Compare commits

...

10 Commits

Author SHA1 Message Date
Carl Pearson
80d18fca11 don't use poetry, update Perlmutter environment 2023-03-09 10:24:58 -08:00
Carl Pearson
07035904ee import datasets -> from lib import datasets 2022-11-02 08:51:02 -07:00
Carl Pearson
a57793a6f4 create directory if it doesn't exist 2022-11-02 08:50:33 -07:00
Carl Pearson
f65421a4cf rzvernal 2022-11-02 08:50:08 -07:00
Carl William Pearson
34bc594b04 ignore .python-version 2022-01-28 13:14:43 -07:00
Carl William Pearson
4584ce14ee add square datasets 2022-01-28 13:14:30 -07:00
Carl William Pearson
29f5289068 ascicgpu 2022-01-28 13:14:19 -07:00
Carl Pearson
800485d984 don't download enormous matrices 2021-12-01 14:58:08 -08:00
Carl Pearson
ecd6cebf05 readme 2021-12-01 14:32:07 -08:00
Carl Pearson
9800c3b5f9 automatically download nonzero datatype metadata 2021-12-01 14:27:44 -08:00
12 changed files with 153 additions and 175 deletions

1
.gitignore vendored
View File

@@ -1 +1,2 @@
__pycache__ __pycache__
.python-version

11
Pipfile Normal file
View File

@@ -0,0 +1,11 @@
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"
[packages]
[dev-packages]
[requires]
python_version = "3.9"

View File

@@ -1,22 +1,21 @@
# ss-downloader # ss-downloader
Install poetry & Python 3.8+
``` ```
curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python - pipenv shell
pip install -r requirements.txt
``` ```
## how to use ## how to use
``` ```
source load-env.sh source load-env.sh
poetry run python list.py python list.py
poetry run python download.py <dataset name> python download.py <dataset name>
``` ```
To download all datasets To download all datasets
``` ```
poetry run python download.py all python download.py all
``` ```
You can move the datasets due to relative symlinks. For example: You can move the datasets due to relative symlinks. For example:
@@ -37,7 +36,13 @@ Then, a relative symlink is created from the `$SCRATCH/<subset>/<matrix>.mtx` fi
This makes use of a [fork of the `ssgetpy`](github.com/cwpearson/ssgetpy) package with a faster download limit. This makes use of a [fork of the `ssgetpy`](github.com/cwpearson/ssgetpy) package with a faster download limit.
ssgetpy does not discriminate "real" datatype from "integer" datatype, as shown on the suitesparse collection website. ssgetpy does not discriminate "real" datatype from "integer" datatype, as shown on the suitesparse collection website.
Therefore, `lists.py` maintains a manually-curated list of `integer` datatype matrices to facilitate discrimination. Therefore, we access https://sparse.tamu.edu/files/ss_index.mat to determine that metadata for each file.
## Transfer data to a different filesystem
```
rsync -rzvh --links --info=progress2 pearson@cori.nersc.gov:$SS_DIR/ .
```
## how this was done ## how this was done
@@ -48,4 +53,4 @@ poetry add ssgetpy
``` ```
poetry install poetry install
``` ```

View File

@@ -2,8 +2,15 @@ import os
import sys import sys
from pathlib import Path from pathlib import Path
from lib import matrix
try: try:
DIR = Path(os.environ["SS_DIR"]) DIR = Path(os.environ["SS_DIR"])
except KeyError as e: except KeyError as e:
print("ERROR: $SS_DIR not set") print("ERROR: $SS_DIR not set")
sys.exit(1) sys.exit(1)
SS_ROOT_URL = "https://sparse.tamu.edu"

View File

@@ -3,7 +3,7 @@ import sys
import ssgetpy import ssgetpy
from lib import lists from lib import dtypes
Dataset = collections.namedtuple("Dataset", ["name", "mats"]) Dataset = collections.namedtuple("Dataset", ["name", "mats"])
@@ -15,18 +15,19 @@ def safe_dir_name(s):
t = t.lower() t = t.lower()
return t return t
def mat_is_integer(mat): def mat_is_real(mat):
return mat.name in lists.INTEGER_MATS val = dtypes.DTYPES[(mat.group, mat.name)] == "real"
return val
def filter_reject_integer(mats): def filter_keep_real(mats):
return [mat for mat in mats if not mat_is_integer(mat)] return [mat for mat in mats if mat_is_real(mat)]
def mat_is_small(mat): def mat_is_small(mat):
return (mat.rows < 1_000 and mat.cols < 1_000) \ return (mat.rows < 1_000 and mat.cols < 1_000) \
or mat.nnz < 20_000 or mat.nnz < 20_000
def mat_is_large(mat): def mat_is_large(mat):
return (mat.rows > 1_000_000 and mat.cols < 1_000_000) \ return (mat.rows > 1_000_000 and mat.cols > 1_000_000) \
or mat.nnz > 20_000_000 or mat.nnz > 20_000_000
def filter_reject_large(mats): def filter_reject_large(mats):
@@ -35,10 +36,13 @@ def filter_reject_large(mats):
def filter_reject_small(mats): def filter_reject_small(mats):
return [mat for mat in mats if not mat_is_small(mat)] return [mat for mat in mats if not mat_is_small(mat)]
def filter_keep_square(mats):
return [mat for mat in mats if mat.rows == mat.cols]
## all real-valued matrices ## all real-valued matrices
REAL_MATS = Dataset( REAL_MATS = Dataset(
name = "reals", name = "reals",
mats = filter_reject_integer(ssgetpy.search( mats = filter_keep_real(ssgetpy.search(
dtype='real', dtype='real',
limit=1_000_000 limit=1_000_000
)) ))
@@ -66,7 +70,7 @@ for kind in kinds:
) )
REGULAR_REAL_MATS = Dataset( REGULAR_REAL_MATS = Dataset(
name="regular_reals", name="regular_reals",
mats = filter_reject_integer(mats) mats = filter_keep_real(mats)
) )
## keep "small" matrices ## keep "small" matrices
@@ -79,6 +83,15 @@ REAL_SMALL_MATS = Dataset (
mats = filter_reject_large(REAL_MATS.mats) mats = filter_reject_large(REAL_MATS.mats)
) )
REGULAR_SQUARE_REAL_SMALL_MATS = Dataset (
name = "regular_square_reals_small",
mats = filter_keep_square(REGULAR_REAL_SMALL_MATS.mats)
)
SQUARE_REAL_SMALL_MATS = Dataset (
name = "square_reals_small",
mats = filter_keep_square(REAL_SMALL_MATS.mats)
)
## keep "medium" matrices ## keep "medium" matrices
REGULAR_REAL_MED_MATS = Dataset ( REGULAR_REAL_MED_MATS = Dataset (
name = "regular_reals_med", name = "regular_reals_med",
@@ -91,12 +104,14 @@ REAL_MED_MATS = Dataset (
## export all datasets ## export all datasets
DATASETS = [ DATASETS = [
REAL_MATS, # REAL_MATS,
REAL_SMALL_MATS, REAL_SMALL_MATS,
REAL_MED_MATS, REAL_MED_MATS,
REGULAR_REAL_MATS, # REGULAR_REAL_MATS,
REGULAR_REAL_SMALL_MATS, REGULAR_REAL_SMALL_MATS,
REGULAR_REAL_MED_MATS REGULAR_REAL_MED_MATS,
REGULAR_SQUARE_REAL_SMALL_MATS,
SQUARE_REAL_SMALL_MATS,
] ]
def get_kinds(): def get_kinds():
@@ -114,7 +129,7 @@ for kind in get_kinds():
name = "kind_"+safe_dir_name(kind), name = "kind_"+safe_dir_name(kind),
mats = filter_reject_large( \ mats = filter_reject_large( \
filter_reject_small( \ filter_reject_small( \
filter_reject_integer(ssgetpy.search( filter_keep_real(ssgetpy.search(
kind=kind, kind=kind,
dtype='real', dtype='real',
limit=1_000_000 limit=1_000_000

58
lib/dtypes.py Normal file
View File

@@ -0,0 +1,58 @@
"""export a map that is (group, name) -> dtype for all mats"""
import requests
import datetime
import os
import scipy.io
from lib import config
def download_ss_index(path):
path_dir = path.parent
path_dir.mkdir(parents=True, exist_ok=True)
with open(path, "wb") as f:
req = requests.get(config.SS_ROOT_URL + "/files/ss_index.mat")
f.write(req.content)
def ensure_ss_index(path):
if not os.path.exists(path):
download_ss_index(path)
mtime = datetime.datetime.utcfromtimestamp(os.path.getmtime(config.DIR / ".ss_index.mat"))
if datetime.datetime.utcnow() - mtime > datetime.timedelta(days=90):
download_ss_index(path)
# download metadata file if missing
local = config.DIR / ".ss_index.mat"
ensure_ss_index(local)
# load metadata and convert to a database
mat = scipy.io.loadmat(config.DIR / ".ss_index.mat", squeeze_me=True)
s = mat["ss_index"].item()
# for i,x in enumerate(s):
# print(i, x)
groups = s[1]
names = s[2]
# 3 letters, first letter:
# r=real, p=binary, c=complex, i=integer
rbtype = s[19]
def dtype_from_rbtype(rbtype):
if rbtype[0] == "r":
return "real"
elif rbtype[0] == "p":
return "binary"
elif rbtype[0] == "c":
return "complex"
elif rbtype[0] == "i":
return "integer"
else:
raise LookupError
DTYPES = {}
for i in range(len(names)):
DTYPES[(groups[i], names[i])] = dtype_from_rbtype(rbtype[i])

17
lib/matrix.py Normal file
View File

@@ -0,0 +1,17 @@
class Matrix:
def __init__(self, group, name, dtype, nrows, ncols, nnz):
self.group = group
self.name = name
self.dtype = dtype
self.nrows = int(nrows)
self.ncols = int(ncols)
self.nnz = int(nnz)
def to_tuple(self):
return (self.group, self.name, self.dtype, self.nrows, self.ncols, self.nnz)
def __repr__(self):
return repr(self.to_tuple())
def url(self):
return "/".join(("https://sparse.tamu.edu", "MM", self.group, self.name + ".tar.gz"))

View File

@@ -1,4 +1,4 @@
import datasets from lib import datasets
for ds in datasets.DATASETS: for ds in datasets.DATASETS:
print(f"{ds.name}: {len(ds.mats)} matrices") print(f"{ds.name}: {len(ds.mats)} matrices")

View File

@@ -2,6 +2,8 @@
host=`hostname` host=`hostname`
if [[ "$NERSC_HOST" == cori ]]; then if [[ "$NERSC_HOST" == cori ]]; then
echo \$NERSC_HOST matched cori echo \$NERSC_HOST matched cori
@@ -13,9 +15,20 @@ if [[ "$NERSC_HOST" == cori ]]; then
elif [[ "$NERSC_HOST" == perlmutter ]]; then elif [[ "$NERSC_HOST" == perlmutter ]]; then
echo \$NERSC_HOST matched perlmutter echo \$NERSC_HOST matched perlmutter
module load cray-python/3.9.4.1 echo module load cray-python/3.9.13.1
which python module load cray-python/3.9.13.1
export SS_DIR="$CFS"/m3918/pearson export SS_DIR="$CFS"/m3918/pearson
echo "\$SS_DIR = $SS_DIR" echo "\$SS_DIR = $SS_DIR"
elif [[ `hostname` =~ ascicgpu030 ]]; then
echo hostname matched ascicgpu030
export SS_DIR="$HOME/suitesparse"
echo "\$SS_DIR = $SS_DIR"
elif [[ `hostname` =~ rzvernal ]]; then
echo hostname matched rzvernal
export SS_DIR="/usr/workspace/cwpears/suitesparse"
echo "\$SS_DIR = $SS_DIR"
fi fi

136
poetry.lock generated
View File

@@ -1,136 +0,0 @@
[[package]]
name = "certifi"
version = "2021.10.8"
description = "Python package for providing Mozilla's CA Bundle."
category = "main"
optional = false
python-versions = "*"
[[package]]
name = "charset-normalizer"
version = "2.0.7"
description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
category = "main"
optional = false
python-versions = ">=3.5.0"
[package.extras]
unicode_backport = ["unicodedata2"]
[[package]]
name = "colorama"
version = "0.4.4"
description = "Cross-platform colored terminal text."
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
[[package]]
name = "idna"
version = "3.3"
description = "Internationalized Domain Names in Applications (IDNA)"
category = "main"
optional = false
python-versions = ">=3.5"
[[package]]
name = "requests"
version = "2.26.0"
description = "Python HTTP for Humans."
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
[package.dependencies]
certifi = ">=2017.4.17"
charset-normalizer = {version = ">=2.0.0,<2.1.0", markers = "python_version >= \"3\""}
idna = {version = ">=2.5,<4", markers = "python_version >= \"3\""}
urllib3 = ">=1.21.1,<1.27"
[package.extras]
socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"]
use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"]
[[package]]
name = "ssgetpy"
version = "1.0-pre2"
description = ""
category = "main"
optional = false
python-versions = ">3.5.2"
develop = false
[package.dependencies]
requests = ">=2.22"
tqdm = ">=4.41"
[package.source]
type = "git"
url = "https://github.com/cwpearson/ssgetpy.git"
reference = "be00d2a"
resolved_reference = "be00d2ad64c55d32291ba820f3d040524b1c5b0e"
[[package]]
name = "tqdm"
version = "4.62.3"
description = "Fast, Extensible Progress Meter"
category = "main"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7"
[package.dependencies]
colorama = {version = "*", markers = "platform_system == \"Windows\""}
[package.extras]
dev = ["py-make (>=0.1.0)", "twine", "wheel"]
notebook = ["ipywidgets (>=6)"]
telegram = ["requests"]
[[package]]
name = "urllib3"
version = "1.26.7"
description = "HTTP library with thread-safe connection pooling, file post, and more."
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
[package.extras]
brotli = ["brotlipy (>=0.6.0)"]
secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"]
socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
[metadata]
lock-version = "1.1"
python-versions = "^3.7"
content-hash = "4a624c76d5d28333a13081a3fe5fba3eadcdfc09ac0963d1f1ecd89eb03451aa"
[metadata.files]
certifi = [
{file = "certifi-2021.10.8-py2.py3-none-any.whl", hash = "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"},
{file = "certifi-2021.10.8.tar.gz", hash = "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872"},
]
charset-normalizer = [
{file = "charset-normalizer-2.0.7.tar.gz", hash = "sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0"},
{file = "charset_normalizer-2.0.7-py3-none-any.whl", hash = "sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b"},
]
colorama = [
{file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"},
{file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"},
]
idna = [
{file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
{file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
]
requests = [
{file = "requests-2.26.0-py2.py3-none-any.whl", hash = "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24"},
{file = "requests-2.26.0.tar.gz", hash = "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"},
]
ssgetpy = []
tqdm = [
{file = "tqdm-4.62.3-py2.py3-none-any.whl", hash = "sha256:8dd278a422499cd6b727e6ae4061c40b48fce8b76d1ccbf5d34fca9b7f925b0c"},
{file = "tqdm-4.62.3.tar.gz", hash = "sha256:d359de7217506c9851b7869f3708d8ee53ed70a1b8edbba4dbcb47442592920d"},
]
urllib3 = [
{file = "urllib3-1.26.7-py2.py3-none-any.whl", hash = "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"},
{file = "urllib3-1.26.7.tar.gz", hash = "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece"},
]

View File

@@ -1,15 +0,0 @@
[tool.poetry]
name = "ss-downloader"
version = "0.1.0"
description = ""
authors = ["Carl Pearson <cwpears@sandia.gov>"]
[tool.poetry.dependencies]
python = "^3.7"
ssgetpy = {git = "https://github.com/cwpearson/ssgetpy.git", rev = "be00d2a"}
[tool.poetry.dev-dependencies]
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

2
requirements.txt Normal file
View File

@@ -0,0 +1,2 @@
ssgetpy
scipy