Compare commits
10 Commits
4a09bc2d33
...
80d18fca11
Author | SHA1 | Date | |
---|---|---|---|
![]() |
80d18fca11 | ||
![]() |
07035904ee | ||
![]() |
a57793a6f4 | ||
![]() |
f65421a4cf | ||
![]() |
34bc594b04 | ||
![]() |
4584ce14ee | ||
![]() |
29f5289068 | ||
![]() |
800485d984 | ||
![]() |
ecd6cebf05 | ||
![]() |
9800c3b5f9 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1 +1,2 @@
|
||||
__pycache__
|
||||
.python-version
|
11
Pipfile
Normal file
11
Pipfile
Normal file
@@ -0,0 +1,11 @@
|
||||
[[source]]
|
||||
url = "https://pypi.org/simple"
|
||||
verify_ssl = true
|
||||
name = "pypi"
|
||||
|
||||
[packages]
|
||||
|
||||
[dev-packages]
|
||||
|
||||
[requires]
|
||||
python_version = "3.9"
|
21
README.md
21
README.md
@@ -1,22 +1,21 @@
|
||||
# ss-downloader
|
||||
|
||||
Install poetry & Python 3.8+
|
||||
|
||||
```
|
||||
curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python -
|
||||
pipenv shell
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## how to use
|
||||
|
||||
```
|
||||
source load-env.sh
|
||||
poetry run python list.py
|
||||
poetry run python download.py <dataset name>
|
||||
python list.py
|
||||
python download.py <dataset name>
|
||||
```
|
||||
|
||||
To download all datasets
|
||||
```
|
||||
poetry run python download.py all
|
||||
python download.py all
|
||||
```
|
||||
|
||||
You can move the datasets due to relative symlinks. For example:
|
||||
@@ -37,7 +36,13 @@ Then, a relative symlink is created from the `$SCRATCH/<subset>/<matrix>.mtx` fi
|
||||
|
||||
This makes use of a [fork of the `ssgetpy`](github.com/cwpearson/ssgetpy) package with a faster download limit.
|
||||
ssgetpy does not discriminate "real" datatype from "integer" datatype, as shown on the suitesparse collection website.
|
||||
Therefore, `lists.py` maintains a manually-curated list of `integer` datatype matrices to facilitate discrimination.
|
||||
Therefore, we access https://sparse.tamu.edu/files/ss_index.mat to determine that metadata for each file.
|
||||
|
||||
## Transfer data to a different filesystem
|
||||
|
||||
```
|
||||
rsync -rzvh --links --info=progress2 pearson@cori.nersc.gov:$SS_DIR/ .
|
||||
```
|
||||
|
||||
## how this was done
|
||||
|
||||
@@ -48,4 +53,4 @@ poetry add ssgetpy
|
||||
|
||||
```
|
||||
poetry install
|
||||
```
|
||||
```
|
||||
|
@@ -2,8 +2,15 @@ import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from lib import matrix
|
||||
|
||||
try:
|
||||
DIR = Path(os.environ["SS_DIR"])
|
||||
except KeyError as e:
|
||||
print("ERROR: $SS_DIR not set")
|
||||
sys.exit(1)
|
||||
|
||||
SS_ROOT_URL = "https://sparse.tamu.edu"
|
||||
|
||||
|
||||
|
||||
|
@@ -3,7 +3,7 @@ import sys
|
||||
|
||||
import ssgetpy
|
||||
|
||||
from lib import lists
|
||||
from lib import dtypes
|
||||
|
||||
Dataset = collections.namedtuple("Dataset", ["name", "mats"])
|
||||
|
||||
@@ -15,18 +15,19 @@ def safe_dir_name(s):
|
||||
t = t.lower()
|
||||
return t
|
||||
|
||||
def mat_is_integer(mat):
|
||||
return mat.name in lists.INTEGER_MATS
|
||||
def mat_is_real(mat):
|
||||
val = dtypes.DTYPES[(mat.group, mat.name)] == "real"
|
||||
return val
|
||||
|
||||
def filter_reject_integer(mats):
|
||||
return [mat for mat in mats if not mat_is_integer(mat)]
|
||||
def filter_keep_real(mats):
|
||||
return [mat for mat in mats if mat_is_real(mat)]
|
||||
|
||||
def mat_is_small(mat):
|
||||
return (mat.rows < 1_000 and mat.cols < 1_000) \
|
||||
or mat.nnz < 20_000
|
||||
|
||||
def mat_is_large(mat):
|
||||
return (mat.rows > 1_000_000 and mat.cols < 1_000_000) \
|
||||
return (mat.rows > 1_000_000 and mat.cols > 1_000_000) \
|
||||
or mat.nnz > 20_000_000
|
||||
|
||||
def filter_reject_large(mats):
|
||||
@@ -35,10 +36,13 @@ def filter_reject_large(mats):
|
||||
def filter_reject_small(mats):
|
||||
return [mat for mat in mats if not mat_is_small(mat)]
|
||||
|
||||
def filter_keep_square(mats):
|
||||
return [mat for mat in mats if mat.rows == mat.cols]
|
||||
|
||||
## all real-valued matrices
|
||||
REAL_MATS = Dataset(
|
||||
name = "reals",
|
||||
mats = filter_reject_integer(ssgetpy.search(
|
||||
mats = filter_keep_real(ssgetpy.search(
|
||||
dtype='real',
|
||||
limit=1_000_000
|
||||
))
|
||||
@@ -66,7 +70,7 @@ for kind in kinds:
|
||||
)
|
||||
REGULAR_REAL_MATS = Dataset(
|
||||
name="regular_reals",
|
||||
mats = filter_reject_integer(mats)
|
||||
mats = filter_keep_real(mats)
|
||||
)
|
||||
|
||||
## keep "small" matrices
|
||||
@@ -79,6 +83,15 @@ REAL_SMALL_MATS = Dataset (
|
||||
mats = filter_reject_large(REAL_MATS.mats)
|
||||
)
|
||||
|
||||
REGULAR_SQUARE_REAL_SMALL_MATS = Dataset (
|
||||
name = "regular_square_reals_small",
|
||||
mats = filter_keep_square(REGULAR_REAL_SMALL_MATS.mats)
|
||||
)
|
||||
SQUARE_REAL_SMALL_MATS = Dataset (
|
||||
name = "square_reals_small",
|
||||
mats = filter_keep_square(REAL_SMALL_MATS.mats)
|
||||
)
|
||||
|
||||
## keep "medium" matrices
|
||||
REGULAR_REAL_MED_MATS = Dataset (
|
||||
name = "regular_reals_med",
|
||||
@@ -91,12 +104,14 @@ REAL_MED_MATS = Dataset (
|
||||
|
||||
## export all datasets
|
||||
DATASETS = [
|
||||
REAL_MATS,
|
||||
# REAL_MATS,
|
||||
REAL_SMALL_MATS,
|
||||
REAL_MED_MATS,
|
||||
REGULAR_REAL_MATS,
|
||||
# REGULAR_REAL_MATS,
|
||||
REGULAR_REAL_SMALL_MATS,
|
||||
REGULAR_REAL_MED_MATS
|
||||
REGULAR_REAL_MED_MATS,
|
||||
REGULAR_SQUARE_REAL_SMALL_MATS,
|
||||
SQUARE_REAL_SMALL_MATS,
|
||||
]
|
||||
|
||||
def get_kinds():
|
||||
@@ -114,7 +129,7 @@ for kind in get_kinds():
|
||||
name = "kind_"+safe_dir_name(kind),
|
||||
mats = filter_reject_large( \
|
||||
filter_reject_small( \
|
||||
filter_reject_integer(ssgetpy.search(
|
||||
filter_keep_real(ssgetpy.search(
|
||||
kind=kind,
|
||||
dtype='real',
|
||||
limit=1_000_000
|
||||
|
58
lib/dtypes.py
Normal file
58
lib/dtypes.py
Normal file
@@ -0,0 +1,58 @@
|
||||
"""export a map that is (group, name) -> dtype for all mats"""
|
||||
|
||||
import requests
|
||||
import datetime
|
||||
import os
|
||||
|
||||
import scipy.io
|
||||
|
||||
from lib import config
|
||||
|
||||
def download_ss_index(path):
|
||||
|
||||
path_dir = path.parent
|
||||
path_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(path, "wb") as f:
|
||||
req = requests.get(config.SS_ROOT_URL + "/files/ss_index.mat")
|
||||
f.write(req.content)
|
||||
|
||||
def ensure_ss_index(path):
|
||||
if not os.path.exists(path):
|
||||
download_ss_index(path)
|
||||
mtime = datetime.datetime.utcfromtimestamp(os.path.getmtime(config.DIR / ".ss_index.mat"))
|
||||
if datetime.datetime.utcnow() - mtime > datetime.timedelta(days=90):
|
||||
download_ss_index(path)
|
||||
|
||||
# download metadata file if missing
|
||||
local = config.DIR / ".ss_index.mat"
|
||||
ensure_ss_index(local)
|
||||
|
||||
|
||||
# load metadata and convert to a database
|
||||
mat = scipy.io.loadmat(config.DIR / ".ss_index.mat", squeeze_me=True)
|
||||
|
||||
s = mat["ss_index"].item()
|
||||
# for i,x in enumerate(s):
|
||||
# print(i, x)
|
||||
groups = s[1]
|
||||
names = s[2]
|
||||
# 3 letters, first letter:
|
||||
# r=real, p=binary, c=complex, i=integer
|
||||
rbtype = s[19]
|
||||
|
||||
def dtype_from_rbtype(rbtype):
|
||||
if rbtype[0] == "r":
|
||||
return "real"
|
||||
elif rbtype[0] == "p":
|
||||
return "binary"
|
||||
elif rbtype[0] == "c":
|
||||
return "complex"
|
||||
elif rbtype[0] == "i":
|
||||
return "integer"
|
||||
else:
|
||||
raise LookupError
|
||||
|
||||
DTYPES = {}
|
||||
for i in range(len(names)):
|
||||
DTYPES[(groups[i], names[i])] = dtype_from_rbtype(rbtype[i])
|
17
lib/matrix.py
Normal file
17
lib/matrix.py
Normal file
@@ -0,0 +1,17 @@
|
||||
class Matrix:
|
||||
def __init__(self, group, name, dtype, nrows, ncols, nnz):
|
||||
self.group = group
|
||||
self.name = name
|
||||
self.dtype = dtype
|
||||
self.nrows = int(nrows)
|
||||
self.ncols = int(ncols)
|
||||
self.nnz = int(nnz)
|
||||
|
||||
def to_tuple(self):
|
||||
return (self.group, self.name, self.dtype, self.nrows, self.ncols, self.nnz)
|
||||
|
||||
def __repr__(self):
|
||||
return repr(self.to_tuple())
|
||||
|
||||
def url(self):
|
||||
return "/".join(("https://sparse.tamu.edu", "MM", self.group, self.name + ".tar.gz"))
|
4
list.py
4
list.py
@@ -1,4 +1,4 @@
|
||||
import datasets
|
||||
from lib import datasets
|
||||
|
||||
for ds in datasets.DATASETS:
|
||||
print(f"{ds.name}: {len(ds.mats)} matrices")
|
||||
print(f"{ds.name}: {len(ds.mats)} matrices")
|
||||
|
17
load-env.sh
17
load-env.sh
@@ -2,6 +2,8 @@
|
||||
|
||||
host=`hostname`
|
||||
|
||||
|
||||
|
||||
if [[ "$NERSC_HOST" == cori ]]; then
|
||||
echo \$NERSC_HOST matched cori
|
||||
|
||||
@@ -13,9 +15,20 @@ if [[ "$NERSC_HOST" == cori ]]; then
|
||||
elif [[ "$NERSC_HOST" == perlmutter ]]; then
|
||||
echo \$NERSC_HOST matched perlmutter
|
||||
|
||||
module load cray-python/3.9.4.1
|
||||
which python
|
||||
echo module load cray-python/3.9.13.1
|
||||
module load cray-python/3.9.13.1
|
||||
|
||||
export SS_DIR="$CFS"/m3918/pearson
|
||||
echo "\$SS_DIR = $SS_DIR"
|
||||
elif [[ `hostname` =~ ascicgpu030 ]]; then
|
||||
echo hostname matched ascicgpu030
|
||||
|
||||
export SS_DIR="$HOME/suitesparse"
|
||||
echo "\$SS_DIR = $SS_DIR"
|
||||
elif [[ `hostname` =~ rzvernal ]]; then
|
||||
echo hostname matched rzvernal
|
||||
|
||||
export SS_DIR="/usr/workspace/cwpears/suitesparse"
|
||||
echo "\$SS_DIR = $SS_DIR"
|
||||
fi
|
||||
|
||||
|
136
poetry.lock
generated
136
poetry.lock
generated
@@ -1,136 +0,0 @@
|
||||
[[package]]
|
||||
name = "certifi"
|
||||
version = "2021.10.8"
|
||||
description = "Python package for providing Mozilla's CA Bundle."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
|
||||
[[package]]
|
||||
name = "charset-normalizer"
|
||||
version = "2.0.7"
|
||||
description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.5.0"
|
||||
|
||||
[package.extras]
|
||||
unicode_backport = ["unicodedata2"]
|
||||
|
||||
[[package]]
|
||||
name = "colorama"
|
||||
version = "0.4.4"
|
||||
description = "Cross-platform colored terminal text."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "3.3"
|
||||
description = "Internationalized Domain Names in Applications (IDNA)"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.5"
|
||||
|
||||
[[package]]
|
||||
name = "requests"
|
||||
version = "2.26.0"
|
||||
description = "Python HTTP for Humans."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
|
||||
|
||||
[package.dependencies]
|
||||
certifi = ">=2017.4.17"
|
||||
charset-normalizer = {version = ">=2.0.0,<2.1.0", markers = "python_version >= \"3\""}
|
||||
idna = {version = ">=2.5,<4", markers = "python_version >= \"3\""}
|
||||
urllib3 = ">=1.21.1,<1.27"
|
||||
|
||||
[package.extras]
|
||||
socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"]
|
||||
use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"]
|
||||
|
||||
[[package]]
|
||||
name = "ssgetpy"
|
||||
version = "1.0-pre2"
|
||||
description = ""
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">3.5.2"
|
||||
develop = false
|
||||
|
||||
[package.dependencies]
|
||||
requests = ">=2.22"
|
||||
tqdm = ">=4.41"
|
||||
|
||||
[package.source]
|
||||
type = "git"
|
||||
url = "https://github.com/cwpearson/ssgetpy.git"
|
||||
reference = "be00d2a"
|
||||
resolved_reference = "be00d2ad64c55d32291ba820f3d040524b1c5b0e"
|
||||
|
||||
[[package]]
|
||||
name = "tqdm"
|
||||
version = "4.62.3"
|
||||
description = "Fast, Extensible Progress Meter"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7"
|
||||
|
||||
[package.dependencies]
|
||||
colorama = {version = "*", markers = "platform_system == \"Windows\""}
|
||||
|
||||
[package.extras]
|
||||
dev = ["py-make (>=0.1.0)", "twine", "wheel"]
|
||||
notebook = ["ipywidgets (>=6)"]
|
||||
telegram = ["requests"]
|
||||
|
||||
[[package]]
|
||||
name = "urllib3"
|
||||
version = "1.26.7"
|
||||
description = "HTTP library with thread-safe connection pooling, file post, and more."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
|
||||
|
||||
[package.extras]
|
||||
brotli = ["brotlipy (>=0.6.0)"]
|
||||
secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"]
|
||||
socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
|
||||
|
||||
[metadata]
|
||||
lock-version = "1.1"
|
||||
python-versions = "^3.7"
|
||||
content-hash = "4a624c76d5d28333a13081a3fe5fba3eadcdfc09ac0963d1f1ecd89eb03451aa"
|
||||
|
||||
[metadata.files]
|
||||
certifi = [
|
||||
{file = "certifi-2021.10.8-py2.py3-none-any.whl", hash = "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"},
|
||||
{file = "certifi-2021.10.8.tar.gz", hash = "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872"},
|
||||
]
|
||||
charset-normalizer = [
|
||||
{file = "charset-normalizer-2.0.7.tar.gz", hash = "sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0"},
|
||||
{file = "charset_normalizer-2.0.7-py3-none-any.whl", hash = "sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b"},
|
||||
]
|
||||
colorama = [
|
||||
{file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"},
|
||||
{file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"},
|
||||
]
|
||||
idna = [
|
||||
{file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
|
||||
{file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
|
||||
]
|
||||
requests = [
|
||||
{file = "requests-2.26.0-py2.py3-none-any.whl", hash = "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24"},
|
||||
{file = "requests-2.26.0.tar.gz", hash = "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"},
|
||||
]
|
||||
ssgetpy = []
|
||||
tqdm = [
|
||||
{file = "tqdm-4.62.3-py2.py3-none-any.whl", hash = "sha256:8dd278a422499cd6b727e6ae4061c40b48fce8b76d1ccbf5d34fca9b7f925b0c"},
|
||||
{file = "tqdm-4.62.3.tar.gz", hash = "sha256:d359de7217506c9851b7869f3708d8ee53ed70a1b8edbba4dbcb47442592920d"},
|
||||
]
|
||||
urllib3 = [
|
||||
{file = "urllib3-1.26.7-py2.py3-none-any.whl", hash = "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"},
|
||||
{file = "urllib3-1.26.7.tar.gz", hash = "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece"},
|
||||
]
|
@@ -1,15 +0,0 @@
|
||||
[tool.poetry]
|
||||
name = "ss-downloader"
|
||||
version = "0.1.0"
|
||||
description = ""
|
||||
authors = ["Carl Pearson <cwpears@sandia.gov>"]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.7"
|
||||
ssgetpy = {git = "https://github.com/cwpearson/ssgetpy.git", rev = "be00d2a"}
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core>=1.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
ssgetpy
|
||||
scipy
|
Reference in New Issue
Block a user