automatically download nonzero datatype metadata

This commit is contained in:
Carl Pearson
2021-12-01 14:27:44 -08:00
parent 4a09bc2d33
commit 9800c3b5f9
7 changed files with 182 additions and 14 deletions

View File

@@ -39,6 +39,12 @@ This makes use of a [fork of the `ssgetpy`](github.com/cwpearson/ssgetpy) packag
ssgetpy does not discriminate "real" datatype from "integer" datatype, as shown on the suitesparse collection website. ssgetpy does not discriminate "real" datatype from "integer" datatype, as shown on the suitesparse collection website.
Therefore, `lists.py` maintains a manually-curated list of `integer` datatype matrices to facilitate discrimination. Therefore, `lists.py` maintains a manually-curated list of `integer` datatype matrices to facilitate discrimination.
## Transfer data to a different filesystem
```
rsync -rzvh --links pearson@cori.nersc.gov:$SS_DIR/ .
```
## how this was done ## how this was done
``` ```
@@ -48,4 +54,4 @@ poetry add ssgetpy
``` ```
poetry install poetry install
``` ```

View File

@@ -2,8 +2,15 @@ import os
import sys import sys
from pathlib import Path from pathlib import Path
from lib import matrix
try: try:
DIR = Path(os.environ["SS_DIR"]) DIR = Path(os.environ["SS_DIR"])
except KeyError as e: except KeyError as e:
print("ERROR: $SS_DIR not set") print("ERROR: $SS_DIR not set")
sys.exit(1) sys.exit(1)
SS_ROOT_URL = "https://sparse.tamu.edu"

View File

@@ -3,7 +3,7 @@ import sys
import ssgetpy import ssgetpy
from lib import lists from lib import dtypes
Dataset = collections.namedtuple("Dataset", ["name", "mats"]) Dataset = collections.namedtuple("Dataset", ["name", "mats"])
@@ -15,18 +15,19 @@ def safe_dir_name(s):
t = t.lower() t = t.lower()
return t return t
def mat_is_integer(mat): def mat_is_real(mat):
return mat.name in lists.INTEGER_MATS val = dtypes.DTYPES[(mat.group, mat.name)] == "real"
return val
def filter_reject_integer(mats): def filter_keep_real(mats):
return [mat for mat in mats if not mat_is_integer(mat)] return [mat for mat in mats if mat_is_real(mat)]
def mat_is_small(mat): def mat_is_small(mat):
return (mat.rows < 1_000 and mat.cols < 1_000) \ return (mat.rows < 1_000 and mat.cols < 1_000) \
or mat.nnz < 20_000 or mat.nnz < 20_000
def mat_is_large(mat): def mat_is_large(mat):
return (mat.rows > 1_000_000 and mat.cols < 1_000_000) \ return (mat.rows > 1_000_000 and mat.cols > 1_000_000) \
or mat.nnz > 20_000_000 or mat.nnz > 20_000_000
def filter_reject_large(mats): def filter_reject_large(mats):
@@ -38,7 +39,7 @@ def filter_reject_small(mats):
## all real-valued matrices ## all real-valued matrices
REAL_MATS = Dataset( REAL_MATS = Dataset(
name = "reals", name = "reals",
mats = filter_reject_integer(ssgetpy.search( mats = filter_keep_real(ssgetpy.search(
dtype='real', dtype='real',
limit=1_000_000 limit=1_000_000
)) ))
@@ -66,7 +67,7 @@ for kind in kinds:
) )
REGULAR_REAL_MATS = Dataset( REGULAR_REAL_MATS = Dataset(
name="regular_reals", name="regular_reals",
mats = filter_reject_integer(mats) mats = filter_keep_real(mats)
) )
## keep "small" matrices ## keep "small" matrices
@@ -91,7 +92,7 @@ REAL_MED_MATS = Dataset (
## export all datasets ## export all datasets
DATASETS = [ DATASETS = [
REAL_MATS, # REAL_MATS,
REAL_SMALL_MATS, REAL_SMALL_MATS,
REAL_MED_MATS, REAL_MED_MATS,
REGULAR_REAL_MATS, REGULAR_REAL_MATS,
@@ -114,7 +115,7 @@ for kind in get_kinds():
name = "kind_"+safe_dir_name(kind), name = "kind_"+safe_dir_name(kind),
mats = filter_reject_large( \ mats = filter_reject_large( \
filter_reject_small( \ filter_reject_small( \
filter_reject_integer(ssgetpy.search( filter_keep_real(ssgetpy.search(
kind=kind, kind=kind,
dtype='real', dtype='real',
limit=1_000_000 limit=1_000_000

54
lib/dtypes.py Normal file
View File

@@ -0,0 +1,54 @@
"""export a map that is (group, name) -> dtype for all mats"""
import requests
import datetime
import os
import scipy.io
from lib import config
def download_ss_index(path):
with open(path, "wb") as f:
req = requests.get(config.SS_ROOT_URL + "/files/ss_index.mat")
f.write(req.content)
def ensure_ss_index(path):
if not os.path.exists(path):
download_ss_index(path)
mtime = datetime.datetime.utcfromtimestamp(os.path.getmtime(config.DIR / ".ss_index.mat"))
if datetime.datetime.utcnow() - mtime > datetime.timedelta(days=90):
download_ss_index(path)
# download metadata file if missing
local = config.DIR / ".ss_index.mat"
ensure_ss_index(local)
# load metadata and convert to a database
mat = scipy.io.loadmat(config.DIR / ".ss_index.mat", squeeze_me=True)
s = mat["ss_index"].item()
for i,x in enumerate(s):
print(i, x)
groups = s[1]
names = s[2]
# 3 letters, first letter:
# r=real, p=binary, c=complex, i=integer
rbtype = s[19]
def dtype_from_rbtype(rbtype):
if rbtype[0] == "r":
return "real"
elif rbtype[0] == "p":
return "binary"
elif rbtype[0] == "c":
return "complex"
elif rbtype[0] == "i":
return "integer"
else:
raise LookupError
DTYPES = {}
for i in range(len(names)):
DTYPES[(groups[i], names[i])] = dtype_from_rbtype(rbtype[i])

17
lib/matrix.py Normal file
View File

@@ -0,0 +1,17 @@
class Matrix:
def __init__(self, group, name, dtype, nrows, ncols, nnz):
self.group = group
self.name = name
self.dtype = dtype
self.nrows = int(nrows)
self.ncols = int(ncols)
self.nnz = int(nnz)
def to_tuple(self):
return (self.group, self.name, self.dtype, self.nrows, self.ncols, self.nnz)
def __repr__(self):
return repr(self.to_tuple())
def url(self):
return "/".join(("https://sparse.tamu.edu", "MM", self.group, self.name + ".tar.gz"))

86
poetry.lock generated
View File

@@ -33,6 +33,14 @@ category = "main"
optional = false optional = false
python-versions = ">=3.5" python-versions = ">=3.5"
[[package]]
name = "numpy"
version = "1.21.4"
description = "NumPy is the fundamental package for array computing with Python."
category = "main"
optional = false
python-versions = ">=3.7,<3.11"
[[package]] [[package]]
name = "requests" name = "requests"
version = "2.26.0" version = "2.26.0"
@@ -51,6 +59,17 @@ urllib3 = ">=1.21.1,<1.27"
socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"] socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"]
use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"] use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"]
[[package]]
name = "scipy"
version = "1.7.3"
description = "SciPy: Scientific Library for Python"
category = "main"
optional = false
python-versions = ">=3.7,<3.11"
[package.dependencies]
numpy = ">=1.16.5,<1.23.0"
[[package]] [[package]]
name = "ssgetpy" name = "ssgetpy"
version = "1.0-pre2" version = "1.0-pre2"
@@ -101,8 +120,8 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
[metadata] [metadata]
lock-version = "1.1" lock-version = "1.1"
python-versions = "^3.7" python-versions = ">=3.7,<3.11"
content-hash = "4a624c76d5d28333a13081a3fe5fba3eadcdfc09ac0963d1f1ecd89eb03451aa" content-hash = "5a1bf7fe65d1fe23f7c34d44076cc157e3343699790a742492686d6198fb88eb"
[metadata.files] [metadata.files]
certifi = [ certifi = [
@@ -121,10 +140,73 @@ idna = [
{file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
{file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
] ]
numpy = [
{file = "numpy-1.21.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8890b3360f345e8360133bc078d2dacc2843b6ee6059b568781b15b97acbe39f"},
{file = "numpy-1.21.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:69077388c5a4b997442b843dbdc3a85b420fb693ec8e33020bb24d647c164fa5"},
{file = "numpy-1.21.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e89717274b41ebd568cd7943fc9418eeb49b1785b66031bc8a7f6300463c5898"},
{file = "numpy-1.21.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b78ecfa070460104934e2caf51694ccd00f37d5e5dbe76f021b1b0b0d221823"},
{file = "numpy-1.21.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:615d4e328af7204c13ae3d4df7615a13ff60a49cb0d9106fde07f541207883ca"},
{file = "numpy-1.21.4-cp310-cp310-win_amd64.whl", hash = "sha256:1403b4e2181fc72664737d848b60e65150f272fe5a1c1cbc16145ed43884065a"},
{file = "numpy-1.21.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:74b85a17528ca60cf98381a5e779fc0264b4a88b46025e6bcbe9621f46bb3e63"},
{file = "numpy-1.21.4-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:92aafa03da8658609f59f18722b88f0a73a249101169e28415b4fa148caf7e41"},
{file = "numpy-1.21.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:5d95668e727c75b3f5088ec7700e260f90ec83f488e4c0aaccb941148b2cd377"},
{file = "numpy-1.21.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f5162ec777ba7138906c9c274353ece5603646c6965570d82905546579573f73"},
{file = "numpy-1.21.4-cp37-cp37m-win32.whl", hash = "sha256:81225e58ef5fce7f1d80399575576fc5febec79a8a2742e8ef86d7b03beef49f"},
{file = "numpy-1.21.4-cp37-cp37m-win_amd64.whl", hash = "sha256:32fe5b12061f6446adcbb32cf4060a14741f9c21e15aaee59a207b6ce6423469"},
{file = "numpy-1.21.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c449eb870616a7b62e097982c622d2577b3dbc800aaf8689254ec6e0197cbf1e"},
{file = "numpy-1.21.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2e4ed57f45f0aa38beca2a03b6532e70e548faf2debbeb3291cfc9b315d9be8f"},
{file = "numpy-1.21.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1247ef28387b7bb7f21caf2dbe4767f4f4175df44d30604d42ad9bd701ebb31f"},
{file = "numpy-1.21.4-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:34f3456f530ae8b44231c63082c8899fe9c983fd9b108c997c4b1c8c2d435333"},
{file = "numpy-1.21.4-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4c9c23158b87ed0e70d9a50c67e5c0b3f75bcf2581a8e34668d4e9d7474d76c6"},
{file = "numpy-1.21.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4799be6a2d7d3c33699a6f77201836ac975b2e1b98c2a07f66a38f499cb50ce"},
{file = "numpy-1.21.4-cp38-cp38-win32.whl", hash = "sha256:bc988afcea53e6156546e5b2885b7efab089570783d9d82caf1cfd323b0bb3dd"},
{file = "numpy-1.21.4-cp38-cp38-win_amd64.whl", hash = "sha256:170b2a0805c6891ca78c1d96ee72e4c3ed1ae0a992c75444b6ab20ff038ba2cd"},
{file = "numpy-1.21.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:fde96af889262e85aa033f8ee1d3241e32bf36228318a61f1ace579df4e8170d"},
{file = "numpy-1.21.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c885bfc07f77e8fee3dc879152ba993732601f1f11de248d4f357f0ffea6a6d4"},
{file = "numpy-1.21.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9e6f5f50d1eff2f2f752b3089a118aee1ea0da63d56c44f3865681009b0af162"},
{file = "numpy-1.21.4-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:ad010846cdffe7ec27e3f933397f8a8d6c801a48634f419e3d075db27acf5880"},
{file = "numpy-1.21.4-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c74c699b122918a6c4611285cc2cad4a3aafdb135c22a16ec483340ef97d573c"},
{file = "numpy-1.21.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9864424631775b0c052f3bd98bc2712d131b3e2cd95d1c0c68b91709170890b0"},
{file = "numpy-1.21.4-cp39-cp39-win32.whl", hash = "sha256:b1e2312f5b8843a3e4e8224b2b48fe16119617b8fc0a54df8f50098721b5bed2"},
{file = "numpy-1.21.4-cp39-cp39-win_amd64.whl", hash = "sha256:e3c3e990274444031482a31280bf48674441e0a5b55ddb168f3a6db3e0c38ec8"},
{file = "numpy-1.21.4-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a3deb31bc84f2b42584b8c4001c85d1934dbfb4030827110bc36bfd11509b7bf"},
{file = "numpy-1.21.4.zip", hash = "sha256:e6c76a87633aa3fa16614b61ccedfae45b91df2767cf097aa9c933932a7ed1e0"},
]
requests = [ requests = [
{file = "requests-2.26.0-py2.py3-none-any.whl", hash = "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24"}, {file = "requests-2.26.0-py2.py3-none-any.whl", hash = "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24"},
{file = "requests-2.26.0.tar.gz", hash = "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"}, {file = "requests-2.26.0.tar.gz", hash = "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"},
] ]
scipy = [
{file = "scipy-1.7.3-1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:c9e04d7e9b03a8a6ac2045f7c5ef741be86727d8f49c45db45f244bdd2bcff17"},
{file = "scipy-1.7.3-1-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:b0e0aeb061a1d7dcd2ed59ea57ee56c9b23dd60100825f98238c06ee5cc4467e"},
{file = "scipy-1.7.3-1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:b78a35c5c74d336f42f44106174b9851c783184a85a3fe3e68857259b37b9ffb"},
{file = "scipy-1.7.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:173308efba2270dcd61cd45a30dfded6ec0085b4b6eb33b5eb11ab443005e088"},
{file = "scipy-1.7.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:21b66200cf44b1c3e86495e3a436fc7a26608f92b8d43d344457c54f1c024cbc"},
{file = "scipy-1.7.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ceebc3c4f6a109777c0053dfa0282fddb8893eddfb0d598574acfb734a926168"},
{file = "scipy-1.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7eaea089345a35130bc9a39b89ec1ff69c208efa97b3f8b25ea5d4c41d88094"},
{file = "scipy-1.7.3-cp310-cp310-win_amd64.whl", hash = "sha256:304dfaa7146cffdb75fbf6bb7c190fd7688795389ad060b970269c8576d038e9"},
{file = "scipy-1.7.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:033ce76ed4e9f62923e1f8124f7e2b0800db533828c853b402c7eec6e9465d80"},
{file = "scipy-1.7.3-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:4d242d13206ca4302d83d8a6388c9dfce49fc48fdd3c20efad89ba12f785bf9e"},
{file = "scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:8499d9dd1459dc0d0fe68db0832c3d5fc1361ae8e13d05e6849b358dc3f2c279"},
{file = "scipy-1.7.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca36e7d9430f7481fc7d11e015ae16fbd5575615a8e9060538104778be84addf"},
{file = "scipy-1.7.3-cp37-cp37m-win32.whl", hash = "sha256:e2c036492e673aad1b7b0d0ccdc0cb30a968353d2c4bf92ac8e73509e1bf212c"},
{file = "scipy-1.7.3-cp37-cp37m-win_amd64.whl", hash = "sha256:866ada14a95b083dd727a845a764cf95dd13ba3dc69a16b99038001b05439709"},
{file = "scipy-1.7.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:65bd52bf55f9a1071398557394203d881384d27b9c2cad7df9a027170aeaef93"},
{file = "scipy-1.7.3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:f99d206db1f1ae735a8192ab93bd6028f3a42f6fa08467d37a14eb96c9dd34a3"},
{file = "scipy-1.7.3-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:5f2cfc359379c56b3a41b17ebd024109b2049f878badc1e454f31418c3a18436"},
{file = "scipy-1.7.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eb7ae2c4dbdb3c9247e07acc532f91077ae6dbc40ad5bd5dca0bb5a176ee9bda"},
{file = "scipy-1.7.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95c2d250074cfa76715d58830579c64dff7354484b284c2b8b87e5a38321672c"},
{file = "scipy-1.7.3-cp38-cp38-win32.whl", hash = "sha256:87069cf875f0262a6e3187ab0f419f5b4280d3dcf4811ef9613c605f6e4dca95"},
{file = "scipy-1.7.3-cp38-cp38-win_amd64.whl", hash = "sha256:7edd9a311299a61e9919ea4192dd477395b50c014cdc1a1ac572d7c27e2207fa"},
{file = "scipy-1.7.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:eef93a446114ac0193a7b714ce67659db80caf940f3232bad63f4c7a81bc18df"},
{file = "scipy-1.7.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:eb326658f9b73c07081300daba90a8746543b5ea177184daed26528273157294"},
{file = "scipy-1.7.3-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:93378f3d14fff07572392ce6a6a2ceb3a1f237733bd6dcb9eb6a2b29b0d19085"},
{file = "scipy-1.7.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edad1cf5b2ce1912c4d8ddad20e11d333165552aba262c882e28c78bbc09dbf6"},
{file = "scipy-1.7.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d1cc2c19afe3b5a546ede7e6a44ce1ff52e443d12b231823268019f608b9b12"},
{file = "scipy-1.7.3-cp39-cp39-win32.whl", hash = "sha256:2c56b820d304dffcadbbb6cbfbc2e2c79ee46ea291db17e288e73cd3c64fefa9"},
{file = "scipy-1.7.3-cp39-cp39-win_amd64.whl", hash = "sha256:3f78181a153fa21c018d346f595edd648344751d7f03ab94b398be2ad083ed3e"},
{file = "scipy-1.7.3.tar.gz", hash = "sha256:ab5875facfdef77e0a47d5fd39ea178b58e60e454a4c85aa1e52fcb80db7babf"},
]
ssgetpy = [] ssgetpy = []
tqdm = [ tqdm = [
{file = "tqdm-4.62.3-py2.py3-none-any.whl", hash = "sha256:8dd278a422499cd6b727e6ae4061c40b48fce8b76d1ccbf5d34fca9b7f925b0c"}, {file = "tqdm-4.62.3-py2.py3-none-any.whl", hash = "sha256:8dd278a422499cd6b727e6ae4061c40b48fce8b76d1ccbf5d34fca9b7f925b0c"},

View File

@@ -5,8 +5,9 @@ description = ""
authors = ["Carl Pearson <cwpears@sandia.gov>"] authors = ["Carl Pearson <cwpears@sandia.gov>"]
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.7" python = ">=3.7,<3.11"
ssgetpy = {git = "https://github.com/cwpearson/ssgetpy.git", rev = "be00d2a"} ssgetpy = {git = "https://github.com/cwpearson/ssgetpy.git", rev = "be00d2a"}
scipy = "^1.7.3"
[tool.poetry.dev-dependencies] [tool.poetry.dev-dependencies]