From 9800c3b5f9d25eff1e552dd0fbec7f138ed5969f Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Wed, 1 Dec 2021 14:27:44 -0800 Subject: [PATCH] automatically download nonzero datatype metadata --- README.md | 8 ++++- lib/config.py | 7 ++++ lib/datasets.py | 21 ++++++------ lib/dtypes.py | 54 +++++++++++++++++++++++++++++++ lib/matrix.py | 17 ++++++++++ poetry.lock | 86 +++++++++++++++++++++++++++++++++++++++++++++++-- pyproject.toml | 3 +- 7 files changed, 182 insertions(+), 14 deletions(-) create mode 100644 lib/dtypes.py create mode 100644 lib/matrix.py diff --git a/README.md b/README.md index be8c93d..8e392a4 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,12 @@ This makes use of a [fork of the `ssgetpy`](github.com/cwpearson/ssgetpy) packag ssgetpy does not discriminate "real" datatype from "integer" datatype, as shown on the suitesparse collection website. Therefore, `lists.py` maintains a manually-curated list of `integer` datatype matrices to facilitate discrimination. +## Transfer data to a different filesystem + +``` +rsync -rzvh --links pearson@cori.nersc.gov:$SS_DIR/ . +``` + ## how this was done ``` @@ -48,4 +54,4 @@ poetry add ssgetpy ``` poetry install -``` \ No newline at end of file +``` diff --git a/lib/config.py b/lib/config.py index e00fdb9..2d7d367 100644 --- a/lib/config.py +++ b/lib/config.py @@ -2,8 +2,15 @@ import os import sys from pathlib import Path +from lib import matrix + try: DIR = Path(os.environ["SS_DIR"]) except KeyError as e: print("ERROR: $SS_DIR not set") sys.exit(1) + +SS_ROOT_URL = "https://sparse.tamu.edu" + + + diff --git a/lib/datasets.py b/lib/datasets.py index fb39ad1..d98f2bb 100644 --- a/lib/datasets.py +++ b/lib/datasets.py @@ -3,7 +3,7 @@ import sys import ssgetpy -from lib import lists +from lib import dtypes Dataset = collections.namedtuple("Dataset", ["name", "mats"]) @@ -15,18 +15,19 @@ def safe_dir_name(s): t = t.lower() return t -def mat_is_integer(mat): - return mat.name in lists.INTEGER_MATS +def mat_is_real(mat): + val = dtypes.DTYPES[(mat.group, mat.name)] == "real" + return val -def filter_reject_integer(mats): - return [mat for mat in mats if not mat_is_integer(mat)] +def filter_keep_real(mats): + return [mat for mat in mats if mat_is_real(mat)] def mat_is_small(mat): return (mat.rows < 1_000 and mat.cols < 1_000) \ or mat.nnz < 20_000 def mat_is_large(mat): - return (mat.rows > 1_000_000 and mat.cols < 1_000_000) \ + return (mat.rows > 1_000_000 and mat.cols > 1_000_000) \ or mat.nnz > 20_000_000 def filter_reject_large(mats): @@ -38,7 +39,7 @@ def filter_reject_small(mats): ## all real-valued matrices REAL_MATS = Dataset( name = "reals", - mats = filter_reject_integer(ssgetpy.search( + mats = filter_keep_real(ssgetpy.search( dtype='real', limit=1_000_000 )) @@ -66,7 +67,7 @@ for kind in kinds: ) REGULAR_REAL_MATS = Dataset( name="regular_reals", - mats = filter_reject_integer(mats) + mats = filter_keep_real(mats) ) ## keep "small" matrices @@ -91,7 +92,7 @@ REAL_MED_MATS = Dataset ( ## export all datasets DATASETS = [ - REAL_MATS, + # REAL_MATS, REAL_SMALL_MATS, REAL_MED_MATS, REGULAR_REAL_MATS, @@ -114,7 +115,7 @@ for kind in get_kinds(): name = "kind_"+safe_dir_name(kind), mats = filter_reject_large( \ filter_reject_small( \ - filter_reject_integer(ssgetpy.search( + filter_keep_real(ssgetpy.search( kind=kind, dtype='real', limit=1_000_000 diff --git a/lib/dtypes.py b/lib/dtypes.py new file mode 100644 index 0000000..350e85e --- /dev/null +++ b/lib/dtypes.py @@ -0,0 +1,54 @@ +"""export a map that is (group, name) -> dtype for all mats""" + +import requests +import datetime +import os + +import scipy.io + +from lib import config + +def download_ss_index(path): + with open(path, "wb") as f: + req = requests.get(config.SS_ROOT_URL + "/files/ss_index.mat") + f.write(req.content) + +def ensure_ss_index(path): + if not os.path.exists(path): + download_ss_index(path) + mtime = datetime.datetime.utcfromtimestamp(os.path.getmtime(config.DIR / ".ss_index.mat")) + if datetime.datetime.utcnow() - mtime > datetime.timedelta(days=90): + download_ss_index(path) + +# download metadata file if missing +local = config.DIR / ".ss_index.mat" +ensure_ss_index(local) + + +# load metadata and convert to a database +mat = scipy.io.loadmat(config.DIR / ".ss_index.mat", squeeze_me=True) + +s = mat["ss_index"].item() +for i,x in enumerate(s): + print(i, x) +groups = s[1] +names = s[2] +# 3 letters, first letter: +# r=real, p=binary, c=complex, i=integer +rbtype = s[19] + +def dtype_from_rbtype(rbtype): + if rbtype[0] == "r": + return "real" + elif rbtype[0] == "p": + return "binary" + elif rbtype[0] == "c": + return "complex" + elif rbtype[0] == "i": + return "integer" + else: + raise LookupError + +DTYPES = {} +for i in range(len(names)): + DTYPES[(groups[i], names[i])] = dtype_from_rbtype(rbtype[i]) \ No newline at end of file diff --git a/lib/matrix.py b/lib/matrix.py new file mode 100644 index 0000000..f05ab2c --- /dev/null +++ b/lib/matrix.py @@ -0,0 +1,17 @@ +class Matrix: + def __init__(self, group, name, dtype, nrows, ncols, nnz): + self.group = group + self.name = name + self.dtype = dtype + self.nrows = int(nrows) + self.ncols = int(ncols) + self.nnz = int(nnz) + + def to_tuple(self): + return (self.group, self.name, self.dtype, self.nrows, self.ncols, self.nnz) + + def __repr__(self): + return repr(self.to_tuple()) + + def url(self): + return "/".join(("https://sparse.tamu.edu", "MM", self.group, self.name + ".tar.gz")) \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 2394c05..afd32f2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -33,6 +33,14 @@ category = "main" optional = false python-versions = ">=3.5" +[[package]] +name = "numpy" +version = "1.21.4" +description = "NumPy is the fundamental package for array computing with Python." +category = "main" +optional = false +python-versions = ">=3.7,<3.11" + [[package]] name = "requests" version = "2.26.0" @@ -51,6 +59,17 @@ urllib3 = ">=1.21.1,<1.27" socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"] use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"] +[[package]] +name = "scipy" +version = "1.7.3" +description = "SciPy: Scientific Library for Python" +category = "main" +optional = false +python-versions = ">=3.7,<3.11" + +[package.dependencies] +numpy = ">=1.16.5,<1.23.0" + [[package]] name = "ssgetpy" version = "1.0-pre2" @@ -101,8 +120,8 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [metadata] lock-version = "1.1" -python-versions = "^3.7" -content-hash = "4a624c76d5d28333a13081a3fe5fba3eadcdfc09ac0963d1f1ecd89eb03451aa" +python-versions = ">=3.7,<3.11" +content-hash = "5a1bf7fe65d1fe23f7c34d44076cc157e3343699790a742492686d6198fb88eb" [metadata.files] certifi = [ @@ -121,10 +140,73 @@ idna = [ {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, ] +numpy = [ + {file = "numpy-1.21.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8890b3360f345e8360133bc078d2dacc2843b6ee6059b568781b15b97acbe39f"}, + {file = "numpy-1.21.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:69077388c5a4b997442b843dbdc3a85b420fb693ec8e33020bb24d647c164fa5"}, + {file = "numpy-1.21.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e89717274b41ebd568cd7943fc9418eeb49b1785b66031bc8a7f6300463c5898"}, + {file = "numpy-1.21.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b78ecfa070460104934e2caf51694ccd00f37d5e5dbe76f021b1b0b0d221823"}, + {file = "numpy-1.21.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:615d4e328af7204c13ae3d4df7615a13ff60a49cb0d9106fde07f541207883ca"}, + {file = "numpy-1.21.4-cp310-cp310-win_amd64.whl", hash = "sha256:1403b4e2181fc72664737d848b60e65150f272fe5a1c1cbc16145ed43884065a"}, + {file = "numpy-1.21.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:74b85a17528ca60cf98381a5e779fc0264b4a88b46025e6bcbe9621f46bb3e63"}, + {file = "numpy-1.21.4-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:92aafa03da8658609f59f18722b88f0a73a249101169e28415b4fa148caf7e41"}, + {file = "numpy-1.21.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:5d95668e727c75b3f5088ec7700e260f90ec83f488e4c0aaccb941148b2cd377"}, + {file = "numpy-1.21.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f5162ec777ba7138906c9c274353ece5603646c6965570d82905546579573f73"}, + {file = "numpy-1.21.4-cp37-cp37m-win32.whl", hash = "sha256:81225e58ef5fce7f1d80399575576fc5febec79a8a2742e8ef86d7b03beef49f"}, + {file = "numpy-1.21.4-cp37-cp37m-win_amd64.whl", hash = "sha256:32fe5b12061f6446adcbb32cf4060a14741f9c21e15aaee59a207b6ce6423469"}, + {file = "numpy-1.21.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c449eb870616a7b62e097982c622d2577b3dbc800aaf8689254ec6e0197cbf1e"}, + {file = "numpy-1.21.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2e4ed57f45f0aa38beca2a03b6532e70e548faf2debbeb3291cfc9b315d9be8f"}, + {file = "numpy-1.21.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1247ef28387b7bb7f21caf2dbe4767f4f4175df44d30604d42ad9bd701ebb31f"}, + {file = "numpy-1.21.4-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:34f3456f530ae8b44231c63082c8899fe9c983fd9b108c997c4b1c8c2d435333"}, + {file = "numpy-1.21.4-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4c9c23158b87ed0e70d9a50c67e5c0b3f75bcf2581a8e34668d4e9d7474d76c6"}, + {file = "numpy-1.21.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4799be6a2d7d3c33699a6f77201836ac975b2e1b98c2a07f66a38f499cb50ce"}, + {file = "numpy-1.21.4-cp38-cp38-win32.whl", hash = "sha256:bc988afcea53e6156546e5b2885b7efab089570783d9d82caf1cfd323b0bb3dd"}, + {file = "numpy-1.21.4-cp38-cp38-win_amd64.whl", hash = "sha256:170b2a0805c6891ca78c1d96ee72e4c3ed1ae0a992c75444b6ab20ff038ba2cd"}, + {file = "numpy-1.21.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:fde96af889262e85aa033f8ee1d3241e32bf36228318a61f1ace579df4e8170d"}, + {file = "numpy-1.21.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c885bfc07f77e8fee3dc879152ba993732601f1f11de248d4f357f0ffea6a6d4"}, + {file = "numpy-1.21.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9e6f5f50d1eff2f2f752b3089a118aee1ea0da63d56c44f3865681009b0af162"}, + {file = "numpy-1.21.4-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:ad010846cdffe7ec27e3f933397f8a8d6c801a48634f419e3d075db27acf5880"}, + {file = "numpy-1.21.4-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c74c699b122918a6c4611285cc2cad4a3aafdb135c22a16ec483340ef97d573c"}, + {file = "numpy-1.21.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9864424631775b0c052f3bd98bc2712d131b3e2cd95d1c0c68b91709170890b0"}, + {file = "numpy-1.21.4-cp39-cp39-win32.whl", hash = "sha256:b1e2312f5b8843a3e4e8224b2b48fe16119617b8fc0a54df8f50098721b5bed2"}, + {file = "numpy-1.21.4-cp39-cp39-win_amd64.whl", hash = "sha256:e3c3e990274444031482a31280bf48674441e0a5b55ddb168f3a6db3e0c38ec8"}, + {file = "numpy-1.21.4-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a3deb31bc84f2b42584b8c4001c85d1934dbfb4030827110bc36bfd11509b7bf"}, + {file = "numpy-1.21.4.zip", hash = "sha256:e6c76a87633aa3fa16614b61ccedfae45b91df2767cf097aa9c933932a7ed1e0"}, +] requests = [ {file = "requests-2.26.0-py2.py3-none-any.whl", hash = "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24"}, {file = "requests-2.26.0.tar.gz", hash = "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"}, ] +scipy = [ + {file = "scipy-1.7.3-1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:c9e04d7e9b03a8a6ac2045f7c5ef741be86727d8f49c45db45f244bdd2bcff17"}, + {file = "scipy-1.7.3-1-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:b0e0aeb061a1d7dcd2ed59ea57ee56c9b23dd60100825f98238c06ee5cc4467e"}, + {file = "scipy-1.7.3-1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:b78a35c5c74d336f42f44106174b9851c783184a85a3fe3e68857259b37b9ffb"}, + {file = "scipy-1.7.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:173308efba2270dcd61cd45a30dfded6ec0085b4b6eb33b5eb11ab443005e088"}, + {file = "scipy-1.7.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:21b66200cf44b1c3e86495e3a436fc7a26608f92b8d43d344457c54f1c024cbc"}, + {file = "scipy-1.7.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ceebc3c4f6a109777c0053dfa0282fddb8893eddfb0d598574acfb734a926168"}, + {file = "scipy-1.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7eaea089345a35130bc9a39b89ec1ff69c208efa97b3f8b25ea5d4c41d88094"}, + {file = "scipy-1.7.3-cp310-cp310-win_amd64.whl", hash = "sha256:304dfaa7146cffdb75fbf6bb7c190fd7688795389ad060b970269c8576d038e9"}, + {file = "scipy-1.7.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:033ce76ed4e9f62923e1f8124f7e2b0800db533828c853b402c7eec6e9465d80"}, + {file = "scipy-1.7.3-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:4d242d13206ca4302d83d8a6388c9dfce49fc48fdd3c20efad89ba12f785bf9e"}, + {file = "scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:8499d9dd1459dc0d0fe68db0832c3d5fc1361ae8e13d05e6849b358dc3f2c279"}, + {file = "scipy-1.7.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca36e7d9430f7481fc7d11e015ae16fbd5575615a8e9060538104778be84addf"}, + {file = "scipy-1.7.3-cp37-cp37m-win32.whl", hash = "sha256:e2c036492e673aad1b7b0d0ccdc0cb30a968353d2c4bf92ac8e73509e1bf212c"}, + {file = "scipy-1.7.3-cp37-cp37m-win_amd64.whl", hash = "sha256:866ada14a95b083dd727a845a764cf95dd13ba3dc69a16b99038001b05439709"}, + {file = "scipy-1.7.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:65bd52bf55f9a1071398557394203d881384d27b9c2cad7df9a027170aeaef93"}, + {file = "scipy-1.7.3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:f99d206db1f1ae735a8192ab93bd6028f3a42f6fa08467d37a14eb96c9dd34a3"}, + {file = "scipy-1.7.3-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:5f2cfc359379c56b3a41b17ebd024109b2049f878badc1e454f31418c3a18436"}, + {file = "scipy-1.7.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eb7ae2c4dbdb3c9247e07acc532f91077ae6dbc40ad5bd5dca0bb5a176ee9bda"}, + {file = "scipy-1.7.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95c2d250074cfa76715d58830579c64dff7354484b284c2b8b87e5a38321672c"}, + {file = "scipy-1.7.3-cp38-cp38-win32.whl", hash = "sha256:87069cf875f0262a6e3187ab0f419f5b4280d3dcf4811ef9613c605f6e4dca95"}, + {file = "scipy-1.7.3-cp38-cp38-win_amd64.whl", hash = "sha256:7edd9a311299a61e9919ea4192dd477395b50c014cdc1a1ac572d7c27e2207fa"}, + {file = "scipy-1.7.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:eef93a446114ac0193a7b714ce67659db80caf940f3232bad63f4c7a81bc18df"}, + {file = "scipy-1.7.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:eb326658f9b73c07081300daba90a8746543b5ea177184daed26528273157294"}, + {file = "scipy-1.7.3-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:93378f3d14fff07572392ce6a6a2ceb3a1f237733bd6dcb9eb6a2b29b0d19085"}, + {file = "scipy-1.7.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edad1cf5b2ce1912c4d8ddad20e11d333165552aba262c882e28c78bbc09dbf6"}, + {file = "scipy-1.7.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d1cc2c19afe3b5a546ede7e6a44ce1ff52e443d12b231823268019f608b9b12"}, + {file = "scipy-1.7.3-cp39-cp39-win32.whl", hash = "sha256:2c56b820d304dffcadbbb6cbfbc2e2c79ee46ea291db17e288e73cd3c64fefa9"}, + {file = "scipy-1.7.3-cp39-cp39-win_amd64.whl", hash = "sha256:3f78181a153fa21c018d346f595edd648344751d7f03ab94b398be2ad083ed3e"}, + {file = "scipy-1.7.3.tar.gz", hash = "sha256:ab5875facfdef77e0a47d5fd39ea178b58e60e454a4c85aa1e52fcb80db7babf"}, +] ssgetpy = [] tqdm = [ {file = "tqdm-4.62.3-py2.py3-none-any.whl", hash = "sha256:8dd278a422499cd6b727e6ae4061c40b48fce8b76d1ccbf5d34fca9b7f925b0c"}, diff --git a/pyproject.toml b/pyproject.toml index a845f40..dd8457e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,8 +5,9 @@ description = "" authors = ["Carl Pearson "] [tool.poetry.dependencies] -python = "^3.7" +python = ">=3.7,<3.11" ssgetpy = {git = "https://github.com/cwpearson/ssgetpy.git", rev = "be00d2a"} +scipy = "^1.7.3" [tool.poetry.dev-dependencies]