don't use poetry, update Perlmutter environment

import datasets -> from lib import datasets
create directory if it doesn't exist
2023-03-09 10:24:58 -08:00 · 2022-11-02 08:51:02 -07:00 · 2022-11-02 08:50:33 -07:00 · 2022-11-02 08:50:08 -07:00 · 2022-01-28 13:14:43 -07:00 · 2022-01-28 13:14:30 -07:00
12 changed files with 153 additions and 175 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 __pycache__
 .python-version
--- a/11
+++ b/11
@@ -0,0 +1,11 @@
 [[source]]
 url = "https://pypi.org/simple"
 verify_ssl = true
 name = "pypi"
 [packages]
 [dev-packages]
 [requires]
 python_version = "3.9"
--- a/README.md
+++ b/README.md
@@ -1,22 +1,21 @@
 # ss-downloader
 Install poetry & Python 3.8+
 ```
-curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python -
+pipenv shell
 pip install -r requirements.txt
 ```
 ## how to use
 ```
 source load-env.sh
-poetry run python list.py
+python list.py
-poetry run python download.py <dataset name>
+python download.py <dataset name>
 ```
 To download all datasets
 ```
-poetry run python download.py all
+python download.py all
 ```
 You can move the datasets due to relative symlinks. For example:
@@ -37,7 +36,13 @@ Then, a relative symlink is created from the `$SCRATCH/<subset>/<matrix>.mtx` fi
 This makes use of a [fork of the `ssgetpy`](github.com/cwpearson/ssgetpy) package with a faster download limit.
 ssgetpy does not discriminate "real" datatype from "integer" datatype, as shown on the suitesparse collection website.
-Therefore, `lists.py` maintains a manually-curated list of `integer` datatype matrices to facilitate discrimination.
+Therefore, we access https://sparse.tamu.edu/files/ss_index.mat to determine that metadata for each file.
 ## Transfer data to a different filesystem
 ```
 rsync -rzvh --links --info=progress2 pearson@cori.nersc.gov:$SS_DIR/ .
 ```
 ## how this was done
@@ -48,4 +53,4 @@ poetry add ssgetpy
 ```
 poetry install
-```
+```
--- a/lib/config.py
+++ b/lib/config.py
@@ -2,8 +2,15 @@ import os
 import sys
 from pathlib import Path
 from lib import matrix
 try:
    DIR = Path(os.environ["SS_DIR"])
 except KeyError as e:
    print("ERROR: $SS_DIR not set")
    sys.exit(1)
 SS_ROOT_URL = "https://sparse.tamu.edu"
--- a/lib/datasets.py
+++ b/lib/datasets.py
@@ -3,7 +3,7 @@ import sys
 import ssgetpy
-from lib import lists
+from lib import dtypes
 Dataset = collections.namedtuple("Dataset", ["name", "mats"])
@@ -15,18 +15,19 @@ def safe_dir_name(s):
    t = t.lower()
    return t
-def mat_is_integer(mat):
+def mat_is_real(mat):
-    return mat.name in lists.INTEGER_MATS
+    val = dtypes.DTYPES[(mat.group, mat.name)] == "real"
    return val
-def filter_reject_integer(mats):
+def filter_keep_real(mats):
-    return [mat for mat in mats if not mat_is_integer(mat)]
+    return [mat for mat in mats if mat_is_real(mat)]
 def mat_is_small(mat):
    return (mat.rows < 1_000 and mat.cols < 1_000) \
        or mat.nnz < 20_000
 def mat_is_large(mat):
-    return (mat.rows > 1_000_000 and mat.cols < 1_000_000) \
+    return (mat.rows > 1_000_000 and mat.cols > 1_000_000) \
        or mat.nnz > 20_000_000
 def filter_reject_large(mats):
@@ -35,10 +36,13 @@ def filter_reject_large(mats):
 def filter_reject_small(mats):
    return [mat for mat in mats if not mat_is_small(mat)]
 def filter_keep_square(mats):
    return [mat for mat in mats if mat.rows == mat.cols]
 ## all real-valued matrices
 REAL_MATS = Dataset(
    name = "reals",
-    mats = filter_reject_integer(ssgetpy.search(
+    mats = filter_keep_real(ssgetpy.search(
        dtype='real',
        limit=1_000_000
    ))
@@ -66,7 +70,7 @@ for kind in kinds:
    )
 REGULAR_REAL_MATS = Dataset(
    name="regular_reals",
-    mats = filter_reject_integer(mats)
+    mats = filter_keep_real(mats)
 )
 ## keep "small" matrices
@@ -79,6 +83,15 @@ REAL_SMALL_MATS = Dataset (
    mats = filter_reject_large(REAL_MATS.mats)
 )
 REGULAR_SQUARE_REAL_SMALL_MATS = Dataset (
    name = "regular_square_reals_small",
    mats = filter_keep_square(REGULAR_REAL_SMALL_MATS.mats)
 )
 SQUARE_REAL_SMALL_MATS = Dataset (
    name = "square_reals_small",
    mats = filter_keep_square(REAL_SMALL_MATS.mats)
 )
 ## keep "medium" matrices
 REGULAR_REAL_MED_MATS = Dataset (
    name = "regular_reals_med",
@@ -91,12 +104,14 @@ REAL_MED_MATS = Dataset (
 ## export all datasets
 DATASETS  = [
-    REAL_MATS,
+    # REAL_MATS,
    REAL_SMALL_MATS,
    REAL_MED_MATS,
-    REGULAR_REAL_MATS,
+    # REGULAR_REAL_MATS,
    REGULAR_REAL_SMALL_MATS,
-    REGULAR_REAL_MED_MATS
+    REGULAR_REAL_MED_MATS,
    REGULAR_SQUARE_REAL_SMALL_MATS,
    SQUARE_REAL_SMALL_MATS,
 ]
 def get_kinds():
@@ -114,7 +129,7 @@ for kind in get_kinds():
            name = "kind_"+safe_dir_name(kind),
            mats = filter_reject_large( \
            filter_reject_small( \
-            filter_reject_integer(ssgetpy.search(
+            filter_keep_real(ssgetpy.search(
                kind=kind,
                dtype='real',
                limit=1_000_000
--- a/lib/dtypes.py
+++ b/lib/dtypes.py
@@ -0,0 +1,58 @@
 """export a map that is (group, name) -> dtype for all mats"""
 import requests
 import datetime
 import os
 import scipy.io
 from lib import config
 def download_ss_index(path):
    path_dir = path.parent
    path_dir.mkdir(parents=True, exist_ok=True) 
    with open(path, "wb") as f:
        req = requests.get(config.SS_ROOT_URL + "/files/ss_index.mat")
        f.write(req.content)
 def ensure_ss_index(path):
    if not os.path.exists(path):
        download_ss_index(path)
    mtime = datetime.datetime.utcfromtimestamp(os.path.getmtime(config.DIR / ".ss_index.mat"))
    if datetime.datetime.utcnow() - mtime > datetime.timedelta(days=90):
        download_ss_index(path)
 # download metadata file if missing
 local = config.DIR / ".ss_index.mat"
 ensure_ss_index(local)
 # load metadata and convert to a database
 mat = scipy.io.loadmat(config.DIR / ".ss_index.mat", squeeze_me=True)
 s = mat["ss_index"].item()
 # for i,x in enumerate(s):
 #     print(i, x)
 groups = s[1]
 names = s[2]
 # 3 letters, first letter:
 # r=real, p=binary, c=complex, i=integer
 rbtype = s[19] 
 def dtype_from_rbtype(rbtype):
    if rbtype[0] == "r":
        return "real"
    elif rbtype[0] == "p":
        return "binary"
    elif rbtype[0] == "c":
        return "complex"
    elif rbtype[0] == "i":
        return "integer"
    else:
        raise LookupError
 DTYPES = {}
 for i in range(len(names)):
    DTYPES[(groups[i], names[i])] = dtype_from_rbtype(rbtype[i])
--- a/lib/matrix.py
+++ b/lib/matrix.py
@@ -0,0 +1,17 @@
 class Matrix:
    def __init__(self, group, name, dtype, nrows, ncols, nnz):
        self.group = group
        self.name = name
        self.dtype = dtype
        self.nrows = int(nrows)
        self.ncols = int(ncols)
        self.nnz = int(nnz)
    def to_tuple(self):
        return (self.group, self.name, self.dtype, self.nrows, self.ncols, self.nnz)
    def __repr__(self):
        return repr(self.to_tuple())
    def url(self):
        return "/".join(("https://sparse.tamu.edu", "MM", self.group, self.name + ".tar.gz"))
--- a/list.py
+++ b/list.py
@@ -1,4 +1,4 @@
-import datasets
+from lib import datasets
 for ds in datasets.DATASETS:
-    print(f"{ds.name}: {len(ds.mats)} matrices")
+    print(f"{ds.name}: {len(ds.mats)} matrices")
--- a/load-env.sh
+++ b/load-env.sh
@@ -2,6 +2,8 @@
 host=`hostname`
 if [[ "$NERSC_HOST" == cori ]]; then
    echo \$NERSC_HOST matched cori
@@ -13,9 +15,20 @@ if [[ "$NERSC_HOST" == cori ]]; then
 elif [[ "$NERSC_HOST" == perlmutter ]]; then
    echo \$NERSC_HOST matched perlmutter
-    module load cray-python/3.9.4.1
+    echo module load cray-python/3.9.13.1
-    which python
+    module load cray-python/3.9.13.1
    export SS_DIR="$CFS"/m3918/pearson
    echo "\$SS_DIR = $SS_DIR"
 elif [[ `hostname` =~ ascicgpu030 ]]; then
    echo hostname matched ascicgpu030
    export SS_DIR="$HOME/suitesparse"
    echo "\$SS_DIR = $SS_DIR"
 elif [[ `hostname` =~ rzvernal ]]; then
    echo hostname matched rzvernal
    export SS_DIR="/usr/workspace/cwpears/suitesparse"
    echo "\$SS_DIR = $SS_DIR"
 fi
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,136 +0,0 @@
 [[package]]
 name = "certifi"
 version = "2021.10.8"
 description = "Python package for providing Mozilla's CA Bundle."
 category = "main"
 optional = false
 python-versions = "*"
 [[package]]
 name = "charset-normalizer"
 version = "2.0.7"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
 category = "main"
 optional = false
 python-versions = ">=3.5.0"
 [package.extras]
 unicode_backport = ["unicodedata2"]
 [[package]]
 name = "colorama"
 version = "0.4.4"
 description = "Cross-platform colored terminal text."
 category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 [[package]]
 name = "idna"
 version = "3.3"
 description = "Internationalized Domain Names in Applications (IDNA)"
 category = "main"
 optional = false
 python-versions = ">=3.5"
 [[package]]
 name = "requests"
 version = "2.26.0"
 description = "Python HTTP for Humans."
 category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
 [package.dependencies]
 certifi = ">=2017.4.17"
 charset-normalizer = {version = ">=2.0.0,<2.1.0", markers = "python_version >= \"3\""}
 idna = {version = ">=2.5,<4", markers = "python_version >= \"3\""}
 urllib3 = ">=1.21.1,<1.27"
 [package.extras]
 socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"]
 use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"]
 [[package]]
 name = "ssgetpy"
 version = "1.0-pre2"
 description = ""
 category = "main"
 optional = false
 python-versions = ">3.5.2"
 develop = false
 [package.dependencies]
 requests = ">=2.22"
 tqdm = ">=4.41"
 [package.source]
 type = "git"
 url = "https://github.com/cwpearson/ssgetpy.git"
 reference = "be00d2a"
 resolved_reference = "be00d2ad64c55d32291ba820f3d040524b1c5b0e"
 [[package]]
 name = "tqdm"
 version = "4.62.3"
 description = "Fast, Extensible Progress Meter"
 category = "main"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7"
 [package.dependencies]
 colorama = {version = "*", markers = "platform_system == \"Windows\""}
 [package.extras]
 dev = ["py-make (>=0.1.0)", "twine", "wheel"]
 notebook = ["ipywidgets (>=6)"]
 telegram = ["requests"]
 [[package]]
 name = "urllib3"
 version = "1.26.7"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
 [package.extras]
 brotli = ["brotlipy (>=0.6.0)"]
 secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"]
 socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.7"
 content-hash = "4a624c76d5d28333a13081a3fe5fba3eadcdfc09ac0963d1f1ecd89eb03451aa"
 [metadata.files]
 certifi = [
    {file = "certifi-2021.10.8-py2.py3-none-any.whl", hash = "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"},
    {file = "certifi-2021.10.8.tar.gz", hash = "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872"},
 ]
 charset-normalizer = [
    {file = "charset-normalizer-2.0.7.tar.gz", hash = "sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0"},
    {file = "charset_normalizer-2.0.7-py3-none-any.whl", hash = "sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b"},
 ]
 colorama = [
    {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"},
    {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"},
 ]
 idna = [
    {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
    {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
 ]
 requests = [
    {file = "requests-2.26.0-py2.py3-none-any.whl", hash = "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24"},
    {file = "requests-2.26.0.tar.gz", hash = "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"},
 ]
 ssgetpy = []
 tqdm = [
    {file = "tqdm-4.62.3-py2.py3-none-any.whl", hash = "sha256:8dd278a422499cd6b727e6ae4061c40b48fce8b76d1ccbf5d34fca9b7f925b0c"},
    {file = "tqdm-4.62.3.tar.gz", hash = "sha256:d359de7217506c9851b7869f3708d8ee53ed70a1b8edbba4dbcb47442592920d"},
 ]
 urllib3 = [
    {file = "urllib3-1.26.7-py2.py3-none-any.whl", hash = "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"},
    {file = "urllib3-1.26.7.tar.gz", hash = "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece"},
 ]
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,15 +0,0 @@
 [tool.poetry]
 name = "ss-downloader"
 version = "0.1.0"
 description = ""
 authors = ["Carl Pearson <cwpears@sandia.gov>"]
 [tool.poetry.dependencies]
 python = "^3.7"
 ssgetpy = {git = "https://github.com/cwpearson/ssgetpy.git", rev = "be00d2a"}
 [tool.poetry.dev-dependencies]
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,2 @@
 ssgetpy
 scipy
Author	SHA1	Message	Date
Carl Pearson	80d18fca11	don't use poetry, update Perlmutter environment	2023-03-09 10:24:58 -08:00
Carl Pearson	07035904ee	import datasets -> from lib import datasets	2022-11-02 08:51:02 -07:00
Carl Pearson	a57793a6f4	create directory if it doesn't exist	2022-11-02 08:50:33 -07:00
Carl Pearson	f65421a4cf	rzvernal	2022-11-02 08:50:08 -07:00
Carl William Pearson	34bc594b04	ignore .python-version	2022-01-28 13:14:43 -07:00
Carl William Pearson	4584ce14ee	add square datasets	2022-01-28 13:14:30 -07:00
Carl William Pearson	29f5289068	ascicgpu	2022-01-28 13:14:19 -07:00
Carl Pearson	800485d984	don't download enormous matrices	2021-12-01 14:58:08 -08:00
Carl Pearson	ecd6cebf05	readme	2021-12-01 14:32:07 -08:00
Carl Pearson	9800c3b5f9	automatically download nonzero datatype metadata	2021-12-01 14:27:44 -08:00