don't use poetry, update Perlmutter environment

import datasets -> from lib import datasets
create directory if it doesn't exist
2023-03-09 10:24:58 -08:00 · 2022-11-02 08:51:02 -07:00 · 2022-11-02 08:50:33 -07:00 · 2022-11-02 08:50:08 -07:00 · 2022-01-28 13:14:43 -07:00 · 2022-01-28 13:14:30 -07:00
12 changed files with 153 additions and 175 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 __pycache__
+.python-version
--- a/11
+++ b/11
@@ -0,0 +1,11 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+
+[dev-packages]
+
+[requires]
+python_version = "3.9"
--- a/README.md
+++ b/README.md
@@ -1,22 +1,21 @@
 # ss-downloader

-Install poetry & Python 3.8+
-
 ```
-curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python -
+pipenv shell
+pip install -r requirements.txt
 ```

 ## how to use

 ```
 source load-env.sh
-poetry run python list.py
-poetry run python download.py <dataset name>
+python list.py
+python download.py <dataset name>
 ```

 To download all datasets
 ```
-poetry run python download.py all
+python download.py all
 ```

 You can move the datasets due to relative symlinks. For example:
@@ -37,7 +36,13 @@ Then, a relative symlink is created from the `$SCRATCH/<subset>/<matrix>.mtx` fi

 This makes use of a [fork of the `ssgetpy`](github.com/cwpearson/ssgetpy) package with a faster download limit.
 ssgetpy does not discriminate "real" datatype from "integer" datatype, as shown on the suitesparse collection website.
-Therefore, `lists.py` maintains a manually-curated list of `integer` datatype matrices to facilitate discrimination.
+Therefore, we access https://sparse.tamu.edu/files/ss_index.mat to determine that metadata for each file.
+
+## Transfer data to a different filesystem
+
+```
+rsync -rzvh --links --info=progress2 pearson@cori.nersc.gov:$SS_DIR/ .
+```

 ## how this was done

@@ -48,4 +53,4 @@ poetry add ssgetpy

 ```
 poetry install
-```
+```
--- a/lib/config.py
+++ b/lib/config.py
@@ -2,8 +2,15 @@ import os
 import sys
 from pathlib import Path

+from lib import matrix
+
 try:
    DIR = Path(os.environ["SS_DIR"])
 except KeyError as e:
    print("ERROR: $SS_DIR not set")
    sys.exit(1)
+
+SS_ROOT_URL = "https://sparse.tamu.edu"
+
+
+
--- a/lib/datasets.py
+++ b/lib/datasets.py
@@ -3,7 +3,7 @@ import sys

 import ssgetpy

-from lib import lists
+from lib import dtypes

 Dataset = collections.namedtuple("Dataset", ["name", "mats"])

@@ -15,18 +15,19 @@ def safe_dir_name(s):
    t = t.lower()
    return t

-def mat_is_integer(mat):
-    return mat.name in lists.INTEGER_MATS
+def mat_is_real(mat):
+    val = dtypes.DTYPES[(mat.group, mat.name)] == "real"
+    return val

-def filter_reject_integer(mats):
-    return [mat for mat in mats if not mat_is_integer(mat)]
+def filter_keep_real(mats):
+    return [mat for mat in mats if mat_is_real(mat)]

 def mat_is_small(mat):
    return (mat.rows < 1_000 and mat.cols < 1_000) \
        or mat.nnz < 20_000

 def mat_is_large(mat):
-    return (mat.rows > 1_000_000 and mat.cols < 1_000_000) \
+    return (mat.rows > 1_000_000 and mat.cols > 1_000_000) \
        or mat.nnz > 20_000_000

 def filter_reject_large(mats):
@@ -35,10 +36,13 @@ def filter_reject_large(mats):
 def filter_reject_small(mats):
    return [mat for mat in mats if not mat_is_small(mat)]

+def filter_keep_square(mats):
+    return [mat for mat in mats if mat.rows == mat.cols]
+
 ## all real-valued matrices
 REAL_MATS = Dataset(
    name = "reals",
-    mats = filter_reject_integer(ssgetpy.search(
+    mats = filter_keep_real(ssgetpy.search(
        dtype='real',
        limit=1_000_000
    ))
@@ -66,7 +70,7 @@ for kind in kinds:
    )
 REGULAR_REAL_MATS = Dataset(
    name="regular_reals",
-    mats = filter_reject_integer(mats)
+    mats = filter_keep_real(mats)
 )

 ## keep "small" matrices
@@ -79,6 +83,15 @@ REAL_SMALL_MATS = Dataset (
    mats = filter_reject_large(REAL_MATS.mats)
 )

+REGULAR_SQUARE_REAL_SMALL_MATS = Dataset (
+    name = "regular_square_reals_small",
+    mats = filter_keep_square(REGULAR_REAL_SMALL_MATS.mats)
+)
+SQUARE_REAL_SMALL_MATS = Dataset (
+    name = "square_reals_small",
+    mats = filter_keep_square(REAL_SMALL_MATS.mats)
+)
+
 ## keep "medium" matrices
 REGULAR_REAL_MED_MATS = Dataset (
    name = "regular_reals_med",
@@ -91,12 +104,14 @@ REAL_MED_MATS = Dataset (

 ## export all datasets
 DATASETS  = [
-    REAL_MATS,
+    # REAL_MATS,
    REAL_SMALL_MATS,
    REAL_MED_MATS,
-    REGULAR_REAL_MATS,
+    # REGULAR_REAL_MATS,
    REGULAR_REAL_SMALL_MATS,
-    REGULAR_REAL_MED_MATS
+    REGULAR_REAL_MED_MATS,
+    REGULAR_SQUARE_REAL_SMALL_MATS,
+    SQUARE_REAL_SMALL_MATS,
 ]

 def get_kinds():
@@ -114,7 +129,7 @@ for kind in get_kinds():
            name = "kind_"+safe_dir_name(kind),
            mats = filter_reject_large( \
            filter_reject_small( \
-            filter_reject_integer(ssgetpy.search(
+            filter_keep_real(ssgetpy.search(
                kind=kind,
                dtype='real',
                limit=1_000_000
--- a/lib/dtypes.py
+++ b/lib/dtypes.py
@@ -0,0 +1,58 @@
+"""export a map that is (group, name) -> dtype for all mats"""
+
+import requests
+import datetime
+import os
+
+import scipy.io
+
+from lib import config
+
+def download_ss_index(path):
+
+    path_dir = path.parent
+    path_dir.mkdir(parents=True, exist_ok=True) 
+
+    with open(path, "wb") as f:
+        req = requests.get(config.SS_ROOT_URL + "/files/ss_index.mat")
+        f.write(req.content)
+
+def ensure_ss_index(path):
+    if not os.path.exists(path):
+        download_ss_index(path)
+    mtime = datetime.datetime.utcfromtimestamp(os.path.getmtime(config.DIR / ".ss_index.mat"))
+    if datetime.datetime.utcnow() - mtime > datetime.timedelta(days=90):
+        download_ss_index(path)
+
+# download metadata file if missing
+local = config.DIR / ".ss_index.mat"
+ensure_ss_index(local)
+
+
+# load metadata and convert to a database
+mat = scipy.io.loadmat(config.DIR / ".ss_index.mat", squeeze_me=True)
+
+s = mat["ss_index"].item()
+# for i,x in enumerate(s):
+#     print(i, x)
+groups = s[1]
+names = s[2]
+# 3 letters, first letter:
+# r=real, p=binary, c=complex, i=integer
+rbtype = s[19] 
+
+def dtype_from_rbtype(rbtype):
+    if rbtype[0] == "r":
+        return "real"
+    elif rbtype[0] == "p":
+        return "binary"
+    elif rbtype[0] == "c":
+        return "complex"
+    elif rbtype[0] == "i":
+        return "integer"
+    else:
+        raise LookupError
+
+DTYPES = {}
+for i in range(len(names)):
+    DTYPES[(groups[i], names[i])] = dtype_from_rbtype(rbtype[i])
--- a/lib/matrix.py
+++ b/lib/matrix.py
@@ -0,0 +1,17 @@
+class Matrix:
+    def __init__(self, group, name, dtype, nrows, ncols, nnz):
+        self.group = group
+        self.name = name
+        self.dtype = dtype
+        self.nrows = int(nrows)
+        self.ncols = int(ncols)
+        self.nnz = int(nnz)
+
+    def to_tuple(self):
+        return (self.group, self.name, self.dtype, self.nrows, self.ncols, self.nnz)
+
+    def __repr__(self):
+        return repr(self.to_tuple())
+
+    def url(self):
+        return "/".join(("https://sparse.tamu.edu", "MM", self.group, self.name + ".tar.gz"))
--- a/list.py
+++ b/list.py
@@ -1,4 +1,4 @@
-import datasets
+from lib import datasets

 for ds in datasets.DATASETS:
-    print(f"{ds.name}: {len(ds.mats)} matrices")
+    print(f"{ds.name}: {len(ds.mats)} matrices")
--- a/load-env.sh
+++ b/load-env.sh
@@ -2,6 +2,8 @@

 host=`hostname`

+
+
 if [[ "$NERSC_HOST" == cori ]]; then
    echo \$NERSC_HOST matched cori

@@ -13,9 +15,20 @@ if [[ "$NERSC_HOST" == cori ]]; then
 elif [[ "$NERSC_HOST" == perlmutter ]]; then
    echo \$NERSC_HOST matched perlmutter

-    module load cray-python/3.9.4.1
-    which python
+    echo module load cray-python/3.9.13.1
+    module load cray-python/3.9.13.1
+
    export SS_DIR="$CFS"/m3918/pearson
    echo "\$SS_DIR = $SS_DIR"
+elif [[ `hostname` =~ ascicgpu030 ]]; then
+    echo hostname matched ascicgpu030
+
+    export SS_DIR="$HOME/suitesparse"
+    echo "\$SS_DIR = $SS_DIR"
+elif [[ `hostname` =~ rzvernal ]]; then
+    echo hostname matched rzvernal
+
+    export SS_DIR="/usr/workspace/cwpears/suitesparse"
+    echo "\$SS_DIR = $SS_DIR"
 fi

--- a/poetry.lock
+++ b/poetry.lock
@@ -1,136 +0,0 @@
-[[package]]
-name = "certifi"
-version = "2021.10.8"
-description = "Python package for providing Mozilla's CA Bundle."
-category = "main"
-optional = false
-python-versions = "*"
-
-[[package]]
-name = "charset-normalizer"
-version = "2.0.7"
-description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
-category = "main"
-optional = false
-python-versions = ">=3.5.0"
-
-[package.extras]
-unicode_backport = ["unicodedata2"]
-
-[[package]]
-name = "colorama"
-version = "0.4.4"
-description = "Cross-platform colored terminal text."
-category = "main"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
-
-[[package]]
-name = "idna"
-version = "3.3"
-description = "Internationalized Domain Names in Applications (IDNA)"
-category = "main"
-optional = false
-python-versions = ">=3.5"
-
-[[package]]
-name = "requests"
-version = "2.26.0"
-description = "Python HTTP for Humans."
-category = "main"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
-
-[package.dependencies]
-certifi = ">=2017.4.17"
-charset-normalizer = {version = ">=2.0.0,<2.1.0", markers = "python_version >= \"3\""}
-idna = {version = ">=2.5,<4", markers = "python_version >= \"3\""}
-urllib3 = ">=1.21.1,<1.27"
-
-[package.extras]
-socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"]
-use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"]
-
-[[package]]
-name = "ssgetpy"
-version = "1.0-pre2"
-description = ""
-category = "main"
-optional = false
-python-versions = ">3.5.2"
-develop = false
-
-[package.dependencies]
-requests = ">=2.22"
-tqdm = ">=4.41"
-
-[package.source]
-type = "git"
-url = "https://github.com/cwpearson/ssgetpy.git"
-reference = "be00d2a"
-resolved_reference = "be00d2ad64c55d32291ba820f3d040524b1c5b0e"
-
-[[package]]
-name = "tqdm"
-version = "4.62.3"
-description = "Fast, Extensible Progress Meter"
-category = "main"
-optional = false
-python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7"
-
-[package.dependencies]
-colorama = {version = "*", markers = "platform_system == \"Windows\""}
-
-[package.extras]
-dev = ["py-make (>=0.1.0)", "twine", "wheel"]
-notebook = ["ipywidgets (>=6)"]
-telegram = ["requests"]
-
-[[package]]
-name = "urllib3"
-version = "1.26.7"
-description = "HTTP library with thread-safe connection pooling, file post, and more."
-category = "main"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
-
-[package.extras]
-brotli = ["brotlipy (>=0.6.0)"]
-secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"]
-socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
-
-[metadata]
-lock-version = "1.1"
-python-versions = "^3.7"
-content-hash = "4a624c76d5d28333a13081a3fe5fba3eadcdfc09ac0963d1f1ecd89eb03451aa"
-
-[metadata.files]
-certifi = [
-    {file = "certifi-2021.10.8-py2.py3-none-any.whl", hash = "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"},
-    {file = "certifi-2021.10.8.tar.gz", hash = "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872"},
-]
-charset-normalizer = [
-    {file = "charset-normalizer-2.0.7.tar.gz", hash = "sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0"},
-    {file = "charset_normalizer-2.0.7-py3-none-any.whl", hash = "sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b"},
-]
-colorama = [
-    {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"},
-    {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"},
-]
-idna = [
-    {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
-    {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
-]
-requests = [
-    {file = "requests-2.26.0-py2.py3-none-any.whl", hash = "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24"},
-    {file = "requests-2.26.0.tar.gz", hash = "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"},
-]
-ssgetpy = []
-tqdm = [
-    {file = "tqdm-4.62.3-py2.py3-none-any.whl", hash = "sha256:8dd278a422499cd6b727e6ae4061c40b48fce8b76d1ccbf5d34fca9b7f925b0c"},
-    {file = "tqdm-4.62.3.tar.gz", hash = "sha256:d359de7217506c9851b7869f3708d8ee53ed70a1b8edbba4dbcb47442592920d"},
-]
-urllib3 = [
-    {file = "urllib3-1.26.7-py2.py3-none-any.whl", hash = "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"},
-    {file = "urllib3-1.26.7.tar.gz", hash = "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece"},
-]
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,15 +0,0 @@
-[tool.poetry]
-name = "ss-downloader"
-version = "0.1.0"
-description = ""
-authors = ["Carl Pearson <cwpears@sandia.gov>"]
-
-[tool.poetry.dependencies]
-python = "^3.7"
-ssgetpy = {git = "https://github.com/cwpearson/ssgetpy.git", rev = "be00d2a"}
-
-[tool.poetry.dev-dependencies]
-
-[build-system]
-requires = ["poetry-core>=1.0.0"]
-build-backend = "poetry.core.masonry.api"
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+ssgetpy
+scipy
Author	SHA1	Message	Date
Carl Pearson	80d18fca11	don't use poetry, update Perlmutter environment	2023-03-09 10:24:58 -08:00
Carl Pearson	07035904ee	import datasets -> from lib import datasets	2022-11-02 08:51:02 -07:00
Carl Pearson	a57793a6f4	create directory if it doesn't exist	2022-11-02 08:50:33 -07:00
Carl Pearson	f65421a4cf	rzvernal	2022-11-02 08:50:08 -07:00
Carl William Pearson	34bc594b04	ignore .python-version	2022-01-28 13:14:43 -07:00
Carl William Pearson	4584ce14ee	add square datasets	2022-01-28 13:14:30 -07:00
Carl William Pearson	29f5289068	ascicgpu	2022-01-28 13:14:19 -07:00
Carl Pearson	800485d984	don't download enormous matrices	2021-12-01 14:58:08 -08:00
Carl Pearson	ecd6cebf05	readme	2021-12-01 14:32:07 -08:00
Carl Pearson	9800c3b5f9	automatically download nonzero datatype metadata	2021-12-01 14:27:44 -08:00