add unused.py, refactor common code a bit

2021-12-01 08:28:45 -08:00
parent 84bdee85ce
commit 4a09bc2d33
7 changed files with 83 additions and 48 deletions
--- a/README.md
+++ b/README.md
@@ -19,6 +19,16 @@ To download all datasets
 poetry run python download.py all
 ```
 You can move the datasets due to relative symlinks. For example:
 ```
 rsync -azvh $SCRATCH/ ~/cfm_m3918/pearson
 ```
 ## how to use (unsupported platform)
 set `SS_DIR` environment variable to the directory where you want the dataset folders to be generated.
 ## What it does
 Downloads subsets of the suitesparse collection to different directories.
--- a/download.py
+++ b/download.py
@@ -2,7 +2,9 @@ import os
 from pathlib import Path, PurePath
 import sys
-import datasets
+from lib import config
 from lib import datasets
 def ensure_dir(path):
    print("ensure", path)
@@ -13,7 +15,7 @@ def ensure_dir(path):
 def ensure_matrix_download(dir, mat):
    if os.path.exists(dir / mat.name / (mat.name + ".mtx")):
-        print(f"SKIP {mat.name}: already exists")
+        # already downloaded
        return
    mat.download(format='MM', destpath=dir, extract=True)
@@ -29,7 +31,7 @@ def ensure_matrix_link(downDir, linkDir, mat):
            try:
                os.symlink(src, dst)
            except FileExistsError:
-                pass # dir already exists
+                pass # symlink already exists
            return
 def download_dataset(dataset):
@@ -37,12 +39,10 @@ def download_dataset(dataset):
    print(len(mats))
    # scratch directory
    scratchPath = Path(os.environ["SCRATCH"])
    # where matrices will be downloaded
-    downDir = scratchPath / "suitesparse"
+    downDir = config.DIR / "suitesparse"
    # where the matrix will be linked to
-    linkDir = scratchPath / dataset.name
+    linkDir = config.DIR / dataset.name
    ensure_dir(downDir)
    ensure_dir(linkDir)
--- a/lib/config.py
+++ b/lib/config.py
@@ -0,0 +1,9 @@
 import os
 import sys
 from pathlib import Path
 try:
    DIR = Path(os.environ["SS_DIR"])
 except KeyError as e:
    print("ERROR: $SS_DIR not set")
    sys.exit(1)
--- a/lib/datasets.py
+++ b/lib/datasets.py
@@ -3,7 +3,7 @@ import sys
 import ssgetpy
-import lists
+from lib import lists
 Dataset = collections.namedtuple("Dataset", ["name", "mats"])
@@ -15,35 +15,30 @@ def safe_dir_name(s):
    t = t.lower()
    return t
-def filter_reject_blacklist(mats):
+def mat_is_integer(mat):
-    filtered = []
+    return mat.name in lists.INTEGER_MATS
-    for mat in mats:
+
-        if mat.name in lists.INTEGER_MATS:
+def filter_reject_integer(mats):
-            print(f"BLACKLIST {mat.name}")
+    return [mat for mat in mats if not mat_is_integer(mat)]
-            continue
+
-        filtered += [mat]
+def mat_is_small(mat):
-    return filtered
+    return (mat.rows < 1_000 and mat.cols < 1_000) \
        or mat.nnz < 20_000
 def mat_is_large(mat):
    return (mat.rows > 1_000_000 and mat.cols < 1_000_000) \
        or mat.nnz > 20_000_000
 def filter_reject_large(mats):
-    filtered = []
+    return [mat for mat in mats if not mat_is_large(mat)]
    for mat in mats:
        if mat.rows > 1_000_000 or mat.cols > 1_000_000 or mat.nnz > 20_000_000:
            continue
        filtered += [mat]
    return filtered
 def filter_reject_small(mats):
-    filtered = []
+    return [mat for mat in mats if not mat_is_small(mat)]
    for mat in mats:
        if mat.rows < 1_000 or mat.cols < 1_000 or mat.nnz < 20_000:
            continue
        filtered += [mat]
    return filtered
 ## all real-valued matrices
 REAL_MATS = Dataset(
    name = "reals",
-    mats = filter_reject_blacklist(ssgetpy.search(
+    mats = filter_reject_integer(ssgetpy.search(
        dtype='real',
        limit=1_000_000
    ))
@@ -61,10 +56,7 @@ kinds = [
    "Theoretical/Quantum Chemistry Problem",
    "Thermal Problem",
 ]
-REGULAR_REAL_MATS = Dataset(
+
    name = "regular_reals",
    mats = []
 )
 mats = []
 for kind in kinds:
    mats += ssgetpy.search(
@@ -74,7 +66,7 @@ for kind in kinds:
    )
 REGULAR_REAL_MATS = Dataset(
    name="regular_reals",
-    mats = filter_reject_blacklist(mats)
+    mats = filter_reject_integer(mats)
 )
 ## keep "small" matrices
@@ -97,12 +89,6 @@ REAL_MED_MATS = Dataset (
    mats = filter_reject_large(filter_reject_small(REAL_MATS.mats))
 )
 ## export all datasets
 DATASETS  = [
    REAL_MATS,
@@ -115,26 +101,24 @@ DATASETS  = [
 def get_kinds():
    """return set of unique kind fields"""
    mats = ssgetpy.search(
        limit=1_000_000
    )
    kinds = set()
    for mat in mats:
        kinds.add(mat.kind)
    print(f"kinds: {kinds}")
    return kinds
 for kind in get_kinds():
        d = Dataset(
            name = "kind_"+safe_dir_name(kind),
-            mats = filter_reject_blacklist(ssgetpy.search(
+            mats = filter_reject_large( \
            filter_reject_small( \
            filter_reject_integer(ssgetpy.search(
                kind=kind,
                dtype='real',
                limit=1_000_000
-            ))
+            ))))
        )
        if len(d.mats) > 0:
            DATASETS += [d]
--- a/lib/lists.py
+++ b/lib/lists.py
--- a/load-env.sh
+++ b/load-env.sh
@@ -4,14 +4,18 @@ host=`hostname`
 if [[ "$NERSC_HOST" == cori ]]; then
    echo \$NERSC_HOST matched cori
    module load cray-python/3.8.5.0
    module load cray-python/3.8.5.0
    which python
    export SS_DIR="$CFS"/m3918/pearson
    echo "\$SS_DIR = $SS_DIR"
 elif [[ "$NERSC_HOST" == perlmutter ]]; then
    echo \$NERSC_HOST matched perlmutter
    module load cray-python/3.9.4.1
    which python
    export SS_DIR="$CFS"/m3918/pearson
    echo "\$SS_DIR = $SS_DIR"
 fi
--- a/unused.py
+++ b/unused.py
@@ -0,0 +1,28 @@
 """list unused data in suitesparse"""
 import os
 from pathlib import Path, PurePath
 import sys
 from lib import config
 from lib import datasets
 used = set()
 for dataset in datasets.DATASETS:
    # check if dataset directory exists
    if not os.path.isdir(config.DIR / dataset.name):
        continue
    for f in os.listdir(config.DIR / dataset.name):
        if f.endswith(".mtx"):
            used.add(f[:-4])
 for f in os.listdir(config.DIR / "suitesparse"):
    if f not in used:
        print(os.path.abspath(config.DIR / "suitesparse" / f))