From 4a09bc2d3327d2665f96f8010f33902ed5b00dcb Mon Sep 17 00:00:00 2001
From: Carl Pearson <cwpears@sandia.gov>
Date: Wed, 1 Dec 2021 08:28:45 -0800
Subject: [PATCH] add unused.py, refactor common code a bit

---
 README.md                      | 10 ++++++
 download.py                    | 14 ++++----
 lib/config.py                  |  9 +++++
 datasets.py => lib/datasets.py | 62 +++++++++++++---------------------
 lists.py => lib/lists.py       |  0
 load-env.sh                    |  8 +++--
 unused.py                      | 28 +++++++++++++++
 7 files changed, 83 insertions(+), 48 deletions(-)
 create mode 100644 lib/config.py
 rename datasets.py => lib/datasets.py (70%)
 rename lists.py => lib/lists.py (100%)
 create mode 100644 unused.py

diff --git a/README.md b/README.md
index 4148540..be8c93d 100644
--- a/README.md
+++ b/README.md
@@ -19,6 +19,16 @@ To download all datasets
 poetry run python download.py all
 ```
 
+You can move the datasets due to relative symlinks. For example:
+
+```
+rsync -azvh $SCRATCH/ ~/cfm_m3918/pearson
+```
+
+## how to use (unsupported platform)
+
+set `SS_DIR` environment variable to the directory where you want the dataset folders to be generated.
+
 ## What it does
 
 Downloads subsets of the suitesparse collection to different directories.
diff --git a/download.py b/download.py
index ce2b8f1..c87399f 100755
--- a/download.py
+++ b/download.py
@@ -2,7 +2,9 @@ import os
 from pathlib import Path, PurePath
 import sys
 
-import datasets
+from lib import config
+from lib import datasets
+
 
 def ensure_dir(path):
     print("ensure", path)
@@ -13,7 +15,7 @@ def ensure_dir(path):
 
 def ensure_matrix_download(dir, mat):
     if os.path.exists(dir / mat.name / (mat.name + ".mtx")):
-        print(f"SKIP {mat.name}: already exists")
+        # already downloaded
         return
     mat.download(format='MM', destpath=dir, extract=True)
 
@@ -29,7 +31,7 @@ def ensure_matrix_link(downDir, linkDir, mat):
             try:
                 os.symlink(src, dst)
             except FileExistsError:
-                pass # dir already exists
+                pass # symlink already exists
             return
 
 def download_dataset(dataset):
@@ -37,12 +39,10 @@ def download_dataset(dataset):
 
     print(len(mats))
 
-    # scratch directory
-    scratchPath = Path(os.environ["SCRATCH"])
     # where matrices will be downloaded
-    downDir = scratchPath / "suitesparse"
+    downDir = config.DIR / "suitesparse"
     # where the matrix will be linked to
-    linkDir = scratchPath / dataset.name
+    linkDir = config.DIR / dataset.name
     ensure_dir(downDir)
     ensure_dir(linkDir)
 
diff --git a/lib/config.py b/lib/config.py
new file mode 100644
index 0000000..e00fdb9
--- /dev/null
+++ b/lib/config.py
@@ -0,0 +1,9 @@
+import os
+import sys
+from pathlib import Path
+
+try:
+    DIR = Path(os.environ["SS_DIR"])
+except KeyError as e:
+    print("ERROR: $SS_DIR not set")
+    sys.exit(1)
diff --git a/datasets.py b/lib/datasets.py
similarity index 70%
rename from datasets.py
rename to lib/datasets.py
index 805a720..fb39ad1 100644
--- a/datasets.py
+++ b/lib/datasets.py
@@ -3,7 +3,7 @@ import sys
 
 import ssgetpy
 
-import lists
+from lib import lists
 
 Dataset = collections.namedtuple("Dataset", ["name", "mats"])
 
@@ -15,35 +15,30 @@ def safe_dir_name(s):
     t = t.lower()
     return t
 
-def filter_reject_blacklist(mats):
-    filtered = []
-    for mat in mats:
-        if mat.name in lists.INTEGER_MATS:
-            print(f"BLACKLIST {mat.name}")
-            continue
-        filtered += [mat]
-    return filtered
+def mat_is_integer(mat):
+    return mat.name in lists.INTEGER_MATS
+
+def filter_reject_integer(mats):
+    return [mat for mat in mats if not mat_is_integer(mat)]
+
+def mat_is_small(mat):
+    return (mat.rows < 1_000 and mat.cols < 1_000) \
+        or mat.nnz < 20_000
+
+def mat_is_large(mat):
+    return (mat.rows > 1_000_000 and mat.cols < 1_000_000) \
+        or mat.nnz > 20_000_000
 
 def filter_reject_large(mats):
-    filtered = []
-    for mat in mats:
-        if mat.rows > 1_000_000 or mat.cols > 1_000_000 or mat.nnz > 20_000_000:
-            continue
-        filtered += [mat]
-    return filtered
+    return [mat for mat in mats if not mat_is_large(mat)]
 
 def filter_reject_small(mats):
-    filtered = []
-    for mat in mats:
-        if mat.rows < 1_000 or mat.cols < 1_000 or mat.nnz < 20_000:
-            continue
-        filtered += [mat]
-    return filtered
+    return [mat for mat in mats if not mat_is_small(mat)]
 
 ## all real-valued matrices
 REAL_MATS = Dataset(
     name = "reals",
-    mats = filter_reject_blacklist(ssgetpy.search(
+    mats = filter_reject_integer(ssgetpy.search(
         dtype='real',
         limit=1_000_000
     ))
@@ -61,10 +56,7 @@ kinds = [
     "Theoretical/Quantum Chemistry Problem",
     "Thermal Problem",
 ]
-REGULAR_REAL_MATS = Dataset(
-    name = "regular_reals",
-    mats = []
-)
+
 mats = []
 for kind in kinds:
     mats += ssgetpy.search(
@@ -74,7 +66,7 @@ for kind in kinds:
     )
 REGULAR_REAL_MATS = Dataset(
     name="regular_reals",
-    mats = filter_reject_blacklist(mats)
+    mats = filter_reject_integer(mats)
 )
 
 ## keep "small" matrices
@@ -97,12 +89,6 @@ REAL_MED_MATS = Dataset (
     mats = filter_reject_large(filter_reject_small(REAL_MATS.mats))
 )
 
-
-
-
-
-
-
 ## export all datasets
 DATASETS  = [
     REAL_MATS,
@@ -115,26 +101,24 @@ DATASETS  = [
 
 def get_kinds():
     """return set of unique kind fields"""
-
     mats = ssgetpy.search(
         limit=1_000_000
     )
-
     kinds = set()
     for mat in mats:
         kinds.add(mat.kind)
-    print(f"kinds: {kinds}")
-
     return kinds
 
 for kind in get_kinds():
         d = Dataset(
             name = "kind_"+safe_dir_name(kind),
-            mats = filter_reject_blacklist(ssgetpy.search(
+            mats = filter_reject_large( \
+            filter_reject_small( \
+            filter_reject_integer(ssgetpy.search(
                 kind=kind,
                 dtype='real',
                 limit=1_000_000
-            ))
+            ))))
         )
         if len(d.mats) > 0:
             DATASETS += [d]
diff --git a/lists.py b/lib/lists.py
similarity index 100%
rename from lists.py
rename to lib/lists.py
diff --git a/load-env.sh b/load-env.sh
index b03d67d..7a12acd 100644
--- a/load-env.sh
+++ b/load-env.sh
@@ -4,14 +4,18 @@ host=`hostname`
 
 if [[ "$NERSC_HOST" == cori ]]; then
     echo \$NERSC_HOST matched cori
-    module load cray-python/3.8.5.0
 
+    module load cray-python/3.8.5.0
     which python
+
+    export SS_DIR="$CFS"/m3918/pearson
+    echo "\$SS_DIR = $SS_DIR"
 elif [[ "$NERSC_HOST" == perlmutter ]]; then
     echo \$NERSC_HOST matched perlmutter
 
     module load cray-python/3.9.4.1
-
     which python
+    export SS_DIR="$CFS"/m3918/pearson
+    echo "\$SS_DIR = $SS_DIR"
 fi
 
diff --git a/unused.py b/unused.py
new file mode 100644
index 0000000..ae68a08
--- /dev/null
+++ b/unused.py
@@ -0,0 +1,28 @@
+"""list unused data in suitesparse"""
+
+import os
+from pathlib import Path, PurePath
+import sys
+
+from lib import config
+from lib import datasets
+
+used = set()
+
+for dataset in datasets.DATASETS:
+
+    # check if dataset directory exists
+    if not os.path.isdir(config.DIR / dataset.name):
+        continue
+
+    for f in os.listdir(config.DIR / dataset.name):
+        if f.endswith(".mtx"):
+            used.add(f[:-4])
+
+for f in os.listdir(config.DIR / "suitesparse"):
+    if f not in used:
+        print(os.path.abspath(config.DIR / "suitesparse" / f))
+
+
+
+