add unused.py, refactor common code a bit

2021-12-01 08:28:45 -08:00
parent 84bdee85ce
commit 4a09bc2d33
7 changed files with 83 additions and 48 deletions
--- a/README.md
+++ b/README.md
@@ -19,6 +19,16 @@ To download all datasets
 poetry run python download.py all
 ```

+You can move the datasets due to relative symlinks. For example:
+
+```
+rsync -azvh $SCRATCH/ ~/cfm_m3918/pearson
+```
+
+## how to use (unsupported platform)
+
+set `SS_DIR` environment variable to the directory where you want the dataset folders to be generated.
+
 ## What it does

 Downloads subsets of the suitesparse collection to different directories.
--- a/download.py
+++ b/download.py
@@ -2,7 +2,9 @@ import os
 from pathlib import Path, PurePath
 import sys

-import datasets
+from lib import config
+from lib import datasets
+

 def ensure_dir(path):
    print("ensure", path)
@@ -13,7 +15,7 @@ def ensure_dir(path):

 def ensure_matrix_download(dir, mat):
    if os.path.exists(dir / mat.name / (mat.name + ".mtx")):
-        print(f"SKIP {mat.name}: already exists")
+        # already downloaded
        return
    mat.download(format='MM', destpath=dir, extract=True)

@@ -29,7 +31,7 @@ def ensure_matrix_link(downDir, linkDir, mat):
            try:
                os.symlink(src, dst)
            except FileExistsError:
-                pass # dir already exists
+                pass # symlink already exists
            return

 def download_dataset(dataset):
@@ -37,12 +39,10 @@ def download_dataset(dataset):

    print(len(mats))

-    # scratch directory
-    scratchPath = Path(os.environ["SCRATCH"])
    # where matrices will be downloaded
-    downDir = scratchPath / "suitesparse"
+    downDir = config.DIR / "suitesparse"
    # where the matrix will be linked to
-    linkDir = scratchPath / dataset.name
+    linkDir = config.DIR / dataset.name
    ensure_dir(downDir)
    ensure_dir(linkDir)

--- a/lib/config.py
+++ b/lib/config.py
@@ -0,0 +1,9 @@
+import os
+import sys
+from pathlib import Path
+
+try:
+    DIR = Path(os.environ["SS_DIR"])
+except KeyError as e:
+    print("ERROR: $SS_DIR not set")
+    sys.exit(1)
--- a/lib/datasets.py
+++ b/lib/datasets.py
@@ -3,7 +3,7 @@ import sys

 import ssgetpy

-import lists
+from lib import lists

 Dataset = collections.namedtuple("Dataset", ["name", "mats"])

@@ -15,35 +15,30 @@ def safe_dir_name(s):
    t = t.lower()
    return t

-def filter_reject_blacklist(mats):
-    filtered = []
-    for mat in mats:
-        if mat.name in lists.INTEGER_MATS:
-            print(f"BLACKLIST {mat.name}")
-            continue
-        filtered += [mat]
-    return filtered
+def mat_is_integer(mat):
+    return mat.name in lists.INTEGER_MATS
+
+def filter_reject_integer(mats):
+    return [mat for mat in mats if not mat_is_integer(mat)]
+
+def mat_is_small(mat):
+    return (mat.rows < 1_000 and mat.cols < 1_000) \
+        or mat.nnz < 20_000
+
+def mat_is_large(mat):
+    return (mat.rows > 1_000_000 and mat.cols < 1_000_000) \
+        or mat.nnz > 20_000_000

 def filter_reject_large(mats):
-    filtered = []
-    for mat in mats:
-        if mat.rows > 1_000_000 or mat.cols > 1_000_000 or mat.nnz > 20_000_000:
-            continue
-        filtered += [mat]
-    return filtered
+    return [mat for mat in mats if not mat_is_large(mat)]

 def filter_reject_small(mats):
-    filtered = []
-    for mat in mats:
-        if mat.rows < 1_000 or mat.cols < 1_000 or mat.nnz < 20_000:
-            continue
-        filtered += [mat]
-    return filtered
+    return [mat for mat in mats if not mat_is_small(mat)]

 ## all real-valued matrices
 REAL_MATS = Dataset(
    name = "reals",
-    mats = filter_reject_blacklist(ssgetpy.search(
+    mats = filter_reject_integer(ssgetpy.search(
        dtype='real',
        limit=1_000_000
    ))
@@ -61,10 +56,7 @@ kinds = [
    "Theoretical/Quantum Chemistry Problem",
    "Thermal Problem",
 ]
-REGULAR_REAL_MATS = Dataset(
-    name = "regular_reals",
-    mats = []
-)
+
 mats = []
 for kind in kinds:
    mats += ssgetpy.search(
@@ -74,7 +66,7 @@ for kind in kinds:
    )
 REGULAR_REAL_MATS = Dataset(
    name="regular_reals",
-    mats = filter_reject_blacklist(mats)
+    mats = filter_reject_integer(mats)
 )

 ## keep "small" matrices
@@ -97,12 +89,6 @@ REAL_MED_MATS = Dataset (
    mats = filter_reject_large(filter_reject_small(REAL_MATS.mats))
 )

-
-
-
-
-
-
 ## export all datasets
 DATASETS  = [
    REAL_MATS,
@@ -115,26 +101,24 @@ DATASETS  = [

 def get_kinds():
    """return set of unique kind fields"""
-
    mats = ssgetpy.search(
        limit=1_000_000
    )
-
    kinds = set()
    for mat in mats:
        kinds.add(mat.kind)
-    print(f"kinds: {kinds}")
-
    return kinds

 for kind in get_kinds():
        d = Dataset(
            name = "kind_"+safe_dir_name(kind),
-            mats = filter_reject_blacklist(ssgetpy.search(
+            mats = filter_reject_large( \
+            filter_reject_small( \
+            filter_reject_integer(ssgetpy.search(
                kind=kind,
                dtype='real',
                limit=1_000_000
-            ))
+            ))))
        )
        if len(d.mats) > 0:
            DATASETS += [d]
--- a/lib/lists.py
+++ b/lib/lists.py
--- a/load-env.sh
+++ b/load-env.sh
@@ -4,14 +4,18 @@ host=`hostname`

 if [[ "$NERSC_HOST" == cori ]]; then
    echo \$NERSC_HOST matched cori
-    module load cray-python/3.8.5.0

+    module load cray-python/3.8.5.0
    which python
+
+    export SS_DIR="$CFS"/m3918/pearson
+    echo "\$SS_DIR = $SS_DIR"
 elif [[ "$NERSC_HOST" == perlmutter ]]; then
    echo \$NERSC_HOST matched perlmutter

    module load cray-python/3.9.4.1
-
    which python
+    export SS_DIR="$CFS"/m3918/pearson
+    echo "\$SS_DIR = $SS_DIR"
 fi

--- a/unused.py
+++ b/unused.py
@@ -0,0 +1,28 @@
+"""list unused data in suitesparse"""
+
+import os
+from pathlib import Path, PurePath
+import sys
+
+from lib import config
+from lib import datasets
+
+used = set()
+
+for dataset in datasets.DATASETS:
+
+    # check if dataset directory exists
+    if not os.path.isdir(config.DIR / dataset.name):
+        continue
+
+    for f in os.listdir(config.DIR / dataset.name):
+        if f.endswith(".mtx"):
+            used.add(f[:-4])
+
+for f in os.listdir(config.DIR / "suitesparse"):
+    if f not in used:
+        print(os.path.abspath(config.DIR / "suitesparse" / f))
+
+
+
+