From 4a09bc2d3327d2665f96f8010f33902ed5b00dcb Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Wed, 1 Dec 2021 08:28:45 -0800 Subject: [PATCH] add unused.py, refactor common code a bit --- README.md | 10 ++++++ download.py | 14 ++++---- lib/config.py | 9 +++++ datasets.py => lib/datasets.py | 62 +++++++++++++--------------------- lists.py => lib/lists.py | 0 load-env.sh | 8 +++-- unused.py | 28 +++++++++++++++ 7 files changed, 83 insertions(+), 48 deletions(-) create mode 100644 lib/config.py rename datasets.py => lib/datasets.py (70%) rename lists.py => lib/lists.py (100%) create mode 100644 unused.py diff --git a/README.md b/README.md index 4148540..be8c93d 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,16 @@ To download all datasets poetry run python download.py all ``` +You can move the datasets due to relative symlinks. For example: + +``` +rsync -azvh $SCRATCH/ ~/cfm_m3918/pearson +``` + +## how to use (unsupported platform) + +set `SS_DIR` environment variable to the directory where you want the dataset folders to be generated. + ## What it does Downloads subsets of the suitesparse collection to different directories. diff --git a/download.py b/download.py index ce2b8f1..c87399f 100755 --- a/download.py +++ b/download.py @@ -2,7 +2,9 @@ import os from pathlib import Path, PurePath import sys -import datasets +from lib import config +from lib import datasets + def ensure_dir(path): print("ensure", path) @@ -13,7 +15,7 @@ def ensure_dir(path): def ensure_matrix_download(dir, mat): if os.path.exists(dir / mat.name / (mat.name + ".mtx")): - print(f"SKIP {mat.name}: already exists") + # already downloaded return mat.download(format='MM', destpath=dir, extract=True) @@ -29,7 +31,7 @@ def ensure_matrix_link(downDir, linkDir, mat): try: os.symlink(src, dst) except FileExistsError: - pass # dir already exists + pass # symlink already exists return def download_dataset(dataset): @@ -37,12 +39,10 @@ def download_dataset(dataset): print(len(mats)) - # scratch directory - scratchPath = Path(os.environ["SCRATCH"]) # where matrices will be downloaded - downDir = scratchPath / "suitesparse" + downDir = config.DIR / "suitesparse" # where the matrix will be linked to - linkDir = scratchPath / dataset.name + linkDir = config.DIR / dataset.name ensure_dir(downDir) ensure_dir(linkDir) diff --git a/lib/config.py b/lib/config.py new file mode 100644 index 0000000..e00fdb9 --- /dev/null +++ b/lib/config.py @@ -0,0 +1,9 @@ +import os +import sys +from pathlib import Path + +try: + DIR = Path(os.environ["SS_DIR"]) +except KeyError as e: + print("ERROR: $SS_DIR not set") + sys.exit(1) diff --git a/datasets.py b/lib/datasets.py similarity index 70% rename from datasets.py rename to lib/datasets.py index 805a720..fb39ad1 100644 --- a/datasets.py +++ b/lib/datasets.py @@ -3,7 +3,7 @@ import sys import ssgetpy -import lists +from lib import lists Dataset = collections.namedtuple("Dataset", ["name", "mats"]) @@ -15,35 +15,30 @@ def safe_dir_name(s): t = t.lower() return t -def filter_reject_blacklist(mats): - filtered = [] - for mat in mats: - if mat.name in lists.INTEGER_MATS: - print(f"BLACKLIST {mat.name}") - continue - filtered += [mat] - return filtered +def mat_is_integer(mat): + return mat.name in lists.INTEGER_MATS + +def filter_reject_integer(mats): + return [mat for mat in mats if not mat_is_integer(mat)] + +def mat_is_small(mat): + return (mat.rows < 1_000 and mat.cols < 1_000) \ + or mat.nnz < 20_000 + +def mat_is_large(mat): + return (mat.rows > 1_000_000 and mat.cols < 1_000_000) \ + or mat.nnz > 20_000_000 def filter_reject_large(mats): - filtered = [] - for mat in mats: - if mat.rows > 1_000_000 or mat.cols > 1_000_000 or mat.nnz > 20_000_000: - continue - filtered += [mat] - return filtered + return [mat for mat in mats if not mat_is_large(mat)] def filter_reject_small(mats): - filtered = [] - for mat in mats: - if mat.rows < 1_000 or mat.cols < 1_000 or mat.nnz < 20_000: - continue - filtered += [mat] - return filtered + return [mat for mat in mats if not mat_is_small(mat)] ## all real-valued matrices REAL_MATS = Dataset( name = "reals", - mats = filter_reject_blacklist(ssgetpy.search( + mats = filter_reject_integer(ssgetpy.search( dtype='real', limit=1_000_000 )) @@ -61,10 +56,7 @@ kinds = [ "Theoretical/Quantum Chemistry Problem", "Thermal Problem", ] -REGULAR_REAL_MATS = Dataset( - name = "regular_reals", - mats = [] -) + mats = [] for kind in kinds: mats += ssgetpy.search( @@ -74,7 +66,7 @@ for kind in kinds: ) REGULAR_REAL_MATS = Dataset( name="regular_reals", - mats = filter_reject_blacklist(mats) + mats = filter_reject_integer(mats) ) ## keep "small" matrices @@ -97,12 +89,6 @@ REAL_MED_MATS = Dataset ( mats = filter_reject_large(filter_reject_small(REAL_MATS.mats)) ) - - - - - - ## export all datasets DATASETS = [ REAL_MATS, @@ -115,26 +101,24 @@ DATASETS = [ def get_kinds(): """return set of unique kind fields""" - mats = ssgetpy.search( limit=1_000_000 ) - kinds = set() for mat in mats: kinds.add(mat.kind) - print(f"kinds: {kinds}") - return kinds for kind in get_kinds(): d = Dataset( name = "kind_"+safe_dir_name(kind), - mats = filter_reject_blacklist(ssgetpy.search( + mats = filter_reject_large( \ + filter_reject_small( \ + filter_reject_integer(ssgetpy.search( kind=kind, dtype='real', limit=1_000_000 - )) + )))) ) if len(d.mats) > 0: DATASETS += [d] diff --git a/lists.py b/lib/lists.py similarity index 100% rename from lists.py rename to lib/lists.py diff --git a/load-env.sh b/load-env.sh index b03d67d..7a12acd 100644 --- a/load-env.sh +++ b/load-env.sh @@ -4,14 +4,18 @@ host=`hostname` if [[ "$NERSC_HOST" == cori ]]; then echo \$NERSC_HOST matched cori - module load cray-python/3.8.5.0 + module load cray-python/3.8.5.0 which python + + export SS_DIR="$CFS"/m3918/pearson + echo "\$SS_DIR = $SS_DIR" elif [[ "$NERSC_HOST" == perlmutter ]]; then echo \$NERSC_HOST matched perlmutter module load cray-python/3.9.4.1 - which python + export SS_DIR="$CFS"/m3918/pearson + echo "\$SS_DIR = $SS_DIR" fi diff --git a/unused.py b/unused.py new file mode 100644 index 0000000..ae68a08 --- /dev/null +++ b/unused.py @@ -0,0 +1,28 @@ +"""list unused data in suitesparse""" + +import os +from pathlib import Path, PurePath +import sys + +from lib import config +from lib import datasets + +used = set() + +for dataset in datasets.DATASETS: + + # check if dataset directory exists + if not os.path.isdir(config.DIR / dataset.name): + continue + + for f in os.listdir(config.DIR / dataset.name): + if f.endswith(".mtx"): + used.add(f[:-4]) + +for f in os.listdir(config.DIR / "suitesparse"): + if f not in used: + print(os.path.abspath(config.DIR / "suitesparse" / f)) + + + +