add unused.py, refactor common code a bit

This commit is contained in:
Carl Pearson
2021-12-01 08:28:45 -08:00
parent 84bdee85ce
commit 4a09bc2d33
7 changed files with 83 additions and 48 deletions

View File

@@ -19,6 +19,16 @@ To download all datasets
poetry run python download.py all poetry run python download.py all
``` ```
You can move the datasets due to relative symlinks. For example:
```
rsync -azvh $SCRATCH/ ~/cfm_m3918/pearson
```
## how to use (unsupported platform)
set `SS_DIR` environment variable to the directory where you want the dataset folders to be generated.
## What it does ## What it does
Downloads subsets of the suitesparse collection to different directories. Downloads subsets of the suitesparse collection to different directories.

View File

@@ -2,7 +2,9 @@ import os
from pathlib import Path, PurePath from pathlib import Path, PurePath
import sys import sys
import datasets from lib import config
from lib import datasets
def ensure_dir(path): def ensure_dir(path):
print("ensure", path) print("ensure", path)
@@ -13,7 +15,7 @@ def ensure_dir(path):
def ensure_matrix_download(dir, mat): def ensure_matrix_download(dir, mat):
if os.path.exists(dir / mat.name / (mat.name + ".mtx")): if os.path.exists(dir / mat.name / (mat.name + ".mtx")):
print(f"SKIP {mat.name}: already exists") # already downloaded
return return
mat.download(format='MM', destpath=dir, extract=True) mat.download(format='MM', destpath=dir, extract=True)
@@ -29,7 +31,7 @@ def ensure_matrix_link(downDir, linkDir, mat):
try: try:
os.symlink(src, dst) os.symlink(src, dst)
except FileExistsError: except FileExistsError:
pass # dir already exists pass # symlink already exists
return return
def download_dataset(dataset): def download_dataset(dataset):
@@ -37,12 +39,10 @@ def download_dataset(dataset):
print(len(mats)) print(len(mats))
# scratch directory
scratchPath = Path(os.environ["SCRATCH"])
# where matrices will be downloaded # where matrices will be downloaded
downDir = scratchPath / "suitesparse" downDir = config.DIR / "suitesparse"
# where the matrix will be linked to # where the matrix will be linked to
linkDir = scratchPath / dataset.name linkDir = config.DIR / dataset.name
ensure_dir(downDir) ensure_dir(downDir)
ensure_dir(linkDir) ensure_dir(linkDir)

9
lib/config.py Normal file
View File

@@ -0,0 +1,9 @@
import os
import sys
from pathlib import Path
try:
DIR = Path(os.environ["SS_DIR"])
except KeyError as e:
print("ERROR: $SS_DIR not set")
sys.exit(1)

View File

@@ -3,7 +3,7 @@ import sys
import ssgetpy import ssgetpy
import lists from lib import lists
Dataset = collections.namedtuple("Dataset", ["name", "mats"]) Dataset = collections.namedtuple("Dataset", ["name", "mats"])
@@ -15,35 +15,30 @@ def safe_dir_name(s):
t = t.lower() t = t.lower()
return t return t
def filter_reject_blacklist(mats): def mat_is_integer(mat):
filtered = [] return mat.name in lists.INTEGER_MATS
for mat in mats:
if mat.name in lists.INTEGER_MATS: def filter_reject_integer(mats):
print(f"BLACKLIST {mat.name}") return [mat for mat in mats if not mat_is_integer(mat)]
continue
filtered += [mat] def mat_is_small(mat):
return filtered return (mat.rows < 1_000 and mat.cols < 1_000) \
or mat.nnz < 20_000
def mat_is_large(mat):
return (mat.rows > 1_000_000 and mat.cols < 1_000_000) \
or mat.nnz > 20_000_000
def filter_reject_large(mats): def filter_reject_large(mats):
filtered = [] return [mat for mat in mats if not mat_is_large(mat)]
for mat in mats:
if mat.rows > 1_000_000 or mat.cols > 1_000_000 or mat.nnz > 20_000_000:
continue
filtered += [mat]
return filtered
def filter_reject_small(mats): def filter_reject_small(mats):
filtered = [] return [mat for mat in mats if not mat_is_small(mat)]
for mat in mats:
if mat.rows < 1_000 or mat.cols < 1_000 or mat.nnz < 20_000:
continue
filtered += [mat]
return filtered
## all real-valued matrices ## all real-valued matrices
REAL_MATS = Dataset( REAL_MATS = Dataset(
name = "reals", name = "reals",
mats = filter_reject_blacklist(ssgetpy.search( mats = filter_reject_integer(ssgetpy.search(
dtype='real', dtype='real',
limit=1_000_000 limit=1_000_000
)) ))
@@ -61,10 +56,7 @@ kinds = [
"Theoretical/Quantum Chemistry Problem", "Theoretical/Quantum Chemistry Problem",
"Thermal Problem", "Thermal Problem",
] ]
REGULAR_REAL_MATS = Dataset(
name = "regular_reals",
mats = []
)
mats = [] mats = []
for kind in kinds: for kind in kinds:
mats += ssgetpy.search( mats += ssgetpy.search(
@@ -74,7 +66,7 @@ for kind in kinds:
) )
REGULAR_REAL_MATS = Dataset( REGULAR_REAL_MATS = Dataset(
name="regular_reals", name="regular_reals",
mats = filter_reject_blacklist(mats) mats = filter_reject_integer(mats)
) )
## keep "small" matrices ## keep "small" matrices
@@ -97,12 +89,6 @@ REAL_MED_MATS = Dataset (
mats = filter_reject_large(filter_reject_small(REAL_MATS.mats)) mats = filter_reject_large(filter_reject_small(REAL_MATS.mats))
) )
## export all datasets ## export all datasets
DATASETS = [ DATASETS = [
REAL_MATS, REAL_MATS,
@@ -115,26 +101,24 @@ DATASETS = [
def get_kinds(): def get_kinds():
"""return set of unique kind fields""" """return set of unique kind fields"""
mats = ssgetpy.search( mats = ssgetpy.search(
limit=1_000_000 limit=1_000_000
) )
kinds = set() kinds = set()
for mat in mats: for mat in mats:
kinds.add(mat.kind) kinds.add(mat.kind)
print(f"kinds: {kinds}")
return kinds return kinds
for kind in get_kinds(): for kind in get_kinds():
d = Dataset( d = Dataset(
name = "kind_"+safe_dir_name(kind), name = "kind_"+safe_dir_name(kind),
mats = filter_reject_blacklist(ssgetpy.search( mats = filter_reject_large( \
filter_reject_small( \
filter_reject_integer(ssgetpy.search(
kind=kind, kind=kind,
dtype='real', dtype='real',
limit=1_000_000 limit=1_000_000
)) ))))
) )
if len(d.mats) > 0: if len(d.mats) > 0:
DATASETS += [d] DATASETS += [d]

View File

@@ -4,14 +4,18 @@ host=`hostname`
if [[ "$NERSC_HOST" == cori ]]; then if [[ "$NERSC_HOST" == cori ]]; then
echo \$NERSC_HOST matched cori echo \$NERSC_HOST matched cori
module load cray-python/3.8.5.0
module load cray-python/3.8.5.0
which python which python
export SS_DIR="$CFS"/m3918/pearson
echo "\$SS_DIR = $SS_DIR"
elif [[ "$NERSC_HOST" == perlmutter ]]; then elif [[ "$NERSC_HOST" == perlmutter ]]; then
echo \$NERSC_HOST matched perlmutter echo \$NERSC_HOST matched perlmutter
module load cray-python/3.9.4.1 module load cray-python/3.9.4.1
which python which python
export SS_DIR="$CFS"/m3918/pearson
echo "\$SS_DIR = $SS_DIR"
fi fi

28
unused.py Normal file
View File

@@ -0,0 +1,28 @@
"""list unused data in suitesparse"""
import os
from pathlib import Path, PurePath
import sys
from lib import config
from lib import datasets
used = set()
for dataset in datasets.DATASETS:
# check if dataset directory exists
if not os.path.isdir(config.DIR / dataset.name):
continue
for f in os.listdir(config.DIR / dataset.name):
if f.endswith(".mtx"):
used.add(f[:-4])
for f in os.listdir(config.DIR / "suitesparse"):
if f not in used:
print(os.path.abspath(config.DIR / "suitesparse" / f))