add unused.py, refactor common code a bit

This commit is contained in:
Carl Pearson
2021-12-01 08:28:45 -08:00
parent 84bdee85ce
commit 4a09bc2d33
7 changed files with 83 additions and 48 deletions

View File

@@ -19,6 +19,16 @@ To download all datasets
poetry run python download.py all
```
You can move the datasets due to relative symlinks. For example:
```
rsync -azvh $SCRATCH/ ~/cfm_m3918/pearson
```
## how to use (unsupported platform)
set `SS_DIR` environment variable to the directory where you want the dataset folders to be generated.
## What it does
Downloads subsets of the suitesparse collection to different directories.

View File

@@ -2,7 +2,9 @@ import os
from pathlib import Path, PurePath
import sys
import datasets
from lib import config
from lib import datasets
def ensure_dir(path):
print("ensure", path)
@@ -13,7 +15,7 @@ def ensure_dir(path):
def ensure_matrix_download(dir, mat):
if os.path.exists(dir / mat.name / (mat.name + ".mtx")):
print(f"SKIP {mat.name}: already exists")
# already downloaded
return
mat.download(format='MM', destpath=dir, extract=True)
@@ -29,7 +31,7 @@ def ensure_matrix_link(downDir, linkDir, mat):
try:
os.symlink(src, dst)
except FileExistsError:
pass # dir already exists
pass # symlink already exists
return
def download_dataset(dataset):
@@ -37,12 +39,10 @@ def download_dataset(dataset):
print(len(mats))
# scratch directory
scratchPath = Path(os.environ["SCRATCH"])
# where matrices will be downloaded
downDir = scratchPath / "suitesparse"
downDir = config.DIR / "suitesparse"
# where the matrix will be linked to
linkDir = scratchPath / dataset.name
linkDir = config.DIR / dataset.name
ensure_dir(downDir)
ensure_dir(linkDir)

9
lib/config.py Normal file
View File

@@ -0,0 +1,9 @@
import os
import sys
from pathlib import Path
try:
DIR = Path(os.environ["SS_DIR"])
except KeyError as e:
print("ERROR: $SS_DIR not set")
sys.exit(1)

View File

@@ -3,7 +3,7 @@ import sys
import ssgetpy
import lists
from lib import lists
Dataset = collections.namedtuple("Dataset", ["name", "mats"])
@@ -15,35 +15,30 @@ def safe_dir_name(s):
t = t.lower()
return t
def filter_reject_blacklist(mats):
filtered = []
for mat in mats:
if mat.name in lists.INTEGER_MATS:
print(f"BLACKLIST {mat.name}")
continue
filtered += [mat]
return filtered
def mat_is_integer(mat):
return mat.name in lists.INTEGER_MATS
def filter_reject_integer(mats):
return [mat for mat in mats if not mat_is_integer(mat)]
def mat_is_small(mat):
return (mat.rows < 1_000 and mat.cols < 1_000) \
or mat.nnz < 20_000
def mat_is_large(mat):
return (mat.rows > 1_000_000 and mat.cols < 1_000_000) \
or mat.nnz > 20_000_000
def filter_reject_large(mats):
filtered = []
for mat in mats:
if mat.rows > 1_000_000 or mat.cols > 1_000_000 or mat.nnz > 20_000_000:
continue
filtered += [mat]
return filtered
return [mat for mat in mats if not mat_is_large(mat)]
def filter_reject_small(mats):
filtered = []
for mat in mats:
if mat.rows < 1_000 or mat.cols < 1_000 or mat.nnz < 20_000:
continue
filtered += [mat]
return filtered
return [mat for mat in mats if not mat_is_small(mat)]
## all real-valued matrices
REAL_MATS = Dataset(
name = "reals",
mats = filter_reject_blacklist(ssgetpy.search(
mats = filter_reject_integer(ssgetpy.search(
dtype='real',
limit=1_000_000
))
@@ -61,10 +56,7 @@ kinds = [
"Theoretical/Quantum Chemistry Problem",
"Thermal Problem",
]
REGULAR_REAL_MATS = Dataset(
name = "regular_reals",
mats = []
)
mats = []
for kind in kinds:
mats += ssgetpy.search(
@@ -74,7 +66,7 @@ for kind in kinds:
)
REGULAR_REAL_MATS = Dataset(
name="regular_reals",
mats = filter_reject_blacklist(mats)
mats = filter_reject_integer(mats)
)
## keep "small" matrices
@@ -97,12 +89,6 @@ REAL_MED_MATS = Dataset (
mats = filter_reject_large(filter_reject_small(REAL_MATS.mats))
)
## export all datasets
DATASETS = [
REAL_MATS,
@@ -115,26 +101,24 @@ DATASETS = [
def get_kinds():
"""return set of unique kind fields"""
mats = ssgetpy.search(
limit=1_000_000
)
kinds = set()
for mat in mats:
kinds.add(mat.kind)
print(f"kinds: {kinds}")
return kinds
for kind in get_kinds():
d = Dataset(
name = "kind_"+safe_dir_name(kind),
mats = filter_reject_blacklist(ssgetpy.search(
mats = filter_reject_large( \
filter_reject_small( \
filter_reject_integer(ssgetpy.search(
kind=kind,
dtype='real',
limit=1_000_000
))
))))
)
if len(d.mats) > 0:
DATASETS += [d]

View File

@@ -4,14 +4,18 @@ host=`hostname`
if [[ "$NERSC_HOST" == cori ]]; then
echo \$NERSC_HOST matched cori
module load cray-python/3.8.5.0
module load cray-python/3.8.5.0
which python
export SS_DIR="$CFS"/m3918/pearson
echo "\$SS_DIR = $SS_DIR"
elif [[ "$NERSC_HOST" == perlmutter ]]; then
echo \$NERSC_HOST matched perlmutter
module load cray-python/3.9.4.1
which python
export SS_DIR="$CFS"/m3918/pearson
echo "\$SS_DIR = $SS_DIR"
fi

28
unused.py Normal file
View File

@@ -0,0 +1,28 @@
"""list unused data in suitesparse"""
import os
from pathlib import Path, PurePath
import sys
from lib import config
from lib import datasets
used = set()
for dataset in datasets.DATASETS:
# check if dataset directory exists
if not os.path.isdir(config.DIR / dataset.name):
continue
for f in os.listdir(config.DIR / dataset.name):
if f.endswith(".mtx"):
used.add(f[:-4])
for f in os.listdir(config.DIR / "suitesparse"):
if f not in used:
print(os.path.abspath(config.DIR / "suitesparse" / f))