add unused.py, refactor common code a bit
This commit is contained in:
10
README.md
10
README.md
@@ -19,6 +19,16 @@ To download all datasets
|
||||
poetry run python download.py all
|
||||
```
|
||||
|
||||
You can move the datasets due to relative symlinks. For example:
|
||||
|
||||
```
|
||||
rsync -azvh $SCRATCH/ ~/cfm_m3918/pearson
|
||||
```
|
||||
|
||||
## how to use (unsupported platform)
|
||||
|
||||
set `SS_DIR` environment variable to the directory where you want the dataset folders to be generated.
|
||||
|
||||
## What it does
|
||||
|
||||
Downloads subsets of the suitesparse collection to different directories.
|
||||
|
14
download.py
14
download.py
@@ -2,7 +2,9 @@ import os
|
||||
from pathlib import Path, PurePath
|
||||
import sys
|
||||
|
||||
import datasets
|
||||
from lib import config
|
||||
from lib import datasets
|
||||
|
||||
|
||||
def ensure_dir(path):
|
||||
print("ensure", path)
|
||||
@@ -13,7 +15,7 @@ def ensure_dir(path):
|
||||
|
||||
def ensure_matrix_download(dir, mat):
|
||||
if os.path.exists(dir / mat.name / (mat.name + ".mtx")):
|
||||
print(f"SKIP {mat.name}: already exists")
|
||||
# already downloaded
|
||||
return
|
||||
mat.download(format='MM', destpath=dir, extract=True)
|
||||
|
||||
@@ -29,7 +31,7 @@ def ensure_matrix_link(downDir, linkDir, mat):
|
||||
try:
|
||||
os.symlink(src, dst)
|
||||
except FileExistsError:
|
||||
pass # dir already exists
|
||||
pass # symlink already exists
|
||||
return
|
||||
|
||||
def download_dataset(dataset):
|
||||
@@ -37,12 +39,10 @@ def download_dataset(dataset):
|
||||
|
||||
print(len(mats))
|
||||
|
||||
# scratch directory
|
||||
scratchPath = Path(os.environ["SCRATCH"])
|
||||
# where matrices will be downloaded
|
||||
downDir = scratchPath / "suitesparse"
|
||||
downDir = config.DIR / "suitesparse"
|
||||
# where the matrix will be linked to
|
||||
linkDir = scratchPath / dataset.name
|
||||
linkDir = config.DIR / dataset.name
|
||||
ensure_dir(downDir)
|
||||
ensure_dir(linkDir)
|
||||
|
||||
|
9
lib/config.py
Normal file
9
lib/config.py
Normal file
@@ -0,0 +1,9 @@
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
DIR = Path(os.environ["SS_DIR"])
|
||||
except KeyError as e:
|
||||
print("ERROR: $SS_DIR not set")
|
||||
sys.exit(1)
|
@@ -3,7 +3,7 @@ import sys
|
||||
|
||||
import ssgetpy
|
||||
|
||||
import lists
|
||||
from lib import lists
|
||||
|
||||
Dataset = collections.namedtuple("Dataset", ["name", "mats"])
|
||||
|
||||
@@ -15,35 +15,30 @@ def safe_dir_name(s):
|
||||
t = t.lower()
|
||||
return t
|
||||
|
||||
def filter_reject_blacklist(mats):
|
||||
filtered = []
|
||||
for mat in mats:
|
||||
if mat.name in lists.INTEGER_MATS:
|
||||
print(f"BLACKLIST {mat.name}")
|
||||
continue
|
||||
filtered += [mat]
|
||||
return filtered
|
||||
def mat_is_integer(mat):
|
||||
return mat.name in lists.INTEGER_MATS
|
||||
|
||||
def filter_reject_integer(mats):
|
||||
return [mat for mat in mats if not mat_is_integer(mat)]
|
||||
|
||||
def mat_is_small(mat):
|
||||
return (mat.rows < 1_000 and mat.cols < 1_000) \
|
||||
or mat.nnz < 20_000
|
||||
|
||||
def mat_is_large(mat):
|
||||
return (mat.rows > 1_000_000 and mat.cols < 1_000_000) \
|
||||
or mat.nnz > 20_000_000
|
||||
|
||||
def filter_reject_large(mats):
|
||||
filtered = []
|
||||
for mat in mats:
|
||||
if mat.rows > 1_000_000 or mat.cols > 1_000_000 or mat.nnz > 20_000_000:
|
||||
continue
|
||||
filtered += [mat]
|
||||
return filtered
|
||||
return [mat for mat in mats if not mat_is_large(mat)]
|
||||
|
||||
def filter_reject_small(mats):
|
||||
filtered = []
|
||||
for mat in mats:
|
||||
if mat.rows < 1_000 or mat.cols < 1_000 or mat.nnz < 20_000:
|
||||
continue
|
||||
filtered += [mat]
|
||||
return filtered
|
||||
return [mat for mat in mats if not mat_is_small(mat)]
|
||||
|
||||
## all real-valued matrices
|
||||
REAL_MATS = Dataset(
|
||||
name = "reals",
|
||||
mats = filter_reject_blacklist(ssgetpy.search(
|
||||
mats = filter_reject_integer(ssgetpy.search(
|
||||
dtype='real',
|
||||
limit=1_000_000
|
||||
))
|
||||
@@ -61,10 +56,7 @@ kinds = [
|
||||
"Theoretical/Quantum Chemistry Problem",
|
||||
"Thermal Problem",
|
||||
]
|
||||
REGULAR_REAL_MATS = Dataset(
|
||||
name = "regular_reals",
|
||||
mats = []
|
||||
)
|
||||
|
||||
mats = []
|
||||
for kind in kinds:
|
||||
mats += ssgetpy.search(
|
||||
@@ -74,7 +66,7 @@ for kind in kinds:
|
||||
)
|
||||
REGULAR_REAL_MATS = Dataset(
|
||||
name="regular_reals",
|
||||
mats = filter_reject_blacklist(mats)
|
||||
mats = filter_reject_integer(mats)
|
||||
)
|
||||
|
||||
## keep "small" matrices
|
||||
@@ -97,12 +89,6 @@ REAL_MED_MATS = Dataset (
|
||||
mats = filter_reject_large(filter_reject_small(REAL_MATS.mats))
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## export all datasets
|
||||
DATASETS = [
|
||||
REAL_MATS,
|
||||
@@ -115,26 +101,24 @@ DATASETS = [
|
||||
|
||||
def get_kinds():
|
||||
"""return set of unique kind fields"""
|
||||
|
||||
mats = ssgetpy.search(
|
||||
limit=1_000_000
|
||||
)
|
||||
|
||||
kinds = set()
|
||||
for mat in mats:
|
||||
kinds.add(mat.kind)
|
||||
print(f"kinds: {kinds}")
|
||||
|
||||
return kinds
|
||||
|
||||
for kind in get_kinds():
|
||||
d = Dataset(
|
||||
name = "kind_"+safe_dir_name(kind),
|
||||
mats = filter_reject_blacklist(ssgetpy.search(
|
||||
mats = filter_reject_large( \
|
||||
filter_reject_small( \
|
||||
filter_reject_integer(ssgetpy.search(
|
||||
kind=kind,
|
||||
dtype='real',
|
||||
limit=1_000_000
|
||||
))
|
||||
))))
|
||||
)
|
||||
if len(d.mats) > 0:
|
||||
DATASETS += [d]
|
@@ -4,14 +4,18 @@ host=`hostname`
|
||||
|
||||
if [[ "$NERSC_HOST" == cori ]]; then
|
||||
echo \$NERSC_HOST matched cori
|
||||
module load cray-python/3.8.5.0
|
||||
|
||||
module load cray-python/3.8.5.0
|
||||
which python
|
||||
|
||||
export SS_DIR="$CFS"/m3918/pearson
|
||||
echo "\$SS_DIR = $SS_DIR"
|
||||
elif [[ "$NERSC_HOST" == perlmutter ]]; then
|
||||
echo \$NERSC_HOST matched perlmutter
|
||||
|
||||
module load cray-python/3.9.4.1
|
||||
|
||||
which python
|
||||
export SS_DIR="$CFS"/m3918/pearson
|
||||
echo "\$SS_DIR = $SS_DIR"
|
||||
fi
|
||||
|
||||
|
28
unused.py
Normal file
28
unused.py
Normal file
@@ -0,0 +1,28 @@
|
||||
"""list unused data in suitesparse"""
|
||||
|
||||
import os
|
||||
from pathlib import Path, PurePath
|
||||
import sys
|
||||
|
||||
from lib import config
|
||||
from lib import datasets
|
||||
|
||||
used = set()
|
||||
|
||||
for dataset in datasets.DATASETS:
|
||||
|
||||
# check if dataset directory exists
|
||||
if not os.path.isdir(config.DIR / dataset.name):
|
||||
continue
|
||||
|
||||
for f in os.listdir(config.DIR / dataset.name):
|
||||
if f.endswith(".mtx"):
|
||||
used.add(f[:-4])
|
||||
|
||||
for f in os.listdir(config.DIR / "suitesparse"):
|
||||
if f not in used:
|
||||
print(os.path.abspath(config.DIR / "suitesparse" / f))
|
||||
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user