add unused.py, refactor common code a bit
This commit is contained in:
10
README.md
10
README.md
@@ -19,6 +19,16 @@ To download all datasets
|
|||||||
poetry run python download.py all
|
poetry run python download.py all
|
||||||
```
|
```
|
||||||
|
|
||||||
|
You can move the datasets due to relative symlinks. For example:
|
||||||
|
|
||||||
|
```
|
||||||
|
rsync -azvh $SCRATCH/ ~/cfm_m3918/pearson
|
||||||
|
```
|
||||||
|
|
||||||
|
## how to use (unsupported platform)
|
||||||
|
|
||||||
|
set `SS_DIR` environment variable to the directory where you want the dataset folders to be generated.
|
||||||
|
|
||||||
## What it does
|
## What it does
|
||||||
|
|
||||||
Downloads subsets of the suitesparse collection to different directories.
|
Downloads subsets of the suitesparse collection to different directories.
|
||||||
|
14
download.py
14
download.py
@@ -2,7 +2,9 @@ import os
|
|||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
import datasets
|
from lib import config
|
||||||
|
from lib import datasets
|
||||||
|
|
||||||
|
|
||||||
def ensure_dir(path):
|
def ensure_dir(path):
|
||||||
print("ensure", path)
|
print("ensure", path)
|
||||||
@@ -13,7 +15,7 @@ def ensure_dir(path):
|
|||||||
|
|
||||||
def ensure_matrix_download(dir, mat):
|
def ensure_matrix_download(dir, mat):
|
||||||
if os.path.exists(dir / mat.name / (mat.name + ".mtx")):
|
if os.path.exists(dir / mat.name / (mat.name + ".mtx")):
|
||||||
print(f"SKIP {mat.name}: already exists")
|
# already downloaded
|
||||||
return
|
return
|
||||||
mat.download(format='MM', destpath=dir, extract=True)
|
mat.download(format='MM', destpath=dir, extract=True)
|
||||||
|
|
||||||
@@ -29,7 +31,7 @@ def ensure_matrix_link(downDir, linkDir, mat):
|
|||||||
try:
|
try:
|
||||||
os.symlink(src, dst)
|
os.symlink(src, dst)
|
||||||
except FileExistsError:
|
except FileExistsError:
|
||||||
pass # dir already exists
|
pass # symlink already exists
|
||||||
return
|
return
|
||||||
|
|
||||||
def download_dataset(dataset):
|
def download_dataset(dataset):
|
||||||
@@ -37,12 +39,10 @@ def download_dataset(dataset):
|
|||||||
|
|
||||||
print(len(mats))
|
print(len(mats))
|
||||||
|
|
||||||
# scratch directory
|
|
||||||
scratchPath = Path(os.environ["SCRATCH"])
|
|
||||||
# where matrices will be downloaded
|
# where matrices will be downloaded
|
||||||
downDir = scratchPath / "suitesparse"
|
downDir = config.DIR / "suitesparse"
|
||||||
# where the matrix will be linked to
|
# where the matrix will be linked to
|
||||||
linkDir = scratchPath / dataset.name
|
linkDir = config.DIR / dataset.name
|
||||||
ensure_dir(downDir)
|
ensure_dir(downDir)
|
||||||
ensure_dir(linkDir)
|
ensure_dir(linkDir)
|
||||||
|
|
||||||
|
9
lib/config.py
Normal file
9
lib/config.py
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
try:
|
||||||
|
DIR = Path(os.environ["SS_DIR"])
|
||||||
|
except KeyError as e:
|
||||||
|
print("ERROR: $SS_DIR not set")
|
||||||
|
sys.exit(1)
|
@@ -3,7 +3,7 @@ import sys
|
|||||||
|
|
||||||
import ssgetpy
|
import ssgetpy
|
||||||
|
|
||||||
import lists
|
from lib import lists
|
||||||
|
|
||||||
Dataset = collections.namedtuple("Dataset", ["name", "mats"])
|
Dataset = collections.namedtuple("Dataset", ["name", "mats"])
|
||||||
|
|
||||||
@@ -15,35 +15,30 @@ def safe_dir_name(s):
|
|||||||
t = t.lower()
|
t = t.lower()
|
||||||
return t
|
return t
|
||||||
|
|
||||||
def filter_reject_blacklist(mats):
|
def mat_is_integer(mat):
|
||||||
filtered = []
|
return mat.name in lists.INTEGER_MATS
|
||||||
for mat in mats:
|
|
||||||
if mat.name in lists.INTEGER_MATS:
|
def filter_reject_integer(mats):
|
||||||
print(f"BLACKLIST {mat.name}")
|
return [mat for mat in mats if not mat_is_integer(mat)]
|
||||||
continue
|
|
||||||
filtered += [mat]
|
def mat_is_small(mat):
|
||||||
return filtered
|
return (mat.rows < 1_000 and mat.cols < 1_000) \
|
||||||
|
or mat.nnz < 20_000
|
||||||
|
|
||||||
|
def mat_is_large(mat):
|
||||||
|
return (mat.rows > 1_000_000 and mat.cols < 1_000_000) \
|
||||||
|
or mat.nnz > 20_000_000
|
||||||
|
|
||||||
def filter_reject_large(mats):
|
def filter_reject_large(mats):
|
||||||
filtered = []
|
return [mat for mat in mats if not mat_is_large(mat)]
|
||||||
for mat in mats:
|
|
||||||
if mat.rows > 1_000_000 or mat.cols > 1_000_000 or mat.nnz > 20_000_000:
|
|
||||||
continue
|
|
||||||
filtered += [mat]
|
|
||||||
return filtered
|
|
||||||
|
|
||||||
def filter_reject_small(mats):
|
def filter_reject_small(mats):
|
||||||
filtered = []
|
return [mat for mat in mats if not mat_is_small(mat)]
|
||||||
for mat in mats:
|
|
||||||
if mat.rows < 1_000 or mat.cols < 1_000 or mat.nnz < 20_000:
|
|
||||||
continue
|
|
||||||
filtered += [mat]
|
|
||||||
return filtered
|
|
||||||
|
|
||||||
## all real-valued matrices
|
## all real-valued matrices
|
||||||
REAL_MATS = Dataset(
|
REAL_MATS = Dataset(
|
||||||
name = "reals",
|
name = "reals",
|
||||||
mats = filter_reject_blacklist(ssgetpy.search(
|
mats = filter_reject_integer(ssgetpy.search(
|
||||||
dtype='real',
|
dtype='real',
|
||||||
limit=1_000_000
|
limit=1_000_000
|
||||||
))
|
))
|
||||||
@@ -61,10 +56,7 @@ kinds = [
|
|||||||
"Theoretical/Quantum Chemistry Problem",
|
"Theoretical/Quantum Chemistry Problem",
|
||||||
"Thermal Problem",
|
"Thermal Problem",
|
||||||
]
|
]
|
||||||
REGULAR_REAL_MATS = Dataset(
|
|
||||||
name = "regular_reals",
|
|
||||||
mats = []
|
|
||||||
)
|
|
||||||
mats = []
|
mats = []
|
||||||
for kind in kinds:
|
for kind in kinds:
|
||||||
mats += ssgetpy.search(
|
mats += ssgetpy.search(
|
||||||
@@ -74,7 +66,7 @@ for kind in kinds:
|
|||||||
)
|
)
|
||||||
REGULAR_REAL_MATS = Dataset(
|
REGULAR_REAL_MATS = Dataset(
|
||||||
name="regular_reals",
|
name="regular_reals",
|
||||||
mats = filter_reject_blacklist(mats)
|
mats = filter_reject_integer(mats)
|
||||||
)
|
)
|
||||||
|
|
||||||
## keep "small" matrices
|
## keep "small" matrices
|
||||||
@@ -97,12 +89,6 @@ REAL_MED_MATS = Dataset (
|
|||||||
mats = filter_reject_large(filter_reject_small(REAL_MATS.mats))
|
mats = filter_reject_large(filter_reject_small(REAL_MATS.mats))
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## export all datasets
|
## export all datasets
|
||||||
DATASETS = [
|
DATASETS = [
|
||||||
REAL_MATS,
|
REAL_MATS,
|
||||||
@@ -115,26 +101,24 @@ DATASETS = [
|
|||||||
|
|
||||||
def get_kinds():
|
def get_kinds():
|
||||||
"""return set of unique kind fields"""
|
"""return set of unique kind fields"""
|
||||||
|
|
||||||
mats = ssgetpy.search(
|
mats = ssgetpy.search(
|
||||||
limit=1_000_000
|
limit=1_000_000
|
||||||
)
|
)
|
||||||
|
|
||||||
kinds = set()
|
kinds = set()
|
||||||
for mat in mats:
|
for mat in mats:
|
||||||
kinds.add(mat.kind)
|
kinds.add(mat.kind)
|
||||||
print(f"kinds: {kinds}")
|
|
||||||
|
|
||||||
return kinds
|
return kinds
|
||||||
|
|
||||||
for kind in get_kinds():
|
for kind in get_kinds():
|
||||||
d = Dataset(
|
d = Dataset(
|
||||||
name = "kind_"+safe_dir_name(kind),
|
name = "kind_"+safe_dir_name(kind),
|
||||||
mats = filter_reject_blacklist(ssgetpy.search(
|
mats = filter_reject_large( \
|
||||||
|
filter_reject_small( \
|
||||||
|
filter_reject_integer(ssgetpy.search(
|
||||||
kind=kind,
|
kind=kind,
|
||||||
dtype='real',
|
dtype='real',
|
||||||
limit=1_000_000
|
limit=1_000_000
|
||||||
))
|
))))
|
||||||
)
|
)
|
||||||
if len(d.mats) > 0:
|
if len(d.mats) > 0:
|
||||||
DATASETS += [d]
|
DATASETS += [d]
|
@@ -4,14 +4,18 @@ host=`hostname`
|
|||||||
|
|
||||||
if [[ "$NERSC_HOST" == cori ]]; then
|
if [[ "$NERSC_HOST" == cori ]]; then
|
||||||
echo \$NERSC_HOST matched cori
|
echo \$NERSC_HOST matched cori
|
||||||
module load cray-python/3.8.5.0
|
|
||||||
|
|
||||||
|
module load cray-python/3.8.5.0
|
||||||
which python
|
which python
|
||||||
|
|
||||||
|
export SS_DIR="$CFS"/m3918/pearson
|
||||||
|
echo "\$SS_DIR = $SS_DIR"
|
||||||
elif [[ "$NERSC_HOST" == perlmutter ]]; then
|
elif [[ "$NERSC_HOST" == perlmutter ]]; then
|
||||||
echo \$NERSC_HOST matched perlmutter
|
echo \$NERSC_HOST matched perlmutter
|
||||||
|
|
||||||
module load cray-python/3.9.4.1
|
module load cray-python/3.9.4.1
|
||||||
|
|
||||||
which python
|
which python
|
||||||
|
export SS_DIR="$CFS"/m3918/pearson
|
||||||
|
echo "\$SS_DIR = $SS_DIR"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
28
unused.py
Normal file
28
unused.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
"""list unused data in suitesparse"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path, PurePath
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from lib import config
|
||||||
|
from lib import datasets
|
||||||
|
|
||||||
|
used = set()
|
||||||
|
|
||||||
|
for dataset in datasets.DATASETS:
|
||||||
|
|
||||||
|
# check if dataset directory exists
|
||||||
|
if not os.path.isdir(config.DIR / dataset.name):
|
||||||
|
continue
|
||||||
|
|
||||||
|
for f in os.listdir(config.DIR / dataset.name):
|
||||||
|
if f.endswith(".mtx"):
|
||||||
|
used.add(f[:-4])
|
||||||
|
|
||||||
|
for f in os.listdir(config.DIR / "suitesparse"):
|
||||||
|
if f not in used:
|
||||||
|
print(os.path.abspath(config.DIR / "suitesparse" / f))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Reference in New Issue
Block a user