commit 7e68d4f712c94890675fd9258a9b759bc0d815dc Author: Carl Pearson Date: Tue Nov 23 07:33:20 2021 -0800 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7ee7f5b --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +poetry.lock diff --git a/README.md b/README.md new file mode 100644 index 0000000..1abff8b --- /dev/null +++ b/README.md @@ -0,0 +1,26 @@ +# ss-downloader + +Install poetry & Python 3.8+ + +``` +curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python - +``` + +## how to use + +``` +source load-env.sh +poetry run python list.py +poetry run python download.py +``` + +## how this was done + +``` +poetry-new init +poetry add ssgetpy +``` + +``` +poetry install +``` \ No newline at end of file diff --git a/datasets.py b/datasets.py new file mode 100644 index 0000000..c406e18 --- /dev/null +++ b/datasets.py @@ -0,0 +1,97 @@ +import collections + +import ssgetpy + +Dataset = collections.namedtuple("Dataset", ["name", "mats"]) + + +def filter_reject_blacklist(mats): + filtered = [] + for mat in mats: + if mat.name.endswith("_b"): + continue + filtered += [mat] + return filtered + +def filter_reject_large(mats): + filtered = [] + for mat in mats: + if mat.rows > 1_000_000 or mat.cols > 1_000_000 or mat.nnz > 20_000_000: + continue + filtered += [mat] + return filtered + +def filter_reject_small(mats): + filtered = [] + for mat in mats: + if mat.rows < 1_000 or mat.cols < 1_000 or mat.nnz < 20_000: + continue + filtered += [mat] + return filtered + +## all real-valued matrices +REAL_MATS = Dataset( + name = "all_reals", + mats = filter_reject_blacklist(ssgetpy.search( + dtype='real', + limit=1_000_000 + )) +) + +## certain matrices with regular structure +kinds = [ + "2D/3D", + "Acoustics Problem", + "Materials Problem", + "Structural Problem", + "Computational Fluid Dynamics Problem", + "Model Reduction Problem", + "Semiconductor Device Problem", + "Theoretical/Quantum Chemistry Problem", + "Thermal Problem", +] +REGULAR_REAL_MATS = Dataset( + name = "regular_reals", + mats = [] +) +mats = [] +for kind in kinds: + mats += ssgetpy.search( + kind=kind, + dtype='real', + limit=1_000_000 + ) +REGULAR_REAL_MATS = Dataset( + name="regular_reals", + mats = filter_reject_blacklist(mats) +) + +## keep "small" matrices +REGULAR_REAL_SMALL_MATS = Dataset ( + name = "regular_reals_small", + mats = filter_reject_large(REGULAR_REAL_MATS.mats) +) +REAL_SMALL_MATS = Dataset ( + name = "reals_small", + mats = filter_reject_large(REAL_MATS.mats) +) + +## keep "medium" matrices +REGULAR_REAL_MED_MATS = Dataset ( + name = "regular_reals_med", + mats = filter_reject_large(filter_reject_small(REGULAR_REAL_MATS.mats)) +) +REAL_MED_MATS = Dataset ( + name = "reals_med", + mats = filter_reject_large(filter_reject_small(REAL_MATS.mats)) +) + +## export all datasets +DATASETS = [ + REAL_MATS, + REAL_SMALL_MATS, + REAL_MED_MATS, + REGULAR_REAL_MATS, + REGULAR_REAL_SMALL_MATS, + REGULAR_REAL_MED_MATS +] \ No newline at end of file diff --git a/download.py b/download.py new file mode 100755 index 0000000..c67d9a2 --- /dev/null +++ b/download.py @@ -0,0 +1,49 @@ +import os +from pathlib import Path +import sys + +import datasets + +mats = datasets.ALL_REAL_MATS + +print(len(mats)) + +scratchPath = Path(os.environ["SCRATCH"]) +downPath = scratchPath / "suitesparse" +print("ensure", downPath) +try: + os.makedirs(downPath) +except FileExistsError: + pass # dir already exists + +for mat in mats: + + print(mat.name) + + if os.path.exists(downPath / mat.name / (mat.name + ".mtx")): + print(f"skipping {mat.name}: already exists") + continue + + mat.download(format='MM', destpath=downPath, extract=True) + + # TODO: check download for a type that is not 'real' and remove if so + + # TODO: check for non-coordinate and remove, if so + + # many mats include rhs/whatever in extracted. toss that. + files = os.listdir(downPath / mat.name) + for f in files: + if f != (mat.name + ".mtx"): + print("DELETE ", f) + os.remove(downPath / mat.name / f) + + files = os.listdir(downPath / mat.name) + if len(files) == 0: + os.rmdir(downPath / mat.name) + + +# blacklist: +# cavity(\d+)_[bx].mtx +# circuit(\d+)_[bx].mtx +# other things that end in _b.mtx? +# \ No newline at end of file diff --git a/list.py b/list.py new file mode 100644 index 0000000..7dbfaaf --- /dev/null +++ b/list.py @@ -0,0 +1,4 @@ +import datasets + +for ds in datasets.DATASETS: + print(f"{ds.name}: {len(ds.mats)} matrices") \ No newline at end of file diff --git a/load-env.sh b/load-env.sh new file mode 100644 index 0000000..b03d67d --- /dev/null +++ b/load-env.sh @@ -0,0 +1,17 @@ +#! /bin/bash + +host=`hostname` + +if [[ "$NERSC_HOST" == cori ]]; then + echo \$NERSC_HOST matched cori + module load cray-python/3.8.5.0 + + which python +elif [[ "$NERSC_HOST" == perlmutter ]]; then + echo \$NERSC_HOST matched perlmutter + + module load cray-python/3.9.4.1 + + which python +fi + diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..a2c58ba --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,15 @@ +[tool.poetry] +name = "ss-downloader" +version = "0.1.0" +description = "" +authors = ["Carl Pearson "] + +[tool.poetry.dependencies] +python = "^3.8" +ssgetpy = "^1.0rc2" + +[tool.poetry.dev-dependencies] + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api"