automatically download nonzero datatype metadata
This commit is contained in:
@@ -2,8 +2,15 @@ import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from lib import matrix
|
||||
|
||||
try:
|
||||
DIR = Path(os.environ["SS_DIR"])
|
||||
except KeyError as e:
|
||||
print("ERROR: $SS_DIR not set")
|
||||
sys.exit(1)
|
||||
|
||||
SS_ROOT_URL = "https://sparse.tamu.edu"
|
||||
|
||||
|
||||
|
||||
|
@@ -3,7 +3,7 @@ import sys
|
||||
|
||||
import ssgetpy
|
||||
|
||||
from lib import lists
|
||||
from lib import dtypes
|
||||
|
||||
Dataset = collections.namedtuple("Dataset", ["name", "mats"])
|
||||
|
||||
@@ -15,18 +15,19 @@ def safe_dir_name(s):
|
||||
t = t.lower()
|
||||
return t
|
||||
|
||||
def mat_is_integer(mat):
|
||||
return mat.name in lists.INTEGER_MATS
|
||||
def mat_is_real(mat):
|
||||
val = dtypes.DTYPES[(mat.group, mat.name)] == "real"
|
||||
return val
|
||||
|
||||
def filter_reject_integer(mats):
|
||||
return [mat for mat in mats if not mat_is_integer(mat)]
|
||||
def filter_keep_real(mats):
|
||||
return [mat for mat in mats if mat_is_real(mat)]
|
||||
|
||||
def mat_is_small(mat):
|
||||
return (mat.rows < 1_000 and mat.cols < 1_000) \
|
||||
or mat.nnz < 20_000
|
||||
|
||||
def mat_is_large(mat):
|
||||
return (mat.rows > 1_000_000 and mat.cols < 1_000_000) \
|
||||
return (mat.rows > 1_000_000 and mat.cols > 1_000_000) \
|
||||
or mat.nnz > 20_000_000
|
||||
|
||||
def filter_reject_large(mats):
|
||||
@@ -38,7 +39,7 @@ def filter_reject_small(mats):
|
||||
## all real-valued matrices
|
||||
REAL_MATS = Dataset(
|
||||
name = "reals",
|
||||
mats = filter_reject_integer(ssgetpy.search(
|
||||
mats = filter_keep_real(ssgetpy.search(
|
||||
dtype='real',
|
||||
limit=1_000_000
|
||||
))
|
||||
@@ -66,7 +67,7 @@ for kind in kinds:
|
||||
)
|
||||
REGULAR_REAL_MATS = Dataset(
|
||||
name="regular_reals",
|
||||
mats = filter_reject_integer(mats)
|
||||
mats = filter_keep_real(mats)
|
||||
)
|
||||
|
||||
## keep "small" matrices
|
||||
@@ -91,7 +92,7 @@ REAL_MED_MATS = Dataset (
|
||||
|
||||
## export all datasets
|
||||
DATASETS = [
|
||||
REAL_MATS,
|
||||
# REAL_MATS,
|
||||
REAL_SMALL_MATS,
|
||||
REAL_MED_MATS,
|
||||
REGULAR_REAL_MATS,
|
||||
@@ -114,7 +115,7 @@ for kind in get_kinds():
|
||||
name = "kind_"+safe_dir_name(kind),
|
||||
mats = filter_reject_large( \
|
||||
filter_reject_small( \
|
||||
filter_reject_integer(ssgetpy.search(
|
||||
filter_keep_real(ssgetpy.search(
|
||||
kind=kind,
|
||||
dtype='real',
|
||||
limit=1_000_000
|
||||
|
54
lib/dtypes.py
Normal file
54
lib/dtypes.py
Normal file
@@ -0,0 +1,54 @@
|
||||
"""export a map that is (group, name) -> dtype for all mats"""
|
||||
|
||||
import requests
|
||||
import datetime
|
||||
import os
|
||||
|
||||
import scipy.io
|
||||
|
||||
from lib import config
|
||||
|
||||
def download_ss_index(path):
|
||||
with open(path, "wb") as f:
|
||||
req = requests.get(config.SS_ROOT_URL + "/files/ss_index.mat")
|
||||
f.write(req.content)
|
||||
|
||||
def ensure_ss_index(path):
|
||||
if not os.path.exists(path):
|
||||
download_ss_index(path)
|
||||
mtime = datetime.datetime.utcfromtimestamp(os.path.getmtime(config.DIR / ".ss_index.mat"))
|
||||
if datetime.datetime.utcnow() - mtime > datetime.timedelta(days=90):
|
||||
download_ss_index(path)
|
||||
|
||||
# download metadata file if missing
|
||||
local = config.DIR / ".ss_index.mat"
|
||||
ensure_ss_index(local)
|
||||
|
||||
|
||||
# load metadata and convert to a database
|
||||
mat = scipy.io.loadmat(config.DIR / ".ss_index.mat", squeeze_me=True)
|
||||
|
||||
s = mat["ss_index"].item()
|
||||
for i,x in enumerate(s):
|
||||
print(i, x)
|
||||
groups = s[1]
|
||||
names = s[2]
|
||||
# 3 letters, first letter:
|
||||
# r=real, p=binary, c=complex, i=integer
|
||||
rbtype = s[19]
|
||||
|
||||
def dtype_from_rbtype(rbtype):
|
||||
if rbtype[0] == "r":
|
||||
return "real"
|
||||
elif rbtype[0] == "p":
|
||||
return "binary"
|
||||
elif rbtype[0] == "c":
|
||||
return "complex"
|
||||
elif rbtype[0] == "i":
|
||||
return "integer"
|
||||
else:
|
||||
raise LookupError
|
||||
|
||||
DTYPES = {}
|
||||
for i in range(len(names)):
|
||||
DTYPES[(groups[i], names[i])] = dtype_from_rbtype(rbtype[i])
|
17
lib/matrix.py
Normal file
17
lib/matrix.py
Normal file
@@ -0,0 +1,17 @@
|
||||
class Matrix:
|
||||
def __init__(self, group, name, dtype, nrows, ncols, nnz):
|
||||
self.group = group
|
||||
self.name = name
|
||||
self.dtype = dtype
|
||||
self.nrows = int(nrows)
|
||||
self.ncols = int(ncols)
|
||||
self.nnz = int(nnz)
|
||||
|
||||
def to_tuple(self):
|
||||
return (self.group, self.name, self.dtype, self.nrows, self.ncols, self.nnz)
|
||||
|
||||
def __repr__(self):
|
||||
return repr(self.to_tuple())
|
||||
|
||||
def url(self):
|
||||
return "/".join(("https://sparse.tamu.edu", "MM", self.group, self.name + ".tar.gz"))
|
Reference in New Issue
Block a user