From cbd26aa0cb530060804fbd11f69908d5eac1b0d2 Mon Sep 17 00:00:00 2001 From: Ethan Ho Date: Thu, 11 May 2023 17:16:22 -0500 Subject: [PATCH 1/3] Add parquet support --- poetry.lock | 46 ++++++++++++---------------------------- src/memoize/dataframe.py | 27 ++++++++++++++--------- src/memoize/main.py | 8 +++---- 3 files changed, 35 insertions(+), 46 deletions(-) diff --git a/poetry.lock b/poetry.lock index 575c259..cfaffd0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,29 +1,13 @@ -[[package]] -name = "attrs" -version = "22.2.0" -description = "Classes Without Boilerplate" -category = "dev" -optional = false -python-versions = ">=3.6" - -[package.extras] -cov = ["attrs", "coverage-enable-subprocess", "coverage[toml] (>=5.3)"] -dev = ["attrs"] -docs = ["furo", "sphinx", "myst-parser", "zope.interface", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier"] -tests = ["attrs", "zope.interface"] -tests-no-zope = ["hypothesis", "pympler", "pytest (>=4.3.0)", "pytest-xdist", "cloudpickle", "mypy (>=0.971,<0.990)", "pytest-mypy-plugins"] -tests_no_zope = ["hypothesis", "pympler", "pytest (>=4.3.0)", "pytest-xdist", "cloudpickle", "mypy (>=0.971,<0.990)", "pytest-mypy-plugins"] - [[package]] name = "boto3" -version = "1.26.53" +version = "1.26.130" description = "The AWS SDK for Python" category = "main" optional = true python-versions = ">= 3.7" [package.dependencies] -botocore = ">=1.29.53,<1.30.0" +botocore = ">=1.29.130,<1.30.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.6.0,<0.7.0" @@ -32,7 +16,7 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.29.53" +version = "1.29.130" description = "Low-level, data-driven core of boto 3." category = "main" optional = true @@ -44,7 +28,7 @@ python-dateutil = ">=2.1,<3.0.0" urllib3 = ">=1.25.4,<1.27" [package.extras] -crt = ["awscrt (==0.15.3)"] +crt = ["awscrt (==0.16.9)"] [[package]] name = "colorama" @@ -56,7 +40,7 @@ python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7 [[package]] name = "exceptiongroup" -version = "1.1.0" +version = "1.1.1" description = "Backport of PEP 654 (exception groups)" category = "dev" optional = false @@ -83,7 +67,7 @@ python-versions = ">=3.7" [[package]] name = "numpy" -version = "1.24.1" +version = "1.24.3" description = "Fundamental package for array computing in Python" category = "main" optional = true @@ -91,7 +75,7 @@ python-versions = ">=3.8" [[package]] name = "packaging" -version = "23.0" +version = "23.1" description = "Core utilities for Python packages" category = "dev" optional = false @@ -131,14 +115,13 @@ testing = ["pytest", "pytest-benchmark"] [[package]] name = "pytest" -version = "7.2.1" +version = "7.3.1" description = "pytest: simple powerful testing with Python" category = "dev" optional = false python-versions = ">=3.7" [package.dependencies] -attrs = ">=19.2.0" colorama = {version = "*", markers = "sys_platform == \"win32\""} exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} iniconfig = "*" @@ -147,7 +130,7 @@ pluggy = ">=0.12,<2.0" tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] -testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] [[package]] name = "pytest-dotenv" @@ -174,18 +157,18 @@ six = ">=1.5" [[package]] name = "python-dotenv" -version = "0.21.0" +version = "1.0.0" description = "Read key-value pairs from a .env file and set them as environment variables" category = "dev" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" [package.extras] cli = ["click (>=5.0)"] [[package]] name = "pytz" -version = "2022.7.1" +version = "2023.3" description = "World timezone definitions, modern and historical" category = "main" optional = true @@ -193,7 +176,7 @@ python-versions = "*" [[package]] name = "s3transfer" -version = "0.6.0" +version = "0.6.1" description = "An Amazon S3 Transfer Manager" category = "main" optional = true @@ -223,7 +206,7 @@ python-versions = ">=3.7" [[package]] name = "urllib3" -version = "1.26.14" +version = "1.26.15" description = "HTTP library with thread-safe connection pooling, file post, and more." category = "main" optional = true @@ -244,7 +227,6 @@ python-versions = "^3.8" content-hash = "0cde0b85158a582efc26687ca81d9f8d6dac31fdc20570547c927319c4600bbe" [metadata.files] -attrs = [] boto3 = [] botocore = [] colorama = [] diff --git a/src/memoize/dataframe.py b/src/memoize/dataframe.py index fc204a0..b96852c 100644 --- a/src/memoize/dataframe.py +++ b/src/memoize/dataframe.py @@ -14,16 +14,24 @@ from .main import _clean_func_name, _get_hist_fps, _make_key - -def _read_csv(fp: str) -> pd.DataFrame: +def _read(ext: str, fp: str) -> pd.DataFrame: """Reads DataFrame from CSV file at `fp`.""" - return pd.read_csv(fp) + if ext == 'csv': + return pd.read_csv(fp) + elif ext == 'parquet': + return pd.read_parquet(fp) + else: + raise Exception(f"Unsupported file extension {ext}") -def _write_csv(fp: str, df: pd.DataFrame): - """Write DataFrame to CSV file at `fp` from DataFrame `df`.""" - write_index = bool(df.index.name) - return df.to_csv(fp, index=write_index) +def _write(ext: str, fp: str, df: pd.DataFrame): + if ext == 'csv': + write_index = bool(df.index.name) + return df.to_csv(fp, index=write_index) + elif ext == 'parquet': + return df.to_parquet(fp) + else: + raise Exception(f"Unsupported file extension {ext}") def memoize_df( @@ -31,7 +39,6 @@ def memoize_df( cache_dir: Optional[str] = '/tmp/memoize', ext: str = 'csv', log_func: Callable = print, - ignore_invalid: bool = True, cache_lifetime_days: int = 0 ) -> Callable: """ @@ -61,7 +68,7 @@ def memoize_dec(*args, **kwargs): if not kwargs.get('_memoize_force_refresh'): for hist_fp in hist_fps: log_func(f"Using cached call from {hist_fp}") - return _read_csv(hist_fp) + return _read(ext, hist_fp) # Else run the function and store cached result result = func(*args, **kwargs) @@ -71,7 +78,7 @@ def memoize_dec(*args, **kwargs): f"Failed to write return value of function '{funcname}' to CSV file. " f"Expected a pandas.DataFrame, received {type(result)}." ) - _write_csv(fp, result) + _write(ext, fp, result) return result return memoize_dec return add_memoize_dec diff --git a/src/memoize/main.py b/src/memoize/main.py index 5bdc376..83f3623 100644 --- a/src/memoize/main.py +++ b/src/memoize/main.py @@ -16,7 +16,7 @@ def _make_key(func_name: str, args: List, kwargs: Dict, maxlen: int = None) -> s d['_func_name'] = func_name d['_args'] = args hl = hashlib.new('sha256') - hl.update(json.dumps(d, sort_keys=True).encode()) + hl.update(json.dumps(d, sort_keys=True).encode()) as_str = hl.hexdigest() if maxlen: as_str = as_str[:maxlen] @@ -48,7 +48,7 @@ def _get_hist_fps(query: str, cache_lifetime_days: int = None) -> List[str]: except Exception as err: raise dt_grps.append(item) - + fps = [ file['fp'] for file in # Sort filepaths starting with most recent @@ -92,7 +92,7 @@ def memoize(stub: Optional[str] = None, raise Exception(f'{cache_dir=} exists but is not a directory') else: os.makedirs(cache_dir) - stub = stub if stub else date.today().strftime('%Y%m%d') + stub = stub if stub else date.today().strftime('%Y%m%d') def add_memoize_dec(func): funcname = _clean_func_name(func.__name__) @@ -118,7 +118,7 @@ def memoize_dec(*args, **kwargs): text = json.dumps(cache) f.write(text) return cache[key] - + # Else run the function and store cached result result = func(*args, **kwargs) cache[key] = result From 840ba5741ab4db415564046b45e41fdbd89948a9 Mon Sep 17 00:00:00 2001 From: Ethan Ho Date: Fri, 12 May 2023 15:24:06 -0500 Subject: [PATCH 2/3] Fix parquet rw --- src/memoize/dataframe.py | 3 +++ tests/test_dataframe.py | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/memoize/dataframe.py b/src/memoize/dataframe.py index b96852c..aee40ee 100644 --- a/src/memoize/dataframe.py +++ b/src/memoize/dataframe.py @@ -29,6 +29,9 @@ def _write(ext: str, fp: str, df: pd.DataFrame): write_index = bool(df.index.name) return df.to_csv(fp, index=write_index) elif ext == 'parquet': + if not pd.api.types.is_object_dtype(df.columns.dtype): + print(f"WARNING: Converting column names to string dtype") + df.columns = df.columns.astype(str) return df.to_parquet(fp) else: raise Exception(f"Unsupported file extension {ext}") diff --git a/tests/test_dataframe.py b/tests/test_dataframe.py index ff78898..1cde5d3 100644 --- a/tests/test_dataframe.py +++ b/tests/test_dataframe.py @@ -10,8 +10,9 @@ def example_func(foo: int): return df -def test_memoize(): - wrapped = memoize_df(cache_lifetime_days=None)(example_func) +@pytest.mark.parametrize('ext', ['csv', 'parquet']) +def test_memoize(ext): + wrapped = memoize_df(cache_lifetime_days=None, ext=ext)(example_func) print(wrapped(2)) print(wrapped(3)) print(wrapped(5)) From e644516c515a621e54c4165f14349d21fde9ede8 Mon Sep 17 00:00:00 2001 From: Ethan Ho Date: Fri, 12 May 2023 15:25:39 -0500 Subject: [PATCH 3/3] Version tick --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 156987e..94cc1f3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "memoize" -version = "0.2.1" +version = "0.3.0" description = "Python3 memoization decorator" authors = ["Ethan Ho "] license = "MIT"