From 930d24299966a17481914929bd461865446fea5b Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 30 May 2024 14:48:58 +0200 Subject: [PATCH 01/67] Ensure correct boolean dtype in misc table index --- audformat/core/table.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/audformat/core/table.py b/audformat/core/table.py index 18a4b863..f2d18d2f 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -847,6 +847,11 @@ def _load_csv(self, path: str): float_precision="round_trip", ) + # Ensure bool values are stored as boolean, + # as pandas.read_csv() + # does not set this correctly + df.index = utils._maybe_convert_pandas_dtype(df.index) + # For an empty CSV file # converters will not set the correct dtype # and we need to correct it manually From 8d38ba96bdd1e1d4db863707446afcd2520c7f55 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 30 May 2024 15:09:54 +0200 Subject: [PATCH 02/67] Remove unneeded code --- audformat/core/table.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index f2d18d2f..18a4b863 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -847,11 +847,6 @@ def _load_csv(self, path: str): float_precision="round_trip", ) - # Ensure bool values are stored as boolean, - # as pandas.read_csv() - # does not set this correctly - df.index = utils._maybe_convert_pandas_dtype(df.index) - # For an empty CSV file # converters will not set the correct dtype # and we need to correct it manually From 06f3a34064d663f18ae6bf72fb7f9bdfa4218b54 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 20 Mar 2024 13:32:14 +0100 Subject: [PATCH 03/67] Use pyarrow to read CSV files --- audformat/core/common.py | 31 +++++++ audformat/core/table.py | 180 ++++++++++++++++++++++++++++----------- pyproject.toml | 1 + 3 files changed, 164 insertions(+), 48 deletions(-) diff --git a/audformat/core/common.py b/audformat/core/common.py index 833b44fc..e8733735 100644 --- a/audformat/core/common.py +++ b/audformat/core/common.py @@ -6,6 +6,7 @@ import oyaml as yaml import pandas as pd +import pyarrow as pa from audformat import define from audformat.core.errors import BadKeyError @@ -388,3 +389,33 @@ def to_pandas_dtype(dtype: str) -> typing.Optional[str]: return "string" elif dtype == define.DataType.TIME: return "timedelta64[ns]" + + +def to_pyarrow_dtype(dtype: str) -> typing.Optional[str]: + r"""Convert audformat to pyarrow dtype. + + For ``"object"`` as ``dtype`` + there is no equivalent, + and we don't return a value here. + We let ``pyarrow`` decide, + which dtype fits best in that case. + + Args: + dtype: audformat dtype + + Returns: + pyarrow dtype + + """ + if dtype == define.DataType.BOOL: + return pa.bool_() + elif dtype == define.DataType.DATE: + return pa.timestamp("ns") + elif dtype == define.DataType.FLOAT: + return pa.float64() + elif dtype == define.DataType.INTEGER: + return pa.int64() + elif dtype == define.DataType.STRING: + return pa.string() + elif dtype == define.DataType.TIME: + return pa.string() diff --git a/audformat/core/table.py b/audformat/core/table.py index 18a4b863..e3a287b2 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -6,6 +6,8 @@ import typing import pandas as pd +import pyarrow as pa +import pyarrow.csv as csv import audeer @@ -15,7 +17,7 @@ from audformat.core.common import HeaderBase from audformat.core.common import HeaderDict from audformat.core.common import to_audformat_dtype -from audformat.core.common import to_pandas_dtype +from audformat.core.common import to_pyarrow_dtype from audformat.core.errors import BadIdError from audformat.core.index import filewise_index from audformat.core.index import index_type @@ -801,68 +803,150 @@ def _get_by_index( raise NotImplementedError() def _load_csv(self, path: str): + r"""Load table from CSV file. + + The loaded table is stored under ``self._df``. + + Loading a CSV file with :func:`pd.read_csv()` is slower + than the method applied here. + We first load the CSV file as a :class:`pa.Table` + and convert it to a dataframe afterwards. + + Args: + path: path to table, including file extension + + """ schemes = self.db.schemes - converters = {} - dtypes = {} + # === DTYPES === + + # Collect dtypes + # of the CSV file, + # by inspecting the audformat schemes, + # and the index + # associated with the table. + dtypes = [] + + # Collect columns, + # that cannot directly be converted to pyarrow + timedelta_columns = [] + boolean_columns = [] + object_columns = [] + integer_columns = [] + + # --- Index --- if hasattr(self, "type"): # filewise or segmented table - dtypes[define.IndexField.FILE] = define.DataType.STRING + index_columns = [] + name = define.IndexField.FILE + dtypes.append((name, to_pyarrow_dtype(define.DataType.STRING))) + index_columns.append(name) if self.type == define.IndexType.SEGMENTED: - dtypes[define.IndexField.START] = define.DataType.TIME - dtypes[define.IndexField.END] = define.DataType.TIME + for name in [define.IndexField.START, define.IndexField.END]: + dtypes.append((name, to_pyarrow_dtype(define.DataType.TIME))) + index_columns.append(name) + timedelta_columns.append(name) else: # misc table - dtypes = self.levels - - # index columns - levels = list(dtypes) - dtypes = {level: to_pandas_dtype(dtype) for level, dtype in dtypes.items()} + index_columns = list(self.levels.keys()) + for name, dtype in self.levels.items(): + _dtype = to_pyarrow_dtype(dtype) + if _dtype is not None: + dtypes.append((name, _dtype)) + if dtype == define.DataType.TIME: + timedelta_columns.append(name) + elif dtype == define.DataType.INTEGER: + integer_columns.append(name) + else: + object_columns.append(name) - # other columns + # --- Columns --- + categories = {} columns = list(self.columns) for column_id, column in self.columns.items(): if column.scheme_id is not None: - dtypes[column_id] = schemes[column.scheme_id].to_pandas_dtype() - else: - dtypes[column_id] = "object" - - # replace dtype with converter for dates or timestamps - dtypes_wo_converters = {} - for column_id, dtype in dtypes.items(): - if dtype == "datetime64[ns]": - converters[column_id] = lambda x: pd.to_datetime(x) - elif dtype == "timedelta64[ns]": - converters[column_id] = lambda x: pd.to_timedelta(x) + scheme = schemes[column.scheme_id] + if scheme.labels is not None: + categories[column_id] = scheme._labels_to_list() + dtype = to_pyarrow_dtype(scheme.dtype) + if dtype is not None: + dtypes.append((column_id, dtype)) + if scheme.dtype == define.DataType.TIME: + timedelta_columns.append(column_id) + elif scheme.dtype == define.DataType.BOOL: + boolean_columns.append(column_id) + elif scheme.dtype == define.DataType.INTEGER: + integer_columns.append(column_id) + else: + object_columns.append(column_id) else: - dtypes_wo_converters[column_id] = dtype + object_columns.append(column_id) - # read csv - df = pd.read_csv( + schema = pa.schema(dtypes) + table = csv.read_csv( path, - usecols=levels + columns, - dtype=dtypes_wo_converters, - index_col=levels, - converters=converters, - float_precision="round_trip", + read_options=csv.ReadOptions( + column_names=index_columns + columns, + skip_rows=1, + ), + convert_options=csv.ConvertOptions( + column_types=schema, + strings_can_be_null=True, + ), ) - - # For an empty CSV file - # converters will not set the correct dtype - # and we need to correct it manually - if len(df) == 0: - # fix index - converter_dtypes = { - level: dtype - for level, dtype in dtypes.items() - if level in converters and level in levels - } - df.index = utils.set_index_dtypes(df.index, converter_dtypes) - # fix columns - for column_id in columns: - if column_id in converters: - dtype = dtypes[column_id] - df[column_id] = df[column_id].astype(dtype) + df = table.to_pandas( + deduplicate_objects=False, + types_mapper={ + pa.string(): pd.StringDtype(), + }.get, # we have to provide a callable, not a dict + ) + # Free no longer needed memory + del table + # Adjust dtypes, that cannot be handled by pyarrow + for column in timedelta_columns: + if len(df) == 0: + # For an empty dataframe, map() will not set the correct dtype + df[column] = df[column].astype("timedelta64[ns]") + else: + df[column] = df[column].map( + # "coerce" will set errors to NaT, + # and catches the case where the input is already + lambda x: pd.to_timedelta(x, errors="coerce") + ) + for column in boolean_columns: + df[column] = df[column].map(lambda x: pd.NA if x is None else x) + df[column] = df[column].astype(pd.BooleanDtype()) + for column in object_columns: + df[column] = df[column].astype("object") + df[column] = df[column].replace(pd.NA, None) + for column in integer_columns: + df[column] = df[column].astype("Int64") + for column, labels in categories.items(): + if len(labels) > 0 and isinstance(labels[0], int): + # allow nullable + labels = pd.array(labels, dtype="int64") + dtype = pd.api.types.CategoricalDtype( + categories=labels, + ordered=False, + ) + df[column] = df[column].astype(dtype) + + # Set index + # + # When assigning more than one column, + # a MultiIndex is assigned. + # As the MultiIndex does not preserve dtypes, + # we need to set them manually. + # + if len(index_columns) > 0: + index_dtypes = {column: df[column].dtype for column in index_columns} + df.set_index(index_columns, inplace=True) + if len(index_columns) > 1: + df.index = utils.set_index_dtypes(df.index, index_dtypes) + elif len(index_columns) > 0: + # Ensure pd.BooleanDtype is used for pd.Index + if index_dtypes[index_columns[0]] == bool: + df.index = df.index.astype(pd.BooleanDtype()) self._df = df diff --git a/pyproject.toml b/pyproject.toml index 3d263b93..1100e75d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ 'iso-639', 'iso3166', 'oyaml', + 'pyarrow', 'pyyaml >=5.4.1', 'pandas >=1.4.1', ] From e5045d03813d638144c152e1d3753b5528dfcbf1 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 30 May 2024 15:58:26 +0200 Subject: [PATCH 04/67] Start debugging --- audformat/core/table.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index e3a287b2..337a1bd2 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -915,7 +915,7 @@ def _load_csv(self, path: str): ) for column in boolean_columns: df[column] = df[column].map(lambda x: pd.NA if x is None else x) - df[column] = df[column].astype(pd.BooleanDtype()) + df[column] = df[column].astype("boolean") for column in object_columns: df[column] = df[column].astype("object") df[column] = df[column].replace(pd.NA, None) @@ -940,13 +940,11 @@ def _load_csv(self, path: str): # if len(index_columns) > 0: index_dtypes = {column: df[column].dtype for column in index_columns} + print(f"{self.levels=}") + print(f"{index_dtypes=}") df.set_index(index_columns, inplace=True) - if len(index_columns) > 1: + if len(index_columns) > 0: df.index = utils.set_index_dtypes(df.index, index_dtypes) - elif len(index_columns) > 0: - # Ensure pd.BooleanDtype is used for pd.Index - if index_dtypes[index_columns[0]] == bool: - df.index = df.index.astype(pd.BooleanDtype()) self._df = df From 463c15f1e4bbc33fa71bfc143ad2b64583eaa0e9 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 30 May 2024 16:20:26 +0200 Subject: [PATCH 05/67] Continue debugging --- audformat/core/table.py | 21 +++++++++++++++------ tests/test_misc_table.py | 1 + 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index 337a1bd2..ab0a6655 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -857,9 +857,12 @@ def _load_csv(self, path: str): timedelta_columns.append(name) elif dtype == define.DataType.INTEGER: integer_columns.append(name) + elif dtype == define.DataType.BOOL: + boolean_columns.append(name) else: object_columns.append(name) + print(f"{dtypes=}") # --- Columns --- categories = {} columns = list(self.columns) @@ -894,12 +897,14 @@ def _load_csv(self, path: str): strings_can_be_null=True, ), ) + print(f"{table=}") df = table.to_pandas( deduplicate_objects=False, types_mapper={ pa.string(): pd.StringDtype(), }.get, # we have to provide a callable, not a dict ) + print(f"{df=}") # Free no longer needed memory del table # Adjust dtypes, that cannot be handled by pyarrow @@ -938,13 +943,17 @@ def _load_csv(self, path: str): # As the MultiIndex does not preserve dtypes, # we need to set them manually. # - if len(index_columns) > 0: - index_dtypes = {column: df[column].dtype for column in index_columns} - print(f"{self.levels=}") - print(f"{index_dtypes=}") + # if len(index_columns) > 0: + # index_dtypes = {column: df[column].dtype for column in index_columns} + # dtypes = { + # level: to_pandas_dtype(dtype) + # for level, dtype in self.levels.items() + # } + # print(f"{self.levels=}") + # print(f"{index_dtypes=}") df.set_index(index_columns, inplace=True) - if len(index_columns) > 0: - df.index = utils.set_index_dtypes(df.index, index_dtypes) + # if len(index_columns) > 0: + # df.index = utils.set_index_dtypes(df.index, index_dtypes) self._df = df diff --git a/tests/test_misc_table.py b/tests/test_misc_table.py index 7d9bfa41..48da78f9 100644 --- a/tests/test_misc_table.py +++ b/tests/test_misc_table.py @@ -907,6 +907,7 @@ def test_dtype_multiindex( assert list(db["misc"].levels.values()) == expected_audformat_dtypes assert list(db["misc"].index.dtypes) == expected_pandas_dtypes + print(f"{db['misc'].index=}") db_root = tmpdir.join("db") db.save(db_root, storage_format="csv") db_new = audformat.Database.load(db_root) From e0b831ef12136b85177fa9a7cd719f23262dac46 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 30 May 2024 16:36:11 +0200 Subject: [PATCH 06/67] Fix tests --- audformat/core/table.py | 21 ++++++--------------- tests/test_misc_table.py | 8 +++++++- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index ab0a6655..3741d74d 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -862,7 +862,6 @@ def _load_csv(self, path: str): else: object_columns.append(name) - print(f"{dtypes=}") # --- Columns --- categories = {} columns = list(self.columns) @@ -897,14 +896,12 @@ def _load_csv(self, path: str): strings_can_be_null=True, ), ) - print(f"{table=}") df = table.to_pandas( deduplicate_objects=False, types_mapper={ pa.string(): pd.StringDtype(), }.get, # we have to provide a callable, not a dict ) - print(f"{df=}") # Free no longer needed memory del table # Adjust dtypes, that cannot be handled by pyarrow @@ -940,20 +937,14 @@ def _load_csv(self, path: str): # # When assigning more than one column, # a MultiIndex is assigned. - # As the MultiIndex does not preserve dtypes, - # we need to set them manually. + # As the MultiIndex does not preserve pandas dtypes, + # we need to restore them manually. # - # if len(index_columns) > 0: - # index_dtypes = {column: df[column].dtype for column in index_columns} - # dtypes = { - # level: to_pandas_dtype(dtype) - # for level, dtype in self.levels.items() - # } - # print(f"{self.levels=}") - # print(f"{index_dtypes=}") + if len(index_columns) > 1: + index_dtypes = {column: df[column].dtype for column in index_columns} df.set_index(index_columns, inplace=True) - # if len(index_columns) > 0: - # df.index = utils.set_index_dtypes(df.index, index_dtypes) + if len(index_columns) > 1: + df.index = utils.set_index_dtypes(df.index, index_dtypes) self._df = df diff --git a/tests/test_misc_table.py b/tests/test_misc_table.py index 48da78f9..683ad4a6 100644 --- a/tests/test_misc_table.py +++ b/tests/test_misc_table.py @@ -511,6 +511,13 @@ def test_dtype_column( "index_object, index_values, index_dtype, " "expected_pandas_dtype, expected_audformat_dtype", [ + ( + pd.Index, + ["0"], + None, + "object", + audformat.define.DataType.OBJECT, + ), ( pd.Index, [], @@ -907,7 +914,6 @@ def test_dtype_multiindex( assert list(db["misc"].levels.values()) == expected_audformat_dtypes assert list(db["misc"].index.dtypes) == expected_pandas_dtypes - print(f"{db['misc'].index=}") db_root = tmpdir.join("db") db.save(db_root, storage_format="csv") db_new = audformat.Database.load(db_root) From f48a00b87c872b1077d4265e521201d594b6d839 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 31 May 2024 10:54:54 +0200 Subject: [PATCH 07/67] Remove unneeded code --- audformat/core/table.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index 3741d74d..354ebe1e 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -902,8 +902,6 @@ def _load_csv(self, path: str): pa.string(): pd.StringDtype(), }.get, # we have to provide a callable, not a dict ) - # Free no longer needed memory - del table # Adjust dtypes, that cannot be handled by pyarrow for column in timedelta_columns: if len(df) == 0: From b548774c65dca434f840dcc61935493064edade1 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 31 May 2024 13:35:36 +0200 Subject: [PATCH 08/67] Improve code --- audformat/core/common.py | 3 ++ audformat/core/table.py | 81 +++++++++++++++++++--------------------- 2 files changed, 42 insertions(+), 42 deletions(-) diff --git a/audformat/core/common.py b/audformat/core/common.py index e8733735..5888fea9 100644 --- a/audformat/core/common.py +++ b/audformat/core/common.py @@ -418,4 +418,7 @@ def to_pyarrow_dtype(dtype: str) -> typing.Optional[str]: elif dtype == define.DataType.STRING: return pa.string() elif dtype == define.DataType.TIME: + # A better fitting type would be `pa.duration("ns")`, + # but this is not yet supported + # when reading CSV files return pa.string() diff --git a/audformat/core/table.py b/audformat/core/table.py index 354ebe1e..3feb398b 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -807,9 +807,9 @@ def _load_csv(self, path: str): The loaded table is stored under ``self._df``. - Loading a CSV file with :func:`pd.read_csv()` is slower + Loading a CSV file with :func:`pandas.read_csv()` is slower than the method applied here. - We first load the CSV file as a :class:`pa.Table` + We first load the CSV file as a :class:`pyarrow.Table` and convert it to a dataframe afterwards. Args: @@ -820,47 +820,53 @@ def _load_csv(self, path: str): # === DTYPES === - # Collect dtypes + # Collect pyarrow dtypes # of the CSV file, # by inspecting the audformat schemes, # and the index # associated with the table. - dtypes = [] + # The dtypes are used to create + # the pyarrow.Schema + # used when reading the CSV file + pyarrow_dtypes = [] # Collect columns, - # that cannot directly be converted to pyarrow + # that cannot directly be converted + # from pyarrow to pandas timedelta_columns = [] boolean_columns = [] object_columns = [] integer_columns = [] + # Collect columns, + # belonging to the index + index_columns = [] + # --- Index --- if hasattr(self, "type"): + levels = {} # filewise or segmented table - index_columns = [] - name = define.IndexField.FILE - dtypes.append((name, to_pyarrow_dtype(define.DataType.STRING))) - index_columns.append(name) + levels[define.IndexField.FILE] = define.DataType.STRING if self.type == define.IndexType.SEGMENTED: - for name in [define.IndexField.START, define.IndexField.END]: - dtypes.append((name, to_pyarrow_dtype(define.DataType.TIME))) - index_columns.append(name) - timedelta_columns.append(name) + # segmented table + for level in [define.IndexField.START, define.IndexField.END]: + levels[level] = define.DataType.TIME else: # misc table - index_columns = list(self.levels.keys()) - for name, dtype in self.levels.items(): - _dtype = to_pyarrow_dtype(dtype) - if _dtype is not None: - dtypes.append((name, _dtype)) - if dtype == define.DataType.TIME: - timedelta_columns.append(name) - elif dtype == define.DataType.INTEGER: - integer_columns.append(name) - elif dtype == define.DataType.BOOL: - boolean_columns.append(name) - else: - object_columns.append(name) + levels = self.levels + index_columns += list(levels.keys()) + for name, dtype in levels.items(): + pyarrow_dtype = to_pyarrow_dtype(dtype) + if pyarrow_dtype is not None: + pyarrow_dtypes.append((name, pyarrow_dtype)) + if dtype == define.DataType.TIME: + timedelta_columns.append(name) + elif dtype == define.DataType.INTEGER: + integer_columns.append(name) + elif dtype == define.DataType.BOOL: + boolean_columns.append(name) + else: + object_columns.append(name) # --- Columns --- categories = {} @@ -870,9 +876,9 @@ def _load_csv(self, path: str): scheme = schemes[column.scheme_id] if scheme.labels is not None: categories[column_id] = scheme._labels_to_list() - dtype = to_pyarrow_dtype(scheme.dtype) - if dtype is not None: - dtypes.append((column_id, dtype)) + pyarrow_dtype = to_pyarrow_dtype(scheme.dtype) + if pyarrow_dtype is not None: + pyarrow_dtypes.append((column_id, pyarrow_dtype)) if scheme.dtype == define.DataType.TIME: timedelta_columns.append(column_id) elif scheme.dtype == define.DataType.BOOL: @@ -884,7 +890,7 @@ def _load_csv(self, path: str): else: object_columns.append(column_id) - schema = pa.schema(dtypes) + schema = pa.schema(pyarrow_dtypes) table = csv.read_csv( path, read_options=csv.ReadOptions( @@ -904,17 +910,8 @@ def _load_csv(self, path: str): ) # Adjust dtypes, that cannot be handled by pyarrow for column in timedelta_columns: - if len(df) == 0: - # For an empty dataframe, map() will not set the correct dtype - df[column] = df[column].astype("timedelta64[ns]") - else: - df[column] = df[column].map( - # "coerce" will set errors to NaT, - # and catches the case where the input is already - lambda x: pd.to_timedelta(x, errors="coerce") - ) + df[column] = df[column].astype("timedelta64[ns]") for column in boolean_columns: - df[column] = df[column].map(lambda x: pd.NA if x is None else x) df[column] = df[column].astype("boolean") for column in object_columns: df[column] = df[column].astype("object") @@ -935,8 +932,8 @@ def _load_csv(self, path: str): # # When assigning more than one column, # a MultiIndex is assigned. - # As the MultiIndex does not preserve pandas dtypes, - # we need to restore them manually. + # Setting a MultiIndex does not always preserve pandas dtypes, + # so we need to set them manually. # if len(index_columns) > 1: index_dtypes = {column: df[column].dtype for column in index_columns} From abb07d9d81632f138b76c505a79e8b3845bf31d0 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 31 May 2024 13:44:34 +0200 Subject: [PATCH 09/67] Fix test for older pandas versions --- audformat/core/table.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index 3feb398b..19c662d6 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -818,7 +818,7 @@ def _load_csv(self, path: str): """ schemes = self.db.schemes - # === DTYPES === + # === Infer dtypes === # Collect pyarrow dtypes # of the CSV file, @@ -890,6 +890,7 @@ def _load_csv(self, path: str): else: object_columns.append(column_id) + # === Read CSV === schema = pa.schema(pyarrow_dtypes) table = csv.read_csv( path, @@ -908,8 +909,14 @@ def _load_csv(self, path: str): pa.string(): pd.StringDtype(), }.get, # we have to provide a callable, not a dict ) + + # === Adjust dtypes === + # Adjust dtypes, that cannot be handled by pyarrow for column in timedelta_columns: + # Older versions of pandas cannot convert None to timedelta + # df[column] = df[column].map(lambda x: pd.NA if x is None else x) + df[column] = df[column].fillna(pd.NA) df[column] = df[column].astype("timedelta64[ns]") for column in boolean_columns: df[column] = df[column].astype("boolean") @@ -928,8 +935,8 @@ def _load_csv(self, path: str): ) df[column] = df[column].astype(dtype) - # Set index - # + # === Set index === + # When assigning more than one column, # a MultiIndex is assigned. # Setting a MultiIndex does not always preserve pandas dtypes, From 48c9da580918c7a59042e0263f3543d7874cb8cf Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 31 May 2024 13:45:01 +0200 Subject: [PATCH 10/67] Exclude benchmark folder from tests --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 1100e75d..3d497a66 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,6 +79,7 @@ addopts = ''' --cov-report term-missing --cov-report xml --ignore=docs/ + --ignore=benchmarks/ ''' From e556c90509fc349156e3e123c28023f2382efad1 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 31 May 2024 13:58:28 +0200 Subject: [PATCH 11/67] Test other implementation --- audformat/core/table.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index 19c662d6..53ead674 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -915,8 +915,8 @@ def _load_csv(self, path: str): # Adjust dtypes, that cannot be handled by pyarrow for column in timedelta_columns: # Older versions of pandas cannot convert None to timedelta - # df[column] = df[column].map(lambda x: pd.NA if x is None else x) - df[column] = df[column].fillna(pd.NA) + df[column] = df[column].map(lambda x: pd.NA if x is None else x) + # df[column] = df[column].fillna(pd.NA) df[column] = df[column].astype("timedelta64[ns]") for column in boolean_columns: df[column] = df[column].astype("boolean") From b07f1ac8b7b9555ba05fc4a23cea112cc7d88c32 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 31 May 2024 14:00:50 +0200 Subject: [PATCH 12/67] Remove support for Python 3.8 --- .github/workflows/test.yml | 2 -- audformat/core/table.py | 3 --- pyproject.toml | 5 ++--- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 021def1d..2060e9bd 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -15,8 +15,6 @@ jobs: os: [ ubuntu-20.04, windows-latest, macOS-latest ] python-version: [ '3.10' ] include: - - os: ubuntu-latest - python-version: '3.8' - os: ubuntu-latest python-version: '3.9' - os: ubuntu-latest diff --git a/audformat/core/table.py b/audformat/core/table.py index 53ead674..fc196e55 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -914,9 +914,6 @@ def _load_csv(self, path: str): # Adjust dtypes, that cannot be handled by pyarrow for column in timedelta_columns: - # Older versions of pandas cannot convert None to timedelta - df[column] = df[column].map(lambda x: pd.NA if x is None else x) - # df[column] = df[column].fillna(pd.NA) df[column] = df[column].astype("timedelta64[ns]") for column in boolean_columns: df[column] = df[column].astype("boolean") diff --git a/pyproject.toml b/pyproject.toml index 3d497a66..14ad2128 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,13 +23,12 @@ classifiers = [ 'Operating System :: OS Independent', 'Programming Language :: Python', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', 'Topic :: Scientific/Engineering', ] -requires-python = '>=3.8' +requires-python = '>=3.9' dependencies = [ 'audeer >=2.0.0', 'audiofile >=0.4.0', @@ -38,7 +37,7 @@ dependencies = [ 'oyaml', 'pyarrow', 'pyyaml >=5.4.1', - 'pandas >=1.4.1', + 'pandas >=2.1.0', ] # Get version dynamically from git # (needs setuptools_scm tools config below) From b1e0b69e9989680613e1563450b0c30515f3ba73 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 11 Jun 2024 12:02:17 +0200 Subject: [PATCH 13/67] Store tables as PARQUET --- audformat/core/define.py | 3 + audformat/core/table.py | 179 +++++++++++++++++++++++++++++++++------ tests/test_database.py | 29 +++++-- tests/test_table.py | 126 +++++++++++++++++++++++---- 4 files changed, 292 insertions(+), 45 deletions(-) diff --git a/audformat/core/define.py b/audformat/core/define.py index 37cffa4c..addd9f79 100644 --- a/audformat/core/define.py +++ b/audformat/core/define.py @@ -337,6 +337,9 @@ class TableStorageFormat(DefineBase): CSV = "csv" """File extension for tables stored in CSV format.""" + PARQUET = "parquet" + """File extension for tables stored in PARQUET format.""" + PICKLE = "pkl" """File extension for tables stored in PKL format.""" diff --git a/audformat/core/table.py b/audformat/core/table.py index fc196e55..fe4e8636 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -8,6 +8,7 @@ import pandas as pd import pyarrow as pa import pyarrow.csv as csv +import pyarrow.parquet as parquet import audeer @@ -460,44 +461,60 @@ def load( """ path = audeer.path(path) - pkl_file = f"{path}.{define.TableStorageFormat.PICKLE}" csv_file = f"{path}.{define.TableStorageFormat.CSV}" + parquet_file = f"{path}.{define.TableStorageFormat.PARQUET}" + pkl_file = f"{path}.{define.TableStorageFormat.PICKLE}" - if not os.path.exists(pkl_file) and not os.path.exists(csv_file): + if ( + not os.path.exists(pkl_file) + and not os.path.exists(csv_file) + and not os.path.exists(parquet_file) + ): raise RuntimeError( - f"No file found for table with path '{path}.{{pkl|csv}}'" + f"No file found for table with path '{path}.{{csv|parquet|pkl}}'" ) - # Load from PKL if file exists and is newer then CSV file. - # If both are written by Database.save() this is the case + # Load from PKL if file exists + # and is newer than CSV or PARQUET file. + # If files are written by Database.save() + # this is always the case # as it stores first the PKL file pickled = False if os.path.exists(pkl_file): - if os.path.exists(csv_file) and os.path.getmtime( - csv_file - ) > os.path.getmtime(pkl_file): - raise RuntimeError( - f"The table CSV file '{csv_file}' is newer " - f"than the table PKL file '{pkl_file}'. " - "If you want to load from the CSV file, " - "please delete the PKL file. " - "If you want to load from the PKL file, " - "please delete the CSV file." - ) + for file in [parquet_file, csv_file]: + if os.path.exists(file) and os.path.getmtime(file) > os.path.getmtime( + pkl_file + ): + ext = audeer.file_extension(file).upper() + raise RuntimeError( + f"The table {ext} file '{file}' is newer " + f"than the table PKL file '{pkl_file}'. " + f"If you want to load from the {ext} file, " + "please delete the PKL file. " + "If you want to load from the PKL file, " + f"please delete the {ext} file." + ) pickled = True if pickled: try: self._load_pickled(pkl_file) except (AttributeError, ValueError, EOFError) as ex: - # if exception is raised (e.g. unsupported pickle protocol) - # try to load from CSV and save it again + # If exception is raised + # (e.g. unsupported pickle protocol) + # try to load from PARQUET or CSV + # and save it again # otherwise raise error - if os.path.exists(csv_file): + if os.path.exists(parquet_file): + self._load_parquet(parquet_file) + self._save_pickled(pkl_file) + elif os.path.exists(csv_file): self._load_csv(csv_file) self._save_pickled(pkl_file) else: raise ex + elif os.path.exists(parquet_file): + self._load_parquet(parquet_file) else: self._load_csv(csv_file) @@ -563,7 +580,7 @@ def save( self, path: str, *, - storage_format: str = define.TableStorageFormat.CSV, + storage_format: str = define.TableStorageFormat.PARQUET, update_other_formats: bool = True, ): r"""Save table data to disk. @@ -583,16 +600,24 @@ def save( path = audeer.path(path) define.TableStorageFormat._assert_has_attribute_value(storage_format) - pickle_file = path + f".{define.TableStorageFormat.PICKLE}" - csv_file = path + f".{define.TableStorageFormat.CSV}" + parquet_file = f"{path}.{define.TableStorageFormat.PARQUET}" + pickle_file = f"{path}.{define.TableStorageFormat.PICKLE}" + csv_file = f"{path}.{define.TableStorageFormat.CSV}" - # Make sure the CSV file is always written first + # Make sure the CSV|PARQUET file is always written first # as it is expected to be older by load() if storage_format == define.TableStorageFormat.PICKLE: - if update_other_formats and os.path.exists(csv_file): + if update_other_formats and os.path.exists(parquet_file): + self._save_parquet(parquet_file) + elif update_other_formats and os.path.exists(csv_file): self._save_csv(csv_file) self._save_pickled(pickle_file) + if storage_format == define.TableStorageFormat.PARQUET: + self._save_parquet(parquet_file) + if update_other_formats and os.path.exists(pickle_file): + self._save_pickled(pickle_file) + if storage_format == define.TableStorageFormat.CSV: self._save_csv(csv_file) if update_other_formats and os.path.exists(pickle_file): @@ -947,6 +972,97 @@ def _load_csv(self, path: str): self._df = df + def _load_parquet(self, path: str): + r"""Load table from PARQUET file. + + The loaded table is stored under ``self._df``. + + Args: + path: path to table, including file extension + + """ + schemes = self.db.schemes + + # === Infer dtypes === + + # Collect columns, + # that cannot directly be converted + # from pyarrow to pandas + object_columns = [] + + # Collect columns, + # belonging to the index + index_columns = [] + + # --- Index --- + if hasattr(self, "type"): + levels = {} + # filewise or segmented table + levels[define.IndexField.FILE] = define.DataType.STRING + if self.type == define.IndexType.SEGMENTED: + # segmented table + for level in [define.IndexField.START, define.IndexField.END]: + levels[level] = define.DataType.TIME + else: + # misc table + levels = self.levels + index_columns += list(levels.keys()) + for name, dtype in levels.items(): + if dtype == define.DataType.OBJECT: + object_columns.append(name) + + # --- Columns --- + categories = {} + for column_id, column in self.columns.items(): + if column.scheme_id is not None: + scheme = schemes[column.scheme_id] + if scheme.labels is not None: + categories[column_id] = scheme._labels_to_list() + if scheme.dtype == define.DataType.OBJECT: + object_columns.append(column_id) + else: + object_columns.append(column_id) + + # === Read CSV === + table = parquet.read_table(path) + df = table.to_pandas( + deduplicate_objects=False, + types_mapper={ + pa.string(): pd.StringDtype(), + }.get, # we have to provide a callable, not a dict + ) + + # === Adjust dtypes === + + # Adjust dtypes, that cannot be handled by pyarrow + for column in object_columns: + df[column] = df[column].astype("object") + df[column] = df[column].replace(pd.NA, None) + for column, labels in categories.items(): + if len(labels) > 0 and isinstance(labels[0], int): + # allow nullable + labels = pd.array(labels, dtype="int64") + dtype = pd.api.types.CategoricalDtype( + categories=labels, + ordered=False, + ) + df[column] = df[column].astype(dtype) + + # === Set index === + + # When assigning more than one column, + # a MultiIndex is assigned. + # Setting a MultiIndex does not always preserve pandas dtypes, + # so we need to set them manually. + # + if len(index_columns) > 1: + index_dtypes = {column: df[column].dtype for column in index_columns} + df.set_index(index_columns, inplace=True) + if len(index_columns) > 1: + df.index = utils.set_index_dtypes(df.index, index_dtypes) + + self._df = df + def _load_pickled(self, path: str): # Older versions of audformat used xz compression # which produced smaller files, @@ -976,10 +1092,23 @@ def _save_csv(self, path: str): # Load table before opening CSV file # to avoid creating a CSV file # that is newer than the PKL file - df = self.df + df = self.df # loads table with open(path, "w") as fp: df.to_csv(fp, encoding="utf-8") + def _save_parquet(self, path: str): + # Load table before opening PARQUET file + # to avoid creating a PARQUET file + # that is newer than the PKL file + df = self.df # loads table + table = pa.Table.from_pandas( + df.reset_index(), + preserve_index=False, + # TODO: check if faster when providing schema? + # schema=self._schema, + ) + parquet.write_table(table, path) + def _save_pickled(self, path: str): self.df.to_pickle( path, diff --git a/tests/test_database.py b/tests/test_database.py index dee4e658..67dfa2cf 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -446,6 +446,12 @@ def test_map_files(num_workers): @pytest.mark.parametrize( "db, storage_format, load_data, num_workers", [ + ( + audformat.testing.create_db(minimal=True), + audformat.define.TableStorageFormat.PARQUET, + False, + 1, + ), ( audformat.testing.create_db(minimal=True), audformat.define.TableStorageFormat.CSV, @@ -458,6 +464,12 @@ def test_map_files(num_workers): False, 1, ), + ( + audformat.testing.create_db(), + audformat.define.TableStorageFormat.PARQUET, + False, + 4, + ), ( audformat.testing.create_db(), audformat.define.TableStorageFormat.CSV, @@ -479,6 +491,11 @@ def test_map_files(num_workers): ], ) def test_save_and_load(tmpdir, db, storage_format, load_data, num_workers): + all_formats = audformat.define.TableStorageFormat._attribute_values() + non_cache_formats = [ + ext for ext in all_formats if ext != audformat.define.TableStorageFormat.PICKLE + ] + assert db.root is None audformat.testing.create_attachment_files(db, tmpdir) db.save( @@ -490,7 +507,7 @@ def test_save_and_load(tmpdir, db, storage_format, load_data, num_workers): expected_formats = [storage_format] for table_id in db.tables: - for ext in audformat.define.TableStorageFormat._attribute_values(): + for ext in all_formats: table_file = os.path.join(tmpdir, f"db.{table_id}.{ext}") if ext in expected_formats: assert os.path.exists(table_file) @@ -498,7 +515,7 @@ def test_save_and_load(tmpdir, db, storage_format, load_data, num_workers): assert not os.path.exists(table_file) # Test update other formats - if storage_format == audformat.define.TableStorageFormat.CSV and db.tables: + if storage_format in non_cache_formats and db.tables: db2 = audformat.testing.create_db() assert db2.root is None db2.save( @@ -508,7 +525,7 @@ def test_save_and_load(tmpdir, db, storage_format, load_data, num_workers): ) assert db.root == tmpdir - # Load prefers PKL files over CSV files, + # Load prefers PKL files, # which means we are loading the second database here db_load = audformat.Database.load( tmpdir, @@ -621,14 +638,16 @@ def test_save_and_load(tmpdir, db, storage_format, load_data, num_workers): # Test missing table if db.tables: table_id = list(db.tables)[0] - for ext in audformat.define.TableStorageFormat._attribute_values(): + for ext in all_formats: table_file = os.path.join(tmpdir, f"db.{table_id}.{ext}") if os.path.exists(table_file): os.remove(table_file) # The replace part handles Windows paths table_path = table_file[:-4].replace("\\", "\\\\") - error_msg = r"No file found for table with path " rf"'{table_path}.{{pkl|csv}}'" + error_msg = ( + r"No file found for table with path " rf"'{table_path}.{{csv|parquet|pkl}}'" + ) with pytest.raises(RuntimeError, match=error_msg): db = audformat.Database.load( tmpdir, diff --git a/tests/test_table.py b/tests/test_table.py index 348e455c..d4ece3d9 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -1,4 +1,5 @@ import os +import re import typing import numpy as np @@ -1118,22 +1119,25 @@ def test_load(tmpdir): with pytest.raises(EOFError): table_loaded.load(path_no_ext) - # repeat with CSV file as fall back - table.save( - path_no_ext, - storage_format=audformat.define.TableStorageFormat.CSV, - ) - with open(path_pkl, "wb"): - pass - table_loaded = audformat.Table() - table_loaded.columns = table.columns - table_loaded._db = table._db - table_loaded.load(path_no_ext) - pd.testing.assert_frame_equal(table.df, table_loaded.df) + # repeat with CSV|PARQUET file as fall back + for ext in [ + audformat.define.TableStorageFormat.CSV, + audformat.define.TableStorageFormat.PARQUET, + ]: + table.save(path_no_ext, storage_format=ext) + with open(path_pkl, "wb"): + pass + table_loaded = audformat.Table() + table_loaded.columns = table.columns + table_loaded._db = table._db + table_loaded.load(path_no_ext) + pd.testing.assert_frame_equal(table.df, table_loaded.df) - # check if pickle file was recovered from CSV - df = pd.read_pickle(path_pkl) - pd.testing.assert_frame_equal(table.df, df) + # check if pickle file was recovered + df = pd.read_pickle(path_pkl) + pd.testing.assert_frame_equal(table.df, df) + + os.remove(f"{path_no_ext}.{ext}") def test_load_old_pickle(tmpdir): @@ -1403,6 +1407,98 @@ def test_pick_index(table, index, expected): pd.testing.assert_index_equal(table.index, expected) +@pytest.mark.parametrize( + "storage_format", + [ + pytest.param( + "csv", + marks=pytest.mark.skip(reason="CSV does not support numpy arrays"), + ), + "parquet", + "pkl", + ], +) +def test_save_and_load(tmpdir, storage_format): + r"""Test saving and loading of a table. + + Ensures the table dataframe representation + is identical after saving and loading a table. + + Args: + tmpdir: tmpdir fixture + storage_format: storage format + the table should be written to disk. + This will also be used as file extension + + """ + db = audformat.testing.create_db() + + # Extend database with more table/scheme combinations + db.schemes["int-labels"] = audformat.Scheme( + dtype=audformat.define.DataType.INTEGER, + labels=[0, 1], + ) + db.schemes["object"] = audformat.Scheme(audformat.define.DataType.OBJECT) + index = pd.MultiIndex.from_arrays( + [[0, 1], ["a", "b"]], + names=["idx1", "idx2"], + ) + index = audformat.utils.set_index_dtypes( + index, + { + "idx1": audformat.define.DataType.INTEGER, + "idx2": audformat.define.DataType.OBJECT, + }, + ) + db["multi-misc"] = audformat.MiscTable(index) + db["multi-misc"]["int"] = audformat.Column(scheme_id="int-labels") + db["multi-misc"]["int"].set([0, pd.NA]) + db["multi-misc"]["bool"] = audformat.Column(scheme_id="bool") + db["multi-misc"]["bool"].set([True, pd.NA]) + db["multi-misc"]["arrays"] = audformat.Column(scheme_id="object") + db["multi-misc"]["arrays"].set([np.array([0, 1]), np.array([2, 3])]) + db["multi-misc"]["lists"] = audformat.Column(scheme_id="object") + db["multi-misc"]["lists"].set([[0, 1], [2, 3]]) + + for table_id in list(db): + expected_df = db[table_id].get() + path_wo_ext = audeer.path(tmpdir, table_id) + path = f"{path_wo_ext}.{storage_format}" + db[table_id].save(path_wo_ext, storage_format=storage_format) + assert os.path.exists(path) + db[table_id].load(path_wo_ext) + pd.testing.assert_frame_equal(db[table_id].df, expected_df) + + +@pytest.mark.parametrize( + "storage_format, expected_error, expected_error_msg", + [ + ( + "non-existing", + audformat.errors.BadValueError, + re.escape( + "Bad value 'non-existing', expected one of ['csv', 'parquet', 'pkl']" + ), + ), + ], +) +def test_save_errors(tmpdir, storage_format, expected_error, expected_error_msg): + r"""Test errors when saving a table. + + Args: + tmpdir: tmpdir fixture + storage_format: storage format of table + expected_error: expected error, e.g. ``ValueError`` + expected_error_msg: expected test of error message + + """ + db = audformat.testing.create_db() + table_id = list(db)[0] + path_wo_ext = audeer.path(tmpdir, table_id) + with pytest.raises(expected_error, match=expected_error_msg): + db[table_id].save(path_wo_ext, storage_format=storage_format) + + @pytest.mark.parametrize( "num_files,num_segments_per_file,values", [ From 68c764cc8f84ceb7aaa829bd81e7f4e94856f110 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 11 Jun 2024 14:37:41 +0200 Subject: [PATCH 14/67] Cleanup code + Table.levels --- audformat/core/database.py | 1 + audformat/core/table.py | 361 ++++++++++++++++++------------------- 2 files changed, 174 insertions(+), 188 deletions(-) diff --git a/audformat/core/database.py b/audformat/core/database.py index 5eb72e68..8c26bedf 100644 --- a/audformat/core/database.py +++ b/audformat/core/database.py @@ -121,6 +121,7 @@ class Database(HeaderBase): tables: table: type: filewise + levels: {file: str} media_id: audio columns: column: {scheme_id: emotion, rater_id: rater} diff --git a/audformat/core/table.py b/audformat/core/table.py index fe4e8636..a3091494 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -819,6 +819,102 @@ def assert_equal( return self + def _convert_pyarrow_dtypes( + self, + df: pd.DataFrame, + *, + convert_all: bool = False, + ) -> pd.DataFrame: + r"""Convert dtypes that are not handled by pyarrow. + + This adjusts dtypes in a dataframe, + that could not be set correctly + when converting to the dataframe + from pyarrow. + + Args: + df: dataframe, + convert_all: if ``False``, + converts all columns with + ``"object"`` audformat dtype, + and all columns with a scheme with labels. + If ``"True"``, + it converts additionally all columns with + ``"bool"``, ``"int"``, and ``"time"`` audformat dtypes + + Returns: + dataframe with converted dtypes + + """ + # Collect columns with dtypes, + # that cannot directly be converted + # from pyarrow to pandas + bool_columns = [] + int_columns = [] + time_columns = [] + object_columns = [] + + # Collect columns + # with scheme labels + labeled_columns = [] + + # Collect columns, + # belonging to the index + index_columns = [] + + # --- Index --- + index_columns += list(self.levels.keys()) + for level, dtype in self.levels.items(): + if dtype == define.DataType.BOOL: + bool_columns.append(level) + elif dtype == define.DataType.INTEGER: + int_columns.append(level) + elif dtype == define.DataType.TIME: + time_columns.append(level) + elif dtype == define.DataType.OBJECT: + object_columns.append(level) + + # --- Columns --- + for column_id, column in self.columns.items(): + if column.scheme_id is not None: + scheme = self.db.schemes[column.scheme_id] + if scheme.labels is not None: + labeled_columns.append(column_id) + elif scheme.dtype == define.DataType.BOOL: + bool_columns.append(column_id) + elif scheme.dtype == define.DataType.INTEGER: + int_columns.append(column_id) + elif scheme.dtype == define.DataType.TIME: + time_columns.append(column_id) + elif scheme.dtype == define.DataType.OBJECT: + object_columns.append(column_id) + else: + # No scheme defaults to `object` dtype + object_columns.append(column_id) + + if convert_all: + for column in bool_columns: + df[column] = df[column].astype("boolean") + for column in int_columns: + df[column] = df[column].astype("Int64") + for column in time_columns: + df[column] = df[column].astype("timedelta64[ns]") + for column in object_columns: + df[column] = df[column].astype("object") + df[column] = df[column].replace(pd.NA, None) + for column in labeled_columns: + scheme = self.db.schemes[self.columns[column].scheme_id] + labels = scheme._labels_to_list() + if len(labels) > 0 and isinstance(labels[0], int): + # allow nullable + labels = pd.array(labels, dtype="int64") + dtype = pd.api.types.CategoricalDtype( + categories=labels, + ordered=False, + ) + df[column] = df[column].astype(dtype) + return df + def _get_by_index( self, index: pd.Index, @@ -841,134 +937,43 @@ def _load_csv(self, path: str): path: path to table, including file extension """ - schemes = self.db.schemes - - # === Infer dtypes === - # Collect pyarrow dtypes - # of the CSV file, - # by inspecting the audformat schemes, - # and the index - # associated with the table. - # The dtypes are used to create + # of all columns, + # including index columns. + # The dtypes are stored as a tuple + # ``(column, dtype)``, + # and are used to create # the pyarrow.Schema # used when reading the CSV file pyarrow_dtypes = [] - - # Collect columns, - # that cannot directly be converted - # from pyarrow to pandas - timedelta_columns = [] - boolean_columns = [] - object_columns = [] - integer_columns = [] - - # Collect columns, - # belonging to the index - index_columns = [] - - # --- Index --- - if hasattr(self, "type"): - levels = {} - # filewise or segmented table - levels[define.IndexField.FILE] = define.DataType.STRING - if self.type == define.IndexType.SEGMENTED: - # segmented table - for level in [define.IndexField.START, define.IndexField.END]: - levels[level] = define.DataType.TIME - else: - # misc table - levels = self.levels - index_columns += list(levels.keys()) - for name, dtype in levels.items(): - pyarrow_dtype = to_pyarrow_dtype(dtype) - if pyarrow_dtype is not None: - pyarrow_dtypes.append((name, pyarrow_dtype)) - if dtype == define.DataType.TIME: - timedelta_columns.append(name) - elif dtype == define.DataType.INTEGER: - integer_columns.append(name) - elif dtype == define.DataType.BOOL: - boolean_columns.append(name) - else: - object_columns.append(name) - - # --- Columns --- - categories = {} - columns = list(self.columns) + # Index + for level, dtype in self.levels.items(): + if dtype != define.DataType.OBJECT: + pyarrow_dtypes.append((level, to_pyarrow_dtype(dtype))) + # Columns for column_id, column in self.columns.items(): if column.scheme_id is not None: - scheme = schemes[column.scheme_id] - if scheme.labels is not None: - categories[column_id] = scheme._labels_to_list() - pyarrow_dtype = to_pyarrow_dtype(scheme.dtype) - if pyarrow_dtype is not None: - pyarrow_dtypes.append((column_id, pyarrow_dtype)) - if scheme.dtype == define.DataType.TIME: - timedelta_columns.append(column_id) - elif scheme.dtype == define.DataType.BOOL: - boolean_columns.append(column_id) - elif scheme.dtype == define.DataType.INTEGER: - integer_columns.append(column_id) - else: - object_columns.append(column_id) - else: - object_columns.append(column_id) + dtype = self.db.schemes[column.scheme_id].dtype + if dtype != define.DataType.OBJECT: + pyarrow_dtypes.append((column_id, to_pyarrow_dtype(dtype))) - # === Read CSV === - schema = pa.schema(pyarrow_dtypes) + # Read CSV file table = csv.read_csv( path, read_options=csv.ReadOptions( - column_names=index_columns + columns, + column_names=list(self.levels.keys()) + list(self.columns.keys()), skip_rows=1, ), convert_options=csv.ConvertOptions( - column_types=schema, + column_types=pa.schema(pyarrow_dtypes), strings_can_be_null=True, ), ) - df = table.to_pandas( - deduplicate_objects=False, - types_mapper={ - pa.string(): pd.StringDtype(), - }.get, # we have to provide a callable, not a dict - ) - - # === Adjust dtypes === - - # Adjust dtypes, that cannot be handled by pyarrow - for column in timedelta_columns: - df[column] = df[column].astype("timedelta64[ns]") - for column in boolean_columns: - df[column] = df[column].astype("boolean") - for column in object_columns: - df[column] = df[column].astype("object") - df[column] = df[column].replace(pd.NA, None) - for column in integer_columns: - df[column] = df[column].astype("Int64") - for column, labels in categories.items(): - if len(labels) > 0 and isinstance(labels[0], int): - # allow nullable - labels = pd.array(labels, dtype="int64") - dtype = pd.api.types.CategoricalDtype( - categories=labels, - ordered=False, - ) - df[column] = df[column].astype(dtype) + df = self._pyarrow_table_to_dataframe(table) - # === Set index === - - # When assigning more than one column, - # a MultiIndex is assigned. - # Setting a MultiIndex does not always preserve pandas dtypes, - # so we need to set them manually. - # - if len(index_columns) > 1: - index_dtypes = {column: df[column].dtype for column in index_columns} - df.set_index(index_columns, inplace=True) - if len(index_columns) > 1: - df.index = utils.set_index_dtypes(df.index, index_dtypes) + # Adjust dtypes and set index + df = self._convert_pyarrow_dtypes(df, convert_all=True) + df = self._set_index(df, list(self.levels.keys())) self._df = df @@ -981,85 +986,13 @@ def _load_parquet(self, path: str): path: path to table, including file extension """ - schemes = self.db.schemes - - # === Infer dtypes === - - # Collect columns, - # that cannot directly be converted - # from pyarrow to pandas - object_columns = [] - - # Collect columns, - # belonging to the index - index_columns = [] - - # --- Index --- - if hasattr(self, "type"): - levels = {} - # filewise or segmented table - levels[define.IndexField.FILE] = define.DataType.STRING - if self.type == define.IndexType.SEGMENTED: - # segmented table - for level in [define.IndexField.START, define.IndexField.END]: - levels[level] = define.DataType.TIME - else: - # misc table - levels = self.levels - index_columns += list(levels.keys()) - for name, dtype in levels.items(): - if dtype == define.DataType.OBJECT: - object_columns.append(name) - - # --- Columns --- - categories = {} - for column_id, column in self.columns.items(): - if column.scheme_id is not None: - scheme = schemes[column.scheme_id] - if scheme.labels is not None: - categories[column_id] = scheme._labels_to_list() - if scheme.dtype == define.DataType.OBJECT: - object_columns.append(column_id) - else: - object_columns.append(column_id) - - # === Read CSV === + # Read PARQUET file table = parquet.read_table(path) - df = table.to_pandas( - deduplicate_objects=False, - types_mapper={ - pa.string(): pd.StringDtype(), - }.get, # we have to provide a callable, not a dict - ) - - # === Adjust dtypes === - - # Adjust dtypes, that cannot be handled by pyarrow - for column in object_columns: - df[column] = df[column].astype("object") - df[column] = df[column].replace(pd.NA, None) - for column, labels in categories.items(): - if len(labels) > 0 and isinstance(labels[0], int): - # allow nullable - labels = pd.array(labels, dtype="int64") - dtype = pd.api.types.CategoricalDtype( - categories=labels, - ordered=False, - ) - df[column] = df[column].astype(dtype) - - # === Set index === + df = self._pyarrow_table_to_dataframe(table) - # When assigning more than one column, - # a MultiIndex is assigned. - # Setting a MultiIndex does not always preserve pandas dtypes, - # so we need to set them manually. - # - if len(index_columns) > 1: - index_dtypes = {column: df[column].dtype for column in index_columns} - df.set_index(index_columns, inplace=True) - if len(index_columns) > 1: - df.index = utils.set_index_dtypes(df.index, index_dtypes) + # Adjust dtypes and set index + df = self._convert_pyarrow_dtypes(df) + df = self._set_index(df, list(self.levels.keys())) self._df = df @@ -1088,6 +1021,23 @@ def _load_pickled(self, path: str): self._df = df + def _pyarrow_table_to_dataframe(self, table: pa.Table) -> pd.DataFrame: + r"""Convert pyarrow table to pandas dataframe. + + Args: + table: pyarrow table + + Returns: + dataframe + + """ + return table.to_pandas( + deduplicate_objects=False, + types_mapper={ + pa.string(): pd.StringDtype(), + }.get, # we have to provide a callable, not a dict + ) + def _save_csv(self, path: str): # Load table before opening CSV file # to avoid creating a CSV file @@ -1149,6 +1099,31 @@ def _set_column(self, column_id: str, column: Column) -> Column: return column + def _set_index(self, df: pd.DataFrame, columns: typing.Sequence) -> pd.DataFrame: + r"""Set columns as index. + + Setting of index columns is performed inplace! + + Args: + df: dataframe + columns: columns to be set as index of dataframe + + Returns: + updated dataframe + + """ + # When assigning more than one column, + # a MultiIndex is assigned. + # Setting a MultiIndex does not always preserve pandas dtypes, + # so we need to set them manually. + # + if len(columns) > 1: + dtypes = {column: df[column].dtype for column in columns} + df.set_index(columns, inplace=True) + if len(columns) > 1: + df.index = utils.set_index_dtypes(df.index, dtypes) + return df + class MiscTable(Base): r"""Miscellaneous table. @@ -1348,6 +1323,7 @@ class Table(Base): >>> table["values"] = Column() >>> table type: filewise + levels: {file: str} split_id: test columns: values: {} @@ -1439,6 +1415,15 @@ def __init__( """ + levels = {} + levels[define.IndexField.FILE] = define.DataType.STRING + if self.type == define.IndexType.SEGMENTED: + levels[define.IndexField.START] = define.DataType.TIME + levels[define.IndexField.END] = define.DataType.TIME + + self.levels = levels + r"""Index levels.""" + super().__init__( index, split_id=split_id, From fdc96bdb11e00692fa8ea0bd68a43f1c95bd5b61 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 11 Jun 2024 14:44:34 +0200 Subject: [PATCH 15/67] Use dict for CSV dtype mappings --- audformat/core/common.py | 34 ---------------------------------- audformat/core/table.py | 21 ++++++++++++++++----- 2 files changed, 16 insertions(+), 39 deletions(-) diff --git a/audformat/core/common.py b/audformat/core/common.py index 5888fea9..833b44fc 100644 --- a/audformat/core/common.py +++ b/audformat/core/common.py @@ -6,7 +6,6 @@ import oyaml as yaml import pandas as pd -import pyarrow as pa from audformat import define from audformat.core.errors import BadKeyError @@ -389,36 +388,3 @@ def to_pandas_dtype(dtype: str) -> typing.Optional[str]: return "string" elif dtype == define.DataType.TIME: return "timedelta64[ns]" - - -def to_pyarrow_dtype(dtype: str) -> typing.Optional[str]: - r"""Convert audformat to pyarrow dtype. - - For ``"object"`` as ``dtype`` - there is no equivalent, - and we don't return a value here. - We let ``pyarrow`` decide, - which dtype fits best in that case. - - Args: - dtype: audformat dtype - - Returns: - pyarrow dtype - - """ - if dtype == define.DataType.BOOL: - return pa.bool_() - elif dtype == define.DataType.DATE: - return pa.timestamp("ns") - elif dtype == define.DataType.FLOAT: - return pa.float64() - elif dtype == define.DataType.INTEGER: - return pa.int64() - elif dtype == define.DataType.STRING: - return pa.string() - elif dtype == define.DataType.TIME: - # A better fitting type would be `pa.duration("ns")`, - # but this is not yet supported - # when reading CSV files - return pa.string() diff --git a/audformat/core/table.py b/audformat/core/table.py index a3091494..125eb921 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -18,7 +18,6 @@ from audformat.core.common import HeaderBase from audformat.core.common import HeaderDict from audformat.core.common import to_audformat_dtype -from audformat.core.common import to_pyarrow_dtype from audformat.core.errors import BadIdError from audformat.core.index import filewise_index from audformat.core.index import index_type @@ -946,16 +945,28 @@ def _load_csv(self, path: str): # the pyarrow.Schema # used when reading the CSV file pyarrow_dtypes = [] + # Mapping from audformat to pyarrow dtypes + to_pyarrow_dtype = { + define.DataType.BOOL: pa.bool_(), + define.DataType.DATE: pa.timestamp("ns"), + define.DataType.FLOAT: pa.float64(), + define.DataType.INTEGER: pa.int64(), + define.DataType.STRING: pa.string(), + # A better fitting type would be `pa.duration("ns")`, + # but this is not yet supported + # when reading CSV files + define.DataType.TIME: pa.string(), + } # Index for level, dtype in self.levels.items(): - if dtype != define.DataType.OBJECT: - pyarrow_dtypes.append((level, to_pyarrow_dtype(dtype))) + if dtype in to_pyarrow_dtype: + pyarrow_dtypes.append((level, to_pyarrow_dtype[dtype])) # Columns for column_id, column in self.columns.items(): if column.scheme_id is not None: dtype = self.db.schemes[column.scheme_id].dtype - if dtype != define.DataType.OBJECT: - pyarrow_dtypes.append((column_id, to_pyarrow_dtype(dtype))) + if dtype in to_pyarrow_dtype: + pyarrow_dtypes.append((column_id, to_pyarrow_dtype[dtype])) # Read CSV file table = csv.read_csv( From e865813c7bef449b90022985f0d948da60a40dad Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 11 Jun 2024 15:19:15 +0200 Subject: [PATCH 16/67] Rename helper function --- audformat/core/table.py | 196 ++++++++++++++++++++-------------------- 1 file changed, 98 insertions(+), 98 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index 125eb921..610cd0a3 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -818,102 +818,6 @@ def assert_equal( return self - def _convert_pyarrow_dtypes( - self, - df: pd.DataFrame, - *, - convert_all: bool = False, - ) -> pd.DataFrame: - r"""Convert dtypes that are not handled by pyarrow. - - This adjusts dtypes in a dataframe, - that could not be set correctly - when converting to the dataframe - from pyarrow. - - Args: - df: dataframe, - convert_all: if ``False``, - converts all columns with - ``"object"`` audformat dtype, - and all columns with a scheme with labels. - If ``"True"``, - it converts additionally all columns with - ``"bool"``, ``"int"``, and ``"time"`` audformat dtypes - - Returns: - dataframe with converted dtypes - - """ - # Collect columns with dtypes, - # that cannot directly be converted - # from pyarrow to pandas - bool_columns = [] - int_columns = [] - time_columns = [] - object_columns = [] - - # Collect columns - # with scheme labels - labeled_columns = [] - - # Collect columns, - # belonging to the index - index_columns = [] - - # --- Index --- - index_columns += list(self.levels.keys()) - for level, dtype in self.levels.items(): - if dtype == define.DataType.BOOL: - bool_columns.append(level) - elif dtype == define.DataType.INTEGER: - int_columns.append(level) - elif dtype == define.DataType.TIME: - time_columns.append(level) - elif dtype == define.DataType.OBJECT: - object_columns.append(level) - - # --- Columns --- - for column_id, column in self.columns.items(): - if column.scheme_id is not None: - scheme = self.db.schemes[column.scheme_id] - if scheme.labels is not None: - labeled_columns.append(column_id) - elif scheme.dtype == define.DataType.BOOL: - bool_columns.append(column_id) - elif scheme.dtype == define.DataType.INTEGER: - int_columns.append(column_id) - elif scheme.dtype == define.DataType.TIME: - time_columns.append(column_id) - elif scheme.dtype == define.DataType.OBJECT: - object_columns.append(column_id) - else: - # No scheme defaults to `object` dtype - object_columns.append(column_id) - - if convert_all: - for column in bool_columns: - df[column] = df[column].astype("boolean") - for column in int_columns: - df[column] = df[column].astype("Int64") - for column in time_columns: - df[column] = df[column].astype("timedelta64[ns]") - for column in object_columns: - df[column] = df[column].astype("object") - df[column] = df[column].replace(pd.NA, None) - for column in labeled_columns: - scheme = self.db.schemes[self.columns[column].scheme_id] - labels = scheme._labels_to_list() - if len(labels) > 0 and isinstance(labels[0], int): - # allow nullable - labels = pd.array(labels, dtype="int64") - dtype = pd.api.types.CategoricalDtype( - categories=labels, - ordered=False, - ) - df[column] = df[column].astype(dtype) - return df - def _get_by_index( self, index: pd.Index, @@ -983,7 +887,7 @@ def _load_csv(self, path: str): df = self._pyarrow_table_to_dataframe(table) # Adjust dtypes and set index - df = self._convert_pyarrow_dtypes(df, convert_all=True) + df = self._pyarrow_convert_dtypes(df, convert_all=True) df = self._set_index(df, list(self.levels.keys())) self._df = df @@ -1002,7 +906,7 @@ def _load_parquet(self, path: str): df = self._pyarrow_table_to_dataframe(table) # Adjust dtypes and set index - df = self._convert_pyarrow_dtypes(df) + df = self._pyarrow_convert_dtypes(df) df = self._set_index(df, list(self.levels.keys())) self._df = df @@ -1032,6 +936,102 @@ def _load_pickled(self, path: str): self._df = df + def _pyarrow_convert_dtypes( + self, + df: pd.DataFrame, + *, + convert_all: bool = False, + ) -> pd.DataFrame: + r"""Convert dtypes that are not handled by pyarrow. + + This adjusts dtypes in a dataframe, + that could not be set correctly + when converting to the dataframe + from pyarrow. + + Args: + df: dataframe, + convert_all: if ``False``, + converts all columns with + ``"object"`` audformat dtype, + and all columns with a scheme with labels. + If ``"True"``, + it converts additionally all columns with + ``"bool"``, ``"int"``, and ``"time"`` audformat dtypes + + Returns: + dataframe with converted dtypes + + """ + # Collect columns with dtypes, + # that cannot directly be converted + # from pyarrow to pandas + bool_columns = [] + int_columns = [] + time_columns = [] + object_columns = [] + + # Collect columns + # with scheme labels + labeled_columns = [] + + # Collect columns, + # belonging to the index + index_columns = [] + + # --- Index --- + index_columns += list(self.levels.keys()) + for level, dtype in self.levels.items(): + if dtype == define.DataType.BOOL: + bool_columns.append(level) + elif dtype == define.DataType.INTEGER: + int_columns.append(level) + elif dtype == define.DataType.TIME: + time_columns.append(level) + elif dtype == define.DataType.OBJECT: + object_columns.append(level) + + # --- Columns --- + for column_id, column in self.columns.items(): + if column.scheme_id is not None: + scheme = self.db.schemes[column.scheme_id] + if scheme.labels is not None: + labeled_columns.append(column_id) + elif scheme.dtype == define.DataType.BOOL: + bool_columns.append(column_id) + elif scheme.dtype == define.DataType.INTEGER: + int_columns.append(column_id) + elif scheme.dtype == define.DataType.TIME: + time_columns.append(column_id) + elif scheme.dtype == define.DataType.OBJECT: + object_columns.append(column_id) + else: + # No scheme defaults to `object` dtype + object_columns.append(column_id) + + if convert_all: + for column in bool_columns: + df[column] = df[column].astype("boolean") + for column in int_columns: + df[column] = df[column].astype("Int64") + for column in time_columns: + df[column] = df[column].astype("timedelta64[ns]") + for column in object_columns: + df[column] = df[column].astype("object") + df[column] = df[column].replace(pd.NA, None) + for column in labeled_columns: + scheme = self.db.schemes[self.columns[column].scheme_id] + labels = scheme._labels_to_list() + if len(labels) > 0 and isinstance(labels[0], int): + # allow nullable + labels = pd.array(labels, dtype="int64") + dtype = pd.api.types.CategoricalDtype( + categories=labels, + ordered=False, + ) + df[column] = df[column].astype(dtype) + return df + def _pyarrow_table_to_dataframe(self, table: pa.Table) -> pd.DataFrame: r"""Convert pyarrow table to pandas dataframe. From eee02d3721fb51fb4ba6560fa174d89f471b01a0 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 11 Jun 2024 15:21:59 +0200 Subject: [PATCH 17/67] Simplify code --- audformat/core/table.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index 610cd0a3..b74ec5d3 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -884,11 +884,7 @@ def _load_csv(self, path: str): strings_can_be_null=True, ), ) - df = self._pyarrow_table_to_dataframe(table) - - # Adjust dtypes and set index - df = self._pyarrow_convert_dtypes(df, convert_all=True) - df = self._set_index(df, list(self.levels.keys())) + df = self._pyarrow_table_to_dataframe(table, from_csv=True) self._df = df @@ -905,10 +901,6 @@ def _load_parquet(self, path: str): table = parquet.read_table(path) df = self._pyarrow_table_to_dataframe(table) - # Adjust dtypes and set index - df = self._pyarrow_convert_dtypes(df) - df = self._set_index(df, list(self.levels.keys())) - self._df = df def _load_pickled(self, path: str): @@ -1032,22 +1024,34 @@ def _pyarrow_convert_dtypes( df[column] = df[column].astype(dtype) return df - def _pyarrow_table_to_dataframe(self, table: pa.Table) -> pd.DataFrame: + def _pyarrow_table_to_dataframe( + self, + table: pa.Table, + *, + from_csv: bool = False, + ) -> pd.DataFrame: r"""Convert pyarrow table to pandas dataframe. Args: table: pyarrow table + from_csv: if ``True`` it assumes, + that ``table`` was created by reading a CSV file, + and it will convert all needed dtypes Returns: dataframe """ - return table.to_pandas( + df = table.to_pandas( deduplicate_objects=False, types_mapper={ pa.string(): pd.StringDtype(), }.get, # we have to provide a callable, not a dict ) + # Adjust dtypes and set index + df = self._pyarrow_convert_dtypes(df, convert_all=from_csv) + df = self._set_index(df, list(self.levels.keys())) + return df def _save_csv(self, path: str): # Load table before opening CSV file From cb4a42fd04168a59e9d4036f198d32918f507638 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 11 Jun 2024 15:27:57 +0200 Subject: [PATCH 18/67] Add helper function for CSV schema --- audformat/core/table.py | 84 ++++++++++++++++++++++++----------------- 1 file changed, 50 insertions(+), 34 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index b74ec5d3..10927505 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -840,39 +840,6 @@ def _load_csv(self, path: str): path: path to table, including file extension """ - # Collect pyarrow dtypes - # of all columns, - # including index columns. - # The dtypes are stored as a tuple - # ``(column, dtype)``, - # and are used to create - # the pyarrow.Schema - # used when reading the CSV file - pyarrow_dtypes = [] - # Mapping from audformat to pyarrow dtypes - to_pyarrow_dtype = { - define.DataType.BOOL: pa.bool_(), - define.DataType.DATE: pa.timestamp("ns"), - define.DataType.FLOAT: pa.float64(), - define.DataType.INTEGER: pa.int64(), - define.DataType.STRING: pa.string(), - # A better fitting type would be `pa.duration("ns")`, - # but this is not yet supported - # when reading CSV files - define.DataType.TIME: pa.string(), - } - # Index - for level, dtype in self.levels.items(): - if dtype in to_pyarrow_dtype: - pyarrow_dtypes.append((level, to_pyarrow_dtype[dtype])) - # Columns - for column_id, column in self.columns.items(): - if column.scheme_id is not None: - dtype = self.db.schemes[column.scheme_id].dtype - if dtype in to_pyarrow_dtype: - pyarrow_dtypes.append((column_id, to_pyarrow_dtype[dtype])) - - # Read CSV file table = csv.read_csv( path, read_options=csv.ReadOptions( @@ -880,7 +847,7 @@ def _load_csv(self, path: str): skip_rows=1, ), convert_options=csv.ConvertOptions( - column_types=pa.schema(pyarrow_dtypes), + column_types=self._pyarrow_csv_schema(), strings_can_be_null=True, ), ) @@ -1024,6 +991,55 @@ def _pyarrow_convert_dtypes( df[column] = df[column].astype(dtype) return df + def _pyarrow_csv_schema(self) -> pa.Schema: + r"""Data type mapping for reading CSV file with pyarrow. + + This provides a schema, + defining pyarrow dtypes + for the columns of a CSV file. + + The dtypes are extracted from the audformat schemes, + and converted to the pyarrow dtypes. + + Returns: + pyarrow schema for reading a CSV file + + """ + # Mapping from audformat to pyarrow dtypes + to_pyarrow_dtype = { + define.DataType.BOOL: pa.bool_(), + define.DataType.DATE: pa.timestamp("ns"), + define.DataType.FLOAT: pa.float64(), + define.DataType.INTEGER: pa.int64(), + define.DataType.STRING: pa.string(), + # A better fitting type would be `pa.duration("ns")`, + # but this is not yet supported + # when reading CSV files + define.DataType.TIME: pa.string(), + } + + # Collect pyarrow dtypes + # of all columns, + # including index columns. + # The dtypes are stored as a tuple + # ``(column, dtype)``, + # and are used to create + # the pyarrow.Schema + # used when reading the CSV file + pyarrow_dtypes = [] + # Index + for level, dtype in self.levels.items(): + if dtype in to_pyarrow_dtype: + pyarrow_dtypes.append((level, to_pyarrow_dtype[dtype])) + # Columns + for column_id, column in self.columns.items(): + if column.scheme_id is not None: + dtype = self.db.schemes[column.scheme_id].dtype + if dtype in to_pyarrow_dtype: + pyarrow_dtypes.append((column_id, to_pyarrow_dtype[dtype])) + + return pa.schema(pyarrow_dtypes) + def _pyarrow_table_to_dataframe( self, table: pa.Table, From c89bc33bfd0a9fec895e211c4524f73df96bb44f Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 12 Jun 2024 09:02:22 +0200 Subject: [PATCH 19/67] Fix typo in docstring --- audformat/core/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/audformat/core/utils.py b/audformat/core/utils.py index 4e5d6015..19b33a74 100644 --- a/audformat/core/utils.py +++ b/audformat/core/utils.py @@ -2052,7 +2052,7 @@ def _is_same_dtype(d1, d2) -> bool: def _levels(obj): - r"""List of dtypes of object.""" + r"""List of levels of object.""" if isinstance(obj, pd.MultiIndex): return list(obj.names) else: From e485d573dadf63e5bd71103e409e958bd3ad8bd1 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 12 Jun 2024 09:06:35 +0200 Subject: [PATCH 20/67] Remove levels attribute --- audformat/core/table.py | 60 +++++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index 10927505..4747bd6a 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -826,6 +826,19 @@ def _get_by_index( # Returns `df, df_is_copy` raise NotImplementedError() + def _levels_and_dtypes(self) -> typing.Dict[str, str]: + r"""Levels and dtypes of index columns. + + Returns: + dictionary with index levels (column names) + and associated audformat data type + + """ + # The returned dictionary is used + # to infer index column names and dtypes + # when reading CSV files. + raise NotImplementedError() # pragma: no cover + def _load_csv(self, path: str): r"""Load table from CSV file. @@ -840,10 +853,12 @@ def _load_csv(self, path: str): path: path to table, including file extension """ + levels = list(self._levels_and_dtypes().keys()) + columns = list(self.columns.keys()) table = csv.read_csv( path, read_options=csv.ReadOptions( - column_names=list(self.levels.keys()) + list(self.columns.keys()), + column_names=levels + columns, skip_rows=1, ), convert_options=csv.ConvertOptions( @@ -939,8 +954,8 @@ def _pyarrow_convert_dtypes( index_columns = [] # --- Index --- - index_columns += list(self.levels.keys()) - for level, dtype in self.levels.items(): + index_columns += list(self._levels_and_dtypes.keys()) + for level, dtype in self._levels_and_dtypes.items(): if dtype == define.DataType.BOOL: bool_columns.append(level) elif dtype == define.DataType.INTEGER: @@ -1028,7 +1043,7 @@ def _pyarrow_csv_schema(self) -> pa.Schema: # used when reading the CSV file pyarrow_dtypes = [] # Index - for level, dtype in self.levels.items(): + for level, dtype in self._levels_and_dtypes.items(): if dtype in to_pyarrow_dtype: pyarrow_dtypes.append((level, to_pyarrow_dtype[dtype])) # Columns @@ -1066,7 +1081,8 @@ def _pyarrow_table_to_dataframe( ) # Adjust dtypes and set index df = self._pyarrow_convert_dtypes(df, convert_all=from_csv) - df = self._set_index(df, list(self.levels.keys())) + index_columns = list(self._levels_and_dtypes.keys()) + df = self._set_index(df, index_columns) return df def _save_csv(self, path: str): @@ -1315,6 +1331,16 @@ def __init__( def _get_by_index(self, index: pd.Index) -> pd.DataFrame: return self.df.loc[index] + def _levels_and_dtypes(self) -> typing.Dict[str, str]: + r"""Levels and dtypes of index columns. + + Returns: + dictionary with index levels (column names) + and associated audformat data type + + """ + return self.levels + class Table(Base): r"""Table conform to :ref:`table specifications `. @@ -1446,15 +1472,6 @@ def __init__( """ - levels = {} - levels[define.IndexField.FILE] = define.DataType.STRING - if self.type == define.IndexType.SEGMENTED: - levels[define.IndexField.START] = define.DataType.TIME - levels[define.IndexField.END] = define.DataType.TIME - - self.levels = levels - r"""Index levels.""" - super().__init__( index, split_id=split_id, @@ -1725,6 +1742,21 @@ def _get_by_index( return result + def _levels_and_dtypes(self) -> typing.Dict[str, str]: + r"""Levels and dtypes of index columns. + + Returns: + dictionary with index levels (column names) + and associated audformat data type + + """ + levels_and_dtypes = {} + levels_and_dtypes[define.IndexField.FILE] = define.DataType.STRING + if self.type == define.IndexType.SEGMENTED: + levels_and_dtypes[define.IndexField.START] = define.DataType.TIME + levels_and_dtypes[define.IndexField.END] = define.DataType.TIME + return levels_and_dtypes + def _assert_table_index( table: Base, From 2a359f1bce9ed3396374eebd0bfe0adb4a9b6b04 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 12 Jun 2024 09:52:16 +0200 Subject: [PATCH 21/67] Merge stash --- audformat/core/table.py | 3 +-- audformat/core/utils.py | 59 ++++++++++++++++++++++++++++++----------- 2 files changed, 45 insertions(+), 17 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index 4747bd6a..f95f5631 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -1316,8 +1316,7 @@ def __init__( f"{levels}, " f"but names must be non-empty and unique." ) - - dtypes = [to_audformat_dtype(dtype) for dtype in utils._dtypes(index)] + dtypes = utils._audformat_dtypes(index) self.levels = {level: dtype for level, dtype in zip(levels, dtypes)} super().__init__( diff --git a/audformat/core/utils.py b/audformat/core/utils.py index 19b33a74..fdb0b411 100644 --- a/audformat/core/utils.py +++ b/audformat/core/utils.py @@ -929,8 +929,7 @@ def is_index_alike( # check dtypes dtypes = set() for obj in objs: - ds = [to_audformat_dtype(dtype) for dtype in _dtypes(obj)] - dtypes.add(tuple(ds)) + dtypes.add(tuple(_audformat_dtypes(obj))) if len(dtypes) > 1: return False @@ -2017,7 +2016,7 @@ def _assert_index_alike( dtypes = [] for obj in objs: - ds = [to_audformat_dtype(dtype) for dtype in _dtypes(obj)] + ds = _audformat_dtypes(obj) dtypes.append(tuple(ds) if len(ds) > 1 else ds[0]) dtypes = list(dict.fromkeys(dtypes)) if len(dtypes) > 1: @@ -2026,12 +2025,18 @@ def _assert_index_alike( raise ValueError(msg) -def _dtypes(obj): - r"""List of dtypes of object.""" - if isinstance(obj, pd.MultiIndex): - return list(obj.dtypes) - else: - return [obj.dtype] +def _audformat_dtypes(index) -> typing.List[str]: + r"""List of audformat data types of index. + + Args: + index: index + + Returns: + audformat data types of index + + """ + dtypes = _pandas_dtypes(index) + return [to_audformat_dtype(dtype) for dtype in dtypes] def _is_same_dtype(d1, d2) -> bool: @@ -2051,12 +2056,20 @@ def _is_same_dtype(d1, d2) -> bool: return d1.name == d2.name -def _levels(obj): - r"""List of levels of object.""" - if isinstance(obj, pd.MultiIndex): - return list(obj.names) +def _levels(index) -> typing.List[str]: + r"""List of levels of index. + + Args: + index: index + + Returns: + index levels + + """ + if isinstance(index, pd.MultiIndex): + return list(index.names) else: - return [obj.name] + return [index.name] def _maybe_convert_filewise_index( @@ -2101,7 +2114,7 @@ def _maybe_convert_pandas_dtype( """ levels = _levels(index) - dtypes = _dtypes(index) + dtypes = _pandas_dtypes(index) # Ensure integers are stored as Int64 int_dtypes = { @@ -2152,3 +2165,19 @@ def _maybe_convert_single_level_multi_index( objs[idx].index = obj.index.get_level_values(0) return objs + + +def _pandas_dtypes(index) -> typing.List[typing.Any]: + r"""List of pandas dtypes of index. + + Args: + index: index + + Returns: + pandas data types of index + + """ + if isinstance(index, pd.MultiIndex): + return list(index.dtypes) + else: + return [index.dtype] From 01678d9ead14b738efe34a6407443cecb304ef0f Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 12 Jun 2024 09:52:56 +0200 Subject: [PATCH 22/67] Remove levels from doctest output --- audformat/core/database.py | 1 - audformat/core/table.py | 1 - 2 files changed, 2 deletions(-) diff --git a/audformat/core/database.py b/audformat/core/database.py index 8c26bedf..5eb72e68 100644 --- a/audformat/core/database.py +++ b/audformat/core/database.py @@ -121,7 +121,6 @@ class Database(HeaderBase): tables: table: type: filewise - levels: {file: str} media_id: audio columns: column: {scheme_id: emotion, rater_id: rater} diff --git a/audformat/core/table.py b/audformat/core/table.py index f95f5631..1a14538d 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -1379,7 +1379,6 @@ class Table(Base): >>> table["values"] = Column() >>> table type: filewise - levels: {file: str} split_id: test columns: values: {} From 92306d8f72a9fc4faa5a91f661d9ec160510fd73 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 12 Jun 2024 09:57:58 +0200 Subject: [PATCH 23/67] Convert method to property --- audformat/core/table.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index 1a14538d..a353505f 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -826,6 +826,7 @@ def _get_by_index( # Returns `df, df_is_copy` raise NotImplementedError() + @property def _levels_and_dtypes(self) -> typing.Dict[str, str]: r"""Levels and dtypes of index columns. @@ -837,6 +838,9 @@ def _levels_and_dtypes(self) -> typing.Dict[str, str]: # The returned dictionary is used # to infer index column names and dtypes # when reading CSV files. + # This means the names and dtypes cannot be inferred + # from the index itself, + # but need to be known before. raise NotImplementedError() # pragma: no cover def _load_csv(self, path: str): @@ -853,7 +857,7 @@ def _load_csv(self, path: str): path: path to table, including file extension """ - levels = list(self._levels_and_dtypes().keys()) + levels = list(self._levels_and_dtypes.keys()) columns = list(self.columns.keys()) table = csv.read_csv( path, @@ -1330,6 +1334,7 @@ def __init__( def _get_by_index(self, index: pd.Index) -> pd.DataFrame: return self.df.loc[index] + @property def _levels_and_dtypes(self) -> typing.Dict[str, str]: r"""Levels and dtypes of index columns. @@ -1740,6 +1745,7 @@ def _get_by_index( return result + @property def _levels_and_dtypes(self) -> typing.Dict[str, str]: r"""Levels and dtypes of index columns. From 2b727b9ebd5061721fddf77b14607d298957070b Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 12 Jun 2024 09:58:38 +0200 Subject: [PATCH 24/67] Add comment --- audformat/core/table.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index a353505f..ac0be0e8 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -954,7 +954,8 @@ def _pyarrow_convert_dtypes( labeled_columns = [] # Collect columns, - # belonging to the index + # belonging to the table index + # (not the index of the provided dataframe) index_columns = [] # --- Index --- From ec50279229aeacff6562ef6405446660b5f6fb6f Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 11 Jun 2024 15:33:15 +0200 Subject: [PATCH 25/67] Simplify code --- audformat/core/table.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index ac0be0e8..815f7f4a 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -1099,12 +1099,8 @@ def _save_csv(self, path: str): df.to_csv(fp, encoding="utf-8") def _save_parquet(self, path: str): - # Load table before opening PARQUET file - # to avoid creating a PARQUET file - # that is newer than the PKL file - df = self.df # loads table table = pa.Table.from_pandas( - df.reset_index(), + self.df.reset_index(), preserve_index=False, # TODO: check if faster when providing schema? # schema=self._schema, From f6820ea3fbea869b7716fcacb7331b14fdcca63a Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 11 Jun 2024 15:47:31 +0200 Subject: [PATCH 26/67] Simplify code --- audformat/core/table.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index 815f7f4a..5eff43c1 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -1099,12 +1099,7 @@ def _save_csv(self, path: str): df.to_csv(fp, encoding="utf-8") def _save_parquet(self, path: str): - table = pa.Table.from_pandas( - self.df.reset_index(), - preserve_index=False, - # TODO: check if faster when providing schema? - # schema=self._schema, - ) + table = pa.Table.from_pandas(self.df.reset_index(), preserve_index=False) parquet.write_table(table, path) def _save_pickled(self, path: str): From fe50e53e441f566ceb62b46179e2bae1e120fd7f Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 12 Jun 2024 12:18:38 +0200 Subject: [PATCH 27/67] Add test for md5sum of parquet file --- tests/test_table.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/test_table.py b/tests/test_table.py index d4ece3d9..1dc5addc 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -1208,6 +1208,35 @@ def test_map(table, map): pd.testing.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "table_id, expected_md5sum", + [ + ("files", "a856aef8ec9d5e4b1752a13ad68cc0c2"), + ], +) +def test_parquet_reproducibility(tmpdir, table_id, expected_md5sum): + r"""Test reproducibility of binary PARQUET files. + + When storing the same dataframe + to different PARQUET files, + those files should have an identical + MD5sum, + which should also be reproducible + across different pandas and pyarrow versions. + + """ + db = audformat.testing.create_db() + path_wo_ext = audeer.path(tmpdir, table_id) + path = f"{path_wo_ext}.parquet" + db[table_id].save(path_wo_ext, storage_format="parquet") + assert audeer.md5(path) == expected_md5sum + # Repeat writing after loading table + db[table_id].load(path_wo_ext) + os.remove(path) + db[table_id].save(path_wo_ext, storage_format="parquet") + assert audeer.md5(path) == expected_md5sum + + @pytest.mark.parametrize( "files", [ From f9d564e1d15865eec6ac8c8d7dccdd19725ded02 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 12 Jun 2024 12:48:05 +0200 Subject: [PATCH 28/67] Switch back to snappy compression --- audformat/core/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index 5eff43c1..0c4b1b73 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -1100,7 +1100,7 @@ def _save_csv(self, path: str): def _save_parquet(self, path: str): table = pa.Table.from_pandas(self.df.reset_index(), preserve_index=False) - parquet.write_table(table, path) + parquet.write_table(table, path, compression="snappy") def _save_pickled(self, path: str): self.df.to_pickle( From c53d8cc99e495cb6b48072115a1598f818a1fd62 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 12 Jun 2024 12:48:52 +0200 Subject: [PATCH 29/67] Fix linter --- audformat/core/table.py | 1 - 1 file changed, 1 deletion(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index 0c4b1b73..27aaa2be 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -17,7 +17,6 @@ from audformat.core.column import Column from audformat.core.common import HeaderBase from audformat.core.common import HeaderDict -from audformat.core.common import to_audformat_dtype from audformat.core.errors import BadIdError from audformat.core.index import filewise_index from audformat.core.index import index_type From 0636a302cd73acc99b5e063caddccb16c2347391 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 12 Jun 2024 14:23:40 +0200 Subject: [PATCH 30/67] Store hash inside parquet file --- audformat/core/table.py | 32 ++++++++++++++++++++++++++++++++ tests/test_table.py | 38 ++++++++++++++++++++++++++++---------- 2 files changed, 60 insertions(+), 10 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index 27aaa2be..be9b030f 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -1099,6 +1099,38 @@ def _save_csv(self, path: str): def _save_parquet(self, path: str): table = pa.Table.from_pandas(self.df.reset_index(), preserve_index=False) + + # Add hash of dataframe + # to the metadata, + # which pyarrow stores inside the schema. + # See https://stackoverflow.com/a/58978449 + try: + metadata = {"hash": utils.hash(self.df)} + except TypeError: + # Levels/columns with dtype "object" might not be hashable, + # e.g. when storing numpy arrays. + # We convert them to strings in this case. + # + # Index + df = self.df.copy() + update_index_dtypes = { + level: "string" + for level, dtype in self._levels_and_dtypes.items() + if dtype == define.DataType.OBJECT + } + df.index = utils.set_index_dtypes(df.index, update_index_dtypes) + # Columns + for column_id, column in self.columns.items(): + if column.scheme_id is not None: + scheme = self.db.schemes[column.scheme_id] + if scheme.dtype == define.DataType.OBJECT: + df[column_id] = df[column_id].astype("string") + else: + # No scheme defaults to `object` dtype + df[column_id] = df[column_id].astype("string") + metadata = {"hash": utils.hash(df)} + + table = table.replace_schema_metadata({**metadata, **table.schema.metadata}) parquet.write_table(table, path, compression="snappy") def _save_pickled(self, path: str): diff --git a/tests/test_table.py b/tests/test_table.py index 1dc5addc..2800cf0d 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -1,9 +1,11 @@ import os +import random import re import typing import numpy as np import pandas as pd +import pyarrow.parquet as parquet import pytest import audeer @@ -1209,32 +1211,48 @@ def test_map(table, map): @pytest.mark.parametrize( - "table_id, expected_md5sum", + "table_id, expected_hash", [ - ("files", "a856aef8ec9d5e4b1752a13ad68cc0c2"), + ("files", "-4778271914368537359"), + ("segments", "6154135801036965154"), + ("misc", "8941499293930597709"), ], ) -def test_parquet_reproducibility(tmpdir, table_id, expected_md5sum): +def test_parquet_reproducibility(tmpdir, table_id, expected_hash): r"""Test reproducibility of binary PARQUET files. When storing the same dataframe to different PARQUET files, - those files should have an identical - MD5sum, - which should also be reproducible - across different pandas and pyarrow versions. + the files will slightly vary + and have different MD5 sums. + + To provide a reproducible hash, + in order to judge if a table has changed, + we calculate the hash of the table + and store it in the metadata + of the schema + of a the table. """ + random.seed(1) # ensure the same random table values are created db = audformat.testing.create_db() + + # Check that the output of audfromat.utils.hash() does not change + assert audformat.utils.hash(db[table_id].df) == expected_hash + + # Write to PARQUET file and check if correct hash is stored path_wo_ext = audeer.path(tmpdir, table_id) path = f"{path_wo_ext}.parquet" db[table_id].save(path_wo_ext, storage_format="parquet") - assert audeer.md5(path) == expected_md5sum - # Repeat writing after loading table + metadata = parquet.read_schema(path).metadata + assert metadata[b"hash"].decode() == expected_hash + + # Load table from PARQUET file, and overwrite it db[table_id].load(path_wo_ext) os.remove(path) db[table_id].save(path_wo_ext, storage_format="parquet") - assert audeer.md5(path) == expected_md5sum + metadata = parquet.read_schema(path).metadata + assert metadata[b"hash"].decode() == expected_hash @pytest.mark.parametrize( From 77eb826df1dd6c9f41cc71af25a9c5e94d908029 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 12 Jun 2024 14:34:19 +0200 Subject: [PATCH 31/67] Fix code coverage --- tests/test_table.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_table.py b/tests/test_table.py index 2800cf0d..215ae8a0 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -1506,6 +1506,8 @@ def test_save_and_load(tmpdir, storage_format): db["multi-misc"]["arrays"].set([np.array([0, 1]), np.array([2, 3])]) db["multi-misc"]["lists"] = audformat.Column(scheme_id="object") db["multi-misc"]["lists"].set([[0, 1], [2, 3]]) + db["multi-misc"]["no-scheme"] = audformat.Column() + db["multi-misc"]["no-scheme"].set([0, 1]) for table_id in list(db): expected_df = db[table_id].get() From 4a54cb008a6951c3267e65b733284901b170987c Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 12 Jun 2024 15:41:51 +0200 Subject: [PATCH 32/67] Stay with CSV as default table format --- audformat/core/database.py | 2 +- audformat/core/table.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/audformat/core/database.py b/audformat/core/database.py index 5eb72e68..0a84447a 100644 --- a/audformat/core/database.py +++ b/audformat/core/database.py @@ -979,7 +979,7 @@ def save( r"""Save database to disk. Creates a header ``/.yaml`` - and for every table a file ``/..[csv,pkl]``. + and for every table a file ``/..[csv,parquet,pkl]``. Existing files will be overwritten. If ``update_other_formats`` is provided, diff --git a/audformat/core/table.py b/audformat/core/table.py index be9b030f..cf3f0fcd 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -578,7 +578,7 @@ def save( self, path: str, *, - storage_format: str = define.TableStorageFormat.PARQUET, + storage_format: str = define.TableStorageFormat.CSV, update_other_formats: bool = True, ): r"""Save table data to disk. From 13a7769474c03e03c9bd437653134ad970f424fa Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 13 Jun 2024 10:42:15 +0200 Subject: [PATCH 33/67] Test pyarrow==15.0.2 --- .github/workflows/test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2060e9bd..fe51ddb3 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -47,6 +47,7 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt pip install -r tests/requirements.txt + pip install "pyarrow==15.0.2" - name: Test with pytest run: | From 6b07a24e661d13f41b3d6fdbf1dce7b6fec65376 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 13 Jun 2024 10:50:26 +0200 Subject: [PATCH 34/67] Test pyarrow==14.0.2 --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index fe51ddb3..5455be87 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -47,7 +47,7 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt pip install -r tests/requirements.txt - pip install "pyarrow==15.0.2" + pip install "pyarrow==14.0.2" - name: Test with pytest run: | From 563a892ecbdae820809094b6e484f452a3d43598 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 13 Jun 2024 10:54:18 +0200 Subject: [PATCH 35/67] Test pyarrow==13.0 --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5455be87..1a2ca341 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -47,7 +47,7 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt pip install -r tests/requirements.txt - pip install "pyarrow==14.0.2" + pip install "pyarrow==13.0" - name: Test with pytest run: | From 4b451ef64619b33b2dbfc2623f3d367920f1393e Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 13 Jun 2024 10:57:31 +0200 Subject: [PATCH 36/67] Test pyarrow==12.0 --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1a2ca341..b18e4863 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -47,7 +47,7 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt pip install -r tests/requirements.txt - pip install "pyarrow==13.0" + pip install "pyarrow==12.0" - name: Test with pytest run: | From 63188ae16e0f34a3c704cac160f5f05951cf1037 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 13 Jun 2024 11:00:45 +0200 Subject: [PATCH 37/67] Test pyarrow==11.0 --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b18e4863..49c7d3f1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -47,7 +47,7 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt pip install -r tests/requirements.txt - pip install "pyarrow==12.0" + pip install "pyarrow==11.0" - name: Test with pytest run: | From e2eee7fcade1115463a2c7050f7a8c31596ffea2 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 13 Jun 2024 11:03:29 +0200 Subject: [PATCH 38/67] Test pyarrow==10.0 --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 49c7d3f1..95f038a5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -47,7 +47,7 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt pip install -r tests/requirements.txt - pip install "pyarrow==11.0" + pip install "pyarrow==10.0" - name: Test with pytest run: | From bf8dd5998a6f0a77ad63df20154e267085e7b1a4 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 13 Jun 2024 11:06:11 +0200 Subject: [PATCH 39/67] Test pyarrow==10.0.1 --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 95f038a5..20f6abc0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -47,7 +47,7 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt pip install -r tests/requirements.txt - pip install "pyarrow==10.0" + pip install "pyarrow==10.0.1" - name: Test with pytest run: | From 83cac4f0464d498b8d0bd1f453f94b294516d003 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 13 Jun 2024 11:09:45 +0200 Subject: [PATCH 40/67] Require pyarrow>=10.0.1 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 14ad2128..b007b679 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ dependencies = [ 'iso-639', 'iso3166', 'oyaml', - 'pyarrow', + 'pyarrow >=10.0.1', # for pyarrow strings in pandas 'pyyaml >=5.4.1', 'pandas >=2.1.0', ] From c78da845add0bd12ee26bbfd1770892d60b59634 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 13 Jun 2024 11:29:37 +0200 Subject: [PATCH 41/67] Test pandas<2.1.0 --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 20f6abc0..1e12c40d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -47,7 +47,7 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt pip install -r tests/requirements.txt - pip install "pyarrow==10.0.1" + pip install "pandas<2.1.0" - name: Test with pytest run: | From 263f97026d556f75104b4ced538d60440ce5b4fd Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 13 Jun 2024 11:33:46 +0200 Subject: [PATCH 42/67] Add explanations for requirements --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b007b679..327844de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ classifiers = [ 'Programming Language :: Python :: 3.11', 'Topic :: Scientific/Engineering', ] -requires-python = '>=3.9' +requires-python = '>=3.9' # pandas >=2.1.0 dependencies = [ 'audeer >=2.0.0', 'audiofile >=0.4.0', @@ -37,7 +37,7 @@ dependencies = [ 'oyaml', 'pyarrow >=10.0.1', # for pyarrow strings in pandas 'pyyaml >=5.4.1', - 'pandas >=2.1.0', + 'pandas >=2.1.0', # support in timedelta ] # Get version dynamically from git # (needs setuptools_scm tools config below) From d51d01db2f49e723e0135562d92dc8d03a203205 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 13 Jun 2024 11:40:07 +0200 Subject: [PATCH 43/67] Add test using minimum pip requirements --- .github/workflows/test.yml | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1e12c40d..dcfede44 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -14,11 +14,15 @@ jobs: matrix: os: [ ubuntu-20.04, windows-latest, macOS-latest ] python-version: [ '3.10' ] + requirements: [ 'newest' ] include: - os: ubuntu-latest python-version: '3.9' - os: ubuntu-latest python-version: '3.11' + - os: ubuntu-latest + python-version: '3.9' + requirements: 'minimum' steps: - uses: actions/checkout@v4 @@ -47,7 +51,15 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt pip install -r tests/requirements.txt - pip install "pandas<2.1.0" + + - name: Downgrade to minimum dependencies + run: | + pip install "audeer==2.0.0" + pip install "audiofile>=0.4.0" + pip install "pandas==2.1.0" + pip install "pyarrow==10.0.1" + pip install "pyyaml==5.4.1" + if: matrix.requirements == 'minimum' - name: Test with pytest run: | From f889b75ab9add2e4167b3123ae98d53b630d5d87 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 13 Jun 2024 11:55:10 +0200 Subject: [PATCH 44/67] Fix alphabetical order of requirements --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 327844de..6e4bc361 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,9 +35,9 @@ dependencies = [ 'iso-639', 'iso3166', 'oyaml', + 'pandas >=2.1.0', # support in timedelta 'pyarrow >=10.0.1', # for pyarrow strings in pandas 'pyyaml >=5.4.1', - 'pandas >=2.1.0', # support in timedelta ] # Get version dynamically from git # (needs setuptools_scm tools config below) From 96df9ac54b0933e4d63c82c2e27d5e2ad537c358 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 13 Jun 2024 11:55:25 +0200 Subject: [PATCH 45/67] Enhance test matrix definition --- .github/workflows/test.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index dcfede44..992a7b4b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -14,7 +14,6 @@ jobs: matrix: os: [ ubuntu-20.04, windows-latest, macOS-latest ] python-version: [ '3.10' ] - requirements: [ 'newest' ] include: - os: ubuntu-latest python-version: '3.9' From f37de7e51e63c208c5c999133948d61e71ff430d Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 13 Jun 2024 11:55:50 +0200 Subject: [PATCH 46/67] Debug failing test --- .github/workflows/test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 992a7b4b..c5e3bb18 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -53,11 +53,11 @@ jobs: - name: Downgrade to minimum dependencies run: | - pip install "audeer==2.0.0" - pip install "audiofile>=0.4.0" + # pip install "audeer==2.0.0" + # pip install "audiofile>=0.4.0" pip install "pandas==2.1.0" pip install "pyarrow==10.0.1" - pip install "pyyaml==5.4.1" + # pip install "pyyaml==5.4.1" if: matrix.requirements == 'minimum' - name: Test with pytest From 17ea1d9d524f9118aee57346bc5b41daac689ebd Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 13 Jun 2024 16:28:08 +0200 Subject: [PATCH 47/67] Test different hash method --- audformat/core/table.py | 64 +++++++++++++++++++++++++---------------- tests/test_table.py | 17 ++++++++--- 2 files changed, 52 insertions(+), 29 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index cf3f0fcd..5489ba14 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -1,6 +1,8 @@ from __future__ import annotations # allow typing without string import copy +import hashlib +import io import os import pickle import typing @@ -1100,35 +1102,47 @@ def _save_csv(self, path: str): def _save_parquet(self, path: str): table = pa.Table.from_pandas(self.df.reset_index(), preserve_index=False) + # audformat.utils.hash() cannot be used due to: + # * https://github.com/audeering/audformat/issues/434 + # * https://github.com/audeering/audformat/issues/433 + # # Add hash of dataframe + # # to the metadata, + # # which pyarrow stores inside the schema. + # # See https://stackoverflow.com/a/58978449 + # try: + # metadata = {"hash": utils.hash(self.df)} + # except TypeError: + # # Levels/columns with dtype "object" might not be hashable, + # # e.g. when storing numpy arrays. + # # We convert them to strings in this case. + # # + # # Index + # df = self.df.copy() + # update_index_dtypes = { + # level: "string" + # for level, dtype in self._levels_and_dtypes.items() + # if dtype == define.DataType.OBJECT + # } + # df.index = utils.set_index_dtypes(df.index, update_index_dtypes) + # # Columns + # for column_id, column in self.columns.items(): + # if column.scheme_id is not None: + # scheme = self.db.schemes[column.scheme_id] + # if scheme.dtype == define.DataType.OBJECT: + # df[column_id] = df[column_id].astype("string") + # else: + # # No scheme defaults to `object` dtype + # df[column_id] = df[column_id].astype("string") + # metadata = {"hash": utils.hash(df)} + # Add hash of dataframe # to the metadata, # which pyarrow stores inside the schema. # See https://stackoverflow.com/a/58978449 - try: - metadata = {"hash": utils.hash(self.df)} - except TypeError: - # Levels/columns with dtype "object" might not be hashable, - # e.g. when storing numpy arrays. - # We convert them to strings in this case. - # - # Index - df = self.df.copy() - update_index_dtypes = { - level: "string" - for level, dtype in self._levels_and_dtypes.items() - if dtype == define.DataType.OBJECT - } - df.index = utils.set_index_dtypes(df.index, update_index_dtypes) - # Columns - for column_id, column in self.columns.items(): - if column.scheme_id is not None: - scheme = self.db.schemes[column.scheme_id] - if scheme.dtype == define.DataType.OBJECT: - df[column_id] = df[column_id].astype("string") - else: - # No scheme defaults to `object` dtype - df[column_id] = df[column_id].astype("string") - metadata = {"hash": utils.hash(df)} + buffer = io.BytesIO() + self.df.to_parquet(buffer) + hash_df = hashlib.sha256(buffer.getbuffer()).hexdigest() + metadata = {"hash": hash_df} table = table.replace_schema_metadata({**metadata, **table.schema.metadata}) parquet.write_table(table, path, compression="snappy") diff --git a/tests/test_table.py b/tests/test_table.py index 215ae8a0..fb8a03ac 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -1213,9 +1213,18 @@ def test_map(table, map): @pytest.mark.parametrize( "table_id, expected_hash", [ - ("files", "-4778271914368537359"), - ("segments", "6154135801036965154"), - ("misc", "8941499293930597709"), + ( + "files", + "b079f9c2331d924a0388dde079cde55c7dcf6bf2bae851d77dc5cba5b33c31e1", + ), + ( + "segments", + "741e139f7adae5199539ec8260f3a55a868038865a3f5a385ea172a5ca72960b", + ), + ( + "misc", + "cb09eb7d3adaf7d45dfff0606c6ab61a1a03333aa1b8351febbba20d8c22a63d", + ), ], ) def test_parquet_reproducibility(tmpdir, table_id, expected_hash): @@ -1238,7 +1247,7 @@ def test_parquet_reproducibility(tmpdir, table_id, expected_hash): db = audformat.testing.create_db() # Check that the output of audfromat.utils.hash() does not change - assert audformat.utils.hash(db[table_id].df) == expected_hash + # assert audformat.utils.hash(db[table_id].df) == expected_hash # Write to PARQUET file and check if correct hash is stored path_wo_ext = audeer.path(tmpdir, table_id) From 495e09514de40721e21ee432c1ccf5011527c651 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 13 Jun 2024 17:06:41 +0200 Subject: [PATCH 48/67] Use different hashing approach --- audformat/core/table.py | 48 +++++++++++++++++++++++++++++++++++------ tests/test_table.py | 6 +++--- 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index 5489ba14..f71be8c8 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -2,7 +2,6 @@ import copy import hashlib -import io import os import pickle import typing @@ -1100,7 +1099,9 @@ def _save_csv(self, path: str): df.to_csv(fp, encoding="utf-8") def _save_parquet(self, path: str): - table = pa.Table.from_pandas(self.df.reset_index(), preserve_index=False) + df = self.df.reset_index() + + table = pa.Table.from_pandas(df, preserve_index=False) # audformat.utils.hash() cannot be used due to: # * https://github.com/audeering/audformat/issues/434 @@ -1138,11 +1139,44 @@ def _save_parquet(self, path: str): # Add hash of dataframe # to the metadata, # which pyarrow stores inside the schema. - # See https://stackoverflow.com/a/58978449 - buffer = io.BytesIO() - self.df.to_parquet(buffer) - hash_df = hashlib.sha256(buffer.getbuffer()).hexdigest() - metadata = {"hash": hash_df} + # See https://stackoverflow.com/a/58978449. + # + # The hashing method was suggested at + # https://github.com/pandas-dev/pandas/issues/46705#issuecomment-1094123442 + # as pandas.util.hash_pandas_object() + # ignores column and index names + # buffer = io.BytesIO() + # self.df.to_parquet(buffer) + # hash_df = hashlib.sha256(buffer.getbuffer()).hexdigest() + # metadata = {"hash": hash_df} + schema_str = table.schema.to_string( + show_field_metadata=False, + show_schema_metadata=False, + ) + try: + hash_data = utils.hash(df) + except TypeError: + # Levels/columns with dtype "object" might not be hashable, + # e.g. when storing numpy arrays. + # We convert them to strings in this case. + # + # Index + for level, dtype in self._levels_and_dtypes.items(): + if dtype == define.DataType.OBJECT: + df[level] = df[level].astype("string") + # Columns + for column_id, column in self.columns.items(): + if column.scheme_id is not None: + scheme = self.db.schemes[column.scheme_id] + if scheme.dtype == define.DataType.OBJECT: + df[column_id] = df[column_id].astype("string") + else: + # No scheme defaults to `object` dtype + df[column_id] = df[column_id].astype("string") + hash_data = utils.hash(df) + + hash_table = hashlib.sha256((hash_data + schema_str).encode()).hexdigest() + metadata = {"hash": hash_table} table = table.replace_schema_metadata({**metadata, **table.schema.metadata}) parquet.write_table(table, path, compression="snappy") diff --git a/tests/test_table.py b/tests/test_table.py index fb8a03ac..45c5cf71 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -1215,15 +1215,15 @@ def test_map(table, map): [ ( "files", - "b079f9c2331d924a0388dde079cde55c7dcf6bf2bae851d77dc5cba5b33c31e1", + "a6031ff402141834ec9ca3886e8672261a2671b534aaae798cf5918f12b9db14", ), ( "segments", - "741e139f7adae5199539ec8260f3a55a868038865a3f5a385ea172a5ca72960b", + "8bb0c5da4aaf1c4b145361a1542ebd2f3857fabc6fdc3cf80deba1307109f5dc", ), ( "misc", - "cb09eb7d3adaf7d45dfff0606c6ab61a1a03333aa1b8351febbba20d8c22a63d", + "ecc24f9ab8c25995017396f363987990d7421507532ee78da57cab0ca2e4b680", ), ], ) From f374fe0aa0518b1e89b63a2824176e4d8ac6fffb Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 14 Jun 2024 11:46:20 +0200 Subject: [PATCH 49/67] Require pandas>=2.2.0 and fix hashes --- .github/workflows/test.yml | 2 +- audformat/core/table.py | 76 ++++++++++++++------------------------ pyproject.toml | 4 +- tests/test_table.py | 6 +-- 4 files changed, 33 insertions(+), 55 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c5e3bb18..e728034b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -55,7 +55,7 @@ jobs: run: | # pip install "audeer==2.0.0" # pip install "audiofile>=0.4.0" - pip install "pandas==2.1.0" + pip install "pandas==2.2.0" pip install "pyarrow==10.0.1" # pip install "pyyaml==5.4.1" if: matrix.requirements == 'minimum' diff --git a/audformat/core/table.py b/audformat/core/table.py index f71be8c8..a78529cc 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -1103,67 +1103,44 @@ def _save_parquet(self, path: str): table = pa.Table.from_pandas(df, preserve_index=False) - # audformat.utils.hash() cannot be used due to: - # * https://github.com/audeering/audformat/issues/434 - # * https://github.com/audeering/audformat/issues/433 - # # Add hash of dataframe - # # to the metadata, - # # which pyarrow stores inside the schema. - # # See https://stackoverflow.com/a/58978449 - # try: - # metadata = {"hash": utils.hash(self.df)} - # except TypeError: - # # Levels/columns with dtype "object" might not be hashable, - # # e.g. when storing numpy arrays. - # # We convert them to strings in this case. - # # - # # Index - # df = self.df.copy() - # update_index_dtypes = { - # level: "string" - # for level, dtype in self._levels_and_dtypes.items() - # if dtype == define.DataType.OBJECT - # } - # df.index = utils.set_index_dtypes(df.index, update_index_dtypes) - # # Columns - # for column_id, column in self.columns.items(): - # if column.scheme_id is not None: - # scheme = self.db.schemes[column.scheme_id] - # if scheme.dtype == define.DataType.OBJECT: - # df[column_id] = df[column_id].astype("string") - # else: - # # No scheme defaults to `object` dtype - # df[column_id] = df[column_id].astype("string") - # metadata = {"hash": utils.hash(df)} - # Add hash of dataframe # to the metadata, # which pyarrow stores inside the schema. # See https://stackoverflow.com/a/58978449. # - # The hashing method was suggested at - # https://github.com/pandas-dev/pandas/issues/46705#issuecomment-1094123442 - # as pandas.util.hash_pandas_object() - # ignores column and index names - # buffer = io.BytesIO() - # self.df.to_parquet(buffer) - # hash_df = hashlib.sha256(buffer.getbuffer()).hexdigest() - # metadata = {"hash": hash_df} + # This allows us to track if a PARQUET file changes over time. + # We cannot rely on md5 sums of the file, + # as the file is written in a non-deterministic way. + table_hash = hashlib.md5() + + # Hash of schema (columns + dtypes) schema_str = table.schema.to_string( + # schema.metadata contains pandas related information, + # and the used pyarrow and pandas version, + # and needs to be excluded show_field_metadata=False, show_schema_metadata=False, ) + schema_hash = hashlib.md5(schema_str.encode()) + table_hash.update(schema_hash.digest()) + + # Hash data try: - hash_data = utils.hash(df) + data_hash = utils.hash(self.df) except TypeError: # Levels/columns with dtype "object" might not be hashable, # e.g. when storing numpy arrays. # We convert them to strings in this case. - # + # Index - for level, dtype in self._levels_and_dtypes.items(): - if dtype == define.DataType.OBJECT: - df[level] = df[level].astype("string") + df = self.df.copy() + update_index_dtypes = { + level: "string" + for level, dtype in self._levels_and_dtypes.items() + if dtype == define.DataType.OBJECT + } + df.index = utils.set_index_dtypes(df.index, update_index_dtypes) + # Columns for column_id, column in self.columns.items(): if column.scheme_id is not None: @@ -1173,12 +1150,13 @@ def _save_parquet(self, path: str): else: # No scheme defaults to `object` dtype df[column_id] = df[column_id].astype("string") - hash_data = utils.hash(df) + data_hash = utils.hash(df) - hash_table = hashlib.sha256((hash_data + schema_str).encode()).hexdigest() - metadata = {"hash": hash_table} + table_hash.update(data_hash.encode()) + metadata = {"hash": table_hash.hexdigest()} table = table.replace_schema_metadata({**metadata, **table.schema.metadata}) + parquet.write_table(table, path, compression="snappy") def _save_pickled(self, path: str): diff --git a/pyproject.toml b/pyproject.toml index 6e4bc361..b0f45140 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,14 +28,14 @@ classifiers = [ 'Programming Language :: Python :: 3.11', 'Topic :: Scientific/Engineering', ] -requires-python = '>=3.9' # pandas >=2.1.0 +requires-python = '>=3.9' # pandas >=2.2.0 dependencies = [ 'audeer >=2.0.0', 'audiofile >=0.4.0', 'iso-639', 'iso3166', 'oyaml', - 'pandas >=2.1.0', # support in timedelta + 'pandas >=2.2.0', # hash values, see https://github.com/pandas-dev/pandas/issues/58999 'pyarrow >=10.0.1', # for pyarrow strings in pandas 'pyyaml >=5.4.1', ] diff --git a/tests/test_table.py b/tests/test_table.py index 45c5cf71..57539e00 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -1215,15 +1215,15 @@ def test_map(table, map): [ ( "files", - "a6031ff402141834ec9ca3886e8672261a2671b534aaae798cf5918f12b9db14", + "4d0295654694751bdcd12be86b89b73e", ), ( "segments", - "8bb0c5da4aaf1c4b145361a1542ebd2f3857fabc6fdc3cf80deba1307109f5dc", + "d2a9b84d03abde24ae84cf647a019b71", ), ( "misc", - "ecc24f9ab8c25995017396f363987990d7421507532ee78da57cab0ca2e4b680", + "6b6faecc836354bd89472095c1fa746a", ), ], ) From 18e3ada5d8be7f60964a808c2caaadf3c5c5b400 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 14 Jun 2024 11:49:22 +0200 Subject: [PATCH 50/67] CI: re-enable all minimal requriements --- .github/workflows/test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e728034b..8411bb10 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -53,11 +53,11 @@ jobs: - name: Downgrade to minimum dependencies run: | - # pip install "audeer==2.0.0" - # pip install "audiofile>=0.4.0" + pip install "audeer==2.0.0" + pip install "audiofile>=0.4.0" pip install "pandas==2.2.0" pip install "pyarrow==10.0.1" - # pip install "pyyaml==5.4.1" + pip install "pyyaml==5.4.1" if: matrix.requirements == 'minimum' - name: Test with pytest From bc0c68fb501020259cee71f2a99a2388684ff494 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 14 Jun 2024 14:37:48 +0200 Subject: [PATCH 51/67] Hashing algorithm to respect row order --- audformat/core/table.py | 133 +++++++++++++++++++++++++--------------- tests/test_table.py | 104 +++++++++++++++++++++++++++++-- 2 files changed, 184 insertions(+), 53 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index a78529cc..d0621c16 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -1099,61 +1099,36 @@ def _save_csv(self, path: str): df.to_csv(fp, encoding="utf-8") def _save_parquet(self, path: str): - df = self.df.reset_index() + r"""Save table as PARQUET file. - table = pa.Table.from_pandas(df, preserve_index=False) + A PARQUET file is written in a non-deterministic way, + and we cannot track changes by its MD5 sum. + To make changes trackable, + we store a hash in its metadata. - # Add hash of dataframe - # to the metadata, - # which pyarrow stores inside the schema. - # See https://stackoverflow.com/a/58978449. - # - # This allows us to track if a PARQUET file changes over time. - # We cannot rely on md5 sums of the file, - # as the file is written in a non-deterministic way. - table_hash = hashlib.md5() + The hash is calculated from the pyarrow schema + (to track column names and data types) + and the pandas dataframes + (to track values and order or rows), + from which the PARQUET file is generated. - # Hash of schema (columns + dtypes) - schema_str = table.schema.to_string( - # schema.metadata contains pandas related information, - # and the used pyarrow and pandas version, - # and needs to be excluded - show_field_metadata=False, - show_schema_metadata=False, - ) - schema_hash = hashlib.md5(schema_str.encode()) - table_hash.update(schema_hash.digest()) + The hash of the PARQUET can then be read by:: - # Hash data - try: - data_hash = utils.hash(self.df) - except TypeError: - # Levels/columns with dtype "object" might not be hashable, - # e.g. when storing numpy arrays. - # We convert them to strings in this case. - - # Index - df = self.df.copy() - update_index_dtypes = { - level: "string" - for level, dtype in self._levels_and_dtypes.items() - if dtype == define.DataType.OBJECT - } - df.index = utils.set_index_dtypes(df.index, update_index_dtypes) - - # Columns - for column_id, column in self.columns.items(): - if column.scheme_id is not None: - scheme = self.db.schemes[column.scheme_id] - if scheme.dtype == define.DataType.OBJECT: - df[column_id] = df[column_id].astype("string") - else: - # No scheme defaults to `object` dtype - df[column_id] = df[column_id].astype("string") - data_hash = utils.hash(df) + pyarrow.parquet.read_schema(path).metadata[b"hash"].decode() + + Args: + path: path, including file extension - table_hash.update(data_hash.encode()) + """ + table = pa.Table.from_pandas(self.df.reset_index(), preserve_index=False) + + # Create hash of table + table_hash = hashlib.md5() + table_hash.update(_schema_hash(table)) + table_hash.update(_dataframe_hash(self.df)) + # Store in metadata of file, + # see https://stackoverflow.com/a/58978449 metadata = {"hash": table_hash.hexdigest()} table = table.replace_schema_metadata({**metadata, **table.schema.metadata}) @@ -1855,6 +1830,46 @@ def _assert_table_index( ) +def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes: + """Hash a dataframe. + + The hash value takes into account: + + * index of dataframe + * values of the dataframe + * order of dataframe rows + + It does not consider: + + * column names of dataframe + * dtypes of dataframe + + Args: + df: dataframe + max_rows: if not ``None``, + the maximum number of rows, + taken into account for hashing + + Returns: + MD5 hash in bytes + + """ + # Idea for implementation from + # https://github.com/streamlit/streamlit/issues/7086#issuecomment-1654504410 + md5 = hashlib.md5() + if max_rows is not None and len(df) > max_rows: # pragma: nocover (not yet used) + df = df.sample(n=max_rows, random_state=0) + # Hash length, as we have to track if this changes + md5.update(str(len(df)).encode("utf-8")) + try: + md5.update(bytes(str(pd.util.hash_pandas_object(df)), "utf-8")) + except TypeError: + # Use pickle if pandas cannot hash the object, + # e.g. if it contains numpy.arrays. + md5.update(f"{pickle.dumps(df, pickle.HIGHEST_PROTOCOL)}".encode("utf-8")) + return md5.digest() + + def _maybe_convert_dtype_to_string( index: pd.Index, ) -> pd.Index: @@ -1877,3 +1892,23 @@ def _maybe_update_scheme( for scheme in table.db.schemes.values(): if table._id == scheme.labels: scheme.replace_labels(table._id) + + +def _schema_hash(table: pa.Table) -> bytes: + r"""Hash pyarrow table schema. + + Args: + table: pyarrow table + + Returns: + MD5 hash in bytes + + """ + schema_str = table.schema.to_string( + # schema.metadata contains pandas related information, + # and the used pyarrow and pandas version, + # and needs to be excluded + show_field_metadata=False, + show_schema_metadata=False, + ) + return hashlib.md5(schema_str.encode()).digest() diff --git a/tests/test_table.py b/tests/test_table.py index 57539e00..4b367660 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -1210,24 +1210,120 @@ def test_map(table, map): pd.testing.assert_frame_equal(result, expected) +@pytest.mark.parametrize("storage_format", ["csv", "parquet"]) +def test_hash(tmpdir, storage_format): + r"""Test if PARQUET file hash changes with table. + + We store a MD5 sum associated with the dataframe, + that was used to create the file, + in the metadata of the PARQUET file. + Those MD5 sum is supposed to change, + if any of the table rows, (index) columns changes, + the data type of the entries changes, + or the name of a column changes. + + Args: + tmpdir: tmpdir fixture + storage_format: storage format of table file + + """ + + def get_md5(path: str) -> str: + r"""Get MD5 sum for table file.""" + ext = audeer.file_extension(path) + if ext == "csv": + md5 = audeer.md5(path) + elif ext == "parquet": + md5 = parquet.read_schema(path).metadata[b"hash"].decode() + return md5 + + db_root = audeer.path(tmpdir, "db") + db = audformat.Database("mydb") + db.schemes["int"] = audformat.Scheme("int") + index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2]) + db["table"] = audformat.Table(index) + db["table"]["column"] = audformat.Column(scheme_id="int") + db["table"]["column"].set([0, 1]) + db.save(db_root, storage_format=storage_format) + + table_file = audeer.path(db_root, f"db.table.{storage_format}") + assert os.path.exists(table_file) + md5 = get_md5(table_file) + + # Replace table with identical copy + table = db["table"].copy() + db["table"] = table + db.save(db_root, storage_format=storage_format) + assert get_md5(table_file) == md5 + + # Change order of rows + index = audformat.segmented_index(["f2", "f1"], [1, 0], [2, 1]) + db["table"] = audformat.Table(index) + db["table"]["column"] = audformat.Column(scheme_id="int") + db["table"]["column"].set([1, 0]) + db.save(db_root, storage_format=storage_format) + assert get_md5(table_file) != md5 + + # Change index entry + index = audformat.segmented_index(["f1", "f1"], [0, 1], [1, 2]) + db["table"] = audformat.Table(index) + db["table"]["column"] = audformat.Column(scheme_id="int") + db["table"]["column"].set([0, 1]) + db.save(db_root, storage_format=storage_format) + assert get_md5(table_file) != md5 + + # Change data entry + index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2]) + db["table"] = audformat.Table(index) + db["table"]["column"] = audformat.Column(scheme_id="int") + db["table"]["column"].set([1, 0]) + db.save(db_root, storage_format=storage_format) + assert get_md5(table_file) != md5 + + # Change column name + index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2]) + db["table"] = audformat.Table(index) + db["table"]["col"] = audformat.Column(scheme_id="int") + db["table"]["col"].set([0, 1]) + db.save(db_root, storage_format=storage_format) + assert get_md5(table_file) != md5 + + # Change order of columns + index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2]) + db["table"] = audformat.Table(index) + db["table"]["col1"] = audformat.Column(scheme_id="int") + db["table"]["col1"].set([0, 1]) + db["table"]["col2"] = audformat.Column(scheme_id="int") + db["table"]["col2"].set([0, 1]) + db.save(db_root, storage_format=storage_format) + md5 = get_md5(table_file) + db["table"] = audformat.Table(index) + db["table"]["col2"] = audformat.Column(scheme_id="int") + db["table"]["col2"].set([0, 1]) + db["table"]["col1"] = audformat.Column(scheme_id="int") + db["table"]["col1"].set([0, 1]) + db.save(db_root, storage_format=storage_format) + assert get_md5(table_file) != md5 + + @pytest.mark.parametrize( "table_id, expected_hash", [ ( "files", - "4d0295654694751bdcd12be86b89b73e", + "9caa6722e65a04ddbce1cda2238c9126", ), ( "segments", - "d2a9b84d03abde24ae84cf647a019b71", + "37c9d9dc4f937a6e97ec72a080055e49", ), ( "misc", - "6b6faecc836354bd89472095c1fa746a", + "3488c007d45b19e04e8fdbf000f0f04d", ), ], ) -def test_parquet_reproducibility(tmpdir, table_id, expected_hash): +def test_parquet_hash_reproducibility(tmpdir, table_id, expected_hash): r"""Test reproducibility of binary PARQUET files. When storing the same dataframe From 6c36e0aec9b45b5ab2eeccda58ffd87c87c26301 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 14 Jun 2024 14:49:21 +0200 Subject: [PATCH 52/67] Clean up tests --- tests/test_table.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test_table.py b/tests/test_table.py index 4b367660..49ae61a6 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -1342,9 +1342,6 @@ def test_parquet_hash_reproducibility(tmpdir, table_id, expected_hash): random.seed(1) # ensure the same random table values are created db = audformat.testing.create_db() - # Check that the output of audfromat.utils.hash() does not change - # assert audformat.utils.hash(db[table_id].df) == expected_hash - # Write to PARQUET file and check if correct hash is stored path_wo_ext = audeer.path(tmpdir, table_id) path = f"{path_wo_ext}.parquet" From 407aa912d93bfa89fe15819c31cf02183b29f310 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 18 Jun 2024 11:27:48 +0200 Subject: [PATCH 53/67] Fix minimum install of audiofile --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8411bb10..210a685f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -54,7 +54,7 @@ jobs: - name: Downgrade to minimum dependencies run: | pip install "audeer==2.0.0" - pip install "audiofile>=0.4.0" + pip install "audiofile==0.4.0" pip install "pandas==2.2.0" pip install "pyarrow==10.0.1" pip install "pyyaml==5.4.1" From c9b576046774bdf47fca3a7c3e3d2dc4607df5ba Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 18 Jun 2024 11:34:49 +0200 Subject: [PATCH 54/67] Fix docstring of Table.load() --- audformat/core/table.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index d0621c16..bc66b943 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -444,10 +444,10 @@ def load( ): r"""Load table data from disk. - Tables can be stored as PKL and/or CSV files to disk. - If both files are present + Tables are stored as CSV, PARQUET and/or PKL files to disk. + If the PKL file exists, it will load the PKL file - as long as its modification date is newer, + as long as its modification date is the newest, otherwise it will raise an error and ask to delete one of the files. @@ -456,7 +456,7 @@ def load( Raises: RuntimeError: if table file(s) are missing - RuntimeError: if CSV file is newer than PKL file + RuntimeError: if CSV or PARQUET file is newer than PKL file """ path = audeer.path(path) From 589da4b988ae063146035fc892e72dd44a0cf0ea Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 18 Jun 2024 11:36:25 +0200 Subject: [PATCH 55/67] Fix docstring of Database.load() --- audformat/core/database.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/audformat/core/database.py b/audformat/core/database.py index 0a84447a..2772f0a4 100644 --- a/audformat/core/database.py +++ b/audformat/core/database.py @@ -1383,7 +1383,7 @@ def load( r"""Load database from disk. Expects a header ``/.yaml`` - and for every table a file ``/..[csv|pkl]`` + and for every table a file ``/..[csv|parquet|pkl]`` Media files should be located under ``root``. Args: @@ -1409,7 +1409,7 @@ def load( Raises: FileNotFoundError: if the database header file cannot be found under ``root`` - RuntimeError: if a CSV table file is newer + RuntimeError: if a CSV or PARQUET table file is newer than the corresponding PKL file """ From b0ee769975a683d0aafdbc035de2b29e72b70e02 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 18 Jun 2024 13:17:20 +0200 Subject: [PATCH 56/67] Ensure correct order in time when storing tables --- audformat/core/table.py | 17 ++++++--- tests/test_table.py | 79 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 4 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index bc66b943..995e3bfc 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -599,25 +599,34 @@ def save( path = audeer.path(path) define.TableStorageFormat._assert_has_attribute_value(storage_format) + csv_file = f"{path}.{define.TableStorageFormat.CSV}" parquet_file = f"{path}.{define.TableStorageFormat.PARQUET}" pickle_file = f"{path}.{define.TableStorageFormat.PICKLE}" - csv_file = f"{path}.{define.TableStorageFormat.CSV}" - # Make sure the CSV|PARQUET file is always written first - # as it is expected to be older by load() + # Ensure the following storage order: + # 1. PARQUET file + # 2. CSV file + # 3. PKL file + # The PKl is expected to be the oldest by load(), + # the order of PARQUET and CSV file + # is only a convention for now. if storage_format == define.TableStorageFormat.PICKLE: if update_other_formats and os.path.exists(parquet_file): self._save_parquet(parquet_file) - elif update_other_formats and os.path.exists(csv_file): + if update_other_formats and os.path.exists(csv_file): self._save_csv(csv_file) self._save_pickled(pickle_file) if storage_format == define.TableStorageFormat.PARQUET: self._save_parquet(parquet_file) + if update_other_formats and os.path.exists(csv_file): + self._save_csv(csv_file) if update_other_formats and os.path.exists(pickle_file): self._save_pickled(pickle_file) if storage_format == define.TableStorageFormat.CSV: + if update_other_formats and os.path.exists(parquet_file): + self._save_parquet(parquet_file) self._save_csv(csv_file) if update_other_formats and os.path.exists(pickle_file): self._save_pickled(pickle_file) diff --git a/tests/test_table.py b/tests/test_table.py index 49ae61a6..c2fdcadb 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -1,6 +1,7 @@ import os import random import re +import time import typing import numpy as np @@ -2122,3 +2123,81 @@ def test_update(table, overwrite, others): for column_id, column in other.columns.items(): assert column.scheme == table[column_id].scheme assert column.rater == table[column_id].rater + + +@pytest.mark.parametrize("update_other_formats", [True, False]) +@pytest.mark.parametrize( + "storage_format, existing_formats", + [ + ("csv", []), + ("csv", []), + ("csv", ["pkl"]), + ("csv", ["parquet", "pkl"]), + ("pkl", ["parquet"]), + ("pkl", ["csv"]), + ("pkl", ["parquet", "csv"]), + ("parquet", ["pkl"]), + ("parquet", ["csv"]), + ("parquet", ["pkl", "csv"]), + ], +) +def test_update_other_formats( + tmpdir, + storage_format, + existing_formats, + update_other_formats, +): + r"""Tests updating of other table formats. + + When a table is stored with `audformat.Table.save()` + as CSV, PARQUET, or PKL file, + a user might select + that all other existing file representations of the table + are updated as well. + E.g. if a PKL file of the same table exists, + and a user saves to a CSV file + with the argument `update_other_formats=True`, + it should write the table to the CSV and PKL file. + + """ + db = audformat.testing.create_db() + + table_id = "files" + table_file = audeer.path(tmpdir, "table") + + # Create existing table files and pause for a short time + old_mtime = {} + for ext in existing_formats: + db[table_id].save( + table_file, + storage_format=ext, + update_other_formats=False, + ) + old_mtime[ext] = os.path.getmtime(f"{table_file}.{ext}") + time.sleep(0.05) + + # Store table to requested format + db[table_id].save( + table_file, + storage_format=storage_format, + update_other_formats=update_other_formats, + ) + + # Collect mtimes of existing table files + mtime = {} + formats = existing_formats + [storage_format] + for ext in formats: + mtime[ext] = os.path.getmtime(f"{table_file}.{ext}") + + # Ensure mtimes are correct + if update_other_formats: + if "pickle" in formats and "csv" in formats: + assert mtime["pickle"] > mtime["csv"] + if "pickle" in formats and "parquet" in formats: + assert mtime["pickle"] > mtime["parquet"] + if "csv" in formats and "parquet" in formats: + assert mtime["csv"] > mtime["parquet"] + else: + for ext in existing_formats: + assert mtime[ext] == old_mtime[ext] + assert mtime[storage_format] > old_mtime[ext] From 1e167c13a39666b2f64508bc36808411fc87bf1a Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 18 Jun 2024 13:19:46 +0200 Subject: [PATCH 57/67] Simplify comment --- audformat/core/table.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index 995e3bfc..094779d3 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -847,9 +847,6 @@ def _levels_and_dtypes(self) -> typing.Dict[str, str]: # The returned dictionary is used # to infer index column names and dtypes # when reading CSV files. - # This means the names and dtypes cannot be inferred - # from the index itself, - # but need to be known before. raise NotImplementedError() # pragma: no cover def _load_csv(self, path: str): From 8ad8d742ed64a1218f4e878a4daf779694447b36 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 18 Jun 2024 13:21:18 +0200 Subject: [PATCH 58/67] Add docstring to _load_pickle() --- audformat/core/table.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/audformat/core/table.py b/audformat/core/table.py index 094779d3..20cbbc29 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -896,6 +896,14 @@ def _load_parquet(self, path: str): self._df = df def _load_pickled(self, path: str): + r"""Load table from PKL file. + + The loaded table is stored under ``self._df``. + + Args: + path: path to table, including file extension + + """ # Older versions of audformat used xz compression # which produced smaller files, # but was slower. From 7b3a55801f9f24a0475dc0f7fd8c38e8c27306fd Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 18 Jun 2024 13:23:58 +0200 Subject: [PATCH 59/67] Fix _save_parquet() docstring --- audformat/core/table.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index 20cbbc29..c1a4c221 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -1122,11 +1122,11 @@ def _save_parquet(self, path: str): The hash is calculated from the pyarrow schema (to track column names and data types) - and the pandas dataframes + and the pandas dataframe (to track values and order or rows), from which the PARQUET file is generated. - The hash of the PARQUET can then be read by:: + The hash of the PARQUET file can then be read by:: pyarrow.parquet.read_schema(path).metadata[b"hash"].decode() From d414fe7ee8c073f4e7b19ca0cc0586bceae8a512 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 18 Jun 2024 13:25:44 +0200 Subject: [PATCH 60/67] Improve comment in _dataframe_hash() --- audformat/core/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index c1a4c221..d3732660 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -1873,7 +1873,7 @@ def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes: md5 = hashlib.md5() if max_rows is not None and len(df) > max_rows: # pragma: nocover (not yet used) df = df.sample(n=max_rows, random_state=0) - # Hash length, as we have to track if this changes + # Hash length of dataframe, as we have to track if this changes md5.update(str(len(df)).encode("utf-8")) try: md5.update(bytes(str(pd.util.hash_pandas_object(df)), "utf-8")) From a90eaf4d54236735281347d5ba436e5444f1c0cc Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 18 Jun 2024 13:28:18 +0200 Subject: [PATCH 61/67] Document arguments of test_table_update... --- tests/test_table.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_table.py b/tests/test_table.py index c2fdcadb..e3975d1b 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -2159,6 +2159,14 @@ def test_update_other_formats( with the argument `update_other_formats=True`, it should write the table to the CSV and PKL file. + Args: + tmpdir: tmpdir fixture + storage_format: storage format of table + existing_formats: formats the table should be stored in + before saving to ``storage_format`` + update_other_formats: if tables specified in ``existing_formats`` + should be updated when saving ``storage_format`` + """ db = audformat.testing.create_db() From 2749ef9e7794f0358a9946e54822e78c5a6663eb Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 18 Jun 2024 13:35:36 +0200 Subject: [PATCH 62/67] Relax test for table saving order --- tests/test_table.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_table.py b/tests/test_table.py index e3975d1b..5900d7f3 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -2200,11 +2200,11 @@ def test_update_other_formats( # Ensure mtimes are correct if update_other_formats: if "pickle" in formats and "csv" in formats: - assert mtime["pickle"] > mtime["csv"] + assert mtime["pickle"] >= mtime["csv"] if "pickle" in formats and "parquet" in formats: - assert mtime["pickle"] > mtime["parquet"] + assert mtime["pickle"] >= mtime["parquet"] if "csv" in formats and "parquet" in formats: - assert mtime["csv"] > mtime["parquet"] + assert mtime["csv"] >= mtime["parquet"] else: for ext in existing_formats: assert mtime[ext] == old_mtime[ext] From 3f21e3c41ae42cf8c37d01175bc82a5ea0b5fbea Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 19 Jun 2024 08:52:13 +0200 Subject: [PATCH 63/67] Update audformat/core/table.py Co-authored-by: ChristianGeng --- audformat/core/table.py | 1 - 1 file changed, 1 deletion(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index d3732660..3565ab4a 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -935,7 +935,6 @@ def _pyarrow_convert_dtypes( convert_all: bool = False, ) -> pd.DataFrame: r"""Convert dtypes that are not handled by pyarrow. - This adjusts dtypes in a dataframe, that could not be set correctly when converting to the dataframe From 2912f76f38052b63ad1b127787416c66c2cd3781 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 19 Jun 2024 08:54:05 +0200 Subject: [PATCH 64/67] Revert "Update audformat/core/table.py" This reverts commit 3f21e3c41ae42cf8c37d01175bc82a5ea0b5fbea. --- audformat/core/table.py | 1 + 1 file changed, 1 insertion(+) diff --git a/audformat/core/table.py b/audformat/core/table.py index 3565ab4a..d3732660 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -935,6 +935,7 @@ def _pyarrow_convert_dtypes( convert_all: bool = False, ) -> pd.DataFrame: r"""Convert dtypes that are not handled by pyarrow. + This adjusts dtypes in a dataframe, that could not be set correctly when converting to the dataframe From c4c41ff0aace45f14e4e076e2806cc6a5a0a2b50 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 19 Jun 2024 16:23:37 +0200 Subject: [PATCH 65/67] Use numpy representation for hashing (#436) * Use numpy representation for hashing * Enable tests and require pandas>=1.4.1 * Use numpy<2.0 in minimum test * Skip doctests in minimum * Require pandas>=2.1.0 * Require numpy<=2.0.0 in minimum test * Remove print statements * Fix numpy<2.0.0 for minimum test * Remove max_rows argument * Simplify code --- .github/workflows/test.yml | 5 +++-- audformat/core/table.py | 26 ++++++++++---------------- pyproject.toml | 4 ++-- tests/test_table.py | 6 +++--- 4 files changed, 18 insertions(+), 23 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 210a685f..f895bbf6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -4,7 +4,7 @@ on: push: branches: [ main ] pull_request: - branches: [ main ] + branches: [ main, pyarrow ] jobs: build: @@ -55,7 +55,8 @@ jobs: run: | pip install "audeer==2.0.0" pip install "audiofile==0.4.0" - pip install "pandas==2.2.0" + pip install "numpy<2.0.0" + pip install "pandas==2.1.0" pip install "pyarrow==10.0.1" pip install "pyyaml==5.4.1" if: matrix.requirements == 'minimum' diff --git a/audformat/core/table.py b/audformat/core/table.py index d3732660..30924953 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -1844,7 +1844,7 @@ def _assert_table_index( ) -def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes: +def _dataframe_hash(df: pd.DataFrame) -> bytes: """Hash a dataframe. The hash value takes into account: @@ -1860,27 +1860,21 @@ def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes: Args: df: dataframe - max_rows: if not ``None``, - the maximum number of rows, - taken into account for hashing Returns: MD5 hash in bytes """ - # Idea for implementation from - # https://github.com/streamlit/streamlit/issues/7086#issuecomment-1654504410 md5 = hashlib.md5() - if max_rows is not None and len(df) > max_rows: # pragma: nocover (not yet used) - df = df.sample(n=max_rows, random_state=0) - # Hash length of dataframe, as we have to track if this changes - md5.update(str(len(df)).encode("utf-8")) - try: - md5.update(bytes(str(pd.util.hash_pandas_object(df)), "utf-8")) - except TypeError: - # Use pickle if pandas cannot hash the object, - # e.g. if it contains numpy.arrays. - md5.update(f"{pickle.dumps(df, pickle.HIGHEST_PROTOCOL)}".encode("utf-8")) + for _, y in df.reset_index().items(): + # Convert every column to a numpy array, + # and hash its string representation + if y.dtype == "Int64": + # Enforce consistent conversion to numpy.array + # for integers across different pandas versions + # (since pandas 2.2.x, Int64 is converted to float if it contains ) + y = y.astype("float") + md5.update(bytes(str(y.to_numpy()), "utf-8")) return md5.digest() diff --git a/pyproject.toml b/pyproject.toml index b0f45140..13c329eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,14 +28,14 @@ classifiers = [ 'Programming Language :: Python :: 3.11', 'Topic :: Scientific/Engineering', ] -requires-python = '>=3.9' # pandas >=2.2.0 +requires-python = '>=3.9' # pandas >=2.1.0 dependencies = [ 'audeer >=2.0.0', 'audiofile >=0.4.0', 'iso-639', 'iso3166', 'oyaml', - 'pandas >=2.2.0', # hash values, see https://github.com/pandas-dev/pandas/issues/58999 + 'pandas >=2.1.0', # for pyarrow -> timedelta conversion 'pyarrow >=10.0.1', # for pyarrow strings in pandas 'pyyaml >=5.4.1', ] diff --git a/tests/test_table.py b/tests/test_table.py index 5900d7f3..1af6bd85 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -1312,15 +1312,15 @@ def get_md5(path: str) -> str: [ ( "files", - "9caa6722e65a04ddbce1cda2238c9126", + "a66a22ee4158e0e5100f1d797155ad81", ), ( "segments", - "37c9d9dc4f937a6e97ec72a080055e49", + "f69eb4a5d19da71e5da00a9b13beb3db", ), ( "misc", - "3488c007d45b19e04e8fdbf000f0f04d", + "331f79758b195cb9b7d0e8889e830eb2", ), ], ) From 8e85168c73db7b9c5cdca69b95b592fc922dac22 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 19 Jun 2024 17:12:36 +0200 Subject: [PATCH 66/67] Use test class --- tests/test_table.py | 168 ++++++++++++++++++++++++-------------------- 1 file changed, 92 insertions(+), 76 deletions(-) diff --git a/tests/test_table.py b/tests/test_table.py index 1af6bd85..2b5536de 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -1212,7 +1212,7 @@ def test_map(table, map): @pytest.mark.parametrize("storage_format", ["csv", "parquet"]) -def test_hash(tmpdir, storage_format): +class TestHash: r"""Test if PARQUET file hash changes with table. We store a MD5 sum associated with the dataframe, @@ -1229,82 +1229,98 @@ def test_hash(tmpdir, storage_format): """ - def get_md5(path: str) -> str: + def db(self, tmpdir, storage_format): + r"""Create minimal database with scheme and table.""" + self.db_root = audeer.path(tmpdir, "db") + self.storage_format = storage_format + self.table_file = audeer.path(self.db_root, f"db.table.{storage_format}") + db = audformat.Database("mydb") + db.schemes["int"] = audformat.Scheme("int") + index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2]) + db["table"] = audformat.Table(index) + db["table"]["column"] = audformat.Column(scheme_id="int") + db["table"]["column"].set([0, 1]) + db.save(self.db_root, storage_format=self.storage_format) + return db + + def md5(self) -> str: r"""Get MD5 sum for table file.""" - ext = audeer.file_extension(path) - if ext == "csv": - md5 = audeer.md5(path) - elif ext == "parquet": - md5 = parquet.read_schema(path).metadata[b"hash"].decode() - return md5 - - db_root = audeer.path(tmpdir, "db") - db = audformat.Database("mydb") - db.schemes["int"] = audformat.Scheme("int") - index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2]) - db["table"] = audformat.Table(index) - db["table"]["column"] = audformat.Column(scheme_id="int") - db["table"]["column"].set([0, 1]) - db.save(db_root, storage_format=storage_format) - - table_file = audeer.path(db_root, f"db.table.{storage_format}") - assert os.path.exists(table_file) - md5 = get_md5(table_file) - - # Replace table with identical copy - table = db["table"].copy() - db["table"] = table - db.save(db_root, storage_format=storage_format) - assert get_md5(table_file) == md5 - - # Change order of rows - index = audformat.segmented_index(["f2", "f1"], [1, 0], [2, 1]) - db["table"] = audformat.Table(index) - db["table"]["column"] = audformat.Column(scheme_id="int") - db["table"]["column"].set([1, 0]) - db.save(db_root, storage_format=storage_format) - assert get_md5(table_file) != md5 - - # Change index entry - index = audformat.segmented_index(["f1", "f1"], [0, 1], [1, 2]) - db["table"] = audformat.Table(index) - db["table"]["column"] = audformat.Column(scheme_id="int") - db["table"]["column"].set([0, 1]) - db.save(db_root, storage_format=storage_format) - assert get_md5(table_file) != md5 - - # Change data entry - index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2]) - db["table"] = audformat.Table(index) - db["table"]["column"] = audformat.Column(scheme_id="int") - db["table"]["column"].set([1, 0]) - db.save(db_root, storage_format=storage_format) - assert get_md5(table_file) != md5 - - # Change column name - index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2]) - db["table"] = audformat.Table(index) - db["table"]["col"] = audformat.Column(scheme_id="int") - db["table"]["col"].set([0, 1]) - db.save(db_root, storage_format=storage_format) - assert get_md5(table_file) != md5 - - # Change order of columns - index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2]) - db["table"] = audformat.Table(index) - db["table"]["col1"] = audformat.Column(scheme_id="int") - db["table"]["col1"].set([0, 1]) - db["table"]["col2"] = audformat.Column(scheme_id="int") - db["table"]["col2"].set([0, 1]) - db.save(db_root, storage_format=storage_format) - md5 = get_md5(table_file) - db["table"] = audformat.Table(index) - db["table"]["col2"] = audformat.Column(scheme_id="int") - db["table"]["col2"].set([0, 1]) - db["table"]["col1"] = audformat.Column(scheme_id="int") - db["table"]["col1"].set([0, 1]) - db.save(db_root, storage_format=storage_format) - assert get_md5(table_file) != md5 + if self.storage_format == "csv": + return audeer.md5(self.table_file) + elif self.storage_format == "parquet": + return parquet.read_schema(self.table_file).metadata[b"hash"].decode() + + def test_change_index(self, tmpdir, storage_format): + r"""Change table index.""" + db = self.db(tmpdir, storage_format) + md5 = self.md5() + index = audformat.segmented_index(["f1", "f1"], [0, 1], [1, 2]) + db["table"] = audformat.Table(index) + db["table"]["column"] = audformat.Column(scheme_id="int") + db["table"]["column"].set([0, 1]) + db.save(self.db_root, storage_format=self.storage_format) + assert self.md5() != md5 + + def test_change_column_name(self, tmpdir, storage_format): + r"""Change table column name.""" + db = self.db(tmpdir, storage_format) + md5 = self.md5() + index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2]) + db["table"] = audformat.Table(index) + db["table"]["col"] = audformat.Column(scheme_id="int") + db["table"]["col"].set([0, 1]) + db.save(self.db_root, storage_format=self.storage_format) + assert self.md5() != md5 + + def test_change_column_order(self, tmpdir, storage_format): + r"""Change order of table columns.""" + db = self.db(tmpdir, storage_format) + index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2]) + db["table"] = audformat.Table(index) + db["table"]["col1"] = audformat.Column(scheme_id="int") + db["table"]["col1"].set([0, 1]) + db["table"]["col2"] = audformat.Column(scheme_id="int") + db["table"]["col2"].set([0, 1]) + db.save(self.db_root, storage_format=self.storage_format) + md5 = self.md5() + db["table"] = audformat.Table(index) + db["table"]["col2"] = audformat.Column(scheme_id="int") + db["table"]["col2"].set([0, 1]) + db["table"]["col1"] = audformat.Column(scheme_id="int") + db["table"]["col1"].set([0, 1]) + db.save(self.db_root, storage_format=self.storage_format) + assert self.md5() != md5 + + def test_change_row_order(self, tmpdir, storage_format): + r"""Change order of table rows.""" + db = self.db(tmpdir, storage_format) + md5 = self.md5() + index = audformat.segmented_index(["f2", "f1"], [1, 0], [2, 1]) + db["table"] = audformat.Table(index) + db["table"]["column"] = audformat.Column(scheme_id="int") + db["table"]["column"].set([1, 0]) + db.save(self.db_root, storage_format=storage_format) + assert self.md5() != md5 + + def test_change_values(self, tmpdir, storage_format): + r"""Change table values.""" + db = self.db(tmpdir, storage_format) + md5 = self.md5() + index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2]) + db["table"] = audformat.Table(index) + db["table"]["column"] = audformat.Column(scheme_id="int") + db["table"]["column"].set([1, 0]) + db.save(self.db_root, storage_format=self.storage_format) + assert self.md5() != md5 + + def test_copy_table(self, tmpdir, storage_format): + r"""Replace table with identical copy.""" + db = self.db(tmpdir, storage_format) + md5 = self.md5() + table = db["table"].copy() + db["table"] = table + db.save(self.db_root, storage_format=self.storage_format) + assert self.md5() == md5 @pytest.mark.parametrize( From 6a9e3d10dd7697eb8e4a40312404a97755326d5e Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 19 Jun 2024 17:14:50 +0200 Subject: [PATCH 67/67] CI: remove pyarrow from branch to start test --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f895bbf6..9473ffc4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -4,7 +4,7 @@ on: push: branches: [ main ] pull_request: - branches: [ main, pyarrow ] + branches: [ main ] jobs: build: