From 384d99cdd9637f0dd62885b7150f8165d4505ba1 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 26 Jun 2024 11:33:10 +0200 Subject: [PATCH] DOC: Table.save() store hash for parquet files (#446) * DOC: Table.save() store hash for parquet files * Provide code example * Update example * Mention that audb uses the hash * Discuss reasons why md5 sum differs --- audformat/core/table.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/audformat/core/table.py b/audformat/core/table.py index 30924953..3c9c21fe 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -586,6 +586,28 @@ def save( Existing files will be overwritten. + When using ``"parquet"`` as ``storage_format`` + a hash, + based on the content of the table, + is stored under the key ``b"hash"`` + in the metadata of the schema of the parquet file. + This provides a deterministic hash for the file, + as md5 sums of parquet files, + containing identical information, + often differ. + Reasons include factors like the library + that wrote the parquet file, + the chosen compression codec + and metadata written by the library. + + The hash can be accessed with ``pyarrow`` by:: + + pyarrow.parquet.read_schema(f"{path}.parquet").metadata[b"hash"].decode() + + The hash is used by :mod:`audb` + when publishing a database + to track changes of database files. + Args: path: file path without extension storage_format: storage format of table.