diff --git a/CHANGES.md b/CHANGES.md index ed95d8100..7b4e16078 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -3,6 +3,8 @@ ### 1.75 * Bugfix: pypandoc not rendering README correctly for PYPI * Bugfix: #744 get_info on an empty dataframe raises an exception + * Feature: Chunkstore: Removed duplication error when filtering by columns + * Feature: Chunkstore: Reduced memory footprint when reading data ### 1.74 (2019-02-28) * Bugfix: #712 Pandas deprecation warning in chunkstore serializer diff --git a/arctic/chunkstore/chunkstore.py b/arctic/chunkstore/chunkstore.py index 6d7198c48..f72f0c127 100644 --- a/arctic/chunkstore/chunkstore.py +++ b/arctic/chunkstore/chunkstore.py @@ -280,6 +280,7 @@ def read(self, symbol, chunk_range=None, filter_data=True, **kwargs): chunks[segments[0][SYMBOL]].append({DATA: chunk_data, METADATA: mdata}) skip_filter = not filter_data or chunk_range is None + kwargs['inplace'] = kwargs.get('inplace', True) if len(symbol) > 1: return {sym: deser(chunks[sym], **kwargs) if skip_filter else chunker.filter(deser(chunks[sym], **kwargs), chunk_range) for sym in symbol} diff --git a/arctic/serialization/numpy_arrays.py b/arctic/serialization/numpy_arrays.py index 60b810460..78445e3f0 100644 --- a/arctic/serialization/numpy_arrays.py +++ b/arctic/serialization/numpy_arrays.py @@ -193,7 +193,7 @@ def serialize(self, df): ret[METADATA][TYPE] = dtype return ret - def deserialize(self, data, columns=None): + def deserialize(self, data, columns=None, inplace=False): """ Deserializes SON to a DataFrame @@ -203,6 +203,8 @@ def deserialize(self, data, columns=None): columns: None, or list of strings optionally you can deserialize a subset of the data in the SON. Index columns are ALWAYS deserialized, and should not be specified + inplace: Convert and remove items from data in-place + this will modify data Returns ------- @@ -210,6 +212,8 @@ def deserialize(self, data, columns=None): """ if not data: return pd.DataFrame() + if not inplace: + data = data[:] meta = data[0][METADATA] if isinstance(data, list) else data[METADATA] index = INDEX in meta @@ -218,16 +222,19 @@ def deserialize(self, data, columns=None): if index: columns = columns[:] columns.extend(meta[INDEX]) - if len(columns) > len(set(columns)): - raise Exception("Duplicate columns specified, cannot de-serialize") + columns = list(set(columns)) if not isinstance(data, list): df = self.converter.objify(data, columns) else: - df = pd.concat([self.converter.objify(d, columns) for d in data], ignore_index=not index) + dfs = [] + while len(data): + dfs.append(self.converter.objify(data.pop(0), columns)) + df = pd.concat(dfs, ignore_index=not index) + del dfs if index: - df = df.set_index(meta[INDEX]) + df.set_index(meta[INDEX], inplace=True) if meta[TYPE] == 'series': return df[df.columns[0]] return df