man-group · TomTaylorLondon · Apr 19, 2019 · bmoscon · Apr 23, 2019 · bmoscon
diff --git a/CHANGES.md b/CHANGES.md
@@ -3,6 +3,8 @@
 ### 1.75
  * Bugfix: pypandoc not rendering README correctly for PYPI
  * Bugfix: #744 get_info on an empty dataframe raises an exception
+ * Feature: Chunkstore: Removed duplication error when filtering by columns
+ * Feature: Chunkstore: Reduced memory footprint when reading data
 
 ### 1.74 (2019-02-28)
  * Bugfix: #712 Pandas deprecation warning in chunkstore serializer

diff --git a/arctic/chunkstore/chunkstore.py b/arctic/chunkstore/chunkstore.py
@@ -280,6 +280,7 @@ def read(self, symbol, chunk_range=None, filter_data=True, **kwargs):
  chunks[segments[0][SYMBOL]].append({DATA: chunk_data, METADATA: mdata})
 
  skip_filter = not filter_data or chunk_range is None
+ kwargs['inplace'] = kwargs.get('inplace', True)
 
  if len(symbol) > 1:
  return {sym: deser(chunks[sym], **kwargs) if skip_filter else chunker.filter(deser(chunks[sym], **kwargs), chunk_range) for sym in symbol}

diff --git a/arctic/serialization/numpy_arrays.py b/arctic/serialization/numpy_arrays.py
@@ -193,7 +193,7 @@ def serialize(self, df):
  ret[METADATA][TYPE] = dtype
  return ret
 
- def deserialize(self, data, columns=None):
+ def deserialize(self, data, columns=None, inplace=False):
  """
  Deserializes SON to a DataFrame
 
@@ -203,13 +203,17 @@ def deserialize(self, data, columns=None):
  columns: None, or list of strings
  optionally you can deserialize a subset of the data in the SON. Index
  columns are ALWAYS deserialized, and should not be specified
+ inplace: Convert and remove items from data in-place
+ this will modify data
 
  Returns
  -------
  pandas dataframe or series
  """
  if not data:
  return pd.DataFrame()
+ if not inplace:
+ data = data[:]
 
  meta = data[0][METADATA] if isinstance(data, list) else data[METADATA]
  index = INDEX in meta
@@ -218,16 +222,19 @@ def deserialize(self, data, columns=None):
  if index:
  columns = columns[:]
  columns.extend(meta[INDEX])
- if len(columns) > len(set(columns)):
- raise Exception("Duplicate columns specified, cannot de-serialize")
+ columns = list(set(columns))
 
  if not isinstance(data, list):
  df = self.converter.objify(data, columns)
  else:
- df = pd.concat([self.converter.objify(d, columns) for d in data], ignore_index=not index)
+ dfs = []
+ while len(data):
+ dfs.append(self.converter.objify(data.pop(0), columns))
+ df = pd.concat(dfs, ignore_index=not index)
+ del dfs
 
  if index:
- df = df.set_index(meta[INDEX])
+ df.set_index(meta[INDEX], inplace=True)
  if meta[TYPE] == 'series':
  return df[df.columns[0]]
  return df