Skip to content

Commit 036a89f

Browse files
reduce chunkstore memory footprint
1 parent 57e110b commit 036a89f

File tree

3 files changed

+15
-5
lines changed

3 files changed

+15
-5
lines changed

CHANGES.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
### 1.75
44
* Bugfix: pypandoc not rendering README correctly for PYPI
55
* Bugfix: #744 get_info on an empty dataframe raises an exception
6+
* Feature: Chunkstore: Removed duplication error when filtering by columns
7+
* Feature: Chunkstore: Reduced memory footprint when reading data
68

79
### 1.74 (2019-02-28)
810
* Bugfix: #712 Pandas deprecation warning in chunkstore serializer

arctic/chunkstore/chunkstore.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,7 @@ def read(self, symbol, chunk_range=None, filter_data=True, **kwargs):
280280
chunks[segments[0][SYMBOL]].append({DATA: chunk_data, METADATA: mdata})
281281

282282
skip_filter = not filter_data or chunk_range is None
283+
kwargs['inplace'] = kwargs.get('inplace', True)
283284

284285
if len(symbol) > 1:
285286
return {sym: deser(chunks[sym], **kwargs) if skip_filter else chunker.filter(deser(chunks[sym], **kwargs), chunk_range) for sym in symbol}

arctic/serialization/numpy_arrays.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ def serialize(self, df):
193193
ret[METADATA][TYPE] = dtype
194194
return ret
195195

196-
def deserialize(self, data, columns=None):
196+
def deserialize(self, data, columns=None, inplace=False):
197197
"""
198198
Deserializes SON to a DataFrame
199199
@@ -203,13 +203,17 @@ def deserialize(self, data, columns=None):
203203
columns: None, or list of strings
204204
optionally you can deserialize a subset of the data in the SON. Index
205205
columns are ALWAYS deserialized, and should not be specified
206+
inplace: Convert and remove items from data in-place
207+
this will modify data
206208
207209
Returns
208210
-------
209211
pandas dataframe or series
210212
"""
211213
if not data:
212214
return pd.DataFrame()
215+
if not inplace:
216+
data = data[:]
213217

214218
meta = data[0][METADATA] if isinstance(data, list) else data[METADATA]
215219
index = INDEX in meta
@@ -218,16 +222,19 @@ def deserialize(self, data, columns=None):
218222
if index:
219223
columns = columns[:]
220224
columns.extend(meta[INDEX])
221-
if len(columns) > len(set(columns)):
222-
raise Exception("Duplicate columns specified, cannot de-serialize")
225+
columns = list(set(columns))
223226

224227
if not isinstance(data, list):
225228
df = self.converter.objify(data, columns)
226229
else:
227-
df = pd.concat([self.converter.objify(d, columns) for d in data], ignore_index=not index)
230+
dfs = []
231+
while len(data):
232+
dfs.append(self.converter.objify(data.pop(0), columns))
233+
df = pd.concat(dfs, ignore_index=not index)
234+
del dfs
228235

229236
if index:
230-
df = df.set_index(meta[INDEX])
237+
df.set_index(meta[INDEX], inplace=True)
231238
if meta[TYPE] == 'series':
232239
return df[df.columns[0]]
233240
return df

0 commit comments

Comments
 (0)