Closed
Description
Ran into a problem writing my xarray to zarr that @jhamman helped me figure out the source of my error.
I had set up an xarray (from a chunked dask array)
<xarray.DataArray 'rechunk-merge-ac8a0511ce784baf57cf304ebdc4a296' (snippets: 792848, symbols: 282)>
dask.array<shape=(792848, 282), dtype=float64, chunksize=(5000, 282)>
Coordinates:
* snippets (snippets) object '0.gravatar.com||gprofiles.js||Gravatar.init' ... 'подолÑ\x8cÑ\x81к-админиÑ\x81Ñ\x82Ñ\x80аÑ\x86иÑ\x8f.Ñ\x80Ñ\x84||wp-embed.min.js||c'
* symbols (symbols) object 'AnalyserNode.channelCount' ... 'window.sessionStorage'
Upon trying to write
array.to_dataset(name='data').to_zarr('test.zarr')
I would get a memory error
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-33-ae022291811c> in <module>
----> 1 array.to_dataset(name='data').to_zarr('test.zarr')
~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/core/dataset.py in to_zarr(self, store, mode, synchronizer, group, encoding, compute, consolidated)
1275 return to_zarr(self, store=store, mode=mode, synchronizer=synchronizer,
1276 group=group, encoding=encoding, compute=compute,
-> 1277 consolidated=consolidated)
1278
1279 def __unicode__(self):
~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/backends/api.py in to_zarr(dataset, store, mode, synchronizer, group, encoding, compute, consolidated)
915 writer = ArrayWriter()
916 # TODO: figure out how to properly handle unlimited_dims
--> 917 dump_to_store(dataset, zstore, writer, encoding=encoding)
918 writes = writer.sync(compute=compute)
919
~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/backends/api.py in dump_to_store(dataset, store, writer, encoder, encoding, unlimited_dims)
790
791 store.store(variables, attrs, check_encoding, writer,
--> 792 unlimited_dims=unlimited_dims)
793
794
~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/backends/zarr.py in store(self, variables, attributes, *args, **kwargs)
343 def store(self, variables, attributes, *args, **kwargs):
344 AbstractWritableDataStore.store(self, variables, attributes,
--> 345 *args, **kwargs)
346
347 def sync(self):
~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/backends/common.py in store(self, variables, attributes, check_encoding_set, writer, unlimited_dims)
259 writer = ArrayWriter()
260
--> 261 variables, attributes = self.encode(variables, attributes)
262
263 self.set_attributes(attributes)
~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/backends/common.py in encode(self, variables, attributes)
203 """
204 variables = OrderedDict([(k, self.encode_variable(v))
--> 205 for k, v in variables.items()])
206 attributes = OrderedDict([(k, self.encode_attribute(v))
207 for k, v in attributes.items()])
~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/backends/common.py in <listcomp>(.0)
203 """
204 variables = OrderedDict([(k, self.encode_variable(v))
--> 205 for k, v in variables.items()])
206 attributes = OrderedDict([(k, self.encode_attribute(v))
207 for k, v in attributes.items()])
~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/backends/zarr.py in encode_variable(self, variable)
308
309 def encode_variable(self, variable):
--> 310 variable = encode_zarr_variable(variable)
311 return variable
312
~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/backends/zarr.py in encode_zarr_variable(var, needs_copy, name)
214 # TODO: allow toggling this explicitly via dtype in encoding.
215 coder = coding.strings.EncodedStringCoder(allows_unicode=False)
--> 216 var = coder.encode(var, name=name)
217 var = coding.strings.ensure_fixed_length_bytes(var)
218
~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/coding/strings.py in encode(self, variable, name)
60 safe_setitem(attrs, '_Encoding', string_encoding, name=name)
61 # TODO: figure out how to handle this in a lazy way with dask
---> 62 data = encode_string_array(data, string_encoding)
63
64 return Variable(dims, data, attrs, encoding)
~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/coding/strings.py in encode_string_array(string_array, encoding)
85 string_array = np.asarray(string_array)
86 encoded = [x.encode(encoding) for x in string_array.ravel()]
---> 87 return np.array(encoded, dtype=bytes).reshape(string_array.shape)
88
89
MemoryError:
My coordinates for 'snippets' are 800k long and include strings like
inassets1-internationsgmbh.netdna-ssl.com||gn1MljtM.11d8186d87588f8fe848.js||["./app-new/src/InterNations/Bundle/LayoutBundle/Resources/public/frontend/js/vendor/fingerprint2.js"]/</e.prototype.getRegularPlugins/</i<`
While the index only takes up 88MB @jhamman noted that
xarray is converting you list of string into a numpy array of bytestrings. The dimensions of this new array will be len(orig_list) x len(longest_string). So, the resulting array is going to be, potentially, much larger than the 88mb
So, this issue is a request to support my bizarre indexing.