Skip to content

Support variable length string arrays in xarray/zarr #2724

Closed
@birdsarah

Description

@birdsarah

Ran into a problem writing my xarray to zarr that @jhamman helped me figure out the source of my error.

I had set up an xarray (from a chunked dask array)

<xarray.DataArray 'rechunk-merge-ac8a0511ce784baf57cf304ebdc4a296' (snippets: 792848, symbols: 282)> 
dask.array<shape=(792848, 282), dtype=float64, chunksize=(5000, 282)> 
Coordinates: 
 * snippets (snippets) object '0.gravatar.com||gprofiles.js||Gravatar.init' ... 'подолÑ\x8cÑ\x81к-админиÑ\x81Ñ\x82Ñ\x80аÑ\x86иÑ\x8f.Ñ\x80Ñ\x84||wp-embed.min.js||c'
 * symbols (symbols) object 'AnalyserNode.channelCount' ... 'window.sessionStorage'

Upon trying to write

array.to_dataset(name='data').to_zarr('test.zarr')

I would get a memory error

---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-33-ae022291811c> in <module>
----> 1 array.to_dataset(name='data').to_zarr('test.zarr')

~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/core/dataset.py in to_zarr(self, store, mode, synchronizer, group, encoding, compute, consolidated)
   1275         return to_zarr(self, store=store, mode=mode, synchronizer=synchronizer,
   1276                        group=group, encoding=encoding, compute=compute,
-> 1277                        consolidated=consolidated)
   1278 
   1279     def __unicode__(self):

~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/backends/api.py in to_zarr(dataset, store, mode, synchronizer, group, encoding, compute, consolidated)
    915     writer = ArrayWriter()
    916     # TODO: figure out how to properly handle unlimited_dims
--> 917     dump_to_store(dataset, zstore, writer, encoding=encoding)
    918     writes = writer.sync(compute=compute)
    919 

~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/backends/api.py in dump_to_store(dataset, store, writer, encoder, encoding, unlimited_dims)
    790 
    791     store.store(variables, attrs, check_encoding, writer,
--> 792                 unlimited_dims=unlimited_dims)
    793 
    794 

~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/backends/zarr.py in store(self, variables, attributes, *args, **kwargs)
    343     def store(self, variables, attributes, *args, **kwargs):
    344         AbstractWritableDataStore.store(self, variables, attributes,
--> 345                                         *args, **kwargs)
    346 
    347     def sync(self):

~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/backends/common.py in store(self, variables, attributes, check_encoding_set, writer, unlimited_dims)
    259             writer = ArrayWriter()
    260 
--> 261         variables, attributes = self.encode(variables, attributes)
    262 
    263         self.set_attributes(attributes)

~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/backends/common.py in encode(self, variables, attributes)
    203         """
    204         variables = OrderedDict([(k, self.encode_variable(v))
--> 205                                  for k, v in variables.items()])
    206         attributes = OrderedDict([(k, self.encode_attribute(v))
    207                                   for k, v in attributes.items()])

~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/backends/common.py in <listcomp>(.0)
    203         """
    204         variables = OrderedDict([(k, self.encode_variable(v))
--> 205                                  for k, v in variables.items()])
    206         attributes = OrderedDict([(k, self.encode_attribute(v))
    207                                   for k, v in attributes.items()])

~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/backends/zarr.py in encode_variable(self, variable)
    308 
    309     def encode_variable(self, variable):
--> 310         variable = encode_zarr_variable(variable)
    311         return variable
    312 

~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/backends/zarr.py in encode_zarr_variable(var, needs_copy, name)
    214     # TODO: allow toggling this explicitly via dtype in encoding.
    215     coder = coding.strings.EncodedStringCoder(allows_unicode=False)
--> 216     var = coder.encode(var, name=name)
    217     var = coding.strings.ensure_fixed_length_bytes(var)
    218 

~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/coding/strings.py in encode(self, variable, name)
     60             safe_setitem(attrs, '_Encoding', string_encoding, name=name)
     61             # TODO: figure out how to handle this in a lazy way with dask
---> 62             data = encode_string_array(data, string_encoding)
     63 
     64         return Variable(dims, data, attrs, encoding)

~/miniconda3/envs/ovscrptd/lib/python3.6/site-packages/xarray/coding/strings.py in encode_string_array(string_array, encoding)
     85     string_array = np.asarray(string_array)
     86     encoded = [x.encode(encoding) for x in string_array.ravel()]
---> 87     return np.array(encoded, dtype=bytes).reshape(string_array.shape)
     88 
     89 

MemoryError: 

My coordinates for 'snippets' are 800k long and include strings like

inassets1-internationsgmbh.netdna-ssl.com||gn1MljtM.11d8186d87588f8fe848.js||["./app-new/src/InterNations/Bundle/LayoutBundle/Resources/public/frontend/js/vendor/fingerprint2.js"]/</e.prototype.getRegularPlugins/</i<` 

While the index only takes up 88MB @jhamman noted that

xarray is converting you list of string into a numpy array of bytestrings. The dimensions of this new array will be len(orig_list) x len(longest_string). So, the resulting array is going to be, potentially, much larger than the 88mb

So, this issue is a request to support my bizarre indexing.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions