pyarrow pandas dataframe buffer source array is read-only exception #147

geoHeil · 2018-12-09T12:50:18Z

When a pyArrow backed pandas parquet Dataframe is used with pandas-profiling:
I get the stack trace below. However, when using a CSV backed data frame with the same data pandas-profiling works just fine.

--------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-9-4f6d7d159b1b> in <module>
----> 1 pandas_profiling.ProfileReport(aa)

/opt/conda/lib/python3.6/site-packages/pandas_profiling/__init__.py in __init__(self, df, **kwargs)
     64         sample = kwargs.get('sample', df.head())
     65 
---> 66         description_set = describe(df, **kwargs)
     67 
     68         self.html = to_html(sample,

/opt/conda/lib/python3.6/site-packages/pandas_profiling/describe.py in describe(df, bins, check_correlation, correlation_threshold, correlation_overrides, check_recoded, pool_size, **kwargs)
    413         'table': table_stats,
    414         'variables': variable_stats.T,
--> 415         'freq': {k: (base.get_groupby_statistic(df[k])[0] if variable_stats[k].type != base.S_TYPE_UNSUPPORTED else None) for k in df.columns},
    416         'correlations': {'pearson': dfcorrPear, 'spearman': dfcorrSpear}
    417     }

/opt/conda/lib/python3.6/site-packages/pandas_profiling/describe.py in <dictcomp>(.0)
    413         'table': table_stats,
    414         'variables': variable_stats.T,
--> 415         'freq': {k: (base.get_groupby_statistic(df[k])[0] if variable_stats[k].type != base.S_TYPE_UNSUPPORTED else None) for k in df.columns},
    416         'correlations': {'pearson': dfcorrPear, 'spearman': dfcorrSpear}
    417     }

/opt/conda/lib/python3.6/site-packages/pandas_profiling/base.py in get_groupby_statistic(data)
     45         return _VALUE_COUNTS_MEMO[data.name]
     46 
---> 47     value_counts_with_nan = data.value_counts(dropna=False)
     48     value_counts_without_nan = value_counts_with_nan.loc[value_counts_with_nan.index.dropna()]
     49     distinct_count_with_nan = value_counts_with_nan.count()

/opt/conda/lib/python3.6/site-packages/pandas/core/base.py in value_counts(self, normalize, sort, ascending, bins, dropna)
   1036         from pandas.core.algorithms import value_counts
   1037         result = value_counts(self, sort=sort, ascending=ascending,
-> 1038                               normalize=normalize, bins=bins, dropna=dropna)
   1039         return result
   1040 

/opt/conda/lib/python3.6/site-packages/pandas/core/algorithms.py in value_counts(values, sort, ascending, normalize, bins, dropna)
    721 
    722     if sort:
--> 723         result = result.sort_values(ascending=ascending)
    724 
    725     if normalize:

/opt/conda/lib/python3.6/site-packages/pandas/core/series.py in sort_values(self, axis, ascending, inplace, kind, na_position)
   2494             raise ValueError('invalid na_position: {!r}'.format(na_position))
   2495 
-> 2496         result = self._constructor(arr[sortedIdx], index=self.index[sortedIdx])
   2497 
   2498         if inplace:

/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py in __getitem__(self, key)
   2095         result = getitem(key)
   2096         if not is_scalar(result):
-> 2097             return promote(result)
   2098         else:
   2099             return result

/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/category.py in _shallow_copy(self, values, categories, ordered, dtype, **kwargs)
    206             dtype = self.dtype if dtype is None else dtype
    207             return super(CategoricalIndex, self)._shallow_copy(
--> 208                 values=values, dtype=dtype, **kwargs)
    209         if categories is None:
    210             categories = self.categories

/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py in _shallow_copy(self, values, **kwargs)
    516         if not len(values) and 'dtype' not in kwargs:
    517             attributes['dtype'] = self.dtype
--> 518         return self._simple_new(values, **attributes)
    519 
    520     def _shallow_copy_with_infer(self, values=None, **kwargs):

/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/category.py in _simple_new(cls, values, name, categories, ordered, dtype, **kwargs)
    182 
    183         values = cls._create_categorical(cls, values, categories, ordered,
--> 184                                          dtype=dtype)
    185         result._data = values
    186         result.name = name

/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/category.py in _create_categorical(self, data, categories, ordered, dtype)
    173             if isinstance(dtype, CategoricalDtype):
    174                 # we want to silently ignore dtype='category'
--> 175                 data = data._set_dtype(dtype)
    176         return data
    177 

/opt/conda/lib/python3.6/site-packages/pandas/core/arrays/categorical.py in _set_dtype(self, dtype)
    728         """
    729         codes = _recode_for_categories(self.codes, self.categories,
--> 730                                        dtype.categories)
    731         return type(self)(codes, dtype=dtype, fastpath=True)
    732 

/opt/conda/lib/python3.6/site-packages/pandas/core/arrays/categorical.py in _recode_for_categories(codes, old_categories, new_categories)
   2461         # All null anyway, so just retain the nulls
   2462         return codes.copy()
-> 2463     indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories),
   2464                                    new_categories)
   2465     new_codes = take_1d(indexer, codes.copy(), fill_value=-1)

/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_indexer(self, target, method, limit, tolerance)
   3257                                  'backfill or nearest reindexing')
   3258 
-> 3259             indexer = self._engine.get_indexer(target._ndarray_values)
   3260 
   3261         return _ensure_platform_int(indexer)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_indexer()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.lookup()

/opt/conda/lib/python3.6/site-packages/pandas/_libs/hashtable.cpython-36m-x86_64-linux-gnu.so in View.MemoryView.memoryview_cwrapper()

/opt/conda/lib/python3.6/site-packages/pandas/_libs/hashtable.cpython-36m-x86_64-linux-gnu.so in View.MemoryView.memoryview.__cinit__()

ValueError: buffer source array is read-only

The text was updated successfully, but these errors were encountered:

sbrugman · 2019-05-30T17:46:45Z

Hi @geoHeil ,

Thank you for reporting this issue. Testing it with a recent version of pandas-profiling works just fine for me, in the myriad of ways I have tested it. This code works for example and seems to be comparable to what you mention in your post:

import pandas as pd
from pandas_profiling import ProfileReport


# https://github.com/pandas-profiling/pandas-profiling/issues/147
def test_issue147():
    "Data from https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata2.parquet"
    df = pd.read_parquet(r"userdata2.parquet", engine='pyarrow')
    report = ProfileReport(df, title="PyArrow with Pandas Parquet Backend")
    html = report.to_html()
    assert type(html) == str and '<p class="h2">Dataset info</p>' in html

It could be that recent updates have solved the problem, or that your dataset contains more than I am now testing. In any case, the first step would be to take another shot at the dataset with the latest pandas-profiling. If the error persists, please let us know.

Kind regards,

geoHeil · 2019-05-31T04:57:30Z

Indeed. I will close it now.

sbrugman added the bug 🐛 Something isn't working label May 29, 2019

sbrugman added a commit that referenced this issue May 30, 2019

Python 3.5 support, tests for [#147] and XSS filter

d6c85be

geoHeil closed this as completed May 31, 2019

chanedwin pushed a commit to chanedwin/pandas-profiling that referenced this issue Oct 11, 2020

Python 3.5 support, tests for [ydataai#147] and XSS filter

92d97ca

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

pyarrow pandas dataframe buffer source array is read-only exception #147

pyarrow pandas dataframe buffer source array is read-only exception #147

geoHeil commented Dec 9, 2018 •

edited by sbrugman

Loading

sbrugman commented May 30, 2019

geoHeil commented May 31, 2019

pyarrow pandas dataframe buffer source array is read-only exception #147

pyarrow pandas dataframe buffer source array is read-only exception #147

Comments

geoHeil commented Dec 9, 2018 • edited by sbrugman Loading

sbrugman commented May 30, 2019

geoHeil commented May 31, 2019

geoHeil commented Dec 9, 2018 •

edited by sbrugman

Loading