You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
When a pyArrow backed pandas parquet Dataframe is used with pandas-profiling:
I get the stack trace below. However, when using a CSV backed data frame with the same data pandas-profiling works just fine.
--------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-9-4f6d7d159b1b> in <module>
----> 1 pandas_profiling.ProfileReport(aa)
/opt/conda/lib/python3.6/site-packages/pandas_profiling/__init__.py in __init__(self, df, **kwargs)
64 sample = kwargs.get('sample', df.head())
65
---> 66 description_set = describe(df, **kwargs)
67
68 self.html = to_html(sample,
/opt/conda/lib/python3.6/site-packages/pandas_profiling/describe.py in describe(df, bins, check_correlation, correlation_threshold, correlation_overrides, check_recoded, pool_size, **kwargs)
413 'table': table_stats,
414 'variables': variable_stats.T,
--> 415 'freq': {k: (base.get_groupby_statistic(df[k])[0] if variable_stats[k].type != base.S_TYPE_UNSUPPORTED else None) for k in df.columns},
416 'correlations': {'pearson': dfcorrPear, 'spearman': dfcorrSpear}
417 }
/opt/conda/lib/python3.6/site-packages/pandas_profiling/describe.py in <dictcomp>(.0)
413 'table': table_stats,
414 'variables': variable_stats.T,
--> 415 'freq': {k: (base.get_groupby_statistic(df[k])[0] if variable_stats[k].type != base.S_TYPE_UNSUPPORTED else None) for k in df.columns},
416 'correlations': {'pearson': dfcorrPear, 'spearman': dfcorrSpear}
417 }
/opt/conda/lib/python3.6/site-packages/pandas_profiling/base.py in get_groupby_statistic(data)
45 return _VALUE_COUNTS_MEMO[data.name]
46
---> 47 value_counts_with_nan = data.value_counts(dropna=False)
48 value_counts_without_nan = value_counts_with_nan.loc[value_counts_with_nan.index.dropna()]
49 distinct_count_with_nan = value_counts_with_nan.count()
/opt/conda/lib/python3.6/site-packages/pandas/core/base.py in value_counts(self, normalize, sort, ascending, bins, dropna)
1036 from pandas.core.algorithms import value_counts
1037 result = value_counts(self, sort=sort, ascending=ascending,
-> 1038 normalize=normalize, bins=bins, dropna=dropna)
1039 return result
1040
/opt/conda/lib/python3.6/site-packages/pandas/core/algorithms.py in value_counts(values, sort, ascending, normalize, bins, dropna)
721
722 if sort:
--> 723 result = result.sort_values(ascending=ascending)
724
725 if normalize:
/opt/conda/lib/python3.6/site-packages/pandas/core/series.py in sort_values(self, axis, ascending, inplace, kind, na_position)
2494 raise ValueError('invalid na_position: {!r}'.format(na_position))
2495
-> 2496 result = self._constructor(arr[sortedIdx], index=self.index[sortedIdx])
2497
2498 if inplace:
/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py in __getitem__(self, key)
2095 result = getitem(key)
2096 if not is_scalar(result):
-> 2097 return promote(result)
2098 else:
2099 return result
/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/category.py in _shallow_copy(self, values, categories, ordered, dtype, **kwargs)
206 dtype = self.dtype if dtype is None else dtype
207 return super(CategoricalIndex, self)._shallow_copy(
--> 208 values=values, dtype=dtype, **kwargs)
209 if categories is None:
210 categories = self.categories
/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py in _shallow_copy(self, values, **kwargs)
516 if not len(values) and 'dtype' not in kwargs:
517 attributes['dtype'] = self.dtype
--> 518 return self._simple_new(values, **attributes)
519
520 def _shallow_copy_with_infer(self, values=None, **kwargs):
/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/category.py in _simple_new(cls, values, name, categories, ordered, dtype, **kwargs)
182
183 values = cls._create_categorical(cls, values, categories, ordered,
--> 184 dtype=dtype)
185 result._data = values
186 result.name = name
/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/category.py in _create_categorical(self, data, categories, ordered, dtype)
173 if isinstance(dtype, CategoricalDtype):
174 # we want to silently ignore dtype='category'
--> 175 data = data._set_dtype(dtype)
176 return data
177
/opt/conda/lib/python3.6/site-packages/pandas/core/arrays/categorical.py in _set_dtype(self, dtype)
728 """
729 codes = _recode_for_categories(self.codes, self.categories,
--> 730 dtype.categories)
731 return type(self)(codes, dtype=dtype, fastpath=True)
732
/opt/conda/lib/python3.6/site-packages/pandas/core/arrays/categorical.py in _recode_for_categories(codes, old_categories, new_categories)
2461 # All null anyway, so just retain the nulls
2462 return codes.copy()
-> 2463 indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories),
2464 new_categories)
2465 new_codes = take_1d(indexer, codes.copy(), fill_value=-1)
/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_indexer(self, target, method, limit, tolerance)
3257 'backfill or nearest reindexing')
3258
-> 3259 indexer = self._engine.get_indexer(target._ndarray_values)
3260
3261 return _ensure_platform_int(indexer)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_indexer()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.lookup()
/opt/conda/lib/python3.6/site-packages/pandas/_libs/hashtable.cpython-36m-x86_64-linux-gnu.so in View.MemoryView.memoryview_cwrapper()
/opt/conda/lib/python3.6/site-packages/pandas/_libs/hashtable.cpython-36m-x86_64-linux-gnu.so in View.MemoryView.memoryview.__cinit__()
ValueError: buffer source array is read-only
The text was updated successfully, but these errors were encountered:
Thank you for reporting this issue. Testing it with a recent version of pandas-profiling works just fine for me, in the myriad of ways I have tested it. This code works for example and seems to be comparable to what you mention in your post:
import pandas as pd
from pandas_profiling import ProfileReport
# https://github.com/pandas-profiling/pandas-profiling/issues/147
def test_issue147():
"Data from https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata2.parquet"
df = pd.read_parquet(r"userdata2.parquet", engine='pyarrow')
report = ProfileReport(df, title="PyArrow with Pandas Parquet Backend")
html = report.to_html()
assert type(html) == str and '<p class="h2">Dataset info</p>' in html
It could be that recent updates have solved the problem, or that your dataset contains more than I am now testing. In any case, the first step would be to take another shot at the dataset with the latest pandas-profiling. If the error persists, please let us know.
When a pyArrow backed pandas parquet Dataframe is used with pandas-profiling:
I get the stack trace below. However, when using a CSV backed data frame with the same data pandas-profiling works just fine.
The text was updated successfully, but these errors were encountered: