-
Notifications
You must be signed in to change notification settings - Fork 69
Description
Describe the issue
When reading a .dta file with apply_value_formats=True & user_missing=True, the following error occurs. No problem if I set apply_value_formats=False and formats_as_category=False', but then trying to reattach value labels with pyreadstat.set_value_labels()' throws the same error.
To Reproduce
Steps to reproduce the behavior.
df, meta = pyreadstat.read_dta( "file.dta", apply_value_formats=True, formats_as_category=True, formats_as_ordered_category=False, user_missing=True, dates_as_pandas_datetime=True, extra_date_formats=["YEAR", "MMYY"], encoding="UTF-8" )
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
File pyreadstat/pyreadstat.pyx:302, in pyreadstat.pyreadstat.read_dta()
File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/pyreadstat/pyfunctions.py:72, in set_value_labels(dataframe, metadata, formats_as_category, formats_as_ordered_category)
70 continue
71 else:
---> 72 df_copy = df_copy.with_columns(nw.col(var_name).replace_strict(labels))
73 if formats_as_ordered_category:
74 categories = list(set(labels.values()))
File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/dataframe.py:1420, in DataFrame.with_columns(self, *exprs, **named_exprs)
1388 def with_columns(
1389 self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr
1390 ) -> Self:
1391 r"""Add columns to this DataFrame.
1392
1393 Added columns will replace existing columns with the same name.
(...) 1418 1 2 4.0 4
1419 """
-> 1420 return super().with_columns(*exprs, **named_exprs)
File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/dataframe.py:206, in BaseFrame.with_columns(self, *exprs, **named_exprs)
201 compliant_exprs, kinds = self._flatten_and_extract(*exprs, **named_exprs)
202 compliant_exprs = [
203 compliant_expr.broadcast(kind) if is_scalar_like(kind) else compliant_expr
204 for compliant_expr, kind in zip_strict(compliant_exprs, kinds)
205 ]
--> 206 return self._with_compliant(self._compliant_frame.with_columns(*compliant_exprs))
File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/_pandas_like/dataframe.py:451, in PandasLikeDataFrame.with_columns(self, *exprs)
450 def with_columns(self, *exprs: PandasLikeExpr) -> Self:
--> 451 columns = self._evaluate_into_exprs(*exprs)
452 if not columns and len(self) == 0:
453 return self
File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/_compliant/dataframe.py:347, in EagerDataFrame._evaluate_into_exprs(self, *exprs)
343 def _evaluate_into_exprs(self, *exprs: EagerExprT) -> Sequence[EagerSeriesT]:
344 # NOTE: Ignore intermittent [False Negative]
345 # Argument of type "EagerExprT@EagerDataFrame" cannot be assigned to parameter "expr" of type "EagerExprT@EagerDataFrame" in function "_evaluate_into_expr"
346 # Type "EagerExprT@EagerDataFrame" is not assignable to type "EagerExprT@EagerDataFrame"
--> 347 return list(chain.from_iterable(self._evaluate_into_expr(expr) for expr in exprs))
File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/_compliant/dataframe.py:347, in <genexpr>(.0)
343 def _evaluate_into_exprs(self, *exprs: EagerExprT) -> Sequence[EagerSeriesT]:
344 # NOTE: Ignore intermittent [False Negative]
345 # Argument of type "EagerExprT@EagerDataFrame" cannot be assigned to parameter "expr" of type "EagerExprT@EagerDataFrame" in function "_evaluate_into_expr"
346 # Type "EagerExprT@EagerDataFrame" is not assignable to type "EagerExprT@EagerDataFrame"
--> 347 return list(chain.from_iterable(self._evaluate_into_expr(expr) for expr in exprs))
File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/_compliant/dataframe.py:360, in EagerDataFrame._evaluate_into_expr(self, expr)
350 """Return list of raw columns.
351
352 For eager backends we alias operations at each step.
(...) 357 Note that for PySpark / DuckDB, we are less free to liberally set aliases whenever we want.
358 """
359 aliases = expr._evaluate_aliases(self)
--> 360 result = expr(self)
361 if list(aliases) != (
362 result_aliases := [s.name for s in result]
363 ): # pragma: no cover
364 msg = f"Safety assertion failed, expected {aliases}, got {result_aliases}"
File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/_compliant/expr.py:234, in EagerExpr.__call__(self, df)
233 def __call__(self, df: EagerDataFrameT) -> Sequence[EagerSeriesT]:
--> 234 return self._call(df)
File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/_compliant/expr.py:384, in EagerExpr._reuse_series_inner(self, df, method_name, returns_scalar, scalar_kwargs, expressifiable_args)
371 kwargs = {
372 **scalar_kwargs,
373 **{
(...) 376 },
377 }
378 method = methodcaller(
379 method_name,
380 **self._reuse_series_extra_kwargs(returns_scalar=returns_scalar),
381 **kwargs,
382 )
383 out: Sequence[EagerSeriesT] = [
--> 384 series._from_scalar(method(series)) if returns_scalar else method(series)
385 for series in self(df)
386 ]
387 aliases = self._evaluate_aliases(df)
388 if [s.name for s in out] != list(aliases): # pragma: no cover
File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/_pandas_like/series.py:669, in PandasLikeSeries.replace_strict(self, old, new, return_dtype)
664 namespace = self.__native_namespace__()
665 other = namespace.DataFrame(
666 {self.name: old, tmp_name: namespace.Series(new, dtype=dtype)}
667 )
668 result = self._with_native(
--> 669 self.native.to_frame().merge(other, on=self.name, how="left")[tmp_name]
670 ).alias(self.name)
671 if result.is_null().sum() != self.is_null().sum():
672 msg = (
673 "replace_strict did not replace all non-null values.\n\n"
674 f"The following did not get replaced: {self.filter(~self.is_null() & result.is_null()).unique(maintain_order=False).to_list()}"
675 )
File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/pandas/core/frame.py:10859, in DataFrame.merge(self, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
10840 @Substitution("")
10841 @Appender(_merge_doc, indents=2)
10842 def merge(
(...) 10855 validate: MergeValidate | None = None,
10856 ) -> DataFrame:
10857 from pandas.core.reshape.merge import merge
> 10859 return merge(
10860 self,
10861 right,
10862 how=how,
10863 on=on,
10864 left_on=left_on,
10865 right_on=right_on,
10866 left_index=left_index,
10867 right_index=right_index,
10868 sort=sort,
10869 suffixes=suffixes,
10870 copy=copy,
10871 indicator=indicator,
10872 validate=validate,
10873 )
File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/pandas/core/reshape/merge.py:170, in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
155 return _cross_merge(
156 left_df,
157 right_df,
(...) 167 copy=copy,
168 )
169 else:
--> 170 op = _MergeOperation(
171 left_df,
172 right_df,
173 how=how,
174 on=on,
175 left_on=left_on,
176 right_on=right_on,
177 left_index=left_index,
178 right_index=right_index,
179 sort=sort,
180 suffixes=suffixes,
181 indicator=indicator,
182 validate=validate,
183 )
184 return op.get_result(copy=copy)
File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/pandas/core/reshape/merge.py:807, in _MergeOperation.__init__(self, left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, indicator, validate)
803 self._validate_tolerance(self.left_join_keys)
805 # validate the merge keys dtypes. We may need to coerce
806 # to avoid incompatible dtypes
--> 807 self._maybe_coerce_merge_keys()
809 # If argument passed to validate,
810 # check if columns specified as unique
811 # are in fact unique.
812 if validate is not None:
File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/pandas/core/reshape/merge.py:1509, in _MergeOperation._maybe_coerce_merge_keys(self)
1503 # unless we are merging non-string-like with string-like
1504 elif (
1505 inferred_left in string_types and inferred_right not in string_types
1506 ) or (
1507 inferred_right in string_types and inferred_left not in string_types
1508 ):
-> 1509 raise ValueError(msg)
1511 # datetimelikes must match exactly
1512 elif needs_i8_conversion(lk.dtype) and not needs_i8_conversion(rk.dtype):
ValueError: You are trying to merge on float64 and object columns for key '<VARNAME>'. If you wish to proceed you should use pd.concat
Suggested fix:
The following line in file ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/pyreadstat/pyfunctions.py:72, in set_value_labels(dataframe, metadata, formats_as_category, formats_as_ordered_category) is throwing the error:
df_copy = df_copy.with_columns(nw.col(var_name).replace_strict(labels))
so I added the following lines right before it fails, in place:
print(nw.col(var_name))
print(df_copy[var_name].dtype)
print(df_copy[var_name].unique())
print(list(labels.keys())[:5])
df_copy = df_copy.with_columns(nw.col(var_name).replace_strict(labels))
Output:
Narwhals Expr
metadata: ExprMetadata(
expansion_kind: ExpansionKind.SINGLE,
last_node: ExprKind.ELEMENTWISE,
has_windows: False,
n_orderable_ops: 0,
is_elementwise: True,
preserves_length: True,
is_scalar_like: False,
is_literal: False,
)
Float64
┌───────────────────────────────────────────┐
| Narwhals Series |
|-------------------------------------------|
|0 100.0 |
... <TRUNCATED OUTPUT>
|312 -85.0 |
|Name: <VARNAME>, Length: 313, dtype: float64|
└───────────────────────────────────────────┘
['a', 'b', 'c', 'd', 100.0]
Replacing that line with the following block is currently working for me:
try:
df_copy = df_copy.with_columns(nw.col(var_name).replace_strict(labels))
except:
df_copy = df_copy.with_columns(nw.col(var_name).cast(nw.Object).replace_strict(labels))
Expected behavior
As per documentation, any Stata missing values to be set to pd.NA automatically by the package.
Setup Information:
How did you install pyreadstat?: pip
Platform: macOS
Python Version: 3.13.7
Python Distribution: plain python
Using Virtualenv or condaenv: No