narwhals issue with missing Stata values

Describe the issue
When reading a .dta file with `apply_value_formats=True` & `user_missing=True`, the following error occurs. No problem if I set `apply_value_formats=False`  and `formats_as_category=False', but then trying to reattach value labels with `pyreadstat.set_value_labels()' throws the same error.

To Reproduce
Steps to reproduce the behavior.
```
df, meta = pyreadstat.read_dta( "file.dta", apply_value_formats=True,  formats_as_category=True, formats_as_ordered_category=False, user_missing=True, dates_as_pandas_datetime=True, extra_date_formats=["YEAR", "MMYY"], encoding="UTF-8" )

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)

File pyreadstat/pyreadstat.pyx:302, in pyreadstat.pyreadstat.read_dta()

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/pyreadstat/pyfunctions.py:72, in set_value_labels(dataframe, metadata, formats_as_category, formats_as_ordered_category)
     70     continue
     71 else:
---> 72     df_copy = df_copy.with_columns(nw.col(var_name).replace_strict(labels))
     73 if formats_as_ordered_category:
     74     categories = list(set(labels.values()))

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/dataframe.py:1420, in DataFrame.with_columns(self, *exprs, **named_exprs)
   1388 def with_columns(
   1389     self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr
   1390 ) -> Self:
   1391     r"""Add columns to this DataFrame.
   1392 
   1393     Added columns will replace existing columns with the same name.
   (...)   1418         1  2  4.0    4
   1419     """
-> 1420     return super().with_columns(*exprs, **named_exprs)

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/dataframe.py:206, in BaseFrame.with_columns(self, *exprs, **named_exprs)
    201 compliant_exprs, kinds = self._flatten_and_extract(*exprs, **named_exprs)
    202 compliant_exprs = [
    203     compliant_expr.broadcast(kind) if is_scalar_like(kind) else compliant_expr
    204     for compliant_expr, kind in zip_strict(compliant_exprs, kinds)
    205 ]
--> 206 return self._with_compliant(self._compliant_frame.with_columns(*compliant_exprs))

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/_pandas_like/dataframe.py:451, in PandasLikeDataFrame.with_columns(self, *exprs)
    450 def with_columns(self, *exprs: PandasLikeExpr) -> Self:
--> 451     columns = self._evaluate_into_exprs(*exprs)
    452     if not columns and len(self) == 0:
    453         return self

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/_compliant/dataframe.py:347, in EagerDataFrame._evaluate_into_exprs(self, *exprs)
    343 def _evaluate_into_exprs(self, *exprs: EagerExprT) -> Sequence[EagerSeriesT]:
    344     # NOTE: Ignore intermittent [False Negative]
    345     # Argument of type "EagerExprT@EagerDataFrame" cannot be assigned to parameter "expr" of type "EagerExprT@EagerDataFrame" in function "_evaluate_into_expr"
    346     #  Type "EagerExprT@EagerDataFrame" is not assignable to type "EagerExprT@EagerDataFrame"
--> 347     return list(chain.from_iterable(self._evaluate_into_expr(expr) for expr in exprs))

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/_compliant/dataframe.py:347, in <genexpr>(.0)
    343 def _evaluate_into_exprs(self, *exprs: EagerExprT) -> Sequence[EagerSeriesT]:
    344     # NOTE: Ignore intermittent [False Negative]
    345     # Argument of type "EagerExprT@EagerDataFrame" cannot be assigned to parameter "expr" of type "EagerExprT@EagerDataFrame" in function "_evaluate_into_expr"
    346     #  Type "EagerExprT@EagerDataFrame" is not assignable to type "EagerExprT@EagerDataFrame"
--> 347     return list(chain.from_iterable(self._evaluate_into_expr(expr) for expr in exprs))

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/_compliant/dataframe.py:360, in EagerDataFrame._evaluate_into_expr(self, expr)
    350 """Return list of raw columns.
    351 
    352 For eager backends we alias operations at each step.
   (...)    357 Note that for PySpark / DuckDB, we are less free to liberally set aliases whenever we want.
    358 """
    359 aliases = expr._evaluate_aliases(self)
--> 360 result = expr(self)
    361 if list(aliases) != (
    362     result_aliases := [s.name for s in result]
    363 ):  # pragma: no cover
    364     msg = f"Safety assertion failed, expected {aliases}, got {result_aliases}"

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/_compliant/expr.py:234, in EagerExpr.__call__(self, df)
    233 def __call__(self, df: EagerDataFrameT) -> Sequence[EagerSeriesT]:
--> 234     return self._call(df)

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/_compliant/expr.py:384, in EagerExpr._reuse_series_inner(self, df, method_name, returns_scalar, scalar_kwargs, expressifiable_args)
    371 kwargs = {
    372     **scalar_kwargs,
    373     **{
   (...)    376     },
    377 }
    378 method = methodcaller(
    379     method_name,
    380     **self._reuse_series_extra_kwargs(returns_scalar=returns_scalar),
    381     **kwargs,
    382 )
    383 out: Sequence[EagerSeriesT] = [
--> 384     series._from_scalar(method(series)) if returns_scalar else method(series)
    385     for series in self(df)
    386 ]
    387 aliases = self._evaluate_aliases(df)
    388 if [s.name for s in out] != list(aliases):  # pragma: no cover

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/_pandas_like/series.py:669, in PandasLikeSeries.replace_strict(self, old, new, return_dtype)
    664 namespace = self.__native_namespace__()
    665 other = namespace.DataFrame(
    666     {self.name: old, tmp_name: namespace.Series(new, dtype=dtype)}
    667 )
    668 result = self._with_native(
--> 669     self.native.to_frame().merge(other, on=self.name, how="left")[tmp_name]
    670 ).alias(self.name)
    671 if result.is_null().sum() != self.is_null().sum():
    672     msg = (
    673         "replace_strict did not replace all non-null values.\n\n"
    674         f"The following did not get replaced: {self.filter(~self.is_null() & result.is_null()).unique(maintain_order=False).to_list()}"
    675     )

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/pandas/core/frame.py:10859, in DataFrame.merge(self, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
  10840 @Substitution("")
  10841 @Appender(_merge_doc, indents=2)
  10842 def merge(
   (...)  10855     validate: MergeValidate | None = None,
  10856 ) -> DataFrame:
  10857     from pandas.core.reshape.merge import merge
> 10859     return merge(
  10860         self,
  10861         right,
  10862         how=how,
  10863         on=on,
  10864         left_on=left_on,
  10865         right_on=right_on,
  10866         left_index=left_index,
  10867         right_index=right_index,
  10868         sort=sort,
  10869         suffixes=suffixes,
  10870         copy=copy,
  10871         indicator=indicator,
  10872         validate=validate,
  10873     )

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/pandas/core/reshape/merge.py:170, in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
    155     return _cross_merge(
    156         left_df,
    157         right_df,
   (...)    167         copy=copy,
    168     )
    169 else:
--> 170     op = _MergeOperation(
    171         left_df,
    172         right_df,
    173         how=how,
    174         on=on,
    175         left_on=left_on,
    176         right_on=right_on,
    177         left_index=left_index,
    178         right_index=right_index,
    179         sort=sort,
    180         suffixes=suffixes,
    181         indicator=indicator,
    182         validate=validate,
    183     )
    184     return op.get_result(copy=copy)

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/pandas/core/reshape/merge.py:807, in _MergeOperation.__init__(self, left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, indicator, validate)
    803 self._validate_tolerance(self.left_join_keys)
    805 # validate the merge keys dtypes. We may need to coerce
    806 # to avoid incompatible dtypes
--> 807 self._maybe_coerce_merge_keys()
    809 # If argument passed to validate,
    810 # check if columns specified as unique
    811 # are in fact unique.
    812 if validate is not None:

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/pandas/core/reshape/merge.py:1509, in _MergeOperation._maybe_coerce_merge_keys(self)
   1503     # unless we are merging non-string-like with string-like
   1504     elif (
   1505         inferred_left in string_types and inferred_right not in string_types
   1506     ) or (
   1507         inferred_right in string_types and inferred_left not in string_types
   1508     ):
-> 1509         raise ValueError(msg)
   1511 # datetimelikes must match exactly
   1512 elif needs_i8_conversion(lk.dtype) and not needs_i8_conversion(rk.dtype):

ValueError: You are trying to merge on float64 and object columns for key '<VARNAME>'. If you wish to proceed you should use pd.concat
```

Suggested fix:
The following line in file `~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/pyreadstat/pyfunctions.py:72`, in `set_value_labels(dataframe, metadata, formats_as_category, formats_as_ordered_category)` is throwing the error:
```
df_copy = df_copy.with_columns(nw.col(var_name).replace_strict(labels))
```

so I added the following lines right before it fails, in place:
```
print(nw.col(var_name)) 
print(df_copy[var_name].dtype) 
print(df_copy[var_name].unique()) 
print(list(labels.keys())[:5])
df_copy = df_copy.with_columns(nw.col(var_name).replace_strict(labels))
```

Output:
```
Narwhals Expr
metadata: ExprMetadata(
  expansion_kind: ExpansionKind.SINGLE,
  last_node: ExprKind.ELEMENTWISE,
  has_windows: False,
  n_orderable_ops: 0,
  is_elementwise: True,
  preserves_length: True,
  is_scalar_like: False,
  is_literal: False,
)

Float64
┌───────────────────────────────────────────┐
|              Narwhals Series              |
|-------------------------------------------|
|0       100.0                           |
... <TRUNCATED OUTPUT>
|312      -85.0                           |
|Name: <VARNAME>, Length: 313, dtype: float64|
└───────────────────────────────────────────┘
['a', 'b', 'c', 'd', 100.0]
```

Replacing that line with the following block is currently working for me:
```
try:
    df_copy = df_copy.with_columns(nw.col(var_name).replace_strict(labels))
except:
    df_copy = df_copy.with_columns(nw.col(var_name).cast(nw.Object).replace_strict(labels))
```

Expected behavior
As per documentation, any Stata missing values to be set to pd.NA automatically by the package.

Setup Information:
How did you install pyreadstat?: pip
Platform: macOS
Python Version: 3.13.7
Python Distribution: plain python
Using Virtualenv or condaenv: No


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

narwhals issue with missing Stata values #316

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

narwhals issue with missing Stata values #316

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions