Skip to content

narwhals issue with missing Stata values #316

@avinnofaruk

Description

@avinnofaruk

Describe the issue
When reading a .dta file with apply_value_formats=True & user_missing=True, the following error occurs. No problem if I set apply_value_formats=False and formats_as_category=False', but then trying to reattach value labels with pyreadstat.set_value_labels()' throws the same error.

To Reproduce
Steps to reproduce the behavior.

df, meta = pyreadstat.read_dta( "file.dta", apply_value_formats=True,  formats_as_category=True, formats_as_ordered_category=False, user_missing=True, dates_as_pandas_datetime=True, extra_date_formats=["YEAR", "MMYY"], encoding="UTF-8" )

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)

File pyreadstat/pyreadstat.pyx:302, in pyreadstat.pyreadstat.read_dta()

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/pyreadstat/pyfunctions.py:72, in set_value_labels(dataframe, metadata, formats_as_category, formats_as_ordered_category)
     70     continue
     71 else:
---> 72     df_copy = df_copy.with_columns(nw.col(var_name).replace_strict(labels))
     73 if formats_as_ordered_category:
     74     categories = list(set(labels.values()))

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/dataframe.py:1420, in DataFrame.with_columns(self, *exprs, **named_exprs)
   1388 def with_columns(
   1389     self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr
   1390 ) -> Self:
   1391     r"""Add columns to this DataFrame.
   1392 
   1393     Added columns will replace existing columns with the same name.
   (...)   1418         1  2  4.0    4
   1419     """
-> 1420     return super().with_columns(*exprs, **named_exprs)

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/dataframe.py:206, in BaseFrame.with_columns(self, *exprs, **named_exprs)
    201 compliant_exprs, kinds = self._flatten_and_extract(*exprs, **named_exprs)
    202 compliant_exprs = [
    203     compliant_expr.broadcast(kind) if is_scalar_like(kind) else compliant_expr
    204     for compliant_expr, kind in zip_strict(compliant_exprs, kinds)
    205 ]
--> 206 return self._with_compliant(self._compliant_frame.with_columns(*compliant_exprs))

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/_pandas_like/dataframe.py:451, in PandasLikeDataFrame.with_columns(self, *exprs)
    450 def with_columns(self, *exprs: PandasLikeExpr) -> Self:
--> 451     columns = self._evaluate_into_exprs(*exprs)
    452     if not columns and len(self) == 0:
    453         return self

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/_compliant/dataframe.py:347, in EagerDataFrame._evaluate_into_exprs(self, *exprs)
    343 def _evaluate_into_exprs(self, *exprs: EagerExprT) -> Sequence[EagerSeriesT]:
    344     # NOTE: Ignore intermittent [False Negative]
    345     # Argument of type "EagerExprT@EagerDataFrame" cannot be assigned to parameter "expr" of type "EagerExprT@EagerDataFrame" in function "_evaluate_into_expr"
    346     #  Type "EagerExprT@EagerDataFrame" is not assignable to type "EagerExprT@EagerDataFrame"
--> 347     return list(chain.from_iterable(self._evaluate_into_expr(expr) for expr in exprs))

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/_compliant/dataframe.py:347, in <genexpr>(.0)
    343 def _evaluate_into_exprs(self, *exprs: EagerExprT) -> Sequence[EagerSeriesT]:
    344     # NOTE: Ignore intermittent [False Negative]
    345     # Argument of type "EagerExprT@EagerDataFrame" cannot be assigned to parameter "expr" of type "EagerExprT@EagerDataFrame" in function "_evaluate_into_expr"
    346     #  Type "EagerExprT@EagerDataFrame" is not assignable to type "EagerExprT@EagerDataFrame"
--> 347     return list(chain.from_iterable(self._evaluate_into_expr(expr) for expr in exprs))

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/_compliant/dataframe.py:360, in EagerDataFrame._evaluate_into_expr(self, expr)
    350 """Return list of raw columns.
    351 
    352 For eager backends we alias operations at each step.
   (...)    357 Note that for PySpark / DuckDB, we are less free to liberally set aliases whenever we want.
    358 """
    359 aliases = expr._evaluate_aliases(self)
--> 360 result = expr(self)
    361 if list(aliases) != (
    362     result_aliases := [s.name for s in result]
    363 ):  # pragma: no cover
    364     msg = f"Safety assertion failed, expected {aliases}, got {result_aliases}"

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/_compliant/expr.py:234, in EagerExpr.__call__(self, df)
    233 def __call__(self, df: EagerDataFrameT) -> Sequence[EagerSeriesT]:
--> 234     return self._call(df)

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/_compliant/expr.py:384, in EagerExpr._reuse_series_inner(self, df, method_name, returns_scalar, scalar_kwargs, expressifiable_args)
    371 kwargs = {
    372     **scalar_kwargs,
    373     **{
   (...)    376     },
    377 }
    378 method = methodcaller(
    379     method_name,
    380     **self._reuse_series_extra_kwargs(returns_scalar=returns_scalar),
    381     **kwargs,
    382 )
    383 out: Sequence[EagerSeriesT] = [
--> 384     series._from_scalar(method(series)) if returns_scalar else method(series)
    385     for series in self(df)
    386 ]
    387 aliases = self._evaluate_aliases(df)
    388 if [s.name for s in out] != list(aliases):  # pragma: no cover

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/narwhals/_pandas_like/series.py:669, in PandasLikeSeries.replace_strict(self, old, new, return_dtype)
    664 namespace = self.__native_namespace__()
    665 other = namespace.DataFrame(
    666     {self.name: old, tmp_name: namespace.Series(new, dtype=dtype)}
    667 )
    668 result = self._with_native(
--> 669     self.native.to_frame().merge(other, on=self.name, how="left")[tmp_name]
    670 ).alias(self.name)
    671 if result.is_null().sum() != self.is_null().sum():
    672     msg = (
    673         "replace_strict did not replace all non-null values.\n\n"
    674         f"The following did not get replaced: {self.filter(~self.is_null() & result.is_null()).unique(maintain_order=False).to_list()}"
    675     )

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/pandas/core/frame.py:10859, in DataFrame.merge(self, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
  10840 @Substitution("")
  10841 @Appender(_merge_doc, indents=2)
  10842 def merge(
   (...)  10855     validate: MergeValidate | None = None,
  10856 ) -> DataFrame:
  10857     from pandas.core.reshape.merge import merge
> 10859     return merge(
  10860         self,
  10861         right,
  10862         how=how,
  10863         on=on,
  10864         left_on=left_on,
  10865         right_on=right_on,
  10866         left_index=left_index,
  10867         right_index=right_index,
  10868         sort=sort,
  10869         suffixes=suffixes,
  10870         copy=copy,
  10871         indicator=indicator,
  10872         validate=validate,
  10873     )

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/pandas/core/reshape/merge.py:170, in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
    155     return _cross_merge(
    156         left_df,
    157         right_df,
   (...)    167         copy=copy,
    168     )
    169 else:
--> 170     op = _MergeOperation(
    171         left_df,
    172         right_df,
    173         how=how,
    174         on=on,
    175         left_on=left_on,
    176         right_on=right_on,
    177         left_index=left_index,
    178         right_index=right_index,
    179         sort=sort,
    180         suffixes=suffixes,
    181         indicator=indicator,
    182         validate=validate,
    183     )
    184     return op.get_result(copy=copy)

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/pandas/core/reshape/merge.py:807, in _MergeOperation.__init__(self, left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, indicator, validate)
    803 self._validate_tolerance(self.left_join_keys)
    805 # validate the merge keys dtypes. We may need to coerce
    806 # to avoid incompatible dtypes
--> 807 self._maybe_coerce_merge_keys()
    809 # If argument passed to validate,
    810 # check if columns specified as unique
    811 # are in fact unique.
    812 if validate is not None:

File ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/pandas/core/reshape/merge.py:1509, in _MergeOperation._maybe_coerce_merge_keys(self)
   1503     # unless we are merging non-string-like with string-like
   1504     elif (
   1505         inferred_left in string_types and inferred_right not in string_types
   1506     ) or (
   1507         inferred_right in string_types and inferred_left not in string_types
   1508     ):
-> 1509         raise ValueError(msg)
   1511 # datetimelikes must match exactly
   1512 elif needs_i8_conversion(lk.dtype) and not needs_i8_conversion(rk.dtype):

ValueError: You are trying to merge on float64 and object columns for key '<VARNAME>'. If you wish to proceed you should use pd.concat

Suggested fix:
The following line in file ~/.pyenv/versions/3.13.7/lib/python3.13/site-packages/pyreadstat/pyfunctions.py:72, in set_value_labels(dataframe, metadata, formats_as_category, formats_as_ordered_category) is throwing the error:

df_copy = df_copy.with_columns(nw.col(var_name).replace_strict(labels))

so I added the following lines right before it fails, in place:

print(nw.col(var_name)) 
print(df_copy[var_name].dtype) 
print(df_copy[var_name].unique()) 
print(list(labels.keys())[:5])
df_copy = df_copy.with_columns(nw.col(var_name).replace_strict(labels))

Output:

Narwhals Expr
metadata: ExprMetadata(
  expansion_kind: ExpansionKind.SINGLE,
  last_node: ExprKind.ELEMENTWISE,
  has_windows: False,
  n_orderable_ops: 0,
  is_elementwise: True,
  preserves_length: True,
  is_scalar_like: False,
  is_literal: False,
)

Float64
┌───────────────────────────────────────────┐
|              Narwhals Series              |
|-------------------------------------------|
|0       100.0                           |
... <TRUNCATED OUTPUT>
|312      -85.0                           |
|Name: <VARNAME>, Length: 313, dtype: float64|
└───────────────────────────────────────────┘
['a', 'b', 'c', 'd', 100.0]

Replacing that line with the following block is currently working for me:

try:
    df_copy = df_copy.with_columns(nw.col(var_name).replace_strict(labels))
except:
    df_copy = df_copy.with_columns(nw.col(var_name).cast(nw.Object).replace_strict(labels))

Expected behavior
As per documentation, any Stata missing values to be set to pd.NA automatically by the package.

Setup Information:
How did you install pyreadstat?: pip
Platform: macOS
Python Version: 3.13.7
Python Distribution: plain python
Using Virtualenv or condaenv: No

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions