Skip to content

[SPARK-43873][PS] Enabling FrameDescribeTests #42319

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 9 additions & 30 deletions python/pyspark/pandas/tests/computation/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,6 @@ def df_pair(self):
psdf = ps.from_pandas(pdf)
return pdf, psdf

@unittest.skipIf(
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
"TODO(SPARK-43556): Enable DataFrameSlowTests.test_describe for pandas 2.0.0.",
)
def test_describe(self):
pdf, psdf = self.df_pair

Expand Down Expand Up @@ -78,19 +74,10 @@ def test_describe(self):
}
)
pdf = psdf._to_pandas()
# NOTE: Set `datetime_is_numeric=True` for pandas:
# FutureWarning: Treating datetime data as categorical rather than numeric in
# `.describe` is deprecated and will be removed in a future version of pandas.
# Specify `datetime_is_numeric=True` to silence this
# warning and adopt the future behavior now.
# NOTE: Compare the result except percentiles, since we use approximate percentile
# so the result is different from pandas.
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
self.assert_eq(
psdf.describe().loc[["count", "mean", "min", "max"]],
pdf.describe(datetime_is_numeric=True)
.astype(str)
.loc[["count", "mean", "min", "max"]],
pdf.describe().astype(str).loc[["count", "mean", "min", "max"]],
)
else:
self.assert_eq(
Expand Down Expand Up @@ -136,17 +123,13 @@ def test_describe(self):
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
self.assert_eq(
psdf.describe().loc[["count", "mean", "min", "max"]],
pdf.describe(datetime_is_numeric=True)
.astype(str)
.loc[["count", "mean", "min", "max"]],
pdf.describe().astype(str).loc[["count", "mean", "min", "max"]],
)
psdf.A += psdf.A
pdf.A += pdf.A
self.assert_eq(
psdf.describe().loc[["count", "mean", "min", "max"]],
pdf.describe(datetime_is_numeric=True)
.astype(str)
.loc[["count", "mean", "min", "max"]],
pdf.describe().astype(str).loc[["count", "mean", "min", "max"]],
)
else:
expected_result = ps.DataFrame(
Expand Down Expand Up @@ -187,15 +170,15 @@ def test_describe(self):
)
pdf = psdf._to_pandas()
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
pandas_result = pdf.describe(datetime_is_numeric=True)
pandas_result = pdf.describe()
pandas_result.B = pandas_result.B.astype(str)
self.assert_eq(
psdf.describe().loc[["count", "mean", "min", "max"]],
pandas_result.loc[["count", "mean", "min", "max"]],
)
psdf.A += psdf.A
pdf.A += pdf.A
pandas_result = pdf.describe(datetime_is_numeric=True)
pandas_result = pdf.describe()
pandas_result.B = pandas_result.B.astype(str)
self.assert_eq(
psdf.describe().loc[["count", "mean", "min", "max"]],
Expand Down Expand Up @@ -252,7 +235,7 @@ def test_describe(self):
)
pdf = psdf._to_pandas()
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
pandas_result = pdf.describe(datetime_is_numeric=True)
pandas_result = pdf.describe()
pandas_result.b = pandas_result.b.astype(str)
self.assert_eq(
psdf.describe().loc[["count", "mean", "min", "max"]],
Expand Down Expand Up @@ -288,10 +271,6 @@ def test_describe(self):
with self.assertRaisesRegex(ValueError, msg):
psdf.describe()

@unittest.skipIf(
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
"TODO(SPARK-43556): Enable DataFrameSlowTests.test_describe for pandas 2.0.0.",
)
def test_describe_empty(self):
# Empty DataFrame
psdf = ps.DataFrame(columns=["A", "B"])
Expand Down Expand Up @@ -328,7 +307,7 @@ def test_describe_empty(self):
# For timestamp type, we should convert NaT to None in pandas result
# since pandas API on Spark doesn't support the NaT for object type.
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
pdf_result = pdf[pdf.a != pdf.a].describe(datetime_is_numeric=True)
pdf_result = pdf[pdf.a != pdf.a].describe()
self.assert_eq(
psdf[psdf.a != psdf.a].describe(),
pdf_result.where(pdf_result.notnull(), None).astype(str),
Expand Down Expand Up @@ -367,7 +346,7 @@ def test_describe_empty(self):
)
pdf = psdf._to_pandas()
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
pdf_result = pdf[pdf.a != pdf.a].describe(datetime_is_numeric=True)
pdf_result = pdf[pdf.a != pdf.a].describe()
pdf_result.b = pdf_result.b.where(pdf_result.b.notnull(), None).astype(str)
self.assert_eq(
psdf[psdf.a != psdf.a].describe(),
Expand Down Expand Up @@ -417,7 +396,7 @@ def test_describe_empty(self):
)
pdf = psdf._to_pandas()
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
pdf_result = pdf[pdf.a != pdf.a].describe(datetime_is_numeric=True)
pdf_result = pdf[pdf.a != pdf.a].describe()
self.assert_eq(
psdf[psdf.a != psdf.a].describe(),
pdf_result.where(pdf_result.notnull(), None).astype(str),
Expand Down