Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TST: Use numpy random generator for testing data #54209

Merged
merged 43 commits into from
Aug 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
36c62cd
TST: Use numpy random generator with ruff NPY002
mroeschke Jul 11, 2023
7495d1e
Fix other testing functions
mroeschke Jul 11, 2023
11e22a1
Merge remote-tracking branch 'upstream/main' into tst/random
mroeschke Jul 11, 2023
efb909b
Fix random_sample
mroeschke Jul 11, 2023
6b86a2a
Merge remote-tracking branch 'upstream/main' into tst/random
mroeschke Jul 11, 2023
6ca87a9
Fix more usage
mroeschke Jul 11, 2023
dafbb21
Merge remote-tracking branch 'upstream/main' into tst/random
mroeschke Jul 12, 2023
b2f42ad
Fix more
mroeschke Jul 12, 2023
af5829b
Replace more
mroeschke Jul 12, 2023
420d21c
address rand
mroeschke Jul 12, 2023
8170bb2
Merge remote-tracking branch 'upstream/main' into tst/random
mroeschke Jul 13, 2023
d26de4e
More fixes
mroeschke Jul 13, 2023
7f95edd
Merge remote-tracking branch 'upstream/main' into tst/random
mroeschke Jul 13, 2023
7a30da0
Fix more standard_normal
mroeschke Jul 13, 2023
c169d3c
Merge remote-tracking branch 'upstream/main' into tst/random
mroeschke Jul 13, 2023
ff12f61
Fix more
mroeschke Jul 13, 2023
cef2805
Fix
mroeschke Jul 13, 2023
590c6fa
Merge remote-tracking branch 'upstream/main' into tst/random
mroeschke Jul 14, 2023
094c39f
Address more
mroeschke Jul 14, 2023
dbea69c
Merge remote-tracking branch 'upstream/main' into tst/random
mroeschke Jul 14, 2023
a8df4c2
Fix more test
mroeschke Jul 14, 2023
baa91b5
Merge remote-tracking branch 'upstream/main' into tst/random
mroeschke Jul 17, 2023
c4a9cbb
fix more tests
mroeschke Jul 17, 2023
7e4d908
Merge remote-tracking branch 'upstream/main' into tst/random
mroeschke Jul 18, 2023
744e638
Try addressing windows tests
mroeschke Jul 18, 2023
cea301e
Merge remote-tracking branch 'upstream/main' into tst/random
mroeschke Jul 18, 2023
9e9ce3e
Address confest, ignore asv
mroeschke Jul 18, 2023
7aa13a3
adjust once more
mroeschke Jul 18, 2023
9480d51
Merge remote-tracking branch 'upstream/main' into tst/random
mroeschke Jul 18, 2023
855fea6
Merge remote-tracking branch 'upstream/main' into tst/random
mroeschke Jul 19, 2023
71a17df
ANother dtype
mroeschke Jul 19, 2023
2ad2f33
Merge remote-tracking branch 'upstream/main' into tst/random
mroeschke Jul 20, 2023
846955f
fix another unseeded default_rng
mroeschke Jul 20, 2023
73bd560
Add a rule for unseeded default_rng
mroeschke Jul 20, 2023
a90c4ca
Remove space
mroeschke Jul 20, 2023
f1b59b6
Merge remote-tracking branch 'upstream/main' into tst/random
mroeschke Jul 21, 2023
2f8d40a
Merge remote-tracking branch 'upstream/main' into tst/random
mroeschke Jul 24, 2023
ffa80ab
Merge remote-tracking branch 'upstream/main' into tst/random
mroeschke Jul 25, 2023
bf09f56
Merge remote-tracking branch 'upstream/main' into tst/random
mroeschke Jul 28, 2023
7f29304
other fixes
mroeschke Jul 28, 2023
476ad0d
Merge remote-tracking branch 'upstream/main' into tst/random
mroeschke Jul 28, 2023
fbc37b5
Merge remote-tracking branch 'upstream/main' into tst/random
mroeschke Jul 31, 2023
f035878
Merge remote-tracking branch 'upstream/main' into tst/random
mroeschke Jul 31, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
3 changes: 3 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,9 @@ repos:

# os.remove
|os\.remove

# Unseeded numpy default_rng
|default_rng\(\)
files: ^pandas/tests/
types_or: [python, cython, rst]
- id: unwanted-patterns-in-ea-tests
Expand Down
56 changes: 15 additions & 41 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,9 +391,9 @@ def makeNumericIndex(k: int = 10, *, name=None, dtype: Dtype | None) -> Index:
if is_unsigned_integer_dtype(dtype):
values += 2 ** (dtype.itemsize * 8 - 1)
elif dtype.kind == "f":
values = np.random.random_sample(k) - np.random.random_sample(1)
values = np.random.default_rng(2).random(k) - np.random.default_rng(2).random(1)
values.sort()
values = values * (10 ** np.random.randint(0, 9))
values = values * (10 ** np.random.default_rng(2).integers(0, 9))
else:
raise NotImplementedError(f"wrong dtype {dtype}")

Expand Down Expand Up @@ -487,7 +487,7 @@ def all_timeseries_index_generator(k: int = 10) -> Iterable[Index]:
# make series
def make_rand_series(name=None, dtype=np.float64) -> Series:
index = makeStringIndex(_N)
data = np.random.randn(_N)
data = np.random.default_rng(2).standard_normal(_N)
with np.errstate(invalid="ignore"):
data = data.astype(dtype, copy=False)
return Series(data, index=index, name=name)
Expand All @@ -510,21 +510,30 @@ def makeObjectSeries(name=None) -> Series:

def getSeriesData() -> dict[str, Series]:
index = makeStringIndex(_N)
return {c: Series(np.random.randn(_N), index=index) for c in getCols(_K)}
return {
c: Series(np.random.default_rng(i).standard_normal(_N), index=index)
for i, c in enumerate(getCols(_K))
}


def makeTimeSeries(nper=None, freq: Frequency = "B", name=None) -> Series:
if nper is None:
nper = _N
return Series(
np.random.randn(nper), index=makeDateIndex(nper, freq=freq), name=name
np.random.default_rng(2).standard_normal(nper),
index=makeDateIndex(nper, freq=freq),
name=name,
)


def makePeriodSeries(nper=None, name=None) -> Series:
if nper is None:
nper = _N
return Series(np.random.randn(nper), index=makePeriodIndex(nper), name=name)
return Series(
np.random.default_rng(2).standard_normal(nper),
index=makePeriodIndex(nper),
name=name,
)


def getTimeSeriesData(nper=None, freq: Frequency = "B") -> dict[str, Series]:
Expand Down Expand Up @@ -787,40 +796,6 @@ def makeCustomDataframe(
return DataFrame(data, index, columns, dtype=dtype)


def _create_missing_idx(nrows, ncols, density: float, random_state=None):
if random_state is None:
random_state = np.random
else:
random_state = np.random.RandomState(random_state)

# below is cribbed from scipy.sparse
size = round((1 - density) * nrows * ncols)
# generate a few more to ensure unique values
min_rows = 5
fac = 1.02
extra_size = min(size + min_rows, fac * size)

def _gen_unique_rand(rng, _extra_size):
ind = rng.rand(int(_extra_size))
return np.unique(np.floor(ind * nrows * ncols))[:size]

ind = _gen_unique_rand(random_state, extra_size)
while ind.size < size:
extra_size *= 1.05
ind = _gen_unique_rand(random_state, extra_size)

j = np.floor(ind * 1.0 / nrows).astype(int)
i = (ind - j * nrows).astype(int)
return i.tolist(), j.tolist()


def makeMissingDataframe(density: float = 0.9, random_state=None) -> DataFrame:
df = makeDataFrame()
i, j = _create_missing_idx(*df.shape, density=density, random_state=random_state)
df.iloc[i, j] = np.nan
return df


class SubclassedSeries(Series):
_metadata = ["testattr", "name"]

Expand Down Expand Up @@ -1131,7 +1106,6 @@ def shares_memory(left, right) -> bool:
"makeFloatSeries",
"makeIntervalIndex",
"makeIntIndex",
"makeMissingDataframe",
"makeMixedDataFrame",
"makeMultiIndex",
"makeNumericIndex",
Expand Down
5 changes: 3 additions & 2 deletions pandas/_testing/_random.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ def rands_array(
Generate an array of byte strings.
"""
retval = (
np.random.choice(RANDS_CHARS, size=nchars * np.prod(size), replace=replace)
np.random.default_rng(2)
.choice(RANDS_CHARS, size=nchars * np.prod(size), replace=replace)
.view((np.str_, nchars))
.reshape(size)
)
Expand All @@ -31,4 +32,4 @@ def rands(nchars) -> str:
See `rands_array` if you want to create an array of random strings.

"""
return "".join(np.random.choice(RANDS_CHARS, nchars))
return "".join(np.random.default_rng(2).choice(RANDS_CHARS, nchars))
12 changes: 7 additions & 5 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,7 +561,9 @@ def multiindex_dataframe_random_data(
"""DataFrame with 2 level MultiIndex with random data"""
index = lexsorted_two_level_string_multiindex
return DataFrame(
np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp")
np.random.default_rng(2).standard_normal((10, 3)),
index=index,
columns=Index(["A", "B", "C"], name="exp"),
)


Expand Down Expand Up @@ -614,7 +616,7 @@ def _create_mi_with_dt64tz_level():
"float32": tm.makeFloatIndex(100, dtype="float32"),
"float64": tm.makeFloatIndex(100, dtype="float64"),
"bool-object": tm.makeBoolIndex(10).astype(object),
"bool-dtype": Index(np.random.randn(10) < 0),
"bool-dtype": Index(np.random.default_rng(2).standard_normal(10) < 0),
"complex64": tm.makeNumericIndex(100, dtype="float64").astype("complex64"),
"complex128": tm.makeNumericIndex(100, dtype="float64").astype("complex128"),
"categorical": tm.makeCategoricalIndex(100),
Expand Down Expand Up @@ -744,7 +746,7 @@ def datetime_series() -> Series:
def _create_series(index):
"""Helper for the _series dict"""
size = len(index)
data = np.random.randn(size)
data = np.random.default_rng(2).standard_normal(size)
return Series(data, index=index, name="a", copy=False)


Expand Down Expand Up @@ -773,7 +775,7 @@ def series_with_multilevel_index() -> Series:
]
tuples = zip(*arrays)
index = MultiIndex.from_tuples(tuples)
data = np.random.randn(8)
data = np.random.default_rng(2).standard_normal(8)
ser = Series(data, index=index)
ser.iloc[3] = np.NaN
return ser
Expand Down Expand Up @@ -946,7 +948,7 @@ def rand_series_with_duplicate_datetimeindex() -> Series:
datetime(2000, 1, 5),
]

return Series(np.random.randn(len(dates)), index=dates)
return Series(np.random.default_rng(2).standard_normal(len(dates)), index=dates)


# ----------------------------------------------------------------
Expand Down
20 changes: 12 additions & 8 deletions pandas/tests/apply/test_frame_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ def test_apply_reduce_to_dict():


def test_apply_differently_indexed():
df = DataFrame(np.random.randn(20, 10))
df = DataFrame(np.random.default_rng(2).standard_normal((20, 10)))

result = df.apply(Series.describe, axis=0)
expected = DataFrame({i: v.describe() for i, v in df.items()}, columns=df.columns)
Expand Down Expand Up @@ -463,9 +463,9 @@ def test_apply_convert_objects():
"shiny",
"shiny",
],
"D": np.random.randn(11),
"E": np.random.randn(11),
"F": np.random.randn(11),
"D": np.random.default_rng(2).standard_normal(11),
"E": np.random.default_rng(2).standard_normal(11),
"F": np.random.default_rng(2).standard_normal(11),
}
)

Expand Down Expand Up @@ -659,7 +659,7 @@ def test_apply_category_equalness(val):
def test_infer_row_shape():
# GH 17437
# if row shape is changing, infer it
df = DataFrame(np.random.rand(10, 2))
df = DataFrame(np.random.default_rng(2).random((10, 2)))
result = df.apply(np.fft.fft, axis=0).shape
assert result == (10, 2)

Expand Down Expand Up @@ -816,7 +816,7 @@ def test_with_listlike_columns():
# GH 17348
df = DataFrame(
{
"a": Series(np.random.randn(4)),
"a": Series(np.random.default_rng(2).standard_normal(4)),
"b": ["a", "list", "of", "words"],
"ts": date_range("2016-10-01", periods=4, freq="H"),
}
Expand Down Expand Up @@ -862,7 +862,9 @@ def test_infer_output_shape_columns():
def test_infer_output_shape_listlike_columns():
# GH 16353

df = DataFrame(np.random.randn(6, 3), columns=["A", "B", "C"])
df = DataFrame(
np.random.default_rng(2).standard_normal((6, 3)), columns=["A", "B", "C"]
)

result = df.apply(lambda x: [1, 2, 3], axis=1)
expected = Series([[1, 2, 3] for t in df.itertuples()])
Expand Down Expand Up @@ -911,7 +913,9 @@ def fun(x):
def test_consistent_coerce_for_shapes(lst):
# we want column names to NOT be propagated
# just because the shape matches the input shape
df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"])
df = DataFrame(
np.random.default_rng(2).standard_normal((4, 3)), columns=["A", "B", "C"]
)

result = df.apply(lambda x: lst, axis=1)
expected = Series([lst for t in df.itertuples()])
Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/apply/test_invalid_arg.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def test_series_nested_renamer(renamer):

def test_apply_dict_depr():
tsdf = DataFrame(
np.random.randn(10, 3),
np.random.default_rng(2).standard_normal((10, 3)),
columns=["A", "B", "C"],
index=date_range("1/1/2000", periods=10),
)
Expand Down Expand Up @@ -190,9 +190,9 @@ def test_apply_modify_traceback():
"shiny",
"shiny",
],
"D": np.random.randn(11),
"E": np.random.randn(11),
"F": np.random.randn(11),
"D": np.random.default_rng(2).standard_normal(11),
"E": np.random.default_rng(2).standard_normal(11),
"F": np.random.default_rng(2).standard_normal(11),
}
)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/apply/test_series_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def f(x):

@pytest.mark.parametrize("convert_dtype", [True, False])
def test_apply_convert_dtype_deprecated(convert_dtype):
ser = Series(np.random.randn(10))
ser = Series(np.random.default_rng(2).standard_normal(10))

def func(x):
return x if x > 0 else np.nan
Expand Down
Loading