Skip to content

Commit e04ea09

Browse files
committed
Merge remote-tracking branch 'upstream/develop' into develop
2 parents 7b201eb + 1c3e682 commit e04ea09

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+5229
-548
lines changed

.pylintrc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# A comma-separated list of package or module names from where C extensions may
44
# be loaded. Extensions are loading into the active Python interpreter and may
55
# run arbitrary code.
6-
extension-pkg-whitelist=lxml
6+
extension-pkg-whitelist=lxml, pandas._libs.missing
77

88
# Add files or directories to the blacklist. They should be base names, not
99
# paths.

dataprep/assets/english_stopwords.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
english_stopwords = [
1+
english_stopwords = {
22
"i",
33
"me",
44
"my",
@@ -178,4 +178,4 @@
178178
"won't",
179179
"wouldn",
180180
"wouldn't",
181-
]
181+
}

dataprep/clean/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,13 @@
2323

2424
from .clean_duplication import clean_duplication
2525

26+
from .clean_currency import clean_currency, validate_currency
27+
28+
from .clean_df import clean_df
29+
30+
from .clean_text import clean_text, default_text_pipeline
31+
32+
2633
__all__ = [
2734
"clean_lat_long",
2835
"validate_lat_long",
@@ -42,4 +49,9 @@
4249
"clean_date",
4350
"validate_date",
4451
"clean_duplication",
52+
"clean_currency",
53+
"validate_currency",
54+
"clean_df",
55+
"clean_text",
56+
"default_text_pipeline",
4557
]

dataprep/clean/clean_country.py

Lines changed: 83 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@
44
from functools import lru_cache
55
from operator import itemgetter
66
from os import path
7-
from re import error
8-
from typing import Any, Union
7+
from typing import Any, Union, Tuple, Optional
98

109
import dask
1110
import dask.dataframe as dd
@@ -26,7 +25,7 @@
2625
def clean_country(
2726
df: Union[pd.DataFrame, dd.DataFrame],
2827
column: str,
29-
input_format: str = "auto",
28+
input_format: Union[str, Tuple[str, ...]] = "auto",
3029
output_format: str = "name",
3130
fuzzy_dist: int = 0,
3231
strict: bool = False,
@@ -55,6 +54,10 @@ def clean_country(
5554
- 'alpha-3': alpha-3 code ('USA')
5655
- 'numeric': numeric code (840)
5756
57+
Can also be a tuple containing any combination of input formats,
58+
for example to clean a column containing alpha-2 and numeric
59+
codes set input_format to ('alpha-2', 'numeric').
60+
5861
(default: 'auto')
5962
output_format
6063
The desired ISO 3166 format of the country:
@@ -112,14 +115,7 @@ def clean_country(
112115
1 US United States
113116
"""
114117
# pylint: disable=too-many-arguments
115-
116-
input_formats = {"auto", "name", "official", "alpha-2", "alpha-3", "numeric"}
117118
output_formats = {"name", "official", "alpha-2", "alpha-3", "numeric"}
118-
if input_format not in input_formats:
119-
raise ValueError(
120-
f'input_format {input_format} is invalid, it needs to be one of "auto", '
121-
'"name", "official", "alpha-2", "alpha-3" or "numeric'
122-
)
123119
if output_format not in output_formats:
124120
raise ValueError(
125121
f'output_format {output_format} is invalid, it needs to be "name", '
@@ -130,6 +126,7 @@ def clean_country(
130126
"can't do fuzzy matching while strict mode is enabled, "
131127
"set strict=False for fuzzy matching or fuzzy_dist=0 for strict matching"
132128
)
129+
input_formats = _input_format_to_tuple(input_format)
133130

134131
# convert to dask
135132
df = to_dask(df)
@@ -140,7 +137,8 @@ def clean_country(
140137
# amount of different codes to produce the report
141138
df["clean_code_tup"] = df[column].map_partitions(
142139
lambda srs: [
143-
_format_country(x, input_format, output_format, fuzzy_dist, strict, errors) for x in srs
140+
_format_country(x, input_formats, output_format, fuzzy_dist, strict, errors)
141+
for x in srs
144142
],
145143
meta=object,
146144
)
@@ -168,7 +166,9 @@ def clean_country(
168166

169167

170168
def validate_country(
171-
x: Union[str, int, pd.Series], input_format: str = "auto", strict: bool = True
169+
x: Union[str, int, pd.Series],
170+
input_format: Union[str, Tuple[str, ...]] = "auto",
171+
strict: bool = True,
172172
) -> Union[bool, pd.Series]:
173173
"""
174174
Validate country names.
@@ -188,6 +188,10 @@ def validate_country(
188188
- 'alpha-3': alpha-3 code ('USA')
189189
- 'numeric': numeric code (840)
190190
191+
Can also be a tuple containing any combination of input formats,
192+
for example to clean a column containing alpha-2 and numeric
193+
codes set input_format to ('alpha-2', 'numeric').
194+
191195
(default: 'auto')
192196
strict
193197
If True, matching for input formats 'name' and 'official' are done by
@@ -207,18 +211,18 @@ def validate_country(
207211
1 False
208212
Name: country, dtype: bool
209213
"""
210-
214+
input_formats = _input_format_to_tuple(input_format)
211215
if isinstance(x, pd.Series):
212216
x = x.astype(str).str.lower().str.strip()
213-
return x.apply(_check_country, args=(input_format, strict, False))
217+
return x.apply(_check_country, args=(input_formats, strict, False))
214218

215219
x = str(x).lower().strip()
216-
return _check_country(x, input_format, strict, False)
220+
return _check_country(x, input_formats, strict, False)
217221

218222

219223
def _format_country(
220224
val: Any,
221-
input_format: str,
225+
input_formats: Tuple[str, ...],
222226
output_format: str,
223227
fuzzy_dist: int,
224228
strict: bool,
@@ -241,9 +245,13 @@ def _format_country(
241245
# could not be parsed) or "success" (a successful parse of the value).
242246

243247
country = str(val).lower().strip()
244-
result_index, status = _check_country(country, input_format, strict, True)
248+
result_index, status = _check_country(country, input_formats, strict, True)
245249

246-
if fuzzy_dist > 0 and status == "unknown" and input_format in ("auto", "name", "official"):
250+
if (
251+
fuzzy_dist > 0
252+
and status == "unknown"
253+
and ("name" in input_formats or "official" in input_formats)
254+
):
247255
result_index, status = _check_fuzzy_dist(country, fuzzy_dist)
248256

249257
if status == "null":
@@ -264,16 +272,16 @@ def _format_country(
264272

265273

266274
@lru_cache(maxsize=2 ** 20)
267-
def _check_country(country: str, input_format: str, strict: bool, clean: bool) -> Any:
275+
def _check_country(country: str, input_formats: Tuple[str, ...], strict: bool, clean: bool) -> Any:
268276
"""
269277
Finds the index of the given country in the DATA dataframe.
270278
271279
Parameters
272280
----------
273281
country
274282
string containing the country value being cleaned
275-
input_format
276-
the ISO 3166 input format of the country
283+
input_formats
284+
Tuple containing potential ISO 3166 input formats of the country
277285
strict
278286
If True, for input types "name" and "offical" the function looks for a direct match
279287
in the DATA dataframe. If False, the country input is searched for a regex match.
@@ -284,19 +292,18 @@ def _check_country(country: str, input_format: str, strict: bool, clean: bool) -
284292
if country in NULL_VALUES:
285293
return (None, "null") if clean else False
286294

287-
if input_format == "auto":
288-
input_format = _get_format_from_name(country)
295+
country_format = _get_format_from_name(country)
296+
input_format = _get_format_if_allowed(country_format, input_formats)
297+
if not input_format:
298+
return (None, "unknown") if clean else False
289299

290300
if strict and input_format == "regex":
291301
for form in ("name", "official"):
292-
try:
293-
ind = DATA[
294-
DATA[form].str.contains(f"^{country}$", flags=re.IGNORECASE, na=False)
295-
].index
296-
if np.size(ind) > 0:
297-
return (ind[0], "success") if clean else True
298-
except error:
299-
return (None, "unknown") if clean else False
302+
ind = DATA[
303+
DATA[form].str.contains(f"^{re.escape(country)}$", flags=re.IGNORECASE, na=False)
304+
].index
305+
if np.size(ind) > 0:
306+
return (ind[0], "success") if clean else True
300307

301308
elif not strict and input_format in ("regex", "name", "official"):
302309
for index, country_regex in enumerate(REGEXES):
@@ -305,7 +312,9 @@ def _check_country(country: str, input_format: str, strict: bool, clean: bool) -
305312

306313
else:
307314
ind = DATA[
308-
DATA[input_format].str.contains(f"^{country}$", flags=re.IGNORECASE, na=False)
315+
DATA[input_format].str.contains(
316+
f"^{re.escape(country)}$", flags=re.IGNORECASE, na=False
317+
)
309318
].index
310319
if np.size(ind) > 0:
311320
return (ind[0], "success") if clean else True
@@ -346,3 +355,45 @@ def _get_format_from_name(name: str) -> str:
346355
return "numeric"
347356
except ValueError:
348357
return "alpha-2" if len(name) == 2 else "alpha-3" if len(name) == 3 else "regex"
358+
359+
360+
def _get_format_if_allowed(input_format: str, allowed_formats: Tuple[str, ...]) -> Optional[str]:
361+
"""
362+
Returns the input format if it's an allowed format.
363+
"regex" input_format is only returned if "name" and "official are
364+
allowed. This is because when strict = True and input_format = "regex"
365+
both the "name" and "official" columns in the DATA dataframe are checked.
366+
"""
367+
if input_format == "regex":
368+
if "name" in allowed_formats and "official" in allowed_formats:
369+
return "regex"
370+
371+
return (
372+
"name"
373+
if "name" in allowed_formats
374+
else "official"
375+
if "official" in allowed_formats
376+
else None
377+
)
378+
379+
return input_format if input_format in allowed_formats else None
380+
381+
382+
def _input_format_to_tuple(input_format: Union[str, Tuple[str, ...]]) -> Tuple[str, ...]:
383+
"""
384+
Converts a string input format to a tuple of allowed input formats and raises an error
385+
if an input format is not valid.
386+
"""
387+
input_formats = {"auto", "name", "official", "alpha-2", "alpha-3", "numeric"}
388+
if isinstance(input_format, str):
389+
if input_format == "auto":
390+
return ("name", "official", "alpha-2", "alpha-3", "numeric")
391+
input_format = (input_format,)
392+
393+
for fmt in input_format:
394+
if fmt not in input_formats:
395+
raise ValueError(
396+
f'input_format {fmt} is invalid, it needs to be one of "auto", '
397+
'"name", "official", "alpha-2", "alpha-3" or "numeric'
398+
)
399+
return input_format

0 commit comments

Comments
 (0)