Skip to content

Commit 33923fc

Browse files
committed
add clean_language function
1 parent c735cd9 commit 33923fc

File tree

3 files changed

+8239
-0
lines changed

3 files changed

+8239
-0
lines changed

dataprep/clean/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929

3030
from .clean_text import clean_text, default_text_pipeline
3131

32+
from .clean_language import clean_language, validate_language
33+
3234

3335
__all__ = [
3436
"clean_lat_long",
@@ -54,4 +56,6 @@
5456
"clean_df",
5557
"clean_text",
5658
"default_text_pipeline",
59+
"clean_language",
60+
"validate_language",
5761
]

dataprep/clean/clean_language.py

Lines changed: 336 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,336 @@
1+
"""
2+
Clean and validate a DataFrame column containing language.
3+
"""
4+
5+
# pylint: disable=too-many-arguments, global-statement
6+
7+
from os import path
8+
from typing import Any, Union, Tuple, List, Optional, Dict
9+
from operator import itemgetter
10+
11+
import dask.dataframe as dd
12+
import numpy as np
13+
import pandas as pd
14+
15+
from ..progress_bar import ProgressBar
16+
from .utils import NULL_VALUES, to_dask
17+
from .clean_headers import clean_headers
18+
19+
DEFAULT_LANGUAGE_DATA_FILE = path.join(path.split(path.abspath(__file__))[0], "language_data.csv")
20+
21+
DATA = pd.read_csv(DEFAULT_LANGUAGE_DATA_FILE, encoding="utf-8", dtype=str)
22+
ALPHA2: Dict[str, List[int]] = {}
23+
ALPHA3: Dict[str, List[int]] = {}
24+
NAME: Dict[str, List[int]] = {}
25+
26+
27+
def clean_language(
28+
df: Union[pd.DataFrame, dd.DataFrame],
29+
column: str,
30+
input_format: Union[str, Tuple[str, ...]] = "auto",
31+
output_format: str = "name",
32+
kb_path: str = "default",
33+
encode: Optional[str] = None,
34+
inplace: bool = False,
35+
errors: str = "coerce",
36+
progress: bool = True,
37+
) -> pd.DataFrame:
38+
"""
39+
Clean language type data in a DataFrame column.
40+
41+
Parameters
42+
----------
43+
df
44+
A pandas or Dask DataFrame containing the data to be cleaned.
45+
column
46+
The name of the column containing data of language type.
47+
input_format
48+
The ISO 639 input format of the language.
49+
- 'auto': infer the input format
50+
- 'name': language name ('English')
51+
- 'alpha-2': alpha-2 code ('en')
52+
- 'alpha-3': alpha-3 code ('eng')
53+
54+
Can also be a tuple containing any combination of input formats,
55+
for example to clean a column containing name and alpha-2
56+
codes set input_format to ('name', 'alpha-2').
57+
58+
(default: 'auto')
59+
output_format
60+
The desired ISO 639 format of the language.
61+
- 'name': language name ('English')
62+
- 'alpha-2': alpha-2 code ('en')
63+
- 'alpha-3': alpha-3 code ('eng')
64+
65+
(default: 'name')
66+
kb_path
67+
The path of user specified knowledge base.
68+
In current stage, it should be in the user's local directory
69+
following by the format we proposing.
70+
71+
(default: 'default')
72+
encode
73+
The encoding of the knowledge base. It will be passed to `pd.read_csv`.
74+
75+
(default: None)
76+
inplace
77+
If True, delete the column containing the data that was cleaned.
78+
Otherwise, keep the original column.
79+
80+
(default: False)
81+
errors
82+
How to handle parsing errors.
83+
- 'coerce': invalid parsing will be set to NaN.
84+
- 'ignore': invalid parsing will return the input.
85+
- 'raise': invalid parsing will raise an exception.
86+
87+
(default: 'coerce')
88+
progress
89+
If True, display a progress bar.
90+
91+
(default: True)
92+
93+
Examples
94+
--------
95+
Clean a column of language data.
96+
97+
>>> df = pd.DataFrame({'language': ['eng', 'zh', 'japanese']})
98+
>>> clean_language(df, 'language')
99+
language language_clean
100+
0 eng English
101+
1 zh Chinese
102+
2 japanese Japanese
103+
"""
104+
# load knowledge base
105+
_load_kb(kb_path, encode)
106+
107+
valid_output_formats = {"name", "alpha-2", "alpha-3"}
108+
if output_format not in valid_output_formats:
109+
raise ValueError(
110+
f'output_format {output_format} is invalid, it needs to be "name", '
111+
'"alpha-2" or "alpha-3"'
112+
)
113+
114+
valid_errors = {"coerce", "ignore", "raise"}
115+
if errors not in valid_errors:
116+
raise ValueError(
117+
f'errors {errors} is invalid, it needs to be "coerce", ' '"ignore" or "raise"'
118+
)
119+
120+
input_formats = _convert_format_to_tuple(input_format)
121+
122+
# convert to dask
123+
df = to_dask(df)
124+
125+
df["clean_code_tup"] = df[column].map_partitions(
126+
lambda srs: [_format(x, input_formats, output_format, errors) for x in srs],
127+
meta=object,
128+
)
129+
130+
df = df.assign(
131+
_temp_=df["clean_code_tup"].map(itemgetter(0), meta=("_temp_", object)),
132+
)
133+
134+
df = df.rename(columns={"_temp_": f"{column}_clean"})
135+
136+
df = df.drop(columns=["clean_code_tup"])
137+
138+
if inplace:
139+
df[column] = df[f"{column}_clean"]
140+
df = df.drop(columns=f"{column}_clean")
141+
df = df.rename(columns={column: f"{column}_clean"})
142+
143+
with ProgressBar(minimum=1, disable=not progress):
144+
df = df.compute()
145+
146+
return df
147+
148+
149+
def validate_language(
150+
x: Union[str, pd.Series, dd.Series, pd.DataFrame, dd.DataFrame],
151+
column: str = "",
152+
input_format: Union[str, Tuple[str, ...]] = "auto",
153+
kb_path: str = "default",
154+
encode: Optional[str] = None,
155+
) -> Union[bool, pd.Series, pd.DataFrame]:
156+
"""
157+
Validate language type data in a DataFrame column. For each cell, return True or False.
158+
159+
Parameters
160+
----------
161+
x
162+
Language data to be validated. It could be a single string, or
163+
a pandas or Dask DataFrame, or a pandas or Dask Series.
164+
column
165+
The name of the column to be validated.
166+
If x is not a pandas or Dask DataFrame, it would be ignored.
167+
If x is a pandas or Dask DataFrame but `column` is not specified,
168+
then the whole dataframe will be validated.
169+
170+
(default: None)
171+
input_format
172+
The ISO 639 input format of the language.
173+
- 'auto': infer the input format
174+
- 'name': language name ('English')
175+
- 'alpha-2': alpha-2 code ('en')
176+
- 'alpha-3': alpha-3 code ('eng')
177+
178+
Can also be a tuple containing any combination of input formats,
179+
for example to clean a column containing name and alpha-2
180+
codes set input_format to ('name', 'alpha-2').
181+
182+
(default: 'auto')
183+
kb_path
184+
The path of user specified knowledge base.
185+
In current stage, it should be in the user's local directory
186+
following by the format we proposing.
187+
188+
(default: "default")
189+
encode
190+
The encoding of the knowledge base. It will be passed to `pd.read_csv`.
191+
192+
(default: None)
193+
"""
194+
# load knowledge base
195+
_load_kb(kb_path, encode)
196+
197+
input_formats = _convert_format_to_tuple(input_format)
198+
199+
if isinstance(x, (pd.Series, dd.Series)):
200+
return x.apply(_check_language, args=(input_formats, False))
201+
elif isinstance(x, (pd.DataFrame, dd.DataFrame)):
202+
if column != "":
203+
return x[column].apply(_check_language, args=(input_formats, False))
204+
else:
205+
return x.applymap(lambda val: _check_language(val, input_formats, False))
206+
return _check_language(x, input_formats, False)
207+
208+
209+
def _format(val: Any, input_formats: Tuple[str, ...], output_format: str, errors: str) -> Any:
210+
"""
211+
Reformat a language string with proper output format.
212+
"""
213+
result_index, status = _check_language(val, input_formats, True)
214+
215+
if status == "null":
216+
return [np.nan]
217+
if status == "unknown":
218+
if errors == "raise":
219+
raise ValueError(f"unable to parse value {val}")
220+
return [val] if errors == "ignore" else [np.nan]
221+
222+
formated_val = DATA.loc[result_index, output_format]
223+
if pd.isna(formated_val):
224+
# country doesn't have the required output format
225+
if errors == "raise":
226+
raise ValueError(f"unable to parse value {val}")
227+
return [val] if errors == "ignore" else [np.nan]
228+
229+
return [formated_val.title()] if output_format == "name" else [formated_val]
230+
231+
232+
def _check_language(val: Any, input_formats: Tuple[str, ...], clean: bool) -> Any:
233+
"""
234+
Find the index of the given language string in the DATA dataframe.
235+
236+
Parameters
237+
----------
238+
val
239+
String containing the language value to be cleaned.
240+
input_formats
241+
Tuple containing potential ISO 639 input formats of the language.
242+
clean
243+
If True, a tuple (index, status) is returned. There are 3 status:
244+
- "null": val is a null value.
245+
- "unknown": val could not be parsed.
246+
- "success": a successful parse of the value.
247+
If False, the function returns True/False to be used by the validate function.
248+
"""
249+
if val in NULL_VALUES:
250+
return (None, "null") if clean else False
251+
252+
val = str(val).lower().strip()
253+
first_letter = val[0]
254+
255+
# select possible formats from input_formats;
256+
possible_formats: Tuple[str, ...] = ()
257+
if len(val) > 1 and "name" in input_formats:
258+
# it is a potential valid language
259+
possible_formats = ("name",) + possible_formats
260+
261+
if len(val) == 3 and "alpha-3" in input_formats:
262+
# alpha-3 or name, and alpha-3 is preferred
263+
possible_formats = ("alpha-3",) + possible_formats
264+
elif len(val) == 2 and "alpha-2" in input_formats:
265+
# alpha-2 or name, and alpha-2 is preferred
266+
possible_formats = ("alpha-2",) + possible_formats
267+
268+
# search the value
269+
format_dicts = {"name": NAME, "alpha-2": ALPHA2, "alpha-3": ALPHA3}
270+
for fmt in possible_formats:
271+
format_dict = format_dicts[fmt]
272+
inds = format_dict.get(
273+
first_letter
274+
) # get the indices of value that starts with the same letter
275+
if inds is None: # no value starts with this letter
276+
continue
277+
df_temp = DATA.iloc[inds][fmt] # extract these values
278+
res = df_temp[df_temp.str.lower() == val] # search the input value within them
279+
if len(res) != 0:
280+
return (res.index[0], "success") if clean else True
281+
282+
return (None, "unknown") if clean else False
283+
284+
285+
def _load_kb(kb_path: str, encode: Optional[str] = None) -> Any:
286+
"""
287+
Load knowledge base from a specified path.
288+
"""
289+
global DATA, NAME, ALPHA2, ALPHA3
290+
291+
if kb_path == "default":
292+
DATA = pd.read_csv(DEFAULT_LANGUAGE_DATA_FILE, encoding="utf-8", dtype=str)
293+
else:
294+
DATA = pd.read_csv(kb_path, encoding=encode, dtype=str)
295+
DATA = clean_headers(DATA, case="kebab", report=False) # to lowercase
296+
# check whether the format of the knowledge base is valid
297+
valid_formats = {"name", "alpha-2", "alpha-3"}
298+
for fmt in valid_formats:
299+
if fmt not in DATA.columns:
300+
raise KeyError(
301+
"knowledge base does not follow the format, "
302+
'it needs to contain "name", "alpha-2", and "alpha-3"'
303+
)
304+
305+
# divide the dataset according to the first letter of each value, store the indices
306+
# e.g. {'a': [12, 36, 39], 'b': [15, 89], ...}
307+
NAME, ALPHA2, ALPHA3 = {}, {}, {}
308+
format_dicts = {"name": NAME, "alpha-2": ALPHA2, "alpha-3": ALPHA3}
309+
for fmt, fmt_dict in format_dicts.items():
310+
first_letters = DATA[fmt].str.lower().dropna().apply(lambda x: x[0])
311+
grps = DATA.groupby(first_letters).groups
312+
fmt_dict.update({k: list(v) for k, v in grps.items()})
313+
314+
315+
def _convert_format_to_tuple(input_format: Union[str, Tuple[str, ...]]) -> Tuple[str, ...]:
316+
"""
317+
Converts a string input format to a tuple of allowed input formats and
318+
raises an error if an input format is not valid.
319+
"""
320+
if isinstance(input_format, str):
321+
if input_format == "auto":
322+
return ("name", "alpha-2", "alpha-3")
323+
else:
324+
input_format = (input_format,)
325+
326+
valid_input_formats = {"auto", "name", "alpha-2", "alpha-3"}
327+
for fmt in input_format:
328+
if fmt not in valid_input_formats:
329+
raise ValueError(
330+
f'input_format {fmt} is invalid, it needs to be one of "auto", '
331+
'"name", "alpha-2" or "alpha-3"'
332+
)
333+
if "auto" in input_format:
334+
return ("name", "alpha-2", "alpha-3")
335+
336+
return input_format

0 commit comments

Comments
 (0)