|
| 1 | +""" |
| 2 | +Clean and validate a DataFrame column containing language. |
| 3 | +""" |
| 4 | + |
| 5 | +# pylint: disable=too-many-arguments, global-statement |
| 6 | + |
| 7 | +from os import path |
| 8 | +from typing import Any, Union, Tuple, List, Optional, Dict |
| 9 | +from operator import itemgetter |
| 10 | + |
| 11 | +import dask.dataframe as dd |
| 12 | +import numpy as np |
| 13 | +import pandas as pd |
| 14 | + |
| 15 | +from ..progress_bar import ProgressBar |
| 16 | +from .utils import NULL_VALUES, to_dask |
| 17 | +from .clean_headers import clean_headers |
| 18 | + |
| 19 | +DEFAULT_LANGUAGE_DATA_FILE = path.join(path.split(path.abspath(__file__))[0], "language_data.csv") |
| 20 | + |
| 21 | +DATA = pd.read_csv(DEFAULT_LANGUAGE_DATA_FILE, encoding="utf-8", dtype=str) |
| 22 | +ALPHA2: Dict[str, List[int]] = {} |
| 23 | +ALPHA3: Dict[str, List[int]] = {} |
| 24 | +NAME: Dict[str, List[int]] = {} |
| 25 | + |
| 26 | + |
| 27 | +def clean_language( |
| 28 | + df: Union[pd.DataFrame, dd.DataFrame], |
| 29 | + column: str, |
| 30 | + input_format: Union[str, Tuple[str, ...]] = "auto", |
| 31 | + output_format: str = "name", |
| 32 | + kb_path: str = "default", |
| 33 | + encode: Optional[str] = None, |
| 34 | + inplace: bool = False, |
| 35 | + errors: str = "coerce", |
| 36 | + progress: bool = True, |
| 37 | +) -> pd.DataFrame: |
| 38 | + """ |
| 39 | + Clean language type data in a DataFrame column. |
| 40 | +
|
| 41 | + Parameters |
| 42 | + ---------- |
| 43 | + df |
| 44 | + A pandas or Dask DataFrame containing the data to be cleaned. |
| 45 | + column |
| 46 | + The name of the column containing data of language type. |
| 47 | + input_format |
| 48 | + The ISO 639 input format of the language. |
| 49 | + - 'auto': infer the input format |
| 50 | + - 'name': language name ('English') |
| 51 | + - 'alpha-2': alpha-2 code ('en') |
| 52 | + - 'alpha-3': alpha-3 code ('eng') |
| 53 | +
|
| 54 | + Can also be a tuple containing any combination of input formats, |
| 55 | + for example to clean a column containing name and alpha-2 |
| 56 | + codes set input_format to ('name', 'alpha-2'). |
| 57 | +
|
| 58 | + (default: 'auto') |
| 59 | + output_format |
| 60 | + The desired ISO 639 format of the language. |
| 61 | + - 'name': language name ('English') |
| 62 | + - 'alpha-2': alpha-2 code ('en') |
| 63 | + - 'alpha-3': alpha-3 code ('eng') |
| 64 | +
|
| 65 | + (default: 'name') |
| 66 | + kb_path |
| 67 | + The path of user specified knowledge base. |
| 68 | + In current stage, it should be in the user's local directory |
| 69 | + following by the format we proposing. |
| 70 | +
|
| 71 | + (default: 'default') |
| 72 | + encode |
| 73 | + The encoding of the knowledge base. It will be passed to `pd.read_csv`. |
| 74 | +
|
| 75 | + (default: None) |
| 76 | + inplace |
| 77 | + If True, delete the column containing the data that was cleaned. |
| 78 | + Otherwise, keep the original column. |
| 79 | +
|
| 80 | + (default: False) |
| 81 | + errors |
| 82 | + How to handle parsing errors. |
| 83 | + - 'coerce': invalid parsing will be set to NaN. |
| 84 | + - 'ignore': invalid parsing will return the input. |
| 85 | + - 'raise': invalid parsing will raise an exception. |
| 86 | +
|
| 87 | + (default: 'coerce') |
| 88 | + progress |
| 89 | + If True, display a progress bar. |
| 90 | +
|
| 91 | + (default: True) |
| 92 | +
|
| 93 | + Examples |
| 94 | + -------- |
| 95 | + Clean a column of language data. |
| 96 | +
|
| 97 | + >>> df = pd.DataFrame({'language': ['eng', 'zh', 'japanese']}) |
| 98 | + >>> clean_language(df, 'language') |
| 99 | + language language_clean |
| 100 | + 0 eng English |
| 101 | + 1 zh Chinese |
| 102 | + 2 japanese Japanese |
| 103 | + """ |
| 104 | + # load knowledge base |
| 105 | + _load_kb(kb_path, encode) |
| 106 | + |
| 107 | + valid_output_formats = {"name", "alpha-2", "alpha-3"} |
| 108 | + if output_format not in valid_output_formats: |
| 109 | + raise ValueError( |
| 110 | + f'output_format {output_format} is invalid, it needs to be "name", ' |
| 111 | + '"alpha-2" or "alpha-3"' |
| 112 | + ) |
| 113 | + |
| 114 | + valid_errors = {"coerce", "ignore", "raise"} |
| 115 | + if errors not in valid_errors: |
| 116 | + raise ValueError( |
| 117 | + f'errors {errors} is invalid, it needs to be "coerce", ' '"ignore" or "raise"' |
| 118 | + ) |
| 119 | + |
| 120 | + input_formats = _convert_format_to_tuple(input_format) |
| 121 | + |
| 122 | + # convert to dask |
| 123 | + df = to_dask(df) |
| 124 | + |
| 125 | + df["clean_code_tup"] = df[column].map_partitions( |
| 126 | + lambda srs: [_format(x, input_formats, output_format, errors) for x in srs], |
| 127 | + meta=object, |
| 128 | + ) |
| 129 | + |
| 130 | + df = df.assign( |
| 131 | + _temp_=df["clean_code_tup"].map(itemgetter(0), meta=("_temp_", object)), |
| 132 | + ) |
| 133 | + |
| 134 | + df = df.rename(columns={"_temp_": f"{column}_clean"}) |
| 135 | + |
| 136 | + df = df.drop(columns=["clean_code_tup"]) |
| 137 | + |
| 138 | + if inplace: |
| 139 | + df[column] = df[f"{column}_clean"] |
| 140 | + df = df.drop(columns=f"{column}_clean") |
| 141 | + df = df.rename(columns={column: f"{column}_clean"}) |
| 142 | + |
| 143 | + with ProgressBar(minimum=1, disable=not progress): |
| 144 | + df = df.compute() |
| 145 | + |
| 146 | + return df |
| 147 | + |
| 148 | + |
| 149 | +def validate_language( |
| 150 | + x: Union[str, pd.Series, dd.Series, pd.DataFrame, dd.DataFrame], |
| 151 | + column: str = "", |
| 152 | + input_format: Union[str, Tuple[str, ...]] = "auto", |
| 153 | + kb_path: str = "default", |
| 154 | + encode: Optional[str] = None, |
| 155 | +) -> Union[bool, pd.Series, pd.DataFrame]: |
| 156 | + """ |
| 157 | + Validate language type data in a DataFrame column. For each cell, return True or False. |
| 158 | +
|
| 159 | + Parameters |
| 160 | + ---------- |
| 161 | + x |
| 162 | + Language data to be validated. It could be a single string, or |
| 163 | + a pandas or Dask DataFrame, or a pandas or Dask Series. |
| 164 | + column |
| 165 | + The name of the column to be validated. |
| 166 | + If x is not a pandas or Dask DataFrame, it would be ignored. |
| 167 | + If x is a pandas or Dask DataFrame but `column` is not specified, |
| 168 | + then the whole dataframe will be validated. |
| 169 | +
|
| 170 | + (default: None) |
| 171 | + input_format |
| 172 | + The ISO 639 input format of the language. |
| 173 | + - 'auto': infer the input format |
| 174 | + - 'name': language name ('English') |
| 175 | + - 'alpha-2': alpha-2 code ('en') |
| 176 | + - 'alpha-3': alpha-3 code ('eng') |
| 177 | +
|
| 178 | + Can also be a tuple containing any combination of input formats, |
| 179 | + for example to clean a column containing name and alpha-2 |
| 180 | + codes set input_format to ('name', 'alpha-2'). |
| 181 | +
|
| 182 | + (default: 'auto') |
| 183 | + kb_path |
| 184 | + The path of user specified knowledge base. |
| 185 | + In current stage, it should be in the user's local directory |
| 186 | + following by the format we proposing. |
| 187 | +
|
| 188 | + (default: "default") |
| 189 | + encode |
| 190 | + The encoding of the knowledge base. It will be passed to `pd.read_csv`. |
| 191 | +
|
| 192 | + (default: None) |
| 193 | + """ |
| 194 | + # load knowledge base |
| 195 | + _load_kb(kb_path, encode) |
| 196 | + |
| 197 | + input_formats = _convert_format_to_tuple(input_format) |
| 198 | + |
| 199 | + if isinstance(x, (pd.Series, dd.Series)): |
| 200 | + return x.apply(_check_language, args=(input_formats, False)) |
| 201 | + elif isinstance(x, (pd.DataFrame, dd.DataFrame)): |
| 202 | + if column != "": |
| 203 | + return x[column].apply(_check_language, args=(input_formats, False)) |
| 204 | + else: |
| 205 | + return x.applymap(lambda val: _check_language(val, input_formats, False)) |
| 206 | + return _check_language(x, input_formats, False) |
| 207 | + |
| 208 | + |
| 209 | +def _format(val: Any, input_formats: Tuple[str, ...], output_format: str, errors: str) -> Any: |
| 210 | + """ |
| 211 | + Reformat a language string with proper output format. |
| 212 | + """ |
| 213 | + result_index, status = _check_language(val, input_formats, True) |
| 214 | + |
| 215 | + if status == "null": |
| 216 | + return [np.nan] |
| 217 | + if status == "unknown": |
| 218 | + if errors == "raise": |
| 219 | + raise ValueError(f"unable to parse value {val}") |
| 220 | + return [val] if errors == "ignore" else [np.nan] |
| 221 | + |
| 222 | + formated_val = DATA.loc[result_index, output_format] |
| 223 | + if pd.isna(formated_val): |
| 224 | + # country doesn't have the required output format |
| 225 | + if errors == "raise": |
| 226 | + raise ValueError(f"unable to parse value {val}") |
| 227 | + return [val] if errors == "ignore" else [np.nan] |
| 228 | + |
| 229 | + return [formated_val.title()] if output_format == "name" else [formated_val] |
| 230 | + |
| 231 | + |
| 232 | +def _check_language(val: Any, input_formats: Tuple[str, ...], clean: bool) -> Any: |
| 233 | + """ |
| 234 | + Find the index of the given language string in the DATA dataframe. |
| 235 | +
|
| 236 | + Parameters |
| 237 | + ---------- |
| 238 | + val |
| 239 | + String containing the language value to be cleaned. |
| 240 | + input_formats |
| 241 | + Tuple containing potential ISO 639 input formats of the language. |
| 242 | + clean |
| 243 | + If True, a tuple (index, status) is returned. There are 3 status: |
| 244 | + - "null": val is a null value. |
| 245 | + - "unknown": val could not be parsed. |
| 246 | + - "success": a successful parse of the value. |
| 247 | + If False, the function returns True/False to be used by the validate function. |
| 248 | + """ |
| 249 | + if val in NULL_VALUES: |
| 250 | + return (None, "null") if clean else False |
| 251 | + |
| 252 | + val = str(val).lower().strip() |
| 253 | + first_letter = val[0] |
| 254 | + |
| 255 | + # select possible formats from input_formats; |
| 256 | + possible_formats: Tuple[str, ...] = () |
| 257 | + if len(val) > 1 and "name" in input_formats: |
| 258 | + # it is a potential valid language |
| 259 | + possible_formats = ("name",) + possible_formats |
| 260 | + |
| 261 | + if len(val) == 3 and "alpha-3" in input_formats: |
| 262 | + # alpha-3 or name, and alpha-3 is preferred |
| 263 | + possible_formats = ("alpha-3",) + possible_formats |
| 264 | + elif len(val) == 2 and "alpha-2" in input_formats: |
| 265 | + # alpha-2 or name, and alpha-2 is preferred |
| 266 | + possible_formats = ("alpha-2",) + possible_formats |
| 267 | + |
| 268 | + # search the value |
| 269 | + format_dicts = {"name": NAME, "alpha-2": ALPHA2, "alpha-3": ALPHA3} |
| 270 | + for fmt in possible_formats: |
| 271 | + format_dict = format_dicts[fmt] |
| 272 | + inds = format_dict.get( |
| 273 | + first_letter |
| 274 | + ) # get the indices of value that starts with the same letter |
| 275 | + if inds is None: # no value starts with this letter |
| 276 | + continue |
| 277 | + df_temp = DATA.iloc[inds][fmt] # extract these values |
| 278 | + res = df_temp[df_temp.str.lower() == val] # search the input value within them |
| 279 | + if len(res) != 0: |
| 280 | + return (res.index[0], "success") if clean else True |
| 281 | + |
| 282 | + return (None, "unknown") if clean else False |
| 283 | + |
| 284 | + |
| 285 | +def _load_kb(kb_path: str, encode: Optional[str] = None) -> Any: |
| 286 | + """ |
| 287 | + Load knowledge base from a specified path. |
| 288 | + """ |
| 289 | + global DATA, NAME, ALPHA2, ALPHA3 |
| 290 | + |
| 291 | + if kb_path == "default": |
| 292 | + DATA = pd.read_csv(DEFAULT_LANGUAGE_DATA_FILE, encoding="utf-8", dtype=str) |
| 293 | + else: |
| 294 | + DATA = pd.read_csv(kb_path, encoding=encode, dtype=str) |
| 295 | + DATA = clean_headers(DATA, case="kebab", report=False) # to lowercase |
| 296 | + # check whether the format of the knowledge base is valid |
| 297 | + valid_formats = {"name", "alpha-2", "alpha-3"} |
| 298 | + for fmt in valid_formats: |
| 299 | + if fmt not in DATA.columns: |
| 300 | + raise KeyError( |
| 301 | + "knowledge base does not follow the format, " |
| 302 | + 'it needs to contain "name", "alpha-2", and "alpha-3"' |
| 303 | + ) |
| 304 | + |
| 305 | + # divide the dataset according to the first letter of each value, store the indices |
| 306 | + # e.g. {'a': [12, 36, 39], 'b': [15, 89], ...} |
| 307 | + NAME, ALPHA2, ALPHA3 = {}, {}, {} |
| 308 | + format_dicts = {"name": NAME, "alpha-2": ALPHA2, "alpha-3": ALPHA3} |
| 309 | + for fmt, fmt_dict in format_dicts.items(): |
| 310 | + first_letters = DATA[fmt].str.lower().dropna().apply(lambda x: x[0]) |
| 311 | + grps = DATA.groupby(first_letters).groups |
| 312 | + fmt_dict.update({k: list(v) for k, v in grps.items()}) |
| 313 | + |
| 314 | + |
| 315 | +def _convert_format_to_tuple(input_format: Union[str, Tuple[str, ...]]) -> Tuple[str, ...]: |
| 316 | + """ |
| 317 | + Converts a string input format to a tuple of allowed input formats and |
| 318 | + raises an error if an input format is not valid. |
| 319 | + """ |
| 320 | + if isinstance(input_format, str): |
| 321 | + if input_format == "auto": |
| 322 | + return ("name", "alpha-2", "alpha-3") |
| 323 | + else: |
| 324 | + input_format = (input_format,) |
| 325 | + |
| 326 | + valid_input_formats = {"auto", "name", "alpha-2", "alpha-3"} |
| 327 | + for fmt in input_format: |
| 328 | + if fmt not in valid_input_formats: |
| 329 | + raise ValueError( |
| 330 | + f'input_format {fmt} is invalid, it needs to be one of "auto", ' |
| 331 | + '"name", "alpha-2" or "alpha-3"' |
| 332 | + ) |
| 333 | + if "auto" in input_format: |
| 334 | + return ("name", "alpha-2", "alpha-3") |
| 335 | + |
| 336 | + return input_format |
0 commit comments