Skip to content

Commit 46e1365

Browse files
committed
test(clean) : add test for clean_language
1 parent 33923fc commit 46e1365

File tree

2 files changed

+381
-0
lines changed

2 files changed

+381
-0
lines changed
Lines changed: 377 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,377 @@
1+
"""
2+
module for testing the functions `clean_language()` and `validate_language()`.
3+
"""
4+
5+
import logging
6+
7+
from os import path
8+
9+
import numpy as np
10+
import pandas as pd
11+
import pytest
12+
13+
from ...clean import clean_language, validate_language
14+
15+
LOGGER = logging.getLogger(__name__)
16+
17+
ALTERNATIVE_LANGUAGE_DATA_FILE = path.join(
18+
path.split(path.abspath(__file__))[0], "test_language_data.csv"
19+
)
20+
21+
22+
@pytest.fixture(scope="module") # type: ignore
23+
def df_languages() -> pd.DataFrame:
24+
df = pd.DataFrame(
25+
{
26+
"messy_language": [
27+
"eng",
28+
"zh",
29+
"Japanese",
30+
"english",
31+
"Zh",
32+
"tp",
33+
"233",
34+
304,
35+
"dd eng",
36+
" tr ",
37+
"hello",
38+
np.nan,
39+
"NULL",
40+
]
41+
}
42+
)
43+
return df
44+
45+
46+
@pytest.fixture(scope="module") # type: ignore
47+
def df_multicols_languages() -> pd.DataFrame:
48+
df = pd.DataFrame(
49+
{
50+
"some_messy_language": [
51+
"eng",
52+
"zh",
53+
"Japanese",
54+
"english",
55+
"Zh",
56+
"tp",
57+
],
58+
"other_messy_language": [
59+
"233",
60+
304,
61+
" tr ",
62+
"hello",
63+
np.nan,
64+
"NULL",
65+
],
66+
}
67+
)
68+
return df
69+
70+
71+
def test_clean_default(df_languages: pd.DataFrame) -> None:
72+
df_clean = clean_language(df_languages, "messy_language")
73+
df_check = df_languages.copy()
74+
df_check["messy_language_clean"] = [
75+
"English",
76+
"Chinese",
77+
"Japanese",
78+
"English",
79+
"Chinese",
80+
np.nan,
81+
np.nan,
82+
np.nan,
83+
np.nan,
84+
"Turkish",
85+
np.nan,
86+
np.nan,
87+
np.nan,
88+
]
89+
90+
assert df_check.equals(df_clean)
91+
92+
93+
def test_clean_input_formats(df_languages: pd.DataFrame) -> None:
94+
df_clean_name = clean_language(df_languages, "messy_language", input_format="name")
95+
df_clean_alpha2 = clean_language(df_languages, "messy_language", input_format="alpha-2")
96+
df_clean_alpha3 = clean_language(df_languages, "messy_language", input_format="alpha-3")
97+
98+
df_check_name = df_languages.copy()
99+
df_check_name["messy_language_clean"] = [
100+
np.nan,
101+
np.nan,
102+
"Japanese",
103+
"English",
104+
np.nan,
105+
np.nan,
106+
np.nan,
107+
np.nan,
108+
np.nan,
109+
np.nan,
110+
np.nan,
111+
np.nan,
112+
np.nan,
113+
]
114+
df_check_alpha2 = df_languages.copy()
115+
df_check_alpha2["messy_language_clean"] = [
116+
np.nan,
117+
"Chinese",
118+
np.nan,
119+
np.nan,
120+
"Chinese",
121+
np.nan,
122+
np.nan,
123+
np.nan,
124+
np.nan,
125+
"Turkish",
126+
np.nan,
127+
np.nan,
128+
np.nan,
129+
]
130+
df_check_alpha3 = df_languages.copy()
131+
df_check_alpha3["messy_language_clean"] = [
132+
"English",
133+
np.nan,
134+
np.nan,
135+
np.nan,
136+
np.nan,
137+
np.nan,
138+
np.nan,
139+
np.nan,
140+
np.nan,
141+
np.nan,
142+
np.nan,
143+
np.nan,
144+
np.nan,
145+
]
146+
147+
assert df_clean_name.equals(df_check_name)
148+
assert df_clean_alpha2.equals(df_check_alpha2)
149+
assert df_clean_alpha3.equals(df_check_alpha3)
150+
151+
152+
def test_clean_input_format_tuple(df_languages: pd.DataFrame) -> None:
153+
df_clean = clean_language(df_languages, "messy_language", input_format=("name", "alpha-3"))
154+
df_check = df_languages.copy()
155+
df_check["messy_language_clean"] = [
156+
"English",
157+
np.nan,
158+
"Japanese",
159+
"English",
160+
np.nan,
161+
np.nan,
162+
np.nan,
163+
np.nan,
164+
np.nan,
165+
np.nan,
166+
np.nan,
167+
np.nan,
168+
np.nan,
169+
]
170+
171+
assert df_check.equals(df_clean)
172+
173+
174+
def test_clean_output_format(df_languages: pd.DataFrame) -> None:
175+
df_clean_name = clean_language(df_languages, "messy_language", output_format="name")
176+
df_clean_alpha2 = clean_language(df_languages, "messy_language", output_format="alpha-2")
177+
df_clean_alpha3 = clean_language(df_languages, "messy_language", output_format="alpha-3")
178+
179+
df_check_name = df_languages.copy()
180+
df_check_name["messy_language_clean"] = [
181+
"English",
182+
"Chinese",
183+
"Japanese",
184+
"English",
185+
"Chinese",
186+
np.nan,
187+
np.nan,
188+
np.nan,
189+
np.nan,
190+
"Turkish",
191+
np.nan,
192+
np.nan,
193+
np.nan,
194+
]
195+
df_check_alpha2 = df_languages.copy()
196+
df_check_alpha2["messy_language_clean"] = [
197+
"en",
198+
"zh",
199+
"ja",
200+
"en",
201+
"zh",
202+
np.nan,
203+
np.nan,
204+
np.nan,
205+
np.nan,
206+
"tr",
207+
np.nan,
208+
np.nan,
209+
np.nan,
210+
]
211+
df_check_alpha3 = df_languages.copy()
212+
df_check_alpha3["messy_language_clean"] = [
213+
"eng",
214+
"zho",
215+
"jpn",
216+
"eng",
217+
"zho",
218+
np.nan,
219+
np.nan,
220+
np.nan,
221+
np.nan,
222+
"tur",
223+
np.nan,
224+
np.nan,
225+
np.nan,
226+
]
227+
228+
assert df_clean_name.equals(df_check_name)
229+
assert df_clean_alpha2.equals(df_check_alpha2)
230+
assert df_clean_alpha3.equals(df_check_alpha3)
231+
232+
233+
def test_clean_kb(df_languages: pd.DataFrame) -> None:
234+
df_clean = clean_language(
235+
df_languages, "messy_language", kb_path=ALTERNATIVE_LANGUAGE_DATA_FILE
236+
)
237+
df_check = df_languages.copy()
238+
df_check["messy_language_clean"] = [
239+
"English",
240+
"Chinese",
241+
"Japanese",
242+
"English",
243+
"Chinese",
244+
np.nan,
245+
np.nan,
246+
np.nan,
247+
np.nan,
248+
np.nan,
249+
np.nan,
250+
np.nan,
251+
np.nan,
252+
]
253+
254+
assert df_check.equals(df_clean)
255+
256+
257+
def test_validate_value() -> None:
258+
assert validate_language("english") == True
259+
assert validate_language("zh") == True
260+
assert validate_language(" ZH ") == True
261+
assert validate_language("tp") == False
262+
assert validate_language("eng") == True
263+
assert validate_language("hello") == False
264+
assert validate_language("233") == False
265+
assert validate_language("dd eng") == False
266+
assert validate_language("") == False
267+
268+
269+
def test_validate_series(df_languages: pd.DataFrame) -> None:
270+
srs_valid = validate_language(df_languages["messy_language"])
271+
srs_check = pd.Series(
272+
[
273+
True,
274+
True,
275+
True,
276+
True,
277+
True,
278+
False,
279+
False,
280+
False,
281+
False,
282+
True,
283+
False,
284+
False,
285+
False,
286+
],
287+
name="messy_language",
288+
)
289+
assert srs_check.equals(srs_valid)
290+
291+
292+
def test_validate_input_format(df_languages: pd.DataFrame) -> None:
293+
srs_valid = validate_language(df_languages["messy_language"], input_format="alpha-2")
294+
srs_check = pd.Series(
295+
[
296+
False,
297+
True,
298+
False,
299+
False,
300+
True,
301+
False,
302+
False,
303+
False,
304+
False,
305+
True,
306+
False,
307+
False,
308+
False,
309+
],
310+
name="messy_language",
311+
)
312+
assert srs_check.equals(srs_valid)
313+
314+
315+
def test_validate_dataframe_col(df_multicols_languages: pd.DataFrame) -> None:
316+
srs_valid = validate_language(df_multicols_languages, "some_messy_language")
317+
srs_check = pd.Series(
318+
[
319+
True,
320+
True,
321+
True,
322+
True,
323+
True,
324+
False,
325+
],
326+
name="some_messy_language",
327+
)
328+
assert srs_check.equals(srs_valid)
329+
330+
331+
def test_validate_dataframe_all(df_multicols_languages: pd.DataFrame) -> None:
332+
df_valid = validate_language(df_multicols_languages)
333+
df_check = pd.DataFrame()
334+
335+
df_check["some_messy_language"] = [
336+
True,
337+
True,
338+
True,
339+
True,
340+
True,
341+
False,
342+
]
343+
df_check["other_messy_language"] = [
344+
False,
345+
False,
346+
True,
347+
False,
348+
False,
349+
False,
350+
]
351+
352+
assert df_check.equals(df_valid)
353+
354+
355+
def test_validate_kb(df_languages: pd.DataFrame) -> None:
356+
srs_valid = validate_language(
357+
df_languages["messy_language"], kb_path=ALTERNATIVE_LANGUAGE_DATA_FILE
358+
)
359+
srs_check = pd.Series(
360+
[
361+
True,
362+
True,
363+
True,
364+
True,
365+
True,
366+
False,
367+
False,
368+
False,
369+
False,
370+
False,
371+
False,
372+
False,
373+
False,
374+
],
375+
name="messy_language",
376+
)
377+
assert srs_check.equals(srs_valid)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
name,alpha-3,alpha-2
2+
English,eng,en
3+
Chinese,zho,zh
4+
Japanese,jpn,ja

0 commit comments

Comments
 (0)