-
Notifications
You must be signed in to change notification settings - Fork 0
/
01_read_files.py
executable file
·67 lines (53 loc) · 2.23 KB
/
01_read_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import glob
import os
import pandas as pd
from googletrans import Translator
import pycountry
import numpy as np
translator = Translator()
def custom_translator(x_trans):
try:
translation = translator.translate(x_trans).text
return translation
except:
return x_trans
def custom_country_finder(x):
results = []
for country in pycountry.countries:
if country.name in x:
results.append(country.name)
if not results:
try:
list_result = pycountry.countries.search_fuzzy(x)
# print(results)
[results.append(i.name) for i in list_result]
return results
except:
# x = geograpy3.get_place_context(text=x).countries
# for country in pycountry.countries:
# if country.name in x:
# results.append(country.name)
results = np.nan
return results
else:
return results
def custom_reader(x_dir):
# columns = ['created_at', 'id', 'text', 'geo', 'coordinates', 'place',
# 'retweet_count', 'favorite_count', 'lang', 'user_id', 'user_name',
# 'user_screen_name', 'user_location', 'user_verified', 'user_followers_count',
# 'user_friends_count', 'user_favourites_count', 'user_created_at']
columns = ['id', 'text', 'geo', 'coordinates', 'place', 'user_location']
df = pd.read_csv(x_dir, usecols=columns, dtype='str', encoding="'utf-8'")
sel_columns = ['geo', 'coordinates', 'place', 'user_location']
df = df.loc[df[sel_columns].notna().any(axis=1)]
print("Done!")
print(df.shape)
return df
file_list = glob.glob(os.path.join('..', "data_twitter", "data_csv", "*.csv"))
df_a = pd.concat(map(custom_reader, file_list[0:4]))
# df_a['user_en'] = df_a['user_location'].apply(lambda x: custom_translator(x) if pd.notnull(x) else x)
# df_a['custom_country'] = df_a['user_location'].apply(lambda x: custom_country_finder(x) if pd.notnull(x) else x)
sel_columns = ['geo', 'coordinates', 'place', 'user_location']
df_a = df_a.loc[df_a[sel_columns].notna().any(axis=1)]
df_a.to_csv(os.path.join("..", "data_twitter", 'data_processed', "covid19_stream.csv"),
index=False, encoding='utf-8-sig')