-
Notifications
You must be signed in to change notification settings - Fork 0
/
add_titles.py
50 lines (44 loc) · 1.46 KB
/
add_titles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import csv
import pandas as pd
import wn
from joblib import Parallel, delayed
from tqdm import tqdm
from utils import merge_sv, title_by_url
import os
import shutil
UK = "pwn31_to_uk_wiki"
EN = "pwn31_to_en_wiki"
ILI = "pwn31_to_ili"
def write_one(filename, key, value):
"""
:param filename: string
:param key: int
:param value: list
:return: None
"""
with open(filename, "w") as file_out:
writer = csv.writer(file_out, delimiter=";")
if (UK in value) or (EN in value):
writer.writerow(value + [title_by_url(value[1])])
else:
writer.writerow(value + [wn.synset(value[0]).lemmas()[0]])
if __name__ == "__main__":
# read and filter main dataframe
pwn_friends = pd.read_csv("./data/pwn_friends.csv")
df = pwn_friends[
pwn_friends["rel"].isin(["pwn31_to_uk_wiki", "pwn31_to_en_wiki", "pwn31_to_ili"])
].sort_values(by=["rel"], ascending=[False])
df = df.drop_duplicates(subset="id_from").reset_index(drop=True)
dct = df.set_index(df.index).T.to_dict("list")
# dir for titled csv files
path_dir = os.getcwd() + "/titled/"
os.mkdir(path_dir)
# parallel loop
res = Parallel(n_jobs=-1)(
delayed(write_one)(f"{path_dir}{k}.csv", k, v)
for k, v in tqdm(dct.items(), total=len(dct))
)
# merge and clean-up
header = ["id_from", "id_to", "rel", "title"]
merge_sv("titled", "data/titled_pwn", header, ";", "csv")
shutil.rmtree(path_dir)