-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathdata_parser.py
More file actions
211 lines (156 loc) · 6.9 KB
/
Copy pathdata_parser.py
File metadata and controls
211 lines (156 loc) · 6.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
from bs4 import BeautifulSoup
import pandas as pd
import glob
import plotly.express as px
import string
from pos_categories import pos_categories, pos_labels, pos_dittos
from attribute_categories import rank_categories, relationship_categories, relationship_labels
class DataParser():
df = None
db_person = pd.read_csv('TCEECE/metadata/database-person.txt', sep='\t', encoding='iso-8859-1')
db_person = db_person.set_index('PersonCode')
path_to_csv = 'TCEECE/data.csv'
def __init__(self):
try:
self.df = pd.read_csv(self.path_to_csv, index_col=False)
except:
self.db_person = pd.read_csv('TCEECE/metadata/database-person.txt', sep='\t', encoding='iso-8859-1')
self.db_person = self.db_person.set_index('PersonCode')
self.db_letter = pd.read_csv('TCEECE/metadata//database-letter.txt', sep='\t', encoding='iso-8859-1')
self.db_letter = self.db_letter.set_index('LetterID')
self.df = self.letters_to_df()
self.df.to_csv(self.path_to_csv, index=False)
# Delete rows with missing data
self.df = self.df.dropna(axis='index')
self.pos_categories = pos_categories
self.rank_categories = rank_categories
self.pos_labels = pos_labels
self.pos_dittos = pos_dittos
self.relationship_categories = relationship_categories
self.relationship_labels = relationship_labels
return
# Transforms xml-file into a BeautifulSoup-object
def read_tei(self, tei_file):
with open(tei_file, 'r') as tei:
soup = BeautifulSoup(tei, 'lxml')
return soup
raise RuntimeError('Cannot generate a soup from the input')
# Creates a Pandas dataframe from a letter specified by the path-argument
# with letted id, words and corresponding POS-tags as the columns
def parse_letter(self, path):
lst = []
pos = []
words = []
# Creates a BeautifulSoup-object
soup = self.read_tei(path)
# Locates the letter text by using the p-tags and extracts it into a list
text = list(soup.find_all('p'))
# Splits the text into single items (word+POS-tag) and adds them to a list
for item in text:
lst += item.text.split()
# Splits the items into POS-tags and words and adds them to separate lists
for item in lst:
part = item.partition("_")
if part[2] == '' or part[2][0] in string.punctuation:
continue
pos.append(part[2])
words.append(part[0])
# Extracts the id of the letter from the TEI-tag
id = soup.tei.attrs['xml:id']
sender = self.db_letter.loc[id, 'Sender']
# Combines the lists into a dict. The id is repeated for each word
data = {'ID': [id] * len(pos),
'Words':words,
'Tags':pos,
'Year': [self.db_letter.loc[id, 'Year']] * len(pos),
'Sender': [self.db_letter.loc[id, 'Sender']] * len(pos),
'SenderRank': [self.db_letter.loc[id, 'SenderRank']] * len(pos),
'SenderSex': [self.db_person.loc[sender, 'Sex']] *len(pos),
'RelCode': [self.db_letter.loc[id, 'RelCode']] * len(pos),
'WordCount': [self.db_letter.loc[id, 'WordCount']] * len(pos)
}
# Creates a Pandas Dataframe from the dict
df = pd.DataFrame(data)
return df
def letters_to_df(self):
# Path of the folder where the letters are located (change path to correct location when using this)
path = 'TCEECE/tceece-letters-c7'
# Uses glob-library to create a list of all the .txt-files in the folder
all_files = glob.glob(path + "/*.txt")
li = []
# Creates separate dataframes from each letter and adds them to a list
for filename in all_files:
df = self.parse_letter(filename)
li.append(df)
# Combines all dataframes in the list by using the concat-method of Pandas
frame = pd.concat(li, axis=0, ignore_index=True)
return frame
def get_pos_list(self):
df = self.df
pos_set = set(df['Tags'])
pos_list = [{'label':tag, 'value':tag} for tag in pos_set]
return pos_list
def get_word_list(self):
df = self.df
word_set = set(df['Words'].str.lower())
word_list = [{'label':word, 'value':word} for word in word_set]
return word_list
def get_rank(self):
df = self.df
rank_set = set(df['SenderRank'])
rank_list = [{'label':rank, 'value':rank} for rank in rank_set]
return rank_set, rank_list
def get_relationship(self):
df = self.df
rel_set = set(df['RelCode'])
rel_list = [{'label':rel, 'value':rel} for rel in rel_set]
return rel_set, rel_list
def get_years(self):
df = self.df
year_set = set(df['Year'])
return year_set
def list_to_dash_option_dict(self, l):
options = [{'label':item, 'value':item} for item in l]
return options
def dict_to_dash_options_with_hover(self, d):
options = [{'label':k, 'value':k, 'title':', '.join(v)} for k,v in d.items()]
return options
def pos_options_with_hover(self, custom, main):
l = self.get_pos_categories(custom)[main]
options = [{'label':tag, 'value':tag, 'title':self.pos_labels[tag] + '\n' + ', '.join(self.pos_dittos[tag])} for tag in l]
return options
def get_pos_categories(self, custom):
try:
all_pos_categories = dict()
all_pos_categories.update(self.pos_categories)
all_pos_categories.update(custom)
return all_pos_categories
except:
return self.pos_categories
def get_rel_categories(self, custom):
try:
all_rel_categories = dict()
all_rel_categories.update(self.relationship_categories)
all_rel_categories.update(custom)
return all_rel_categories
except:
return self.relationship_categories
def get_rank_categories(self, custom):
try:
all_rank_categories = dict()
all_rank_categories.update(self.rank_categories)
all_rank_categories.update(custom)
return all_rank_categories
except:
return self.rank_categories
def include_ditto_tags_to_pos_list(self, pos_list):
final_list = []
for tag in pos_list:
final_list.extend(self.pos_dittos[tag])
return final_list
def get_name(self, ids):
person = self.db_person
senders = ids.to_frame()
tmp = person[['FirstName','LastName']]
names = senders.join(tmp, on='Sender').reset_index(drop=True)
return names