language-analysis/data_parser.py at main · DSP2021-LanguageAnalysis/language-analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
from bs4 import BeautifulSoup
import pandas as pd
import glob
import plotly.express as px
import string
from pos_categories import pos_categories, pos_labels, pos_dittos
from attribute_categories import rank_categories, relationship_categories, relationship_labels

class DataParser():
    df = None
    db_person = pd.read_csv('TCEECE/metadata/database-person.txt', sep='\t', encoding='iso-8859-1')
    db_person = db_person.set_index('PersonCode')
    path_to_csv = 'TCEECE/data.csv'

    def __init__(self):
        try:
            self.df = pd.read_csv(self.path_to_csv, index_col=False)
        except:
            self.db_person = pd.read_csv('TCEECE/metadata/database-person.txt', sep='\t', encoding='iso-8859-1')
            self.db_person = self.db_person.set_index('PersonCode')
            self.db_letter = pd.read_csv('TCEECE/metadata//database-letter.txt', sep='\t', encoding='iso-8859-1')
            self.db_letter = self.db_letter.set_index('LetterID')

            self.df = self.letters_to_df()
            self.df.to_csv(self.path_to_csv, index=False)
        # Delete rows with missing data
        self.df = self.df.dropna(axis='index')
        self.pos_categories = pos_categories
        self.rank_categories = rank_categories
        self.pos_labels = pos_labels
        self.pos_dittos = pos_dittos
        self.relationship_categories = relationship_categories
        self.relationship_labels = relationship_labels
        return

    # Transforms xml-file into a BeautifulSoup-object
    def read_tei(self, tei_file):
        with open(tei_file, 'r') as tei:
            soup = BeautifulSoup(tei, 'lxml')
            return soup
        raise RuntimeError('Cannot generate a soup from the input')

    # Creates a Pandas dataframe from a letter specified by the path-argument
    # with letted id, words and corresponding POS-tags as the columns
    def parse_letter(self, path):
        lst = []
        pos = []
        words = []

        # Creates a BeautifulSoup-object
        soup = self.read_tei(path)

        # Locates the letter text by using the p-tags and extracts it into a list
        text = list(soup.find_all('p'))

        # Splits the text into single items (word+POS-tag) and adds them to a list
        for item in text:
            lst += item.text.split()

        # Splits the items into POS-tags and words and adds them to separate lists
        for item in lst:
            part = item.partition("_")
            if part[2] == '' or  part[2][0] in string.punctuation:
                continue
            pos.append(part[2])
            words.append(part[0])

        # Extracts the id of the letter from the TEI-tag
        id = soup.tei.attrs['xml:id']
        sender = self.db_letter.loc[id, 'Sender']

        # Combines the lists into a dict. The id is repeated for each word
        data = {'ID': [id] * len(pos),
                'Words':words,
                'Tags':pos,
                'Year': [self.db_letter.loc[id, 'Year']] * len(pos),
                'Sender': [self.db_letter.loc[id, 'Sender']] * len(pos),
                'SenderRank': [self.db_letter.loc[id, 'SenderRank']] * len(pos),
                'SenderSex': [self.db_person.loc[sender, 'Sex']] *len(pos),
                'RelCode': [self.db_letter.loc[id, 'RelCode']] * len(pos),
                'WordCount': [self.db_letter.loc[id, 'WordCount']] * len(pos)
                }

        # Creates a Pandas Dataframe from the dict
        df = pd.DataFrame(data)

        return df

    def letters_to_df(self):
        # Path of the folder where the letters are located (change path to correct location when using this)
        path = 'TCEECE/tceece-letters-c7'

        # Uses glob-library to create a list of all the .txt-files in the folder
        all_files = glob.glob(path + "/*.txt")

        li = []

        # Creates separate dataframes from each letter and adds them to a list
        for filename in all_files:
            df = self.parse_letter(filename)
            li.append(df)

        # Combines all dataframes in the list by using the concat-method of Pandas
        frame = pd.concat(li, axis=0, ignore_index=True)

        return frame

    def get_pos_list(self):

        df = self.df
        pos_set = set(df['Tags'])
        pos_list = [{'label':tag, 'value':tag} for tag in pos_set]

        return pos_list

    def get_word_list(self):

        df = self.df
        word_set = set(df['Words'].str.lower())
        word_list = [{'label':word, 'value':word} for word in word_set]

        return word_list

    def get_rank(self):

        df = self.df
        rank_set = set(df['SenderRank'])
        rank_list = [{'label':rank, 'value':rank} for rank in rank_set]

        return rank_set, rank_list

    def get_relationship(self):

        df = self.df
        rel_set = set(df['RelCode'])
        rel_list = [{'label':rel, 'value':rel} for rel in rel_set]

        return rel_set, rel_list

    def get_years(self):

        df = self.df
        year_set = set(df['Year'])

        return year_set

    def list_to_dash_option_dict(self, l):

        options = [{'label':item, 'value':item} for item in l]

        return options

    def dict_to_dash_options_with_hover(self, d):

        options = [{'label':k, 'value':k, 'title':', '.join(v)} for k,v in d.items()]

        return options

    def pos_options_with_hover(self, custom, main):

        l = self.get_pos_categories(custom)[main]
        options = [{'label':tag, 'value':tag, 'title':self.pos_labels[tag] + '\n' + ', '.join(self.pos_dittos[tag])} for tag in l]

        return options

    def get_pos_categories(self, custom):

        try:
            all_pos_categories = dict()
            all_pos_categories.update(self.pos_categories)
            all_pos_categories.update(custom)
            return all_pos_categories
        except:
            return self.pos_categories

    def get_rel_categories(self, custom):

        try:
            all_rel_categories = dict()
            all_rel_categories.update(self.relationship_categories)
            all_rel_categories.update(custom)
            return all_rel_categories
        except:
            return self.relationship_categories

    def get_rank_categories(self, custom):

        try:
            all_rank_categories = dict()
            all_rank_categories.update(self.rank_categories)
            all_rank_categories.update(custom)
            return all_rank_categories
        except:
            return self.rank_categories

    def include_ditto_tags_to_pos_list(self, pos_list):

        final_list = []
        for tag in pos_list:
            final_list.extend(self.pos_dittos[tag])

        return final_list

    def get_name(self, ids):

        person = self.db_person
        senders = ids.to_frame()
        tmp = person[['FirstName','LastName']]
        names = senders.join(tmp, on='Sender').reset_index(drop=True)

        return names