-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathutils.py
64 lines (54 loc) · 2.39 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from context_manager import *
import numpy as np
def checkout_prob(text, file_path = 'prob.tsv'):
tokens, self_info = get_self_information(text)
with open(file_path, 'w') as f:
for token, info in zip(tokens, self_info):
print(token, info)
f.write(token + '\t' + str(info) + '\n')
print('Finished writing to file: ', file_path)
def read_lexical_units(article: ArxivArticle, mask_level = 'phrase'):
if mask_level == 'sent':
lexical_units = article.units[0]
assert lexical_units.unit_type == 'sent'
elif mask_level == 'phrase':
lexical_units = article.units[1]
assert lexical_units.unit_type == 'phrase'
elif mask_level == 'token':
lexical_units = article.units[2]
assert lexical_units.unit_type == 'token'
tokens = lexical_units.text[:50] + lexical_units.text[360:421]
self_info = lexical_units.self_info[:50] + lexical_units.self_info[360:421]
self_info = [x**1.2 for x in self_info]
max_score = max(self_info)
min_score = min(self_info)
mid = np.percentile(self_info, 50)
lines = []
highlighted = []
buffer = []
for token, score in zip(tokens, self_info):
normalized_score = ((score - min_score) / (max_score - min_score)) * 100
line = f"\\colorize{{{normalized_score}}}{{{token}}}"
if score > mid:
if len(buffer) > 0:
str_ = '\n'.join(buffer)
lines.append(f"\\underline{{{str_}}}")
buffer = []
highlighted.append(line)
lines.append(line)
else:
# token = f"\\sdelete{{{token}}}"
# line = f"\\colorize{{{normalized_score}}}{{{token}}}"
buffer.append(line)
return '\n'.join(lines) + '\n\n\n' + '\n'.join(highlighted)
def datasets_statistics(manager: ArxivContextManager, tokenizer):
def num_tokens(text):
return len(tokenizer(text)['input_ids'])
articles = manager.articles
num_sents = [len(article.units[0].text) for article in articles]
num_phrases = [len(article.units[1].text) for article in articles]
num_tokens = [len(article.units[2].text) for article in articles]
print('Number of articles: ', len(articles))
print('Average number of sentences: ', np.mean(num_sents))
print('Average number of phrases: ', np.mean(num_phrases))
print('Average number of tokens: ', np.mean(num_tokens))