-
Notifications
You must be signed in to change notification settings - Fork 197
/
Copy pathutils.py
67 lines (58 loc) · 2.12 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import torch
from torch.utils.data import Dataset, DataLoader
from vocab import Vocab
# Constants
BOS_TOKEN = "<bos>"
EOS_TOKEN = "<eos>"
PAD_TOKEN = "<pad>"
BOW_TOKEN = "<bow>"
EOW_TOKEN = "<eow>"
WEIGHT_INIT_RANGE = 0.1
def load_reuters():
from nltk.corpus import reuters
text = reuters.sents()
# lowercase (optional)
text = [[word.lower() for word in sentence] for sentence in text]
vocab = Vocab.build(text, reserved_tokens=[PAD_TOKEN, BOS_TOKEN, EOS_TOKEN])
corpus = [vocab.convert_tokens_to_ids(sentence) for sentence in text]
return corpus, vocab
def save_pretrained(vocab, embeds, save_path):
"""
Save pretrained token vectors in a unified format, where the first line
specifies the `number_of_tokens` and `embedding_dim` followed with all
token vectors, one token per line.
"""
with open(save_path, "w") as writer:
writer.write(f"{embeds.shape[0]} {embeds.shape[1]}\n")
for idx, token in enumerate(vocab.idx_to_token):
vec = " ".join(["{:.4f}".format(x) for x in embeds[idx]])
writer.write(f"{token} {vec}\n")
print(f"Pretrained embeddings saved to: {save_path}")
def load_pretrained(load_path):
with open(load_path, "r") as fin:
# Optional: depending on the specific format of pretrained vector file
n, d = map(int, fin.readline().split())
tokens = []
embeds = []
for line in fin:
line = line.rstrip().split(' ')
token, embed = line[0], list(map(float, line[1:]))
tokens.append(token)
embeds.append(embed)
vocab = Vocab(tokens)
embeds = torch.tensor(embeds, dtype=torch.float)
return vocab, embeds
def get_loader(dataset, batch_size, shuffle=True):
data_loader = DataLoader(
dataset,
batch_size=batch_size,
collate_fn=dataset.collate_fn,
shuffle=shuffle
)
return data_loader
def init_weights(model):
for name, param in model.named_parameters():
if "embedding" not in name:
torch.nn.init.uniform_(
param, a=-WEIGHT_INIT_RANGE, b=WEIGHT_INIT_RANGE
)