-
Notifications
You must be signed in to change notification settings - Fork 11
/
train.py
96 lines (84 loc) · 4.08 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# -*- coding: utf-8 -*-
import codecs
import numpy as np
import cPickle
from keras import Input
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.engine import Model
from keras.layers.merge import concatenate
from keras.layers import Embedding, Dense, Dropout, Conv1D, GlobalMaxPooling1D
from preprocessing import BATCH_SIZE, EMBEDDING_DIMENSION, CONTEXT_LENGTH, UNKNOWN
from preprocessing import TARGET_LENGTH, generate_arrays_from_file, ENCODING_MAP_2x2, ENCODING_MAP_1x1
from subprocess import check_output
print(u"Embedding Dimension:", EMBEDDING_DIMENSION)
print(u"Input length (each side):", CONTEXT_LENGTH)
word_to_index = cPickle.load(open(u"data/words2index.pkl"))
print(u"Vocabulary Size:", len(word_to_index))
vectors = {UNKNOWN: np.ones(EMBEDDING_DIMENSION), u'0': np.ones(EMBEDDING_DIMENSION)}
for line in codecs.open(u"../data/glove.twitter." + str(EMBEDDING_DIMENSION) + u"d.txt", encoding=u"utf-8"):
if line.strip() == "":
continue
t = line.split()
vectors[t[0]] = [float(x) for x in t[1:]]
print(u'Vectors...', len(vectors))
emb_weights = np.zeros((len(word_to_index), EMBEDDING_DIMENSION))
oov = 0
for w in word_to_index:
if w in vectors:
emb_weights[word_to_index[w]] = vectors[w]
else:
emb_weights[word_to_index[w]] = np.random.normal(size=(EMBEDDING_DIMENSION,), scale=0.3)
oov += 1
emb_weights = np.array([emb_weights])
print(u'Done preparing vectors...')
print(u"OOV (no vectors):", oov)
# --------------------------------------------------------------------------------------------------------------------
print(u'Building model...')
embeddings = Embedding(len(word_to_index), EMBEDDING_DIMENSION, input_length=CONTEXT_LENGTH * 2, weights=emb_weights)
# shared embeddings between all language input layers
context_words_pair = Input(shape=(CONTEXT_LENGTH * 2,))
cwp = embeddings(context_words_pair)
cwp = Conv1D(1000, 2, activation='relu', strides=1)(cwp)
cwp = GlobalMaxPooling1D()(cwp)
cwp = Dense(250)(cwp)
cwp = Dropout(0.5)(cwp)
context_words_single = Input(shape=(CONTEXT_LENGTH * 2,))
cws = embeddings(context_words_single)
cws = Conv1D(1000, 1, activation='relu', strides=1)(cws)
cws = GlobalMaxPooling1D()(cws)
cws = Dense(250)(cws)
cws = Dropout(0.5)(cws)
entities_strings_pair = Input(shape=(CONTEXT_LENGTH * 2,))
esp = embeddings(entities_strings_pair)
esp = Conv1D(1000, 2, activation='relu', strides=1)(esp)
esp = GlobalMaxPooling1D()(esp)
esp = Dense(250)(esp)
esp = Dropout(0.5)(esp)
entities_strings_single = Input(shape=(CONTEXT_LENGTH * 2,))
ess = embeddings(entities_strings_single)
ess = Conv1D(1000, 1, activation='relu', strides=1)(ess)
ess = GlobalMaxPooling1D()(ess)
ess = Dense(250)(ess)
ess = Dropout(0.5)(ess)
mapvec = Input(shape=(len(ENCODING_MAP_1x1),))
l2v = Dense(5000, activation='relu', input_dim=len(ENCODING_MAP_1x1))(mapvec)
l2v = Dense(1000, activation='relu')(l2v)
l2v = Dropout(0.5)(l2v)
target_string = Input(shape=(TARGET_LENGTH,))
ts = Embedding(len(word_to_index), EMBEDDING_DIMENSION, input_length=TARGET_LENGTH, weights=emb_weights)(target_string)
ts = Conv1D(1000, 3, activation='relu')(ts)
ts = GlobalMaxPooling1D()(ts)
ts = Dropout(0.5)(ts)
inp = concatenate([cwp, cws, esp, ess, l2v, ts])
inp = Dense(units=len(ENCODING_MAP_2x2), activation=u'softmax')(inp)
model = Model(inputs=[context_words_pair, context_words_single, entities_strings_pair, entities_strings_single,
mapvec, target_string], outputs=[inp])
model.compile(loss=u'categorical_crossentropy', optimizer=u'rmsprop', metrics=[u'accuracy'])
print(u'Finished building model...')
# --------------------------------------------------------------------------------------------------------------------
checkpoint = ModelCheckpoint(filepath=u"../data/weights.{epoch:02d}-{acc:.2f}.hdf5", verbose=0)
early_stop = EarlyStopping(monitor=u'acc', patience=5)
file_name = u"../data/train_wiki_uniform.txt"
model.fit_generator(generate_arrays_from_file(file_name, word_to_index),
steps_per_epoch=int(check_output(["wc", file_name]).split()[0]) / BATCH_SIZE,
epochs=250, callbacks=[checkpoint, early_stop])