add detect

fluency03 · fluency03 · commit 70bd45af807f · 2016-04-22T15:21:02.000+02:00
diff --git a/brnn_sequence_analyzer.py b/brnn_sequence_analyzer.py
@@ -390,6 +390,63 @@ def predict(sequence, input_len, analyzer, nb_predictions=80,
         print "\n"
 
 
+def detect(sequence, input_len, analyzer, mapping='m2m', sentence_length=40):
+    """
+    Scan the given sequence for detecting anormalies.
+
+    Arguments:
+        sequence: {lsit}, the original input sequence
+        input_len: {integer}, the number of unique id classes
+        analyzer: {SequenceAnalyzer}, the sequence analyzer
+        mapping: {string}, input to output mapping.
+            'o2o': one-to-one
+            'm2m': many-to-many
+        sentence_length: {integer}, the length of each sentence.
+    """
+    # sequence length
+    length = len(sequence)
+
+    # predicted probabilities for each id
+    # we assume the first sentence_length ids are true
+    prob = [1] * sentence_length + [0] * (length - sentence_length)
+
+    # generate elements
+    for start_index in xrange(length - sentence_length):
+        # seed sentence
+        X = sequence[start_index : start_index + sentence_length]
+        # print "X:      " + ' '.join(str(s).ljust(4) for s in sentence)
+
+        # Y_true
+        # y_true = sequence[start_index + 1 : start_index + sentence_length + 1]
+        # print "y_true: " + ' '.join(str(s).ljust(4) for s in y_true)
+        y_next_true = sequence[start_index + sentence_length]
+
+        seed = np.zeros((1, sentence_length, input_len))
+        # format input
+        for t in range(0, sentence_length):
+            seed[0, t, X[t]] = 1
+
+        # get predictionsverbose = 0, no logging
+        predictions = analyzer.model.predict(seed, verbose=0)[0]
+
+        # y_predicted
+        y_next_pred = 0
+        if mapping == 'o2o':
+            prob[start_index + sentence_length] = predictions[y_next_true]
+            y_next_pred = np.argmax(predictions)
+        elif mapping == 'm2m':
+            # next_sentence = []
+            # for pred in predictions:
+            #     next_sentence.append(np.argmax(pred))
+            # y_next_pred = next_sentence[-1]
+            # print "y_pred: " + ' '.join(str(id_).ljust(4)
+            #                             for id_ in next_sentence)
+            y_next_pred = np.argmax(predictions[-1])
+            prob[start_index + sentence_length] = predictions[-1][y_next_true]
+
+        return prob
+
+
 def train(analyzer, train_sequence, val_sequence, input_len,
           batch_size=128, nb_epoch=50, nb_iterations=4,
           sentence_length=40, step=40, mapping='m2m'):
diff --git a/brnn_sequence_analyzer_gen.py b/brnn_sequence_analyzer_gen.py
@@ -447,6 +447,63 @@ def train(analyzer, train_data, nb_training_samples,
         analyzer.save_model("weights-after-iteration.hdf5")
 
 
+def detect(sequence, input_len, analyzer, mapping='m2m', sentence_length=40):
+    """
+    Scan the given sequence for detecting anormalies.
+
+    Arguments:
+        sequence: {lsit}, the original input sequence
+        input_len: {integer}, the number of unique id classes
+        analyzer: {SequenceAnalyzer}, the sequence analyzer
+        mapping: {string}, input to output mapping.
+            'o2o': one-to-one
+            'm2m': many-to-many
+        sentence_length: {integer}, the length of each sentence.
+    """
+    # sequence length
+    length = len(sequence)
+
+    # predicted probabilities for each id
+    # we assume the first sentence_length ids are true
+    prob = [1] * sentence_length + [0] * (length - sentence_length)
+
+    # generate elements
+    for start_index in xrange(length - sentence_length):
+        # seed sentence
+        X = sequence[start_index : start_index + sentence_length]
+        # print "X:      " + ' '.join(str(s).ljust(4) for s in sentence)
+
+        # Y_true
+        # y_true = sequence[start_index + 1 : start_index + sentence_length + 1]
+        # print "y_true: " + ' '.join(str(s).ljust(4) for s in y_true)
+        y_next_true = sequence[start_index + sentence_length]
+
+        seed = np.zeros((1, sentence_length, input_len))
+        # format input
+        for t in range(0, sentence_length):
+            seed[0, t, X[t]] = 1
+
+        # get predictionsverbose = 0, no logging
+        predictions = analyzer.model.predict(seed, verbose=0)[0]
+
+        # y_predicted
+        y_next_pred = 0
+        if mapping == 'o2o':
+            prob[start_index + sentence_length] = predictions[y_next_true]
+            y_next_pred = np.argmax(predictions)
+        elif mapping == 'm2m':
+            # next_sentence = []
+            # for pred in predictions:
+            #     next_sentence.append(np.argmax(pred))
+            # y_next_pred = next_sentence[-1]
+            # print "y_pred: " + ' '.join(str(id_).ljust(4)
+            #                             for id_ in next_sentence)
+            y_next_pred = np.argmax(predictions[-1])
+            prob[start_index + sentence_length] = predictions[-1][y_next_true]
+
+        return prob
+
+
 def run(hidden_len=512, batch_size=128, nb_batch=200, nb_epoch=50,
         nb_iterations=4, lr=0.001, validation_split=0.05, nb_predictions=20,
         mapping='m2m', sentence_length=80, step=80, mode='train'):
diff --git a/rnn_sequence_analyzer.py b/rnn_sequence_analyzer.py
@@ -417,7 +417,6 @@ def train(analyzer, train_sequence, val_sequence, input_len,
         analyzer.save_model("weights-after-iteration.hdf5")
 
 
-
 def detect(sequence, input_len, analyzer, mapping='m2m', sentence_length=40):
     """
     Scan the given sequence for detecting anormalies.
diff --git a/rnn_sequence_analyzer_gen.py b/rnn_sequence_analyzer_gen.py
@@ -54,7 +54,7 @@ def __init__(self, sentence_length, input_len, hidden_len, output_len):
 
     def build(self, layer='LSTM', mapping='m2m', nb_layers=2, dropout=0.2):
         """
-        Stacked LSTM with specified dropout rate (default 0.2), built with
+        Stacked RNN with specified dropout rate (default 0.2), built with
         softmax activation, cross entropy loss and rmsprop optimizer.
 
         Arguments:
@@ -122,7 +122,9 @@ class LAYER(GRU):
 
         self.model.add(Activation('softmax'))
 
-        self.model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
+        self.model.compile(loss='categorical_crossentropy',
+                           optimizer='rmsprop',
+                           metrics=['accuracy'])
 
     def save_model(self, filename):
         """
@@ -184,7 +186,7 @@ def on_epoch_end(self, epoch, logs={}): # pylint: disable=W0102
         A method starting at the begining of the training.
 
         Arguments:
-            epoch: {integer}, the current epoch
+            epoch: {integer}, the current epoch.
             logs: {dictionary}, recording the training and validation
                 losses and accuracy of every epoch.
         """
@@ -383,6 +385,45 @@ def predict(sequence, input_len, analyzer, nb_predictions=80,
         print "\n"
 
 
+def train(analyzer, train_data, nb_training_samples,
+          val_data, nb_validation_samples,
+          nb_epoch=50, nb_iterations=4):
+    """
+    Trains the network.
+
+    Arguments:
+        analyzer: {SequenceAnalyzer}.
+        train_data: {tuple}, training data (X_train, y_train).
+        val_data: {tuple}, validation data (X_val, y_val).
+        nb_training_samples: {integer}, the number training samples.
+        nb_validation_samples: {integer}, the number validation samples.
+        nb_iterations: {integer}, number of iterations.
+        sentence_length: {integer}, the length of each training sentence.
+    """
+    for iteration in range(1, nb_iterations+1):
+        print ""
+        print "------------------------ Start Training ------------------------"
+        print "Iteration: ", iteration
+        print "Number of epoch per iteration: ", nb_epoch
+
+        # history of losses and accuracy
+        history = History()
+
+        # saves the model weights after each epoch
+        # if the validation loss decreased
+        checkpointer = ModelCheckpoint(filepath="weights.hdf5",
+                                       verbose=1, save_best_only=True)
+
+        # train the model with data generator
+        analyzer.model.fit_generator(train_data,
+                                     samples_per_epoch=nb_training_samples,
+                                     nb_epoch=nb_epoch, verbose=1,
+                                     callbacks=[history, checkpointer],
+                                     validation_data=val_data,
+                                     nb_val_samples=nb_validation_samples)
+
+        analyzer.save_model("weights-after-iteration.hdf5")
+
 
 def detect(sequence, input_len, analyzer, mapping='m2m', sentence_length=40):
     """
@@ -441,47 +482,6 @@ def detect(sequence, input_len, analyzer, mapping='m2m', sentence_length=40):
         return prob
 
 
-
-def train(analyzer, train_data, nb_training_samples,
-          val_data, nb_validation_samples,
-          nb_epoch=50, nb_iterations=4):
-    """
-    Trains the network.
-
-    Arguments:
-        analyzer: {SequenceAnalyzer}.
-        train_data: {tuple}, training data (X_train, y_train).
-        val_data: {tuple}, validation data (X_val, y_val).
-        nb_training_samples: {integer}, the number training samples.
-        nb_validation_samples: {integer}, the number validation samples.
-        nb_iterations: {integer}, number of iterations.
-        sentence_length: {integer}, the length of each training sentence.
-    """
-    for iteration in range(1, nb_iterations+1):
-        print ""
-        print "------------------------ Start Training ------------------------"
-        print "Iteration: ", iteration
-        print "Number of epoch per iteration: ", nb_epoch
-
-        # history of losses and accuracy
-        history = History()
-
-        # saves the model weights after each epoch
-        # if the validation loss decreased
-        checkpointer = ModelCheckpoint(filepath="weights.hdf5",
-                                       verbose=1, save_best_only=True)
-
-        # train the model with data generator
-        analyzer.model.fit_generator(train_data,
-                                     samples_per_epoch=nb_training_samples,
-                                     nb_epoch=nb_epoch, verbose=1,
-                                     callbacks=[history, checkpointer],
-                                     validation_data=val_data,
-                                     nb_val_samples=nb_validation_samples)
-
-        analyzer.save_model("weights-after-iteration.hdf5")
-
-
 def run(hidden_len=512, batch_size=128, nb_batch=200, nb_epoch=50,
         nb_iterations=4, lr=0.001, validation_split=0.05, nb_predictions=20,
         mapping='m2m', sentence_length=80, step=80, mode='train'):
diff --git a/sequence_analyzer.py b/sequence_analyzer.py
@@ -547,6 +547,63 @@ def train(analyzer, train_sequence, val_sequence, input_len,
         analyzer.save_model("weights-after-iteration.hdf5")
 
 
+def detect(sequence, input_len, analyzer, mapping='m2m', sentence_length=40):
+    """
+    Scan the given sequence for detecting anormalies.
+
+    Arguments:
+        sequence: {lsit}, the original input sequence
+        input_len: {integer}, the number of unique id classes
+        analyzer: {SequenceAnalyzer}, the sequence analyzer
+        mapping: {string}, input to output mapping.
+            'o2o': one-to-one
+            'm2m': many-to-many
+        sentence_length: {integer}, the length of each sentence.
+    """
+    # sequence length
+    length = len(sequence)
+
+    # predicted probabilities for each id
+    # we assume the first sentence_length ids are true
+    prob = [1] * sentence_length + [0] * (length - sentence_length)
+
+    # generate elements
+    for start_index in xrange(length - sentence_length):
+        # seed sentence
+        X = sequence[start_index : start_index + sentence_length]
+        # print "X:      " + ' '.join(str(s).ljust(4) for s in sentence)
+
+        # Y_true
+        # y_true = sequence[start_index + 1 : start_index + sentence_length + 1]
+        # print "y_true: " + ' '.join(str(s).ljust(4) for s in y_true)
+        y_next_true = sequence[start_index + sentence_length]
+
+        seed = np.zeros((1, sentence_length, input_len))
+        # format input
+        for t in range(0, sentence_length):
+            seed[0, t, X[t]] = 1
+
+        # get predictionsverbose = 0, no logging
+        predictions = analyzer.model.predict(seed, verbose=0)[0]
+
+        # y_predicted
+        y_next_pred = 0
+        if mapping == 'o2o':
+            prob[start_index + sentence_length] = predictions[y_next_true]
+            y_next_pred = np.argmax(predictions)
+        elif mapping == 'm2m':
+            # next_sentence = []
+            # for pred in predictions:
+            #     next_sentence.append(np.argmax(pred))
+            # y_next_pred = next_sentence[-1]
+            # print "y_pred: " + ' '.join(str(id_).ljust(4)
+            #                             for id_ in next_sentence)
+            y_next_pred = np.argmax(predictions[-1])
+            prob[start_index + sentence_length] = predictions[-1][y_next_true]
+
+        return prob
+
+
 def run(hidden_len=512, batch_size=128, nb_epoch=50, nb_iterations=4, lr=0.001,
         validation_split=0.05, # pylint: disable=W0613
         nb_predictions=20, mapping='m2m', sentence_length=80, step=80,
@@ -592,10 +649,10 @@ def run(hidden_len=512, batch_size=128, nb_epoch=50, nb_iterations=4, lr=0.001,
     analyzer.build(layer='LSTM', mapping=mapping, nb_layers=2, dropout=0.2)
 
     # plot model
-    # rnn.plot_model()
+    # analyzer.plot_model()
 
     # load the previous model weights
-    # rnn.load_model("weightsf4-61.hdf5")
+    # analyzer.load_model("weightsf4-61.hdf5")
 
     # reset the learning rate
     if lr != 0.001:
diff --git a/sequence_analyzer_gen.py b/sequence_analyzer_gen.py
@@ -553,6 +553,63 @@ def train(analyzer, train_data, nb_training_samples,
         analyzer.save_model("weights-after-iteration.hdf5")
 
 
+def detect(sequence, input_len, analyzer, mapping='m2m', sentence_length=40):
+    """
+    Scan the given sequence for detecting anormalies.
+
+    Arguments:
+        sequence: {lsit}, the original input sequence
+        input_len: {integer}, the number of unique id classes
+        analyzer: {SequenceAnalyzer}, the sequence analyzer
+        mapping: {string}, input to output mapping.
+            'o2o': one-to-one
+            'm2m': many-to-many
+        sentence_length: {integer}, the length of each sentence.
+    """
+    # sequence length
+    length = len(sequence)
+
+    # predicted probabilities for each id
+    # we assume the first sentence_length ids are true
+    prob = [1] * sentence_length + [0] * (length - sentence_length)
+
+    # generate elements
+    for start_index in xrange(length - sentence_length):
+        # seed sentence
+        X = sequence[start_index : start_index + sentence_length]
+        # print "X:      " + ' '.join(str(s).ljust(4) for s in sentence)
+
+        # Y_true
+        # y_true = sequence[start_index + 1 : start_index + sentence_length + 1]
+        # print "y_true: " + ' '.join(str(s).ljust(4) for s in y_true)
+        y_next_true = sequence[start_index + sentence_length]
+
+        seed = np.zeros((1, sentence_length, input_len))
+        # format input
+        for t in range(0, sentence_length):
+            seed[0, t, X[t]] = 1
+
+        # get predictionsverbose = 0, no logging
+        predictions = analyzer.model.predict(seed, verbose=0)[0]
+
+        # y_predicted
+        y_next_pred = 0
+        if mapping == 'o2o':
+            prob[start_index + sentence_length] = predictions[y_next_true]
+            y_next_pred = np.argmax(predictions)
+        elif mapping == 'm2m':
+            # next_sentence = []
+            # for pred in predictions:
+            #     next_sentence.append(np.argmax(pred))
+            # y_next_pred = next_sentence[-1]
+            # print "y_pred: " + ' '.join(str(id_).ljust(4)
+            #                             for id_ in next_sentence)
+            y_next_pred = np.argmax(predictions[-1])
+            prob[start_index + sentence_length] = predictions[-1][y_next_true]
+
+        return prob
+
+
 def run(hidden_len=512, batch_size=128, nb_batch=200, nb_epoch=50,
         nb_iterations=4, lr=0.001, validation_split=0.05, nb_predictions=20,
         mapping='m2m', sentence_length=80, step=80, mode='train'):