salesforce · danFromTelAviv · May 16, 2019 · May 16, 2019 · May 16, 2019 · May 19, 2019
diff --git a/torchqrnn/qrnn.py b/torchqrnn/qrnn.py
@@ -8,6 +8,43 @@
     from .forget_mult import ForgetMult
 
 
+class BiDirQRNNLayer(nn.Module):
+    def __init__(self, input_size, hidden_size=None, save_prev_x=False, zoneout=0, window=1, output_gate=True,
+                 use_cuda=True):
+        super(BiDirQRNNLayer, self).__init__()
+
+        assert window in [1,
+                          2], "This QRNN implementation currently only handles convolutional window of size 1 or size 2"
+        self.window = window
+        self.input_size = input_size
+        self.hidden_size = hidden_size if hidden_size else input_size
+        self.zoneout = zoneout
+        self.save_prev_x = save_prev_x
+        self.prevX = None
+        self.output_gate = output_gate
+        self.use_cuda = use_cuda
+
+        self.forward_qrnn = QRNNLayer(input_size, hidden_size=hidden_size, save_prev_x=save_prev_x, zoneout=zoneout,
+                                      window=window,
+                                      output_gate=output_gate, use_cuda=use_cuda)
+        self.backward_qrnn = QRNNLayer(input_size, hidden_size=hidden_size, save_prev_x=save_prev_x, zoneout=zoneout,
+                                       window=window,
+                                       output_gate=output_gate, use_cuda=use_cuda)
+
+    def forward(self, X, hidden=None):
+        fwd, h_fwd = self.forward_qrnn(X, hidden=hidden)
+        bwd, h_bwd = self.backward_qrnn(torch.flip(X, [0]), hidden=hidden)
+        return torch.flip(torch.cat([fwd, bwd], dim=-1), [0]), torch.cat([h_fwd, h_bwd], dim=-1)
+
+
+def fast_tanh(x):
+    return x / (1 + x.abs())
+
+
+def fast_sigmoid(x):
+    return (x / 2) / (1 + x.abs()) + 0.5
+
+
 class QRNNLayer(nn.Module):
     r"""Applies a single layer Quasi-Recurrent Neural Network (QRNN) to an input sequence.
 
@@ -29,10 +66,12 @@ class QRNNLayer(nn.Module):
         - h_n (batch, hidden_size): tensor containing the hidden state for t=seq_len
     """
 
-    def __init__(self, input_size, hidden_size=None, save_prev_x=False, zoneout=0, window=1, output_gate=True, use_cuda=True):
+    def __init__(self, input_size, hidden_size=None, save_prev_x=False, zoneout=0, window=1, output_gate=True,
+                 use_cuda=True):
         super(QRNNLayer, self).__init__()
 
-        assert window in [1, 2], "This QRNN implementation currently only handles convolutional window of size 1 or size 2"
+        assert window in [1,
+                          2], "This QRNN implementation currently only handles convolutional window of size 1 or size 2"
         self.window = window
         self.input_size = input_size
         self.hidden_size = hidden_size if hidden_size else input_size
@@ -43,7 +82,8 @@ def __init__(self, input_size, hidden_size=None, save_prev_x=False, zoneout=0, w
         self.use_cuda = use_cuda
 
         # One large matmul with concat is faster than N small matmuls and no concat
-        self.linear = nn.Linear(self.window * self.input_size, 3 * self.hidden_size if self.output_gate else 2 * self.hidden_size)
+        self.linear = nn.Linear(self.window * self.input_size,
+                                3 * self.hidden_size if self.output_gate else 2 * self.hidden_size)
 
     def reset(self):
         # If you are saving the previous value of x, you should call this when starting with a new state
@@ -76,8 +116,8 @@ def forward(self, X, hidden=None):
             Y = Y.view(seq_len, batch_size, 2 * self.hidden_size)
             Z, F = Y.chunk(2, dim=2)
         ###
-        Z = torch.nn.functional.tanh(Z)
-        F = torch.nn.functional.sigmoid(F)
+        Z = fast_tanh(Z)  # torch.nn.functional.tanh(Z)
+        F = fast_sigmoid(F)  # torch.nn.functional.sigmoid(F)
 
         # If zoneout is specified, we perform dropout on the forget gates in F
         # If an element of F is zero, that means the corresponding neuron keeps the old value
@@ -100,7 +140,7 @@ def forward(self, X, hidden=None):
 
         # Apply (potentially optional) output gate
         if self.output_gate:
-            H = torch.nn.functional.sigmoid(O) * C
+            H = fast_sigmoid(O) * C  # torch.nn.functional.sigmoid(O) * C
         else:
             H = C
 
@@ -137,13 +177,21 @@ class QRNN(torch.nn.Module):
     def __init__(self, input_size, hidden_size,
                  num_layers=1, bias=True, batch_first=False,
                  dropout=0, bidirectional=False, layers=None, **kwargs):
-        assert bidirectional == False, 'Bidirectional QRNN is not yet supported'
+        # assert bidirectional == False, 'Bidirectional QRNN is not yet supported'
         assert batch_first == False, 'Batch first mode is not yet supported'
         assert bias == True, 'Removing underlying bias is not yet supported'
 
         super(QRNN, self).__init__()
 
-        self.layers = torch.nn.ModuleList(layers if layers else [QRNNLayer(input_size if l == 0 else hidden_size, hidden_size, **kwargs) for l in range(num_layers)])
+        if bidirectional:
+            self.layers = torch.nn.ModuleList(
+                layers if layers else [BiDirQRNNLayer(input_size if l == 0 else hidden_size * 2, hidden_size, **kwargs)
+                                       for l in
+                                       range(num_layers)])
+        else:
+            self.layers = torch.nn.ModuleList(
+                layers if layers else [QRNNLayer(input_size if l == 0 else hidden_size, hidden_size, **kwargs) for l in
+                                       range(num_layers)])
 
         self.input_size = input_size
         self.hidden_size = hidden_size
@@ -202,6 +250,7 @@ def forward(self, input, hidden=None):
     assert diff < 1e-5, 'CUDA and non-CUDA QRNN layers return different results'
 
     from torch.autograd import gradcheck
-    inputs = [X,]
+
+    inputs = [X, ]
     test = gradcheck(QRNNLayer(hidden_size, hidden_size).cuda(), inputs)
     print(test)