Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Recurrent Attention - MixtureOfGaussian1DAttention PoC #3

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions examples/recurrent_attention.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from __future__ import division, print_function

import random

import numpy as np

from keras import Input
from keras.engine import Model
from keras.layers import Dense, TimeDistributed, LSTMCell, RNN

from keras.layers.attention import MixtureOfGaussian1DAttention

# canonical example of attention for alignment
# in this example the model should learn to "parse" through and attended
# sequence and output only relevant parts


# TODO:
# - add proper docs
# - same format as other examples
# - add encoder-decoder version for comparison of parameters efficiency
# - compare use_delta=True/False (converges faster with True)

def get_training_data(
n_samples,
n_labels,
n_timesteps_attended,
n_timesteps_labels,
):
labels = np.random.randint(
n_labels,
size=(n_samples, n_timesteps_labels)
)
attended_time_idx = range(n_timesteps_attended)
label_time_idx = range(1, n_timesteps_labels + 1)

labels_one_hot = np.zeros((n_samples, n_timesteps_labels + 1, n_labels))
attended = np.zeros((n_samples, n_timesteps_attended, n_labels))
for i in range(n_samples):
labels_one_hot[i][label_time_idx, labels[i]] = 1
positions = sorted(random.sample(attended_time_idx, n_timesteps_labels))
attended[i][positions, labels[i]] = 1

return labels_one_hot, attended


n_samples = 10000
n_timesteps_labels = 10
n_timesteps_attended = 30
n_labels = 4

input_labels = Input((n_timesteps_labels, n_labels))
attended = Input((n_timesteps_attended, n_labels))

cell = MixtureOfGaussian1DAttention(LSTMCell(64), n_components=3)
attention_lstm = RNN(cell, return_sequences=True)

attention_lstm_output = attention_lstm(input_labels, constants=attended)
output_layer = TimeDistributed(Dense(n_labels, activation='softmax'))
output = output_layer(attention_lstm_output)

model = Model(
inputs=[input_labels, attended],
outputs=output
)

labels_data, attended_data = get_training_data(
n_samples,
n_labels,
n_timesteps_attended,
n_timesteps_labels
)
input_labels_data = labels_data[:, :-1, :]
target_labels_data = labels_data[:, 1:, :]

model.compile(optimizer='Adam', loss='categorical_crossentropy')
model.fit(
x=[input_labels_data, attended_data],
y=target_labels_data,
nb_epoch=5
)
output_data = model.predict([input_labels_data, attended_data])
2 changes: 1 addition & 1 deletion keras/backend/tensorflow_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -1943,7 +1943,7 @@ def arange(start, stop=None, step=1, dtype='int32'):

"""
# Match the behavior of numpy and Theano by returning an empty seqence.
if stop is None and start < 0:
if stop is None and isinstance(start, int) and start < 0:
start = 0
result = tf.range(start, limit=stop, delta=step, name='arange')
if dtype != 'int32':
Expand Down
173 changes: 173 additions & 0 deletions keras/distribution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
from __future__ import division, print_function

from collections import OrderedDict

import numpy as np

from keras import backend as K
from keras.layers import Dense, concatenate
from keras.activations import softmax


class DistributionBase(object):
"""Defines activation for distribution parameters and (default) loss for
target given the distribution."""

def activation(self, x):
"""Activation function to apply to get parameters of distribution."""
raise NotImplementedError

def loss(self, y_true, y_pred):
"""Implementation of standard loss for this this distribution, normally
-log(pdf(y_true)) where pdf is parametrized by y_pred (the parameters).
"""
raise NotImplementedError

def pdf(self, y_true, y_pred):
"""Probability density function"""
raise NotImplementedError

@property
def num_params(self):
"""Expected size of x in activation and y_pred in loss"""
raise NotImplementedError

def get_config(self):
raise NotImplementedError


class MixtureDistributionBase(DistributionBase):
"""Base class for Mixture Distributions"""

def __init__(self, num_components):
self.num_components = num_components

@property
def mixture_weight_activation(self):
return softmax

def param_type_to_size(self):
"""
# Returns
An OrderedDict of param_type (str) to size (int)
# Example
return OrderedDict([
('mixture_weight', self.n_components),
...
])
"""
raise NotImplementedError

def split_param_types(self, x):
"""Splits input tensor into the different param types. This method is
useful for applying activation and computing loss.
# Args
x : Tensor with shape[-1] == self.n_params
# Returns
list of Tensors, one for each param type
"""
if isinstance(x, np.ndarray):
last_dim = x.shape[-1]
else:
last_dim = x.shape[-1].value # TODO only works with tf
if not last_dim == self.num_params:
raise ValueError(
'last dimension of x must be equal to the number of parameters'
' of distribution, got {}, expected {}'.format(
last_dim,
self.num_params
)
)

idx = 0
param_types = []
for size in self.param_type_to_size.values():
param_types.append(x[..., idx:idx+size])
idx += size

return param_types

@property
def num_params(self):
return sum(self.param_type_to_size.values())

def get_config(self):
return dict(num_components=self.num_components)


class ScaledExponential(object): # TODO move to advanced activations

def __init__(self, scale=1, epsilon=1e-3):
self.scale = scale
self.epsilon = epsilon

def __call__(self, x):
return self.scale * K.exp(x) + self.epsilon


class MixtureOfGaussian1D(MixtureDistributionBase):
"""1D Mixture of Gaussian distribution"""

def __init__(
self,
num_components,
mu_activation=None,
sigma_activation=None,
):
super(MixtureOfGaussian1D, self).__init__(num_components)
self.mu_activation = mu_activation or (lambda x: x)
self.sigma_activation = sigma_activation or ScaledExponential()

@property
def param_type_to_size(self):
return OrderedDict([
('mixture_weight', self.num_components),
('mu', self.num_components),
('sigma', self.num_components)
])

def activation(self, x):
_mixture_weights, _mu, _sigma = self.split_param_types(x)
mixture_weights = self.mixture_weight_activation(_mixture_weights)
mu = self.mu_activation(_mu)
sigma = self.sigma_activation(_sigma)

return concatenate([mixture_weights, mu, sigma], axis=-1)

def loss(self, y_true, y_pred):
"""Negative log pdf. Used logsum trick for numerical stability"""
mixture_weights, mu, sigma, = self.split_param_types(y_pred)
norm = 1. / (np.sqrt(2. * np.pi) * sigma)
exponent = -(
K.square(y_true - mu) / (2. * K.square(sigma)) -
K.log(mixture_weights) -
K.log(norm)
)
return -K.logsumexp(exponent, axis=-1)

def pdf(self, y_true, y_pred):
raise NotImplementedError # TODO

def get_config(self, y_true, y_pred):
raise NotImplementedError # TODO


class DistributionOutputLayer(Dense):
"""Wraps Dense layer to output distribution parameters based on passed
distribution.

# Arguments
distribution (DistributionABC): The distribution to output parameters
for
"""
def __init__(self, distribution, **kwargs):
self.distribution = distribution
if 'units' in kwargs or 'activation' in kwargs:
raise ValueError(
'"units" or "activation" should not be passed as kwargs '
'as this is already specified by the passed distribution'
)
super(DistributionOutputLayer, self).__init__(
units=distribution.num_params,
activation=distribution.activation
)
Loading