test/test_autograd.py

import contextlib
import gc
import sys
import math
import torch
import unittest
from copy import deepcopy
from collections import OrderedDict

from common import make_jacobian, TestCase, iter_tensors, get_numerical_jacobian
from torch.autograd._functions import *
from torch.autograd import Variable, Function

if sys.version_info[0] == 2:
    import cPickle as pickle
else:
    import pickle

PRECISION = 1e-4


def iter_gradients(x):
    if isinstance(x, Variable):
        if x.requires_grad:
            yield x.grad.data
    else:
        for elem in x:
            for result in iter_gradients(elem):
                yield result


def zero_gradients(i):
    for t in iter_gradients(i):
        t.zero_()


def get_analytical_jacobian(input, output):
    jacobian = make_jacobian(input, output.numel())
    grad_output = output.data.clone().zero_()
    flat_grad_output = grad_output.view(-1)

    for i in range(flat_grad_output.numel()):
        flat_grad_output.zero_()
        flat_grad_output[i] = 1
        zero_gradients(input)
        output.backward(grad_output, retain_variables=True)
        for jacobian_x, d_x in zip(jacobian, iter_gradients(input)):
            jacobian_x[:,i] = d_x

    return jacobian


@contextlib.contextmanager
def backward_engine(engine):
    _prev_engine = Variable._execution_engine
    Variable._execution_engine = engine()
    try:
        yield
    finally:
        Variable._execution_engine = _prev_engine


class TestAutograd(TestCase):

    def test_hooks(self):
        x = Variable(torch.ones(5, 5), requires_grad=True)
        y = Variable(torch.ones(5, 5) * 4, requires_grad=True)

        counter = [0]
        def bw_hook(inc, grad):
            self.assertIsInstance(grad, Variable)
            counter[0] += inc

        z = x ** 2 + x * 2 + x * y + y
        test = z.register_hook(lambda *args: bw_hook(1, *args))
        z.backward(torch.ones(5, 5), retain_variables=True)
        self.assertEqual(counter[0], 1)

        test2 = z.register_hook(lambda *args: bw_hook(2, *args))
        z.backward(torch.ones(5, 5), retain_variables=True)
        self.assertEqual(counter[0], 4)

        test2.remove()
        z.backward(torch.ones(5, 5), retain_variables=True)
        self.assertEqual(counter[0], 5)

        def bw_hook_modify(grad):
            return grad.mul(2)

        test.remove()
        z.register_hook(bw_hook_modify)
        y.grad.data.zero_()
        z.backward(torch.ones(5, 5), retain_variables=True)
        self.assertEqual(y.grad.data, (x.data + 1) * 2)

        y.register_hook(bw_hook_modify)
        y.grad.data.zero_()
        z.backward(torch.ones(5, 5))
        self.assertEqual(y.grad.data, (x.data + 1) * 4)

    def _test_backward(self):
        v_t = torch.randn(5, 5)
        x_t = torch.randn(5, 5)
        y_t = torch.rand(5, 5) + 0.1
        z_t = torch.randn(5, 5)
        grad_output = torch.randn(5, 5)
        v = Variable(v_t, requires_grad=True)
        x = Variable(x_t, requires_grad=True)
        y = Variable(y_t, requires_grad=True)
        z = Variable(z_t, requires_grad=True)

        v.backward(grad_output)
        self.assertEqual(v.grad.data, grad_output)

        a = x + (y * z) + 4 * z**2 * x / y
        a.backward(grad_output)
        x_grad = 4 * z_t.pow(2) / y_t + 1
        y_grad = z_t - 4 * x_t * z_t.pow(2) / y_t.pow(2)
        z_grad = 8 * x_t * z_t / y_t + y_t
        self.assertEqual(x.grad.data, x_grad * grad_output)
        self.assertEqual(y.grad.data, y_grad * grad_output)
        self.assertEqual(z.grad.data, z_grad * grad_output)

    def test_backward(self):
        self._test_backward()

    @unittest.skip("BasicEngine is out of date")
    def test_backward_basic_engine(self):
        with backward_engine(torch.autograd.engine.BasicEngine):
            self._test_backward()

    def test_multi_backward(self):
        x = Variable(torch.randn(5, 5), requires_grad=True)
        y = Variable(torch.randn(5, 5), requires_grad=True)

        q = Variable(torch.randn(5, 5), requires_grad=True)

        a = Variable(torch.randn(5, 5), requires_grad=True)
        b = Variable(torch.randn(5, 5), requires_grad=True)

        q2 = q * 2
        z = x + y + q2
        c = a * b + q2
        grad_z = torch.randn(5, 5)
        grad_c = torch.randn(5, 5)
        torch.autograd.backward([z, c], [grad_z, grad_c])

        self.assertEqual(x.grad.data, grad_z)
        self.assertEqual(y.grad.data, grad_z)
        self.assertEqual(a.grad.data, grad_c * b.data)
        self.assertEqual(b.grad.data, grad_c * a.data)
        self.assertEqual(q.grad.data, (grad_c + grad_z) * 2)

    def test_multi_backward_stochastic(self):
        x = Variable(torch.randn(5, 5), requires_grad=True)
        y = Variable(torch.randn(5, 5), requires_grad=True)

        z = x + y
        q = torch.normal(x)
        q.reinforce(torch.randn(5, 5))

        torch.autograd.backward([z, q], [torch.ones(5, 5), None])

    def test_multi_backward_no_grad(self):
        x = Variable(torch.randn(5, 5), requires_grad=True)
        y = Variable(torch.randn(5, 5), requires_grad=False)

        z = x + y
        q = y * 2

        torch.autograd.backward([z, q], [torch.ones(5, 5), torch.ones(5, 5)])
        self.assertEqual(x.grad.data, torch.ones(5, 5))

    def test_volatile(self):
        x = Variable(torch.ones(5, 5), requires_grad=True)
        y = Variable(torch.ones(5, 5) * 4, volatile=True)

        z = x ** 2
        self.assertFalse(z.volatile)
        self.assertTrue(z.requires_grad)
        self.assertIsNotNone(z.creator)
        z.backward(torch.ones(5, 5))
        self.assertEqual(x.grad.data, torch.ones(5, 5) * 2)

        w = z + y
        self.assertTrue(w.volatile)
        self.assertFalse(w.requires_grad)
        self.assertRaises(RuntimeError, lambda: w.backward(torch.ones(5, 5)))
        self.assertIsNone(w.creator)

    def test_indexing(self):
        x = torch.range(1, 16).resize_(4, 4)
        y = Variable(x)
        self.assertEqual(x[1], y[1].data)
        self.assertEqual(x[1, 1], y[1, 1].data[0])
        self.assertEqual(x[1:], y[1:].data)
        self.assertEqual(x[:2], y[:2].data)
        self.assertEqual(x[:2, 2], y[:2, 2].data)
        self.assertEqual(x[1:2, 2], y[1:2, 2].data)
        self.assertEqual(x[1, 2:], y[1, 2:].data)

    def test_requires_grad(self):
        x = Variable(torch.randn(5, 5))
        y = Variable(torch.randn(5, 5))
        z = Variable(torch.randn(5, 5), requires_grad=True)
        a = x + y
        self.assertFalse(a.requires_grad)
        b = a + z
        self.assertTrue(b.requires_grad)
        def error():
            raise RuntimeError
        # Make sure backward isn't called on these
        a._backward_hooks = OrderedDict()
        x._backward_hooks = OrderedDict()
        y._backward_hooks = OrderedDict()
        a._backward_hooks['test'] = error
        x._backward_hooks['test'] = error
        y._backward_hooks['test'] = error
        b.backward(torch.ones(5, 5))

    def test_inplace(self):
        x = Variable(torch.ones(5, 5), requires_grad=True)
        y = Variable(torch.ones(5, 5) * 4, requires_grad=True)

        z = x * y
        q = z + y
        w = z * y
        z.add_(2)
        # Add doesn't need it's inputs to do backward, so it shouldn't raise
        q.backward(torch.ones(5, 5), retain_variables=True)
        # Mul saves both inputs in forward, so it should raise
        self.assertRaises(RuntimeError, lambda: w.backward(torch.ones(5, 5)))

        z = x * y
        q = z * y
        r = z + y
        w = z.add_(y)
        # w is a the last expression, so this should succeed
        w.backward(torch.ones(5, 5), retain_variables=True)
        # r doesn't use the modified value in backward, so it should succeed
        r.backward(torch.ones(5, 5), retain_variables=True)
        # q uses dirty z, so it should raise
        self.assertRaises(RuntimeError, lambda: q.backward(torch.ones(5, 5)))

        x.grad.data.zero_()
        m = x / 2
        z = m + y / 8
        q = z * y
        r = z + y
        prev_version = z._version
        w = z.exp_()
        self.assertNotEqual(z._version, prev_version)
        r.backward(torch.ones(5, 5), retain_variables=True)
        self.assertEqual(x.grad.data, torch.ones(5, 5) / 2)
        w.backward(torch.ones(5, 5), retain_variables=True)
        self.assertEqual(x.grad.data, torch.Tensor(5, 5).fill_((1 + math.e) / 2))
        self.assertRaises(RuntimeError, lambda: q.backward(torch.ones(5, 5)))

        leaf = Variable(torch.ones(5, 5), requires_grad=True)
        x = leaf.clone()
        x.add_(10)
        self.assertEqual(x.data, torch.ones(5, 5) * 11)
        # x should be still usable
        y = x + 2
        y.backward(torch.ones(5, 5))
        self.assertEqual(leaf.grad.data, torch.ones(5, 5))
        z = x * y
        x.add_(2)
        self.assertRaises(RuntimeError, lambda: z.backward(torch.ones(5, 5)))

    def test_shared_storage(self):
        x = Variable(torch.ones(5, 5))
        y = x.t()
        z = x[1]
        self.assertRaises(RuntimeError, lambda: x.add_(2))
        self.assertRaises(RuntimeError, lambda: y.add_(2))
        self.assertRaises(RuntimeError, lambda: z.add_(2))

    def _test_setitem(self, size, index):
        x = Variable(torch.ones(*size), requires_grad=True)
        y = x + 2
        y_version = y._version
        y[index] = 2
        self.assertNotEqual(y._version, y_version)
        y.backward(torch.ones(*size))
        expected_grad = torch.ones(*size)
        if isinstance(index, Variable):
            index = index.data
        expected_grad[index] = 0
        self.assertEqual(x.grad.data, expected_grad)

    def _test_setitem_tensor(self, size, index):
        x = Variable(torch.ones(*size), requires_grad=True)
        y = x + 2
        y_version = y._version
        value = Variable(torch.Tensor(x[index].size()).fill_(7), requires_grad=True)
        y[index] = value
        self.assertNotEqual(y._version, y_version)
        y.backward(torch.ones(*size))
        expected_grad_input = torch.ones(*size)
        if isinstance(index, Variable):
            index = index.data
        expected_grad_input[index] = 0
        self.assertEqual(x.grad.data, expected_grad_input)
        self.assertEqual(value.grad.data, torch.ones(value.size()))

    def test_setitem(self):
        self._test_setitem((5, 5), 1)
        self._test_setitem((5,), 1)
        self._test_setitem((1,), 0)
        self._test_setitem_tensor((5, 5), 3)
        self._test_setitem_tensor((5,), 3)

    def test_setitem_mask(self):
        mask = torch.ByteTensor(5, 5).bernoulli_()
        self._test_setitem((5, 5), Variable(mask))
        self._test_setitem((5,), Variable(mask[0]))
        self._test_setitem((1,), Variable(mask[0, 0:1]))
        self._test_setitem_tensor((5, 5), Variable(mask))
        self._test_setitem_tensor((5,), Variable(mask[0]))

    def test_unused_output(self):
        x = Variable(torch.randn(10, 10), requires_grad=True)
        outputs = x.chunk(5)
        o = outputs[2]
        o = o * 4 + 2
        o.sum().backward()
        expected_grad = torch.zeros(10, 10)
        expected_grad[4:6] = 4
        self.assertEqual(x.grad.data, expected_grad)

        x.grad.data.zero_()
        grad_output = torch.randn(2, 10)
        outputs = x.chunk(5)
        outputs[0].backward(grad_output)
        expected_grad = torch.zeros(10, 10)
        expected_grad[:2] = grad_output
        self.assertEqual(x.grad.data, expected_grad)

    def test_gc_in_destructor(self):
        """
        Previously, if a Function destructor triggered a garbage collection,
        the Variable's tp_dealloc handler would get called twice leading to a
        segfault.
        """
        class CollectOnDelete(Function):
            def __del__(self):
                gc.collect()

        for i in range(10):
            Variable(torch.randn(10, 10), creator=CollectOnDelete())

    @unittest.skipIf(not torch.cuda.is_available() or torch.cuda.device_count() < 2,
            "CUDA not available or <2 GPUs detected")
    def test_unused_output_gpu(self):
        from torch.nn.parallel._functions import Broadcast
        x = Variable(torch.randn(5, 5).float().cuda(), requires_grad=True)
        outputs = Broadcast(list(range(torch.cuda.device_count())))(x)
        y = outputs[-1] * 2
        y.sum().backward()
        self.assertEqual(x.grad.data, torch.ones(5, 5) * 2)

    def test_detach(self):
        x = Variable(torch.randn(10, 10), requires_grad=True)
        y = x + 2
        y = y.detach()
        z = y * 4 + 2
        self.assertFalse(y.requires_grad)
        self.assertFalse(z.requires_grad)

        x = Variable(torch.randn(10, 10), requires_grad=True)
        y = x * 2
        y = y.detach()
        self.assertFalse(y.requires_grad)
        self.assertFalse(y.creator.requires_grad)
        z = x + y
        z.sum().backward()
        # This is an incorrect gradient, but we assume that's what the user
        # wanted. detach() is an advanced option.
        self.assertEqual(x.grad.data, torch.ones(10, 10))

    def test_type_conversions(self):
        import torch.cuda
        x = Variable(torch.randn(5, 5))
        self.assertIs(type(x.float().data), torch.FloatTensor)
        self.assertIs(type(x.int().data), torch.IntTensor)
        if torch.cuda.is_available():
            self.assertIs(type(x.float().cuda().data), torch.cuda.FloatTensor)
            self.assertIs(type(x.int().cuda().data), torch.cuda.IntTensor)
            self.assertIs(type(x.int().cuda().cpu().data), torch.IntTensor)
            if torch.cuda.device_count() > 2:
                x2 = x.float().cuda(1)
                self.assertIs(type(x2.data), torch.cuda.FloatTensor)
                self.assertIs(x2.get_device(), 1)
                x2 = x.float().cuda()
                self.assertIs(type(x2.data), torch.cuda.FloatTensor)
                self.assertIs(x2.get_device(), 0)
                x2 = x2.cuda(1)
                self.assertIs(type(x2.data), torch.cuda.FloatTensor)
                self.assertIs(x2.get_device(), 1)

    def test_return_leaf(self):
        class Identity(Function):
            def forward(self, a, b):
                return a, a + b

            def backward(self, grad_a, grad_b):
                return grad_a + grad_b, grad_b

        class Inplace(InplaceFunction):
            def forward(self, a, b):
                self.mark_dirty(a)
                return a.add_(b), b + 2

            def backward(self, grad_a, grad_b):
                return grad_a, grad_a + grad_b

        x = Variable(torch.randn(5, 5), requires_grad=True)
        y = Variable(torch.randn(5, 5), requires_grad=True)

        q, p = Identity()(x, y)
        # Make sure hooks only receive grad from usage of q, not x.
        q.register_hook(
            lambda grad: self.assertEqual(grad.data, torch.ones(5, 5)))
        (q + p + x).sum().backward()
        self.assertEqual(x.grad.data, torch.ones(5, 5) * 3)
        self.assertEqual(y.grad.data, torch.ones(5, 5))
        del q, p  # these need to be freed, or next part will raise an error

    def test_return_leaf_inplace(self):
        class Inplace(InplaceFunction):
            def forward(self, a, b):
                self.mark_dirty(a)
                return a.add_(b), b + 2

            def backward(self, grad_a, grad_b):
                return grad_a, grad_a + grad_b

        x = Variable(torch.randn(5, 5))
        y = Variable(torch.randn(5, 5), requires_grad=True)

        fn = Inplace(True)
        q, p = fn(x, y)
        self.assertIs(q, x)
        self.assertIs(q.creator, fn)
        self.assertTrue(q.requires_grad)
        q.sum().backward()
        self.assertEqual(y.grad.data, torch.ones(5, 5))

    def test_leaf_assignment(self):
        x = Variable(torch.randn(5, 5))
        y = Variable(torch.randn(5), requires_grad=True)
        z = Variable(torch.randn(5), requires_grad=True)

        x[0] = y
        x[1] = 2 * z
        self.assertTrue(x.requires_grad)
        self.assertIsNot(x.creator, None)
        x.sum().backward()
        self.assertEqual(y.grad.data, torch.ones(5))
        self.assertEqual(z.grad.data, torch.ones(5) * 2)

    def test_backward_copy(self):
      # This tests checks backward engine for a very subtle bug that appreared
      # in one of the initial versions of autograd. Gradients tensors were
      # simply stored in lists while the function waited for all its gradients
      # to be computed. However, sometimes an output was used multiple times,
      # so the gradients needed to be summed. Engine used to keep a need_copy
      # set of tensors that will need a clone upon next addition and removed
      # them from the set as soon as the clone was performed. However, this
      # could lead to incorrect results if the same gradient tensor was
      # buffered in three places in the graph:
      # 1. When accumulating gradients in one of these places it was cloned
      #    and removed from need_copy set.
      # 2. When accumulating in second place, it wasn't in the need_copy set,
      #    so the gradients were simply accumulated in-place (which already
      #    modified the grad in 3rd place)
      # 3. When accumulating in the third place, it wasn't in the need_copy set
      #    as well, so the incoming gradient was summed in-place, yielding
      #    incorrect results in all functions, except the first one.
      x = Variable(torch.ones(5, 5), requires_grad=True)
      y = Variable(torch.ones(5, 5), requires_grad=True)
      # Simulate that we're in the middle of the graph
      a = x + 2
      b = y + 2
      c = x + 2
      # This op will just return grad_output two times in backward
      add1 = a + b
      add2 = add1 + c
      # Simulate a long branch, so grad_output will get buffered.
      for i in range(4):
        a = a * 2
        b = b * 2
        c = c * 2
      branch = a + b + c
      out = add2 + branch
      # expected gradients are:
      # for x: 34 (16 from final a, 16 from final c, 2 from add2)
      # for y: 17 (16 from final b, 1 from add2)
      grad_output = torch.ones(5, 5)
      out.backward(grad_output)
      self.assertEqual(x.grad.data, torch.ones(5, 5) * 34)
      self.assertEqual(y.grad.data, torch.ones(5, 5) * 17)

    def test_functional_blas(self):
        def compare(fn, *args):
            unpacked_args = tuple(arg.data if isinstance(arg, Variable) else arg
                                    for arg in args)
            self.assertEqual(fn(*args).data, fn(*unpacked_args))

        def test_blas_add(fn, x, y, z):
            # Checks all signatures
            compare(fn, x, y, z)
            compare(fn, 0.5, x, y, z)
            compare(fn, 0.5, x, 0.25, y, z)

        def test_blas(fn, x, y):
            compare(fn, x, y)

        test_blas(torch.mm, Variable(torch.randn(2, 10)),
                Variable(torch.randn(10, 4)))
        test_blas_add(torch.addmm, Variable(torch.randn(2, 4)),
                Variable(torch.randn(2, 10)), Variable(torch.randn(10, 4)))
        test_blas(torch.bmm, Variable(torch.randn(4, 2, 10)),
                Variable(torch.randn(4, 10, 4)))
        test_blas_add(torch.addbmm, Variable(torch.randn(2, 4)),
                Variable(torch.randn(4, 2, 10)), Variable(torch.randn(4, 10, 4)))
        test_blas_add(torch.baddbmm, Variable(torch.randn(4, 2, 4)),
                Variable(torch.randn(4, 2, 10)), Variable(torch.randn(4, 10, 4)))
        test_blas(torch.mv, Variable(torch.randn(2, 10)),
                Variable(torch.randn(10)))
        test_blas_add(torch.addmv, Variable(torch.randn(2)),
                Variable(torch.randn(2, 10)), Variable(torch.randn(10)))
        test_blas(torch.ger, Variable(torch.randn(5)),
                Variable(torch.randn(6)))
        test_blas_add(torch.addr, Variable(torch.randn(5, 6)),
                Variable(torch.randn(5)), Variable(torch.randn(6)))

    def test_save_none_for_backward(self):
        test_case = self
        class MyFn(Function):
            def forward(self, input):
                self.save_for_backward(None, input, None)
                return input * input

            def backward(self, grad_output):
                n1, input, n2 = self.saved_tensors
                test_case.assertIsNone(n1)
                test_case.assertIsNone(n2)
                return 2 * input * grad_output

        x = Variable(torch.randn(5, 5), requires_grad=True)
        y = MyFn()(x)
        y.sum().backward()
        self.assertEqual(x.grad.data, 2 * x.data)

    def test_too_many_grads(self):
        class MyFn(Function):
            def forward(self, input):
                return input

            def backward(self, grad_output):
                return grad_output, None, None

        x = Variable(torch.randn(5, 5), requires_grad=True)
        y = MyFn()(x)
        y.sum().backward()
        self.assertEqual(x.grad.data, x.data.clone().fill_(1))

    def test_stochastic(self):
        x = Variable(torch.rand(2, 10), requires_grad=True)
        stddevs = Variable(torch.rand(2, 10) * 5, requires_grad=True)
        y = (x * 2).clamp(0, 1)
        y = y / y.sum(1).expand_as(y)
        samples_multi = y.multinomial(5)
        samples_multi_flat = y[0].multinomial(5)
        samples_bernoulli = y.bernoulli()
        samples_norm = torch.normal(y)
        samples_norm_std = torch.normal(y, stddevs)
        z = samples_multi * 2 + 4
        z = z + samples_multi_flat.unsqueeze(0).expand_as(samples_multi)
        z = torch.cat([z, z], 1)
        z = z.double()
        z = z + samples_bernoulli + samples_norm + samples_norm_std
        last_sample = torch.normal(z, 4)
        z = last_sample + 2
        self.assertFalse(z.requires_grad)

        self.assertRaises(RuntimeError, lambda: z.backward(retain_variables=True))
        samples_multi.reinforce(torch.randn(2, 5))
        self.assertRaises(RuntimeError, lambda: z.backward(retain_variables=True))
        samples_multi_flat.reinforce(torch.randn(5))
        self.assertRaises(RuntimeError, lambda: z.backward(retain_variables=True))
        samples_bernoulli.reinforce(torch.randn(2, 10))
        self.assertRaises(RuntimeError, lambda: z.backward(retain_variables=True))
        samples_norm.reinforce(torch.randn(2, 10))
        self.assertRaises(RuntimeError, lambda: z.backward(retain_variables=True))
        samples_norm_std.reinforce(torch.randn(2, 10))
        # We don't have to specify rewards w.r.t. last_sample - it doesn't
        # require gradient

        last_sample.backward(retain_variables=True)
        z.backward()

        self.assertGreater(x.grad.data.abs().sum(), 0)

    def test_stochastic_sequence(self):
        x = Variable(torch.rand(10).clamp_(0, 1), requires_grad=True)
        b = x.bernoulli()
        n1 = torch.normal(b, x)
        n2 = torch.normal(n1, 2)

        b.reinforce(torch.randn(10))
        n1.reinforce(torch.randn(10))
        n2.reinforce(torch.randn(10))

        n2.backward()

        self.assertGreater(x.grad.data.abs().sum(), 0)

    def test_stochastic_output(self):
        x = Variable(torch.rand(10), requires_grad=True)
        b = x.clone().clamp(0, 1).bernoulli()
        b.reinforce(torch.randn(10))
        b.backward()
        self.assertGreater(x.grad.data.abs().sum(), 0)

    def test_pickle(self):
        x = Variable(torch.randn(10, 10), requires_grad=True)
        y = Variable(torch.randn(10, 10), volatile=True)
        z = Variable(torch.randn(10, 10), requires_grad=False)

        def assert_strict_equal(var1, var2):
            self.assertEqual(var1.data, var2.data)
            self.assertEqual(var1.requires_grad, var2.requires_grad)
            self.assertEqual(var1.volatile, var2.volatile)

        serialized = [pickle.dumps([x, y, z], protocol=p) for p in range(3)]
        for dump in serialized:
            xc, yc, zc = pickle.loads(dump)
            assert_strict_equal(xc, x)
            assert_strict_equal(yc, y)
            assert_strict_equal(zc, z)

    def test_dep_nograd(self):
        class F1(Function):
            def forward(self, input):
                out = torch.randn(input.size())
                self.mark_non_differentiable(out)
                return input, out

            def backward(self, grad_output, ignored):
                return grad_output

        class F2(Function):
            def forward(self, input, ignored):
                return input

            def backward(self, grad_output):
                return grad_output, None

        x = Variable(torch.randn(5), requires_grad=True)
        a, b = F1()(x)
        b = b + 1  # separate F1 from F2 by another op
        self.assertTrue(a.requires_grad)
        self.assertFalse(b.requires_grad)
        c = F2()(a, b)
        c.backward(torch.ones(c.size()))
        self.assertEqual(x.grad.data, torch.ones(x.size()))


def index_variable(shape, max_indices):
    if not isinstance(shape, tuple):
        shape = (shape,)
    index = torch.rand(*shape).mul_(max_indices).floor_().long()
    return Variable(index, requires_grad=False)

def gather_variable(shape, index_dim, max_indices):
    assert len(shape) == 2
    assert index_dim < 2
    batch_dim = 1 - index_dim
    index = torch.LongTensor(*shape)
    for i in range(shape[index_dim]):
        index.select(index_dim, i).copy_(
                torch.randperm(max_indices)[:shape[batch_dim]])
    return Variable(index, requires_grad=False)


L = 20
M = 10
S = 5
function_tests = [
    (Add,           (),                 ((M, M), (M, M))                            ),
    (Sub,           (),                 ((M, M), (M, M))                            ),
    (Mul,           (),                 ((M, M), (M, M))                            ),
    (Div,           (),                 ((M, M), torch.rand(M, M) + 5e-2)           ),
    (Pow,           (),                 (torch.rand(M, M) + 1e-3, torch.rand(M, M) + 0.1)),
    (AddConstant,   (3.14,),            ((L, L),)                                   ),
    (SubConstant,   (3.14,),            ((L, L),)                                   ),
    (SubConstant,   (3.14, True),       ((L, L),),                  'from_tensor'   ),
    (MulConstant,   (3.14,),            ((L, L),)                                   ),
    (DivConstant,   (3.14, True),       (torch.rand(L, L) + 1e-1,), 'by_tensor'     ),
    (PowConstant,   (3.14,),            (torch.rand(L, L),)                         ),
    (PowConstant,   (3.14, True),       (torch.rand(L, L),),        'tensor_power'  ),
    (Transpose,     (0, 1),             (torch.rand(L, L),)                         ),
    (Transpose,     (2, 0),             (torch.rand(S, S, S),),     '3d'            ),
    (Permute,       ((0, 4, 3, 5, 1, 2),), ((1, 2, 3, 4, 5, 6),)                    ),
    (Index,         ((1, 2),),          (torch.rand(S, S, S),)                      ),
    (Index,         (slice(0, 3),),     (torch.rand(S, S, S),),     'slice'         ),
    (Index,         ((slice(0, 3), 1),),(torch.rand(S, S, S),),     'slice_index'   ),
    (View,          (S*S, S),           (torch.rand(S, S, S),)                      ),
    (Expand,        ((S, 5, S, 5),),    ((S, 1, S, 1),)                             ),
    (Exp,           (),                 (torch.rand(S, S, S),)                      ),
    (Log,           (),                 (torch.rand(S, S, S) + 1e-2,)               ),
    (Log1p,         (),                 (torch.rand(S, S, S),)                      ),
    (Tanh,          (),                 ((S, S, S),)                                ),
    (Sigmoid,       (),                 ((S, S, S),)                                ),
    (Sinh,          (),                 ((S, S, S),)                                ),
    (Cosh,          (),                 ((S, S, S),)                                ),
    (Abs,           (),                 ((S, S, S),)                                ),
    (Clamp,         (0, 1),             ((S, S, S),)                                ),
    (Sqrt,          (),                 (torch.rand(S, S, S) + 1e-4,)               ),
    (Sin,           (),                 ((S, S, S),)                                ),
    (Cos,           (),                 ((S, S, S),)                                ),
    (Tan,           (),                 (torch.randn(S, S, S).clamp(-1, 1),)        ),
    (Asin,          (),                 (torch.randn(S, S, S).clamp(-0.9, 0.9),)    ),
    (Acos,          (),                 (torch.randn(S, S, S).clamp(-0.9, 0.9),)    ),
    (Atan,          (),                 ((S, S, S),)                                ),
    (Reciprocal,    (),                 (torch.rand(S, S, S) + 0.1,)                ),
    (Cmax,          (),                 ((S, S, S), (S, S, S))                      ),
    (Cmin,          (),                 ((S, S, S), (S, S, S))                      ),
    (Round,         (),                 ((S, S, S),)                                ),
    (Sign,          (),                 ((S, S, S),)                                ),
    (Trunc,         (),                 ((S, S, S),)                                ),
    (Floor,         (),                 ((S, S, S),)                                ),
    (Ceil,          (),                 ((S, S, S),)                                ),
    (Frac,          (),                 ((S, S, S),)                                ),
    (Fmod,          (1.5,),             ((S, S, S),)                                ),
    (Lerp,          (0.2,),             ((S, S, S), (S, S, S))                      ),
    (Rsqrt,         (),                 (torch.rand(S, S, S) + 1e-2,)               ),
    (Remainder,     (1.5,),             ((S, S, S),)                                ),
    (CmaxConstant,  (0.5,),             ((S, S, S),)                                ),
    (CminConstant,  (0.5,),             ((S, S, S),)                                ),
    (Mean,          (),                 ((S, S, S),)                                ),
    (Mean,          (1,),               ((S, S, S),),               'dim'           ),
    (Sum,           (),                 ((S, S, S),)                                ),
    (Sum,           (1,),               ((S, S, S),),               'dim'           ),
    (Prod,          (),                 ((S, S, S),)                                ),
    (Prod,          (1,),               ((S, S, S),),               'dim'           ),
    (Addmm,         (),                 ((S, M), (S, S), (S, M)),                   ),
    (Addmm,         (0.1, 1),           ((S, M), (S, S), (S, M)),   'coef'          ),
    (Addbmm,        (),                 ((S, M), (S, S, S), (S, S, M)),             ),
    (Addbmm,        (0.1, 0.4),         ((S, M), (S, S, S), (S, S, M)), 'coef'      ),
    (Baddbmm,       (),                 ((S, S, M), (S, S, S), (S, S, M)),          ),
    (Baddbmm,       (0.1, 0.4),         ((S, S, M), (S, S, S), (S, S, M)), 'coef'   ),
    (Addmv,         (),                 ((S,), (S, M), (M,)),                       ),
    (Addmv,         (0.1, 0.4),         ((S,), (S, M), (M,)),       'coef'          ),
    (Addr,          (),                 ((S, M), (S,), (M,)),                       ),
    (Addr,          (0.1, 0.4),         ((S, M), (S,), (M,)),       'coef'          ),
    (Dot,           (),                 ((L,), (L,)),                               ),
    (Max,           (),                 ((S, S, S),),                               ),
    (Min,           (),                 ((S, S, S),),                               ),
    (Max,           (0,),               ((S, S, S),),               'dim'           ),
    (Min,           (0,),               ((S, S, S),),               'dim'           ),
    (Mode,          (0,),               ((S, S, S),),                               ),
    (Kthvalue,      (2, 0),             ((S, S, S),),                               ),
    (Median,        (0,),               ((S, S, S),),                               ),
    (Norm,          (1.5,),             (torch.rand(S, S, S),),     '1_5'           ),
    (Norm,          (),                 ((S, S, S),),               '2'             ),
    (Norm,          (3,),               ((S, S, S),),               '3'             ),
    (Norm,          (1.5, 0),           (torch.rand(S, S, S),),     '1_5_dim'       ),
    (Norm,          (2, 0),             ((S, S, S),),               '2_dim'         ),
    (Norm,          (3, 0),             ((S, S, S),),               '3_dim'         ),
    (Addcmul,       (),                 ((S, S), (S, S), (S, S))                    ),
    (Addcmul,       (0.6,),             ((S, S), (S, S), (S, S)),   'scale'         ),
    (Addcdiv,       (),                 ((S, S), (S, S), torch.rand(S, S) + 1e-2)   ),
    (Addcdiv,       (0.6,),             ((S, S), (S, S), torch.rand(S, S) + 1e-2), 'scale'),
    (IndexAdd,      (0,),               ((S, S), index_variable(2, S), (2, S))      ),
    # (IndexCopy,     (0,),               ((S, S), index_variable(2, S), (2, S))      ),
    (IndexFill,     (0, 2),             ((S, S), index_variable(2, S))              ),
    (IndexSelect,   (0,),               ((S, S), index_variable(2, S))              ),
    (Gather,        (0,),               ((M, S), gather_variable((S, S), 1, M))     ),
    (Gather,        (1,),               ((M, S), gather_variable((M, S//2), 0, S)), 'dim1'),
    (Scatter,       (0,),               ((M, S), gather_variable((S, S), 1, M), (S, S))),
    (Scatter,       (1,),               ((M, S), gather_variable((M, S//2), 0, S), (M, S//2)), 'dim1'),
    (Concat,        (0,),               ((1, S, S), (2, S, S), (3, S, S))           ),
    (Resize,        (S*S, S),           ((S, S, S),)                                ),
    (Diag,          (),                 ((S, S),),                  '2d'            ),
    (Diag,          (),                 ((S,),),                    '1d'            ),
    (Tril,          (),                 ((S, S),)                                   ),
    (Tril,          (2,),               ((S, S),),                  'idx'           ),
    (Triu,          (),                 ((S, S),)                                   ),
    (Triu,          (2,),               ((S, S),),                  'idx'           ),
    (Clone,         (),                 ((S, M, S),)                                ),
    (Squeeze,       (),                 ((S, 1, M, 1),)                             ),
    (Squeeze,       (1,),               ((S, 1, M, 1),),            'dim'           ),
    (Unsqueeze,     (0,),               ((S, M, S),),               '0'             ),
    (Unsqueeze,     (1,),               ((S, M, S),),               '1'             ),
    # (MaskedCopy,    (),                 ((S, S), Variable(torch.randn(S, S).gt(0), requires_grad=False), (S, S),)),
    (MaskedFill,    (10,),              ((S, S), Variable(torch.randn(S, S).gt(0), requires_grad=False))),
    (MaskedSelect,  (),                 ((S, S), Variable(torch.randn(S, S).gt(0), requires_grad=False))),
    (Sort,          (),                 ((S, M, S),)                               ),
    (Sort,          (1,),               ((S, M, S),),               'dim'           ),
    (Sort,          (1, True),          ((S, M, S),),               'dim_desc'      ),
    (Topk,          (3,),               ((S, M, S),)                               ),
    (Topk,          (3, 1),             ((S, M, S),),               'dim'           ),
    (Topk,          (3, 1, True),       ((S, M, S),),               'dim_desc'      ),
    (Topk,          (3, 1, True, True), ((S, M, S),),               'dim_desc_sort' ),
]


method_tests = [
    ('add',         (S, S, S),          ((S, S, S),)                                ),
    ('add',         (S, S, S),          (3.14,),                    'constant'      ),
    ('sub',         (S, S, S),          ((S, S, S),)                                ),
    ('sub',         (S, S, S),          (3.14,),                    'constant'      ),
    ('mul',         (S, S, S),          ((S, S, S),)                                ),
    ('mul',         (S, S, S),          (3.14,),                    'constant'      ),
    ('div',         (S, S, S),          ((S, S, S),)                                ),
    ('div',         (S, S, S),          (3.14,),                    'constant'      ),
    ('pow',         (S, S, S),          ((S, S, S),)                                ),
    ('pow',         (S, S, S),          (3.14,),                    'constant'      ),
    ('transpose',   (1, 2, 3),          (1, 2)                                      ),
    ('t',           (1, 2),             ()                                          ),
    ('view',        (S, S, S),          (S*S, S),                                   ),
    ('view_as',      (S, S, S),          ((S*S, S),)                                ),
    ('expand',      (S, 1, S),          (S, S, S)                                   ),
    ('expand',      (torch.Size([S, 1, S]),), (S, S, S),            'size'          ),
    ('exp',         (S, S, S),          ()                                          ),
    ('log',         (S, S, S),          ()                                          ),
    ('log1p',       (S, S, S),          ()                                          ),
    ('tanh',        (S, S, S),          ()                                          ),
    ('sigmoid',     (S, S, S),          ()                                          ),
    ('sinh',        (S, S, S),          ()                                          ),
    ('cosh',        (S, S, S),          ()                                          ),
    ('abs',         (S, S, S),          ()                                          ),
    ('clamp',       (S, S, S),          (0, 1)                                      ),
    ('sqrt',        (S, S, S),          ()                                          ),
    ('sin',         (S, S, S),          ()                                          ),
    ('cos',         (S, S, S),          ()                                          ),
    ('tan',         (S, S, S),          ()                                          ),
    ('asin',        (S, S, S),          ()                                          ),
    ('acos',        (S, S, S),          ()                                          ),
    ('atan',        (S, S, S),          ()                                          ),
    ('reciprocal',  (S, S, S),          ()                                          ),
    ('round',       (S, S, S),          ()                                          ),
    ('sign',        (S, S, S),          ()                                          ),
    ('trunc',       (S, S, S),          ()                                          ),
    ('floor',       (S, S, S),          ()                                          ),
    ('ceil',        (S, S, S),          ()                                          ),
    ('rsqrt',       (S, S, S),          ()                                          ),
    ('fmod',        (S, S, S),          (1.5,)                                      ),
    ('remainder',   (S, S, S),          (1.5,)                                      ),
    ('lerp',        (S, S, S),          ((S, S, S), 0.4)                            ),
    ('max',         (S, S, S),          ()                                          ),
    ('max',         (S, S, S),          ((S, S, S),),               'elementwise'   ),
    ('min',         (S, S, S),          ()                                          ),
    ('min',         (S, S, S),          ((S, S, S),),               'elementwise'   ),
    ('mean',        (S, S, S),          ()                                          ),
    ('mean',        (S, S, S),          (1,),                       'dim'           ),
    ('sum',         (S, S, S),          ()                                          ),
    ('sum',         (S, S, S),          (1,),                       'dim'           ),
    ('prod',        (S, S, S),          ()                                          ),
    ('prod',        (S, S, S),          (1,),                       'dim'           ),
    ('addmm',       (S, M),             ((S, S), (S, M)),                           ),
    ('addmm',       (S, M),             (0.2, 0.6, (S, S), (S, M)), 'coef'          ),
    ('addbmm',      (S, M),             ((S, S, S), (S, S, M)),                     ),
    ('addbmm',      (S, M),             (0.2, 0.6, (S, S, S), (S, S, M)), 'coef'    ),
    ('baddbmm',     (S, S, M),          ((S, S, S), (S, S, M)),                     ),
    ('baddbmm',     (S, S, M),          (0.2, 0.6, (S, S, S), (S, S, M)), 'coef'    ),
    ('addmv',       (S,),               ((S, M), (M,)),                             ),
    ('addmv',       (S,),               (0.2, 0.6, (S, M), (M,)),   'coef'          ),
    ('addr',        (S, M),             ((S,), (M,)),                               ),
    ('addr',        (S, M),             (0.2, 0.6, (S,), (M,)),     'coef'          ),
    ('dot',         (L,),               ((L,),),                                    ),
    ('addcmul',     (S, S),             ((S, S), (S, S))                            ),
    ('addcmul',     (S, S),             (0.5, (S, S), (S, S)),      'scale'         ),
    ('addcdiv',     (S, S),             ((S, S), (S, S))                            ),
    ('addcdiv',     (S, S),             (0.5, (S, S), (S, S)),      'scale'         ),
    ('norm',        (S, S, S),          (2,)                                        ),
    ('norm',        (S, S, S),          (2, 1),                     'dim'           ),
    ('dist',        (S, S, S),          ((S, S, S),)                                ),
    ('dist',        (S, S, S),          ((S, S, S), 4),             '4'             ),
    ('index_select', (S, S, S),         (0, index_variable(2, S))                   ),
    ('diag',        (M, M),             (),                         '2d'            ),
    ('diag',        (M,),               (),                         '1d'            ),
    ('tril',        (M, M),             ()                                          ),
    ('triu',        (M, M),             ()                                          ),
    ('clone',       (S, M, S),          ()                                          ),
    ('permute',     (1, 2, 3, 4),       (0, 2, 3, 1)                                ),
    ('select',      (S, S, S),          (1, 2)                                      ),
    ('narrow',      (S, S, S),          (1, 2, 2)                                   ),
    ('squeeze',     (S, 1, S, 1),       ()                                          ),
    ('squeeze',     (S, 1, S, 1),       (1,),                       '1_dim'         ),
    ('squeeze',     (S, 1, S, 1),       (2,),                       'not_1_dim'     ),
    ('unsqueeze',   (S, S, S),          (0,),                       'first'         ),
    ('unsqueeze',   (S, S, S),          (1,),                       'middle'        ),
    ('unsqueeze',   (S, S, S),          (3,),                       'last'          ),
    ('masked_select', (M, M),           (Variable(torch.ByteTensor(M, M).bernoulli_(), requires_grad=False),)           ),
    ('masked_fill_',  (M, M),           (Variable(torch.ByteTensor(M, M).bernoulli_(), requires_grad=False), 10)        ),
    ('masked_copy_',  (M, M),           (Variable(torch.ByteTensor(M, M).bernoulli_(), requires_grad=False), (M, M))    ),
]
# TODO: mm, bmm, mv, ger
# TODO: max, min with dim (problem with indices)
# TODO: mode, median, sort, kthvalue, topk (problem with indices)
# TODO: indexAdd, indexCopy, indexFill
# TODO: resize, resize_as (tensors only have resize_ and resize_as_)
# TODO: clamp with min/max


def create_input(call_args):
    if not isinstance(call_args, tuple):
        call_args = (call_args,)
    def map_arg(arg):
        if isinstance(arg, tuple) and not isinstance(arg[0], Variable):
            return Variable(torch.randn(*arg).double(), requires_grad=True)
        elif torch.is_tensor(arg):
            if isinstance(arg, torch.FloatTensor):
                return Variable(arg.double(), requires_grad=True)
            else:
                return Variable(arg, requires_grad=True)
        else:
            return arg
    return tuple(map_arg(arg) for arg in call_args)


def unpack_variables(args):
    if isinstance(args, Variable):
        return args.data
    elif isinstance(args, tuple):
        return tuple(unpack_variables(elem) for elem in args)
    else:
        return args


ignore_inplace = set((
    'test_DivConstant_by_tensor',
))


for test in function_tests:
    cls, constructor_args, call_args = test[:3]
    test_name = 'test_' + cls.__name__ + ('_' + test[3] if len(test) == 4 else '')
    def do_test(self, cls=cls, constructor_args=constructor_args,
            call_args=call_args, test_name=test_name):
        input = create_input(call_args)
        output = cls(*constructor_args)(*input)
        if not isinstance(output, tuple):
            output = (output,)
        for i, o in enumerate(output):
            if not o.requires_grad:
                continue
            analytical = get_analytical_jacobian(input, o)
            def fn(input):
                tmp = cls(*constructor_args)(*input)
                if not isinstance(tmp, tuple):
                    tmp = (tmp,)
                return tmp[i].data
            numerical = get_numerical_jacobian(fn, input, input)
            self.assertLessEqual(
                max(a.add(-1, n).abs().max() for a, n in zip(analytical, numerical)),
                PRECISION
            )

        if test_name not in ignore_inplace and issubclass(cls, InplaceFunction):
            inplace_input = deepcopy(input)
            inplace_input_copy = tuple(i + 0 for i in inplace_input)
            fn = cls(*constructor_args, inplace=True)
            inplace_output = fn(*inplace_input_copy)
            if not isinstance(inplace_output, tuple):
                inplace_output = (inplace_output,)
            self.assertEqual(inplace_output, output)
            # Check that gradient is the same
            for inp_i, i in zip(inplace_input, input):
                if inp_i.grad is not None:
                    inp_i.grad.data.zero_()
                if i.grad is not None:
                    i.grad.data.zero_()
            for io, o in zip(inplace_output, output):
                grad = torch.randn(*io.size()).double()
                io.backward(grad)
                o.backward(grad)
            for inp_i, i in zip(inplace_input, input):
                self.assertEqual(inp_i.grad, i.grad)

    assert not hasattr(TestAutograd, test_name), 'Two tests have the same name: ' + test_name
    setattr(TestAutograd, test_name, do_test)


EXCLUDE_FUNCTIONAL = {
    'addmm',
    'addbmm',
    'baddbmm',
    'addmv',
    'addr',
}
for test in method_tests:
    name, self_size, args = test[:3]
    test_name = 'test_' + name + ('_' + test[3] if len(test) == 4 else '')
    def do_test(self, name=name, self_size=self_size, args=args, test_name=test_name):
        def check(name):
            self_variable = create_input((self_size,))[0]
            args_variable = create_input(args)
            self_tensor = deepcopy(self_variable.data)
            args_tensor = deepcopy(unpack_variables(args_variable))
            output_variable = getattr(self_variable, name)(*args_variable)
            output_tensor = getattr(self_tensor, name)(*args_tensor)
            if not torch.is_tensor(output_tensor) and not isinstance(output_tensor, tuple):
                output_tensor = torch.DoubleTensor((output_tensor,))
            self.assertEqual(unpack_variables(output_variable), output_tensor)
            # TODO: check that both have changed after adding all inplace ops

            # functional interface tests
            if hasattr(torch, name) and name not in EXCLUDE_FUNCTIONAL:
                f_args_variable = (self_variable,) + args_variable
                f_args_tensor = (self_tensor,) + args_tensor
                output_variable = getattr(torch, name)(*f_args_variable)
                output_tensor = getattr(torch, name)(*f_args_tensor)
                if not torch.is_tensor(output_tensor) and not isinstance(output_tensor, tuple):
                    output_tensor = torch.DoubleTensor((output_tensor,))
                self.assertEqual(unpack_variables(output_variable), output_tensor)

        check(name)
        inplace_name = name + '_'
        if hasattr(Variable(torch.ones(1)), inplace_name):
            try:
                check(inplace_name)
            except Exception as e:
                if not 'only supports scalar' in e.args[0]:
                    raise


    assert not hasattr(TestAutograd, test_name), 'Two tests have the same name: ' + test_name
    setattr(TestAutograd, test_name, do_test)


if __name__ == '__main__':
    unittest.main()