diff --git a/advanced_source/c_extension.rst b/advanced_source/c_extension.rst
index d637dbbd816..87962a03097 100644
--- a/advanced_source/c_extension.rst
+++ b/advanced_source/c_extension.rst
@@ -111,7 +111,6 @@ import your extension just like a regular python file.
     # main.py
     import torch
     import torch.nn as nn
-    from torch.autograd import Variable
     from modules.add import MyAddModule
 
     class MyNetwork(nn.Module):
@@ -123,7 +122,7 @@ import your extension just like a regular python file.
             return self.add(input1, input2)
 
     model = MyNetwork()
-    input1, input2 = Variable(torch.randn(5, 5)), Variable(torch.randn(5, 5))
+    input1, input2 = torch.randn(5, 5), torch.randn(5, 5)
     print(model(input1, input2))
     print(input1 + input2)
 
diff --git a/advanced_source/neural_style_tutorial.py b/advanced_source/neural_style_tutorial.py
index 203e4ba9b5f..d12b13861c1 100644
--- a/advanced_source/neural_style_tutorial.py
+++ b/advanced_source/neural_style_tutorial.py
@@ -72,7 +72,7 @@
 Then, if :math:`Y` is another image *of any size*, we define the
 distance of style at layer :math:`L` as follow:
 
-.. math:: 
+.. math::
 
     D_S^L(X,Y) = \|G_{XL} - G_{YL}\|^2 = \sum_{k,l} (G_{XL}(k,l) - G_{YL}(k,l))^2
 
@@ -97,7 +97,7 @@
 Ok. That's enough with maths. If you want to go deeper (how to compute
 the gradients) **we encourage you to read the original paper** by Leon
 A. Gatys and AL, where everything is much better and much clearer
-explained. 
+explained.
 
 For our implementation in PyTorch, we already have everything
 we need: indeed, with PyTorch, all the gradients are automatically and
@@ -120,8 +120,6 @@
 
 -  ``torch``, ``torch.nn``, ``numpy`` (indispensables packages for
    neural networks with PyTorch)
--  ``torch.autograd.Variable`` (dynamic computation of the gradient wrt
-   a variable)
 -  ``torch.optim`` (efficient gradient descents)
 -  ``PIL``, ``PIL.Image``, ``matplotlib.pyplot`` (load and display
    images)
@@ -135,7 +133,7 @@
 
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
+import torch.nn.functional as F
 import torch.optim as optim
 
 from PIL import Image
@@ -154,16 +152,13 @@
 # If you have a GPU on your computer, it is preferable to run the
 # algorithm on it, especially if you want to try larger networks (like
 # VGG). For this, we have ``torch.cuda.is_available()`` that returns
-# ``True`` if you computer has an available GPU. Then, we can use method
-# ``.cuda()`` that moves allocated proccesses associated with a module
-# from the CPU to the GPU. When we want to move back this module to the
-# CPU (e.g. to use numpy), we use the ``.cpu()`` method. Finally,
-# ``.type(dtype)`` will be use to convert a ``torch.FloatTensor`` into
-# ``torch.cuda.FloatTensor`` to feed GPU processes.
-#
+# ``True`` if you computer has an available GPU. Then, we can set the
+# ``torch.device`` that will be used in this script. Then, we will use
+# the method ``.to(device)`` that a tensor or a module to the desired
+# device. When we want to move back this tensor or module to the
+# CPU (e.g. to use numpy), we can use the ``.cpu()`` method.
 
-use_cuda = torch.cuda.is_available()
-dtype = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
 ######################################################################
@@ -185,23 +180,22 @@
 
 
 # desired size of the output image
-imsize = 512 if use_cuda else 128  # use small size if no gpu
+imsize = 512 if torch.cuda.is_available() else 128  # use small size if no gpu
 
 loader = transforms.Compose([
-    transforms.Scale(imsize),  # scale imported image
+    transforms.Resize(imsize),  # scale imported image
     transforms.ToTensor()])  # transform it into a torch tensor
 
 
 def image_loader(image_name):
     image = Image.open(image_name)
-    image = Variable(loader(image))
     # fake batch dimension required to fit network's input dimensions
-    image = image.unsqueeze(0)
-    return image
+    image = loader(image).unsqueeze(0)
+    return image.to(device, torch.float)
 
 
-style_img = image_loader("images/picasso.jpg").type(dtype)
-content_img = image_loader("images/dancing.jpg").type(dtype)
+style_img = image_loader("images/picasso.jpg")
+content_img = image_loader("images/dancing.jpg")
 
 assert style_img.size() == content_img.size(), \
     "we need to import style and content images of the same size"
@@ -228,8 +222,8 @@ def image_loader(image_name):
 plt.ion()
 
 def imshow(tensor, title=None):
-    image = tensor.clone().cpu()  # we clone the tensor to not do changes on it
-    image = image.view(3, imsize, imsize)  # remove the fake batch dimension
+    image = tensor.cpu().clone()  # we clone the tensor to not do changes on it
+    image = image.squeeze(0)      # remove the fake batch dimension
     image = unloader(image)
     plt.imshow(image)
     if title is not None:
@@ -238,10 +232,10 @@ def imshow(tensor, title=None):
 
 
 plt.figure()
-imshow(style_img.data, title='Style Image')
+imshow(style_img, title='Style Image')
 
 plt.figure()
-imshow(content_img.data, title='Content Image')
+imshow(content_img, title='Content Image')
 
 
 ######################################################################
@@ -276,24 +270,17 @@ def imshow(tensor, title=None):
 
 class ContentLoss(nn.Module):
 
-    def __init__(self, target, weight):
+    def __init__(self, target,):
         super(ContentLoss, self).__init__()
         # we 'detach' the target content from the tree used
-        self.target = target.detach() * weight
         # to dynamically compute the gradient: this is a stated value,
         # not a variable. Otherwise the forward method of the criterion
         # will throw an error.
-        self.weight = weight
-        self.criterion = nn.MSELoss()
+        self.target = target.detach()
 
     def forward(self, input):
-        self.loss = self.criterion(input * self.weight, self.target)
-        self.output = input
-        return self.output
-
-    def backward(self, retain_graph=True):
-        self.loss.backward(retain_graph=retain_graph)
-        return self.loss
+        self.loss = F.mse_loss(input, self.target)
+        return input
 
 
 ######################################################################
@@ -319,53 +306,42 @@ def backward(self, retain_graph=True):
 # becomes easy to implement our module:
 #
 
-class GramMatrix(nn.Module):
-
-    def forward(self, input):
-        a, b, c, d = input.size()  # a=batch size(=1)
-        # b=number of feature maps
-        # (c,d)=dimensions of a f. map (N=c*d)
+def gram_matrix(input):
+    a, b, c, d = input.size()  # a=batch size(=1)
+    # b=number of feature maps
+    # (c,d)=dimensions of a f. map (N=c*d)
 
-        features = input.view(a * b, c * d)  # resise F_XL into \hat F_XL
+    features = input.view(a * b, c * d)  # resise F_XL into \hat F_XL
 
-        G = torch.mm(features, features.t())  # compute the gram product
+    G = torch.mm(features, features.t())  # compute the gram product
 
-        # we 'normalize' the values of the gram matrix
-        # by dividing by the number of element in each feature maps.
-        return G.div(a * b * c * d)
+    # we 'normalize' the values of the gram matrix
+    # by dividing by the number of element in each feature maps.
+    return G.div(a * b * c * d)
 
 
 ######################################################################
 # The longer is the feature maps dimension :math:`N`, the bigger are the
-# values of the gram matrix. Therefore, if we don't normalize by :math:`N`,
+# values of the Gram matrix. Therefore, if we don't normalize by :math:`N`,
 # the loss computed at the first layers (before pooling layers) will have
 # much more importance during the gradient descent. We dont want that,
 # since the most interesting style features are in the deepest layers!
 #
 # Then, the style loss module is implemented exactly the same way than the
-# content loss module, but we have to add the ``gramMatrix`` as a
-# parameter:
+# content loss module, but it compares the difference in Gram matrices of target
+# and input
 #
 
 class StyleLoss(nn.Module):
 
-    def __init__(self, target, weight):
+    def __init__(self, target_feature):
         super(StyleLoss, self).__init__()
-        self.target = target.detach() * weight
-        self.weight = weight
-        self.gram = GramMatrix()
-        self.criterion = nn.MSELoss()
+        self.target = gram_matrix(target_feature).detach()
 
     def forward(self, input):
-        self.output = input.clone()
-        self.G = self.gram(input)
-        self.G.mul_(self.weight)
-        self.loss = self.criterion(self.G, self.target)
-        return self.output
-
-    def backward(self, retain_graph=True):
-        self.loss.backward(retain_graph=retain_graph)
-        return self.loss
+        G = gram_matrix(input)
+        self.loss = F.mse_loss(G, self.target)
+        return input
 
 
 ######################################################################
@@ -379,22 +355,45 @@ def backward(self, retain_graph=True):
 # ``Sequential`` modules: ``features`` (containing convolution and pooling
 # layers) and ``classifier`` (containing fully connected layers). We are
 # just interested by ``features``:
+# Some layers have different behavior in training and in evaluation. Since we
+# are using it as a feature extractor. We will use ``.eval()`` to set the
+# network in evaluation mode.
 #
 
-cnn = models.vgg19(pretrained=True).features
+cnn = models.vgg19(pretrained=True).features.to(device).eval()
+
+######################################################################
+# Additionally, VGG networks are trained on images with each channel normalized
+# by mean=[0.485, 0.456, 0.406] and std=[0.229, 0.224, 0.225]. We will use them
+# to normalize the image before sending into the network.
+#
 
-# move it to the GPU if possible:
-if use_cuda:
-    cnn = cnn.cuda()
+cnn_normalization_mean = torch.tensor([0.485, 0.456, 0.406]).to(device)
+cnn_normalization_std = torch.tensor([0.229, 0.224, 0.225]).to(device)
+
+# create a module to normalize input image so we can easily put it in a
+# nn.Sequential
+class Normalization(nn.Module):
+    def __init__(self, mean, std):
+        super(Normalization, self).__init__()
+        # .view the mean and std to make them [C x 1 x 1] so that they can
+        # directly work with image Tensor of shape [B x C x H x W].
+        # B is batch size. C is number of channels. H is height and W is width.
+        self.mean = torch.tensor(mean).view(-1, 1, 1)
+        self.std = torch.tensor(std).view(-1, 1, 1)
+
+    def forward(self, img):
+        # normalize img
+        return (img - self.mean) / self.std
 
 
 ######################################################################
 # A ``Sequential`` module contains an ordered list of child modules. For
 # instance, ``vgg19.features`` contains a sequence (Conv2d, ReLU,
-# Maxpool2d, Conv2d, ReLU...) aligned in the right order of depth. As we
+# MaxPool2d, Conv2d, ReLU...) aligned in the right order of depth. As we
 # said in *Content loss* section, we wand to add our style and content
 # loss modules as additive 'transparent' layers in our network, at desired
-# depths. For that, we construct a new ``Sequential`` module, in wich we
+# depths. For that, we construct a new ``Sequential`` module, in which we
 # are going to add modules from ``vgg19`` and our loss modules in the
 # right order:
 #
@@ -403,71 +402,64 @@ def backward(self, retain_graph=True):
 content_layers_default = ['conv_4']
 style_layers_default = ['conv_1', 'conv_2', 'conv_3', 'conv_4', 'conv_5']
 
-
-def get_style_model_and_losses(cnn, style_img, content_img,
-                               style_weight=1000, content_weight=1,
+def get_style_model_and_losses(cnn, normalization_mean, normalization_std,
+                               style_img, content_img,
                                content_layers=content_layers_default,
                                style_layers=style_layers_default):
     cnn = copy.deepcopy(cnn)
 
+    # normalization module
+    normalization = Normalization(normalization_mean, normalization_std).to(device)
+
     # just in order to have an iterable access to or list of content/syle
     # losses
     content_losses = []
     style_losses = []
 
-    model = nn.Sequential()  # the new Sequential module network
-    gram = GramMatrix()  # we need a gram module in order to compute style targets
+    # assuming that cnn is a nn.Sequential, so we make a new nn.Sequential
+    # to put in modules that are supposed to be activated sequentially
+    model = nn.Sequential(normalization)
 
-    # move these modules to the GPU if possible:
-    if use_cuda:
-        model = model.cuda()
-        gram = gram.cuda()
-
-    i = 1
-    for layer in list(cnn):
+    i = 0  # increment every time we see a conv
+    for layer in cnn.children():
         if isinstance(layer, nn.Conv2d):
-            name = "conv_" + str(i)
-            model.add_module(name, layer)
-
-            if name in content_layers:
-                # add content loss:
-                target = model(content_img).clone()
-                content_loss = ContentLoss(target, content_weight)
-                model.add_module("content_loss_" + str(i), content_loss)
-                content_losses.append(content_loss)
-
-            if name in style_layers:
-                # add style loss:
-                target_feature = model(style_img).clone()
-                target_feature_gram = gram(target_feature)
-                style_loss = StyleLoss(target_feature_gram, style_weight)
-                model.add_module("style_loss_" + str(i), style_loss)
-                style_losses.append(style_loss)
-
-        if isinstance(layer, nn.ReLU):
-            name = "relu_" + str(i)
-            model.add_module(name, layer)
-
-            if name in content_layers:
-                # add content loss:
-                target = model(content_img).clone()
-                content_loss = ContentLoss(target, content_weight)
-                model.add_module("content_loss_" + str(i), content_loss)
-                content_losses.append(content_loss)
-
-            if name in style_layers:
-                # add style loss:
-                target_feature = model(style_img).clone()
-                target_feature_gram = gram(target_feature)
-                style_loss = StyleLoss(target_feature_gram, style_weight)
-                model.add_module("style_loss_" + str(i), style_loss)
-                style_losses.append(style_loss)
-
             i += 1
-
-        if isinstance(layer, nn.MaxPool2d):
-            name = "pool_" + str(i)
-            model.add_module(name, layer)  # ***
+            name = 'conv_{}'.format(i)
+        elif isinstance(layer, nn.ReLU):
+            name = 'relu_{}'.format(i)
+            # The in-place version doesn't play very nicely with the ContentLoss
+            # and StyleLoss we insert below. So we replace with out-of-place
+            # ones here.
+            layer = nn.ReLU(inplace=False)
+        elif isinstance(layer, nn.MaxPool2d):
+            name = 'pool_{}'.format(i)
+        elif isinstance(layer, nn.BatchNorm2d):
+            name = 'bn_{}'.format(i)
+        else:
+            raise RuntimeError('Unrecognized layer: {}'.format(layer.__class__.__name__))
+
+        model.add_module(name, layer)
+
+        if name in content_layers:
+            # add content loss:
+            target = model(content_img).detach()
+            content_loss = ContentLoss(target)
+            model.add_module("content_loss_{}".format(i), content_loss)
+            content_losses.append(content_loss)
+
+        if name in style_layers:
+            # add style loss:
+            target_feature = model(style_img).detach()
+            style_loss = StyleLoss(target_feature)
+            model.add_module("style_loss_{}".format(i), style_loss)
+            style_losses.append(style_loss)
+
+    # now we trim off the layers after the last content and style losses
+    for i in range(len(model) - 1, -1, -1):
+        if isinstance(model[i], ContentLoss) or isinstance(model[i], StyleLoss):
+            break
+
+    model = model[:(i + 1)]
 
     return model, style_losses, content_losses
 
@@ -498,11 +490,11 @@ def get_style_model_and_losses(cnn, style_img, content_img,
 
 input_img = content_img.clone()
 # if you want to use a white noise instead uncomment the below line:
-# input_img = Variable(torch.randn(content_img.data.size())).type(dtype)
+# input_img = torch.randn(content_img.data.size(), device=device)
 
 # add the original input image to the figure:
 plt.figure()
-imshow(input_img.data, title='Input Image')
+imshow(input_img, title='Input Image')
 
 
 ######################################################################
@@ -514,21 +506,15 @@ def get_style_model_and_losses(cnn, style_img, content_img,
 # we will use L-BFGS algorithm to run our gradient descent. Unlike
 # training a network, we want to train the input image in order to
 # minimise the content/style losses. We would like to simply create a
-# PyTorch  L-BFGS optimizer, passing our image as the variable to optimize.
-# But ``optim.LBFGS`` takes as first argument a list of PyTorch
-# ``Variable`` that require gradient. Our input image is a ``Variable``
-# but is not a leaf of the tree that requires computation of gradients. In
-# order to show that this variable requires a gradient, a possibility is
-# to construct a ``Parameter`` object from the input image. Then, we just
-# give a list containing this ``Parameter`` to the optimizer's
-# constructor:
+# PyTorch  L-BFGS optimizer ``optim.LBFGS``, passing our image as the
+# Tensor to optimize. We use ``.requires_grad_()`` to make sure that this
+# image requires gradient.
 #
 
-def get_input_param_optimizer(input_img):
+def get_input_optimizer(input_img):
     # this line to show that input is a parameter that requires a gradient
-    input_param = nn.Parameter(input_img.data)
-    optimizer = optim.LBFGS([input_param])
-    return input_param, optimizer
+    optimizer = optim.LBFGS([input_img.requires_grad_()])
+    return optimizer
 
 
 ######################################################################
@@ -548,13 +534,14 @@ def get_input_param_optimizer(input_img):
 # the 0-1 interval.
 #
 
-def run_style_transfer(cnn, content_img, style_img, input_img, num_steps=300,
-                       style_weight=1000, content_weight=1):
+def run_style_transfer(cnn, normalization_mean, normalization_std,
+                       content_img, style_img, input_img, num_steps=300,
+                       style_weight=1000000, content_weight=1):
     """Run the style transfer."""
     print('Building the style transfer model..')
     model, style_losses, content_losses = get_style_model_and_losses(cnn,
-        style_img, content_img, style_weight, content_weight)
-    input_param, optimizer = get_input_param_optimizer(input_img)
+        normalization_mean, normalization_std, style_img, content_img)
+    optimizer = get_input_optimizer(input_img)
 
     print('Optimizing..')
     run = [0]
@@ -562,23 +549,25 @@ def run_style_transfer(cnn, content_img, style_img, input_img, num_steps=300,
 
         def closure():
             # correct the values of updated input image
-            input_param.data.clamp_(0, 1)
+            input_img.data.clamp_(0, 1)
 
             optimizer.zero_grad()
-            model(input_param)
+            model(input_img)
             style_score = 0
             content_score = 0
 
             for sl in style_losses:
-                style_score += sl.backward()
+                style_score += sl.loss
             for cl in content_losses:
-                content_score += cl.backward()
+                content_score += cl.loss
+
+            (style_score * style_weight + content_score * content_weight).backward()
 
             run[0] += 1
             if run[0] % 50 == 0:
                 print("run {}:".format(run))
                 print('Style Loss : {:4f} Content Loss: {:4f}'.format(
-                    style_score.data[0], content_score.data[0]))
+                    style_score.item(), content_score.item()))
                 print()
 
             return style_score + content_score
@@ -586,14 +575,15 @@ def closure():
         optimizer.step(closure)
 
     # a last correction...
-    input_param.data.clamp_(0, 1)
+    input_img.data.clamp_(0, 1)
 
-    return input_param.data
+    return input_img
 
 ######################################################################
 # Finally, run the algorithm
 
-output = run_style_transfer(cnn, content_img, style_img, input_img)
+output = run_style_transfer(cnn, cnn_normalization_mean, cnn_normalization_std,
+                            content_img, style_img, input_img)
 
 plt.figure()
 imshow(output, title='Output Image')
diff --git a/advanced_source/numpy_extensions_tutorial.py b/advanced_source/numpy_extensions_tutorial.py
index e14dec66a19..e6bca8b0dd4 100644
--- a/advanced_source/numpy_extensions_tutorial.py
+++ b/advanced_source/numpy_extensions_tutorial.py
@@ -17,7 +17,6 @@
 
 import torch
 from torch.autograd import Function
-from torch.autograd import Variable
 
 ###############################################################
 # Parameter-less example
@@ -36,7 +35,7 @@
 class BadFFTFunction(Function):
 
     def forward(self, input):
-        numpy_input = input.numpy()
+        numpy_input = input.detach().numpy()
         result = abs(rfft2(numpy_input))
         return input.new(result)
 
@@ -55,11 +54,11 @@ def incorrect_fft(input):
 ###############################################################
 # **Example usage of the created layer:**
 
-input = Variable(torch.randn(8, 8), requires_grad=True)
+input = torch.randn(8, 8, requires_grad=True)
 result = incorrect_fft(input)
-print(result.data)
+print(result)
 result.backward(torch.randn(result.size()))
-print(input.grad)
+print(input)
 
 ###############################################################
 # Parametrized example
@@ -88,19 +87,19 @@ def incorrect_fft(input):
 class ScipyConv2dFunction(Function):
     @staticmethod
     def forward(ctx, input, filter):
-        result = correlate2d(input.numpy(), filter.numpy(), mode='valid')
+        input, filter = input.detach(), filter.detach()  # detach so we can cast to NumPy
+        result = correlate2d(input.numpy(), filter.detach().numpy(), mode='valid')
         ctx.save_for_backward(input, filter)
         return input.new(result)
 
     @staticmethod
     def backward(ctx, grad_output):
+        grad_output = grad_output.detach()
         input, filter = ctx.saved_tensors
-        grad_output = grad_output.data
         grad_input = convolve2d(grad_output.numpy(), filter.t().numpy(), mode='full')
         grad_filter = convolve2d(input.numpy(), grad_output.numpy(), mode='valid')
 
-        return Variable(grad_output.new(grad_input)), \
-            Variable(grad_output.new(grad_filter))
+        return grad_output.new_tensor(grad_input), grad_output.new_tensor(grad_filter)
 
 
 class ScipyConv2d(Module):
@@ -117,7 +116,7 @@ def forward(self, input):
 
 module = ScipyConv2d(3, 3)
 print(list(module.parameters()))
-input = Variable(torch.randn(10, 10), requires_grad=True)
+input = torch.randn(10, 10, requires_grad=True)
 output = module(input)
 print(output)
 output.backward(torch.randn(8, 8))
diff --git a/advanced_source/super_resolution_with_caffe2.py b/advanced_source/super_resolution_with_caffe2.py
index 6b96121c9ea..81eede93360 100644
--- a/advanced_source/super_resolution_with_caffe2.py
+++ b/advanced_source/super_resolution_with_caffe2.py
@@ -23,7 +23,6 @@
 import numpy as np
 
 from torch import nn
-from torch.autograd import Variable
 import torch.utils.model_zoo as model_zoo
 import torch.onnx
 
@@ -65,10 +64,10 @@ def forward(self, x):
         return x
 
     def _initialize_weights(self):
-        init.orthogonal(self.conv1.weight, init.calculate_gain('relu'))
-        init.orthogonal(self.conv2.weight, init.calculate_gain('relu'))
-        init.orthogonal(self.conv3.weight, init.calculate_gain('relu'))
-        init.orthogonal(self.conv4.weight)
+        init.orthogonal_(self.conv1.weight, init.calculate_gain('relu'))
+        init.orthogonal_(self.conv2.weight, init.calculate_gain('relu'))
+        init.orthogonal_(self.conv3.weight, init.calculate_gain('relu'))
+        init.orthogonal_(self.conv4.weight)
 
 # Create the super-resolution model by using the above model definition.
 torch_model = SuperResolutionNet(upscale_factor=3)
@@ -108,7 +107,7 @@ def _initialize_weights(self):
 #
 
 # Input to the model
-x = Variable(torch.randn(batch_size, 1, 224, 224), requires_grad=True)
+x = torch.randn(batch_size, 1, 224, 224, requires_grad=True)
 
 # Export the model
 torch_out = torch.onnx._export(torch_model,             # model being run
diff --git a/beginner_source/blitz/autograd_tutorial.py b/beginner_source/blitz/autograd_tutorial.py
index 776b462fc92..ae69df3168c 100644
--- a/beginner_source/blitz/autograd_tutorial.py
+++ b/beginner_source/blitz/autograd_tutorial.py
@@ -15,48 +15,49 @@
 
 Let us see this in more simple terms with some examples.
 
-Variable
+Tensor
 --------
 
-``autograd.Variable`` is the central class of the package. It wraps a
-Tensor, and supports nearly all of operations defined on it. Once you
-finish your computation you can call ``.backward()`` and have all the
-gradients computed automatically.
+``torch.Tensor`` is the central class of the package. If you set its attribute
+``.requires_grad`` as ``True``, it starts to track all operations on it. When
+you finish your computation you can call ``.backward()`` and have all the
+gradients computed automatically. The gradient for this tensor will be
+accumulated into ``.grad`` attribute.
 
-You can access the raw tensor through the ``.data`` attribute, while the
-gradient w.r.t. this variable is accumulated into ``.grad``.
+To stop a tensor from tracking history, you can call ``.detach()`` to detach
+it from the computation history, and to prevent future computation from being
+tracked.
 
-.. figure:: /_static/img/Variable.png
-   :alt: Variable
-
-   Variable
+To prevent tracking history (and using memory), you can also wrap the code block
+in ``with torch.no_grad():``. This can be particularly helpful when evaluating a
+model because the model may have trainable parameters with `requires_grad=True`,
+but we don't need the gradients.
 
 There’s one more class which is very important for autograd
 implementation - a ``Function``.
 
-``Variable`` and ``Function`` are interconnected and build up an acyclic
+``Tensor`` and ``Function`` are interconnected and build up an acyclic
 graph, that encodes a complete history of computation. Each variable has
 a ``.grad_fn`` attribute that references a ``Function`` that has created
-the ``Variable`` (except for Variables created by the user - their
+the ``Tensor`` (except for Tensors created by the user - their
 ``grad_fn is None``).
 
 If you want to compute the derivatives, you can call ``.backward()`` on
-a ``Variable``. If ``Variable`` is a scalar (i.e. it holds a one element
+a ``Tensor``. If ``Tensor`` is a scalar (i.e. it holds a one element
 data), you don’t need to specify any arguments to ``backward()``,
 however if it has more elements, you need to specify a ``gradient``
 argument that is a tensor of matching shape.
 """
 
 import torch
-from torch.autograd import Variable
 
 ###############################################################
-# Create a variable:
-x = Variable(torch.ones(2, 2), requires_grad=True)
+# Create a tensor and set requires_grad=True to track computation with it
+x = torch.ones(2, 2, requires_grad=True)
 print(x)
 
 ###############################################################
-# Do an operation of variable:
+# Do an operation of tensor:
 y = x + 2
 print(y)
 
@@ -71,11 +72,23 @@
 
 print(z, out)
 
+################################################################
+# ``.requires_grad_( ... )`` changes an existing Tensor's ``requires_grad``
+# flag in-place. The input flag defaults to ``True`` if not given.
+a = torch.randn(2, 2)
+a = ((a * 3) / (a - 1))
+print(a.requires_grad)
+a.requires_grad_(True)
+print(a.requires_grad)
+b = (a * a).sum()
+print(b.grad_fn)
+
 ###############################################################
 # Gradients
 # ---------
-# let's backprop now
-# ``out.backward()`` is equivalent to doing ``out.backward(torch.Tensor([1.0]))``
+# Let's backprop now
+# Because ``out`` contains a single scalar, ``out.backward()`` is
+# equivalent to ``out.backward(torch.tensor(1))``.
 
 out.backward()
 
@@ -87,7 +100,7 @@
 
 ###############################################################
 # You should have got a matrix of ``4.5``. Let’s call the ``out``
-# *Variable* “:math:`o`”.
+# *Tensor* “:math:`o`”.
 # We have that :math:`o = \frac{1}{4}\sum_i z_i`,
 # :math:`z_i = 3(x_i+2)^2` and :math:`z_i\bigr\rvert_{x_i=1} = 27`.
 # Therefore,
@@ -98,8 +111,7 @@
 # You can do many crazy things with autograd!
 
 
-x = torch.randn(3)
-x = Variable(x, requires_grad=True)
+x = torch.randn(3, requires_grad=True)
 
 y = x * 2
 while y.data.norm() < 1000:
@@ -109,13 +121,23 @@
 
 ###############################################################
 #
-gradients = torch.FloatTensor([0.1, 1.0, 0.0001])
+gradients = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float)
 y.backward(gradients)
 
 print(x.grad)
 
+###############################################################
+# You can also stops autograd from tracking history on Tensors
+# with requires_grad=True by wrapping the code block in
+# ``with torch.no_grad():``
+print(x.requires_grad)
+print((x ** 2).requires_grad)
+
+with torch.no_grad():
+	print((x ** 2).requires_grad)
+
 ###############################################################
 # **Read Later:**
 #
-# Documentation of ``Variable`` and ``Function`` is at
+# Documentation of ``autograd`` and ``Function`` is at
 # http://pytorch.org/docs/autograd
diff --git a/beginner_source/blitz/cifar10_tutorial.py b/beginner_source/blitz/cifar10_tutorial.py
index a7e0d67f605..74512d44499 100644
--- a/beginner_source/blitz/cifar10_tutorial.py
+++ b/beginner_source/blitz/cifar10_tutorial.py
@@ -111,7 +111,6 @@ def imshow(img):
 # Copy the neural network from the Neural Networks section before and modify it to
 # take 3-channel images (instead of 1-channel images as it was defined).
 
-from torch.autograd import Variable
 import torch.nn as nn
 import torch.nn.functional as F
 
@@ -163,9 +162,6 @@ def forward(self, x):
         # get the inputs
         inputs, labels = data
 
-        # wrap them in Variable
-        inputs, labels = Variable(inputs), Variable(labels)
-
         # zero the parameter gradients
         optimizer.zero_grad()
 
@@ -176,7 +172,7 @@ def forward(self, x):
         optimizer.step()
 
         # print statistics
-        running_loss += loss.data[0]
+        running_loss += loss.item()
         if i % 2000 == 1999:    # print every 2000 mini-batches
             print('[%d, %5d] loss: %.3f' %
                   (epoch + 1, i + 1, running_loss / 2000))
@@ -207,14 +203,14 @@ def forward(self, x):
 ########################################################################
 # Okay, now let us see what the neural network thinks these examples above are:
 
-outputs = net(Variable(images))
+outputs = net(images)
 
 ########################################################################
 # The outputs are energies for the 10 classes.
 # Higher the energy for a class, the more the network
 # thinks that the image is of the particular class.
 # So, let's get the index of the highest energy:
-_, predicted = torch.max(outputs.data, 1)
+_, predicted = torch.max(outputs, 1)
 
 print('Predicted: ', ' '.join('%5s' % classes[predicted[j]]
                               for j in range(4)))
@@ -226,12 +222,13 @@ def forward(self, x):
 
 correct = 0
 total = 0
-for data in testloader:
-    images, labels = data
-    outputs = net(Variable(images))
-    _, predicted = torch.max(outputs.data, 1)
-    total += labels.size(0)
-    correct += (predicted == labels).sum()
+with torch.no_grad():
+    for data in testloader:
+        images, labels = data
+        outputs = net(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
 
 print('Accuracy of the network on the 10000 test images: %d %%' % (
     100 * correct / total))
@@ -246,15 +243,16 @@ def forward(self, x):
 
 class_correct = list(0. for i in range(10))
 class_total = list(0. for i in range(10))
-for data in testloader:
-    images, labels = data
-    outputs = net(Variable(images))
-    _, predicted = torch.max(outputs.data, 1)
-    c = (predicted == labels).squeeze()
-    for i in range(4):
-        label = labels[i]
-        class_correct[label] += c[i]
-        class_total[label] += 1
+with torch.no_grad():
+    for data in testloader:
+        images, labels = data
+        outputs = net(images)
+        _, predicted = torch.max(outputs, 1)
+        c = (predicted == labels).squeeze()
+        for i in range(4):
+            label = labels[i]
+            class_correct[label] += c[i].item()
+            class_total[label] += 1
 
 
 for i in range(10):
@@ -270,20 +268,32 @@ def forward(self, x):
 # ----------------
 # Just like how you transfer a Tensor on to the GPU, you transfer the neural
 # net onto the GPU.
-# This will recursively go over all modules and convert their parameters and
-# buffers to CUDA tensors:
+#
+# Let's first define our device as the first visible cuda device if we have
+# CUDA available:
+
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+# Assume that we are on a CUDA machine, then this should print a CUDA device:
+
+print(device)
+
+# The rest of this section assumes that `device` is a CUDA device.
+#
+# Then these methods will recursively go over all modules and convert their
+# parameters and buffers to CUDA tensors:
 #
 # .. code:: python
 #
-#     net.cuda()
+#     net.to(device)
 #
 #
 # Remember that you will have to send the inputs and targets at every step
 # to the GPU too:
 #
-# ::
+# .. code:: python
 #
-#         inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())
+#         inputs, labels = inputs.to(device), labels.to(device)
 #
 # Why dont I notice MASSIVE speedup compared to CPU? Because your network
 # is realllly small.
diff --git a/beginner_source/blitz/data_parallel_tutorial.py b/beginner_source/blitz/data_parallel_tutorial.py
index 0174f152068..2fb8bff92f8 100644
--- a/beginner_source/blitz/data_parallel_tutorial.py
+++ b/beginner_source/blitz/data_parallel_tutorial.py
@@ -9,20 +9,22 @@
 
 .. code:: python
 
-    model.gpu()
+    device = torch.device("cuda:0")
+    model.to(device)
 
 Then, you can copy all your tensors to the GPU:
 
 .. code:: python
 
-    mytensor = my_tensor.gpu()
+    mytensor = my_tensor.to(device)
 
-Please note that just calling ``mytensor.gpu()`` won't copy the tensor
-to the GPU. You need to assign it to a new tensor and use that tensor on the GPU.
+Please note that just calling ``mytensor.to(device)`` returns a new copy of
+``mytensor`` on GPU instead of rewriting ``mytensor``. You need to assign it to
+a new variable and use that tensor on the GPU.
 
-It's natural to execute your forward, backward propagations on multiple GPUs. 
-However, Pytorch will only use one GPU by default. You can easily run your 
-operations on multiple GPUs by making your model run parallelly using 
+It's natural to execute your forward, backward propagations on multiple GPUs.
+However, Pytorch will only use one GPU by default. You can easily run your
+operations on multiple GPUs by making your model run parallelly using
 ``DataParallel``:
 
 .. code:: python
@@ -36,13 +38,12 @@
 ######################################################################
 # Imports and parameters
 # ----------------------
-# 
+#
 # Import PyTorch modules and define parameters.
-# 
+#
 
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 from torch.utils.data import Dataset, DataLoader
 
 # Parameters and DataLoaders
@@ -53,12 +54,17 @@
 data_size = 100
 
 
+######################################################################
+# Device
+#
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
 ######################################################################
 # Dummy DataSet
 # -------------
-# 
+#
 # Make a dummy (random) dataset. You just need to implement the
-# getitem 
+# getitem
 #
 
 class RandomDataset(Dataset):
@@ -80,15 +86,15 @@ def __len__(self):
 ######################################################################
 # Simple Model
 # ------------
-# 
-# For the demo, our model just gets an input, performs a linear operation, and 
+#
+# For the demo, our model just gets an input, performs a linear operation, and
 # gives an output. However, you can use ``DataParallel`` on any model (CNN, RNN,
-# Capsule Net etc.) 
+# Capsule Net etc.)
 #
 # We've placed a print statement inside the model to monitor the size of input
-# and output tensors. 
+# and output tensors.
 # Please pay attention to what is printed at batch rank 0.
-# 
+#
 
 class Model(nn.Module):
     # Our model
@@ -99,7 +105,7 @@ def __init__(self, input_size, output_size):
 
     def forward(self, input):
         output = self.fc(input)
-        print("  In Model: input size", input.size(), 
+        print("\tIn Model: input size", input.size(),
               "output size", output.size())
 
         return output
@@ -108,12 +114,12 @@ def forward(self, input):
 ######################################################################
 # Create Model and DataParallel
 # -----------------------------
-# 
+#
 # This is the core part of the tutorial. First, we need to make a model instance
-# and check if we have multiple GPUs. If we have multiple GPUs, we can wrap 
+# and check if we have multiple GPUs. If we have multiple GPUs, we can wrap
 # our model using ``nn.DataParallel``. Then we can put our model on GPUs by
-# ``model.gpu()`` 
-# 
+# ``model.to(device)``
+#
 
 model = Model(input_size, output_size)
 if torch.cuda.device_count() > 1:
@@ -121,42 +127,37 @@ def forward(self, input):
   # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
   model = nn.DataParallel(model)
 
-if torch.cuda.is_available():
-   model.cuda()
+model.to(device)
 
 
 ######################################################################
 # Run the Model
 # -------------
-# 
+#
 # Now we can see the sizes of input and output tensors.
-# 
+#
 
 for data in rand_loader:
-    if torch.cuda.is_available():
-        input_var = Variable(data.cuda())
-    else:
-        input_var = Variable(data)
-
-    output = model(input_var)
-    print("Outside: input size", input_var.size(),
+    input = data.to(device)
+    output = model(input)
+    print("Outside: input size", input.size(),
           "output_size", output.size())
 
 
 ######################################################################
 # Results
 # -------
-# 
+#
 # When we batch 30 inputs and 30 outputs, the model gets 30 and outputs 30 as
 # expected. But if you have GPUs, then you can get results like this.
-# 
+#
 # 2 GPUs
 # ~~~~~~
 #
 # If you have 2, you will see:
-# 
+#
 # .. code:: bash
-# 
+#
 #     # on 2 GPUs
 #     Let's use 2 GPUs!
 #         In Model: input size torch.Size([15, 5]) output size torch.Size([15, 2])
@@ -171,14 +172,14 @@ def forward(self, input):
 #         In Model: input size torch.Size([5, 5]) output size torch.Size([5, 2])
 #         In Model: input size torch.Size([5, 5]) output size torch.Size([5, 2])
 #     Outside: input size torch.Size([10, 5]) output_size torch.Size([10, 2])
-# 
+#
 # 3 GPUs
 # ~~~~~~
-# 
+#
 # If you have 3 GPUs, you will see:
-# 
+#
 # .. code:: bash
-# 
+#
 #     Let's use 3 GPUs!
 #         In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
 #         In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
@@ -196,14 +197,14 @@ def forward(self, input):
 #         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
 #         In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
 #     Outside: input size torch.Size([10, 5]) output_size torch.Size([10, 2])
-# 
+#
 # 8 GPUs
 # ~~~~~~~~~~~~~~
-# 
+#
 # If you have 8, you will see:
-# 
+#
 # .. code:: bash
-# 
+#
 #     Let's use 8 GPUs!
 #         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
 #         In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
@@ -238,17 +239,17 @@ def forward(self, input):
 #         In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
 #         In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
 #     Outside: input size torch.Size([10, 5]) output_size torch.Size([10, 2])
-# 
+#
 
 
 ######################################################################
 # Summary
 # -------
-# 
+#
 # DataParallel splits your data automatically and sends job orders to multiple
 # models on several GPUs. After each model finishes their job, DataParallel
 # collects and merges the results before returning it to you.
-# 
+#
 # For more information, please check out
 # http://pytorch.org/tutorials/beginner/former\_torchies/parallelism\_tutorial.html.
-# 
+#
diff --git a/beginner_source/blitz/neural_networks_tutorial.py b/beginner_source/blitz/neural_networks_tutorial.py
index 3b02ec45694..eee89f002eb 100644
--- a/beginner_source/blitz/neural_networks_tutorial.py
+++ b/beginner_source/blitz/neural_networks_tutorial.py
@@ -38,7 +38,6 @@
 Let’s define this network:
 """
 import torch
-from torch.autograd import Variable
 import torch.nn as nn
 import torch.nn.functional as F
 
@@ -91,11 +90,11 @@ def num_flat_features(self, x):
 print(params[0].size())  # conv1's .weight
 
 ########################################################################
-# The input to the forward is an ``autograd.Variable``, and so is the output.
+# Let try a random 32x32 input
 # Note: Expected input size to this net(LeNet) is 32x32. To use this net on
 # MNIST dataset, please resize the images from the dataset to 32x32.
 
-input = Variable(torch.randn(1, 1, 32, 32))
+input = torch.randn(1, 1, 32, 32)
 out = net(input)
 print(out)
 
@@ -121,21 +120,19 @@ def num_flat_features(self, x):
 # Before proceeding further, let's recap all the classes you’ve seen so far.
 #
 # **Recap:**
-#   -  ``torch.Tensor`` - A *multi-dimensional array*.
-#   -  ``autograd.Variable`` - *Wraps a Tensor and records the history of
-#      operations* applied to it. Has the same API as a ``Tensor``, with
-#      some additions like ``backward()``. Also *holds the gradient*
-#      w.r.t. the tensor.
+#   -  ``torch.Tensor`` - A *multi-dimensional array* with support for autograd
+#      operations like ``backward()``. Also *holds the gradient* w.r.t. the
+#      tensor.
 #   -  ``nn.Module`` - Neural network module. *Convenient way of
 #      encapsulating parameters*, with helpers for moving them to GPU,
 #      exporting, loading, etc.
-#   -  ``nn.Parameter`` - A kind of Variable, that is *automatically
+#   -  ``nn.Parameter`` - A kind of Tensor, that is *automatically
 #      registered as a parameter when assigned as an attribute to a*
 #      ``Module``.
 #   -  ``autograd.Function`` - Implements *forward and backward definitions
-#      of an autograd operation*. Every ``Variable`` operation, creates at
+#      of an autograd operation*. Every ``Tensor`` operation, creates at
 #      least a single ``Function`` node, that connects to functions that
-#      created a ``Variable`` and *encodes its history*.
+#      created a ``Tensor`` and *encodes its history*.
 #
 # **At this point, we covered:**
 #   -  Defining a neural network
@@ -159,7 +156,7 @@ def num_flat_features(self, x):
 # For example:
 
 output = net(input)
-target = Variable(torch.arange(1, 11))  # a dummy target, for example
+target = torch.arange(1, 11)  # a dummy target, for example
 target = target.view(1, -1)  # make it the same shape as output
 criterion = nn.MSELoss()
 
@@ -179,8 +176,8 @@ def num_flat_features(self, x):
 #           -> loss
 #
 # So, when we call ``loss.backward()``, the whole graph is differentiated
-# w.r.t. the loss, and all Variables in the graph will have their
-# ``.grad`` Variable accumulated with the gradient.
+# w.r.t. the loss, and all Tensors in the graph that has ``requres_grad=True``
+# will have their ``.grad`` Tensor accumulated with the gradient.
 #
 # For illustration, let us follow a few steps backward:
 
diff --git a/beginner_source/blitz/tensor_tutorial.py b/beginner_source/blitz/tensor_tutorial.py
index ccf995b4f64..a64274075de 100644
--- a/beginner_source/blitz/tensor_tutorial.py
+++ b/beginner_source/blitz/tensor_tutorial.py
@@ -26,7 +26,7 @@
 ###############################################################
 # Construct a 5x3 matrix, uninitialized:
 
-x = torch.Tensor(5, 3)
+x = torch.empty(5, 3)
 print(x)
 
 ###############################################################
@@ -35,6 +35,29 @@
 x = torch.rand(5, 3)
 print(x)
 
+###############################################################
+# Construct a matrix filled zeros and of dtype long:
+
+x = torch.zeros(5, 3, dtype=torch.long)
+print(x)
+
+###############################################################
+# Construct a tensor directly from data:
+
+x = torch.tensor([5.5, 3])
+print(x)
+
+###############################################################
+# or create a tensor basing on existing tensor. These methods
+# will reuse properties of the input tensor, e.g. dtype, unless
+# new values are provided by user
+
+x = x.new_ones(5, 3, dtype=torch.double)      # new_* methods take in sizes
+print(x)
+
+x = torch.randn_like(x, dtype=torch.float)    # override dtype!
+print(x)                                      # result has the same size
+
 ###############################################################
 # Get its size:
 
@@ -60,7 +83,7 @@
 
 ###############################################################
 # Addition: providing an output tensor as argument
-result = torch.Tensor(5, 3)
+result = torch.empty(5, 3)
 torch.add(x, y, out=result)
 print(result)
 
@@ -87,6 +110,13 @@
 z = x.view(-1, 8)  # the size -1 is inferred from other dimensions
 print(x.size(), y.size(), z.size())
 
+###############################################################
+# If you have a one element tensor, use ``.item()`` to get the value as a
+# Python number
+x = torch.randn(1)
+print(x)
+print(x.item())
+
 ###############################################################
 # **Read later:**
 #
@@ -113,7 +143,6 @@
 ###############################################################
 #
 
-
 b = a.numpy()
 print(b)
 
@@ -143,10 +172,14 @@
 # CUDA Tensors
 # ------------
 #
-# Tensors can be moved onto GPU using the ``.cuda`` method.
+# Tensors can be moved onto any device using the ``.to`` method.
 
 # let us run this cell only if CUDA is available
+# We will use ``torch.device`` objects to move tensors in and out of GPU
 if torch.cuda.is_available():
-    x = x.cuda()
-    y = y.cuda()
-    x + y
+    device = torch.device("cuda")          # a CUDA device object
+    y = torch.ones_like(x, device=device)  # directly create a tensor on GPU
+    x = x.to(device)                       # or just use strings ``.to("cuda")``
+    z = x + y
+    print(z)
+    print(z.to("cpu", torch.double))       # ``.to`` can also change dtype together!
diff --git a/beginner_source/data_loading_tutorial.py b/beginner_source/data_loading_tutorial.py
index 617813c1ce7..826859b1dd0 100644
--- a/beginner_source/data_loading_tutorial.py
+++ b/beginner_source/data_loading_tutorial.py
@@ -179,7 +179,7 @@ def __getitem__(self, idx):
 # Transforms
 # ----------
 #
-# One issue we can see from the above is that the samples are not of the 
+# One issue we can see from the above is that the samples are not of the
 # same size. Most neural networks expect the images of a fixed size.
 # Therefore, we will need to write some prepocessing code.
 # Let's create three transforms:
@@ -192,7 +192,7 @@ def __getitem__(self, idx):
 #
 # We will write them as callable classes instead of simple functions so
 # that parameters of the transform need not be passed everytime it's
-# called. For this, we just need to implement ``__call__`` method and 
+# called. For this, we just need to implement ``__call__`` method and
 # if required, ``__init__`` method. We can then use a transform like this:
 #
 # ::
@@ -278,7 +278,7 @@ class ToTensor(object):
 
     def __call__(self, sample):
         image, landmarks = sample['image'], sample['landmarks']
- 
+
         # swap color axis because
         # numpy image: H x W x C
         # torch image: C X H X W
@@ -324,7 +324,7 @@ def __call__(self, sample):
 # -----------------------------
 #
 # Let's put this all together to create a dataset with composed
-# transforms. 
+# transforms.
 # To summarize, every time this dataset is sampled:
 #
 # -  An image is read from the file on the fly
diff --git a/beginner_source/deep_learning_60min_blitz.rst b/beginner_source/deep_learning_60min_blitz.rst
index 99522b7178b..6ea59720f96 100644
--- a/beginner_source/deep_learning_60min_blitz.rst
+++ b/beginner_source/deep_learning_60min_blitz.rst
@@ -10,7 +10,7 @@ Goal of this tutorial:
 
 *This tutorial assumes that you have a basic familiarity of numpy*
 
-.. Note:: 
+.. Note::
     Make sure you have the `torch`_ and `torchvision`_ packages installed.
 
 .. _torch: https://github.com/pytorch/pytorch
@@ -29,9 +29,6 @@ Goal of this tutorial:
 .. galleryitem:: /beginner/blitz/tensor_tutorial.py
     :figure: /_static/img/tensor_illustration_flat.png
 
-.. galleryitem:: /beginner/blitz/autograd_tutorial.py
-    :figure: /_static/img/Variable.png
-
 .. galleryitem:: /beginner/blitz/neural_networks_tutorial.py
     :figure: /_static/img/mnist.png
 
diff --git a/beginner_source/deep_learning_nlp_tutorial.rst b/beginner_source/deep_learning_nlp_tutorial.rst
index ab7d4c86b68..1d65ad18dcd 100644
--- a/beginner_source/deep_learning_nlp_tutorial.rst
+++ b/beginner_source/deep_learning_nlp_tutorial.rst
@@ -36,7 +36,7 @@ and use them on it.
 
 
 .. galleryitem:: /beginner/nlp/pytorch_tutorial.py
-    :intro: All of deep learning is computations on tensors, which are generalizations of a matrix that can be 
+    :intro: All of deep learning is computations on tensors, which are generalizations of a matrix that can be
 
 .. galleryitem:: /beginner/nlp/deep_learning_tutorial.py
     :intro: Deep learning consists of composing linearities with non-linearities in clever ways. The introduction of non-linearities allows
@@ -45,10 +45,10 @@ and use them on it.
     :intro: Word embeddings are dense vectors of real numbers, one per word in your vocabulary. In NLP, it is almost always the case that your features are
 
 .. galleryitem:: /beginner/nlp/sequence_models_tutorial.py
-    :intro: At this point, we have seen various feed-forward networks. That is, there is no state maintained by the network at all. 
+    :intro: At this point, we have seen various feed-forward networks. That is, there is no state maintained by the network at all.
 
 .. galleryitem:: /beginner/nlp/advanced_tutorial.py
-    :intro: Dynamic versus Static Deep Learning Toolkits. Pytorch is a *dynamic* neural network kit. 
+    :intro: Dynamic versus Static Deep Learning Toolkits. Pytorch is a *dynamic* neural network kit.
 
 
 .. raw:: html
diff --git a/beginner_source/examples_autograd/two_layer_net_autograd.py b/beginner_source/examples_autograd/two_layer_net_autograd.py
index ce213242294..043a47f57cc 100755
--- a/beginner_source/examples_autograd/two_layer_net_autograd.py
+++ b/beginner_source/examples_autograd/two_layer_net_autograd.py
@@ -1,71 +1,72 @@
 # -*- coding: utf-8 -*-
 """
-PyTorch: Variables and autograd
+PyTorch: Tensors and autograd
 -------------------------------
 
 A fully-connected ReLU network with one hidden layer and no biases, trained to
 predict y from x by minimizing squared Euclidean distance.
 
 This implementation computes the forward pass using operations on PyTorch
-Variables, and uses PyTorch autograd to compute gradients.
+Tensors, and uses PyTorch autograd to compute gradients.
 
-A PyTorch Variable is a wrapper around a PyTorch Tensor, and represents a node
-in a computational graph. If x is a Variable then x.data is a Tensor giving its
-value, and x.grad is another Variable holding the gradient of x with respect to
-some scalar value.
 
-PyTorch Variables have the same API as PyTorch tensors: (almost) any operation
-you can do on a Tensor you can also do on a Variable; the difference is that
-autograd allows you to automatically compute gradients.
+A PyTorch Tensor represents a node in a computational graph. If ``x`` is a
+Tensor that has ``x.requires_grad=True`` then ``x.grad`` is another Tensor
+holding the gradient of ``x`` with respect to some scalar value.
 """
 import torch
-from torch.autograd import Variable
 
-dtype = torch.FloatTensor
-# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU
+dtype = torch.float
+device = torch.device("cpu")
+# dtype = torch.device("cuda:0") # Uncomment this to run on GPU
 
 # N is batch size; D_in is input dimension;
 # H is hidden dimension; D_out is output dimension.
 N, D_in, H, D_out = 64, 1000, 100, 10
 
-# Create random Tensors to hold input and outputs, and wrap them in Variables.
+# Create random Tensors to hold input and outputs.
 # Setting requires_grad=False indicates that we do not need to compute gradients
-# with respect to these Variables during the backward pass.
-x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
-y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)
+# with respect to these Tensors during the backward pass.
+x = torch.randn(N, D_in, device=device, dtype=dtype)
+y = torch.randn(N, D_out, device=device, dtype=dtype)
 
-# Create random Tensors for weights, and wrap them in Variables.
+# Create random Tensors for weights.
 # Setting requires_grad=True indicates that we want to compute gradients with
-# respect to these Variables during the backward pass.
-w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
-w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)
+# respect to these Tensors during the backward pass.
+w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
+w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)
 
 learning_rate = 1e-6
 for t in range(500):
-    # Forward pass: compute predicted y using operations on Variables; these
+    # Forward pass: compute predicted y using operations on Tensors; these
     # are exactly the same operations we used to compute the forward pass using
     # Tensors, but we do not need to keep references to intermediate values since
     # we are not implementing the backward pass by hand.
     y_pred = x.mm(w1).clamp(min=0).mm(w2)
 
-    # Compute and print loss using operations on Variables.
-    # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape
-    # (1,); loss.data[0] is a scalar value holding the loss.
+    # Compute and print loss using operations on Tensors.
+    # Now loss is a Tensor of shape (1,)
+    # loss.item() gets the a scalar value held in the loss.
     loss = (y_pred - y).pow(2).sum()
-    print(t, loss.data[0])
+    print(t, loss.item())
 
     # Use autograd to compute the backward pass. This call will compute the
-    # gradient of loss with respect to all Variables with requires_grad=True.
-    # After this call w1.grad and w2.grad will be Variables holding the gradient
+    # gradient of loss with respect to all Tensors with requires_grad=True.
+    # After this call w1.grad and w2.grad will be Tensors holding the gradient
     # of the loss with respect to w1 and w2 respectively.
     loss.backward()
 
-    # Update weights using gradient descent; w1.data and w2.data are Tensors,
-    # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are
-    # Tensors.
-    w1.data -= learning_rate * w1.grad.data
-    w2.data -= learning_rate * w2.grad.data
+    # Manually update weights using gradient descent. Wrap in torch.no_grad()
+    # because weights have requires_grad=True, but we don't need to track this
+    # in autograd.
+    # An alternative way is to operate on weight.data and weight.grad.data.
+    # Recall that tensor.data gives a tensor that shares the storage with
+    # tensor, but doesn't track history.
+    # You can also use torch.optim.SGD to achieve this.
+    with torch.no_grad():
+        w1 -= learning_rate * w1.grad
+        w2 -= learning_rate * w2.grad
 
-    # Manually zero the gradients after updating weights
-    w1.grad.data.zero_()
-    w2.grad.data.zero_()
+        # Manually zero the gradients after updating weights
+        w1.grad.zero_()
+        w2.grad.zero_()
diff --git a/beginner_source/examples_autograd/two_layer_net_custom_function.py b/beginner_source/examples_autograd/two_layer_net_custom_function.py
index 2933f20e0aa..b951604c375 100755
--- a/beginner_source/examples_autograd/two_layer_net_custom_function.py
+++ b/beginner_source/examples_autograd/two_layer_net_custom_function.py
@@ -13,7 +13,6 @@
 the ReLU function.
 """
 import torch
-from torch.autograd import Variable
 
 
 class MyReLU(torch.autograd.Function):
@@ -47,41 +46,43 @@ def backward(ctx, grad_output):
         return grad_input
 
 
-dtype = torch.FloatTensor
-# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU
+dtype = torch.float
+device = torch.device("cpu")
+# dtype = torch.device("cuda:0") # Uncomment this to run on GPU
 
 # N is batch size; D_in is input dimension;
 # H is hidden dimension; D_out is output dimension.
 N, D_in, H, D_out = 64, 1000, 100, 10
 
-# Create random Tensors to hold input and outputs, and wrap them in Variables.
-x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
-y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)
+# Create random Tensors to hold input and outputs.
+x = torch.randn(N, D_in, device=device, dtype=dtype)
+y = torch.randn(N, D_out, device=device, dtype=dtype)
 
-# Create random Tensors for weights, and wrap them in Variables.
-w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
-w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)
+# Create random Tensors for weights.
+w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
+w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)
 
 learning_rate = 1e-6
 for t in range(500):
     # To apply our Function, we use Function.apply method. We alias this as 'relu'.
     relu = MyReLU.apply
 
-    # Forward pass: compute predicted y using operations on Variables; we compute
+    # Forward pass: compute predicted y using operations; we compute
     # ReLU using our custom autograd operation.
     y_pred = relu(x.mm(w1)).mm(w2)
 
     # Compute and print loss
     loss = (y_pred - y).pow(2).sum()
-    print(t, loss.data[0])
+    print(t, loss.item())
 
     # Use autograd to compute the backward pass.
     loss.backward()
 
     # Update weights using gradient descent
-    w1.data -= learning_rate * w1.grad.data
-    w2.data -= learning_rate * w2.grad.data
+    with torch.no_grad():
+        w1 -= learning_rate * w1.grad
+        w2 -= learning_rate * w2.grad
 
-    # Manually zero the gradients after updating weights
-    w1.grad.data.zero_()
-    w2.grad.data.zero_()
+        # Manually zero the gradients after updating weights
+        w1.grad.zero_()
+        w2.grad.zero_()
diff --git a/beginner_source/examples_nn/dynamic_net.py b/beginner_source/examples_nn/dynamic_net.py
index c6f515c00d7..45d848f6496 100755
--- a/beginner_source/examples_nn/dynamic_net.py
+++ b/beginner_source/examples_nn/dynamic_net.py
@@ -10,7 +10,6 @@
 """
 import random
 import torch
-from torch.autograd import Variable
 
 
 class DynamicNet(torch.nn.Module):
@@ -49,9 +48,9 @@ def forward(self, x):
 # H is hidden dimension; D_out is output dimension.
 N, D_in, H, D_out = 64, 1000, 100, 10
 
-# Create random Tensors to hold inputs and outputs, and wrap them in Variables
-x = Variable(torch.randn(N, D_in))
-y = Variable(torch.randn(N, D_out), requires_grad=False)
+# Create random Tensors to hold inputs and outputs
+x = torch.randn(N, D_in)
+y = torch.randn(N, D_out)
 
 # Construct our model by instantiating the class defined above
 model = DynamicNet(D_in, H, D_out)
@@ -66,7 +65,7 @@ def forward(self, x):
 
     # Compute and print loss
     loss = criterion(y_pred, y)
-    print(t, loss.data[0])
+    print(t, loss.item())
 
     # Zero gradients, perform a backward pass, and update the weights.
     optimizer.zero_grad()
diff --git a/beginner_source/examples_nn/two_layer_net_module.py b/beginner_source/examples_nn/two_layer_net_module.py
index c3cb5900c1f..7a60d5a0feb 100755
--- a/beginner_source/examples_nn/two_layer_net_module.py
+++ b/beginner_source/examples_nn/two_layer_net_module.py
@@ -11,7 +11,6 @@
 need to define your model this way.
 """
 import torch
-from torch.autograd import Variable
 
 
 class TwoLayerNet(torch.nn.Module):
@@ -26,9 +25,9 @@ def __init__(self, D_in, H, D_out):
 
     def forward(self, x):
         """
-        In the forward function we accept a Variable of input data and we must return
-        a Variable of output data. We can use Modules defined in the constructor as
-        well as arbitrary operators on Variables.
+        In the forward function we accept a Tensor of input data and we must return
+        a Tensor of output data. We can use Modules defined in the constructor as
+        well as arbitrary operators on Tensors.
         """
         h_relu = self.linear1(x).clamp(min=0)
         y_pred = self.linear2(h_relu)
@@ -39,9 +38,9 @@ def forward(self, x):
 # H is hidden dimension; D_out is output dimension.
 N, D_in, H, D_out = 64, 1000, 100, 10
 
-# Create random Tensors to hold inputs and outputs, and wrap them in Variables
-x = Variable(torch.randn(N, D_in))
-y = Variable(torch.randn(N, D_out), requires_grad=False)
+# Create random Tensors to hold inputs and outputs
+x = torch.randn(N, D_in)
+y = torch.randn(N, D_out)
 
 # Construct our model by instantiating the class defined above
 model = TwoLayerNet(D_in, H, D_out)
@@ -57,7 +56,7 @@ def forward(self, x):
 
     # Compute and print loss
     loss = criterion(y_pred, y)
-    print(t, loss.data[0])
+    print(t, loss.item())
 
     # Zero gradients, perform a backward pass, and update the weights.
     optimizer.zero_grad()
diff --git a/beginner_source/examples_nn/two_layer_net_nn.py b/beginner_source/examples_nn/two_layer_net_nn.py
index 7420ad7fea2..d7fb03d07f4 100755
--- a/beginner_source/examples_nn/two_layer_net_nn.py
+++ b/beginner_source/examples_nn/two_layer_net_nn.py
@@ -14,20 +14,19 @@
 input and may have some trainable weights.
 """
 import torch
-from torch.autograd import Variable
 
 # N is batch size; D_in is input dimension;
 # H is hidden dimension; D_out is output dimension.
 N, D_in, H, D_out = 64, 1000, 100, 10
 
-# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
-x = Variable(torch.randn(N, D_in))
-y = Variable(torch.randn(N, D_out), requires_grad=False)
+# Create random Tensors to hold inputs and outputs
+x = torch.randn(N, D_in)
+y = torch.randn(N, D_out)
 
 # Use the nn package to define our model as a sequence of layers. nn.Sequential
 # is a Module which contains other Modules, and applies them in sequence to
 # produce its output. Each Linear Module computes output from input using a
-# linear function, and holds internal Variables for its weight and bias.
+# linear function, and holds internal Tensors for its weight and bias.
 model = torch.nn.Sequential(
     torch.nn.Linear(D_in, H),
     torch.nn.ReLU(),
@@ -42,26 +41,27 @@
 for t in range(500):
     # Forward pass: compute predicted y by passing x to the model. Module objects
     # override the __call__ operator so you can call them like functions. When
-    # doing so you pass a Variable of input data to the Module and it produces
-    # a Variable of output data.
+    # doing so you pass a Tensor of input data to the Module and it produces
+    # a Tensor of output data.
     y_pred = model(x)
 
-    # Compute and print loss. We pass Variables containing the predicted and true
-    # values of y, and the loss function returns a Variable containing the
+    # Compute and print loss. We pass Tensors containing the predicted and true
+    # values of y, and the loss function returns a Tensor containing the
     # loss.
     loss = loss_fn(y_pred, y)
-    print(t, loss.data[0])
+    print(t, loss.item())
 
     # Zero the gradients before running the backward pass.
     model.zero_grad()
 
     # Backward pass: compute gradient of the loss with respect to all the learnable
     # parameters of the model. Internally, the parameters of each Module are stored
-    # in Variables with requires_grad=True, so this call will compute gradients for
+    # in Tensors with requires_grad=True, so this call will compute gradients for
     # all learnable parameters in the model.
     loss.backward()
 
-    # Update the weights using gradient descent. Each parameter is a Variable, so
-    # we can access its data and gradients like we did before.
-    for param in model.parameters():
-        param.data -= learning_rate * param.grad.data
+    # Update the weights using gradient descent. Each parameter is a Tensor, so
+    # we can access and gradients like we did before.
+    with torch.no_grad():
+        for param in model.parameters():
+            param -= learning_rate * param.grad
diff --git a/beginner_source/examples_nn/two_layer_net_optim.py b/beginner_source/examples_nn/two_layer_net_optim.py
index fd130028df7..ab41d0a3bcd 100755
--- a/beginner_source/examples_nn/two_layer_net_optim.py
+++ b/beginner_source/examples_nn/two_layer_net_optim.py
@@ -14,15 +14,14 @@
 used for deep learning, including SGD+momentum, RMSProp, Adam, etc.
 """
 import torch
-from torch.autograd import Variable
 
 # N is batch size; D_in is input dimension;
 # H is hidden dimension; D_out is output dimension.
 N, D_in, H, D_out = 64, 1000, 100, 10
 
-# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
-x = Variable(torch.randn(N, D_in))
-y = Variable(torch.randn(N, D_out), requires_grad=False)
+# Create random Tensors to hold inputs and outputs
+x = torch.randn(N, D_in)
+y = torch.randn(N, D_out)
 
 # Use the nn package to define our model and loss function.
 model = torch.nn.Sequential(
@@ -35,7 +34,7 @@
 # Use the optim package to define an Optimizer that will update the weights of
 # the model for us. Here we will use Adam; the optim package contains many other
 # optimization algoriths. The first argument to the Adam constructor tells the
-# optimizer which Variables it should update.
+# optimizer which Tensors it should update.
 learning_rate = 1e-4
 optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
 for t in range(500):
@@ -44,7 +43,7 @@
 
     # Compute and print loss.
     loss = loss_fn(y_pred, y)
-    print(t, loss.data[0])
+    print(t, loss.item())
 
     # Before the backward pass, use the optimizer object to zero all of the
     # gradients for the variables it will update (which are the learnable
diff --git a/beginner_source/examples_tensor/two_layer_net_tensor.py b/beginner_source/examples_tensor/two_layer_net_tensor.py
index d0339a49b50..cdc2e14b40a 100755
--- a/beginner_source/examples_tensor/two_layer_net_tensor.py
+++ b/beginner_source/examples_tensor/two_layer_net_tensor.py
@@ -21,20 +21,21 @@
 import torch
 
 
-dtype = torch.FloatTensor
-# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU
+dtype = torch.float
+device = torch.device("cpu")
+# dtype = torch.device("cuda:0") # Uncomment this to run on GPU
 
 # N is batch size; D_in is input dimension;
 # H is hidden dimension; D_out is output dimension.
 N, D_in, H, D_out = 64, 1000, 100, 10
 
 # Create random input and output data
-x = torch.randn(N, D_in).type(dtype)
-y = torch.randn(N, D_out).type(dtype)
+x = torch.randn(N, D_in, device=device, dtype=dtype)
+y = torch.randn(N, D_out, device=device, dtype=dtype)
 
 # Randomly initialize weights
-w1 = torch.randn(D_in, H).type(dtype)
-w2 = torch.randn(H, D_out).type(dtype)
+w1 = torch.randn(D_in, H, device=device, dtype=dtype)
+w2 = torch.randn(H, D_out, device=device, dtype=dtype)
 
 learning_rate = 1e-6
 for t in range(500):
@@ -44,7 +45,7 @@
     y_pred = h_relu.mm(w2)
 
     # Compute and print loss
-    loss = (y_pred - y).pow(2).sum()
+    loss = (y_pred - y).pow(2).sum().item()
     print(t, loss)
 
     # Backprop to compute gradients of w1 and w2 with respect to loss
diff --git a/beginner_source/former_torchies/autograd_tutorial.py b/beginner_source/former_torchies/autograd_tutorial.py
index 7ed1feaa4cf..4bfd7ba53e9 100644
--- a/beginner_source/former_torchies/autograd_tutorial.py
+++ b/beginner_source/former_torchies/autograd_tutorial.py
@@ -9,37 +9,33 @@
 In the forward phase, the autograd tape will remember all the operations
 it executed, and in the backward phase, it will replay the operations.
 
-Variable
---------
+Tensors that track history
+--------------------------
 
-In autograd, we introduce a ``Variable`` class, which is a very thin
-wrapper around a ``Tensor``. You can access the raw tensor through the
-``.data`` attribute, and after computing the backward pass, a gradient
+In autograd, if any input ``Tensor`` of an operation has ``requires_grad=True``,
+the computation will be tracked. After computing the backward pass, a gradient
 w.r.t. this variable is accumulated into ``.grad`` attribute.
 
-.. figure:: /_static/img/Variable.png
-   :alt: Variable
-
-   Variable
-
 There’s one more class which is very important for autograd
-implementation - a ``Function``. ``Variable`` and ``Function`` are
+implementation - a ``Function``. ``Tensor`` and ``Function`` are
 interconnected and build up an acyclic graph, that encodes a complete
 history of computation. Each variable has a ``.grad_fn`` attribute that
-references a function that has created a function (except for Variables
+references a function that has created a function (except for Tensors
 created by the user - these have ``None`` as ``.grad_fn``).
 
 If you want to compute the derivatives, you can call ``.backward()`` on
-a ``Variable``. If ``Variable`` is a scalar (i.e. it holds a one element
+a ``Tensor``. If ``Tensor`` is a scalar (i.e. it holds a one element
 tensor), you don’t need to specify any arguments to ``backward()``,
 however if it has more elements, you need to specify a ``grad_output``
 argument that is a tensor of matching shape.
 """
 
 import torch
-from torch.autograd import Variable
-x = Variable(torch.ones(2, 2), requires_grad=True)
-print(x)  # notice the "Variable containing" line
+
+###############################################################
+# Create a tensor and set requires_grad=True to track computation with it
+x = torch.ones(2, 2, requires_grad=True)
+print(x)
 
 ###############################################################
 #
@@ -73,6 +69,17 @@
 
 print(z, out)
 
+################################################################
+# ``.requires_grad_( ... )`` changes an existing Tensor's ``requires_grad``
+# flag in-place. The input flag defaults to ``True`` if not given.
+a = torch.randn(2, 2)
+a = ((a * 3) / (a - 1))
+print(a.requires_grad)
+a.requires_grad_(True)
+print(a.requires_grad)
+b = (a * a).sum()
+print(b.grad_fn)
+
 ###############################################################
 # Gradients
 # ---------
@@ -89,7 +96,7 @@
 # part of the graph twice, you need to pass in ``retain_variables = True``
 # during the first pass.
 
-x = Variable(torch.ones(2, 2), requires_grad=True)
+x = torch.ones(2, 2, requires_grad=True)
 y = x + 2
 y.backward(torch.ones(2, 2), retain_graph=True)
 # the retain_variables flag will prevent the internal buffers from being freed
@@ -111,3 +118,13 @@
 y.backward(gradient)
 
 print(x.grad)
+
+###############################################################
+# You can also stops autograd from tracking history on Tensors
+# with requires_grad=True by wrapping the code block in
+# ``with torch.no_grad():``
+print(x.requires_grad)
+print((x ** 2).requires_grad)
+
+with torch.no_grad():
+	print((x ** 2).requires_grad)
diff --git a/beginner_source/former_torchies/nn_tutorial.py b/beginner_source/former_torchies/nn_tutorial.py
index 0df7ec49625..316bf03a985 100644
--- a/beginner_source/former_torchies/nn_tutorial.py
+++ b/beginner_source/former_torchies/nn_tutorial.py
@@ -47,7 +47,6 @@
 """
 
 import torch
-from torch.autograd import Variable
 import torch.nn as nn
 import torch.nn.functional as F
 
@@ -115,14 +114,14 @@ def forward(self, input):
 # Create a mini-batch containing a single sample of random data and send the
 # sample through the ConvNet.
 
-input = Variable(torch.randn(1, 1, 28, 28))
+input = torch.randn(1, 1, 28, 28)
 out = net(input)
 print(out.size())
 
 ########################################################################
 # Define a dummy target label and compute error using a loss function.
 
-target = Variable(torch.LongTensor([3]))
+target = torch.tensor([3], dtype=torch.long)
 loss_fn = nn.CrossEntropyLoss()  # LogSoftmax + ClassNLL Loss
 err = loss_fn(out, target)
 err.backward()
@@ -130,8 +129,8 @@ def forward(self, input):
 print(err)
 
 ########################################################################
-# The output of the ConvNet ``out`` is a ``Variable``. We compute the loss
-# using that, and that results in ``err`` which is also a ``Variable``.
+# The output of the ConvNet ``out`` is a ``Tensor``. We compute the loss
+# using that, and that results in ``err`` which is also a ``Tensor``.
 # Calling ``.backward`` on ``err`` hence will propagate gradients all the
 # way through the ConvNet to it’s weights
 #
@@ -152,7 +151,7 @@ def forward(self, input):
 #
 # We introduce **hooks** for this purpose.
 #
-# You can register a function on a ``Module`` or a ``Variable``.
+# You can register a function on a ``Module`` or a ``Tensor``.
 # The hook can be a forward hook or a backward hook.
 # The forward hook will be executed when a forward call is executed.
 # The backward hook will be executed in the backward phase.
@@ -163,7 +162,7 @@ def forward(self, input):
 
 def printnorm(self, input, output):
     # input is a tuple of packed inputs
-    # output is a Variable. output.data is the Tensor we are interested
+    # output is a Tensor. output.data is the Tensor we are interested
     print('Inside ' + self.__class__.__name__ + ' forward')
     print('')
     print('input: ', type(input))
@@ -195,7 +194,7 @@ def printgradnorm(self, grad_input, grad_output):
     print('')
     print('grad_input size:', grad_input[0].size())
     print('grad_output size:', grad_output[0].size())
-    print('grad_input norm:', grad_input[0].data.norm())
+    print('grad_input norm:', grad_input[0].norm())
 
 
 net.conv2.register_backward_hook(printgradnorm)
@@ -254,9 +253,9 @@ def forward(self, data, last_hidden):
 TIMESTEPS = 5
 
 # Create some fake data
-batch = Variable(torch.randn(batch_size, 50))
-hidden = Variable(torch.zeros(batch_size, 20))
-target = Variable(torch.zeros(batch_size, 10))
+batch = torch.randn(batch_size, 50)
+hidden = torch.zeros(batch_size, 20)
+target = torch.zeros(batch_size, 10)
 
 loss = 0
 for t in range(TIMESTEPS):
diff --git a/beginner_source/former_torchies/parallelism_tutorial.py b/beginner_source/former_torchies/parallelism_tutorial.py
index b3eb57ea4c3..861a461d06c 100644
--- a/beginner_source/former_torchies/parallelism_tutorial.py
+++ b/beginner_source/former_torchies/parallelism_tutorial.py
@@ -14,6 +14,7 @@
 DataParallel
 -------------
 """
+import torch
 import torch.nn as nn
 
 
@@ -77,13 +78,14 @@ def data_parallel(module, input, device_ids, output_device=None):
 # Let’s look at a small example of implementing a network where part of it
 # is on the CPU and part on the GPU
 
+device = torch.device("cuda:0")
 
 class DistributedModel(nn.Module):
 
     def __init__(self):
         super().__init__(
             embedding=nn.Embedding(1000, 10),
-            rnn=nn.Linear(10, 10).cuda(0),
+            rnn=nn.Linear(10, 10).to(device),
         )
 
     def forward(self, x):
@@ -91,7 +93,7 @@ def forward(self, x):
         x = self.embedding(x)
 
         # Transfer to GPU
-        x = x.cuda(0)
+        x = x.to(device)
 
         # Compute RNN on GPU
         x = self.rnn(x)
diff --git a/beginner_source/former_torchies/tensor_tutorial.py b/beginner_source/former_torchies/tensor_tutorial.py
index 4b2004a5fe1..10a9d81fadb 100644
--- a/beginner_source/former_torchies/tensor_tutorial.py
+++ b/beginner_source/former_torchies/tensor_tutorial.py
@@ -10,12 +10,13 @@
 """
 
 import torch
-a = torch.FloatTensor(5, 7)
+a = torch.empty(5, 7, dtype=torch.float)
 
 ###############################################################
-# Initialize a tensor randomized with a normal distribution with mean=0, var=1:
+# Initialize a double tensor randomized with a normal distribution with mean=0,
+# var=1:
 
-a = torch.randn(5, 7)
+a = torch.randn(5, 7, dtype=torch.double)
 print(a)
 print(a.size())
 
@@ -72,14 +73,14 @@
 ###############################################################
 #
 
-z = torch.Tensor(5, 2)
+z = torch.empty(5, 2)
 z[:, 0] = 10
 z[:, 1] = 100
 print(z)
 
 ###############################################################
 #
-x.index_add_(1, torch.LongTensor([4, 0]), z)
+x.index_add_(1, torch.tensor([4, 0], dtype=torch.long), z)
 print(x)
 
 ###############################################################
@@ -132,10 +133,11 @@
 
 # let us run this cell only if CUDA is available
 if torch.cuda.is_available():
+
     # creates a LongTensor and transfers it
     # to GPU as torch.cuda.LongTensor
-    a = torch.LongTensor(10).fill_(3).cuda()
+    a = torch.full((10,), 3, device=torch.device("cuda"))
     print(type(a))
-    b = a.cpu()
+    b = a.to(torch.device("cpu"))
     # transfers it to CPU, back to
     # being a torch.LongTensor
diff --git a/beginner_source/former_torchies_tutorial.rst b/beginner_source/former_torchies_tutorial.rst
index 751da38b83d..9489e7e7877 100644
--- a/beginner_source/former_torchies_tutorial.rst
+++ b/beginner_source/former_torchies_tutorial.rst
@@ -25,9 +25,6 @@ In this tutorial, you will learn the following:
 .. galleryitem:: /beginner/former_torchies/tensor_tutorial.py
     :figure: /_static/img/tensor_illustration_flat.png
 
-.. galleryitem:: /beginner/former_torchies/autograd_tutorial.py
-    :figure: /_static/img/Variable.png
-
 .. galleryitem:: /beginner/former_torchies/nn_tutorial.py
     :figure: /_static/img/torch-nn-vs-pytorch-nn.png
 
diff --git a/beginner_source/nlp/advanced_tutorial.py b/beginner_source/nlp/advanced_tutorial.py
index b8820d5219a..0d03593f9cb 100644
--- a/beginner_source/nlp/advanced_tutorial.py
+++ b/beginner_source/nlp/advanced_tutorial.py
@@ -127,21 +127,15 @@
 # Helper functions to make the code more readable.
 
 
-def to_scalar(var):
-    # returns a python float
-    return var.view(-1).data.tolist()[0]
-
-
 def argmax(vec):
     # return the argmax as a python int
     _, idx = torch.max(vec, 1)
-    return to_scalar(idx)
+    return idx.item()
 
 
 def prepare_sequence(seq, to_ix):
     idxs = [to_ix[w] for w in seq]
-    tensor = torch.LongTensor(idxs)
-    return autograd.Variable(tensor)
+    return torch.tensor(idxs, dtype=torch.long)
 
 
 # Compute log sum exp in a numerically stable way for the forward algorithm
@@ -178,28 +172,28 @@ def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
             torch.randn(self.tagset_size, self.tagset_size))
 
         # These two statements enforce the constraint that we never transfer
-        # to the start tag and we never transfer from the stop tag 
+        # to the start tag and we never transfer from the stop tag
         self.transitions.data[tag_to_ix[START_TAG], :] = -10000
         self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
 
         self.hidden = self.init_hidden()
 
     def init_hidden(self):
-        return (autograd.Variable(torch.randn(2, 1, self.hidden_dim // 2)),
-                autograd.Variable(torch.randn(2, 1, self.hidden_dim // 2)))
+        return (torch.randn(2, 1, self.hidden_dim // 2),
+                torch.randn(2, 1, self.hidden_dim // 2))
 
     def _forward_alg(self, feats):
         # Do the forward algorithm to compute the partition function
-        init_alphas = torch.Tensor(1, self.tagset_size).fill_(-10000.)
+        init_alphas = torch.full((1, self.tagset_size), -10000.)
         # START_TAG has all of the score.
         init_alphas[0][self.tag_to_ix[START_TAG]] = 0.
 
         # Wrap in a variable so that we will get automatic backprop
-        forward_var = autograd.Variable(init_alphas)
+        forward_var = init_alphas
 
         # Iterate through the sentence
         for feat in feats:
-            alphas_t = []  # The forward variables at this timestep
+            alphas_t = []  # The forward tensors at this timestep
             for next_tag in range(self.tagset_size):
                 # broadcast the emission score: it is the same regardless of
                 # the previous tag
@@ -213,7 +207,7 @@ def _forward_alg(self, feats):
                 next_tag_var = forward_var + trans_score + emit_score
                 # The forward variable for this tag is log-sum-exp of all the
                 # scores.
-                alphas_t.append(log_sum_exp(next_tag_var))
+                alphas_t.append(log_sum_exp(next_tag_var).view(1))
             forward_var = torch.cat(alphas_t).view(1, -1)
         terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
         alpha = log_sum_exp(terminal_var)
@@ -229,8 +223,8 @@ def _get_lstm_features(self, sentence):
 
     def _score_sentence(self, feats, tags):
         # Gives the score of a provided tag sequence
-        score = autograd.Variable(torch.Tensor([0]))
-        tags = torch.cat([torch.LongTensor([self.tag_to_ix[START_TAG]]), tags])
+        score = torch.zeros(1)
+        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
         for i, feat in enumerate(feats):
             score = score + \
                 self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
@@ -241,11 +235,11 @@ def _viterbi_decode(self, feats):
         backpointers = []
 
         # Initialize the viterbi variables in log space
-        init_vvars = torch.Tensor(1, self.tagset_size).fill_(-10000.)
+        init_vvars = torch.full((1, self.tagset_size), -10000.)
         init_vvars[0][self.tag_to_ix[START_TAG]] = 0
 
         # forward_var at step i holds the viterbi variables for step i-1
-        forward_var = autograd.Variable(init_vvars)
+        forward_var = init_vvars
         for feat in feats:
             bptrs_t = []  # holds the backpointers for this step
             viterbivars_t = []  # holds the viterbi variables for this step
@@ -259,7 +253,7 @@ def _viterbi_decode(self, feats):
                 next_tag_var = forward_var + self.transitions[next_tag]
                 best_tag_id = argmax(next_tag_var)
                 bptrs_t.append(best_tag_id)
-                viterbivars_t.append(next_tag_var[0][best_tag_id])
+                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
             # Now add in the emission scores, and assign forward_var to the set
             # of viterbi variables we just computed
             forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
@@ -325,9 +319,10 @@ def forward(self, sentence):  # dont confuse this with _forward_alg above.
 optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
 
 # Check predictions before training
-precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
-precheck_tags = torch.LongTensor([tag_to_ix[t] for t in training_data[0][1]])
-print(model(precheck_sent))
+with torch.no_grad():
+    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
+    precheck_tags = torch.tensor([tag_to_ix[t] for t in training_data[0][1]], dtype=torch.long)
+    print(model(precheck_sent))
 
 # Make sure prepare_sequence from earlier in the LSTM section is loaded
 for epoch in range(
@@ -338,21 +333,22 @@ def forward(self, sentence):  # dont confuse this with _forward_alg above.
         model.zero_grad()
 
         # Step 2. Get our inputs ready for the network, that is,
-        # turn them into Variables of word indices.
+        # turn them into Tensors of word indices.
         sentence_in = prepare_sequence(sentence, word_to_ix)
-        targets = torch.LongTensor([tag_to_ix[t] for t in tags])
+        targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
 
         # Step 3. Run our forward pass.
-        neg_log_likelihood = model.neg_log_likelihood(sentence_in, targets)
+        loss = model.neg_log_likelihood(sentence_in, targets)
 
         # Step 4. Compute the loss, gradients, and update the parameters by
         # calling optimizer.step()
-        neg_log_likelihood.backward()
+        loss.backward()
         optimizer.step()
 
 # Check predictions after training
-precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
-print(model(precheck_sent))
+with torch.no_grad():
+    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
+    print(model(precheck_sent))
 # We got it!
 
 
diff --git a/beginner_source/nlp/deep_learning_tutorial.py b/beginner_source/nlp/deep_learning_tutorial.py
index 11d9890d737..49d14f61691 100644
--- a/beginner_source/nlp/deep_learning_tutorial.py
+++ b/beginner_source/nlp/deep_learning_tutorial.py
@@ -25,7 +25,7 @@
 as the *bias* term.
 
 
-Pytorch and most other deep learning frameworks do things a little
+PyTorch and most other deep learning frameworks do things a little
 differently than traditional linear algebra. It maps the rows of the
 input instead of the columns. That is, the :math:`i`'th row of the
 output below is the mapping of the :math:`i`'th row of the input under
@@ -36,7 +36,6 @@
 # Author: Robert Guthrie
 
 import torch
-import torch.autograd as autograd
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
@@ -48,7 +47,7 @@
 
 lin = nn.Linear(5, 3)  # maps from R^5 to R^3, parameters A, b
 # data is 2x5.  A maps from 5 to 3... can we map "data" under A?
-data = autograd.Variable(torch.randn(2, 5))
+data = torch.randn(2, 5)
 print(lin(data))  # yes
 
 
@@ -93,7 +92,7 @@
 # In pytorch, most non-linearities are in torch.functional (we have it imported as F)
 # Note that non-linearites typically don't have parameters like affine maps do.
 # That is, they don't have weights that are updated during training.
-data = autograd.Variable(torch.randn(2, 2))
+data = torch.randn(2, 2)
 print(data)
 print(F.relu(data))
 
@@ -121,7 +120,7 @@
 #
 
 # Softmax is also in torch.nn.functional
-data = autograd.Variable(torch.randn(5))
+data = torch.randn(5)
 print(data)
 print(F.softmax(data, dim=0))
 print(F.softmax(data, dim=0).sum())  # Sums to 1 because it is a distribution!
@@ -158,9 +157,9 @@
 # =========================
 #
 # So what we can compute a loss function for an instance? What do we do
-# with that? We saw earlier that autograd.Variable's know how to compute
-# gradients with respect to the things that were used to compute it. Well,
-# since our loss is an autograd.Variable, we can compute gradients with
+# with that? We saw earlier that Tensors know how to compute gradients
+# with respect to the things that were used to compute it. Well,
+# since our loss is an Tensor, we can compute gradients with
 # respect to all of the parameters used to compute it! Then we can perform
 # standard gradient updates. Let :math:`\theta` be our parameters,
 # :math:`L(\theta)` the loss function, and :math:`\eta` a positive
@@ -184,21 +183,22 @@
 
 
 ######################################################################
-# Creating Network Components in Pytorch
+# Creating Network Components in PyTorch
 # ======================================
 #
 # Before we move on to our focus on NLP, lets do an annotated example of
-# building a network in Pytorch using only affine maps and
+# building a network in PyTorch using only affine maps and
 # non-linearities. We will also see how to compute a loss function, using
-# Pytorch's built in negative log likelihood, and update parameters by
+# PyTorch's built in negative log likelihood, and update parameters by
 # backpropagation.
 #
 # All network components should inherit from nn.Module and override the
 # forward() method. That is about it, as far as the boilerplate is
 # concerned. Inheriting from nn.Module provides functionality to your
 # component. For example, it makes it keep track of its trainable
-# parameters, you can swap it between CPU and GPU with the .cuda() or
-# .cpu() functions, etc.
+# parameters, you can swap it between CPU and GPU with the ``.to(device)``
+# method, where device can be a CPU device ``torch.device("cpu")`` or CUDA
+# device ``torch.device("cuda:0")``.
 #
 # Let's write an annotated example of a network that takes in a sparse
 # bag-of-words representation and outputs a probability distribution over
@@ -297,16 +297,18 @@ def make_target(label, label_to_ix):
 # Whenever you assign a component to a class variable in the __init__ function
 # of a module, which was done with the line
 # self.linear = nn.Linear(...)
-# Then through some Python magic from the Pytorch devs, your module
+# Then through some Python magic from the PyTorch devs, your module
 # (in this case, BoWClassifier) will store knowledge of the nn.Linear's parameters
 for param in model.parameters():
     print(param)
 
-# To run the model, pass in a BoW vector, but wrapped in an autograd.Variable
-sample = data[0]
-bow_vector = make_bow_vector(sample[0], word_to_ix)
-log_probs = model(autograd.Variable(bow_vector))
-print(log_probs)
+# To run the model, pass in a BoW vector
+# Here we don't need to train, so the code is wrapped in torch.no_grad()
+with torch.no_grad():
+    sample = data[0]
+    bow_vector = make_bow_vector(sample[0], word_to_ix)
+    log_probs = model(bow_vector)
+    print(log_probs)
 
 
 ######################################################################
@@ -334,10 +336,11 @@ def make_target(label, label_to_ix):
 #
 
 # Run on test data before we train, just to see a before-and-after
-for instance, label in test_data:
-    bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
-    log_probs = model(bow_vec)
-    print(log_probs)
+with torch.no_grad():
+    for instance, label in test_data:
+        bow_vec = make_bow_vector(instance, word_to_ix)
+        log_probs = model(bow_vec)
+        print(log_probs)
 
 # Print the matrix column corresponding to "creo"
 print(next(model.parameters())[:, word_to_ix["creo"]])
@@ -350,17 +353,17 @@ def make_target(label, label_to_ix):
 # two instances.  Usually, somewhere between 5 and 30 epochs is reasonable.
 for epoch in range(100):
     for instance, label in data:
-        # Step 1. Remember that Pytorch accumulates gradients.
+        # Step 1. Remember that PyTorch accumulates gradients.
         # We need to clear them out before each instance
         model.zero_grad()
 
         # Step 2. Make our BOW vector and also we must wrap the target in a
-        # Variable as an integer. For example, if the target is SPANISH, then
+        # Tensor as an integer. For example, if the target is SPANISH, then
         # we wrap the integer 0. The loss function then knows that the 0th
         # element of the log probabilities is the log probability
         # corresponding to SPANISH
-        bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
-        target = autograd.Variable(make_target(label, label_to_ix))
+        bow_vec = make_bow_vector(instance, word_to_ix)
+        target = make_target(label, label_to_ix)
 
         # Step 3. Run our forward pass.
         log_probs = model(bow_vec)
@@ -371,10 +374,11 @@ def make_target(label, label_to_ix):
         loss.backward()
         optimizer.step()
 
-for instance, label in test_data:
-    bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
-    log_probs = model(bow_vec)
-    print(log_probs)
+with torch.no_grad():
+    for instance, label in test_data:
+        bow_vec = make_bow_vector(instance, word_to_ix)
+        log_probs = model(bow_vec)
+        print(log_probs)
 
 # Index corresponding to Spanish goes up, English goes down!
 print(next(model.parameters())[:, word_to_ix["creo"]])
@@ -385,7 +389,7 @@ def make_target(label, label_to_ix):
 # Spanish is much higher in the first example, and the log probability for
 # English is much higher in the second for the test data, as it should be.
 #
-# Now you see how to make a Pytorch component, pass some data through it
+# Now you see how to make a PyTorch component, pass some data through it
 # and do gradient updates. We are ready to dig deeper into what deep NLP
 # has to offer.
 #
diff --git a/beginner_source/nlp/pytorch_tutorial.py b/beginner_source/nlp/pytorch_tutorial.py
index 0730ea357e6..493e1199184 100644
--- a/beginner_source/nlp/pytorch_tutorial.py
+++ b/beginner_source/nlp/pytorch_tutorial.py
@@ -30,20 +30,20 @@
 # function.
 #
 
-# Create a torch.Tensor object with the given data.  It is a 1D vector
+# torch.tensor(data) creates a torch.Tensor object with the given data.
 V_data = [1., 2., 3.]
-V = torch.Tensor(V_data)
+V = torch.tensor(V_data)
 print(V)
 
 # Creates a matrix
 M_data = [[1., 2., 3.], [4., 5., 6]]
-M = torch.Tensor(M_data)
+M = torch.tensor(M_data)
 print(M)
 
 # Create a 3D tensor of size 2x2x2.
 T_data = [[[1., 2.], [3., 4.]],
           [[5., 6.], [7., 8.]]]
-T = torch.Tensor(T_data)
+T = torch.tensor(T_data)
 print(T)
 
 
@@ -60,8 +60,10 @@
 # talking about 3D tensors, I will explicitly use the term "3D tensor".
 #
 
-# Index into V and get a scalar
+# Index into V and get a scalar (0 dimensional tensor)
 print(V[0])
+# Get a Python number from it
+print(V[0].item())
 
 # Index into M and get a vector
 print(M[0])
@@ -93,8 +95,8 @@
 #
 # You can operate on tensors in the ways you would expect.
 
-x = torch.Tensor([1., 2., 3.])
-y = torch.Tensor([4., 5., 6.])
+x = torch.tensor([1., 2., 3.])
+y = torch.tensor([4., 5., 6.])
 z = x + y
 print(z)
 
@@ -151,8 +153,8 @@
 # specification of how your data is combined to give you the output. Since
 # the graph totally specifies what parameters were involved with which
 # operations, it contains enough information to compute derivatives. This
-# probably sounds vague, so lets see what is going on using the
-# fundamental class of Pytorch: autograd.Variable.
+# probably sounds vague, so let's see what is going on using the
+# fundamental flag ``requires_grad``.
 #
 # First, think from a programmers perspective. What is stored in the
 # torch.Tensor objects we were creating above? Obviously the data and the
@@ -162,26 +164,25 @@
 # (it could have been read in from a file, it could be the result of some
 # other operation, etc.)
 #
-# The Variable class keeps track of how it was created. Lets see it in
-# action.
+# If ``requires_grad=True``, the Tensor object keeps track of how it was
+# created. Lets see it in action.
 #
 
-# Variables wrap tensor objects
-x = autograd.Variable(torch.Tensor([1., 2., 3]), requires_grad=True)
-# You can access the data with the .data attribute
-print(x.data)
+# Tensor factory methods have a ``requires_grad`` flag
+x = torch.tensor([1., 2., 3], requires_grad=True)
 
-# You can also do all the same operations you did with tensors with Variables.
-y = autograd.Variable(torch.Tensor([4., 5., 6]), requires_grad=True)
+# With requires_grad=True, you can still do all the operations you previously
+# could
+y = torch.tensor([4., 5., 6], requires_grad=True)
 z = x + y
-print(z.data)
+print(z)
 
 # BUT z knows something extra.
 print(z.grad_fn)
 
 
 ######################################################################
-# So Variables know what created them. z knows that it wasn't read in from
+# So Tensors know what created them. z knows that it wasn't read in from
 # a file, it wasn't the result of a multiplication or exponential or
 # whatever. And if you keep following z.grad_fn, you will find yourself at
 # x and y.
@@ -240,42 +241,44 @@
 # successful programmer in deep learning.
 #
 
-x = torch.randn((2, 2))
-y = torch.randn((2, 2))
-z = x + y  # These are Tensor types, and backprop would not be possible
+x = torch.randn(2, 2)
+y = torch.randn(2, 2)
+# By default, user created Tensors have ``requires_grad=False``
+print(x.requires_grad, y.requires_grad)
+z = x + y
+# So you can't backprop through z
+print(z.grad_fn)
 
-var_x = autograd.Variable(x, requires_grad=True)
-var_y = autograd.Variable(y, requires_grad=True)
-# var_z contains enough information to compute gradients, as we saw above
-var_z = var_x + var_y
-print(var_z.grad_fn)
+# ``.requires_grad_( ... )`` changes an existing Tensor's ``requires_grad``
+# flag in-place. The input flag defaults to ``True`` if not given.
+x = x.requires_grad_()
+y = y.requires_grad_()
+# z contains enough information to compute gradients, as we saw above
+z = x + y
+print(z.grad_fn)
+# If any input to an operation has ``requires_grad=True``, so will the output
+print(z.requires_grad)
 
-var_z_data = var_z.data  # Get the wrapped Tensor object out of var_z...
-# Re-wrap the tensor in a new variable
-new_var_z = autograd.Variable(var_z_data)
+# Now z has the computation history that relates itself to x and y
+# Can we just take its values, and **detach** it from its history?
+new_z = z.detach()
 
-# ... does new_var_z have information to backprop to x and y?
+# ... does new_z have information to backprop to x and y?
 # NO!
-print(new_var_z.grad_fn)
-# And how could it?  We yanked the tensor out of var_z (that is
-# what var_z.data is).  This tensor doesn't know anything about
-# how it was computed.  We pass it into new_var_z, and this is all the
-# information new_var_z gets.  If var_z_data doesn't know how it was
-# computed, theres no way new_var_z will.
-# In essence, we have broken the variable away from its past history
+print(new_z.grad_fn)
+# And how could it? ``z.detach()`` returns a tensor that shares the same storage
+# as ``z``, but with the computation history forgotten. It doesn't know anything
+# about how it was computed.
+# In essence, we have broken the Tensor away from its past history
+
+###############################################################
+# You can also stops autograd from tracking history on Tensors
+# with requires_grad=True by wrapping the code block in
+# ``with torch.no_grad():``
+print(x.requires_grad)
+print((x ** 2).requires_grad)
+
+with torch.no_grad():
+	print((x ** 2).requires_grad)
 
 
-######################################################################
-# Here is the basic, extremely important rule for computing with
-# autograd.Variables (note this is more general than Pytorch. There is an
-# equivalent object in every major deep learning toolkit):
-#
-# **If you want the error from your loss function to backpropagate to a
-# component of your network, you MUST NOT break the Variable chain from
-# that component to your loss Variable. If you do, the loss will have no
-# idea your component exists, and its parameters can't be updated.**
-#
-# I say this in bold, because this error can creep up on you in very
-# subtle ways (I will show some such ways below), and it will not cause
-# your code to crash or complain, so you must be careful.
-#
diff --git a/beginner_source/nlp/sequence_models_tutorial.py b/beginner_source/nlp/sequence_models_tutorial.py
index aff5ef005c0..ce10f188e04 100644
--- a/beginner_source/nlp/sequence_models_tutorial.py
+++ b/beginner_source/nlp/sequence_models_tutorial.py
@@ -53,7 +53,6 @@
 # Author: Robert Guthrie
 
 import torch
-import torch.autograd as autograd
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
@@ -63,12 +62,11 @@
 ######################################################################
 
 lstm = nn.LSTM(3, 3)  # Input dim is 3, output dim is 3
-inputs = [autograd.Variable(torch.randn((1, 3)))
-          for _ in range(5)]  # make a sequence of length 5
+inputs = [torch.randn(1, 3) for _ in range(5)]  # make a sequence of length 5
 
 # initialize the hidden state.
-hidden = (autograd.Variable(torch.randn(1, 1, 3)),
-          autograd.Variable(torch.randn((1, 1, 3))))
+hidden = (torch.randn(1, 1, 3),
+          torch.randn(1, 1, 3))
 for i in inputs:
     # Step through the sequence one element at a time.
     # after each step, hidden contains the hidden state.
@@ -84,8 +82,7 @@
 # by passing it as an argument  to the lstm at a later time
 # Add the extra 2nd dimension
 inputs = torch.cat(inputs).view(len(inputs), 1, -1)
-hidden = (autograd.Variable(torch.randn(1, 1, 3)), autograd.Variable(
-    torch.randn((1, 1, 3))))  # clean out hidden state
+hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state
 out, hidden = lstm(inputs, hidden)
 print(out)
 print(hidden)
@@ -126,8 +123,7 @@
 
 def prepare_sequence(seq, to_ix):
     idxs = [to_ix[w] for w in seq]
-    tensor = torch.LongTensor(idxs)
-    return autograd.Variable(tensor)
+    return torch.tensor(idxs, dtype=torch.long)
 
 
 training_data = [
@@ -172,8 +168,8 @@ def init_hidden(self):
         # Refer to the Pytorch documentation to see exactly
         # why they have this dimensionality.
         # The axes semantics are (num_layers, minibatch_size, hidden_dim)
-        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
-                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))
+        return (torch.zeros(1, 1, self.hidden_dim),
+                torch.zeros(1, 1, self.hidden_dim))
 
     def forward(self, sentence):
         embeds = self.word_embeddings(sentence)
@@ -193,9 +189,11 @@ def forward(self, sentence):
 
 # See what the scores are before training
 # Note that element i,j of the output is the score for tag j for word i.
-inputs = prepare_sequence(training_data[0][0], word_to_ix)
-tag_scores = model(inputs)
-print(tag_scores)
+# Here we don't need to train, so the code is wrapped in torch.no_grad()
+with torch.no_grad():
+    inputs = prepare_sequence(training_data[0][0], word_to_ix)
+    tag_scores = model(inputs)
+    print(tag_scores)
 
 for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
     for sentence, tags in training_data:
@@ -208,7 +206,7 @@ def forward(self, sentence):
         model.hidden = model.init_hidden()
 
         # Step 2. Get our inputs ready for the network, that is, turn them into
-        # Variables of word indices.
+        # Tensors of word indices.
         sentence_in = prepare_sequence(sentence, word_to_ix)
         targets = prepare_sequence(tags, tag_to_ix)
 
@@ -222,15 +220,17 @@ def forward(self, sentence):
         optimizer.step()
 
 # See what the scores are after training
-inputs = prepare_sequence(training_data[0][0], word_to_ix)
-tag_scores = model(inputs)
-# The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
-#  for word i. The predicted tag is the maximum scoring tag.
-# Here, we can see the predicted sequence below is 0 1 2 0 1
-# since 0 is index of the maximum value of row 1,
-# 1 is the index of maximum value of row 2, etc.
-# Which is DET NOUN VERB DET NOUN, the correct sequence!
-print(tag_scores)
+with torch.no_grad():
+    inputs = prepare_sequence(training_data[0][0], word_to_ix)
+    tag_scores = model(inputs)
+
+    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
+    # for word i. The predicted tag is the maximum scoring tag.
+    # Here, we can see the predicted sequence below is 0 1 2 0 1
+    # since 0 is index of the maximum value of row 1,
+    # 1 is the index of maximum value of row 2, etc.
+    # Which is DET NOUN VERB DET NOUN, the correct sequence!
+    print(tag_scores)
 
 
 ######################################################################
diff --git a/beginner_source/nlp/word_embeddings_tutorial.py b/beginner_source/nlp/word_embeddings_tutorial.py
index 7d4485fa3ff..05083d2d7eb 100644
--- a/beginner_source/nlp/word_embeddings_tutorial.py
+++ b/beginner_source/nlp/word_embeddings_tutorial.py
@@ -159,7 +159,6 @@
 # Author: Robert Guthrie
 
 import torch
-import torch.autograd as autograd
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
@@ -170,8 +169,8 @@
 
 word_to_ix = {"hello": 0, "world": 1}
 embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings
-lookup_tensor = torch.LongTensor([word_to_ix["hello"]])
-hello_embed = embeds(autograd.Variable(lookup_tensor))
+lookup_tensor = torch.tensor([word_to_ix["hello"]], dtype=torch.long)
+hello_embed = embeds(lookup_tensor)
 print(hello_embed)
 
 
@@ -245,8 +244,7 @@ def forward(self, inputs):
 
         # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
         # into integer indices and wrap them in variables)
-        context_idxs = [word_to_ix[w] for w in context]
-        context_var = autograd.Variable(torch.LongTensor(context_idxs))
+        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
 
         # Step 2. Recall that torch *accumulates* gradients. Before passing in a
         # new instance, you need to zero out the gradients from the old
@@ -255,18 +253,18 @@ def forward(self, inputs):
 
         # Step 3. Run the forward pass, getting log probabilities over next
         # words
-        log_probs = model(context_var)
+        log_probs = model(context_idxs)
 
         # Step 4. Compute your loss function. (Again, Torch wants the target
         # word wrapped in a variable)
-        loss = loss_function(log_probs, autograd.Variable(
-            torch.LongTensor([word_to_ix[target]])))
+        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))
 
         # Step 5. Do the backward pass and update the gradient
         loss.backward()
         optimizer.step()
 
-        total_loss += loss.data
+        # Get the Python number from a 1-element Tensor by calling tensor.item()
+        total_loss += loss.item()
     losses.append(total_loss)
 print(losses)  # The loss decreased every iteration over the training data!
 
@@ -338,8 +336,7 @@ def forward(self, inputs):
 
 def make_context_vector(context, word_to_ix):
     idxs = [word_to_ix[w] for w in context]
-    tensor = torch.LongTensor(idxs)
-    return autograd.Variable(tensor)
+    return torch.tensor(idxs, dtype=torch.long)
 
 
 make_context_vector(data[0][0], word_to_ix)  # example
diff --git a/beginner_source/pytorch_with_examples.rst b/beginner_source/pytorch_with_examples.rst
index f5a8df11695..a7bb97a35e7 100644
--- a/beginner_source/pytorch_with_examples.rst
+++ b/beginner_source/pytorch_with_examples.rst
@@ -1,14 +1,14 @@
 Learning PyTorch with Examples
 ******************************
-**Author**: `Justin Johnson <https://github.com/jcjohnson/pytorch-examples>`_ 
+**Author**: `Justin Johnson <https://github.com/jcjohnson/pytorch-examples>`_
 
 This tutorial introduces the fundamental concepts of
 `PyTorch <https://github.com/pytorch/pytorch>`__ through self-contained
 examples.
 
-At its core, PyTorch provides two main features: 
+At its core, PyTorch provides two main features:
 
-- An n-dimensional Tensor, similar to numpy but can run on GPUs 
+- An n-dimensional Tensor, similar to numpy but can run on GPUs
 - Automatic differentiation for building and training neural networks
 
 We will use a fully-connected ReLU network as our running example. The
@@ -17,7 +17,7 @@ gradient descent to fit random data by minimizing the Euclidean distance
 between the network output and the true output.
 
 .. Note::
-	You can browse the individual examples at the 
+	You can browse the individual examples at the
 	:ref:`end of this page <examples-download>`.
 
 .. contents:: Table of Contents
@@ -72,7 +72,7 @@ and backward passes through the network:
 Autograd
 ========
 
-PyTorch: Variables and autograd
+PyTorch: Tensors and autograd
 -------------------------------
 
 In the above examples, we had to manually implement both the forward and
@@ -90,18 +90,12 @@ will be functions that produce output Tensors from input Tensors.
 Backpropagating through this graph then allows you to easily compute
 gradients.
 
-This sounds complicated, it's pretty simple to use in practice. We wrap
-our PyTorch Tensors in **Variable** objects; a Variable represents a
-node in a computational graph. If ``x`` is a Variable then ``x.data`` is
-a Tensor, and ``x.grad`` is another Variable holding the gradient of
-``x`` with respect to some scalar value.
+This sounds complicated, it's pretty simple to use in practice. Each Tensor
+represents a node in a computational graph. If ``x`` is a Tensor that has
+``x.requires_grad=True`` then ``x.grad`` is another Tensor holding the
+gradient of ``x`` with respect to some scalar value.
 
-PyTorch Variables have the same API as PyTorch Tensors: (almost) any
-operation that you can perform on a Tensor also works on Variables; the
-difference is that using Variables defines a computational graph,
-allowing you to automatically compute gradients.
-
-Here we use PyTorch Variables and autograd to implement our two-layer
+Here we use PyTorch Tensors and autograd to implement our two-layer
 network; now we no longer need to manually implement the backward pass
 through the network:
 
@@ -121,7 +115,7 @@ In PyTorch we can easily define our own autograd operator by defining a
 subclass of ``torch.autograd.Function`` and implementing the ``forward``
 and ``backward`` functions. We can then use our new autograd operator by
 constructing an instance and calling it like a function, passing
-Variables containing input data.
+Tensors containing input data.
 
 In this example we define our own custom autograd function for
 performing the ReLU nonlinearity, and use it to implement our two-layer
@@ -189,8 +183,8 @@ networks.
 
 In PyTorch, the ``nn`` package serves this same purpose. The ``nn``
 package defines a set of **Modules**, which are roughly equivalent to
-neural network layers. A Module receives input Variables and computes
-output Variables, but may also hold internal state such as Variables
+neural network layers. A Module receives input Tensors and computes
+output Tensors, but may also hold internal state such as Tensors
 containing learnable parameters. The ``nn`` package also defines a set
 of useful loss functions that are commonly used when training neural
 networks.
@@ -204,11 +198,11 @@ PyTorch: optim
 --------------
 
 Up to this point we have updated the weights of our models by manually
-mutating the ``.data`` member for Variables holding learnable
-parameters. This is not a huge burden for simple optimization algorithms
-like stochastic gradient descent, but in practice we often train neural
-networks using more sophisticated optimizers like AdaGrad, RMSProp,
-Adam, etc.
+mutating the Tensors holding learnable parameters (with ``torch.no_grad()``
+or ``.data`` to avoid tracking history in autograd). This is not a huge
+burden for simple optimization algorithms like stochastic gradient descent,
+but in practice we often train neural networks using more sophisticated
+optimizers like AdaGrad, RMSProp, Adam, etc.
 
 The ``optim`` package in PyTorch abstracts the idea of an optimization
 algorithm and provides implementations of commonly used optimization
@@ -226,8 +220,8 @@ PyTorch: Custom nn Modules
 Sometimes you will want to specify models that are more complex than a
 sequence of existing Modules; for these cases you can define your own
 Modules by subclassing ``nn.Module`` and defining a ``forward`` which
-receives input Variables and produces output Variables using other
-modules or other autograd operations on Variables.
+receives input Tensors and produces output Tensors using other
+modules or other autograd operations on Tensors.
 
 In this example we implement our two-layer network as a custom Module
 subclass:
diff --git a/beginner_source/transfer_learning_tutorial.py b/beginner_source/transfer_learning_tutorial.py
index bfdf0685aeb..338fa44a996 100644
--- a/beginner_source/transfer_learning_tutorial.py
+++ b/beginner_source/transfer_learning_tutorial.py
@@ -39,7 +39,6 @@
 import torch.nn as nn
 import torch.optim as optim
 from torch.optim import lr_scheduler
-from torch.autograd import Variable
 import numpy as np
 import torchvision
 from torchvision import datasets, models, transforms
@@ -98,7 +97,7 @@
 dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
 class_names = image_datasets['train'].classes
 
-use_gpu = torch.cuda.is_available()
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
 ######################################################################
 # Visualize a few images
@@ -156,40 +155,35 @@ def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
         for phase in ['train', 'val']:
             if phase == 'train':
                 scheduler.step()
-                model.train(True)  # Set model to training mode
+                model.train()  # Set model to training mode
             else:
-                model.train(False)  # Set model to evaluate mode
+                model.eval()   # Set model to evaluate mode
 
             running_loss = 0.0
             running_corrects = 0
 
             # Iterate over data.
-            for data in dataloaders[phase]:
-                # get the inputs
-                inputs, labels = data
-
-                # wrap them in Variable
-                if use_gpu:
-                    inputs = Variable(inputs.cuda())
-                    labels = Variable(labels.cuda())
-                else:
-                    inputs, labels = Variable(inputs), Variable(labels)
+            for inputs, labels in dataloaders[phase]:
+                inputs = inputs.to(device)
+                labels = labels.to(device)
 
                 # zero the parameter gradients
                 optimizer.zero_grad()
 
                 # forward
-                outputs = model(inputs)
-                _, preds = torch.max(outputs.data, 1)
-                loss = criterion(outputs, labels)
+                # track history if only in train
+                with torch.set_grad_enabled(phase == 'train'):
+                    outputs = model(inputs)
+                    _, preds = torch.max(outputs, 1)
+                    loss = criterion(outputs, labels)
 
-                # backward + optimize only if in training phase
-                if phase == 'train':
-                    loss.backward()
-                    optimizer.step()
+                    # backward + optimize only if in training phase
+                    if phase == 'train':
+                        loss.backward()
+                        optimizer.step()
 
                 # statistics
-                running_loss += loss.data[0] * inputs.size(0)
+                running_loss += loss.item() * inputs.size(0)
                 running_corrects += torch.sum(preds == labels.data)
 
             epoch_loss = running_loss / dataset_sizes[phase]
@@ -228,27 +222,25 @@ def visualize_model(model, num_images=6):
     images_so_far = 0
     fig = plt.figure()
 
-    for i, data in enumerate(dataloaders['val']):
-        inputs, labels = data
-        if use_gpu:
-            inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())
-        else:
-            inputs, labels = Variable(inputs), Variable(labels)
+    with torch.no_grad():
+        for i, (inputs, labels) in enumerate(dataloaders['val']):
+            inputs = inputs.to(device)
+            labels = labels.to(device)
 
-        outputs = model(inputs)
-        _, preds = torch.max(outputs.data, 1)
+            outputs = model(inputs)
+            _, preds = torch.max(outputs, 1)
 
-        for j in range(inputs.size()[0]):
-            images_so_far += 1
-            ax = plt.subplot(num_images//2, 2, images_so_far)
-            ax.axis('off')
-            ax.set_title('predicted: {}'.format(class_names[preds[j]]))
-            imshow(inputs.cpu().data[j])
+            for j in range(inputs.size()[0]):
+                images_so_far += 1
+                ax = plt.subplot(num_images//2, 2, images_so_far)
+                ax.axis('off')
+                ax.set_title('predicted: {}'.format(class_names[preds[j]]))
+                imshow(inputs.cpu().data[j])
 
-            if images_so_far == num_images:
-                model.train(mode=was_training)
-                return
-    model.train(mode=was_training)
+                if images_so_far == num_images:
+                    model.train(mode=was_training)
+                    return
+        model.train(mode=was_training)
 
 ######################################################################
 # Finetuning the convnet
@@ -261,8 +253,7 @@ def visualize_model(model, num_images=6):
 num_ftrs = model_ft.fc.in_features
 model_ft.fc = nn.Linear(num_ftrs, 2)
 
-if use_gpu:
-    model_ft = model_ft.cuda()
+model_ft = model_ft.to(device)
 
 criterion = nn.CrossEntropyLoss()
 
@@ -309,8 +300,7 @@ def visualize_model(model, num_images=6):
 num_ftrs = model_conv.fc.in_features
 model_conv.fc = nn.Linear(num_ftrs, 2)
 
-if use_gpu:
-    model_conv = model_conv.cuda()
+model_conv = model_conv.to(device)
 
 criterion = nn.CrossEntropyLoss()
 
diff --git a/intermediate_source/char_rnn_classification_tutorial.py b/intermediate_source/char_rnn_classification_tutorial.py
index 048a0832a15..a6032edff62 100644
--- a/intermediate_source/char_rnn_classification_tutorial.py
+++ b/intermediate_source/char_rnn_classification_tutorial.py
@@ -181,7 +181,6 @@ def lineToTensor(line):
 #
 
 import torch.nn as nn
-from torch.autograd import Variable
 
 class RNN(nn.Module):
     def __init__(self, input_size, hidden_size, output_size):
@@ -201,7 +200,7 @@ def forward(self, input, hidden):
         return output, hidden
 
     def initHidden(self):
-        return Variable(torch.zeros(1, self.hidden_size))
+        return torch.zeros(1, self.hidden_size)
 
 n_hidden = 128
 rnn = RNN(n_letters, n_hidden, n_categories)
@@ -214,12 +213,9 @@ def initHidden(self):
 # each language) and a next hidden state (which we keep for the next
 # step).
 #
-# Remember that PyTorch modules operate on Variables rather than straight
-# up Tensors.
-#
 
-input = Variable(letterToTensor('A'))
-hidden = Variable(torch.zeros(1, n_hidden))
+input = letterToTensor('A')
+hidden =torch.zeros(1, n_hidden)
 
 output, next_hidden = rnn(input, hidden)
 
@@ -231,8 +227,8 @@ def initHidden(self):
 # pre-computing batches of Tensors.
 #
 
-input = Variable(lineToTensor('Albert'))
-hidden = Variable(torch.zeros(1, n_hidden))
+input = lineToTensor('Albert')
+hidden = torch.zeros(1, n_hidden)
 
 output, next_hidden = rnn(input[0], hidden)
 print(output)
@@ -258,8 +254,8 @@ def initHidden(self):
 #
 
 def categoryFromOutput(output):
-    top_n, top_i = output.data.topk(1) # Tensor out of Variable with .data
-    category_i = top_i[0][0]
+    top_n, top_i = output.topk(1)
+    category_i = top_i[0].item()
     return all_categories[category_i], category_i
 
 print(categoryFromOutput(output))
@@ -278,8 +274,8 @@ def randomChoice(l):
 def randomTrainingExample():
     category = randomChoice(all_categories)
     line = randomChoice(category_lines[category])
-    category_tensor = Variable(torch.LongTensor([all_categories.index(category)]))
-    line_tensor = Variable(lineToTensor(line))
+    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
+    line_tensor = lineToTensor(line)
     return category, line, category_tensor, line_tensor
 
 for i in range(10):
@@ -332,7 +328,7 @@ def train(category_tensor, line_tensor):
     for p in rnn.parameters():
         p.data.add_(-learning_rate, p.grad.data)
 
-    return output, loss.data[0]
+    return output, loss.item()
 
 
 ######################################################################
@@ -466,17 +462,18 @@ def evaluate(line_tensor):
 
 def predict(input_line, n_predictions=3):
     print('\n> %s' % input_line)
-    output = evaluate(Variable(lineToTensor(input_line)))
-
-    # Get top N categories
-    topv, topi = output.data.topk(n_predictions, 1, True)
-    predictions = []
-
-    for i in range(n_predictions):
-        value = topv[0][i]
-        category_index = topi[0][i]
-        print('(%.2f) %s' % (value, all_categories[category_index]))
-        predictions.append([value, all_categories[category_index]])
+    with torch.no_grad():
+        output = evaluate(lineToTensor(input_line))
+
+        # Get top N categories
+        topv, topi = output.topk(n_predictions, 1, True)
+        predictions = []
+
+        for i in range(n_predictions):
+            value = topv[0][i].item()
+            category_index = topi[0][i].item()
+            print('(%.2f) %s' % (value, all_categories[category_index]))
+            predictions.append([value, all_categories[category_index]])
 
 predict('Dovesky')
 predict('Jackson')
diff --git a/intermediate_source/char_rnn_generation_tutorial.py b/intermediate_source/char_rnn_generation_tutorial.py
index e9a56a03fcb..4c38f37f849 100644
--- a/intermediate_source/char_rnn_generation_tutorial.py
+++ b/intermediate_source/char_rnn_generation_tutorial.py
@@ -109,6 +109,11 @@ def readLines(filename):
 
 n_categories = len(all_categories)
 
+if n_categories == 0:
+    raise RuntimeError('Data not found. Make sure that you downloaded data '
+        'from https://download.pytorch.org/tutorial/data.zip and extract it to '
+        'the current directory.')
+
 print('# categories:', n_categories, all_categories)
 print(unicodeToAscii("O'Néàl"))
 
@@ -141,7 +146,6 @@ def readLines(filename):
 
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 
 class RNN(nn.Module):
     def __init__(self, input_size, hidden_size, output_size):
@@ -165,7 +169,7 @@ def forward(self, category, input, hidden):
         return output, hidden
 
     def initHidden(self):
-        return Variable(torch.zeros(1, self.hidden_size))
+        return torch.zeros(1, self.hidden_size)
 
 
 ######################################################################
@@ -244,9 +248,9 @@ def targetTensor(line):
 # Make category, input, and target tensors from a random category, line pair
 def randomTrainingExample():
     category, line = randomTrainingPair()
-    category_tensor = Variable(categoryTensor(category))
-    input_line_tensor = Variable(inputTensor(line))
-    target_line_tensor = Variable(targetTensor(line))
+    category_tensor = categoryTensor(category)
+    input_line_tensor = inputTensor(line)
+    target_line_tensor = targetTensor(line)
     return category_tensor, input_line_tensor, target_line_tensor
 
 
@@ -267,22 +271,24 @@ def randomTrainingExample():
 learning_rate = 0.0005
 
 def train(category_tensor, input_line_tensor, target_line_tensor):
+    target_line_tensor.unsqueeze_(-1)
     hidden = rnn.initHidden()
 
     rnn.zero_grad()
 
     loss = 0
 
-    for i in range(input_line_tensor.size()[0]):
+    for i in range(input_line_tensor.size(0)):
         output, hidden = rnn(category_tensor, input_line_tensor[i], hidden)
-        loss += criterion(output, target_line_tensor[i])
+        l = criterion(output, target_line_tensor[i])
+        loss += l
 
     loss.backward()
 
     for p in rnn.parameters():
         p.data.add_(-learning_rate, p.grad.data)
 
-    return output, loss.data[0] / input_line_tensor.size()[0]
+    return output, loss.item() / input_line_tensor.size(0)
 
 
 ######################################################################
@@ -374,24 +380,25 @@ def timeSince(since):
 
 # Sample from a category and starting letter
 def sample(category, start_letter='A'):
-    category_tensor = Variable(categoryTensor(category))
-    input = Variable(inputTensor(start_letter))
-    hidden = rnn.initHidden()
-
-    output_name = start_letter
-
-    for i in range(max_length):
-        output, hidden = rnn(category_tensor, input[0], hidden)
-        topv, topi = output.data.topk(1)
-        topi = topi[0][0]
-        if topi == n_letters - 1:
-            break
-        else:
-            letter = all_letters[topi]
-            output_name += letter
-        input = Variable(inputTensor(letter))
-
-    return output_name
+    with torch.no_grad():  # no need to track history in sampling
+        category_tensor = categoryTensor(category)
+        input = inputTensor(start_letter)
+        hidden = rnn.initHidden()
+
+        output_name = start_letter
+
+        for i in range(max_length):
+            output, hidden = rnn(category_tensor, input[0], hidden)
+            topv, topi = output.topk(1)
+            topi = topi[0][0]
+            if topi == n_letters - 1:
+                break
+            else:
+                letter = all_letters[topi]
+                output_name += letter
+            input = inputTensor(letter)
+
+        return output_name
 
 # Get multiple samples from one category and multiple starting letters
 def samples(category, start_letters='ABC'):
diff --git a/intermediate_source/dist_tuto.rst b/intermediate_source/dist_tuto.rst
index 137d3c56c47..2d160a9d1da 100644
--- a/intermediate_source/dist_tuto.rst
+++ b/intermediate_source/dist_tuto.rst
@@ -144,10 +144,10 @@ return a ``DistributedRequest`` object upon which we can choose to
 When using immediates we have to be careful about with our usage of the sent and received tensors.
 Since we do not know when the data will be communicated to the other process,
 we should not modify the sent tensor nor access the received tensor before ``req.wait()`` has completed.
-In other words, 
+In other words,
 
 -  writing to ``tensor`` after ``dist.isend()`` will result in undefined behaviour.
--  reading from ``tensor`` after ``dist.irecv()`` will result in undefined behaviour. 
+-  reading from ``tensor`` after ``dist.irecv()`` will result in undefined behaviour.
 
 However, after ``req.wait()``
 has been executed we are guaranteed that the communication took place,
@@ -202,7 +202,7 @@ to obtain the sum of all tensors at all processes, we can use the
     """ All-Reduce example."""
     def run(rank, size):
         """ Simple point-to-point communication. """
-        group = dist.new_group([0, 1]) 
+        group = dist.new_group([0, 1])
         tensor = torch.ones(1)
         dist.all_reduce(tensor, op=dist.reduce_op.SUM, group=group)
         print('Rank ', rank, ' has data ', tensor[0])
@@ -339,26 +339,25 @@ example <https://github.com/pytorch/examples/blob/master/mnist/main.py>`__.)
 
     """ Distributed Synchronous SGD Example """
     def run(rank, size):
-            torch.manual_seed(1234)
-            train_set, bsz = partition_dataset()
-            model = Net()
-            optimizer = optim.SGD(model.parameters(),
-                                  lr=0.01, momentum=0.5)
-
-            num_batches = ceil(len(train_set.dataset) / float(bsz)) 
-            for epoch in range(10):
-                epoch_loss = 0.0
-                for data, target in train_set:
-                    data, target = Variable(data), Variable(target)
-                    optimizer.zero_grad()
-                    output = model(data)
-                    loss = F.nll_loss(output, target)
-                    epoch_loss += loss.data[0]
-                    loss.backward()
-                    average_gradients(model)
-                    optimizer.step()
-                print('Rank ', dist.get_rank(), ', epoch ',
-                      epoch, ': ', epoch_loss / num_batches) 
+        torch.manual_seed(1234)
+        train_set, bsz = partition_dataset()
+        model = Net()
+        optimizer = optim.SGD(model.parameters(),
+                              lr=0.01, momentum=0.5)
+
+        num_batches = ceil(len(train_set.dataset) / float(bsz))
+        for epoch in range(10):
+            epoch_loss = 0.0
+            for data, target in train_set:
+                optimizer.zero_grad()
+                output = model(data)
+                loss = F.nll_loss(output, target)
+                epoch_loss += loss.item()
+                loss.backward()
+                average_gradients(model)
+                optimizer.step()
+            print('Rank ', dist.get_rank(), ', epoch ',
+                  epoch, ': ', epoch_loss / num_batches)
 
 It remains to implement the ``average_gradients(model)`` function, which
 simply takes in a model and averages its gradients across the whole
@@ -371,7 +370,7 @@ world.
         size = float(dist.get_world_size())
         for param in model.parameters():
             dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM)
-            param.grad.data /= size 
+            param.grad.data /= size
 
 *Et voilà*! We successfully implemented distributed synchronous SGD and
 could train any model on a large computer cluster.
@@ -480,10 +479,9 @@ modifications:
 
 0. ``init_processes(rank, size, fn, backend='tcp')`` :math:`\rightarrow`
    ``init_processes(rank, size, fn, backend='gloo')``
-1. ``model = Net()`` :math:`\rightarrow` ``model = Net().cuda(rank)``
-2. ``data, target = Variable(data), Variable(target)``
-   :math:`\rightarrow`
-   ``data, target = Variable(data.cuda(rank)), Variable(target.cuda(rank))``
+1.  Use ``device = torch.device("cuda:{}".format(rank))
+1. ``model = Net()`` :math:`\rightarrow` ``model = Net().to(device)``
+2.  Use ``data, target = data.to(device), target.to(device)``
 
 With the above modifications, our model is now training on two GPUs and
 you can monitor their utilization with ``watch nvidia-smi``.
diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index 1046faaa6f2..a1301c1becf 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -68,7 +68,6 @@
 import torch.nn as nn
 import torch.optim as optim
 import torch.nn.functional as F
-from torch.autograd import Variable
 import torchvision.transforms as T
 
 
@@ -82,11 +81,7 @@
 plt.ion()
 
 # if gpu is to be used
-use_cuda = torch.cuda.is_available()
-FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
-LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
-ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
-Tensor = FloatTensor
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
 ######################################################################
@@ -270,7 +265,7 @@ def get_screen():
     screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
     screen = torch.from_numpy(screen)
     # Resize, and add a batch dimension (BCHW)
-    return resize(screen).unsqueeze(0).type(Tensor)
+    return resize(screen).unsqueeze(0).to(device)
 
 
 env.reset()
@@ -290,9 +285,6 @@ def get_screen():
 # This cell instantiates our model and its optimizer, and defines some
 # utilities:
 #
-# -  ``Variable`` - this is a simple wrapper around
-#    ``torch.autograd.Variable`` that will automatically send the data to
-#    the GPU every time we construct a Variable.
 # -  ``select_action`` - will select an action accordingly to an epsilon
 #    greedy policy. Simply put, we'll sometimes use our model for choosing
 #    the action, and sometimes we'll just sample one uniformly. The
@@ -313,15 +305,11 @@ def get_screen():
 EPS_DECAY = 200
 TARGET_UPDATE = 10
 
-policy_net = DQN()
-target_net = DQN()
+policy_net = DQN().to(device)
+target_net = DQN().to(device)
 target_net.load_state_dict(policy_net.state_dict())
 target_net.eval()
 
-if use_cuda:
-    policy_net.cuda()
-    target_net.cuda()
-
 optimizer = optim.RMSprop(policy_net.parameters())
 memory = ReplayMemory(10000)
 
@@ -336,10 +324,10 @@ def select_action(state):
         math.exp(-1. * steps_done / EPS_DECAY)
     steps_done += 1
     if sample > eps_threshold:
-        return policy_net(
-            Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view(1, 1)
+        with torch.no_grad():
+            return policy_net(state).max(1)[1].view(1, 1)
     else:
-        return LongTensor([[random.randrange(2)]])
+        return torch.tensor([[random.randrange(2)]], device=device, dtype=torch.long)
 
 
 episode_durations = []
@@ -348,7 +336,7 @@ def select_action(state):
 def plot_durations():
     plt.figure(2)
     plt.clf()
-    durations_t = torch.FloatTensor(episode_durations)
+    durations_t = torch.tensor(episode_durations, dtype=torch.float)
     plt.title('Training...')
     plt.xlabel('Episode')
     plt.ylabel('Duration')
@@ -392,29 +380,26 @@ def optimize_model():
     batch = Transition(*zip(*transitions))
 
     # Compute a mask of non-final states and concatenate the batch elements
-    non_final_mask = ByteTensor(tuple(map(lambda s: s is not None,
-                                          batch.next_state)))
-    non_final_next_states = Variable(torch.cat([s for s in batch.next_state
-                                                if s is not None]),
-                                     volatile=True)
-    state_batch = Variable(torch.cat(batch.state))
-    action_batch = Variable(torch.cat(batch.action))
-    reward_batch = Variable(torch.cat(batch.reward))
+    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
+                                          batch.next_state)), device=device, dtype=torch.uint8)
+    non_final_next_states = torch.cat([s for s in batch.next_state
+                                                if s is not None])
+    state_batch = torch.cat(batch.state)
+    action_batch = torch.cat(batch.action)
+    reward_batch = torch.cat(batch.reward)
 
     # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
     # columns of actions taken
     state_action_values = policy_net(state_batch).gather(1, action_batch)
 
     # Compute V(s_{t+1}) for all next states.
-    next_state_values = Variable(torch.zeros(BATCH_SIZE).type(Tensor))
-    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0]
+    next_state_values = torch.zeros(BATCH_SIZE, device=device)
+    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
     # Compute the expected Q values
     expected_state_action_values = (next_state_values * GAMMA) + reward_batch
-    # Undo volatility (which was used to prevent unnecessary gradients)
-    expected_state_action_values = Variable(expected_state_action_values.data)
 
     # Compute Huber loss
-    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)
+    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))
 
     # Optimize the model
     optimizer.zero_grad()
@@ -427,7 +412,7 @@ def optimize_model():
 ######################################################################
 #
 # Below, you can find the main training loop. At the beginning we reset
-# the environment and initialize the ``state`` variable. Then, we sample
+# the environment and initialize the ``state`` Tensor. Then, we sample
 # an action, execute it, observe the next screen and the reward (always
 # 1), and optimize our model once. When the episode ends (our model
 # fails), we restart the loop.
@@ -446,8 +431,8 @@ def optimize_model():
     for t in count():
         # Select and perform an action
         action = select_action(state)
-        _, reward, done, _ = env.step(action[0, 0])
-        reward = Tensor([reward])
+        _, reward, done, _ = env.step(action.item())
+        reward = torch.tensor([reward], device=device)
 
         # Observe new state
         last_screen = current_screen
@@ -474,7 +459,7 @@ def optimize_model():
         target_net.load_state_dict(policy_net.state_dict())
 
 print('Complete')
-env.render(close=True)
+env.render()
 env.close()
 plt.ioff()
 plt.show()
diff --git a/intermediate_source/seq2seq_translation_tutorial.py b/intermediate_source/seq2seq_translation_tutorial.py
index a5b90c31a95..4ee85b8db7b 100644
--- a/intermediate_source/seq2seq_translation_tutorial.py
+++ b/intermediate_source/seq2seq_translation_tutorial.py
@@ -92,11 +92,10 @@
 
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 from torch import optim
 import torch.nn.functional as F
 
-use_cuda = torch.cuda.is_available()
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 ######################################################################
 # Loading data files
@@ -350,11 +349,7 @@ def forward(self, input, hidden):
         return output, hidden
 
     def initHidden(self):
-        result = Variable(torch.zeros(1, 1, self.hidden_size))
-        if use_cuda:
-            return result.cuda()
-        else:
-            return result
+        return torch.zeros(1, 1, self.hidden_size, device=device)
 
 ######################################################################
 # The Decoder
@@ -402,11 +397,7 @@ def forward(self, input, hidden):
         return output, hidden
 
     def initHidden(self):
-        result = Variable(torch.zeros(1, 1, self.hidden_size))
-        if use_cuda:
-            return result.cuda()
-        else:
-            return result
+        return torch.zeros(1, 1, self.hidden_size, device=device)
 
 ######################################################################
 # I encourage you to train and observe the results of this model, but to
@@ -480,11 +471,7 @@ def forward(self, input, hidden, encoder_outputs):
         return output, hidden, attn_weights
 
     def initHidden(self):
-        result = Variable(torch.zeros(1, 1, self.hidden_size))
-        if use_cuda:
-            return result.cuda()
-        else:
-            return result
+        return torch.zeros(1, 1, self.hidden_size, device=device)
 
 
 ######################################################################
@@ -509,20 +496,16 @@ def indexesFromSentence(lang, sentence):
     return [lang.word2index[word] for word in sentence.split(' ')]
 
 
-def variableFromSentence(lang, sentence):
+def tensorFromSentence(lang, sentence):
     indexes = indexesFromSentence(lang, sentence)
     indexes.append(EOS_token)
-    result = Variable(torch.LongTensor(indexes).view(-1, 1))
-    if use_cuda:
-        return result.cuda()
-    else:
-        return result
+    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)
 
 
-def variablesFromPair(pair):
-    input_variable = variableFromSentence(input_lang, pair[0])
-    target_variable = variableFromSentence(output_lang, pair[1])
-    return (input_variable, target_variable)
+def tensorsFromPair(pair):
+    input_tensor = tensorFromSentence(input_lang, pair[0])
+    target_tensor = tensorFromSentence(output_lang, pair[1])
+    return (input_tensor, target_tensor)
 
 
 ######################################################################
@@ -555,27 +538,25 @@ def variablesFromPair(pair):
 teacher_forcing_ratio = 0.5
 
 
-def train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
+def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
     encoder_hidden = encoder.initHidden()
 
     encoder_optimizer.zero_grad()
     decoder_optimizer.zero_grad()
 
-    input_length = input_variable.size()[0]
-    target_length = target_variable.size()[0]
+    input_length = input_tensor.size(0)
+    target_length = target_tensor.size(0)
 
-    encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
-    encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs
+    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
 
     loss = 0
 
     for ei in range(input_length):
         encoder_output, encoder_hidden = encoder(
-            input_variable[ei], encoder_hidden)
-        encoder_outputs[ei] = encoder_output[0][0]
+            input_tensor[ei], encoder_hidden)
+        encoder_outputs[ei] = encoder_output[0, 0]
 
-    decoder_input = Variable(torch.LongTensor([[SOS_token]]))
-    decoder_input = decoder_input.cuda() if use_cuda else decoder_input
+    decoder_input = torch.tensor([[SOS_token]], device=device)
 
     decoder_hidden = encoder_hidden
 
@@ -586,22 +567,19 @@ def train(input_variable, target_variable, encoder, decoder, encoder_optimizer,
         for di in range(target_length):
             decoder_output, decoder_hidden, decoder_attention = decoder(
                 decoder_input, decoder_hidden, encoder_outputs)
-            loss += criterion(decoder_output, target_variable[di])
-            decoder_input = target_variable[di]  # Teacher forcing
+            loss += criterion(decoder_output, target_tensor[di])
+            decoder_input = target_tensor[di]  # Teacher forcing
 
     else:
         # Without teacher forcing: use its own predictions as the next input
         for di in range(target_length):
             decoder_output, decoder_hidden, decoder_attention = decoder(
                 decoder_input, decoder_hidden, encoder_outputs)
-            topv, topi = decoder_output.data.topk(1)
-            ni = topi[0][0]
+            topv, topi = decoder_output.topk(1)
+            decoder_input = topi.squeeze().detach()  # detach from history as input
 
-            decoder_input = Variable(torch.LongTensor([[ni]]))
-            decoder_input = decoder_input.cuda() if use_cuda else decoder_input
-
-            loss += criterion(decoder_output, target_variable[di])
-            if ni == EOS_token:
+            loss += criterion(decoder_output, target_tensor[di])
+            if decoder_input.item() == EOS_token:
                 break
 
     loss.backward()
@@ -609,7 +587,7 @@ def train(input_variable, target_variable, encoder, decoder, encoder_optimizer,
     encoder_optimizer.step()
     decoder_optimizer.step()
 
-    return loss.data[0] / target_length
+    return loss.item() / target_length
 
 
 ######################################################################
@@ -655,16 +633,16 @@ def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, lear
 
     encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
     decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
-    training_pairs = [variablesFromPair(random.choice(pairs))
+    training_pairs = [tensorsFromPair(random.choice(pairs))
                       for i in range(n_iters)]
     criterion = nn.NLLLoss()
 
     for iter in range(1, n_iters + 1):
         training_pair = training_pairs[iter - 1]
-        input_variable = training_pair[0]
-        target_variable = training_pair[1]
+        input_tensor = training_pair[0]
+        target_tensor = training_pair[1]
 
-        loss = train(input_variable, target_variable, encoder,
+        loss = train(input_tensor, target_tensor, encoder,
                      decoder, encoder_optimizer, decoder_optimizer, criterion)
         print_loss_total += loss
         plot_loss_total += loss
@@ -692,6 +670,7 @@ def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, lear
 #
 
 import matplotlib.pyplot as plt
+plt.switch_backend('agg')
 import matplotlib.ticker as ticker
 import numpy as np
 
@@ -717,42 +696,39 @@ def showPlot(points):
 #
 
 def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
-    input_variable = variableFromSentence(input_lang, sentence)
-    input_length = input_variable.size()[0]
-    encoder_hidden = encoder.initHidden()
+    with torch.no_grad():
+        input_tensor = tensorFromSentence(input_lang, sentence)
+        input_length = input_tensor.size()[0]
+        encoder_hidden = encoder.initHidden()
 
-    encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
-    encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs
+        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
 
-    for ei in range(input_length):
-        encoder_output, encoder_hidden = encoder(input_variable[ei],
-                                                 encoder_hidden)
-        encoder_outputs[ei] = encoder_outputs[ei] + encoder_output[0][0]
+        for ei in range(input_length):
+            encoder_output, encoder_hidden = encoder(input_tensor[ei],
+                                                     encoder_hidden)
+            encoder_outputs[ei] += encoder_output[0, 0]
 
-    decoder_input = Variable(torch.LongTensor([[SOS_token]]))  # SOS
-    decoder_input = decoder_input.cuda() if use_cuda else decoder_input
+        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS
 
-    decoder_hidden = encoder_hidden
+        decoder_hidden = encoder_hidden
 
-    decoded_words = []
-    decoder_attentions = torch.zeros(max_length, max_length)
-
-    for di in range(max_length):
-        decoder_output, decoder_hidden, decoder_attention = decoder(
-            decoder_input, decoder_hidden, encoder_outputs)
-        decoder_attentions[di] = decoder_attention.data
-        topv, topi = decoder_output.data.topk(1)
-        ni = topi[0][0]
-        if ni == EOS_token:
-            decoded_words.append('<EOS>')
-            break
-        else:
-            decoded_words.append(output_lang.index2word[ni])
+        decoded_words = []
+        decoder_attentions = torch.zeros(max_length, max_length)
+
+        for di in range(max_length):
+            decoder_output, decoder_hidden, decoder_attention = decoder(
+                decoder_input, decoder_hidden, encoder_outputs)
+            decoder_attentions[di] = decoder_attention.data
+            topv, topi = decoder_output.data.topk(1)
+            if topi.item() == EOS_token:
+                decoded_words.append('<EOS>')
+                break
+            else:
+                decoded_words.append(output_lang.index2word[topi.item()])
 
-        decoder_input = Variable(torch.LongTensor([[ni]]))
-        decoder_input = decoder_input.cuda() if use_cuda else decoder_input
+            decoder_input = topi.squeeze().detach()
 
-    return decoded_words, decoder_attentions[:di + 1]
+        return decoded_words, decoder_attentions[:di + 1]
 
 
 ######################################################################
@@ -791,13 +767,8 @@ def evaluateRandomly(encoder, decoder, n=10):
 #
 
 hidden_size = 256
-encoder1 = EncoderRNN(input_lang.n_words, hidden_size)
-attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1)
-
-
-if use_cuda:
-    encoder1 = encoder1.cuda()
-    attn_decoder1 = attn_decoder1.cuda()
+encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
+attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)
 
 trainIters(encoder1, attn_decoder1, 75000, print_every=5000)
 
diff --git a/intermediate_source/spatial_transformer_tutorial.py b/intermediate_source/spatial_transformer_tutorial.py
index 277ec60a9bf..0afd366bf74 100644
--- a/intermediate_source/spatial_transformer_tutorial.py
+++ b/intermediate_source/spatial_transformer_tutorial.py
@@ -34,7 +34,6 @@
 import torch.optim as optim
 import torchvision
 from torchvision import datasets, transforms
-from torch.autograd import Variable
 import matplotlib.pyplot as plt
 import numpy as np
 
@@ -48,7 +47,7 @@
 # standard convolutional network augmented with a spatial transformer
 # network.
 
-use_cuda = torch.cuda.is_available()
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 # Training dataset
 train_loader = torch.utils.data.DataLoader(
@@ -114,8 +113,8 @@ def __init__(self):
         )
 
         # Initialize the weights/bias with identity transformation
-        self.fc_loc[2].weight.data.fill_(0)
-        self.fc_loc[2].bias.data = torch.FloatTensor([1, 0, 0, 0, 1, 0])
+        self.fc_loc[2].weight.data.zero_()
+        self.fc_loc[2].bias.data.copy_(torch.tensor([1, 0, 0, 0, 1, 0], dtype=torch.float))
 
     # Spatial transformer network forward function
     def stn(self, x):
@@ -143,9 +142,7 @@ def forward(self, x):
         return F.log_softmax(x, dim=1)
 
 
-model = Net()
-if use_cuda:
-    model.cuda()
+model = Net().to(device)
 
 ######################################################################
 # Training the model
@@ -162,10 +159,8 @@ def forward(self, x):
 def train(epoch):
     model.train()
     for batch_idx, (data, target) in enumerate(train_loader):
-        if use_cuda:
-            data, target = data.cuda(), target.cuda()
+        data, target = data.to(device), target.to(device)
 
-        data, target = Variable(data), Variable(target)
         optimizer.zero_grad()
         output = model(data)
         loss = F.nll_loss(output, target)
@@ -174,32 +169,31 @@ def train(epoch):
         if batch_idx % 500 == 0:
             print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                 epoch, batch_idx * len(data), len(train_loader.dataset),
-                100. * batch_idx / len(train_loader), loss.data[0]))
+                100. * batch_idx / len(train_loader), loss.item()))
 #
 # A simple test procedure to measure STN the performances on MNIST.
 #
 
 
 def test():
-    model.eval()
-    test_loss = 0
-    correct = 0
-    for data, target in test_loader:
-        if use_cuda:
-            data, target = data.cuda(), target.cuda()
-        data, target = Variable(data, volatile=True), Variable(target)
-        output = model(data)
-
-        # sum up batch loss
-        test_loss += F.nll_loss(output, target, size_average=False).data[0]
-        # get the index of the max log-probability
-        pred = output.data.max(1, keepdim=True)[1]
-        correct += pred.eq(target.data.view_as(pred)).cpu().sum()
-
-    test_loss /= len(test_loader.dataset)
-    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'
-          .format(test_loss, correct, len(test_loader.dataset),
-                  100. * correct / len(test_loader.dataset)))
+    with torch.no_grad():
+        model.eval()
+        test_loss = 0
+        correct = 0
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+
+            # sum up batch loss
+            test_loss += F.nll_loss(output, target, size_average=False).item()
+            # get the index of the max log-probability
+            pred = output.max(1, keepdim=True)[1]
+            correct += pred.eq(target.view_as(pred)).sum().item()
+
+        test_loss /= len(test_loader.dataset)
+        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'
+              .format(test_loss, correct, len(test_loader.dataset),
+                      100. * correct / len(test_loader.dataset)))
 
 ######################################################################
 # Visualizing the STN results
@@ -227,29 +221,26 @@ def convert_image_np(inp):
 
 
 def visualize_stn():
-    # Get a batch of training data
-    data, _ = next(iter(test_loader))
-    data = Variable(data, volatile=True)
-
-    if use_cuda:
-        data = data.cuda()
+    with torch.no_grad():
+        # Get a batch of training data
+        data = next(iter(test_loader))[0].to(device)
 
-    input_tensor = data.cpu().data
-    transformed_input_tensor = model.stn(data).cpu().data
+        input_tensor = data.cpu()
+        transformed_input_tensor = model.stn(data).cpu()
 
-    in_grid = convert_image_np(
-        torchvision.utils.make_grid(input_tensor))
+        in_grid = convert_image_np(
+            torchvision.utils.make_grid(input_tensor))
 
-    out_grid = convert_image_np(
-        torchvision.utils.make_grid(transformed_input_tensor))
+        out_grid = convert_image_np(
+            torchvision.utils.make_grid(transformed_input_tensor))
 
-    # Plot the results side-by-side
-    f, axarr = plt.subplots(1, 2)
-    axarr[0].imshow(in_grid)
-    axarr[0].set_title('Dataset Images')
+        # Plot the results side-by-side
+        f, axarr = plt.subplots(1, 2)
+        axarr[0].imshow(in_grid)
+        axarr[0].set_title('Dataset Images')
 
-    axarr[1].imshow(out_grid)
-    axarr[1].set_title('Transformed Images')
+        axarr[1].imshow(out_grid)
+        axarr[1].set_title('Transformed Images')
 
 
 for epoch in range(1, 20 + 1):