dotbsp
diff --git a/‎labml_nn/capsule_networks/__init__.py
Lines changed: 1 addition & 1 deletion b/‎labml_nn/capsule_networks/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/capsule_networks/mnist.py
Lines changed: 1 addition & 1 deletion b/‎labml_nn/capsule_networks/mnist.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/capsule_networks/readme.md
Lines changed: 1 addition & 1 deletion b/‎labml_nn/capsule_networks/readme.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/gan/cycle_gan/__init__.py
Lines changed: 1 addition & 1 deletion b/‎labml_nn/gan/cycle_gan/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/gan/cycle_gan/readme.md
Lines changed: 1 addition & 1 deletion b/‎labml_nn/gan/cycle_gan/readme.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/gan/dcgan/__init__.py
Lines changed: 1 addition & 1 deletion b/‎labml_nn/gan/dcgan/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/gan/dcgan/readme.md
Lines changed: 1 addition & 1 deletion b/‎labml_nn/gan/dcgan/readme.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/gan/original/__init__.py
Lines changed: 1 addition & 1 deletion b/‎labml_nn/gan/original/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/gan/original/readme.md
Lines changed: 1 addition & 1 deletion b/‎labml_nn/gan/original/readme.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/gan/stylegan/__init__.py
Lines changed: 6 additions & 6 deletions b/‎labml_nn/gan/stylegan/__init__.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎labml_nn/gan/stylegan/readme.md
Lines changed: 3 additions & 3 deletions b/‎labml_nn/gan/stylegan/readme.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎labml_nn/gan/wasserstein/__init__.py
Lines changed: 1 addition & 1 deletion b/‎labml_nn/gan/wasserstein/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/gan/wasserstein/gradient_penalty/__init__.py
Lines changed: 2 additions & 2 deletions b/‎labml_nn/gan/wasserstein/gradient_penalty/__init__.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎labml_nn/gan/wasserstein/gradient_penalty/readme.md
Lines changed: 2 additions & 2 deletions b/‎labml_nn/gan/wasserstein/gradient_penalty/readme.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎labml_nn/gan/wasserstein/readme.md
Lines changed: 1 addition & 1 deletion b/‎labml_nn/gan/wasserstein/readme.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/graphs/gat/__init__.py
Lines changed: 1 addition & 1 deletion b/‎labml_nn/graphs/gat/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/graphs/gat/readme.md
Lines changed: 1 addition & 1 deletion b/‎labml_nn/graphs/gat/readme.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/graphs/gatv2/__init__.py
Lines changed: 1 addition & 1 deletion b/‎labml_nn/graphs/gatv2/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/graphs/gatv2/readme.md
Lines changed: 1 addition & 1 deletion b/‎labml_nn/graphs/gatv2/readme.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/hypernetworks/hyper_lstm.py
Lines changed: 1 addition & 1 deletion b/‎labml_nn/hypernetworks/hyper_lstm.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/normalization/batch_channel_norm/__init__.py
Lines changed: 1 addition & 1 deletion b/‎labml_nn/normalization/batch_channel_norm/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/normalization/batch_norm/__init__.py
Lines changed: 1 addition & 1 deletion b/‎labml_nn/normalization/batch_norm/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/normalization/batch_norm/readme.md
Lines changed: 1 addition & 1 deletion b/‎labml_nn/normalization/batch_norm/readme.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/normalization/group_norm/__init__.py
Lines changed: 1 addition & 1 deletion b/‎labml_nn/normalization/group_norm/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/normalization/group_norm/readme.md
Lines changed: 1 addition & 1 deletion b/‎labml_nn/normalization/group_norm/readme.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/normalization/instance_norm/__init__.py
Lines changed: 1 addition & 1 deletion b/‎labml_nn/normalization/instance_norm/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/normalization/instance_norm/readme.md
Lines changed: 1 addition & 1 deletion b/‎labml_nn/normalization/instance_norm/readme.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/normalization/layer_norm/__init__.py
Lines changed: 1 addition & 1 deletion b/‎labml_nn/normalization/layer_norm/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/normalization/layer_norm/readme.md
Lines changed: 1 addition & 1 deletion b/‎labml_nn/normalization/layer_norm/readme.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/normalization/weight_standardization/__init__.py
Lines changed: 2 additions & 2 deletions b/‎labml_nn/normalization/weight_standardization/__init__.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎labml_nn/normalization/weight_standardization/readme.md
Lines changed: 1 addition & 1 deletion b/‎labml_nn/normalization/weight_standardization/readme.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/optimizers/ada_belief.py
Lines changed: 1 addition & 1 deletion b/‎labml_nn/optimizers/ada_belief.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/optimizers/adam.py
Lines changed: 1 addition & 1 deletion b/‎labml_nn/optimizers/adam.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/optimizers/amsgrad.py
Lines changed: 1 addition & 1 deletion b/‎labml_nn/optimizers/amsgrad.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/optimizers/noam.py
Lines changed: 1 addition & 1 deletion b/‎labml_nn/optimizers/noam.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/optimizers/radam.py
Lines changed: 1 addition & 1 deletion b/‎labml_nn/optimizers/radam.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/recurrent_highway_networks/__init__.py
Lines changed: 1 addition & 1 deletion b/‎labml_nn/recurrent_highway_networks/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎labml_nn/resnet/__init__.py
Lines changed: 1 addition & 1 deletion b/‎labml_nn/resnet/__init__.py
Lines changed: 1 addition & 1 deletion
@@ -10,7 +10,7 @@
 # Capsule Networks
 
 This is a [PyTorch](https://pytorch.org) implementation/tutorial of
-[Dynamic Routing Between Capsules](https://arxiv.org/abs/1710.09829).
+[Dynamic Routing Between Capsules](https://papers.labml.ai/paper/1710.09829).
 
 Capsule network is a neural network architecture that embeds features
 as capsules and routes them with a voting mechanism to next layer of capsules.
 
@@ -9,7 +9,7 @@
 This is an annotated PyTorch code to classify MNIST digits with PyTorch.
 
 This paper implements the experiment described in paper
-[Dynamic Routing Between Capsules](https://arxiv.org/abs/1710.09829).
+[Dynamic Routing Between Capsules](https://papers.labml.ai/paper/1710.09829).
 """
 from typing import Any
 
 
@@ -1,7 +1,7 @@
 # [Capsule Networks](https://nn.labml.ai/capsule_networks/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation/tutorial of
-[Dynamic Routing Between Capsules](https://arxiv.org/abs/1710.09829).
+[Dynamic Routing Between Capsules](https://papers.labml.ai/paper/1710.09829).
 
 Capsule network is a neural network architecture that embeds features
 as capsules and routes them with a voting mechanism to next layer of capsules.
 
@@ -9,7 +9,7 @@
 # Cycle GAN
 
 This is a [PyTorch](https://pytorch.org) implementation/tutorial of the paper
-[Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial Networks](https://arxiv.org/abs/1703.10593).
+[Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial Networks](https://papers.labml.ai/paper/1703.10593).
 
 I've taken pieces of code from [eriklindernoren/PyTorch-GAN](https://github.com/eriklindernoren/PyTorch-GAN).
 It is a very good resource if you want to checkout other GAN variations too.
 
@@ -1,4 +1,4 @@
 # [Cycle GAN](https://nn.labml.ai/gan/cycle_gan/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation/tutorial of the paper
-[Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial Networks](https://arxiv.org/abs/1703.10593).
+[Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial Networks](https://papers.labml.ai/paper/1703.10593).
@@ -7,7 +7,7 @@
 # Deep Convolutional Generative Adversarial Networks (DCGAN)
 
 This is a [PyTorch](https://pytorch.org) implementation of paper
-[Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks](https://arxiv.org/abs/1511.06434).
+[Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks](https://papers.labml.ai/paper/1511.06434).
 
 This implementation is based on the [PyTorch DCGAN Tutorial](https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html).
 """
 
@@ -1,4 +1,4 @@
 # [Deep Convolutional Generative Adversarial Networks - DCGAN](https://nn.labml.ai/gan/dcgan/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of paper
-[Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks](https://arxiv.org/abs/1511.06434).
+[Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks](https://papers.labml.ai/paper/1511.06434).
@@ -7,7 +7,7 @@
 # Generative Adversarial Networks (GAN)
 
 This is an implementation of
-[Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
+[Generative Adversarial Networks](https://papers.labml.ai/paper/1406.2661).
 
 The generator, $G(\pmb{z}; \theta_g)$ generates samples that match the
 distribution of data, while the discriminator, $D(\pmb{x}; \theta_g)$
 
@@ -1,4 +1,4 @@
 # [Generative Adversarial Networks - GAN](https://nn.labml.ai/gan/original/index.html)
 
 This is an annotated implementation of
-[Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
+[Generative Adversarial Networks](https://papers.labml.ai/paper/1406.2661).
@@ -8,12 +8,12 @@
 # StyleGAN 2
 
 This is a [PyTorch](https://pytorch.org) implementation of the paper
- [Analyzing and Improving the Image Quality of StyleGAN](https://arxiv.org/abs/1912.04958)
+ [Analyzing and Improving the Image Quality of StyleGAN](https://papers.labml.ai/paper/1912.04958)
  which introduces **StyleGAN 2**.
 StyleGAN 2 is an improvement over **StyleGAN** from the paper
- [A Style-Based Generator Architecture for Generative Adversarial Networks](https://arxiv.org/abs/1812.04948).
+ [A Style-Based Generator Architecture for Generative Adversarial Networks](https://papers.labml.ai/paper/1812.04948).
 And StyleGAN is based on **Progressive GAN** from the paper
- [Progressive Growing of GANs for Improved Quality, Stability, and Variation](https://arxiv.org/abs/1710.10196).
+ [Progressive Growing of GANs for Improved Quality, Stability, and Variation](https://papers.labml.ai/paper/1710.10196).
 All three papers are from the same authors from [NVIDIA AI](https://twitter.com/NVIDIAAI).
 
 *Our implementation is a minimalistic StyleGAN 2 model training code.
@@ -650,7 +650,7 @@ class DownSample(nn.Module):
     The down-sample operation [smoothens](#smooth) each feature channel and
      scale $2 \times$ using bilinear interpolation.
     This is based on the paper
-     [Making Convolutional Networks Shift-Invariant Again](https://arxiv.org/abs/1904.11486).
+     [Making Convolutional Networks Shift-Invariant Again](https://papers.labml.ai/paper/1904.11486).
     """
 
     def __init__(self):
@@ -672,7 +672,7 @@ class UpSample(nn.Module):
 
     The up-sample operation scales the image up by $2 \times$ and [smoothens](#smooth) each feature channel.
     This is based on the paper
-     [Making Convolutional Networks Shift-Invariant Again](https://arxiv.org/abs/1904.11486).
+     [Making Convolutional Networks Shift-Invariant Again](https://papers.labml.ai/paper/1904.11486).
     """
 
     def __init__(self):
@@ -824,7 +824,7 @@ class GradientPenalty(nn.Module):
     ## Gradient Penalty
 
     This is the $R_1$ regularization penality from the paper
-    [Which Training Methods for GANs do actually Converge?](https://arxiv.org/abs/1801.04406).
+    [Which Training Methods for GANs do actually Converge?](https://papers.labml.ai/paper/1801.04406).
 
     $$R_1(\psi) = \frac{\gamma}{2} \mathbb{E}_{p_\mathcal{D}(x)}
     \Big[\Vert \nabla_x D_\psi(x)^2 \Vert\Big]$$
 
@@ -1,10 +1,10 @@
 # [StyleGAN 2](https://nn.labml.ai/gan/stylegan/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of the paper
- [Analyzing and Improving the Image Quality of StyleGAN](https://arxiv.org/abs/1912.04958)
+ [Analyzing and Improving the Image Quality of StyleGAN](https://papers.labml.ai/paper/1912.04958)
  which introduces **StyleGAN2**.
 StyleGAN 2 is an improvement over **StyleGAN** from the paper
- [A Style-Based Generator Architecture for Generative Adversarial Networks](https://arxiv.org/abs/1812.04948).
+ [A Style-Based Generator Architecture for Generative Adversarial Networks](https://papers.labml.ai/paper/1812.04948).
 And StyleGAN is based on **Progressive GAN** from the paper
- [Progressive Growing of GANs for Improved Quality, Stability, and Variation](https://arxiv.org/abs/1710.10196).
+ [Progressive Growing of GANs for Improved Quality, Stability, and Variation](https://papers.labml.ai/paper/1710.10196).
 All three papers are from the same authors from [NVIDIA AI](https://twitter.com/NVIDIAAI).
@@ -7,7 +7,7 @@
 # Wasserstein GAN (WGAN)
 
 This is an implementation of
-[Wasserstein GAN](https://arxiv.org/abs/1701.07875).
+[Wasserstein GAN](https://papers.labml.ai/paper/1701.07875).
 
 The original GAN loss is based on Jensen-Shannon (JS) divergence
 between the real distribution $\mathbb{P}_r$ and generated distribution $\mathbb{P}_g$.
 
@@ -9,7 +9,7 @@
 # Gradient Penalty for Wasserstein GAN (WGAN-GP)
 
 This is an implementation of
-[Improved Training of Wasserstein GANs](https://arxiv.org/abs/1704.00028).
+[Improved Training of Wasserstein GANs](https://papers.labml.ai/paper/1704.00028).
 
 [WGAN](../index.html) suggests clipping weights to enforce Lipschitz constraint
 on the discriminator network (critic).
@@ -19,7 +19,7 @@
 1. Limiting the capacity of the discriminator
 2. Exploding and vanishing gradients (without [Batch Normalization](../../../normalization/batch_norm/index.html)).
 
-The paper [Improved Training of Wasserstein GANs](https://arxiv.org/abs/1704.00028)
+The paper [Improved Training of Wasserstein GANs](https://papers.labml.ai/paper/1704.00028)
 proposal a better way to improve Lipschitz constraint, a gradient penalty.
 
 $$\mathcal{L}_{GP} = \lambda \underset{\hat{x} \sim \mathbb{P}_{\hat{x}}}{\mathbb{E}}
 
@@ -1,7 +1,7 @@
 # [Gradient Penalty for Wasserstein GAN (WGAN-GP)](https://nn.labml.ai/gan/wasserstein/gradient_penalty/index.html)
 
 This is an implementation of
-[Improved Training of Wasserstein GANs](https://arxiv.org/abs/1704.00028).
+[Improved Training of Wasserstein GANs](https://papers.labml.ai/paper/1704.00028).
 
 [WGAN](https://nn.labml.ai/gan/wasserstein/index.html) suggests
 clipping weights to enforce Lipschitz constraint
@@ -12,5 +12,5 @@ L1, L2 weight decay have problems:
 1. Limiting the capacity of the discriminator
 2. Exploding and vanishing gradients (without [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html)).
 
-The paper [Improved Training of Wasserstein GANs](https://arxiv.org/abs/1704.00028)
+The paper [Improved Training of Wasserstein GANs](https://papers.labml.ai/paper/1704.00028)
 proposal a better way to improve Lipschitz constraint, a gradient penalty.
@@ -1,4 +1,4 @@
 # [Wasserstein GAN - WGAN](https://nn.labml.ai/gan/wasserstein/index.html)
 
 This is an implementation of
-[Wasserstein GAN](https://arxiv.org/abs/1701.07875).
+[Wasserstein GAN](https://papers.labml.ai/paper/1701.07875).
@@ -8,7 +8,7 @@
 # Graph Attention Networks (GAT)
 
 This is a [PyTorch](https://pytorch.org) implementation of the paper
-[Graph Attention Networks](https://arxiv.org/abs/1710.10903).
+[Graph Attention Networks](https://papers.labml.ai/paper/1710.10903).
 
 GATs work on graph data.
 A graph consists of nodes and edges connecting nodes.
 
@@ -1,7 +1,7 @@
 # [Graph Attention Networks (GAT)](https://nn.labml.ai/graphs/gat/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of the paper
-[Graph Attention Networks](https://arxiv.org/abs/1710.10903).
+[Graph Attention Networks](https://papers.labml.ai/paper/1710.10903).
 
 GATs work on graph data.
 A graph consists of nodes and edges connecting nodes.
 
@@ -6,7 +6,7 @@
 ---
 # Graph Attention Networks v2 (GATv2)
 This is a [PyTorch](https://pytorch.org) implementation of the GATv2 operator from the paper
-[How Attentive are Graph Attention Networks?](https://arxiv.org/abs/2105.14491).
+[How Attentive are Graph Attention Networks?](https://papers.labml.ai/paper/2105.14491).
 
 GATv2s work on graph data similar to [GAT](../gat/index.html).
 A graph consists of nodes and edges connecting nodes.
 
@@ -1,7 +1,7 @@
 # [Graph Attention Networks v2 (GATv2)](https://nn.labml.ai/graphs/gatv2/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of the GATv2 operator from the paper
-[How Attentive are Graph Attention Networks?](https://arxiv.org/abs/2105.14491).
+[How Attentive are Graph Attention Networks?](https://papers.labml.ai/paper/2105.14491).
 
 GATv2s work on graph data.
 A graph consists of nodes and edges connecting nodes.
 
@@ -7,7 +7,7 @@
 # HyperNetworks - HyperLSTM
 
 We have implemented HyperLSTM introduced in paper
-[HyperNetworks](https://arxiv.org/abs/1609.09106), with annotations
+[HyperNetworks](https://papers.labml.ai/paper/1609.09106), with annotations
 using [PyTorch](https://pytorch.org).
 [This blog post](https://blog.otoro.net/2016/09/28/hyper-networks/)
 by David Ha gives a good explanation of HyperNetworks.
 
@@ -8,7 +8,7 @@
 # Batch-Channel Normalization
 
 This is a [PyTorch](https://pytorch.org) implementation of Batch-Channel Normalization from the paper
- [Micro-Batch Training with Batch-Channel Normalization and Weight Standardization](https://arxiv.org/abs/1903.10520).
+ [Micro-Batch Training with Batch-Channel Normalization and Weight Standardization](https://papers.labml.ai/paper/1903.10520).
 We also have an [annotated implementation of Weight Standardization](../weight_standardization/index.html).
 
 Batch-Channel Normalization performs batch normalization followed
 
@@ -8,7 +8,7 @@
 # Batch Normalization
 
 This is a [PyTorch](https://pytorch.org) implementation of Batch Normalization from paper
- [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://arxiv.org/abs/1502.03167).
+ [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://papers.labml.ai/paper/1502.03167).
 
 ### Internal Covariate Shift
 
 
@@ -1,7 +1,7 @@
 # [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of Batch Normalization from paper
- [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://arxiv.org/abs/1502.03167).
+ [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://papers.labml.ai/paper/1502.03167).
 
 ### Internal Covariate Shift
 
 
@@ -8,7 +8,7 @@
 # Group Normalization
 
 This is a [PyTorch](https://pytorch.org) implementation of
-the [Group Normalization](https://arxiv.org/abs/1803.08494) paper.
+the [Group Normalization](https://papers.labml.ai/paper/1803.08494) paper.
 
 [Batch Normalization](../batch_norm/index.html) works well for large enough batch sizes
 but not well for small batch sizes, because it normalizes over the batch.
 
@@ -1,7 +1,7 @@
 # [Group Normalization](https://nn.labml.ai/normalization/group_norm/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of
-the [Group Normalization](https://arxiv.org/abs/1803.08494) paper.
+the [Group Normalization](https://papers.labml.ai/paper/1803.08494) paper.
 
 [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html) works well for large enough batch sizes
 but not well for small batch sizes, because it normalizes over the batch.
 
@@ -8,7 +8,7 @@
 # Instance Normalization
 
 This is a [PyTorch](https://pytorch.org) implementation of
-[Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022).
+[Instance Normalization: The Missing Ingredient for Fast Stylization](https://papers.labml.ai/paper/1607.08022).
 
 Instance normalization was introduced to improve [style transfer](https://paperswithcode.com/task/style-transfer).
 It is based on the observation that stylization should not depend on the contrast of the content image.
 
@@ -1,7 +1,7 @@
 # [Instance Normalization](https://nn.labml.ai/normalization/instance_norm/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of
-[Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022).
+[Instance Normalization: The Missing Ingredient for Fast Stylization](https://papers.labml.ai/paper/1607.08022).
 
 Instance normalization was introduced to improve [style transfer](https://paperswithcode.com/task/style-transfer).
 It is based on the observation that stylization should not depend on the contrast of the content image.
 
@@ -8,7 +8,7 @@
 # Layer Normalization
 
 This is a [PyTorch](https://pytorch.org) implementation of
-[Layer Normalization](https://arxiv.org/abs/1607.06450).
+[Layer Normalization](https://papers.labml.ai/paper/1607.06450).
 
 ### Limitations of [Batch Normalization](../batch_norm/index.html)
 
 
@@ -1,7 +1,7 @@
 # [Layer Normalization](https://nn.labml.ai/normalization/layer_norm/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of
-[Layer Normalization](https://arxiv.org/abs/1607.06450).
+[Layer Normalization](https://papers.labml.ai/paper/1607.06450).
 
 ### Limitations of [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html)
 
 
@@ -8,7 +8,7 @@
 # Weight Standardization
 
 This is a [PyTorch](https://pytorch.org) implementation of Weight Standardization from the paper
- [Micro-Batch Training with Batch-Channel Normalization and Weight Standardization](https://arxiv.org/abs/1903.10520).
+ [Micro-Batch Training with Batch-Channel Normalization and Weight Standardization](https://papers.labml.ai/paper/1903.10520).
 We also have an [annotated implementation of Batch-Channel Normalization](../batch_channel_norm/index.html).
 
 Batch normalization **gives a smooth loss landscape** and
@@ -36,7 +36,7 @@
 This avoids outputs of nodes from always falling beyond the active range of the activation function
 (e.g. always negative input for a ReLU).
 
-*[Refer to the paper for proofs](https://arxiv.org/abs/1903.10520)*.
+*[Refer to the paper for proofs](https://papers.labml.ai/paper/1903.10520)*.
 
 Here is [the training code](experiment.html) for training
 a VGG network that uses weight standardization to classify CIFAR-10 data.
 
@@ -1,6 +1,6 @@
 # [Weight Standardization](https://nn.labml.ai/normalization/weight_standardization/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of Weight Standardization from the paper
- [Micro-Batch Training with Batch-Channel Normalization and Weight Standardization](https://arxiv.org/abs/1903.10520).
+ [Micro-Batch Training with Batch-Channel Normalization and Weight Standardization](https://papers.labml.ai/paper/1903.10520).
 We also have an
 [annotated implementation of Batch-Channel Normalization](https://nn.labml.ai/normalization/batch_channel_norm/index.html).
@@ -9,7 +9,7 @@
 This is based from AdaBelief
 [official implementation](https://github.com/juntang-zhuang/Adabelief-Optimizer)
 of the paper
-[AdaBelief Optimizer: Adapting Stepsizes by the Belief in Observed Gradients](https://arxiv.org/abs/2010.07468).
+[AdaBelief Optimizer: Adapting Stepsizes by the Belief in Observed Gradients](https://papers.labml.ai/paper/2010.07468).
 
 This is implemented in [PyTorch](https://pytorch.org) as an extension to [RAdam](radam.html).
 
 
@@ -7,7 +7,7 @@
 # Adam Optimizer
 
 This is a [PyTorch](https://pytorch.org) implementation of popular optimizer *Adam* from paper
- [Adam: A Method for Stochastic Optimization](https://arxiv.org/abs/1412.6980v9).
+ [Adam: A Method for Stochastic Optimization](https://papers.labml.ai/paper/1412.6980v9).
 
 *Adam* update is,
 
 
@@ -7,7 +7,7 @@
 # AMSGrad
 
 This is a [PyTorch](https://pytorch.org) implementation of the paper
-[On the Convergence of Adam and Beyond](https://arxiv.org/abs/1904.09237).
+[On the Convergence of Adam and Beyond](https://papers.labml.ai/paper/1904.09237).
 
 We implement this as an extension to our [Adam optimizer implementation](adam.html).
 The implementation it self is really small since it's very similar to Adam.
 
@@ -9,7 +9,7 @@
 # Noam Optimizer
 
 This is the [PyTorch](https://pytorch.org) implementation of optimizer introduced in the paper
-[Attention Is All You Need](https://arxiv.org/abs/1706.03762).
+[Attention Is All You Need](https://papers.labml.ai/paper/1706.03762).
 """
 from typing import Dict
 
 
@@ -9,7 +9,7 @@
 This implementation is based on
 [the official implementation](https://github.com/LiyuanLucasLiu/RAdam)
 of the paper
-[On the Variance of the Adaptive Learning Rate and Beyond](https://arxiv.org/abs/1908.03265).
+[On the Variance of the Adaptive Learning Rate and Beyond](https://papers.labml.ai/paper/1908.03265).
 
 We have implemented it in [PyTorch](https://pytorch.org)
 as an extension to [our AMSGrad implementation](amsgrad.html)
 
@@ -6,7 +6,7 @@
 
 # Recurrent Highway Networks
 
-This is a [PyTorch](https://pytorch.org) implementation of [Recurrent Highway Networks](https://arxiv.org/abs/1607.03474).
+This is a [PyTorch](https://pytorch.org) implementation of [Recurrent Highway Networks](https://papers.labml.ai/paper/1607.03474).
 """
 from typing import Optional
 
 
@@ -8,7 +8,7 @@
 # Deep Residual Learning for Image Recognition (ResNet)
 
 This is a [PyTorch](https://pytorch.org) implementation of the paper
-[Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385).
+[Deep Residual Learning for Image Recognition](https://papers.labml.ai/paper/1512.03385).
 
 ResNets train layers as residual functions to overcome the
 *degradation problem*.