diff --git a/.actions/assistant.py b/.actions/assistant.py index 93019ee9c..66fedcf49 100644 --- a/.actions/assistant.py +++ b/.actions/assistant.py @@ -462,7 +462,7 @@ def _replace_images(lines: list, local_dir: str) -> list: @staticmethod def _is_ipynb_parent_dir(dir_path: str) -> bool: - """Determine in recursive fasion of a folder is valid notebook file or any of sub-folders is.""" + """Determine in recursive fashion of a folder is valid notebook file or any of sub-folders is.""" if AssistantCLI._find_meta(dir_path): return True sub_dirs = [d for d in glob.glob(os.path.join(dir_path, "*")) if os.path.isdir(d)] @@ -702,7 +702,7 @@ def list_dirs(folder: str = "", include_file_ext: str = "") -> str: dirs += glob.glob(os.path.join(folder, "**", "*" + include_file_ext)) if include_file_ext: _ignore_base_dir = lambda p: os.path.sep.join(p.split(os.path.sep)[1:]) # noqa: E731 - # Take the notebook as a folder (notebook are on teh same level as the raw tutorial file mix) + # Take the notebook as a folder (notebook are on the same level as the raw tutorial file mix) dirs = [os.path.splitext(_ignore_base_dir(p))[0] for p in dirs] else: dirs = [p for p in dirs if os.path.isdir(p)] diff --git a/.github/workflows/ci_block-ipybn.yml b/.github/workflows/ci_block-ipybn.yml index 73fd2e0c4..7de74ce2c 100644 --- a/.github/workflows/ci_block-ipybn.yml +++ b/.github/workflows/ci_block-ipybn.yml @@ -1,4 +1,4 @@ -name: Prevent adding/chnaging notebooks +name: Prevent adding/changing notebooks # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on PR to master diff --git a/course_UvA-DL/01-introduction-to-pytorch/Introduction_to_PyTorch.py b/course_UvA-DL/01-introduction-to-pytorch/Introduction_to_PyTorch.py index be0ca7e87..6ecf00440 100644 --- a/course_UvA-DL/01-introduction-to-pytorch/Introduction_to_PyTorch.py +++ b/course_UvA-DL/01-introduction-to-pytorch/Introduction_to_PyTorch.py @@ -735,7 +735,7 @@ def visualize_samples(data, label): # For instance, for BCE, PyTorch has two modules: `nn.BCELoss()`, `nn.BCEWithLogitsLoss()`. # While `nn.BCELoss` expects the inputs $x$ to be in the range $[0,1]$, i.e. the output of a sigmoid, `nn.BCEWithLogitsLoss` combines a sigmoid layer and the BCE loss in a single class. # This version is numerically more stable than using a plain Sigmoid followed by a BCE loss because of the logarithms applied in the loss function. -# Hence, it is adviced to use loss functions applied on "logits" where possible (remember to not apply a sigmoid on the output of the model in this case!). +# Hence, it is advised to use loss functions applied on "logits" where possible (remember to not apply a sigmoid on the output of the model in this case!). # For our model defined above, we therefore use the module `nn.BCEWithLogitsLoss`. # %% @@ -982,7 +982,7 @@ def visualize_classification(model, data, label): # Finally, you are all set to start with your own PyTorch project! # In summary, we have looked at how we can build neural networks in PyTorch, and train and test them on data. # However, there is still much more to PyTorch we haven't discussed yet. -# In the comming series of Jupyter notebooks, we will discover more and more functionalities of PyTorch, so that you also get familiar to PyTorch concepts beyond the basics. +# In the coming series of Jupyter notebooks, we will discover more and more functionalities of PyTorch, so that you also get familiar to PyTorch concepts beyond the basics. # If you are already interested in learning more of PyTorch, we recommend the official [tutorial website](https://pytorch.org/tutorials/) that contains many tutorials on various topics. # Especially logging with Tensorboard ([tutorial # here](https://pytorch.org/tutorials/intermediate/tensorboard_tutorial.html)) diff --git a/course_UvA-DL/03-initialization-and-optimization/Initialization_and_Optimization.py b/course_UvA-DL/03-initialization-and-optimization/Initialization_and_Optimization.py index ea8788ed5..cd8c470b8 100644 --- a/course_UvA-DL/03-initialization-and-optimization/Initialization_and_Optimization.py +++ b/course_UvA-DL/03-initialization-and-optimization/Initialization_and_Optimization.py @@ -33,7 +33,7 @@ sns.set() # %% [markdown] -# Instead of the `set_seed` function as in Tutorial 3, we can use Lightning's build-in function `L.seed_everything`. +# Instead of the `set_seed` function as in Tutorial 3, we can use Lightning's built-in function `L.seed_everything`. # We will reuse the path variables `DATASET_PATH` and `CHECKPOINT_PATH` as in Tutorial 3. # Adjust the paths if necessary. @@ -416,7 +416,7 @@ def var_init(model, std=0.01): # Actually, as $b$ is a single element per output neuron and is constant across different inputs, we set it to 0 overall. # # Next, we need to calculate the variance with which we need to initialize the weight parameters. -# Along the calculation, we will need to following variance rule: given two independent variables, the variance of their product is $\text{Var}(X\cdot Y) = \mathbb{E}(Y)^2\text{Var}(X) + \mathbb{E}(X)^2\text{Var}(Y) + \text{Var}(X)\text{Var}(Y) = \mathbb{E}(Y^2)\mathbb{E}(X^2)-\mathbb{E}(Y)^2\mathbb{E}(X)^2$ ($X$ and $Y$ are not refering to $x$ and $y$, but any random variable). +# Along the calculation, we will need to following variance rule: given two independent variables, the variance of their product is $\text{Var}(X\cdot Y) = \mathbb{E}(Y)^2\text{Var}(X) + \mathbb{E}(X)^2\text{Var}(Y) + \text{Var}(X)\text{Var}(Y) = \mathbb{E}(Y^2)\mathbb{E}(X^2)-\mathbb{E}(Y)^2\mathbb{E}(X)^2$ ($X$ and $Y$ are not referring to $x$ and $y$, but any random variable). # # The needed variance of the weights, $\text{Var}(w_{ij})$, is calculated as follows: # diff --git a/course_UvA-DL/04-inception-resnet-densenet/Inception_ResNet_DenseNet.py b/course_UvA-DL/04-inception-resnet-densenet/Inception_ResNet_DenseNet.py index 5d5def356..f6e1cb84b 100644 --- a/course_UvA-DL/04-inception-resnet-densenet/Inception_ResNet_DenseNet.py +++ b/course_UvA-DL/04-inception-resnet-densenet/Inception_ResNet_DenseNet.py @@ -372,7 +372,7 @@ def train_model(model_name, save_name=None, **kwargs): # Automatically loads the model with the saved hyperparameters model = CIFARModule.load_from_checkpoint(pretrained_filename) else: - L.seed_everything(42) # To be reproducable + L.seed_everything(42) # To be reproducible model = CIFARModule(model_name=model_name, **kwargs) trainer.fit(model, train_loader, val_loader) model = CIFARModule.load_from_checkpoint( diff --git a/course_UvA-DL/05-transformers-and-MH-attention/Transformers_MHAttention.py b/course_UvA-DL/05-transformers-and-MH-attention/Transformers_MHAttention.py index 753b368db..71000a366 100644 --- a/course_UvA-DL/05-transformers-and-MH-attention/Transformers_MHAttention.py +++ b/course_UvA-DL/05-transformers-and-MH-attention/Transformers_MHAttention.py @@ -203,7 +203,7 @@ # # One aspect we haven't discussed yet is the scaling factor of $1/\sqrt{d_k}$. # This scaling factor is crucial to maintain an appropriate variance of attention values after initialization. -# Remember that we intialize our layers with the intention of having equal variance throughout the model, and hence, +# Remember that we initialize our layers with the intention of having equal variance throughout the model, and hence, # $Q$ and $K$ might also have a variance close to $1$. # However, performing a dot product over two vectors with a variance $\sigma$ results # in a scalar having $d_k$-times higher variance: diff --git a/course_UvA-DL/07-deep-energy-based-generative-models/Deep_Energy_Models.py b/course_UvA-DL/07-deep-energy-based-generative-models/Deep_Energy_Models.py index 862653362..c688ac5e2 100644 --- a/course_UvA-DL/07-deep-energy-based-generative-models/Deep_Energy_Models.py +++ b/course_UvA-DL/07-deep-energy-based-generative-models/Deep_Energy_Models.py @@ -703,7 +703,7 @@ def train_model(**kwargs): # ### Image Generation # # Another way of evaluating generative models is by sampling a few generated images. -# Generative models need to be good at generating realistic images as this truely shows that they have modeled the true data distribution. +# Generative models need to be good at generating realistic images as this truly shows that they have modeled the true data distribution. # Thus, let's sample a few images of the model below: # %% diff --git a/course_UvA-DL/09-normalizing-flows/NF_image_modeling.py b/course_UvA-DL/09-normalizing-flows/NF_image_modeling.py index 446229006..3f5fdaba4 100644 --- a/course_UvA-DL/09-normalizing-flows/NF_image_modeling.py +++ b/course_UvA-DL/09-normalizing-flows/NF_image_modeling.py @@ -289,7 +289,7 @@ def encode(self, imgs): def _get_likelihood(self, imgs, return_ll=False): """Given a batch of images, return the likelihood of those. - If return_ll is True, this function returns the log likelihood of the input. Otherwise, the ouptut metric is + If return_ll is True, this function returns the log likelihood of the input. Otherwise, the output metric is bits per dimension (scaled negative log likelihood) """ z, ldj = self.encode(imgs) @@ -352,7 +352,7 @@ def test_step(self, batch, batch_idx): # %% [markdown] # The `test_step` function differs from the training and validation step in that it makes use of importance sampling. -# We will discuss the motiviation and details behind this after +# We will discuss the motivation and details behind this after # understanding how flows model discrete images in continuous space. # %% [markdown] @@ -975,7 +975,7 @@ def train_flow(flow, model_name="MNISTFlow"): # One disadvantage of normalizing flows is that they operate on the exact same dimensions as the input. # If the input is high-dimensional, so is the latent space, which requires larger computational cost to learn suitable transformations. # However, particularly in the image domain, many pixels contain less information in the sense -# that we could remove them without loosing the semantical information of the image. +# that we could remove them without losing the semantical information of the image. # # Based on this intuition, deep normalizing flows on images commonly apply a multi-scale architecture [1]. # After the first $N$ flow transformations, we split off half of the latent dimensions and directly evaluate them on the prior. @@ -1208,7 +1208,7 @@ def print_num_params(model): ) # %% [markdown] -# As we have intially expected, using variational dequantization improves upon standard dequantization in terms of bits per dimension. +# As we have initially expected, using variational dequantization improves upon standard dequantization in terms of bits per dimension. # Although the difference with 0.04bpd doesn't seem impressive first, it is a considerably step for generative models # (most state-of-the-art models improve upon previous models in a range of 0.02-0.1bpd on CIFAR with three times as high bpd). # While it takes longer to evaluate the probability of an image due to the variational dequantization, @@ -1223,7 +1223,7 @@ def print_num_params(model): # We should note that the samples for variational dequantization and standard dequantization are very similar, # and hence we visualize here only the ones for variational dequantization and the multi-scale model. # However, feel free to also test out the `"simple"` model. -# The seeds are set to obtain reproducable generations and are not cherry picked. +# The seeds are set to obtain reproducible generations and are not cherry picked. # %% L.seed_everything(44) diff --git a/course_UvA-DL/10-autoregressive-image-modeling/Autoregressive_Image_Modeling.py b/course_UvA-DL/10-autoregressive-image-modeling/Autoregressive_Image_Modeling.py index 5adb92868..c36660ade 100644 --- a/course_UvA-DL/10-autoregressive-image-modeling/Autoregressive_Image_Modeling.py +++ b/course_UvA-DL/10-autoregressive-image-modeling/Autoregressive_Image_Modeling.py @@ -69,7 +69,7 @@ L.seed_everything(42) # Ensure that all operations are deterministic on GPU (if used) for reproducibility -torch.backends.cudnn.determinstic = True +torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Fetching the device that will be used throughout this notebook @@ -504,7 +504,7 @@ def forward(self, v_stack, h_stack): # # Using the gated convolutions, we can now build our PixelCNN model. # The architecture consists of multiple stacked GatedMaskedConv blocks, where we add an additional dilation factor to a few convolutions. -# This is used to increase the receptive field of the model and allows to take a larger context into accout during generation. +# This is used to increase the receptive field of the model and allows to take a larger context into account during generation. # As a reminder, dilation on a convolution works looks as follows # (figure credit - [Vincent Dumoulin and Francesco Visin](https://arxiv.org/pdf/1603.07285.pdf)): # diff --git a/course_UvA-DL/11-vision-transformer/Vision_Transformer.py b/course_UvA-DL/11-vision-transformer/Vision_Transformer.py index a7c419ad5..e498b185c 100644 --- a/course_UvA-DL/11-vision-transformer/Vision_Transformer.py +++ b/course_UvA-DL/11-vision-transformer/Vision_Transformer.py @@ -398,7 +398,7 @@ def train_model(**kwargs): # Automatically loads the model with the saved hyperparameters model = ViT.load_from_checkpoint(pretrained_filename) else: - L.seed_everything(42) # To be reproducable + L.seed_everything(42) # To be reproducible model = ViT(**kwargs) trainer.fit(model, train_loader, val_loader) # Load best checkpoint after training @@ -503,7 +503,7 @@ def train_model(**kwargs): # In this tutorial, we have implemented our own Vision Transformer from scratch and applied it on the task of image classification. # Vision Transformers work by splitting an image into a sequence of smaller patches, use those as input to a standard Transformer encoder. # While Vision Transformers achieved outstanding results on large-scale image recognition benchmarks such as ImageNet, they considerably underperform when being trained from scratch on small-scale datasets like CIFAR10. -# The reason is that in contrast to CNNs, Transformers do not have the inductive biases of translation invariance and the feature hierachy (i.e. larger patterns consist of many smaller patterns). +# The reason is that in contrast to CNNs, Transformers do not have the inductive biases of translation invariance and the feature hierarchy (i.e. larger patterns consist of many smaller patterns). # However, these aspects can be learned when enough data is provided, or the model has been pre-trained on other large-scale tasks. # Considering that Vision Transformers have just been proposed end of 2020, there is likely a lot more to come on Transformers for Computer Vision. # diff --git a/course_UvA-DL/12-meta-learning/Meta_Learning.py b/course_UvA-DL/12-meta-learning/Meta_Learning.py index bf87dd801..0e05b4433 100644 --- a/course_UvA-DL/12-meta-learning/Meta_Learning.py +++ b/course_UvA-DL/12-meta-learning/Meta_Learning.py @@ -248,7 +248,7 @@ def dataset_from_labels(imgs, targets, class_set, **kwargs): # # This subsection summarizes the code that is needed to create such training batches. # In PyTorch, we can specify the data sampling procedure by so-called `Sampler` ([documentation](https://pytorch.org/docs/stable/data.html#data-loading-order-and-sampler)). -# Samplers are iteratable objects that return indices in the order in which the data elements should be sampled. +# Samplers are iterable objects that return indices in the order in which the data elements should be sampled. # In our previous notebooks, we usually used the option `shuffle=True` in the `data.DataLoader` objects which creates a sampler returning the data indices in a random order. # Here, we focus on samplers that return batches of indices that correspond to support and query set batches. # Below, we implement such a sampler. @@ -575,7 +575,7 @@ def train_model(model_class, train_loader, val_loader, **kwargs): # Automatically loads the model with the saved hyperparameters model = model_class.load_from_checkpoint(pretrained_filename) else: - L.seed_everything(42) # To be reproducable + L.seed_everything(42) # To be reproducible model = model_class(**kwargs) trainer.fit(model, train_loader, val_loader) model = model_class.load_from_checkpoint( @@ -777,7 +777,7 @@ def plot_few_shot(acc_dict, name, color=None, ax=None): # %% [markdown] # To obtain gradients for the initial parameters $\theta$ from the optimized model $f_{\theta_i'}$, we actually need second-order gradients, i.e. gradients of gradients, as the support set gradients depend on $\theta$ as well. -# This makes MAML computationally expensive, especially when using mulitple inner loop steps. +# This makes MAML computationally expensive, especially when using multiple inner loop steps. # A simpler, yet almost equally well performing alternative is First-Order MAML (FOMAML) which only uses first-order gradients. # This means that the second-order gradients are ignored, and we can calculate the outer loop gradients (line 10 in algorithm 2) simply by calculating the gradients with respect to $\theta_i'$, and use those as update to $\theta$. # Hence, the new update rule becomes: @@ -1049,7 +1049,7 @@ def collate_fn(item_list): # We use the same feature space size as for ProtoNet, but can use a higher learning rate since the outer loop gradients are accumulated over 16 batches. # The inner loop learning rate is set to 0.1, which is much higher than the outer loop lr because we use SGD in the inner loop instead of Adam. # Commonly, the learning rate for the output layer is higher than the base model is the base model is very deep or pre-trained. -# However, for our setup, we observed no noticable impact of using a different learning rate than the base model. +# However, for our setup, we observed no noticeable impact of using a different learning rate than the base model. # The number of inner loop updates is another crucial hyperparmaeter, and depends on the similarity of our training tasks. # Since all tasks are on images from the same dataset, we notice that a single inner loop update achieves similar performance as 3 or 5 while training considerably faster. # However, especially in RL and NLP, larger number of inner loop steps are often needed. diff --git a/course_UvA-DL/13-contrastive-learning/SimCLR.py b/course_UvA-DL/13-contrastive-learning/SimCLR.py index 1bc97bb79..2b2e966e0 100644 --- a/course_UvA-DL/13-contrastive-learning/SimCLR.py +++ b/course_UvA-DL/13-contrastive-learning/SimCLR.py @@ -70,7 +70,7 @@ L.seed_everything(42) # Ensure that all operations are deterministic on GPU (if used) for reproducibility -torch.backends.cudnn.determinstic = True +torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu") @@ -390,7 +390,7 @@ def train_simclr(batch_size, max_epochs=500, **kwargs): pin_memory=True, num_workers=NUM_WORKERS, ) - L.seed_everything(42) # To be reproducable + L.seed_everything(42) # To be reproducible model = SimCLR(max_epochs=max_epochs, **kwargs) trainer.fit(model, train_loader, val_loader) # Load best checkpoint after training @@ -566,7 +566,7 @@ def train_logreg(batch_size, train_feats_data, test_feats_data, model_suffix, ma print(f"Found pretrained model at {pretrained_filename}, loading...") model = LogisticRegression.load_from_checkpoint(pretrained_filename) else: - L.seed_everything(42) # To be reproducable + L.seed_everything(42) # To be reproducible model = LogisticRegression(**kwargs) trainer.fit(model, train_loader, test_loader) model = LogisticRegression.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) @@ -582,7 +582,7 @@ def train_logreg(batch_size, train_feats_data, test_feats_data, model_suffix, ma # %% [markdown] # Despite the training dataset of STL10 already only having 500 labeled images per class, we will perform experiments with even smaller datasets. # Specifically, we train a Logistic Regression model for datasets with only 10, 20, 50, 100, 200, and all 500 examples per class. -# This gives us an intuition on how well the representations learned by contrastive learning can be transfered to a image recognition task like this classification. +# This gives us an intuition on how well the representations learned by contrastive learning can be transferred to a image recognition task like this classification. # First, let's define a function to create the intended sub-datasets from the full training set: @@ -762,7 +762,7 @@ def train_resnet(batch_size, max_epochs=100, **kwargs): print("Found pretrained model at %s, loading..." % pretrained_filename) model = ResNet.load_from_checkpoint(pretrained_filename) else: - L.seed_everything(42) # To be reproducable + L.seed_everything(42) # To be reproducible model = ResNet(**kwargs) trainer.fit(model, train_loader, test_loader) model = ResNet.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) diff --git a/flash_tutorials/tabular_classification/tabular_classification.py b/flash_tutorials/tabular_classification/tabular_classification.py index a2089ca97..ab19eceb8 100644 --- a/flash_tutorials/tabular_classification/tabular_classification.py +++ b/flash_tutorials/tabular_classification/tabular_classification.py @@ -75,7 +75,7 @@ # # Predicting # ## Load the model from a checkpoint # -# `TabularClassifier.load_from_checkpoint` supports both url or local_path to a checkpoint. If provided with an url, the checkpoint will first be downloaded and laoded to re-create the model. +# `TabularClassifier.load_from_checkpoint` supports both url or local_path to a checkpoint. If provided with an url, the checkpoint will first be downloaded and loaded to re-create the model. # %% model = TabularClassifier.load_from_checkpoint( diff --git a/lightning_examples/augmentation_kornia/augmentation.py b/lightning_examples/augmentation_kornia/augmentation.py index 46ab9690f..8320b03ad 100644 --- a/lightning_examples/augmentation_kornia/augmentation.py +++ b/lightning_examples/augmentation_kornia/augmentation.py @@ -52,7 +52,7 @@ def __init__(self, apply_color_jitter: bool = False) -> None: self.jitter = ColorJitter(0.5, 0.5, 0.5, 0.5) - @torch.no_grad() # disable gradients for effiency + @torch.no_grad() # disable gradients for efficiency def forward(self, x: Tensor) -> Tensor: x_out = self.transforms(x) # BxCxHxW if self._apply_color_jitter: @@ -76,7 +76,7 @@ def forward(self, x: Tensor) -> Tensor: class Preprocess(nn.Module): """Module to perform pre-process using Kornia on torch tensors.""" - @torch.no_grad() # disable gradients for effiency + @torch.no_grad() # disable gradients for efficiency def forward(self, x) -> Tensor: x_tmp: np.ndarray = np.array(x) # HxWxC x_out: Tensor = image_to_tensor(x_tmp, keepdim=True) # CxHxW diff --git a/lightning_examples/barlow-twins/barlow_twins.py b/lightning_examples/barlow-twins/barlow_twins.py index 6f34a67cc..ec45d1e33 100644 --- a/lightning_examples/barlow-twins/barlow_twins.py +++ b/lightning_examples/barlow-twins/barlow_twins.py @@ -315,7 +315,7 @@ def configure_optimizers(self): # %% [markdown] # ### Evaluation # -# We define a callback which appends a linear layer on top of the encoder and trains the classification evaluation head in an online manner. We make sure not to backpropagate the gradients back to the encoder while tuning the linear layer. This technique was used in SimCLR as well and they showed that the final downstream classification peformance is pretty much similar to the results on online finetuning as the training progresses. +# We define a callback which appends a linear layer on top of the encoder and trains the classification evaluation head in an online manner. We make sure not to backpropagate the gradients back to the encoder while tuning the linear layer. This technique was used in SimCLR as well and they showed that the final downstream classification performance is pretty much similar to the results on online finetuning as the training progresses. # %% diff --git a/lightning_examples/reinforce-learning-DQN/dqn.py b/lightning_examples/reinforce-learning-DQN/dqn.py index a247ce605..fbaec9777 100644 --- a/lightning_examples/reinforce-learning-DQN/dqn.py +++ b/lightning_examples/reinforce-learning-DQN/dqn.py @@ -113,7 +113,7 @@ def __iter__(self) -> Iterator[Tuple]: # %% class Agent: def __init__(self, env: gym.Env, replay_buffer: ReplayBuffer) -> None: - """Base Agent class handeling the interaction with the environment. + """Base Agent class handling the interaction with the environment. Args: env: training environment @@ -293,7 +293,7 @@ def get_epsilon(self, start: int, end: int, frames: int) -> float: def training_step(self, batch: Tuple[Tensor, Tensor], nb_batch) -> OrderedDict: """Carries out a single step through the environment to update the replay buffer. Then calculates loss based on - the minibatch recieved. + the minibatch received. Args: batch: current mini batch of replay data