From 20fbff140a17835309f2e26e88e2fff667448c87 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sun, 4 Jul 2021 23:35:50 +0530 Subject: [PATCH] Adding an example on Compact Convolutional Transformers (#538) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * adding cct example * added more notes * added colab link * added a note on NLP tasks * copyedits * feedback round I * adding rest of the files Co-authored-by: François Chollet --- examples/vision/cct.py | 385 ++++++++++++++++++ examples/vision/img/cct/cct_22_0.png | Bin 0 -> 17920 bytes examples/vision/ipynb/cct.ipynb | 582 +++++++++++++++++++++++++++ examples/vision/md/cct.md | 495 +++++++++++++++++++++++ 4 files changed, 1462 insertions(+) create mode 100644 examples/vision/cct.py create mode 100644 examples/vision/img/cct/cct_22_0.png create mode 100644 examples/vision/ipynb/cct.ipynb create mode 100644 examples/vision/md/cct.md diff --git a/examples/vision/cct.py b/examples/vision/cct.py new file mode 100644 index 0000000000..c269a1349b --- /dev/null +++ b/examples/vision/cct.py @@ -0,0 +1,385 @@ +""" +Title: Compact Convolutional Transformers +Author: [Sayak Paul](https://twitter.com/RisingSayak) +Date created: 2021/06/30 +Last modified: 2021/06/30 +Description: Compact Convolutional Transformers for efficient image classification. +""" +""" +As discussed in the [Vision Transformers (ViT)](https://arxiv.org/abs/2010.11929) paper, +a Transformer-based architecture for vision typically requires a larger dataset than +usual, as well as a longer pre-training schedule. [ImageNet-1k](http://imagenet.org/) +(which has about a million images) is considered to fall under the medium-sized data regime with +respect to ViTs. This is primarily because, unlike CNNs, ViTs (or a typical +Transformer-based architecture) do not have well-informed inductive biases (such as +convolutions for processing images). This begs the question: can't we combine the +benefits of convolution and the benefits of Transformers +in a single network architecture? These benefits include parameter-efficiency, and +self-attention to process long-range and global dependencies (interactions between +different regions in an image). + +In [Escaping the Big Data Paradigm with Compact Transformers](https://arxiv.org/abs/2104.05704), +Hassani et al. present an approach for doing exactly this. They proposed the +**Compact Convolutional Transformer** (CCT) architecture. In this example, we will work on an +implementation of CCT and we will see how well it performs on the CIFAR-10 dataset. + +If you are unfamiliar with the concept of self-attention or Transformers, you can read +[this chapter](https://livebook.manning.com/book/deep-learning-with-python-second-edition/chapter-11/r-3/312) +from François Chollet's book *Deep Learning with Python*. This example uses +code snippets from another example, +[Image classification with Vision Transformer](https://keras.io/examples/vision/image_classification_with_vision_transformer/). + +This example requires TensorFlow 2.5 or higher, as well as TensorFlow Addons, which can +be installed using the following command: +""" + +"""shell +pip install -U -q tensorflow-addons +""" + +""" +## Imports +""" + +from tensorflow.keras import layers +from tensorflow import keras + +import matplotlib.pyplot as plt +import tensorflow_addons as tfa +import tensorflow as tf +import numpy as np + +""" +## Hyperparameters and constants +""" + +positional_emb = True +conv_layers = 2 +projection_dim = 128 + +num_heads = 2 +transformer_units = [ + projection_dim, + projection_dim, +] +transformer_layers = 2 +stochastic_depth_rate = 0.1 + +learning_rate = 0.001 +weight_decay = 0.0001 +batch_size = 128 +num_epochs = 30 +image_size = 32 + +""" +## Load CIFAR-10 dataset +""" + +num_classes = 10 +input_shape = (32, 32, 3) + +(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data() + +y_train = keras.utils.to_categorical(y_train, num_classes) +y_test = keras.utils.to_categorical(y_test, num_classes) + +print(f"x_train shape: {x_train.shape} - y_train shape: {y_train.shape}") +print(f"x_test shape: {x_test.shape} - y_test shape: {y_test.shape}") + +""" +## The CCT tokenizer + +The first recipe introduced by the CCT authors is the tokenizer for processing the +images. In a standard ViT, images are organized into uniform *non-overlapping* patches. +This eliminates the boundary-level information present in between different patches. This +is important for a neural network to effectively exploit the locality information. The +figure below presents an illustration of how images are organized into patches. + +![](https://i.imgur.com/IkBK9oY.png) + +We already know that convolutions are quite good at exploiting locality information. So, +based on this, the authors introduce an all-convolution mini-network to produce image +patches. +""" + + +class CCTTokenizer(layers.Layer): + def __init__( + self, + kernel_size=3, + stride=1, + padding=1, + pooling_kernel_size=3, + pooling_stride=2, + num_conv_layers=conv_layers, + num_output_channels=[64, 128], + positional_emb=positional_emb, + **kwargs, + ): + super(CCTTokenizer, self).__init__(**kwargs) + + # This is our tokenizer. + self.conv_model = keras.Sequential() + for i in range(num_conv_layers): + self.conv_model.add( + layers.Conv2D( + num_output_channels[i], + kernel_size, + stride, + padding="valid", + use_bias=False, + activation="relu", + kernel_initializer="he_normal", + ) + ) + self.conv_model.add(layers.ZeroPadding2D(padding)) + self.conv_model.add( + layers.MaxPool2D(pooling_kernel_size, pooling_stride, "same") + ) + + self.positional_emb = positional_emb + + def call(self, images): + outputs = self.conv_model(images) + # After passing the images through our mini-network the spatial dimensions + # are flattened to form sequences. + reshaped = tf.reshape( + outputs, + (-1, tf.shape(outputs)[1] * tf.shape(outputs)[2], tf.shape(outputs)[-1]), + ) + return reshaped + + def positional_embedding(self, image_size): + # Positional embeddings are optional in CCT. Here, we calculate + # the number of sequences and initialize an `Embedding` layer to + # compute the positional embeddings later. + if self.positional_emb: + dummy_inputs = tf.ones((1, image_size, image_size, 3)) + dummy_outputs = self.call(dummy_inputs) + sequence_length = tf.shape(dummy_outputs)[1] + projection_dim = tf.shape(dummy_outputs)[-1] + + embed_layer = layers.Embedding( + input_dim=sequence_length, output_dim=projection_dim + ) + return embed_layer, sequence_length + else: + return None + + +""" +## Stochastic depth for regularization + +[Stochastic depth](https://arxiv.org/abs/1603.09382) is a regularization technique that +randomly drops a set of layers. During inference, the layers are kept as they are. It is +very much similar to [Dropout](https://jmlr.org/papers/v15/srivastava14a.html) but only +that it operates on a block os layers rather than individual nodes present inside a +layer. In CCT, stochastic depth is used just before the residual blocks of a Transformers +encoder. +""" + +# Referred from: github.com:rwightman/pytorch-image-models. +class StochasticDepth(layers.Layer): + def __init__(self, drop_prop, **kwargs): + super(StochasticDepth, self).__init__(**kwargs) + self.drop_prob = drop_prop + + def call(self, x, training=None): + if training: + keep_prob = 1 - self.drop_prob + shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1) + random_tensor = keep_prob + tf.random.uniform(shape, 0, 1) + random_tensor = tf.floor(random_tensor) + return (x / keep_prob) * random_tensor + return x + + +""" +## MLP for the Transformers encoder +""" + + +def mlp(x, hidden_units, dropout_rate): + for units in hidden_units: + x = layers.Dense(units, activation=tf.nn.gelu)(x) + x = layers.Dropout(dropout_rate)(x) + return x + + +""" +## Data augmentation + +In the [original paper](https://arxiv.org/abs/2104.05704), the authors use +[AutoAugment](https://arxiv.org/abs/1805.09501) to induce stronger regularization. For +this example, we will be using the standard geometric augmentations like random cropping +and flipping. +""" + +# Note the rescaling layer. These layers have pre-defined inference behavior. +data_augmentation = keras.Sequential( + [ + layers.experimental.preprocessing.Rescaling(scale=1.0 / 255), + layers.experimental.preprocessing.RandomCrop(image_size, image_size), + layers.experimental.preprocessing.RandomFlip("horizontal"), + ], + name="data_augmentation", +) + +""" +## The final CCT model + +Another recipe introduced in CCT is attention pooling or sequence pooling. In ViT, only +the feature map corresponding to the class token is pooled and is then used for the +subsequent classification task (or any other downstream task). In CCT, outputs from the +Transformers encoder are weighted and then passed on to the final task-specific layer (in +this example, we do classification). +""" + + +def create_cct_model( + image_size=image_size, + input_shape=input_shape, + num_heads=num_heads, + projection_dim=projection_dim, + transformer_units=transformer_units, +): + + inputs = layers.Input(input_shape) + + # Augment data. + augmented = data_augmentation(inputs) + + # Encode patches. + cct_tokenizer = CCTTokenizer() + encoded_patches = cct_tokenizer(augmented) + + # Apply positional embedding. + if positional_emb: + pos_embed, seq_length = cct_tokenizer.positional_embedding(image_size) + positions = tf.range(start=0, limit=seq_length, delta=1) + position_embeddings = pos_embed(positions) + encoded_patches += position_embeddings + + # Calculate Stochastic Depth probabilities. + dpr = [x for x in np.linspace(0, stochastic_depth_rate, transformer_layers)] + + # Create multiple layers of the Transformer block. + for i in range(transformer_layers): + # Layer normalization 1. + x1 = layers.LayerNormalization(epsilon=1e-5)(encoded_patches) + + # Create a multi-head attention layer. + attention_output = layers.MultiHeadAttention( + num_heads=num_heads, key_dim=projection_dim, dropout=0.1 + )(x1, x1) + + # Skip connection 1. + attention_output = StochasticDepth(dpr[i])(attention_output) + x2 = layers.Add()([attention_output, encoded_patches]) + + # Layer normalization 2. + x3 = layers.LayerNormalization(epsilon=1e-5)(x2) + + # MLP. + x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1) + + # Skip connection 2. + x3 = StochasticDepth(dpr[i])(x3) + encoded_patches = layers.Add()([x3, x2]) + + # Apply sequence pooling. + representation = layers.LayerNormalization(epsilon=1e-5)(encoded_patches) + attention_weights = tf.nn.softmax(layers.Dense(1)(representation), axis=1) + weighted_representation = tf.matmul( + attention_weights, representation, transpose_a=True + ) + weighted_representation = tf.squeeze(weighted_representation, -2) + + # Classify outputs. + logits = layers.Dense(num_classes)(weighted_representation) + # Create the Keras model. + model = keras.Model(inputs=inputs, outputs=logits) + return model + + +""" +## Model training and evaluation +""" + + +def run_experiment(model): + optimizer = tfa.optimizers.AdamW(learning_rate=0.001, weight_decay=0.0001) + + model.compile( + optimizer=optimizer, + loss=keras.losses.CategoricalCrossentropy( + from_logits=True, label_smoothing=0.1 + ), + metrics=[ + keras.metrics.CategoricalAccuracy(name="accuracy"), + keras.metrics.TopKCategoricalAccuracy(5, name="top-5-accuracy"), + ], + ) + + checkpoint_filepath = "/tmp/checkpoint" + checkpoint_callback = keras.callbacks.ModelCheckpoint( + checkpoint_filepath, + monitor="val_accuracy", + save_best_only=True, + save_weights_only=True, + ) + + history = model.fit( + x=x_train, + y=y_train, + batch_size=batch_size, + epochs=num_epochs, + validation_split=0.1, + callbacks=[checkpoint_callback], + ) + + model.load_weights(checkpoint_filepath) + _, accuracy, top_5_accuracy = model.evaluate(x_test, y_test) + print(f"Test accuracy: {round(accuracy * 100, 2)}%") + print(f"Test top 5 accuracy: {round(top_5_accuracy * 100, 2)}%") + + return history + + +cct_model = create_cct_model() +history = run_experiment(cct_model) + +""" +Let's now visualize the training progress of the model. +""" + +plt.plot(history.history["loss"], label="train_loss") +plt.plot(history.history["val_loss"], label="val_loss") +plt.xlabel("Epochs") +plt.ylabel("Loss") +plt.title("Train and Validation Losses Over Epochs", fontsize=14) +plt.legend() +plt.grid() +plt.show() + +""" +The CCT model we just trained has just **0.4 million** parameters, and it gets us to +~78% top-1 accuracy within 30 epochs. The plot above shows no signs of overfitting as +well. This means we can train this network for longers (perhaps with a bit more +regularization) and may obtain even better performance. This performance can further be +improved by additional recipes like cosine decay learning rate schedule, other data augmentation +techniques like [AutoAugment](https://arxiv.org/abs/1805.09501), +[MixUp](https://arxiv.org/abs/1710.09412) or +[Cutmix](https://arxiv.org/abs/1905.04899. The authors also present a number of +experiments to study how the number of convolution blocks, Transformers layers, etc. +affect the final performance. + +For a comparison, a ViT model takes about **4.7 million** parameters and **100 +epochs** of training to reach a top-1 accuracy of 78.22% on the CIFAR-10 dataset. You can +refer to +[this notebook](https://colab.research.google.com/gist/sayakpaul/1a80d9f582b044354a1a26c5cb3d69e5/image_classification_with_vision_transformer.ipynb) +to know about the experimental setup. + +The authors also demonstrate the performance of Compact Convolutional Transformers on +NLP tasks and they report competitive results there. +""" diff --git a/examples/vision/img/cct/cct_22_0.png b/examples/vision/img/cct/cct_22_0.png new file mode 100644 index 0000000000000000000000000000000000000000..870a4f91e78c3f804a4c35c4dcf4fc98c3e009fb GIT binary patch literal 17920 zcmZU*1yqz#*C;%6hXNwf79G+JB1qTJLrXVOL$`p^(v8S~!~oLG03t}2LwAF84tXEm z_y6v_>;9~nHBYRw&)NIzK6~?7T~&?<{~10A1R_$9mjQ!7XraKD5*HhIqL#=k2mD~V zNh@gK0)KwE7LmX`o{PM`8wf=3_~DB-HXm>UJQQ-5)pOT$wsMD>xmto8&D>oaoZTI4 zKQMS&y1LmqJMnVzaPqJ-*tom9h;VWJpZA>3uGU;B^wFO|AO?_v%v&w5tbg-RFAD7| z#L+?HE4<>QFidF$^sog&S=du4G&a2;ZFXi(duM{aCP`f*kIA<3(vX<$kjf_+d=v%p zsS9|tkL{Ql(Z+@k;&_8!7BC44{_&R_=2dguR2H^=oqjPSY(2yb=aYP z!(I_6Gpui(r0M1L<9xm(@QWlc%l?p@w*ohFm`4Y*U54Io6-m4~qhNhFpJK6f64*K! z+=f*!oeaiPEDlUE8}ARsM*MNL=5fwBXM?@h{oRH?1&wKtS}ROyQevr-{d7hNCS=aMw?n*2?`4DICa<-h$I{^xxyQUS8dOOZC*cS^BDM99Lh z=QHpT)lx(K7gA&okK0l7La*Yv&`xBAEr9L_20$Nzh7ejyZm@}6&V_mv7E7)V36ftg z0;|yuKe3%?y|9H_vRa|uhFYr{f#mG zFmEvAF=H{DG3uTlYks{YZ#K%4*vP#`cL8~UrqEloezmn@JfY+ce0ErRcFl`;y+jia z^244Os&y#yxB-ahFxWiVH|}lv^4%haSJ!is@N2=j(Q zy>aXU;~vM4FsnB9I_<-q)q4<&Y=Pj9LuKWU@i`>Uev<9k33~=x#DXq5kJIsIBWANO-fmAxoXpC6B z*(qYe{`$B8oq%Ee>qz!(R2hEaEU!B4YkU&yYj}-(-Xl`tXfzQt4FvOUT&YR92a~He zw>P`2_=fqp$a_KI2D!QD zSCF3VXUe9Rtdqw-$sZ90?IR`b?ACA9((JP5HuGK_l8(^XIB{eW*Hn$|e&AH*!Z4H4 zW4!)?R#mp1AH0lw<0UrPc5ZMTI50E0iWDekT@i}Oi#H8wv?=e=()ajTn2dN<#(;T) zd-EZgcT8j-170)x=I!=-W%X@zu@w933N%r;$MfK3Zpt6jN>u@9s9V8}{5@@{jCzlj zJ4^onGQD)?^weNS}O{=jd{_zeV`PQbG@5zll4{=lLYAE~OUBHWt+qX4l8+Yv~%tGCy!eb5Z=Hr{17mmvkMZ z#>m3p4J;}p` zc7&?q6<*|Nc0DK!YKhJ}G*A7}bf1u!j$422m2|-%|~AFX&5U&~G^9>9(N` z(A>Tlw8!5LVhKD+XnQkRC#u&|uis{V)^g@+qI~!x*_oO;$l8rol;~FygYyilvtnDZ zf14x4DKy8eEESFH*b(}?l5Y~#hYg|pSk6DfGug77){e>w(%eFou_o3~B@JiH(U-lS zXlI29u)tIyM3MDoZT=LNnr*`oZ?uK5Si1wh%sAN(&|mIk$-vLh?9-)f;niqSXEnk=;{EFLo=*hb8E zdXzO4mZ%L5tj=U`aN|bu4o$LYZbU8e+Re%aegTVByTL7I=+-&Wu>;H^FWZ0TS~PBtRR(8e}H-UvL*%5_ledt6KHsGn!v88R~eF zQ}+%rN9Dl0?Burp)lE&*4MutXKox>(>RyEdLYh&CGuLwxTgdL*wOBMNv%=h;bH`5y z?=fNDreWLg_h3m3XR{p=QMBfw9sLB(Ho!cY+vz5Z(w0x)s8^_e2%m7B6hXW$#? z$9}h_8#_1&8poTz*4Dw$pjt(ic0=!`^cKpVI;s1|WhU`LdHQeBx-upQAg_BS^*Rt& zjT22E^Yw$LsU|H3W-d~gpPM%15!A4nf)8;kfNk0nZ!YfvQpQ<;5e{;;(+V@KYH zGt7VqNFc0sA;=aHRi1x69V~o5?`p*i%K#BY8H`@_dK)9KMZY)A-mI%-J;;~v?zZP`&+&|TQtgEj8}T?42f_4 zs*zB}pGPL2tCV{$f(Dbpy69cp9gIC9%G8>kXed$Rtw(IRop(cSZQGRRg7I#gNDQJR zFuXtUbN-OE_L2(}+b&JWG!0zBRERUYn&ynLTRIa^gA`2$p9IAvgTHg1l^;MPl~&va zB)vs81jKtd2P79Y6^o10qAoRG4W_R94+s{ugBHzqP|kz($}}#eU_0y$gX^fYlDN5Z zjMgNuiFupt>nD{J(4a-Qv+bfOXLKNT4wq#P!3Rx6k3K;zGBSBVw8SK^uu;-|XruZa zeqPd*U}3ogG48LLW{2vr_ojDHU9+8@2(#D@PPj)Ec*k+c5th`+mvUu3^sWwhm6(s@ z`aufWO;#+11__pl2HU1Nx7B}m7PvS9*(?wLtgFa@{oCAU*)P@J?c|L%4{s89_(NM# znErZZMeQ9t%^VUxqgZ^S9nS*U&Ds>>3BD*Z_eZ;=EL{EN$9BuHBnq2=ELpR0b-mH9 z01C0#PhM7>7G9J)A~k~z`a0vE3feK+$B z`<2&@vDZkB9hruCu-1xRUbA>Mr#aj3zPV?J2*P10w6`tyXbmDub8>x)8p}brv$O`G zr!mERfLD)V-k`AAfNnZ1V+WEfLYk_B3WYMC8pRkM4(Of|_nf4hs`u zBc*Yc7JJ*f@U8w>=eE;FYr8XcDxYryIo*WYQn;uw_Dh&1L+#LP(b7atFAaPA z@O8fPcDwfv*26<$e_|UCwsM%pdx(v58|i0m&JMjx&(+ol92R@9Yr@S%YbtLx z?nT)E&5+De1T(;8w4>SO+`@p@wl|-E|0-k#1702;HsI;S+dXFtBXzZA2qO)#o2um# z64H~Ckx8vT43u*KhW8%twk8RC<&caYsf* znhs|0cXR0l$wobSL``k*lghO^L#Hr9hwg{BWr^*2aE5MSNZzQ0jtU1_|x0VAR&=6F`-rePm-BAa&I>3*|(;v!=s}PLM3gH2WQlYlAc~o zN(%Jo(Id;I_+(vIR~|GpwB)wMFNL)mYkv|gcc@=zjlj3_>&!~Aw$I$mbe$Bku)x{_iy;otRoxr0G&CT>`uFzJd z5wM>AcpLTi@dpTNj7~=Mcaha25IJ#pT`gfl_M@ zo3}EzjbRUEW{Cgi2cq_}v9J^)Oykqj@5L{?fx!_owf4=d2FZQqO({OY4q+2uu*ypr zii_hogK5u8hfhw`DqT{>Esc)fwz=9apZlKw`=(i>DwM72=jZoBvxr#K^B)5zXKY6( zj@#A5Esfg@-D4NI-ua2Z zeQzdt*10_!ApZLF?{cH&G%;_U+<+^VcM-(v5%d8|`^^WOuI(4I;Lh*h**u|^QnK7ai>JdsH)#hX5Lue!5N zizp10K2?A5OZXX*vyD>?cbV$u_Etevm6Mx;h2^t|$IjC3ZhtJTs9vM1<)^)R%l?#q zi|x7d&Cu}{Z@0Fq9k%kujb~smSSavLOmNNxL)_<-ft3|MNZj{)q}3PN1U(p+8!4-Je4rcb&ls z2@RbF7^@rgT`rE()r5>mwdK;cJ>XWaJ-|Qb6r1Q7K+c_hWTuFRUBWB}Nt_+8y5b|M z6{g{5xQsFqbRO0qQmTK$zu!^JzEZ+v#6Jdvw#s3;O6(-Z?^t2K{XRf&{f7%3C1HNZ zZCdL#w{5z!_R!BpRw4Z1oKH|Mh{7{h+>h;co|yvC4! zQAH}fw(_xq;rSG0Jlbv;>2tw20&Fx1)g2tKes!cYZbDQK7DiC7>9#Rz7&Cof;Jx%X zzKni`X+XeNEYWpFqVHIMt2`pW9+pT548 zPn5={Q`!grsm!qFUm%#c>WVA~dLgVN|Fm{+Bw?L#aJpA4TgF3f0a0nz z<+W2>#EBgWsGAf(BYbAc@d`m8q#l!zJ+M^cdWr#eD*P+zu!}@P@u&BRmFK^yxA%9q zpQED8Z!V9-P-}^V^b%R8r|wl%RcXA|#6;8r7U%z_gQ#9sR5hG@t0oCbN0>2yo)*>) zaoW6vO)#61OWD{kM?K+S9VgZVgY{}`9(~>%aN4-53$I=t;&Ci3>1EMW(HQT;;IB|F zXI4{JAAxDS^S!H;uJ_kq{Qec)!qRdSkgHfaaYaKzn$Mp>$JEav> z*28p8Z78b@L1$G!8A;yX@Ty7P@?qc*uK{m4-RWgVA*ok1jl53@K>{HEU@HWdV)0@P zGPU(U)&3Um5Kk(V)5zVy!6DtV!DjUN2Xpg3y?q<^DT7^X4R3z_g!dT+l`s@3^UMC% zcsJ&Y&DJ|JF)-vw*Xuv~^XCsHCMLJT6!*qpW)EN(8h6W^(}dk#D<~)cYose0SCEmx zB#v19U`)_o-QF$!B-+l|9Sl3WY25_ks9KA(oIsU^wM@x3K^qg_zE8b-j{}M*(O-L;e z&7xU&czdzPYtc*k_3PIkChZccMTwsXC|#Ej!C>cV-3nvn3_iS|h6W*GIx%@b2TeCa zximC1#EyQE0c;$pk}2@<^3crB-@oyCznO~Hif|VRFWq|nq%RwajT0Pza8@i<&wEu_ zU*$a+8($wcL20(mkxOTj&t@cOxhul$WKeLt&EL1leOtOz2X^0f)%zmu5a7j! zhXD3~8u~RmoB6=Ifa=Bv$fT$DbXf9Uto{B@)ckjJhgJ2my9i)jvj`ORxl^$)U=tP$ zPjGbRHB{^Io=r9gr?9~q!V`MB3PLyR8AX58{n>k`9Ev>EkwRXvw zu3DaS)PxzI(o8G4j%ZpTVGY89(ztMs;;RE)ti9QK>#lI3{h{}CkX;`5W`a2xS{SC+ z>ZWDtdke-A5IsG;YL1xx20=P^4Khp2`)gTQ8MtEiClp(eC13i+GWt1VNznV4XOHOV zhd+}s%~jn4E*{{_%vBwH*;rleYIK9OmwP*KgT|dI_m^y;*6(oRHr+)&M@PHQ-cfQu zWB^xVzV7~uPRb^uSX~CL^2JyfFQxn-^DRd*sr1B4I$FZ#p|>N?h2PxLvKuy~mM(iBAc>S1ragz&J_JVX7Nh7+&a#S$?w zl;iSG2OJ&`8M;7q00)%gvZVtYhh>k zwMZT495F`)Oqa5W8Tzg?a^Vc`*L{HgR{)n>u}|#(9g%VN%3#`TQVr|1oZG5q4++*mQM#$Z3Rs6ypni zwXeDLp_NkfwlVxoD;V+(_ip4~V#c;*q z?(#k*x`bB5vZjl+Mr5v-*XyuD`SKRtG+aVPy@PS^N1XPggoy3|Gpq5?TjO zutg4-VHzC)S;Dy;EDyWd+4Wzm1SLIu!sJ%6pDz!pxd1VZJi!Is!E5iT;`gG_-=l?vhyPYi=Yc7s04>?E zxJaK%(9_`hm)TR$H|X+rC}$zPXzHL_yOF7dD3=evYlGROZ>!2dgX+8!?&hQK{i zrFF+3%?{|#gE&4F_=CzYZ-PusDM^jy1k68taM$NBY&aZ?AZfA{KiUx6rb^duLY-8f z?Yh;cdIMn9((QtP_zG@soNfp4t||AnWc?wAcxLpg_HTCU=at~x8svk1K|IbghwZh> z1`1WujD6)Aa-I5*#wE!CWFb~Gw1Y6O2;BEA=@_dCW0uYFU=)Kv6yuzIfLD}f%A_e3 z&>pdD%_*wbIv|c@@R4G1f?{z*5|{+d240VG5~PB#TjGU#e4AHiGhE$YqII7hAs>{! znF>4>)Bg|m<9uwe>w@fxDpd{`APma;-X;7~wL#)|-hOCGW5qJ`p+8>crL-}eoaE3a zlaB1$@K$5kd?F44$95*fP+I?aDl3hvwl>|7hw|9iEH~jC`hx#v{^y#1FYtI}x&tAJ z`OG+BbI-ct&KT26)B+*G7$$f0&lFTx!W=-9rGz9!F@n(r#l zZ8J(gQt8{#t$$%e(UDM~wc#YZ)8>cADD=|~>g<=nG-51l_@WE;H(N;g{xYj^>GuE-3GXU!gAVaRUh5j2pI7@< z#DbRk*RRX zToiV>Z=T%CKgH=cm7Hxs$pEHS@uu-F3SLb(x$0|cD36P{+ZkrwbIz*OMO(0j{rCoe z24s~m=)bZ$h9Ul9I@K%olPKE({~$1RI`AHuFKLk#fuVFV(AH-3lqkgS(vVJ?OnO57 zF`;YJKde|tsy2o2p1^)9Ysk8Z$-CW)`Jy?lKzYCjh$fd$a&?ReeY0T;spDz+&1{m$ypypfu$8i6&m262fTbr zUNFh)@`hI-0F%(ScEuM1}Fc@|9~mTX0SyFL;vO?rd#F}=-UdZw$rCu z1bmw_d6)n!z87>Xu`F>R_~2(iU_xFw;xS$F;p(PaEU4}>My%Q(4Eu0|fAG@GNUVKN zWVsMvXBkv7GY|(*+QXekA?;@TFPZG%xh@|UN@7JGjjRXgT~NH-psn8e**5qY|9ll? zn<<>jS@D}(T1)NKp2}$qm{px+9mfH!js^C+LBL*H`QXjc;Tw|t^_A6{li}GJx2*}R zyECP%|157^bIdMZ0J2++Fg4{Q{k*NA#SELkgiU;eO{9EC*)2)x(^8z*=xk|-2=s3_ zHk%TmAa?FeoDeUegL*i3ZYo{;WVwsZ#sCP&nb0o*)3qo<3oD0;TId(Mw(s(eq*e&7B zqmQc*^W$rDOK}oDx-q7YLG=@iz)1)bMYZw&7_oyOXbqAr8B7LWZ{xBM{T`$N`u6SH z8_hyx7J*@`|C+IO<=qYB(xF;S^mXLmUe2b4%Y~S&HMknpKSscZS9xS8NbjJS>>DUs z{DViEf*&rXe$uD)-)IRdxkxQn!oC)1gm0u^Z}cy$xX+s^vZ|0WxcHSG7y1-fenvw1 z?|1+MmZmmeLxbb*U%$4hAPp{N>63#mc#U=gp~$N)w%Q@UtMNWA!D!lct3F-R#$mMz zl>Jf3qx}e&3^Mtj#G{Q7FsZtC5Pwsoy}iOLVTLW-WCC(fTbPAVw}R}-!#%=~BIuWN zUNZLLtzxrfe4b3Fw`??vI!gpD;U10y$r>KsjzgZYxhfv-umM?cSA0gu$qe(^@KaYj z196hqhJfvOD$#9De`z)go% zrd#^J+N^8x@Y*`{)%{OhayT{SaXpoRd{ex}_B5c9zOw$!9I8QD*LpvuTB4#{E4uo8 zOCUoM=520vUnQoX+8iV3~bL?KBLT0-^q;9KSfuQY#&YCyzMrXl5@3efXl3eR*II7x%PgS0i~b|2 zSZr$i3C!}3i2Ayk1eS-KVo)}tbOAFy6aSH-eg*CyHr;&8?e0njA_Dj;!;4N9!=K47 z)--S#j}uIn@F}U2G`Z~6&poH8pin|%Vrmlq+7vh2;}T`YuxkZwm*n^q7lRI>J|g?c z6-lV46)-sGHb8R-;SUr@2s|brE*aKIKL-q-skV`O@mzs_%z}_?+$+iO7VM^z}AB zz&8V|f8UA;{riD%EC1ylrsc^=P61ZI3VMXWJCUlpB`V|e+QC683TdzBc-YB*X6g5{ z(6jf$UpaS1_b~JmI{Lr6#=a?rlABbB^1tUo4|e2PRt@_ z)=RmxH=KsXF-@NP%cRJ$FsGf;AA*EJoq%XddY^FaEz8ykWdq88s+#om0xnC>$0mrols2mD<&E##v^hs!_xGMYF$`Ox>Hw1KsR?tA-euUwR8 z|2tc0+|)$VC8tpg(_ifJk~p~+5${TmEk~M-j0qSGBRO}Y2GZVI9~1&!P8r}aW7Q#2B#+op_^K`8W;crDc*!Ex6yxFC*nR3PCH?@us-Tp z(0BlBf1YbK)x#znAW(bh2$(t+dF~%}Eq-QOx-Ms{l~?9zeoO_gcpKb=$lklJFa2?w zyBc-%rVJs?mjL>IKY*tvass4pqh5_}#La=E-*idVM2R<5S|bH%05U7SKXDHFh}997GG%v347 z)IQlY)F^{_eRTUymyGraq9=2#!GF#bNf+OY$tW`oEPz;gwlkDqV0QfVA|6}Mqm_g1 ziHM4gU{7Pfg+lDDrkwN^)Y1rf56w3MGWqT?VM$bw`qMW(E#i&~dTG4a%|rRn7feyV ze;uFJjfXYQuGfxc0*!1 zlPv?eTq``geKGXG%1MkpWs#*m#-kytui}k#_KA@}{rI8A~!!GzS)| zbP0rKp0Pq56&cz0e!Jz)PnO2@C9G=c0rLUM&_KtwSQ=5Kd$%D^e@1cM19W|T$o*9Y zk_+4JcDojmD-LNJND0dNUQ$t*EVuigb4ZM`QCUNS`I$_4mcMA1gii}B9Zcdg>5NKjSNh98rK$iJc7z2AAM*Q#8og z({7ggc{NGVGda(V26NGGy|;TW0F5>nvahcH$S6M9W#@>~)tk!|6IZj{{AF1FJ?qP| z@8{bw6zz_TGCvKGL1jr9gul=`6pOuXRl3nLN`>uEiNLgxO}>>6eMxv>JbSs zsfA>-jG~FEAX*W5uGfsL>1@7a^ZLqHp;Ey_zdOaz{jyp-?R}jb7K1hmojy!4u{9i& z;O5XyJvQNr*zWW>a6kEVQgy*_2PJ#?&v@w&yID}E<@oj#yP0f=rpbL-EgGXNmY}U; zq>@bLBBGMGV8{u%Wj8BI;z+>+vdGauA<6vefEmGWVokGdDHZ`QALFfWA$qo7S3M$vK1 zPi5}A*3T>oGjs30Et#}W-O>^5%JrL91LiVMU$Rk@H2-VwluW+7(Tg3Pi$BKEJG9+N ze{rY|D_*h3ihe|eD5f`KI|m-Z34)38R58P3GD`{`W1=I(`dtQ}3bhcg1e8p50_RJw zI;6<^du%sR)R2>|F5T7Ly`?t1QmT{r_g^_1Y^0iFp&3$Dhx~3mKoBYnVDL*su(+$N z$M@JK*GkzaSE?QQ70=4 zJWt=~0D|&<;NWI`pkFb<}zUOFHC_E8tQDIbZ? z=!o6<%FcV5YKk-Z;^5!FPP=ix8-~HpX`|K$X2CRk-}s7a%bilJYK}0Uk`()e)jN5m6P*V5cwZX{R8bNqF(N$} zJg zPo1`ZJ=hv}o5n+9j2AwPgBZJ5lEITx8t&uP*33N%Jt$S9(`%c|F4b?!UYUo8{EA_;QY;AWf{f?utUG60i*rtB& zzc?}Xd2&A^lv_s>YC}&a(|pJ<^9koMoxga=TRFWOZ1<;jbjobXRw8AWpeOCN;eQ%d zj8DKhD6WiRV>cUb@tPL2XCA!sdTye1M1_(ToL~1MI?tTb7y;Lw4D&*n|E=}!7x5qI znr?rt5PxO6J=L8O#wic%=NpVok@ySoZX|Y4P$jWqW@fy=EfG z&Nx^vkxBMduSxW11;fF^FPpH!7w7wM3!+ee6?aGY>uV2x4HQn*Y(1vc{WM|X{LbsA zmm(8R4r55rJp(~1NFt!xR%4j?)@8MMq~*UfpXZ5Hj}@R)N?@?&J@#S6w8{D!^pe)_c&IDudr z(c%ID+Z~2@uBV!WEG*HKp@gDh%l_L@vb!W-utL3w?E9-sp*UwNcq?z91XK`D>Dgmn z2ex`o;v(nLt4krJ!-=YSv`6BDTE+sRb2rY1#N2yNY*|R%4N&!&29Pg!V#EhyaHx-@ zAMXGmQ@|$q;W39-qd=(g*;Hp>Mvh^}V1Kry#+d>)>E9^u$foz@@sl*K7ehhV+!oK@ zIAWiSCRh~0-v@lAUI_L#QaQ+fRIL)nsqy_` zYBdYzt#c-qE2)Dw8wNrng=P$n&+5$nox%0%L=$Xx$siMy3!{uc z^s@Z7Bb|NfW0pxBtvRyrg)-`vw9VM_gF*O6yUJ64$X})(I*;bH`ggzh>UmTNi1Vf( z4sttE4!#}Sr#qj%qVhVJkQkq$7Np#%Lo zE@c{?o`w?ocTePWTi~v3mc7qIbPOoot>9E2OYbTI>jAZ9G6!P&5-JR4MQ&3548XviMkc&V3u8AiezC zV8tiSi%g~;f-1ohfuCZ#6jMLKl&^c_bUVt5%r2qY)ljXsp2LFT-R1tIo4&SGMz) zR4%Udf8KdMF+6ejL@}HU6xhXCRI`-ZfFsy4CLUOI%3mx!w~o$SVGi##`+QGN)WynP zqraLHVWuD2UG`Un3X?)2`4Y@IVayL-(DJqt%9jr#(DI@ z_bBHY&WalH6Te?|+Z#qt2P=4U+o|mK?OJa7>gHcLr#IJdzF7VZRrF-9u2%8w@0=W% zm(~u%1r8ykK_e{zput_7Lz~=ELsF{$ZhNPbPJuhLD)#M zEcY2kzXoir4D07`x(~|4m|IPp>#Arwc;a4r>rQ?2ofCP_KHvO1rDIHwj}D$sPlIhS zb49VG-y146KLr(LMof==9U!HJJM*Y{99(tRB$wSkD-R$n_Zbjd)fiRX<9!#BHRZHl zQyws9SVL=Ubpvw_Pse#gu&Zj2woac$;L{_z9A2+Pr{yTXo1*Pm=#4wN&*xpz32a9~ z93T*FaOv$?R`Q_l3dJ{dyO>H)rYd*n^Kn6%_bd+?n0lkp$l$%s!wb&UcJSVBZi{>r z(%ZC=N{*%SZ8t?>epn6PU)uIIMfuDpq^VAA!3Swr#R^M~SV01zjI8J2+4sAdgR+N{ zx`PeiQb~R&p3(b^1yc&M3v^y}2E@GNTC1-uv>BBm;h02~eKvHTEXGil4wAh8o3x!m zqPWAOtlSV#V|EG#MO(X8|kajb8}o8?B!`x{d@!&ai!F%Llt zF-ZF+@)()!bDEnIK|!BhUQc_ce$L2MBQR_<5csDBv`vzj=d`xo@9`8}gu3t+uW-(m zu7PbI?<_{s6v;dCA;=rtT;^RPI&d|e&h1==hpSzaS&oUpsMukpS>yOrW;Xn-*mMQf zE^*FAga_Whj8__`S@tR0V)%iU%<%UsO&x-ui*@AE+%T< z7aE%IIP$y8v<-5ir0aFercU_C8kHAXbmTg9=s9&*HT!VkI^{BxjGA|13tV-#{IU4O zXf*J-m`L5f*~l;ap-YarwhK(R?aS)za4O0Yr=;{9eqKp}ohLp%lFE(Lv5vt3Tp8uH zuY^4THQnlKC1G_g>@c>D#<*|I>#6y@ToxyK%+)J>2XQ}7RGq_vw*1AH)oPYP2H0&1 z1Q0ugpA$XAA;4ka<;PXC!g8ghgX+3;EFCT`Piiy?Vjw<&S)Z8t)Zu>9k2E>MCAIvP zM1@hj9#d)db-S)xuCY1Vgu zdtDeRc%72w{mpTpu78}imVwy$=~Z%ZNd~F`4>x}NmQ z!*z7yFlLy(T1Wjw#|Rgm@(kMI4egFh=lu9_+i3Z7VgDtP#k2*Lae4u?Z(Z>%{Wgv{ zl)^tTVx4*CL*twy$V9*OBo@NEvHqLz3d#&xh3tA_6|gNWB8U*)reR2*?9$^GVWNBl zM@1^)qCeq^3q5yBe@<@dt^wWf>iDG2+^=4M4k1Q5*@+oZ{>Z~fvhAy=7wWB^FYkJ4 z3c5nX09JF$UqPdNK`13)G!yspx;pqt$?azNZg!HLXS+Wld19s(3He*IhG=mt`#qKy z(Y6~G@aRTBJM{v?_I#}~mea25*&Q-V7)$-qV-+|MJx!=-)|n=59I_yXt816v((9to z!qUD@&)NUlaS|{cvpqdZ=PQ==qs0o2IloZAc0)G| zF0)20d?J<_psz8*wCB#FR@Uv&I}1dW4_HBqKI-i;#Ul8&mH`!ueJ!yykN;22qMilm zAxywo(xf=41fFD|LJ55`jf0TX61AYeHJwaa*hM#4ir_lcY^K5}wg7}U4|1HCBs8inL;rm!r zJr%$cwO3RnoRmcrlu8SksC#78qw2yTYeY}&;YaS_Pi}mE%&*ulTyph)rPbE zkU(@l^bwd6B`r{_U%1B(sfnyAH_-lD^HAX*wA=4)UcRW7t7Hw>ys{~`uN1y&JdV&l z%l^qDkH=56qQc)wgkhmZDm^(m(fxnWZ>dza3o@cDOdOrDo-PXu%W0eF_pJ_@#)H<{ zxa=(kfn=`+tt1zAojn5KX(b;8P?1JOaoq;Y8j%PV&TtQY(I`Xk9R$;&w#tMb-J z(ixC9T)e~AzZNr>Kqsh)1m$PB@{CnE^mTnczF989Q0&ljgt=ldOIrBqOIt3|{0}G} z;Yj==Cf0&O*7%8N_Sly&*XP19WPfa57N)UmUXZbD{}cZhOU7bbV4Y$2+?omARi5buVB3WMTX~ zc%-1LR~Hvurj)IjHEE?y}~LciBLnCC)VK zbemyz1+wlo6i=j5Bb(f9ArWbX9zRxfhcsld9@>YOUjt>)K&p|i!sJkW5n>OhcP{M6 zuyM*rN&lRfBL%-Ucdk{+OuHW&XSdtWG#*Aj{Ji#nJ!)qEPs_$jSkgA~*^)}n@7j|B zn3cuPqY)}3jnH7mZ4pqqjSIv;=mS#0Kmu`OeT#4KTGw%;W&pNBgBXR&#HvmfCZHe5io`&P-oP5M@{KJeOetzg* z?K+iEDLZd()zCV?+RM~hSF=FEu|{~8|% zXJy!`Y+O`R@^A)7WM+I$?fQ-IG;?<{oQ3~2cqlaF2Xk=wer}(}$%=g;P{#(b2X@n3 z^M*=*>E%x#O@Nw>JK4*)1n(KJ@G{u_ygu`cPe~tSQaT^2$>9nW{RA9LdHo#rYX||* z2&-g<4OEl9H9K{-53fc-+=N(?2al|$QoV8`o ze5}`R#-C<#wSBOTu+n^4WX0{=uTkhMzvOREKBF3A?`97;Iy)nIW5#wbA9}{Weqo^9sJ^cDyw+&T357wQ~Fpb;k`Ajc_yCRZp!mdngZ&WPJ_@ zibiYZn-VCmCyjXj=~+bjPsU&qC{53B|IqlP?1^awbLu~Xht?sjxA0DMDqnwwVPcy< zCH_TeZ4$xQZ_67+(H>fxG+@&XUrIC51*aO_B{}Dt0N{3SABO$6@#%rF!leFl|NmZs zk~sg2_-IFhcQ9S@qDyfwK2xM+m*f>D7ytK|0w}o4+ej|XJLdZfi6F0&t|h=;|CR*# zUc0}t@26Gp|8_oACCQb1{-4(RW$%t9GN?5I%MS8Txc30(zd#j@=ztamV09YYYR&)o z1^uT%ON{Ah>+(M|5D4r2!~X?fhGDb9Jb@1z?01u>B*j~nS`(I9eU@6_;`cm?h2wvh zz!;wFu)+c~6!)qEkRPq-luF}yS__gCD~HChVG~6A1Ln5OuxD(rC#*1_+NO>bmQ%B# z+kc`Z<6Uwhe5|1e*HLt=={v3Bmjtua_BCzXs-LJUT$$HS2ObAvRFpoduQG*zU%Xmz`I)1Ot+GwZL2@9PqP8 z@H3I!3ky?GQdNF&HQA)1nX&g@&PHujo^28MIS+E3(|9X#*G`WmJ)fSAf0_oiOVp0h;waHN)>M=QWe00d1f3*Rh^fa6CxH8a z^F$C0P5|0D$hroZj(py2d?NPh>4cVeEm9XWi4h0UCg%?q@JQqTM(0K|Lb!!VgSPO| z*;5gqE6EK=3|1?VDGC?KDMIhlgbybmMSvnyJe#m$JX`G?LhBUC3Rl~eF zA`0Pt*WT)j6T=#b&V57wT~-QL0`dMGzUJpg-HPnL1YoAPGm)PP)!9DleMz?Di}_yx zP6Dz079uEwG^Pwn6cH4(P$RaIP9Z521QChH;!d&`%_b4di~YZ5ch4@v4j1;^nb|o5 zqd+HBsq&MVxnUM$oAl`z)$~IGQ^2{}eEYf<5l4aNzZAW1SANRmtjk|az9 zk|az9(!7Y6tyQgj`#-DOMnqS=lBu-bY+m{!Qp@;Ww)9Ia$B}`wgmP}gjdFIxtBBZJ z;$0E(q!c%sh=|S--xU!LOMUrw5pkq6)fN$VO3nF4Ww$61ak13bpNxo+HR`5XoXJ3H zX#ZoHsIT_k*Ua1x+@jja4sa8=mzh1l17N7chk$39c>p*Bd?@j5U^p}P0bhU9&ZNq`JHBFR=^2-H5mhOao_1{1R|GBF<8c00yWIOj$@9aK9vs z0ES9UdTCX;W*{x0tQj26%$c?RF49VInn;r5$Uu_h$Uu^W$v~2X$v~2X$v~2X$v~0> Z{00UWn)0#lpOyds002ovPDHLkV1i9=t`YzM literal 0 HcmV?d00001 diff --git a/examples/vision/ipynb/cct.ipynb b/examples/vision/ipynb/cct.ipynb new file mode 100644 index 0000000000..bbf0105bd7 --- /dev/null +++ b/examples/vision/ipynb/cct.ipynb @@ -0,0 +1,582 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "# Compact Convolutional Transformers\n", + "\n", + "**Author:** [Sayak Paul](https://twitter.com/RisingSayak)
\n", + "**Date created:** 2021/06/30
\n", + "**Last modified:** 2021/06/30
\n", + "**Description:** Compact Convolutional Transformers for efficient image classification." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "As discussed in the [Vision Transformers (ViT)](https://arxiv.org/abs/2010.11929) paper,\n", + "a Transformer-based architecture for vision typically requires a larger dataset than\n", + "usual, as well as a longer pre-training schedule. [ImageNet-1k](http://imagenet.org/)\n", + "(which has about a million images) is considered to fall under the medium-sized data regime with\n", + "respect to ViTs. This is primarily because, unlike CNNs, ViTs (or a typical\n", + "Transformer-based architecture) do not have well-informed inductive biases (such as\n", + "convolutions for processing images). This begs the question: can't we combine the\n", + "benefits of convolution and the benefits of Transformers\n", + "in a single network architecture? These benefits include parameter-efficiency, and\n", + "self-attention to process long-range and global dependencies (interactions between\n", + "different regions in an image).\n", + "\n", + "In [Escaping the Big Data Paradigm with Compact Transformers](https://arxiv.org/abs/2104.05704),\n", + "Hassani et al. present an approach for doing exactly this. They proposed the\n", + "**Compact Convolutional Transformer** (CCT) architecture. In this example, we will work on an\n", + "implementation of CCT and we will see how well it performs on the CIFAR-10 dataset.\n", + "\n", + "If you are unfamiliar with the concept of self-attention or Transformers, you can read\n", + "[this chapter](https://livebook.manning.com/book/deep-learning-with-python-second-edition/chapter-11/r-3/312)\n", + "from Fran\u00e7ois Chollet's book *Deep Learning with Python*. This example uses\n", + "code snippets from another example,\n", + "[Image classification with Vision Transformer](https://keras.io/examples/vision/image_classification_with_vision_transformer/).\n", + "\n", + "This example requires TensorFlow 2.5 or higher, as well as TensorFlow Addons, which can\n", + "be installed using the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab_type": "code" + }, + "outputs": [], + "source": [ + "!pip install -U -q tensorflow-addons" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab_type": "code" + }, + "outputs": [], + "source": [ + "from tensorflow.keras import layers\n", + "from tensorflow import keras\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import tensorflow_addons as tfa\n", + "import tensorflow as tf\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "## Hyperparameters and constants" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab_type": "code" + }, + "outputs": [], + "source": [ + "positional_emb = True\n", + "conv_layers = 2\n", + "projection_dim = 128\n", + "\n", + "num_heads = 2\n", + "transformer_units = [\n", + " projection_dim,\n", + " projection_dim,\n", + "]\n", + "transformer_layers = 2\n", + "stochastic_depth_rate = 0.1\n", + "\n", + "learning_rate = 0.001\n", + "weight_decay = 0.0001\n", + "batch_size = 128\n", + "num_epochs = 30\n", + "image_size = 32" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "## Load CIFAR-10 dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab_type": "code" + }, + "outputs": [], + "source": [ + "num_classes = 10\n", + "input_shape = (32, 32, 3)\n", + "\n", + "(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()\n", + "\n", + "y_train = keras.utils.to_categorical(y_train, num_classes)\n", + "y_test = keras.utils.to_categorical(y_test, num_classes)\n", + "\n", + "print(f\"x_train shape: {x_train.shape} - y_train shape: {y_train.shape}\")\n", + "print(f\"x_test shape: {x_test.shape} - y_test shape: {y_test.shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "## The CCT tokenizer\n", + "\n", + "The first recipe introduced by the CCT authors is the tokenizer for processing the\n", + "images. In a standard ViT, images are organized into uniform *non-overlapping* patches.\n", + "This eliminates the boundary-level information present in between different patches. This\n", + "is important for a neural network to effectively exploit the locality information. The\n", + "figure below presents an illustration of how images are organized into patches.\n", + "\n", + "![](https://i.imgur.com/IkBK9oY.png)\n", + "\n", + "We already know that convolutions are quite good at exploiting locality information. So,\n", + "based on this, the authors introduce an all-convolution mini-network to produce image\n", + "patches." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab_type": "code" + }, + "outputs": [], + "source": [ + "\n", + "class CCTTokenizer(layers.Layer):\n", + " def __init__(\n", + " self,\n", + " kernel_size=3,\n", + " stride=1,\n", + " padding=1,\n", + " pooling_kernel_size=3,\n", + " pooling_stride=2,\n", + " num_conv_layers=conv_layers,\n", + " num_output_channels=[64, 128],\n", + " positional_emb=positional_emb,\n", + " **kwargs,\n", + " ):\n", + " super(CCTTokenizer, self).__init__(**kwargs)\n", + "\n", + " # This is our tokenizer.\n", + " self.conv_model = keras.Sequential()\n", + " for i in range(num_conv_layers):\n", + " self.conv_model.add(\n", + " layers.Conv2D(\n", + " num_output_channels[i],\n", + " kernel_size,\n", + " stride,\n", + " padding=\"valid\",\n", + " use_bias=False,\n", + " activation=\"relu\",\n", + " kernel_initializer=\"he_normal\",\n", + " )\n", + " )\n", + " self.conv_model.add(layers.ZeroPadding2D(padding))\n", + " self.conv_model.add(\n", + " layers.MaxPool2D(pooling_kernel_size, pooling_stride, \"same\")\n", + " )\n", + "\n", + " self.positional_emb = positional_emb\n", + "\n", + " def call(self, images):\n", + " outputs = self.conv_model(images)\n", + " # After passing the images through our mini-network the spatial dimensions\n", + " # are flattened to form sequences.\n", + " reshaped = tf.reshape(\n", + " outputs,\n", + " (-1, tf.shape(outputs)[1] * tf.shape(outputs)[2], tf.shape(outputs)[-1]),\n", + " )\n", + " return reshaped\n", + "\n", + " def positional_embedding(self, image_size):\n", + " # Positional embeddings are optional in CCT. Here, we calculate\n", + " # the number of sequences and initialize an `Embedding` layer to\n", + " # compute the positional embeddings later.\n", + " if self.positional_emb:\n", + " dummy_inputs = tf.ones((1, image_size, image_size, 3))\n", + " dummy_outputs = self.call(dummy_inputs)\n", + " sequence_length = tf.shape(dummy_outputs)[1]\n", + " projection_dim = tf.shape(dummy_outputs)[-1]\n", + "\n", + " embed_layer = layers.Embedding(\n", + " input_dim=sequence_length, output_dim=projection_dim\n", + " )\n", + " return embed_layer, sequence_length\n", + " else:\n", + " return None\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "## Stochastic depth for regularization\n", + "\n", + "[Stochastic depth](https://arxiv.org/abs/1603.09382) is a regularization technique that\n", + "randomly drops a set of layers. During inference, the layers are kept as they are. It is\n", + "very much similar to [Dropout](https://jmlr.org/papers/v15/srivastava14a.html) but only\n", + "that it operates on a block os layers rather than individual nodes present inside a\n", + "layer. In CCT, stochastic depth is used just before the residual blocks of a Transformers\n", + "encoder." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab_type": "code" + }, + "outputs": [], + "source": [ + "# Referred from: github.com:rwightman/pytorch-image-models.\n", + "class StochasticDepth(layers.Layer):\n", + " def __init__(self, drop_prop, **kwargs):\n", + " super(StochasticDepth, self).__init__(**kwargs)\n", + " self.drop_prob = drop_prop\n", + "\n", + " def call(self, x, training=None):\n", + " if training:\n", + " keep_prob = 1 - self.drop_prob\n", + " shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)\n", + " random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)\n", + " random_tensor = tf.floor(random_tensor)\n", + " return (x / keep_prob) * random_tensor\n", + " return x\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "## MLP for the Transformers encoder" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab_type": "code" + }, + "outputs": [], + "source": [ + "\n", + "def mlp(x, hidden_units, dropout_rate):\n", + " for units in hidden_units:\n", + " x = layers.Dense(units, activation=tf.nn.gelu)(x)\n", + " x = layers.Dropout(dropout_rate)(x)\n", + " return x\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "## Data augmentation\n", + "\n", + "In the [original paper](https://arxiv.org/abs/2104.05704), the authors use\n", + "[AutoAugment](https://arxiv.org/abs/1805.09501) to induce stronger regularization. For\n", + "this example, we will be using the standard geometric augmentations like random cropping\n", + "and flipping." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab_type": "code" + }, + "outputs": [], + "source": [ + "# Note the rescaling layer. These layers have pre-defined inference behavior.\n", + "data_augmentation = keras.Sequential(\n", + " [\n", + " layers.experimental.preprocessing.Rescaling(scale=1.0 / 255),\n", + " layers.experimental.preprocessing.RandomCrop(image_size, image_size),\n", + " layers.experimental.preprocessing.RandomFlip(\"horizontal\"),\n", + " ],\n", + " name=\"data_augmentation\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "## The final CCT model\n", + "\n", + "Another recipe introduced in CCT is attention pooling or sequence pooling. In ViT, only\n", + "the feature map corresponding to the class token is pooled and is then used for the\n", + "subsequent classification task (or any other downstream task). In CCT, outputs from the\n", + "Transformers encoder are weighted and then passed on to the final task-specific layer (in\n", + "this example, we do classification)." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab_type": "code" + }, + "outputs": [], + "source": [ + "\n", + "def create_cct_model(\n", + " image_size=image_size,\n", + " input_shape=input_shape,\n", + " num_heads=num_heads,\n", + " projection_dim=projection_dim,\n", + " transformer_units=transformer_units,\n", + "):\n", + "\n", + " inputs = layers.Input(input_shape)\n", + "\n", + " # Augment data.\n", + " augmented = data_augmentation(inputs)\n", + "\n", + " # Encode patches.\n", + " cct_tokenizer = CCTTokenizer()\n", + " encoded_patches = cct_tokenizer(augmented)\n", + "\n", + " # Apply positional embedding.\n", + " if positional_emb:\n", + " pos_embed, seq_length = cct_tokenizer.positional_embedding(image_size)\n", + " positions = tf.range(start=0, limit=seq_length, delta=1)\n", + " position_embeddings = pos_embed(positions)\n", + " encoded_patches += position_embeddings\n", + "\n", + " # Calculate Stochastic Depth probabilities.\n", + " dpr = [x for x in np.linspace(0, stochastic_depth_rate, transformer_layers)]\n", + "\n", + " # Create multiple layers of the Transformer block.\n", + " for i in range(transformer_layers):\n", + " # Layer normalization 1.\n", + " x1 = layers.LayerNormalization(epsilon=1e-5)(encoded_patches)\n", + "\n", + " # Create a multi-head attention layer.\n", + " attention_output = layers.MultiHeadAttention(\n", + " num_heads=num_heads, key_dim=projection_dim, dropout=0.1\n", + " )(x1, x1)\n", + "\n", + " # Skip connection 1.\n", + " attention_output = StochasticDepth(dpr[i])(attention_output)\n", + " x2 = layers.Add()([attention_output, encoded_patches])\n", + "\n", + " # Layer normalization 2.\n", + " x3 = layers.LayerNormalization(epsilon=1e-5)(x2)\n", + "\n", + " # MLP.\n", + " x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)\n", + "\n", + " # Skip connection 2.\n", + " x3 = StochasticDepth(dpr[i])(x3)\n", + " encoded_patches = layers.Add()([x3, x2])\n", + "\n", + " # Apply sequence pooling.\n", + " representation = layers.LayerNormalization(epsilon=1e-5)(encoded_patches)\n", + " attention_weights = tf.nn.softmax(layers.Dense(1)(representation), axis=1)\n", + " weighted_representation = tf.matmul(\n", + " attention_weights, representation, transpose_a=True\n", + " )\n", + " weighted_representation = tf.squeeze(weighted_representation, -2)\n", + "\n", + " # Classify outputs.\n", + " logits = layers.Dense(num_classes)(weighted_representation)\n", + " # Create the Keras model.\n", + " model = keras.Model(inputs=inputs, outputs=logits)\n", + " return model\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "## Model training and evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab_type": "code" + }, + "outputs": [], + "source": [ + "\n", + "def run_experiment(model):\n", + " optimizer = tfa.optimizers.AdamW(learning_rate=0.001, weight_decay=0.0001)\n", + "\n", + " model.compile(\n", + " optimizer=optimizer,\n", + " loss=keras.losses.CategoricalCrossentropy(\n", + " from_logits=True, label_smoothing=0.1\n", + " ),\n", + " metrics=[\n", + " keras.metrics.CategoricalAccuracy(name=\"accuracy\"),\n", + " keras.metrics.TopKCategoricalAccuracy(5, name=\"top-5-accuracy\"),\n", + " ],\n", + " )\n", + "\n", + " checkpoint_filepath = \"/tmp/checkpoint\"\n", + " checkpoint_callback = keras.callbacks.ModelCheckpoint(\n", + " checkpoint_filepath,\n", + " monitor=\"val_accuracy\",\n", + " save_best_only=True,\n", + " save_weights_only=True,\n", + " )\n", + "\n", + " history = model.fit(\n", + " x=x_train,\n", + " y=y_train,\n", + " batch_size=batch_size,\n", + " epochs=num_epochs,\n", + " validation_split=0.1,\n", + " callbacks=[checkpoint_callback],\n", + " )\n", + "\n", + " model.load_weights(checkpoint_filepath)\n", + " _, accuracy, top_5_accuracy = model.evaluate(x_test, y_test)\n", + " print(f\"Test accuracy: {round(accuracy * 100, 2)}%\")\n", + " print(f\"Test top 5 accuracy: {round(top_5_accuracy * 100, 2)}%\")\n", + "\n", + " return history\n", + "\n", + "\n", + "cct_model = create_cct_model()\n", + "history = run_experiment(cct_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "Let's now visualize the training progress of the model." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab_type": "code" + }, + "outputs": [], + "source": [ + "plt.plot(history.history[\"loss\"], label=\"train_loss\")\n", + "plt.plot(history.history[\"val_loss\"], label=\"val_loss\")\n", + "plt.xlabel(\"Epochs\")\n", + "plt.ylabel(\"Loss\")\n", + "plt.title(\"Train and Validation Losses Over Epochs\", fontsize=14)\n", + "plt.legend()\n", + "plt.grid()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text" + }, + "source": [ + "The CCT model we just trained has just **0.4 million** parameters, and it gets us to\n", + "~78% top-1 accuracy within 30 epochs. The plot above shows no signs of overfitting as\n", + "well. This means we can train this network for longers (perhaps with a bit more\n", + "regularization) and may obtain even better performance. This performance can further be\n", + "improved by additional recipes like cosine decay learning rate schedule, other data augmentation\n", + "techniques like [AutoAugment](https://arxiv.org/abs/1805.09501),\n", + "[MixUp](https://arxiv.org/abs/1710.09412) or\n", + "[Cutmix](https://arxiv.org/abs/1905.04899. The authors also present a number of\n", + "experiments to study how the number of convolution blocks, Transformers layers, etc.\n", + "affect the final performance.\n", + "\n", + "For a comparison, a ViT model takes about **4.7 million** parameters and **100\n", + "epochs** of training to reach a top-1 accuracy of 78.22% on the CIFAR-10 dataset. You can\n", + "refer to\n", + "[this notebook](https://colab.research.google.com/gist/sayakpaul/1a80d9f582b044354a1a26c5cb3d69e5/image_classification_with_vision_transformer.ipynb)\n", + "to know about the experimental setup.\n", + "\n", + "The authors also demonstrate the performance of Compact Convolutional Transformers on\n", + "NLP tasks and they report competitive results there." + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "cct", + "private_outputs": false, + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/examples/vision/md/cct.md b/examples/vision/md/cct.md new file mode 100644 index 0000000000..6e84256312 --- /dev/null +++ b/examples/vision/md/cct.md @@ -0,0 +1,495 @@ + +# Compact Convolutional Transformers + +**Author:** [Sayak Paul](https://twitter.com/RisingSayak)
+**Date created:** 2021/06/30
+**Last modified:** 2021/06/30
+ + + [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/vision/ipynb/cct.ipynb) [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/examples/vision/cct.py) + + +**Description:** Compact Convolutional Transformers for efficient image classification. + +As discussed in the [Vision Transformers (ViT)](https://arxiv.org/abs/2010.11929) paper, +a Transformer-based architecture for vision typically requires a larger dataset than +usual, as well as a longer pre-training schedule. [ImageNet-1k](http://imagenet.org/) +(which has about a million images) is considered to fall under the medium-sized data regime with +respect to ViTs. This is primarily because, unlike CNNs, ViTs (or a typical +Transformer-based architecture) do not have well-informed inductive biases (such as +convolutions for processing images). This begs the question: can't we combine the +benefits of convolution and the benefits of Transformers +in a single network architecture? These benefits include parameter-efficiency, and +self-attention to process long-range and global dependencies (interactions between +different regions in an image). + +In [Escaping the Big Data Paradigm with Compact Transformers](https://arxiv.org/abs/2104.05704), +Hassani et al. present an approach for doing exactly this. They proposed the +**Compact Convolutional Transformer** (CCT) architecture. In this example, we will work on an +implementation of CCT and we will see how well it performs on the CIFAR-10 dataset. + +If you are unfamiliar with the concept of self-attention or Transformers, you can read +[this chapter](https://livebook.manning.com/book/deep-learning-with-python-second-edition/chapter-11/r-3/312) +from François Chollet's book *Deep Learning with Python*. This example uses +code snippets from another example, +[Image classification with Vision Transformer](https://keras.io/examples/vision/image_classification_with_vision_transformer/). + +This example requires TensorFlow 2.5 or higher, as well as TensorFlow Addons, which can +be installed using the following command: + + +```python +!pip install -U -q tensorflow-addons +``` + +
+``` + |████████████████████████████████| 686kB 5.3MB/s +[?25h + +``` +
+--- +## Imports + + +```python +from tensorflow.keras import layers +from tensorflow import keras + +import matplotlib.pyplot as plt +import tensorflow_addons as tfa +import tensorflow as tf +import numpy as np +``` + +--- +## Hyperparameters and constants + + +```python +positional_emb = True +conv_layers = 2 +projection_dim = 128 + +num_heads = 2 +transformer_units = [ + projection_dim, + projection_dim, +] +transformer_layers = 2 +stochastic_depth_rate = 0.1 + +learning_rate = 0.001 +weight_decay = 0.0001 +batch_size = 128 +num_epochs = 30 +image_size = 32 +``` + +--- +## Load CIFAR-10 dataset + + +```python +num_classes = 10 +input_shape = (32, 32, 3) + +(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data() + +y_train = keras.utils.to_categorical(y_train, num_classes) +y_test = keras.utils.to_categorical(y_test, num_classes) + +print(f"x_train shape: {x_train.shape} - y_train shape: {y_train.shape}") +print(f"x_test shape: {x_test.shape} - y_test shape: {y_test.shape}") +``` + +
+``` +Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz +170500096/170498071 [==============================] - 11s 0us/step +x_train shape: (50000, 32, 32, 3) - y_train shape: (50000, 10) +x_test shape: (10000, 32, 32, 3) - y_test shape: (10000, 10) + +``` +
+--- +## The CCT tokenizer + +The first recipe introduced by the CCT authors is the tokenizer for processing the +images. In a standard ViT, images are organized into uniform *non-overlapping* patches. +This eliminates the boundary-level information present in between different patches. This +is important for a neural network to effectively exploit the locality information. The +figure below presents an illustration of how images are organized into patches. + +![](https://i.imgur.com/IkBK9oY.png) + +We already know that convolutions are quite good at exploiting locality information. So, +based on this, the authors introduce an all-convolution mini-network to produce image +patches. + + +```python + +class CCTTokenizer(layers.Layer): + def __init__( + self, + kernel_size=3, + stride=1, + padding=1, + pooling_kernel_size=3, + pooling_stride=2, + num_conv_layers=conv_layers, + num_output_channels=[64, 128], + positional_emb=positional_emb, + **kwargs, + ): + super(CCTTokenizer, self).__init__(**kwargs) + + # This is our tokenizer. + self.conv_model = keras.Sequential() + for i in range(num_conv_layers): + self.conv_model.add( + layers.Conv2D( + num_output_channels[i], + kernel_size, + stride, + padding="valid", + use_bias=False, + activation="relu", + kernel_initializer="he_normal", + ) + ) + self.conv_model.add(layers.ZeroPadding2D(padding)) + self.conv_model.add( + layers.MaxPool2D(pooling_kernel_size, pooling_stride, "same") + ) + + self.positional_emb = positional_emb + + def call(self, images): + outputs = self.conv_model(images) + # After passing the images through our mini-network the spatial dimensions + # are flattened to form sequences. + reshaped = tf.reshape( + outputs, + (-1, tf.shape(outputs)[1] * tf.shape(outputs)[2], tf.shape(outputs)[-1]), + ) + return reshaped + + def positional_embedding(self, image_size): + # Positional embeddings are optional in CCT. Here, we calculate + # the number of sequences and initialize an `Embedding` layer to + # compute the positional embeddings later. + if self.positional_emb: + dummy_inputs = tf.ones((1, image_size, image_size, 3)) + dummy_outputs = self.call(dummy_inputs) + sequence_length = tf.shape(dummy_outputs)[1] + projection_dim = tf.shape(dummy_outputs)[-1] + + embed_layer = layers.Embedding( + input_dim=sequence_length, output_dim=projection_dim + ) + return embed_layer, sequence_length + else: + return None + +``` + +--- +## Stochastic depth for regularization + +[Stochastic depth](https://arxiv.org/abs/1603.09382) is a regularization technique that +randomly drops a set of layers. During inference, the layers are kept as they are. It is +very much similar to [Dropout](https://jmlr.org/papers/v15/srivastava14a.html) but only +that it operates on a block os layers rather than individual nodes present inside a +layer. In CCT, stochastic depth is used just before the residual blocks of a Transformers +encoder. + + +```python +# Referred from: github.com:rwightman/pytorch-image-models. +class StochasticDepth(layers.Layer): + def __init__(self, drop_prop, **kwargs): + super(StochasticDepth, self).__init__(**kwargs) + self.drop_prob = drop_prop + + def call(self, x, training=None): + if training: + keep_prob = 1 - self.drop_prob + shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1) + random_tensor = keep_prob + tf.random.uniform(shape, 0, 1) + random_tensor = tf.floor(random_tensor) + return (x / keep_prob) * random_tensor + return x + +``` + +--- +## MLP for the Transformers encoder + + +```python + +def mlp(x, hidden_units, dropout_rate): + for units in hidden_units: + x = layers.Dense(units, activation=tf.nn.gelu)(x) + x = layers.Dropout(dropout_rate)(x) + return x + +``` + +--- +## Data augmentation + +In the [original paper](https://arxiv.org/abs/2104.05704), the authors use +[AutoAugment](https://arxiv.org/abs/1805.09501) to induce stronger regularization. For +this example, we will be using the standard geometric augmentations like random cropping +and flipping. + + +```python +# Note the rescaling layer. These layers have pre-defined inference behavior. +data_augmentation = keras.Sequential( + [ + layers.experimental.preprocessing.Rescaling(scale=1.0 / 255), + layers.experimental.preprocessing.RandomCrop(image_size, image_size), + layers.experimental.preprocessing.RandomFlip("horizontal"), + ], + name="data_augmentation", +) +``` + +--- +## The final CCT model + +Another recipe introduced in CCT is attention pooling or sequence pooling. In ViT, only +the feature map corresponding to the class token is pooled and is then used for the +subsequent classification task (or any other downstream task). In CCT, outputs from the +Transformers encoder are weighted and then passed on to the final task-specific layer (in +this example, we do classification). + + +```python + +def create_cct_model( + image_size=image_size, + input_shape=input_shape, + num_heads=num_heads, + projection_dim=projection_dim, + transformer_units=transformer_units, +): + + inputs = layers.Input(input_shape) + + # Augment data. + augmented = data_augmentation(inputs) + + # Encode patches. + cct_tokenizer = CCTTokenizer() + encoded_patches = cct_tokenizer(augmented) + + # Apply positional embedding. + if positional_emb: + pos_embed, seq_length = cct_tokenizer.positional_embedding(image_size) + positions = tf.range(start=0, limit=seq_length, delta=1) + position_embeddings = pos_embed(positions) + encoded_patches += position_embeddings + + # Calculate Stochastic Depth probabilities. + dpr = [x for x in np.linspace(0, stochastic_depth_rate, transformer_layers)] + + # Create multiple layers of the Transformer block. + for i in range(transformer_layers): + # Layer normalization 1. + x1 = layers.LayerNormalization(epsilon=1e-5)(encoded_patches) + + # Create a multi-head attention layer. + attention_output = layers.MultiHeadAttention( + num_heads=num_heads, key_dim=projection_dim, dropout=0.1 + )(x1, x1) + + # Skip connection 1. + attention_output = StochasticDepth(dpr[i])(attention_output) + x2 = layers.Add()([attention_output, encoded_patches]) + + # Layer normalization 2. + x3 = layers.LayerNormalization(epsilon=1e-5)(x2) + + # MLP. + x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1) + + # Skip connection 2. + x3 = StochasticDepth(dpr[i])(x3) + encoded_patches = layers.Add()([x3, x2]) + + # Apply sequence pooling. + representation = layers.LayerNormalization(epsilon=1e-5)(encoded_patches) + attention_weights = tf.nn.softmax(layers.Dense(1)(representation), axis=1) + weighted_representation = tf.matmul( + attention_weights, representation, transpose_a=True + ) + weighted_representation = tf.squeeze(weighted_representation, -2) + + # Classify outputs. + logits = layers.Dense(num_classes)(weighted_representation) + # Create the Keras model. + model = keras.Model(inputs=inputs, outputs=logits) + return model + +``` + +--- +## Model training and evaluation + + +```python + +def run_experiment(model): + optimizer = tfa.optimizers.AdamW(learning_rate=0.001, weight_decay=0.0001) + + model.compile( + optimizer=optimizer, + loss=keras.losses.CategoricalCrossentropy( + from_logits=True, label_smoothing=0.1 + ), + metrics=[ + keras.metrics.CategoricalAccuracy(name="accuracy"), + keras.metrics.TopKCategoricalAccuracy(5, name="top-5-accuracy"), + ], + ) + + checkpoint_filepath = "/tmp/checkpoint" + checkpoint_callback = keras.callbacks.ModelCheckpoint( + checkpoint_filepath, + monitor="val_accuracy", + save_best_only=True, + save_weights_only=True, + ) + + history = model.fit( + x=x_train, + y=y_train, + batch_size=batch_size, + epochs=num_epochs, + validation_split=0.1, + callbacks=[checkpoint_callback], + ) + + model.load_weights(checkpoint_filepath) + _, accuracy, top_5_accuracy = model.evaluate(x_test, y_test) + print(f"Test accuracy: {round(accuracy * 100, 2)}%") + print(f"Test top 5 accuracy: {round(top_5_accuracy * 100, 2)}%") + + return history + + +cct_model = create_cct_model() +history = run_experiment(cct_model) +``` + +
+``` +Epoch 1/30 +352/352 [==============================] - 10s 17ms/step - loss: 1.9019 - accuracy: 0.3357 - top-5-accuracy: 0.8326 - val_loss: 1.6537 - val_accuracy: 0.4596 - val_top-5-accuracy: 0.9206 +Epoch 2/30 +352/352 [==============================] - 5s 15ms/step - loss: 1.5560 - accuracy: 0.5058 - top-5-accuracy: 0.9341 - val_loss: 1.4756 - val_accuracy: 0.5466 - val_top-5-accuracy: 0.9462 +Epoch 3/30 +352/352 [==============================] - 5s 15ms/step - loss: 1.4379 - accuracy: 0.5646 - top-5-accuracy: 0.9527 - val_loss: 1.3775 - val_accuracy: 0.6016 - val_top-5-accuracy: 0.9622 +Epoch 4/30 +352/352 [==============================] - 5s 15ms/step - loss: 1.3568 - accuracy: 0.6067 - top-5-accuracy: 0.9611 - val_loss: 1.3125 - val_accuracy: 0.6288 - val_top-5-accuracy: 0.9658 +Epoch 5/30 +352/352 [==============================] - 5s 15ms/step - loss: 1.2905 - accuracy: 0.6386 - top-5-accuracy: 0.9668 - val_loss: 1.2665 - val_accuracy: 0.6506 - val_top-5-accuracy: 0.9712 +Epoch 6/30 +352/352 [==============================] - 5s 15ms/step - loss: 1.2438 - accuracy: 0.6612 - top-5-accuracy: 0.9710 - val_loss: 1.2220 - val_accuracy: 0.6740 - val_top-5-accuracy: 0.9728 +Epoch 7/30 +352/352 [==============================] - 5s 15ms/step - loss: 1.2150 - accuracy: 0.6753 - top-5-accuracy: 0.9743 - val_loss: 1.2013 - val_accuracy: 0.6802 - val_top-5-accuracy: 0.9772 +Epoch 8/30 +352/352 [==============================] - 5s 15ms/step - loss: 1.1807 - accuracy: 0.6922 - top-5-accuracy: 0.9762 - val_loss: 1.2122 - val_accuracy: 0.6808 - val_top-5-accuracy: 0.9710 +Epoch 9/30 +352/352 [==============================] - 5s 15ms/step - loss: 1.1464 - accuracy: 0.7075 - top-5-accuracy: 0.9792 - val_loss: 1.1697 - val_accuracy: 0.6974 - val_top-5-accuracy: 0.9798 +Epoch 10/30 +352/352 [==============================] - 5s 15ms/step - loss: 1.1294 - accuracy: 0.7148 - top-5-accuracy: 0.9800 - val_loss: 1.1683 - val_accuracy: 0.6992 - val_top-5-accuracy: 0.9750 +Epoch 11/30 +352/352 [==============================] - 5s 15ms/step - loss: 1.1030 - accuracy: 0.7258 - top-5-accuracy: 0.9818 - val_loss: 1.1785 - val_accuracy: 0.6946 - val_top-5-accuracy: 0.9770 +Epoch 12/30 +352/352 [==============================] - 5s 15ms/step - loss: 1.0928 - accuracy: 0.7315 - top-5-accuracy: 0.9827 - val_loss: 1.0762 - val_accuracy: 0.7460 - val_top-5-accuracy: 0.9828 +Epoch 13/30 +352/352 [==============================] - 5s 15ms/step - loss: 1.0739 - accuracy: 0.7436 - top-5-accuracy: 0.9837 - val_loss: 1.1078 - val_accuracy: 0.7296 - val_top-5-accuracy: 0.9844 +Epoch 14/30 +352/352 [==============================] - 5s 15ms/step - loss: 1.0577 - accuracy: 0.7509 - top-5-accuracy: 0.9843 - val_loss: 1.0919 - val_accuracy: 0.7384 - val_top-5-accuracy: 0.9814 +Epoch 15/30 +352/352 [==============================] - 5s 15ms/step - loss: 1.0436 - accuracy: 0.7570 - top-5-accuracy: 0.9849 - val_loss: 1.1271 - val_accuracy: 0.7206 - val_top-5-accuracy: 0.9804 +Epoch 16/30 +352/352 [==============================] - 5s 15ms/step - loss: 1.0245 - accuracy: 0.7651 - top-5-accuracy: 0.9855 - val_loss: 1.0777 - val_accuracy: 0.7452 - val_top-5-accuracy: 0.9826 +Epoch 17/30 +352/352 [==============================] - 5s 15ms/step - loss: 1.0231 - accuracy: 0.7653 - top-5-accuracy: 0.9860 - val_loss: 1.0474 - val_accuracy: 0.7608 - val_top-5-accuracy: 0.9868 +Epoch 18/30 +352/352 [==============================] - 5s 15ms/step - loss: 1.0091 - accuracy: 0.7713 - top-5-accuracy: 0.9876 - val_loss: 1.0785 - val_accuracy: 0.7468 - val_top-5-accuracy: 0.9808 +Epoch 19/30 +352/352 [==============================] - 5s 15ms/step - loss: 0.9959 - accuracy: 0.7800 - top-5-accuracy: 0.9880 - val_loss: 1.0574 - val_accuracy: 0.7522 - val_top-5-accuracy: 0.9830 +Epoch 20/30 +352/352 [==============================] - 5s 16ms/step - loss: 0.9902 - accuracy: 0.7792 - top-5-accuracy: 0.9883 - val_loss: 1.1174 - val_accuracy: 0.7354 - val_top-5-accuracy: 0.9834 +Epoch 21/30 +352/352 [==============================] - 5s 15ms/step - loss: 0.9855 - accuracy: 0.7830 - top-5-accuracy: 0.9883 - val_loss: 1.0374 - val_accuracy: 0.7598 - val_top-5-accuracy: 0.9850 +Epoch 22/30 +352/352 [==============================] - 5s 15ms/step - loss: 0.9750 - accuracy: 0.7890 - top-5-accuracy: 0.9898 - val_loss: 1.0547 - val_accuracy: 0.7570 - val_top-5-accuracy: 0.9824 +Epoch 23/30 +352/352 [==============================] - 5s 15ms/step - loss: 0.9696 - accuracy: 0.7903 - top-5-accuracy: 0.9898 - val_loss: 1.0271 - val_accuracy: 0.7680 - val_top-5-accuracy: 0.9856 +Epoch 24/30 +352/352 [==============================] - 5s 15ms/step - loss: 0.9634 - accuracy: 0.7957 - top-5-accuracy: 0.9890 - val_loss: 1.0197 - val_accuracy: 0.7742 - val_top-5-accuracy: 0.9864 +Epoch 25/30 +352/352 [==============================] - 5s 15ms/step - loss: 0.9513 - accuracy: 0.8004 - top-5-accuracy: 0.9898 - val_loss: 1.0614 - val_accuracy: 0.7590 - val_top-5-accuracy: 0.9826 +Epoch 26/30 +352/352 [==============================] - 5s 15ms/step - loss: 0.9498 - accuracy: 0.8014 - top-5-accuracy: 0.9897 - val_loss: 1.0088 - val_accuracy: 0.7792 - val_top-5-accuracy: 0.9858 +Epoch 27/30 +352/352 [==============================] - 5s 15ms/step - loss: 0.9393 - accuracy: 0.8040 - top-5-accuracy: 0.9904 - val_loss: 1.0632 - val_accuracy: 0.7598 - val_top-5-accuracy: 0.9808 +Epoch 28/30 +352/352 [==============================] - 5s 15ms/step - loss: 0.9390 - accuracy: 0.8063 - top-5-accuracy: 0.9901 - val_loss: 1.0624 - val_accuracy: 0.7580 - val_top-5-accuracy: 0.9808 +Epoch 29/30 +352/352 [==============================] - 5s 15ms/step - loss: 0.9421 - accuracy: 0.8045 - top-5-accuracy: 0.9901 - val_loss: 1.0095 - val_accuracy: 0.7768 - val_top-5-accuracy: 0.9870 +Epoch 30/30 +352/352 [==============================] - 5s 15ms/step - loss: 0.9234 - accuracy: 0.8108 - top-5-accuracy: 0.9915 - val_loss: 1.0183 - val_accuracy: 0.7808 - val_top-5-accuracy: 0.9838 +313/313 [==============================] - 2s 5ms/step - loss: 1.0569 - accuracy: 0.7645 - top-5-accuracy: 0.9827 +Test accuracy: 76.45% +Test top 5 accuracy: 98.27% + +``` +
+Let's now visualize the training progress of the model. + + +```python +plt.plot(history.history["loss"], label="train_loss") +plt.plot(history.history["val_loss"], label="val_loss") +plt.xlabel("Epochs") +plt.ylabel("Loss") +plt.title("Train and Validation Losses Over Epochs", fontsize=14) +plt.legend() +plt.grid() +plt.show() +``` + + +![png](/img/examples/vision/cct/cct_22_0.png) + + +The CCT model we just trained has just **0.4 million** parameters, and it gets us to +~78% top-1 accuracy within 30 epochs. The plot above shows no signs of overfitting as +well. This means we can train this network for longers (perhaps with a bit more +regularization) and may obtain even better performance. This performance can further be +improved by additional recipes like cosine decay learning rate schedule, other data augmentation +techniques like [AutoAugment](https://arxiv.org/abs/1805.09501), +[MixUp](https://arxiv.org/abs/1710.09412) or +[Cutmix](https://arxiv.org/abs/1905.04899. The authors also present a number of +experiments to study how the number of convolution blocks, Transformers layers, etc. +affect the final performance. + +For a comparison, a ViT model takes about **4.7 million** parameters and **100 +epochs** of training to reach a top-1 accuracy of 78.22% on the CIFAR-10 dataset. You can +refer to +[this notebook](https://colab.research.google.com/gist/sayakpaul/1a80d9f582b044354a1a26c5cb3d69e5/image_classification_with_vision_transformer.ipynb) +to know about the experimental setup. + +The authors also demonstrate the performance of Compact Convolutional Transformers on +NLP tasks and they report competitive results there.