Merge pull request #140 from HydrogenSulfate/TSM_ucf101_ft

TSM finetune on UCF101 dataset
PaddlePaddle · May 17, 2021 · 884ee02 · 884ee02
2 parents d8b9bd4 + fe89c8c
commit 884ee02
Show file tree

Hide file tree

Showing 9 changed files with 290 additions and 67 deletions.
diff --git a/configs/recognition/tsm/tsm_k400_frames.yaml b/configs/recognition/tsm/tsm_k400_frames.yaml
@@ -0,0 +1,114 @@
+MODEL: #MODEL field
+    framework: "Recognizer2D" #Mandatory, indicate the type of network, associate to the 'paddlevideo/modeling/framework/' .
+    backbone: #Mandatory, indicate the type of backbone, associate to the 'paddlevideo/modeling/backbones/' .
+        name: "ResNetTSM" #Mandatory, The name of backbone.
+        pretrained: "data/ResNet50_pretrain.pdparams" #Optional, pretrained model path.
+        num_seg: 8
+        depth: 50 #Optional, the depth of backbone architecture.
+    head:
+        name: "TSMHead" #Mandatory, indicate the type of head, associate to the 'paddlevideo/modeling/heads'
+        num_classes: 400 #Optional, the number of classes to be classified.
+        in_channels: 2048 #input channel of the extracted feature.
+        drop_ratio: 0.5 #the ratio of dropout
+        std: 0.001 #std value in params initialization
+
+
+DATASET: #DATASET field
+    batch_size: 16 #Mandatory, bacth size
+    num_workers: 4 #Mandatory, XXX the number of subprocess on each GPU.
+    train:
+        format: "FrameDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
+        data_prefix: "" #Mandatory, train data root path
+        file_path: "data/k400_frames/train.list" #Mandatory, train data index file path
+        suffix: 'img_{:05}.jpg'
+    valid:
+        format: "FrameDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
+        data_prefix: "" #Mandatory, valid data root path
+        file_path: "data/k400_frames/val.list" #Mandatory, valid data index file path
+        suffix: 'img_{:05}.jpg'
+    test:
+        format: "FrameDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
+        data_prefix: "" #Mandatory, valid data root path
+        file_path: "data/k400_frames/val.list" #Mandatory, valid data index file path
+        suffix: 'img_{:05}.jpg'
+
+
+PIPELINE: #PIPELINE field
+    train: #Mandotary, indicate the pipeline to deal with the training data, associate to the 'paddlevideo/loader/pipelines/'
+        decode:
+            name: "FrameDecoder"
+        sample:
+            name: "Sampler_TSM"
+            num_seg: 8
+            seg_len: 1
+            valid_mode: False
+        transform: #Mandotary, image transform operator.
+            - MultiScaleCrop_TSM:
+                target_size: 224
+            - RandomFlip:
+            - Image2Array:
+            - Normalization:
+                mean: [0.485, 0.456, 0.406]
+                std: [0.229, 0.224, 0.225]
+
+    valid: #Mandatory, indicate the pipeline to deal with the validing data. associate to the 'paddlevideo/loader/pipelines/'
+        decode:
+            name: "FrameDecoder"
+        sample:
+            name: "Sampler_TSM"
+            num_seg: 8
+            seg_len: 1
+            valid_mode: True
+        transform:
+            - Scale_PV:
+                short_size: 256
+            - CenterCrop:
+                target_size: 224
+            - Image2Array:
+            - Normalization:
+                mean: [0.485, 0.456, 0.406]
+                std: [0.229, 0.224, 0.225]
+
+    test:
+        decode:
+            name: "FrameDecoder"
+        sample:
+            name: "Sampler_TSM"
+            num_seg: 8
+            seg_len: 1
+            valid_mode: True
+        transform:
+            - Scale_PV:
+                short_size: 256
+            - CenterCrop:
+                target_size: 224
+            - Image2Array:
+            - Normalization:
+                mean: [0.485, 0.456, 0.406]
+                std: [0.229, 0.224, 0.225]
+
+
+OPTIMIZER: #OPTIMIZER field
+    name: 'Momentum' #Mandatory, the type of optimizer, associate to the 'paddlevideo/solver/'
+    momentum: 0.9
+    learning_rate: #Mandatory, the type of learning rate scheduler, associate to the 'paddlevideo/solver/'
+        name: 'PiecewiseDecay'
+        boundaries: [20, 40]
+        values: [0.02, 0.002, 0.0002]  #8 cards * 16 batch size
+    weight_decay:
+        name: 'L2'
+        value: 0.0001
+    grad_clip:
+        name: 'ClipGradByGlobalNorm'
+        value: 20.0
+
+
+METRIC:
+    name: 'CenterCropMetric'
+
+
+model_name: "TSM"
+log_interval: 20 #Optional, the interal of logger, default:10
+save_interval: 10
+epochs: 50 #Mandatory, total epoch
+log_level: "INFO" #Optional, the logger level. default: "INFO"
diff --git a/configs/recognition/tsm/tsm_k400_frames_nhwc.yaml b/configs/recognition/tsm/tsm_k400_frames_nhwc.yaml
@@ -0,0 +1,122 @@
+MODEL: #MODEL field
+    framework: "Recognizer2D" #Mandatory, indicate the type of network, associate to the 'paddlevideo/modeling/framework/' .
+    backbone: #Mandatory, indicate the type of backbone, associate to the 'paddlevideo/modeling/backbones/' .
+        name: "ResNetTSM" #Mandatory, The name of backbone.
+        pretrained: "data/ResNet50_pretrain.pdparams" #Optional, pretrained model path.
+        num_seg: 8
+        depth: 50 #Optional, the depth of backbone architecture.
+        data_format: "NHWC"
+    head:
+        name: "TSMHead" #Mandatory, indicate the type of head, associate to the 'paddlevideo/modeling/heads'
+        num_classes: 400 #Optional, the number of classes to be classified.
+        in_channels: 2048 #input channel of the extracted feature.
+        drop_ratio: 0.5 #the ratio of dropout
+        std: 0.001 #std value in params initialization
+        data_format: "NHWC"
+
+
+DATASET: #DATASET field
+    batch_size: 16 #Mandatory, bacth size
+    num_workers: 4 #Mandatory, XXX the number of subprocess on each GPU.
+    train:
+        format: "FrameDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
+        data_prefix: "" #Mandatory, train data root path
+        file_path: "data/k400_frames/train.list" #Mandatory, train data index file path
+        suffix: 'img_{:05}.jpg'
+    valid:
+        format: "FrameDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
+        data_prefix: "" #Mandatory, valid data root path
+        file_path: "data/k400_frames/val.list" #Mandatory, valid data index file path
+        suffix: 'img_{:05}.jpg'
+    test:
+        format: "FrameDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
+        data_prefix: "" #Mandatory, valid data root path
+        file_path: "data/k400_frames/val.list" #Mandatory, valid data index file path
+        suffix: 'img_{:05}.jpg'
+
+
+PIPELINE: #PIPELINE field
+    train: #Mandotary, indicate the pipeline to deal with the training data, associate to the 'paddlevideo/loader/pipelines/'
+        decode:
+            name: "FrameDecoder"
+        sample:
+            name: "Sampler_TSM"
+            num_seg: 8
+            seg_len: 1
+            valid_mode: False
+        transform: #Mandotary, image transform operator.
+            - MultiScaleCrop_TSM:
+                target_size: 224
+            - RandomFlip:
+            - Image2Array:
+                transpose: False
+            - Normalization:
+                mean: [0.485, 0.456, 0.406]
+                std: [0.229, 0.224, 0.225]
+                tensor_shape: [1,1,3]
+
+    valid: #Mandatory, indicate the pipeline to deal with the validing data. associate to the 'paddlevideo/loader/pipelines/'
+        decode:
+            name: "FrameDecoder"
+        sample:
+            name: "Sampler_TSM"
+            num_seg: 8
+            seg_len: 1
+            valid_mode: True
+        transform:
+            - Scale_PV:
+                short_size: 256
+            - CenterCrop:
+                target_size: 224
+            - Image2Array:
+                transpose: False
+            - Normalization:
+                mean: [0.485, 0.456, 0.406]
+                std: [0.229, 0.224, 0.225]
+                tensor_shape: [1,1,3]
+
+    test:
+        decode:
+            name: "FrameDecoder"
+        sample:
+            name: "Sampler_TSM"
+            num_seg: 8
+            seg_len: 1
+            valid_mode: True
+        transform:
+            - Scale_PV:
+                short_size: 256
+            - CenterCrop:
+                target_size: 224
+            - Image2Array:
+                transpose: False
+            - Normalization:
+                mean: [0.485, 0.456, 0.406]
+                std: [0.229, 0.224, 0.225]
+                tensor_shape: [1,1,3]
+
+
+OPTIMIZER: #OPTIMIZER field
+    name: 'Momentum' #Mandatory, the type of optimizer, associate to the 'paddlevideo/solver/'
+    momentum: 0.9
+    learning_rate: #Mandatory, the type of learning rate scheduler, associate to the 'paddlevideo/solver/'
+        name: 'PiecewiseDecay'
+        boundaries: [20, 40]
+        values: [0.02, 0.002, 0.0002]  #8 cards * 16 batch size
+    weight_decay:
+        name: 'L2'
+        value: 0.0001
+    grad_clip:
+        name: 'ClipGradByGlobalNorm'
+        value: 20.0
+
+
+METRIC:
+    name: 'CenterCropMetric'
+
+
+model_name: "TSM"
+log_interval: 20 #Optional, the interal of logger, default:10
+save_interval: 10
+epochs: 50 #Mandatory, total epoch
+log_level: "INFO" #Optional, the logger level. default: "INFO"
diff --git a/configs/recognition/tsm/tsm.yaml → ...gs/recognition/tsm/tsm_ucf101_frames.yaml b/configs/recognition/tsm/tsm.yaml → ...gs/recognition/tsm/tsm_ucf101_frames.yaml
@@ -2,18 +2,17 @@ MODEL: #MODEL field
     framework: "Recognizer2D" #Mandatory, indicate the type of network, associate to the 'paddlevideo/modeling/framework/' .
     backbone: #Mandatory, indicate the type of backbone, associate to the 'paddlevideo/modeling/backbones/' .
         name: "ResNetTSM" #Mandatory, The name of backbone.
-        pretrained: "data/ResNet50_pretrain.pdparams" #Optional, pretrained model path.
+        pretrained: "data/TSM_k400.pdparams" #Optional, pretrained model path.
         num_seg: 8
         depth: 50 #Optional, the depth of backbone architecture.
-        bn_wd: False
     head:
         name: "TSMHead" #Mandatory, indicate the type of head, associate to the 'paddlevideo/modeling/heads'
         num_classes: 101 #Optional, the number of classes to be classified.
         in_channels: 2048 #input channel of the extracted feature.
-        drop_ratio: 0.5 #the ratio of dropout
-        # ls_eps: 0.1 # label smoothing epsilon
+        drop_ratio: 0.8 #the ratio of dropout
         std: 0.001 #std value in params initialization
 
+
 DATASET: #DATASET field
     batch_size: 16 #Mandatory, bacth size
     num_workers: 4 #Mandatory, XXX the number of subprocess on each GPU.
@@ -88,29 +87,25 @@ PIPELINE: #PIPELINE field
                 mean: [0.485, 0.456, 0.406]
                 std: [0.229, 0.224, 0.225]
 
-#MIX:
-#    name: "Mixup"
-#    alpha: 0.2
 
 OPTIMIZER: #OPTIMIZER field
     name: 'Momentum' #Mandatory, the type of optimizer, associate to the 'paddlevideo/solver/'
     momentum: 0.9
     learning_rate: #Mandatory, the type of learning rate scheduler, associate to the 'paddlevideo/solver/'
         name: 'PiecewiseDecay'
-        boundaries: [20, 40]
-        values: [0.02, 0.002, 0.0002]  #4 cards * 16 batch size
-    weight_decay:
-        name: 'L2'
-        value: 1e-4
+        boundaries: [10, 20]
+        values: [0.001, 0.0001, 0.00001]  #4 cards * 16 batch size
     grad_clip:
         name: 'ClipGradByGlobalNorm'
         value: 20.0
 
+
 METRIC:
     name: 'CenterCropMetric'
 
+
 model_name: "TSM"
 log_interval: 20 #Optional, the interal of logger, default:10
 save_interval: 10
-epochs: 50 #Mandatory, total epoch
+epochs: 25 #Mandatory, total epoch
 log_level: "INFO" #Optional, the logger level. default: "INFO"
diff --git a/configs/recognition/tsm/tsm_nhwc.yaml → ...cognition/tsm/tsm_ucf101_frames_nhwc.yaml b/configs/recognition/tsm/tsm_nhwc.yaml → ...cognition/tsm/tsm_ucf101_frames_nhwc.yaml
@@ -2,16 +2,16 @@ MODEL: #MODEL field
     framework: "Recognizer2D" #Mandatory, indicate the type of network, associate to the 'paddlevideo/modeling/framework/' .
     backbone: #Mandatory, indicate the type of backbone, associate to the 'paddlevideo/modeling/backbones/' .
         name: "ResNetTSM" #Mandatory, The name of backbone.
-        pretrained: "data/ResNet50_pretrain.pdparams" #Optional, pretrained model path.
+        pretrained: "data/TSM_k400.pdparams" #Optional, pretrained model path.
         num_seg: 8
         depth: 50 #Optional, the depth of backbone architecture.
         data_format: "NHWC"
     head:
         name: "TSMHead" #Mandatory, indicate the type of head, associate to the 'paddlevideo/modeling/heads'
         num_classes: 101 #Optional, the number of classes to be classified.
         in_channels: 2048 #input channel of the extracted feature.
-        drop_ratio: 0.5 #the ratio of dropout
-        std: 0.01 #std value in params initialization
+        drop_ratio: 0.8 #the ratio of dropout
+        std: 0.001 #std value in params initialization
         data_format: "NHWC"
 
 
@@ -40,16 +40,12 @@ PIPELINE: #PIPELINE field
         decode:
             name: "FrameDecoder"
         sample:
-            name: "Sampler"
+            name: "Sampler_TSM"
             num_seg: 8
             seg_len: 1
             valid_mode: False
         transform: #Mandotary, image transform operator.
-            - Scale:
-                short_size: 256
-            - MultiScaleCrop:
-                target_size: 256
-            - RandomCrop:
+            - MultiScaleCrop_TSM:
                 target_size: 224
             - RandomFlip:
             - Image2Array:
@@ -63,13 +59,12 @@ PIPELINE: #PIPELINE field
         decode:
             name: "FrameDecoder"
         sample:
-            name: "Sampler"
-            valid_mode: True
+            name: "Sampler_TSM"
             num_seg: 8
             seg_len: 1
             valid_mode: True
         transform:
-            - Scale:
+            - Scale_PV:
                 short_size: 256
             - CenterCrop:
                 target_size: 224
@@ -84,13 +79,12 @@ PIPELINE: #PIPELINE field
         decode:
             name: "FrameDecoder"
         sample:
-            name: "Sampler"
-            valid_mode: True
+            name: "Sampler_TSM"
             num_seg: 8
             seg_len: 1
             valid_mode: True
         transform:
-            - Scale:
+            - Scale_PV:
                 short_size: 256
             - CenterCrop:
                 target_size: 224
@@ -107,17 +101,19 @@ OPTIMIZER: #OPTIMIZER field
     momentum: 0.9
     learning_rate: #Mandatory, the type of learning rate scheduler, associate to the 'paddlevideo/solver/'
         name: 'PiecewiseDecay'
-        boundaries: [40, 60]
-        values: [0.01, 0.001, 0.0001]  #4 cards * 16 batch size
-    weight_decay:
-        name: 'L2'
-        value: 1e-4
+        boundaries: [10, 20]
+        values: [0.001, 0.0001, 0.00001]  #4 cards * 16 batch size
+    grad_clip:
+        name: 'ClipGradByGlobalNorm'
+        value: 20.0
+
 
 METRIC:
     name: 'CenterCropMetric'
 
+
 model_name: "TSM"
 log_interval: 20 #Optional, the interal of logger, default:10
 save_interval: 10
-epochs: 80 #Mandatory, total epoch
+epochs: 25 #Mandatory, total epoch
 log_level: "INFO" #Optional, the logger level. default: "INFO"