Skip to content

Commit

Permalink
Release PPTSN, PPTSM, TSN, TSM base tpic
Browse files Browse the repository at this point in the history
  • Loading branch information
HydrogenSulfate committed Nov 22, 2021
1 parent 47545fb commit bd290d3
Show file tree
Hide file tree
Showing 11 changed files with 273 additions and 145 deletions.
3 changes: 3 additions & 0 deletions configs/recognition/pptsm/pptsm_k400_videos_uniform.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,15 @@ DATASET: #DATASET field
test_batch_size: 1
train:
format: "VideoDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
data_prefix: "data/k400" #Mandatory, train data root path
file_path: "data/k400/train.list" #Mandatory, train data index file path
valid:
format: "VideoDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
data_prefix: "data/k400" #Mandatory, train data root path
file_path: "data/k400/val.list" #Mandatory, valid data index file path
test:
format: "VideoDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
data_prefix: "data/k400" #Mandatory, train data root path
file_path: "data/k400/val.list" #Mandatory, valid data index file path

PIPELINE: #PIPELINE field
Expand Down
3 changes: 3 additions & 0 deletions configs/recognition/pptsn/pptsn_k400_videos.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,15 @@ DATASET: #DATASET field
num_workers: 4 #Mandatory, XXX the number of subprocess on each GPU.
train:
format: "VideoDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
data_prefix: "data/k400" #Mandatory, train data root path
file_path: "data/k400/train.list" #Mandatory, train data index file path
valid:
format: "VideoDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
data_prefix: "data/k400" #Mandatory, train data root path
file_path: "data/k400/val.list" #Mandatory, valid data index file path
test:
format: "VideoDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
data_prefix: "data/k400" #Mandatory, train data root path
file_path: "data/k400/val.list" #Mandatory, valid data index file path


Expand Down
6 changes: 3 additions & 3 deletions configs/recognition/tsm/tsm_k400_frames.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,17 @@ DATASET: #DATASET field
num_workers: 4 #Mandatory, XXX the number of subprocess on each GPU.
train:
format: "FrameDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
data_prefix: "" #Mandatory, train data root path
data_prefix: "data/k400/rawframes" #Mandatory, train data root path
file_path: "data/k400_frames/train.list" #Mandatory, train data index file path
suffix: 'img_{:05}.jpg'
valid:
format: "FrameDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
data_prefix: "" #Mandatory, valid data root path
data_prefix: "data/k400/rawframes" #Mandatory, valid data root path
file_path: "data/k400_frames/val.list" #Mandatory, valid data index file path
suffix: 'img_{:05}.jpg'
test:
format: "FrameDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
data_prefix: "" #Mandatory, valid data root path
data_prefix: "data/k400/rawframes" #Mandatory, valid data root path
file_path: "data/k400_frames/val.list" #Mandatory, valid data index file path
suffix: 'img_{:05}.jpg'

Expand Down
3 changes: 3 additions & 0 deletions configs/recognition/tsm/tsm_k400_videos.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,15 @@ DATASET: #DATASET field
num_workers: 4 #Mandatory, XXX the number of subprocess on each GPU.
train:
format: "VideoDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
data_prefix: "data/k400/videos" #Mandatory, train data root path
file_path: "data/k400/train.list" #Mandatory, train data index file path
valid:
format: "VideoDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
data_prefix: "data/k400/videos" #Mandatory, train data root path
file_path: "data/k400/val.list" #Mandatory, valid data index file path
test:
format: "VideoDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
data_prefix: "data/k400/videos" #Mandatory, train data root path
file_path: "data/k400/val.list" #Mandatory, valid data index file path


Expand Down
35 changes: 30 additions & 5 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,15 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import argparse
from paddlevideo.utils import get_config
from paddlevideo.tasks import train_model, train_model_multigrid, test_model, train_dali
from paddlevideo.utils import get_dist_info
import random

import numpy as np
import paddle

from paddlevideo.tasks import (test_model, train_dali, train_model,
train_model_multigrid)
from paddlevideo.utils import get_config, get_dist_info


def parse_args():
Expand Down Expand Up @@ -53,6 +57,16 @@ def parse_args():
'--validate',
action='store_true',
help='whether to evaluate the checkpoint during training')
parser.add_argument(
'--seed',
type=int,
default=None,
help='fixed all random seeds when the program is running')
parser.add_argument(
'--seed',
type=int,
default=None,
help='fixed all random seeds when the program is running')
parser.add_argument(
'-p',
'--profiler_options',
Expand All @@ -69,6 +83,15 @@ def main():
args = parse_args()
cfg = get_config(args.config, overrides=args.override)

# set seed if specified
seed = args.seed
if seed is not None:
assert isinstance(
seed, int), f"seed must be a integer when specified, but got {seed}"
paddle.seed(seed)
np.random.seed(seed)
random.seed(seed)

_, world_size = get_dist_info()
parallel = world_size != 1
if parallel:
Expand All @@ -79,7 +102,9 @@ def main():
elif args.train_dali:
train_dali(cfg, weights=args.weights, parallel=parallel)
elif args.multigrid:
train_model_multigrid(cfg, world_size=world_size, validate=args.validate)
train_model_multigrid(cfg,
world_size=world_size,
validate=args.validate)
else:
train_model(cfg,
weights=args.weights,
Expand Down
5 changes: 3 additions & 2 deletions paddlevideo/loader/pipelines/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,11 @@ def __call__(self, results):
frames_idx = []
if self.linspace_sample:
if 'start_idx' in results and 'end_idx' in results:
offsets = np.linspace(results['start_idx'], results['end_idx'], self.num_seg)
offsets = np.linspace(results['start_idx'], results['end_idx'],
self.num_seg)
else:
offsets = np.linspace(0, frames_len - 1, self.num_seg)
offsets = np.clip(offsets, 0, frames_len - 1).astype(np.long)
offsets = np.clip(offsets, 0, frames_len - 1).astype(np.int64)
if results['format'] == 'video':
frames_idx = list(offsets)
frames_idx = [x % frames_len for x in frames_idx]
Expand Down
96 changes: 46 additions & 50 deletions paddlevideo/modeling/backbones/vit.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,15 @@
import numpy as np
import paddle
import paddle.nn as nn
from paddle.nn.initializer import TruncatedNormal, Constant, Normal
import paddle.nn.functional as F
from ..registry import BACKBONES
from paddle.nn.initializer import Constant

from ...utils import load_ckpt
from ..registry import BACKBONES
from ..weight_init import trunc_normal_


__all__ = ['VisionTransformer']


zeros_ = Constant(value=0.)
ones_ = Constant(value=1.)

Expand Down Expand Up @@ -77,7 +76,7 @@ def __init__(self,
hidden_features=None,
out_features=None,
act_layer=nn.GELU,
drop=0.):
drop=0.0):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
Expand All @@ -101,8 +100,8 @@ def __init__(self,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.,
proj_drop=0.):
attn_drop=0.0,
proj_drop=0.0):
super().__init__()

self.num_heads = num_heads
Expand Down Expand Up @@ -151,7 +150,7 @@ def __init__(self,
if isinstance(norm_layer, str):
self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
elif isinstance(norm_layer, Callable):
self.norm1 = norm_layer(dim)
self.norm1 = norm_layer(dim, epsilon=epsilon)
else:
raise TypeError(
"The norm_layer must be str or paddle.nn.layer.Layer class")
Expand Down Expand Up @@ -185,7 +184,7 @@ def __init__(self,
if isinstance(norm_layer, str):
self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
elif isinstance(norm_layer, Callable):
self.norm2 = norm_layer(dim)
self.norm2 = norm_layer(dim, epsilon=epsilon)
else:
raise TypeError(
"The norm_layer must be str or paddle.nn.layer.Layer class")
Expand All @@ -205,14 +204,14 @@ def forward(self, x, B, T, W):
elif self.attention_type == 'divided_space_time':
########## Temporal ##########
xt = x[:, 1:, :]
_b, _h, _w, _t, _m = B, H, W, T, xt.shape[-1]
xt = xt.reshape([_b * _h * _w if _b > 0 else -1, _t, _m])
_, _, _, _t, _m = B, H, W, T, xt.shape[-1]
xt = xt.reshape([-1, _t, _m])

res_temporal = self.drop_path(
self.temporal_attn(self.temporal_norm1(xt)))

_b, _h, _w, _t, _m = B, H, W, T, res_temporal.shape[-1]
res_temporal = res_temporal.reshape([_b, _h * _w * _t, _m])
_, _h, _w, _t, _m = B, H, W, T, res_temporal.shape[-1]
res_temporal = res_temporal.reshape([-1, _h * _w * _t, _m])

res_temporal = self.temporal_fc(res_temporal)
xt = x[:, 1:, :] + res_temporal
Expand All @@ -221,26 +220,26 @@ def forward(self, x, B, T, W):
init_cls_token = x[:, 0, :].unsqueeze(1)
cls_token = init_cls_token.tile((1, T, 1))
_b, _t, _m = cls_token.shape
cls_token = cls_token.reshape([_b * _t, _m]).unsqueeze(1)
cls_token = cls_token.reshape([-1, _m]).unsqueeze(1)

xs = xt
_b, _h, _w, _t, _m = B, H, W, T, xs.shape[-1]
xs = xs.reshape([_b, _h, _w, _t, _m]).transpose(
(0, 3, 1, 2, 4)).reshape([_b * _t if _b > 0 else -1, _h * _w, _m])
_, _h, _w, _t, _m = B, H, W, T, xs.shape[-1]
xs = xs.reshape([-1, _h, _w, _t, _m]).transpose(
(0, 3, 1, 2, 4)).reshape([-1, _h * _w, _m])
xs = paddle.concat((cls_token, xs), axis=1)
res_spatial = self.drop_path(self.attn(self.norm1(xs)))

# Taking care of CLS token
cls_token = res_spatial[:, 0, :]
_b, _t, _m = B, T, cls_token.shape[-1]
cls_token = cls_token.reshape([_b, _t, _m])
_, _t, _m = B, T, cls_token.shape[-1]
cls_token = cls_token.reshape([-1, _t, _m])
# averaging for every frame
cls_token = paddle.mean(cls_token, axis=1, keepdim=True)

res_spatial = res_spatial[:, 1:, :]
_b, _t, _h, _w, _m = B, T, H, W, res_spatial.shape[-1]
res_spatial = res_spatial.reshape([_b, _t, _h, _w, _m]).transpose(
(0, 2, 3, 1, 4)).reshape([_b, _h * _w * _t, _m])
_, _t, _h, _w, _m = B, T, H, W, res_spatial.shape[-1]
res_spatial = res_spatial.reshape([-1, _t, _h, _w, _m]).transpose(
(0, 2, 3, 1, 4)).reshape([-1, _h * _w * _t, _m])

res = res_spatial
x = xt
Expand Down Expand Up @@ -282,7 +281,7 @@ def forward(self, x):
assert H == self.img_size[0] and W == self.img_size[1], \
f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
x = x.transpose((0, 2, 1, 3, 4))
x = x.reshape([B * T if B > 0 else -1, C, H, W])
x = x.reshape([-1, C, H, W])
x = self.proj(x)
W = x.shape[-1]
x = x.flatten(2).transpose((0, 2, 1))
Expand Down Expand Up @@ -316,7 +315,6 @@ def __init__(self,
self.pretrained = pretrained
self.seg_num = seg_num
self.attention_type = attention_type

self.num_features = self.embed_dim = embed_dim

self.patch_embed = PatchEmbed(img_size=img_size,
Expand Down Expand Up @@ -375,32 +373,30 @@ def init_weights(self):
zeros_(m.temporal_fc.weight)
zeros_(m.temporal_fc.bias)
i += 1

"""Second, if provide pretrained ckpt, load it"""
if isinstance(
self.pretrained, str
) and self.pretrained.strip() != "": # load pretrained weights
load_ckpt(self, self.pretrained, num_patches=self.patch_embed.num_patches,
seg_num=self.seg_num, attention_type=self.attention_type)
elif self.pretrained is None or self.pretrained.strip() == "":
pass
else:
raise NotImplementedError
load_ckpt(self,
self.pretrained,
num_patches=self.patch_embed.num_patches,
seg_num=self.seg_num,
attention_type=self.attention_type)

def _init_fn(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight)
if isinstance(m, nn.Linear) and m.bias is not None:
if m.bias is not None:
zeros_(m.bias)
elif isinstance(m, nn.LayerNorm):
ones_(m.weight)
zeros_(m.bias)

def forward_features(self, x):
B = x.shape[0]
# B = paddle.shape(x)[0]
x, T, W = self.patch_embed(x)
cls_tokens = self.cls_token.expand((x.shape[0] if B > 0 else 3 * T, -1, -1))
# B = x.shape[0]
B = paddle.shape(x)[0]
x, T, W = self.patch_embed(x) # [BT,nH*nW,F]
cls_tokens = self.cls_token.expand((B * T, -1, -1)) # [1,1,F]->[BT,1,F]
x = paddle.concat((cls_tokens, x), axis=1)
pos_interp = (x.shape[1] != self.pos_embed.shape[1])
if pos_interp:
Expand All @@ -426,14 +422,14 @@ def forward_features(self, x):

# Time Embeddings
if self.attention_type != 'space_only':
# cls_tokens = x[:B, 0, :].unsqueeze(1)
cls_tokens = x[:B, 0, :].unsqueeze(1) if B > 0 else x.split(T)[0].index_select(paddle.to_tensor([0]), axis=1)
cls_tokens = x[:B, 0, :].unsqueeze(1) if B > 0 else x.split(
T)[0].index_select(paddle.to_tensor([0]), axis=1)
x = x[:, 1:]
_bt, _n, _m = x.shape
_b = B
_t = _bt // _b if _b != -1 else T
x = x.reshape([_b, _t, _n, _m]).transpose(
(0, 2, 1, 3)).reshape([_b * _n if _b > 0 else -1, _t, _m])
_, _n, _m = x.shape
# _b = B
_t = T
x = x.reshape([-1, _t, _n, _m]).transpose(
(0, 2, 1, 3)).reshape([-1, _t, _m])
# Resizing time embeddings in case they don't match
time_interp = (T != self.time_embed.shape[1])
if time_interp: # T' != T
Expand All @@ -447,9 +443,9 @@ def forward_features(self, x):
x = x + self.time_embed

x = self.time_drop(x)
_bn, _t, _m = x.shape
_b = B
x = x.reshape([_b, _n * _t, _m] if _n > 0 else [_b, W * W * T, _m])
_, _t, _m = x.shape
# _b = B
x = x.reshape([-1, W * W * T, _m])
x = paddle.concat((cls_tokens, x), axis=1)

# Attention blocks
Expand All @@ -458,14 +454,14 @@ def forward_features(self, x):

# Predictions for space-only baseline
if self.attention_type == 'space_only':
_bt, _n, _m = x.shape
_b = B
_, _n, _m = x.shape
# _b = B
_t = T
x = x.reshape([_b, _t, _n, _m])
x = x.reshape([-1, _t, _n, _m])
x = paddle.mean(x, 1) # averaging predictions for every frame

x = self.norm(x)
return x[:, 0] # [B, 1, embed_dim]
return x[:, 0] # [B, embed_dim]

def forward(self, x):
x = self.forward_features(x)
Expand Down
Loading

0 comments on commit bd290d3

Please sign in to comment.