ShiYaya · ShiYaya · Nov 12, 2021 · Nov 12, 2021 · Nov 12, 2021 · Nov 12, 2021
diff --git a/LICENSE b/LICENSE
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@ Automatic Video Captioning Evaluation Metric --- EMScore
 
 For an illustration, EMScore can be computed as:
 
-![EMScore](./images/EMScore.png)
+![EMScore](./emscore/images/EMScore.png)
 
 
 
@@ -25,12 +25,12 @@ For an illustration, EMScore can be computed as:
       x = self.ln_final(x).type(self.dtype)
 
       if local:
-      x = x @ self.text_projection
+          x = x @ self.text_projection
       else:
-      # x.shape = [batch_size, n_ctx, transformer.width]
-      # take features from the eot embedding (eot_token is the highest number in each sequence)
-      x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
-
+          # x.shape = [batch_size, n_ctx, transformer.width]
+          # take features from the eot embedding (eot_token is the highest number in each sequence)
+          x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+      
       return x
   ```
 
@@ -59,7 +59,7 @@ python demo.py
 ### VATEX-EVAL
 - download the files in the following link, and save at a storage directory  
 ```
-https://drive.google.com/drive/folders/1jAfZZKEgkMEYFF2x1mhYo39nH-TNeGm6?usp=sharing
+In order to comply with the anonymity policy, the link will be made public after acceptance
 ```
 
 - run code
@@ -71,7 +71,7 @@ python VATEX-EVAL-demo.py --storage_path $storage_path --use_n_refs 1 --use_feat
 ### ActivityNet-FOIL
 - download the files in the following link, and save at a storage directory  
 ```
-https://drive.google.com/drive/folders/1oY9EJiEi_db_1GH-R33JDqfE8txffKR3?usp=sharing
+In order to comply with the anonymity policy, the link will be made public after acceptance
 ```
 
 - run code

diff --git a/VATEX-EVAL-demo.py b/VATEX-EVAL-demo.py
@@ -48,20 +48,13 @@ def get_feats_dict(feat_dir_path):
     """
     Video feats prepare
     """
-    use_uniform_sample = 10
 
     if not opt.use_feat_cache:
         vids = [vid_base_path+vid+'.mp4' for vid in video_ids]
         metric = EMScorer(vid_feat_cache=[])
     else:
         vid_clip_feats_dir = os.path.join(opt.storage_path, 'VATEX-EVAL_video_feats')
         video_clip_feats_dict = get_feats_dict(vid_clip_feats_dir)
-        if use_uniform_sample:
-            for vid in video_clip_feats_dict:
-                data = video_clip_feats_dict[vid]
-                select_index = np.linspace(0, len(data)-1, use_uniform_sample)
-                select_index = [int(index) for index in select_index]
-                video_clip_feats_dict[vid] = data[select_index]
 
         vids = video_ids.tolist()
         metric = EMScorer(vid_feat_cache=video_clip_feats_dict)

diff --git a/emscore/images/EMScore.png b/emscore/images/EMScore.png
diff --git a/extract_video_embeddings.py b/extract_video_embeddings.py
@@ -16,7 +16,6 @@
 import math
 
 
-
 def encode_video(video_file, preprocess, model):
     cap = cv2.VideoCapture(video_file)
     frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
@@ -74,42 +73,15 @@ def extract_dataset_videos_embeddings(preprocess, model, opt):
         # print(vid)
 
 
-def encode_text(vid_caps, model):
-    text_input = clip.tokenize(vid_caps).cuda()
-    with torch.no_grad():
-        text_features = model.encode_text(text_input).float()
-    text_features /= text_features.norm(dim=-1, keepdim=True)
-    return text_features
-
-
-def extract_dataset_sents_embeddings(anno_path, model, opt):
-    dataset = opt.dataset
-    save_dir_path = os.path.join(opt.save_path, 'clip_caps_feats')
-
-    if not os.path.exists(save_dir_path):
-        os.makedirs(save_dir_path)
-    anno_data = json.load(open(anno_path))
-    for vid in anno_data:
-        save_vid_path = os.path.join(save_dir_path, vid+'.pt')
-        if os.path.exists(save_vid_path):
-            continue
-        vid_caps = anno_data[vid]
-        vid_caps_embedddings = encode_text(vid_caps, model).cpu().data
-        torch.save(vid_caps_embedddings, save_vid_path)
-        # print(vid)
-
 
 if __name__ == "__main__":
     parse = argparse.ArgumentParser()
-    parse.add_argument('--videos_path', type=str, default='/mnt/disk1/yyshi/dataset/vatex/val_videos')
-    parse.add_argument('--save_path', type=str, default='/mnt/disk1/yyshi/code/Vision_Language_Bert/video/syy_CLIP_Video_Representation/data/vatex_video_qe', 
+    parse.add_argument('--videos_path', type=str, default='')
+    parse.add_argument('--save_path', type=str, default='', 
         help='the path to save reformat files')
     parse.add_argument('--backbone', type=str, default='RN50')
     opt = parse.parse_args()
 
-
-    anno_data = '/mnt/disk1/yyshi/code/Vision_Language_Bert/video/syy_CLIP_Video_Representation/data/vatex/val_en_annotations.json'
-
     device = "cuda" if torch.cuda.is_available() else "cpu"
     # backbone = ['RN50', 'RN101', 'RN50x4', 'RN50x16', 'ViT-B/16']
     if 'ViT-B/16' == opt.backbone:
@@ -123,4 +95,3 @@ def extract_dataset_sents_embeddings(anno_path, model, opt):
     model, preprocess = clip.load(opt.backbone, device=device)
 
     extract_dataset_videos_embeddings(preprocess, model, opt)
-    extract_dataset_sents_embeddings(anno_data, model, opt)