ControlNet · aromanusc · Feb 6, 2024 · Feb 9, 2024
diff --git a/config/celebv_hq/emotion/celebvhq_marlin_emotion_ft.yaml b/config/celebv_hq/emotion/celebvhq_marlin_emotion_ft.yaml
@@ -0,0 +1,8 @@
+model_name: "celebvhq_marlin_emotion_ft"
+backbone: "marlin_vit_base_ytf"
+dataset: "celebvhq"
+task: "emotion"
+temporal_reduction: "mean"
+learning_rate: 1.0e-4
+seq_mean_pool: true
+finetune: true
diff --git a/dataset/celebv_hq.py b/dataset/celebv_hq.py
@@ -1,7 +1,7 @@
 import os
 from abc import ABC, abstractmethod
 from itertools import islice
-from typing import Optional
+from typing import Optional, List
 
 import ffmpeg
 import numpy as np
@@ -15,12 +15,13 @@
 
 
 class CelebvHqBase(LightningDataModule, ABC):
+    emotions = ["neutral", "happy", "sadness", "anger", "fear", "surprise", "contempt", "disgust"]
 
     def __init__(self, data_root: str, split: str, task: str, data_ratio: float = 1.0, take_num: int = None):
         super().__init__()
         self.data_root = data_root
         self.split = split
-        assert task in ("appearance", "action")
+        assert task in ("appearance", "action", "emotion")
         self.task = task
         self.take_num = take_num
 
@@ -42,6 +43,16 @@ def __getitem__(self, index: int):
     def __len__(self):
         return len(self.name_list)
 
+    @classmethod
+    def parse_emotion_label(cls, emotion_annotation: dict) -> List[int]:
+        labels = [0] * 8
+        if emotion_annotation["sep_flag"]:
+            for emo in emotion_annotation["labels"]:
+                labels[cls.emotions.index(emo["emotion"])] = 1
+            return labels
+        else:
+            labels[cls.emotions.index(emotion_annotation["labels"])] = 1
+        return labels
 
 # for fine-tuning
 class CelebvHq(CelebvHqBase):
@@ -61,6 +72,8 @@ def __init__(self,
 
     def __getitem__(self, index: int):
         y = self.metadata["clips"][self.name_list[index]]["attributes"][self.task]
+        if self.task == "emotion":
+            y = self.parse_emotion_label(y)
         video_path = os.path.join(self.data_root, "cropped", self.name_list[index] + ".mp4")
 
         probe = ffmpeg.probe(video_path)["streams"][0]
@@ -124,6 +137,8 @@ def __getitem__(self, index: int):
             raise ValueError(self.temporal_reduction)
 
         y = self.metadata["clips"][self.name_list[index]]["attributes"][self.task]
+        if self.task == "emotion":
+            y = CelebvHq.parse_emotion_label(y)
 
         return x, torch.tensor(y, dtype=torch.long).bool()
 

diff --git a/evaluate.py b/evaluate.py
@@ -29,6 +29,8 @@ def train_celebvhq(args, config):
         num_classes = 40
     elif task == "action":
         num_classes = 35
+    elif task == "emotion":
+        num_classes = 8
     else:
         raise ValueError(f"Unknown task {task}")
 
@@ -39,7 +41,7 @@ def train_celebvhq(args, config):
             num_classes, config["backbone"], True, args.marlin_ckpt, "multilabel", config["learning_rate"],
             args.n_gpus > 1,
         )
-
+        
         dm = CelebvHqDataModule(
             data_path, finetune, task,
             batch_size=args.batch_size,

diff --git a/model/classifier.py b/model/classifier.py
@@ -1,9 +1,8 @@
 from typing import Optional, Union, Sequence, Dict, Literal, Any
 
-from memory_profiler import profile
 from pytorch_lightning import LightningModule
 from torch import Tensor
-from torch.nn import CrossEntropyLoss, Linear, Identity, BCEWithLogitsLoss
+from torch.nn import CrossEntropyLoss, Linear, BCEWithLogitsLoss
 from torch.optim import Adam
 from torch.optim.lr_scheduler import ReduceLROnPlateau
 from torchmetrics import Accuracy, AUROC