ai-forever
diff --git a/‎DPF/filters/videos/lita_filter.py‎
Lines changed: 150 additions & 0 deletions b/‎DPF/filters/videos/lita_filter.py‎
Lines changed: 150 additions & 0 deletions
diff --git a/‎DPF/filters/videos/raft_core/download.py‎
Lines changed: 125 additions & 0 deletions b/‎DPF/filters/videos/raft_core/download.py‎
Lines changed: 125 additions & 0 deletions
diff --git a/‎docs/filters.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/filters.md‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,150 @@
+import os
+from io import BytesIO
+from typing import Any, Optional
+
+import gdown
+import torch
+from lita.constants import (
+    DEFAULT_IM_END_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IMAGE_TOKEN,
+    IMAGE_TOKEN_INDEX,
+)
+from lita.model.builder import load_pretrained_model
+from lita.utils import load_video
+from llava.conversation import SeparatorStyle, conv_templates
+from llava.mm_utils import (
+    KeywordsStoppingCriteria,
+    get_model_name_from_path,
+    tokenizer_image_token,
+)
+
+from DPF.types import ModalityToDataMapping
+
+from .video_filter import VideoFilter
+
+try:
+    from torch.utils.data.dataloader import default_collate
+except ImportError:
+    from torch.utils.data import default_collate
+
+
+def disable_torch_init() -> None:
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    torch.nn.Linear.reset_parameters = lambda self: None  # type: ignore
+    torch.nn.LayerNorm.reset_parameters = lambda self: None  # type: ignore
+
+
+class LITAFilter(VideoFilter):
+    """
+    LITA inference class to get captions for auto-labeling videos.
+    More info about the model here: https://github.com/NVlabs/LITA
+    """
+    def __init__(
+        self,
+        weights_path: str = "./lita-vicuna-v1-3-13b-finetune",
+        model_base: Optional[str] = None,
+        prompt: str = "detailed_video",
+        temperature: float = 0.2,
+        max_new_tokens: int = 1024,
+        load_4bit: bool = False,
+        load_8bit: bool = False,
+        device: str = "cuda:0",
+        workers: int = 16,
+        batch_size: int = 8,
+        pbar: bool = True,
+        _pbar_position: int = 0
+    ):
+        super().__init__(pbar, _pbar_position)
+        self.model_name = get_model_name_from_path(weights_path)
+        self.prompt_to_use = prompt
+        prompt_templates = {
+            'detailed_video': 'Describe this video and its style in a very detailed manner',
+            'short_video': 'Describe this video and its style briefly'
+        }
+
+        self.num_workers = workers
+        self.batch_size = batch_size
+        self.device = device
+
+        self.inp = prompt_templates[self.prompt_to_use]
+        self.temperature = temperature
+        self.max_new_tokens = max_new_tokens
+
+        weights_url = "https://drive.google.com/drive/folders/1-P7p-tq5aXZzSoefEJx4PSFKH8jt8KWy"
+        if not os.path.exists(weights_path):
+            gdown.download_folder(weights_url)
+
+        disable_torch_init()
+
+        pretrainers = load_pretrained_model(weights_path, model_base, self.model_name, load_8bit, load_4bit)
+        self.tokenizer, self.model, self.processor, self.context_len = pretrainers
+
+        self.conv_mode = "llava_v1"
+        self.conv = conv_templates[self.conv_mode].copy()
+
+        inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + self.inp
+        self.conv.append_message(self.conv.roles[0], inp)
+        self.conv.append_message(self.conv.roles[1], None)
+        prompt = self.conv.get_prompt()
+        self.input_ids = tokenizer_image_token(
+            prompt,
+            self.tokenizer,
+            IMAGE_TOKEN_INDEX,
+            return_tensors='pt'
+        ).unsqueeze(0).to(self.device)
+        stop_str = self.conv.sep if self.conv.sep_style != SeparatorStyle.TWO else self.conv.sep2
+        keywords = [stop_str]
+        self.stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, self.input_ids)
+
+    @property
+    def result_columns(self) -> list[str]:
+        return [f"caption {self.model_name} prompt {self.prompt_to_use}"]
+
+    @property
+    def dataloader_kwargs(self) -> dict[str, Any]:
+        return {
+            "num_workers": self.num_workers,
+            "batch_size": self.batch_size,
+            "drop_last": False,
+        }
+
+    def preprocess_data(
+        self,
+        modality2data: ModalityToDataMapping,
+        metadata: dict[str, Any]
+    ) -> Any:
+        key = metadata[self.key_column]
+        video_file = BytesIO(modality2data['video'])
+        video_file = load_video(video_file, self.processor, self.model.config.num_frames).unsqueeze(0).half()
+        return key, video_file
+
+    def process_batch(self, batch: list[Any]) -> dict[str, list[Any]]:
+        df_batch_labels = self._get_dict_from_schema()
+
+        keys, video_tensors = list(zip(*batch))
+
+        video_tensors = default_collate(video_tensors).to(self.device)  # type: ignore
+        input_ids_batch = self.input_ids.repeat_interleave(video_tensors.shape[0], 0).to(self.device)  # type: ignore
+
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                input_ids_batch,
+                images=video_tensors[:, 0],  # type: ignore
+                do_sample=True,
+                temperature=self.temperature,
+                top_p=0.85,
+                num_beams=1,
+                max_new_tokens=self.max_new_tokens,
+                use_cache=True
+            )
+
+        all_outputs: list[Optional[str]] = []
+        for i in range(output_ids.shape[0]):
+            caption = self.tokenizer.decode(output_ids[i, self.input_ids.shape[1]:]).strip().split('</s>')[0]
+            all_outputs.append(caption)
+        df_batch_labels[self.schema[1]].extend(all_outputs)
+        df_batch_labels[self.key_column].extend(keys)
+        return df_batch_labels
@@ -0,0 +1,125 @@
+def get_weights_path_from_url(url, md5sum=None):
+    """Get weights path from WEIGHT_HOME, if not exists,
+    download it from url.
+
+    Args:
+        url (str): download url
+        md5sum (str): md5 sum of download package
+    
+    Returns:
+        str: a local path to save downloaded weights.
+
+    Examples:
+        .. code-block:: python
+
+            from hapi.download import get_weights_path_from_url
+
+            resnet18_pretrained_weight_url = 'https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams'
+            local_weight_path = get_weights_path_from_url(resnet18_pretrained_weight_url)
+
+    """
+    path = get_path_from_url(url, WEIGHTS_HOME, md5sum)
+    return path
+
+
+def _map_path(url, root_dir):
+    # parse path after download under root_dir
+    fname = osp.split(url)[-1]
+    fpath = fname
+    return osp.join(root_dir, fpath)
+
+
+def get_path_from_url(url, root_dir, md5sum=None, check_exist=True):
+    """ Download from given url to root_dir.
+    if file or directory specified by url is exists under
+    root_dir, return the path directly, otherwise download
+    from url and decompress it, return the path.
+
+    Args:
+        url (str): download url
+        root_dir (str): root dir for downloading, it should be
+                        WEIGHTS_HOME or DATASET_HOME
+        md5sum (str): md5 sum of download package
+    
+    Returns:
+        str: a local path to save downloaded models & weights & datasets.
+    """
+    assert is_url(url), "downloading from {} not a url".format(url)
+    # parse path after download to decompress under root_dir
+    fullpath = _map_path(url, root_dir)
+
+    if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum):
+        logger.info("Found {}".format(fullpath))
+    else:
+        if ParallelEnv().local_rank == 0:
+            fullpath = _download(url, root_dir, md5sum)
+        else:
+            while not os.path.exists(fullpath):
+                time.sleep(1)
+    return fullpath
+
+
+def _download(url, path, md5sum=None):
+    """
+    Download from url, save to path.
+
+    url (str): download url
+    path (str): download to given path
+    """
+    if not osp.exists(path):
+        os.makedirs(path)
+
+    fname = osp.split(url)[-1]
+    fullname = osp.join(path, fname)
+    retry_cnt = 0
+
+    while not (osp.exists(fullname) and _md5check(fullname, md5sum)):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RuntimeError("Download from {} failed. "
+                               "Retry limit reached".format(url))
+
+        logger.info("Downloading {} from {}".format(fname, url))
+
+        req = requests.get(url, stream=True)
+        if req.status_code != 200:
+            raise RuntimeError("Downloading from {} failed with code "
+                               "{}!".format(url, req.status_code))
+
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get('content-length')
+        with open(tmp_fullname, 'wb') as f:
+            if total_size:
+                with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
+                    for chunk in req.iter_content(chunk_size=1024):
+                        f.write(chunk)
+                        pbar.update(1)
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+def _md5check(fullname, md5sum=None):
+    if md5sum is None:
+        return True
+
+    logger.info("File {} md5 checking...".format(fullname))
+    md5 = hashlib.md5()
+    with open(fullname, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    calc_md5sum = md5.hexdigest()
+
+    if calc_md5sum != md5sum:
+        logger.info("File {} md5 check failed, {}(calc) != "
+                    "{}(base)".format(fullname, calc_md5sum, md5sum))
+        return False
+    return True
@@ -29,6 +29,7 @@ List of implemented filters:
   - [GunnarFarnebackFilter](../DPF/filters/videos/farneback_filter.py) - computes flow scores using Farneback's algorithm
   - [RAFTOpticalFlowFilter](../DPF/filters/videos/raft_filter.py) - computes flow scores using [RAFT](https://github.com/princeton-vl/RAFT) model
   - [VideoLLaVAFilter](../DPF/filters/videos/video_llava_filter.py) - captioning videos using Video-LLaVA
+  - [LITAFilter](../DPF/filters/videos/lita_filter.py) - captioning videos using [LITA model](https://github.com/NVlabs/LITA)
 
 ### Datafilter