ai-forever
diff --git a/‎DPF/configs/files_config.py‎
Lines changed: 5 additions & 0 deletions b/‎DPF/configs/files_config.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎DPF/configs/sharded_files_config.py‎
Lines changed: 5 additions & 0 deletions b/‎DPF/configs/sharded_files_config.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎DPF/configs/shards_config.py‎
Lines changed: 5 additions & 0 deletions b/‎DPF/configs/shards_config.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎DPF/filters/audios/audio_filter.py‎
Lines changed: 22 additions & 0 deletions b/‎DPF/filters/audios/audio_filter.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎DPF/filters/audios/info_filter.py‎
Lines changed: 89 additions & 0 deletions b/‎DPF/filters/audios/info_filter.py‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎DPF/filters/images/llava34b_captioning_filter.py‎
Lines changed: 106 additions & 0 deletions b/‎DPF/filters/images/llava34b_captioning_filter.py‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎DPF/filters/videos/__init__.py‎ b/‎DPF/filters/videos/__init__.py‎
@@ -58,6 +58,7 @@ def from_path_and_columns(
         path: str,
         image_path_col: Optional[str] = None,
         video_path_col: Optional[str] = None,
+        audio_path_col: Optional[str] = None,
         text_col: Optional[str] = None,
     ) -> "FilesDatasetConfig":
         """
@@ -69,6 +70,8 @@ def from_path_and_columns(
             Name of column with image paths
         video_path_col: Optional[str] = None
             Name of column with video paths
+        audio_path_col: Optional[str] = None
+            Name of column with audio paths
         text_col: Optional[str] = None
             Name of column with text
 
@@ -82,6 +85,8 @@ def from_path_and_columns(
             datatypes.append(FileDataType(MODALITIES['image'], image_path_col))
         if video_path_col:
             datatypes.append(FileDataType(MODALITIES['video'], video_path_col))
+        if audio_path_col:
+            datatypes.append(FileDataType(MODALITIES['audio'], audio_path_col))
         if text_col:
             datatypes.append(ColumnDataType(MODALITIES['text'], text_col))
         assert len(datatypes) > 0, "At least one modality should be provided"
 
@@ -33,6 +33,7 @@ def from_path_and_columns(
         path: str,
         image_name_col: Optional[str] = None,
         video_name_col: Optional[str] = None,
+        audio_name_col: Optional[str] = None,
         text_col: Optional[str] = None,
         datafiles_ext: str = "csv",
     ) -> "ShardedFilesDatasetConfig":
@@ -45,6 +46,8 @@ def from_path_and_columns(
             Name of column with image filenames in shard
         video_name_col: Optional[str] = None
             Name of column with video filenames in shard
+        audio_name_col: Optional[str] = None
+            Name of column with audio filenames in shard
         text_col: Optional[str] = None
             Name of column with text
         datafiles_ext: str = "csv"
@@ -60,6 +63,8 @@ def from_path_and_columns(
             datatypes.append(ShardedDataType(MODALITIES['image'], image_name_col))
         if video_name_col:
             datatypes.append(ShardedDataType(MODALITIES['video'], video_name_col))
+        if audio_name_col:
+            datatypes.append(ShardedDataType(MODALITIES['audio'], audio_name_col))
         if text_col:
             datatypes.append(ColumnDataType(MODALITIES['text'], text_col))
         assert len(datatypes) > 0, "At least one modality should be provided"
 
@@ -37,6 +37,7 @@ def from_path_and_columns(
         path: str,
         image_name_col: Optional[str] = None,
         video_name_col: Optional[str] = None,
+        audio_name_col: Optional[str] = None,
         text_col: Optional[str] = None,
         archives_ext: str = "tar",
         datafiles_ext: str = "csv",
@@ -50,6 +51,8 @@ def from_path_and_columns(
             Name of column with image filenames in shard
         video_name_col: Optional[str] = None
             Name of column with video filenames in shard
+        audio_name_col: Optional[str] = None
+            Name of column with audio filenames in shard
         text_col: Optional[str] = None
             Name of column with text
         archives_ext: str = "tar"
@@ -67,6 +70,8 @@ def from_path_and_columns(
             datatypes.append(ShardedDataType(MODALITIES['image'], image_name_col))
         if video_name_col:
             datatypes.append(ShardedDataType(MODALITIES['video'], video_name_col))
+        if audio_name_col:
+            datatypes.append(ShardedDataType(MODALITIES['audio'], audio_name_col))
         if text_col:
             datatypes.append(ColumnDataType(MODALITIES['text'], text_col))
         assert len(datatypes) > 0, "At least one modality should be provided"
 
@@ -0,0 +1,22 @@
+from abc import ABC
+
+from DPF.filters.data_filter import DataFilter
+from DPF.modalities import MODALITIES, ModalityName
+
+
+class AudioFilter(DataFilter, ABC):
+    """
+    Abstract class for all audio filters.
+    """
+
+    @property
+    def modalities(self) -> list[ModalityName]:
+        return ['audio']
+
+    @property
+    def key_column(self) -> str:
+        return MODALITIES['audio'].path_column
+
+    @property
+    def metadata_columns(self) -> list[str]:
+        return []
@@ -0,0 +1,89 @@
+from dataclasses import dataclass
+from io import BytesIO
+from typing import Any, Optional
+
+import soundfile as sf
+
+from DPF.types import ModalityToDataMapping
+
+from .audio_filter import AudioFilter
+
+
+@dataclass
+class AudioInfo:
+    key: str
+    is_correct: bool
+    duration: Optional[float]
+    sample_rate: Optional[int]
+    error: Optional[str]
+
+
+def get_audio_info(audio_bytes: bytes, data: dict[str, Any], key_column: str) -> AudioInfo:
+    """
+    Get info about audio
+    """
+    key = data[key_column]
+
+    is_correct = True
+    sample_rate, duration = None, None
+    err_str = None
+
+    try:
+        file = sf.SoundFile(BytesIO(audio_bytes))
+
+        sample_rate = file.samplerate
+        duration = len(file) / sample_rate
+    except Exception as err:
+        is_correct = False
+        err_str = str(err)
+
+    return AudioInfo(key, is_correct, duration, sample_rate, err_str)
+
+
+class AudioInfoFilter(AudioFilter):
+    """
+    Filter for gathering basic info about audios (width, height, number of channels)
+
+    Parameters
+    ----------
+    workers: int = 16
+        Number of parallel dataloader workers
+    pbar: bool = True
+        Whether to show progress bar
+    """
+
+    def __init__(self, workers: int = 16, pbar: bool = True, _pbar_position: int = 0):
+        super().__init__(pbar, _pbar_position)
+        self.num_workers = workers
+
+    @property
+    def result_columns(self) -> list[str]:
+        return [
+            "is_correct", "duration", "sample_rate", "error",
+        ]
+
+    @property
+    def dataloader_kwargs(self) -> dict[str, Any]:
+        return {
+            "num_workers": self.num_workers,
+            "batch_size": 1,
+            "drop_last": False,
+        }
+
+    def preprocess_data(
+        self,
+        modality2data: ModalityToDataMapping,
+        metadata: dict[str, Any]
+    ) -> Any:
+        return get_audio_info(modality2data['audio'], metadata, self.key_column)
+
+    def process_batch(self, batch: list[Any]) -> dict[str, list[Any]]:
+        df_batch_labels = self._get_dict_from_schema()
+
+        for image_info in batch:
+            df_batch_labels[self.key_column].append(image_info.key)
+            df_batch_labels["is_correct"].append(image_info.is_correct)
+            df_batch_labels["duration"].append(image_info.duration)
+            df_batch_labels["sample_rate"].append(image_info.sample_rate)
+            df_batch_labels["error"].append(image_info.error)
+        return df_batch_labels
@@ -0,0 +1,106 @@
+import re
+from typing import Any
+
+import torch
+from torchvision import transforms as T
+from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor
+
+from DPF.filters.images.img_filter import ImageFilter
+from DPF.types import ModalityToDataMapping
+from DPF.utils import read_image_rgb_from_bytes
+
+
+class Llava34b_Filter(ImageFilter):
+    """
+    The filter implements a description of the images supplied to the input using a model llava-v1.6-34b-hf.
+    """
+
+    def __init__(
+        self,
+        model_path: str = 'llava-hf/llava-v1.6-34b-hf',
+        workers: int = 16,
+        batch_size: int = 8,
+        prompt: str = 'detailed-long',
+        device: str = "cuda:0",
+        pbar: bool = True,
+        crop_size_x: int = 336,
+        crop_size_y: int = 336,
+        resize: int = 336,
+        _pbar_position: int = 0
+    ):
+        super().__init__(pbar, _pbar_position)
+        self.batch_size = batch_size
+        self.num_workers = workers
+        self.device = device
+        self.crop_size_x = crop_size_x
+        self.crop_size_y = crop_size_y
+        self.resize = resize
+        self.model_path = model_path
+        self.prompt_to_use = prompt
+        prompts = {
+            'detailed-long': 'Please provide a caption for this image. Speak confidently and describe everything clearly. Do not lie and describe only what you can see',
+            'pixart': 'Describe this image and its style in a very detailed manner',
+            'short': 'Describe this image very shortly in 1-2 short sentences',
+            'short-video': 'Describe this video very shortly in 1-2 short sentences. Describe what is happening in this video.'
+        }
+        self.input_ids = prompts[self.prompt_to_use]
+        print(self.input_ids)
+        self.prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\n" +  f"{self.input_ids}" + "<|im_end|><|im_start|>assistant\n"
+        self.processor = LlavaNextProcessor.from_pretrained(model_path)
+        self.model = LlavaNextForConditionalGeneration.from_pretrained(
+            model_path,
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            attn_implementation="flash_attention_2",
+            device_map=self.device
+        )
+
+    @property
+    def result_columns(self) -> list[str]:
+        return [f"caption {self.model_path}"]
+
+    @property
+    def dataloader_kwargs(self) -> dict[str, Any]:
+        return {
+            "num_workers": self.num_workers,
+            "batch_size": self.batch_size,
+            "drop_last": False,
+        }
+
+    def preprocess_data(
+        self,
+        modality2data: ModalityToDataMapping,
+        metadata: dict[str, Any]
+    ) -> Any:
+        key = metadata[self.key_column]
+        pil_img = read_image_rgb_from_bytes(
+            modality2data['image']).convert('RGB')
+        transform = T.Compose([
+                    T.Resize(self.resize),
+                    T.CenterCrop((self.crop_size_x,self.crop_size_y))
+                    ])
+        cropped_image = transform(pil_img)
+        return key, cropped_image
+
+    def process_batch(self, batch: list[Any]) -> dict[str, list[Any]]:
+        df_batch_labels = self._get_dict_from_schema()
+        keys, images = list(zip(*batch))
+        prompts = [self.prompt for _ in range(self.batch_size)]
+        inputs = self.processor(prompts, list(
+            images), return_tensors="pt").to(self.device)
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                **inputs, max_new_tokens=512, use_cache=True)
+
+        all_outputs = []
+        for i in range(output_ids.shape[0]):
+            output = self.processor.decode(
+                output_ids[i], skip_special_tokens=True, clean_up_tokenization_spaces=True)
+            output = re.sub(r'.*?assistant', '', output, flags=re.DOTALL)
+            output = re.sub(r'\n', '', output, count=1)
+            all_outputs.append(output)
+
+        df_batch_labels[self.schema[1]].extend(all_outputs)
+        df_batch_labels[self.key_column].extend(keys)
+
+        return df_batch_labels