ai-forever · boomb0om · Jun 10, 2024 · May 3, 2024 · May 8, 2024 · May 8, 2024
diff --git a/DPF/filters/images/llava34b_captioning_filter.py b/DPF/filters/images/llava34b_captioning_filter.py
@@ -0,0 +1,95 @@
+import re
+from typing import Any
+
+import torch
+from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor
+
+from DPF.filters.images.img_filter import ImageFilter
+from DPF.types import ModalityToDataMapping
+from DPF.utils import read_image_rgb_from_bytes
+
+
+class Llava34b_Filter(ImageFilter):
+    """
+    The filter implements a description of the images supplied to the input using a model llava-v1.6-34b-hf.
+    """
+
+    def __init__(
+        self,
+        model_path: str = 'llava-hf/llava-v1.6-34b-hf',
+        workers: int = 16,
+        batch_size: int = 8,
+        device: str = "cuda:0",
+        pbar: bool = True,
+        crop_size_x: int = 336,
+        crop_size_y: int = 336,
+        _pbar_position: int = 0
+    ):
+        super().__init__(pbar, _pbar_position)
+        self.batch_size = batch_size
+        self.num_workers = workers
+        self.device = device
+        self.crop_size_x = crop_size_x
+        self.crop_size_y = crop_size_y
+        self.prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nDescribe this image and its style in a very detailed manner<|im_end|><|im_start|>assistant\n"
+        self.processor = LlavaNextProcessor.from_pretrained(model_path)
+        self.model = LlavaNextForConditionalGeneration.from_pretrained(
+            model_path,
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            attn_implementation="flash_attention_2",
+            device_map=self.device
+        )
+
+    @property
+    def result_columns(self) -> list[str]:
+        return ["llava34b_caption"]
+
+    @property
+    def dataloader_kwargs(self) -> dict[str, Any]:
+        return {
+            "num_workers": self.num_workers,
+            "batch_size": self.batch_size,
+            "drop_last": False,
+        }
+
+    def preprocess_data(
+        self,
+        modality2data: ModalityToDataMapping,
+        metadata: dict[str, Any]
+    ) -> Any:
+        key = metadata[self.key_column]
+        pil_img = read_image_rgb_from_bytes(
+            modality2data['image']).convert('RGB')
+        width, height = pil_img.size
+        left = int((width - self.crop_size_x)/2)
+        top = int((height - self.crop_size_y)/2)
+        right = int((width + self.crop_size_x)/2)
+        bottom = int((height + self.crop_size_y)/2)
+        cropped_image = pil_img.crop((left, top, right, bottom))
+        cropped_image = cropped_image.resize(
+            (self.crop_size_x, self.crop_size_y))
+        return key, cropped_image
+
+    def process_batch(self, batch: list[Any]) -> dict[str, list[Any]]:
+        df_batch_labels = self._get_dict_from_schema()
+        keys, images = list(zip(*batch))
+        prompts = [self.prompt for _ in range(self.batch_size)]
+        inputs = self.processor(prompts, list(
+            images), return_tensors="pt").to(self.device)
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                **inputs, max_new_tokens=512, use_cache=True)
+
+        all_outputs = []
+        for i in range(output_ids.shape[0]):
+            output = self.processor.decode(
+                output_ids[i], skip_special_tokens=True, clean_up_tokenization_spaces=True)
+            output = re.sub(r'.*?assistant', '', output, flags=re.DOTALL)
+            output = re.sub(r'\n', '', output)
+            all_outputs.append(output)
+
+        df_batch_labels[self.schema[1]].extend(all_outputs)
+        df_batch_labels[self.key_column].extend(keys)
+
+        return df_batch_labels
diff --git a/docs/filters.md b/docs/filters.md
@@ -11,6 +11,7 @@ List of implemented filters:
   - [BLIPCaptioningFilter](../DPF/filters/images/blip_captioning_filter.py) - captioning images using BLIP model
   - [CLIPLabelsFilter](../DPF/filters/images/cliplabels_filter.py) - calculate similarity of images with provided texts using CLIP model
   - [LLaVaCaptioningFilter](../DPF/filters/images/llava_captioning_filter.py) - captioning images using LLaVA models
+  - [LLaVa34bCaptioningFilter](../DPF/filters/images/llava34b_captioning_filter.py) - captioning images using LLaVA models, llava-v1.6-34b-hf
   - [NSFWFilter](../DPF/filters/images/nsfw_filter.py) - NSFW images detection
   - [CRAFTFilter](../DPF/filters/images/text_detection_filter.py) - text detection on image
   - [OCRFilter](../DPF/filters/images/ocr_filter.py) - text recognition

diff --git a/examples/image_filters_example.ipynb b/examples/image_filters_example.ipynb
@@ -899,6 +899,117 @@
     "processor.df['caption liuhaotian/llava-v1.5-13b prompt pixart']"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "9e4f15f0",
+   "metadata": {},
+   "source": [
+    "## LLaVa34bCaptioningFilter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "7c629325",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/jovyan/.mlspace/envs/env3.11/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "100%|██████████| 1/1 [00:00<00:00, 17.88it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "sys.path.append('../')\n",
+    "from DPF import ShardsDatasetConfig, DatasetReader\n",
+    "\n",
+    "config = ShardsDatasetConfig.from_path_and_columns(\n",
+    "    'example_dataset',\n",
+    "    image_name_col='image_name',\n",
+    "    text_col=\"text\"\n",
+    ")\n",
+    "\n",
+    "reader = DatasetReader()\n",
+    "processor = reader.read_from_config(config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "f5cfd34d",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-05-12 12:42:04,347] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers\n",
+      "You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour\n",
+      "Loading checkpoint shards: 100%|██████████| 15/15 [00:31<00:00,  2.10s/it]\n",
+      "100%|██████████| 250/250 [1:30:04<00:00, 21.62s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from DPF.filters.images.llava34b_captioning_filter import Llava34b_Filter\n",
+    "\n",
+    "datafilter = Llava34b_Filter(\n",
+    "    workers=1, \n",
+    "    batch_size=4, \n",
+    "    device='cuda:0',\n",
+    "    crop_size_x = 336,\n",
+    "    crop_size_y = 336\n",
+    ")\n",
+    "\n",
+    "processor.apply_data_filter(datafilter)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "e471161b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0      The image depicts an older couple in a kitchen...\n",
+       "1      The image shows a close-up of two bowls of gre...\n",
+       "2      The image shows a golden retriever dog swimmin...\n",
+       "3      The image depicts a tranquil scene at what app...\n",
+       "4      The image depicts a serene landscape featuring...\n",
+       "                             ...                        \n",
+       "995    The image depicts an aerial view of a densely ...\n",
+       "996    The image depicts an impressionist painting of...\n",
+       "997    The image depicts a modern and stylish interio...\n",
+       "998    The image depicts a stylized, fantasy-themed l...\n",
+       "999    The image shows a pair of dark blue trousers w...\n",
+       "Name: fix all caption liuhaotian/llava-v1.6-34b prompt short, Length: 1000, dtype: object"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "processor.df['llava-v1.6-34b']"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "76d24a11",
@@ -948,9 +1059,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python [conda env:.mlspace-dpf_llava]",
+   "display_name": "Python [conda env:.mlspace-env3.11]",
    "language": "python",
-   "name": "conda-env-.mlspace-dpf_llava-py"
+   "name": "conda-env-.mlspace-env3.11-py"
   },
   "language_info": {
    "codemirror_mode": {
@@ -962,7 +1073,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.6"
+   "version": "3.11.8"
   }
  },
  "nbformat": 4,

diff --git a/llava_captioning.py b/llava_captioning.py
@@ -0,0 +1,114 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ffc1208c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import LlavaNextConfig, LlavaNextProcessor, LlavaNextForConditionalGeneration\n",
+    "from DPF import S3Connector, DatasetReader, ShardsDatasetConfig\n",
+    "import torch\n",
+    "from PIL import Image\n",
+    "import requests\n",
+    "import csv\n",
+    "import requests\n",
+    "import os\n",
+    "from typing import Any\n",
+    "from py3langid.langid import MODEL_FILE, LanguageIdentifier\n",
+    "from DPF.filters.images.img_filter import ImageFilter\n",
+    "from DPF.types import ModalityToDataMapping\n",
+    "\n",
+    "class Llava34b_Filter(ImageFilter):\n",
+    "    \"\"\"\n",
+    "    The filter implements a description of the images supplied to the input.\n",
+    "    \"\"\"\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        model_path: str = 'llava-hf/llava-v1.6-34b-hf',\n",
+    "        workers: int = 16,\n",
+    "        batch_size: int = 16,\n",
+    "        device: str = \"cuda:0\",\n",
+    "        pbar: bool = True,\n",
+    "        _pbar_position: int = 0\n",
+    "    ):\n",
+    "        super().__init__(pbar, _pbar_position)\n",
+    "        self.batch_size = batch_size\n",
+    "        self.num_workers = workers\n",
+    "        self.device = device\n",
+    "        self.prompt = \"<|im_start|>system\\nAnswer the questions.<|im_end|><|im_start|>user\\n<image>\\nDescribe this image and its style in a very detailed manner<|im_end|><|im_start|>assistant\\n\"\n",
+    "        \n",
+    "        self.processor = LlavaNextProcessor.from_pretrained(\"llava-hf/llava-v1.6-34b-hf\")\n",
+    "        \n",
+    "        self.model = LlavaNextForConditionalGeneration.from_pretrained(\n",
+    "        \"llava-hf/llava-v1.6-34b-hf\",\n",
+    "        torch_dtype=torch.float16,\n",
+    "        low_cpu_mem_usage=True,\n",
+    "        use_flash_attention_2=True,\n",
+    "        device_map=self.device)\n",
+    "\n",
+    "    @property\n",
+    "    def result_columns(self) -> list[str]:\n",
+    "        return [\"llava34b_caption\"]\n",
+    "\n",
+    "    @property\n",
+    "    def dataloader_kwargs(self) -> dict[str, Any]:\n",
+    "        return {\n",
+    "            \"num_workers\": self.num_workers,\n",
+    "            \"batch_size\": self.batch_size,\n",
+    "            \"drop_last\": False,\n",
+    "        }\n",
+    "\n",
+    "    def preprocess_data(\n",
+    "        self,\n",
+    "        modality2data: ModalityToDataMapping,\n",
+    "        metadata: dict[str, Any]\n",
+    "    ) -> Any:\n",
+    "        key = metadata[self.key_column]\n",
+    "        pil_img = read_image_rgb_from_bytes(modality2data['image']).convert('RGB')\n",
+    "        img_tensor = self.image_processor.preprocess(pil_img, return_tensors='pt')['pixel_values'].half()\n",
+    "        return key, img_tensor\n",
+    "\n",
+    "    def process_batch(self, batch: list[Any]) -> dict[str, list[Any]]:\n",
+    "        df_batch_labels = self._get_dict_from_schema()\n",
+    "\n",
+    "        keys, image_tensors = list(zip(*batch))\n",
+    "        image_tensors = default_collate(image_tensors).to(self.device)  # type: ignore\n",
+    "\n",
+    "        input_ids_batch = self.input_ids.repeat_interleave(image_tensors.shape[0], 0).to(self.device)  # type: ignore\n",
+    "        with torch.inference_mode():\n",
+    "            output_ids = self.model.generate(\n",
+    "                input_ids_batch, images=image_tensors, do_sample=True, temperature=0.2, top_p=0.7,\n",
+    "                max_new_tokens=512, use_cache=True, stopping_criteria=[self.stopping_criteria]\n",
+    "            )\n",
+    "\n",
+    "        all_outputs = []\n",
+    "        for i in range(output_ids.shape[0]):\n",
+    "            output = self.tokenizer.decode(output_ids[i, self.input_ids.shape[1]:]).strip().split('</s>')[0]\n",
+    "            all_outputs.append(output)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:.mlspace-env3.11]",
+   "language": "python",
+   "name": "conda-env-.mlspace-env3.11-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}