Merge pull request #41 from ai-forever/v1.0

boomb0om · web-flow · commit 0125cbefeb77 · 2024-04-09T15:21:22.000+03:00
docs: update filters documentation
diff --git a/DPF/filters/images/hash_filters.py b/DPF/filters/images/hash_filters.py
@@ -12,7 +12,7 @@
 
 def get_phash(pil_img: Image.Image, hash_size: int = 8, highfreq_factor: int = 4) -> str:
     img_size = hash_size * highfreq_factor
-    image_array = np.array(pil_img.resize((img_size, img_size), Image.LANCZOS))
+    image_array = np.array(pil_img.resize((img_size, img_size), Image.Resampling.LANCZOS))
 
     dct_coef = dct(dct(image_array, axis=0), axis=1)
     dct_reduced_coef = dct_coef[:hash_size, :hash_size]
diff --git a/DPF/filters/images/info_filter.py b/DPF/filters/images/info_filter.py
@@ -31,7 +31,7 @@ def get_image_info(img_bytes: bytes, data: dict[str, Any], key_column: str) -> I
 
     try:
         pil_img = Image.open(BytesIO(img_bytes))
-        pil_img.load()
+        pil_img.load()  # type: ignore
 
         arr = np.array(pil_img)
 
diff --git a/DPF/filters/videos/image_filter_adapter.py b/DPF/filters/videos/image_filter_adapter.py
@@ -56,7 +56,7 @@ def preprocess_data(
         frame = iio.imread(io.BytesIO(video_bytes), index=frame_index, plugin="pyav")
 
         buff = io.BytesIO()
-        Image.fromarray(frame).convert('RGB').save(buff, format='JPEG', quality=95)
+        Image.fromarray(frame).convert('RGB').save(buff, format='JPEG', quality=95)  # type: ignore
         modality2data['image'] = buff.getvalue()
         metadata[self.image_filter.key_column] = ''
         return key, self.image_filter.preprocess_data(modality2data, metadata)
diff --git a/DPF/utils/image_utils.py b/DPF/utils/image_utils.py
@@ -5,7 +5,7 @@
 
 def read_image_rgb(path: str, force_rgb: bool = True) -> Image.Image:
     pil_img = Image.open(path)
-    pil_img.load()
+    pil_img.load()  # type: ignore
     if pil_img.format == "PNG" and pil_img.mode != "RGBA":
         pil_img = pil_img.convert("RGBA")
     if force_rgb:
@@ -15,7 +15,7 @@ def read_image_rgb(path: str, force_rgb: bool = True) -> Image.Image:
 
 def read_image_rgb_from_bytes(img_bytes: bytes, force_rgb: bool = True) -> Image.Image:
     pil_img = Image.open(BytesIO(img_bytes))
-    pil_img.load()
+    pil_img.load()  # type: ignore
     if pil_img.format == "PNG" and pil_img.mode != "RGBA":
         pil_img = pil_img.convert("RGBA")
     if force_rgb:
diff --git a/README.md b/README.md
@@ -19,6 +19,10 @@ cd DataProcessingFramework
 pip install .
 ```
 
+Extra requirements: `filters`, `dev`, `llava`, `video_llava`
+
+To install extra requirements run: `pip install .[filters]`
+
 ## Overview
 
 Framework supports following features:
@@ -31,6 +35,9 @@ Framework supports following features:
 
 DPF allows you to easily filter datasets and add new metadata. 
 For example, the code below generates synthetic captions for images in shards on remote s3 storage and updates dataset metadata without downloading shards:
+
+Before running the example below, install extra requirements: `pip install DPF[filters,llava]`
+
 ```python
 from DPF import S3Connector, DatasetReader, ShardsDatasetConfig
 
diff --git a/docs/filters.md b/docs/filters.md
@@ -86,5 +86,136 @@ You can find usage examples [there](../examples).
 
 ### Creating new filter
 
-TODO
+To add your filter, you should create new filter class.
+If your filter uses only data from columns (e.g. _text_ modality), you should inherit your class from [ColumnFilter class](../DPF/filters/column_filter.py)
+If your filter uses data from files, you should inherit your class from [DataFilter class](../DPF/filters/data_filter.py)
 
+#### Creating DataFilter
+
+To create a new datafilter, add new file in a folder with the modality used by your filter. 
+For example, if your filter uses _images_ modality, create file in [DPF/filters/images/](../DPF/filters/images) folder.
+If your filter uses _texts_ and _images_ modality, create file in [DPF/filters/text2image/](../DPF/filters/text2image) and so on.
+
+Inherit you filter from corresponding `DataFilter` class in modality folder:
+- [DPF/filters/images/img_filter.py](../DPF/filters/images/img_filter.py) for _images_
+- [DPF/filters/text2image/t2i_filter.py](../DPF/filters/text2image/t2i_filter.py) for _texts_ and _images_
+- [DPF/filters/videos/video_filter.py](../DPF/filters/videos/video_filter.py) for _videos_
+
+Then you should implement `result_columns`, `dataloader_kwargs` properties and `preprocess_data`, `process_batch` methods.
+- `result_columns` - list of result columns that filter adds to a DataFrame
+- `dataloader_kwargs` - parameters for a pytorch dataloader
+- `preprocess_data` - method where data preprocessing is implemented. This method is passed to dataloader and preprocessing runs in multiple processes. Do not use cuda operations in this method.
+- `process_batch` - method where batch is processed with model
+
+For more information run:
+```python
+from DPF.filters import DataFilter
+help(DataFilter)
+```
+
+**Example of custom DataFilter:**
+```python
+from typing import Any
+
+from DPF.filters.images.img_filter import ImageFilter
+from DPF.types import ModalityToDataMapping
+
+class PHashFilter(ImageFilter):
+    def __init__(
+        self,
+        sim_hash_size: int = 8,
+        workers: int = 16,
+        pbar: bool = True,
+        _pbar_position: int = 0
+    ):
+        super().__init__(pbar, _pbar_position)
+        self.num_workers = workers
+        self.sim_hash_size = sim_hash_size
+
+    @property
+    def result_columns(self) -> list[str]:
+        return [f"image_phash_{self.sim_hash_size}"]
+
+    @property
+    def dataloader_kwargs(self) -> dict[str, Any]:
+        return {"num_workers": self.num_workers, "batch_size": 1, "drop_last": False}
+
+    def preprocess_data(
+        self,
+        modality2data: ModalityToDataMapping,
+        metadata: dict[str, Any]
+    ) -> Any:
+        key = metadata[self.key_column]
+        img_simhash = get_phash(
+            read_image_rgb_from_bytes(modality2data['image']), 
+            hash_size=self.sim_hash_size
+        )
+        return key, img_simhash
+
+    def process_batch(self, batch: list[Any]) -> dict[str, list[Any]]:
+        df_batch_labels = self._get_dict_from_schema()
+
+        keys, img_simhashes = list(zip(*batch))
+        df_batch_labels[self.key_column].extend(keys)
+        df_batch_labels[f"image_phash_{self.sim_hash_size}"].extend(img_simhashes)
+
+        return df_batch_labels
+```
+
+This filter reads images and calculates PHash **in dataloader**. 
+Then dataloader returns PHash strings and these strings are added in result dataframe. 
+
+#### Creating ColumnFilter
+
+To create a new columnfilter, add new file in a folder with the modality used by your filter.
+Inherit your class from [ColumnFilter](../DPF/filters/column_filter.py) class.
+
+Then you should implement `result_columns`, `columns_to_process` properties and `process_sample` methods.
+- `result_columns` - list of result columns that filter adds to a DataFrame
+- `columns_to_process` - columns in original dataframe used for processing. These columns are being passed in method 
+- `process_sample` - method that processes one sample of data.
+
+For more information run:
+```python
+from DPF.filters import ColumnFilter
+help(ColumnFilter)
+```
+
+**Example of custom ColumnFilter:**
+```python
+from typing import Any
+from py3langid.langid import MODEL_FILE, LanguageIdentifier
+from DPF.filters import ColumnFilter
+
+class LangFilter(ColumnFilter):
+    """
+    LangFilter class
+    """
+
+    def __init__(
+        self,
+        text_column_name: str = "text",
+        workers: int = 16,
+        pbar: bool = True
+    ):
+        super().__init__(workers, pbar)
+        self.lang_identifier = LanguageIdentifier.from_pickled_model(
+            MODEL_FILE, norm_probs=True
+        )
+        self.text_column_name = text_column_name
+
+    @property
+    def columns_to_process(self) -> list[str]:
+        return [self.text_column_name]
+
+    @property
+    def result_columns(self) -> list[str]:
+        return ["lang", "lang_score"]
+
+    def process_sample(self, sample: dict[str, Any]) -> list[Any]:
+        lg, score = self.lang_identifier.classify(sample[self.text_column_name])
+        return [lg, round(score, 2)]
+```
+
+This filter creates 2 new columns: `lang` and `lang_score`. 
+It uses column with text name to identify the language of a text.
diff --git a/docs/processor.md b/docs/processor.md
@@ -12,7 +12,7 @@ Dataset processor supports following features:
 from DPF import ShardsDatasetConfig, DatasetReader
 
 config = ShardsDatasetConfig.from_path_and_columns(
-  'examples/example_dataset/',
+  'examples/example_dataset',
   image_name_col='image_name',
   text_col='caption'
 )
diff --git a/examples/process_dataset_example.ipynb b/examples/process_dataset_example.ipynb

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@ Dataset processor supports following features:`
`12`	`12`	`from DPF import ShardsDatasetConfig, DatasetReader`
`13`	`13`
`14`	`14`	`config = ShardsDatasetConfig.from_path_and_columns(`
`15`		`- 'examples/example_dataset/',`
	`15`	`+ 'examples/example_dataset',`
`16`	`16`	`image_name_col='image_name',`
`17`	`17`	`text_col='caption'`
`18`	`18`	`)`