ServiceNow · sohamparikh · Apr 8, 2025 · Apr 9, 2025 · Apr 15, 2025 · Apr 21, 2025
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -27,6 +27,7 @@ jobs:
 
       - name: Install dependencies
         run: |
+          sudo apt install libjpeg-dev
           pip install "torch>=2.2.2"
           pip install pybind11
           FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE MAMBA_SKIP_CUDA_BUILD=TRUE MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE pip install --no-build-isolation -e ".[CORE,OPTIONAL,DEV,DOCS]"

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -29,6 +29,7 @@ jobs:
           restore-keys: |
             mkdocs-material-
       - run: |
+          sudo apt install libjpeg-dev
           pip install "torch>=2.2.2"
           pip install pybind11
           FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE MAMBA_SKIP_CUDA_BUILD=TRUE \
@@ -56,6 +57,7 @@ jobs:
           restore-keys: |
             mkdocs-material-
       - run: |
+          sudo apt install libjpeg-dev
           pip install "torch>=2.2.2"
           pip install pybind11
           FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE MAMBA_SKIP_CUDA_BUILD=TRUE \

diff --git a/fast_llm/data/data/gpt/data.py b/fast_llm/data/data/gpt/data.py
@@ -32,6 +32,8 @@ class GPTBatch:
     token_ids: torch.Tensor
     loss_masking_spans: list[torch.Tensor] | None = None
     sequence_lengths: list[torch.Tensor] | None = None
+    images: list[torch.Tensor] | None = None
+    image_positions: list[torch.Tensor] | None = None
     chosen_spans: list[torch.Tensor] | None = None
     rejected_spans: list[torch.Tensor] | None = None
 
@@ -49,12 +51,28 @@ def gpt_data_collate_fn(batch: list[GPTSample], sampling_parameters: GPTSampling
         stacked_rejected_spans = [torch.from_numpy(sample.rejected_span) for sample in batch]
     if not sampling_parameters.cross_document_attention:
         sequence_lengths = [torch.tensor(sample.sequence_lengths) for sample in batch]
+    has_images = False
+    batch_images = []
+    for sample in batch:
+        if sample.images is not None:
+            batch_images.append([torch.from_numpy(image) for image in sample.images])
+            has_images = True
+        else:
+            batch_images.append([])
+    batch_image_positions = []
+    for sample in batch:
+        if sample.image_positions is not None:
+            batch_image_positions.append(torch.from_numpy(sample.image_positions))
+        else:
+            batch_image_positions.append([])
     return GPTBatch(
         token_ids=torch.from_numpy(stacked_ids),
         loss_masking_spans=stacked_spans,
         sequence_lengths=sequence_lengths,
         chosen_spans=stacked_chosen_spans,
         rejected_spans=stacked_rejected_spans,
+        images=batch_images if has_images else None,
+        image_positions=batch_image_positions if has_images else None,
     )
 
 

diff --git a/fast_llm/data/dataset/gpt/config.py b/fast_llm/data/dataset/gpt/config.py
@@ -75,6 +75,10 @@ class GPTSamplingParameters(SamplingParameters):
     use_loss_masking_spans: bool = False
     use_preference_loss_spans: bool = False
     cross_document_attention: bool = True
+    patch_size: int | None = None
+    max_image_size: int | None = None
+    image_break_token: int | None = None
+    image_end_token: int | None = None
     # How many extra tokens to add to the sequence length.
     # This is used to provide labels even for the last tokens in the sequence.
     extra_tokens: int = 1
@@ -142,11 +146,18 @@ class GPTMemmapDatasetConfig(GPTIndexedDatasetConfig):
         desc="Expected number of tokens in the dataset.",
         hint=FieldHint.optional,
     )
+    num_pixels: int | None = Field(
+        default=None,
+        desc="Expected number of pixels in the dataset.",
+        hint=FieldHint.optional,
+    )
 
     def build(self) -> "GPTMemmapDataset":
         from fast_llm.data.dataset.gpt.memmap import GPTMemmapDataset
 
-        return GPTMemmapDataset(str(self.path).replace("/", "__"), self.path, self.num_documents, self.num_tokens)
+        return GPTMemmapDataset(
+            str(self.path).replace("/", "__"), self.path, self.num_documents, self.num_tokens, self.num_pixels
+        )
 
 
 @config_class(dynamic_type={GPTSampledDatasetConfig: "concatenated"})

diff --git a/fast_llm/data/dataset/gpt/indexed.py b/fast_llm/data/dataset/gpt/indexed.py
@@ -34,6 +34,14 @@ def sample(self, sampling: GPTSamplingData) -> "GPTSampledIndexedDataset":
             else GPTSampledIndexedDataset(self, sampling)
         )
 
+    @property
+    @abc.abstractmethod
+    def has_images(self) -> bool:
+        """
+        Whether the dataset contains images.
+        This is used to determine whether to use image-related fields in the sampled data.
+        """
+
 
 class GPTDatasetSlice[IndexedDatasetType: GPTIndexedDataset](DatasetSlice[IndexedDatasetType], GPTIndexedDataset):
     """
@@ -44,11 +52,16 @@ class GPTDatasetSlice[IndexedDatasetType: GPTIndexedDataset](DatasetSlice[Indexe
 
     def get_document_sizes(self) -> np.ndarray:
         # TODO: This can be really big.
-        return self._dataset.get_document_sizes()[self._begin : self._end]
+        doc_sizes, im_sizes = self._dataset.get_document_sizes()
+        return doc_sizes[self._begin : self._end], im_sizes[self._begin : self._end] if im_sizes else []
 
     def get_document_size(self, index: int) -> int:
         return self._dataset.get_document_size(self._begin + index)
 
+    @property
+    def has_images(self) -> bool:
+        return self._dataset.has_images
+
 
 class GPTConcatenatedDataset[IndexedDatasetType: GPTIndexedDataset](
     ConcatenatedDataset[IndexedDatasetType], GPTIndexedDataset