Patch issues with table rec

VikParuchuri · VikParuchuri · commit d41dd7c83931 · 2024-12-12T16:40:30.000-05:00
diff --git a/ocr_app.py b/ocr_app.py
@@ -3,10 +3,8 @@
 
 import pypdfium2
 import streamlit as st
-from pypdfium2 import PdfiumError
 
 from surya.detection import batch_text_detection
-from surya.input.pdflines import get_page_text_lines, get_table_blocks
 from surya.layout import batch_layout_detection
 from surya.model.detection.model import load_model, load_processor
 from surya.model.layout.model import load_model as load_layout_model
@@ -24,7 +22,7 @@
 from surya.schema import OCRResult, TextDetectionResult, LayoutResult, TableResult
 from surya.settings import settings
 from surya.tables import batch_table_recognition
-from surya.postprocessing.util import rescale_bboxes, rescale_bbox
+from surya.postprocessing.util import rescale_bbox
 
 
 @st.cache_resource()
diff --git a/surya/model/common/adetr/decoder.py b/surya/model/common/adetr/decoder.py
@@ -374,7 +374,7 @@ def forward(
             # Do cross-attention on encoder outputs
             cross_attn_inputs = self.cross_pre_norm(hidden_states)
             cross_attn_path = self.cross_attn_block(
-                cross_attn_inputs, position_ids, encoder_hidden_states, attention_mask, encoder_attention_mask
+                cross_attn_inputs, encoder_hidden_states, attention_mask, encoder_attention_mask, use_cache=use_cache
             )
             hidden_states = cross_attn_path + hidden_states
 
diff --git a/surya/model/table_rec/decoder.py b/surya/model/table_rec/decoder.py
@@ -43,7 +43,10 @@ def __init__(self, config):
 
         self.config = config
 
-    def forward(self, boxes: torch.LongTensor):
+    def forward(self, boxes: torch.LongTensor, *args):
+        # Need to keep *args for compatibility with common decoder
+        boxes = boxes.to(torch.long).clamp(0, self.config.vocab_size)
+
         boxes_unbound = boxes.to(torch.long).unbind(dim=-1)
         cx, cy, w, h, xskew, yskew = boxes_unbound[self.component_idxs["bbox"][0]:self.component_idxs["bbox"][1]]
         category = boxes_unbound[self.component_idxs["category"][0]:self.component_idxs["category"][1]][0]
diff --git a/surya/model/table_rec/encoderdecoder.py b/surya/model/table_rec/encoderdecoder.py
@@ -25,7 +25,6 @@ def __init__(
         self,
         config: Optional[PretrainedConfig] = None,
         encoder: Optional[PreTrainedModel] = None,
-        text_encoder: Optional[PreTrainedModel] = None,
         decoder: Optional[PreTrainedModel] = None,
     ):
         # initialize with config
diff --git a/surya/model/table_rec/model.py b/surya/model/table_rec/model.py
@@ -1,7 +1,6 @@
 from surya.model.table_rec.encoder import DonutSwinModel
-from surya.model.table_rec.config import SuryaTableRecConfig, SuryaTableRecDecoderConfig, DonutSwinTableRecConfig, \
-    SuryaTableRecTextEncoderConfig
-from surya.model.table_rec.decoder import SuryaTableRecDecoder, SuryaTableRecTextEncoder
+from surya.model.table_rec.config import SuryaTableRecConfig, SuryaTableRecDecoderConfig, DonutSwinTableRecConfig
+from surya.model.table_rec.decoder import SuryaTableRecDecoder
 from surya.model.table_rec.encoderdecoder import TableRecEncoderDecoderModel
 from surya.settings import settings
 
diff --git a/surya/model/table_rec/shaper.py b/surya/model/table_rec/shaper.py
@@ -1,3 +1,4 @@
+import math
 from typing import List, Dict
 import numpy as np
 import torch
@@ -120,21 +121,24 @@ def convert_bbox_to_polygon(self, box, skew_scaler=BOX_DIM // 2, skew_min=.001):
         y1 = cy - height / 2
         x2 = cx + width / 2
         y2 = cy + height / 2
-        skew_x = torch.floor((box[4] - skew_scaler) / 2)
-        skew_y = torch.floor((box[5] - skew_scaler) / 2)
+        skew_x = math.floor((box[4] - skew_scaler) / 2)
+        skew_y = math.floor((box[5] - skew_scaler) / 2)
 
         # Ensures we don't get slightly warped boxes
         # Note that the values are later scaled, so this is in 1/1024 space
-        skew_x[torch.abs(skew_x) < skew_min] = 0
-        skew_y[torch.abs(skew_y) < skew_min] = 0
+        if abs(skew_x) < skew_min:
+            skew_x = 0
+
+        if abs(skew_y) < skew_min:
+            skew_y = 0
 
         polygon = [x1 - skew_x, y1 - skew_y, x2 - skew_x, y1 + skew_y, x2 + skew_x, y2 + skew_y, x1 + skew_x,
                    y2 - skew_y]
         poly = []
         for i in range(4):
             poly.append([
-                polygon[2 * i].item(),
-                polygon[2 * i + 1].item()
+                polygon[2 * i],
+                polygon[2 * i + 1]
             ])
         return poly
 
diff --git a/surya/tables.py b/surya/tables.py
@@ -57,18 +57,23 @@ def pad_to_batch_size(tensor: torch.Tensor, batch_size: int) -> torch.Tensor:
 
 def inference_loop(
         model: nn.Module,
-        processor: SuryaProcessor,
         encoder_hidden_states: torch.Tensor,
         batch_input_ids: torch.Tensor,
         current_batch_size: int,
+        batch_size: int
 ):
     shaper = LabelShaper()
-    max_batch_size = batch_input_ids.shape[0]
     batch_predictions = [[] for _ in range(current_batch_size)]
     max_tokens = settings.TABLE_REC_MAX_BOXES
     decoder_position_ids = torch.ones_like(batch_input_ids[0, :, 0], dtype=torch.int64, device=model.device).cumsum(
         0) - 1
+    inference_token_count = batch_input_ids.shape[1]
 
+    if settings.TABLE_REC_STATIC_CACHE:
+        encoder_hidden_states = pad_to_batch_size(encoder_hidden_states, batch_size)
+        batch_input_ids = pad_to_batch_size(batch_input_ids, batch_size)
+
+    model.decoder.model._setup_cache(model.config, batch_size, model.device, model.dtype)
 
     with torch.inference_mode():
         token_count = 0
@@ -94,15 +99,15 @@ def inference_loop(
                 for (k, kcount, mode) in BOX_PROPERTIES:
                     k_logits = return_dict["box_property_logits"][k][j, -1, :]
                     if mode == "classification":
-                        item = torch.argmax(k_logits, dim=-1).item()
+                        item = int(torch.argmax(k_logits, dim=-1).item())
                         if k == "category":
-                            done.append(item == processor.tokenizer.eos_id or item == processor.tokenizer.pad_id)
+                            done.append(item == model.decoder.config.eos_token_id or item == model.decoder.config.pad_token_id)
                         item -= SPECIAL_TOKENS
                         box_property[k] = item
                     elif mode == "regression":
                         if k == "bbox":
                             k_logits *= BOX_DIM
-                        box_property[k] = k_logits
+                        box_property[k] = k_logits.tolist()
                 box_properties.append(box_property)
 
             all_done = all_done | torch.tensor(done, dtype=torch.bool)
@@ -111,6 +116,7 @@ def inference_loop(
                 break
 
             batch_input_ids = torch.tensor(shaper.dict_to_labels(box_properties), dtype=torch.long).to(model.device)
+            batch_input_ids = batch_input_ids.unsqueeze(1) # Add sequence length dimension
 
             for j, (box_property, status) in enumerate(zip(box_properties, all_done)):
                 if not status:
@@ -120,7 +126,7 @@ def inference_loop(
             inference_token_count = batch_input_ids.shape[1]
 
             if settings.TABLE_REC_STATIC_CACHE:
-                batch_input_ids = pad_to_batch_size(batch_input_ids, max_batch_size)
+                batch_input_ids = pad_to_batch_size(batch_input_ids, batch_size)
     return batch_predictions
 
 
@@ -156,19 +162,14 @@ def batch_table_recognition(images: List, model: TableRecEncoderDecoderModel, pr
         batch_input_ids = model_inputs["input_ids"].to(model.device)
         batch_pixel_values = torch.tensor(np.array(batch_pixel_values), dtype=model.dtype).to(model.device)
 
-        if settings.TABLE_REC_STATIC_CACHE:
-            batch_pixel_values = pad_to_batch_size(batch_pixel_values, batch_size)
-            batch_input_ids = pad_to_batch_size(batch_input_ids, batch_size)
-
-        model.decoder.model._setup_cache(model.config, batch_size, model.device, model.dtype)
-        model.text_encoder.model._setup_cache(model.config, batch_size, model.device, model.dtype)
         shaper = LabelShaper()
 
         # We only need to process each image once
         with torch.inference_mode():
             encoder_hidden_states = model.encoder(pixel_values=batch_pixel_values).last_hidden_state
 
-        row_predictions = inference_loop(model, processor, encoder_hidden_states, batch_input_ids, current_batch_size)
+        row_predictions = inference_loop(model, encoder_hidden_states, batch_input_ids, current_batch_size, batch_size)
+
         row_query_items = []
         row_encoder_hidden_states = []
         idx_map = []
@@ -186,7 +187,16 @@ def batch_table_recognition(images: List, model: TableRecEncoderDecoderModel, pr
 
         row_encoder_hidden_states = torch.stack(row_encoder_hidden_states)
         row_inputs = processor(images=None, query_items=row_query_items, convert_images=False)
-        cell_predictions = inference_loop(model, processor, row_encoder_hidden_states, row_inputs["input_ids"], len(row_query_items))
+        row_input_ids = row_inputs["input_ids"].to(model.device)
+        cell_predictions = []
+        for j in tqdm(range(0, len(images), batch_size), desc="Recognizing tables"):
+            cell_batch_hidden_states = row_encoder_hidden_states[j:j+batch_size]
+            cell_batch_input_ids = row_input_ids[j:j+batch_size]
+            cell_batch_size = len(cell_batch_input_ids)
+
+            cell_predictions.extend(
+                inference_loop(model, cell_batch_hidden_states, cell_batch_input_ids, cell_batch_size, batch_size)
+            )
 
         batch_predictions = []
 
diff --git a/table_recognition.py b/table_recognition.py
@@ -1,4 +1,3 @@
-import pypdfium2 as pdfium # Needs to be on top to avoid warning
 import os
 import argparse
 import copy

Original file line number	Diff line number	Diff line change
`@@ -374,7 +374,7 @@ def forward(`
`374`	`374`	`# Do cross-attention on encoder outputs`
`375`	`375`	`cross_attn_inputs = self.cross_pre_norm(hidden_states)`
`376`	`376`	`cross_attn_path = self.cross_attn_block(`
`377`		`- cross_attn_inputs, position_ids, encoder_hidden_states, attention_mask, encoder_attention_mask`
	`377`	`+ cross_attn_inputs, encoder_hidden_states, attention_mask, encoder_attention_mask, use_cache=use_cache`
`378`	`378`	`)`
`379`	`379`	`hidden_states = cross_attn_path + hidden_states`
`380`	`380`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-import pypdfium2 as pdfium # Needs to be on top to avoid warning`
`2`	`1`	`import os`
`3`	`2`	`import argparse`
`4`	`3`	`import copy`