Update to new text detection model

tarun-menta · tarun-menta · commit 30444a046596 · 2025-02-28T13:52:10.000-05:00
diff --git a/surya/common/polygon.py b/surya/common/polygon.py
@@ -86,6 +86,21 @@ def merge(self, other):
         y2 = max(self.bbox[3], other.bbox[3])
         self.polygon = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
 
+    def expand(self, x_margin: float, y_margin: float):
+        new_polygon = []
+        x_margin = x_margin * self.width
+        y_margin = y_margin * self.height
+        for idx, poly in enumerate(self.polygon):
+            if idx == 0:
+                new_polygon.append([poly[0] - x_margin, poly[1] - y_margin])
+            elif idx == 1:
+                new_polygon.append([poly[0] + x_margin, poly[1] - y_margin])
+            elif idx == 2:
+                new_polygon.append([poly[0] + x_margin, poly[1] + y_margin])
+            elif idx == 3:
+                new_polygon.append([poly[0] - x_margin, poly[1] + y_margin])
+        self.polygon = new_polygon
+
     def intersection_polygon(self, other) -> List[List[float]]:
         new_poly = []
         for i in range(4):
diff --git a/surya/detection/heatmap.py b/surya/detection/heatmap.py
@@ -122,6 +122,7 @@ def get_and_clean_boxes(textmap, processor_size, image_size, text_threshold=None
     bboxes = clean_boxes(bboxes)
     return bboxes
 
+
 def parallel_get_lines(preds, orig_sizes, include_maps=False):
     heatmap, affinity_map = preds
     heat_img, aff_img = None, None
@@ -143,18 +144,24 @@ def parallel_get_lines(preds, orig_sizes, include_maps=False):
     return result
 
 def parallel_get_boxes(preds, orig_sizes, include_maps=False):
-    heatmap, _ = preds
+    heatmap, affinity_map = preds
     heat_img, aff_img = None, None
+
     if include_maps:
         heat_img = Image.fromarray((heatmap * 255).astype(np.uint8))
+        aff_img = Image.fromarray((affinity_map * 255).astype(np.uint8))
     heatmap_size = list(reversed(heatmap.shape))
     bboxes = get_and_clean_boxes(heatmap, heatmap_size, orig_sizes)
+    for box in bboxes:
+        #Skip for vertical boxes
+        if box.height<3*box.width:
+            box.expand(x_margin=0, y_margin=settings.DETECTOR_BOX_Y_EXPAND_MARGIN)
 
     result = TextDetectionResult(
         bboxes=bboxes,
         vertical_lines=[],
         heatmap=heat_img,
-        affinity_map=None,
+        affinity_map=aff_img,
         image_bbox=[0, 0, orig_sizes[0], orig_sizes[1]]
     )
     return result
diff --git a/surya/settings.py b/surya/settings.py
@@ -48,13 +48,14 @@ def TORCH_DEVICE_MODEL(self) -> str:
 
     # Text detection
     DETECTOR_BATCH_SIZE: Optional[int] = None # Defaults to 2 for CPU/MPS, 32 otherwise
-    DETECTOR_MODEL_CHECKPOINT: str = "s3://text_detection/2025_02_18"
+    DETECTOR_MODEL_CHECKPOINT: str = "s3://text_detection/2025_02_28"
     DETECTOR_BENCH_DATASET_NAME: str = "vikp/doclaynet_bench"
     DETECTOR_IMAGE_CHUNK_HEIGHT: int = 1400 # Height at which to slice images vertically
     DETECTOR_TEXT_THRESHOLD: float = 0.6 # Threshold for text detection (above this is considered text)
     DETECTOR_BLANK_THRESHOLD: float = 0.35 # Threshold for blank space (below this is considered blank)
     DETECTOR_POSTPROCESSING_CPU_WORKERS: int = min(8, os.cpu_count()) # Number of workers for postprocessing
     DETECTOR_MIN_PARALLEL_THRESH: int = 3 # Minimum number of images before we parallelize
+    DETECTOR_BOX_Y_EXPAND_MARGIN: float = 0.025  #Margin by which to expand detected boxes vertically
     COMPILE_DETECTOR: bool = False
 
     # Inline math detection