Merge pull request #270 from VikParuchuri/dev

Update layout model
VikParuchuri · Dec 30, 2024 · ac03917 · ac03917
2 parents 0774cef + 76754bc
commit ac03917
Show file tree

Hide file tree

Showing 4 changed files with 8 additions and 5 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "surya-ocr"
-version = "0.8.1"
+version = "0.8.2"
 description = "OCR, layout, reading order, and table recognition in 90+ languages"
 authors = ["Vik Paruchuri <[email protected]>"]
 readme = "README.md"

diff --git a/surya/layout.py b/surya/layout.py
@@ -181,7 +181,10 @@ def batch_layout_detection(images: List, model, processor, batch_size=None, top_
                                 prediction["pause_tokens"] = last_prediction["pause_tokens"]
                                 prediction["token"].fill_(model.decoder.config.pause_token_id)
                                 batch_decoder_input[j, :] = model.decoder.config.pause_token_id
-                        elif intersects_other_boxes(prediction["polygon"], [p["polygon"] for p in batch_predictions[j]], thresh=.4):
+                        elif intersects_other_boxes(
+                                prediction["polygon"],
+                                [p["polygon"] for p in batch_predictions[j]], thresh=.4
+                        ) and model.decoder.config.max_pause_tokens > 0:
                             prediction["paused"] = True
                             prediction["pause_tokens"] = 1
                             prediction["token"].fill_(model.decoder.config.pause_token_id)

diff --git a/surya/model/layout/config.py b/surya/model/layout/config.py
@@ -173,8 +173,8 @@ def __init__(
         aux_heads=0, # How many n-token-ahead heads to add
         causal=True,
         layer_norm_eps=1e-5,
-        pause_token_count=5,
-        max_pause_tokens=3,
+        pause_token_count=0,
+        max_pause_tokens=0,
         **kwargs,
     ):
         self.num_hidden_layers = num_hidden_layers

diff --git a/surya/settings.py b/surya/settings.py
@@ -65,7 +65,7 @@ def TORCH_DEVICE_MODEL(self) -> str:
     RECOGNITION_ENCODER_BATCH_DIVISOR: int = 1 # Divisor for batch size in decoder
 
     # Layout
-    LAYOUT_MODEL_CHECKPOINT: str = "datalab-to/surya_layout0"
+    LAYOUT_MODEL_CHECKPOINT: str = "datalab-to/surya_layout"
     LAYOUT_IMAGE_SIZE: Dict = {"height": 768, "width": 768}
     LAYOUT_SLICE_MIN: Dict = {"height": 1500, "width": 1500} # When to start slicing images
     LAYOUT_SLICE_SIZE: Dict = {"height": 1200, "width": 1200} # Size of slices