-
Notifications
You must be signed in to change notification settings - Fork 967
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #75 from VikParuchuri/dev
Add reading order model
- Loading branch information
Showing
34 changed files
with
1,462 additions
and
48 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import argparse | ||
import collections | ||
import copy | ||
import json | ||
|
||
from surya.benchmark.metrics import precision_recall | ||
from surya.model.ordering.model import load_model | ||
from surya.model.ordering.processor import load_processor | ||
from surya.postprocessing.heatmap import draw_bboxes_on_image | ||
from surya.ordering import batch_ordering | ||
from surya.settings import settings | ||
from surya.benchmark.metrics import rank_accuracy | ||
import os | ||
import time | ||
from tabulate import tabulate | ||
import datasets | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description="Benchmark surya reading order model.") | ||
parser.add_argument("--results_dir", type=str, help="Path to JSON file with benchmark results.", default=os.path.join(settings.RESULT_DIR, "benchmark")) | ||
parser.add_argument("--max", type=int, help="Maximum number of images to run benchmark on.", default=None) | ||
args = parser.parse_args() | ||
|
||
model = load_model() | ||
processor = load_processor() | ||
|
||
pathname = "order_bench" | ||
# These have already been shuffled randomly, so sampling from the start is fine | ||
split = "train" | ||
if args.max is not None: | ||
split = f"train[:{args.max}]" | ||
dataset = datasets.load_dataset(settings.ORDER_BENCH_DATASET_NAME, split=split) | ||
images = list(dataset["image"]) | ||
images = [i.convert("RGB") for i in images] | ||
bboxes = list(dataset["bboxes"]) | ||
|
||
start = time.time() | ||
order_predictions = batch_ordering(images, bboxes, model, processor) | ||
surya_time = time.time() - start | ||
|
||
folder_name = os.path.basename(pathname).split(".")[0] | ||
result_path = os.path.join(args.results_dir, folder_name) | ||
os.makedirs(result_path, exist_ok=True) | ||
|
||
page_metrics = collections.OrderedDict() | ||
mean_accuracy = 0 | ||
for idx, order_pred in enumerate(order_predictions): | ||
row = dataset[idx] | ||
pred_labels = [str(l.position) for l in order_pred.bboxes] | ||
labels = row["labels"] | ||
accuracy = rank_accuracy(pred_labels, labels) | ||
mean_accuracy += accuracy | ||
page_results = { | ||
"accuracy": accuracy, | ||
"box_count": len(labels) | ||
} | ||
|
||
page_metrics[idx] = page_results | ||
|
||
mean_accuracy /= len(order_predictions) | ||
|
||
out_data = { | ||
"time": surya_time, | ||
"mean_accuracy": mean_accuracy, | ||
"page_metrics": page_metrics | ||
} | ||
|
||
with open(os.path.join(result_path, "results.json"), "w+") as f: | ||
json.dump(out_data, f, indent=4) | ||
|
||
print(f"Mean accuracy is {mean_accuracy:.2f}.") | ||
print(f"Took {surya_time / len(images):.2f} seconds per image, and {surya_time:.1f} seconds total.") | ||
print("Mean accuracy is the % of correct ranking pairs.") | ||
print(f"Wrote results to {result_path}") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
[tool.poetry] | ||
name = "surya-ocr" | ||
version = "0.3.0" | ||
description = "OCR, layout analysis, and line detection in 90+ languages" | ||
version = "0.4.0" | ||
description = "OCR, layout, reading order, and line detection in 90+ languages" | ||
authors = ["Vik Paruchuri <[email protected]>"] | ||
readme = "README.md" | ||
license = "GPL-3.0-or-later" | ||
|
@@ -15,7 +15,8 @@ include = [ | |
"ocr_text.py", | ||
"ocr_app.py", | ||
"run_ocr_app.py", | ||
"detect_layout.py" | ||
"detect_layout.py", | ||
"reading_order.py", | ||
] | ||
|
||
[tool.poetry.dependencies] | ||
|
@@ -48,6 +49,7 @@ surya_detect = "detect_text:main" | |
surya_ocr = "ocr_text:main" | ||
surya_layout = "detect_layout:main" | ||
surya_gui = "run_ocr_app:run_app" | ||
surya_order = "reading_order:main" | ||
|
||
[build-system] | ||
requires = ["poetry-core"] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
import argparse | ||
import copy | ||
import json | ||
from collections import defaultdict | ||
|
||
from surya.detection import batch_text_detection | ||
from surya.input.load import load_from_folder, load_from_file | ||
from surya.layout import batch_layout_detection | ||
from surya.model.detection.segformer import load_model as load_det_model, load_processor as load_det_processor | ||
from surya.model.ordering.model import load_model | ||
from surya.model.ordering.processor import load_processor | ||
from surya.ordering import batch_ordering | ||
from surya.postprocessing.heatmap import draw_polys_on_image | ||
from surya.settings import settings | ||
import os | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description="Find reading order of an input file or folder (PDFs or image).") | ||
parser.add_argument("input_path", type=str, help="Path to pdf or image file or folder to find reading order in.") | ||
parser.add_argument("--results_dir", type=str, help="Path to JSON file with layout results.", default=os.path.join(settings.RESULT_DIR, "surya")) | ||
parser.add_argument("--max", type=int, help="Maximum number of pages to process.", default=None) | ||
parser.add_argument("--images", action="store_true", help="Save images of detected layout bboxes.", default=False) | ||
args = parser.parse_args() | ||
|
||
model = load_model() | ||
processor = load_processor() | ||
|
||
layout_model = load_det_model(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT) | ||
layout_processor = load_det_processor(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT) | ||
|
||
det_model = load_det_model() | ||
det_processor = load_det_processor() | ||
|
||
if os.path.isdir(args.input_path): | ||
images, names = load_from_folder(args.input_path, args.max) | ||
folder_name = os.path.basename(args.input_path) | ||
else: | ||
images, names = load_from_file(args.input_path, args.max) | ||
folder_name = os.path.basename(args.input_path).split(".")[0] | ||
|
||
line_predictions = batch_text_detection(images, det_model, det_processor) | ||
layout_predictions = batch_layout_detection(images, layout_model, layout_processor, line_predictions) | ||
bboxes = [] | ||
for layout_pred in layout_predictions: | ||
bbox = [l.bbox for l in layout_pred.bboxes] | ||
bboxes.append(bbox) | ||
|
||
order_predictions = batch_ordering(images, bboxes, model, processor) | ||
result_path = os.path.join(args.results_dir, folder_name) | ||
os.makedirs(result_path, exist_ok=True) | ||
|
||
if args.images: | ||
for idx, (image, layout_pred, order_pred, name) in enumerate(zip(images, layout_predictions, order_predictions, names)): | ||
polys = [l.polygon for l in order_pred.bboxes] | ||
labels = [str(l.position) for l in order_pred.bboxes] | ||
bbox_image = draw_polys_on_image(polys, copy.deepcopy(image), labels=labels, label_font_size=20) | ||
bbox_image.save(os.path.join(result_path, f"{name}_{idx}_order.png")) | ||
|
||
predictions_by_page = defaultdict(list) | ||
for idx, (layout_pred, pred, name, image) in enumerate(zip(layout_predictions, order_predictions, names, images)): | ||
out_pred = pred.model_dump() | ||
for bbox, layout_bbox in zip(out_pred["bboxes"], layout_pred.bboxes): | ||
bbox["label"] = layout_bbox.label | ||
|
||
out_pred["page"] = len(predictions_by_page[name]) + 1 | ||
predictions_by_page[name].append(out_pred) | ||
|
||
# Sort in reading order | ||
for name in predictions_by_page: | ||
for page_preds in predictions_by_page[name]: | ||
page_preds["bboxes"] = sorted(page_preds["bboxes"], key=lambda x: x["position"]) | ||
|
||
with open(os.path.join(result_path, "results.json"), "w+", encoding="utf-8") as f: | ||
json.dump(predictions_by_page, f, ensure_ascii=False) | ||
|
||
print(f"Wrote results to {result_path}") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.