Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use ruff check to auto format code #160

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ jobs:
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
python-version: "3.11"
- name: Install python dependencies
run: |
pip install poetry
pip install poetry==1.8.3
poetry install
- name: Build package
run: |
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
python-version: "3.11"
- name: Install apt dependencies
run: |
sudo apt-get update
Expand All @@ -24,6 +24,9 @@ jobs:
poetry install
poetry remove torch
poetry run pip install torch --index-url https://download.pytorch.org/whl/cpu
- name: Run ruff lint for checking code style
run: |
poetry run ruff check --select I .
- name: Run detection benchmark test
run: |
poetry run python benchmark/detection.py --max 2
Expand All @@ -40,6 +43,3 @@ jobs:
run: |
poetry run python benchmark/ordering.py --max 5
poetry run python scripts/verify_benchmark_scores.py results/benchmark/order_bench/results.json --bench_type ordering



13 changes: 7 additions & 6 deletions benchmark/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,21 @@
import collections
import copy
import json
import os
import time

import datasets
from tabulate import tabulate

from surya.benchmark.bbox import get_pdf_lines
from surya.benchmark.metrics import precision_recall
from surya.benchmark.tesseract import tesseract_parallel
from surya.model.detection.model import load_model, load_processor
from surya.input.processing import open_pdf, get_page_images, convert_if_not_rgb
from surya.detection import batch_text_detection
from surya.input.processing import convert_if_not_rgb, get_page_images, open_pdf
from surya.model.detection.model import load_model, load_processor
from surya.postprocessing.heatmap import draw_polys_on_image
from surya.postprocessing.util import rescale_bbox
from surya.settings import settings
import os
import time
from tabulate import tabulate
import datasets


def main():
Expand Down
11 changes: 6 additions & 5 deletions benchmark/gcloud_label.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import argparse
import hashlib
import io
import json
import os
from collections import defaultdict

import datasets
from surya.settings import settings
from google.cloud import vision
import hashlib
import os
from tqdm import tqdm
import io

from surya.settings import settings

DATA_DIR = os.path.join(settings.BASE_DIR, settings.DATA_DIR)
RESULT_DIR = os.path.join(settings.BASE_DIR, settings.RESULT_DIR)
Expand Down Expand Up @@ -146,4 +147,4 @@ def main():


if __name__ == "__main__":
main()
main()
15 changes: 8 additions & 7 deletions benchmark/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,20 @@
import collections
import copy
import json
import os
import time

import datasets
from tabulate import tabulate

from surya.benchmark.metrics import precision_recall
from surya.detection import batch_text_detection
from surya.model.detection.model import load_model, load_processor
from surya.input.processing import open_pdf, get_page_images, convert_if_not_rgb
from surya.input.processing import convert_if_not_rgb, get_page_images, open_pdf
from surya.layout import batch_layout_detection
from surya.postprocessing.heatmap import draw_polys_on_image, draw_bboxes_on_image
from surya.model.detection.model import load_model, load_processor
from surya.postprocessing.heatmap import draw_bboxes_on_image, draw_polys_on_image
from surya.postprocessing.util import rescale_bbox
from surya.settings import settings
import os
import time
from tabulate import tabulate
import datasets


def main():
Expand Down
9 changes: 5 additions & 4 deletions benchmark/ordering.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@
import collections
import copy
import json
import os
import time

import datasets

from surya.benchmark.metrics import rank_accuracy
from surya.input.processing import convert_if_not_rgb
from surya.model.ordering.model import load_model
from surya.model.ordering.processor import load_processor
from surya.ordering import batch_ordering
from surya.settings import settings
from surya.benchmark.metrics import rank_accuracy
import os
import time
import datasets


def main():
Expand Down
3 changes: 2 additions & 1 deletion benchmark/profile.sh
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
python -m cProfile -s time -o data/profile.pstats detect_text.py data/benchmark/nyt2.pdf --max 10
#!/bin/bash
python -m cProfile -s time -o data/profile.pstats detect_text.py data/benchmark/nyt2.pdf --max 10
3 changes: 1 addition & 2 deletions benchmark/pymupdf_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@
import os

from surya.benchmark.bbox import get_pdf_lines
from surya.input.processing import get_page_images, open_pdf
from surya.postprocessing.heatmap import draw_bboxes_on_image

from surya.input.processing import open_pdf, get_page_images
from surya.settings import settings


Expand Down
22 changes: 14 additions & 8 deletions benchmark/recognition.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,28 @@
import argparse
import json
import os
import time
from collections import defaultdict

import datasets
import torch
from tabulate import tabulate

from benchmark.scoring import overlap_score
from surya.benchmark.tesseract import (
TESS_CODE_TO_LANGUAGE,
surya_lang_to_tesseract,
tesseract_ocr_parallel,
)
from surya.input.processing import convert_if_not_rgb
from surya.languages import CODE_TO_LANGUAGE
from surya.model.recognition.model import load_model as load_recognition_model
from surya.model.recognition.processor import load_processor as load_recognition_processor
from surya.model.recognition.processor import (
load_processor as load_recognition_processor,
)
from surya.ocr import run_recognition
from surya.postprocessing.text import draw_text_on_image
from surya.settings import settings
from surya.languages import CODE_TO_LANGUAGE
from surya.benchmark.tesseract import tesseract_ocr_parallel, surya_lang_to_tesseract, TESS_CODE_TO_LANGUAGE
import os
import datasets
import json
import time
from tabulate import tabulate

KEY_LANGUAGES = ["Chinese", "Spanish", "English", "Arabic", "Hindi", "Bengali", "Russian", "Japanese"]

Expand Down
2 changes: 1 addition & 1 deletion benchmark/scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ def overlap_score(pred_lines: List[str], reference_lines: List[str]):
line_weights.append(line_weight)
line_scores = [line_scores[i] * line_weights[i] for i in range(len(line_scores))]

return sum(line_scores) / sum(line_weights)
return sum(line_scores) / sum(line_weights)
3 changes: 1 addition & 2 deletions benchmark/tesseract_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@
import os

from surya.benchmark.tesseract import tesseract_bboxes
from surya.input.processing import get_page_images, open_pdf
from surya.postprocessing.heatmap import draw_bboxes_on_image

from surya.input.processing import open_pdf, get_page_images
from surya.settings import settings


Expand Down
3 changes: 2 additions & 1 deletion benchmark/viz.sh
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
snakeviz data/profile.pstats
#!/bin/bash
snakeviz data/profile.pstats
4 changes: 2 additions & 2 deletions detect_layout.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import argparse
import copy
import json
import os
from collections import defaultdict

from surya.detection import batch_text_detection
from surya.input.load import load_from_folder, load_from_file
from surya.input.load import load_from_file, load_from_folder
from surya.layout import batch_layout_detection
from surya.model.detection.model import load_model, load_processor
from surya.postprocessing.heatmap import draw_polys_on_image
from surya.settings import settings
import os


def main():
Expand Down
9 changes: 5 additions & 4 deletions detect_text.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
import argparse
import copy
import json
import os
import time
from collections import defaultdict

from surya.input.load import load_from_folder, load_from_file
from surya.model.detection.model import load_model, load_processor
from tqdm import tqdm

from surya.detection import batch_text_detection
from surya.input.load import load_from_file, load_from_folder
from surya.model.detection.model import load_model, load_processor
from surya.postprocessing.affinity import draw_lines_on_image
from surya.postprocessing.heatmap import draw_polys_on_image
from surya.settings import settings
import os
from tqdm import tqdm


def main():
Expand Down
18 changes: 10 additions & 8 deletions ocr_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,25 @@

import pypdfium2
import streamlit as st
from PIL import Image

from surya.detection import batch_text_detection
from surya.input.langs import replace_lang_with_code
from surya.languages import CODE_TO_LANGUAGE
from surya.layout import batch_layout_detection
from surya.model.detection.model import load_model, load_processor
from surya.model.ordering.model import load_model as load_order_model
from surya.model.ordering.processor import load_processor as load_order_processor
from surya.model.recognition.model import load_model as load_rec_model
from surya.model.recognition.processor import load_processor as load_rec_processor
from surya.model.ordering.processor import load_processor as load_order_processor
from surya.model.ordering.model import load_model as load_order_model
from surya.ocr import run_ocr
from surya.ordering import batch_ordering
from surya.postprocessing.heatmap import draw_polys_on_image
from surya.ocr import run_ocr
from surya.postprocessing.text import draw_text_on_image
from PIL import Image
from surya.languages import CODE_TO_LANGUAGE
from surya.input.langs import replace_lang_with_code
from surya.schema import OCRResult, TextDetectionResult, LayoutResult, OrderResult
from surya.schema import LayoutResult, OCRResult, OrderResult, TextDetectionResult
from surya.settings import settings


@st.cache_resource()
def load_det_cached():
checkpoint = settings.DETECTOR_MODEL_CHECKPOINT
Expand Down Expand Up @@ -181,4 +183,4 @@ def page_count(pdf_file):
st.json(pred.model_dump(), expanded=True)

with col2:
st.image(pil_image, caption="Uploaded Image", use_column_width=True)
st.image(pil_image, caption="Uploaded Image", use_column_width=True)
13 changes: 8 additions & 5 deletions ocr_text.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
import os
import argparse
import json
import os
from collections import defaultdict

import torch

from surya.input.langs import replace_lang_with_code, get_unique_langs
from surya.input.load import load_from_folder, load_from_file, load_lang_file
from surya.model.detection.model import load_model as load_detection_model, load_processor as load_detection_processor
from surya.input.langs import get_unique_langs, replace_lang_with_code
from surya.input.load import load_from_file, load_from_folder, load_lang_file
from surya.model.detection.model import load_model as load_detection_model
from surya.model.detection.model import load_processor as load_detection_processor
from surya.model.recognition.model import load_model as load_recognition_model
from surya.model.recognition.processor import load_processor as load_recognition_processor
from surya.model.recognition.processor import (
load_processor as load_recognition_processor,
)
from surya.model.recognition.tokenizer import _tokenize
from surya.ocr import run_ocr
from surya.postprocessing.text import draw_text_on_image
Expand Down
Loading
Loading