Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add benchmark - ( 312.OCR ) #219

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions benchmarks/300.utilities/312.ocr/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"timeout": 60,
"memory": 256,
"languages": ["python", "nodejs"]
}
24 changes: 24 additions & 0 deletions benchmarks/300.utilities/312.ocr/init.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash

command_exists() {
command -v "$1" >/dev/null 2>&1
}

if command_exists apt-get; then
echo "Using apt package manager"
apt-get install -y tesseract-ocr

elif command_exists yum; then
echo "Using yum package manager"
yum install -y tesseract

elif command_exists apk; then
echo "Using apk package manager"
apk add tesseract-ocr

else
echo "Error: No supported package manager found (apt, yum, or apk)"
exit 1
fi

echo "tesseract-ocr installation completed"
17 changes: 17 additions & 0 deletions benchmarks/300.utilities/312.ocr/input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import glob, os

def buckets_count():
return (1, 1)

def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func):

for file in glob.glob(os.path.join(data_dir, '*.jpg')):
img = os.path.relpath(file, data_dir)
upload_func(0, img, file)

input_config = {'object': {}, 'bucket': {}}
input_config['object']['key'] = img
input_config['bucket']['bucket'] = benchmarks_bucket
input_config['bucket']['input'] = input_paths[0]
input_config['bucket']['output'] = output_paths[0]
return input_config
53 changes: 53 additions & 0 deletions benchmarks/300.utilities/312.ocr/python/function.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import datetime
import io
import os
import sys
import uuid
from urllib.parse import unquote_plus
from PIL import Image
import pytesseract

from . import storage
client = storage.storage.get_instance()

# Memory-based solution
def ocr_image(image_bytes):
with Image.open(io.BytesIO(image_bytes)) as image:
ocr_text = pytesseract.image_to_string(image)
return ocr_text

def handler(event):

bucket = event.get('bucket').get('bucket')
input_prefix = event.get('bucket').get('input')
output_prefix = event.get('bucket').get('output')
key = unquote_plus(event.get('object').get('key'))
download_begin = datetime.datetime.now()
img = client.download_stream(bucket, os.path.join(input_prefix, key))
download_end = datetime.datetime.now()

process_begin = datetime.datetime.now()
ocr_result = ocr_image(img)
process_end = datetime.datetime.now()

upload_begin = datetime.datetime.now()
output_key = f"{os.path.splitext(key)[0]}_ocr.txt"
key_name = client.upload_stream(bucket, os.path.join(output_prefix, output_key), ocr_result.encode('utf-8'))
upload_end = datetime.datetime.now()

download_time = (download_end - download_begin) / datetime.timedelta(microseconds=1)
upload_time = (upload_end - upload_begin) / datetime.timedelta(microseconds=1)
process_time = (process_end - process_begin) / datetime.timedelta(microseconds=1)
return {
'result': {
'bucket': bucket,
'key': key_name
},
'measurement': {
'download_time': download_time,
'download_size': len(img),
'upload_time': upload_time,
'upload_size': len(ocr_result),
'compute_time': process_time
}
}
2 changes: 2 additions & 0 deletions benchmarks/300.utilities/312.ocr/python/requirements.txt.3.8
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Pillow==9.0.0
pytesseract==0.3.13