diff --git a/benchmarks/300.utilities/312.ocr/config.json b/benchmarks/300.utilities/312.ocr/config.json new file mode 100644 index 00000000..e9fe5a45 --- /dev/null +++ b/benchmarks/300.utilities/312.ocr/config.json @@ -0,0 +1,5 @@ +{ + "timeout": 60, + "memory": 256, + "languages": ["python", "nodejs"] +} diff --git a/benchmarks/300.utilities/312.ocr/init.sh b/benchmarks/300.utilities/312.ocr/init.sh new file mode 100644 index 00000000..ef7c6439 --- /dev/null +++ b/benchmarks/300.utilities/312.ocr/init.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +command_exists() { + command -v "$1" >/dev/null 2>&1 +} + +if command_exists apt-get; then + echo "Using apt package manager" + apt-get install -y tesseract-ocr + +elif command_exists yum; then + echo "Using yum package manager" + yum install -y tesseract + +elif command_exists apk; then + echo "Using apk package manager" + apk add tesseract-ocr + +else + echo "Error: No supported package manager found (apt, yum, or apk)" + exit 1 +fi + +echo "tesseract-ocr installation completed" diff --git a/benchmarks/300.utilities/312.ocr/input.py b/benchmarks/300.utilities/312.ocr/input.py new file mode 100644 index 00000000..86e6fb3c --- /dev/null +++ b/benchmarks/300.utilities/312.ocr/input.py @@ -0,0 +1,17 @@ +import glob, os + +def buckets_count(): + return (1, 1) + +def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func): + + for file in glob.glob(os.path.join(data_dir, '*.jpg')): + img = os.path.relpath(file, data_dir) + upload_func(0, img, file) + + input_config = {'object': {}, 'bucket': {}} + input_config['object']['key'] = img + input_config['bucket']['bucket'] = benchmarks_bucket + input_config['bucket']['input'] = input_paths[0] + input_config['bucket']['output'] = output_paths[0] + return input_config diff --git a/benchmarks/300.utilities/312.ocr/python/function.py b/benchmarks/300.utilities/312.ocr/python/function.py new file mode 100644 index 00000000..13a62344 --- /dev/null +++ b/benchmarks/300.utilities/312.ocr/python/function.py @@ -0,0 +1,53 @@ +import datetime +import io +import os +import sys +import uuid +from urllib.parse import unquote_plus +from PIL import Image +import pytesseract + +from . import storage +client = storage.storage.get_instance() + +# Memory-based solution +def ocr_image(image_bytes): + with Image.open(io.BytesIO(image_bytes)) as image: + ocr_text = pytesseract.image_to_string(image) + return ocr_text + +def handler(event): + + bucket = event.get('bucket').get('bucket') + input_prefix = event.get('bucket').get('input') + output_prefix = event.get('bucket').get('output') + key = unquote_plus(event.get('object').get('key')) + download_begin = datetime.datetime.now() + img = client.download_stream(bucket, os.path.join(input_prefix, key)) + download_end = datetime.datetime.now() + + process_begin = datetime.datetime.now() + ocr_result = ocr_image(img) + process_end = datetime.datetime.now() + + upload_begin = datetime.datetime.now() + output_key = f"{os.path.splitext(key)[0]}_ocr.txt" + key_name = client.upload_stream(bucket, os.path.join(output_prefix, output_key), ocr_result.encode('utf-8')) + upload_end = datetime.datetime.now() + + download_time = (download_end - download_begin) / datetime.timedelta(microseconds=1) + upload_time = (upload_end - upload_begin) / datetime.timedelta(microseconds=1) + process_time = (process_end - process_begin) / datetime.timedelta(microseconds=1) + return { + 'result': { + 'bucket': bucket, + 'key': key_name + }, + 'measurement': { + 'download_time': download_time, + 'download_size': len(img), + 'upload_time': upload_time, + 'upload_size': len(ocr_result), + 'compute_time': process_time + } + } diff --git a/benchmarks/300.utilities/312.ocr/python/requirements.txt.3.8 b/benchmarks/300.utilities/312.ocr/python/requirements.txt.3.8 new file mode 100644 index 00000000..5ce33227 --- /dev/null +++ b/benchmarks/300.utilities/312.ocr/python/requirements.txt.3.8 @@ -0,0 +1,2 @@ +Pillow==9.0.0 +pytesseract==0.3.13