diff --git a/transcription/.gitignore b/transcription/.gitignore new file mode 100644 index 00000000..e69de29b diff --git a/transcription/app.py b/transcription/app.py new file mode 100644 index 00000000..bff6161e --- /dev/null +++ b/transcription/app.py @@ -0,0 +1,40 @@ +from flask import Flask, request, jsonify +from PIL import Image +from transformers import AutoProcessor, AutoModelForCausalLM +import torch + +app = Flask(__name__) + +# load model and processor once during init +device = "cuda:0" if torch.cuda.is_available() else "cpu" +torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 + +model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large", torch_dtype=torch_dtype, trust_remote_code=True).to(device) +processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True) + +@app.route("/transcribe", methods=["POST"]) +def transcribe(): + if "image" not in request.files: + return jsonify({"error": "No image file provided"}), 400 + + image_file = request.files["image"] + try: + # open and preprocess image + image = Image.open(image_file).convert("RGB") + prompt = "" + inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype) + generated_ids = model.generate( + input_ids=inputs["input_ids"], + pixel_values=inputs["pixel_values"], + max_new_tokens=1024, + num_beams=3, + do_sample=False + ) + generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] + + return jsonify({"transcription": generated_text}) + except Exception as e: + return jsonify({"error": str(e)}), 500 + +if __name__ == "__main__": + app.run(debug=True) \ No newline at end of file diff --git a/transcription/requirements.txt b/transcription/requirements.txt index 03feafad..d87174ed 100644 --- a/transcription/requirements.txt +++ b/transcription/requirements.txt @@ -5,3 +5,5 @@ torch==2.5.1 transformers==4.46.1 einops timm +flask +flask-cors \ No newline at end of file diff --git a/transcription/testing.py b/transcription/testing.py index f569788a..499e0d66 100644 --- a/transcription/testing.py +++ b/transcription/testing.py @@ -33,7 +33,7 @@ def florence(): prompt = "" - url = "../assets/kkl.jpg" + url = "../assets/Filled_Logbook_page-0001.jpg" image = Image.open(url).convert("RGB") inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)