luxonis · tersekmatija · Sep 21, 2021 · Sep 23, 2021
diff --git a/gen2-ocr-3d-db/.gitignore b/gen2-ocr-3d-db/.gitignore
@@ -0,0 +1,7 @@
+models-old/
+.idea/
+__pycache__/
+codec.py
+east.py
+main.py
+main_no_depth_dev.py
diff --git a/gen2-ocr-3d-db/README.md b/gen2-ocr-3d-db/README.md
@@ -0,0 +1,66 @@
+## [Gen2] Text Detection with Differential Binarization + Optical Character Recognition (OCR) Pipeline
+
+This pipeline implements text detection with differential binarization followed by optical character recognition of the detected text. Model is taken from [OpenCV Zoo](https://github.com/opencv/opencv_zoo) and converted to blob that can run on OAK-D devices.
+
+Input shape to NN is 320x320, confidence map is returned in the 1st stage. Bounding boxes are then extracted and forward to second stage, that expects an output of 32x100 (HxW).
+
+![Example](imgs/example.gif)
+
+### Important notes
+
+* Rotated text is currently not supported and performance will be bad. This example will be updated once ImageManip Node is updated to correctly crop the area. Right now you can manually change the center of the crop area so it works for your example by changing these 2 lines:
+
+  ```python
+  # create rr for image manip
+  rr = dai.RotatedRect()
+  rr.center.x = cx + 15 # manually add so the crop is centered (myb bug in Manip)
+  rr.center.y = cy
+  rr.size.width = width * 1.2
+  rr.size.height = height# * 1.05
+  rr.angle = 0
+  ```
+
+* Support for 3D text recognition is not yet implemented, and will be once the ImageManip is updated.
+
+* Default threshold for bitmap is set higher for this example. If you lower it, consider lowering `UNCLIP_RATIO` in code, otherwise crops sent to recognition stage will be bigger than the detected area.
+
+* This example also includes regex filtering and parsing of the amount that can be seen in the above GIF. If you wish to change or remove this, search for the following code in the `main_db.py`:
+
+    ```python
+    r = re.compile('s[0-9]{6}')
+    raised_list = list(filter(r.match, texts))  # Read Note below
+    print(raised_list)
+    if len(raised_list) > 0:
+        raised_amount = int(raised_list[0][1:])
+        raised_text = f"Raised: ${raised_amount}"
+        (w, h), _ = cv2.getTextSize(raised_text, cv2.FONT_HERSHEY_DUPLEX, 0.5, 1)
+        cv2.rectangle(frame, (160 - w//2 - 5, 0), (160 + w//2 + 5, h + 15), color_white, -1)
+        cv2.putText(frame, raised_text, (160 - w//2, 0 + h + 10), cv2.FONT_HERSHEY_DUPLEX,
+                    0.5, (100, 100, 100) if raised_amount < 500000 else (255, 0, 255))
+	```
+
+* Right now words are not separated when passed to the 2nd stage, so long text will likely be not correctly recognized.
+
+## Pre-requisites
+
+1. Purchase a DepthAI (or OAK) model (see [shop.luxonis.com](https://shop.luxonis.com/)).
+
+3. Install requirements.
+   ```
+   python3 -m pip install -r requirements.txt
+   ```
+   *Note: if you are using windows and have problems with installing shapely library, please consider using Conda environment.*
+
+## Usage
+
+```
+python3 main_db.py [options]
+```
+
+Options:
+
+* `-bt, --box_thresh`: Set the box confidence threshold. Default: *0.3*.
+* `-t, --thresh`: Set the bitmap threshold. Default: *0.6*.
+* `-ms, --min_size`: Set the minimum size of box (area). Default: *2*.
+* `-mc, --max_candidates`: Maximum number of returned box candidates. Default: *50*.
+
diff --git a/gen2-ocr-3d-db/imgs/example.gif b/gen2-ocr-3d-db/imgs/example.gif
diff --git a/gen2-ocr-3d-db/main_db.py b/gen2-ocr-3d-db/main_db.py
@@ -0,0 +1,232 @@
+import depthai as dai
+import cv2
+import time
+import numpy as np
+import re
+from utils import get_boxes, postprocess
+import argparse
+
+# --------------- Arguments ---------------
+parser = argparse.ArgumentParser()
+parser.add_argument("-bt", "--box_thresh", help="set the confidence threshold of boxes", default=0.3, type=float)
+parser.add_argument("-t", "--thresh", help="set the bitmap threshold", default=0.6, type=float)
+parser.add_argument("-ms", "--min_size", default=2, type=int, help='set min size of box')
+parser.add_argument("-mc", "--max_candidates", default=50, type=int, help='maximum number of candidate boxes')
+
+
+args = parser.parse_args()
+
+MAX_CANDIDATES = args.max_candidates
+MIN_SIZE = args.min_size
+BOX_THRESH = args.box_thresh
+THRESH = args.thresh
+UNCLIP_RATIO = 4 # set big unclip because thresh is high
+
+PREVIEW_W, PREVIEW_H = 320, 320
+
+def create_pipeline():
+    pipeline = dai.Pipeline()
+    pipeline.setOpenVINOVersion(version=dai.OpenVINO.VERSION_2021_4)
+
+    # ------ Create a camera ------
+    cam = pipeline.createColorCamera()
+    cam.setPreviewSize(PREVIEW_W, PREVIEW_H)
+    cam.setInterleaved(False)
+    cam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
+    cam.setColorOrder(dai.ColorCameraProperties.ColorOrder.RGB)
+    cam.setPreviewKeepAspectRatio(True)
+    cam.setFps(5)
+    # ------------------------
+
+    # ------ Image Manip ------
+    manip = pipeline.createImageManip()
+    manip.initialConfig.setResize(PREVIEW_W, PREVIEW_H)
+    manip.initialConfig.setKeepAspectRatio(False)
+    manip.initialConfig.setFrameType(dai.ImgFrame.Type.BGR888p)
+    # ------------------------
+
+    cam.preview.link(manip.inputImage)
+
+    # ------ Neural Network ------
+    nn = pipeline.createNeuralNetwork()
+    nn.setBlobPath("models/text_detection_db_320x320_openvino_2021.4_6shave.blob")
+    # ------------------------
+
+    manip.out.link(nn.input)
+
+    # ------ Image Manip ------
+    manip_recog = pipeline.createImageManip()
+    manip_recog.setWaitForConfigInput(True)
+    manip_img = pipeline.createXLinkIn()
+    manip_img.setStreamName('manip_img')
+    manip_img.out.link(manip_recog.inputImage)
+    manip_cfg = pipeline.createXLinkIn()
+    manip_cfg.setStreamName('manip_cfg')
+    manip_cfg.out.link(manip_recog.inputConfig)
+    # ------------------------
+
+    # ------ Neural Network #2 ------
+    nn_recog = pipeline.createNeuralNetwork()
+    nn_recog.setBlobPath("models/text_recog_db_32x100_openvino_2021.4_6shave.blob")
+    # ------------------------
+
+    manip_recog.out.link(nn_recog.input)
+
+    # ------ Out link ------
+    xout_cam = pipeline.createXLinkOut()
+    xout_cam.setStreamName("cam")
+    cam.preview.link(xout_cam.input)
+    #manip.out.link(xout_cam.input)
+
+    xout_nn = pipeline.createXLinkOut()
+    xout_nn.setStreamName("nn")
+    nn.out.link(xout_nn.input)
+
+    xout_manip_recog = pipeline.createXLinkOut()
+    xout_manip_recog.setStreamName("manip_recog")
+    manip_recog.out.link(xout_manip_recog.input)
+
+    xout_nn_recog = pipeline.createXLinkOut()
+    xout_nn_recog.setStreamName("nn_recog")
+    nn_recog.out.link(xout_nn_recog.input)
+
+    return pipeline
+
+
+if __name__ == "__main__":
+
+    with dai.Device() as device:
+
+        # fps handling
+        start_time = time.time()
+        counter = 0
+        fps = 0
+
+        # start pipeline
+        pipeline = create_pipeline()
+        device.startPipeline(pipeline)
+
+        while True:
+
+            # get queues
+            q_cam = device.getOutputQueue("cam", 4, False)
+            q_nn = device.getOutputQueue("nn", 4, False)
+
+            q_manip_recog = device.getOutputQueue("manip_recog", 4, False)
+            q_nn_recog = device.getOutputQueue("nn_recog", 4, False)
+
+            q_manip_cfg = device.getInputQueue("manip_cfg", 12)
+            q_manip_img = device.getInputQueue("manip_img", 12)
+
+            # get frame
+            in_cam = q_cam.get()
+            frame = in_cam.getCvFrame()
+
+            # ------ read detection ------
+            in_nn = q_nn.get()
+            # get output layer
+            pred = np.array(in_nn.getLayerFp16("out")).reshape((PREVIEW_W, PREVIEW_H))
+            # show output mask
+            cv2.imshow("Mask",(pred * 255).astype(np.uint8))
+            tv, thresh = cv2.threshold((pred * 255).astype(np.uint8), 10, 255, cv2.THRESH_BINARY)
+            # get the contours from your thresholded image
+            contours = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0]
+            # decode
+            boxes, scores = get_boxes(pred, THRESH, BOX_THRESH, MIN_SIZE, MAX_CANDIDATES, UNCLIP_RATIO)
+            boxes = boxes.astype(np.int16)
+
+            # recognition init
+            texts = []
+            frame_recogs = np.zeros((32, 100, 1), dtype = np.uint8)
+            frame_texts = np.zeros((32, 250, 1), dtype=np.uint8)
+
+            # loop over detections
+            for idx, box in enumerate(boxes):
+
+                # display text bb
+                cv2.rectangle(frame, (box[0, 0], box[0, 1]), (box[2, 0], box[2, 1]), (255, 0, 0), 1)
+                cx = (box[0, 0] + box[2, 0]) / 2
+                cy = (box[0, 1] + box[2, 1]) / 2
+                cv2.circle(frame, (int(cx), int(cy)), 1, (255, 0, 0), 1)
+                width = np.linalg.norm(box[0] - box[1])
+                height = np.linalg.norm(box[0] - box[3])
+                dist = np.abs(box[0,0] - box[1, 0])
+                angle = np.arccos(dist/width)
+
+                #print(f"{dist} / {width} => {np.rad2deg(angle)}")
+
+                # create rr for image manip
+                rr = dai.RotatedRect()
+                rr.center.x = cx + 15 # manually add so the crop is centered (myb bug in Manip)
+                rr.center.y = cy
+                rr.size.width = width * 1.2
+                rr.size.height = height# * 1.05
+                rr.angle = 0
+                #rr.angle = np.rad2deg(angle)
+
+                # send to image config to get a crop
+                cfg = dai.ImageManipConfig()
+                cfg.setFrameType(dai.ImgFrame.Type.GRAY8)
+                cfg.setCropRotatedRect(rr, False)
+                cfg.setResize(100, 32)
+                if idx == 0:
+                    q_manip_img.send(in_cam)
+                else:
+                    cfg.setReusePreviousImage(True)
+                q_manip_cfg.send(cfg)
+
+                # get cropped image
+                frame_recog = q_manip_recog.get()
+                shape = (1, frame_recog.getHeight(), frame_recog.getWidth())
+                frame_recog = frame_recog.getData().reshape(shape).transpose(1, 2, 0)
+                frame_recogs = np.vstack([frame_recogs, frame_recog])
+
+                # get 2nd nn output and decode text
+                in_text = q_nn_recog.get()
+                text_recog = np.array(in_text.getLayerFp16("output")).reshape(24, 1, 37)
+                text_recog = postprocess(text_recog)
+                texts.append(text_recog)
+
+                # combine text frames
+                frame_text = np.zeros((32, 250, 1), dtype=np.uint8)
+                cv2.putText(frame_text, text_recog, (0, 26), cv2.FONT_HERSHEY_DUPLEX, 0.5, 255)
+                frame_texts = np.vstack([frame_texts, frame_text])
+
+            # show all manip crops
+            cv2.imshow("recogs", np.hstack([frame_recogs, frame_texts]))
+
+
+            # detect 500000k reached
+            #print(texts)
+            r = re.compile('s[0-9]{6}')
+            raised_list = list(filter(r.match, texts))  # Read Note below
+            print(raised_list)
+            if len(raised_list) > 0:
+                raised_amount = int(raised_list[0][1:])
+                print(f"PARSED AMOUNT: {raised_amount}")
+                raised_text = f"Raised: ${raised_amount}"
+                (w, h), _ = cv2.getTextSize(raised_text, cv2.FONT_HERSHEY_DUPLEX, 0.5, 1)
+                cv2.rectangle(frame, (160 - w//2 - 5, 0), (160 + w//2 + 5, h + 15), color_white, -1)
+                cv2.putText(frame, raised_text, (160 - w//2, 0 + h + 10), cv2.FONT_HERSHEY_DUPLEX,
+                            0.5, (100, 100, 100) if raised_amount < 500000 else (255, 0, 255))
+
+            # ------ Show FPS ------
+            color_black, color_white = (0, 0, 0), (255, 255, 255)
+            label_fps = "Fps: {:.2f}".format(fps)
+            (w1, h1), _ = cv2.getTextSize(label_fps, cv2.FONT_HERSHEY_TRIPLEX, 0.4, 1)
+            cv2.rectangle(frame, (0, frame.shape[0] - h1 - 6), (w1 + 2, frame.shape[0]), color_white, -1)
+            cv2.putText(frame, label_fps, (2, frame.shape[0] - 4), cv2.FONT_HERSHEY_TRIPLEX,
+                        0.4, color_black)
+
+            cv2.imshow("frame", frame)
+
+            counter += 1
+            if (time.time() - start_time) > 1:
+                fps = counter / (time.time() - start_time)
+
+                counter = 0
+                start_time = time.time()
+
+
+            if cv2.waitKey(1) == ord('q'):
+                break
diff --git a/gen2-ocr-3d-db/models/text_detection_db_320x320_openvino_2021.4_6shave.blob b/gen2-ocr-3d-db/models/text_detection_db_320x320_openvino_2021.4_6shave.blob
diff --git a/gen2-ocr-3d-db/models/text_recog_db_32x100_openvino_2021.4_6shave.blob b/gen2-ocr-3d-db/models/text_recog_db_32x100_openvino_2021.4_6shave.blob
diff --git a/gen2-ocr-3d-db/requirements.txt b/gen2-ocr-3d-db/requirements.txt
@@ -0,0 +1,5 @@
+opencv-python
+depthai==2.9.0.0
+numpy~=1.19.5
+pyclipper~=1.3.0
+shapely~=1.7.1