diff --git a/plugins/coreml/src/coreml/text_recognition.py b/plugins/coreml/src/coreml/text_recognition.py index bc05de7ed..fe192d9f4 100644 --- a/plugins/coreml/src/coreml/text_recognition.py +++ b/plugins/coreml/src/coreml/text_recognition.py @@ -37,3 +37,9 @@ class CoreMLTextRecognition(TextRecognition): out_dict = model.predict({inputName: input}) results = list(out_dict.values())[0] return results + + def predictTextModel(self, input): + model, inputName = self.textModel + out_dict = model.predict({inputName: input}) + preds = out_dict["linear_2"] + return preds diff --git a/plugins/openvino/src/common/text.py b/plugins/openvino/src/common/text.py index cac12df3d..4dfa6d134 100644 --- a/plugins/openvino/src/common/text.py +++ b/plugins/openvino/src/common/text.py @@ -8,7 +8,17 @@ from common.softmax import softmax from common.colors import ensureRGBData import math -async def crop_text(d: ObjectDetectionResult, image: scrypted_sdk.Image, width: int, height: int): +def skew_image(image: Image, skew_angle_rad: float): + skew_matrix = [1, 0, 0, skew_angle_rad, 1, 0] + + # Apply the transformation + skewed_image = image.transform( + image.size, Image.AFFINE, skew_matrix, resample=Image.BICUBIC + ) + + return skewed_image + +async def crop_text(d: ObjectDetectionResult, image: scrypted_sdk.Image): l, t, w, h = d["boundingBox"] l = math.floor(l) t = math.floor(t) @@ -27,14 +37,30 @@ async def crop_text(d: ObjectDetectionResult, image: scrypted_sdk.Image, width: } ) pilImage = await ensureRGBData(cropped, (w, h), format) - resized = pilImage.resize((width, height), resample=Image.LANCZOS).convert("L") - pilImage.close() - return resized + return pilImage + +def calculate_y_change(original_height, skew_angle_radians): + # Calculate the change in y-position + y_change = original_height * math.tan(skew_angle_radians) + + return y_change + +async def prepare_text_result(d: ObjectDetectionResult, image: scrypted_sdk.Image, skew_angle: float): + textImage = await crop_text(d, image) + + skew_height_change = calculate_y_change(d["boundingBox"][3], skew_angle) + skew_height_change = math.floor(skew_height_change) + textImage = skew_image(textImage, skew_angle) + # crop skew_height_change from top + if skew_height_change > 0: + textImage = textImage.crop((0, 0, textImage.width, textImage.height - skew_height_change)) + elif skew_height_change < 0: + textImage = textImage.crop((0, -skew_height_change, textImage.width, textImage.height)) -async def prepare_text_result(d: ObjectDetectionResult, image: scrypted_sdk.Image): new_height = 64 - new_width = int(d["boundingBox"][2] * new_height / d["boundingBox"][3]) - textImage = await crop_text(d, image, new_width, new_height) + new_width = int(textImage.width * new_height / textImage.height) + textImage = textImage.resize((new_width, new_height), resample=Image.LANCZOS).convert("L") + new_width = 256 # calculate padding dimensions padding = (0, 0, new_width - textImage.width, 0) @@ -50,7 +76,6 @@ async def prepare_text_result(d: ObjectDetectionResult, image: scrypted_sdk.Imag # test normalize contrast # image_tensor = (image_tensor - np.min(image_tensor)) / (np.max(image_tensor) - np.min(image_tensor)) - image_tensor = (image_tensor - 0.5) / 0.5 image_tensor = np.expand_dims(image_tensor, axis=0) diff --git a/plugins/openvino/src/ov/text_recognition.py b/plugins/openvino/src/ov/text_recognition.py index ea777e650..fb4ec136e 100644 --- a/plugins/openvino/src/ov/text_recognition.py +++ b/plugins/openvino/src/ov/text_recognition.py @@ -1,6 +1,7 @@ from __future__ import annotations import openvino.runtime as ov +import numpy as np from predict.text_recognize import TextRecognition @@ -34,3 +35,12 @@ class OpenVINOTextRecognition(TextRecognition): infer_request.start_async() infer_request.wait() return infer_request.output_tensors[0].data + + def predictTextModel(self, input): + input = input.astype(np.float32) + im = ov.Tensor(array=input) + infer_request = self.textModel.create_infer_request() + infer_request.set_input_tensor(im) + infer_request.start_async() + infer_request.wait() + return infer_request.output_tensors[0].data diff --git a/plugins/tensorflow-lite/src/predict/recognize.py b/plugins/tensorflow-lite/src/predict/recognize.py index ab1330ca1..eb9cf8598 100644 --- a/plugins/tensorflow-lite/src/predict/recognize.py +++ b/plugins/tensorflow-lite/src/predict/recognize.py @@ -202,8 +202,8 @@ class RecognizeDetection(PredictPlugin): for d in ret["detections"]: if d["className"] == "face": futures.append(asyncio.ensure_future(self.setEmbedding(d, image))) - elif d["className"] == "plate": - futures.append(asyncio.ensure_future(self.setLabel(d, image))) + # elif d["className"] == "plate": + # futures.append(asyncio.ensure_future(self.setLabel(d, image))) # elif d['className'] == 'text': # futures.append(asyncio.ensure_future(self.setLabel(d, image))) diff --git a/plugins/tensorflow-lite/src/predict/text_recognize.py b/plugins/tensorflow-lite/src/predict/text_recognize.py index bc18e47a0..8e0c2404a 100644 --- a/plugins/tensorflow-lite/src/predict/text_recognize.py +++ b/plugins/tensorflow-lite/src/predict/text_recognize.py @@ -2,20 +2,26 @@ from __future__ import annotations import asyncio import concurrent.futures +import traceback +from asyncio import Future from typing import Any, List, Tuple import numpy as np import scrypted_sdk from PIL import Image +from scrypted_sdk import ObjectDetectionResult, ObjectDetectionSession, ObjectsDetected +from common.text import prepare_text_result, process_text_result from predict import Prediction, PredictPlugin from predict.craft_utils import normalizeMeanVariance from predict.rectangle import Rectangle from .craft_utils import adjustResultCoordinates, getDetBoxes +from predict.text_skew import find_adjacent_groups predictExecutor = concurrent.futures.ThreadPoolExecutor(1, "TextDetect") + class TextRecognition(PredictPlugin): def __init__(self, nativeId: str | None = None): super().__init__(nativeId=nativeId) @@ -30,7 +36,7 @@ class TextRecognition(PredictPlugin): self.minThreshold = 0.1 self.detectModel = self.downloadModel("craft") - + self.textModel = self.downloadModel("vgg_english_g2") def downloadModel(self, model: str): pass @@ -38,7 +44,12 @@ class TextRecognition(PredictPlugin): def predictDetectModel(self, input): pass - async def detect_once(self, input: Image.Image, settings: Any, src_size, cvss) -> scrypted_sdk.ObjectsDetected: + def predictTextModel(self, input): + pass + + async def detect_once( + self, input: Image.Image, settings: Any, src_size, cvss + ) -> scrypted_sdk.ObjectsDetected: image_tensor = normalizeMeanVariance(np.array(input)) # reshape to c w h image_tensor = image_tensor.transpose([2, 0, 1]) @@ -51,9 +62,9 @@ class TextRecognition(PredictPlugin): estimate_num_chars = False ratio_h = ratio_w = 1 - text_threshold = .7 - link_threshold = .7 - low_text = .4 + text_threshold = 0.4 + link_threshold = 0.7 + low_text = 0.4 poly = False boxes_list, polys_list = [], [] @@ -64,7 +75,14 @@ class TextRecognition(PredictPlugin): # Post-processing boxes, polys, mapper = getDetBoxes( - score_text, score_link, text_threshold, link_threshold, low_text, poly, estimate_num_chars) + score_text, + score_link, + text_threshold, + link_threshold, + low_text, + poly, + estimate_num_chars, + ) if not len(boxes): continue @@ -86,16 +104,60 @@ class TextRecognition(PredictPlugin): for boxes in boxes_list: for box in boxes: tl, tr, br, bl = box - l = tl[0] - t = tl[1] - r = br[0] - b = br[1] + l = min(tl[0], bl[0]) + t = min(tl[1], tr[1]) + r = max(tr[0], br[0]) + b = max(bl[1], br[1]) pred = Prediction(0, 1, Rectangle(l, t, r, b)) preds.append(pred) - + return self.create_detection_result(preds, src_size, cvss) + async def run_detection_image( + self, image: scrypted_sdk.Image, detection_session: ObjectDetectionSession + ) -> ObjectsDetected: + ret = await super().run_detection_image(image, detection_session) + + detections = ret["detections"] + + futures: List[Future] = [] + + boundingBoxes = [d["boundingBox"] for d in detections] + text_groups = find_adjacent_groups(boundingBoxes) + + detections = [] + for group in text_groups: + boundingBox = group["union"] + d: ObjectDetectionResult = { + "boundingBox": boundingBox, + "score": 1, + "className": "text", + } + futures.append(asyncio.ensure_future(self.setLabel(d, image, group["skew_angle"]))) + detections.append(d) + + ret["detections"] = detections + + if len(futures): + await asyncio.wait(futures) + + return ret + + async def setLabel(self, d: ObjectDetectionResult, image: scrypted_sdk.Image, skew_angle: float): + try: + + image_tensor = await prepare_text_result(d, image, skew_angle) + preds = await asyncio.get_event_loop().run_in_executor( + predictExecutor, + lambda: self.predictTextModel(image_tensor), + ) + d["label"] = process_text_result(preds) + + except Exception as e: + traceback.print_exc() + pass + # width, height, channels def get_input_details(self) -> Tuple[int, int, int]: return (self.inputwidth, self.inputheight, 3) @@ -104,4 +166,4 @@ class TextRecognition(PredictPlugin): return (self.inputwidth, self.inputheight) def get_input_format(self) -> str: - return "rgb" \ No newline at end of file + return "rgb" diff --git a/plugins/tensorflow-lite/src/predict/text_skew.py b/plugins/tensorflow-lite/src/predict/text_skew.py new file mode 100644 index 000000000..0084ae2b6 --- /dev/null +++ b/plugins/tensorflow-lite/src/predict/text_skew.py @@ -0,0 +1,78 @@ +from typing import List, Tuple +import math + +BoundingBox = Tuple[int, int, int, int] + + +def union_boxes(boxes: List[BoundingBox]) -> BoundingBox: + left = min([box[0] for box in boxes]) + top = min([box[1] for box in boxes]) + right = max([box[0] + box[2] for box in boxes]) + bottom = max([box[1] + box[3] for box in boxes]) + return left, top, right - left, bottom - top + + +def are_boxes_adjacent(box1: BoundingBox, box2: BoundingBox): + l1, t1, w1, h1 = box1 + l2, t2, w2, h2 = box2 + + line_slop = 2 / 3 + if t1 > t2 + h2 * line_slop or t2 > t1 + h1 * line_slop: + return False + + # Calculate the left and right edges of each box + left_edge_box1 = l1 + right_edge_box1 = l1 + w1 + left_edge_box2 = l2 + right_edge_box2 = l2 + w2 + + # Determine the larger height between the two boxes + larger_height = max(h1, h2) + + threshold = larger_height * 2 + + # Calculate the vertical distance between the boxes + distance = min( + abs(left_edge_box1 - right_edge_box2), abs(left_edge_box2 - right_edge_box1) + ) + + # Check if the boxes are adjacent along their left or right sides + if distance <= threshold: + return True + else: + return False + + +def find_adjacent_groups(boxes: List[BoundingBox]) -> List[dict]: + groups = [] + + # sort boxes left to right + boxes = sorted(boxes, key=lambda box: box[0]) + + for box in boxes: + added_to_group = False + for group in groups: + for other_box in group["boxes"]: + if are_boxes_adjacent(box, other_box): + group["boxes"].append(box) + added_to_group = True + break + if added_to_group: + break + if not added_to_group: + groups.append({"boxes": [box], "skew_angle": 0}) + + # Calculate the skew angle of each group + for group in groups: + boxes = group["boxes"] + sum_angle = 0 + for i in range(len(boxes) - 1): + x1, y1, w1, h1 = boxes[i] + x2, y2, w2, h2 = boxes[i + 1] + dx = x2 - x1 + dy = y2 - y1 + sum_angle += math.atan2(dy, dx) + group["skew_angle"] = 0 if not len(boxes) - 1 else sum_angle / (len(boxes) - 1) + group["union"] = union_boxes(boxes) + + return groups diff --git a/server/package-lock.json b/server/package-lock.json index df19ec008..6b0270601 100644 --- a/server/package-lock.json +++ b/server/package-lock.json @@ -1,12 +1,12 @@ { "name": "@scrypted/server", - "version": "0.98.4", + "version": "0.100.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@scrypted/server", - "version": "0.98.4", + "version": "0.100.0", "hasInstallScript": true, "license": "ISC", "dependencies": {