tensorflow-lite/opencv/videoanalysis: refactor to support detection snapshots

2026-05-04 21:30:30 +01:00 · 2022-09-27 20:34:52 -07:00
parent a13991183d
commit 758adf8022
7 changed files with 185 additions and 96 deletions
--- a/plugins/objectdetector/.vscode/settings.json
+++ b/plugins/objectdetector/.vscode/settings.json
@@ -1,3 +1,3 @@
 {
-    "scrypted.debugHost": "raspberrypi",
+    "scrypted.debugHost": "127.0.0.1",
 }
--- a/plugins/objectdetector/src/main.ts
+++ b/plugins/objectdetector/src/main.ts
@@ -5,7 +5,7 @@ import { alertRecommendedPlugins } from '@scrypted/common/src/alert-recommended-
 import { DenoisedDetectionEntry, DenoisedDetectionState, denoiseDetections } from './denoise';
 import { AutoenableMixinProvider } from "../../../common/src/autoenable-mixin-provider"
 import { safeParseJson } from './util';
-import fs from 'fs';
+import crypto from 'crypto';

 const polygonOverlap = require('polygon-overlap');

@@ -150,7 +150,7 @@ class ObjectDetectionMixin extends SettingsMixinDeviceBase<VideoCamera & Camera
      settings: await this.getCurrentSettings(),
    });
    this.objectsDetected(detections, true);
-    this.reportObjectDetections(detections, undefined);
+    this.reportObjectDetections(detections);
  }

  bindObjectDetection() {
@@ -170,7 +170,7 @@ class ObjectDetectionMixin extends SettingsMixinDeviceBase<VideoCamera & Camera
      if (eventData?.detectionId !== this.detectionId)
        return;
      this.objectsDetected(eventData);
-      this.reportObjectDetections(eventData, undefined);
+      this.reportObjectDetections(eventData);

      this.running = eventData.running;
    });
@@ -194,7 +194,8 @@ class ObjectDetectionMixin extends SettingsMixinDeviceBase<VideoCamera & Camera
          settings: await this.getCurrentSettings(),
        });
        this.objectsDetected(detections, true);
-        this.reportObjectDetections(detections, eventData.detectionId);
+        this.setDetection(detections, mo);
+        this.reportObjectDetections(detections);
      });
    }
  }
@@ -228,7 +229,9 @@ class ObjectDetectionMixin extends SettingsMixinDeviceBase<VideoCamera & Camera
    this.running = detection.running;

    const newOrBetterDetection = this.objectsDetected(detection);
-    this.reportObjectDetections(detection, newOrBetterDetection ? mediaObject : undefined);
+    if (newOrBetterDetection)
+      this.setDetection(detection, mediaObject);
+    this.reportObjectDetections(detection);
    // if (newOrBetterDetection) {
    //   mediaManager.convertMediaObjectToBuffer(mediaObject, 'image/jpeg')
    //     .then(jpeg => {
@@ -302,10 +305,7 @@ class ObjectDetectionMixin extends SettingsMixinDeviceBase<VideoCamera & Camera
    return this.hasMotionType ? this.detectionInterval * 1000 * 5 : this.detectionDuration * 1000;
  }

-  reportObjectDetections(detection: ObjectsDetected, detectionInput?: MediaObject) {
-    if (detectionInput)
-      this.setDetection(detection.detectionId, detectionInput);
-
+  reportObjectDetections(detection: ObjectsDetected) {
    // determine zones of the objects, if configured.
    if (detection.detections && Object.keys(this.zones).length) {
      for (const o of detection.detections) {
@@ -441,7 +441,11 @@ class ObjectDetectionMixin extends SettingsMixinDeviceBase<VideoCamera & Camera
    return newOrBetterDetection;
  }

-  setDetection(detectionId: string, detectionInput: MediaObject) {
+  setDetection(detection: ObjectsDetected, detectionInput: MediaObject) {
+    if (!detection.detectionId)
+      detection.detectionId = crypto.randomBytes(4).toString('hex');
+
+    const { detectionId } = detection;
    this.detections.set(detectionId, detectionInput);
    setTimeout(() => {
      this.detections.delete(detectionId);
@@ -465,7 +469,7 @@ class ObjectDetectionMixin extends SettingsMixinDeviceBase<VideoCamera & Camera
  async getDetectionInput(detectionId: any): Promise<MediaObject> {
    const detection = this.detections.get(detectionId);
    if (detection)
-      return;
+      return detection;
    if (this.mixinDeviceInterfaces.includes(ScryptedInterface.ObjectDetector))
      return this.mixinDevice.getDetectionInput(detectionId);
    throw new Error('Detection not found. It may have expired.');
--- a/plugins/opencv/package-lock.json
+++ b/plugins/opencv/package-lock.json
@@ -7,14 +7,13 @@
      "": {
         "name": "@scrypted/opencv",
         "version": "0.0.46",
-         "hasInstallScript": true,
         "devDependencies": {
            "@scrypted/sdk": "file:../../sdk"
         }
      },
      "../../sdk": {
         "name": "@scrypted/sdk",
-         "version": "0.0.199",
+         "version": "0.1.17",
         "dev": true,
         "license": "ISC",
         "dependencies": {
@@ -23,12 +22,13 @@
            "axios": "^0.21.4",
            "babel-loader": "^8.2.3",
            "babel-plugin-const-enum": "^1.1.0",
-            "esbuild": "^0.13.8",
+            "esbuild": "^0.15.9",
            "ncp": "^2.0.0",
            "raw-loader": "^4.0.2",
            "rimraf": "^3.0.2",
            "tmp": "^0.2.1",
-            "webpack": "^5.59.0"
+            "webpack": "^5.74.0",
+            "webpack-bundle-analyzer": "^4.5.0"
         },
         "bin": {
            "scrypted-debug": "bin/scrypted-debug.js",
@@ -44,9 +44,7 @@
            "@types/stringify-object": "^4.0.0",
            "stringify-object": "^3.3.0",
            "ts-node": "^10.4.0",
-            "typedoc": "^0.22.8",
-            "typescript-json-schema": "^0.50.1",
-            "webpack-bundle-analyzer": "^4.5.0"
+            "typedoc": "^0.23.15"
         }
      },
      "../sdk": {
@@ -68,16 +66,15 @@
            "axios": "^0.21.4",
            "babel-loader": "^8.2.3",
            "babel-plugin-const-enum": "^1.1.0",
-            "esbuild": "^0.13.8",
+            "esbuild": "^0.15.9",
            "ncp": "^2.0.0",
            "raw-loader": "^4.0.2",
            "rimraf": "^3.0.2",
            "stringify-object": "^3.3.0",
            "tmp": "^0.2.1",
            "ts-node": "^10.4.0",
-            "typedoc": "^0.22.8",
-            "typescript-json-schema": "^0.50.1",
-            "webpack": "^5.59.0",
+            "typedoc": "^0.23.15",
+            "webpack": "^5.74.0",
            "webpack-bundle-analyzer": "^4.5.0"
         }
      }
--- a/plugins/opencv/src/opencv/init.py
+++ b/plugins/opencv/src/opencv/init.py
@@ -207,7 +207,7 @@ class OpenCVPlugin(DetectPlugin):
        width = caps.get_structure(0).get_value('width')
        result, info = buf.map(Gst.MapFlags.READ)
        if not result:
-            return
+            return None, None
        try:
            mat = np.ndarray(
                (height,
@@ -223,8 +223,8 @@ class OpenCVPlugin(DetectPlugin):

        if not detections or not len(detections['detections']):
            self.detection_sleep(settings)
-            return None
-        return detections
+            return None, None
+        return detections, None

    def create_detection_session(self):
        return OpenCVDetectionSession()
--- a/plugins/tensorflow-lite/.vscode/settings.json
+++ b/plugins/tensorflow-lite/.vscode/settings.json
@@ -1,16 +1,16 @@

 {
    // docker installation
-    "scrypted.debugHost": "raspberrypi",
-    "scrypted.serverRoot": "/server",
+    // "scrypted.debugHost": "raspberrypi",
+    // "scrypted.serverRoot": "/server",

    // pi local installation
    // "scrypted.debugHost": "192.168.2.119",
    // "scrypted.serverRoot": "/home/pi/.scrypted",

    // local checkout
-    // "scrypted.debugHost": "127.0.0.1",
-    // "scrypted.serverRoot": "/Users/koush/.scrypted",
+    "scrypted.debugHost": "127.0.0.1",
+    "scrypted.serverRoot": "/Users/koush/.scrypted",

    "scrypted.pythonRemoteRoot": "${config:scrypted.serverRoot}/volume/plugin.zip",
    "python.analysis.extraPaths": [
--- a/plugins/tensorflow-lite/src/pipeline/init.py
+++ b/plugins/tensorflow-lite/src/pipeline/init.py
@@ -66,8 +66,9 @@ class GstPipelineBase:
            self.watchId = None
            self.gst = None

+
 class GstPipeline(GstPipelineBase):
-    def __init__(self, loop: AbstractEventLoop, finished: Future, appsink_name: str, user_callback, crop = False):
+    def __init__(self, loop: AbstractEventLoop, finished: Future, appsink_name: str, user_callback, crop=False):
        super().__init__(loop, finished)
        self.appsink_name = appsink_name
        self.user_callback = user_callback
@@ -115,13 +116,15 @@ class GstPipeline(GstPipelineBase):
    def get_src_size(self):
        if not self.src_size:
            videoconvert = self.gst.get_by_name('videoconvert')
-            structure = videoconvert.srcpads[0].get_current_caps().get_structure(0)
+            structure = videoconvert.srcpads[0].get_current_caps(
+            ).get_structure(0)
            _, w = structure.get_int('width')
            _, h = structure.get_int('height')
            self.src_size = (w, h)

            videoscale = self.gst.get_by_name('videoscale')
-            structure = videoscale.srcpads[0].get_current_caps().get_structure(0)
+            structure = videoscale.srcpads[0].get_current_caps(
+            ).get_structure(0)
            _, w = structure.get_int('width')
            _, h = structure.get_int('height')
            self.dst_size = (w, h)
@@ -134,7 +137,8 @@ class GstPipeline(GstPipelineBase):

            # the dimension with the higher scale value got cropped or boxed.
            # use the other dimension to figure out the crop/box amount.
-            scales = (self.dst_size[0] / self.src_size[0], self.dst_size[1] / self.src_size[1])
+            scales = (self.dst_size[0] / self.src_size[0],
+                      self.dst_size[1] / self.src_size[1])
            if self.crop:
                scale = max(scales[0], scales[1])
            else:
@@ -148,10 +152,10 @@ class GstPipeline(GstPipelineBase):
            py = math.ceil((self.dst_size[1] - dy) / 2)

            self.pad_size = (px, py)
-            
+
        return self.src_size

-    def convert_to_src_size(self, point, normalize = False):
+    def convert_to_src_size(self, point, normalize=False):
        valid = True
        px, py = self.pad_size
        x, y = point
@@ -189,49 +193,61 @@ class GstPipeline(GstPipelineBase):
                    break
                gstsample = self.gstsample
                self.gstsample = None
-                self.user_callback(gstsample, self.get_src_size(), lambda p, normalize=False: self.convert_to_src_size(p, normalize))
+                self.user_callback(gstsample, self.get_src_size(
+                ), lambda p, normalize=False: self.convert_to_src_size(p, normalize))
+

 def get_dev_board_model():
-  try:
-    model = open('/sys/firmware/devicetree/base/model').read().lower()
-    if 'mx8mq' in model:
-        return 'mx8mq'
-    if 'mt8167' in model:
-        return 'mt8167'
-  except: pass
-  return None
+    try:
+        model = open('/sys/firmware/devicetree/base/model').read().lower()
+        if 'mx8mq' in model:
+            return 'mx8mq'
+        if 'mt8167' in model:
+            return 'mt8167'
+    except:
+        pass
+    return None
+

 def create_pipeline_sink(
-                 appsink_name,
-                 appsink_size,
-                 pixel_format,
-                 crop = False):
-    SINK_ELEMENT = 'appsink name={appsink_name} emit-signals=true max-buffers=0 drop=true sync=false'.format(appsink_name=appsink_name)
+        appsink_name,
+        appsink_size,
+        pixel_format,
+        crop=False):
+    SINK_ELEMENT = 'appsink name={appsink_name} emit-signals=true max-buffers=0 drop=true sync=false'.format(
+        appsink_name=appsink_name)

-    (width, height)= appsink_size
+    (width, height) = appsink_size

-    SINK_CAPS = 'video/x-raw,format={pixel_format},width={width},height={height},pixel-aspect-ratio=1/1'
-    sink_caps = SINK_CAPS.format(width=width, height=height, pixel_format=pixel_format)
+    SINK_CAPS = 'video/x-raw,format={pixel_format}'
+    if width and height:
+        SINK_CAPS += ',width={width},height={height},pixel-aspect-ratio=1/1'
+
+    sink_caps = SINK_CAPS.format(
+        width=width, height=height, pixel_format=pixel_format)
    pipeline = " {sink_caps} ! {sink_element}".format(
        sink_caps=sink_caps,
        sink_element=SINK_ELEMENT)

    return pipeline

+
 def create_pipeline(
-                 appsink_name,
-                 appsink_size,
-                 video_input,
-                 pixel_format,
-                 crop = False,
-                 parse_only = False):
+        appsink_name,
+        appsink_size,
+        video_input,
+        pixel_format,
+        crop=False,
+        parse_only=False):
    if parse_only:
-        sink = 'appsink name={appsink_name} emit-signals=true sync=false'.format(appsink_name=appsink_name)
+        sink = 'appsink name={appsink_name} emit-signals=true sync=false'.format(
+            appsink_name=appsink_name)
        PIPELINE = """ {video_input}
            ! {sink}
        """
    else:
-        sink = create_pipeline_sink(appsink_name, appsink_size, pixel_format, crop = crop)
+        sink = create_pipeline_sink(
+            appsink_name, appsink_size, pixel_format, crop=crop)
        if crop:
            PIPELINE = """ {video_input} ! videoconvert name=videoconvert ! aspectratiocrop aspect-ratio=1/1 ! videoscale name=videoscale ! queue leaky=downstream max-size-buffers=0 
                ! {sink}
@@ -240,19 +256,21 @@ def create_pipeline(
            PIPELINE = """ {video_input} ! queue leaky=downstream max-size-buffers=0 ! videoconvert name=videoconvert ! videoscale name=videoscale
                ! {sink}
            """
-    pipeline = PIPELINE.format(video_input = video_input, sink = sink)
+    pipeline = PIPELINE.format(video_input=video_input, sink=sink)
    print('Gstreamer pipeline:\n', pipeline)
    return pipeline

+
 def run_pipeline(loop, finished,
                 user_callback,
                 appsink_name,
                 appsink_size,
                 video_input,
                 pixel_format,
-                 crop = False,
-                 parse_only = False):
-    gst = GstPipeline(loop, finished, appsink_name, user_callback, crop = crop)
-    pipeline = create_pipeline(appsink_name, appsink_size, video_input, pixel_format, crop = crop, parse_only = parse_only)
+                 crop=False,
+                 parse_only=False):
+    gst = GstPipeline(loop, finished, appsink_name, user_callback, crop=crop)
+    pipeline = create_pipeline(
+        appsink_name, appsink_size, video_input, pixel_format, crop=crop, parse_only=parse_only)
    gst.parse_launch(pipeline)
    return gst
--- a/plugins/tensorflow-lite/src/tflite/init.py
+++ b/plugins/tensorflow-lite/src/tflite/init.py
@@ -23,6 +23,7 @@ import scrypted_sdk
 from typing import Any, List, Tuple
 from gi.repository import Gst
 import asyncio
+import numpy

 from detect import DetectionSession, DetectPlugin

@@ -50,13 +51,14 @@ def parse_label_contents(contents: str):


 defaultThreshold = .4
+defaultSecondThreshold = .7

 class RawImage:
-    jpeg: scrypted_sdk.MediaObject
+    jpegMediaObject: scrypted_sdk.MediaObject

    def __init__(self, image: Image.Image):
        self.image = image
-        self.jpeg = None
+        self.jpegMediaObject = None

 MIME_TYPE = 'x-scrypted-tensorflow-lite/x-raw-image'

@@ -67,7 +69,7 @@ class TensorFlowLitePlugin(DetectPlugin, scrypted_sdk.BufferConverter):
        self.fromMimeType = MIME_TYPE
        self.toMimeType = scrypted_sdk.ScryptedMimeTypes.MediaObject.value

-        self.crop = True
+        self.crop = False

        labels_contents = scrypted_sdk.zip.open(
            'fs/coco_labels.txt').read().decode('utf8')
@@ -117,10 +119,10 @@ class TensorFlowLitePlugin(DetectPlugin, scrypted_sdk.BufferConverter):
                detection_session.image = image
            else:
                image.close()
-        data.jpeg = None
+        data.jpegMediaObject = None

    async def convert(self, data: RawImage, fromMimeType: str, toMimeType: str, options: scrypted_sdk.BufferConvertorOptions = None) -> Any:
-        mo = data.jpeg
+        mo = data.jpegMediaObject
        if not mo:
            image = data.image
            if not image:
@@ -130,8 +132,7 @@ class TensorFlowLitePlugin(DetectPlugin, scrypted_sdk.BufferConverter):
            image.save(bio, format='JPEG')
            jpegBytes = bio.getvalue()
            mo = await scrypted_sdk.mediaManager.createMediaObject(jpegBytes, 'image/jpeg')
-            data.jpeg = jpegBytes
-            data.image = None
+            data.jpegMediaObject = mo
        return mo

    def requestRestart(self):
@@ -155,6 +156,14 @@ class TensorFlowLitePlugin(DetectPlugin, scrypted_sdk.BufferConverter):
            'value': defaultThreshold,
            'placeholder': defaultThreshold,
        }
+        secondConfidence: Setting = {
+            'title': 'Second Pass Confidence',
+            'description': 'Scale, crop, and reanalyze the results from the initial detection pass to get more accurate results. This will exponentially increase complexity, so using an allow list is recommended',
+            'key': 'second_score_threshold',
+            'type': 'number',
+            'value': defaultSecondThreshold,
+            'placeholder': defaultSecondThreshold,
+        }
        decoderSetting: Setting = {
            'title': "Decoder",
            'description': "The gstreamer element used to decode the stream",
@@ -174,7 +183,9 @@ class TensorFlowLitePlugin(DetectPlugin, scrypted_sdk.BufferConverter):
            'choices': list(self.labels.values()),
            'multiple': True,
            'key': 'allowList',
-            'value': [],
+            'value': [
+                'person',
+            ],
        }
        coral: Setting = {
            'title': 'Detected Edge TPU',
@@ -184,10 +195,10 @@ class TensorFlowLitePlugin(DetectPlugin, scrypted_sdk.BufferConverter):
            'key': 'coral',
        }

-        d['settings'] = [coral, confidence, decoderSetting, allowList]
+        d['settings'] = [coral, confidence, secondConfidence, decoderSetting, allowList]
        return d

-    def create_detection_result(self, objs, size, allowList, convert_to_src_size=None):
+    def create_detection_result(self, objs, size, allowList, convert_to_src_size=None) -> ObjectsDetected:
        detections: List[ObjectDetectionResult] = []
        detection_result: ObjectsDetected = {}
        detection_result['detections'] = detections
@@ -232,25 +243,93 @@ class TensorFlowLitePlugin(DetectPlugin, scrypted_sdk.BufferConverter):
        stream = io.BytesIO(image_bytes)
        image = Image.open(stream)

-        score_threshold = self.parse_settings(settings)
+        return self.run_detection_image(self, settings, image.size)
+
+    def get_detection_input_size(self, src_size):
+        return (None, None)
        with self.mutex:
-            _, scale = common.set_resized_input(
-                self.interpreter, image.size, lambda size: image.resize(size, Image.ANTIALIAS))
+            return input_size(self.interpreter)
+
+    def run_detection_image(self, image: Image.Image, settings: Any, src_size, convert_to_src_size: Any = None, second_pass_crop: Tuple[float, float, float, float] = None):
+        score_threshold = defaultThreshold
+        second_score_threshold = None
+        if settings:
+            score_threshold = float(settings.get(
+                'score_threshold', score_threshold) or score_threshold)
+            check = settings.get(
+                'second_score_threshold', None)
+            if check:
+                second_score_threshold = float(check)
+
+        if second_pass_crop:
+            score_threshold = second_score_threshold
+
+        (w, h) = input_size(self.interpreter)
+        if not second_pass_crop:
+            (iw, ih) = image.size
+            ws = w / iw
+            hs = h / ih
+            s = max(ws, hs)
+            scaled = image.resize((round(s * iw), round(s * ih)), Image.ANTIALIAS)
+            ow = round((scaled.width - w) / 2)
+            oh = round((scaled.height - h) / 2)
+            input = scaled.crop((ow, oh, ow + w, oh + h))
+
+            def cvss(point, normalize=False):
+                converted = convert_to_src_size(point, normalize)
+                return ((converted[0] + ow) / s, (converted[1] + oh) / s, converted[2])
+        else:
+            (l, t, r, b) = second_pass_crop
+            cropped = image.crop(second_pass_crop)
+            (cw, ch) = cropped.size
+            input = cropped.resize((w, h), Image.ANTIALIAS)
+
+            def cvss(point, normalize=False):
+                converted = convert_to_src_size(point, normalize)
+                return ((converted[0] / w) * cw + l, (converted[1] / h) * ch + t, converted[2])
+                
+        with self.mutex:
+            common.set_input(
+                self.interpreter, input)
+            scale = (1, 1)
+            # _, scale = common.set_resized_input(
+            #     self.interpreter, cropped.size, lambda size: cropped.resize(size, Image.ANTIALIAS))
            self.interpreter.invoke()
            objs = detect.get_objects(
                self.interpreter, score_threshold=score_threshold, image_scale=scale)

-        allowList = settings and settings.get('allowList', None)
+        
+        allowList = settings.get('allowList', None)
+        ret = self.create_detection_result(objs, src_size, allowList, cvss)

-        return self.create_detection_result(objs, image.size, allowList)
+        if second_pass_crop or not second_score_threshold or not len(ret['detections']):
+            return ret, RawImage(image)
+        
+        secondPassDetections: List[ObjectDetectionResult] = []
+        detections = ret['detections']
+        ret['detections'] = []
+        for detection in detections:
+            if detection['score'] >= second_score_threshold:
+                ret['detections'].append(detection)
+                continue
+            (x, y, w, h) = detection['boundingBox']
+            cx = x + w / 2
+            cy = y + h / 2
+            d = round(max(w, h) * 1.5)
+            x = round(cx - d / 2)
+            y = round(cy - d / 2)
+            x = max(0, x)
+            y = max(0, y)
+            x2 = x + d
+            y2 = y + d

-    def get_detection_input_size(self, src_size):
-        with self.mutex:
-            return input_size(self.interpreter)
+            secondPassResult, _ = self.run_detection_image(image, settings, src_size, convert_to_src_size, (x, y, x2, y2))
+            ret['detections'].extend(secondPassResult['detections'])
+
+        return ret, RawImage(image)

    def run_detection_gstsample(self, detection_session: TensorFlowLiteSession, gstsample, settings: Any, src_size, convert_to_src_size) -> Tuple[ObjectsDetected, Image.Image]:
-        score_threshold = self.parse_settings(settings)
-
+        # todo reenable this if detection images aren't needed.
        if False and loaded_py_coral:
            with self.mutex:
                gst_buffer = gstsample.get_buffer()
@@ -280,16 +359,7 @@ class TensorFlowLitePlugin(DetectPlugin, scrypted_sdk.BufferConverter):
            finally:
                gst_buffer.unmap(info)

-            with self.mutex:
-                _, scale = common.set_resized_input(
-                    self.interpreter, image.size, lambda size: image.resize(size, Image.ANTIALIAS))
-                self.interpreter.invoke()
-                objs = detect.get_objects(
-                    self.interpreter, score_threshold=score_threshold, image_scale=scale)
-
-        allowList = settings.get('allowList', None)
-
-        return self.create_detection_result(objs, src_size, allowList, convert_to_src_size), RawImage(image)
+        return self.run_detection_image(image, settings, src_size, convert_to_src_size)

    def create_detection_session(self):
        return TensorFlowLiteSession()