detect: move clip to huggingface

2026-03-12 05:23:16 +00:00 · 2026-02-27 20:49:33 -08:00
parent e66ea8e794
commit 46dd4006c7
10 changed files with 37 additions and 68 deletions
--- a/plugins/coreml/package-lock.json
+++ b/plugins/coreml/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "@scrypted/coreml",
-   "version": "0.1.89",
+   "version": "0.1.90",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
      "": {
         "name": "@scrypted/coreml",
-         "version": "0.1.89",
+         "version": "0.1.90",
         "devDependencies": {
            "@scrypted/sdk": "file:../../sdk"
         }
--- a/plugins/coreml/package.json
+++ b/plugins/coreml/package.json
@@ -50,5 +50,5 @@
   "devDependencies": {
      "@scrypted/sdk": "file:../../sdk"
   },
-   "version": "0.1.89"
+   "version": "0.1.90"
 }
--- a/plugins/coreml/src/coreml/clip_embedding.py
+++ b/plugins/coreml/src/coreml/clip_embedding.py
@@ -29,21 +29,12 @@ class CoreMLClipEmbedding(ClipEmbedding):
            "vision.mlpackage/Data/com.apple.CoreML/model.mlmodel",
        ]

-    def loadModel(self, files):
-        # find the xml file in the files list
-        text_manifest = [f for f in files if f.lower().endswith('text.mlpackage/manifest.json')]
-        if not text_manifest:
-            raise ValueError("No XML model file found in the provided files list")
-        text_manifest = text_manifest[0]
-
-        vision_manifest = [f for f in files if f.lower().endswith('vision.mlpackage/manifest.json')]
-        if not vision_manifest:
-            raise ValueError("No XML model file found in the provided files list")
-        vision_manifest = vision_manifest[0]
-        
-
-        textModel = ct.models.MLModel(os.path.dirname(text_manifest))
-        visionModel = ct.models.MLModel(os.path.dirname(vision_manifest))
+    def initModel(self):
+        model_path = self.downloadHuggingFaceModelLocalFallback("clip")
+        text = os.path.join(model_path, "text.mlpackage")
+        vision = os.path.join(model_path, "vision.mlpackage")
+        textModel = ct.models.MLModel(text)
+        visionModel = ct.models.MLModel(vision)

        return textModel, visionModel

--- a/plugins/onnx/package-lock.json
+++ b/plugins/onnx/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "@scrypted/onnx",
-   "version": "0.1.130",
+   "version": "0.1.131",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
      "": {
         "name": "@scrypted/onnx",
-         "version": "0.1.130",
+         "version": "0.1.131",
         "devDependencies": {
            "@scrypted/sdk": "file:../../sdk"
         }
--- a/plugins/onnx/package.json
+++ b/plugins/onnx/package.json
@@ -50,5 +50,5 @@
   "devDependencies": {
      "@scrypted/sdk": "file:../../sdk"
   },
-   "version": "0.1.130"
+   "version": "0.1.131"
 }
--- a/plugins/onnx/src/ort/clip_embedding.py
+++ b/plugins/onnx/src/ort/clip_embedding.py
@@ -2,6 +2,7 @@ from __future__ import annotations

 import asyncio
 from typing import Any
+import os

 import numpy as np
 import onnxruntime
@@ -19,24 +20,12 @@ class ONNXClipEmbedding(ClipEmbedding):
    def __init__(self, plugin, nativeId: str):
        super().__init__(plugin=plugin, nativeId=nativeId)

-    def getFiles(self):
-        return [
-            "text.onnx",
-            "vision.onnx",
-        ]
+    def initModel(self):
+        model_path = self.downloadHuggingFaceModelLocalFallback("clip")

-    def loadModel(self, files):
        # find the xml file in the files list
-        text_onnx = [f for f in files if f.lower().endswith('text.onnx')]
-        if not text_onnx:
-            raise ValueError("No onnx model file found in the provided files list")
-        text_onnx = text_onnx[0]
-
-        vision_onnx = [f for f in files if f.lower().endswith('vision.onnx')]
-        if not vision_onnx:
-            raise ValueError("No onnx model file found in the provided files list")
-        vision_onnx = vision_onnx[0]
-        
+        text_onnx = os.path.join(model_path, 'text.onnx')
+        vision_onnx = os.path.join(model_path, 'vision.onnx')        

        compiled_models_array = []
        compiled_models = {}
--- a/plugins/openvino/package-lock.json
+++ b/plugins/openvino/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "@scrypted/openvino",
-   "version": "0.1.194",
+   "version": "0.1.195",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
      "": {
         "name": "@scrypted/openvino",
-         "version": "0.1.194",
+         "version": "0.1.195",
         "devDependencies": {
            "@scrypted/sdk": "file:../../sdk"
         }
--- a/plugins/openvino/package.json
+++ b/plugins/openvino/package.json
@@ -50,5 +50,5 @@
   "devDependencies": {
      "@scrypted/sdk": "file:../../sdk"
   },
-   "version": "0.1.194"
+   "version": "0.1.195"
 }
--- a/plugins/openvino/src/ov/clip_embedding.py
+++ b/plugins/openvino/src/ov/clip_embedding.py
@@ -2,6 +2,7 @@ from __future__ import annotations

 import asyncio
 from typing import Any
+import os

 import numpy as np
 import openvino as ov
@@ -31,17 +32,12 @@ class OpenVINOClipEmbedding(ClipEmbedding):
            f"openvino/vision{model_suffix}.bin"
        ]

-    def loadModel(self, files):
-        # find the xml file in the files list
-        text_xml = [f for f in files if f.lower().endswith(text_xml_name)]
-        if not text_xml:
-            raise ValueError("No XML model file found in the provided files list")
-        text_xml = text_xml[0]
+    def initModel(self):
+        model_path = self.downloadHuggingFaceModelLocalFallback("clip")

-        vision_xml = [f for f in files if f.lower().endswith(vision_xml_name)]
-        if not vision_xml:
-            raise ValueError("No XML model file found in the provided files list")
-        vision_xml = vision_xml[0]
+        # find the xml file in the files list
+        text_xml = os.path.join(model_path, "text.xml")
+        vision_xml = os.path.join(model_path, "vision.xml")
        
        textModel = self.plugin.core.compile_model(text_xml, self.plugin.mode)
        model = self.plugin.core.read_model(vision_xml)
--- a/plugins/openvino/src/predict/clip.py
+++ b/plugins/openvino/src/predict/clip.py
@@ -23,45 +23,38 @@ class ClipEmbedding(PredictPlugin, scrypted_sdk.TextEmbedding, scrypted_sdk.Imag
        self.loop = asyncio.get_event_loop()
        self.minThreshold = 0.5

-        self.model = self.initModel()
+        try:
+           self.model = self.initModel()
+        except Exception as e:
+            self.print("Error initializing CLIP model:", e)
+            raise

        self.processor = None
-        print("Loading CLIP processor from local cache.")
+        self.print("Loading CLIP processor from local cache.")
        try:
            self.processor = CLIPProcessor.from_pretrained(
                hf_id,
                local_files_only=True,
            )
-            print("Loaded CLIP processor from local cache.")
+            self.print("Loaded CLIP processor from local cache.")
        except Exception:
-            print("CLIP processor not available in local cache yet.")
+            self.print("CLIP processor not available in local cache yet.")

        asyncio.ensure_future(self.refreshClipProcessor(hf_id), loop=self.loop)

    async def refreshClipProcessor(self, hf_id: str):
        try:
-            print("Refreshing CLIP processor cache (online).")
+            self.print("Refreshing CLIP processor cache (online).")
            processor = await asyncio.to_thread(
                CLIPProcessor.from_pretrained,
                hf_id,
            )
            self.processor = processor
-            print("Refreshed CLIP processor cache.")
+            self.print("Refreshed CLIP processor cache.")
        except Exception:
-            print("CLIP processor cache refresh failed.")
-
-    def getFiles(self):
-        pass
+            self.print("CLIP processor cache refresh failed.")

    def initModel(self):
-        local_files: list[str] = []
-        for file in self.getFiles():
-            remote_file = "https://huggingface.co/koushd/clip/resolve/main/" + file
-            localFile = self.downloadFile(remote_file, f"{self.id}/{file}")
-            local_files.append(localFile)
-        return self.loadModel(local_files)
-
-    def loadModel(self, files: list[str]):
        pass

    async def getImageEmbedding(self, input):