diff --git a/plugins/coreml/package-lock.json b/plugins/coreml/package-lock.json index 0faba214e..db08a6edf 100644 --- a/plugins/coreml/package-lock.json +++ b/plugins/coreml/package-lock.json @@ -13,22 +13,30 @@ }, "../../sdk": { "name": "@scrypted/sdk", - "version": "0.3.77", + "version": "0.5.22", "dev": true, "license": "ISC", "dependencies": { - "@babel/preset-typescript": "^7.26.0", + "@babel/preset-typescript": "^7.27.1", + "@rollup/plugin-commonjs": "^28.0.5", + "@rollup/plugin-json": "^6.1.0", + "@rollup/plugin-node-resolve": "^16.0.1", + "@rollup/plugin-typescript": "^12.1.2", + "@rollup/plugin-virtual": "^3.0.2", "adm-zip": "^0.5.16", - "axios": "^1.7.7", - "babel-loader": "^9.2.1", + "axios": "^1.10.0", + "babel-loader": "^10.0.0", "babel-plugin-const-enum": "^1.2.0", "ncp": "^2.0.0", + "openai": "^5.3.0", "raw-loader": "^4.0.2", "rimraf": "^6.0.1", + "rollup": "^4.43.0", "tmp": "^0.2.3", - "ts-loader": "^9.5.1", - "typescript": "^5.5.4", - "webpack": "^5.95.0", + "ts-loader": "^9.5.2", + "tslib": "^2.8.1", + "typescript": "^5.8.3", + "webpack": "^5.99.9", "webpack-bundle-analyzer": "^4.10.2" }, "bin": { @@ -41,11 +49,9 @@ "scrypted-webpack": "bin/scrypted-webpack.js" }, "devDependencies": { - "@types/node": "^22.8.1", - "@types/stringify-object": "^4.0.5", - "stringify-object": "^3.3.0", + "@types/node": "^24.0.1", "ts-node": "^10.9.2", - "typedoc": "^0.26.10" + "typedoc": "^0.28.5" } }, "../sdk": { @@ -60,23 +66,29 @@ "@scrypted/sdk": { "version": "file:../../sdk", "requires": { - "@babel/preset-typescript": "^7.26.0", - "@types/node": "^22.8.1", - "@types/stringify-object": "^4.0.5", + "@babel/preset-typescript": "^7.27.1", + "@rollup/plugin-commonjs": "^28.0.5", + "@rollup/plugin-json": "^6.1.0", + "@rollup/plugin-node-resolve": "^16.0.1", + "@rollup/plugin-typescript": "^12.1.2", + "@rollup/plugin-virtual": "^3.0.2", + "@types/node": "^24.0.1", "adm-zip": "^0.5.16", - "axios": "^1.7.7", - "babel-loader": "^9.2.1", + "axios": "^1.10.0", + "babel-loader": "^10.0.0", "babel-plugin-const-enum": "^1.2.0", "ncp": "^2.0.0", + "openai": "^5.3.0", "raw-loader": "^4.0.2", "rimraf": "^6.0.1", - "stringify-object": "^3.3.0", + "rollup": "^4.43.0", "tmp": "^0.2.3", - "ts-loader": "^9.5.1", + "ts-loader": "^9.5.2", "ts-node": "^10.9.2", - "typedoc": "^0.26.10", - "typescript": "^5.5.4", - "webpack": "^5.95.0", + "tslib": "^2.8.1", + "typedoc": "^0.28.5", + "typescript": "^5.8.3", + "webpack": "^5.99.9", "webpack-bundle-analyzer": "^4.10.2" } } diff --git a/plugins/coreml/src/coreml/__init__.py b/plugins/coreml/src/coreml/__init__.py index 9ef3dc4e8..84026d41c 100644 --- a/plugins/coreml/src/coreml/__init__.py +++ b/plugins/coreml/src/coreml/__init__.py @@ -15,6 +15,7 @@ from scrypted_sdk import Setting, SettingValue from common import yolo from coreml.face_recognition import CoreMLFaceRecognition from coreml.custom_detection import CoreMLCustomDetection +from coreml.clip_embedding import CoreMLClipEmbedding try: from coreml.text_recognition import CoreMLTextRecognition @@ -146,6 +147,7 @@ class CoreMLPlugin( self.faceDevice = None self.textDevice = None + self.clipDevice = None if not self.forked: asyncio.ensure_future(self.prepareRecognitionModels(), loop=self.loop) @@ -177,6 +179,19 @@ class CoreMLPlugin( }, ) + await scrypted_sdk.deviceManager.onDeviceDiscovered( + { + "nativeId": "clipembedding", + "type": scrypted_sdk.ScryptedDeviceType.Builtin.value, + "interfaces": [ + scrypted_sdk.ScryptedInterface.ClusterForkInterface.value, + scrypted_sdk.ScryptedInterface.ObjectDetection.value, + scrypted_sdk.ScryptedInterface.TextEmbedding.value, + scrypted_sdk.ScryptedInterface.ImageEmbedding.value, + ], + "name": "CoreML CLIP Embedding", + } + ) except: pass @@ -184,9 +199,12 @@ class CoreMLPlugin( if nativeId == "facerecognition": self.faceDevice = self.faceDevice or CoreMLFaceRecognition(self, nativeId) return self.faceDevice - if nativeId == "textrecognition": + elif nativeId == "textrecognition": self.textDevice = self.textDevice or CoreMLTextRecognition(self, nativeId) return self.textDevice + elif nativeId == "clipembedding": + self.clipDevice = self.clipDevice or CoreMLClipEmbedding(self, nativeId) + return self.clipDevice custom_model = self.custom_models.get(nativeId, None) if custom_model: return custom_model diff --git a/plugins/coreml/src/coreml/clip_embedding.py b/plugins/coreml/src/coreml/clip_embedding.py new file mode 100644 index 000000000..3a788cdfe --- /dev/null +++ b/plugins/coreml/src/coreml/clip_embedding.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +import asyncio +from typing import Any + +import numpy as np +from PIL import Image + +import coremltools as ct +from predict.clip import ClipEmbedding +from scrypted_sdk import ObjectsDetected +import os +import concurrent.futures + +class CoreMLClipEmbedding(ClipEmbedding): + def __init__(self, plugin, nativeId: str): + super().__init__(plugin=plugin, nativeId=nativeId) + self.predictExecutor = concurrent.futures.ThreadPoolExecutor(1, "detect-custom") + + def getFiles(self): + return [ + "text.mlpackage/Manifest.json", + "text.mlpackage/Data/com.apple.CoreML/weights/weight.bin", + "text.mlpackage/Data/com.apple.CoreML/model.mlmodel", + + "vision.mlpackage/Manifest.json", + "vision.mlpackage/Data/com.apple.CoreML/weights/weight.bin", + "vision.mlpackage/Data/com.apple.CoreML/model.mlmodel", + ] + + def loadModel(self, files): + # find the xml file in the files list + text_manifest = [f for f in files if f.lower().endswith('text.mlpackage/manifest.json')] + if not text_manifest: + raise ValueError("No XML model file found in the provided files list") + text_manifest = text_manifest[0] + + vision_manifest = [f for f in files if f.lower().endswith('vision.mlpackage/manifest.json')] + if not vision_manifest: + raise ValueError("No XML model file found in the provided files list") + vision_manifest = vision_manifest[0] + + + textModel = ct.models.MLModel(os.path.dirname(text_manifest)) + visionModel = ct.models.MLModel(os.path.dirname(vision_manifest)) + + return textModel, visionModel + + async def detect_once(self, input: Image.Image, settings: Any, src_size, cvss): + def predict(): + inputs = self.processor(images=input, return_tensors="np", padding="max_length", truncation=True) + _, vision_model = self.model + vision_predictions = vision_model.predict({'x': inputs['pixel_values']}) + image_embeds = vision_predictions['var_877'] + # this is a hack to utilize the existing image massaging infrastructure + embedding = bytearray(image_embeds.astype(np.float32).tobytes()) + ret: ObjectsDetected = { + "detections": [ + { + "embedding": embedding, + } + ], + "inputDimensions": src_size + } + + return ret + + ret = await asyncio.get_event_loop().run_in_executor( + self.predictExecutor, lambda: predict() + ) + return ret + + async def getTextEmbedding(self, input): + def predict(): + inputs = self.processor(text=input, return_tensors="np", padding="max_length", truncation=True) + text_model, _ = self.model + text_predictions = text_model.predict({'input_ids_1': inputs['input_ids'].astype(np.float32), 'attention_mask_1': inputs['attention_mask'].astype(np.float32)}) + text_embeds = text_predictions['var_1050'] + return bytearray(text_embeds.astype(np.float32).tobytes()) + + ret = await asyncio.get_event_loop().run_in_executor( + self.predictExecutor, lambda: predict() + ) + return ret diff --git a/plugins/coreml/src/requirements.txt b/plugins/coreml/src/requirements.txt index da9fb0972..0ff71b746 100644 --- a/plugins/coreml/src/requirements.txt +++ b/plugins/coreml/src/requirements.txt @@ -1,3 +1,5 @@ coremltools==8.0 Pillow==10.3.0 opencv-python-headless==4.10.0.84 + +transformers==4.52.4