From 9c9e29068b1a3f410ddbae5e45cb9bc28eef1fdc Mon Sep 17 00:00:00 2001 From: Koushik Dutta Date: Fri, 3 May 2024 18:07:17 -0700 Subject: [PATCH] server: Improve plugin health check --- .../scrypted_python/scrypted_sdk/types.py | 2 +- server/package-lock.json | 4 +- server/python/plugin_remote.py | 7 +++ server/src/plugin/plugin-host.ts | 44 +++++++++++++++---- server/src/plugin/plugin-remote-worker.ts | 6 +++ 5 files changed, 52 insertions(+), 11 deletions(-) diff --git a/sdk/types/scrypted_python/scrypted_sdk/types.py b/sdk/types/scrypted_python/scrypted_sdk/types.py index e60f8d174..306b78060 100644 --- a/sdk/types/scrypted_python/scrypted_sdk/types.py +++ b/sdk/types/scrypted_python/scrypted_sdk/types.py @@ -644,7 +644,7 @@ class ObjectDetectionModel(TypedDict): class ObjectDetectionSession(TypedDict): - batch: float + batch: float # Denotes that this is the first sample in a batch of samples. settings: Any sourceId: str zones: list[ObjectDetectionZone] diff --git a/server/package-lock.json b/server/package-lock.json index 6b0270601..c414f53dd 100644 --- a/server/package-lock.json +++ b/server/package-lock.json @@ -1,12 +1,12 @@ { "name": "@scrypted/server", - "version": "0.100.0", + "version": "0.100.1", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@scrypted/server", - "version": "0.100.0", + "version": "0.100.1", "hasInstallScript": true, "license": "ISC", "dependencies": { diff --git a/server/python/plugin_remote.py b/server/python/plugin_remote.py index 7220e069c..9a020e73d 100644 --- a/server/python/plugin_remote.py +++ b/server/python/plugin_remote.py @@ -754,6 +754,13 @@ class PluginRemote: raise Exception(f'unknown service {name}') async def start_stats_runner(self): + pong = None + async def ping(time: int): + nonlocal pong + pong = pong or await self.peer.getParam('pong') + await pong(time) + self.peer.params['ping'] = ping + update_stats = await self.peer.getParam('updateStats') if not update_stats: print('host did not provide update_stats') diff --git a/server/src/plugin/plugin-host.ts b/server/src/plugin/plugin-host.ts index adbdcfac2..94da10daf 100644 --- a/server/src/plugin/plugin-host.ts +++ b/server/src/plugin/plugin-host.ts @@ -350,11 +350,38 @@ export class PluginHost { // the plugin is expected to send process stats every 10 seconds. // this can be used as a check for liveness. let lastStats: number; - const statsInterval = setInterval(async () => { + this.peer.params.updateStats = (stats: any) => { + lastStats = Date.now(); + this.stats = stats; + } + + let lastPong: number; + this.peer.params.pong = (time: number) => { + lastPong = time; + }; + (async () => { + try { + let pingPromise: Promise + while (!this.killed) { + await sleep(30000); + if (this.killed) + return; + pingPromise ||= await this.peer.getParam('ping'); + const ping = await pingPromise; + await ping(Date.now()); + } + } + catch (e) { + logger.log('e', 'plugin ping failed. restarting.'); + this.api.requestRestart(); + } + })(); + + const healthInterval = setInterval(async () => { const now = Date.now(); // plugin may take a while to install, so wait 10 minutes. // after that, require 1 minute checkins. - if (!lastStats) { + if (!lastStats || !lastPong) { if (now - startupTime > 10 * 60 * 1000) { const logger = await this.api.getLogger(undefined); logger.log('e', 'plugin failed to start in a timely manner. restarting.'); @@ -364,15 +391,16 @@ export class PluginHost { } if (!pluginDebug && (lastStats + 60000 < now)) { const logger = await this.api.getLogger(undefined); - logger.log('e', 'plugin is unresponsive. restarting.'); + logger.log('e', 'plugin is not reporting stats. restarting.'); + this.api.requestRestart(); + } + if (!pluginDebug && (lastPong + 60000 < now)) { + const logger = await this.api.getLogger(undefined); + logger.log('e', 'plugin is not responding to ping. restarting.'); this.api.requestRestart(); } }, 60000); - this.peer.killed.finally(() => clearInterval(statsInterval)); - this.peer.params.updateStats = (stats: any) => { - lastStats = Date.now(); - this.stats = stats; - } + this.peer.killed.finally(() => clearInterval(healthInterval)); } async createRpcIoPeer(socket: IOServerSocket, accessControls: AccessControls) { diff --git a/server/src/plugin/plugin-remote-worker.ts b/server/src/plugin/plugin-remote-worker.ts index 4d69a55a5..68df01cc9 100644 --- a/server/src/plugin/plugin-remote-worker.ts +++ b/server/src/plugin/plugin-remote-worker.ts @@ -285,6 +285,12 @@ export function startPluginRemote(mainFilename: string, pluginId: string, peerSe // start the stats updater/watchdog after installation has finished, as that may take some time. peer.getParam('updateStats').then(updateStats => startStatsUpdater(allMemoryStats, updateStats)); + let pong: (time: number) => Promise; + peer.params.ping = async (time: number) => { + pong ||= await peer.getParam('pong'); + await pong(time); + }; + const main = pluginReader('main.nodejs.js'); const script = main.toString();