feat(alertmanager): integrate with ruler (#7222)

### Summary Integrate the new implementations of the alertmanager along with changes to the ruler. This change can be broadly categoried into 3 parts: #### Frontend - The earlier `/api/v1/alerts` api was double encoding the response in json and sending it to the frontend. This PR fixes the json response object. For instance, we have gone from the response `{ "status": "success", "data": "{\"status\":\"success\",\"data\":[{\"labels\":{\"alertname\":\"[platform][consumer] consumer is above 100% memory utilization\",\"bu\":\"platform\",\"...... }` to the response `{"status":"success","data":[{"labels":{"alertname":"[Metrics] Pod CP......` - `msteams` has been changed to `msteamsv2` wherever applicable #### Ruler The following changes have been done in the ruler component: - Removal of the old alertmanager and notifier - The RuleDB methods `Create`, `Edit` and `Delete` have been made transactional - Introduction of a new `testPrepareNotifyFunc` for sending test notifications - Integration with the new alertmanager #### Alertmanager Although a huge chunk of the alertmanagers have been merged in previous PRs (the list can be found at https://github.com/SigNoz/platform-pod/issues/404), this PR takes care of changes needed in order to incorporate it with the ruler - Addition of ruleId based matching - Support for marshalling the global configuration directly from the upstream alertmanager - Addition of orgId to the legacy alertmanager - Support for always adding defaults to both routes and receivers while creating them - Migration to create the required alertmanager tables - Migration for msteams to msteamsv2 has been added. We will start using msteamv2 config for the new alertmanager and keep using msteams for the old one. #### Related Issues / PR's Closes https://github.com/SigNoz/platform-pod/issues/404 Closes https://github.com/SigNoz/platform-pod/issues/176
2026-02-03 08:33:26 +00:00 · 2025-03-10 01:30:42 +05:30
parent 8abba261a8
commit 1f33928bf9
56 changed files with 1855 additions and 1595 deletions
--- a/conf/example.yaml
+++ b/conf/example.yaml
@@ -70,26 +70,74 @@ sqlstore:
 ##################### APIServer #####################
 apiserver:
  timeout:
+    # Default request timeout.
    default: 60s
+    # Maximum request timeout.
    max: 600s
+    # List of routes to exclude from request timeout.
    excluded_routes:
      - /api/v1/logs/tail
      - /api/v3/logs/livetail
  logging:
+    # List of routes to exclude from request responselogging.
    excluded_routes:
      - /api/v1/health


 ##################### TelemetryStore #####################
 telemetrystore:
-  # specifies the telemetrystore provider to use.
+  # Specifies the telemetrystore provider to use.
  provider: clickhouse
-  clickhouse:
-    # The DSN to use for ClickHouse.
-    dsn: http://localhost:9000
  # Maximum number of idle connections in the connection pool.
  max_idle_conns: 50
  # Maximum number of open connections to the database.
  max_open_conns: 100
  # Maximum time to wait for a connection to be established.
-  dial_timeout: 5s
+  dial_timeout: 5s
+  clickhouse:
+    # The DSN to use for ClickHouse.
+    dsn: http://localhost:9000
+
+##################### Alertmanager #####################
+alertmanager:
+  # Specifies the alertmanager provider to use.
+  provider: legacy
+  legacy:
+    # The API URL (with prefix) of the legacy Alertmanager instance.
+    api_url: http://localhost:9093/api
+  signoz:
+    # The poll interval for periodically syncing the alertmanager with the config in the store.
+    poll_interval: 1m
+    # The URL under which Alertmanager is externally reachable (for example, if Alertmanager is served via a reverse proxy). Used for generating relative and absolute links back to Alertmanager itself.
+    external_url: http://localhost:9093
+    # The global configuration for the alertmanager. All the exahustive fields can be found in the upstream: https://github.com/prometheus/alertmanager/blob/efa05feffd644ba4accb526e98a8c6545d26a783/config/config.go#L833
+    global:
+      # ResolveTimeout is the time after which an alert is declared resolved if it has not been updated.
+      resolve_timeout: 5m
+    route:
+      # GroupByStr is the list of labels to group alerts by.
+      group_by:
+        - alertname
+      # GroupInterval is the interval at which alerts are grouped.
+      group_interval: 1m
+      # GroupWait is the time to wait before sending alerts to receivers.
+      group_wait: 1m
+      # RepeatInterval is the interval at which alerts are repeated.
+      repeat_interval: 1h
+    alerts:
+      # Interval between garbage collection of alerts.
+      gc_interval: 30m
+    silences:
+      # Maximum number of silences, including expired silences. If negative or zero, no limit is set.
+      max: 0
+      # Maximum size of the silences in bytes. If negative or zero, no limit is set.
+      max_size_bytes: 0
+      # Interval between garbage collection and snapshotting of the silences. The snapshot will be stored in the state store.
+      maintenance_interval: 15m
+      # Retention of the silences.
+      retention: 120h
+    nflog:
+      # Interval between garbage collection and snapshotting of the notification logs. The snapshot will be stored in the state store.
+      maintenance_interval: 15m
+      # Retention of the notification logs.
+      retention: 120h