mirror of
https://github.com/SigNoz/signoz.git
synced 2026-04-27 14:10:30 +01:00
Compare commits
6 Commits
fix/recurr
...
alertmanag
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e247acabd4 | ||
|
|
f8ecc2f305 | ||
|
|
c84dc69be8 | ||
|
|
211436cf54 | ||
|
|
aaeec725b3 | ||
|
|
fabc716709 |
@@ -49,7 +49,7 @@ func PrepareTaskFunc(opts baserules.PrepareTaskOptions) (baserules.Task, error)
|
||||
rules = append(rules, tr)
|
||||
|
||||
// create ch rule task for evaluation
|
||||
task = newTask(baserules.TaskTypeCh, opts.TaskName, evaluation.GetFrequency().Duration(), rules, opts.ManagerOpts, opts.NotifyFunc, opts.MaintenanceStore, opts.OrgID)
|
||||
task = newTask(baserules.TaskTypeCh, opts.TaskName, evaluation.GetFrequency().Duration(), rules, opts.ManagerOpts, opts.NotifyFunc, opts.OrgID)
|
||||
|
||||
} else if opts.Rule.RuleType == ruletypes.RuleTypeProm {
|
||||
|
||||
@@ -73,7 +73,7 @@ func PrepareTaskFunc(opts baserules.PrepareTaskOptions) (baserules.Task, error)
|
||||
rules = append(rules, pr)
|
||||
|
||||
// create promql rule task for evaluation
|
||||
task = newTask(baserules.TaskTypeProm, opts.TaskName, evaluation.GetFrequency().Duration(), rules, opts.ManagerOpts, opts.NotifyFunc, opts.MaintenanceStore, opts.OrgID)
|
||||
task = newTask(baserules.TaskTypeProm, opts.TaskName, evaluation.GetFrequency().Duration(), rules, opts.ManagerOpts, opts.NotifyFunc, opts.OrgID)
|
||||
|
||||
} else if opts.Rule.RuleType == ruletypes.RuleTypeAnomaly {
|
||||
// create anomaly rule
|
||||
@@ -96,7 +96,7 @@ func PrepareTaskFunc(opts baserules.PrepareTaskOptions) (baserules.Task, error)
|
||||
rules = append(rules, ar)
|
||||
|
||||
// create anomaly rule task for evaluation
|
||||
task = newTask(baserules.TaskTypeCh, opts.TaskName, evaluation.GetFrequency().Duration(), rules, opts.ManagerOpts, opts.NotifyFunc, opts.MaintenanceStore, opts.OrgID)
|
||||
task = newTask(baserules.TaskTypeCh, opts.TaskName, evaluation.GetFrequency().Duration(), rules, opts.ManagerOpts, opts.NotifyFunc, opts.OrgID)
|
||||
|
||||
} else {
|
||||
return nil, errors.NewInvalidInputf(errors.CodeInvalidInput, "unsupported rule type %s. Supported types: %s, %s", opts.Rule.RuleType, ruletypes.RuleTypeProm, ruletypes.RuleTypeThreshold)
|
||||
@@ -210,9 +210,9 @@ func TestNotification(opts baserules.PrepareTestRuleOptions) (int, error) {
|
||||
}
|
||||
|
||||
// newTask returns an appropriate group for the rule type
|
||||
func newTask(taskType baserules.TaskType, name string, frequency time.Duration, rules []baserules.Rule, opts *baserules.ManagerOptions, notify baserules.NotifyFunc, maintenanceStore ruletypes.MaintenanceStore, orgID valuer.UUID) baserules.Task {
|
||||
func newTask(taskType baserules.TaskType, name string, frequency time.Duration, rules []baserules.Rule, opts *baserules.ManagerOptions, notify baserules.NotifyFunc, orgID valuer.UUID) baserules.Task {
|
||||
if taskType == baserules.TaskTypeCh {
|
||||
return baserules.NewRuleTask(name, "", frequency, rules, opts, notify, maintenanceStore, orgID)
|
||||
return baserules.NewRuleTask(name, "", frequency, rules, opts, notify, orgID)
|
||||
}
|
||||
return baserules.NewPromRuleTask(name, "", frequency, rules, opts, notify, maintenanceStore, orgID)
|
||||
return baserules.NewPromRuleTask(name, "", frequency, rules, opts, notify, orgID)
|
||||
}
|
||||
|
||||
94
pkg/alertmanager/alertmanagerserver/maintenance_muter.go
Normal file
94
pkg/alertmanager/alertmanagerserver/maintenance_muter.go
Normal file
@@ -0,0 +1,94 @@
|
||||
package alertmanagerserver
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/alertmanager/types"
|
||||
"github.com/prometheus/common/model"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/types/ruletypes"
|
||||
)
|
||||
|
||||
// MaintenanceMuter implements types.Muter for maintenance windows.
|
||||
// It suppresses alerts whose ruleId label matches an active maintenance schedule.
|
||||
// Results are cached for cacheTTL to avoid a DB query on every per-alert check.
|
||||
type MaintenanceMuter struct {
|
||||
maintenanceStore ruletypes.MaintenanceStore
|
||||
orgID string
|
||||
logger *slog.Logger
|
||||
|
||||
mu sync.RWMutex
|
||||
cached []*ruletypes.PlannedMaintenance
|
||||
cacheExpiry time.Time
|
||||
}
|
||||
|
||||
const maintenanceCacheTTL = 30 * time.Second
|
||||
|
||||
func NewMaintenanceMuter(store ruletypes.MaintenanceStore, orgID string, logger *slog.Logger) *MaintenanceMuter {
|
||||
return &MaintenanceMuter{
|
||||
maintenanceStore: store,
|
||||
orgID: orgID,
|
||||
logger: logger,
|
||||
}
|
||||
}
|
||||
|
||||
func (m *MaintenanceMuter) Mutes(ctx context.Context, lset model.LabelSet) bool {
|
||||
ruleID := string(lset[ruletypes.AlertRuleIDLabel])
|
||||
if ruleID == "" {
|
||||
return false
|
||||
}
|
||||
now := time.Now()
|
||||
for _, mw := range m.getMaintenances(ctx) {
|
||||
if mw.ShouldSkip(ruleID, now) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (m *MaintenanceMuter) getMaintenances(ctx context.Context) []*ruletypes.PlannedMaintenance {
|
||||
m.mu.RLock()
|
||||
if time.Now().Before(m.cacheExpiry) {
|
||||
cached := m.cached
|
||||
m.mu.RUnlock()
|
||||
return cached
|
||||
}
|
||||
m.mu.RUnlock()
|
||||
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
// Double-check after acquiring write lock.
|
||||
if time.Now().Before(m.cacheExpiry) {
|
||||
return m.cached
|
||||
}
|
||||
|
||||
mws, err := m.maintenanceStore.ListPlannedMaintenance(ctx, m.orgID)
|
||||
if err != nil {
|
||||
m.logger.ErrorContext(ctx, "failed to list planned maintenance windows; alerts will not be suppressed", slog.String("org_id", m.orgID))
|
||||
return m.cached // return stale (potentially empty) cache on error
|
||||
}
|
||||
m.cached = mws
|
||||
m.cacheExpiry = time.Now().Add(maintenanceCacheTTL)
|
||||
return m.cached
|
||||
}
|
||||
|
||||
// maintenanceMuteStage wraps MaintenanceMuter as a notify.Stage.
|
||||
// We implement the stage directly rather than using notify.NewMuteStage to avoid
|
||||
// a dependency on the unexported *notify.Metrics field of PipelineBuilder.
|
||||
type maintenanceMuteStage struct {
|
||||
muter *MaintenanceMuter
|
||||
}
|
||||
|
||||
func (s *maintenanceMuteStage) Exec(ctx context.Context, _ *slog.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
|
||||
filtered := make([]*types.Alert, 0, len(alerts))
|
||||
for _, a := range alerts {
|
||||
if !s.muter.Mutes(ctx, a.Labels) {
|
||||
filtered = append(filtered, a)
|
||||
}
|
||||
}
|
||||
return ctx, filtered, nil
|
||||
}
|
||||
120
pkg/alertmanager/alertmanagerserver/pipeline_builder.go
Normal file
120
pkg/alertmanager/alertmanagerserver/pipeline_builder.go
Normal file
@@ -0,0 +1,120 @@
|
||||
// Copyright (c) 2026 SigNoz, Inc.
|
||||
// Copyright 2015 Prometheus Team
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package alertmanagerserver
|
||||
|
||||
// pipelineBuilder is a local copy of notify.PipelineBuilder that injects
|
||||
// the maintenance mute stage immediately before the receiver stage.
|
||||
//
|
||||
// We maintain our own copy so we can control exactly where in the pipeline
|
||||
// the maintenance stage runs (between the silence stage and the receiver),
|
||||
// which is not possible by wrapping the output of the upstream builder.
|
||||
//
|
||||
// Upstream pipeline order (notify.PipelineBuilder.New, notify.go:444):
|
||||
//
|
||||
// GossipSettle → Inhibit → TimeActive → TimeMute → Silence → [mms] → Receiver
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/alertmanager/featurecontrol"
|
||||
"github.com/prometheus/alertmanager/inhibit"
|
||||
"github.com/prometheus/alertmanager/nflog/nflogpb"
|
||||
"github.com/prometheus/alertmanager/notify"
|
||||
"github.com/prometheus/alertmanager/silence"
|
||||
"github.com/prometheus/alertmanager/timeinterval"
|
||||
"github.com/prometheus/alertmanager/types"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
type pipelineBuilder struct {
|
||||
metrics *notify.Metrics
|
||||
ff featurecontrol.Flagger
|
||||
muter *MaintenanceMuter
|
||||
}
|
||||
|
||||
func newPipelineBuilder(
|
||||
r prometheus.Registerer,
|
||||
ff featurecontrol.Flagger,
|
||||
muter *MaintenanceMuter,
|
||||
) *pipelineBuilder {
|
||||
return &pipelineBuilder{
|
||||
metrics: notify.NewMetrics(r, ff),
|
||||
ff: ff,
|
||||
muter: muter,
|
||||
}
|
||||
}
|
||||
|
||||
// New returns a map of receivers to Stages, mirroring notify.PipelineBuilder.New
|
||||
// but inserting a maintenanceMuteStage between the silence stage and the receiver.
|
||||
func (pb *pipelineBuilder) New(
|
||||
receivers map[string][]notify.Integration,
|
||||
wait func() time.Duration,
|
||||
inhibitor *inhibit.Inhibitor,
|
||||
silencer *silence.Silencer,
|
||||
intervener *timeinterval.Intervener,
|
||||
marker types.GroupMarker,
|
||||
notificationLog notify.NotificationLog,
|
||||
peer notify.Peer,
|
||||
) notify.RoutingStage {
|
||||
rs := make(notify.RoutingStage, len(receivers))
|
||||
|
||||
ms := notify.NewGossipSettleStage(peer)
|
||||
is := notify.NewMuteStage(inhibitor, pb.metrics)
|
||||
tas := notify.NewTimeActiveStage(intervener, marker, pb.metrics)
|
||||
tms := notify.NewTimeMuteStage(intervener, marker, pb.metrics)
|
||||
ss := notify.NewMuteStage(silencer, pb.metrics)
|
||||
|
||||
var mms *maintenanceMuteStage
|
||||
if pb.muter != nil {
|
||||
mms = &maintenanceMuteStage{muter: pb.muter}
|
||||
}
|
||||
|
||||
for name := range receivers {
|
||||
stages := notify.MultiStage{ms, is, tas, tms, ss}
|
||||
if mms != nil {
|
||||
stages = append(stages, mms)
|
||||
}
|
||||
stages = append(stages, buildReceiverStage(name, receivers[name], wait, notificationLog, pb.metrics))
|
||||
rs[name] = stages
|
||||
}
|
||||
|
||||
pb.metrics.InitializeFor(receivers)
|
||||
return rs
|
||||
}
|
||||
|
||||
// buildReceiverStage is a copy of notify.createReceiverStage (unexported upstream).
|
||||
func buildReceiverStage(
|
||||
name string,
|
||||
integrations []notify.Integration,
|
||||
wait func() time.Duration,
|
||||
notificationLog notify.NotificationLog,
|
||||
metrics *notify.Metrics,
|
||||
) notify.Stage {
|
||||
var fs notify.FanoutStage
|
||||
for i := range integrations {
|
||||
recv := &nflogpb.Receiver{
|
||||
GroupName: name,
|
||||
Integration: integrations[i].Name(),
|
||||
Idx: uint32(integrations[i].Index()),
|
||||
}
|
||||
var s notify.MultiStage
|
||||
s = append(s, notify.NewWaitStage(wait))
|
||||
s = append(s, notify.NewDedupStage(&integrations[i], notificationLog, recv))
|
||||
s = append(s, notify.NewRetryStage(integrations[i], name, metrics))
|
||||
s = append(s, notify.NewSetNotifiesStage(notificationLog, recv))
|
||||
fs = append(fs, s)
|
||||
}
|
||||
return fs
|
||||
}
|
||||
@@ -26,14 +26,13 @@ import (
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager/nfmanager"
|
||||
"github.com/SigNoz/signoz/pkg/errors"
|
||||
"github.com/SigNoz/signoz/pkg/types/alertmanagertypes"
|
||||
ruletypes "github.com/SigNoz/signoz/pkg/types/ruletypes"
|
||||
)
|
||||
|
||||
var (
|
||||
// This is not a real file and will never be used. We need this placeholder to ensure maintenance runs on shutdown. See
|
||||
// https://github.com/prometheus/server/blob/3ee2cd0f1271e277295c02b6160507b4d193dde2/silence/silence.go#L435-L438
|
||||
// and https://github.com/prometheus/server/blob/3b06b97af4d146e141af92885a185891eb79a5b0/nflog/nflog.go#L362.
|
||||
snapfnoop string = "snapfnoop"
|
||||
)
|
||||
// This is not a real snapshot file and will never be used. We need this placeholder to ensure maintenance runs on shutdown.
|
||||
// See https://github.com/prometheus/alertmanager/blob/3ee2cd0f1271e277295c02b6160507b4d193dde2/silence/silence.go#L435-L438
|
||||
// and https://github.com/prometheus/alertmanager/blob/3b06b97af4d146e141af92885a185891eb79a5b0/nflog/nflog.go#L362.
|
||||
var snapfnoop string = "snapfnoop"
|
||||
|
||||
type Server struct {
|
||||
// logger is the logger for the alertmanager
|
||||
@@ -63,7 +62,7 @@ type Server struct {
|
||||
silencer *silence.Silencer
|
||||
silences *silence.Silences
|
||||
timeIntervals map[string][]timeinterval.TimeInterval
|
||||
pipelineBuilder *notify.PipelineBuilder
|
||||
pipelineBuilder *pipelineBuilder
|
||||
marker *alertmanagertypes.MemMarker
|
||||
tmpl *template.Template
|
||||
wg sync.WaitGroup
|
||||
@@ -71,7 +70,16 @@ type Server struct {
|
||||
notificationManager nfmanager.NotificationManager
|
||||
}
|
||||
|
||||
func New(ctx context.Context, logger *slog.Logger, registry prometheus.Registerer, srvConfig Config, orgID string, stateStore alertmanagertypes.StateStore, nfManager nfmanager.NotificationManager) (*Server, error) {
|
||||
func New(
|
||||
ctx context.Context,
|
||||
logger *slog.Logger,
|
||||
registry prometheus.Registerer,
|
||||
srvConfig Config,
|
||||
orgID string,
|
||||
stateStore alertmanagertypes.StateStore,
|
||||
nfManager nfmanager.NotificationManager,
|
||||
maintenanceStore ruletypes.MaintenanceStore,
|
||||
) (*Server, error) {
|
||||
server := &Server{
|
||||
logger: logger.With(slog.String("pkg", "go.signoz.io/pkg/alertmanager/alertmanagerserver")),
|
||||
registry: registry,
|
||||
@@ -160,7 +168,6 @@ func New(ctx context.Context, logger *slog.Logger, registry prometheus.Registere
|
||||
|
||||
return c, server.stateStore.Set(ctx, storableSilences)
|
||||
})
|
||||
|
||||
}()
|
||||
|
||||
// Start maintenance for notification logs
|
||||
@@ -196,7 +203,11 @@ func New(ctx context.Context, logger *slog.Logger, registry prometheus.Registere
|
||||
return nil, err
|
||||
}
|
||||
|
||||
server.pipelineBuilder = notify.NewPipelineBuilder(signozRegisterer, featurecontrol.NoopFlags{})
|
||||
var muter *MaintenanceMuter
|
||||
if maintenanceStore != nil {
|
||||
muter = NewMaintenanceMuter(maintenanceStore, orgID, server.logger)
|
||||
}
|
||||
server.pipelineBuilder = newPipelineBuilder(signozRegisterer, featurecontrol.NoopFlags{}, muter)
|
||||
server.dispatcherMetrics = NewDispatcherMetrics(false, signozRegisterer)
|
||||
|
||||
return server, nil
|
||||
|
||||
@@ -90,7 +90,7 @@ func TestEndToEndAlertManagerFlow(t *testing.T) {
|
||||
stateStore := alertmanagertypestest.NewStateStore()
|
||||
registry := prometheus.NewRegistry()
|
||||
logger := slog.New(slog.DiscardHandler)
|
||||
server, err := New(context.Background(), logger, registry, srvCfg, orgID, stateStore, notificationManager)
|
||||
server, err := New(context.Background(), logger, registry, srvCfg, orgID, stateStore, notificationManager, nil)
|
||||
require.NoError(t, err)
|
||||
amConfig, err := alertmanagertypes.NewDefaultConfig(srvCfg.Global, srvCfg.Route, orgID)
|
||||
require.NoError(t, err)
|
||||
|
||||
@@ -25,7 +25,7 @@ import (
|
||||
|
||||
func TestServerSetConfigAndStop(t *testing.T) {
|
||||
notificationManager := nfmanagertest.NewMock()
|
||||
server, err := New(context.Background(), slog.New(slog.DiscardHandler), prometheus.NewRegistry(), NewConfig(), "1", alertmanagertypestest.NewStateStore(), notificationManager)
|
||||
server, err := New(context.Background(), slog.New(slog.DiscardHandler), prometheus.NewRegistry(), NewConfig(), "1", alertmanagertypestest.NewStateStore(), notificationManager, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
amConfig, err := alertmanagertypes.NewDefaultConfig(alertmanagertypes.GlobalConfig{}, alertmanagertypes.RouteConfig{GroupInterval: 1 * time.Minute, RepeatInterval: 1 * time.Minute, GroupWait: 1 * time.Minute}, "1")
|
||||
@@ -37,7 +37,7 @@ func TestServerSetConfigAndStop(t *testing.T) {
|
||||
|
||||
func TestServerTestReceiverTypeWebhook(t *testing.T) {
|
||||
notificationManager := nfmanagertest.NewMock()
|
||||
server, err := New(context.Background(), slog.New(slog.DiscardHandler), prometheus.NewRegistry(), NewConfig(), "1", alertmanagertypestest.NewStateStore(), notificationManager)
|
||||
server, err := New(context.Background(), slog.New(slog.DiscardHandler), prometheus.NewRegistry(), NewConfig(), "1", alertmanagertypestest.NewStateStore(), notificationManager, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
amConfig, err := alertmanagertypes.NewDefaultConfig(alertmanagertypes.GlobalConfig{}, alertmanagertypes.RouteConfig{GroupInterval: 1 * time.Minute, RepeatInterval: 1 * time.Minute, GroupWait: 1 * time.Minute}, "1")
|
||||
@@ -85,7 +85,7 @@ func TestServerPutAlerts(t *testing.T) {
|
||||
srvCfg := NewConfig()
|
||||
srvCfg.Route.GroupInterval = 1 * time.Second
|
||||
notificationManager := nfmanagertest.NewMock()
|
||||
server, err := New(context.Background(), slog.New(slog.DiscardHandler), prometheus.NewRegistry(), srvCfg, "1", stateStore, notificationManager)
|
||||
server, err := New(context.Background(), slog.New(slog.DiscardHandler), prometheus.NewRegistry(), srvCfg, "1", stateStore, notificationManager, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
amConfig, err := alertmanagertypes.NewDefaultConfig(srvCfg.Global, srvCfg.Route, "1")
|
||||
@@ -133,7 +133,7 @@ func TestServerTestAlert(t *testing.T) {
|
||||
srvCfg := NewConfig()
|
||||
srvCfg.Route.GroupInterval = 1 * time.Second
|
||||
notificationManager := nfmanagertest.NewMock()
|
||||
server, err := New(context.Background(), slog.New(slog.DiscardHandler), prometheus.NewRegistry(), srvCfg, "1", stateStore, notificationManager)
|
||||
server, err := New(context.Background(), slog.New(slog.DiscardHandler), prometheus.NewRegistry(), srvCfg, "1", stateStore, notificationManager, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
amConfig, err := alertmanagertypes.NewDefaultConfig(srvCfg.Global, srvCfg.Route, "1")
|
||||
@@ -238,7 +238,7 @@ func TestServerTestAlertContinuesOnFailure(t *testing.T) {
|
||||
srvCfg := NewConfig()
|
||||
srvCfg.Route.GroupInterval = 1 * time.Second
|
||||
notificationManager := nfmanagertest.NewMock()
|
||||
server, err := New(context.Background(), slog.New(slog.DiscardHandler), prometheus.NewRegistry(), srvCfg, "1", stateStore, notificationManager)
|
||||
server, err := New(context.Background(), slog.New(slog.DiscardHandler), prometheus.NewRegistry(), srvCfg, "1", stateStore, notificationManager, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
amConfig, err := alertmanagertypes.NewDefaultConfig(srvCfg.Global, srvCfg.Route, "1")
|
||||
|
||||
@@ -14,6 +14,7 @@ import (
|
||||
"github.com/SigNoz/signoz/pkg/factory"
|
||||
"github.com/SigNoz/signoz/pkg/modules/organization"
|
||||
"github.com/SigNoz/signoz/pkg/types/alertmanagertypes"
|
||||
ruletypes "github.com/SigNoz/signoz/pkg/types/ruletypes"
|
||||
)
|
||||
|
||||
type Service struct {
|
||||
@@ -39,16 +40,18 @@ type Service struct {
|
||||
serversMtx sync.RWMutex
|
||||
|
||||
notificationManager nfmanager.NotificationManager
|
||||
|
||||
maintenanceStore ruletypes.MaintenanceStore
|
||||
}
|
||||
|
||||
func New(
|
||||
ctx context.Context,
|
||||
settings factory.ScopedProviderSettings,
|
||||
config alertmanagerserver.Config,
|
||||
stateStore alertmanagertypes.StateStore,
|
||||
configStore alertmanagertypes.ConfigStore,
|
||||
orgGetter organization.Getter,
|
||||
nfManager nfmanager.NotificationManager,
|
||||
maintenanceStore ruletypes.MaintenanceStore,
|
||||
) *Service {
|
||||
service := &Service{
|
||||
config: config,
|
||||
@@ -59,6 +62,7 @@ func New(
|
||||
servers: make(map[string]*alertmanagerserver.Server),
|
||||
serversMtx: sync.RWMutex{},
|
||||
notificationManager: nfManager,
|
||||
maintenanceStore: maintenanceStore,
|
||||
}
|
||||
|
||||
return service
|
||||
@@ -177,7 +181,10 @@ func (service *Service) newServer(ctx context.Context, orgID string) (*alertmana
|
||||
return nil, err
|
||||
}
|
||||
|
||||
server, err := alertmanagerserver.New(ctx, service.settings.Logger(), service.settings.PrometheusRegisterer(), service.config, orgID, service.stateStore, service.notificationManager)
|
||||
server, err := alertmanagerserver.New(
|
||||
ctx, service.settings.Logger(), service.settings.PrometheusRegisterer(), service.config, orgID,
|
||||
service.stateStore, service.notificationManager, service.maintenanceStore,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -20,6 +20,7 @@ import (
|
||||
"github.com/SigNoz/signoz/pkg/types"
|
||||
"github.com/SigNoz/signoz/pkg/types/alertmanagertypes"
|
||||
"github.com/SigNoz/signoz/pkg/types/authtypes"
|
||||
ruletypes "github.com/SigNoz/signoz/pkg/types/ruletypes"
|
||||
"github.com/SigNoz/signoz/pkg/valuer"
|
||||
)
|
||||
|
||||
@@ -30,35 +31,49 @@ type provider struct {
|
||||
configStore alertmanagertypes.ConfigStore
|
||||
stateStore alertmanagertypes.StateStore
|
||||
notificationManager nfmanager.NotificationManager
|
||||
maintenanceStore ruletypes.MaintenanceStore
|
||||
stopC chan struct{}
|
||||
}
|
||||
|
||||
func NewFactory(sqlstore sqlstore.SQLStore, orgGetter organization.Getter, notificationManager nfmanager.NotificationManager) factory.ProviderFactory[alertmanager.Alertmanager, alertmanager.Config] {
|
||||
func NewFactory(
|
||||
sqlstore sqlstore.SQLStore,
|
||||
orgGetter organization.Getter,
|
||||
notificationManager nfmanager.NotificationManager,
|
||||
maintenanceStore ruletypes.MaintenanceStore,
|
||||
) factory.ProviderFactory[alertmanager.Alertmanager, alertmanager.Config] {
|
||||
return factory.NewProviderFactory(factory.MustNewName("signoz"), func(ctx context.Context, settings factory.ProviderSettings, config alertmanager.Config) (alertmanager.Alertmanager, error) {
|
||||
return New(ctx, settings, config, sqlstore, orgGetter, notificationManager)
|
||||
return New(settings, config, sqlstore, orgGetter, notificationManager, maintenanceStore)
|
||||
})
|
||||
}
|
||||
|
||||
func New(ctx context.Context, providerSettings factory.ProviderSettings, config alertmanager.Config, sqlstore sqlstore.SQLStore, orgGetter organization.Getter, notificationManager nfmanager.NotificationManager) (*provider, error) {
|
||||
func New(
|
||||
providerSettings factory.ProviderSettings,
|
||||
config alertmanager.Config,
|
||||
sqlstore sqlstore.SQLStore,
|
||||
orgGetter organization.Getter,
|
||||
notificationManager nfmanager.NotificationManager,
|
||||
maintenanceStore ruletypes.MaintenanceStore,
|
||||
) (*provider, error) {
|
||||
settings := factory.NewScopedProviderSettings(providerSettings, "github.com/SigNoz/signoz/pkg/alertmanager/signozalertmanager")
|
||||
configStore := sqlalertmanagerstore.NewConfigStore(sqlstore)
|
||||
stateStore := sqlalertmanagerstore.NewStateStore(sqlstore)
|
||||
|
||||
p := &provider{
|
||||
service: alertmanager.New(
|
||||
ctx,
|
||||
settings,
|
||||
config.Signoz.Config,
|
||||
stateStore,
|
||||
configStore,
|
||||
orgGetter,
|
||||
notificationManager,
|
||||
maintenanceStore,
|
||||
),
|
||||
settings: settings,
|
||||
config: config,
|
||||
configStore: configStore,
|
||||
stateStore: stateStore,
|
||||
notificationManager: notificationManager,
|
||||
maintenanceStore: maintenanceStore,
|
||||
stopC: make(chan struct{}),
|
||||
}
|
||||
|
||||
|
||||
@@ -32,11 +32,10 @@ import (
|
||||
)
|
||||
|
||||
type PrepareTaskOptions struct {
|
||||
Rule *ruletypes.PostableRule
|
||||
TaskName string
|
||||
RuleStore ruletypes.RuleStore
|
||||
MaintenanceStore ruletypes.MaintenanceStore
|
||||
Querier querier.Querier
|
||||
Rule *ruletypes.PostableRule
|
||||
TaskName string
|
||||
RuleStore ruletypes.RuleStore
|
||||
Querier querier.Querier
|
||||
Logger *slog.Logger
|
||||
Cache cache.Cache
|
||||
ManagerOpts *ManagerOptions
|
||||
@@ -46,10 +45,9 @@ type PrepareTaskOptions struct {
|
||||
}
|
||||
|
||||
type PrepareTestRuleOptions struct {
|
||||
Rule *ruletypes.PostableRule
|
||||
RuleStore ruletypes.RuleStore
|
||||
MaintenanceStore ruletypes.MaintenanceStore
|
||||
Querier querier.Querier
|
||||
Rule *ruletypes.PostableRule
|
||||
RuleStore ruletypes.RuleStore
|
||||
Querier querier.Querier
|
||||
Logger *slog.Logger
|
||||
Cache cache.Cache
|
||||
ManagerOpts *ManagerOptions
|
||||
@@ -167,7 +165,7 @@ func defaultPrepareTaskFunc(opts PrepareTaskOptions) (Task, error) {
|
||||
rules = append(rules, tr)
|
||||
|
||||
// create ch rule task for evaluation
|
||||
task = newTask(TaskTypeCh, opts.TaskName, taskNameSuffix, evaluation.GetFrequency().Duration(), rules, opts.ManagerOpts, opts.NotifyFunc, opts.MaintenanceStore, opts.OrgID)
|
||||
task = newTask(TaskTypeCh, opts.TaskName, taskNameSuffix, evaluation.GetFrequency().Duration(), rules, opts.ManagerOpts, opts.NotifyFunc, opts.OrgID)
|
||||
|
||||
} else if opts.Rule.RuleType == ruletypes.RuleTypeProm {
|
||||
|
||||
@@ -191,7 +189,7 @@ func defaultPrepareTaskFunc(opts PrepareTaskOptions) (Task, error) {
|
||||
rules = append(rules, pr)
|
||||
|
||||
// create promql rule task for evaluation
|
||||
task = newTask(TaskTypeProm, opts.TaskName, taskNameSuffix, evaluation.GetFrequency().Duration(), rules, opts.ManagerOpts, opts.NotifyFunc, opts.MaintenanceStore, opts.OrgID)
|
||||
task = newTask(TaskTypeProm, opts.TaskName, taskNameSuffix, evaluation.GetFrequency().Duration(), rules, opts.ManagerOpts, opts.NotifyFunc, opts.OrgID)
|
||||
|
||||
} else {
|
||||
return nil, errors.NewInvalidInputf(errors.CodeInvalidInput, "unsupported rule type %s. Supported types: %s, %s", opts.Rule.RuleType, ruletypes.RuleTypeProm, ruletypes.RuleTypeThreshold)
|
||||
@@ -432,9 +430,8 @@ func (m *Manager) editTask(_ context.Context, orgID valuer.UUID, rule *ruletypes
|
||||
newTask, err := m.prepareTaskFunc(PrepareTaskOptions{
|
||||
Rule: rule,
|
||||
TaskName: taskName,
|
||||
RuleStore: m.ruleStore,
|
||||
MaintenanceStore: m.maintenanceStore,
|
||||
Querier: m.opts.Querier,
|
||||
RuleStore: m.ruleStore,
|
||||
Querier: m.opts.Querier,
|
||||
Logger: m.opts.Logger,
|
||||
Cache: m.cache,
|
||||
ManagerOpts: m.opts,
|
||||
@@ -645,9 +642,8 @@ func (m *Manager) addTask(_ context.Context, orgID valuer.UUID, rule *ruletypes.
|
||||
newTask, err := m.prepareTaskFunc(PrepareTaskOptions{
|
||||
Rule: rule,
|
||||
TaskName: taskName,
|
||||
RuleStore: m.ruleStore,
|
||||
MaintenanceStore: m.maintenanceStore,
|
||||
Querier: m.opts.Querier,
|
||||
RuleStore: m.ruleStore,
|
||||
Querier: m.opts.Querier,
|
||||
Logger: m.opts.Logger,
|
||||
Cache: m.cache,
|
||||
ManagerOpts: m.opts,
|
||||
@@ -1030,9 +1026,8 @@ func (m *Manager) TestNotification(ctx context.Context, orgID valuer.UUID, ruleS
|
||||
|
||||
alertCount, err := m.prepareTestRuleFunc(PrepareTestRuleOptions{
|
||||
Rule: &parsedRule,
|
||||
RuleStore: m.ruleStore,
|
||||
MaintenanceStore: m.maintenanceStore,
|
||||
Querier: m.opts.Querier,
|
||||
RuleStore: m.ruleStore,
|
||||
Querier: m.opts.Querier,
|
||||
Logger: m.opts.Logger,
|
||||
Cache: m.cache,
|
||||
ManagerOpts: m.opts,
|
||||
|
||||
@@ -40,13 +40,12 @@ type PromRuleTask struct {
|
||||
logger *slog.Logger
|
||||
notify NotifyFunc
|
||||
|
||||
maintenanceStore ruletypes.MaintenanceStore
|
||||
orgID valuer.UUID
|
||||
orgID valuer.UUID
|
||||
}
|
||||
|
||||
// NewPromRuleTask holds rules that have promql condition
|
||||
// and evaluates the rule at a given frequency
|
||||
func NewPromRuleTask(name, file string, frequency time.Duration, rules []Rule, opts *ManagerOptions, notify NotifyFunc, maintenanceStore ruletypes.MaintenanceStore, orgID valuer.UUID) *PromRuleTask {
|
||||
func NewPromRuleTask(name, file string, frequency time.Duration, rules []Rule, opts *ManagerOptions, notify NotifyFunc, orgID valuer.UUID) *PromRuleTask {
|
||||
opts.Logger.Info("initiating a new rule group", "name", name, "frequency", frequency)
|
||||
|
||||
if frequency == 0 {
|
||||
@@ -63,10 +62,9 @@ func NewPromRuleTask(name, file string, frequency time.Duration, rules []Rule, o
|
||||
seriesInPreviousEval: make([]map[string]plabels.Labels, len(rules)),
|
||||
done: make(chan struct{}),
|
||||
terminated: make(chan struct{}),
|
||||
notify: notify,
|
||||
maintenanceStore: maintenanceStore,
|
||||
logger: opts.Logger,
|
||||
orgID: orgID,
|
||||
notify: notify,
|
||||
logger: opts.Logger,
|
||||
orgID: orgID,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -330,30 +328,12 @@ func (g *PromRuleTask) Eval(ctx context.Context, ts time.Time) {
|
||||
}()
|
||||
|
||||
g.logger.InfoContext(ctx, "promql rule task", "name", g.name, "eval_started_at", ts)
|
||||
maintenance, err := g.maintenanceStore.ListPlannedMaintenance(ctx, g.orgID.StringValue())
|
||||
if err != nil {
|
||||
g.logger.ErrorContext(ctx, "error in processing sql query", errors.Attr(err))
|
||||
}
|
||||
|
||||
for i, rule := range g.rules {
|
||||
if rule == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
shouldSkip := false
|
||||
for _, m := range maintenance {
|
||||
g.logger.InfoContext(ctx, "checking if rule should be skipped", slog.String("rule.id", rule.ID()), slog.Any("maintenance", m))
|
||||
if m.ShouldSkip(rule.ID(), ts) {
|
||||
shouldSkip = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if shouldSkip {
|
||||
g.logger.InfoContext(ctx, "rule should be skipped", slog.String("rule.id", rule.ID()))
|
||||
continue
|
||||
}
|
||||
|
||||
select {
|
||||
case <-g.done:
|
||||
return
|
||||
|
||||
@@ -38,14 +38,13 @@ type RuleTask struct {
|
||||
pause bool
|
||||
notify NotifyFunc
|
||||
|
||||
maintenanceStore ruletypes.MaintenanceStore
|
||||
orgID valuer.UUID
|
||||
orgID valuer.UUID
|
||||
}
|
||||
|
||||
const DefaultFrequency = 1 * time.Minute
|
||||
|
||||
// NewRuleTask makes a new RuleTask with the given name, options, and rules.
|
||||
func NewRuleTask(name, file string, frequency time.Duration, rules []Rule, opts *ManagerOptions, notify NotifyFunc, maintenanceStore ruletypes.MaintenanceStore, orgID valuer.UUID) *RuleTask {
|
||||
func NewRuleTask(name, file string, frequency time.Duration, rules []Rule, opts *ManagerOptions, notify NotifyFunc, orgID valuer.UUID) *RuleTask {
|
||||
|
||||
if frequency == 0 {
|
||||
frequency = DefaultFrequency
|
||||
@@ -62,9 +61,8 @@ func NewRuleTask(name, file string, frequency time.Duration, rules []Rule, opts
|
||||
logger: opts.Logger,
|
||||
done: make(chan struct{}),
|
||||
terminated: make(chan struct{}),
|
||||
notify: notify,
|
||||
maintenanceStore: maintenanceStore,
|
||||
orgID: orgID,
|
||||
notify: notify,
|
||||
orgID: orgID,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -318,31 +316,11 @@ func (g *RuleTask) Eval(ctx context.Context, ts time.Time) {
|
||||
|
||||
g.logger.DebugContext(ctx, "rule task eval started", "name", g.name, "start_time", ts)
|
||||
|
||||
maintenance, err := g.maintenanceStore.ListPlannedMaintenance(ctx, g.orgID.StringValue())
|
||||
|
||||
if err != nil {
|
||||
g.logger.ErrorContext(ctx, "error in processing sql query", errors.Attr(err))
|
||||
}
|
||||
|
||||
for i, rule := range g.rules {
|
||||
if rule == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
shouldSkip := false
|
||||
for _, m := range maintenance {
|
||||
g.logger.InfoContext(ctx, "checking if rule should be skipped", slog.String("rule.id", rule.ID()), slog.Any("maintenance", m))
|
||||
if m.ShouldSkip(rule.ID(), ts) {
|
||||
shouldSkip = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if shouldSkip {
|
||||
g.logger.InfoContext(ctx, "rule should be skipped", slog.String("rule.id", rule.ID()))
|
||||
continue
|
||||
}
|
||||
|
||||
select {
|
||||
case <-g.done:
|
||||
return
|
||||
|
||||
@@ -4,7 +4,6 @@ import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/types/ruletypes"
|
||||
"github.com/SigNoz/signoz/pkg/valuer"
|
||||
)
|
||||
|
||||
@@ -32,9 +31,9 @@ type Task interface {
|
||||
|
||||
// newTask returns an appropriate group for
|
||||
// rule type
|
||||
func newTask(taskType TaskType, name, file string, frequency time.Duration, rules []Rule, opts *ManagerOptions, notify NotifyFunc, maintenanceStore ruletypes.MaintenanceStore, orgID valuer.UUID) Task {
|
||||
func newTask(taskType TaskType, name, file string, frequency time.Duration, rules []Rule, opts *ManagerOptions, notify NotifyFunc, orgID valuer.UUID) Task {
|
||||
if taskType == TaskTypeCh {
|
||||
return NewRuleTask(name, file, frequency, rules, opts, notify, maintenanceStore, orgID)
|
||||
return NewRuleTask(name, file, frequency, rules, opts, notify, orgID)
|
||||
}
|
||||
return NewPromRuleTask(name, file, frequency, rules, opts, notify, maintenanceStore, orgID)
|
||||
return NewPromRuleTask(name, file, frequency, rules, opts, notify, orgID)
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@ import (
|
||||
"github.com/SigNoz/signoz/pkg/modules/user/impluser"
|
||||
"github.com/SigNoz/signoz/pkg/querier"
|
||||
"github.com/SigNoz/signoz/pkg/queryparser"
|
||||
"github.com/SigNoz/signoz/pkg/ruler/rulestore/sqlrulestore"
|
||||
"github.com/SigNoz/signoz/pkg/sharder"
|
||||
"github.com/SigNoz/signoz/pkg/sharder/noopsharder"
|
||||
"github.com/SigNoz/signoz/pkg/sqlstore"
|
||||
@@ -38,7 +39,8 @@ func TestNewHandlers(t *testing.T) {
|
||||
orgGetter := implorganization.NewGetter(implorganization.NewStore(sqlstore), sharder)
|
||||
notificationManager := nfmanagertest.NewMock()
|
||||
require.NoError(t, err)
|
||||
alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{}, sqlstore, orgGetter, notificationManager)
|
||||
maintenanceStore := sqlrulestore.NewMaintenanceStore(sqlstore)
|
||||
alertmanager, err := signozalertmanager.New(providerSettings, alertmanager.Config{}, sqlstore, orgGetter, notificationManager, maintenanceStore)
|
||||
require.NoError(t, err)
|
||||
tokenizer := tokenizertest.NewMockTokenizer(t)
|
||||
emailing := emailingtest.New()
|
||||
|
||||
@@ -20,6 +20,7 @@ import (
|
||||
"github.com/SigNoz/signoz/pkg/modules/serviceaccount/implserviceaccount"
|
||||
"github.com/SigNoz/signoz/pkg/modules/user/impluser"
|
||||
"github.com/SigNoz/signoz/pkg/queryparser"
|
||||
"github.com/SigNoz/signoz/pkg/ruler/rulestore/sqlrulestore"
|
||||
"github.com/SigNoz/signoz/pkg/sharder"
|
||||
"github.com/SigNoz/signoz/pkg/sharder/noopsharder"
|
||||
"github.com/SigNoz/signoz/pkg/sqlstore"
|
||||
@@ -39,7 +40,8 @@ func TestNewModules(t *testing.T) {
|
||||
orgGetter := implorganization.NewGetter(implorganization.NewStore(sqlstore), sharder)
|
||||
notificationManager := nfmanagertest.NewMock()
|
||||
require.NoError(t, err)
|
||||
alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{}, sqlstore, orgGetter, notificationManager)
|
||||
maintenanceStore := sqlrulestore.NewMaintenanceStore(sqlstore)
|
||||
alertmanager, err := signozalertmanager.New(providerSettings, alertmanager.Config{}, sqlstore, orgGetter, notificationManager, maintenanceStore)
|
||||
require.NoError(t, err)
|
||||
tokenizer := tokenizertest.NewMockTokenizer(t)
|
||||
emailing := emailingtest.New()
|
||||
|
||||
@@ -65,6 +65,7 @@ import (
|
||||
"github.com/SigNoz/signoz/pkg/tokenizer/tokenizerstore/sqltokenizerstore"
|
||||
"github.com/SigNoz/signoz/pkg/types/alertmanagertypes"
|
||||
"github.com/SigNoz/signoz/pkg/types/featuretypes"
|
||||
ruletypes "github.com/SigNoz/signoz/pkg/types/ruletypes"
|
||||
"github.com/SigNoz/signoz/pkg/version"
|
||||
"github.com/SigNoz/signoz/pkg/web"
|
||||
"github.com/SigNoz/signoz/pkg/web/noopweb"
|
||||
@@ -221,9 +222,14 @@ func NewNotificationManagerProviderFactories(routeStore alertmanagertypes.RouteS
|
||||
)
|
||||
}
|
||||
|
||||
func NewAlertmanagerProviderFactories(sqlstore sqlstore.SQLStore, orgGetter organization.Getter, nfManager nfmanager.NotificationManager) factory.NamedMap[factory.ProviderFactory[alertmanager.Alertmanager, alertmanager.Config]] {
|
||||
func NewAlertmanagerProviderFactories(
|
||||
sqlstore sqlstore.SQLStore,
|
||||
orgGetter organization.Getter,
|
||||
nfManager nfmanager.NotificationManager,
|
||||
maintenanceStore ruletypes.MaintenanceStore,
|
||||
) factory.NamedMap[factory.ProviderFactory[alertmanager.Alertmanager, alertmanager.Config]] {
|
||||
return factory.MustNewNamedMap(
|
||||
signozalertmanager.NewFactory(sqlstore, orgGetter, nfManager),
|
||||
signozalertmanager.NewFactory(sqlstore, orgGetter, nfManager, maintenanceStore),
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@ import (
|
||||
"github.com/SigNoz/signoz/pkg/modules/user/impluser"
|
||||
"github.com/SigNoz/signoz/pkg/sqlschema"
|
||||
"github.com/SigNoz/signoz/pkg/sqlschema/sqlschematest"
|
||||
"github.com/SigNoz/signoz/pkg/ruler/rulestore/sqlrulestore"
|
||||
"github.com/SigNoz/signoz/pkg/sqlstore"
|
||||
"github.com/SigNoz/signoz/pkg/sqlstore/sqlstoretest"
|
||||
"github.com/SigNoz/signoz/pkg/statsreporter"
|
||||
@@ -59,9 +60,11 @@ func TestNewProviderFactories(t *testing.T) {
|
||||
})
|
||||
|
||||
assert.NotPanics(t, func() {
|
||||
orgGetter := implorganization.NewGetter(implorganization.NewStore(sqlstoretest.New(sqlstore.Config{Provider: "sqlite"}, sqlmock.QueryMatcherEqual)), nil)
|
||||
store := sqlstoretest.New(sqlstore.Config{Provider: "sqlite"}, sqlmock.QueryMatcherEqual)
|
||||
orgGetter := implorganization.NewGetter(implorganization.NewStore(store), nil)
|
||||
notificationManager := nfmanagertest.NewMock()
|
||||
NewAlertmanagerProviderFactories(sqlstoretest.New(sqlstore.Config{Provider: "sqlite"}, sqlmock.QueryMatcherEqual), orgGetter, notificationManager)
|
||||
maintenanceStore := sqlrulestore.NewMaintenanceStore(store)
|
||||
NewAlertmanagerProviderFactories(store, orgGetter, notificationManager, maintenanceStore)
|
||||
})
|
||||
|
||||
assert.NotPanics(t, func() {
|
||||
|
||||
@@ -34,6 +34,7 @@ import (
|
||||
"github.com/SigNoz/signoz/pkg/querier"
|
||||
"github.com/SigNoz/signoz/pkg/queryparser"
|
||||
"github.com/SigNoz/signoz/pkg/ruler"
|
||||
sqlrulestore "github.com/SigNoz/signoz/pkg/ruler/rulestore/sqlrulestore"
|
||||
"github.com/SigNoz/signoz/pkg/sharder"
|
||||
"github.com/SigNoz/signoz/pkg/sqlmigration"
|
||||
"github.com/SigNoz/signoz/pkg/sqlmigrator"
|
||||
@@ -362,12 +363,14 @@ func New(
|
||||
return nil, err
|
||||
}
|
||||
|
||||
maintenanceStore := sqlrulestore.NewMaintenanceStore(sqlstore)
|
||||
|
||||
// Initialize alertmanager from the available alertmanager provider factories
|
||||
alertmanager, err := factory.NewProviderFromNamedMap(
|
||||
ctx,
|
||||
providerSettings,
|
||||
config.Alertmanager,
|
||||
NewAlertmanagerProviderFactories(sqlstore, orgGetter, nfManager),
|
||||
NewAlertmanagerProviderFactories(sqlstore, orgGetter, nfManager, maintenanceStore),
|
||||
config.Alertmanager.Provider,
|
||||
)
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user