mirror of
https://github.com/SigNoz/signoz.git
synced 2026-05-13 05:30:30 +01:00
Compare commits
27 Commits
fix/recurr
...
mute-rules
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
58c2e8366e | ||
|
|
17f0c33fe0 | ||
|
|
c31d95f9c2 | ||
|
|
2b46e35bb0 | ||
|
|
a1e3d75296 | ||
|
|
1bbccebf16 | ||
|
|
0304c86a6e | ||
|
|
6994a3875a | ||
|
|
2d3625bc60 | ||
|
|
931988fcca | ||
|
|
f78bfa1d57 | ||
|
|
20c08b30b3 | ||
|
|
46d69f6ddb | ||
|
|
c8caa657e0 | ||
|
|
31fee32601 | ||
|
|
2362a1d6ec | ||
|
|
023e916446 | ||
|
|
270b05f94d | ||
|
|
e4e40103d5 | ||
|
|
35a498010d | ||
|
|
ed21a51671 | ||
|
|
742b0b86b4 | ||
|
|
13e1b254b5 | ||
|
|
dc579c1001 | ||
|
|
95b7186086 | ||
|
|
096ded8b0e | ||
|
|
14082ee5c7 |
@@ -4802,6 +4802,8 @@ components:
|
||||
type: string
|
||||
kind:
|
||||
$ref: '#/components/schemas/RuletypesMaintenanceKind'
|
||||
labelExpression:
|
||||
type: string
|
||||
name:
|
||||
type: string
|
||||
schedule:
|
||||
@@ -4829,6 +4831,8 @@ components:
|
||||
type: array
|
||||
description:
|
||||
type: string
|
||||
labelExpression:
|
||||
type: string
|
||||
name:
|
||||
type: string
|
||||
schedule:
|
||||
|
||||
@@ -13,7 +13,6 @@ import (
|
||||
"github.com/SigNoz/signoz/pkg/errors"
|
||||
baserules "github.com/SigNoz/signoz/pkg/query-service/rules"
|
||||
"github.com/SigNoz/signoz/pkg/types/ruletypes"
|
||||
"github.com/SigNoz/signoz/pkg/valuer"
|
||||
)
|
||||
|
||||
func PrepareTaskFunc(opts baserules.PrepareTaskOptions) (baserules.Task, error) {
|
||||
@@ -49,7 +48,7 @@ func PrepareTaskFunc(opts baserules.PrepareTaskOptions) (baserules.Task, error)
|
||||
rules = append(rules, tr)
|
||||
|
||||
// create ch rule task for evaluation
|
||||
task = newTask(baserules.TaskTypeCh, opts.TaskName, evaluation.GetFrequency().Duration(), rules, opts.ManagerOpts, opts.NotifyFunc, opts.MaintenanceStore, opts.OrgID)
|
||||
task = newTask(baserules.TaskTypeCh, opts.TaskName, evaluation.GetFrequency().Duration(), rules, opts.ManagerOpts, opts.NotifyFunc)
|
||||
|
||||
} else if opts.Rule.RuleType == ruletypes.RuleTypeProm {
|
||||
|
||||
@@ -73,7 +72,7 @@ func PrepareTaskFunc(opts baserules.PrepareTaskOptions) (baserules.Task, error)
|
||||
rules = append(rules, pr)
|
||||
|
||||
// create promql rule task for evaluation
|
||||
task = newTask(baserules.TaskTypeProm, opts.TaskName, evaluation.GetFrequency().Duration(), rules, opts.ManagerOpts, opts.NotifyFunc, opts.MaintenanceStore, opts.OrgID)
|
||||
task = newTask(baserules.TaskTypeProm, opts.TaskName, evaluation.GetFrequency().Duration(), rules, opts.ManagerOpts, opts.NotifyFunc)
|
||||
|
||||
} else if opts.Rule.RuleType == ruletypes.RuleTypeAnomaly {
|
||||
// create anomaly rule
|
||||
@@ -96,7 +95,7 @@ func PrepareTaskFunc(opts baserules.PrepareTaskOptions) (baserules.Task, error)
|
||||
rules = append(rules, ar)
|
||||
|
||||
// create anomaly rule task for evaluation
|
||||
task = newTask(baserules.TaskTypeCh, opts.TaskName, evaluation.GetFrequency().Duration(), rules, opts.ManagerOpts, opts.NotifyFunc, opts.MaintenanceStore, opts.OrgID)
|
||||
task = newTask(baserules.TaskTypeCh, opts.TaskName, evaluation.GetFrequency().Duration(), rules, opts.ManagerOpts, opts.NotifyFunc)
|
||||
|
||||
} else {
|
||||
return nil, errors.NewInvalidInputf(errors.CodeInvalidInput, "unsupported rule type %s. Supported types: %s, %s", opts.Rule.RuleType, ruletypes.RuleTypeProm, ruletypes.RuleTypeThreshold)
|
||||
@@ -210,9 +209,9 @@ func TestNotification(opts baserules.PrepareTestRuleOptions) (int, error) {
|
||||
}
|
||||
|
||||
// newTask returns an appropriate group for the rule type
|
||||
func newTask(taskType baserules.TaskType, name string, frequency time.Duration, rules []baserules.Rule, opts *baserules.ManagerOptions, notify baserules.NotifyFunc, maintenanceStore ruletypes.MaintenanceStore, orgID valuer.UUID) baserules.Task {
|
||||
func newTask(taskType baserules.TaskType, name string, frequency time.Duration, rules []baserules.Rule, opts *baserules.ManagerOptions, notify baserules.NotifyFunc) baserules.Task {
|
||||
if taskType == baserules.TaskTypeCh {
|
||||
return baserules.NewRuleTask(name, "", frequency, rules, opts, notify, maintenanceStore, orgID)
|
||||
return baserules.NewRuleTask(name, "", frequency, rules, opts, notify)
|
||||
}
|
||||
return baserules.NewPromRuleTask(name, "", frequency, rules, opts, notify, maintenanceStore, orgID)
|
||||
return baserules.NewPromRuleTask(name, "", frequency, rules, opts, notify)
|
||||
}
|
||||
|
||||
@@ -7042,6 +7042,10 @@ export interface RuletypesPlannedMaintenanceDTO {
|
||||
*/
|
||||
id: string;
|
||||
kind: RuletypesMaintenanceKindDTO;
|
||||
/**
|
||||
* @type string
|
||||
*/
|
||||
labelExpression?: string;
|
||||
/**
|
||||
* @type string
|
||||
*/
|
||||
@@ -7069,6 +7073,10 @@ export interface RuletypesPostablePlannedMaintenanceDTO {
|
||||
* @type string
|
||||
*/
|
||||
description?: string;
|
||||
/**
|
||||
* @type string
|
||||
*/
|
||||
labelExpression?: string;
|
||||
/**
|
||||
* @type string
|
||||
*/
|
||||
|
||||
@@ -81,6 +81,20 @@
|
||||
}
|
||||
}
|
||||
|
||||
.alert-rule-scope {
|
||||
margin-bottom: 12px;
|
||||
|
||||
.ant-radio-wrapper {
|
||||
color: var(--l1-foreground);
|
||||
}
|
||||
}
|
||||
|
||||
.alert-rule-all-warning {
|
||||
font-size: 12px;
|
||||
font-weight: 400;
|
||||
color: var(--l2-foreground);
|
||||
}
|
||||
|
||||
.formItemWithBullet {
|
||||
margin-bottom: 0;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import React, { useCallback, useEffect, useMemo, useState } from 'react';
|
||||
import { Check } from '@signozhq/icons';
|
||||
import { Check, Info } from '@signozhq/icons';
|
||||
import {
|
||||
Button,
|
||||
DatePicker,
|
||||
@@ -8,9 +8,11 @@ import {
|
||||
FormInstance,
|
||||
Input,
|
||||
Modal,
|
||||
Radio,
|
||||
Select,
|
||||
SelectProps,
|
||||
Spin,
|
||||
Tooltip,
|
||||
} from 'antd';
|
||||
import { Typography } from '@signozhq/ui/typography';
|
||||
import type { DefaultOptionType } from 'antd/es/select';
|
||||
@@ -66,13 +68,17 @@ const TIME_FORMAT = DATE_TIME_FORMATS.TIME;
|
||||
const DATE_FORMAT = DATE_TIME_FORMATS.ORDINAL_DATE;
|
||||
const ORDINAL_FORMAT = DATE_TIME_FORMATS.ORDINAL_ONLY;
|
||||
|
||||
type AlertRuleScope = 'all' | 'specific';
|
||||
|
||||
interface PlannedDowntimeFormData {
|
||||
name: string;
|
||||
startTime: dayjs.Dayjs | string;
|
||||
endTime: dayjs.Dayjs | string;
|
||||
recurrence?: RuletypesRecurrenceDTO | null;
|
||||
alertRuleScope: AlertRuleScope;
|
||||
alertRules: DefaultOptionType[];
|
||||
timezone?: string;
|
||||
labelExpression?: string;
|
||||
}
|
||||
|
||||
const customFormat = DATE_TIME_FORMATS.ORDINAL_DATETIME;
|
||||
@@ -127,6 +133,12 @@ export function PlannedDowntimeForm(
|
||||
recurrenceOptions.doesNotRepeat.value,
|
||||
);
|
||||
|
||||
const [alertRuleScope, setAlertRuleScope] = useState<AlertRuleScope>(
|
||||
initialValues.id && (initialValues.alertIds || []).length === 0
|
||||
? 'all'
|
||||
: 'specific',
|
||||
);
|
||||
|
||||
const timezoneInitialValue = !isEmpty(initialValues.schedule?.timezone)
|
||||
? (initialValues.schedule?.timezone as string)
|
||||
: undefined;
|
||||
@@ -142,10 +154,14 @@ export function PlannedDowntimeForm(
|
||||
const saveHandler = useCallback(
|
||||
async (values: PlannedDowntimeFormData) => {
|
||||
const data: RuletypesPostablePlannedMaintenanceDTO = {
|
||||
alertIds: values.alertRules
|
||||
.map((alert) => alert.value)
|
||||
.filter((alert) => alert !== undefined) as string[],
|
||||
alertIds:
|
||||
values.alertRuleScope === 'all'
|
||||
? []
|
||||
: (values.alertRules
|
||||
.map((alert) => alert.value)
|
||||
.filter((alert) => alert !== undefined) as string[]),
|
||||
name: values.name,
|
||||
labelExpression: values.labelExpression,
|
||||
schedule: {
|
||||
startTime: new Date(
|
||||
handleTimeConversion(
|
||||
@@ -259,12 +275,13 @@ export function PlannedDowntimeForm(
|
||||
};
|
||||
|
||||
const formatedInitialValues = useMemo(() => {
|
||||
const initialAlertIds = initialValues.alertIds || [];
|
||||
const isExistingSchedule = Boolean(initialValues.id);
|
||||
const formData: PlannedDowntimeFormData = {
|
||||
name: defaultTo(initialValues.name, ''),
|
||||
alertRules: getAlertOptionsFromIds(
|
||||
initialValues.alertIds || [],
|
||||
alertOptions,
|
||||
),
|
||||
alertRuleScope:
|
||||
isExistingSchedule && initialAlertIds.length === 0 ? 'all' : 'specific',
|
||||
alertRules: getAlertOptionsFromIds(initialAlertIds, alertOptions),
|
||||
endTime: getEndTime(initialValues) ? dayjs(getEndTime(initialValues)) : '',
|
||||
startTime: initialValues.schedule?.startTime
|
||||
? dayjs(initialValues.schedule?.startTime)
|
||||
@@ -281,12 +298,14 @@ export function PlannedDowntimeForm(
|
||||
),
|
||||
} as RuletypesRecurrenceDTO,
|
||||
timezone: initialValues.schedule?.timezone as string,
|
||||
labelExpression: initialValues.labelExpression || '',
|
||||
};
|
||||
return formData;
|
||||
}, [initialValues, alertOptions]);
|
||||
|
||||
useEffect(() => {
|
||||
setSelectedTags(formatedInitialValues.alertRules);
|
||||
setAlertRuleScope(formatedInitialValues.alertRuleScope);
|
||||
form.setFieldsValue({ ...formatedInitialValues });
|
||||
}, [form, formatedInitialValues, initialValues]);
|
||||
|
||||
@@ -406,6 +425,9 @@ export function PlannedDowntimeForm(
|
||||
onFinish={onFinish}
|
||||
onValuesChange={(): void => {
|
||||
setRecurrenceType(form.getFieldValue('recurrence')?.repeatType as string);
|
||||
setAlertRuleScope(
|
||||
form.getFieldValue('alertRuleScope') as AlertRuleScope,
|
||||
);
|
||||
setFormData(form.getFieldsValue());
|
||||
}}
|
||||
autoComplete="off"
|
||||
@@ -517,50 +539,99 @@ export function PlannedDowntimeForm(
|
||||
<div className="scheduleTimeInfoText">{endTimeText}</div>
|
||||
)}
|
||||
<div>
|
||||
<div className="alert-rule-form">
|
||||
<Typography style={{ marginBottom: 8 }}>Silence Alerts</Typography>
|
||||
<Typography style={{ marginBottom: 8 }} className="alert-rule-info">
|
||||
(Leave empty to silence all alerts)
|
||||
</Typography>
|
||||
</div>
|
||||
<Form.Item noStyle shouldUpdate>
|
||||
<AlertRuleTags
|
||||
closable
|
||||
selectedTags={selectedTags}
|
||||
handleClose={handleClose}
|
||||
/>
|
||||
<Typography style={{ marginBottom: 8 }}>Silence Alerts</Typography>
|
||||
<Form.Item
|
||||
name="alertRuleScope"
|
||||
initialValue="specific"
|
||||
className="alert-rule-scope"
|
||||
>
|
||||
<Radio.Group>
|
||||
<Radio value="all">All alert rules</Radio>
|
||||
<Radio value="specific">Specific alert rules</Radio>
|
||||
</Radio.Group>
|
||||
</Form.Item>
|
||||
<Form.Item name={alertRuleFormName}>
|
||||
<Select
|
||||
placeholder="Search for alerts rules or groups..."
|
||||
mode="multiple"
|
||||
status={isError ? 'error' : undefined}
|
||||
loading={isLoading}
|
||||
tagRender={noTagRenderer}
|
||||
onChange={handleChange}
|
||||
showSearch
|
||||
options={alertOptions}
|
||||
filterOption={(input, option): boolean =>
|
||||
(option?.label as string)?.toLowerCase()?.includes(input.toLowerCase())
|
||||
}
|
||||
notFoundContent={
|
||||
isLoading ? (
|
||||
<span>
|
||||
<Spin size="small" /> Loading...
|
||||
</span>
|
||||
) : (
|
||||
<span>No alert available.</span>
|
||||
)
|
||||
}
|
||||
{alertRuleScope === 'specific' && (
|
||||
<>
|
||||
<Form.Item noStyle shouldUpdate>
|
||||
<AlertRuleTags
|
||||
closable
|
||||
selectedTags={selectedTags}
|
||||
handleClose={handleClose}
|
||||
/>
|
||||
</Form.Item>
|
||||
<Form.Item
|
||||
name={alertRuleFormName}
|
||||
rules={[
|
||||
{
|
||||
validator: async (
|
||||
_rule,
|
||||
value: DefaultOptionType[] | undefined,
|
||||
): Promise<void> => {
|
||||
if (!value || value.length === 0) {
|
||||
throw new Error(
|
||||
'Select at least one alert rule, or choose "All alert rules" to silence everything.',
|
||||
);
|
||||
}
|
||||
},
|
||||
},
|
||||
]}
|
||||
>
|
||||
<Select
|
||||
placeholder="Search for alerts rules or groups..."
|
||||
mode="multiple"
|
||||
status={isError ? 'error' : undefined}
|
||||
loading={isLoading}
|
||||
tagRender={noTagRenderer}
|
||||
onChange={handleChange}
|
||||
showSearch
|
||||
options={alertOptions}
|
||||
filterOption={(input, option): boolean =>
|
||||
(option?.label as string)?.toLowerCase()?.includes(input.toLowerCase())
|
||||
}
|
||||
notFoundContent={
|
||||
isLoading ? (
|
||||
<span>
|
||||
<Spin size="small" /> Loading...
|
||||
</span>
|
||||
) : (
|
||||
<span>No alert available.</span>
|
||||
)
|
||||
}
|
||||
>
|
||||
{alertOptions?.map((option) => (
|
||||
<Select.Option key={option.value} value={option.value}>
|
||||
{option.label}
|
||||
</Select.Option>
|
||||
))}
|
||||
</Select>
|
||||
</Form.Item>
|
||||
</>
|
||||
)}
|
||||
{alertRuleScope === 'all' && (
|
||||
<Typography
|
||||
className="alert-rule-info alert-rule-all-warning"
|
||||
style={{ marginBottom: 16 }}
|
||||
>
|
||||
{alertOptions?.map((option) => (
|
||||
<Select.Option key={option.value} value={option.value}>
|
||||
{option.label}
|
||||
</Select.Option>
|
||||
))}
|
||||
</Select>
|
||||
</Form.Item>
|
||||
All alerts will be silenced for the duration of this maintenance window.
|
||||
</Typography>
|
||||
)}
|
||||
</div>
|
||||
<Form.Item
|
||||
label={
|
||||
<span>
|
||||
Label Expression
|
||||
<Tooltip title='Filter by alert labels. Examples: env == "prod", region == "us-east-1" && severity == "critical"'>
|
||||
<Info size={13} />
|
||||
</Tooltip>
|
||||
</span>
|
||||
}
|
||||
name="labelExpression"
|
||||
>
|
||||
<Input.TextArea
|
||||
placeholder='e.g. env == "prod" && region == "us-east-1"'
|
||||
autoSize={{ minRows: 2, maxRows: 4 }}
|
||||
/>
|
||||
</Form.Item>
|
||||
<Form.Item style={{ marginBottom: 0 }}>
|
||||
<ModalButtonWrapper>
|
||||
<Button
|
||||
|
||||
@@ -207,7 +207,7 @@ export function CollapseListContent({
|
||||
selectedTags={alertOptions}
|
||||
/>
|
||||
) : (
|
||||
'-'
|
||||
<Tag className="all-alerts-tag">All alert rules</Tag>
|
||||
),
|
||||
)}
|
||||
</Flex>
|
||||
|
||||
@@ -24,6 +24,7 @@ import (
|
||||
"github.com/prometheus/alertmanager/dispatch"
|
||||
"github.com/prometheus/alertmanager/notify"
|
||||
"github.com/prometheus/alertmanager/provider/mem"
|
||||
promTypes "github.com/prometheus/alertmanager/types"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/common/model"
|
||||
"github.com/prometheus/common/promslog"
|
||||
@@ -364,7 +365,7 @@ route:
|
||||
providerSettings := createTestProviderSettings()
|
||||
logger := providerSettings.Logger
|
||||
route := dispatch.NewRoute(conf.Route, nil)
|
||||
marker := alertmanagertypes.NewMarker(prometheus.NewRegistry())
|
||||
marker := promTypes.NewMarker(prometheus.NewRegistry())
|
||||
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, 0, nil, logger, prometheus.NewRegistry(), nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
@@ -637,7 +638,7 @@ route:
|
||||
providerSettings := createTestProviderSettings()
|
||||
logger := providerSettings.Logger
|
||||
route := dispatch.NewRoute(conf.Route, nil)
|
||||
marker := alertmanagertypes.NewMarker(prometheus.NewRegistry())
|
||||
marker := promTypes.NewMarker(prometheus.NewRegistry())
|
||||
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, 0, nil, logger, prometheus.NewRegistry(), nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
@@ -896,7 +897,7 @@ route:
|
||||
providerSettings := createTestProviderSettings()
|
||||
logger := providerSettings.Logger
|
||||
route := dispatch.NewRoute(conf.Route, nil)
|
||||
marker := alertmanagertypes.NewMarker(prometheus.NewRegistry())
|
||||
marker := promTypes.NewMarker(prometheus.NewRegistry())
|
||||
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, 0, nil, logger, prometheus.NewRegistry(), nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
@@ -1158,7 +1159,7 @@ func newAlert(labels model.LabelSet) *alertmanagertypes.Alert {
|
||||
|
||||
func TestDispatcherRace(t *testing.T) {
|
||||
logger := promslog.NewNopLogger()
|
||||
marker := alertmanagertypes.NewMarker(prometheus.NewRegistry())
|
||||
marker := promTypes.NewMarker(prometheus.NewRegistry())
|
||||
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, 0, nil, logger, prometheus.NewRegistry(), nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
@@ -1194,7 +1195,7 @@ route:
|
||||
route := dispatch.NewRoute(conf.Route, nil)
|
||||
providerSettings := createTestProviderSettings()
|
||||
logger := providerSettings.Logger
|
||||
marker := alertmanagertypes.NewMarker(prometheus.NewRegistry())
|
||||
marker := promTypes.NewMarker(prometheus.NewRegistry())
|
||||
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, 0, nil, logger, prometheus.NewRegistry(), nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
@@ -1264,7 +1265,7 @@ route:
|
||||
|
||||
func TestDispatcher_DoMaintenance(t *testing.T) {
|
||||
r := prometheus.NewRegistry()
|
||||
marker := alertmanagertypes.NewMarker(r)
|
||||
marker := promTypes.NewMarker(r)
|
||||
|
||||
alerts, err := mem.NewAlerts(context.Background(), marker, time.Minute, 0, nil, promslog.NewNopLogger(), prometheus.NewRegistry(), nil)
|
||||
if err != nil {
|
||||
@@ -1370,7 +1371,7 @@ route:
|
||||
providerSettings := createTestProviderSettings()
|
||||
logger := providerSettings.Logger
|
||||
route := dispatch.NewRoute(conf.Route, nil)
|
||||
marker := alertmanagertypes.NewMarker(prometheus.NewRegistry())
|
||||
marker := promTypes.NewMarker(prometheus.NewRegistry())
|
||||
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, 0, nil, logger, prometheus.NewRegistry(), nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
|
||||
96
pkg/alertmanager/alertmanagerserver/maintenance_muter.go
Normal file
96
pkg/alertmanager/alertmanagerserver/maintenance_muter.go
Normal file
@@ -0,0 +1,96 @@
|
||||
package alertmanagerserver
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/common/model"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/types/alertmanagertypes"
|
||||
"github.com/SigNoz/signoz/pkg/types/ruletypes"
|
||||
)
|
||||
|
||||
// MaintenanceMuter implements types.Muter for maintenance windows.
|
||||
// It suppresses alerts whose ruleId label matches an active maintenance schedule.
|
||||
// Results are cached for cacheTTL to avoid a DB query on every per-alert check.
|
||||
type MaintenanceMuter struct {
|
||||
maintenanceStore alertmanagertypes.MaintenanceStore
|
||||
orgID string
|
||||
logger *slog.Logger
|
||||
|
||||
mu sync.RWMutex
|
||||
cached []*alertmanagertypes.PlannedMaintenance
|
||||
cacheExpiry time.Time
|
||||
}
|
||||
|
||||
const maintenanceCacheTTL = 30 * time.Second
|
||||
|
||||
func NewMaintenanceMuter(store alertmanagertypes.MaintenanceStore, orgID string, logger *slog.Logger) *MaintenanceMuter {
|
||||
return &MaintenanceMuter{
|
||||
maintenanceStore: store,
|
||||
orgID: orgID,
|
||||
logger: logger,
|
||||
}
|
||||
}
|
||||
|
||||
func (m *MaintenanceMuter) Mutes(ctx context.Context, lset model.LabelSet) bool {
|
||||
ruleID := string(lset[ruletypes.AlertRuleIDLabel])
|
||||
if ruleID == "" {
|
||||
return false
|
||||
}
|
||||
now := time.Now()
|
||||
for _, mw := range m.getMaintenances(ctx) {
|
||||
if mw.ShouldSkip(ruleID, now, lset) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// MutedBy returns the IDs of all active maintenance windows currently
|
||||
// suppressing the alert identified by lset. It is used to populate the
|
||||
// `mutedBy` field on the v2 API alert response so that maintenance-suppressed
|
||||
// alerts surface as `state=suppressed` in GetAlerts responses.
|
||||
func (m *MaintenanceMuter) MutedBy(ctx context.Context, lset model.LabelSet) []string {
|
||||
ruleID := string(lset[ruletypes.AlertRuleIDLabel])
|
||||
if ruleID == "" {
|
||||
return nil
|
||||
}
|
||||
var ids []string
|
||||
now := time.Now()
|
||||
for _, mw := range m.getMaintenances(ctx) {
|
||||
if mw.ShouldSkip(ruleID, now, lset) {
|
||||
ids = append(ids, mw.ID.String())
|
||||
}
|
||||
}
|
||||
return ids
|
||||
}
|
||||
|
||||
func (m *MaintenanceMuter) getMaintenances(ctx context.Context) []*alertmanagertypes.PlannedMaintenance {
|
||||
m.mu.RLock()
|
||||
if time.Now().Before(m.cacheExpiry) {
|
||||
cached := m.cached
|
||||
m.mu.RUnlock()
|
||||
return cached
|
||||
}
|
||||
m.mu.RUnlock()
|
||||
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
// Double-check after acquiring write lock.
|
||||
if time.Now().Before(m.cacheExpiry) {
|
||||
return m.cached
|
||||
}
|
||||
|
||||
mws, err := m.maintenanceStore.ListPlannedMaintenance(ctx, m.orgID)
|
||||
if err != nil {
|
||||
m.logger.ErrorContext(ctx, "failed to list planned maintenance windows; alerts will not be suppressed", slog.String("org_id", m.orgID))
|
||||
return m.cached // return stale (potentially empty) cache on error
|
||||
}
|
||||
m.cached = mws
|
||||
m.cacheExpiry = time.Now().Add(maintenanceCacheTTL)
|
||||
return m.cached
|
||||
}
|
||||
246
pkg/alertmanager/alertmanagerserver/maintenance_muter_test.go
Normal file
246
pkg/alertmanager/alertmanagerserver/maintenance_muter_test.go
Normal file
@@ -0,0 +1,246 @@
|
||||
package alertmanagerserver
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"log/slog"
|
||||
"sort"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/common/model"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/types/alertmanagertypes"
|
||||
"github.com/SigNoz/signoz/pkg/types/ruletypes"
|
||||
"github.com/SigNoz/signoz/pkg/valuer"
|
||||
)
|
||||
|
||||
// fakeMaintenanceStore implements alertmanagertypes.MaintenanceStore with an in-memory list.
|
||||
// It counts ListPlannedMaintenance calls so tests can assert caching behavior.
|
||||
type fakeMaintenanceStore struct {
|
||||
mu sync.Mutex
|
||||
items []*alertmanagertypes.PlannedMaintenance
|
||||
err error
|
||||
calls int
|
||||
}
|
||||
|
||||
func (s *fakeMaintenanceStore) ListPlannedMaintenance(_ context.Context, _ string) ([]*alertmanagertypes.PlannedMaintenance, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.calls++
|
||||
if s.err != nil {
|
||||
return nil, s.err
|
||||
}
|
||||
return s.items, nil
|
||||
}
|
||||
|
||||
func (s *fakeMaintenanceStore) callCount() int {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
return s.calls
|
||||
}
|
||||
|
||||
func (s *fakeMaintenanceStore) setError(err error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.err = err
|
||||
}
|
||||
|
||||
// Remaining MaintenanceStore methods — unused by MaintenanceMuter.
|
||||
func (s *fakeMaintenanceStore) CreatePlannedMaintenance(context.Context, *alertmanagertypes.PostablePlannedMaintenance) (*alertmanagertypes.PlannedMaintenance, error) {
|
||||
return nil, nil
|
||||
}
|
||||
func (s *fakeMaintenanceStore) DeletePlannedMaintenance(context.Context, valuer.UUID) error {
|
||||
return nil
|
||||
}
|
||||
func (s *fakeMaintenanceStore) GetPlannedMaintenanceByID(context.Context, valuer.UUID) (*alertmanagertypes.PlannedMaintenance, error) {
|
||||
return nil, nil
|
||||
}
|
||||
func (s *fakeMaintenanceStore) UpdatePlannedMaintenance(context.Context, *alertmanagertypes.PostablePlannedMaintenance, valuer.UUID) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func newMuter(t *testing.T, items ...*alertmanagertypes.PlannedMaintenance) (*MaintenanceMuter, *fakeMaintenanceStore) {
|
||||
t.Helper()
|
||||
store := &fakeMaintenanceStore{items: items}
|
||||
muter := NewMaintenanceMuter(store, "org-1", slog.New(slog.DiscardHandler))
|
||||
return muter, store
|
||||
}
|
||||
|
||||
// activeFixed builds a fixed-time maintenance window that brackets now.
|
||||
// ruleIDs scope the window; an empty slice matches every rule.
|
||||
func activeFixed(ruleIDs ...string) *alertmanagertypes.PlannedMaintenance {
|
||||
now := time.Now().UTC()
|
||||
return &alertmanagertypes.PlannedMaintenance{
|
||||
ID: valuer.GenerateUUID(),
|
||||
Schedule: &alertmanagertypes.Schedule{
|
||||
Timezone: "UTC",
|
||||
StartTime: now.Add(-time.Hour),
|
||||
EndTime: now.Add(time.Hour),
|
||||
},
|
||||
RuleIDs: ruleIDs,
|
||||
}
|
||||
}
|
||||
|
||||
// futureFixed builds a fixed-time maintenance window that starts in the future.
|
||||
func futureFixed(ruleIDs ...string) *alertmanagertypes.PlannedMaintenance {
|
||||
now := time.Now().UTC()
|
||||
return &alertmanagertypes.PlannedMaintenance{
|
||||
ID: valuer.GenerateUUID(),
|
||||
Schedule: &alertmanagertypes.Schedule{
|
||||
Timezone: "UTC",
|
||||
StartTime: now.Add(time.Hour),
|
||||
EndTime: now.Add(2 * time.Hour),
|
||||
},
|
||||
RuleIDs: ruleIDs,
|
||||
}
|
||||
}
|
||||
|
||||
func labelsFor(ruleID string) model.LabelSet {
|
||||
return model.LabelSet{model.LabelName(ruletypes.AlertRuleIDLabel): model.LabelValue(ruleID)}
|
||||
}
|
||||
|
||||
func TestMutes_EmptyRuleIDLabel(t *testing.T) {
|
||||
muter, store := newMuter(t, activeFixed())
|
||||
assert.False(t, muter.Mutes(context.Background(), model.LabelSet{}))
|
||||
// Short-circuit: no store lookup needed when the label is missing.
|
||||
assert.Equal(t, 0, store.callCount())
|
||||
}
|
||||
|
||||
func TestMutes_NoMaintenanceWindows(t *testing.T) {
|
||||
muter, _ := newMuter(t)
|
||||
assert.False(t, muter.Mutes(context.Background(), labelsFor("rule-1")))
|
||||
}
|
||||
|
||||
func TestMutes_MatchingRule(t *testing.T) {
|
||||
mw := activeFixed("rule-1", "rule-2")
|
||||
muter, _ := newMuter(t, mw)
|
||||
assert.True(t, muter.Mutes(context.Background(), labelsFor("rule-1")))
|
||||
assert.True(t, muter.Mutes(context.Background(), labelsFor("rule-2")))
|
||||
}
|
||||
|
||||
func TestMutes_NonMatchingRule(t *testing.T) {
|
||||
muter, _ := newMuter(t, activeFixed("rule-1"))
|
||||
assert.False(t, muter.Mutes(context.Background(), labelsFor("rule-other")))
|
||||
}
|
||||
|
||||
func TestMutes_EmptyRuleIDsMatchesAllRules(t *testing.T) {
|
||||
// A maintenance with no RuleIDs is treated as scoping every rule.
|
||||
muter, _ := newMuter(t, activeFixed())
|
||||
assert.True(t, muter.Mutes(context.Background(), labelsFor("any-rule")))
|
||||
}
|
||||
|
||||
func TestMutes_FutureWindowDoesNotMute(t *testing.T) {
|
||||
muter, _ := newMuter(t, futureFixed("rule-1"))
|
||||
assert.False(t, muter.Mutes(context.Background(), labelsFor("rule-1")))
|
||||
}
|
||||
|
||||
func TestMutes_AnyOfMultipleWindowsMatches(t *testing.T) {
|
||||
muter, _ := newMuter(t,
|
||||
futureFixed("rule-1"),
|
||||
activeFixed("rule-1"),
|
||||
)
|
||||
assert.True(t, muter.Mutes(context.Background(), labelsFor("rule-1")))
|
||||
}
|
||||
|
||||
func TestMutedBy_EmptyRuleIDLabel(t *testing.T) {
|
||||
muter, store := newMuter(t, activeFixed())
|
||||
assert.Nil(t, muter.MutedBy(context.Background(), model.LabelSet{}))
|
||||
assert.Equal(t, 0, store.callCount())
|
||||
}
|
||||
|
||||
func TestMutedBy_NoMatches(t *testing.T) {
|
||||
muter, _ := newMuter(t, activeFixed("rule-1"), futureFixed("rule-1"))
|
||||
assert.Nil(t, muter.MutedBy(context.Background(), labelsFor("rule-other")))
|
||||
}
|
||||
|
||||
func TestMutedBy_ReturnsIDsOfAllActiveMatchingWindows(t *testing.T) {
|
||||
mw1 := activeFixed("rule-1")
|
||||
mw2 := activeFixed() // matches all rules
|
||||
mw3 := futureFixed("rule-1")
|
||||
mw4 := activeFixed("rule-other")
|
||||
|
||||
muter, _ := newMuter(t, mw1, mw2, mw3, mw4)
|
||||
ids := muter.MutedBy(context.Background(), labelsFor("rule-1"))
|
||||
|
||||
want := []string{mw1.ID.String(), mw2.ID.String()}
|
||||
sort.Strings(want)
|
||||
sort.Strings(ids)
|
||||
assert.Equal(t, want, ids)
|
||||
}
|
||||
|
||||
func TestCache_RepeatedCallsHitStoreOnce(t *testing.T) {
|
||||
muter, store := newMuter(t, activeFixed("rule-1"))
|
||||
ctx := context.Background()
|
||||
for i := 0; i < 5; i++ {
|
||||
require.True(t, muter.Mutes(ctx, labelsFor("rule-1")))
|
||||
}
|
||||
assert.Equal(t, 1, store.callCount(), "cached results should suppress further store lookups within TTL")
|
||||
}
|
||||
|
||||
func TestCache_StoreErrorReturnsStaleCache(t *testing.T) {
|
||||
mw := activeFixed("rule-1")
|
||||
muter, store := newMuter(t, mw)
|
||||
ctx := context.Background()
|
||||
|
||||
// First call populates the cache from a working store.
|
||||
require.True(t, muter.Mutes(ctx, labelsFor("rule-1")))
|
||||
require.Equal(t, 1, store.callCount())
|
||||
|
||||
// Force cache to be considered expired so the next call re-fetches.
|
||||
muter.mu.Lock()
|
||||
muter.cacheExpiry = time.Time{}
|
||||
muter.mu.Unlock()
|
||||
|
||||
// Store now errors. The muter should fall back to the previously cached value
|
||||
// (i.e. still mute rule-1) rather than returning false.
|
||||
store.setError(errors.New("boom"))
|
||||
assert.True(t, muter.Mutes(ctx, labelsFor("rule-1")),
|
||||
"on store error, muter should keep using the last known cache to avoid losing suppression")
|
||||
assert.Equal(t, 2, store.callCount())
|
||||
}
|
||||
|
||||
func TestCache_ExpiredCacheRefetchesUpdatedData(t *testing.T) {
|
||||
mw := activeFixed("rule-1")
|
||||
muter, store := newMuter(t, mw)
|
||||
ctx := context.Background()
|
||||
|
||||
require.True(t, muter.Mutes(ctx, labelsFor("rule-1")))
|
||||
require.Equal(t, 1, store.callCount())
|
||||
|
||||
// Drop the maintenance window from the store and expire the cache.
|
||||
store.mu.Lock()
|
||||
store.items = nil
|
||||
store.mu.Unlock()
|
||||
muter.mu.Lock()
|
||||
muter.cacheExpiry = time.Time{}
|
||||
muter.mu.Unlock()
|
||||
|
||||
assert.False(t, muter.Mutes(ctx, labelsFor("rule-1")))
|
||||
assert.Equal(t, 2, store.callCount())
|
||||
}
|
||||
|
||||
func TestMutes_IsConcurrencySafe(t *testing.T) {
|
||||
muter, store := newMuter(t, activeFixed("rule-1"))
|
||||
ctx := context.Background()
|
||||
|
||||
const goroutines = 32
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(goroutines)
|
||||
for i := 0; i < goroutines; i++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for j := 0; j < 50; j++ {
|
||||
_ = muter.Mutes(ctx, labelsFor("rule-1"))
|
||||
_ = muter.MutedBy(ctx, labelsFor("rule-1"))
|
||||
}
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
// Even under contention the cache must collapse the load to a single fetch.
|
||||
assert.Equal(t, 1, store.callCount())
|
||||
}
|
||||
109
pkg/alertmanager/alertmanagerserver/pipeline_builder.go
Normal file
109
pkg/alertmanager/alertmanagerserver/pipeline_builder.go
Normal file
@@ -0,0 +1,109 @@
|
||||
// Copyright (c) 2026 SigNoz, Inc.
|
||||
// Copyright 2015 Prometheus Team
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package alertmanagerserver
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/alertmanager/featurecontrol"
|
||||
"github.com/prometheus/alertmanager/inhibit"
|
||||
"github.com/prometheus/alertmanager/nflog/nflogpb"
|
||||
"github.com/prometheus/alertmanager/notify"
|
||||
"github.com/prometheus/alertmanager/silence"
|
||||
"github.com/prometheus/alertmanager/timeinterval"
|
||||
"github.com/prometheus/alertmanager/types"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
// pipelineBuilder is a local copy of notify.PipelineBuilder that injects
|
||||
// the maintenance mute stage immediately before the receiver stage.
|
||||
//
|
||||
// We maintain our own copy so we can control exactly where in the pipeline
|
||||
// the maintenance stage runs (between the silence stage and the receiver),
|
||||
// which is not possible by wrapping the output of the upstream builder.
|
||||
//
|
||||
// Upstream pipeline order:
|
||||
// GossipSettle → Inhibit → TimeActive → TimeMute → Silence → [mms] → Receiver
|
||||
type pipelineBuilder struct {
|
||||
metrics *notify.Metrics
|
||||
ff featurecontrol.Flagger
|
||||
}
|
||||
|
||||
func newPipelineBuilder(
|
||||
r prometheus.Registerer,
|
||||
ff featurecontrol.Flagger,
|
||||
) *pipelineBuilder {
|
||||
return &pipelineBuilder{
|
||||
metrics: notify.NewMetrics(r, ff),
|
||||
ff: ff,
|
||||
}
|
||||
}
|
||||
|
||||
// New returns a map of receivers to Stages, mirroring notify.PipelineBuilder.New
|
||||
// but inserting a maintenanceMuteStage between the silence stage and the receiver.
|
||||
func (pb *pipelineBuilder) New(
|
||||
receivers map[string][]notify.Integration,
|
||||
wait func() time.Duration,
|
||||
inhibitor *inhibit.Inhibitor,
|
||||
silencer *silence.Silencer,
|
||||
intervener *timeinterval.Intervener,
|
||||
marker types.GroupMarker,
|
||||
muter *MaintenanceMuter,
|
||||
notificationLog notify.NotificationLog,
|
||||
peer notify.Peer,
|
||||
) notify.RoutingStage {
|
||||
rs := make(notify.RoutingStage, len(receivers))
|
||||
|
||||
ms := notify.NewGossipSettleStage(peer)
|
||||
is := notify.NewMuteStage(inhibitor, pb.metrics)
|
||||
tas := notify.NewTimeActiveStage(intervener, marker, pb.metrics)
|
||||
tms := notify.NewTimeMuteStage(intervener, marker, pb.metrics)
|
||||
ss := notify.NewMuteStage(silencer, pb.metrics)
|
||||
mms := notify.NewMuteStage(muter, pb.metrics)
|
||||
|
||||
for name := range receivers {
|
||||
stages := notify.MultiStage{ms, is, tas, tms, ss, mms}
|
||||
stages = append(stages, createReceiverStage(name, receivers[name], wait, notificationLog, pb.metrics))
|
||||
rs[name] = stages
|
||||
}
|
||||
|
||||
pb.metrics.InitializeFor(receivers)
|
||||
return rs
|
||||
}
|
||||
|
||||
// createReceiverStage is a copy of notify.createReceiverStage (unexported upstream).
|
||||
func createReceiverStage(
|
||||
name string,
|
||||
integrations []notify.Integration,
|
||||
wait func() time.Duration,
|
||||
notificationLog notify.NotificationLog,
|
||||
metrics *notify.Metrics,
|
||||
) notify.Stage {
|
||||
var fs notify.FanoutStage
|
||||
for i := range integrations {
|
||||
recv := &nflogpb.Receiver{
|
||||
GroupName: name,
|
||||
Integration: integrations[i].Name(),
|
||||
Idx: uint32(integrations[i].Index()),
|
||||
}
|
||||
var s notify.MultiStage
|
||||
s = append(s, notify.NewWaitStage(wait))
|
||||
s = append(s, notify.NewDedupStage(&integrations[i], notificationLog, recv))
|
||||
s = append(s, notify.NewRetryStage(integrations[i], name, metrics))
|
||||
s = append(s, notify.NewSetNotifiesStage(notificationLog, recv))
|
||||
fs = append(fs, s)
|
||||
}
|
||||
return fs
|
||||
}
|
||||
@@ -28,12 +28,10 @@ import (
|
||||
"github.com/SigNoz/signoz/pkg/types/alertmanagertypes"
|
||||
)
|
||||
|
||||
var (
|
||||
// This is not a real file and will never be used. We need this placeholder to ensure maintenance runs on shutdown. See
|
||||
// https://github.com/prometheus/server/blob/3ee2cd0f1271e277295c02b6160507b4d193dde2/silence/silence.go#L435-L438
|
||||
// and https://github.com/prometheus/server/blob/3b06b97af4d146e141af92885a185891eb79a5b0/nflog/nflog.go#L362.
|
||||
snapfnoop string = "snapfnoop"
|
||||
)
|
||||
// This is not a real snapshot file and will never be used. We need this placeholder to ensure maintenance runs on shutdown.
|
||||
// See https://github.com/prometheus/alertmanager/blob/3ee2cd0f1271e277295c02b6160507b4d193dde2/silence/silence.go#L435-L438
|
||||
// and https://github.com/prometheus/alertmanager/blob/3b06b97af4d146e141af92885a185891eb79a5b0/nflog/nflog.go#L362.
|
||||
var snapfnoop string = "snapfnoop"
|
||||
|
||||
type Server struct {
|
||||
// logger is the logger for the alertmanager
|
||||
@@ -63,15 +61,25 @@ type Server struct {
|
||||
silencer *silence.Silencer
|
||||
silences *silence.Silences
|
||||
timeIntervals map[string][]timeinterval.TimeInterval
|
||||
pipelineBuilder *notify.PipelineBuilder
|
||||
marker *alertmanagertypes.MemMarker
|
||||
pipelineBuilder *pipelineBuilder
|
||||
muter *MaintenanceMuter
|
||||
marker *types.MemMarker
|
||||
tmpl *template.Template
|
||||
wg sync.WaitGroup
|
||||
stopc chan struct{}
|
||||
notificationManager nfmanager.NotificationManager
|
||||
}
|
||||
|
||||
func New(ctx context.Context, logger *slog.Logger, registry prometheus.Registerer, srvConfig Config, orgID string, stateStore alertmanagertypes.StateStore, nfManager nfmanager.NotificationManager) (*Server, error) {
|
||||
func New(
|
||||
ctx context.Context,
|
||||
logger *slog.Logger,
|
||||
registry prometheus.Registerer,
|
||||
srvConfig Config,
|
||||
orgID string,
|
||||
stateStore alertmanagertypes.StateStore,
|
||||
nfManager nfmanager.NotificationManager,
|
||||
maintenanceStore alertmanagertypes.MaintenanceStore,
|
||||
) (*Server, error) {
|
||||
server := &Server{
|
||||
logger: logger.With(slog.String("pkg", "go.signoz.io/pkg/alertmanager/alertmanagerserver")),
|
||||
registry: registry,
|
||||
@@ -84,7 +92,7 @@ func New(ctx context.Context, logger *slog.Logger, registry prometheus.Registere
|
||||
signozRegisterer := prometheus.WrapRegistererWithPrefix("signoz_", registry)
|
||||
signozRegisterer = prometheus.WrapRegistererWith(prometheus.Labels{"org_id": server.orgID}, signozRegisterer)
|
||||
// initialize marker
|
||||
server.marker = alertmanagertypes.NewMarker(signozRegisterer)
|
||||
server.marker = types.NewMarker(signozRegisterer)
|
||||
|
||||
// get silences for initial state
|
||||
state, err := server.stateStore.Get(ctx, server.orgID)
|
||||
@@ -160,7 +168,6 @@ func New(ctx context.Context, logger *slog.Logger, registry prometheus.Registere
|
||||
|
||||
return c, server.stateStore.Set(ctx, storableSilences)
|
||||
})
|
||||
|
||||
}()
|
||||
|
||||
// Start maintenance for notification logs
|
||||
@@ -196,17 +203,25 @@ func New(ctx context.Context, logger *slog.Logger, registry prometheus.Registere
|
||||
return nil, err
|
||||
}
|
||||
|
||||
server.pipelineBuilder = notify.NewPipelineBuilder(signozRegisterer, featurecontrol.NoopFlags{})
|
||||
server.muter = NewMaintenanceMuter(maintenanceStore, orgID, server.logger)
|
||||
server.pipelineBuilder = newPipelineBuilder(signozRegisterer, featurecontrol.NoopFlags{})
|
||||
server.dispatcherMetrics = NewDispatcherMetrics(false, signozRegisterer)
|
||||
|
||||
return server, nil
|
||||
}
|
||||
|
||||
func (server *Server) GetAlerts(ctx context.Context, params alertmanagertypes.GettableAlertsParams) (alertmanagertypes.GettableAlerts, error) {
|
||||
return alertmanagertypes.NewGettableAlertsFromAlertProvider(server.alerts, server.alertmanagerConfig, server.marker.Status, func(labels model.LabelSet) {
|
||||
server.inhibitor.Mutes(ctx, labels)
|
||||
server.silencer.Mutes(ctx, labels)
|
||||
}, params)
|
||||
return alertmanagertypes.NewGettableAlertsFromAlertProvider(
|
||||
server.alerts, server.alertmanagerConfig, server.marker.Status,
|
||||
func(labels model.LabelSet) {
|
||||
server.inhibitor.Mutes(ctx, labels)
|
||||
server.silencer.Mutes(ctx, labels)
|
||||
},
|
||||
func(labels model.LabelSet) []string {
|
||||
return server.muter.MutedBy(ctx, labels)
|
||||
},
|
||||
params,
|
||||
)
|
||||
}
|
||||
|
||||
func (server *Server) PutAlerts(ctx context.Context, postableAlerts alertmanagertypes.PostableAlerts) error {
|
||||
@@ -290,6 +305,7 @@ func (server *Server) SetConfig(ctx context.Context, alertmanagerConfig *alertma
|
||||
server.silencer,
|
||||
intervener,
|
||||
server.marker,
|
||||
server.muter,
|
||||
server.nflog,
|
||||
pipelinePeer,
|
||||
)
|
||||
|
||||
@@ -7,6 +7,10 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
sqlmock "github.com/DATA-DOG/go-sqlmock"
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager/alertmanagerstore/sqlalertmanagerstore"
|
||||
"github.com/SigNoz/signoz/pkg/sqlstore"
|
||||
"github.com/SigNoz/signoz/pkg/sqlstore/sqlstoretest"
|
||||
"github.com/SigNoz/signoz/pkg/types/alertmanagertypes/alertmanagertypestest"
|
||||
"github.com/prometheus/alertmanager/dispatch"
|
||||
|
||||
@@ -90,7 +94,8 @@ func TestEndToEndAlertManagerFlow(t *testing.T) {
|
||||
stateStore := alertmanagertypestest.NewStateStore()
|
||||
registry := prometheus.NewRegistry()
|
||||
logger := slog.New(slog.DiscardHandler)
|
||||
server, err := New(context.Background(), logger, registry, srvCfg, orgID, stateStore, notificationManager)
|
||||
maintenanceStore := sqlalertmanagerstore.NewMaintenanceStore(sqlstoretest.New(sqlstore.Config{Provider: "sqlite"}, sqlmock.QueryMatcherEqual))
|
||||
server, err := New(context.Background(), logger, registry, srvCfg, orgID, stateStore, notificationManager, maintenanceStore)
|
||||
require.NoError(t, err)
|
||||
amConfig, err := alertmanagertypes.NewDefaultConfig(srvCfg.Global, srvCfg.Route, orgID)
|
||||
require.NoError(t, err)
|
||||
|
||||
@@ -10,7 +10,11 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager/alertmanagerstore/sqlalertmanagerstore"
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfmanagertest"
|
||||
"github.com/SigNoz/signoz/pkg/sqlstore"
|
||||
"github.com/SigNoz/signoz/pkg/sqlstore/sqlstoretest"
|
||||
"github.com/SigNoz/signoz/pkg/types/alertmanagertypes"
|
||||
"github.com/SigNoz/signoz/pkg/types/alertmanagertypes/alertmanagertypestest"
|
||||
"github.com/go-openapi/strfmt"
|
||||
@@ -23,9 +27,14 @@ import (
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func newTestMaintenanceStore() alertmanagertypes.MaintenanceStore {
|
||||
ss := sqlstoretest.New(sqlstore.Config{Provider: "sqlite"}, sqlmock.QueryMatcherEqual)
|
||||
return sqlalertmanagerstore.NewMaintenanceStore(ss)
|
||||
}
|
||||
|
||||
func TestServerSetConfigAndStop(t *testing.T) {
|
||||
notificationManager := nfmanagertest.NewMock()
|
||||
server, err := New(context.Background(), slog.New(slog.DiscardHandler), prometheus.NewRegistry(), NewConfig(), "1", alertmanagertypestest.NewStateStore(), notificationManager)
|
||||
server, err := New(context.Background(), slog.New(slog.DiscardHandler), prometheus.NewRegistry(), NewConfig(), "1", alertmanagertypestest.NewStateStore(), notificationManager, newTestMaintenanceStore())
|
||||
require.NoError(t, err)
|
||||
|
||||
amConfig, err := alertmanagertypes.NewDefaultConfig(alertmanagertypes.GlobalConfig{}, alertmanagertypes.RouteConfig{GroupInterval: 1 * time.Minute, RepeatInterval: 1 * time.Minute, GroupWait: 1 * time.Minute}, "1")
|
||||
@@ -37,7 +46,7 @@ func TestServerSetConfigAndStop(t *testing.T) {
|
||||
|
||||
func TestServerTestReceiverTypeWebhook(t *testing.T) {
|
||||
notificationManager := nfmanagertest.NewMock()
|
||||
server, err := New(context.Background(), slog.New(slog.DiscardHandler), prometheus.NewRegistry(), NewConfig(), "1", alertmanagertypestest.NewStateStore(), notificationManager)
|
||||
server, err := New(context.Background(), slog.New(slog.DiscardHandler), prometheus.NewRegistry(), NewConfig(), "1", alertmanagertypestest.NewStateStore(), notificationManager, newTestMaintenanceStore())
|
||||
require.NoError(t, err)
|
||||
|
||||
amConfig, err := alertmanagertypes.NewDefaultConfig(alertmanagertypes.GlobalConfig{}, alertmanagertypes.RouteConfig{GroupInterval: 1 * time.Minute, RepeatInterval: 1 * time.Minute, GroupWait: 1 * time.Minute}, "1")
|
||||
@@ -85,7 +94,7 @@ func TestServerPutAlerts(t *testing.T) {
|
||||
srvCfg := NewConfig()
|
||||
srvCfg.Route.GroupInterval = 1 * time.Second
|
||||
notificationManager := nfmanagertest.NewMock()
|
||||
server, err := New(context.Background(), slog.New(slog.DiscardHandler), prometheus.NewRegistry(), srvCfg, "1", stateStore, notificationManager)
|
||||
server, err := New(context.Background(), slog.New(slog.DiscardHandler), prometheus.NewRegistry(), srvCfg, "1", stateStore, notificationManager, newTestMaintenanceStore())
|
||||
require.NoError(t, err)
|
||||
|
||||
amConfig, err := alertmanagertypes.NewDefaultConfig(srvCfg.Global, srvCfg.Route, "1")
|
||||
@@ -133,7 +142,7 @@ func TestServerTestAlert(t *testing.T) {
|
||||
srvCfg := NewConfig()
|
||||
srvCfg.Route.GroupInterval = 1 * time.Second
|
||||
notificationManager := nfmanagertest.NewMock()
|
||||
server, err := New(context.Background(), slog.New(slog.DiscardHandler), prometheus.NewRegistry(), srvCfg, "1", stateStore, notificationManager)
|
||||
server, err := New(context.Background(), slog.New(slog.DiscardHandler), prometheus.NewRegistry(), srvCfg, "1", stateStore, notificationManager, newTestMaintenanceStore())
|
||||
require.NoError(t, err)
|
||||
|
||||
amConfig, err := alertmanagertypes.NewDefaultConfig(srvCfg.Global, srvCfg.Route, "1")
|
||||
@@ -238,7 +247,7 @@ func TestServerTestAlertContinuesOnFailure(t *testing.T) {
|
||||
srvCfg := NewConfig()
|
||||
srvCfg.Route.GroupInterval = 1 * time.Second
|
||||
notificationManager := nfmanagertest.NewMock()
|
||||
server, err := New(context.Background(), slog.New(slog.DiscardHandler), prometheus.NewRegistry(), srvCfg, "1", stateStore, notificationManager)
|
||||
server, err := New(context.Background(), slog.New(slog.DiscardHandler), prometheus.NewRegistry(), srvCfg, "1", stateStore, notificationManager, newTestMaintenanceStore())
|
||||
require.NoError(t, err)
|
||||
|
||||
amConfig, err := alertmanagertypes.NewDefaultConfig(srvCfg.Global, srvCfg.Route, "1")
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package sqlrulestore
|
||||
package sqlalertmanagerstore
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -7,8 +7,8 @@ import (
|
||||
"github.com/SigNoz/signoz/pkg/errors"
|
||||
"github.com/SigNoz/signoz/pkg/sqlstore"
|
||||
"github.com/SigNoz/signoz/pkg/types"
|
||||
"github.com/SigNoz/signoz/pkg/types/alertmanagertypes"
|
||||
"github.com/SigNoz/signoz/pkg/types/authtypes"
|
||||
ruletypes "github.com/SigNoz/signoz/pkg/types/ruletypes"
|
||||
"github.com/SigNoz/signoz/pkg/valuer"
|
||||
)
|
||||
|
||||
@@ -16,12 +16,12 @@ type maintenance struct {
|
||||
sqlstore sqlstore.SQLStore
|
||||
}
|
||||
|
||||
func NewMaintenanceStore(store sqlstore.SQLStore) ruletypes.MaintenanceStore {
|
||||
func NewMaintenanceStore(store sqlstore.SQLStore) alertmanagertypes.MaintenanceStore {
|
||||
return &maintenance{sqlstore: store}
|
||||
}
|
||||
|
||||
func (r *maintenance) ListPlannedMaintenance(ctx context.Context, orgID string) ([]*ruletypes.PlannedMaintenance, error) {
|
||||
gettableMaintenancesRules := make([]*ruletypes.PlannedMaintenanceWithRules, 0)
|
||||
func (r *maintenance) ListPlannedMaintenance(ctx context.Context, orgID string) ([]*alertmanagertypes.PlannedMaintenance, error) {
|
||||
gettableMaintenancesRules := make([]*alertmanagertypes.PlannedMaintenanceWithRules, 0)
|
||||
err := r.sqlstore.
|
||||
BunDB().
|
||||
NewSelect().
|
||||
@@ -33,7 +33,7 @@ func (r *maintenance) ListPlannedMaintenance(ctx context.Context, orgID string)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
gettablePlannedMaintenance := make([]*ruletypes.PlannedMaintenance, 0)
|
||||
gettablePlannedMaintenance := make([]*alertmanagertypes.PlannedMaintenance, 0)
|
||||
for _, gettableMaintenancesRule := range gettableMaintenancesRules {
|
||||
gettablePlannedMaintenance = append(gettablePlannedMaintenance, gettableMaintenancesRule.ToPlannedMaintenance())
|
||||
}
|
||||
@@ -41,8 +41,8 @@ func (r *maintenance) ListPlannedMaintenance(ctx context.Context, orgID string)
|
||||
return gettablePlannedMaintenance, nil
|
||||
}
|
||||
|
||||
func (r *maintenance) GetPlannedMaintenanceByID(ctx context.Context, id valuer.UUID) (*ruletypes.PlannedMaintenance, error) {
|
||||
storableMaintenanceRule := new(ruletypes.PlannedMaintenanceWithRules)
|
||||
func (r *maintenance) GetPlannedMaintenanceByID(ctx context.Context, id valuer.UUID) (*alertmanagertypes.PlannedMaintenance, error) {
|
||||
storableMaintenanceRule := new(alertmanagertypes.PlannedMaintenanceWithRules)
|
||||
err := r.sqlstore.
|
||||
BunDB().
|
||||
NewSelect().
|
||||
@@ -57,13 +57,13 @@ func (r *maintenance) GetPlannedMaintenanceByID(ctx context.Context, id valuer.U
|
||||
return storableMaintenanceRule.ToPlannedMaintenance(), nil
|
||||
}
|
||||
|
||||
func (r *maintenance) CreatePlannedMaintenance(ctx context.Context, maintenance *ruletypes.PostablePlannedMaintenance) (*ruletypes.PlannedMaintenance, error) {
|
||||
func (r *maintenance) CreatePlannedMaintenance(ctx context.Context, maintenance *alertmanagertypes.PostablePlannedMaintenance) (*alertmanagertypes.PlannedMaintenance, error) {
|
||||
claims, err := authtypes.ClaimsFromContext(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
storablePlannedMaintenance := ruletypes.StorablePlannedMaintenance{
|
||||
storablePlannedMaintenance := alertmanagertypes.StorablePlannedMaintenance{
|
||||
Identifiable: types.Identifiable{
|
||||
ID: valuer.GenerateUUID(),
|
||||
},
|
||||
@@ -75,20 +75,21 @@ func (r *maintenance) CreatePlannedMaintenance(ctx context.Context, maintenance
|
||||
CreatedBy: claims.Email,
|
||||
UpdatedBy: claims.Email,
|
||||
},
|
||||
Name: maintenance.Name,
|
||||
Description: maintenance.Description,
|
||||
Schedule: maintenance.Schedule,
|
||||
OrgID: claims.OrgID,
|
||||
Name: maintenance.Name,
|
||||
Description: maintenance.Description,
|
||||
Schedule: maintenance.Schedule,
|
||||
OrgID: claims.OrgID,
|
||||
LabelExpression: maintenance.LabelExpression,
|
||||
}
|
||||
|
||||
maintenanceRules := make([]*ruletypes.StorablePlannedMaintenanceRule, 0)
|
||||
maintenanceRules := make([]*alertmanagertypes.StorablePlannedMaintenanceRule, 0)
|
||||
for _, ruleIDStr := range maintenance.AlertIds {
|
||||
ruleID, err := valuer.NewUUID(ruleIDStr)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
maintenanceRules = append(maintenanceRules, &ruletypes.StorablePlannedMaintenanceRule{
|
||||
maintenanceRules = append(maintenanceRules, &alertmanagertypes.StorablePlannedMaintenanceRule{
|
||||
Identifiable: types.Identifiable{
|
||||
ID: valuer.GenerateUUID(),
|
||||
},
|
||||
@@ -113,7 +114,6 @@ func (r *maintenance) CreatePlannedMaintenance(ctx context.Context, maintenance
|
||||
NewInsert().
|
||||
Model(&maintenanceRules).
|
||||
Exec(ctx)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -125,16 +125,17 @@ func (r *maintenance) CreatePlannedMaintenance(ctx context.Context, maintenance
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &ruletypes.PlannedMaintenance{
|
||||
ID: storablePlannedMaintenance.ID,
|
||||
Name: storablePlannedMaintenance.Name,
|
||||
Description: storablePlannedMaintenance.Description,
|
||||
Schedule: storablePlannedMaintenance.Schedule,
|
||||
RuleIDs: maintenance.AlertIds,
|
||||
CreatedAt: storablePlannedMaintenance.CreatedAt,
|
||||
CreatedBy: storablePlannedMaintenance.CreatedBy,
|
||||
UpdatedAt: storablePlannedMaintenance.UpdatedAt,
|
||||
UpdatedBy: storablePlannedMaintenance.UpdatedBy,
|
||||
return &alertmanagertypes.PlannedMaintenance{
|
||||
ID: storablePlannedMaintenance.ID,
|
||||
Name: storablePlannedMaintenance.Name,
|
||||
Description: storablePlannedMaintenance.Description,
|
||||
Schedule: storablePlannedMaintenance.Schedule,
|
||||
RuleIDs: maintenance.AlertIds,
|
||||
LabelExpression: maintenance.LabelExpression,
|
||||
CreatedAt: storablePlannedMaintenance.CreatedAt,
|
||||
CreatedBy: storablePlannedMaintenance.CreatedBy,
|
||||
UpdatedAt: storablePlannedMaintenance.UpdatedAt,
|
||||
UpdatedBy: storablePlannedMaintenance.UpdatedBy,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -142,7 +143,7 @@ func (r *maintenance) DeletePlannedMaintenance(ctx context.Context, id valuer.UU
|
||||
_, err := r.sqlstore.
|
||||
BunDB().
|
||||
NewDelete().
|
||||
Model(new(ruletypes.StorablePlannedMaintenance)).
|
||||
Model(new(alertmanagertypes.StorablePlannedMaintenance)).
|
||||
Where("id = ?", id.StringValue()).
|
||||
Exec(ctx)
|
||||
if err != nil {
|
||||
@@ -152,7 +153,7 @@ func (r *maintenance) DeletePlannedMaintenance(ctx context.Context, id valuer.UU
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *maintenance) UpdatePlannedMaintenance(ctx context.Context, maintenance *ruletypes.PostablePlannedMaintenance, id valuer.UUID) error {
|
||||
func (r *maintenance) UpdatePlannedMaintenance(ctx context.Context, maintenance *alertmanagertypes.PostablePlannedMaintenance, id valuer.UUID) error {
|
||||
claims, err := authtypes.ClaimsFromContext(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -163,7 +164,7 @@ func (r *maintenance) UpdatePlannedMaintenance(ctx context.Context, maintenance
|
||||
return err
|
||||
}
|
||||
|
||||
storablePlannedMaintenance := ruletypes.StorablePlannedMaintenance{
|
||||
storablePlannedMaintenance := alertmanagertypes.StorablePlannedMaintenance{
|
||||
Identifiable: types.Identifiable{
|
||||
ID: id,
|
||||
},
|
||||
@@ -175,20 +176,21 @@ func (r *maintenance) UpdatePlannedMaintenance(ctx context.Context, maintenance
|
||||
CreatedBy: existing.CreatedBy,
|
||||
UpdatedBy: claims.Email,
|
||||
},
|
||||
Name: maintenance.Name,
|
||||
Description: maintenance.Description,
|
||||
Schedule: maintenance.Schedule,
|
||||
OrgID: claims.OrgID,
|
||||
Name: maintenance.Name,
|
||||
Description: maintenance.Description,
|
||||
Schedule: maintenance.Schedule,
|
||||
OrgID: claims.OrgID,
|
||||
LabelExpression: maintenance.LabelExpression,
|
||||
}
|
||||
|
||||
storablePlannedMaintenanceRules := make([]*ruletypes.StorablePlannedMaintenanceRule, 0)
|
||||
storablePlannedMaintenanceRules := make([]*alertmanagertypes.StorablePlannedMaintenanceRule, 0)
|
||||
for _, ruleIDStr := range maintenance.AlertIds {
|
||||
ruleID, err := valuer.NewUUID(ruleIDStr)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
storablePlannedMaintenanceRules = append(storablePlannedMaintenanceRules, &ruletypes.StorablePlannedMaintenanceRule{
|
||||
storablePlannedMaintenanceRules = append(storablePlannedMaintenanceRules, &alertmanagertypes.StorablePlannedMaintenanceRule{
|
||||
Identifiable: types.Identifiable{
|
||||
ID: valuer.GenerateUUID(),
|
||||
},
|
||||
@@ -211,10 +213,9 @@ func (r *maintenance) UpdatePlannedMaintenance(ctx context.Context, maintenance
|
||||
_, err = r.sqlstore.
|
||||
BunDBCtx(ctx).
|
||||
NewDelete().
|
||||
Model(new(ruletypes.StorablePlannedMaintenanceRule)).
|
||||
Model(new(alertmanagertypes.StorablePlannedMaintenanceRule)).
|
||||
Where("planned_maintenance_id = ?", storablePlannedMaintenance.ID.StringValue()).
|
||||
Exec(ctx)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -231,7 +232,6 @@ func (r *maintenance) UpdatePlannedMaintenance(ctx context.Context, maintenance
|
||||
}
|
||||
|
||||
return nil
|
||||
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -39,16 +39,18 @@ type Service struct {
|
||||
serversMtx sync.RWMutex
|
||||
|
||||
notificationManager nfmanager.NotificationManager
|
||||
|
||||
maintenanceStore alertmanagertypes.MaintenanceStore
|
||||
}
|
||||
|
||||
func New(
|
||||
ctx context.Context,
|
||||
settings factory.ScopedProviderSettings,
|
||||
config alertmanagerserver.Config,
|
||||
stateStore alertmanagertypes.StateStore,
|
||||
configStore alertmanagertypes.ConfigStore,
|
||||
orgGetter organization.Getter,
|
||||
nfManager nfmanager.NotificationManager,
|
||||
maintenanceStore alertmanagertypes.MaintenanceStore,
|
||||
) *Service {
|
||||
service := &Service{
|
||||
config: config,
|
||||
@@ -59,6 +61,7 @@ func New(
|
||||
servers: make(map[string]*alertmanagerserver.Server),
|
||||
serversMtx: sync.RWMutex{},
|
||||
notificationManager: nfManager,
|
||||
maintenanceStore: maintenanceStore,
|
||||
}
|
||||
|
||||
return service
|
||||
@@ -177,7 +180,10 @@ func (service *Service) newServer(ctx context.Context, orgID string) (*alertmana
|
||||
return nil, err
|
||||
}
|
||||
|
||||
server, err := alertmanagerserver.New(ctx, service.settings.Logger(), service.settings.PrometheusRegisterer(), service.config, orgID, service.stateStore, service.notificationManager)
|
||||
server, err := alertmanagerserver.New(
|
||||
ctx, service.settings.Logger(), service.settings.PrometheusRegisterer(), service.config, orgID,
|
||||
service.stateStore, service.notificationManager, service.maintenanceStore,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -4,11 +4,8 @@ import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/common/model"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/query-service/utils/labels"
|
||||
|
||||
amConfig "github.com/prometheus/alertmanager/config"
|
||||
"github.com/prometheus/common/model"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager"
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager/alertmanagerstore/sqlalertmanagerstore"
|
||||
@@ -16,6 +13,7 @@ import (
|
||||
"github.com/SigNoz/signoz/pkg/errors"
|
||||
"github.com/SigNoz/signoz/pkg/factory"
|
||||
"github.com/SigNoz/signoz/pkg/modules/organization"
|
||||
"github.com/SigNoz/signoz/pkg/query-service/utils/labels"
|
||||
"github.com/SigNoz/signoz/pkg/sqlstore"
|
||||
"github.com/SigNoz/signoz/pkg/types"
|
||||
"github.com/SigNoz/signoz/pkg/types/alertmanagertypes"
|
||||
@@ -30,35 +28,49 @@ type provider struct {
|
||||
configStore alertmanagertypes.ConfigStore
|
||||
stateStore alertmanagertypes.StateStore
|
||||
notificationManager nfmanager.NotificationManager
|
||||
maintenanceStore alertmanagertypes.MaintenanceStore
|
||||
stopC chan struct{}
|
||||
}
|
||||
|
||||
func NewFactory(sqlstore sqlstore.SQLStore, orgGetter organization.Getter, notificationManager nfmanager.NotificationManager) factory.ProviderFactory[alertmanager.Alertmanager, alertmanager.Config] {
|
||||
func NewFactory(
|
||||
sqlstore sqlstore.SQLStore,
|
||||
orgGetter organization.Getter,
|
||||
notificationManager nfmanager.NotificationManager,
|
||||
maintenanceStore alertmanagertypes.MaintenanceStore,
|
||||
) factory.ProviderFactory[alertmanager.Alertmanager, alertmanager.Config] {
|
||||
return factory.NewProviderFactory(factory.MustNewName("signoz"), func(ctx context.Context, settings factory.ProviderSettings, config alertmanager.Config) (alertmanager.Alertmanager, error) {
|
||||
return New(ctx, settings, config, sqlstore, orgGetter, notificationManager)
|
||||
return New(settings, config, sqlstore, orgGetter, notificationManager, maintenanceStore)
|
||||
})
|
||||
}
|
||||
|
||||
func New(ctx context.Context, providerSettings factory.ProviderSettings, config alertmanager.Config, sqlstore sqlstore.SQLStore, orgGetter organization.Getter, notificationManager nfmanager.NotificationManager) (*provider, error) {
|
||||
func New(
|
||||
providerSettings factory.ProviderSettings,
|
||||
config alertmanager.Config,
|
||||
sqlstore sqlstore.SQLStore,
|
||||
orgGetter organization.Getter,
|
||||
notificationManager nfmanager.NotificationManager,
|
||||
maintenanceStore alertmanagertypes.MaintenanceStore,
|
||||
) (*provider, error) {
|
||||
settings := factory.NewScopedProviderSettings(providerSettings, "github.com/SigNoz/signoz/pkg/alertmanager/signozalertmanager")
|
||||
configStore := sqlalertmanagerstore.NewConfigStore(sqlstore)
|
||||
stateStore := sqlalertmanagerstore.NewStateStore(sqlstore)
|
||||
|
||||
p := &provider{
|
||||
service: alertmanager.New(
|
||||
ctx,
|
||||
settings,
|
||||
config.Signoz.Config,
|
||||
stateStore,
|
||||
configStore,
|
||||
orgGetter,
|
||||
notificationManager,
|
||||
maintenanceStore,
|
||||
),
|
||||
settings: settings,
|
||||
config: config,
|
||||
configStore: configStore,
|
||||
stateStore: stateStore,
|
||||
notificationManager: notificationManager,
|
||||
maintenanceStore: maintenanceStore,
|
||||
stopC: make(chan struct{}),
|
||||
}
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/http/handler"
|
||||
"github.com/SigNoz/signoz/pkg/types"
|
||||
"github.com/SigNoz/signoz/pkg/types/alertmanagertypes"
|
||||
"github.com/SigNoz/signoz/pkg/types/ruletypes"
|
||||
"github.com/gorilla/mux"
|
||||
)
|
||||
@@ -115,13 +116,29 @@ func (provider *provider) addRulerRoutes(router *mux.Router) error {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := router.Handle("/api/v2/rules/{id}/mute", handler.New(provider.authzMiddleware.EditAccess(provider.rulerHandler.MuteRule), handler.OpenAPIDef{
|
||||
ID: "MuteRule",
|
||||
Tags: []string{"rules"},
|
||||
Summary: "Mute alert rule",
|
||||
Description: "This endpoint mutes an alert rule until the provided endTime by creating a fixed-window planned maintenance scoped to that rule. Unmute by deleting the returned downtime schedule.",
|
||||
Request: new(alertmanagertypes.PostableMuteRule),
|
||||
RequestContentType: "application/json",
|
||||
Response: new(alertmanagertypes.PlannedMaintenance),
|
||||
ResponseContentType: "application/json",
|
||||
SuccessStatusCode: http.StatusCreated,
|
||||
ErrorStatusCodes: []int{http.StatusBadRequest, http.StatusNotFound},
|
||||
SecuritySchemes: newSecuritySchemes(types.RoleEditor),
|
||||
})).Methods(http.MethodPost).GetError(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := router.Handle("/api/v1/downtime_schedules", handler.New(provider.authzMiddleware.ViewAccess(provider.rulerHandler.ListDowntimeSchedules), handler.OpenAPIDef{
|
||||
ID: "ListDowntimeSchedules",
|
||||
Tags: []string{"downtimeschedules"},
|
||||
Summary: "List downtime schedules",
|
||||
Description: "This endpoint lists all planned maintenance / downtime schedules",
|
||||
RequestQuery: new(ruletypes.ListPlannedMaintenanceParams),
|
||||
Response: make([]*ruletypes.PlannedMaintenance, 0),
|
||||
RequestQuery: new(alertmanagertypes.ListPlannedMaintenanceParams),
|
||||
Response: make([]*alertmanagertypes.PlannedMaintenance, 0),
|
||||
ResponseContentType: "application/json",
|
||||
SuccessStatusCode: http.StatusOK,
|
||||
SecuritySchemes: newSecuritySchemes(types.RoleViewer),
|
||||
@@ -134,7 +151,7 @@ func (provider *provider) addRulerRoutes(router *mux.Router) error {
|
||||
Tags: []string{"downtimeschedules"},
|
||||
Summary: "Get downtime schedule by ID",
|
||||
Description: "This endpoint returns a downtime schedule by ID",
|
||||
Response: new(ruletypes.PlannedMaintenance),
|
||||
Response: new(alertmanagertypes.PlannedMaintenance),
|
||||
ResponseContentType: "application/json",
|
||||
SuccessStatusCode: http.StatusOK,
|
||||
ErrorStatusCodes: []int{http.StatusNotFound},
|
||||
@@ -148,9 +165,9 @@ func (provider *provider) addRulerRoutes(router *mux.Router) error {
|
||||
Tags: []string{"downtimeschedules"},
|
||||
Summary: "Create downtime schedule",
|
||||
Description: "This endpoint creates a new planned maintenance / downtime schedule",
|
||||
Request: new(ruletypes.PostablePlannedMaintenance),
|
||||
Request: new(alertmanagertypes.PostablePlannedMaintenance),
|
||||
RequestContentType: "application/json",
|
||||
Response: new(ruletypes.PlannedMaintenance),
|
||||
Response: new(alertmanagertypes.PlannedMaintenance),
|
||||
ResponseContentType: "application/json",
|
||||
SuccessStatusCode: http.StatusCreated,
|
||||
ErrorStatusCodes: []int{http.StatusBadRequest},
|
||||
@@ -164,7 +181,7 @@ func (provider *provider) addRulerRoutes(router *mux.Router) error {
|
||||
Tags: []string{"downtimeschedules"},
|
||||
Summary: "Update downtime schedule",
|
||||
Description: "This endpoint updates a downtime schedule by ID",
|
||||
Request: new(ruletypes.PostablePlannedMaintenance),
|
||||
Request: new(alertmanagertypes.PostablePlannedMaintenance),
|
||||
RequestContentType: "application/json",
|
||||
SuccessStatusCode: http.StatusNoContent,
|
||||
ErrorStatusCodes: []int{http.StatusBadRequest, http.StatusNotFound},
|
||||
|
||||
@@ -2,6 +2,8 @@ package envprovider
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/config"
|
||||
@@ -9,7 +11,21 @@ import (
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// clearSignozEnv unsets all existing SIGNOZ_* env vars for the duration of the test.
|
||||
func clearSignozEnv(t *testing.T) {
|
||||
t.Helper()
|
||||
for _, kv := range os.Environ() {
|
||||
if strings.HasPrefix(kv, prefix) {
|
||||
key := strings.SplitN(kv, "=", 2)[0]
|
||||
orig, _ := os.LookupEnv(key)
|
||||
os.Unsetenv(key)
|
||||
t.Cleanup(func() { os.Setenv(key, orig) })
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetWithStrings(t *testing.T) {
|
||||
clearSignozEnv(t)
|
||||
t.Setenv("SIGNOZ_K1_K2", "string")
|
||||
t.Setenv("SIGNOZ_K3__K4", "string")
|
||||
t.Setenv("SIGNOZ_K5__K6_K7__K8", "string")
|
||||
@@ -31,6 +47,7 @@ func TestGetWithStrings(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestGetWithNoPrefix(t *testing.T) {
|
||||
clearSignozEnv(t)
|
||||
t.Setenv("K1_K2", "string")
|
||||
t.Setenv("K3_K4", "string")
|
||||
expected := map[string]any{}
|
||||
@@ -43,6 +60,7 @@ func TestGetWithNoPrefix(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestGetWithGoTypes(t *testing.T) {
|
||||
clearSignozEnv(t)
|
||||
t.Setenv("SIGNOZ_BOOL", "true")
|
||||
t.Setenv("SIGNOZ_STRING", "string")
|
||||
t.Setenv("SIGNOZ_INT", "1")
|
||||
|
||||
@@ -32,30 +32,28 @@ import (
|
||||
)
|
||||
|
||||
type PrepareTaskOptions struct {
|
||||
Rule *ruletypes.PostableRule
|
||||
TaskName string
|
||||
RuleStore ruletypes.RuleStore
|
||||
MaintenanceStore ruletypes.MaintenanceStore
|
||||
Querier querier.Querier
|
||||
Logger *slog.Logger
|
||||
Cache cache.Cache
|
||||
ManagerOpts *ManagerOptions
|
||||
NotifyFunc NotifyFunc
|
||||
SQLStore sqlstore.SQLStore
|
||||
OrgID valuer.UUID
|
||||
Rule *ruletypes.PostableRule
|
||||
TaskName string
|
||||
RuleStore ruletypes.RuleStore
|
||||
Querier querier.Querier
|
||||
Logger *slog.Logger
|
||||
Cache cache.Cache
|
||||
ManagerOpts *ManagerOptions
|
||||
NotifyFunc NotifyFunc
|
||||
SQLStore sqlstore.SQLStore
|
||||
OrgID valuer.UUID
|
||||
}
|
||||
|
||||
type PrepareTestRuleOptions struct {
|
||||
Rule *ruletypes.PostableRule
|
||||
RuleStore ruletypes.RuleStore
|
||||
MaintenanceStore ruletypes.MaintenanceStore
|
||||
Querier querier.Querier
|
||||
Logger *slog.Logger
|
||||
Cache cache.Cache
|
||||
ManagerOpts *ManagerOptions
|
||||
NotifyFunc NotifyFunc
|
||||
SQLStore sqlstore.SQLStore
|
||||
OrgID valuer.UUID
|
||||
Rule *ruletypes.PostableRule
|
||||
RuleStore ruletypes.RuleStore
|
||||
Querier querier.Querier
|
||||
Logger *slog.Logger
|
||||
Cache cache.Cache
|
||||
ManagerOpts *ManagerOptions
|
||||
NotifyFunc NotifyFunc
|
||||
SQLStore sqlstore.SQLStore
|
||||
OrgID valuer.UUID
|
||||
}
|
||||
|
||||
const taskNameSuffix = "webAppEditor"
|
||||
@@ -89,7 +87,7 @@ type ManagerOptions struct {
|
||||
Alertmanager alertmanager.Alertmanager
|
||||
OrgGetter organization.Getter
|
||||
RuleStore ruletypes.RuleStore
|
||||
MaintenanceStore ruletypes.MaintenanceStore
|
||||
MaintenanceStore alertmanagertypes.MaintenanceStore
|
||||
SQLStore sqlstore.SQLStore
|
||||
QueryParser queryparser.QueryParser
|
||||
}
|
||||
@@ -103,7 +101,7 @@ type Manager struct {
|
||||
block chan struct{}
|
||||
// datastore to store alert definitions
|
||||
ruleStore ruletypes.RuleStore
|
||||
maintenanceStore ruletypes.MaintenanceStore
|
||||
maintenanceStore alertmanagertypes.MaintenanceStore
|
||||
|
||||
logger *slog.Logger
|
||||
cache cache.Cache
|
||||
@@ -134,7 +132,6 @@ func defaultOptions(o *ManagerOptions) *ManagerOptions {
|
||||
}
|
||||
|
||||
func defaultPrepareTaskFunc(opts PrepareTaskOptions) (Task, error) {
|
||||
|
||||
rules := make([]Rule, 0)
|
||||
var task Task
|
||||
|
||||
@@ -159,7 +156,6 @@ func defaultPrepareTaskFunc(opts PrepareTaskOptions) (Task, error) {
|
||||
WithMetadataStore(opts.ManagerOpts.MetadataStore),
|
||||
WithRuleStateHistoryModule(opts.ManagerOpts.RuleStateHistoryModule),
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
return task, err
|
||||
}
|
||||
@@ -167,7 +163,7 @@ func defaultPrepareTaskFunc(opts PrepareTaskOptions) (Task, error) {
|
||||
rules = append(rules, tr)
|
||||
|
||||
// create ch rule task for evaluation
|
||||
task = newTask(TaskTypeCh, opts.TaskName, taskNameSuffix, evaluation.GetFrequency().Duration(), rules, opts.ManagerOpts, opts.NotifyFunc, opts.MaintenanceStore, opts.OrgID)
|
||||
task = newTask(TaskTypeCh, opts.TaskName, taskNameSuffix, evaluation.GetFrequency().Duration(), rules, opts.ManagerOpts, opts.NotifyFunc)
|
||||
|
||||
} else if opts.Rule.RuleType == ruletypes.RuleTypeProm {
|
||||
|
||||
@@ -183,7 +179,6 @@ func defaultPrepareTaskFunc(opts PrepareTaskOptions) (Task, error) {
|
||||
WithMetadataStore(opts.ManagerOpts.MetadataStore),
|
||||
WithRuleStateHistoryModule(opts.ManagerOpts.RuleStateHistoryModule),
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
return task, err
|
||||
}
|
||||
@@ -191,7 +186,7 @@ func defaultPrepareTaskFunc(opts PrepareTaskOptions) (Task, error) {
|
||||
rules = append(rules, pr)
|
||||
|
||||
// create promql rule task for evaluation
|
||||
task = newTask(TaskTypeProm, opts.TaskName, taskNameSuffix, evaluation.GetFrequency().Duration(), rules, opts.ManagerOpts, opts.NotifyFunc, opts.MaintenanceStore, opts.OrgID)
|
||||
task = newTask(TaskTypeProm, opts.TaskName, taskNameSuffix, evaluation.GetFrequency().Duration(), rules, opts.ManagerOpts, opts.NotifyFunc)
|
||||
|
||||
} else {
|
||||
return nil, errors.NewInvalidInputf(errors.CodeInvalidInput, "unsupported rule type %s. Supported types: %s, %s", opts.Rule.RuleType, ruletypes.RuleTypeProm, ruletypes.RuleTypeThreshold)
|
||||
@@ -237,10 +232,11 @@ func (m *Manager) RuleStore() ruletypes.RuleStore {
|
||||
return m.ruleStore
|
||||
}
|
||||
|
||||
func (m *Manager) MaintenanceStore() ruletypes.MaintenanceStore {
|
||||
func (m *Manager) MaintenanceStore() alertmanagertypes.MaintenanceStore {
|
||||
return m.maintenanceStore
|
||||
}
|
||||
|
||||
// TODO(jatinderjit): remove (unused)?
|
||||
func (m *Manager) Pause(b bool) {
|
||||
m.mtx.Lock()
|
||||
defer m.mtx.Unlock()
|
||||
@@ -430,19 +426,17 @@ func (m *Manager) editTask(_ context.Context, orgID valuer.UUID, rule *ruletypes
|
||||
m.logger.Debug("editing a rule task", "name", taskName)
|
||||
|
||||
newTask, err := m.prepareTaskFunc(PrepareTaskOptions{
|
||||
Rule: rule,
|
||||
TaskName: taskName,
|
||||
RuleStore: m.ruleStore,
|
||||
MaintenanceStore: m.maintenanceStore,
|
||||
Querier: m.opts.Querier,
|
||||
Logger: m.opts.Logger,
|
||||
Cache: m.cache,
|
||||
ManagerOpts: m.opts,
|
||||
NotifyFunc: m.notifyFunc,
|
||||
SQLStore: m.sqlstore,
|
||||
OrgID: orgID,
|
||||
Rule: rule,
|
||||
TaskName: taskName,
|
||||
RuleStore: m.ruleStore,
|
||||
Querier: m.opts.Querier,
|
||||
Logger: m.opts.Logger,
|
||||
Cache: m.cache,
|
||||
ManagerOpts: m.opts,
|
||||
NotifyFunc: m.notifyFunc,
|
||||
SQLStore: m.sqlstore,
|
||||
OrgID: orgID,
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
m.logger.Error("loading tasks failed", errors.Attr(err))
|
||||
return errors.NewInvalidInputf(errors.CodeInvalidInput, "error preparing rule with given parameters, previous rule set restored")
|
||||
@@ -643,19 +637,17 @@ func (m *Manager) addTask(_ context.Context, orgID valuer.UUID, rule *ruletypes.
|
||||
|
||||
m.logger.Debug("adding a new rule task", "name", taskName)
|
||||
newTask, err := m.prepareTaskFunc(PrepareTaskOptions{
|
||||
Rule: rule,
|
||||
TaskName: taskName,
|
||||
RuleStore: m.ruleStore,
|
||||
MaintenanceStore: m.maintenanceStore,
|
||||
Querier: m.opts.Querier,
|
||||
Logger: m.opts.Logger,
|
||||
Cache: m.cache,
|
||||
ManagerOpts: m.opts,
|
||||
NotifyFunc: m.notifyFunc,
|
||||
SQLStore: m.sqlstore,
|
||||
OrgID: orgID,
|
||||
Rule: rule,
|
||||
TaskName: taskName,
|
||||
RuleStore: m.ruleStore,
|
||||
Querier: m.opts.Querier,
|
||||
Logger: m.opts.Logger,
|
||||
Cache: m.cache,
|
||||
ManagerOpts: m.opts,
|
||||
NotifyFunc: m.notifyFunc,
|
||||
SQLStore: m.sqlstore,
|
||||
OrgID: orgID,
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
m.logger.Error("creating rule task failed", "name", taskName, errors.Attr(err))
|
||||
return errors.NewInvalidInputf(errors.CodeInvalidInput, "error loading rules, previous rule set restored")
|
||||
@@ -703,7 +695,6 @@ func (m *Manager) RuleTasks() []Task {
|
||||
// RuleTasksWithoutLock returns the list of manager's rule tasks without
|
||||
// acquiring a lock on the manager.
|
||||
func (m *Manager) RuleTasksWithoutLock() []Task {
|
||||
|
||||
rgs := make([]Task, 0, len(m.tasks))
|
||||
for _, g := range m.tasks {
|
||||
rgs = append(rgs, g)
|
||||
@@ -897,7 +888,6 @@ func (m *Manager) GetRule(ctx context.Context, id valuer.UUID) (*ruletypes.Getta
|
||||
// the task state. For example - if a stored rule is disabled, then
|
||||
// there is no task running against it.
|
||||
func (m *Manager) syncRuleStateWithTask(ctx context.Context, orgID valuer.UUID, taskName string, rule *ruletypes.PostableRule) error {
|
||||
|
||||
if rule.Disabled {
|
||||
// check if rule has any task running
|
||||
if _, ok := m.tasks[taskName]; ok {
|
||||
@@ -1029,16 +1019,15 @@ func (m *Manager) TestNotification(ctx context.Context, orgID valuer.UUID, ruleS
|
||||
}
|
||||
|
||||
alertCount, err := m.prepareTestRuleFunc(PrepareTestRuleOptions{
|
||||
Rule: &parsedRule,
|
||||
RuleStore: m.ruleStore,
|
||||
MaintenanceStore: m.maintenanceStore,
|
||||
Querier: m.opts.Querier,
|
||||
Logger: m.opts.Logger,
|
||||
Cache: m.cache,
|
||||
ManagerOpts: m.opts,
|
||||
NotifyFunc: m.testNotifyFunc,
|
||||
SQLStore: m.sqlstore,
|
||||
OrgID: orgID,
|
||||
Rule: &parsedRule,
|
||||
RuleStore: m.ruleStore,
|
||||
Querier: m.opts.Querier,
|
||||
Logger: m.opts.Logger,
|
||||
Cache: m.cache,
|
||||
ManagerOpts: m.opts,
|
||||
NotifyFunc: m.testNotifyFunc,
|
||||
SQLStore: m.sqlstore,
|
||||
OrgID: orgID,
|
||||
})
|
||||
|
||||
return alertCount, err
|
||||
|
||||
@@ -15,7 +15,6 @@ import (
|
||||
"github.com/SigNoz/signoz/pkg/types/authtypes"
|
||||
"github.com/SigNoz/signoz/pkg/types/ctxtypes"
|
||||
ruletypes "github.com/SigNoz/signoz/pkg/types/ruletypes"
|
||||
"github.com/SigNoz/signoz/pkg/valuer"
|
||||
)
|
||||
|
||||
// PromRuleTask is a promql rule executor
|
||||
@@ -39,14 +38,11 @@ type PromRuleTask struct {
|
||||
pause bool
|
||||
logger *slog.Logger
|
||||
notify NotifyFunc
|
||||
|
||||
maintenanceStore ruletypes.MaintenanceStore
|
||||
orgID valuer.UUID
|
||||
}
|
||||
|
||||
// NewPromRuleTask holds rules that have promql condition
|
||||
// and evaluates the rule at a given frequency
|
||||
func NewPromRuleTask(name, file string, frequency time.Duration, rules []Rule, opts *ManagerOptions, notify NotifyFunc, maintenanceStore ruletypes.MaintenanceStore, orgID valuer.UUID) *PromRuleTask {
|
||||
func NewPromRuleTask(name, file string, frequency time.Duration, rules []Rule, opts *ManagerOptions, notify NotifyFunc) *PromRuleTask {
|
||||
opts.Logger.Info("initiating a new rule group", "name", name, "frequency", frequency)
|
||||
|
||||
if frequency == 0 {
|
||||
@@ -63,10 +59,8 @@ func NewPromRuleTask(name, file string, frequency time.Duration, rules []Rule, o
|
||||
seriesInPreviousEval: make([]map[string]plabels.Labels, len(rules)),
|
||||
done: make(chan struct{}),
|
||||
terminated: make(chan struct{}),
|
||||
notify: notify,
|
||||
maintenanceStore: maintenanceStore,
|
||||
logger: opts.Logger,
|
||||
orgID: orgID,
|
||||
notify: notify,
|
||||
logger: opts.Logger,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -330,30 +324,12 @@ func (g *PromRuleTask) Eval(ctx context.Context, ts time.Time) {
|
||||
}()
|
||||
|
||||
g.logger.InfoContext(ctx, "promql rule task", "name", g.name, "eval_started_at", ts)
|
||||
maintenance, err := g.maintenanceStore.ListPlannedMaintenance(ctx, g.orgID.StringValue())
|
||||
if err != nil {
|
||||
g.logger.ErrorContext(ctx, "error in processing sql query", errors.Attr(err))
|
||||
}
|
||||
|
||||
for i, rule := range g.rules {
|
||||
if rule == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
shouldSkip := false
|
||||
for _, m := range maintenance {
|
||||
g.logger.InfoContext(ctx, "checking if rule should be skipped", slog.String("rule.id", rule.ID()), slog.Any("maintenance", m))
|
||||
if m.ShouldSkip(rule.ID(), ts) {
|
||||
shouldSkip = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if shouldSkip {
|
||||
g.logger.InfoContext(ctx, "rule should be skipped", slog.String("rule.id", rule.ID()))
|
||||
continue
|
||||
}
|
||||
|
||||
select {
|
||||
case <-g.done:
|
||||
return
|
||||
|
||||
@@ -2,20 +2,18 @@ package rules
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"runtime/debug"
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"log/slog"
|
||||
|
||||
opentracing "github.com/opentracing/opentracing-go"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/errors"
|
||||
"github.com/SigNoz/signoz/pkg/types/authtypes"
|
||||
"github.com/SigNoz/signoz/pkg/types/ctxtypes"
|
||||
"github.com/SigNoz/signoz/pkg/types/ruletypes"
|
||||
"github.com/SigNoz/signoz/pkg/valuer"
|
||||
)
|
||||
|
||||
// RuleTask holds a rule (with composite queries)
|
||||
@@ -37,34 +35,28 @@ type RuleTask struct {
|
||||
|
||||
pause bool
|
||||
notify NotifyFunc
|
||||
|
||||
maintenanceStore ruletypes.MaintenanceStore
|
||||
orgID valuer.UUID
|
||||
}
|
||||
|
||||
const DefaultFrequency = 1 * time.Minute
|
||||
|
||||
// NewRuleTask makes a new RuleTask with the given name, options, and rules.
|
||||
func NewRuleTask(name, file string, frequency time.Duration, rules []Rule, opts *ManagerOptions, notify NotifyFunc, maintenanceStore ruletypes.MaintenanceStore, orgID valuer.UUID) *RuleTask {
|
||||
|
||||
func NewRuleTask(name, file string, frequency time.Duration, rules []Rule, opts *ManagerOptions, notify NotifyFunc) *RuleTask {
|
||||
if frequency == 0 {
|
||||
frequency = DefaultFrequency
|
||||
}
|
||||
opts.Logger.Info("initiating a new rule task", "name", name, "frequency", frequency)
|
||||
|
||||
return &RuleTask{
|
||||
name: name,
|
||||
file: file,
|
||||
pause: false,
|
||||
frequency: frequency,
|
||||
rules: rules,
|
||||
opts: opts,
|
||||
logger: opts.Logger,
|
||||
done: make(chan struct{}),
|
||||
terminated: make(chan struct{}),
|
||||
notify: notify,
|
||||
maintenanceStore: maintenanceStore,
|
||||
orgID: orgID,
|
||||
name: name,
|
||||
file: file,
|
||||
pause: false,
|
||||
frequency: frequency,
|
||||
rules: rules,
|
||||
opts: opts,
|
||||
logger: opts.Logger,
|
||||
done: make(chan struct{}),
|
||||
terminated: make(chan struct{}),
|
||||
notify: notify,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -72,6 +64,7 @@ func NewRuleTask(name, file string, frequency time.Duration, rules []Rule, opts
|
||||
func (g *RuleTask) Name() string { return g.name }
|
||||
|
||||
// Key returns the group key
|
||||
// TODO(jatinderjit): remove (unused)?
|
||||
func (g *RuleTask) Key() string {
|
||||
return g.name + ";" + g.file
|
||||
}
|
||||
@@ -83,7 +76,7 @@ func (g *RuleTask) Type() TaskType { return TaskTypeCh }
|
||||
func (g *RuleTask) Rules() []Rule { return g.rules }
|
||||
|
||||
// Interval returns the group's interval.
|
||||
// TODO: remove (unused)?
|
||||
// TODO(jatinderjit): remove (unused)?
|
||||
func (g *RuleTask) Interval() time.Duration { return g.frequency }
|
||||
|
||||
func (g *RuleTask) Pause(b bool) {
|
||||
@@ -261,7 +254,6 @@ func nameAndLabels(rule Rule) string {
|
||||
// Rules are matched based on their name and labels. If there are duplicates, the
|
||||
// first is matched with the first, second with the second etc.
|
||||
func (g *RuleTask) CopyState(fromTask Task) error {
|
||||
|
||||
from, ok := fromTask.(*RuleTask)
|
||||
if !ok {
|
||||
return errors.NewInternalf(errors.CodeInternal, "invalid from task for copy")
|
||||
@@ -306,7 +298,6 @@ func (g *RuleTask) CopyState(fromTask Task) error {
|
||||
|
||||
// Eval runs a single evaluation cycle in which all rules are evaluated sequentially.
|
||||
func (g *RuleTask) Eval(ctx context.Context, ts time.Time) {
|
||||
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
g.logger.ErrorContext(
|
||||
@@ -318,31 +309,11 @@ func (g *RuleTask) Eval(ctx context.Context, ts time.Time) {
|
||||
|
||||
g.logger.DebugContext(ctx, "rule task eval started", "name", g.name, "start_time", ts)
|
||||
|
||||
maintenance, err := g.maintenanceStore.ListPlannedMaintenance(ctx, g.orgID.StringValue())
|
||||
|
||||
if err != nil {
|
||||
g.logger.ErrorContext(ctx, "error in processing sql query", errors.Attr(err))
|
||||
}
|
||||
|
||||
for i, rule := range g.rules {
|
||||
if rule == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
shouldSkip := false
|
||||
for _, m := range maintenance {
|
||||
g.logger.InfoContext(ctx, "checking if rule should be skipped", slog.String("rule.id", rule.ID()), slog.Any("maintenance", m))
|
||||
if m.ShouldSkip(rule.ID(), ts) {
|
||||
shouldSkip = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if shouldSkip {
|
||||
g.logger.InfoContext(ctx, "rule should be skipped", slog.String("rule.id", rule.ID()))
|
||||
continue
|
||||
}
|
||||
|
||||
select {
|
||||
case <-g.done:
|
||||
return
|
||||
@@ -382,7 +353,6 @@ func (g *RuleTask) Eval(ctx context.Context, ts time.Time) {
|
||||
}
|
||||
|
||||
rule.SendAlerts(ctx, ts, g.opts.ResendDelay, g.frequency, g.notify)
|
||||
|
||||
}(i, rule)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,9 +3,6 @@ package rules
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/types/ruletypes"
|
||||
"github.com/SigNoz/signoz/pkg/valuer"
|
||||
)
|
||||
|
||||
type TaskType string
|
||||
@@ -32,9 +29,9 @@ type Task interface {
|
||||
|
||||
// newTask returns an appropriate group for
|
||||
// rule type
|
||||
func newTask(taskType TaskType, name, file string, frequency time.Duration, rules []Rule, opts *ManagerOptions, notify NotifyFunc, maintenanceStore ruletypes.MaintenanceStore, orgID valuer.UUID) Task {
|
||||
func newTask(taskType TaskType, name, file string, frequency time.Duration, rules []Rule, opts *ManagerOptions, notify NotifyFunc) Task {
|
||||
if taskType == TaskTypeCh {
|
||||
return NewRuleTask(name, file, frequency, rules, opts, notify, maintenanceStore, orgID)
|
||||
return NewRuleTask(name, file, frequency, rules, opts, notify)
|
||||
}
|
||||
return NewPromRuleTask(name, file, frequency, rules, opts, notify, maintenanceStore, orgID)
|
||||
return NewPromRuleTask(name, file, frequency, rules, opts, notify)
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ type Handler interface {
|
||||
DeleteRuleByID(http.ResponseWriter, *http.Request)
|
||||
PatchRuleByID(http.ResponseWriter, *http.Request)
|
||||
TestRule(http.ResponseWriter, *http.Request)
|
||||
MuteRule(http.ResponseWriter, *http.Request)
|
||||
|
||||
ListDowntimeSchedules(http.ResponseWriter, *http.Request)
|
||||
GetDowntimeScheduleByID(http.ResponseWriter, *http.Request)
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/factory"
|
||||
"github.com/SigNoz/signoz/pkg/statsreporter"
|
||||
"github.com/SigNoz/signoz/pkg/types/alertmanagertypes"
|
||||
"github.com/SigNoz/signoz/pkg/types/ruletypes"
|
||||
"github.com/SigNoz/signoz/pkg/valuer"
|
||||
)
|
||||
@@ -44,5 +45,5 @@ type Ruler interface {
|
||||
// MaintenanceStore returns the store for planned maintenance / downtime schedules.
|
||||
// TODO: expose downtime CRUD as methods on Ruler directly instead of leaking the
|
||||
// store interface. The handler should not call store methods directly.
|
||||
MaintenanceStore() ruletypes.MaintenanceStore
|
||||
MaintenanceStore() alertmanagertypes.MaintenanceStore
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
"github.com/SigNoz/signoz/pkg/http/binding"
|
||||
"github.com/SigNoz/signoz/pkg/http/render"
|
||||
"github.com/SigNoz/signoz/pkg/ruler"
|
||||
"github.com/SigNoz/signoz/pkg/types/alertmanagertypes"
|
||||
"github.com/SigNoz/signoz/pkg/types/authtypes"
|
||||
"github.com/SigNoz/signoz/pkg/types/ruletypes"
|
||||
"github.com/SigNoz/signoz/pkg/valuer"
|
||||
@@ -184,6 +185,53 @@ func (handler *handler) TestRule(rw http.ResponseWriter, req *http.Request) {
|
||||
render.Success(rw, http.StatusOK, ruletypes.GettableTestRule{AlertCount: alertCount, Message: "notification sent"})
|
||||
}
|
||||
|
||||
// MuteRule mutes an alert rule until the provided endTime by creating a
|
||||
// fixed-window planned maintenance scoped to that rule. The created
|
||||
// PlannedMaintenance is returned so the client can later unmute by deleting
|
||||
// it via DELETE /api/v1/downtime_schedules/{id}.
|
||||
func (handler *handler) MuteRule(rw http.ResponseWriter, req *http.Request) {
|
||||
ctx, cancel := context.WithTimeout(req.Context(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
id, err := valuer.NewUUID(mux.Vars(req)["id"])
|
||||
if err != nil {
|
||||
render.Error(rw, errors.Newf(errors.TypeInvalidInput, errors.CodeInvalidInput, "id is not a valid uuid-v7"))
|
||||
return
|
||||
}
|
||||
|
||||
rule, err := handler.ruler.GetRule(ctx, id)
|
||||
if err != nil {
|
||||
render.Error(rw, err)
|
||||
return
|
||||
}
|
||||
|
||||
payload := new(ruletypes.PostableMuteRule)
|
||||
if err := binding.JSON.BindBody(req.Body, payload); err != nil {
|
||||
render.Error(rw, err)
|
||||
return
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
if err := payload.Validate(now); err != nil {
|
||||
render.Error(rw, err)
|
||||
return
|
||||
}
|
||||
|
||||
schedule := payload.ToPostablePlannedMaintenance(id.StringValue(), rule.AlertName, now)
|
||||
if err := schedule.Validate(); err != nil {
|
||||
render.Error(rw, err)
|
||||
return
|
||||
}
|
||||
|
||||
created, err := handler.ruler.MaintenanceStore().CreatePlannedMaintenance(ctx, schedule)
|
||||
if err != nil {
|
||||
render.Error(rw, err)
|
||||
return
|
||||
}
|
||||
|
||||
render.Success(rw, http.StatusCreated, created)
|
||||
}
|
||||
|
||||
func (handler *handler) ListDowntimeSchedules(rw http.ResponseWriter, req *http.Request) {
|
||||
ctx, cancel := context.WithTimeout(req.Context(), 30*time.Second)
|
||||
defer cancel()
|
||||
@@ -194,7 +242,7 @@ func (handler *handler) ListDowntimeSchedules(rw http.ResponseWriter, req *http.
|
||||
return
|
||||
}
|
||||
|
||||
var params ruletypes.ListPlannedMaintenanceParams
|
||||
var params alertmanagertypes.ListPlannedMaintenanceParams
|
||||
if err := binding.Query.BindQuery(req.URL.Query(), ¶ms); err != nil {
|
||||
render.Error(rw, err)
|
||||
return
|
||||
@@ -207,7 +255,7 @@ func (handler *handler) ListDowntimeSchedules(rw http.ResponseWriter, req *http.
|
||||
}
|
||||
|
||||
if params.Active != nil {
|
||||
activeSchedules := make([]*ruletypes.PlannedMaintenance, 0)
|
||||
activeSchedules := make([]*alertmanagertypes.PlannedMaintenance, 0)
|
||||
for _, schedule := range schedules {
|
||||
now := time.Now().In(time.FixedZone(schedule.Schedule.Timezone, 0))
|
||||
if schedule.IsActive(now) == *params.Active {
|
||||
@@ -218,7 +266,7 @@ func (handler *handler) ListDowntimeSchedules(rw http.ResponseWriter, req *http.
|
||||
}
|
||||
|
||||
if params.Recurring != nil {
|
||||
recurringSchedules := make([]*ruletypes.PlannedMaintenance, 0)
|
||||
recurringSchedules := make([]*alertmanagertypes.PlannedMaintenance, 0)
|
||||
for _, schedule := range schedules {
|
||||
if schedule.IsRecurring() == *params.Recurring {
|
||||
recurringSchedules = append(recurringSchedules, schedule)
|
||||
@@ -253,7 +301,7 @@ func (handler *handler) CreateDowntimeSchedule(rw http.ResponseWriter, req *http
|
||||
ctx, cancel := context.WithTimeout(req.Context(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
schedule := new(ruletypes.PostablePlannedMaintenance)
|
||||
schedule := new(alertmanagertypes.PostablePlannedMaintenance)
|
||||
if err := binding.JSON.BindBody(req.Body, schedule); err != nil {
|
||||
render.Error(rw, err)
|
||||
return
|
||||
@@ -283,7 +331,7 @@ func (handler *handler) UpdateDowntimeScheduleByID(rw http.ResponseWriter, req *
|
||||
return
|
||||
}
|
||||
|
||||
schedule := new(ruletypes.PostablePlannedMaintenance)
|
||||
schedule := new(alertmanagertypes.PostablePlannedMaintenance)
|
||||
if err := binding.JSON.BindBody(req.Body, schedule); err != nil {
|
||||
render.Error(rw, err)
|
||||
return
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager"
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager/alertmanagerstore/sqlalertmanagerstore"
|
||||
"github.com/SigNoz/signoz/pkg/cache"
|
||||
"github.com/SigNoz/signoz/pkg/factory"
|
||||
"github.com/SigNoz/signoz/pkg/modules/organization"
|
||||
@@ -16,6 +17,7 @@ import (
|
||||
"github.com/SigNoz/signoz/pkg/ruler/rulestore/sqlrulestore"
|
||||
"github.com/SigNoz/signoz/pkg/sqlstore"
|
||||
"github.com/SigNoz/signoz/pkg/telemetrystore"
|
||||
"github.com/SigNoz/signoz/pkg/types/alertmanagertypes"
|
||||
"github.com/SigNoz/signoz/pkg/types/ruletypes"
|
||||
"github.com/SigNoz/signoz/pkg/types/telemetrytypes"
|
||||
"github.com/SigNoz/signoz/pkg/valuer"
|
||||
@@ -44,7 +46,7 @@ func NewFactory(
|
||||
) factory.ProviderFactory[ruler.Ruler, ruler.Config] {
|
||||
return factory.NewProviderFactory(factory.MustNewName("signoz"), func(ctx context.Context, providerSettings factory.ProviderSettings, config ruler.Config) (ruler.Ruler, error) {
|
||||
ruleStore := sqlrulestore.NewRuleStore(sqlstore, queryParser, providerSettings)
|
||||
maintenanceStore := sqlrulestore.NewMaintenanceStore(sqlstore)
|
||||
maintenanceStore := sqlalertmanagerstore.NewMaintenanceStore(sqlstore)
|
||||
|
||||
managerOpts := &rules.ManagerOptions{
|
||||
TelemetryStore: telemetryStore,
|
||||
@@ -129,6 +131,6 @@ func (provider *provider) TestNotification(ctx context.Context, orgID valuer.UUI
|
||||
return provider.manager.TestNotification(ctx, orgID, ruleStr)
|
||||
}
|
||||
|
||||
func (provider *provider) MaintenanceStore() ruletypes.MaintenanceStore {
|
||||
func (provider *provider) MaintenanceStore() alertmanagertypes.MaintenanceStore {
|
||||
return provider.manager.MaintenanceStore()
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager"
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager/alertmanagerstore/sqlalertmanagerstore"
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfmanagertest"
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager/signozalertmanager"
|
||||
"github.com/SigNoz/signoz/pkg/emailing/emailingtest"
|
||||
@@ -39,7 +40,8 @@ func TestNewHandlers(t *testing.T) {
|
||||
orgGetter := implorganization.NewGetter(implorganization.NewStore(sqlstore), sharder)
|
||||
notificationManager := nfmanagertest.NewMock()
|
||||
require.NoError(t, err)
|
||||
alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{}, sqlstore, orgGetter, notificationManager)
|
||||
maintenanceStore := sqlalertmanagerstore.NewMaintenanceStore(sqlstore)
|
||||
alertmanager, err := signozalertmanager.New(providerSettings, alertmanager.Config{}, sqlstore, orgGetter, notificationManager, maintenanceStore)
|
||||
require.NoError(t, err)
|
||||
tokenizer := tokenizertest.NewMockTokenizer(t)
|
||||
emailing := emailingtest.New()
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager"
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager/alertmanagerstore/sqlalertmanagerstore"
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfmanagertest"
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager/signozalertmanager"
|
||||
"github.com/SigNoz/signoz/pkg/emailing/emailingtest"
|
||||
@@ -40,7 +41,8 @@ func TestNewModules(t *testing.T) {
|
||||
orgGetter := implorganization.NewGetter(implorganization.NewStore(sqlstore), sharder)
|
||||
notificationManager := nfmanagertest.NewMock()
|
||||
require.NoError(t, err)
|
||||
alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{}, sqlstore, orgGetter, notificationManager)
|
||||
maintenanceStore := sqlalertmanagerstore.NewMaintenanceStore(sqlstore)
|
||||
alertmanager, err := signozalertmanager.New(providerSettings, alertmanager.Config{}, sqlstore, orgGetter, notificationManager, maintenanceStore)
|
||||
require.NoError(t, err)
|
||||
tokenizer := tokenizertest.NewMockTokenizer(t)
|
||||
emailing := emailingtest.New()
|
||||
|
||||
@@ -200,6 +200,7 @@ func NewSQLMigrationProviderFactories(
|
||||
sqlmigration.NewAddServiceAccountManagedRoleTransactionsFactory(sqlstore),
|
||||
sqlmigration.NewAddSpanMapperFactory(sqlstore, sqlschema),
|
||||
sqlmigration.NewAddLLMPricingRulesFactory(sqlstore, sqlschema),
|
||||
sqlmigration.NewAddLabelExpressionToPlannedMaintenanceFactory(sqlstore, sqlschema),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -226,9 +227,14 @@ func NewNotificationManagerProviderFactories(routeStore alertmanagertypes.RouteS
|
||||
)
|
||||
}
|
||||
|
||||
func NewAlertmanagerProviderFactories(sqlstore sqlstore.SQLStore, orgGetter organization.Getter, nfManager nfmanager.NotificationManager) factory.NamedMap[factory.ProviderFactory[alertmanager.Alertmanager, alertmanager.Config]] {
|
||||
func NewAlertmanagerProviderFactories(
|
||||
sqlstore sqlstore.SQLStore,
|
||||
orgGetter organization.Getter,
|
||||
nfManager nfmanager.NotificationManager,
|
||||
maintenanceStore alertmanagertypes.MaintenanceStore,
|
||||
) factory.NamedMap[factory.ProviderFactory[alertmanager.Alertmanager, alertmanager.Config]] {
|
||||
return factory.MustNewNamedMap(
|
||||
signozalertmanager.NewFactory(sqlstore, orgGetter, nfManager),
|
||||
signozalertmanager.NewFactory(sqlstore, orgGetter, nfManager, maintenanceStore),
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"testing"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager/alertmanagerstore/sqlalertmanagerstore"
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfmanagertest"
|
||||
"github.com/SigNoz/signoz/pkg/analytics"
|
||||
"github.com/SigNoz/signoz/pkg/flagger"
|
||||
@@ -59,9 +60,11 @@ func TestNewProviderFactories(t *testing.T) {
|
||||
})
|
||||
|
||||
assert.NotPanics(t, func() {
|
||||
orgGetter := implorganization.NewGetter(implorganization.NewStore(sqlstoretest.New(sqlstore.Config{Provider: "sqlite"}, sqlmock.QueryMatcherEqual)), nil)
|
||||
store := sqlstoretest.New(sqlstore.Config{Provider: "sqlite"}, sqlmock.QueryMatcherEqual)
|
||||
orgGetter := implorganization.NewGetter(implorganization.NewStore(store), nil)
|
||||
notificationManager := nfmanagertest.NewMock()
|
||||
NewAlertmanagerProviderFactories(sqlstoretest.New(sqlstore.Config{Provider: "sqlite"}, sqlmock.QueryMatcherEqual), orgGetter, notificationManager)
|
||||
maintenanceStore := sqlalertmanagerstore.NewMaintenanceStore(store)
|
||||
NewAlertmanagerProviderFactories(store, orgGetter, notificationManager, maintenanceStore)
|
||||
})
|
||||
|
||||
assert.NotPanics(t, func() {
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"log/slog"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager"
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager/alertmanagerstore/sqlalertmanagerstore"
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager/nfmanager"
|
||||
"github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfroutingstore/sqlroutingstore"
|
||||
"github.com/SigNoz/signoz/pkg/analytics"
|
||||
@@ -369,12 +370,14 @@ func New(
|
||||
return nil, err
|
||||
}
|
||||
|
||||
maintenanceStore := sqlalertmanagerstore.NewMaintenanceStore(sqlstore)
|
||||
|
||||
// Initialize alertmanager from the available alertmanager provider factories
|
||||
alertmanager, err := factory.NewProviderFromNamedMap(
|
||||
ctx,
|
||||
providerSettings,
|
||||
config.Alertmanager,
|
||||
NewAlertmanagerProviderFactories(sqlstore, orgGetter, nfManager),
|
||||
NewAlertmanagerProviderFactories(sqlstore, orgGetter, nfManager, maintenanceStore),
|
||||
config.Alertmanager.Provider,
|
||||
)
|
||||
if err != nil {
|
||||
|
||||
@@ -11,7 +11,7 @@ import (
|
||||
"github.com/SigNoz/signoz/pkg/factory"
|
||||
"github.com/SigNoz/signoz/pkg/sqlstore"
|
||||
"github.com/SigNoz/signoz/pkg/types"
|
||||
ruletypes "github.com/SigNoz/signoz/pkg/types/ruletypes"
|
||||
"github.com/SigNoz/signoz/pkg/types/alertmanagertypes"
|
||||
"github.com/SigNoz/signoz/pkg/valuer"
|
||||
"github.com/uptrace/bun"
|
||||
"github.com/uptrace/bun/migrate"
|
||||
@@ -61,7 +61,7 @@ type existingMaintenance struct {
|
||||
Name string `bun:"name,type:text,notnull"`
|
||||
Description string `bun:"description,type:text"`
|
||||
AlertIDs *AlertIds `bun:"alert_ids,type:text"`
|
||||
Schedule *ruletypes.Schedule `bun:"schedule,type:text,notnull"`
|
||||
Schedule *alertmanagertypes.Schedule `bun:"schedule,type:text,notnull"`
|
||||
CreatedAt time.Time `bun:"created_at,type:datetime,notnull"`
|
||||
CreatedBy string `bun:"created_by,type:text,notnull"`
|
||||
UpdatedAt time.Time `bun:"updated_at,type:datetime,notnull"`
|
||||
@@ -75,7 +75,7 @@ type newMaintenance struct {
|
||||
types.UserAuditable
|
||||
Name string `bun:"name,type:text,notnull"`
|
||||
Description string `bun:"description,type:text"`
|
||||
Schedule *ruletypes.Schedule `bun:"schedule,type:text,notnull"`
|
||||
Schedule *alertmanagertypes.Schedule `bun:"schedule,type:text,notnull"`
|
||||
OrgID string `bun:"org_id,type:text"`
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,97 @@
|
||||
package sqlmigration
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/factory"
|
||||
"github.com/SigNoz/signoz/pkg/sqlschema"
|
||||
"github.com/SigNoz/signoz/pkg/sqlstore"
|
||||
"github.com/uptrace/bun"
|
||||
"github.com/uptrace/bun/migrate"
|
||||
)
|
||||
|
||||
type addLabelExpressionToPlannedMaintenance struct {
|
||||
sqlstore sqlstore.SQLStore
|
||||
sqlschema sqlschema.SQLSchema
|
||||
}
|
||||
|
||||
func NewAddLabelExpressionToPlannedMaintenanceFactory(sqlstore sqlstore.SQLStore, sqlschema sqlschema.SQLSchema) factory.ProviderFactory[SQLMigration, Config] {
|
||||
return factory.NewProviderFactory(
|
||||
factory.MustNewName("add_label_expr_to_planned"),
|
||||
func(ctx context.Context, ps factory.ProviderSettings, c Config) (SQLMigration, error) {
|
||||
return &addLabelExpressionToPlannedMaintenance{
|
||||
sqlstore: sqlstore,
|
||||
sqlschema: sqlschema,
|
||||
}, nil
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
func (migration *addLabelExpressionToPlannedMaintenance) Register(migrations *migrate.Migrations) error {
|
||||
if err := migrations.Register(migration.Up, migration.Down); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (migration *addLabelExpressionToPlannedMaintenance) Up(ctx context.Context, db *bun.DB) error {
|
||||
tx, err := db.BeginTx(ctx, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
defer func() {
|
||||
_ = tx.Rollback()
|
||||
}()
|
||||
|
||||
table, _, err := migration.sqlschema.GetTable(ctx, sqlschema.TableName("planned_maintenance"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
column := &sqlschema.Column{
|
||||
Name: sqlschema.ColumnName("label_expression"),
|
||||
DataType: sqlschema.DataTypeText,
|
||||
Nullable: true,
|
||||
}
|
||||
|
||||
sqls := migration.sqlschema.Operator().AddColumn(table, nil, column, nil)
|
||||
for _, sql := range sqls {
|
||||
if _, err := tx.ExecContext(ctx, string(sql)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return tx.Commit()
|
||||
}
|
||||
|
||||
func (migration *addLabelExpressionToPlannedMaintenance) Down(ctx context.Context, db *bun.DB) error {
|
||||
tx, err := db.BeginTx(ctx, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
defer func() {
|
||||
_ = tx.Rollback()
|
||||
}()
|
||||
|
||||
table, _, err := migration.sqlschema.GetTable(ctx, sqlschema.TableName("planned_maintenance"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
column := &sqlschema.Column{
|
||||
Name: sqlschema.ColumnName("label_expression"),
|
||||
DataType: sqlschema.DataTypeText,
|
||||
Nullable: true,
|
||||
}
|
||||
|
||||
sqls := migration.sqlschema.Operator().DropColumn(table, column)
|
||||
for _, sql := range sqls {
|
||||
if _, err := tx.ExecContext(ctx, string(sql)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return tx.Commit()
|
||||
}
|
||||
@@ -170,6 +170,7 @@ func NewGettableAlertsFromAlertProvider(
|
||||
cfg *Config,
|
||||
getAlertStatusFunc func(model.Fingerprint) types.AlertStatus,
|
||||
setAlertStatusFunc func(model.LabelSet),
|
||||
mutedByFunc func(model.LabelSet) []string,
|
||||
params GettableAlertsParams,
|
||||
) (GettableAlerts, error) {
|
||||
res := GettableAlerts{}
|
||||
@@ -219,7 +220,7 @@ func NewGettableAlertsFromAlertProvider(
|
||||
continue
|
||||
}
|
||||
|
||||
alert := v2.AlertToOpenAPIAlert(alertData, getAlertStatusFunc(alertData.Fingerprint()), receivers, nil)
|
||||
alert := v2.AlertToOpenAPIAlert(alertData, getAlertStatusFunc(alertData.Fingerprint()), receivers, mutedByFunc(alertData.Labels))
|
||||
|
||||
res = append(res, alert)
|
||||
}
|
||||
|
||||
@@ -1,17 +1,58 @@
|
||||
package ruletypes
|
||||
package alertmanagertypes
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"time"
|
||||
|
||||
"github.com/expr-lang/expr"
|
||||
"github.com/prometheus/common/model"
|
||||
"github.com/uptrace/bun"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/errors"
|
||||
"github.com/SigNoz/signoz/pkg/types"
|
||||
"github.com/SigNoz/signoz/pkg/valuer"
|
||||
"github.com/uptrace/bun"
|
||||
)
|
||||
|
||||
var ErrCodeInvalidPlannedMaintenancePayload = errors.MustNewCode("invalid_planned_maintenance_payload")
|
||||
var ErrCodeInvalidMutePayload = errors.MustNewCode("invalid_mute_payload")
|
||||
|
||||
// PostableMuteRule is the input payload for muting an alert rule. A mute is a
|
||||
// shortcut for creating a fixed (non-recurring) planned maintenance scoped to a
|
||||
// single rule, covering [now, EndTime].
|
||||
type PostableMuteRule struct {
|
||||
EndTime time.Time `json:"endTime" required:"true"`
|
||||
Description string `json:"description"`
|
||||
}
|
||||
|
||||
func (p *PostableMuteRule) Validate(now time.Time) error {
|
||||
if p.EndTime.IsZero() {
|
||||
return errors.Newf(errors.TypeInvalidInput, ErrCodeInvalidMutePayload, "missing endTime in the payload")
|
||||
}
|
||||
if !p.EndTime.After(now) {
|
||||
return errors.Newf(errors.TypeInvalidInput, ErrCodeInvalidMutePayload, "endTime must be in the future")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ToPostablePlannedMaintenance builds the underlying planned maintenance
|
||||
// payload for muting the given rule until p.EndTime.
|
||||
func (p *PostableMuteRule) ToPostablePlannedMaintenance(ruleID string, ruleName string, now time.Time) *PostablePlannedMaintenance {
|
||||
name := "Mute: " + ruleName
|
||||
if ruleName == "" {
|
||||
name = "Mute for rule " + ruleID
|
||||
}
|
||||
return &PostablePlannedMaintenance{
|
||||
Name: name,
|
||||
Description: p.Description,
|
||||
Schedule: &Schedule{
|
||||
Timezone: "UTC",
|
||||
StartTime: now.UTC(),
|
||||
EndTime: p.EndTime.UTC(),
|
||||
},
|
||||
AlertIds: []string{ruleID},
|
||||
}
|
||||
}
|
||||
|
||||
type MaintenanceStatus struct {
|
||||
valuer.String
|
||||
@@ -54,34 +95,37 @@ type StorablePlannedMaintenance struct {
|
||||
types.Identifiable
|
||||
types.TimeAuditable
|
||||
types.UserAuditable
|
||||
Name string `bun:"name,type:text,notnull"`
|
||||
Description string `bun:"description,type:text"`
|
||||
Schedule *Schedule `bun:"schedule,type:text,notnull"`
|
||||
OrgID string `bun:"org_id,type:text"`
|
||||
Name string `bun:"name,type:text,notnull"`
|
||||
Description string `bun:"description,type:text"`
|
||||
Schedule *Schedule `bun:"schedule,type:text,notnull"`
|
||||
OrgID string `bun:"org_id,type:text"`
|
||||
LabelExpression string `bun:"label_expression,type:text"`
|
||||
}
|
||||
|
||||
type PlannedMaintenance struct {
|
||||
ID valuer.UUID `json:"id" required:"true"`
|
||||
Name string `json:"name" required:"true"`
|
||||
Description string `json:"description"`
|
||||
Schedule *Schedule `json:"schedule" required:"true"`
|
||||
RuleIDs []string `json:"alertIds"`
|
||||
CreatedAt time.Time `json:"createdAt"`
|
||||
CreatedBy string `json:"createdBy"`
|
||||
UpdatedAt time.Time `json:"updatedAt"`
|
||||
UpdatedBy string `json:"updatedBy"`
|
||||
Status MaintenanceStatus `json:"status" required:"true"`
|
||||
Kind MaintenanceKind `json:"kind" required:"true"`
|
||||
ID valuer.UUID `json:"id" required:"true"`
|
||||
Name string `json:"name" required:"true"`
|
||||
Description string `json:"description"`
|
||||
Schedule *Schedule `json:"schedule" required:"true"`
|
||||
RuleIDs []string `json:"alertIds"`
|
||||
LabelExpression string `json:"labelExpression,omitempty"`
|
||||
CreatedAt time.Time `json:"createdAt"`
|
||||
CreatedBy string `json:"createdBy"`
|
||||
UpdatedAt time.Time `json:"updatedAt"`
|
||||
UpdatedBy string `json:"updatedBy"`
|
||||
Status MaintenanceStatus `json:"status" required:"true"`
|
||||
Kind MaintenanceKind `json:"kind" required:"true"`
|
||||
}
|
||||
|
||||
// PostablePlannedMaintenance is the input payload for creating or updating a
|
||||
// planned maintenance. Server-owned fields (id, timestamps, audit users,
|
||||
// derived status / kind) are deliberately not accepted from the client.
|
||||
type PostablePlannedMaintenance struct {
|
||||
Name string `json:"name" required:"true"`
|
||||
Description string `json:"description"`
|
||||
Schedule *Schedule `json:"schedule" required:"true"`
|
||||
AlertIds []string `json:"alertIds"`
|
||||
Name string `json:"name" required:"true"`
|
||||
Description string `json:"description"`
|
||||
Schedule *Schedule `json:"schedule" required:"true"`
|
||||
AlertIds []string `json:"alertIds"`
|
||||
LabelExpression string `json:"labelExpression"`
|
||||
}
|
||||
|
||||
func (p *PostablePlannedMaintenance) Validate() error {
|
||||
@@ -113,6 +157,11 @@ func (p *PostablePlannedMaintenance) Validate() error {
|
||||
return errors.Newf(errors.TypeInvalidInput, ErrCodeInvalidPlannedMaintenancePayload, "missing duration in the payload")
|
||||
}
|
||||
}
|
||||
if p.LabelExpression != "" {
|
||||
if _, err := expr.Compile(p.LabelExpression, expr.AllowUndefinedVariables(), expr.AsBool()); err != nil {
|
||||
return errors.Newf(errors.TypeInvalidInput, ErrCodeInvalidPlannedMaintenancePayload, "invalid label expression: %v", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -128,7 +177,7 @@ type PlannedMaintenanceWithRules struct {
|
||||
Rules []*StorablePlannedMaintenanceRule `bun:"rel:has-many,join:id=planned_maintenance_id"`
|
||||
}
|
||||
|
||||
func (m *PlannedMaintenance) ShouldSkip(ruleID string, now time.Time) bool {
|
||||
func (m *PlannedMaintenance) ShouldSkip(ruleID string, now time.Time, lset model.LabelSet) bool {
|
||||
// Check if the alert ID is in the maintenance window
|
||||
found := false
|
||||
if len(m.RuleIDs) > 0 {
|
||||
@@ -148,6 +197,23 @@ func (m *PlannedMaintenance) ShouldSkip(ruleID string, now time.Time) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
if !m.isScheduleActive(now) {
|
||||
return false
|
||||
}
|
||||
|
||||
// lset is empty when called from IsActive (no instance labels available);
|
||||
// skip expression filtering in that case.
|
||||
if m.LabelExpression != "" && len(lset) != 0 {
|
||||
if !evalLabelExpression(m.LabelExpression, lset) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// isScheduleActive reports whether now falls inside the maintenance window's schedule.
|
||||
func (m *PlannedMaintenance) isScheduleActive(now time.Time) bool {
|
||||
// If alert is found, we check if it should be skipped based on the schedule
|
||||
loc, err := time.LoadLocation(m.Schedule.Timezone)
|
||||
if err != nil {
|
||||
@@ -187,6 +253,25 @@ func (m *PlannedMaintenance) ShouldSkip(ruleID string, now time.Time) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// evalLabelExpression compiles and runs the expression against the provided labels.
|
||||
// Returns false on any error (safety-first: don't suppress on a bad expression).
|
||||
func evalLabelExpression(expression string, lset model.LabelSet) bool {
|
||||
env := make(map[string]interface{}, len(lset))
|
||||
for k, v := range lset {
|
||||
env[string(k)] = string(v)
|
||||
}
|
||||
program, err := expr.Compile(expression, expr.Env(env), expr.AllowUndefinedVariables())
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
output, err := expr.Run(program, env)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
result, ok := output.(bool)
|
||||
return ok && result
|
||||
}
|
||||
|
||||
// checkDaily rebases the recurrence start to today (or yesterday if needed)
|
||||
// and returns true if currentTime is within [candidate, candidate+Duration].
|
||||
func (m *PlannedMaintenance) checkDaily(currentTime time.Time, rec *Recurrence, loc *time.Location) bool {
|
||||
@@ -274,7 +359,7 @@ func (m *PlannedMaintenance) IsActive(now time.Time) bool {
|
||||
if len(m.RuleIDs) > 0 {
|
||||
ruleID = (m.RuleIDs)[0]
|
||||
}
|
||||
return m.ShouldSkip(ruleID, now)
|
||||
return m.ShouldSkip(ruleID, now, nil)
|
||||
}
|
||||
|
||||
func (m *PlannedMaintenance) IsUpcoming() bool {
|
||||
@@ -342,29 +427,31 @@ func (m PlannedMaintenance) MarshalJSON() ([]byte, error) {
|
||||
}
|
||||
|
||||
return json.Marshal(struct {
|
||||
ID valuer.UUID `json:"id" db:"id"`
|
||||
Name string `json:"name" db:"name"`
|
||||
Description string `json:"description" db:"description"`
|
||||
Schedule *Schedule `json:"schedule" db:"schedule"`
|
||||
AlertIds []string `json:"alertIds" db:"alert_ids"`
|
||||
CreatedAt time.Time `json:"createdAt" db:"created_at"`
|
||||
CreatedBy string `json:"createdBy" db:"created_by"`
|
||||
UpdatedAt time.Time `json:"updatedAt" db:"updated_at"`
|
||||
UpdatedBy string `json:"updatedBy" db:"updated_by"`
|
||||
Status MaintenanceStatus `json:"status"`
|
||||
Kind MaintenanceKind `json:"kind"`
|
||||
ID valuer.UUID `json:"id" db:"id"`
|
||||
Name string `json:"name" db:"name"`
|
||||
Description string `json:"description" db:"description"`
|
||||
Schedule *Schedule `json:"schedule" db:"schedule"`
|
||||
AlertIds []string `json:"alertIds" db:"alert_ids"`
|
||||
LabelExpression string `json:"labelExpression,omitempty" db:"label_expression"`
|
||||
CreatedAt time.Time `json:"createdAt" db:"created_at"`
|
||||
CreatedBy string `json:"createdBy" db:"created_by"`
|
||||
UpdatedAt time.Time `json:"updatedAt" db:"updated_at"`
|
||||
UpdatedBy string `json:"updatedBy" db:"updated_by"`
|
||||
Status MaintenanceStatus `json:"status"`
|
||||
Kind MaintenanceKind `json:"kind"`
|
||||
}{
|
||||
ID: m.ID,
|
||||
Name: m.Name,
|
||||
Description: m.Description,
|
||||
Schedule: m.Schedule,
|
||||
AlertIds: m.RuleIDs,
|
||||
CreatedAt: m.CreatedAt,
|
||||
CreatedBy: m.CreatedBy,
|
||||
UpdatedAt: m.UpdatedAt,
|
||||
UpdatedBy: m.UpdatedBy,
|
||||
Status: status,
|
||||
Kind: kind,
|
||||
ID: m.ID,
|
||||
Name: m.Name,
|
||||
Description: m.Description,
|
||||
Schedule: m.Schedule,
|
||||
AlertIds: m.RuleIDs,
|
||||
LabelExpression: m.LabelExpression,
|
||||
CreatedAt: m.CreatedAt,
|
||||
CreatedBy: m.CreatedBy,
|
||||
UpdatedAt: m.UpdatedAt,
|
||||
UpdatedBy: m.UpdatedBy,
|
||||
Status: status,
|
||||
Kind: kind,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -377,15 +464,16 @@ func (m *PlannedMaintenanceWithRules) ToPlannedMaintenance() *PlannedMaintenance
|
||||
}
|
||||
|
||||
return &PlannedMaintenance{
|
||||
ID: m.ID,
|
||||
Name: m.Name,
|
||||
Description: m.Description,
|
||||
Schedule: m.Schedule,
|
||||
RuleIDs: ruleIDs,
|
||||
CreatedAt: m.CreatedAt,
|
||||
UpdatedAt: m.UpdatedAt,
|
||||
CreatedBy: m.CreatedBy,
|
||||
UpdatedBy: m.UpdatedBy,
|
||||
ID: m.ID,
|
||||
Name: m.Name,
|
||||
Description: m.Description,
|
||||
Schedule: m.Schedule,
|
||||
RuleIDs: ruleIDs,
|
||||
LabelExpression: m.LabelExpression,
|
||||
CreatedAt: m.CreatedAt,
|
||||
UpdatedAt: m.UpdatedAt,
|
||||
CreatedBy: m.CreatedBy,
|
||||
UpdatedBy: m.UpdatedBy,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
package ruletypes
|
||||
package alertmanagertypes
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/valuer"
|
||||
"github.com/prometheus/common/model"
|
||||
)
|
||||
|
||||
func TestShouldSkipMaintenance(t *testing.T) {
|
||||
@@ -660,7 +661,7 @@ func TestShouldSkipMaintenance(t *testing.T) {
|
||||
}
|
||||
|
||||
for idx, c := range cases {
|
||||
result := c.maintenance.ShouldSkip(c.name, c.ts)
|
||||
result := c.maintenance.ShouldSkip(c.name, c.ts, model.LabelSet{})
|
||||
if result != c.skip {
|
||||
t.Errorf("skip %v, got %v, case:%d - %s", c.skip, result, idx, c.name)
|
||||
}
|
||||
@@ -1,12 +0,0 @@
|
||||
package alertmanagertypes
|
||||
|
||||
import (
|
||||
"github.com/prometheus/alertmanager/types"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
type MemMarker = types.MemMarker
|
||||
|
||||
func NewMarker(r prometheus.Registerer) *MemMarker {
|
||||
return types.NewMarker(r)
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package ruletypes
|
||||
package alertmanagertypes
|
||||
|
||||
import (
|
||||
"database/sql/driver"
|
||||
@@ -1,4 +1,4 @@
|
||||
package ruletypes
|
||||
package alertmanagertypes
|
||||
|
||||
import (
|
||||
"database/sql/driver"
|
||||
Reference in New Issue
Block a user