Files
signoz/pkg/modules/inframonitoring/implinframonitoring/module.go
Nikhil Mantri 89b755a6b0 feat(infra-monitoring): v2 hosts list api (#10805)
* chore: baseline setup

* chore: endpoint detail update

* chore: added logic for hosts v3 api

* fix: bug fix

* chore: disk usage

* chore: added validate function

* chore: added some unit tests

* chore: return status as a string

* chore: yarn generate api

* chore: removed isSendingK8sAgentsMetricsCode

* chore: moved funcs

* chore: added validation on order by

* chore: updated spec

* chore: nil pointer dereference fix in req.Filter

* chore: added temporalities of metrics

* chore: unified composite key function

* chore: code improvements

* chore: hostStatusNone added for clarity that this field can be left empty as well in payload

* chore: yarn generate api

* chore: return errors from getMetadata and lint fix

* chore: return errors from getMetadata and lint fix

* chore: added hostName logic

* chore: modified getMetadata query

* chore: add type for response and files rearrange

* chore: warnings added passing from queryResponse warning to host lists response struct

* chore: added better metrics existence check

* chore: added a TODO remark

* chore: added required metrics check

* chore: distributed samples table to local table change for get metadata

* chore: frontend fix

* chore: endpoint correction

* chore: endpoint modification openapi

* chore: escape backtick to prevent sql injection

* chore: rearrage

* chore: improvements

* chore: validate order by to validate function

* chore: improved description

* chore: added TODOs and made filterByStatus a part of filter struct

* chore: ignore empty string hosts in get active hosts

* feat(infra-monitoring): v2 hosts list - return counts of active & inactive hosts for custom group by attributes (#10956)

* chore: add functionality for showing active and inactive counts in custom group by

* chore: bug fix

* chore: added subquery for active and total count

* chore: ignore empty string hosts in get active hosts

* fix: sinceUnixMilli for determining active hosts compute once per request

* chore: refactor code

* chore: rename HostsList -> ListHosts

* chore: rearrangement

* chore: inframonitoring types renaming

* chore: added types package

* chore: file structure further breakdown for clarity

* chore: comments correction

* chore: removed temporalities

* chore: comments resolve

* chore: added json tag required: true

* chore: added status unauthorized

* chore: remove a defensive nil map check, the function ensure non-nil map when err nil

* chore: make sort stable in case of tiebreaker by comparing composite group by keys

* chore: regen api client for inframonitoring

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 09:30:33 +00:00

162 lines
5.6 KiB
Go

package implinframonitoring
import (
"context"
"log/slog"
"time"
"github.com/SigNoz/signoz/pkg/factory"
"github.com/SigNoz/signoz/pkg/modules/inframonitoring"
"github.com/SigNoz/signoz/pkg/querier"
"github.com/SigNoz/signoz/pkg/telemetrymetrics"
"github.com/SigNoz/signoz/pkg/telemetrystore"
"github.com/SigNoz/signoz/pkg/types/inframonitoringtypes"
qbtypes "github.com/SigNoz/signoz/pkg/types/querybuildertypes/querybuildertypesv5"
"github.com/SigNoz/signoz/pkg/types/telemetrytypes"
"github.com/SigNoz/signoz/pkg/valuer"
)
type module struct {
telemetryStore telemetrystore.TelemetryStore
telemetryMetadataStore telemetrytypes.MetadataStore
querier querier.Querier
fieldMapper qbtypes.FieldMapper
condBuilder qbtypes.ConditionBuilder
logger *slog.Logger
config inframonitoring.Config
}
// NewModule constructs the inframonitoring module with the provided dependencies.
func NewModule(
telemetryStore telemetrystore.TelemetryStore,
telemetryMetadataStore telemetrytypes.MetadataStore,
querier querier.Querier,
providerSettings factory.ProviderSettings,
cfg inframonitoring.Config,
) inframonitoring.Module {
fieldMapper := telemetrymetrics.NewFieldMapper()
condBuilder := telemetrymetrics.NewConditionBuilder(fieldMapper)
return &module{
telemetryStore: telemetryStore,
telemetryMetadataStore: telemetryMetadataStore,
querier: querier,
fieldMapper: fieldMapper,
condBuilder: condBuilder,
logger: providerSettings.Logger,
config: cfg,
}
}
func (m *module) ListHosts(ctx context.Context, orgID valuer.UUID, req *inframonitoringtypes.PostableHosts) (*inframonitoringtypes.Hosts, error) {
if err := req.Validate(); err != nil {
return nil, err
}
resp := &inframonitoringtypes.Hosts{}
// default to cpu order by
if req.OrderBy == nil {
req.OrderBy = &qbtypes.OrderBy{
Key: qbtypes.OrderByKey{
TelemetryFieldKey: telemetrytypes.TelemetryFieldKey{
Name: "cpu",
},
},
Direction: qbtypes.OrderDirectionDesc,
}
}
// default to host name group by
if len(req.GroupBy) == 0 {
req.GroupBy = []qbtypes.GroupByKey{hostNameGroupByKey}
resp.Type = inframonitoringtypes.ResponseTypeList
} else {
resp.Type = inframonitoringtypes.ResponseTypeGroupedList
}
// 1. Check which required metrics exist and get earliest retention time.
// If any required metric is missing, return early with the list of missing metrics.
// 2. If metrics exist but req.End is before the earliest reported time, return early with endTimeBeforeRetention=true.
missingMetrics, minFirstReportedUnixMilli, err := m.getMetricsExistenceAndEarliestTime(ctx, hostsTableMetricNamesList)
if err != nil {
return nil, err
}
if len(missingMetrics) > 0 {
resp.RequiredMetricsCheck = inframonitoringtypes.RequiredMetricsCheck{MissingMetrics: missingMetrics}
resp.Records = []inframonitoringtypes.HostRecord{}
resp.Total = 0
return resp, nil
}
if req.End < int64(minFirstReportedUnixMilli) {
resp.EndTimeBeforeRetention = true
resp.Records = []inframonitoringtypes.HostRecord{}
resp.Total = 0
return resp, nil
}
resp.RequiredMetricsCheck = inframonitoringtypes.RequiredMetricsCheck{MissingMetrics: []string{}}
// TOD(nikhilmantri0902): replace this separate ClickHouse query with a sub-query inside the main query builder query
// once QB supports sub-queries.
// Determine active hosts: those with metrics reported in the last 10 minutes.
// Compute the cutoff once so every downstream query/subquery agrees on what "active" means.
sinceUnixMilli := time.Now().Add(-10 * time.Minute).UTC().UnixMilli()
activeHostsMap, err := m.getActiveHosts(ctx, hostsTableMetricNamesList, hostNameAttrKey, sinceUnixMilli)
if err != nil {
return nil, err
}
// this check below modifies req.Filter by adding `AND active hosts filter` if req.FilterByStatus is set.
if m.applyHostsActiveStatusFilter(req, activeHostsMap) {
resp.Records = []inframonitoringtypes.HostRecord{}
resp.Total = 0
return resp, nil
}
metadataMap, err := m.getHostsTableMetadata(ctx, req)
if err != nil {
return nil, err
}
resp.Total = len(metadataMap)
pageGroups, err := m.getTopHostGroups(ctx, orgID, req, metadataMap)
if err != nil {
return nil, err
}
if len(pageGroups) == 0 {
resp.Records = []inframonitoringtypes.HostRecord{}
return resp, nil
}
hostsFilterExpr := ""
if req.Filter != nil {
hostsFilterExpr = req.Filter.Expression
}
fullQueryReq := buildFullQueryRequest(req.Start, req.End, hostsFilterExpr, req.GroupBy, pageGroups, m.newListHostsQuery())
queryResp, err := m.querier.QueryRange(ctx, orgID, fullQueryReq)
if err != nil {
return nil, err
}
// Compute per-group active/inactive host counts.
// When host.name is in groupBy, each row = one host, so counts are derived
// directly from activeHostsMap in buildHostRecords (no extra query needed).
// When host.name is not in groupBy, we need to run an additional query to get the counts per group for the current page,
// using the same filter expression as the main query (including user filters + page groups IN clause).
hostCounts := make(map[string]groupHostStatusCounts)
isHostNameInGroupBy := isKeyInGroupByAttrs(req.GroupBy, hostNameAttrKey)
if !isHostNameInGroupBy {
hostCounts, err = m.getPerGroupHostStatusCounts(ctx, req, hostsTableMetricNamesList, pageGroups, sinceUnixMilli)
if err != nil {
return nil, err
}
}
resp.Records = buildHostRecords(isHostNameInGroupBy, queryResp, pageGroups, req.GroupBy, metadataMap, activeHostsMap, hostCounts)
resp.Warning = queryResp.Warning
return resp, nil
}