mirror of
https://github.com/SigNoz/signoz.git
synced 2026-06-17 14:00:34 +01:00
Compare commits
2 Commits
issue-5388
...
issue-5378
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7c85c63555 | ||
|
|
f74ca236fa |
@@ -10,6 +10,7 @@ import (
|
||||
"go.opentelemetry.io/contrib/instrumentation/github.com/gorilla/mux/otelmux"
|
||||
"go.opentelemetry.io/otel/propagation"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/cache/memorycache"
|
||||
"github.com/SigNoz/signoz/pkg/errors"
|
||||
|
||||
"github.com/gorilla/handlers"
|
||||
@@ -19,6 +20,7 @@ import (
|
||||
|
||||
"github.com/SigNoz/signoz/ee/query-service/app/api"
|
||||
"github.com/SigNoz/signoz/ee/query-service/usage"
|
||||
"github.com/SigNoz/signoz/pkg/cache"
|
||||
"github.com/SigNoz/signoz/pkg/http/middleware"
|
||||
"github.com/SigNoz/signoz/pkg/signoz"
|
||||
"github.com/SigNoz/signoz/pkg/web"
|
||||
@@ -57,12 +59,25 @@ type Server struct {
|
||||
|
||||
// NewServer creates and initializes Server
|
||||
func NewServer(config signoz.Config, signoz *signoz.SigNoz) (*Server, error) {
|
||||
cacheForTraceDetail, err := memorycache.New(context.TODO(), signoz.Instrumentation.ToProviderSettings(), cache.Config{
|
||||
Provider: "memory",
|
||||
Memory: cache.Memory{
|
||||
NumCounters: 10 * 10000,
|
||||
MaxCost: 1 << 27, // 128 MB
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
reader := clickhouseReader.NewReader(
|
||||
signoz.Instrumentation.Logger(),
|
||||
signoz.SQLStore,
|
||||
signoz.TelemetryStore,
|
||||
signoz.Prometheus,
|
||||
signoz.TelemetryStore.Cluster(),
|
||||
config.Querier.FluxInterval,
|
||||
cacheForTraceDetail,
|
||||
signoz.Cache,
|
||||
nil,
|
||||
)
|
||||
|
||||
@@ -119,11 +119,7 @@ func (q *querier) QueryRange(ctx context.Context, orgID valuer.UUID, req *qbtype
|
||||
queries := make(map[string]qbtypes.Query)
|
||||
steps := make(map[string]qbtypes.Step)
|
||||
|
||||
// Resolve metric metadata once per request: patches each metric-aggregation
|
||||
// query's spec in place, returns the queries whose every aggregation was
|
||||
// missing (used for preseeded empty results), and any dormant-metric
|
||||
// warning string. NotFound errors for never-seen metrics are propagated.
|
||||
missingMetricQueries, dormantMetricsWarningMsg, err := q.resolveMetricMetadata(ctx, req.CompositeQuery.Queries, req.Start, req.End)
|
||||
missingMetricQueries, metricWarnings, err := q.resolveMetricMetadata(ctx, req.CompositeQuery.Queries, req.Start, req.End)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -240,13 +236,15 @@ func (q *querier) QueryRange(ctx context.Context, orgID valuer.UUID, req *qbtype
|
||||
}
|
||||
}
|
||||
}
|
||||
if dormantMetricsWarningMsg != "" {
|
||||
if len(metricWarnings) > 0 {
|
||||
if qbResp.Warning == nil {
|
||||
qbResp.Warning = &qbtypes.QueryWarnData{}
|
||||
}
|
||||
qbResp.Warning.Warnings = append(qbResp.Warning.Warnings, qbtypes.QueryWarnDataAdditional{
|
||||
Message: dormantMetricsWarningMsg,
|
||||
})
|
||||
for _, w := range metricWarnings {
|
||||
qbResp.Warning.Warnings = append(qbResp.Warning.Warnings, qbtypes.QueryWarnDataAdditional{
|
||||
Message: w,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
return qbResp, qbErr
|
||||
@@ -302,12 +300,11 @@ func (q *querier) populateQBEvent(event *qbtypes.QBEvent, queries []qbtypes.Quer
|
||||
// - missingMetricQueries: names of queries whose every aggregation was
|
||||
// missing. Used downstream to preseed empty result placeholders so the
|
||||
// response still has an entry per requested query name.
|
||||
// - dormantWarning: a human-readable warning describing metrics that exist in
|
||||
// the store but produced no data within the query window. Empty when no
|
||||
// such metrics are present.
|
||||
// - err: NotFound when one or more referenced metrics have never been seen,
|
||||
// or Internal when a metadata fetch fails.
|
||||
func (q *querier) resolveMetricMetadata(ctx context.Context, queries []qbtypes.QueryEnvelope, start, end uint64) (missingMetricQueries []string, dormantWarning string, err error) {
|
||||
// - metricWarnings: human-readable warnings for metrics that could not be
|
||||
// resolved: never-seen metrics and dormant metrics (seen but no data in
|
||||
// the query window).
|
||||
// - err: Internal when a metadata fetch fails.
|
||||
func (q *querier) resolveMetricMetadata(ctx context.Context, queries []qbtypes.QueryEnvelope, start, end uint64) (missingMetricQueries []string, metricWarnings []string, err error) {
|
||||
metricNames := make([]string, 0)
|
||||
for idx := range queries {
|
||||
if queries[idx].Type != qbtypes.QueryTypeBuilder {
|
||||
@@ -325,13 +322,13 @@ func (q *querier) resolveMetricMetadata(ctx context.Context, queries []qbtypes.Q
|
||||
}
|
||||
|
||||
if len(metricNames) == 0 {
|
||||
return nil, "", nil
|
||||
return nil, nil, nil
|
||||
}
|
||||
|
||||
metricTemporality, metricTypes, err := q.metadataStore.FetchTemporalityAndTypeMulti(ctx, start, end, metricNames...)
|
||||
if err != nil {
|
||||
q.logger.WarnContext(ctx, "failed to fetch metric temporality", errors.Attr(err), slog.Any("metrics", metricNames))
|
||||
return nil, "", errors.NewInternalf(errors.CodeInternal, "failed to fetch metrics temporality")
|
||||
return nil, nil, errors.NewInternalf(errors.CodeInternal, "failed to fetch metrics temporality")
|
||||
}
|
||||
q.logger.DebugContext(ctx, "fetched metric temporalities and types", slog.Any("metric_temporality", metricTemporality), slog.Any("metric_types", metricTypes))
|
||||
|
||||
@@ -363,7 +360,7 @@ func (q *querier) resolveMetricMetadata(ctx context.Context, queries []qbtypes.Q
|
||||
}
|
||||
// Type is resolved now; validate aggregation compatibility against it.
|
||||
if err := spec.Aggregations[i].ValidateForType(); err != nil {
|
||||
return nil, "", err
|
||||
return nil, nil, err
|
||||
}
|
||||
presentAggregations = append(presentAggregations, spec.Aggregations[i])
|
||||
}
|
||||
@@ -376,7 +373,7 @@ func (q *querier) resolveMetricMetadata(ctx context.Context, queries []qbtypes.Q
|
||||
}
|
||||
|
||||
if len(missingMetrics) == 0 {
|
||||
return missingMetricQueries, "", nil
|
||||
return missingMetricQueries, nil, nil
|
||||
}
|
||||
|
||||
isInternalMetric := func(n string) bool { return strings.HasPrefix(n, "signoz.") || strings.HasPrefix(n, "signoz_") }
|
||||
@@ -387,29 +384,33 @@ func (q *querier) resolveMetricMetadata(ctx context.Context, queries []qbtypes.Q
|
||||
}
|
||||
}
|
||||
if len(externalMissingMetrics) == 0 {
|
||||
// this means all missing metrics are internal, and since internal metrics
|
||||
// aren't user-controlled, skip errors/warnings for them since users can't act on them
|
||||
return missingMetricQueries, "", nil
|
||||
return missingMetricQueries, nil, nil
|
||||
}
|
||||
|
||||
// Classify each missing metric: never-seen → NotFound error; seen-but-no-
|
||||
// data-in-window → dormant warning.
|
||||
// Classify each missing metric: never-seen -> warning with empty result;
|
||||
// seen-but-no-data-in-window -> dormant warning.
|
||||
lastSeenInfo, _ := q.metadataStore.FetchLastSeenInfoMulti(ctx, externalMissingMetrics...)
|
||||
nonExistentMetrics := []string{}
|
||||
var nonExistentMetrics []string
|
||||
var dormantMetrics []string
|
||||
for _, name := range externalMissingMetrics {
|
||||
if ts, ok := lastSeenInfo[name]; ok && ts > 0 {
|
||||
dormantMetrics = append(dormantMetrics, name)
|
||||
continue
|
||||
}
|
||||
nonExistentMetrics = append(nonExistentMetrics, name)
|
||||
}
|
||||
|
||||
var warnings []string
|
||||
|
||||
// Never-seen metrics: the query already gets a preseeded empty result
|
||||
// via the aggregation-dropping path above; we just attach a warning.
|
||||
if len(nonExistentMetrics) == 1 {
|
||||
return nil, "", errors.NewNotFoundf(errors.CodeNotFound, "could not find the metric %s", nonExistentMetrics[0])
|
||||
}
|
||||
if len(nonExistentMetrics) > 1 {
|
||||
return nil, "", errors.NewNotFoundf(errors.CodeNotFound, "the following metrics were not found: %s", strings.Join(nonExistentMetrics, ", "))
|
||||
warnings = append(warnings, fmt.Sprintf("metric %s has never been received. Check the metric name and instrumentation", nonExistentMetrics[0]))
|
||||
} else if len(nonExistentMetrics) > 1 {
|
||||
warnings = append(warnings, fmt.Sprintf("the following metrics have never been received. Check the metric names and instrumentation: %s", strings.Join(nonExistentMetrics, ", ")))
|
||||
}
|
||||
|
||||
// All missing metrics are dormant — assemble the warning string.
|
||||
// Dormant metrics: seen before but no data in the query window.
|
||||
lastSeenStr := func(name string) string {
|
||||
if ts, ok := lastSeenInfo[name]; ok && ts > 0 {
|
||||
ago := humanize.RelTime(time.UnixMilli(ts), time.Now(), "ago", "from now")
|
||||
@@ -417,16 +418,16 @@ func (q *querier) resolveMetricMetadata(ctx context.Context, queries []qbtypes.Q
|
||||
}
|
||||
return name
|
||||
}
|
||||
if len(externalMissingMetrics) == 1 {
|
||||
dormantWarning = fmt.Sprintf("no data found for the metric %s in the query time range", lastSeenStr(missingMetrics[0]))
|
||||
} else {
|
||||
parts := make([]string, len(externalMissingMetrics))
|
||||
for i, m := range externalMissingMetrics {
|
||||
if len(dormantMetrics) == 1 {
|
||||
warnings = append(warnings, fmt.Sprintf("no data found for the metric %s in the query time range", lastSeenStr(dormantMetrics[0])))
|
||||
} else if len(dormantMetrics) > 1 {
|
||||
parts := make([]string, len(dormantMetrics))
|
||||
for i, m := range dormantMetrics {
|
||||
parts[i] = lastSeenStr(m)
|
||||
}
|
||||
dormantWarning = fmt.Sprintf("no data found for the following metrics in the query time range: %s", strings.Join(parts, ", "))
|
||||
warnings = append(warnings, fmt.Sprintf("no data found for the following metrics in the query time range: %s", strings.Join(parts, ", ")))
|
||||
}
|
||||
return missingMetricQueries, dormantWarning, nil
|
||||
return missingMetricQueries, warnings, nil
|
||||
}
|
||||
|
||||
func (q *querier) QueryRawStream(ctx context.Context, orgID valuer.UUID, req *qbtypes.QueryRangeRequest, client *qbtypes.RawStream) {
|
||||
|
||||
@@ -37,7 +37,7 @@ func (m *mockMetricStmtBuilder) Build(_ context.Context, _, _ uint64, _ qbtypes.
|
||||
|
||||
func TestQueryRange_MetricTypeMissing(t *testing.T) {
|
||||
// When a metric has UnspecifiedType and is not found in the metadata store,
|
||||
// the querier should return a not-found error, even if the request provides a temporality
|
||||
// the querier should return an empty result with a warning instead of an error.
|
||||
providerSettings := instrumentationtest.New().ToProviderSettings()
|
||||
metadataStore := telemetrytypestest.NewMockMetadataStore()
|
||||
|
||||
@@ -80,9 +80,19 @@ func TestQueryRange_MetricTypeMissing(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
_, err := q.QueryRange(context.Background(), valuer.GenerateUUID(), req)
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "could not find the metric unknown_metric")
|
||||
resp, err := q.QueryRange(context.Background(), valuer.GenerateUUID(), req)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, resp)
|
||||
require.NotNil(t, resp.Warning)
|
||||
|
||||
found := false
|
||||
for _, w := range resp.Warning.Warnings {
|
||||
if assert.ObjectsAreEqual("metric unknown_metric has never been received. Check the metric name and instrumentation", w.Message) {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
assert.True(t, found, "expected never-seen metric warning, got: %v", resp.Warning.Warnings)
|
||||
}
|
||||
|
||||
func TestQueryRange_MetricTypeFromStore(t *testing.T) {
|
||||
|
||||
@@ -18,6 +18,7 @@ import (
|
||||
"github.com/uptrace/bun"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/prometheus"
|
||||
"github.com/SigNoz/signoz/pkg/query-service/utils/timestamp"
|
||||
"github.com/SigNoz/signoz/pkg/sqlstore"
|
||||
"github.com/SigNoz/signoz/pkg/telemetrystore"
|
||||
"github.com/SigNoz/signoz/pkg/types"
|
||||
@@ -46,6 +47,7 @@ import (
|
||||
"github.com/SigNoz/signoz/pkg/query-service/app/resource"
|
||||
"github.com/SigNoz/signoz/pkg/query-service/app/services"
|
||||
"github.com/SigNoz/signoz/pkg/query-service/app/traces/smart"
|
||||
"github.com/SigNoz/signoz/pkg/query-service/app/traces/tracedetail"
|
||||
"github.com/SigNoz/signoz/pkg/query-service/common"
|
||||
"github.com/SigNoz/signoz/pkg/query-service/constants"
|
||||
|
||||
@@ -159,9 +161,11 @@ type ClickHouseReader struct {
|
||||
traceResourceTableV3 string
|
||||
traceSummaryTable string
|
||||
|
||||
cache cache.Cache
|
||||
metadataDB string
|
||||
metadataTable string
|
||||
fluxIntervalForTraceDetail time.Duration
|
||||
cache cache.Cache
|
||||
cacheForTraceDetail cache.Cache
|
||||
metadataDB string
|
||||
metadataTable string
|
||||
}
|
||||
|
||||
// NewTraceReader returns a TraceReader for the database
|
||||
@@ -171,6 +175,8 @@ func NewReader(
|
||||
telemetryStore telemetrystore.TelemetryStore,
|
||||
prometheus prometheus.Prometheus,
|
||||
cluster string,
|
||||
fluxIntervalForTraceDetail time.Duration,
|
||||
cacheForTraceDetail cache.Cache,
|
||||
cache cache.Cache,
|
||||
options *Options,
|
||||
) *ClickHouseReader {
|
||||
@@ -184,43 +190,45 @@ func NewReader(
|
||||
traceLocalTableName := options.primary.TraceLocalTableNameV3
|
||||
|
||||
return &ClickHouseReader{
|
||||
db: telemetryStore.ClickhouseDB(),
|
||||
logger: logger,
|
||||
prometheus: prometheus,
|
||||
sqlDB: sqlDB,
|
||||
TraceDB: options.primary.TraceDB,
|
||||
operationsTable: options.primary.OperationsTable,
|
||||
indexTable: options.primary.IndexTable,
|
||||
errorTable: options.primary.ErrorTable,
|
||||
usageExplorerTable: options.primary.UsageExplorerTable,
|
||||
durationTable: options.primary.DurationTable,
|
||||
SpansTable: options.primary.SpansTable,
|
||||
spanAttributeTableV2: options.primary.SpanAttributeTableV2,
|
||||
spanAttributesKeysTable: options.primary.SpanAttributeKeysTable,
|
||||
dependencyGraphTable: options.primary.DependencyGraphTable,
|
||||
topLevelOperationsTable: options.primary.TopLevelOperationsTable,
|
||||
logsDB: options.primary.LogsDB,
|
||||
logsTable: options.primary.LogsTable,
|
||||
logsLocalTable: options.primary.LogsLocalTable,
|
||||
logsAttributeKeys: options.primary.LogsAttributeKeysTable,
|
||||
logsResourceKeys: options.primary.LogsResourceKeysTable,
|
||||
logsTagAttributeTableV2: options.primary.LogsTagAttributeTableV2,
|
||||
liveTailRefreshSeconds: options.primary.LiveTailRefreshSeconds,
|
||||
cluster: cluster,
|
||||
queryProgressTracker: queryprogress.NewQueryProgressTracker(logger),
|
||||
logsTableV2: options.primary.LogsTableV2,
|
||||
logsLocalTableV2: options.primary.LogsLocalTableV2,
|
||||
logsResourceTableV2: options.primary.LogsResourceTableV2,
|
||||
logsResourceLocalTableV2: options.primary.LogsResourceLocalTableV2,
|
||||
logsTableName: logsTableName,
|
||||
logsLocalTableName: logsLocalTableName,
|
||||
traceLocalTableName: traceLocalTableName,
|
||||
traceTableName: traceTableName,
|
||||
traceResourceTableV3: options.primary.TraceResourceTableV3,
|
||||
traceSummaryTable: options.primary.TraceSummaryTable,
|
||||
cache: cache,
|
||||
metadataDB: options.primary.MetadataDB,
|
||||
metadataTable: options.primary.MetadataTable,
|
||||
db: telemetryStore.ClickhouseDB(),
|
||||
logger: logger,
|
||||
prometheus: prometheus,
|
||||
sqlDB: sqlDB,
|
||||
TraceDB: options.primary.TraceDB,
|
||||
operationsTable: options.primary.OperationsTable,
|
||||
indexTable: options.primary.IndexTable,
|
||||
errorTable: options.primary.ErrorTable,
|
||||
usageExplorerTable: options.primary.UsageExplorerTable,
|
||||
durationTable: options.primary.DurationTable,
|
||||
SpansTable: options.primary.SpansTable,
|
||||
spanAttributeTableV2: options.primary.SpanAttributeTableV2,
|
||||
spanAttributesKeysTable: options.primary.SpanAttributeKeysTable,
|
||||
dependencyGraphTable: options.primary.DependencyGraphTable,
|
||||
topLevelOperationsTable: options.primary.TopLevelOperationsTable,
|
||||
logsDB: options.primary.LogsDB,
|
||||
logsTable: options.primary.LogsTable,
|
||||
logsLocalTable: options.primary.LogsLocalTable,
|
||||
logsAttributeKeys: options.primary.LogsAttributeKeysTable,
|
||||
logsResourceKeys: options.primary.LogsResourceKeysTable,
|
||||
logsTagAttributeTableV2: options.primary.LogsTagAttributeTableV2,
|
||||
liveTailRefreshSeconds: options.primary.LiveTailRefreshSeconds,
|
||||
cluster: cluster,
|
||||
queryProgressTracker: queryprogress.NewQueryProgressTracker(logger),
|
||||
logsTableV2: options.primary.LogsTableV2,
|
||||
logsLocalTableV2: options.primary.LogsLocalTableV2,
|
||||
logsResourceTableV2: options.primary.LogsResourceTableV2,
|
||||
logsResourceLocalTableV2: options.primary.LogsResourceLocalTableV2,
|
||||
logsTableName: logsTableName,
|
||||
logsLocalTableName: logsLocalTableName,
|
||||
traceLocalTableName: traceLocalTableName,
|
||||
traceTableName: traceTableName,
|
||||
traceResourceTableV3: options.primary.TraceResourceTableV3,
|
||||
traceSummaryTable: options.primary.TraceSummaryTable,
|
||||
fluxIntervalForTraceDetail: fluxIntervalForTraceDetail,
|
||||
cache: cache,
|
||||
cacheForTraceDetail: cacheForTraceDetail,
|
||||
metadataDB: options.primary.MetadataDB,
|
||||
metadataTable: options.primary.MetadataTable,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -858,6 +866,206 @@ func (r *ClickHouseReader) GetUsage(ctx context.Context, queryParams *model.GetU
|
||||
return &usageItems, nil
|
||||
}
|
||||
|
||||
func (r *ClickHouseReader) GetSpansForTrace(ctx context.Context, traceID string, traceDetailsQuery string) ([]model.SpanItemV2, *model.ApiError) {
|
||||
|
||||
ctx = ctxtypes.NewContextWithCommentVals(ctx, map[string]string{
|
||||
instrumentationtypes.TelemetrySignal: telemetrytypes.SignalTraces.StringValue(),
|
||||
instrumentationtypes.CodeNamespace: "clickhouse-reader",
|
||||
instrumentationtypes.CodeFunctionName: "GetSpansForTrace",
|
||||
})
|
||||
|
||||
var traceSummary model.TraceSummary
|
||||
summaryQuery := fmt.Sprintf("SELECT trace_id, min(start) AS start, max(end) AS end, sum(num_spans) AS num_spans FROM %s.%s WHERE trace_id=$1 GROUP BY trace_id", r.TraceDB, r.traceSummaryTable)
|
||||
err := r.db.QueryRow(ctx, summaryQuery, traceID).Scan(&traceSummary.TraceID, &traceSummary.Start, &traceSummary.End, &traceSummary.NumSpans)
|
||||
if err != nil {
|
||||
if err == sql.ErrNoRows {
|
||||
return []model.SpanItemV2{}, nil
|
||||
}
|
||||
r.logger.Error("Error in processing trace summary sql query", errorsV2.Attr(err))
|
||||
return nil, model.ExecutionError(fmt.Errorf("error in processing trace summary sql query: %w", err))
|
||||
}
|
||||
|
||||
var searchScanResponses []model.SpanItemV2
|
||||
queryStartTime := time.Now()
|
||||
err = r.db.Select(ctx, &searchScanResponses, traceDetailsQuery, traceID, strconv.FormatInt(traceSummary.Start.Unix()-1800, 10), strconv.FormatInt(traceSummary.End.Unix(), 10))
|
||||
r.logger.Info(traceDetailsQuery)
|
||||
if err != nil {
|
||||
r.logger.Error("Error in processing sql query", errorsV2.Attr(err))
|
||||
return nil, model.ExecutionError(fmt.Errorf("error in processing trace data sql query: %w", err))
|
||||
}
|
||||
r.logger.Info("trace details query took: ", "duration", time.Since(queryStartTime), "traceID", traceID)
|
||||
|
||||
return searchScanResponses, nil
|
||||
}
|
||||
|
||||
|
||||
func (r *ClickHouseReader) GetFlamegraphSpansForTraceCache(ctx context.Context, orgID valuer.UUID, traceID string) (*model.GetFlamegraphSpansForTraceCache, error) {
|
||||
cachedTraceData := new(model.GetFlamegraphSpansForTraceCache)
|
||||
err := r.cacheForTraceDetail.Get(ctx, orgID, strings.Join([]string{"getFlamegraphSpansForTrace", traceID}, "-"), cachedTraceData)
|
||||
if err != nil {
|
||||
r.logger.Debug("error in retrieving getFlamegraphSpansForTrace cache", errorsV2.Attr(err), "traceID", traceID)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if time.Since(time.UnixMilli(int64(cachedTraceData.EndTime))) < r.fluxIntervalForTraceDetail {
|
||||
r.logger.Info("the trace end time falls under the flux interval, skipping getFlamegraphSpansForTrace cache", "traceID", traceID)
|
||||
return nil, errors.Errorf("the trace end time falls under the flux interval, skipping getFlamegraphSpansForTrace cache, traceID: %s", traceID)
|
||||
}
|
||||
|
||||
r.logger.Info("cache is successfully hit, applying cache for getFlamegraphSpansForTrace", "traceID", traceID)
|
||||
return cachedTraceData, nil
|
||||
}
|
||||
|
||||
func (r *ClickHouseReader) GetFlamegraphSpansForTrace(ctx context.Context, orgID valuer.UUID, traceID string, req *model.GetFlamegraphSpansForTraceParams) (*model.GetFlamegraphSpansForTraceResponse, error) {
|
||||
trace := new(model.GetFlamegraphSpansForTraceResponse)
|
||||
var startTime, endTime, durationNano uint64
|
||||
var spanIdToSpanNodeMap = map[string]*model.FlamegraphSpan{}
|
||||
// map[traceID][level]span
|
||||
var selectedSpans = [][]*model.FlamegraphSpan{}
|
||||
var traceRoots []*model.FlamegraphSpan
|
||||
|
||||
// get the trace tree from cache!
|
||||
cachedTraceData, err := r.GetFlamegraphSpansForTraceCache(ctx, orgID, traceID)
|
||||
|
||||
if err == nil {
|
||||
startTime = cachedTraceData.StartTime
|
||||
endTime = cachedTraceData.EndTime
|
||||
durationNano = cachedTraceData.DurationNano
|
||||
selectedSpans = cachedTraceData.SelectedSpans
|
||||
traceRoots = cachedTraceData.TraceRoots
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
r.logger.Info("cache miss for getFlamegraphSpansForTrace", "traceID", traceID)
|
||||
|
||||
selectCols := "timestamp, duration_nano, span_id, trace_id, has_error, links as references, resource_string_service$$name, name, events"
|
||||
if len(req.SelectFields) > 0 {
|
||||
selectCols += ", attributes_string, attributes_number, attributes_bool, resources_string"
|
||||
}
|
||||
flamegraphQuery := fmt.Sprintf("SELECT %s FROM %s.%s WHERE trace_id=$1 and ts_bucket_start>=$2 and ts_bucket_start<=$3 ORDER BY timestamp ASC, name ASC", selectCols, r.TraceDB, r.traceTableName)
|
||||
|
||||
searchScanResponses, err := r.GetSpansForTrace(ctx, traceID, flamegraphQuery)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(searchScanResponses) == 0 {
|
||||
return trace, nil
|
||||
}
|
||||
|
||||
for _, item := range searchScanResponses {
|
||||
ref := []model.OtelSpanRef{}
|
||||
err := json.Unmarshal([]byte(item.References), &ref)
|
||||
if err != nil {
|
||||
r.logger.Error("Error unmarshalling references", errorsV2.Attr(err))
|
||||
return nil, errorsV2.Newf(errorsV2.TypeInternal, errorsV2.CodeInternal, "getFlamegraphSpansForTrace: error in unmarshalling references %s", err.Error())
|
||||
}
|
||||
|
||||
events := make([]model.Event, 0)
|
||||
for _, event := range item.Events {
|
||||
var eventMap model.Event
|
||||
err = json.Unmarshal([]byte(event), &eventMap)
|
||||
if err != nil {
|
||||
r.logger.Error("Error unmarshalling events", errorsV2.Attr(err))
|
||||
return nil, errorsV2.Newf(errorsV2.TypeInternal, errorsV2.CodeInternal, "getFlamegraphSpansForTrace: error in unmarshalling events %s", err.Error())
|
||||
}
|
||||
events = append(events, eventMap)
|
||||
}
|
||||
|
||||
jsonItem := model.FlamegraphSpan{
|
||||
SpanID: item.SpanID,
|
||||
TraceID: item.TraceID,
|
||||
ServiceName: item.ServiceName,
|
||||
Name: item.Name,
|
||||
DurationNano: item.DurationNano,
|
||||
HasError: item.HasError,
|
||||
References: ref,
|
||||
Events: events,
|
||||
Children: make([]*model.FlamegraphSpan, 0),
|
||||
}
|
||||
|
||||
if len(req.SelectFields) > 0 {
|
||||
jsonItem.SetRequestedFields(item, req.SelectFields)
|
||||
}
|
||||
|
||||
// metadata calculation
|
||||
startTimeUnixNano := uint64(item.TimeUnixNano.UnixNano())
|
||||
if startTime == 0 || startTimeUnixNano < startTime {
|
||||
startTime = startTimeUnixNano
|
||||
}
|
||||
if endTime == 0 || (startTimeUnixNano+jsonItem.DurationNano) > endTime {
|
||||
endTime = (startTimeUnixNano + jsonItem.DurationNano)
|
||||
}
|
||||
if durationNano == 0 || jsonItem.DurationNano > durationNano {
|
||||
durationNano = jsonItem.DurationNano
|
||||
}
|
||||
|
||||
jsonItem.TimeUnixNano = uint64(item.TimeUnixNano.UnixNano() / 1000000)
|
||||
spanIdToSpanNodeMap[jsonItem.SpanID] = &jsonItem
|
||||
}
|
||||
|
||||
// traverse through the map and append each node to the children array of the parent node
|
||||
// and add missing spans
|
||||
for _, spanNode := range spanIdToSpanNodeMap {
|
||||
hasParentSpanNode := false
|
||||
for _, reference := range spanNode.References {
|
||||
if reference.RefType == "CHILD_OF" && reference.SpanId != "" {
|
||||
hasParentSpanNode = true
|
||||
if parentNode, exists := spanIdToSpanNodeMap[reference.SpanId]; exists {
|
||||
parentNode.Children = append(parentNode.Children, spanNode)
|
||||
} else {
|
||||
// insert the missing spans
|
||||
missingSpan := model.FlamegraphSpan{
|
||||
SpanID: reference.SpanId,
|
||||
TraceID: spanNode.TraceID,
|
||||
ServiceName: "",
|
||||
Name: "Missing Span",
|
||||
TimeUnixNano: spanNode.TimeUnixNano,
|
||||
DurationNano: spanNode.DurationNano,
|
||||
HasError: false,
|
||||
Events: make([]model.Event, 0),
|
||||
Children: make([]*model.FlamegraphSpan, 0),
|
||||
}
|
||||
missingSpan.Children = append(missingSpan.Children, spanNode)
|
||||
spanIdToSpanNodeMap[missingSpan.SpanID] = &missingSpan
|
||||
traceRoots = append(traceRoots, &missingSpan)
|
||||
}
|
||||
}
|
||||
}
|
||||
if !hasParentSpanNode && !tracedetail.ContainsFlamegraphSpan(traceRoots, spanNode) {
|
||||
traceRoots = append(traceRoots, spanNode)
|
||||
}
|
||||
}
|
||||
|
||||
selectedSpans = tracedetail.GetAllSpansForFlamegraph(traceRoots, spanIdToSpanNodeMap)
|
||||
|
||||
// TODO: set the trace data (model.GetFlamegraphSpansForTraceCache) in cache here
|
||||
// removed existing cache usage since it was not getting used due to this bug https://github.com/SigNoz/engineering-pod/issues/4648
|
||||
// and was causing out of memory issues https://github.com/SigNoz/engineering-pod/issues/4638
|
||||
}
|
||||
|
||||
processingPostCache := time.Now()
|
||||
selectedSpansForRequest := selectedSpans
|
||||
clientLimit := min(req.Limit, tracedetail.MaxLimitWithoutSampling)
|
||||
totalSpanCount := tracedetail.GetTotalSpanCount(selectedSpans)
|
||||
if totalSpanCount > uint64(clientLimit) {
|
||||
// using trace start and end time if boundary ts are set to zero (or not set)
|
||||
boundaryStart := max(timestamp.MilliToNano(req.BoundaryStartTS), startTime)
|
||||
boundaryEnd := timestamp.MilliToNano(req.BoundaryEndTS)
|
||||
if boundaryEnd == 0 {
|
||||
boundaryEnd = endTime
|
||||
}
|
||||
|
||||
selectedSpansForRequest = tracedetail.GetSelectedSpansForFlamegraphForRequest(req.SelectedSpanID, selectedSpans, boundaryStart, boundaryEnd)
|
||||
}
|
||||
r.logger.Debug("getFlamegraphSpansForTrace: processing post cache", "duration", time.Since(processingPostCache), "traceID", traceID, "totalSpans", totalSpanCount, "limit", clientLimit)
|
||||
|
||||
trace.Spans = selectedSpansForRequest
|
||||
trace.StartTimestampMillis = startTime / 1000000
|
||||
trace.EndTimestampMillis = endTime / 1000000
|
||||
trace.HasMore = totalSpanCount > uint64(clientLimit)
|
||||
return trace, nil
|
||||
}
|
||||
|
||||
func (r *ClickHouseReader) GetDependencyGraph(ctx context.Context, queryParams *model.GetServicesParams) (*[]model.ServiceMapDependencyResponseItem, error) {
|
||||
|
||||
ctx = ctxtypes.NewContextWithCommentVals(ctx, map[string]string{
|
||||
|
||||
@@ -534,6 +534,8 @@ func (aH *APIHandler) RegisterRoutes(router *mux.Router, am *middleware.AuthZ) {
|
||||
|
||||
router.HandleFunc("/api/v2/traces/fields", am.ViewAccess(aH.traceFields)).Methods(http.MethodGet)
|
||||
router.HandleFunc("/api/v2/traces/fields", am.EditAccess(aH.updateTraceField)).Methods(http.MethodPost)
|
||||
router.HandleFunc("/api/v2/traces/flamegraph/{traceId}", am.ViewAccess(aH.GetFlamegraphSpansForTrace)).Methods(http.MethodPost)
|
||||
|
||||
|
||||
router.HandleFunc("/api/v1/version", am.OpenAccess(aH.getVersion)).Methods(http.MethodGet)
|
||||
router.HandleFunc("/api/v1/features", am.ViewAccess(aH.getFeatureFlags)).Methods(http.MethodGet)
|
||||
@@ -1444,6 +1446,40 @@ func (aH *APIHandler) SearchTraces(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
}
|
||||
|
||||
func (aH *APIHandler) GetFlamegraphSpansForTrace(w http.ResponseWriter, r *http.Request) {
|
||||
claims, err := authtypes.ClaimsFromContext(r.Context())
|
||||
if err != nil {
|
||||
render.Error(w, err)
|
||||
return
|
||||
}
|
||||
orgID, err := valuer.NewUUID(claims.OrgID)
|
||||
if err != nil {
|
||||
render.Error(w, err)
|
||||
return
|
||||
}
|
||||
|
||||
traceID := mux.Vars(r)["traceId"]
|
||||
if traceID == "" {
|
||||
render.Error(w, errors.NewInvalidInputf(errors.CodeInvalidInput, "traceID is required"))
|
||||
return
|
||||
}
|
||||
|
||||
req := new(model.GetFlamegraphSpansForTraceParams)
|
||||
err = json.NewDecoder(r.Body).Decode(&req)
|
||||
if err != nil {
|
||||
RespondError(w, model.BadRequest(err), nil)
|
||||
return
|
||||
}
|
||||
|
||||
result, apiErr := aH.reader.GetFlamegraphSpansForTrace(r.Context(), orgID, traceID, req)
|
||||
if apiErr != nil {
|
||||
render.Error(w, apiErr)
|
||||
return
|
||||
}
|
||||
|
||||
aH.WriteJSON(w, r, result)
|
||||
}
|
||||
|
||||
func (aH *APIHandler) listErrors(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
query, err := parseListErrorsRequest(r)
|
||||
|
||||
@@ -10,7 +10,6 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
cmock "github.com/SigNoz/clickhouse-go-mock"
|
||||
"github.com/SigNoz/signoz/pkg/cache"
|
||||
"github.com/SigNoz/signoz/pkg/cache/cachetest"
|
||||
"github.com/SigNoz/signoz/pkg/instrumentation/instrumentationtest"
|
||||
@@ -27,6 +26,7 @@ import (
|
||||
"github.com/SigNoz/signoz/pkg/telemetrystore"
|
||||
"github.com/SigNoz/signoz/pkg/telemetrystore/telemetrystoretest"
|
||||
"github.com/SigNoz/signoz/pkg/valuer"
|
||||
cmock "github.com/SigNoz/clickhouse-go-mock"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
@@ -1409,6 +1409,8 @@ func Test_querier_Traces_runWindowBasedListQueryDesc(t *testing.T) {
|
||||
telemetryStore,
|
||||
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
|
||||
"",
|
||||
time.Duration(time.Second),
|
||||
nil,
|
||||
nil,
|
||||
options,
|
||||
)
|
||||
@@ -1633,6 +1635,8 @@ func Test_querier_Traces_runWindowBasedListQueryAsc(t *testing.T) {
|
||||
telemetryStore,
|
||||
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
|
||||
"",
|
||||
time.Duration(time.Second),
|
||||
nil,
|
||||
nil,
|
||||
options,
|
||||
)
|
||||
@@ -1932,6 +1936,8 @@ func Test_querier_Logs_runWindowBasedListQueryDesc(t *testing.T) {
|
||||
telemetryStore,
|
||||
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
|
||||
"",
|
||||
time.Duration(time.Second),
|
||||
nil,
|
||||
nil,
|
||||
options,
|
||||
)
|
||||
@@ -2158,6 +2164,8 @@ func Test_querier_Logs_runWindowBasedListQueryAsc(t *testing.T) {
|
||||
telemetryStore,
|
||||
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
|
||||
"",
|
||||
time.Duration(time.Second),
|
||||
nil,
|
||||
nil,
|
||||
options,
|
||||
)
|
||||
|
||||
@@ -1461,6 +1461,8 @@ func Test_querier_Traces_runWindowBasedListQueryDesc(t *testing.T) {
|
||||
telemetryStore,
|
||||
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
|
||||
"",
|
||||
time.Duration(time.Second),
|
||||
nil,
|
||||
nil,
|
||||
options,
|
||||
)
|
||||
@@ -1685,6 +1687,8 @@ func Test_querier_Traces_runWindowBasedListQueryAsc(t *testing.T) {
|
||||
telemetryStore,
|
||||
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
|
||||
"",
|
||||
time.Duration(time.Second),
|
||||
nil,
|
||||
nil,
|
||||
options,
|
||||
)
|
||||
@@ -1983,6 +1987,8 @@ func Test_querier_Logs_runWindowBasedListQueryDesc(t *testing.T) {
|
||||
telemetryStore,
|
||||
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
|
||||
"",
|
||||
time.Duration(time.Second),
|
||||
nil,
|
||||
nil,
|
||||
options,
|
||||
)
|
||||
@@ -2209,6 +2215,8 @@ func Test_querier_Logs_runWindowBasedListQueryAsc(t *testing.T) {
|
||||
telemetryStore,
|
||||
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
|
||||
"",
|
||||
time.Duration(time.Second),
|
||||
nil,
|
||||
nil,
|
||||
options,
|
||||
)
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"net/http"
|
||||
"slices"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/cache/memorycache"
|
||||
"github.com/SigNoz/signoz/pkg/errors"
|
||||
"github.com/SigNoz/signoz/pkg/queryparser"
|
||||
|
||||
@@ -15,6 +16,7 @@ import (
|
||||
"github.com/rs/cors"
|
||||
"github.com/soheilhy/cmux"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/cache"
|
||||
"github.com/SigNoz/signoz/pkg/http/middleware"
|
||||
"github.com/SigNoz/signoz/pkg/licensing/nooplicensing"
|
||||
"github.com/SigNoz/signoz/pkg/query-service/agentConf"
|
||||
@@ -58,12 +60,25 @@ func NewServer(config signoz.Config, signoz *signoz.SigNoz) (*Server, error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
cacheForTraceDetail, err := memorycache.New(context.TODO(), signoz.Instrumentation.ToProviderSettings(), cache.Config{
|
||||
Provider: "memory",
|
||||
Memory: cache.Memory{
|
||||
NumCounters: 10 * 10000,
|
||||
MaxCost: 1 << 27, // 128 MB
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
reader := clickhouseReader.NewReader(
|
||||
signoz.Instrumentation.Logger(),
|
||||
signoz.SQLStore,
|
||||
signoz.TelemetryStore,
|
||||
signoz.Prometheus,
|
||||
signoz.TelemetryStore.Cluster(),
|
||||
config.Querier.FluxInterval,
|
||||
cacheForTraceDetail,
|
||||
signoz.Cache,
|
||||
nil,
|
||||
)
|
||||
|
||||
199
pkg/query-service/app/traces/tracedetail/flamegraph.go
Normal file
199
pkg/query-service/app/traces/tracedetail/flamegraph.go
Normal file
@@ -0,0 +1,199 @@
|
||||
package tracedetail
|
||||
|
||||
import (
|
||||
"sort"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/query-service/model"
|
||||
)
|
||||
|
||||
var (
|
||||
flamegraphSpanLevelLimit float64 = 50
|
||||
flamegraphSpanLimitPerLevel int = 100
|
||||
flamegraphSamplingBucketCount int = 50
|
||||
flamegraphTopLatencySpanCount int = 5
|
||||
|
||||
MaxLimitWithoutSampling uint = 120_000
|
||||
)
|
||||
|
||||
func ContainsFlamegraphSpan(slice []*model.FlamegraphSpan, item *model.FlamegraphSpan) bool {
|
||||
for _, v := range slice {
|
||||
if v.SpanID == item.SpanID {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func BfsTraversalForTrace(span *model.FlamegraphSpan, level int64) map[int64][]*model.FlamegraphSpan {
|
||||
bfs := map[int64][]*model.FlamegraphSpan{}
|
||||
bfs[level] = []*model.FlamegraphSpan{span}
|
||||
|
||||
for _, child := range span.Children {
|
||||
childBfsMap := BfsTraversalForTrace(child, level+1)
|
||||
for _level, nodes := range childBfsMap {
|
||||
bfs[_level] = append(bfs[_level], nodes...)
|
||||
}
|
||||
}
|
||||
span.Level = level
|
||||
span.Children = make([]*model.FlamegraphSpan, 0)
|
||||
|
||||
return bfs
|
||||
}
|
||||
|
||||
func FindIndexForSelectedSpan(spans [][]*model.FlamegraphSpan, selectedSpanId string) int {
|
||||
var selectedSpanLevel int = 0
|
||||
|
||||
for index, _spans := range spans {
|
||||
for _, span := range _spans {
|
||||
if span.SpanID == selectedSpanId {
|
||||
selectedSpanLevel = index
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return selectedSpanLevel
|
||||
}
|
||||
|
||||
// GetAllSpansForFlamegraph groups all spans as per their level
|
||||
func GetAllSpansForFlamegraph(traceRoots []*model.FlamegraphSpan, spanIdToSpanNodeMap map[string]*model.FlamegraphSpan) [][]*model.FlamegraphSpan {
|
||||
|
||||
var traceIdLevelledFlamegraph = map[string]map[int64][]*model.FlamegraphSpan{}
|
||||
selectedSpans := [][]*model.FlamegraphSpan{}
|
||||
|
||||
// sort the trace roots to add missing spans at the right order
|
||||
sort.Slice(traceRoots, func(i, j int) bool {
|
||||
if traceRoots[i].TimeUnixNano == traceRoots[j].TimeUnixNano {
|
||||
return traceRoots[i].Name < traceRoots[j].Name
|
||||
}
|
||||
return traceRoots[i].TimeUnixNano < traceRoots[j].TimeUnixNano
|
||||
})
|
||||
|
||||
for _, rootSpanID := range traceRoots {
|
||||
if rootNode, exists := spanIdToSpanNodeMap[rootSpanID.SpanID]; exists {
|
||||
bfsMapForTrace := BfsTraversalForTrace(rootNode, 0)
|
||||
traceIdLevelledFlamegraph[rootSpanID.SpanID] = bfsMapForTrace
|
||||
}
|
||||
}
|
||||
|
||||
for _, trace := range traceRoots {
|
||||
keys := make([]int64, 0, len(traceIdLevelledFlamegraph[trace.SpanID]))
|
||||
for key := range traceIdLevelledFlamegraph[trace.SpanID] {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
|
||||
sort.Slice(keys, func(i, j int) bool {
|
||||
return keys[i] < keys[j]
|
||||
})
|
||||
|
||||
for _, level := range keys {
|
||||
if ok, exists := traceIdLevelledFlamegraph[trace.SpanID][level]; exists {
|
||||
selectedSpans = append(selectedSpans, ok)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return selectedSpans
|
||||
}
|
||||
|
||||
func getLatencyAndTimestampBucketedSpans(spans []*model.FlamegraphSpan, selectedSpanID string, isSelectedSpanIDPresent bool, startTime uint64, endTime uint64) []*model.FlamegraphSpan {
|
||||
var sampledSpans []*model.FlamegraphSpan
|
||||
// sort the spans by latency for latency filtering
|
||||
sort.Slice(spans, func(i, j int) bool {
|
||||
return spans[i].DurationNano > spans[j].DurationNano
|
||||
})
|
||||
|
||||
// pick the top 5 latency spans
|
||||
for idx := range flamegraphTopLatencySpanCount {
|
||||
sampledSpans = append(sampledSpans, spans[idx])
|
||||
}
|
||||
|
||||
// always add the selectedSpan
|
||||
if isSelectedSpanIDPresent {
|
||||
idx := -1
|
||||
for _idx, span := range spans {
|
||||
if span.SpanID == selectedSpanID {
|
||||
idx = _idx
|
||||
}
|
||||
}
|
||||
if idx != -1 {
|
||||
sampledSpans = append(sampledSpans, spans[idx])
|
||||
}
|
||||
}
|
||||
|
||||
bucketSize := (endTime - startTime) / uint64(flamegraphSamplingBucketCount)
|
||||
if bucketSize == 0 {
|
||||
bucketSize = 1
|
||||
}
|
||||
|
||||
bucketedSpans := make([][]*model.FlamegraphSpan, flamegraphSamplingBucketCount)
|
||||
|
||||
for _, span := range spans {
|
||||
if span.TimeUnixNano >= startTime && span.TimeUnixNano <= endTime {
|
||||
bucketIndex := int((span.TimeUnixNano - startTime) / bucketSize)
|
||||
if bucketIndex >= 0 && bucketIndex < flamegraphSamplingBucketCount {
|
||||
bucketedSpans[bucketIndex] = append(bucketedSpans[bucketIndex], span)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for i := range bucketedSpans {
|
||||
if len(bucketedSpans[i]) > 2 {
|
||||
// Keep only the first 2 spans
|
||||
bucketedSpans[i] = bucketedSpans[i][:2]
|
||||
}
|
||||
}
|
||||
|
||||
// Flatten the bucketed spans into a single slice
|
||||
for _, bucket := range bucketedSpans {
|
||||
sampledSpans = append(sampledSpans, bucket...)
|
||||
}
|
||||
|
||||
return sampledSpans
|
||||
}
|
||||
|
||||
func GetSelectedSpansForFlamegraphForRequest(selectedSpanID string, selectedSpans [][]*model.FlamegraphSpan, startTime uint64, endTime uint64) [][]*model.FlamegraphSpan {
|
||||
var selectedSpansForRequest = make([][]*model.FlamegraphSpan, 0)
|
||||
var selectedIndex = 0
|
||||
|
||||
if selectedSpanID != "" {
|
||||
selectedIndex = FindIndexForSelectedSpan(selectedSpans, selectedSpanID)
|
||||
}
|
||||
|
||||
lowerLimit := selectedIndex - int(flamegraphSpanLevelLimit*0.4)
|
||||
upperLimit := selectedIndex + int(flamegraphSpanLevelLimit*0.6)
|
||||
|
||||
if lowerLimit < 0 {
|
||||
upperLimit = upperLimit - lowerLimit
|
||||
lowerLimit = 0
|
||||
}
|
||||
|
||||
if upperLimit > len(selectedSpans) {
|
||||
lowerLimit = lowerLimit - (upperLimit - len(selectedSpans))
|
||||
upperLimit = len(selectedSpans)
|
||||
}
|
||||
|
||||
if lowerLimit < 0 {
|
||||
lowerLimit = 0
|
||||
}
|
||||
|
||||
for i := lowerLimit; i < upperLimit; i++ {
|
||||
if len(selectedSpans[i]) > flamegraphSpanLimitPerLevel {
|
||||
_spans := getLatencyAndTimestampBucketedSpans(selectedSpans[i], selectedSpanID, i == selectedIndex, startTime, endTime)
|
||||
selectedSpansForRequest = append(selectedSpansForRequest, _spans)
|
||||
} else {
|
||||
selectedSpansForRequest = append(selectedSpansForRequest, selectedSpans[i])
|
||||
}
|
||||
}
|
||||
|
||||
return selectedSpansForRequest
|
||||
}
|
||||
|
||||
func GetTotalSpanCount(spans [][]*model.FlamegraphSpan) uint64 {
|
||||
levelCount := len(spans)
|
||||
spanCount := uint64(0)
|
||||
for i := range levelCount {
|
||||
spanCount += uint64(len(spans[i]))
|
||||
}
|
||||
return spanCount
|
||||
}
|
||||
@@ -43,6 +43,8 @@ type Reader interface {
|
||||
|
||||
// Search Interfaces
|
||||
SearchTraces(ctx context.Context, params *model.SearchTracesParams) (*[]model.SearchSpansResult, error)
|
||||
GetFlamegraphSpansForTrace(ctx context.Context, orgID valuer.UUID, traceID string, req *model.GetFlamegraphSpansForTraceParams) (*model.GetFlamegraphSpansForTraceResponse, error)
|
||||
|
||||
// Setter Interfaces
|
||||
SetTTL(ctx context.Context, orgID string, ttlParams *retentiontypes.TTLParams) (*retentiontypes.SetTTLResponseItem, *model.ApiError)
|
||||
SetTTLV2(ctx context.Context, orgID string, params *retentiontypes.CustomRetentionTTLParams) (*retentiontypes.CustomRetentionTTLResponse, error)
|
||||
|
||||
42
pkg/query-service/model/cacheable.go
Normal file
42
pkg/query-service/model/cacheable.go
Normal file
@@ -0,0 +1,42 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/types/cachetypes"
|
||||
)
|
||||
|
||||
type GetFlamegraphSpansForTraceCache struct {
|
||||
StartTime uint64 `json:"startTime"`
|
||||
EndTime uint64 `json:"endTime"`
|
||||
DurationNano uint64 `json:"durationNano"`
|
||||
SelectedSpans [][]*FlamegraphSpan `json:"selectedSpans"`
|
||||
TraceRoots []*FlamegraphSpan `json:"traceRoots"`
|
||||
}
|
||||
|
||||
func (c *GetFlamegraphSpansForTraceCache) Clone() cachetypes.Cacheable {
|
||||
return &GetFlamegraphSpansForTraceCache{
|
||||
StartTime: c.StartTime,
|
||||
EndTime: c.EndTime,
|
||||
DurationNano: c.DurationNano,
|
||||
SelectedSpans: c.SelectedSpans,
|
||||
TraceRoots: c.TraceRoots,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *GetFlamegraphSpansForTraceCache) Cost() int64 {
|
||||
const perSpanBytes = 128
|
||||
var spans int64
|
||||
for _, row := range c.SelectedSpans {
|
||||
spans += int64(len(row))
|
||||
}
|
||||
spans += int64(len(c.TraceRoots))
|
||||
return spans * perSpanBytes
|
||||
}
|
||||
|
||||
func (c *GetFlamegraphSpansForTraceCache) MarshalBinary() (data []byte, err error) {
|
||||
return json.Marshal(c)
|
||||
}
|
||||
func (c *GetFlamegraphSpansForTraceCache) UnmarshalBinary(data []byte) error {
|
||||
return json.Unmarshal(data, c)
|
||||
}
|
||||
@@ -2,6 +2,8 @@ package model
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/types/telemetrytypes"
|
||||
)
|
||||
|
||||
type InstantQueryMetricsParams struct {
|
||||
@@ -329,6 +331,14 @@ type SearchTracesParams struct {
|
||||
MaxSpansInTrace int `json:"maxSpansInTrace"`
|
||||
}
|
||||
|
||||
type GetFlamegraphSpansForTraceParams struct {
|
||||
SelectedSpanID string `json:"selectedSpanId"`
|
||||
Limit uint `json:"limit"`
|
||||
BoundaryStartTS uint64 `json:"boundaryStartTsMilli"`
|
||||
BoundaryEndTS uint64 `json:"boundarEndTsMilli"`
|
||||
SelectFields []telemetrytypes.TelemetryFieldKey `json:"selectFields"`
|
||||
}
|
||||
|
||||
type SpanFilterParams struct {
|
||||
TraceID []string `json:"traceID"`
|
||||
Status []string `json:"status"`
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/types/telemetrytypes"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/prometheus/prometheus/promql/parser"
|
||||
"github.com/prometheus/prometheus/util/stats"
|
||||
@@ -239,6 +240,13 @@ type SearchSpanDBResponseItem struct {
|
||||
Model string `ch:"model"`
|
||||
}
|
||||
|
||||
type Event struct {
|
||||
Name string `json:"name,omitempty"`
|
||||
TimeUnixNano uint64 `json:"timeUnixNano,omitempty"`
|
||||
AttributeMap map[string]interface{} `json:"attributeMap,omitempty"`
|
||||
IsError bool `json:"isError,omitempty"`
|
||||
}
|
||||
|
||||
//easyjson:json
|
||||
type SearchSpanResponseItem struct {
|
||||
TimeUnixNano uint64 `json:"timestamp"`
|
||||
@@ -259,6 +267,79 @@ type SearchSpanResponseItem struct {
|
||||
SpanKind string `json:"spanKind"`
|
||||
}
|
||||
|
||||
type Span struct {
|
||||
TimeUnixNano uint64 `json:"timestamp"`
|
||||
DurationNano uint64 `json:"durationNano"`
|
||||
SpanID string `json:"spanId"`
|
||||
RootSpanID string `json:"rootSpanId"`
|
||||
TraceID string `json:"traceId"`
|
||||
HasError bool `json:"hasError"`
|
||||
Kind int32 `json:"kind"`
|
||||
ServiceName string `json:"serviceName"`
|
||||
Name string `json:"name"`
|
||||
References []OtelSpanRef `json:"references,omitempty"`
|
||||
TagMap map[string]string `json:"tagMap"`
|
||||
Events []Event `json:"event"`
|
||||
RootName string `json:"rootName"`
|
||||
StatusMessage string `json:"statusMessage"`
|
||||
StatusCodeString string `json:"statusCodeString"`
|
||||
SpanKind string `json:"spanKind"`
|
||||
Children []*Span `json:"children"`
|
||||
|
||||
// the below two fields are for frontend to render the spans
|
||||
SubTreeNodeCount uint64 `json:"subTreeNodeCount"`
|
||||
HasChildren bool `json:"hasChildren"`
|
||||
HasSiblings bool `json:"hasSiblings"`
|
||||
Level uint64 `json:"level"`
|
||||
}
|
||||
|
||||
type FlamegraphSpan struct {
|
||||
TimeUnixNano uint64 `json:"timestamp"`
|
||||
DurationNano uint64 `json:"durationNano"`
|
||||
SpanID string `json:"spanId"`
|
||||
TraceID string `json:"traceId"`
|
||||
HasError bool `json:"hasError"`
|
||||
ServiceName string `json:"serviceName"`
|
||||
Name string `json:"name"`
|
||||
Level int64 `json:"level"`
|
||||
Events []Event `json:"event"`
|
||||
References []OtelSpanRef `json:"references,omitempty"`
|
||||
Children []*FlamegraphSpan `json:"children"`
|
||||
Attributes map[string]any `json:"attributes,omitempty"`
|
||||
Resource map[string]string `json:"resource,omitempty"`
|
||||
}
|
||||
|
||||
// SetRequestedFields extracts the requested attribute/resource fields from item into s.
|
||||
// This can eventually support missing fieldContext by checking both
|
||||
func (s *FlamegraphSpan) SetRequestedFields(item SpanItemV2, fields []telemetrytypes.TelemetryFieldKey) {
|
||||
for _, field := range fields {
|
||||
switch field.FieldContext {
|
||||
case telemetrytypes.FieldContextResource:
|
||||
if v, ok := item.Resources_string[field.Name]; ok && v != "" {
|
||||
if s.Resource == nil {
|
||||
s.Resource = make(map[string]string)
|
||||
}
|
||||
s.Resource[field.Name] = v
|
||||
}
|
||||
case telemetrytypes.FieldContextAttribute:
|
||||
if v := item.AttributeValue(field.Name); v != nil {
|
||||
if s.Attributes == nil {
|
||||
s.Attributes = make(map[string]any)
|
||||
}
|
||||
s.Attributes[field.Name] = v
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type GetFlamegraphSpansForTraceResponse struct {
|
||||
StartTimestampMillis uint64 `json:"startTimestampMillis"`
|
||||
EndTimestampMillis uint64 `json:"endTimestampMillis"`
|
||||
DurationNano uint64 `json:"durationNano"`
|
||||
Spans [][]*FlamegraphSpan `json:"spans"`
|
||||
HasMore bool `json:"hasMore"`
|
||||
}
|
||||
|
||||
type OtelSpanRef struct {
|
||||
TraceId string `json:"traceId,omitempty"`
|
||||
SpanId string `json:"spanId,omitempty"`
|
||||
|
||||
@@ -101,9 +101,29 @@ func (b *MetricQueryStatementBuilder) Build(
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var pairFallbackWarnings []string
|
||||
for _, sel := range keySelectors {
|
||||
if _, ok := keys[sel.Name]; !ok {
|
||||
keys[sel.Name] = []*telemetrytypes.TelemetryFieldKey{{
|
||||
Name: sel.Name,
|
||||
FieldContext: telemetrytypes.FieldContextAttribute,
|
||||
FieldDataType: telemetrytypes.FieldDataTypeString,
|
||||
Signal: telemetrytypes.SignalMetrics,
|
||||
}}
|
||||
pairFallbackWarnings = append(pairFallbackWarnings,
|
||||
fmt.Sprintf("key `%s` not found on metric %s", sel.Name, query.Aggregations[0].MetricName),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
start, end = querybuilder.AdjustedMetricTimeRange(start, end, uint64(query.StepInterval.Seconds()), query)
|
||||
|
||||
return b.buildPipelineStatement(ctx, start, end, query, keys, variables)
|
||||
stmt, err := b.buildPipelineStatement(ctx, start, end, query, keys, variables)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stmt.Warnings = append(stmt.Warnings, pairFallbackWarnings...)
|
||||
return stmt, nil
|
||||
}
|
||||
|
||||
func (b *MetricQueryStatementBuilder) buildPipelineStatement(
|
||||
|
||||
@@ -338,7 +338,6 @@ func isValidLabelValue(v string) bool {
|
||||
// validate runs during UnmarshalJSON (read + write path).
|
||||
// Preserves the original pre-existing checks only so that stored rules
|
||||
// continue to load without errors.
|
||||
// TODO(srikanthccv): remove this once v1 is deprecated and removed
|
||||
func (r *PostableRule) validate() error {
|
||||
var errs []error
|
||||
|
||||
@@ -367,13 +366,9 @@ func (r *PostableRule) validate() error {
|
||||
|
||||
errs = append(errs, testTemplateParsing(r)...)
|
||||
|
||||
if len(errs) > 0 {
|
||||
messages := make([]string, len(errs))
|
||||
for i, e := range errs {
|
||||
messages[i] = e.Error()
|
||||
}
|
||||
return errors.NewInvalidInputf(errors.CodeInvalidInput, "alert rule definition is not valid").
|
||||
WithAdditional(messages...)
|
||||
joined := errors.Join(errs...)
|
||||
if joined != nil {
|
||||
return errors.WrapInvalidInputf(joined, errors.CodeInvalidInput, "validation failed")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -471,13 +466,9 @@ func (r *PostableRule) Validate() error {
|
||||
|
||||
errs = append(errs, testTemplateParsing(r)...)
|
||||
|
||||
if len(errs) > 0 {
|
||||
messages := make([]string, len(errs))
|
||||
for i, e := range errs {
|
||||
messages[i] = e.Error()
|
||||
}
|
||||
return errors.NewInvalidInputf(errors.CodeInvalidInput, "alert rule is not valid").
|
||||
WithAdditional(messages...)
|
||||
joined := errors.Join(errs...)
|
||||
if joined != nil {
|
||||
return errors.WrapInvalidInputf(joined, errors.CodeInvalidInput, "validation failed")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -4,23 +4,8 @@ import (
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/SigNoz/signoz/pkg/errors"
|
||||
)
|
||||
|
||||
func errorContains(err error, substr string) bool {
|
||||
j := errors.AsJSON(err)
|
||||
if strings.Contains(j.Message, substr) {
|
||||
return true
|
||||
}
|
||||
for _, e := range j.Errors {
|
||||
if strings.Contains(e.Message, substr) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// validV1Builder returns a minimal valid v1 builder rule JSON.
|
||||
func validV1Builder() string {
|
||||
return `{
|
||||
@@ -509,7 +494,7 @@ func TestValidate_PostableRule_Common(t *testing.T) {
|
||||
if tt.wantErr {
|
||||
if err == nil {
|
||||
t.Errorf("expected error containing %q, got nil", tt.errSubstr)
|
||||
} else if tt.errSubstr != "" && !errorContains(err, tt.errSubstr) {
|
||||
} else if tt.errSubstr != "" && !strings.Contains(err.Error(), tt.errSubstr) {
|
||||
t.Errorf("expected error containing %q, got: %v", tt.errSubstr, err)
|
||||
}
|
||||
} else {
|
||||
@@ -702,7 +687,7 @@ func TestValidate_V1_ConditionFields(t *testing.T) {
|
||||
if tt.wantErr {
|
||||
if validateErr == nil {
|
||||
t.Errorf("expected Validate() error containing %q, got nil", tt.errSubstr)
|
||||
} else if tt.errSubstr != "" && !errorContains(validateErr, tt.errSubstr) {
|
||||
} else if tt.errSubstr != "" && !strings.Contains(validateErr.Error(), tt.errSubstr) {
|
||||
t.Errorf("expected error containing %q, got: %v", tt.errSubstr, validateErr)
|
||||
}
|
||||
} else {
|
||||
@@ -1044,7 +1029,7 @@ func TestValidate_V2Alpha1(t *testing.T) {
|
||||
if tt.wantErr {
|
||||
if err == nil {
|
||||
t.Errorf("expected error containing %q, got nil", tt.errSubstr)
|
||||
} else if tt.errSubstr != "" && !errorContains(err, tt.errSubstr) {
|
||||
} else if tt.errSubstr != "" && !strings.Contains(err.Error(), tt.errSubstr) {
|
||||
t.Errorf("expected error containing %q, got: %v", tt.errSubstr, err)
|
||||
}
|
||||
} else {
|
||||
@@ -1352,7 +1337,7 @@ func TestValidate_MultipleErrors(t *testing.T) {
|
||||
t.Fatal("expected unmarshal error for wrong version")
|
||||
}
|
||||
// The error should mention version
|
||||
if !errorContains(err, "version") {
|
||||
if !strings.Contains(err.Error(), "version") {
|
||||
t.Errorf("expected error to mention version, got: %v", err)
|
||||
}
|
||||
})
|
||||
@@ -1370,9 +1355,10 @@ func TestValidate_MultipleErrors(t *testing.T) {
|
||||
if validateErr == nil {
|
||||
t.Fatal("expected Validate() error")
|
||||
}
|
||||
errStr := validateErr.Error()
|
||||
// Should contain errors for thresholds, evaluation, notificationSettings
|
||||
for _, substr := range []string{"evaluation", "notificationSettings"} {
|
||||
if !errorContains(validateErr, substr) {
|
||||
if !strings.Contains(errStr, substr) {
|
||||
t.Errorf("expected error to mention %q, got: %v", substr, validateErr)
|
||||
}
|
||||
}
|
||||
@@ -1483,7 +1469,7 @@ func TestValidate_V2Alpha1_CumulativeEvaluation(t *testing.T) {
|
||||
if tt.wantErr {
|
||||
if err == nil {
|
||||
t.Errorf("expected error containing %q, got nil", tt.errSubstr)
|
||||
} else if !errorContains(err, tt.errSubstr) {
|
||||
} else if !strings.Contains(err.Error(), tt.errSubstr) {
|
||||
t.Errorf("expected error containing %q, got: %v", tt.errSubstr, err)
|
||||
}
|
||||
} else if err != nil {
|
||||
|
||||
@@ -614,7 +614,7 @@ def test_histogram_p90_returns_warning_outside_data_window(
|
||||
assert warnings[0]["message"].startswith(f"no data found for the metric {metric_name}")
|
||||
|
||||
|
||||
def test_non_existent_metrics_returns_404(
|
||||
def test_non_existent_metrics_returns_warning(
|
||||
signoz: types.SigNoz,
|
||||
create_user_admin: None, # pylint: disable=unused-argument
|
||||
get_token: Callable[[str, str], str],
|
||||
@@ -635,9 +635,11 @@ def test_non_existent_metrics_returns_404(
|
||||
|
||||
start_2h = int((now - timedelta(hours=2)).timestamp() * 1000)
|
||||
response = make_query_request(signoz, token, start_2h, end_ms, [query])
|
||||
assert response.status_code == HTTPStatus.NOT_FOUND
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
|
||||
assert get_error_message(response.json()) == "could not find the metric whatevergoennnsgoeshere"
|
||||
data = response.json()
|
||||
warnings = get_all_warnings(data)
|
||||
assert any("whatevergoennnsgoeshere" in w and "has never been received" in w for w in warnings), f"expected never-seen metric warning, got: {warnings}"
|
||||
|
||||
|
||||
def test_non_existent_internal_metrics_returns_no_warning(
|
||||
|
||||
Reference in New Issue
Block a user