Compare commits

...

6 Commits

Author SHA1 Message Date
Nikhil Soni
168b2eaa9c feat: query full spans for smaller traces 2026-05-27 00:12:38 +05:30
Nikhil Soni
6b613f18a3 feat: add api and module for flamegraph v3 2026-05-26 20:04:20 +05:30
Nikhil Soni
1b0447181d feat: add method to enrich selected spans 2026-05-26 20:03:47 +05:30
Nikhil Soni
20edff4771 feat: add config for flamegraph 2026-05-26 19:21:33 +05:30
Nikhil Soni
2048ef3d2f chore: remove limit from request payload
It's a new api so doesn't need to be backward compatible
2026-05-26 19:06:48 +05:30
Nikhil Soni
53c551359e feat: add types for flamegraph v3 in module structure 2026-05-26 18:56:35 +05:30
9 changed files with 514 additions and 1 deletions

View File

@@ -434,6 +434,17 @@ tracedetail:
max_depth_to_auto_expand: 5
# Threshold below which all spans are returned without windowing.
max_limit_to_select_all_spans: 10000
flamegraph:
# Maximum number of BFS depth levels included in a windowed response.
max_selected_levels: 50
# Maximum spans per level before sampling is applied.
max_spans_per_level: 100
# Number of highest-latency spans always included when sampling a level.
sampling_top_latency_count: 5
# Number of timestamp buckets used for uniform sampling within a level.
sampling_bucket_count: 50
# Threshold below which all spans are returned without windowing or sampling.
select_all_spans_limit: 100000
##################### Authz #################################
authz:

View File

@@ -48,5 +48,23 @@ func (provider *provider) addTraceDetailRoutes(router *mux.Router) error {
return err
}
if err := router.Handle("/api/v3/traces/{traceID}/flamegraph", handler.New(
provider.authzMiddleware.ViewAccess(provider.traceDetailHandler.GetFlamegraph),
handler.OpenAPIDef{
ID: "GetFlamegraph",
Tags: []string{"tracedetail"},
Summary: "Get flamegraph view for a trace",
Description: "Returns the flamegraph view of spans for a given trace ID.",
Request: new(spantypes.PostableFlamegraph),
RequestContentType: "application/json",
Response: new(spantypes.GettableFlamegraphTrace),
ResponseContentType: "application/json",
SuccessStatusCode: http.StatusOK,
ErrorStatusCodes: []int{http.StatusBadRequest, http.StatusNotFound},
SecuritySchemes: newSecuritySchemes(types.RoleViewer),
},
)).Methods(http.MethodPost).GetError(); err != nil {
return err
}
return nil
}

View File

@@ -6,7 +6,16 @@ import (
)
type Config struct {
Waterfall WaterfallConfig `mapstructure:"waterfall"`
Waterfall WaterfallConfig `mapstructure:"waterfall"`
Flamegraph FlamegraphConfig `mapstructure:"flamegraph"`
}
type FlamegraphConfig struct {
MaxSelectedLevels int `mapstructure:"max_selected_levels"`
MaxSpansPerLevel int `mapstructure:"max_spans_per_level"`
SamplingTopLatencySpansCount int `mapstructure:"sampling_top_latency_count"`
SamplingBucketCount int `mapstructure:"sampling_bucket_count"`
SelectAllSpansLimit uint `mapstructure:"select_all_spans_limit"`
}
type WaterfallConfig struct {
@@ -29,6 +38,13 @@ func newConfig() factory.Config {
MaxDepthToAutoExpand: 5,
MaxLimitToSelectAllSpans: 10_000,
},
Flamegraph: FlamegraphConfig{
MaxSelectedLevels: 50,
MaxSpansPerLevel: 100,
SamplingTopLatencySpansCount: 5,
SamplingBucketCount: 50,
SelectAllSpansLimit: 100_000,
},
}
}
@@ -45,5 +61,25 @@ func (c Config) Validate() error {
return errors.NewInvalidInputf(errors.CodeInvalidInput,
"tracedetail.waterfall.max_limit_to_select_all_spans must be positive")
}
if c.Flamegraph.MaxSelectedLevels <= 0 {
return errors.NewInvalidInputf(errors.CodeInvalidInput,
"tracedetail.flamegraph.level_limit must be positive, got %d", c.Flamegraph.MaxSelectedLevels)
}
if c.Flamegraph.MaxSpansPerLevel <= 0 {
return errors.NewInvalidInputf(errors.CodeInvalidInput,
"tracedetail.flamegraph.spans_per_level must be positive, got %d", c.Flamegraph.MaxSpansPerLevel)
}
if c.Flamegraph.SamplingTopLatencySpansCount < 0 {
return errors.NewInvalidInputf(errors.CodeInvalidInput,
"tracedetail.flamegraph.top_latency_count cannot be negative, got %d", c.Flamegraph.SamplingTopLatencySpansCount)
}
if c.Flamegraph.SamplingBucketCount <= 0 {
return errors.NewInvalidInputf(errors.CodeInvalidInput,
"tracedetail.flamegraph.bucket_count must be positive, got %d", c.Flamegraph.SamplingBucketCount)
}
if c.Flamegraph.SelectAllSpansLimit == 0 {
return errors.NewInvalidInputf(errors.CodeInvalidInput,
"tracedetail.flamegraph.max_limit_to_select_all_spans must be positive")
}
return nil
}

View File

@@ -59,3 +59,19 @@ func (h *handler) GetWaterfallV4(rw http.ResponseWriter, r *http.Request) {
render.Success(rw, http.StatusOK, result)
}
func (h *handler) GetFlamegraph(rw http.ResponseWriter, r *http.Request) {
req := new(spantypes.PostableFlamegraph)
if err := binding.JSON.BindBody(r.Body, req); err != nil {
render.Error(rw, err)
return
}
result, err := h.module.GetFlamegraph(r.Context(), mux.Vars(r)["traceID"], req)
if err != nil {
render.Error(rw, err)
return
}
render.Success(rw, http.StatusOK, result)
}

View File

@@ -105,6 +105,64 @@ func (m *module) getFullWaterfall(ctx context.Context, traceID string, summary *
return spantypes.NewGettableWaterfallTrace(waterfallTrace, selectedSpans, nil, true, nil), nil
}
func (m *module) GetFlamegraph(ctx context.Context, traceID string, req *spantypes.PostableFlamegraph) (*spantypes.GettableFlamegraphTrace, error) {
summary, err := m.store.GetTraceSummary(ctx, traceID)
if err != nil {
return nil, err
}
if summary.NumSpans <= uint64(m.config.Flamegraph.SelectAllSpansLimit) {
return m.getFullFlamegraph(ctx, traceID, summary)
}
return m.getWindowedFlamegraph(ctx, traceID, req.SelectedSpanID, summary)
}
func (m *module) getFullFlamegraph(ctx context.Context, traceID string, summary *spantypes.TraceSummary) (*spantypes.GettableFlamegraphTrace, error) {
fullSpans, err := m.store.GetTraceSpans(ctx, traceID, summary)
if err != nil {
return nil, err
}
if len(fullSpans) == 0 {
return nil, spantypes.ErrTraceNotFound
}
flamegraphTrace := spantypes.NewFlamegraphTraceFromStorable(fullSpans)
return spantypes.NewGettableFlamegraphTrace(
flamegraphTrace.GetAllLevels(),
summary.Start.UnixMilli(), summary.End.UnixMilli(), false,
), nil
}
// getWindowedFlamegraph returns a window of a max levels and max sampled spans per level around the selected span
func (m *module) getWindowedFlamegraph(ctx context.Context, traceID, selectedSpanID string, summary *spantypes.TraceSummary) (*spantypes.GettableFlamegraphTrace, error) {
minimalSpans, err := m.store.GetMinimalSpans(ctx, traceID, summary.Start, summary.End)
if err != nil {
return nil, err
}
if len(minimalSpans) == 0 {
return nil, spantypes.ErrTraceNotFound
}
flamegraphTrace := spantypes.NewFlamegraphTraceFromMinimal(minimalSpans)
minimalSpans = nil
cfg := m.config.Flamegraph
selectedSpans := flamegraphTrace.GetSelectedLevels(selectedSpanID,
cfg.MaxSelectedLevels, cfg.MaxSpansPerLevel, cfg.SamplingTopLatencySpansCount, cfg.SamplingBucketCount)
if len(selectedSpans) == 0 {
return nil, spantypes.ErrTraceNotFound
}
fullSpans, err := m.store.GetTraceSpansByIDs(ctx, traceID, summary.Start, summary.End,
spantypes.FlamegraphWindowSpanIDs(selectedSpans))
if err != nil {
return nil, err
}
return spantypes.NewGettableFlamegraphTrace(
flamegraphTrace.EnrichSelectedSpans(selectedSpans, fullSpans),
summary.Start.UnixMilli(), summary.End.UnixMilli(), true,
), nil
}
// getWindowedWaterfall builds the waterfall tree with minimal data and then returns only a window of full spans.
func (m *module) getWindowedWaterfall(ctx context.Context, traceID, selectedSpanID string, uncollapsedSpans []string, start, end time.Time) (*spantypes.GettableWaterfallTrace, error) {
// Step 1: minimal fetch → build full tree → select visible window

View File

@@ -11,10 +11,12 @@ import (
type Handler interface {
GetWaterfall(http.ResponseWriter, *http.Request)
GetWaterfallV4(http.ResponseWriter, *http.Request)
GetFlamegraph(http.ResponseWriter, *http.Request)
}
// Module defines the business logic for trace detail operations.
type Module interface {
GetWaterfall(ctx context.Context, traceID string, req *spantypes.PostableWaterfall) (*spantypes.GettableWaterfallTrace, error)
GetWaterfallV4(ctx context.Context, traceID string, selectedSpanID string, uncollapsedSpans []string, selectAllLimit uint) (*spantypes.GettableWaterfallTrace, error)
GetFlamegraph(ctx context.Context, traceID string, req *spantypes.PostableFlamegraph) (*spantypes.GettableFlamegraphTrace, error)
}

View File

@@ -0,0 +1,81 @@
package spantypes
import (
"maps"
"github.com/SigNoz/signoz/pkg/types/telemetrytypes"
)
type FlamegraphSpan struct {
SpanID string `json:"spanId"`
ParentSpanID string `json:"parentSpanId"`
Timestamp uint64 `json:"timestamp"`
DurationNano uint64 `json:"durationNano"`
HasError bool `json:"hasError"`
ServiceName string `json:"serviceName"`
Name string `json:"name"`
Level int64 `json:"level"`
Events []Event `json:"event"`
Attributes map[string]any `json:"attributes,omitempty"`
Resource map[string]string `json:"resource,omitempty"`
Children []*FlamegraphSpan `json:"-"` // internal tree use only
}
// FlamegraphLevel groups span IDs at a single level within the selected window.
type FlamegraphLevel struct {
Level int64
SpanIDs []string
}
type PostableFlamegraph struct {
SelectedSpanID string `json:"selectedSpanId"`
SelectFields []telemetrytypes.TelemetryFieldKey `json:"selectFields,omitempty"`
}
// GettableFlamegraphTrace is the response for the v3 flamegraph API.
type GettableFlamegraphTrace struct {
Spans [][]*FlamegraphSpan `json:"spans"`
StartTimestampMillis int64 `json:"startTimestampMillis"`
EndTimestampMillis int64 `json:"endTimestampMillis"`
HasMore bool `json:"hasMore"`
}
func NewGettableFlamegraphTrace(spans [][]*FlamegraphSpan, startMs, endMs int64, hasMore bool) *GettableFlamegraphTrace {
return &GettableFlamegraphTrace{
Spans: spans,
StartTimestampMillis: startMs,
EndTimestampMillis: endMs,
HasMore: hasMore,
}
}
func NewFlamegraphSpanFromStorable(s *StorableSpan, level int64) *FlamegraphSpan {
resources := make(map[string]string, len(s.ResourcesString))
maps.Copy(resources, s.ResourcesString)
return &FlamegraphSpan{
SpanID: s.SpanID,
ParentSpanID: s.ParentSpanID,
Timestamp: uint64(s.StartTime.UnixNano()),
DurationNano: s.DurationNano,
HasError: s.HasError,
ServiceName: s.ServiceName,
Name: s.Name,
Level: level,
Events: s.UnmarshalledEvents(),
Attributes: s.Attributes(),
Resource: resources,
}
}
// FlamegraphWindowSpanIDs collects all span IDs from a level window into a flat slice.
func FlamegraphWindowSpanIDs(window []FlamegraphLevel) []string {
total := 0
for _, lvl := range window {
total += len(lvl.SpanIDs)
}
ids := make([]string, 0, total)
for _, lvl := range window {
ids = append(ids, lvl.SpanIDs...)
}
return ids
}

View File

@@ -0,0 +1,279 @@
package spantypes
import (
"sort"
)
// FlamegraphTrace holds the level wise tree built from minimal spans.
type FlamegraphTrace struct {
roots []*FlamegraphSpan
nodeByID map[string]*FlamegraphSpan
startTime uint64
endTime uint64
}
func NewFlamegraphTraceFromMinimal(spans []MinimalSpan) *FlamegraphTrace {
t := &FlamegraphTrace{
nodeByID: make(map[string]*FlamegraphSpan, len(spans)),
}
for i := range spans {
node := spans[i].ToFlamegraphSpan()
t.updateTimeRange(node.Timestamp, node.DurationNano)
t.nodeByID[node.SpanID] = node
}
t.wireTree()
return t
}
func NewFlamegraphTraceFromStorable(spans []StorableSpan) *FlamegraphTrace {
t := &FlamegraphTrace{
nodeByID: make(map[string]*FlamegraphSpan, len(spans)),
}
for i := range spans {
node := NewFlamegraphSpanFromStorable(&spans[i], 0) // level is set later by BFS
t.updateTimeRange(node.Timestamp, node.DurationNano)
t.nodeByID[node.SpanID] = node
}
t.wireTree()
return t
}
func (t *FlamegraphTrace) GetAllLevels() [][]*FlamegraphSpan {
allLevels := t.buildAllLevels()
for _, node := range t.nodeByID {
node.Children = nil // children not required after building tree
}
return allLevels
}
// GetSelectedLevels returns the level window for selectedSpanID with sampling applied to
// dense levels. It always applies windowing — callers should only invoke this when the
// trace is known to exceed the select-all limit.
// Children are cleared after traversal so the tree can be GC'd.
func (t *FlamegraphTrace) GetSelectedLevels(
selectedSpanID string,
levelLimit, spansPerLevel, topLatencyCount, bucketCount int,
) []FlamegraphLevel {
allLevels := t.buildAllLevels()
for _, node := range t.nodeByID {
node.Children = nil
}
selectedIndex := 0
if selectedSpanID != "" {
outer:
for i, lvl := range allLevels {
for _, span := range lvl {
if span.SpanID == selectedSpanID {
selectedIndex = i
break outer
}
}
}
}
lowerLimit := selectedIndex - int(float64(levelLimit)*0.4)
upperLimit := selectedIndex + int(float64(levelLimit)*0.6)
if lowerLimit < 0 {
upperLimit -= lowerLimit
lowerLimit = 0
}
if upperLimit > len(allLevels) {
lowerLimit -= upperLimit - len(allLevels)
upperLimit = len(allLevels)
}
if lowerLimit < 0 {
lowerLimit = 0
}
result := make([]FlamegraphLevel, 0, upperLimit-lowerLimit)
for i := lowerLimit; i < upperLimit; i++ {
lvl := allLevels[i]
if len(lvl) == 0 {
continue
}
var sampled []*FlamegraphSpan
if len(lvl) > spansPerLevel {
sampled = sampleFlamegraphLevel(lvl, selectedSpanID, i == selectedIndex,
t.startTime, t.endTime, topLatencyCount, bucketCount)
} else {
sampled = lvl
}
if len(sampled) == 0 {
continue
}
spanIDs := make([]string, len(sampled))
for j, s := range sampled {
spanIDs[j] = s.SpanID
}
result = append(result, FlamegraphLevel{
Level: sampled[0].Level,
SpanIDs: spanIDs,
})
}
return result
}
func (t *FlamegraphTrace) EnrichSelectedSpans(selectedSpans []FlamegraphLevel, fullSpans []StorableSpan) [][]*FlamegraphSpan {
fullByID := make(map[string]*StorableSpan, len(fullSpans))
for i := range fullSpans {
fullByID[fullSpans[i].SpanID] = &fullSpans[i]
}
result := make([][]*FlamegraphSpan, len(selectedSpans))
for i, lvl := range selectedSpans {
result[i] = make([]*FlamegraphSpan, 0, len(lvl.SpanIDs))
for _, spanID := range lvl.SpanIDs {
if full, ok := fullByID[spanID]; ok {
result[i] = append(result[i], NewFlamegraphSpanFromStorable(full, lvl.Level))
} else if lean, ok := t.nodeByID[spanID]; ok {
result[i] = append(result[i], lean)
}
}
}
return result
}
func (t *FlamegraphTrace) updateTimeRange(timestamp, durationNano uint64) {
if t.startTime == 0 || timestamp < t.startTime {
t.startTime = timestamp
}
if end := timestamp + durationNano; end > t.endTime {
t.endTime = end
}
}
func (t *FlamegraphTrace) wireTree() {
for _, node := range t.nodeByID {
if node.ParentSpanID != "" {
if parent, ok := t.nodeByID[node.ParentSpanID]; ok {
parent.Children = append(parent.Children, node)
} else {
missing := &FlamegraphSpan{
SpanID: node.ParentSpanID,
Name: "Missing Span",
Timestamp: node.Timestamp,
DurationNano: node.DurationNano,
Children: []*FlamegraphSpan{node},
}
t.nodeByID[missing.SpanID] = missing
t.roots = append(t.roots, missing)
}
} else if flamegraphSpanIndex(t.roots, node.SpanID) == -1 {
t.roots = append(t.roots, node)
}
}
sort.Slice(t.roots, func(i, j int) bool {
if t.roots[i].Timestamp == t.roots[j].Timestamp {
return t.roots[i].SpanID < t.roots[j].SpanID
}
return t.roots[i].Timestamp < t.roots[j].Timestamp
})
}
func (t *FlamegraphTrace) buildAllLevels() [][]*FlamegraphSpan {
var result [][]*FlamegraphSpan
type entry struct {
node *FlamegraphSpan
depth int64
}
for _, root := range t.roots {
levelMap := make(map[int64][]*FlamegraphSpan)
maxDepth := int64(-1)
queue := []entry{{root, 0}}
for len(queue) > 0 {
curr := queue[0]
queue = queue[1:]
curr.node.Level = curr.depth
levelMap[curr.depth] = append(levelMap[curr.depth], curr.node)
if curr.depth > maxDepth {
maxDepth = curr.depth
}
for _, child := range curr.node.Children {
queue = append(queue, entry{child, curr.depth + 1})
}
}
for depth := int64(0); depth <= maxDepth; depth++ {
if spans, ok := levelMap[depth]; ok {
result = append(result, spans)
}
}
}
return result
}
func sampleFlamegraphLevel(
spans []*FlamegraphSpan,
selectedSpanID string,
isSelectedLevel bool,
startTime, endTime uint64,
topLatencyCount, bucketCount int,
) []*FlamegraphSpan {
sorted := make([]*FlamegraphSpan, len(spans))
copy(sorted, spans)
sort.Slice(sorted, func(i, j int) bool {
return sorted[i].DurationNano > sorted[j].DurationNano
})
var sampled []*FlamegraphSpan
topK := topLatencyCount
if topK > len(sorted) {
topK = len(sorted)
}
sampled = append(sampled, sorted[:topK]...)
if isSelectedLevel {
for _, span := range sorted {
if span.SpanID == selectedSpanID {
sampled = append(sampled, span)
break
}
}
}
bucketSize := (endTime - startTime) / uint64(bucketCount)
if bucketSize == 0 {
bucketSize = 1
}
buckets := make([][]*FlamegraphSpan, bucketCount)
for _, span := range sorted {
if span.Timestamp < startTime || span.Timestamp > endTime {
continue
}
idx := int((span.Timestamp - startTime) / bucketSize)
if idx < 0 {
idx = 0
} else if idx >= bucketCount {
idx = bucketCount - 1
}
buckets[idx] = append(buckets[idx], span)
}
for i := range buckets {
if len(buckets[i]) > 2 {
buckets[i] = buckets[i][:2]
}
}
for _, bucket := range buckets {
sampled = append(sampled, bucket...)
}
return sampled
}
func flamegraphSpanIndex(spans []*FlamegraphSpan, spanID string) int {
for i, s := range spans {
if s != nil && s.SpanID == spanID {
return i
}
}
return -1
}

View File

@@ -156,6 +156,18 @@ func (item *MinimalSpan) ToWaterfallSpan(traceID string) *WaterfallSpan {
}
}
func (item *MinimalSpan) ToFlamegraphSpan() *FlamegraphSpan {
return &FlamegraphSpan{
SpanID: item.SpanID,
ParentSpanID: item.ParentSpanID,
Timestamp: uint64(item.StartTime.UnixNano()),
DurationNano: item.DurationNano,
HasError: item.HasError,
ServiceName: item.ServiceName,
Children: make([]*FlamegraphSpan, 0),
}
}
// NewMissingWaterfallSpan creates a synthetic placeholder span for a parent that has no recorded data.
func NewMissingWaterfallSpan(spanID, traceID string, timeUnixNano, durationNano uint64) *WaterfallSpan {
return &WaterfallSpan{