Compare commits

..

1 Commits

Author SHA1 Message Date
Nikhil Soni
de8f3cc06e chore: remove caching spans since v2 was not using it
So we can directly introduce redis instead of relying
on in-memory cache
2026-04-17 17:30:58 +05:30
4 changed files with 418 additions and 230 deletions

View File

@@ -10,6 +10,7 @@ import (
"go.opentelemetry.io/contrib/instrumentation/github.com/gorilla/mux/otelmux"
"go.opentelemetry.io/otel/propagation"
"github.com/SigNoz/signoz/pkg/cache/memorycache"
"github.com/SigNoz/signoz/pkg/errors"
"github.com/SigNoz/signoz/pkg/factory"
"github.com/SigNoz/signoz/pkg/queryparser"
@@ -73,12 +74,25 @@ type Server struct {
// NewServer creates and initializes Server
func NewServer(config signoz.Config, signoz *signoz.SigNoz) (*Server, error) {
cacheForTraceDetail, err := memorycache.New(context.TODO(), signoz.Instrumentation.ToProviderSettings(), cache.Config{
Provider: "memory",
Memory: cache.Memory{
NumCounters: 10 * 10000,
MaxCost: 1 << 27, // 128 MB
},
})
if err != nil {
return nil, err
}
reader := clickhouseReader.NewReader(
signoz.Instrumentation.Logger(),
signoz.SQLStore,
signoz.TelemetryStore,
signoz.Prometheus,
signoz.TelemetryStore.Cluster(),
config.Querier.FluxInterval,
cacheForTraceDetail,
signoz.Cache,
nil,
)

View File

@@ -160,9 +160,11 @@ type ClickHouseReader struct {
traceResourceTableV3 string
traceSummaryTable string
cache cache.Cache
metadataDB string
metadataTable string
fluxIntervalForTraceDetail time.Duration
cache cache.Cache
cacheForTraceDetail cache.Cache
metadataDB string
metadataTable string
}
// NewTraceReader returns a TraceReader for the database
@@ -172,6 +174,8 @@ func NewReader(
telemetryStore telemetrystore.TelemetryStore,
prometheus prometheus.Prometheus,
cluster string,
fluxIntervalForTraceDetail time.Duration,
cacheForTraceDetail cache.Cache,
cache cache.Cache,
options *Options,
) *ClickHouseReader {
@@ -185,43 +189,45 @@ func NewReader(
traceLocalTableName := options.primary.TraceLocalTableNameV3
return &ClickHouseReader{
db: telemetryStore.ClickhouseDB(),
logger: logger,
prometheus: prometheus,
sqlDB: sqlDB,
TraceDB: options.primary.TraceDB,
operationsTable: options.primary.OperationsTable,
indexTable: options.primary.IndexTable,
errorTable: options.primary.ErrorTable,
usageExplorerTable: options.primary.UsageExplorerTable,
durationTable: options.primary.DurationTable,
SpansTable: options.primary.SpansTable,
spanAttributeTableV2: options.primary.SpanAttributeTableV2,
spanAttributesKeysTable: options.primary.SpanAttributeKeysTable,
dependencyGraphTable: options.primary.DependencyGraphTable,
topLevelOperationsTable: options.primary.TopLevelOperationsTable,
logsDB: options.primary.LogsDB,
logsTable: options.primary.LogsTable,
logsLocalTable: options.primary.LogsLocalTable,
logsAttributeKeys: options.primary.LogsAttributeKeysTable,
logsResourceKeys: options.primary.LogsResourceKeysTable,
logsTagAttributeTableV2: options.primary.LogsTagAttributeTableV2,
liveTailRefreshSeconds: options.primary.LiveTailRefreshSeconds,
cluster: cluster,
queryProgressTracker: queryprogress.NewQueryProgressTracker(logger),
logsTableV2: options.primary.LogsTableV2,
logsLocalTableV2: options.primary.LogsLocalTableV2,
logsResourceTableV2: options.primary.LogsResourceTableV2,
logsResourceLocalTableV2: options.primary.LogsResourceLocalTableV2,
logsTableName: logsTableName,
logsLocalTableName: logsLocalTableName,
traceLocalTableName: traceLocalTableName,
traceTableName: traceTableName,
traceResourceTableV3: options.primary.TraceResourceTableV3,
traceSummaryTable: options.primary.TraceSummaryTable,
cache: cache,
metadataDB: options.primary.MetadataDB,
metadataTable: options.primary.MetadataTable,
db: telemetryStore.ClickhouseDB(),
logger: logger,
prometheus: prometheus,
sqlDB: sqlDB,
TraceDB: options.primary.TraceDB,
operationsTable: options.primary.OperationsTable,
indexTable: options.primary.IndexTable,
errorTable: options.primary.ErrorTable,
usageExplorerTable: options.primary.UsageExplorerTable,
durationTable: options.primary.DurationTable,
SpansTable: options.primary.SpansTable,
spanAttributeTableV2: options.primary.SpanAttributeTableV2,
spanAttributesKeysTable: options.primary.SpanAttributeKeysTable,
dependencyGraphTable: options.primary.DependencyGraphTable,
topLevelOperationsTable: options.primary.TopLevelOperationsTable,
logsDB: options.primary.LogsDB,
logsTable: options.primary.LogsTable,
logsLocalTable: options.primary.LogsLocalTable,
logsAttributeKeys: options.primary.LogsAttributeKeysTable,
logsResourceKeys: options.primary.LogsResourceKeysTable,
logsTagAttributeTableV2: options.primary.LogsTagAttributeTableV2,
liveTailRefreshSeconds: options.primary.LiveTailRefreshSeconds,
cluster: cluster,
queryProgressTracker: queryprogress.NewQueryProgressTracker(logger),
logsTableV2: options.primary.LogsTableV2,
logsLocalTableV2: options.primary.LogsLocalTableV2,
logsResourceTableV2: options.primary.LogsResourceTableV2,
logsResourceLocalTableV2: options.primary.LogsResourceLocalTableV2,
logsTableName: logsTableName,
logsLocalTableName: logsLocalTableName,
traceLocalTableName: traceLocalTableName,
traceTableName: traceTableName,
traceResourceTableV3: options.primary.TraceResourceTableV3,
traceSummaryTable: options.primary.TraceSummaryTable,
fluxIntervalForTraceDetail: fluxIntervalForTraceDetail,
cache: cache,
cacheForTraceDetail: cacheForTraceDetail,
metadataDB: options.primary.MetadataDB,
metadataTable: options.primary.MetadataTable,
}
}
@@ -891,6 +897,23 @@ func (r *ClickHouseReader) GetSpansForTrace(ctx context.Context, traceID string,
return searchScanResponses, nil
}
func (r *ClickHouseReader) GetWaterfallSpansForTraceWithMetadataCache(ctx context.Context, orgID valuer.UUID, traceID string) (*model.GetWaterfallSpansForTraceWithMetadataCache, error) {
cachedTraceData := new(model.GetWaterfallSpansForTraceWithMetadataCache)
err := r.cacheForTraceDetail.Get(ctx, orgID, strings.Join([]string{"getWaterfallSpansForTraceWithMetadata", traceID}, "-"), cachedTraceData)
if err != nil {
r.logger.Debug("error in retrieving getWaterfallSpansForTraceWithMetadata cache", errorsV2.Attr(err), "traceID", traceID)
return nil, err
}
if time.Since(time.UnixMilli(int64(cachedTraceData.EndTime))) < r.fluxIntervalForTraceDetail {
r.logger.Info("the trace end time falls under the flux interval, skipping getWaterfallSpansForTraceWithMetadata cache", "traceID", traceID)
return nil, errors.Errorf("the trace end time falls under the flux interval, skipping getWaterfallSpansForTraceWithMetadata cache, traceID: %s", traceID)
}
r.logger.Info("cache is successfully hit, applying cache for getWaterfallSpansForTraceWithMetadata", "traceID", traceID)
return cachedTraceData, nil
}
func (r *ClickHouseReader) GetWaterfallSpansForTraceWithMetadata(ctx context.Context, orgID valuer.UUID, traceID string, req *model.GetWaterfallSpansForTraceWithMetadataParams) (*model.GetWaterfallSpansForTraceWithMetadataResponse, error) {
response := new(model.GetWaterfallSpansForTraceWithMetadataResponse)
var startTime, endTime, durationNano, totalErrorSpans, totalSpans uint64
@@ -900,136 +923,160 @@ func (r *ClickHouseReader) GetWaterfallSpansForTraceWithMetadata(ctx context.Con
var serviceNameIntervalMap = map[string][]tracedetail.Interval{}
var hasMissingSpans bool
searchScanResponses, err := r.GetSpansForTrace(ctx, traceID, fmt.Sprintf("SELECT DISTINCT ON (span_id) timestamp, duration_nano, span_id, trace_id, has_error, kind, resource_string_service$$name, name, links as references, attributes_string, attributes_number, attributes_bool, resources_string, events, status_message, status_code_string, kind_string FROM %s.%s WHERE trace_id=$1 and ts_bucket_start>=$2 and ts_bucket_start<=$3 ORDER BY timestamp ASC, name ASC", r.TraceDB, r.traceTableName))
cachedTraceData, err := r.GetWaterfallSpansForTraceWithMetadataCache(ctx, orgID, traceID)
if err == nil {
startTime = cachedTraceData.StartTime
endTime = cachedTraceData.EndTime
durationNano = cachedTraceData.DurationNano
spanIdToSpanNodeMap = cachedTraceData.SpanIdToSpanNodeMap
serviceNameToTotalDurationMap = cachedTraceData.ServiceNameToTotalDurationMap
traceRoots = cachedTraceData.TraceRoots
totalSpans = cachedTraceData.TotalSpans
totalErrorSpans = cachedTraceData.TotalErrorSpans
hasMissingSpans = cachedTraceData.HasMissingSpans
}
if err != nil {
return nil, err
}
if len(searchScanResponses) == 0 {
return response, nil
}
totalSpans = uint64(len(searchScanResponses))
for _, item := range searchScanResponses {
ref := []model.OtelSpanRef{}
err := json.Unmarshal([]byte(item.References), &ref)
r.logger.Info("cache miss for getWaterfallSpansForTraceWithMetadata", "traceID", traceID)
searchScanResponses, err := r.GetSpansForTrace(ctx, traceID, fmt.Sprintf("SELECT DISTINCT ON (span_id) timestamp, duration_nano, span_id, trace_id, has_error, kind, resource_string_service$$name, name, links as references, attributes_string, attributes_number, attributes_bool, resources_string, events, status_message, status_code_string, kind_string FROM %s.%s WHERE trace_id=$1 and ts_bucket_start>=$2 and ts_bucket_start<=$3 ORDER BY timestamp ASC, name ASC", r.TraceDB, r.traceTableName))
if err != nil {
r.logger.Error("getWaterfallSpansForTraceWithMetadata: error unmarshalling references", errorsV2.Attr(err), "traceID", traceID)
return nil, errorsV2.Newf(errorsV2.TypeInvalidInput, errorsV2.CodeInvalidInput, "getWaterfallSpansForTraceWithMetadata: error unmarshalling references %s", err.Error())
return nil, err
}
// merge attributes_number and attributes_bool to attributes_string
for k, v := range item.Attributes_bool {
item.Attributes_string[k] = fmt.Sprintf("%v", v)
if len(searchScanResponses) == 0 {
return response, nil
}
for k, v := range item.Attributes_number {
item.Attributes_string[k] = strconv.FormatFloat(v, 'f', -1, 64)
}
for k, v := range item.Resources_string {
item.Attributes_string[k] = v
}
events := make([]model.Event, 0)
for _, event := range item.Events {
var eventMap model.Event
err = json.Unmarshal([]byte(event), &eventMap)
totalSpans = uint64(len(searchScanResponses))
processingBeforeCache := time.Now()
for _, item := range searchScanResponses {
ref := []model.OtelSpanRef{}
err := json.Unmarshal([]byte(item.References), &ref)
if err != nil {
r.logger.Error("Error unmarshalling events", errorsV2.Attr(err))
return nil, errorsV2.Newf(errorsV2.TypeInternal, errorsV2.CodeInternal, "getWaterfallSpansForTraceWithMetadata: error in unmarshalling events %s", err.Error())
r.logger.Error("getWaterfallSpansForTraceWithMetadata: error unmarshalling references", errorsV2.Attr(err), "traceID", traceID)
return nil, errorsV2.Newf(errorsV2.TypeInvalidInput, errorsV2.CodeInvalidInput, "getWaterfallSpansForTraceWithMetadata: error unmarshalling references %s", err.Error())
}
events = append(events, eventMap)
// merge attributes_number and attributes_bool to attributes_string
for k, v := range item.Attributes_bool {
item.Attributes_string[k] = fmt.Sprintf("%v", v)
}
for k, v := range item.Attributes_number {
item.Attributes_string[k] = strconv.FormatFloat(v, 'f', -1, 64)
}
for k, v := range item.Resources_string {
item.Attributes_string[k] = v
}
events := make([]model.Event, 0)
for _, event := range item.Events {
var eventMap model.Event
err = json.Unmarshal([]byte(event), &eventMap)
if err != nil {
r.logger.Error("Error unmarshalling events", errorsV2.Attr(err))
return nil, errorsV2.Newf(errorsV2.TypeInternal, errorsV2.CodeInternal, "getWaterfallSpansForTraceWithMetadata: error in unmarshalling events %s", err.Error())
}
events = append(events, eventMap)
}
startTimeUnixNano := uint64(item.TimeUnixNano.UnixNano())
jsonItem := model.Span{
SpanID: item.SpanID,
TraceID: item.TraceID,
ServiceName: item.ServiceName,
Name: item.Name,
Kind: int32(item.Kind),
DurationNano: item.DurationNano,
HasError: item.HasError,
StatusMessage: item.StatusMessage,
StatusCodeString: item.StatusCodeString,
SpanKind: item.SpanKind,
References: ref,
Events: events,
TagMap: item.Attributes_string,
Children: make([]*model.Span, 0),
TimeUnixNano: startTimeUnixNano, // Store nanoseconds temporarily
}
// metadata calculation
if startTime == 0 || startTimeUnixNano < startTime {
startTime = startTimeUnixNano
}
if endTime == 0 || (startTimeUnixNano+jsonItem.DurationNano) > endTime {
endTime = (startTimeUnixNano + jsonItem.DurationNano)
}
if durationNano == 0 || jsonItem.DurationNano > durationNano {
durationNano = jsonItem.DurationNano
}
if jsonItem.HasError {
totalErrorSpans = totalErrorSpans + 1
}
// collect the intervals for service for execution time calculation
serviceNameIntervalMap[jsonItem.ServiceName] =
append(serviceNameIntervalMap[jsonItem.ServiceName], tracedetail.Interval{StartTime: jsonItem.TimeUnixNano, Duration: jsonItem.DurationNano, Service: jsonItem.ServiceName})
// append to the span node map
spanIdToSpanNodeMap[jsonItem.SpanID] = &jsonItem
}
startTimeUnixNano := uint64(item.TimeUnixNano.UnixNano())
// traverse through the map and append each node to the children array of the parent node
// and add the missing spans
for _, spanNode := range spanIdToSpanNodeMap {
hasParentSpanNode := false
for _, reference := range spanNode.References {
if reference.RefType == "CHILD_OF" && reference.SpanId != "" {
hasParentSpanNode = true
jsonItem := model.Span{
SpanID: item.SpanID,
TraceID: item.TraceID,
ServiceName: item.ServiceName,
Name: item.Name,
Kind: int32(item.Kind),
DurationNano: item.DurationNano,
HasError: item.HasError,
StatusMessage: item.StatusMessage,
StatusCodeString: item.StatusCodeString,
SpanKind: item.SpanKind,
References: ref,
Events: events,
TagMap: item.Attributes_string,
Children: make([]*model.Span, 0),
TimeUnixNano: startTimeUnixNano, // Store nanoseconds temporarily
}
// metadata calculation
if startTime == 0 || startTimeUnixNano < startTime {
startTime = startTimeUnixNano
}
if endTime == 0 || (startTimeUnixNano+jsonItem.DurationNano) > endTime {
endTime = (startTimeUnixNano + jsonItem.DurationNano)
}
if durationNano == 0 || jsonItem.DurationNano > durationNano {
durationNano = jsonItem.DurationNano
}
if jsonItem.HasError {
totalErrorSpans = totalErrorSpans + 1
}
// collect the intervals for service for execution time calculation
serviceNameIntervalMap[jsonItem.ServiceName] =
append(serviceNameIntervalMap[jsonItem.ServiceName], tracedetail.Interval{StartTime: jsonItem.TimeUnixNano, Duration: jsonItem.DurationNano, Service: jsonItem.ServiceName})
// append to the span node map
spanIdToSpanNodeMap[jsonItem.SpanID] = &jsonItem
}
// traverse through the map and append each node to the children array of the parent node
// and add the missing spans
for _, spanNode := range spanIdToSpanNodeMap {
hasParentSpanNode := false
for _, reference := range spanNode.References {
if reference.RefType == "CHILD_OF" && reference.SpanId != "" {
hasParentSpanNode = true
if parentNode, exists := spanIdToSpanNodeMap[reference.SpanId]; exists {
parentNode.Children = append(parentNode.Children, spanNode)
} else {
// insert the missing span
missingSpan := model.Span{
SpanID: reference.SpanId,
TraceID: spanNode.TraceID,
ServiceName: "",
Name: "Missing Span",
TimeUnixNano: spanNode.TimeUnixNano,
Kind: 0,
DurationNano: spanNode.DurationNano,
HasError: false,
StatusMessage: "",
StatusCodeString: "",
SpanKind: "",
Events: make([]model.Event, 0),
Children: make([]*model.Span, 0),
if parentNode, exists := spanIdToSpanNodeMap[reference.SpanId]; exists {
parentNode.Children = append(parentNode.Children, spanNode)
} else {
// insert the missing span
missingSpan := model.Span{
SpanID: reference.SpanId,
TraceID: spanNode.TraceID,
ServiceName: "",
Name: "Missing Span",
TimeUnixNano: spanNode.TimeUnixNano,
Kind: 0,
DurationNano: spanNode.DurationNano,
HasError: false,
StatusMessage: "",
StatusCodeString: "",
SpanKind: "",
Events: make([]model.Event, 0),
Children: make([]*model.Span, 0),
}
missingSpan.Children = append(missingSpan.Children, spanNode)
spanIdToSpanNodeMap[missingSpan.SpanID] = &missingSpan
traceRoots = append(traceRoots, &missingSpan)
hasMissingSpans = true
}
missingSpan.Children = append(missingSpan.Children, spanNode)
spanIdToSpanNodeMap[missingSpan.SpanID] = &missingSpan
traceRoots = append(traceRoots, &missingSpan)
hasMissingSpans = true
}
}
if !hasParentSpanNode && !tracedetail.ContainsWaterfallSpan(traceRoots, spanNode) {
traceRoots = append(traceRoots, spanNode)
}
}
if !hasParentSpanNode && !tracedetail.ContainsWaterfallSpan(traceRoots, spanNode) {
traceRoots = append(traceRoots, spanNode)
}
// sort the trace roots to add missing spans at the right order
sort.Slice(traceRoots, func(i, j int) bool {
if traceRoots[i].TimeUnixNano == traceRoots[j].TimeUnixNano {
return traceRoots[i].Name < traceRoots[j].Name
}
return traceRoots[i].TimeUnixNano < traceRoots[j].TimeUnixNano
})
serviceNameToTotalDurationMap = tracedetail.CalculateServiceTime(serviceNameIntervalMap)
// TODO: set the span data (model.GetWaterfallSpansForTraceWithMetadataCache) in cache here
// removed existing cache usage since it was not getting used due to this bug https://github.com/SigNoz/engineering-pod/issues/4648
// and was causing out of memory issues https://github.com/SigNoz/engineering-pod/issues/4638
r.logger.Info("getWaterfallSpansForTraceWithMetadata: processing pre cache", "duration", time.Since(processingBeforeCache), "traceID", traceID)
}
// sort the trace roots to add missing spans at the right order
sort.Slice(traceRoots, func(i, j int) bool {
if traceRoots[i].TimeUnixNano == traceRoots[j].TimeUnixNano {
return traceRoots[i].Name < traceRoots[j].Name
}
return traceRoots[i].TimeUnixNano < traceRoots[j].TimeUnixNano
})
serviceNameToTotalDurationMap = tracedetail.CalculateServiceTime(serviceNameIntervalMap)
processingPostCache := time.Now()
// When req.Limit is 0 (not set by the client), selectAllSpans is set to false
// preserving the old paged behaviour for backward compatibility
@@ -1071,6 +1118,23 @@ func (r *ClickHouseReader) GetWaterfallSpansForTraceWithMetadata(ctx context.Con
return response, nil
}
func (r *ClickHouseReader) GetFlamegraphSpansForTraceCache(ctx context.Context, orgID valuer.UUID, traceID string) (*model.GetFlamegraphSpansForTraceCache, error) {
cachedTraceData := new(model.GetFlamegraphSpansForTraceCache)
err := r.cacheForTraceDetail.Get(ctx, orgID, strings.Join([]string{"getFlamegraphSpansForTrace", traceID}, "-"), cachedTraceData)
if err != nil {
r.logger.Debug("error in retrieving getFlamegraphSpansForTrace cache", errorsV2.Attr(err), "traceID", traceID)
return nil, err
}
if time.Since(time.UnixMilli(int64(cachedTraceData.EndTime))) < r.fluxIntervalForTraceDetail {
r.logger.Info("the trace end time falls under the flux interval, skipping getFlamegraphSpansForTrace cache", "traceID", traceID)
return nil, errors.Errorf("the trace end time falls under the flux interval, skipping getFlamegraphSpansForTrace cache, traceID: %s", traceID)
}
r.logger.Info("cache is successfully hit, applying cache for getFlamegraphSpansForTrace", "traceID", traceID)
return cachedTraceData, nil
}
func (r *ClickHouseReader) GetFlamegraphSpansForTrace(ctx context.Context, orgID valuer.UUID, traceID string, req *model.GetFlamegraphSpansForTraceParams) (*model.GetFlamegraphSpansForTraceResponse, error) {
trace := new(model.GetFlamegraphSpansForTraceResponse)
var startTime, endTime, durationNano uint64
@@ -1079,95 +1143,118 @@ func (r *ClickHouseReader) GetFlamegraphSpansForTrace(ctx context.Context, orgID
var selectedSpans = [][]*model.FlamegraphSpan{}
var traceRoots []*model.FlamegraphSpan
searchScanResponses, err := r.GetSpansForTrace(ctx, traceID, fmt.Sprintf("SELECT timestamp, duration_nano, span_id, trace_id, has_error,links as references, resource_string_service$$name, name, events FROM %s.%s WHERE trace_id=$1 and ts_bucket_start>=$2 and ts_bucket_start<=$3 ORDER BY timestamp ASC, name ASC", r.TraceDB, r.traceTableName))
// get the trace tree from cache!
cachedTraceData, err := r.GetFlamegraphSpansForTraceCache(ctx, orgID, traceID)
if err == nil {
startTime = cachedTraceData.StartTime
endTime = cachedTraceData.EndTime
durationNano = cachedTraceData.DurationNano
selectedSpans = cachedTraceData.SelectedSpans
traceRoots = cachedTraceData.TraceRoots
}
if err != nil {
return nil, err
}
if len(searchScanResponses) == 0 {
return trace, nil
}
r.logger.Info("cache miss for getFlamegraphSpansForTrace", "traceID", traceID)
for _, item := range searchScanResponses {
ref := []model.OtelSpanRef{}
err := json.Unmarshal([]byte(item.References), &ref)
searchScanResponses, err := r.GetSpansForTrace(ctx, traceID, fmt.Sprintf("SELECT timestamp, duration_nano, span_id, trace_id, has_error,links as references, resource_string_service$$name, name, events FROM %s.%s WHERE trace_id=$1 and ts_bucket_start>=$2 and ts_bucket_start<=$3 ORDER BY timestamp ASC, name ASC", r.TraceDB, r.traceTableName))
if err != nil {
r.logger.Error("Error unmarshalling references", errorsV2.Attr(err))
return nil, errorsV2.Newf(errorsV2.TypeInternal, errorsV2.CodeInternal, "getFlamegraphSpansForTrace: error in unmarshalling references %s", err.Error())
return nil, err
}
if len(searchScanResponses) == 0 {
return trace, nil
}
events := make([]model.Event, 0)
for _, event := range item.Events {
var eventMap model.Event
err = json.Unmarshal([]byte(event), &eventMap)
processingBeforeCache := time.Now()
for _, item := range searchScanResponses {
ref := []model.OtelSpanRef{}
err := json.Unmarshal([]byte(item.References), &ref)
if err != nil {
r.logger.Error("Error unmarshalling events", errorsV2.Attr(err))
return nil, errorsV2.Newf(errorsV2.TypeInternal, errorsV2.CodeInternal, "getFlamegraphSpansForTrace: error in unmarshalling events %s", err.Error())
r.logger.Error("Error unmarshalling references", errorsV2.Attr(err))
return nil, errorsV2.Newf(errorsV2.TypeInternal, errorsV2.CodeInternal, "getFlamegraphSpansForTrace: error in unmarshalling references %s", err.Error())
}
events = append(events, eventMap)
events := make([]model.Event, 0)
for _, event := range item.Events {
var eventMap model.Event
err = json.Unmarshal([]byte(event), &eventMap)
if err != nil {
r.logger.Error("Error unmarshalling events", errorsV2.Attr(err))
return nil, errorsV2.Newf(errorsV2.TypeInternal, errorsV2.CodeInternal, "getFlamegraphSpansForTrace: error in unmarshalling events %s", err.Error())
}
events = append(events, eventMap)
}
jsonItem := model.FlamegraphSpan{
SpanID: item.SpanID,
TraceID: item.TraceID,
ServiceName: item.ServiceName,
Name: item.Name,
DurationNano: item.DurationNano,
HasError: item.HasError,
References: ref,
Events: events,
Children: make([]*model.FlamegraphSpan, 0),
}
// metadata calculation
startTimeUnixNano := uint64(item.TimeUnixNano.UnixNano())
if startTime == 0 || startTimeUnixNano < startTime {
startTime = startTimeUnixNano
}
if endTime == 0 || (startTimeUnixNano+jsonItem.DurationNano) > endTime {
endTime = (startTimeUnixNano + jsonItem.DurationNano)
}
if durationNano == 0 || jsonItem.DurationNano > durationNano {
durationNano = jsonItem.DurationNano
}
jsonItem.TimeUnixNano = uint64(item.TimeUnixNano.UnixNano() / 1000000)
spanIdToSpanNodeMap[jsonItem.SpanID] = &jsonItem
}
jsonItem := model.FlamegraphSpan{
SpanID: item.SpanID,
TraceID: item.TraceID,
ServiceName: item.ServiceName,
Name: item.Name,
DurationNano: item.DurationNano,
HasError: item.HasError,
References: ref,
Events: events,
Children: make([]*model.FlamegraphSpan, 0),
}
// metadata calculation
startTimeUnixNano := uint64(item.TimeUnixNano.UnixNano())
if startTime == 0 || startTimeUnixNano < startTime {
startTime = startTimeUnixNano
}
if endTime == 0 || (startTimeUnixNano+jsonItem.DurationNano) > endTime {
endTime = (startTimeUnixNano + jsonItem.DurationNano)
}
if durationNano == 0 || jsonItem.DurationNano > durationNano {
durationNano = jsonItem.DurationNano
}
jsonItem.TimeUnixNano = uint64(item.TimeUnixNano.UnixNano() / 1000000)
spanIdToSpanNodeMap[jsonItem.SpanID] = &jsonItem
}
// traverse through the map and append each node to the children array of the parent node
// and add missing spans
for _, spanNode := range spanIdToSpanNodeMap {
hasParentSpanNode := false
for _, reference := range spanNode.References {
if reference.RefType == "CHILD_OF" && reference.SpanId != "" {
hasParentSpanNode = true
if parentNode, exists := spanIdToSpanNodeMap[reference.SpanId]; exists {
parentNode.Children = append(parentNode.Children, spanNode)
} else {
// insert the missing spans
missingSpan := model.FlamegraphSpan{
SpanID: reference.SpanId,
TraceID: spanNode.TraceID,
ServiceName: "",
Name: "Missing Span",
TimeUnixNano: spanNode.TimeUnixNano,
DurationNano: spanNode.DurationNano,
HasError: false,
Events: make([]model.Event, 0),
Children: make([]*model.FlamegraphSpan, 0),
// traverse through the map and append each node to the children array of the parent node
// and add missing spans
for _, spanNode := range spanIdToSpanNodeMap {
hasParentSpanNode := false
for _, reference := range spanNode.References {
if reference.RefType == "CHILD_OF" && reference.SpanId != "" {
hasParentSpanNode = true
if parentNode, exists := spanIdToSpanNodeMap[reference.SpanId]; exists {
parentNode.Children = append(parentNode.Children, spanNode)
} else {
// insert the missing spans
missingSpan := model.FlamegraphSpan{
SpanID: reference.SpanId,
TraceID: spanNode.TraceID,
ServiceName: "",
Name: "Missing Span",
TimeUnixNano: spanNode.TimeUnixNano,
DurationNano: spanNode.DurationNano,
HasError: false,
Events: make([]model.Event, 0),
Children: make([]*model.FlamegraphSpan, 0),
}
missingSpan.Children = append(missingSpan.Children, spanNode)
spanIdToSpanNodeMap[missingSpan.SpanID] = &missingSpan
traceRoots = append(traceRoots, &missingSpan)
}
missingSpan.Children = append(missingSpan.Children, spanNode)
spanIdToSpanNodeMap[missingSpan.SpanID] = &missingSpan
traceRoots = append(traceRoots, &missingSpan)
}
}
if !hasParentSpanNode && !tracedetail.ContainsFlamegraphSpan(traceRoots, spanNode) {
traceRoots = append(traceRoots, spanNode)
}
}
if !hasParentSpanNode && !tracedetail.ContainsFlamegraphSpan(traceRoots, spanNode) {
traceRoots = append(traceRoots, spanNode)
}
}
selectedSpans = tracedetail.GetAllSpansForFlamegraph(traceRoots, spanIdToSpanNodeMap)
selectedSpans = tracedetail.GetAllSpansForFlamegraph(traceRoots, spanIdToSpanNodeMap)
// TODO: set the trace data (model.GetFlamegraphSpansForTraceCache) in cache here
// removed existing cache usage since it was not getting used due to this bug https://github.com/SigNoz/engineering-pod/issues/4648
// and was causing out of memory issues https://github.com/SigNoz/engineering-pod/issues/4638
r.logger.Info("getFlamegraphSpansForTrace: processing pre cache", "duration", time.Since(processingBeforeCache), "traceID", traceID)
}
processingPostCache := time.Now()
selectedSpansForRequest := selectedSpans

View File

@@ -7,6 +7,7 @@ import (
"net/http"
"slices"
"github.com/SigNoz/signoz/pkg/cache/memorycache"
"github.com/SigNoz/signoz/pkg/errors"
"github.com/SigNoz/signoz/pkg/factory"
"github.com/SigNoz/signoz/pkg/queryparser"
@@ -78,12 +79,25 @@ func NewServer(config signoz.Config, signoz *signoz.SigNoz) (*Server, error) {
return nil, err
}
cacheForTraceDetail, err := memorycache.New(context.TODO(), signoz.Instrumentation.ToProviderSettings(), cache.Config{
Provider: "memory",
Memory: cache.Memory{
NumCounters: 10 * 10000,
MaxCost: 1 << 27, // 128 MB
},
})
if err != nil {
return nil, err
}
reader := clickhouseReader.NewReader(
signoz.Instrumentation.Logger(),
signoz.SQLStore,
signoz.TelemetryStore,
signoz.Prometheus,
signoz.TelemetryStore.Cluster(),
config.Querier.FluxInterval,
cacheForTraceDetail,
signoz.Cache,
nil,
)

View File

@@ -1 +1,74 @@
package model
import (
"encoding/json"
"maps"
"github.com/SigNoz/signoz/pkg/types/cachetypes"
)
type GetWaterfallSpansForTraceWithMetadataCache struct {
StartTime uint64 `json:"startTime"`
EndTime uint64 `json:"endTime"`
DurationNano uint64 `json:"durationNano"`
TotalSpans uint64 `json:"totalSpans"`
TotalErrorSpans uint64 `json:"totalErrorSpans"`
ServiceNameToTotalDurationMap map[string]uint64 `json:"serviceNameToTotalDurationMap"`
SpanIdToSpanNodeMap map[string]*Span `json:"spanIdToSpanNodeMap"`
TraceRoots []*Span `json:"traceRoots"`
HasMissingSpans bool `json:"hasMissingSpans"`
}
func (c *GetWaterfallSpansForTraceWithMetadataCache) Clone() cachetypes.Cacheable {
copyOfServiceNameToTotalDurationMap := make(map[string]uint64)
maps.Copy(copyOfServiceNameToTotalDurationMap, c.ServiceNameToTotalDurationMap)
copyOfSpanIdToSpanNodeMap := make(map[string]*Span)
maps.Copy(copyOfSpanIdToSpanNodeMap, c.SpanIdToSpanNodeMap)
copyOfTraceRoots := make([]*Span, len(c.TraceRoots))
copy(copyOfTraceRoots, c.TraceRoots)
return &GetWaterfallSpansForTraceWithMetadataCache{
StartTime: c.StartTime,
EndTime: c.EndTime,
DurationNano: c.DurationNano,
TotalSpans: c.TotalSpans,
TotalErrorSpans: c.TotalErrorSpans,
ServiceNameToTotalDurationMap: copyOfServiceNameToTotalDurationMap,
SpanIdToSpanNodeMap: copyOfSpanIdToSpanNodeMap,
TraceRoots: copyOfTraceRoots,
HasMissingSpans: c.HasMissingSpans,
}
}
func (c *GetWaterfallSpansForTraceWithMetadataCache) MarshalBinary() (data []byte, err error) {
return json.Marshal(c)
}
func (c *GetWaterfallSpansForTraceWithMetadataCache) UnmarshalBinary(data []byte) error {
return json.Unmarshal(data, c)
}
type GetFlamegraphSpansForTraceCache struct {
StartTime uint64 `json:"startTime"`
EndTime uint64 `json:"endTime"`
DurationNano uint64 `json:"durationNano"`
SelectedSpans [][]*FlamegraphSpan `json:"selectedSpans"`
TraceRoots []*FlamegraphSpan `json:"traceRoots"`
}
func (c *GetFlamegraphSpansForTraceCache) Clone() cachetypes.Cacheable {
return &GetFlamegraphSpansForTraceCache{
StartTime: c.StartTime,
EndTime: c.EndTime,
DurationNano: c.DurationNano,
SelectedSpans: c.SelectedSpans,
TraceRoots: c.TraceRoots,
}
}
func (c *GetFlamegraphSpansForTraceCache) MarshalBinary() (data []byte, err error) {
return json.Marshal(c)
}
func (c *GetFlamegraphSpansForTraceCache) UnmarshalBinary(data []byte) error {
return json.Unmarshal(data, c)
}