Compare commits

...

1 Commits

Author SHA1 Message Date
nityanandagohain
aa140b3456 feat: trace based filters for logs, supporting aggregations as well 2026-05-21 18:47:25 +05:30
3 changed files with 545 additions and 37 deletions

View File

@@ -199,7 +199,16 @@ func (q *builderQuery[T]) Execute(ctx context.Context) (*qbtypes.Result, error)
return q.executeWindowList(ctx)
}
stmt, err := q.stmtBuilder.Build(ctx, q.fromMS, q.toMS, q.kind, q.spec, q.variables)
fromMS, toMS := q.fromMS, q.toMS
if q.spec.Signal == telemetrytypes.SignalTraces || q.spec.Signal == telemetrytypes.SignalLogs {
var overlap bool
fromMS, toMS, overlap = q.narrowWindowByTraceID(ctx, fromMS, toMS)
if !overlap {
return emptyResultFor(q.kind, q.spec.Name), nil
}
}
stmt, err := q.stmtBuilder.Build(ctx, fromMS, toMS, q.kind, q.spec, q.variables)
if err != nil {
return nil, err
}
@@ -215,6 +224,81 @@ func (q *builderQuery[T]) Execute(ctx context.Context) (*qbtypes.Result, error)
return result, nil
}
// narrowWindowByTraceID inspects the filter for trace_id predicates and clamps
// [fromMS,toMS] to the time range stored in signoz_traces.distributed_trace_summary.
// Returns the (possibly narrowed) window and overlap=false when the trace lies
// completely outside the query window — callers should short-circuit in that case.
//
// When the trace_id is not present in trace_summary the behaviour differs by
// signal:
// - traces: trace_summary is derived from the spans table, so a missing row
// means no spans exist for that trace_id; we short-circuit to empty.
// - logs: logs can carry a trace_id even when traces are not ingested at all
// (e.g. traces disabled). We must not short-circuit; instead leave the
// window untouched and let the query run.
func (q *builderQuery[T]) narrowWindowByTraceID(ctx context.Context, fromMS, toMS uint64) (uint64, uint64, bool) {
if q.spec.Filter == nil || q.spec.Filter.Expression == "" {
return fromMS, toMS, true
}
traceIDs, found := telemetrytraces.ExtractTraceIDsFromFilter(q.spec.Filter.Expression)
if !found || len(traceIDs) == 0 {
return fromMS, toMS, true
}
finder := telemetrytraces.NewTraceTimeRangeFinder(q.telemetryStore)
traceStart, traceEnd, ok := finder.GetTraceTimeRangeMulti(ctx, traceIDs)
if !ok {
if q.spec.Signal == telemetrytypes.SignalTraces {
q.logger.DebugContext(ctx, "trace_id not found in trace_summary; short-circuiting traces query to empty",
slog.Any("trace_ids", traceIDs))
return fromMS, toMS, false
}
q.logger.DebugContext(ctx, "trace_id not found in trace_summary; leaving time range untouched for logs",
slog.Any("trace_ids", traceIDs))
return fromMS, toMS, true
}
traceStartMS := uint64(traceStart) / 1_000_000
traceEndMS := uint64(traceEnd) / 1_000_000
if traceStartMS == 0 || traceEndMS == 0 {
return fromMS, toMS, true
}
if traceStartMS > toMS || traceEndMS < fromMS {
return fromMS, toMS, false
}
if traceStartMS > fromMS {
fromMS = traceStartMS
}
if traceEndMS < toMS {
toMS = traceEndMS
}
q.logger.DebugContext(ctx, "optimized time range using trace_id lookup",
slog.String("signal", q.spec.Signal.StringValue()),
slog.Any("trace_ids", traceIDs),
slog.Uint64("start", fromMS),
slog.Uint64("end", toMS))
return fromMS, toMS, true
}
// emptyResultFor returns an empty result payload appropriate for the given kind.
func emptyResultFor(kind qbtypes.RequestType, queryName string) *qbtypes.Result {
var value any
switch kind {
case qbtypes.RequestTypeTimeSeries:
value = &qbtypes.TimeSeriesData{QueryName: queryName}
case qbtypes.RequestTypeScalar:
value = &qbtypes.ScalarData{QueryName: queryName}
default:
value = &qbtypes.RawData{QueryName: queryName}
}
return &qbtypes.Result{
Type: kind,
Value: value,
}
}
// executeWithContext executes the query with query window and step context for partial value detection.
func (q *builderQuery[T]) executeWithContext(ctx context.Context, query string, args []any) (*qbtypes.Result, error) {
ctx = ctxtypes.NewContextWithCommentVals(ctx, map[string]string{
@@ -310,42 +394,22 @@ func (q *builderQuery[T]) executeWindowList(ctx context.Context) (*qbtypes.Resul
totalBytes := uint64(0)
start := time.Now()
// Check if filter contains trace_id(s) and optimize time range if needed
if q.spec.Signal == telemetrytypes.SignalTraces &&
q.spec.Filter != nil && q.spec.Filter.Expression != "" {
traceIDs, found := telemetrytraces.ExtractTraceIDsFromFilter(q.spec.Filter.Expression)
if found && len(traceIDs) > 0 {
finder := telemetrytraces.NewTraceTimeRangeFinder(q.telemetryStore)
traceStart, traceEnd, ok := finder.GetTraceTimeRangeMulti(ctx, traceIDs)
traceStartMS := uint64(traceStart) / 1_000_000
traceEndMS := uint64(traceEnd) / 1_000_000
if !ok {
q.logger.DebugContext(ctx, "failed to get trace time range", slog.Any("trace_ids", traceIDs))
} else if traceStartMS > 0 && traceEndMS > 0 {
// no overlap — nothing to return
if uint64(traceStartMS) > toMS || uint64(traceEndMS) < fromMS {
return &qbtypes.Result{
Type: qbtypes.RequestTypeRaw,
Value: &qbtypes.RawData{
QueryName: q.spec.Name,
},
Stats: qbtypes.ExecStats{
DurationMS: uint64(time.Since(start).Milliseconds()),
},
}, nil
}
// clamp window to trace time range before bucketing
if uint64(traceStartMS) > fromMS {
fromMS = uint64(traceStartMS)
}
if uint64(traceEndMS) < toMS {
toMS = uint64(traceEndMS)
}
q.logger.DebugContext(ctx, "optimized time range for traces", slog.Any("trace_ids", traceIDs), slog.Uint64("start", fromMS), slog.Uint64("end", toMS))
}
// Check if filter contains trace_id(s) and optimize time range if needed.
// Applies to both traces (the listing this branch was built for) and logs
// (which carry trace_id and benefit from the same clamp before bucketing).
if q.spec.Signal == telemetrytypes.SignalTraces || q.spec.Signal == telemetrytypes.SignalLogs {
var overlap bool
fromMS, toMS, overlap = q.narrowWindowByTraceID(ctx, fromMS, toMS)
if !overlap {
return &qbtypes.Result{
Type: qbtypes.RequestTypeRaw,
Value: &qbtypes.RawData{
QueryName: q.spec.Name,
},
Stats: qbtypes.ExecStats{
DurationMS: uint64(time.Since(start).Milliseconds()),
},
}, nil
}
}

View File

@@ -20,6 +20,7 @@ from fixtures.querier import (
index_series_by_label,
make_query_request,
)
from fixtures.traces import TraceIdGenerator, Traces, TracesKind, TracesStatusCode
def test_logs_list(
@@ -2293,3 +2294,333 @@ def test_logs_formula_orderby_and_limit(
assert len(f3_services) == 3, f"F3: expected 3 rows after limit, got {len(f3_services)}"
assert f3_values == f4_values[:3], f"F3 values {f3_values} do not match F4[:3] values {f4_values[:3]}"
assert set(f3_services) == set(f4_services[:3]), f"F3 services {f3_services} do not match F4[:3] services {f4_services[:3]}"
def test_logs_list_filter_by_trace_id(
signoz: types.SigNoz,
create_user_admin: None, # pylint: disable=unused-argument
get_token: Callable[[str, str], str],
insert_logs: Callable[[list[Logs]], None],
insert_traces: Callable[[list[Traces]], None],
) -> None:
"""
Tests that filtering logs by trace_id uses the trace_summary lookup to
narrow the query window before scanning the logs table:
1. Returns the matching log (narrow window, single bucket).
2. Does not return duplicate logs when the query window spans multiple
exponential buckets (>1 h).
3. Returns no results when the query window does not contain the trace.
4. Logs carrying a trace_id whose trace is NOT in trace_summary (e.g.
traces disabled) are still returned — the lookup miss must not
short-circuit logs queries.
"""
target_trace_id = TraceIdGenerator.trace_id()
other_trace_id = TraceIdGenerator.trace_id()
orphan_trace_id = TraceIdGenerator.trace_id()
target_root_span_id = TraceIdGenerator.span_id()
target_child_span_id = TraceIdGenerator.span_id()
other_span_id = TraceIdGenerator.span_id()
orphan_span_id = TraceIdGenerator.span_id()
now = datetime.now(tz=UTC).replace(second=0, microsecond=0)
common_resources = {
"deployment.environment": "production",
"service.name": "logs-trace-filter-service",
"cloud.provider": "integration",
}
# Populate signoz_traces.distributed_trace_summary by inserting spans for
# the target trace_id. trace_summary records min/max of span timestamps
# (it ignores span duration), so two spans are inserted to give the trace
# a non-trivial recorded window of [now-10s, now-5s].
insert_traces(
[
Traces(
timestamp=now - timedelta(seconds=10),
duration=timedelta(seconds=1),
trace_id=target_trace_id,
span_id=target_root_span_id,
parent_span_id="",
name="root-span",
kind=TracesKind.SPAN_KIND_SERVER,
status_code=TracesStatusCode.STATUS_CODE_OK,
status_message="",
resources=common_resources,
attributes={},
),
Traces(
timestamp=now - timedelta(seconds=5),
duration=timedelta(seconds=1),
trace_id=target_trace_id,
span_id=target_child_span_id,
parent_span_id=target_root_span_id,
name="child-span",
kind=TracesKind.SPAN_KIND_CLIENT,
status_code=TracesStatusCode.STATUS_CODE_OK,
status_message="",
resources=common_resources,
attributes={},
),
]
)
# Insert logs:
# - one with the target trace_id, at a timestamp within the trace's
# recorded window (now-10s..now-5s, padded ±1s).
# - one with a different trace_id; must never appear in target_trace_id
# results.
# - one with an orphan trace_id whose trace was never ingested — used to
# verify the lookup miss does NOT short-circuit logs queries.
insert_logs(
[
Logs(
timestamp=now - timedelta(seconds=7),
resources=common_resources,
attributes={"http.method": "GET"},
body="log inside the target trace window",
severity_text="INFO",
trace_id=target_trace_id,
span_id=target_root_span_id,
),
Logs(
timestamp=now - timedelta(seconds=3),
resources=common_resources,
attributes={"http.method": "POST"},
body="log with a different trace_id",
severity_text="INFO",
trace_id=other_trace_id,
span_id=other_span_id,
),
Logs(
timestamp=now - timedelta(seconds=2),
resources=common_resources,
attributes={"http.method": "PUT"},
body="log with a trace_id absent from trace_summary",
severity_text="INFO",
trace_id=orphan_trace_id,
span_id=orphan_span_id,
),
]
)
token = get_token(USER_ADMIN_EMAIL, USER_ADMIN_PASSWORD)
def _query(start_ms: int, end_ms: int, trace_id: str) -> list:
response = make_query_request(
signoz,
token,
start_ms=start_ms,
end_ms=end_ms,
request_type="raw",
queries=[
{
"type": "builder_query",
"spec": {
"name": "A",
"signal": "logs",
"disabled": False,
"limit": 100,
"offset": 0,
"filter": {"expression": f"trace_id = '{trace_id}'"},
"order": [
{"key": {"name": "timestamp"}, "direction": "desc"},
{"key": {"name": "id"}, "direction": "desc"},
],
"having": {"expression": ""},
"aggregations": [{"expression": "count()"}],
},
}
],
)
assert response.status_code == HTTPStatus.OK
assert response.json()["status"] == "success"
return response.json()["data"]["data"]["results"][0]["rows"] or []
now_ms = int(now.timestamp() * 1000)
# --- Test 1: narrow window (single bucket, <1 h) ---
narrow_start_ms = int((now - timedelta(minutes=5)).timestamp() * 1000)
narrow_rows = _query(narrow_start_ms, now_ms, target_trace_id)
assert len(narrow_rows) == 1, f"Expected 1 log for trace_id filter (narrow window), got {len(narrow_rows)}"
assert narrow_rows[0]["data"]["trace_id"] == target_trace_id
assert narrow_rows[0]["data"]["span_id"] == target_root_span_id
# --- Test 2: wide window (>1 h, triggers multiple exponential buckets) ---
# Should still return exactly one log — no duplicates from multi-bucket scan.
wide_start_ms = int((now - timedelta(hours=12)).timestamp() * 1000)
wide_rows = _query(wide_start_ms, now_ms, target_trace_id)
assert len(wide_rows) == 1, f"Expected 1 log for trace_id filter (wide window, multi-bucket), got {len(wide_rows)} — possible duplicate-log regression"
assert wide_rows[0]["data"]["trace_id"] == target_trace_id
assert wide_rows[0]["data"]["span_id"] == target_root_span_id
# --- Test 3: window that does not contain the trace returns no results ---
past_start_ms = int((now - timedelta(hours=6)).timestamp() * 1000)
past_end_ms = int((now - timedelta(hours=2)).timestamp() * 1000)
past_rows = _query(past_start_ms, past_end_ms, target_trace_id)
assert len(past_rows) == 0, f"Expected 0 logs for trace_id filter outside time window, got {len(past_rows)}"
# --- Test 4: trace_id not present in trace_summary still returns logs ---
orphan_rows = _query(narrow_start_ms, now_ms, orphan_trace_id)
assert len(orphan_rows) == 1, f"Expected 1 log for orphan trace_id (no trace_summary entry), got {len(orphan_rows)} — logs query may have been incorrectly short-circuited"
assert orphan_rows[0]["data"]["trace_id"] == orphan_trace_id
def test_logs_aggregation_filter_by_trace_id(
signoz: types.SigNoz,
create_user_admin: None, # pylint: disable=unused-argument
get_token: Callable[[str, str], str],
insert_logs: Callable[[list[Logs]], None],
insert_traces: Callable[[list[Traces]], None],
) -> None:
"""
Tests that the trace_id time-range optimization also applies to
non-window-list (time_series / aggregation) logs queries:
1. Wide query window containing the trace returns the correct count.
2. Query window outside the trace's time range short-circuits to an
empty result.
3. A trace_id with no row in trace_summary (e.g. traces disabled) still
returns the matching logs — the lookup miss must not short-circuit
logs aggregation queries.
"""
target_trace_id = TraceIdGenerator.trace_id()
orphan_trace_id = TraceIdGenerator.trace_id()
target_root_span_id = TraceIdGenerator.span_id()
target_child_span_id = TraceIdGenerator.span_id()
orphan_span_id = TraceIdGenerator.span_id()
now = datetime.now(tz=UTC).replace(second=0, microsecond=0)
common_resources = {
"deployment.environment": "production",
"service.name": "logs-trace-agg-service",
"cloud.provider": "integration",
}
# trace_summary records min/max of span timestamps (it ignores duration),
# so insert two spans to give the trace a recorded window wide enough to
# comfortably contain the log timestamps below.
insert_traces(
[
Traces(
timestamp=now - timedelta(seconds=10),
duration=timedelta(seconds=1),
trace_id=target_trace_id,
span_id=target_root_span_id,
parent_span_id="",
name="root-span",
kind=TracesKind.SPAN_KIND_SERVER,
status_code=TracesStatusCode.STATUS_CODE_OK,
status_message="",
resources=common_resources,
attributes={},
),
Traces(
timestamp=now - timedelta(seconds=5),
duration=timedelta(seconds=1),
trace_id=target_trace_id,
span_id=target_child_span_id,
parent_span_id=target_root_span_id,
name="child-span",
kind=TracesKind.SPAN_KIND_CLIENT,
status_code=TracesStatusCode.STATUS_CODE_OK,
status_message="",
resources=common_resources,
attributes={},
),
]
)
# Two logs for the target trace_id, both inside the recorded trace window.
# One additional log carries an orphan trace_id with no row in
# trace_summary — used to verify that the lookup miss does not
# short-circuit logs aggregations.
insert_logs(
[
Logs(
timestamp=now - timedelta(seconds=9),
resources=common_resources,
attributes={},
body="log A inside trace window",
severity_text="INFO",
trace_id=target_trace_id,
span_id=target_root_span_id,
),
Logs(
timestamp=now - timedelta(seconds=6),
resources=common_resources,
attributes={},
body="log B inside trace window",
severity_text="INFO",
trace_id=target_trace_id,
span_id=target_root_span_id,
),
Logs(
timestamp=now - timedelta(seconds=2),
resources=common_resources,
attributes={},
body="log with a trace_id absent from trace_summary",
severity_text="INFO",
trace_id=orphan_trace_id,
span_id=orphan_span_id,
),
]
)
token = get_token(USER_ADMIN_EMAIL, USER_ADMIN_PASSWORD)
def _count(start_ms: int, end_ms: int, trace_id: str) -> float:
response = make_query_request(
signoz,
token,
start_ms=start_ms,
end_ms=end_ms,
request_type="time_series",
queries=[
{
"type": "builder_query",
"spec": {
"name": "A",
"signal": "logs",
"stepInterval": 60,
"disabled": False,
"filter": {"expression": f"trace_id = '{trace_id}'"},
"having": {"expression": ""},
"aggregations": [{"expression": "count()"}],
},
}
],
)
assert response.status_code == HTTPStatus.OK
assert response.json()["status"] == "success"
results = response.json()["data"]["data"]["results"]
assert len(results) == 1
aggregations = results[0].get("aggregations") or []
if not aggregations:
return 0
series = aggregations[0].get("series") or []
if not series:
return 0
return sum(v["value"] for v in series[0]["values"])
now_ms = int(now.timestamp() * 1000)
narrow_start_ms = int((now - timedelta(minutes=5)).timestamp() * 1000)
# --- Test 1: wide window (>1 h) containing the trace returns 2 logs ---
wide_start_ms = int((now - timedelta(hours=12)).timestamp() * 1000)
wide_count = _count(wide_start_ms, now_ms, target_trace_id)
assert wide_count == 2, f"Expected count=2 for trace_id aggregation (wide window), got {wide_count}"
# --- Test 2: window outside the trace short-circuits to empty ---
past_start_ms = int((now - timedelta(hours=6)).timestamp() * 1000)
past_end_ms = int((now - timedelta(hours=2)).timestamp() * 1000)
past_count = _count(past_start_ms, past_end_ms, target_trace_id)
assert past_count == 0, f"Expected count=0 for trace_id aggregation outside time window, got {past_count}"
# --- Test 3: trace_id not present in trace_summary still returns logs ---
orphan_count = _count(narrow_start_ms, now_ms, orphan_trace_id)
assert orphan_count == 1, f"Expected count=1 for orphan trace_id aggregation, got {orphan_count} — query may have been incorrectly short-circuited"

View File

@@ -2123,3 +2123,116 @@ def test_traces_list_filter_by_trace_id(
past_rows = _query(past_start_ms, past_end_ms)
assert len(past_rows) == 0, f"Expected 0 spans for trace_id filter outside time window, got {len(past_rows)}"
def test_traces_aggregation_filter_by_trace_id(
signoz: types.SigNoz,
create_user_admin: None, # pylint: disable=unused-argument
get_token: Callable[[str, str], str],
insert_traces: Callable[[list[Traces]], None],
) -> None:
"""
Tests that the trace_id time-range optimization also applies to
non-window-list (time_series / aggregation) traces queries:
1. Wide query window containing the trace returns the correct count.
2. Query window outside the trace's time range short-circuits to empty.
3. Filter referencing a trace_id with no row in trace_summary
short-circuits to empty (trace_summary is authoritative for traces).
"""
target_trace_id = TraceIdGenerator.trace_id()
target_root_span_id = TraceIdGenerator.span_id()
target_child_span_id = TraceIdGenerator.span_id()
missing_trace_id = TraceIdGenerator.trace_id()
now = datetime.now(tz=UTC).replace(second=0, microsecond=0)
common_resources = {
"deployment.environment": "production",
"service.name": "traces-agg-filter-service",
"cloud.provider": "integration",
}
insert_traces(
[
Traces(
timestamp=now - timedelta(seconds=10),
duration=timedelta(seconds=5),
trace_id=target_trace_id,
span_id=target_root_span_id,
parent_span_id="",
name="root-span",
kind=TracesKind.SPAN_KIND_SERVER,
status_code=TracesStatusCode.STATUS_CODE_OK,
status_message="",
resources=common_resources,
attributes={"http.request.method": "GET"},
),
Traces(
timestamp=now - timedelta(seconds=9),
duration=timedelta(seconds=1),
trace_id=target_trace_id,
span_id=target_child_span_id,
parent_span_id=target_root_span_id,
name="child-span",
kind=TracesKind.SPAN_KIND_CLIENT,
status_code=TracesStatusCode.STATUS_CODE_OK,
status_message="",
resources=common_resources,
attributes={},
),
]
)
token = get_token(USER_ADMIN_EMAIL, USER_ADMIN_PASSWORD)
def _count(start_ms: int, end_ms: int, trace_id: str) -> float:
response = make_query_request(
signoz,
token,
start_ms=start_ms,
end_ms=end_ms,
request_type="time_series",
queries=[
{
"type": "builder_query",
"spec": {
"name": "A",
"signal": "traces",
"stepInterval": 60,
"disabled": False,
"filter": {"expression": f"trace_id = '{trace_id}'"},
"having": {"expression": ""},
"aggregations": [{"expression": "count()"}],
},
}
],
)
assert response.status_code == HTTPStatus.OK
assert response.json()["status"] == "success"
results = response.json()["data"]["data"]["results"]
assert len(results) == 1
aggregations = results[0].get("aggregations") or []
if not aggregations:
return 0
series = aggregations[0].get("series") or []
if not series:
return 0
return sum(v["value"] for v in series[0]["values"])
now_ms = int(now.timestamp() * 1000)
# --- Test 1: wide window (>1 h) containing the trace returns both spans ---
wide_start_ms = int((now - timedelta(hours=12)).timestamp() * 1000)
wide_count = _count(wide_start_ms, now_ms, target_trace_id)
assert wide_count == 2, f"Expected count=2 for trace_id aggregation (wide window), got {wide_count}"
# --- Test 2: window outside the trace short-circuits to empty ---
past_start_ms = int((now - timedelta(hours=6)).timestamp() * 1000)
past_end_ms = int((now - timedelta(hours=2)).timestamp() * 1000)
past_count = _count(past_start_ms, past_end_ms, target_trace_id)
assert past_count == 0, f"Expected count=0 for trace_id aggregation outside time window, got {past_count}"
# --- Test 3: trace_id with no entry in trace_summary short-circuits ---
missing_start_ms = int((now - timedelta(minutes=5)).timestamp() * 1000)
missing_count = _count(missing_start_ms, now_ms, missing_trace_id)
assert missing_count == 0, f"Expected count=0 for trace_id absent from trace_summary, got {missing_count}"