diff --git a/ddtrace/opentelemetry/_span.py b/ddtrace/opentelemetry/_span.py index a08501bfcfb..af3d2962167 100644 --- a/ddtrace/opentelemetry/_span.py +++ b/ddtrace/opentelemetry/_span.py @@ -8,6 +8,7 @@ from opentelemetry.trace.span import TraceFlags from opentelemetry.trace.span import TraceState +from ddtrace import tracer as ddtracer from ddtrace.constants import ERROR_MSG from ddtrace.constants import SPAN_KIND from ddtrace.internal import core @@ -135,13 +136,27 @@ def kind(self): def get_span_context(self): # type: () -> SpanContext """Returns an OpenTelemetry SpanContext""" - ts = None - tf = TraceFlags.DEFAULT - if self._ddspan.context: - ts_str = w3c_tracestate_add_p(self._ddspan.context._tracestate, self._ddspan.span_id) - ts = TraceState.from_header([ts_str]) - if self._ddspan.context.sampling_priority and self._ddspan.context.sampling_priority > 0: - tf = TraceFlags.SAMPLED + if self._ddspan.context.sampling_priority is None: + # With the introduction of lazy sampling, spans are now sampled on serialization. With this change + # a spans trace flags could be propagated before a sampling + # decision is made. Since the default sampling decision is to unsample spans this can result + # in missing spans. To resolve this issue, a sampling decision must be made the first time + # the span context is accessed. + ddtracer.sample(self._ddspan._local_root or self._ddspan) + + if self._ddspan.context.sampling_priority is None: + tf = TraceFlags.DEFAULT + log.warning( + "Span context is missing a sampling decision, defaulting to unsampled: %s", str(self._ddspan.context) + ) + elif self._ddspan.context.sampling_priority > 0: + tf = TraceFlags.SAMPLED + else: + tf = TraceFlags.DEFAULT + + # Evaluate the tracestate header after the sampling decision has been made + ts_str = w3c_tracestate_add_p(self._ddspan.context._tracestate, self._ddspan.span_id) + ts = TraceState.from_header([ts_str]) return SpanContext(self._ddspan.trace_id, self._ddspan.span_id, False, tf, ts) diff --git a/releasenotes/notes/otel-ensure-sampling-decision-is-made-40ab760eada20b20.yaml b/releasenotes/notes/otel-ensure-sampling-decision-is-made-40ab760eada20b20.yaml new file mode 100644 index 00000000000..5c286f8a06e --- /dev/null +++ b/releasenotes/notes/otel-ensure-sampling-decision-is-made-40ab760eada20b20.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + opentelemetry: Resolves an edge case where distributed tracing headers could be generated before a sampling decision is made, + resulting in dropped spans in downstream services. diff --git a/tests/opentelemetry/test_trace.py b/tests/opentelemetry/test_trace.py index 22659664743..f05d4506e42 100644 --- a/tests/opentelemetry/test_trace.py +++ b/tests/opentelemetry/test_trace.py @@ -1,4 +1,6 @@ import opentelemetry +from opentelemetry.trace import set_span_in_context +from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator import opentelemetry.version import pytest @@ -42,7 +44,7 @@ def test_otel_start_span_without_default_args(oteltracer): root = oteltracer.start_span("root-span") otel_span = oteltracer.start_span( "test-start-span", - context=opentelemetry.trace.set_span_in_context(root), + context=set_span_in_context(root), kind=opentelemetry.trace.SpanKind.CLIENT, attributes={"start_span_tag": "start_span_val"}, links=None, @@ -114,7 +116,7 @@ def test_otel_start_current_span_without_default_args(oteltracer): with oteltracer.start_as_current_span("root-span") as root: with oteltracer.start_as_current_span( "test-start-current-span-no-defualts", - context=opentelemetry.trace.set_span_in_context(root), + context=set_span_in_context(root), kind=opentelemetry.trace.SpanKind.SERVER, attributes={"start_current_span_tag": "start_cspan_val"}, links=[], @@ -135,6 +137,50 @@ def test_otel_start_current_span_without_default_args(oteltracer): otel_span.end() +def test_otel_get_span_context_sets_sampling_decision(oteltracer): + with oteltracer.start_span("otel-server") as otelspan: + # Sampling priority is not set on span creation + assert otelspan._ddspan.context.sampling_priority is None + # Ensure the sampling priority is always consistent with traceflags + span_context = otelspan.get_span_context() + # Sampling priority is evaluated when the SpanContext is first accessed + sp = otelspan._ddspan.context.sampling_priority + assert sp is not None + if sp > 0: + assert span_context.trace_flags == 1 + else: + assert span_context.trace_flags == 0 + # Ensure the sampling priority is always consistent + for _ in range(1000): + otelspan.get_span_context() + assert otelspan._ddspan.context.sampling_priority == sp + + +def test_distributed_trace_inject(oteltracer): # noqa:F811 + with oteltracer.start_as_current_span("test-otel-distributed-trace") as span: + headers = {} + TraceContextTextMapPropagator().inject(headers, set_span_in_context(span)) + sp = span.get_span_context() + assert headers["traceparent"] == f"00-{sp.trace_id:032x}-{sp.span_id:016x}-{sp.trace_flags:02x}" + assert headers["tracestate"] == sp.trace_state.to_header() + + +def test_distributed_trace_extract(oteltracer): # noqa:F811 + headers = { + "traceparent": "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01", + "tracestate": "congo=t61rcWkgMzE,dd=s:2", + } + context = TraceContextTextMapPropagator().extract(headers) + with oteltracer.start_as_current_span("test-otel-distributed-trace", context=context) as span: + sp = span.get_span_context() + assert sp.trace_id == int("0af7651916cd43dd8448eb211c80319c", 16) + assert span._ddspan.parent_id == int("b7ad6b7169203331", 16) + assert sp.trace_flags == 1 + assert sp.trace_state.get("congo") == "t61rcWkgMzE" + assert "s:2" in sp.trace_state.get("dd") + assert sp.is_remote is False + + @flaky(1735812000) @pytest.mark.parametrize( "flask_wsgi_application,flask_env_arg,flask_port,flask_command", @@ -161,10 +207,12 @@ def test_otel_start_current_span_without_default_args(oteltracer): "with_opentelemetry_instrument", ], ) -@pytest.mark.snapshot(ignores=["metrics.net.peer.port", "meta.traceparent", "meta.flask.version"]) +@pytest.mark.snapshot(ignores=["metrics.net.peer.port", "meta.traceparent", "meta.tracestate", "meta.flask.version"]) def test_distributed_trace_with_flask_app(flask_client, oteltracer): # noqa:F811 - with oteltracer.start_as_current_span("test-otel-distributed-trace"): - resp = flask_client.get("/otel") + with oteltracer.start_as_current_span("test-otel-distributed-trace") as span: + headers = {} + TraceContextTextMapPropagator().inject(headers, set_span_in_context(span)) + resp = flask_client.get("/otel", headers=headers) assert resp.text == "otel" assert resp.status_code == 200 diff --git a/tests/snapshots/tests.opentelemetry.test_trace.test_distributed_trace_with_flask_app[with_ddtrace_run].json b/tests/snapshots/tests.opentelemetry.test_trace.test_distributed_trace_with_flask_app[with_ddtrace_run].json index c05e181c129..132e962af9b 100644 --- a/tests/snapshots/tests.opentelemetry.test_trace.test_distributed_trace_with_flask_app[with_ddtrace_run].json +++ b/tests/snapshots/tests.opentelemetry.test_trace.test_distributed_trace_with_flask_app[with_ddtrace_run].json @@ -10,9 +10,9 @@ "error": 0, "meta": { "_dd.p.dm": "-0", - "_dd.p.tid": "655535db00000000", + "_dd.p.tid": "66a6956800000000", "language": "python", - "runtime-id": "b4ffa244c11343de919ca20a7e8eebcf" + "runtime-id": "0da930484bcf4b488bfd7bb9cfd9c4b6" }, "metrics": { "_dd.top_level": 1, @@ -35,11 +35,12 @@ "meta": { "_dd.base_service": "", "_dd.p.dm": "-0", - "_dd.p.tid": "655535db00000000", + "_dd.p.tid": "66a6956800000000", + "_dd.parent_id": "4f4e94b0f4d57229", "component": "flask", "flask.endpoint": "otel", "flask.url_rule": "/otel", - "flask.version": "2.1.3", + "flask.version": "1.1.4", "http.method": "GET", "http.route": "/otel", "http.status_code": "200", @@ -56,10 +57,10 @@ "_dd.top_level": 1, "_dd.tracer_kr": 1.0, "_sampling_priority_v1": 1, - "process_id": 7713 + "process_id": 31660 }, - "duration": 14813125, - "start": 1700083163141821300 + "duration": 4600000, + "start": 1722193256335520000 }, { "name": "flask.application", @@ -76,8 +77,8 @@ "flask.endpoint": "otel", "flask.url_rule": "/otel" }, - "duration": 14243208, - "start": 1700083163142061300 + "duration": 4219000, + "start": 1722193256335701000 }, { "name": "flask.try_trigger_before_first_request_functions", @@ -92,8 +93,8 @@ "_dd.base_service": "", "component": "flask" }, - "duration": 14625, - "start": 1700083163142217383 + "duration": 12000, + "start": 1722193256335827000 }, { "name": "flask.preprocess_request", @@ -108,8 +109,8 @@ "_dd.base_service": "", "component": "flask" }, - "duration": 20125, - "start": 1700083163142331717 + "duration": 13000, + "start": 1722193256335911000 }, { "name": "flask.dispatch_request", @@ -124,8 +125,8 @@ "_dd.base_service": "", "component": "flask" }, - "duration": 13437750, - "start": 1700083163142415633 + "duration": 3646000, + "start": 1722193256335970000 }, { "name": "tests.opentelemetry.flask_app.otel", @@ -140,8 +141,8 @@ "_dd.base_service": "", "component": "flask" }, - "duration": 13354542, - "start": 1700083163142480508 + "duration": 3585000, + "start": 1722193256336021000 }, { "name": "internal", @@ -155,8 +156,8 @@ "meta": { "_dd.base_service": "" }, - "duration": 52875, - "start": 1700083163155755800 + "duration": 36000, + "start": 1722193256339552000 }, { "name": "flask.process_response", @@ -171,8 +172,8 @@ "_dd.base_service": "", "component": "flask" }, - "duration": 32375, - "start": 1700083163156004175 + "duration": 17000, + "start": 1722193256339703000 }, { "name": "flask.do_teardown_request", @@ -187,8 +188,8 @@ "_dd.base_service": "", "component": "flask" }, - "duration": 31625, - "start": 1700083163156192133 + "duration": 14000, + "start": 1722193256339831000 }, { "name": "flask.do_teardown_appcontext", @@ -203,8 +204,8 @@ "_dd.base_service": "", "component": "flask" }, - "duration": 9292, - "start": 1700083163156278258 + "duration": 10000, + "start": 1722193256339897000 }, { "name": "flask.response", @@ -219,6 +220,6 @@ "_dd.base_service": "", "component": "flask" }, - "duration": 311541, - "start": 1700083163156314342 + "duration": 183000, + "start": 1722193256339931000 }]] diff --git a/tests/snapshots/tests.opentelemetry.test_trace.test_distributed_trace_with_flask_app[with_opentelemetry_instrument].json b/tests/snapshots/tests.opentelemetry.test_trace.test_distributed_trace_with_flask_app[with_opentelemetry_instrument].json index e1fbd2196d3..daf39a83b8a 100644 --- a/tests/snapshots/tests.opentelemetry.test_trace.test_distributed_trace_with_flask_app[with_opentelemetry_instrument].json +++ b/tests/snapshots/tests.opentelemetry.test_trace.test_distributed_trace_with_flask_app[with_opentelemetry_instrument].json @@ -10,23 +10,23 @@ "error": 0, "meta": { "_dd.p.dm": "-0", - "_dd.p.tid": "655535dc00000000", + "_dd.p.tid": "66a666b600000000", "language": "python", - "runtime-id": "b4ffa244c11343de919ca20a7e8eebcf" + "runtime-id": "4fe738d296c04c00aff7eb2a66389ad4" }, "metrics": { "_dd.top_level": 1, "_dd.tracer_kr": 1.0, "_sampling_priority_v1": 1, - "process_id": 7703 + "process_id": 75526 }, - "duration": 24467250, - "start": 1700083164080082009 + "duration": 3561000, + "start": 1722181302828129000 }, { "name": "server", "service": "", - "resource": "/otel", + "resource": "GET /otel", "trace_id": 0, "span_id": 2, "parent_id": 1, @@ -34,7 +34,8 @@ "error": 0, "meta": { "_dd.p.dm": "-0", - "_dd.p.tid": "655535dc00000000", + "_dd.p.tid": "66a666b600000000", + "_dd.parent_id": "ae6e2bf21738f62f", "http.flavor": "1.1", "http.host": "0.0.0.0:8001", "http.method": "GET", @@ -45,21 +46,22 @@ "http.target": "/otel", "http.user_agent": "python-requests/2.28.1", "language": "python", + "net.host.name": "0.0.0.0:8001", "net.peer.ip": "127.0.0.1", - "runtime-id": "f90fe90fc53a4388b02210639d156981", + "runtime-id": "0dacca250fe4471094dc593f4892b91f", "span.kind": "server", - "tracestate": "dd=s:1;t.dm:-0" + "tracestate": "dd=p:ae6e2bf21738f62f;s:1;t.dm:-0" }, "metrics": { "_dd.top_level": 1, "_dd.tracer_kr": 1.0, "_sampling_priority_v1": 1, "net.host.port": 8001, - "net.peer.port": 65508, - "process_id": 7721 + "net.peer.port": 62683, + "process_id": 75530 }, - "duration": 413250, - "start": 1700083164081574592 + "duration": 515000, + "start": 1722181302830460000 }, { "name": "internal", @@ -70,6 +72,6 @@ "parent_id": 2, "type": "", "error": 0, - "duration": 18250, - "start": 1700083164081867259 + "duration": 16000, + "start": 1722181302830847000 }]]