Skip to content

Commit

Permalink
fix(opentelemetry): resolve lazy sampling + distributed tracing bug (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
mabdinur committed Aug 12, 2024
1 parent 3de8627 commit d983322
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 54 deletions.
29 changes: 22 additions & 7 deletions ddtrace/opentelemetry/_span.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from opentelemetry.trace.span import TraceFlags
from opentelemetry.trace.span import TraceState

from ddtrace import tracer as ddtracer
from ddtrace.constants import ERROR_MSG
from ddtrace.constants import SPAN_KIND
from ddtrace.internal import core
Expand Down Expand Up @@ -135,13 +136,27 @@ def kind(self):
def get_span_context(self):
# type: () -> SpanContext
"""Returns an OpenTelemetry SpanContext"""
ts = None
tf = TraceFlags.DEFAULT
if self._ddspan.context:
ts_str = w3c_tracestate_add_p(self._ddspan.context._tracestate, self._ddspan.span_id)
ts = TraceState.from_header([ts_str])
if self._ddspan.context.sampling_priority and self._ddspan.context.sampling_priority > 0:
tf = TraceFlags.SAMPLED
if self._ddspan.context.sampling_priority is None:
# With the introduction of lazy sampling, spans are now sampled on serialization. With this change
# a spans trace flags could be propagated before a sampling
# decision is made. Since the default sampling decision is to unsample spans this can result
# in missing spans. To resolve this issue, a sampling decision must be made the first time
# the span context is accessed.
ddtracer.sample(self._ddspan._local_root or self._ddspan)

if self._ddspan.context.sampling_priority is None:
tf = TraceFlags.DEFAULT
log.warning(
"Span context is missing a sampling decision, defaulting to unsampled: %s", str(self._ddspan.context)
)
elif self._ddspan.context.sampling_priority > 0:
tf = TraceFlags.SAMPLED
else:
tf = TraceFlags.DEFAULT

# Evaluate the tracestate header after the sampling decision has been made
ts_str = w3c_tracestate_add_p(self._ddspan.context._tracestate, self._ddspan.span_id)
ts = TraceState.from_header([ts_str])

return SpanContext(self._ddspan.trace_id, self._ddspan.span_id, False, tf, ts)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
fixes:
- |
opentelemetry: Resolves an edge case where distributed tracing headers could be generated before a sampling decision is made,
resulting in dropped spans in downstream services.
58 changes: 53 additions & 5 deletions tests/opentelemetry/test_trace.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import opentelemetry
from opentelemetry.trace import set_span_in_context
from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
import opentelemetry.version
import pytest

Expand Down Expand Up @@ -42,7 +44,7 @@ def test_otel_start_span_without_default_args(oteltracer):
root = oteltracer.start_span("root-span")
otel_span = oteltracer.start_span(
"test-start-span",
context=opentelemetry.trace.set_span_in_context(root),
context=set_span_in_context(root),
kind=opentelemetry.trace.SpanKind.CLIENT,
attributes={"start_span_tag": "start_span_val"},
links=None,
Expand Down Expand Up @@ -114,7 +116,7 @@ def test_otel_start_current_span_without_default_args(oteltracer):
with oteltracer.start_as_current_span("root-span") as root:
with oteltracer.start_as_current_span(
"test-start-current-span-no-defualts",
context=opentelemetry.trace.set_span_in_context(root),
context=set_span_in_context(root),
kind=opentelemetry.trace.SpanKind.SERVER,
attributes={"start_current_span_tag": "start_cspan_val"},
links=[],
Expand All @@ -135,6 +137,50 @@ def test_otel_start_current_span_without_default_args(oteltracer):
otel_span.end()


def test_otel_get_span_context_sets_sampling_decision(oteltracer):
with oteltracer.start_span("otel-server") as otelspan:
# Sampling priority is not set on span creation
assert otelspan._ddspan.context.sampling_priority is None
# Ensure the sampling priority is always consistent with traceflags
span_context = otelspan.get_span_context()
# Sampling priority is evaluated when the SpanContext is first accessed
sp = otelspan._ddspan.context.sampling_priority
assert sp is not None
if sp > 0:
assert span_context.trace_flags == 1
else:
assert span_context.trace_flags == 0
# Ensure the sampling priority is always consistent
for _ in range(1000):
otelspan.get_span_context()
assert otelspan._ddspan.context.sampling_priority == sp


def test_distributed_trace_inject(oteltracer): # noqa:F811
with oteltracer.start_as_current_span("test-otel-distributed-trace") as span:
headers = {}
TraceContextTextMapPropagator().inject(headers, set_span_in_context(span))
sp = span.get_span_context()
assert headers["traceparent"] == f"00-{sp.trace_id:032x}-{sp.span_id:016x}-{sp.trace_flags:02x}"
assert headers["tracestate"] == sp.trace_state.to_header()


def test_distributed_trace_extract(oteltracer): # noqa:F811
headers = {
"traceparent": "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01",
"tracestate": "congo=t61rcWkgMzE,dd=s:2",
}
context = TraceContextTextMapPropagator().extract(headers)
with oteltracer.start_as_current_span("test-otel-distributed-trace", context=context) as span:
sp = span.get_span_context()
assert sp.trace_id == int("0af7651916cd43dd8448eb211c80319c", 16)
assert span._ddspan.parent_id == int("b7ad6b7169203331", 16)
assert sp.trace_flags == 1
assert sp.trace_state.get("congo") == "t61rcWkgMzE"
assert "s:2" in sp.trace_state.get("dd")
assert sp.is_remote is False


@flaky(1735812000)
@pytest.mark.parametrize(
"flask_wsgi_application,flask_env_arg,flask_port,flask_command",
Expand All @@ -161,10 +207,12 @@ def test_otel_start_current_span_without_default_args(oteltracer):
"with_opentelemetry_instrument",
],
)
@pytest.mark.snapshot(ignores=["metrics.net.peer.port", "meta.traceparent", "meta.flask.version"])
@pytest.mark.snapshot(ignores=["metrics.net.peer.port", "meta.traceparent", "meta.tracestate", "meta.flask.version"])
def test_distributed_trace_with_flask_app(flask_client, oteltracer): # noqa:F811
with oteltracer.start_as_current_span("test-otel-distributed-trace"):
resp = flask_client.get("/otel")
with oteltracer.start_as_current_span("test-otel-distributed-trace") as span:
headers = {}
TraceContextTextMapPropagator().inject(headers, set_span_in_context(span))
resp = flask_client.get("/otel", headers=headers)

assert resp.text == "otel"
assert resp.status_code == 200
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
"error": 0,
"meta": {
"_dd.p.dm": "-0",
"_dd.p.tid": "655535db00000000",
"_dd.p.tid": "66a6956800000000",
"language": "python",
"runtime-id": "b4ffa244c11343de919ca20a7e8eebcf"
"runtime-id": "0da930484bcf4b488bfd7bb9cfd9c4b6"
},
"metrics": {
"_dd.top_level": 1,
Expand All @@ -35,11 +35,12 @@
"meta": {
"_dd.base_service": "",
"_dd.p.dm": "-0",
"_dd.p.tid": "655535db00000000",
"_dd.p.tid": "66a6956800000000",
"_dd.parent_id": "4f4e94b0f4d57229",
"component": "flask",
"flask.endpoint": "otel",
"flask.url_rule": "/otel",
"flask.version": "2.1.3",
"flask.version": "1.1.4",
"http.method": "GET",
"http.route": "/otel",
"http.status_code": "200",
Expand All @@ -56,10 +57,10 @@
"_dd.top_level": 1,
"_dd.tracer_kr": 1.0,
"_sampling_priority_v1": 1,
"process_id": 7713
"process_id": 31660
},
"duration": 14813125,
"start": 1700083163141821300
"duration": 4600000,
"start": 1722193256335520000
},
{
"name": "flask.application",
Expand All @@ -76,8 +77,8 @@
"flask.endpoint": "otel",
"flask.url_rule": "/otel"
},
"duration": 14243208,
"start": 1700083163142061300
"duration": 4219000,
"start": 1722193256335701000
},
{
"name": "flask.try_trigger_before_first_request_functions",
Expand All @@ -92,8 +93,8 @@
"_dd.base_service": "",
"component": "flask"
},
"duration": 14625,
"start": 1700083163142217383
"duration": 12000,
"start": 1722193256335827000
},
{
"name": "flask.preprocess_request",
Expand All @@ -108,8 +109,8 @@
"_dd.base_service": "",
"component": "flask"
},
"duration": 20125,
"start": 1700083163142331717
"duration": 13000,
"start": 1722193256335911000
},
{
"name": "flask.dispatch_request",
Expand All @@ -124,8 +125,8 @@
"_dd.base_service": "",
"component": "flask"
},
"duration": 13437750,
"start": 1700083163142415633
"duration": 3646000,
"start": 1722193256335970000
},
{
"name": "tests.opentelemetry.flask_app.otel",
Expand All @@ -140,8 +141,8 @@
"_dd.base_service": "",
"component": "flask"
},
"duration": 13354542,
"start": 1700083163142480508
"duration": 3585000,
"start": 1722193256336021000
},
{
"name": "internal",
Expand All @@ -155,8 +156,8 @@
"meta": {
"_dd.base_service": ""
},
"duration": 52875,
"start": 1700083163155755800
"duration": 36000,
"start": 1722193256339552000
},
{
"name": "flask.process_response",
Expand All @@ -171,8 +172,8 @@
"_dd.base_service": "",
"component": "flask"
},
"duration": 32375,
"start": 1700083163156004175
"duration": 17000,
"start": 1722193256339703000
},
{
"name": "flask.do_teardown_request",
Expand All @@ -187,8 +188,8 @@
"_dd.base_service": "",
"component": "flask"
},
"duration": 31625,
"start": 1700083163156192133
"duration": 14000,
"start": 1722193256339831000
},
{
"name": "flask.do_teardown_appcontext",
Expand All @@ -203,8 +204,8 @@
"_dd.base_service": "",
"component": "flask"
},
"duration": 9292,
"start": 1700083163156278258
"duration": 10000,
"start": 1722193256339897000
},
{
"name": "flask.response",
Expand All @@ -219,6 +220,6 @@
"_dd.base_service": "",
"component": "flask"
},
"duration": 311541,
"start": 1700083163156314342
"duration": 183000,
"start": 1722193256339931000
}]]
Original file line number Diff line number Diff line change
Expand Up @@ -10,31 +10,32 @@
"error": 0,
"meta": {
"_dd.p.dm": "-0",
"_dd.p.tid": "655535dc00000000",
"_dd.p.tid": "66a666b600000000",
"language": "python",
"runtime-id": "b4ffa244c11343de919ca20a7e8eebcf"
"runtime-id": "4fe738d296c04c00aff7eb2a66389ad4"
},
"metrics": {
"_dd.top_level": 1,
"_dd.tracer_kr": 1.0,
"_sampling_priority_v1": 1,
"process_id": 7703
"process_id": 75526
},
"duration": 24467250,
"start": 1700083164080082009
"duration": 3561000,
"start": 1722181302828129000
},
{
"name": "server",
"service": "",
"resource": "/otel",
"resource": "GET /otel",
"trace_id": 0,
"span_id": 2,
"parent_id": 1,
"type": "",
"error": 0,
"meta": {
"_dd.p.dm": "-0",
"_dd.p.tid": "655535dc00000000",
"_dd.p.tid": "66a666b600000000",
"_dd.parent_id": "ae6e2bf21738f62f",
"http.flavor": "1.1",
"http.host": "0.0.0.0:8001",
"http.method": "GET",
Expand All @@ -45,21 +46,22 @@
"http.target": "/otel",
"http.user_agent": "python-requests/2.28.1",
"language": "python",
"net.host.name": "0.0.0.0:8001",
"net.peer.ip": "127.0.0.1",
"runtime-id": "f90fe90fc53a4388b02210639d156981",
"runtime-id": "0dacca250fe4471094dc593f4892b91f",
"span.kind": "server",
"tracestate": "dd=s:1;t.dm:-0"
"tracestate": "dd=p:ae6e2bf21738f62f;s:1;t.dm:-0"
},
"metrics": {
"_dd.top_level": 1,
"_dd.tracer_kr": 1.0,
"_sampling_priority_v1": 1,
"net.host.port": 8001,
"net.peer.port": 65508,
"process_id": 7721
"net.peer.port": 62683,
"process_id": 75530
},
"duration": 413250,
"start": 1700083164081574592
"duration": 515000,
"start": 1722181302830460000
},
{
"name": "internal",
Expand All @@ -70,6 +72,6 @@
"parent_id": 2,
"type": "",
"error": 0,
"duration": 18250,
"start": 1700083164081867259
"duration": 16000,
"start": 1722181302830847000
}]]

0 comments on commit d983322

Please sign in to comment.