Skip to content

Commit

Permalink
chore(llmobs): update ragas trace ml app (#11952)
Browse files Browse the repository at this point in the history
Update the ml application of any ragas evaluation span to be the same as
the span being evaluated.

#### Refresher:
- Spans are generated by datadog for RAGAS operations
- These spans need to be identified as RAGAS-specific to avoid an
infinite eval loop and tagged with 'runner.integration:ragas' for
backend purposes
- Some of these spans are auto-instrumented through our langchain
integration, meaning they can't be manually tagged

#### Previous Implementation:
- Added a `dd-ragas-` prefix to the ML application name for RAGAS spans
- This prefix was used to identify which spans came from RAGAS when
spans are processed

#### Now:
- add `_dd_ragas` prefix on ragas span's ml app for identification
purposes, except only temporarily.
- **Just before sending these spans to the backend, remove the prefix**

An alternative solution is to utilize `annotation_contexts` to set a tag
on auto-instrumented ragas spans. This is a larger refactor that i would
like to explore when we clean up the ragas evaluators (more cleanly
separate tracing setup vs actual eval logic).

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met 
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)

---------

Co-authored-by: lievan <[email protected]>
  • Loading branch information
lievan and lievan authored Jan 22, 2025
1 parent e11a0a3 commit 57c43a7
Show file tree
Hide file tree
Showing 9 changed files with 47 additions and 34 deletions.
6 changes: 3 additions & 3 deletions ddtrace/llmobs/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@
# Used to differentiate traces of Datadog-run operations vs user-application operations.
RUNNER_IS_INTEGRATION_SPAN_TAG = "runner.integration"

# The ml app of all ragas traces have this prefix that we use to detect
# whether a span is generated from the ragas evaluation itself.
RAGAS_ML_APP_PREFIX = "dd-ragas"
# All ragas traces have this context item set so we can differentiate
# spans generated from the ragas integration vs user application spans.
IS_EVALUATION_SPAN = "_ml_obs.evaluation_span"

ANNOTATIONS_CONTEXT_ID = "annotations_context_id"
INTERNAL_CONTEXT_VARIABLE_KEYS = "_dd_context_variable_keys"
Expand Down
2 changes: 2 additions & 0 deletions ddtrace/llmobs/_evaluators/ragas/answer_relevancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from ddtrace.internal.logger import get_logger
from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA
from ddtrace.llmobs._constants import IS_EVALUATION_SPAN
from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator
from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace

Expand Down Expand Up @@ -84,6 +85,7 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]
with self.llmobs_service.workflow(
"dd-ragas.answer_relevancy", ml_app=_get_ml_app_for_ragas_trace(span_event)
) as ragas_ar_workflow:
ragas_ar_workflow._set_ctx_item(IS_EVALUATION_SPAN, True)
try:
evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span(span=ragas_ar_workflow)

Expand Down
6 changes: 2 additions & 4 deletions ddtrace/llmobs/_evaluators/ragas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
from typing import Tuple
from typing import Union

from ddtrace import config
from ddtrace.internal.logger import get_logger
from ddtrace.internal.telemetry import telemetry_writer
from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL
from ddtrace.internal.telemetry.constants import TELEMETRY_NAMESPACE
from ddtrace.internal.utils.version import parse_version
from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS
from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS
from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX


logger = get_logger(__name__)
Expand Down Expand Up @@ -94,9 +94,7 @@ def _get_ml_app_for_ragas_trace(span_event: dict) -> str:
if isinstance(tag, str) and tag.startswith("ml_app:"):
ml_app = tag.split(":")[1]
break
if not ml_app:
return RAGAS_ML_APP_PREFIX
return "{}-{}".format(RAGAS_ML_APP_PREFIX, ml_app)
return ml_app or config._llmobs_ml_app or "unknown-ml-app"


class BaseRagasEvaluator:
Expand Down
2 changes: 2 additions & 0 deletions ddtrace/llmobs/_evaluators/ragas/context_precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from ddtrace.internal.logger import get_logger
from ddtrace.llmobs._constants import EVALUATION_KIND_METADATA
from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA
from ddtrace.llmobs._constants import IS_EVALUATION_SPAN
from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator
from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace

Expand Down Expand Up @@ -82,6 +83,7 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]
with self.llmobs_service.workflow(
"dd-ragas.context_precision", ml_app=_get_ml_app_for_ragas_trace(span_event)
) as ragas_cp_workflow:
ragas_cp_workflow._set_ctx_item(IS_EVALUATION_SPAN, True)
try:
evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span(span=ragas_cp_workflow)

Expand Down
2 changes: 2 additions & 0 deletions ddtrace/llmobs/_evaluators/ragas/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from ddtrace.llmobs._constants import EVALUATION_KIND_METADATA
from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA
from ddtrace.llmobs._constants import FAITHFULNESS_DISAGREEMENTS_METADATA
from ddtrace.llmobs._constants import IS_EVALUATION_SPAN
from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator
from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace

Expand Down Expand Up @@ -96,6 +97,7 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]
with self.llmobs_service.workflow(
"dd-ragas.faithfulness", ml_app=_get_ml_app_for_ragas_trace(span_event)
) as ragas_faithfulness_workflow:
ragas_faithfulness_workflow._set_ctx_item(IS_EVALUATION_SPAN, True)
try:
evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span(
span=ragas_faithfulness_workflow
Expand Down
35 changes: 12 additions & 23 deletions ddtrace/llmobs/_llmobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from typing import Dict
from typing import List
from typing import Optional
from typing import Tuple
from typing import Union

import ddtrace
Expand Down Expand Up @@ -60,6 +59,7 @@
from ddtrace.llmobs._utils import _get_session_id
from ddtrace.llmobs._utils import _get_span_name
from ddtrace.llmobs._utils import _inject_llmobs_parent_id
from ddtrace.llmobs._utils import _is_evaluation_span
from ddtrace.llmobs._utils import safe_json
from ddtrace.llmobs._utils import validate_prompt
from ddtrace.llmobs._writer import LLMObsEvalMetricWriter
Expand Down Expand Up @@ -123,23 +123,21 @@ def _on_span_finish(self, span):
def _submit_llmobs_span(self, span: Span) -> None:
"""Generate and submit an LLMObs span event to be sent to LLMObs."""
span_event = None
is_llm_span = span._get_ctx_item(SPAN_KIND) == "llm"
is_ragas_integration_span = False
try:
span_event, is_ragas_integration_span = self._llmobs_span_event(span)
span_event = self._llmobs_span_event(span)
self._llmobs_span_writer.enqueue(span_event)
except (KeyError, TypeError):
log.error(
"Error generating LLMObs span event for span %s, likely due to malformed span", span, exc_info=True
)
finally:
if not span_event or not is_llm_span or is_ragas_integration_span:
if not span_event or not span._get_ctx_item(SPAN_KIND) == "llm" or _is_evaluation_span(span):
return
if self._evaluator_runner:
self._evaluator_runner.enqueue(span_event, span)

@classmethod
def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]:
def _llmobs_span_event(cls, span: Span) -> Dict[str, Any]:
"""Span event object structure."""
span_kind = span._get_ctx_item(SPAN_KIND)
if not span_kind:
Expand Down Expand Up @@ -186,11 +184,6 @@ def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]:
metrics = span._get_ctx_item(METRICS) or {}
ml_app = _get_ml_app(span)

is_ragas_integration_span = False

if ml_app.startswith(constants.RAGAS_ML_APP_PREFIX):
is_ragas_integration_span = True

span._set_ctx_item(ML_APP, ml_app)
parent_id = str(_get_llmobs_parent_id(span) or "undefined")

Expand All @@ -210,20 +203,16 @@ def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]:
span._set_ctx_item(SESSION_ID, session_id)
llmobs_span_event["session_id"] = session_id

llmobs_span_event["tags"] = cls._llmobs_tags(
span, ml_app, session_id, is_ragas_integration_span=is_ragas_integration_span
)
llmobs_span_event["tags"] = cls._llmobs_tags(span, ml_app, session_id)

span_links = span._get_ctx_item(SPAN_LINKS)
if isinstance(span_links, list):
llmobs_span_event["span_links"] = span_links

return llmobs_span_event, is_ragas_integration_span
return llmobs_span_event

@staticmethod
def _llmobs_tags(
span: Span, ml_app: str, session_id: Optional[str] = None, is_ragas_integration_span: bool = False
) -> List[str]:
def _llmobs_tags(span: Span, ml_app: str, session_id: Optional[str] = None) -> List[str]:
tags = {
"version": config.version or "",
"env": config.env or "",
Expand All @@ -239,7 +228,7 @@ def _llmobs_tags(
tags["error_type"] = err_type
if session_id:
tags["session_id"] = session_id
if is_ragas_integration_span:
if _is_evaluation_span(span):
tags[constants.RUNNER_IS_INTEGRATION_SPAN_TAG] = "ragas"
existing_tags = span._get_ctx_item(TAGS)
if existing_tags is not None:
Expand Down Expand Up @@ -278,10 +267,6 @@ def _start_service(self) -> None:
log.debug("Error starting evaluator runner")

def _stop_service(self) -> None:
# Remove listener hooks for span events
core.reset_listeners("trace.span_start", self._on_span_start)
core.reset_listeners("trace.span_finish", self._on_span_finish)

try:
self._evaluator_runner.stop()
# flush remaining evaluation spans & evaluations
Expand All @@ -296,6 +281,10 @@ def _stop_service(self) -> None:
except ServiceStatusError:
log.debug("Error stopping LLMObs writers")

# Remove listener hooks for span events
core.reset_listeners("trace.span_start", self._on_span_start)
core.reset_listeners("trace.span_finish", self._on_span_finish)

forksafe.unregister(self._child_after_fork)

@classmethod
Expand Down
18 changes: 18 additions & 0 deletions ddtrace/llmobs/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from ddtrace.llmobs._constants import GEMINI_APM_SPAN_NAME
from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS
from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS
from ddtrace.llmobs._constants import IS_EVALUATION_SPAN
from ddtrace.llmobs._constants import LANGCHAIN_APM_SPAN_NAME
from ddtrace.llmobs._constants import ML_APP
from ddtrace.llmobs._constants import NAME
Expand Down Expand Up @@ -128,6 +129,23 @@ def _get_span_name(span: Span) -> str:
return span._get_ctx_item(NAME) or span.name


def _is_evaluation_span(span: Span) -> bool:
"""
Return whether or not a span is an evaluation span by checking the span's
nearest LLMObs span ancestor. Default to 'False'
"""
is_evaluation_span = span._get_ctx_item(IS_EVALUATION_SPAN)
if is_evaluation_span:
return is_evaluation_span
llmobs_parent = _get_nearest_llmobs_ancestor(span)
while llmobs_parent:
is_evaluation_span = llmobs_parent._get_ctx_item(IS_EVALUATION_SPAN)
if is_evaluation_span:
return is_evaluation_span
llmobs_parent = _get_nearest_llmobs_ancestor(llmobs_parent)
return False


def _get_ml_app(span: Span) -> str:
"""
Return the ML app name for a given span, by checking the span's nearest LLMObs span ancestor.
Expand Down
2 changes: 1 addition & 1 deletion tests/llmobs/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ def expected_ragas_trace_tags():
"env:",
"service:tests.llmobs",
"source:integration",
"ml_app:dd-ragas-unnamed-ml-app",
"ml_app:unnamed-ml-app",
"ddtrace.version:{}".format(ddtrace.__version__),
"language:python",
"error:0",
Expand Down
8 changes: 5 additions & 3 deletions tests/llmobs/test_llmobs_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from ddtrace.llmobs._constants import INPUT_PARAMETERS
from ddtrace.llmobs._constants import INPUT_PROMPT
from ddtrace.llmobs._constants import INPUT_VALUE
from ddtrace.llmobs._constants import IS_EVALUATION_SPAN
from ddtrace.llmobs._constants import METADATA
from ddtrace.llmobs._constants import METRICS
from ddtrace.llmobs._constants import MODEL_NAME
Expand All @@ -24,7 +25,6 @@
from ddtrace.llmobs._constants import OUTPUT_MESSAGES
from ddtrace.llmobs._constants import OUTPUT_VALUE
from ddtrace.llmobs._constants import PROPAGATED_PARENT_ID_KEY
from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX
from ddtrace.llmobs._constants import SESSION_ID
from ddtrace.llmobs._constants import SPAN_KIND
from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING
Expand Down Expand Up @@ -1538,8 +1538,10 @@ def test_llmobs_with_evaluator_runner(llmobs, mock_llmobs_evaluator_runner):


def test_llmobs_with_evaluator_runner_does_not_enqueue_evaluation_spans(mock_llmobs_evaluator_runner, llmobs):
with llmobs.llm(model_name="test_model", ml_app="{}-dummy".format(RAGAS_ML_APP_PREFIX)):
pass
with llmobs.agent(name="test") as agent:
agent._set_ctx_item(IS_EVALUATION_SPAN, True)
with llmobs.llm(model_name="test_model"):
pass
time.sleep(0.1)
assert llmobs._instance._evaluator_runner.enqueue.call_count == 0

Expand Down

0 comments on commit 57c43a7

Please sign in to comment.