Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(llmobs): ragas evaluation framework integration #11939

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion ddtrace/llmobs/_evaluators/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
from ddtrace.internal.periodic import PeriodicService
from ddtrace.internal.telemetry import telemetry_writer
from ddtrace.internal.telemetry.constants import TELEMETRY_NAMESPACE
from ddtrace.internal.utils.deprecations import DDTraceDeprecationWarning
from ddtrace.llmobs._evaluators.ragas.answer_relevancy import RagasAnswerRelevancyEvaluator
from ddtrace.llmobs._evaluators.ragas.context_precision import RagasContextPrecisionEvaluator
from ddtrace.llmobs._evaluators.ragas.faithfulness import RagasFaithfulnessEvaluator
from ddtrace.llmobs._evaluators.sampler import EvaluatorRunnerSampler
from ddtrace.vendor.debtcollector import deprecate


logger = get_logger(__name__)
Expand All @@ -31,6 +33,9 @@ class EvaluatorRunner(PeriodicService):
2. triggers evaluator runs over buffered finished spans on each `periodic` call
"""

EVALUATORS_ENV_VAR = "DD_LLMOBS_EVALUATORS"
DEPRECATED_EVALUATORS_ENV_VAR = "_DD_LLMOBS_EVALUATORS"

def __init__(self, interval: float, llmobs_service=None, evaluators=None):
super(EvaluatorRunner, self).__init__(interval=interval)
self._lock = forksafe.RLock()
Expand All @@ -45,7 +50,16 @@ def __init__(self, interval: float, llmobs_service=None, evaluators=None):
if len(self.evaluators) > 0:
return

evaluator_str = os.getenv("_DD_LLMOBS_EVALUATORS")
deprecated_evaluator_str = os.getenv(self.DEPRECATED_EVALUATORS_ENV_VAR)
if deprecated_evaluator_str is not None:
deprecate(
"Using `_DD_LLMOBS_EVALUATORS` is deprecated",
message="Please use `DD_LLMOBS_EVALUATORS` instead.",
removal_version="3.2.0",
category=DDTraceDeprecationWarning,
)

evaluator_str = os.getenv(self.EVALUATORS_ENV_VAR) or deprecated_evaluator_str
if evaluator_str is None:
return

Expand Down
18 changes: 15 additions & 3 deletions ddtrace/llmobs/_evaluators/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from ddtrace.internal.telemetry import telemetry_writer
from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL
from ddtrace.internal.telemetry.constants import TELEMETRY_NAMESPACE
from ddtrace.internal.utils.deprecations import DDTraceDeprecationWarning
from ddtrace.vendor.debtcollector import deprecate


logger = get_logger(__name__)
Expand Down Expand Up @@ -46,7 +48,8 @@ def __repr__(self):


class EvaluatorRunnerSampler:
SAMPLING_RULES_ENV_VAR = "_DD_LLMOBS_EVALUATOR_SAMPLING_RULES"
SAMPLING_RULES_ENV_VAR = "DD_LLMOBS_EVALUATOR_SAMPLING_RULES"
DEPRECATED_SAMPLING_RULES_ENV_VAR = "_DD_LLMOBS_EVALUATOR_SAMPLING_RULES"

def __init__(self):
self.rules = self.parse_rules()
Expand All @@ -59,8 +62,17 @@ def sample(self, evaluator_label, span):

def parse_rules(self) -> List[EvaluatorRunnerSamplingRule]:
rules = []
sampling_rules_str = os.getenv(self.SAMPLING_RULES_ENV_VAR)
telemetry_writer.add_configuration("_DD_LLMOBS_EVALUATOR_SAMPLING_RULES", sampling_rules_str, origin="env")

deprecated_sampling_rules_str = os.getenv(self.DEPRECATED_SAMPLING_RULES_ENV_VAR)
lievan marked this conversation as resolved.
Show resolved Hide resolved
if deprecated_sampling_rules_str is not None:
deprecate(
"Using `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES` is deprecated",
message="Please use `DD_LLMOBS_EVALUATOR_SAMPLING_RULES` instead.",
removal_version="3.2.0",
category=DDTraceDeprecationWarning,
)
sampling_rules_str = os.getenv(self.SAMPLING_RULES_ENV_VAR) or deprecated_sampling_rules_str
telemetry_writer.add_configuration("DD_LLMOBS_EVALUATOR_SAMPLING_RULES", sampling_rules_str, origin="env")

def parsing_failed_because(msg, maybe_throw_this):
telemetry_writer.add_log(
Expand Down
21 changes: 21 additions & 0 deletions releasenotes/notes/ragas-integration-a81b696757c0e7a5.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
---
features:
- |
LLM Observability: This introduces an integration with the `RAGAS <https://docs.ragas.io/en/stable/>`_ evaluation framework to continuously monitor
the performance of context-augmented LLM generations in production.

The integration supports evaluating LLM inferences with the following RAGAS metrics:
- `Faithfulness <https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/faithfulness/>`_: measures if the LLM response is faithful to the provided context.
- `Answer Relevancy <https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/answer_relevance/>`_: measures how relevant the LLM response is to the user input.
- `Context Precision <https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_precision/>`_: measures how effectively the context is used in the generated response.

To learn more, see the `LLM Observability evaluations guide <https://docs.datadoghq.com/llm_observability/submit_evaluations/>`_.
deprecations:
- |
LLM Observability: The `_DD_LLMOBS_EVALUATORS` environment variable is deprecated and will be removed in ddtrace 3.2.0.
As an alternative to `_DD_LLMOBS_EVALUATORS`, you can use `DD_LLMOBS_EVALUATORS` instead.
To migrate, replace `_DD_LLMOBS_EVALUATORS` with `DD_LLMOBS_EVALUATORS`.
- |
LLM Observability: The `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES` environment variable is deprecated and will be removed in ddtrace 3.2.0.
As an alternative to `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES`, you can use `DD_LLMOBS_EVALUATOR_SAMPLING_RULES` instead.
To migrate, replace `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES` with `DD_LLMOBS_EVALUATOR_SAMPLING_RULES`.
2 changes: 1 addition & 1 deletion tests/llmobs/test_llmobs_evaluator_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces


def test_evaluator_runner_unsupported_evaluator():
with override_env({"_DD_LLMOBS_EVALUATORS": "unsupported"}):
with override_env({EvaluatorRunner.EVALUATORS_ENV_VAR: "unsupported"}):
with pytest.raises(ValueError):
EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock())

Expand Down
3 changes: 2 additions & 1 deletion tests/llmobs/test_llmobs_ragas_evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from ddtrace.llmobs._evaluators.ragas.answer_relevancy import RagasAnswerRelevancyEvaluator
from ddtrace.llmobs._evaluators.ragas.context_precision import RagasContextPrecisionEvaluator
from ddtrace.llmobs._evaluators.ragas.faithfulness import RagasFaithfulnessEvaluator
from ddtrace.llmobs._evaluators.runner import EvaluatorRunner
from ddtrace.span import Span
from tests.llmobs._utils import _expected_llmobs_llm_span_event
from tests.llmobs._utils import _expected_ragas_answer_relevancy_spans
Expand Down Expand Up @@ -235,7 +236,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log
"PYTHONPATH": ":".join(pypath),
"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", "dummy-openai-api-key"),
"_DD_LLMOBS_EVALUATOR_INTERVAL": "5",
"_DD_LLMOBS_EVALUATORS": "ragas_faithfulness",
EvaluatorRunner.EVALUATORS_ENV_VAR: "ragas_faithfulness",
"DD_TRACE_ENABLED": "0",
}
)
Expand Down
4 changes: 2 additions & 2 deletions tests/llmobs/test_llmobs_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -1387,7 +1387,7 @@ def test_llmobs_fork_recreates_and_restarts_eval_metric_writer():

def test_llmobs_fork_recreates_and_restarts_evaluator_runner(mock_ragas_evaluator):
"""Test that forking a process correctly recreates and restarts the EvaluatorRunner."""
with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
with override_env(dict(DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
with mock.patch("ddtrace.llmobs._evaluators.runner.EvaluatorRunner.periodic"):
llmobs_service.enable(_tracer=DummyTracer(), ml_app="test_app")
original_pid = llmobs_service._instance.tracer._pid
Expand Down Expand Up @@ -1758,7 +1758,7 @@ async def test_annotation_context_async_nested(llmobs):
def test_service_enable_starts_evaluator_runner_when_evaluators_exist():
pytest.importorskip("ragas")
with override_global_config(dict(_dd_api_key="<not-a-real-api-key>", _llmobs_ml_app="<ml-app-name>")):
with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
with override_env(dict(DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
dummy_tracer = DummyTracer()
llmobs_service.enable(_tracer=dummy_tracer)
llmobs_instance = llmobs_service._instance
Expand Down
Loading