feat(llmobs): introduce ragas eval integration (#12143)

Publicize the RAGAS integration [RAGAS](https://docs.ragas.io/en/stable/getstarted/index.html) is an evaluation framework for RAG Applications. The integration supports evaluating LLM inferences with three RAGAS metrics - faithfulness - answer relevancy - context precision dupe of #11939 to get in ddtrace 3.0 ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --------- Co-authored-by: lievan <[email protected]>
DataDog · Jan 31, 2025 · 7257296 · 7257296
1 parent 19f2378
commit 7257296
Show file tree

Hide file tree

Showing 6 changed files with 35 additions and 10 deletions.
diff --git a/ddtrace/llmobs/_evaluators/runner.py b/ddtrace/llmobs/_evaluators/runner.py
@@ -32,6 +32,8 @@ class EvaluatorRunner(PeriodicService):
     2. triggers evaluator runs over buffered finished spans on each `periodic` call
     """
 
+    EVALUATORS_ENV_VAR = "DD_LLMOBS_EVALUATORS"
+
     def __init__(self, interval: float, llmobs_service=None, evaluators=None):
         super(EvaluatorRunner, self).__init__(interval=interval)
         self._lock = forksafe.RLock()
@@ -46,7 +48,7 @@ def __init__(self, interval: float, llmobs_service=None, evaluators=None):
         if len(self.evaluators) > 0:
             return
 
-        evaluator_str = os.getenv("_DD_LLMOBS_EVALUATORS")
+        evaluator_str = os.getenv(self.EVALUATORS_ENV_VAR)
         if evaluator_str is None:
             return
 

diff --git a/ddtrace/llmobs/_evaluators/sampler.py b/ddtrace/llmobs/_evaluators/sampler.py
@@ -46,7 +46,7 @@ def __repr__(self):
 
 
 class EvaluatorRunnerSampler:
-    SAMPLING_RULES_ENV_VAR = "_DD_LLMOBS_EVALUATOR_SAMPLING_RULES"
+    SAMPLING_RULES_ENV_VAR = "DD_LLMOBS_EVALUATOR_SAMPLING_RULES"
 
     def __init__(self):
         self.rules = self.parse_rules()
@@ -59,8 +59,9 @@ def sample(self, evaluator_label, span):
 
     def parse_rules(self) -> List[EvaluatorRunnerSamplingRule]:
         rules = []
+
         sampling_rules_str = os.getenv(self.SAMPLING_RULES_ENV_VAR)
-        telemetry_writer.add_configuration("_DD_LLMOBS_EVALUATOR_SAMPLING_RULES", sampling_rules_str, origin="env")
+        telemetry_writer.add_configuration(self.SAMPLING_RULES_ENV_VAR, sampling_rules_str, origin="env")
 
         def parsing_failed_because(msg, maybe_throw_this):
             telemetry_writer.add_log(

diff --git a/releasenotes/notes/ragas-integration-a81b696757c0e7a5.yaml b/releasenotes/notes/ragas-integration-a81b696757c0e7a5.yaml
@@ -0,0 +1,21 @@
+---
+features:
+  - |
+    LLM Observability: This introduces an integration with the `RAGAS <https://docs.ragas.io/en/stable/>`_ evaluation framework to continuously monitor 
+                    the performance of context-augmented LLM generations in production.
+
+                    The integration supports evaluating LLM inferences with the following RAGAS metrics:
+                    - `Faithfulness <https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/faithfulness/>`_: measures if the LLM response is faithful to the provided context.
+                    - `Answer Relevancy <https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/answer_relevance/>`_: measures how relevant the LLM response is to the user input.
+                    - `Context Precision <https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_precision/>`_:  measures how effectively the context is used in the generated response.
+
+                    To learn more, see the `LLM Observability evaluations guide <https://docs.datadoghq.com/llm_observability/submit_evaluations/>`_.
+deprecations:
+  - |
+    LLM Observability: The `_DD_LLMOBS_EVALUATORS` environment variable is deprecated and will be removed in ddtrace 3.0.0.
+                      As an alternative to `_DD_LLMOBS_EVALUATORS`, you can use `DD_LLMOBS_EVALUATORS` instead.
+                      To migrate, replace `_DD_LLMOBS_EVALUATORS` with `DD_LLMOBS_EVALUATORS`.
+  - |
+    LLM Observability: The `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES` environment variable is deprecated and will be removed in ddtrace 3.0.0.
+                      As an alternative to `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES`, you can use `DD_LLMOBS_EVALUATOR_SAMPLING_RULES` instead.
+                      To migrate, replace `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES` with `DD_LLMOBS_EVALUATOR_SAMPLING_RULES`.
diff --git a/tests/llmobs/test_llmobs_evaluator_runner.py b/tests/llmobs/test_llmobs_evaluator_runner.py
@@ -120,7 +120,7 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces
 
 
 def test_evaluator_runner_unsupported_evaluator():
-    with override_env({"_DD_LLMOBS_EVALUATORS": "unsupported"}):
+    with override_env({EvaluatorRunner.EVALUATORS_ENV_VAR: "unsupported"}):
         with pytest.raises(ValueError):
             EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock())
 

diff --git a/tests/llmobs/test_llmobs_ragas_evaluators.py b/tests/llmobs/test_llmobs_ragas_evaluators.py
@@ -6,7 +6,8 @@
 from ddtrace.llmobs._evaluators.ragas.answer_relevancy import RagasAnswerRelevancyEvaluator
 from ddtrace.llmobs._evaluators.ragas.context_precision import RagasContextPrecisionEvaluator
 from ddtrace.llmobs._evaluators.ragas.faithfulness import RagasFaithfulnessEvaluator
-from ddtrace.trace import Span
+from ddtrace.llmobs._evaluators.runner import EvaluatorRunner
+from ddtrace.span import Span
 from tests.llmobs._utils import _expected_llmobs_llm_span_event
 from tests.llmobs._utils import _expected_ragas_answer_relevancy_spans
 from tests.llmobs._utils import _expected_ragas_context_precision_spans
@@ -235,7 +236,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log
             "PYTHONPATH": ":".join(pypath),
             "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", "dummy-openai-api-key"),
             "_DD_LLMOBS_EVALUATOR_INTERVAL": "5",
-            "_DD_LLMOBS_EVALUATORS": "ragas_faithfulness",
+            EvaluatorRunner.EVALUATORS_ENV_VAR: "ragas_faithfulness",
             "DD_TRACE_ENABLED": "0",
         }
     )

diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py
@@ -1384,7 +1384,7 @@ def test_llmobs_fork_recreates_and_restarts_eval_metric_writer():
 def test_llmobs_fork_recreates_and_restarts_evaluator_runner(mock_ragas_evaluator):
     """Test that forking a process correctly recreates and restarts the EvaluatorRunner."""
     pytest.importorskip("ragas")
-    with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
+    with override_env(dict(DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
         with mock.patch("ddtrace.llmobs._evaluators.runner.EvaluatorRunner.periodic"):
             llmobs_service.enable(_tracer=DummyTracer(), ml_app="test_app")
             original_pid = llmobs_service._instance.tracer._pid
@@ -1464,9 +1464,9 @@ def test_llmobs_fork_submit_evaluation(monkeypatch):
 
 def test_llmobs_fork_evaluator_runner_run(monkeypatch):
     """Test that forking a process correctly encodes new spans created in each process."""
-    monkeypatch.setenv("_DD_LLMOBS_EVALUATOR_INTERVAL", 5.0)
+    monkeypatch.setenv("DD_LLMOBS_EVALUATOR_INTERVAL", 5.0)
     pytest.importorskip("ragas")
-    monkeypatch.setenv("_DD_LLMOBS_EVALUATORS", "ragas_faithfulness")
+    monkeypatch.setenv("DD_LLMOBS_EVALUATORS", "ragas_faithfulness")
     with mock.patch("ddtrace.llmobs._evaluators.runner.EvaluatorRunner.periodic"):
         llmobs_service.enable(_tracer=DummyTracer(), ml_app="test_app", api_key="test_api_key")
         pid = os.fork()
@@ -1757,7 +1757,7 @@ async def test_annotation_context_async_nested(llmobs):
 def test_service_enable_starts_evaluator_runner_when_evaluators_exist():
     pytest.importorskip("ragas")
     with override_global_config(dict(_dd_api_key="<not-a-real-api-key>", _llmobs_ml_app="<ml-app-name>")):
-        with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
+        with override_env(dict(DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
             dummy_tracer = DummyTracer()
             llmobs_service.enable(_tracer=dummy_tracer)
             llmobs_instance = llmobs_service._instance