From 7257296a482a92197fb43957c92ee3c908d61092 Mon Sep 17 00:00:00 2001
From: lievan <42917263+lievan@users.noreply.github.com>
Date: Fri, 31 Jan 2025 14:43:44 -0500
Subject: [PATCH] feat(llmobs): introduce ragas eval integration (#12143)

Publicize the RAGAS integration

[RAGAS](https://docs.ragas.io/en/stable/getstarted/index.html) is an
evaluation framework for RAG Applications. The integration supports
evaluating LLM inferences with three RAGAS metrics
- faithfulness
- answer relevancy
- context precision

dupe of https://github.com/DataDog/dd-trace-py/pull/11939 to get in
ddtrace 3.0

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)

---------

Co-authored-by: lievan <evan.li@datadoqhq.com>
---
 ddtrace/llmobs/_evaluators/runner.py          |  4 +++-
 ddtrace/llmobs/_evaluators/sampler.py         |  5 +++--
 .../ragas-integration-a81b696757c0e7a5.yaml   | 21 +++++++++++++++++++
 tests/llmobs/test_llmobs_evaluator_runner.py  |  2 +-
 tests/llmobs/test_llmobs_ragas_evaluators.py  |  5 +++--
 tests/llmobs/test_llmobs_service.py           |  8 +++----
 6 files changed, 35 insertions(+), 10 deletions(-)
 create mode 100644 releasenotes/notes/ragas-integration-a81b696757c0e7a5.yaml

diff --git a/ddtrace/llmobs/_evaluators/runner.py b/ddtrace/llmobs/_evaluators/runner.py
index 5e0ab2737f4..780293563d8 100644
--- a/ddtrace/llmobs/_evaluators/runner.py
+++ b/ddtrace/llmobs/_evaluators/runner.py
@@ -32,6 +32,8 @@ class EvaluatorRunner(PeriodicService):
     2. triggers evaluator runs over buffered finished spans on each `periodic` call
     """
 
+    EVALUATORS_ENV_VAR = "DD_LLMOBS_EVALUATORS"
+
     def __init__(self, interval: float, llmobs_service=None, evaluators=None):
         super(EvaluatorRunner, self).__init__(interval=interval)
         self._lock = forksafe.RLock()
@@ -46,7 +48,7 @@ def __init__(self, interval: float, llmobs_service=None, evaluators=None):
         if len(self.evaluators) > 0:
             return
 
-        evaluator_str = os.getenv("_DD_LLMOBS_EVALUATORS")
+        evaluator_str = os.getenv(self.EVALUATORS_ENV_VAR)
         if evaluator_str is None:
             return
 
diff --git a/ddtrace/llmobs/_evaluators/sampler.py b/ddtrace/llmobs/_evaluators/sampler.py
index 3598e90f7f3..524af217f83 100644
--- a/ddtrace/llmobs/_evaluators/sampler.py
+++ b/ddtrace/llmobs/_evaluators/sampler.py
@@ -46,7 +46,7 @@ def __repr__(self):
 
 
 class EvaluatorRunnerSampler:
-    SAMPLING_RULES_ENV_VAR = "_DD_LLMOBS_EVALUATOR_SAMPLING_RULES"
+    SAMPLING_RULES_ENV_VAR = "DD_LLMOBS_EVALUATOR_SAMPLING_RULES"
 
     def __init__(self):
         self.rules = self.parse_rules()
@@ -59,8 +59,9 @@ def sample(self, evaluator_label, span):
 
     def parse_rules(self) -> List[EvaluatorRunnerSamplingRule]:
         rules = []
+
         sampling_rules_str = os.getenv(self.SAMPLING_RULES_ENV_VAR)
-        telemetry_writer.add_configuration("_DD_LLMOBS_EVALUATOR_SAMPLING_RULES", sampling_rules_str, origin="env")
+        telemetry_writer.add_configuration(self.SAMPLING_RULES_ENV_VAR, sampling_rules_str, origin="env")
 
         def parsing_failed_because(msg, maybe_throw_this):
             telemetry_writer.add_log(
diff --git a/releasenotes/notes/ragas-integration-a81b696757c0e7a5.yaml b/releasenotes/notes/ragas-integration-a81b696757c0e7a5.yaml
new file mode 100644
index 00000000000..7963f891661
--- /dev/null
+++ b/releasenotes/notes/ragas-integration-a81b696757c0e7a5.yaml
@@ -0,0 +1,21 @@
+---
+features:
+  - |
+    LLM Observability: This introduces an integration with the `RAGAS <https://docs.ragas.io/en/stable/>`_ evaluation framework to continuously monitor 
+                    the performance of context-augmented LLM generations in production.
+
+                    The integration supports evaluating LLM inferences with the following RAGAS metrics:
+                    - `Faithfulness <https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/faithfulness/>`_: measures if the LLM response is faithful to the provided context.
+                    - `Answer Relevancy <https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/answer_relevance/>`_: measures how relevant the LLM response is to the user input.
+                    - `Context Precision <https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_precision/>`_:  measures how effectively the context is used in the generated response.
+
+                    To learn more, see the `LLM Observability evaluations guide <https://docs.datadoghq.com/llm_observability/submit_evaluations/>`_.
+deprecations:
+  - |
+    LLM Observability: The `_DD_LLMOBS_EVALUATORS` environment variable is deprecated and will be removed in ddtrace 3.0.0.
+                      As an alternative to `_DD_LLMOBS_EVALUATORS`, you can use `DD_LLMOBS_EVALUATORS` instead.
+                      To migrate, replace `_DD_LLMOBS_EVALUATORS` with `DD_LLMOBS_EVALUATORS`.
+  - |
+    LLM Observability: The `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES` environment variable is deprecated and will be removed in ddtrace 3.0.0.
+                      As an alternative to `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES`, you can use `DD_LLMOBS_EVALUATOR_SAMPLING_RULES` instead.
+                      To migrate, replace `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES` with `DD_LLMOBS_EVALUATOR_SAMPLING_RULES`.
\ No newline at end of file
diff --git a/tests/llmobs/test_llmobs_evaluator_runner.py b/tests/llmobs/test_llmobs_evaluator_runner.py
index a2c4278297c..1c941c52d83 100644
--- a/tests/llmobs/test_llmobs_evaluator_runner.py
+++ b/tests/llmobs/test_llmobs_evaluator_runner.py
@@ -120,7 +120,7 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces
 
 
 def test_evaluator_runner_unsupported_evaluator():
-    with override_env({"_DD_LLMOBS_EVALUATORS": "unsupported"}):
+    with override_env({EvaluatorRunner.EVALUATORS_ENV_VAR: "unsupported"}):
         with pytest.raises(ValueError):
             EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock())
 
diff --git a/tests/llmobs/test_llmobs_ragas_evaluators.py b/tests/llmobs/test_llmobs_ragas_evaluators.py
index 9766c18c1e5..c46dce740c2 100644
--- a/tests/llmobs/test_llmobs_ragas_evaluators.py
+++ b/tests/llmobs/test_llmobs_ragas_evaluators.py
@@ -6,7 +6,8 @@
 from ddtrace.llmobs._evaluators.ragas.answer_relevancy import RagasAnswerRelevancyEvaluator
 from ddtrace.llmobs._evaluators.ragas.context_precision import RagasContextPrecisionEvaluator
 from ddtrace.llmobs._evaluators.ragas.faithfulness import RagasFaithfulnessEvaluator
-from ddtrace.trace import Span
+from ddtrace.llmobs._evaluators.runner import EvaluatorRunner
+from ddtrace.span import Span
 from tests.llmobs._utils import _expected_llmobs_llm_span_event
 from tests.llmobs._utils import _expected_ragas_answer_relevancy_spans
 from tests.llmobs._utils import _expected_ragas_context_precision_spans
@@ -235,7 +236,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log
             "PYTHONPATH": ":".join(pypath),
             "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", "dummy-openai-api-key"),
             "_DD_LLMOBS_EVALUATOR_INTERVAL": "5",
-            "_DD_LLMOBS_EVALUATORS": "ragas_faithfulness",
+            EvaluatorRunner.EVALUATORS_ENV_VAR: "ragas_faithfulness",
             "DD_TRACE_ENABLED": "0",
         }
     )
diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py
index ff099ae3f71..2fe3e1fbfab 100644
--- a/tests/llmobs/test_llmobs_service.py
+++ b/tests/llmobs/test_llmobs_service.py
@@ -1384,7 +1384,7 @@ def test_llmobs_fork_recreates_and_restarts_eval_metric_writer():
 def test_llmobs_fork_recreates_and_restarts_evaluator_runner(mock_ragas_evaluator):
     """Test that forking a process correctly recreates and restarts the EvaluatorRunner."""
     pytest.importorskip("ragas")
-    with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
+    with override_env(dict(DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
         with mock.patch("ddtrace.llmobs._evaluators.runner.EvaluatorRunner.periodic"):
             llmobs_service.enable(_tracer=DummyTracer(), ml_app="test_app")
             original_pid = llmobs_service._instance.tracer._pid
@@ -1464,9 +1464,9 @@ def test_llmobs_fork_submit_evaluation(monkeypatch):
 
 def test_llmobs_fork_evaluator_runner_run(monkeypatch):
     """Test that forking a process correctly encodes new spans created in each process."""
-    monkeypatch.setenv("_DD_LLMOBS_EVALUATOR_INTERVAL", 5.0)
+    monkeypatch.setenv("DD_LLMOBS_EVALUATOR_INTERVAL", 5.0)
     pytest.importorskip("ragas")
-    monkeypatch.setenv("_DD_LLMOBS_EVALUATORS", "ragas_faithfulness")
+    monkeypatch.setenv("DD_LLMOBS_EVALUATORS", "ragas_faithfulness")
     with mock.patch("ddtrace.llmobs._evaluators.runner.EvaluatorRunner.periodic"):
         llmobs_service.enable(_tracer=DummyTracer(), ml_app="test_app", api_key="test_api_key")
         pid = os.fork()
@@ -1757,7 +1757,7 @@ async def test_annotation_context_async_nested(llmobs):
 def test_service_enable_starts_evaluator_runner_when_evaluators_exist():
     pytest.importorskip("ragas")
     with override_global_config(dict(_dd_api_key="<not-a-real-api-key>", _llmobs_ml_app="<ml-app-name>")):
-        with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
+        with override_env(dict(DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
             dummy_tracer = DummyTracer()
             llmobs_service.enable(_tracer=dummy_tracer)
             llmobs_instance = llmobs_service._instance