DataDog · lievan · Jan 14, 2025 · Jan 14, 2025 · Jan 17, 2025 · Jan 21, 2025
@@ -8,10 +8,12 @@
 from ddtrace.internal.periodic import PeriodicService
 from ddtrace.internal.telemetry import telemetry_writer
 from ddtrace.internal.telemetry.constants import TELEMETRY_NAMESPACE
+from ddtrace.internal.utils.deprecations import DDTraceDeprecationWarning
 from ddtrace.llmobs._evaluators.ragas.answer_relevancy import RagasAnswerRelevancyEvaluator
 from ddtrace.llmobs._evaluators.ragas.context_precision import RagasContextPrecisionEvaluator
 from ddtrace.llmobs._evaluators.ragas.faithfulness import RagasFaithfulnessEvaluator
 from ddtrace.llmobs._evaluators.sampler import EvaluatorRunnerSampler
+from ddtrace.vendor.debtcollector import deprecate
 
 
 logger = get_logger(__name__)
@@ -31,6 +33,9 @@ class EvaluatorRunner(PeriodicService):
     2. triggers evaluator runs over buffered finished spans on each `periodic` call
     """
 
+    EVALUATORS_ENV_VAR = "DD_LLMOBS_EVALUATORS"
+    DEPRECATED_EVALUATORS_ENV_VAR = "_DD_LLMOBS_EVALUATORS"
+
     def __init__(self, interval: float, llmobs_service=None, evaluators=None):
         super(EvaluatorRunner, self).__init__(interval=interval)
         self._lock = forksafe.RLock()
@@ -45,7 +50,16 @@ def __init__(self, interval: float, llmobs_service=None, evaluators=None):
         if len(self.evaluators) > 0:
             return
 
-        evaluator_str = os.getenv("_DD_LLMOBS_EVALUATORS")
+        deprecated_evaluator_str = os.getenv(self.DEPRECATED_EVALUATORS_ENV_VAR)
+        if deprecated_evaluator_str is not None:
+            deprecate(
+                "Using `_DD_LLMOBS_EVALUATORS` is deprecated",
+                message="Please use `DD_LLMOBS_EVALUATORS` instead.",
+                removal_version="3.2.0",
+                category=DDTraceDeprecationWarning,
+            )
+
+        evaluator_str = os.getenv(self.EVALUATORS_ENV_VAR) or deprecated_evaluator_str
         if evaluator_str is None:
             return
 

@@ -11,6 +11,8 @@
 from ddtrace.internal.telemetry import telemetry_writer
 from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL
 from ddtrace.internal.telemetry.constants import TELEMETRY_NAMESPACE
+from ddtrace.internal.utils.deprecations import DDTraceDeprecationWarning
+from ddtrace.vendor.debtcollector import deprecate
 
 
 logger = get_logger(__name__)
@@ -46,7 +48,8 @@ def __repr__(self):
 
 
 class EvaluatorRunnerSampler:
-    SAMPLING_RULES_ENV_VAR = "_DD_LLMOBS_EVALUATOR_SAMPLING_RULES"
+    SAMPLING_RULES_ENV_VAR = "DD_LLMOBS_EVALUATOR_SAMPLING_RULES"
+    DEPRECATED_SAMPLING_RULES_ENV_VAR = "_DD_LLMOBS_EVALUATOR_SAMPLING_RULES"
 
     def __init__(self):
         self.rules = self.parse_rules()
@@ -59,8 +62,17 @@ def sample(self, evaluator_label, span):
 
     def parse_rules(self) -> List[EvaluatorRunnerSamplingRule]:
         rules = []
-        sampling_rules_str = os.getenv(self.SAMPLING_RULES_ENV_VAR)
-        telemetry_writer.add_configuration("_DD_LLMOBS_EVALUATOR_SAMPLING_RULES", sampling_rules_str, origin="env")
+
+        deprecated_sampling_rules_str = os.getenv(self.DEPRECATED_SAMPLING_RULES_ENV_VAR)
+        if deprecated_sampling_rules_str is not None:
+            deprecate(
+                "Using `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES` is deprecated",
+                message="Please use `DD_LLMOBS_EVALUATOR_SAMPLING_RULES` instead.",
+                removal_version="3.2.0",
+                category=DDTraceDeprecationWarning,
+            )
+        sampling_rules_str = os.getenv(self.SAMPLING_RULES_ENV_VAR) or deprecated_sampling_rules_str
+        telemetry_writer.add_configuration("DD_LLMOBS_EVALUATOR_SAMPLING_RULES", sampling_rules_str, origin="env")
 
         def parsing_failed_because(msg, maybe_throw_this):
             telemetry_writer.add_log(

@@ -0,0 +1,21 @@
+---
+features:
+  - |
+    LLM Observability: This introduces an integration with the `RAGAS <https://docs.ragas.io/en/stable/>`_ evaluation framework to continuously monitor 
+                    the performance of context-augmented LLM generations in production.
+
+                    The integration supports evaluating LLM inferences with the following RAGAS metrics:
+                    - `Faithfulness <https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/faithfulness/>`_: measures if the LLM response is faithful to the provided context.
+                    - `Answer Relevancy <https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/answer_relevance/>`_: measures how relevant the LLM response is to the user input.
+                    - `Context Precision <https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_precision/>`_:  measures how effectively the context is used in the generated response.
+
+                    To learn more, see the `LLM Observability evaluations guide <https://docs.datadoghq.com/llm_observability/submit_evaluations/>`_.
+deprecations:
+  - |
+    LLM Observability: The `_DD_LLMOBS_EVALUATORS` environment variable is deprecated and will be removed in ddtrace 3.2.0.
+                      As an alternative to `_DD_LLMOBS_EVALUATORS`, you can use `DD_LLMOBS_EVALUATORS` instead.
+                      To migrate, replace `_DD_LLMOBS_EVALUATORS` with `DD_LLMOBS_EVALUATORS`.
+  - |
+    LLM Observability: The `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES` environment variable is deprecated and will be removed in ddtrace 3.2.0.
+                      As an alternative to `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES`, you can use `DD_LLMOBS_EVALUATOR_SAMPLING_RULES` instead.
+                      To migrate, replace `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES` with `DD_LLMOBS_EVALUATOR_SAMPLING_RULES`.
@@ -115,7 +115,7 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces
 
 
 def test_evaluator_runner_unsupported_evaluator():
-    with override_env({"_DD_LLMOBS_EVALUATORS": "unsupported"}):
+    with override_env({EvaluatorRunner.EVALUATORS_ENV_VAR: "unsupported"}):
         with pytest.raises(ValueError):
             EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock())
 

@@ -6,6 +6,7 @@
 from ddtrace.llmobs._evaluators.ragas.answer_relevancy import RagasAnswerRelevancyEvaluator
 from ddtrace.llmobs._evaluators.ragas.context_precision import RagasContextPrecisionEvaluator
 from ddtrace.llmobs._evaluators.ragas.faithfulness import RagasFaithfulnessEvaluator
+from ddtrace.llmobs._evaluators.runner import EvaluatorRunner
 from ddtrace.span import Span
 from tests.llmobs._utils import _expected_llmobs_llm_span_event
 from tests.llmobs._utils import _expected_ragas_answer_relevancy_spans
@@ -235,7 +236,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log
             "PYTHONPATH": ":".join(pypath),
             "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", "dummy-openai-api-key"),
             "_DD_LLMOBS_EVALUATOR_INTERVAL": "5",
-            "_DD_LLMOBS_EVALUATORS": "ragas_faithfulness",
+            EvaluatorRunner.EVALUATORS_ENV_VAR: "ragas_faithfulness",
             "DD_TRACE_ENABLED": "0",
         }
     )

@@ -1387,7 +1387,7 @@ def test_llmobs_fork_recreates_and_restarts_eval_metric_writer():
 
 def test_llmobs_fork_recreates_and_restarts_evaluator_runner(mock_ragas_evaluator):
     """Test that forking a process correctly recreates and restarts the EvaluatorRunner."""
-    with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
+    with override_env(dict(DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
         with mock.patch("ddtrace.llmobs._evaluators.runner.EvaluatorRunner.periodic"):
             llmobs_service.enable(_tracer=DummyTracer(), ml_app="test_app")
             original_pid = llmobs_service._instance.tracer._pid
@@ -1758,7 +1758,7 @@ async def test_annotation_context_async_nested(llmobs):
 def test_service_enable_starts_evaluator_runner_when_evaluators_exist():
     pytest.importorskip("ragas")
     with override_global_config(dict(_dd_api_key="<not-a-real-api-key>", _llmobs_ml_app="<ml-app-name>")):
-        with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
+        with override_env(dict(DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
             dummy_tracer = DummyTracer()
             llmobs_service.enable(_tracer=dummy_tracer)
             llmobs_instance = llmobs_service._instance