From 7257296a482a92197fb43957c92ee3c908d61092 Mon Sep 17 00:00:00 2001 From: lievan <42917263+lievan@users.noreply.github.com> Date: Fri, 31 Jan 2025 14:43:44 -0500 Subject: [PATCH] feat(llmobs): introduce ragas eval integration (#12143) Publicize the RAGAS integration [RAGAS](https://docs.ragas.io/en/stable/getstarted/index.html) is an evaluation framework for RAG Applications. The integration supports evaluating LLM inferences with three RAGAS metrics - faithfulness - answer relevancy - context precision dupe of https://github.com/DataDog/dd-trace-py/pull/11939 to get in ddtrace 3.0 ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --------- Co-authored-by: lievan --- ddtrace/llmobs/_evaluators/runner.py | 4 +++- ddtrace/llmobs/_evaluators/sampler.py | 5 +++-- .../ragas-integration-a81b696757c0e7a5.yaml | 21 +++++++++++++++++++ tests/llmobs/test_llmobs_evaluator_runner.py | 2 +- tests/llmobs/test_llmobs_ragas_evaluators.py | 5 +++-- tests/llmobs/test_llmobs_service.py | 8 +++---- 6 files changed, 35 insertions(+), 10 deletions(-) create mode 100644 releasenotes/notes/ragas-integration-a81b696757c0e7a5.yaml diff --git a/ddtrace/llmobs/_evaluators/runner.py b/ddtrace/llmobs/_evaluators/runner.py index 5e0ab2737f4..780293563d8 100644 --- a/ddtrace/llmobs/_evaluators/runner.py +++ b/ddtrace/llmobs/_evaluators/runner.py @@ -32,6 +32,8 @@ class EvaluatorRunner(PeriodicService): 2. triggers evaluator runs over buffered finished spans on each `periodic` call """ + EVALUATORS_ENV_VAR = "DD_LLMOBS_EVALUATORS" + def __init__(self, interval: float, llmobs_service=None, evaluators=None): super(EvaluatorRunner, self).__init__(interval=interval) self._lock = forksafe.RLock() @@ -46,7 +48,7 @@ def __init__(self, interval: float, llmobs_service=None, evaluators=None): if len(self.evaluators) > 0: return - evaluator_str = os.getenv("_DD_LLMOBS_EVALUATORS") + evaluator_str = os.getenv(self.EVALUATORS_ENV_VAR) if evaluator_str is None: return diff --git a/ddtrace/llmobs/_evaluators/sampler.py b/ddtrace/llmobs/_evaluators/sampler.py index 3598e90f7f3..524af217f83 100644 --- a/ddtrace/llmobs/_evaluators/sampler.py +++ b/ddtrace/llmobs/_evaluators/sampler.py @@ -46,7 +46,7 @@ def __repr__(self): class EvaluatorRunnerSampler: - SAMPLING_RULES_ENV_VAR = "_DD_LLMOBS_EVALUATOR_SAMPLING_RULES" + SAMPLING_RULES_ENV_VAR = "DD_LLMOBS_EVALUATOR_SAMPLING_RULES" def __init__(self): self.rules = self.parse_rules() @@ -59,8 +59,9 @@ def sample(self, evaluator_label, span): def parse_rules(self) -> List[EvaluatorRunnerSamplingRule]: rules = [] + sampling_rules_str = os.getenv(self.SAMPLING_RULES_ENV_VAR) - telemetry_writer.add_configuration("_DD_LLMOBS_EVALUATOR_SAMPLING_RULES", sampling_rules_str, origin="env") + telemetry_writer.add_configuration(self.SAMPLING_RULES_ENV_VAR, sampling_rules_str, origin="env") def parsing_failed_because(msg, maybe_throw_this): telemetry_writer.add_log( diff --git a/releasenotes/notes/ragas-integration-a81b696757c0e7a5.yaml b/releasenotes/notes/ragas-integration-a81b696757c0e7a5.yaml new file mode 100644 index 00000000000..7963f891661 --- /dev/null +++ b/releasenotes/notes/ragas-integration-a81b696757c0e7a5.yaml @@ -0,0 +1,21 @@ +--- +features: + - | + LLM Observability: This introduces an integration with the `RAGAS `_ evaluation framework to continuously monitor + the performance of context-augmented LLM generations in production. + + The integration supports evaluating LLM inferences with the following RAGAS metrics: + - `Faithfulness `_: measures if the LLM response is faithful to the provided context. + - `Answer Relevancy `_: measures how relevant the LLM response is to the user input. + - `Context Precision `_: measures how effectively the context is used in the generated response. + + To learn more, see the `LLM Observability evaluations guide `_. +deprecations: + - | + LLM Observability: The `_DD_LLMOBS_EVALUATORS` environment variable is deprecated and will be removed in ddtrace 3.0.0. + As an alternative to `_DD_LLMOBS_EVALUATORS`, you can use `DD_LLMOBS_EVALUATORS` instead. + To migrate, replace `_DD_LLMOBS_EVALUATORS` with `DD_LLMOBS_EVALUATORS`. + - | + LLM Observability: The `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES` environment variable is deprecated and will be removed in ddtrace 3.0.0. + As an alternative to `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES`, you can use `DD_LLMOBS_EVALUATOR_SAMPLING_RULES` instead. + To migrate, replace `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES` with `DD_LLMOBS_EVALUATOR_SAMPLING_RULES`. \ No newline at end of file diff --git a/tests/llmobs/test_llmobs_evaluator_runner.py b/tests/llmobs/test_llmobs_evaluator_runner.py index a2c4278297c..1c941c52d83 100644 --- a/tests/llmobs/test_llmobs_evaluator_runner.py +++ b/tests/llmobs/test_llmobs_evaluator_runner.py @@ -120,7 +120,7 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces def test_evaluator_runner_unsupported_evaluator(): - with override_env({"_DD_LLMOBS_EVALUATORS": "unsupported"}): + with override_env({EvaluatorRunner.EVALUATORS_ENV_VAR: "unsupported"}): with pytest.raises(ValueError): EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock()) diff --git a/tests/llmobs/test_llmobs_ragas_evaluators.py b/tests/llmobs/test_llmobs_ragas_evaluators.py index 9766c18c1e5..c46dce740c2 100644 --- a/tests/llmobs/test_llmobs_ragas_evaluators.py +++ b/tests/llmobs/test_llmobs_ragas_evaluators.py @@ -6,7 +6,8 @@ from ddtrace.llmobs._evaluators.ragas.answer_relevancy import RagasAnswerRelevancyEvaluator from ddtrace.llmobs._evaluators.ragas.context_precision import RagasContextPrecisionEvaluator from ddtrace.llmobs._evaluators.ragas.faithfulness import RagasFaithfulnessEvaluator -from ddtrace.trace import Span +from ddtrace.llmobs._evaluators.runner import EvaluatorRunner +from ddtrace.span import Span from tests.llmobs._utils import _expected_llmobs_llm_span_event from tests.llmobs._utils import _expected_ragas_answer_relevancy_spans from tests.llmobs._utils import _expected_ragas_context_precision_spans @@ -235,7 +236,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log "PYTHONPATH": ":".join(pypath), "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", "dummy-openai-api-key"), "_DD_LLMOBS_EVALUATOR_INTERVAL": "5", - "_DD_LLMOBS_EVALUATORS": "ragas_faithfulness", + EvaluatorRunner.EVALUATORS_ENV_VAR: "ragas_faithfulness", "DD_TRACE_ENABLED": "0", } ) diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py index ff099ae3f71..2fe3e1fbfab 100644 --- a/tests/llmobs/test_llmobs_service.py +++ b/tests/llmobs/test_llmobs_service.py @@ -1384,7 +1384,7 @@ def test_llmobs_fork_recreates_and_restarts_eval_metric_writer(): def test_llmobs_fork_recreates_and_restarts_evaluator_runner(mock_ragas_evaluator): """Test that forking a process correctly recreates and restarts the EvaluatorRunner.""" pytest.importorskip("ragas") - with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")): + with override_env(dict(DD_LLMOBS_EVALUATORS="ragas_faithfulness")): with mock.patch("ddtrace.llmobs._evaluators.runner.EvaluatorRunner.periodic"): llmobs_service.enable(_tracer=DummyTracer(), ml_app="test_app") original_pid = llmobs_service._instance.tracer._pid @@ -1464,9 +1464,9 @@ def test_llmobs_fork_submit_evaluation(monkeypatch): def test_llmobs_fork_evaluator_runner_run(monkeypatch): """Test that forking a process correctly encodes new spans created in each process.""" - monkeypatch.setenv("_DD_LLMOBS_EVALUATOR_INTERVAL", 5.0) + monkeypatch.setenv("DD_LLMOBS_EVALUATOR_INTERVAL", 5.0) pytest.importorskip("ragas") - monkeypatch.setenv("_DD_LLMOBS_EVALUATORS", "ragas_faithfulness") + monkeypatch.setenv("DD_LLMOBS_EVALUATORS", "ragas_faithfulness") with mock.patch("ddtrace.llmobs._evaluators.runner.EvaluatorRunner.periodic"): llmobs_service.enable(_tracer=DummyTracer(), ml_app="test_app", api_key="test_api_key") pid = os.fork() @@ -1757,7 +1757,7 @@ async def test_annotation_context_async_nested(llmobs): def test_service_enable_starts_evaluator_runner_when_evaluators_exist(): pytest.importorskip("ragas") with override_global_config(dict(_dd_api_key="", _llmobs_ml_app="")): - with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")): + with override_env(dict(DD_LLMOBS_EVALUATORS="ragas_faithfulness")): dummy_tracer = DummyTracer() llmobs_service.enable(_tracer=dummy_tracer) llmobs_instance = llmobs_service._instance