Skip to content

Commit

Permalink
feat(llmobs): introduce ragas eval integration (#12143)
Browse files Browse the repository at this point in the history
Publicize the RAGAS integration

[RAGAS](https://docs.ragas.io/en/stable/getstarted/index.html) is an
evaluation framework for RAG Applications. The integration supports
evaluating LLM inferences with three RAGAS metrics
- faithfulness
- answer relevancy
- context precision 

dupe of #11939 to get in
ddtrace 3.0

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met 
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)

---------

Co-authored-by: lievan <[email protected]>
  • Loading branch information
lievan and lievan authored Jan 31, 2025
1 parent 19f2378 commit 7257296
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 10 deletions.
4 changes: 3 additions & 1 deletion ddtrace/llmobs/_evaluators/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ class EvaluatorRunner(PeriodicService):
2. triggers evaluator runs over buffered finished spans on each `periodic` call
"""

EVALUATORS_ENV_VAR = "DD_LLMOBS_EVALUATORS"

def __init__(self, interval: float, llmobs_service=None, evaluators=None):
super(EvaluatorRunner, self).__init__(interval=interval)
self._lock = forksafe.RLock()
Expand All @@ -46,7 +48,7 @@ def __init__(self, interval: float, llmobs_service=None, evaluators=None):
if len(self.evaluators) > 0:
return

evaluator_str = os.getenv("_DD_LLMOBS_EVALUATORS")
evaluator_str = os.getenv(self.EVALUATORS_ENV_VAR)
if evaluator_str is None:
return

Expand Down
5 changes: 3 additions & 2 deletions ddtrace/llmobs/_evaluators/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __repr__(self):


class EvaluatorRunnerSampler:
SAMPLING_RULES_ENV_VAR = "_DD_LLMOBS_EVALUATOR_SAMPLING_RULES"
SAMPLING_RULES_ENV_VAR = "DD_LLMOBS_EVALUATOR_SAMPLING_RULES"

def __init__(self):
self.rules = self.parse_rules()
Expand All @@ -59,8 +59,9 @@ def sample(self, evaluator_label, span):

def parse_rules(self) -> List[EvaluatorRunnerSamplingRule]:
rules = []

sampling_rules_str = os.getenv(self.SAMPLING_RULES_ENV_VAR)
telemetry_writer.add_configuration("_DD_LLMOBS_EVALUATOR_SAMPLING_RULES", sampling_rules_str, origin="env")
telemetry_writer.add_configuration(self.SAMPLING_RULES_ENV_VAR, sampling_rules_str, origin="env")

def parsing_failed_because(msg, maybe_throw_this):
telemetry_writer.add_log(
Expand Down
21 changes: 21 additions & 0 deletions releasenotes/notes/ragas-integration-a81b696757c0e7a5.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
---
features:
- |
LLM Observability: This introduces an integration with the `RAGAS <https://docs.ragas.io/en/stable/>`_ evaluation framework to continuously monitor
the performance of context-augmented LLM generations in production.
The integration supports evaluating LLM inferences with the following RAGAS metrics:
- `Faithfulness <https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/faithfulness/>`_: measures if the LLM response is faithful to the provided context.
- `Answer Relevancy <https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/answer_relevance/>`_: measures how relevant the LLM response is to the user input.
- `Context Precision <https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_precision/>`_: measures how effectively the context is used in the generated response.
To learn more, see the `LLM Observability evaluations guide <https://docs.datadoghq.com/llm_observability/submit_evaluations/>`_.
deprecations:
- |
LLM Observability: The `_DD_LLMOBS_EVALUATORS` environment variable is deprecated and will be removed in ddtrace 3.0.0.
As an alternative to `_DD_LLMOBS_EVALUATORS`, you can use `DD_LLMOBS_EVALUATORS` instead.
To migrate, replace `_DD_LLMOBS_EVALUATORS` with `DD_LLMOBS_EVALUATORS`.
- |
LLM Observability: The `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES` environment variable is deprecated and will be removed in ddtrace 3.0.0.
As an alternative to `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES`, you can use `DD_LLMOBS_EVALUATOR_SAMPLING_RULES` instead.
To migrate, replace `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES` with `DD_LLMOBS_EVALUATOR_SAMPLING_RULES`.
2 changes: 1 addition & 1 deletion tests/llmobs/test_llmobs_evaluator_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces


def test_evaluator_runner_unsupported_evaluator():
with override_env({"_DD_LLMOBS_EVALUATORS": "unsupported"}):
with override_env({EvaluatorRunner.EVALUATORS_ENV_VAR: "unsupported"}):
with pytest.raises(ValueError):
EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock())

Expand Down
5 changes: 3 additions & 2 deletions tests/llmobs/test_llmobs_ragas_evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from ddtrace.llmobs._evaluators.ragas.answer_relevancy import RagasAnswerRelevancyEvaluator
from ddtrace.llmobs._evaluators.ragas.context_precision import RagasContextPrecisionEvaluator
from ddtrace.llmobs._evaluators.ragas.faithfulness import RagasFaithfulnessEvaluator
from ddtrace.trace import Span
from ddtrace.llmobs._evaluators.runner import EvaluatorRunner
from ddtrace.span import Span
from tests.llmobs._utils import _expected_llmobs_llm_span_event
from tests.llmobs._utils import _expected_ragas_answer_relevancy_spans
from tests.llmobs._utils import _expected_ragas_context_precision_spans
Expand Down Expand Up @@ -235,7 +236,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log
"PYTHONPATH": ":".join(pypath),
"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", "dummy-openai-api-key"),
"_DD_LLMOBS_EVALUATOR_INTERVAL": "5",
"_DD_LLMOBS_EVALUATORS": "ragas_faithfulness",
EvaluatorRunner.EVALUATORS_ENV_VAR: "ragas_faithfulness",
"DD_TRACE_ENABLED": "0",
}
)
Expand Down
8 changes: 4 additions & 4 deletions tests/llmobs/test_llmobs_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -1384,7 +1384,7 @@ def test_llmobs_fork_recreates_and_restarts_eval_metric_writer():
def test_llmobs_fork_recreates_and_restarts_evaluator_runner(mock_ragas_evaluator):
"""Test that forking a process correctly recreates and restarts the EvaluatorRunner."""
pytest.importorskip("ragas")
with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
with override_env(dict(DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
with mock.patch("ddtrace.llmobs._evaluators.runner.EvaluatorRunner.periodic"):
llmobs_service.enable(_tracer=DummyTracer(), ml_app="test_app")
original_pid = llmobs_service._instance.tracer._pid
Expand Down Expand Up @@ -1464,9 +1464,9 @@ def test_llmobs_fork_submit_evaluation(monkeypatch):

def test_llmobs_fork_evaluator_runner_run(monkeypatch):
"""Test that forking a process correctly encodes new spans created in each process."""
monkeypatch.setenv("_DD_LLMOBS_EVALUATOR_INTERVAL", 5.0)
monkeypatch.setenv("DD_LLMOBS_EVALUATOR_INTERVAL", 5.0)
pytest.importorskip("ragas")
monkeypatch.setenv("_DD_LLMOBS_EVALUATORS", "ragas_faithfulness")
monkeypatch.setenv("DD_LLMOBS_EVALUATORS", "ragas_faithfulness")
with mock.patch("ddtrace.llmobs._evaluators.runner.EvaluatorRunner.periodic"):
llmobs_service.enable(_tracer=DummyTracer(), ml_app="test_app", api_key="test_api_key")
pid = os.fork()
Expand Down Expand Up @@ -1757,7 +1757,7 @@ async def test_annotation_context_async_nested(llmobs):
def test_service_enable_starts_evaluator_runner_when_evaluators_exist():
pytest.importorskip("ragas")
with override_global_config(dict(_dd_api_key="<not-a-real-api-key>", _llmobs_ml_app="<ml-app-name>")):
with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
with override_env(dict(DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
dummy_tracer = DummyTracer()
llmobs_service.enable(_tracer=dummy_tracer)
llmobs_instance = llmobs_service._instance
Expand Down

0 comments on commit 7257296

Please sign in to comment.