Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Config Loading from YAML & README Update #778

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,30 @@ print(answer_relevancy_metric.score)
print(answer_relevancy_metric.reason)
```

## Configurable Evaluation Metrics with YAML

With DeepEval, you can conveniently specify and manage your evaluation metrics using a YAML configuration file. This allows for simpler adjustments of metrics' parameters without modifying the codebase.

Refer to the `config.yaml` file for guidance on configuring your metrics.

```python
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase

answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.7)
test_case = LLMTestCase(
input="What if these shoes don't fit?",
# Replace this with the actual output from your LLM application
actual_output="We offer a 30-day full refund at no extra costs.",
# Replace this with the expected output from your RAG generator
expected_output = "You are eligible for a 30 day full refund at no extra cost.",
retrieval_context=["All customers are eligible for a 30 day full refund at no extra costs."]
)
# Use config.yaml to define metrics for evaluation
evaluate([test_case], config_path="config.yaml")
```

Note that some metrics are for RAG pipelines, while others are for fine-tuning. Make sure to use our docs to pick the right one for your use case.

## Evaluating a Dataset / Test Cases in Bulk
Expand Down
29 changes: 29 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
metrics:
answer_relevancy:
threshold: 0.7
model: gpt-4-turbo
Copy link

@nicoeiris11 nicoeiris11 May 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't deepeval using gpt4o by default?

include_reason: true
faithfulness:
threshold: 0.7
model: gpt-4-turbo
include_reason: true
contextual_precision:
threshold: 0.7
model: gpt-4-turbo
include_reason: true
contextual_recall:
threshold: 0.7
model: gpt-4-turbo
include_reason: true
contextual_relevancy:
threshold: 0.7
model: gpt-4-turbo
include_reason: true
geval:
name: geval
threshold: 0.5
model: gpt-4-turbo
criteria: "Coherence - determine if the actual output is coherent with the input."
evaluation_params:
- input
- actual_output
14 changes: 12 additions & 2 deletions deepeval/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
should_use_cache,
)
from deepeval.telemetry import capture_evaluation_run
from deepeval.metrics import BaseMetric
from deepeval.metrics import BaseMetric, MetricsLoader
from deepeval.metrics.indicator import (
measure_metrics_with_indicator,
)
Expand Down Expand Up @@ -418,15 +418,25 @@ def assert_test(

def evaluate(
test_cases: List[Union[LLMTestCase, ConversationalTestCase]],
metrics: List[BaseMetric],
metrics: Optional[List[BaseMetric]] = None,
hyperparameters: Optional[Dict[str, Union[str, int, float]]] = None,
run_async: bool = True,
show_indicator: bool = True,
print_results: bool = True,
write_cache: bool = True,
use_cache: bool = False,
ignore_errors: bool = False,
config_path: Optional[str] = None
):
if not metrics and not config_path:
raise ValueError("Either metrics or a config path must be provided")
if metrics:
pass
else:
print("Loading metrics from config.")
metrics_evaluator = MetricsLoader(config_path=config_path)
metrics = metrics_evaluator.get_metrics_list()

if hyperparameters is not None:
if (
hyperparameters.get("model") is None
Expand Down
1 change: 1 addition & 0 deletions deepeval/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from .contextual_precision.contextual_precision import ContextualPrecisionMetric
from .knowledge_retention.knowledge_retention import KnowledgeRetentionMetric

from .loader import MetricsLoader
# from .ragas_metric import (
# RagasMetric,
# RAGASAnswerRelevancyMetric,
Expand Down
127 changes: 127 additions & 0 deletions deepeval/metrics/loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
from deepeval.test_case import LLMTestCaseParams, LLMTestCase
from .registry import metric_class_mapping
from .utils import ConfigLoader
from .base_metric import BaseMetric
from typing import List

class MetricsLoader:
def __init__(self, config_path=None, metrics=None):
"""
Initialize MetricsLoader instance

Args:
config_path (str, optional): Path to a YAML config file.
metrics (list, optional): List of metrics to evaluate.

Raises:
ValueError: If neither config_path nor metrics are provided.
"""
if config_path is None and metrics is None:
raise ValueError("Either config_path or metrics must be provided")

self.config_loader = None
self.metrics = None

if config_path is not None:
self.config_loader = ConfigLoader(config_path)
self.metrics = self.initialize_metrics()
elif metrics is not None:
self.metrics = metrics

if self.config_loader is None:
raise ValueError("Config file is not provided")
if self.metrics is None:
raise ValueError("Metrics are not provided")

def initialize_metrics(
self,
) -> dict:
"""
Initialize metrics from config file.

Initializes metrics for evaluation based on the configuration
provided in the config file. The configuration is expected to be a dictionary
where the keys are the names of the metrics and the values are dictionaries
containing the configuration for the metric.

Returns:
dict: A dictionary containing the initialized metrics `{metric_name: metric object}`.
"""
metrics_config = self.config_loader.get_metrics_config()
metrics = {}
for metric_name, config in metrics_config.items():
# Map evaluation_params from config to LLMTestCaseParams
evaluation_params = config.pop("evaluation_params", [])
if not isinstance(evaluation_params, list):
raise ValueError(
f"Invalid configuration for metric '{metric_name}'. "
f"'evaluation_params' must be a list. Check the metric registry for valid configuration."
)
# For handling multiple evaluation_params provided for some metrics (i.e. geval)
mapped_params = []
for param in evaluation_params:
try:
# Convert the string param to the corresponding LLMTestCaseParams enum
mapped_param = getattr(LLMTestCaseParams, param.upper(), None)
if mapped_param is None:
raise ValueError(
f"Invalid evaluation param '{param}' for metric '{metric_name}'. "
f"Check the LLMTestCaseParams enum for valid values."
)
mapped_params.append(mapped_param)
except AttributeError:
raise ValueError(
f"Invalid evaluation param '{param}' for metric '{metric_name}'. "
f"Check the LLMTestCaseParams enum for valid values."
)
if mapped_params:
config["evaluation_params"] = mapped_params
if metric_name in metric_class_mapping:
MetricClass = metric_class_mapping[metric_name]
try:
metrics[metric_name] = MetricClass(**config)
except TypeError:
raise ValueError(
f"Invalid configuration for metric '{metric_name}'. "
f"Check the metric registry for valid configuration."
)
else:
raise ValueError(f"No metric class found for '{metric_name}'. Check the metric registry.")

return metrics

def evaluate(
self,
test_case: LLMTestCase
) -> dict:
"""
Evaluates the given test case using all the metrics in the metrics dictionary.

Returns:
dict[str, dict[str, Union[str, bool]]]: A dictionary containing the results of the evaluation for each metric.
"""
results = {}
for metric_name, metric in self.metrics.items():
try:
result = metric.measure(test_case)
results[metric_name] = result
except Exception as e:
results[metric_name] = {
'error': str(e),
'success': False
}
return results

def get_metrics_list(
self,
) -> List[BaseMetric]:
"""
Retrieves a list of metric objects from the MetricsEvaluator instance.

Args:
self (MetricsEvaluator): An instance of MetricsEvaluator.

Returns:
list: A list of metric objects.
"""
return list(self.metrics.values())
27 changes: 27 additions & 0 deletions deepeval/metrics/registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Import metric classes from their respective modules
from .answer_relevancy.answer_relevancy import AnswerRelevancyMetric
from .faithfulness.faithfulness import FaithfulnessMetric
from .contextual_recall.contextual_recall import ContextualRecallMetric
from .contextual_relevancy.contextual_relevancy import ContextualRelevancyMetric
from .contextual_precision.contextual_precision import ContextualPrecisionMetric
from .g_eval.g_eval import GEval
from .bias.bias import BiasMetric
from .toxicity.toxicity import ToxicityMetric
from .hallucination.hallucination import HallucinationMetric
from .knowledge_retention.knowledge_retention import KnowledgeRetentionMetric
from .summarization.summarization import SummarizationMetric

# Define a dictionary mapping from metric names to metric classes
metric_class_mapping = {
'answer_relevancy': AnswerRelevancyMetric,
'faithfulness': FaithfulnessMetric,
'contextual_recall': ContextualRecallMetric,
'contextual_relevancy': ContextualRelevancyMetric,
'contextual_precision': ContextualPrecisionMetric,
'geval': GEval,
'bias': BiasMetric,
'toxicity': ToxicityMetric,
'hallucination': HallucinationMetric,
'knowledge_retention': KnowledgeRetentionMetric,
'summarization': SummarizationMetric,
}
10 changes: 10 additions & 0 deletions deepeval/metrics/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import yaml
from typing import Any, Optional, List, Union, Tuple
from deepeval.models import GPTModel, DeepEvalBaseLLM

Expand Down Expand Up @@ -79,3 +80,12 @@ def initialize_model(
return model, False
# Otherwise (the model is a string or None), we initialize a GPTModel and use as a native model
return GPTModel(model=model), True


class ConfigLoader:
def __init__(self, config_path):
with open(config_path, 'r') as file:
self.config = yaml.load(file, Loader=yaml.FullLoader)

def get_metrics_config(self):
return self.config.get('metrics', {})
Loading