confident-ai · deval-shah · May 21, 2024 · May 21, 2024 · May 22, 2024 · nicoeiris11
diff --git a/README.md b/README.md
@@ -194,6 +194,30 @@ print(answer_relevancy_metric.score)
 print(answer_relevancy_metric.reason)
 ```
 
+## Configurable Evaluation Metrics with YAML
+
+With DeepEval, you can conveniently specify and manage your evaluation metrics using a YAML configuration file. This allows for simpler adjustments of metrics' parameters without modifying the codebase.
+
+Refer to the `config.yaml` file for guidance on configuring your metrics.
+
+```python
+from deepeval import evaluate
+from deepeval.metrics import AnswerRelevancyMetric
+from deepeval.test_case import LLMTestCase
+
+answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.7)
+test_case = LLMTestCase(
+    input="What if these shoes don't fit?",
+    # Replace this with the actual output from your LLM application
+    actual_output="We offer a 30-day full refund at no extra costs.",
+    # Replace this with the expected output from your RAG generator
+    expected_output = "You are eligible for a 30 day full refund at no extra cost.",
+    retrieval_context=["All customers are eligible for a 30 day full refund at no extra costs."]
+)
+# Use config.yaml to define metrics for evaluation
+evaluate([test_case], config_path="config.yaml")
+```
+
 Note that some metrics are for RAG pipelines, while others are for fine-tuning. Make sure to use our docs to pick the right one for your use case.
 
 ## Evaluating a Dataset / Test Cases in Bulk

diff --git a/config.yaml b/config.yaml
@@ -0,0 +1,29 @@
+metrics:
+  answer_relevancy:
+    threshold: 0.7
+    model: gpt-4-turbo
+    include_reason: true
+  faithfulness:
+    threshold: 0.7
+    model: gpt-4-turbo
+    include_reason: true
+  contextual_precision:
+    threshold: 0.7
+    model: gpt-4-turbo
+    include_reason: true
+  contextual_recall:
+    threshold: 0.7
+    model: gpt-4-turbo
+    include_reason: true
+  contextual_relevancy:
+    threshold: 0.7
+    model: gpt-4-turbo
+    include_reason: true
+  geval:
+    name: geval
+    threshold: 0.5
+    model: gpt-4-turbo
+    criteria: "Coherence - determine if the actual output is coherent with the input."
+    evaluation_params:
+      - input
+      - actual_output
diff --git a/deepeval/evaluate.py b/deepeval/evaluate.py
@@ -11,7 +11,7 @@
     should_use_cache,
 )
 from deepeval.telemetry import capture_evaluation_run
-from deepeval.metrics import BaseMetric
+from deepeval.metrics import BaseMetric, MetricsLoader
 from deepeval.metrics.indicator import (
     measure_metrics_with_indicator,
 )
@@ -418,15 +418,25 @@ def assert_test(
 
 def evaluate(
     test_cases: List[Union[LLMTestCase, ConversationalTestCase]],
-    metrics: List[BaseMetric],
+    metrics: Optional[List[BaseMetric]] = None,
     hyperparameters: Optional[Dict[str, Union[str, int, float]]] = None,
     run_async: bool = True,
     show_indicator: bool = True,
     print_results: bool = True,
     write_cache: bool = True,
     use_cache: bool = False,
     ignore_errors: bool = False,
+    config_path: Optional[str] = None
 ):
+    if not metrics and not config_path:
+        raise ValueError("Either metrics or a config path must be provided")
+    if metrics:
+        pass
+    else:
+        print("Loading metrics from config.")
+        metrics_evaluator = MetricsLoader(config_path=config_path)
+        metrics = metrics_evaluator.get_metrics_list()
+
     if hyperparameters is not None:
         if (
             hyperparameters.get("model") is None

diff --git a/deepeval/metrics/__init__.py b/deepeval/metrics/__init__.py
@@ -12,6 +12,7 @@
 from .contextual_precision.contextual_precision import ContextualPrecisionMetric
 from .knowledge_retention.knowledge_retention import KnowledgeRetentionMetric
 
+from .loader import MetricsLoader
 # from .ragas_metric import (
 #     RagasMetric,
 #     RAGASAnswerRelevancyMetric,

diff --git a/deepeval/metrics/loader.py b/deepeval/metrics/loader.py
@@ -0,0 +1,127 @@
+from deepeval.test_case import LLMTestCaseParams, LLMTestCase
+from .registry import metric_class_mapping
+from .utils import ConfigLoader
+from .base_metric import BaseMetric
+from typing import List
+
+class MetricsLoader:
+    def __init__(self, config_path=None, metrics=None):
+        """
+        Initialize MetricsLoader instance
+
+        Args:
+            config_path (str, optional): Path to a YAML config file.
+            metrics (list, optional): List of metrics to evaluate.
+
+        Raises:
+            ValueError: If neither config_path nor metrics are provided.
+        """
+        if config_path is None and metrics is None:
+            raise ValueError("Either config_path or metrics must be provided")
+
+        self.config_loader = None
+        self.metrics = None
+
+        if config_path is not None:
+            self.config_loader = ConfigLoader(config_path)
+            self.metrics = self.initialize_metrics()
+        elif metrics is not None:
+            self.metrics = metrics
+
+        if self.config_loader is None:
+            raise ValueError("Config file is not provided")
+        if self.metrics is None:
+            raise ValueError("Metrics are not provided")
+
+    def initialize_metrics(
+        self,
+    ) -> dict:
+        """
+        Initialize metrics from config file.
+
+        Initializes metrics for evaluation based on the configuration
+        provided in the config file. The configuration is expected to be a dictionary
+        where the keys are the names of the metrics and the values are dictionaries
+        containing the configuration for the metric.
+
+        Returns:
+            dict: A dictionary containing the initialized metrics `{metric_name: metric object}`.
+        """
+        metrics_config = self.config_loader.get_metrics_config()
+        metrics = {}
+        for metric_name, config in metrics_config.items():
+            # Map evaluation_params from config to LLMTestCaseParams
+            evaluation_params = config.pop("evaluation_params", [])
+            if not isinstance(evaluation_params, list):
+                raise ValueError(
+                    f"Invalid configuration for metric '{metric_name}'. "
+                    f"'evaluation_params' must be a list. Check the metric registry for valid configuration."
+                )
+            # For handling multiple evaluation_params provided for some metrics (i.e. geval)
+            mapped_params = []
+            for param in evaluation_params:
+                try:
+                    # Convert the string param to the corresponding LLMTestCaseParams enum
+                    mapped_param = getattr(LLMTestCaseParams, param.upper(), None)
+                    if mapped_param is None:
+                        raise ValueError(
+                            f"Invalid evaluation param '{param}' for metric '{metric_name}'. "
+                            f"Check the LLMTestCaseParams enum for valid values."
+                        )
+                    mapped_params.append(mapped_param)
+                except AttributeError:
+                    raise ValueError(
+                        f"Invalid evaluation param '{param}' for metric '{metric_name}'. "
+                        f"Check the LLMTestCaseParams enum for valid values."
+                    )
+            if mapped_params:
+                config["evaluation_params"] = mapped_params
+            if metric_name in metric_class_mapping:
+                MetricClass = metric_class_mapping[metric_name]
+                try:
+                    metrics[metric_name] = MetricClass(**config)
+                except TypeError:
+                    raise ValueError(
+                        f"Invalid configuration for metric '{metric_name}'. "
+                        f"Check the metric registry for valid configuration."
+                    )
+            else:
+                raise ValueError(f"No metric class found for '{metric_name}'. Check the metric registry.")
+
+        return metrics
+
+    def evaluate(
+        self,
+        test_case: LLMTestCase
+    ) -> dict:
+        """
+        Evaluates the given test case using all the metrics in the metrics dictionary.
+
+        Returns:
+            dict[str, dict[str, Union[str, bool]]]: A dictionary containing the results of the evaluation for each metric.
+        """
+        results = {}
+        for metric_name, metric in self.metrics.items():
+            try:
+                result = metric.measure(test_case)
+                results[metric_name] = result
+            except Exception as e:
+                results[metric_name] = {
+                    'error': str(e),
+                    'success': False
+                }
+        return results
+
+    def get_metrics_list(
+        self,
+    ) -> List[BaseMetric]:
+        """
+        Retrieves a list of metric objects from the MetricsEvaluator instance.
+
+        Args:
+            self (MetricsEvaluator): An instance of MetricsEvaluator.
+
+        Returns:
+            list: A list of metric objects.
+        """
+        return list(self.metrics.values())
diff --git a/deepeval/metrics/registry.py b/deepeval/metrics/registry.py
@@ -0,0 +1,27 @@
+# Import metric classes from their respective modules
+from .answer_relevancy.answer_relevancy import AnswerRelevancyMetric
+from .faithfulness.faithfulness import FaithfulnessMetric
+from .contextual_recall.contextual_recall import ContextualRecallMetric
+from .contextual_relevancy.contextual_relevancy import ContextualRelevancyMetric
+from .contextual_precision.contextual_precision import ContextualPrecisionMetric
+from .g_eval.g_eval import GEval
+from .bias.bias import BiasMetric
+from .toxicity.toxicity import ToxicityMetric
+from .hallucination.hallucination import HallucinationMetric
+from .knowledge_retention.knowledge_retention import KnowledgeRetentionMetric
+from .summarization.summarization import SummarizationMetric
+
+# Define a dictionary mapping from metric names to metric classes
+metric_class_mapping = {
+    'answer_relevancy': AnswerRelevancyMetric,
+    'faithfulness': FaithfulnessMetric,
+    'contextual_recall': ContextualRecallMetric,
+    'contextual_relevancy': ContextualRelevancyMetric,
+    'contextual_precision': ContextualPrecisionMetric,
+    'geval': GEval,
+    'bias': BiasMetric,
+    'toxicity': ToxicityMetric,
+    'hallucination': HallucinationMetric,
+    'knowledge_retention': KnowledgeRetentionMetric,
+    'summarization': SummarizationMetric,
+}
diff --git a/deepeval/metrics/utils.py b/deepeval/metrics/utils.py
@@ -1,4 +1,5 @@
 import json
+import yaml
 from typing import Any, Optional, List, Union, Tuple
 from deepeval.models import GPTModel, DeepEvalBaseLLM
 
@@ -79,3 +80,12 @@ def initialize_model(
         return model, False
     # Otherwise (the model is a string or None), we initialize a GPTModel and use as a native model
     return GPTModel(model=model), True
+
+
+class ConfigLoader:
+    def __init__(self, config_path):
+        with open(config_path, 'r') as file:
+            self.config = yaml.load(file, Loader=yaml.FullLoader)
+
+    def get_metrics_config(self):
+        return self.config.get('metrics', {})