From e2a27a72088187459a33a58213cd6514d8fbcc19 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 11 Nov 2024 12:02:47 +0100
Subject: [PATCH 01/57] Add swiss legal evals as new community tasks

---
 community_tasks/swiss_legal_evals.py | 402 +++++++++++++++++++++++++++
 1 file changed, 402 insertions(+)
 create mode 100644 community_tasks/swiss_legal_evals.py
diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
new file mode 100644
index 000000000..5b03b8406
--- /dev/null
+++ b/community_tasks/swiss_legal_evals.py
@@ -0,0 +1,402 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# ruff: noqa: F405, F403, F401
+"""
+This module contains task configurations and prompt functions for evaluating
+LLM models on Swiss legal datasets. Each task is defined using the 
+`LightevalTaskConfig` class with its respective prompt function. The tasks 
+cover a variety of benchmarks, including: translation of laws, court decisions 
+and press releases.
+
+Author: Joel Niklaus
+"""
+from lighteval.logging.hierarchical_logger import hlog_warn
+import nltk
+
+nltk.download("punkt_tab")
+
+
+import numpy as np
+import re
+from dataclasses import dataclass
+
+
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.metrics_sample import JudgeLLMMixEval, BertScore
+from lighteval.metrics.utils.metric_utils import (
+    MetricCategory,
+    MetricUseCase,
+    SampleLevelMetric,
+    SampleLevelMetricGrouping,
+)
+from lighteval.metrics.imports.bert_scorer import BERTScorer
+from lighteval.metrics.normalizations import remove_braces, remove_braces_and_strip
+from lighteval.tasks.extended.mix_eval.judge_prompts import (
+    flow_judge_for_freeform_template,
+    gpt_judge_for_closeended_freeform,
+)
+
+
+# CUSTOM METRICS
+def process_judge_response(x):
+    search = re.search(r"<score>\s(\d)\s</score>", x)
+    return int(search.group(1)) if search else 0
+
+
+def process_judge_response_freeform_gpt(x):
+    search = re.search(r"\[\[(\d.\d)\]\]", x)
+    answer = float(search.group(1) if search else 0)
+    return answer
+
+
+def freeform_flow_judge():
+    return SampleLevelMetricGrouping(
+        metric_name=["llm_judge_mixeval_flow"],
+        higher_is_better={"judge_score_flow": True},
+        category=MetricCategory.LLM_AS_JUDGE,
+        use_case=MetricUseCase.SUMMARIZATION,
+        sample_level_fn=JudgeLLMMixEval(
+            judge_model_name="flowaicom/Flow-Judge-v0.1",
+            template=flow_judge_for_freeform_template,
+            process_judge_response=process_judge_response,
+            judge_backend="vllm",
+            short_judge_name="flow",
+        ).compute,
+        corpus_level_fn={
+            "judge_score_flow": np.mean,
+        },
+    )
+
+
+def freeform_gpt_judge(judge_model_name: str = "gpt-4o"):
+    return SampleLevelMetricGrouping(
+        metric_name=[f"llm_judge_mixeval_{judge_model_name}"],
+        higher_is_better={"judge_score_{judge_model_name}": True},
+        category=MetricCategory.LLM_AS_JUDGE,
+        use_case=MetricUseCase.SUMMARIZATION,
+        sample_level_fn=JudgeLLMMixEval(
+            judge_model_name=judge_model_name,
+            template=gpt_judge_for_closeended_freeform,
+            process_judge_response=process_judge_response_freeform_gpt,
+            judge_backend="openai",
+            short_judge_name=judge_model_name,
+        ).compute,
+        corpus_level_fn={
+            f"judge_score_{judge_model_name}": np.mean,
+        },
+    )
+
+
+def bert_score(model_type: str = "xlm-roberta-large"):
+    score = BertScore(
+        normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip
+    )
+    score.bert_scorer = BERTScorer(
+        # We could download the files from here and set the baseline_path ourselves:
+        # https://github.com/Tiiiger/bert_score/tree/master/bert_score/rescale_baseline
+        model_type=model_type,
+        lang=None,  # Needs to be set if rescale_with_baseline is True
+        rescale_with_baseline=False,
+        baseline_path=None,
+    )
+    return SampleLevelMetricGrouping(
+        metric_name=["BERTScore-P", "BERTScore-R", "BERTScore-F"],
+        sample_level_fn=score.compute,
+        category=MetricCategory.GENERATIVE,
+        use_case=MetricUseCase.SUMMARIZATION,
+        corpus_level_fn={
+            "BERTScore-P": np.mean,
+            "BERTScore-R": np.mean,
+            "BERTScore-F": np.mean,
+        },
+        higher_is_better={
+            "BERTScore-P": True,
+            "BERTScore-R": True,
+            "BERTScore-F": True,
+        },
+    )
+
+
+class BLEURT:
+    def __init__(self, model_size: str = "tiny", seq_len: int = 512):
+        """Creates a BLEURT scorer based on the model size (tiny, base, large) and sequence length (128, 512)."""
+        assert model_size in [
+            "tiny",
+            "base",
+            "large",
+        ], "Model size must be either tiny, base, or large"
+        assert seq_len in [128, 512], "Sequence length must be either 128 or 512"
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            f"Elron/bleurt-{model_size}-{seq_len}"
+        )
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            f"Elron/bleurt-{model_size}-{seq_len}"
+        )
+        self.max_length = seq_len
+        self.model.eval()
+
+    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
+        """Uses the stored BLEURT scorer to compute the score on the current sample.
+
+        Args:
+            golds (list[str]): Reference targets
+            predictions (list[str]): Predicted strings
+
+        Returns:
+            float: Score over the current sample's items.
+        """
+        if len(predictions) == 1:
+            predictions = predictions * len(golds)
+        inputs = self.tokenizer(
+            golds,
+            predictions,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=self.max_length,
+        )
+        if any(len(encoding) == self.max_length for encoding in inputs["input_ids"]):
+            hlog_warn(
+                f"Some inputs were truncated to max_length={self.max_length} in BLEURT scoring"
+            )
+        scores = self.model(**inputs)[0].squeeze()
+        return scores.item()
+
+
+def bleurt(model_size: str = "tiny", seq_len: int = 512):
+    return SampleLevelMetric(
+        metric_name="bleurt",
+        sample_level_fn=BLEURT(model_size=model_size, seq_len=seq_len).compute,
+        category=MetricCategory.GENERATIVE,
+        use_case=MetricUseCase.TRANSLATION,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
+
+
+# EVALS WITH SUBSET
+# This is how you create a subset task (like MMLU), which has several subset
+# each being its own evaluation task.
+
+
+def create_translation_pairs(langs_list: list) -> list[tuple]:
+    """
+    Create all possible translation pairs from a given list of languages.
+
+    Args:
+    langs_list (list): A list of languages.
+
+    Returns:
+    lang_pairs_list (list): A list of tuples representing a translation pair.
+    """
+    lang_pairs_list = []
+    for i, lang1 in enumerate(langs_list):
+        for lang2 in langs_list[i + 1 :]:
+            lang_pairs_list.append((lang1, lang2))
+            lang_pairs_list.append((lang2, lang1))
+    return lang_pairs_list
+
+
+@dataclass
+class LevelConfig:
+    name: str
+    text_col_name: str
+    prompt_prefix: str
+    metadata_cols: list[str]
+
+
+@dataclass
+class DatasetConfig:
+    name: str
+    hf_repo: str
+    languages: list[str]
+    subsets: dict[str, LevelConfig]
+
+    def __post_init__(self):
+        self.translation_pairs = create_translation_pairs(self.languages)
+
+
+SwissDecisionSummaryTranslations = DatasetConfig(
+    name="sdst",
+    hf_repo="joelniklaus/SwissDecisionSummaryTranslations",
+    languages=["de", "fr", "it"],
+    subsets={
+        "bge_level": LevelConfig(
+            name="bge_level",
+            text_col_name="bgeText",
+            prompt_prefix="Consider the following summary of a Swiss leading court decision",
+            metadata_cols=["bge"],
+        ),
+        "regeste_level": LevelConfig(
+            name="regeste_level",
+            text_col_name="regesteText",
+            prompt_prefix="Consider the following paragraph of a summary of a Swiss leading court decision",
+            metadata_cols=["bge"],
+        ),
+        "text_level": LevelConfig(
+            name="text_level",
+            text_col_name="text",
+            prompt_prefix="Consider the following sentence of a summary of a Swiss leading court decision",
+            metadata_cols=["bge"],
+        ),
+    },
+)
+
+
+SwissLawTranslations = DatasetConfig(
+    name="slt",
+    hf_repo="joelniklaus/SwissLawTranslations",
+    languages=["de", "fr", "it", "rm", "en"],
+    subsets={
+        "law_level": LevelConfig(
+            name="law_level",
+            text_col_name="lawText",
+            prompt_prefix="Consider the following Swiss federal law",
+            metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
+        ),
+        "article_level": LevelConfig(
+            name="article_level",
+            text_col_name="articleText",
+            prompt_prefix="Consider the following Swiss federal law article",
+            metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
+        ),
+        "paragraph_level": LevelConfig(
+            name="paragraph_level",
+            text_col_name="paragraphText",
+            prompt_prefix="Consider the following Swiss federal law paragraph",
+            metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
+        ),
+    },
+)
+
+SwissSupremeCourtPressReleaseTranslations = DatasetConfig(
+    name="sscprt",
+    hf_repo="joelniklaus/SwissSupremeCourtPressReleaseTranslations",
+    languages=["de", "fr", "it"],
+    subsets={
+        "press_release": LevelConfig(
+            name="press_release",
+            text_col_name="text",
+            prompt_prefix="Consider the following Swiss Supreme Court press release",
+            metadata_cols=["filename"],
+        )
+    },
+)
+
+
+def create_prompt_fn(level_config: LevelConfig, src_lang: str, target_lang: str):
+    """
+    Create a prompt function for a given level configuration.
+    """
+    text_col = level_config.text_col_name
+    src_text_col = f"{src_lang}_{text_col}"
+    target_text_col = f"{target_lang}_{text_col}"
+
+    def prompt_fn(line: dict, task_name: str = None):
+        custom_query = f"{level_config.prompt_prefix}: {line[src_text_col]}\nTranslate from {src_lang} to {target_lang}.\nTranslation: "
+
+        return Doc(
+            task_name=task_name,
+            query=custom_query,
+            choices=[str(line[target_text_col])],
+            gold_index=0,
+            specific={
+                **{col: line[col] for col in level_config.metadata_cols},
+                "question": custom_query,
+            },
+        )
+
+    return prompt_fn
+
+
+class TranslationTask(LightevalTaskConfig):
+    def __init__(
+        self,
+        dataset_config: DatasetConfig,
+        level_name: str,
+        src_lang: str,
+        target_lang: str,
+    ):
+        super().__init__(
+            name=f"{dataset_config.name}-{level_name}:{src_lang}-{target_lang}",
+            suite=["community"],
+            prompt_function=create_prompt_fn(
+                dataset_config.subsets[level_name], src_lang, target_lang
+            ),
+            hf_repo=dataset_config.hf_repo,
+            hf_subset=level_name,
+            hf_filter=None,
+            hf_avail_splits=["train", "validation", "test"],
+            evaluation_splits=["test"],  # ["validation", "test"],
+            few_shots_split="validation",
+            few_shots_select=None,
+            generation_size=10,
+            metric=[
+                Metrics.bleu,
+                Metrics.bleu_1,
+                Metrics.bleu_4,
+                Metrics.chrf,
+                Metrics.ter,
+                bert_score(model_type="xlm-roberta-large"),
+                bleurt(model_size="tiny", seq_len=512),
+                freeform_gpt_judge(judge_model_name="gpt-4o"),
+                # freeform_flow_judge(), # TODO: Needs to be tested on GPU machine
+                # TODO: add prometheus eval
+            ],
+            stop_sequence=["\n"],
+            trust_dataset=True,
+        )
+
+
+# STORE YOUR EVALS
+
+# list of all the subsets to use for this eval
+DATASETS = [
+    SwissDecisionSummaryTranslations,
+    SwissLawTranslations,
+    SwissSupremeCourtPressReleaseTranslations,
+]
+
+TASKS_TABLE = [
+    TranslationTask(
+        dataset_config=dataset,
+        level_name=subset,
+        src_lang=src_lang,
+        target_lang=target_lang,
+    )
+    for dataset in DATASETS
+    for subset in dataset.subsets
+    for src_lang, target_lang in dataset.translation_pairs
+]
+
+
+# MODULE LOGIC
+# You should not need to touch this
+# Convert to dict for lighteval
+if __name__ == "__main__":
+    print(t.name for t in TASKS_TABLE)
+    print(len(TASKS_TABLE))

From aa409c831f7a882619ba89b55e76c254bdbea0c2 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Tue, 12 Nov 2024 10:34:14 +0100
Subject: [PATCH 02/57] Removed nltk and numpy dependencies.

---
 community_tasks/swiss_legal_evals.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 5b03b8406..9b0d8d45a 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -31,12 +31,8 @@
 Author: Joel Niklaus
 """
 from lighteval.logging.hierarchical_logger import hlog_warn
-import nltk
 
-nltk.download("punkt_tab")
-
-
-import numpy as np
+import statistics
 import re
 from dataclasses import dataclass
 
@@ -86,7 +82,7 @@ def freeform_flow_judge():
             short_judge_name="flow",
         ).compute,
         corpus_level_fn={
-            "judge_score_flow": np.mean,
+            "judge_score_flow": statistics.mean,
         },
     )
 
@@ -105,7 +101,7 @@ def freeform_gpt_judge(judge_model_name: str = "gpt-4o"):
             short_judge_name=judge_model_name,
         ).compute,
         corpus_level_fn={
-            f"judge_score_{judge_model_name}": np.mean,
+            f"judge_score_{judge_model_name}": statistics.mean,
         },
     )
 
@@ -128,9 +124,9 @@ def bert_score(model_type: str = "xlm-roberta-large"):
         category=MetricCategory.GENERATIVE,
         use_case=MetricUseCase.SUMMARIZATION,
         corpus_level_fn={
-            "BERTScore-P": np.mean,
-            "BERTScore-R": np.mean,
-            "BERTScore-F": np.mean,
+            "BERTScore-P": statistics.mean,
+            "BERTScore-R": statistics.mean,
+            "BERTScore-F": statistics.mean,
         },
         higher_is_better={
             "BERTScore-P": True,
@@ -193,7 +189,7 @@ def bleurt(model_size: str = "tiny", seq_len: int = 512):
         sample_level_fn=BLEURT(model_size=model_size, seq_len=seq_len).compute,
         category=MetricCategory.GENERATIVE,
         use_case=MetricUseCase.TRANSLATION,
-        corpus_level_fn=np.mean,
+        corpus_level_fn=statistics.mean,
         higher_is_better=True,
     )
 

From a8ee2a5cd37e82fdd8d5dfb12b46ffb7a2a3f0c5 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Tue, 12 Nov 2024 10:37:11 +0100
Subject: [PATCH 03/57] Added short dataset descriptions.

---
 community_tasks/swiss_legal_evals.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 9b0d8d45a..549391ce2 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -236,6 +236,7 @@ def __post_init__(self):
         self.translation_pairs = create_translation_pairs(self.languages)
 
 
+# Translation of Swiss Federal Supreme Court Decision Summaries on three levels: the entire decision, the regeste level and the text level.
 SwissDecisionSummaryTranslations = DatasetConfig(
     name="sdst",
     hf_repo="joelniklaus/SwissDecisionSummaryTranslations",
@@ -262,7 +263,7 @@ def __post_init__(self):
     },
 )
 
-
+# Translation of Swiss Federal Laws on three levels: the entire law, the article level and the paragraph level.
 SwissLawTranslations = DatasetConfig(
     name="slt",
     hf_repo="joelniklaus/SwissLawTranslations",
@@ -289,6 +290,7 @@ def __post_init__(self):
     },
 )
 
+# Translation of Swiss Federal Supreme Court Press Releases on one level: the entire press release.
 SwissSupremeCourtPressReleaseTranslations = DatasetConfig(
     name="sscprt",
     hf_repo="joelniklaus/SwissSupremeCourtPressReleaseTranslations",

From c7f703807acef2edf234e999f817bd9f7c07c418 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 13 Nov 2024 16:00:32 +0100
Subject: [PATCH 04/57] Removed open judge models and added COMET and METEOR.

---
 community_tasks/swiss_legal_evals.py | 168 +++++++++++++++++++--------
 1 file changed, 122 insertions(+), 46 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 549391ce2..54b01c4fb 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -30,14 +30,20 @@
 
 Author: Joel Niklaus
 """
-from lighteval.logging.hierarchical_logger import hlog_warn
 
 import statistics
-import re
 from dataclasses import dataclass
+from packaging import version
+import importlib.metadata as importlib_metadata
+
+import nltk
+from nltk import word_tokenize
+from nltk.translate import meteor_score
 
+from comet import download_model, load_from_checkpoint
 
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 from lighteval.metrics.metrics import Metrics
@@ -50,46 +56,19 @@
 )
 from lighteval.metrics.imports.bert_scorer import BERTScorer
 from lighteval.metrics.normalizations import remove_braces, remove_braces_and_strip
+from lighteval.tasks.extended.mix_eval.main import process_judge_response_freeform_gpt
 from lighteval.tasks.extended.mix_eval.judge_prompts import (
-    flow_judge_for_freeform_template,
     gpt_judge_for_closeended_freeform,
 )
 
 
 # CUSTOM METRICS
-def process_judge_response(x):
-    search = re.search(r"<score>\s(\d)\s</score>", x)
-    return int(search.group(1)) if search else 0
-
-
-def process_judge_response_freeform_gpt(x):
-    search = re.search(r"\[\[(\d.\d)\]\]", x)
-    answer = float(search.group(1) if search else 0)
-    return answer
-
-
-def freeform_flow_judge():
-    return SampleLevelMetricGrouping(
-        metric_name=["llm_judge_mixeval_flow"],
-        higher_is_better={"judge_score_flow": True},
-        category=MetricCategory.LLM_AS_JUDGE,
-        use_case=MetricUseCase.SUMMARIZATION,
-        sample_level_fn=JudgeLLMMixEval(
-            judge_model_name="flowaicom/Flow-Judge-v0.1",
-            template=flow_judge_for_freeform_template,
-            process_judge_response=process_judge_response,
-            judge_backend="vllm",
-            short_judge_name="flow",
-        ).compute,
-        corpus_level_fn={
-            "judge_score_flow": statistics.mean,
-        },
-    )
+# TODO: adjust prompt
 
 
 def freeform_gpt_judge(judge_model_name: str = "gpt-4o"):
     return SampleLevelMetricGrouping(
-        metric_name=[f"llm_judge_mixeval_{judge_model_name}"],
+        metric_name=[f"llm_judge_{judge_model_name}"],
         higher_is_better={"judge_score_{judge_model_name}": True},
         category=MetricCategory.LLM_AS_JUDGE,
         use_case=MetricUseCase.SUMMARIZATION,
@@ -106,7 +85,7 @@ def freeform_gpt_judge(judge_model_name: str = "gpt-4o"):
     )
 
 
-def bert_score(model_type: str = "xlm-roberta-large"):
+def get_bert_score(model_type: str = "xlm-roberta-large"):
     score = BertScore(
         normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip
     )
@@ -136,6 +115,9 @@ def bert_score(model_type: str = "xlm-roberta-large"):
     )
 
 
+bert_score = get_bert_score(model_type="xlm-roberta-large")
+
+
 class BLEURT:
     def __init__(self, model_size: str = "tiny", seq_len: int = 512):
         """Creates a BLEURT scorer based on the model size (tiny, base, large) and sequence length (128, 512)."""
@@ -183,16 +165,105 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         return scores.item()
 
 
-def bleurt(model_size: str = "tiny", seq_len: int = 512):
-    return SampleLevelMetric(
-        metric_name="bleurt",
-        sample_level_fn=BLEURT(model_size=model_size, seq_len=seq_len).compute,
-        category=MetricCategory.GENERATIVE,
-        use_case=MetricUseCase.TRANSLATION,
-        corpus_level_fn=statistics.mean,
-        higher_is_better=True,
-    )
+bleurt = SampleLevelMetric(
+    metric_name="bleurt",
+    sample_level_fn=BLEURT(model_size="tiny", seq_len=512).compute,
+    category=MetricCategory.GENERATIVE,
+    use_case=MetricUseCase.TRANSLATION,
+    corpus_level_fn=statistics.mean,
+    higher_is_better=True,
+)
+
+
+class COMET:
+    def __init__(
+        self,
+        model_name: str = "Unbabel/wmt22-comet-da",
+        batch_size: int = 1,
+        gpus: int = 1,
+        accelerator: str = "cpu",  # "mps" leads to errors
+    ):
+        model_path = download_model(model_name)
+        self.model = load_from_checkpoint(model_path)
+        self.batch_size = batch_size
+        self.gpus = gpus
+        self.accelerator = accelerator
+
+    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
+        data = [
+            {"src": src, "mt": pred, "ref": gold}
+            for src, pred, gold in zip(
+                [kwargs["formatted_doc"].specific["source"]] * len(predictions),
+                predictions,
+                golds,
+            )
+        ]
+        model_output = self.model.predict(
+            data,
+            batch_size=self.batch_size,
+            gpus=self.gpus,
+            accelerator=self.accelerator,
+        )
+        # model_output["scores"] contains the sentence level scores
+        return model_output["system_score"]
+
+
+comet = SampleLevelMetric(
+    metric_name="comet",
+    sample_level_fn=COMET(accelerator="cpu").compute,
+    category=MetricCategory.GENERATIVE,
+    use_case=MetricUseCase.TRANSLATION,
+    corpus_level_fn=statistics.mean,
+    higher_is_better=True,
+)
+
 
+class METEOR:
+    def __init__(self, alpha=0.9, beta=3, gamma=0.5):
+        self.alpha = alpha
+        self.beta = beta
+        self.gamma = gamma
+
+        NLTK_VERSION = version.parse(importlib_metadata.version("nltk"))
+        assert NLTK_VERSION >= version.Version("3.9.0"), "NLTK version must be >= 3.9.0"
+        nltk.download("punkt_tab")
+        nltk.download("wordnet")
+
+    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
+        if isinstance(golds[0], list):  # multiple references
+            scores = [
+                meteor_score.meteor_score(
+                    [word_tokenize(ref) for ref in refs],
+                    word_tokenize(pred),
+                    alpha=self.alpha,
+                    beta=self.beta,
+                    gamma=self.gamma,
+                )
+                for refs, pred in zip(golds, predictions)
+            ]
+        else:
+            scores = [
+                meteor_score.single_meteor_score(
+                    word_tokenize(ref),
+                    word_tokenize(pred),
+                    alpha=self.alpha,
+                    beta=self.beta,
+                    gamma=self.gamma,
+                )
+                for ref, pred in zip(golds, predictions)
+            ]
+
+        return statistics.mean(scores)
+
+
+meteor = SampleLevelMetric(
+    metric_name="meteor",
+    sample_level_fn=METEOR().compute,
+    category=MetricCategory.GENERATIVE,
+    use_case=MetricUseCase.TRANSLATION,
+    corpus_level_fn=statistics.mean,
+    higher_is_better=True,
+)
 
 # EVALS WITH SUBSET
 # This is how you create a subset task (like MMLU), which has several subset
@@ -315,6 +386,7 @@ def create_prompt_fn(level_config: LevelConfig, src_lang: str, target_lang: str)
     target_text_col = f"{target_lang}_{text_col}"
 
     def prompt_fn(line: dict, task_name: str = None):
+        # TODO: replace this with the prompt template
         custom_query = f"{level_config.prompt_prefix}: {line[src_text_col]}\nTranslate from {src_lang} to {target_lang}.\nTranslation: "
 
         return Doc(
@@ -325,6 +397,7 @@ def prompt_fn(line: dict, task_name: str = None):
             specific={
                 **{col: line[col] for col in level_config.metadata_cols},
                 "question": custom_query,
+                "source": line[src_text_col],
             },
         )
 
@@ -359,11 +432,14 @@ def __init__(
                 Metrics.bleu_4,
                 Metrics.chrf,
                 Metrics.ter,
-                bert_score(model_type="xlm-roberta-large"),
-                bleurt(model_size="tiny", seq_len=512),
+                bert_score,
+                bleurt,
+                comet,
+                meteor,
                 freeform_gpt_judge(judge_model_name="gpt-4o"),
-                # freeform_flow_judge(), # TODO: Needs to be tested on GPU machine
-                # TODO: add prometheus eval
+                # Additionally we could consider adding the following open source judge models:
+                # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0
+                # However, these are only fine-tuned on English data and we need multilingual support.
             ],
             stop_sequence=["\n"],
             trust_dataset=True,

From 5d41ce0a4f646e65419787155e545fc443c99fc0 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 20 Nov 2024 11:39:33 +0100
Subject: [PATCH 05/57] Ran pre-commit hooks.

---
 community_tasks/swiss_legal_evals.py | 47 +++++++++++-----------------
 1 file changed, 18 insertions(+), 29 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 54b01c4fb..5b47e1d64 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -23,43 +23,42 @@
 # ruff: noqa: F405, F403, F401
 """
 This module contains task configurations and prompt functions for evaluating
-LLM models on Swiss legal datasets. Each task is defined using the 
-`LightevalTaskConfig` class with its respective prompt function. The tasks 
-cover a variety of benchmarks, including: translation of laws, court decisions 
+LLM models on Swiss legal datasets. Each task is defined using the
+`LightevalTaskConfig` class with its respective prompt function. The tasks
+cover a variety of benchmarks, including: translation of laws, court decisions
 and press releases.
 
 Author: Joel Niklaus
 """
 
+import importlib.metadata as importlib_metadata
 import statistics
 from dataclasses import dataclass
-from packaging import version
-import importlib.metadata as importlib_metadata
 
 import nltk
+from comet import download_model, load_from_checkpoint
 from nltk import word_tokenize
 from nltk.translate import meteor_score
+from packaging import version
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
-from comet import download_model, load_from_checkpoint
-
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from lighteval.logging.hierarchical_logger import hlog_warn
-from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.requests import Doc
+from lighteval.metrics.imports.bert_scorer import BERTScorer
 from lighteval.metrics.metrics import Metrics
-from lighteval.metrics.metrics_sample import JudgeLLMMixEval, BertScore
+from lighteval.metrics.metrics_sample import BertScore, JudgeLLMMixEval
+from lighteval.metrics.normalizations import remove_braces, remove_braces_and_strip
 from lighteval.metrics.utils.metric_utils import (
     MetricCategory,
     MetricUseCase,
     SampleLevelMetric,
     SampleLevelMetricGrouping,
 )
-from lighteval.metrics.imports.bert_scorer import BERTScorer
-from lighteval.metrics.normalizations import remove_braces, remove_braces_and_strip
-from lighteval.tasks.extended.mix_eval.main import process_judge_response_freeform_gpt
 from lighteval.tasks.extended.mix_eval.judge_prompts import (
     gpt_judge_for_closeended_freeform,
 )
+from lighteval.tasks.extended.mix_eval.main import process_judge_response_freeform_gpt
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
 
 
 # CUSTOM METRICS
@@ -86,9 +85,7 @@ def freeform_gpt_judge(judge_model_name: str = "gpt-4o"):
 
 
 def get_bert_score(model_type: str = "xlm-roberta-large"):
-    score = BertScore(
-        normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip
-    )
+    score = BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip)
     score.bert_scorer = BERTScorer(
         # We could download the files from here and set the baseline_path ourselves:
         # https://github.com/Tiiiger/bert_score/tree/master/bert_score/rescale_baseline
@@ -128,12 +125,8 @@ def __init__(self, model_size: str = "tiny", seq_len: int = 512):
         ], "Model size must be either tiny, base, or large"
         assert seq_len in [128, 512], "Sequence length must be either 128 or 512"
 
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            f"Elron/bleurt-{model_size}-{seq_len}"
-        )
-        self.model = AutoModelForSequenceClassification.from_pretrained(
-            f"Elron/bleurt-{model_size}-{seq_len}"
-        )
+        self.tokenizer = AutoTokenizer.from_pretrained(f"Elron/bleurt-{model_size}-{seq_len}")
+        self.model = AutoModelForSequenceClassification.from_pretrained(f"Elron/bleurt-{model_size}-{seq_len}")
         self.max_length = seq_len
         self.model.eval()
 
@@ -158,9 +151,7 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
             max_length=self.max_length,
         )
         if any(len(encoding) == self.max_length for encoding in inputs["input_ids"]):
-            hlog_warn(
-                f"Some inputs were truncated to max_length={self.max_length} in BLEURT scoring"
-            )
+            hlog_warn(f"Some inputs were truncated to max_length={self.max_length} in BLEURT scoring")
         scores = self.model(**inputs)[0].squeeze()
         return scores.item()
 
@@ -415,9 +406,7 @@ def __init__(
         super().__init__(
             name=f"{dataset_config.name}-{level_name}:{src_lang}-{target_lang}",
             suite=["community"],
-            prompt_function=create_prompt_fn(
-                dataset_config.subsets[level_name], src_lang, target_lang
-            ),
+            prompt_function=create_prompt_fn(dataset_config.subsets[level_name], src_lang, target_lang),
             hf_repo=dataset_config.hf_repo,
             hf_subset=level_name,
             hf_filter=None,

From 81941254b99bc3a57f4a28f3784d4591fe945fda Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 20 Nov 2024 11:52:58 +0100
Subject: [PATCH 06/57] Changed prompt template.

---
 community_tasks/swiss_legal_evals.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 5b47e1d64..890b1b98d 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -283,7 +283,6 @@ def create_translation_pairs(langs_list: list) -> list[tuple]:
 class LevelConfig:
     name: str
     text_col_name: str
-    prompt_prefix: str
     metadata_cols: list[str]
 
 
@@ -307,19 +306,16 @@ def __post_init__(self):
         "bge_level": LevelConfig(
             name="bge_level",
             text_col_name="bgeText",
-            prompt_prefix="Consider the following summary of a Swiss leading court decision",
             metadata_cols=["bge"],
         ),
         "regeste_level": LevelConfig(
             name="regeste_level",
             text_col_name="regesteText",
-            prompt_prefix="Consider the following paragraph of a summary of a Swiss leading court decision",
             metadata_cols=["bge"],
         ),
         "text_level": LevelConfig(
             name="text_level",
             text_col_name="text",
-            prompt_prefix="Consider the following sentence of a summary of a Swiss leading court decision",
             metadata_cols=["bge"],
         ),
     },
@@ -334,19 +330,16 @@ def __post_init__(self):
         "law_level": LevelConfig(
             name="law_level",
             text_col_name="lawText",
-            prompt_prefix="Consider the following Swiss federal law",
             metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
         ),
         "article_level": LevelConfig(
             name="article_level",
             text_col_name="articleText",
-            prompt_prefix="Consider the following Swiss federal law article",
             metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
         ),
         "paragraph_level": LevelConfig(
             name="paragraph_level",
             text_col_name="paragraphText",
-            prompt_prefix="Consider the following Swiss federal law paragraph",
             metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
         ),
     },
@@ -361,7 +354,6 @@ def __post_init__(self):
         "press_release": LevelConfig(
             name="press_release",
             text_col_name="text",
-            prompt_prefix="Consider the following Swiss Supreme Court press release",
             metadata_cols=["filename"],
         )
     },
@@ -377,8 +369,8 @@ def create_prompt_fn(level_config: LevelConfig, src_lang: str, target_lang: str)
     target_text_col = f"{target_lang}_{text_col}"
 
     def prompt_fn(line: dict, task_name: str = None):
-        # TODO: replace this with the prompt template
-        custom_query = f"{level_config.prompt_prefix}: {line[src_text_col]}\nTranslate from {src_lang} to {target_lang}.\nTranslation: "
+        # Following Template A from https://github.com/huggingface/lighteval/pull/389#issuecomment-2471580177
+        custom_query = f"{src_lang.upper()}: {line[src_text_col]}\n{target_lang.upper()}:"
 
         return Doc(
             task_name=task_name,

From c58ae447be1efd449d5498eaf7b4138c34799412 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Thu, 21 Nov 2024 15:30:54 +0100
Subject: [PATCH 07/57] Added legal translation specific judge prompt.

---
 community_tasks/swiss_legal_evals.py | 95 ++++++++++++++++++++++++++--
 1 file changed, 90 insertions(+), 5 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 890b1b98d..62113d1b2 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -53,16 +53,101 @@
     SampleLevelMetric,
     SampleLevelMetricGrouping,
 )
-from lighteval.tasks.extended.mix_eval.judge_prompts import (
-    gpt_judge_for_closeended_freeform,
-)
 from lighteval.tasks.extended.mix_eval.main import process_judge_response_freeform_gpt
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 
 
 # CUSTOM METRICS
-# TODO: adjust prompt
+
+
+def swiss_legal_translation_judge(question, options, answer, gold):
+    return [
+        {
+            "role": "system",
+            "content": "Act as a Judge specializing in the evaluation of translations of Swiss legal documents. Your task is to assess the accuracy, clarity, and fidelity of the model's translation to the golden translation, while considering the nuances of legal language.",
+        },
+        {
+            "role": "user",
+            "content": f"""You will be provided with a source text, its golden translation, and the model's translation. Your task is to judge how correct the model's translation is based on the golden translation, and then give a correctness score. The correctness score should be one of the below numbers: 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). You should first briefly give your reasoning process regarding how the model's translation conforms to or contradicts the golden translation, and then give the correctness score. The correctness score must strictly follow this format: \"[[score]]\", e.g., \"The correctness score: [[0.5]]\". Below are some examples.
+
+Example 1:
+Source Text:
+Art. 20
+A contract is void if its terms are impossible, unlawful or immoral. However, where the defect pertains only to certain terms of a contract, those terms alone are void unless there is cause to assume that the contract would not have been concluded without them.
+
+Golden Translation:
+Il contratto che ha per oggetto una cosa impossibile o contraria alle leggi od ai buoni costumi è nullo. Se il contratto è viziato solo in alcune parti, queste soltanto sono nulle, ove non si debba ammettere che senza la parte nulla esso non sarebbe stato conchiuso.
+
+Model’s Translation:
+Il contratto è nullo se le sue clausole sono impossibili, illecite o immorali. Tuttavia, quando il vizio riguarda solo determinate clausole del contratto, solo queste sono nulle, salvo che vi sia motivo di ritenere che il contratto non sarebbe stato concluso senza di esse.
+
+Your Judgment: The model’s translation aligns well with the golden translation in terms of accuracy, clarity, and fidelity to the source text. However, there are minor stylistic differences. For example, the golden translation uses “conchiuso,” an older and more formal term, while the model opts for “concluso,” which is modern. Similarly, the golden translation uses the idiomatic phrase “contraria alle leggi od ai buoni costumi,” whereas the model employs the more literal “illecite o immorali”. The correctness score: [[0.9]]
+
+Example 2:
+Source Text:
+Art. 13 Abs. 2, Art. 36 Abs. 1 BV; Art. 141 Abs. 2 StPO; Verwertbarkeit von polizeilichen Aufzeichnungen der automatischen Fahrzeugfahndung und Verkehrsüberwachung (AFV).
+Die Erhebung und die Aufbewahrung von Aufzeichnungen der AFV stellen einen Eingriff in die Grundrechte der Betroffenen dar, insbesondere in das Recht auf Privatsphäre, das den Anspruch auf informationelle Selbstbestimmung miteinschliesst (E. 3.1). Für die AFV besteht im Kanton Thurgau keine hinreichend bestimmte gesetzliche Grundlage. Der mit der Überwachung verbundene Eingriff in die Privatsphäre verstösst daher gegen Art. 13 Abs. 2 i.V.m. Art. 36 Abs. 1 BV (E. 3.2 und 3.3).
+Stellt die Polizei im Rahmen ihrer präventiven Kontrolltätigkeit strafbare Handlungen fest, ermittelt sie nach Art. 306 ff. StPO. Die Frage, ob die mangels gesetzlicher Grundlage rechtswidrig erhobenen Beweismittel im Strafprozess verwertbar sind, ist nach Art. 141 Abs. 2 StPO zu prüfen (Präzisierung der Rechtsprechung; E. 4.1 und 4.2). Verwertbarkeit im vorliegenden Fall verneint (E. 4.3).
+
+Golden Translation:
+Art. 13 al. 2, art. 36 al. 1 Cst.; art. 141 al. 2 CPP; exploitabilité d'enregistrements réalisés par la police au moyen du Système de recherche automatisée de véhicules et surveillance du trafic (RVS). La réalisation et le stockage d'enregistrements par la RVS constituent une atteinte aux droits fondamentaux des personnes concernées, singulièrement à la protection de leur sphère privée, qui englobe le droit à l'autodétermination en matière de données (consid. 3.1). Le recours à la RVS ne repose sur aucune base légale suffisamment précise dans le canton de Thurgovie. L'atteinte à la sphère privée inhérente à cette surveillance viole partant l'art. 13 al. 2 en corrélation avec l'art. 36 al. 1 Cst. (consid. 3.2 et 3.3). La constatation d'infractions par la police dans le cadre de son activité de contrôle préventive, relève de l'investigation au sens des art. 306 ss CPP. L'exploitabilité dans le procès pénal de preuves recueillies de manière illicite faute de base légale doit être examinée à l'aune de l'art. 141 al. 2 CPP (précision de la jurisprudence; consid. 4.1 et 4.2). Exploitabilité niée en l'espèce (consid. 4.3).
+
+Model’s Translation:
+Art. 13 al. 2, Art. 36 al. 1 Cst.; Art. 141 al. 2 CPP; Admissibilité des enregistrements de la poursuite automatisée des véhicules et de la surveillance de la circulation (AFV).
+La collecte et la conservation des enregistrements de l'AFV constituent une ingérence dans les droits fondamentaux des personnes concernées, en particulier dans le droit à la vie privée, qui inclut le droit à l'autodétermination informationnelle (consid. 3.1). Dans le canton de Thurgovie, il n'existe pas de base légale suffisamment déterminée pour l'AFV. Par conséquent, l'ingérence dans la vie privée associée à la surveillance viole donc les art. 13 al. 2 combiné avec l'art. 36 al. 1 Cst (consid. 3.2 et 3.3).
+Si la police relève des infractions pénales dans le cadre de ses activités de contrôle préventif, elle enquête conformément aux art. 306 et suivants CPP. La question de savoir si les preuves illégalement recueillies peuvent être utilisées dans la procédure pénale est examinée conformément à l'art. 141 al. 2 CPP (précision de la jurisprudence; consid. 4.1 et 4.2). Admissibilité dans le cas présent niée (consid. 4.3).
+
+Your Judgment: The model’s translation mostly aligns with the golden translation but diverges when it comes to accuracy and fidelity to Swiss legal terminology. For instance, the term “exploitabilité” which is closer to the Swiss provision is replaced in the model’s translation with “admissibilité”. Similarly, “ingérence” is used instead of “atteinte”, although “atteinte” is commonly used in Swiss law to discuss a violation of fundamental rights. Also, the term "recherche automatisée de véhicules et surveillance du trafic (RVS)" used by the golden translation is more established than "poursuite automatisée des véhicules et de la surveillance de la circulation (AFV)" in the model’s translation. The model’s translation is almost complete, but omits a critical point in one sentence: that the evidence was unlawfully obtained due to lack of a sufficiently clear legal basis. This omission impacts the completeness. The correctness score: [[0.7]]
+
+Example 3:
+Source Text:
+Yoko Ono est propriétaire de la montre de John Lennon – rejet du recours d'un collectionneur contre un arrêt rendu par la Cour de justice genevoise
+
+Le Tribunal fédéral rejette le recours déposé par un collectionneur contre l'arrêt de la Cour de justice genevoise par lequel celle-ci confirmait que Yoko Ono est propriétaire de la montre qu'elle avait offerte à John Lennon en 1980, deux mois avant qu'il ne soit assassiné. Le collectionneur, qui a remis la montre à une maison de vente aux enchères genevoise en 2014 afin d'en faire estimer la valeur, a quant à lui revendiqué la propriété de ladite montre.
+
+En 1980, Yoko Ono a acquis à New York une montre de marque Patek Philippe. Elle y a fait graver au dos l'inscription « (JUST LIKE) STARTING OVER LOVE YOKO 10·9·1980 N.Y.C » et l'a offerte à son époux, John Lennon, le 9 octobre 1980 pour son 40e anniversaire. Le 8 décembre 1980, John Lennon a été assassiné à New York. La montre a été répertoriée dans l'inventaire successoral et conservée dans une pièce de l'appartement de Yoko Ono à New York. Par la suite, la montre s'est retrouvée aux mains d'un homme qui avait été le chauffeur privé de Yoko Ono de 1995 à 2006. Un autre possesseur intermédiaire l'a remise à une maison de vente aux enchères allemande, où elle a été acquise par un collectionneur en 2014. Ce dernier l'a remise la même année à une maison de vente aux enchères genevoise afin d'en faire estimer la valeur, ce dont a été informée Yoko Ono. Cette dernière n'avait jusqu'alors pas eu conscience du fait que la montre n'était plus en sa possession. En 2018, le collectionneur a formé à Genève une action visant à constater sa qualité de propriétaire, action à laquelle Yoko Ono s'est opposée. En 2022, le tribunal de première instance genevois a constaté que Yoko Ono était la seule et unique propriétaire de la montre, ce que la Cour de justice du canton de Genève, statuant sur appel du collectionneur, a confirmé en 2023.
+
+Le Tribunal fédéral rejette le recours déposé par le collectionneur contre cet arrêt. Il n'est tout d'abord pas contesté que la propriété de la montre a été acquise par succession par Yoko Ono après le décès de John Lennon. C'est en outre sans arbitraire que la Cour de justice genevoise a retenu que la montre avait été volée par l'ancien chauffeur et que, à l'inverse, aucun élément ne permettait de démontrer que Yoko Ono aurait eu l'intention de faire donation au chauffeur d'une chose si particulière que la montre, gravée d'une inscription, qu'elle avait offerte à John Lennon deux mois avant son décès. Dès lors qu'il s'agit d'une chose volée, le collectionneur, aujourd'hui recourant, ne pouvait pas acquérir la propriété de la montre par un mode originaire d'acquisition lorsqu'il l'a achetée en Allemagne en 2014 ; selon le droit allemand applicable en la matière, cela vaut indépendamment du fait que l'acquéreur était ou non de bonne foi quant à l'origine de la chose.
+
+Golden Translation:
+Yoko Ono ist Eigentümerin der Uhr von John Lennon –
+Beschwerde von Sammler gegen Genfer Urteil abgewiesen
+
+Das Bundesgericht weist die Beschwerde eines Sammlers gegen das Urteil des Genfer Kantonsgerichts ab, mit dem Yoko Ono als Eigentümerin der Uhr bestätigt wurde, die sie John Lennon 1980 zwei Monate vor seiner Ermordung geschenkt hat. Der Sammler hatte die Uhr 2014 zur Schätzung bei einem Auktionshaus in Genf eingereicht und seinerseits Eigentümerschaft an der Uhr geltend gemacht.
+Yoko Ono hatte 1980 in New York eine Uhr der Marke Patek Philippe gekauft. Sie liess auf der Rückseite die Gravur "(JUST LIKE) STARTING OVER LOVE YOKO 10·9·1980 N.Y.C" anbringen und schenkte sie ihrem Ehemann John Lennon am 9. Oktober 1980 zum 40. Geburtstag. Am 8. Dezember 1980 wurde John Lennon in New York ermordet. Die Uhr wurde ins Erbschaftsinventar aufgenommen und in einem Zimmer der Wohnung von Yoko Ono in New York aufbewahrt. Sie gelangte von dort in die Hände eines Mannes, der von 1995 bis 2006 Privatchauffeur von Yoko Ono gewesen war. Ein weiterer Zwischenbesitzer brachte die Uhr in ein deutsches Auktionshaus, wo sie 2014 von einem Sammler erworben wurde. Dieser reichte die Uhr im gleichen Jahr bei einem Auktionshaus in Genf zur Schätzung ihres Wertes ein. Davon erfuhr Yoko Ono, die bis dahin keine Kenntnis davon gehabt hatte, dass sich die Uhr nicht mehr in ihrem Besitz befand. Der Sammler erhob 2018 in Genf eine Klage auf Feststellung seiner Eigentümerschaft, der sich Yoko Ono widersetzte. Das erstinstanzliche Genfer Gericht stellte 2022 fest, dass Yoko Ono die alleinige Eigentümerin der Uhr sei, was auf Berufung des Sammlers 2023 vom Kantonsgericht des Kantons Genf bestätigt wurde.
+
+Das Bundesgericht weist die dagegen erhobene Beschwerde des Sammlers ab. Unbestritten ist zunächst, dass das Eigentum an der Uhr nach dem Tod von John Lennon durch Erbschaft an Yoko Ono übergegangen ist. Sodann durfte das Genfer Kantonsgericht willkürfrei davon ausgehen, dass die Uhr vom ehemaligen Chauffeur gestohlen wurde und umgekehrt nichts dafür spricht, dass Yoko Ono dem Chauffeur dieses einzigartige Stück, welches sie John Lennon zwei Monate vor seinem Tod mit einer Inschrift versehen überreicht hatte, hätte schenken wollen. Da es sich bei der Uhr um eine gestohlene Sache handelt, konnte der Sammler und heutige Beschwerdeführer bei ihrem Erwerb 2014 in Deutschland nicht unmittelbar zum Eigentümer werden; dies gilt gemäss dem diesbezüglich anwendbaren deutschen Recht unabhängig davon, ob der Erwerber in Bezug auf die Herkunft der Sache in gutem Glauben war.
+
+
+Model’s Translation:
+Hier, der Bundesgerichtshof hat den Antrag des Käufer gegen den Urteil des Zivilgerichts in Zürich abgelehnt, das das Zivilgericht in Zürich 2023 bestätigt hatte, dass Yoko Ono die Uhr als Geschenk 1980 an John Lennon gegeben hat, bevor dieser am 8. Dezember 1980 ermordet wurde. Der Käufer, der die Uhr 2014 in eine Geneßische Auktion verkauft hat, behauptet, die Uhr sei ihm verliehen worden, und nicht von Yoko Ono.
+
+Im Jahr 1980 hat Yoko Ono in New York eine Uhr von der Marke Patek Philippe gekauft. Sie hat auf der Rückseite der Uhr die Inschrift "JUST LIKE STARTING OVER LOVE YOKO 10·9·1980 N.Y.C." gemalt und sie an ihren Ehemann, John Lennon, am 9. Oktober 1980 als Geburtstag gegeben. Am 8. Dezember 1980 wurde Lennon ermordet in New York. Die Uhr wurde in den Erbeverzeichnis des Lennon-Mädchens aufbewahrt und in einem Zimmer in Yoko Onos New Yorker Wohnung.
+
+Danach wurde die Uhr von einem Mann, der als Wagenkellner von Yoko Ono zwischen 1995 und 2006 gearbeitet hat, in die Hände gebracht. Ein weiterer mittlerer Eigentümer hat sie 2014 in eine deutsche Auktion verkauft, wo sie einem Käufer in Zürich übergeben wurde. Dieser Käufer hat die Uhr 2014 in eine Geneßische Auktion verkauft, um sie zu valieren, und Yoko Ono war von diesem Moment an nicht mehr bewusst, dass die Uhr nicht mehr in ihrer Hand war.
+
+Im Jahr 2018 hat der Käufer in Zürich eine Anzeige gegen Yoko Ono eröffnet, um die Qualität des Eigentums der Uhr zu bestätigen, was Yoko Ono ansprach. Im Jahr 2022 hat der Zivilgericht in Zürich bestätigt, dass Yoko Ono die einzige und letzte Eigentümerin der Uhr ist, was der Zivilgerichtshof im Jahr 2023 bestätigt hat.
+
+Der Bundesgerichtshof hat den Antrag des Käufer gegen dieses Urteil abgelehnt. Er ist zunächst nicht bezweifelt, dass Yoko Ono die Uhr nach dem Tod von John Lennon durch Erbvergabe gekauft hat. Es ist auch nicht bezweifelt, dass die Zivilgerichtsverfassung in Zürich die Uhr als gestohlen angenommen hat, da niemand einen Beweis dafür gab, dass Yoko Ono die Uhr als Geschenk an den Wagenkellner hatte. Deshalb kann der Käufer nicht durch eine rechtliche Handlung die Uhr von Yoko Ono erwerben, wenn er die Uhr in Deutschland gekauft hat.
+
+Your Judgment: The model’s translation diverges significantly from the golden translation in accuracy, clarity, and fidelity. Critical legal terminology is mistranslated, omitted, and distorted. For instance, the courts are misidentified (“Zivilgerichtsverfassung”, “Zivilgericht”, “Bundesgerichtshof”). The model’s translation has grammatical errors, such as “Geneßische Auktion”, “Erbvergabe”, “Wagenkellner” and “zu valieren”. The model also omits the explanation that, under German law, stolen property cannot be acquired in good faith. The correctness score: [[0.2]]
+
+Judge the below case, give the brief reasoning process and the correctness score.
+
+
+Source:
+{question}
+
+Golden Translation:
+{gold}
+
+Model's Translation:
+{answer}
+
+Your Judgment:""",
+        },
+    ]
 
 
 def freeform_gpt_judge(judge_model_name: str = "gpt-4o"):
@@ -73,7 +158,7 @@ def freeform_gpt_judge(judge_model_name: str = "gpt-4o"):
         use_case=MetricUseCase.SUMMARIZATION,
         sample_level_fn=JudgeLLMMixEval(
             judge_model_name=judge_model_name,
-            template=gpt_judge_for_closeended_freeform,
+            template=swiss_legal_translation_judge,
             process_judge_response=process_judge_response_freeform_gpt,
             judge_backend="openai",
             short_judge_name=judge_model_name,

From ff3705f94ae1085035c79b1baeec50ae00c816de Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Thu, 21 Nov 2024 15:41:59 +0100
Subject: [PATCH 08/57] Improved judge prompt.

---
 community_tasks/swiss_legal_evals.py | 43 ++++++++++++++--------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 62113d1b2..85245aab3 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -73,55 +73,56 @@ def swiss_legal_translation_judge(question, options, answer, gold):
 
 Example 1:
 Source Text:
-Art. 20
-A contract is void if its terms are impossible, unlawful or immoral. However, where the defect pertains only to certain terms of a contract, those terms alone are void unless there is cause to assume that the contract would not have been concluded without them.
+```A contract is void if its terms are impossible, unlawful or immoral. However, where the defect pertains only to certain terms of a contract, those terms alone are void unless there is cause to assume that the contract would not have been concluded without them.```
 
 Golden Translation:
-Il contratto che ha per oggetto una cosa impossibile o contraria alle leggi od ai buoni costumi è nullo. Se il contratto è viziato solo in alcune parti, queste soltanto sono nulle, ove non si debba ammettere che senza la parte nulla esso non sarebbe stato conchiuso.
+```Il contratto che ha per oggetto una cosa impossibile o contraria alle leggi od ai buoni costumi è nullo. Se il contratto è viziato solo in alcune parti, queste soltanto sono nulle, ove non si debba ammettere che senza la parte nulla esso non sarebbe stato conchiuso.```
 
 Model’s Translation:
-Il contratto è nullo se le sue clausole sono impossibili, illecite o immorali. Tuttavia, quando il vizio riguarda solo determinate clausole del contratto, solo queste sono nulle, salvo che vi sia motivo di ritenere che il contratto non sarebbe stato concluso senza di esse.
+```Il contratto è nullo se le sue clausole sono impossibili, illecite o immorali. Tuttavia, quando il vizio riguarda solo determinate clausole del contratto, solo queste sono nulle, salvo che vi sia motivo di ritenere che il contratto non sarebbe stato concluso senza di esse.```
 
 Your Judgment: The model’s translation aligns well with the golden translation in terms of accuracy, clarity, and fidelity to the source text. However, there are minor stylistic differences. For example, the golden translation uses “conchiuso,” an older and more formal term, while the model opts for “concluso,” which is modern. Similarly, the golden translation uses the idiomatic phrase “contraria alle leggi od ai buoni costumi,” whereas the model employs the more literal “illecite o immorali”. The correctness score: [[0.9]]
 
 Example 2:
 Source Text:
-Art. 13 Abs. 2, Art. 36 Abs. 1 BV; Art. 141 Abs. 2 StPO; Verwertbarkeit von polizeilichen Aufzeichnungen der automatischen Fahrzeugfahndung und Verkehrsüberwachung (AFV).
+```Art. 13 Abs. 2, Art. 36 Abs. 1 BV; Art. 141 Abs. 2 StPO; Verwertbarkeit von polizeilichen Aufzeichnungen der automatischen Fahrzeugfahndung und Verkehrsüberwachung (AFV).
 Die Erhebung und die Aufbewahrung von Aufzeichnungen der AFV stellen einen Eingriff in die Grundrechte der Betroffenen dar, insbesondere in das Recht auf Privatsphäre, das den Anspruch auf informationelle Selbstbestimmung miteinschliesst (E. 3.1). Für die AFV besteht im Kanton Thurgau keine hinreichend bestimmte gesetzliche Grundlage. Der mit der Überwachung verbundene Eingriff in die Privatsphäre verstösst daher gegen Art. 13 Abs. 2 i.V.m. Art. 36 Abs. 1 BV (E. 3.2 und 3.3).
-Stellt die Polizei im Rahmen ihrer präventiven Kontrolltätigkeit strafbare Handlungen fest, ermittelt sie nach Art. 306 ff. StPO. Die Frage, ob die mangels gesetzlicher Grundlage rechtswidrig erhobenen Beweismittel im Strafprozess verwertbar sind, ist nach Art. 141 Abs. 2 StPO zu prüfen (Präzisierung der Rechtsprechung; E. 4.1 und 4.2). Verwertbarkeit im vorliegenden Fall verneint (E. 4.3).
+Stellt die Polizei im Rahmen ihrer präventiven Kontrolltätigkeit strafbare Handlungen fest, ermittelt sie nach Art. 306 ff. StPO. Die Frage, ob die mangels gesetzlicher Grundlage rechtswidrig erhobenen Beweismittel im Strafprozess verwertbar sind, ist nach Art. 141 Abs. 2 StPO zu prüfen (Präzisierung der Rechtsprechung; E. 4.1 und 4.2). Verwertbarkeit im vorliegenden Fall verneint (E. 4.3).```
 
 Golden Translation:
-Art. 13 al. 2, art. 36 al. 1 Cst.; art. 141 al. 2 CPP; exploitabilité d'enregistrements réalisés par la police au moyen du Système de recherche automatisée de véhicules et surveillance du trafic (RVS). La réalisation et le stockage d'enregistrements par la RVS constituent une atteinte aux droits fondamentaux des personnes concernées, singulièrement à la protection de leur sphère privée, qui englobe le droit à l'autodétermination en matière de données (consid. 3.1). Le recours à la RVS ne repose sur aucune base légale suffisamment précise dans le canton de Thurgovie. L'atteinte à la sphère privée inhérente à cette surveillance viole partant l'art. 13 al. 2 en corrélation avec l'art. 36 al. 1 Cst. (consid. 3.2 et 3.3). La constatation d'infractions par la police dans le cadre de son activité de contrôle préventive, relève de l'investigation au sens des art. 306 ss CPP. L'exploitabilité dans le procès pénal de preuves recueillies de manière illicite faute de base légale doit être examinée à l'aune de l'art. 141 al. 2 CPP (précision de la jurisprudence; consid. 4.1 et 4.2). Exploitabilité niée en l'espèce (consid. 4.3).
+```Art. 13 al. 2, art. 36 al. 1 Cst.; art. 141 al. 2 CPP; exploitabilité d'enregistrements réalisés par la police au moyen du Système de recherche automatisée de véhicules et surveillance du trafic (RVS).
+La réalisation et le stockage d'enregistrements par la RVS constituent une atteinte aux droits fondamentaux des personnes concernées, singulièrement à la protection de leur sphère privée, qui englobe le droit à l'autodétermination en matière de données (consid. 3.1). Le recours à la RVS ne repose sur aucune base légale suffisamment précise dans le canton de Thurgovie. L'atteinte à la sphère privée inhérente à cette surveillance viole partant l'art. 13 al. 2 en corrélation avec l'art. 36 al. 1 Cst. (consid. 3.2 et 3.3).
+La constatation d'infractions par la police dans le cadre de son activité de contrôle préventive, relève de l'investigation au sens des art. 306 ss CPP. L'exploitabilité dans le procès pénal de preuves recueillies de manière illicite faute de base légale doit être examinée à l'aune de l'art. 141 al. 2 CPP (précision de la jurisprudence; consid. 4.1 et 4.2). Exploitabilité niée en l'espèce (consid. 4.3).```
 
 Model’s Translation:
-Art. 13 al. 2, Art. 36 al. 1 Cst.; Art. 141 al. 2 CPP; Admissibilité des enregistrements de la poursuite automatisée des véhicules et de la surveillance de la circulation (AFV).
+```Art. 13 al. 2, Art. 36 al. 1 Cst.; Art. 141 al. 2 CPP; Admissibilité des enregistrements de la poursuite automatisée des véhicules et de la surveillance de la circulation (AFV).
 La collecte et la conservation des enregistrements de l'AFV constituent une ingérence dans les droits fondamentaux des personnes concernées, en particulier dans le droit à la vie privée, qui inclut le droit à l'autodétermination informationnelle (consid. 3.1). Dans le canton de Thurgovie, il n'existe pas de base légale suffisamment déterminée pour l'AFV. Par conséquent, l'ingérence dans la vie privée associée à la surveillance viole donc les art. 13 al. 2 combiné avec l'art. 36 al. 1 Cst (consid. 3.2 et 3.3).
-Si la police relève des infractions pénales dans le cadre de ses activités de contrôle préventif, elle enquête conformément aux art. 306 et suivants CPP. La question de savoir si les preuves illégalement recueillies peuvent être utilisées dans la procédure pénale est examinée conformément à l'art. 141 al. 2 CPP (précision de la jurisprudence; consid. 4.1 et 4.2). Admissibilité dans le cas présent niée (consid. 4.3).
+Si la police relève des infractions pénales dans le cadre de ses activités de contrôle préventif, elle enquête conformément aux art. 306 et suivants CPP. La question de savoir si les preuves illégalement recueillies peuvent être utilisées dans la procédure pénale est examinée conformément à l'art. 141 al. 2 CPP (précision de la jurisprudence; consid. 4.1 et 4.2). Admissibilité dans le cas présent niée (consid. 4.3).```
 
 Your Judgment: The model’s translation mostly aligns with the golden translation but diverges when it comes to accuracy and fidelity to Swiss legal terminology. For instance, the term “exploitabilité” which is closer to the Swiss provision is replaced in the model’s translation with “admissibilité”. Similarly, “ingérence” is used instead of “atteinte”, although “atteinte” is commonly used in Swiss law to discuss a violation of fundamental rights. Also, the term "recherche automatisée de véhicules et surveillance du trafic (RVS)" used by the golden translation is more established than "poursuite automatisée des véhicules et de la surveillance de la circulation (AFV)" in the model’s translation. The model’s translation is almost complete, but omits a critical point in one sentence: that the evidence was unlawfully obtained due to lack of a sufficiently clear legal basis. This omission impacts the completeness. The correctness score: [[0.7]]
 
 Example 3:
 Source Text:
-Yoko Ono est propriétaire de la montre de John Lennon – rejet du recours d'un collectionneur contre un arrêt rendu par la Cour de justice genevoise
+```Yoko Ono est propriétaire de la montre de John Lennon – rejet du recours d'un collectionneur contre un arrêt rendu par la Cour de justice genevoise
 
 Le Tribunal fédéral rejette le recours déposé par un collectionneur contre l'arrêt de la Cour de justice genevoise par lequel celle-ci confirmait que Yoko Ono est propriétaire de la montre qu'elle avait offerte à John Lennon en 1980, deux mois avant qu'il ne soit assassiné. Le collectionneur, qui a remis la montre à une maison de vente aux enchères genevoise en 2014 afin d'en faire estimer la valeur, a quant à lui revendiqué la propriété de ladite montre.
 
 En 1980, Yoko Ono a acquis à New York une montre de marque Patek Philippe. Elle y a fait graver au dos l'inscription « (JUST LIKE) STARTING OVER LOVE YOKO 10·9·1980 N.Y.C » et l'a offerte à son époux, John Lennon, le 9 octobre 1980 pour son 40e anniversaire. Le 8 décembre 1980, John Lennon a été assassiné à New York. La montre a été répertoriée dans l'inventaire successoral et conservée dans une pièce de l'appartement de Yoko Ono à New York. Par la suite, la montre s'est retrouvée aux mains d'un homme qui avait été le chauffeur privé de Yoko Ono de 1995 à 2006. Un autre possesseur intermédiaire l'a remise à une maison de vente aux enchères allemande, où elle a été acquise par un collectionneur en 2014. Ce dernier l'a remise la même année à une maison de vente aux enchères genevoise afin d'en faire estimer la valeur, ce dont a été informée Yoko Ono. Cette dernière n'avait jusqu'alors pas eu conscience du fait que la montre n'était plus en sa possession. En 2018, le collectionneur a formé à Genève une action visant à constater sa qualité de propriétaire, action à laquelle Yoko Ono s'est opposée. En 2022, le tribunal de première instance genevois a constaté que Yoko Ono était la seule et unique propriétaire de la montre, ce que la Cour de justice du canton de Genève, statuant sur appel du collectionneur, a confirmé en 2023.
 
-Le Tribunal fédéral rejette le recours déposé par le collectionneur contre cet arrêt. Il n'est tout d'abord pas contesté que la propriété de la montre a été acquise par succession par Yoko Ono après le décès de John Lennon. C'est en outre sans arbitraire que la Cour de justice genevoise a retenu que la montre avait été volée par l'ancien chauffeur et que, à l'inverse, aucun élément ne permettait de démontrer que Yoko Ono aurait eu l'intention de faire donation au chauffeur d'une chose si particulière que la montre, gravée d'une inscription, qu'elle avait offerte à John Lennon deux mois avant son décès. Dès lors qu'il s'agit d'une chose volée, le collectionneur, aujourd'hui recourant, ne pouvait pas acquérir la propriété de la montre par un mode originaire d'acquisition lorsqu'il l'a achetée en Allemagne en 2014 ; selon le droit allemand applicable en la matière, cela vaut indépendamment du fait que l'acquéreur était ou non de bonne foi quant à l'origine de la chose.
+Le Tribunal fédéral rejette le recours déposé par le collectionneur contre cet arrêt. Il n'est tout d'abord pas contesté que la propriété de la montre a été acquise par succession par Yoko Ono après le décès de John Lennon. C'est en outre sans arbitraire que la Cour de justice genevoise a retenu que la montre avait été volée par l'ancien chauffeur et que, à l'inverse, aucun élément ne permettait de démontrer que Yoko Ono aurait eu l'intention de faire donation au chauffeur d'une chose si particulière que la montre, gravée d'une inscription, qu'elle avait offerte à John Lennon deux mois avant son décès. Dès lors qu'il s'agit d'une chose volée, le collectionneur, aujourd'hui recourant, ne pouvait pas acquérir la propriété de la montre par un mode originaire d'acquisition lorsqu'il l'a achetée en Allemagne en 2014 ; selon le droit allemand applicable en la matière, cela vaut indépendamment du fait que l'acquéreur était ou non de bonne foi quant à l'origine de la chose.```
 
 Golden Translation:
-Yoko Ono ist Eigentümerin der Uhr von John Lennon –
-Beschwerde von Sammler gegen Genfer Urteil abgewiesen
+```Yoko Ono ist Eigentümerin der Uhr von John Lennon – Beschwerde von Sammler gegen Genfer Urteil abgewiesen
 
 Das Bundesgericht weist die Beschwerde eines Sammlers gegen das Urteil des Genfer Kantonsgerichts ab, mit dem Yoko Ono als Eigentümerin der Uhr bestätigt wurde, die sie John Lennon 1980 zwei Monate vor seiner Ermordung geschenkt hat. Der Sammler hatte die Uhr 2014 zur Schätzung bei einem Auktionshaus in Genf eingereicht und seinerseits Eigentümerschaft an der Uhr geltend gemacht.
+
 Yoko Ono hatte 1980 in New York eine Uhr der Marke Patek Philippe gekauft. Sie liess auf der Rückseite die Gravur "(JUST LIKE) STARTING OVER LOVE YOKO 10·9·1980 N.Y.C" anbringen und schenkte sie ihrem Ehemann John Lennon am 9. Oktober 1980 zum 40. Geburtstag. Am 8. Dezember 1980 wurde John Lennon in New York ermordet. Die Uhr wurde ins Erbschaftsinventar aufgenommen und in einem Zimmer der Wohnung von Yoko Ono in New York aufbewahrt. Sie gelangte von dort in die Hände eines Mannes, der von 1995 bis 2006 Privatchauffeur von Yoko Ono gewesen war. Ein weiterer Zwischenbesitzer brachte die Uhr in ein deutsches Auktionshaus, wo sie 2014 von einem Sammler erworben wurde. Dieser reichte die Uhr im gleichen Jahr bei einem Auktionshaus in Genf zur Schätzung ihres Wertes ein. Davon erfuhr Yoko Ono, die bis dahin keine Kenntnis davon gehabt hatte, dass sich die Uhr nicht mehr in ihrem Besitz befand. Der Sammler erhob 2018 in Genf eine Klage auf Feststellung seiner Eigentümerschaft, der sich Yoko Ono widersetzte. Das erstinstanzliche Genfer Gericht stellte 2022 fest, dass Yoko Ono die alleinige Eigentümerin der Uhr sei, was auf Berufung des Sammlers 2023 vom Kantonsgericht des Kantons Genf bestätigt wurde.
 
-Das Bundesgericht weist die dagegen erhobene Beschwerde des Sammlers ab. Unbestritten ist zunächst, dass das Eigentum an der Uhr nach dem Tod von John Lennon durch Erbschaft an Yoko Ono übergegangen ist. Sodann durfte das Genfer Kantonsgericht willkürfrei davon ausgehen, dass die Uhr vom ehemaligen Chauffeur gestohlen wurde und umgekehrt nichts dafür spricht, dass Yoko Ono dem Chauffeur dieses einzigartige Stück, welches sie John Lennon zwei Monate vor seinem Tod mit einer Inschrift versehen überreicht hatte, hätte schenken wollen. Da es sich bei der Uhr um eine gestohlene Sache handelt, konnte der Sammler und heutige Beschwerdeführer bei ihrem Erwerb 2014 in Deutschland nicht unmittelbar zum Eigentümer werden; dies gilt gemäss dem diesbezüglich anwendbaren deutschen Recht unabhängig davon, ob der Erwerber in Bezug auf die Herkunft der Sache in gutem Glauben war.
+Das Bundesgericht weist die dagegen erhobene Beschwerde des Sammlers ab. Unbestritten ist zunächst, dass das Eigentum an der Uhr nach dem Tod von John Lennon durch Erbschaft an Yoko Ono übergegangen ist. Sodann durfte das Genfer Kantonsgericht willkürfrei davon ausgehen, dass die Uhr vom ehemaligen Chauffeur gestohlen wurde und umgekehrt nichts dafür spricht, dass Yoko Ono dem Chauffeur dieses einzigartige Stück, welches sie John Lennon zwei Monate vor seinem Tod mit einer Inschrift versehen überreicht hatte, hätte schenken wollen. Da es sich bei der Uhr um eine gestohlene Sache handelt, konnte der Sammler und heutige Beschwerdeführer bei ihrem Erwerb 2014 in Deutschland nicht unmittelbar zum Eigentümer werden; dies gilt gemäss dem diesbezüglich anwendbaren deutschen Recht unabhängig davon, ob der Erwerber in Bezug auf die Herkunft der Sache in gutem Glauben war.```
 
 
 Model’s Translation:
-Hier, der Bundesgerichtshof hat den Antrag des Käufer gegen den Urteil des Zivilgerichts in Zürich abgelehnt, das das Zivilgericht in Zürich 2023 bestätigt hatte, dass Yoko Ono die Uhr als Geschenk 1980 an John Lennon gegeben hat, bevor dieser am 8. Dezember 1980 ermordet wurde. Der Käufer, der die Uhr 2014 in eine Geneßische Auktion verkauft hat, behauptet, die Uhr sei ihm verliehen worden, und nicht von Yoko Ono.
+```Hier, der Bundesgerichtshof hat den Antrag des Käufer gegen den Urteil des Zivilgerichts in Zürich abgelehnt, das das Zivilgericht in Zürich 2023 bestätigt hatte, dass Yoko Ono die Uhr als Geschenk 1980 an John Lennon gegeben hat, bevor dieser am 8. Dezember 1980 ermordet wurde. Der Käufer, der die Uhr 2014 in eine Geneßische Auktion verkauft hat, behauptet, die Uhr sei ihm verliehen worden, und nicht von Yoko Ono.
 
 Im Jahr 1980 hat Yoko Ono in New York eine Uhr von der Marke Patek Philippe gekauft. Sie hat auf der Rückseite der Uhr die Inschrift "JUST LIKE STARTING OVER LOVE YOKO 10·9·1980 N.Y.C." gemalt und sie an ihren Ehemann, John Lennon, am 9. Oktober 1980 als Geburtstag gegeben. Am 8. Dezember 1980 wurde Lennon ermordet in New York. Die Uhr wurde in den Erbeverzeichnis des Lennon-Mädchens aufbewahrt und in einem Zimmer in Yoko Onos New Yorker Wohnung.
 
@@ -129,21 +130,21 @@ def swiss_legal_translation_judge(question, options, answer, gold):
 
 Im Jahr 2018 hat der Käufer in Zürich eine Anzeige gegen Yoko Ono eröffnet, um die Qualität des Eigentums der Uhr zu bestätigen, was Yoko Ono ansprach. Im Jahr 2022 hat der Zivilgericht in Zürich bestätigt, dass Yoko Ono die einzige und letzte Eigentümerin der Uhr ist, was der Zivilgerichtshof im Jahr 2023 bestätigt hat.
 
-Der Bundesgerichtshof hat den Antrag des Käufer gegen dieses Urteil abgelehnt. Er ist zunächst nicht bezweifelt, dass Yoko Ono die Uhr nach dem Tod von John Lennon durch Erbvergabe gekauft hat. Es ist auch nicht bezweifelt, dass die Zivilgerichtsverfassung in Zürich die Uhr als gestohlen angenommen hat, da niemand einen Beweis dafür gab, dass Yoko Ono die Uhr als Geschenk an den Wagenkellner hatte. Deshalb kann der Käufer nicht durch eine rechtliche Handlung die Uhr von Yoko Ono erwerben, wenn er die Uhr in Deutschland gekauft hat.
+Der Bundesgerichtshof hat den Antrag des Käufer gegen dieses Urteil abgelehnt. Er ist zunächst nicht bezweifelt, dass Yoko Ono die Uhr nach dem Tod von John Lennon durch Erbvergabe gekauft hat. Es ist auch nicht bezweifelt, dass die Zivilgerichtsverfassung in Zürich die Uhr als gestohlen angenommen hat, da niemand einen Beweis dafür gab, dass Yoko Ono die Uhr als Geschenk an den Wagenkellner hatte. Deshalb kann der Käufer nicht durch eine rechtliche Handlung die Uhr von Yoko Ono erwerben, wenn er die Uhr in Deutschland gekauft hat.```
 
-Your Judgment: The model’s translation diverges significantly from the golden translation in accuracy, clarity, and fidelity. Critical legal terminology is mistranslated, omitted, and distorted. For instance, the courts are misidentified (“Zivilgerichtsverfassung”, “Zivilgericht”, “Bundesgerichtshof”). The model’s translation has grammatical errors, such as “Geneßische Auktion”, “Erbvergabe”, “Wagenkellner” and “zu valieren”. The model also omits the explanation that, under German law, stolen property cannot be acquired in good faith. The correctness score: [[0.2]]
+Your Judgment: The model’s translation diverges significantly from the golden translation in accuracy, clarity, and fidelity. Critical legal terminology is mistranslated, omitted, and distorted. For instance, the courts are misidentified (“Zivilgerichtsverfassung”, “Zivilgericht”, “Bundesgerichtshof”). The model’s translation has several grammatical errors, such as “Geneßische Auktion”, “Erbvergabe”, “Wagenkellner” and “zu valieren”. The model also omits the explanation that, under German law, stolen property cannot be acquired in good faith. The correctness score: [[0.2]]
 
 Judge the below case, give the brief reasoning process and the correctness score.
 
 
-Source:
-{question}
+Source Text:
+```{question}```
 
 Golden Translation:
-{gold}
+```{gold}```
 
 Model's Translation:
-{answer}
+```{answer}```
 
 Your Judgment:""",
         },

From 091ec1134f523e7f253441ae48455aaff9ccd443 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Thu, 21 Nov 2024 16:56:22 +0100
Subject: [PATCH 09/57] Changed metric selection.

---
 community_tasks/swiss_legal_evals.py | 83 ++++++++++++++++++----------
 1 file changed, 55 insertions(+), 28 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 85245aab3..8958d3192 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -151,12 +151,13 @@ def swiss_legal_translation_judge(question, options, answer, gold):
     ]
 
 
-def freeform_gpt_judge(judge_model_name: str = "gpt-4o"):
+def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
+    name = f"swiss_legal_translation_judge_{judge_model_name}"
     return SampleLevelMetricGrouping(
-        metric_name=[f"llm_judge_{judge_model_name}"],
-        higher_is_better={"judge_score_{judge_model_name}": True},
+        metric_name=[name],
+        higher_is_better={name: True},
         category=MetricCategory.LLM_AS_JUDGE,
-        use_case=MetricUseCase.SUMMARIZATION,
+        use_case=MetricUseCase.TRANSLATION,
         sample_level_fn=JudgeLLMMixEval(
             judge_model_name=judge_model_name,
             template=swiss_legal_translation_judge,
@@ -165,11 +166,14 @@ def freeform_gpt_judge(judge_model_name: str = "gpt-4o"):
             short_judge_name=judge_model_name,
         ).compute,
         corpus_level_fn={
-            f"judge_score_{judge_model_name}": statistics.mean,
+            name: statistics.mean,
         },
     )
 
 
+swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
+
+
 def get_bert_score(model_type: str = "xlm-roberta-large"):
     score = BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip)
     score.bert_scorer = BERTScorer(
@@ -184,7 +188,7 @@ def get_bert_score(model_type: str = "xlm-roberta-large"):
         metric_name=["BERTScore-P", "BERTScore-R", "BERTScore-F"],
         sample_level_fn=score.compute,
         category=MetricCategory.GENERATIVE,
-        use_case=MetricUseCase.SUMMARIZATION,
+        use_case=MetricUseCase.TRANSLATION,
         corpus_level_fn={
             "BERTScore-P": statistics.mean,
             "BERTScore-R": statistics.mean,
@@ -242,14 +246,20 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         return scores.item()
 
 
-bleurt = SampleLevelMetric(
-    metric_name="bleurt",
-    sample_level_fn=BLEURT(model_size="tiny", seq_len=512).compute,
-    category=MetricCategory.GENERATIVE,
-    use_case=MetricUseCase.TRANSLATION,
-    corpus_level_fn=statistics.mean,
-    higher_is_better=True,
-)
+def get_bleurt(model_size: str = "tiny", seq_len: int = 512):
+    return SampleLevelMetric(
+        metric_name=f"bleurt_{model_size}",
+        sample_level_fn=BLEURT(model_size=model_size, seq_len=seq_len).compute,
+        category=MetricCategory.GENERATIVE,
+        use_case=MetricUseCase.TRANSLATION,
+        corpus_level_fn=statistics.mean,
+        higher_is_better=True,
+    )
+
+
+# bleurt_tiny = get_bleurt(model_size="tiny", seq_len=512)
+# bleurt_base = get_bleurt(model_size="base", seq_len=512)
+bleurt_large = get_bleurt(model_size="large", seq_len=512)
 
 
 class COMET:
@@ -285,14 +295,31 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         return model_output["system_score"]
 
 
-comet = SampleLevelMetric(
-    metric_name="comet",
-    sample_level_fn=COMET(accelerator="cpu").compute,
-    category=MetricCategory.GENERATIVE,
-    use_case=MetricUseCase.TRANSLATION,
-    corpus_level_fn=statistics.mean,
-    higher_is_better=True,
-)
+def get_comet(
+    model_name: str = "Unbabel/wmt22-comet-da",
+    batch_size: int = 1,
+    gpus: int = 1,
+    accelerator: str = "cpu",
+):
+    return SampleLevelMetric(
+        metric_name=model_name.split("/")[-1],
+        sample_level_fn=COMET(
+            model_name=model_name,
+            batch_size=batch_size,
+            gpus=gpus,
+            accelerator=accelerator,
+        ).compute,
+        category=MetricCategory.GENERATIVE,
+        use_case=MetricUseCase.TRANSLATION,
+        corpus_level_fn=statistics.mean,
+        higher_is_better=True,
+    )
+
+
+# There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
+comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=1, gpus=1, accelerator="cpu")
+# xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=1, gpus=1, accelerator="cpu")
+xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=1, gpus=1, accelerator="cpu")
 
 
 class METEOR:
@@ -495,15 +522,15 @@ def __init__(
             generation_size=10,
             metric=[
                 Metrics.bleu,
-                Metrics.bleu_1,
-                Metrics.bleu_4,
+                # Metrics.bleu_4,
                 Metrics.chrf,
                 Metrics.ter,
-                bert_score,
-                bleurt,
-                comet,
                 meteor,
-                freeform_gpt_judge(judge_model_name="gpt-4o"),
+                bert_score,
+                bleurt_large,
+                comet_wmt22_da,
+                xcomet_xxl,
+                swiss_legal_translation_judge_gpt_4o,
                 # Additionally we could consider adding the following open source judge models:
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0
                 # However, these are only fine-tuned on English data and we need multilingual support.

From 5a479564f0ea3d4a294df1cab4015f2e66258de1 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Fri, 22 Nov 2024 13:59:25 +0100
Subject: [PATCH 10/57] Made generation_size dependent on the config.

---
 community_tasks/swiss_legal_evals.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 8958d3192..d0dc733a6 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -397,6 +397,7 @@ class LevelConfig:
     name: str
     text_col_name: str
     metadata_cols: list[str]
+    generation_size: int
 
 
 @dataclass
@@ -420,16 +421,19 @@ def __post_init__(self):
             name="bge_level",
             text_col_name="bgeText",
             metadata_cols=["bge"],
+            generation_size=2048,
         ),
         "regeste_level": LevelConfig(
             name="regeste_level",
             text_col_name="regesteText",
             metadata_cols=["bge"],
+            generation_size=512,
         ),
         "text_level": LevelConfig(
             name="text_level",
             text_col_name="text",
             metadata_cols=["bge"],
+            generation_size=256,
         ),
     },
 )
@@ -444,16 +448,19 @@ def __post_init__(self):
             name="law_level",
             text_col_name="lawText",
             metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
+            generation_size=16384,
         ),
         "article_level": LevelConfig(
             name="article_level",
             text_col_name="articleText",
             metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
+            generation_size=1024,
         ),
         "paragraph_level": LevelConfig(
             name="paragraph_level",
             text_col_name="paragraphText",
             metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
+            generation_size=256,
         ),
     },
 )
@@ -468,6 +475,7 @@ def __post_init__(self):
             name="press_release",
             text_col_name="text",
             metadata_cols=["filename"],
+            generation_size=1024,
         )
     },
 )
@@ -508,10 +516,11 @@ def __init__(
         src_lang: str,
         target_lang: str,
     ):
+        level_config = dataset_config.subsets[level_name]
         super().__init__(
             name=f"{dataset_config.name}-{level_name}:{src_lang}-{target_lang}",
             suite=["community"],
-            prompt_function=create_prompt_fn(dataset_config.subsets[level_name], src_lang, target_lang),
+            prompt_function=create_prompt_fn(level_config, src_lang, target_lang),
             hf_repo=dataset_config.hf_repo,
             hf_subset=level_name,
             hf_filter=None,
@@ -519,7 +528,7 @@ def __init__(
             evaluation_splits=["test"],  # ["validation", "test"],
             few_shots_split="validation",
             few_shots_select=None,
-            generation_size=10,
+            generation_size=level_config.generation_size,
             metric=[
                 Metrics.bleu,
                 # Metrics.bleu_4,

From 6bf7fa2463070f7c73cfbc2de0d5087953577952 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Fri, 22 Nov 2024 14:13:53 +0100
Subject: [PATCH 11/57] Fixed error in config.

---
 community_tasks/swiss_legal_evals.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index d0dc733a6..ae3a46e7d 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -452,13 +452,13 @@ def __post_init__(self):
         ),
         "article_level": LevelConfig(
             name="article_level",
-            text_col_name="articleText",
+            text_col_name="artText",
             metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
             generation_size=1024,
         ),
         "paragraph_level": LevelConfig(
             name="paragraph_level",
-            text_col_name="paragraphText",
+            text_col_name="parText",
             metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
             generation_size=256,
         ),

From 6cf1c2acd69a6fc35983276e3ab81fe27dc59f7f Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Fri, 22 Nov 2024 14:53:07 +0100
Subject: [PATCH 12/57] Fixed error in config.

---
 community_tasks/swiss_legal_evals.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index ae3a46e7d..c7242df74 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -447,19 +447,19 @@ def __post_init__(self):
         "law_level": LevelConfig(
             name="law_level",
             text_col_name="lawText",
-            metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
+            metadata_cols=["rsNr"],
             generation_size=16384,
         ),
         "article_level": LevelConfig(
             name="article_level",
             text_col_name="artText",
-            metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
+            metadata_cols=["rsNr"],
             generation_size=1024,
         ),
         "paragraph_level": LevelConfig(
             name="paragraph_level",
             text_col_name="parText",
-            metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
+            metadata_cols=["rsNr"],
             generation_size=256,
         ),
     },

From b5488017248605bee9f096f2efb688cae124eb5b Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Fri, 22 Nov 2024 14:58:48 +0100
Subject: [PATCH 13/57] Added support for multiple devices.

---
 community_tasks/swiss_legal_evals.py | 73 ++++++++++++++++++----------
 1 file changed, 47 insertions(+), 26 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index c7242df74..7ca161737 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -36,6 +36,7 @@
 from dataclasses import dataclass
 
 import nltk
+import torch
 from comet import download_model, load_from_checkpoint
 from nltk import word_tokenize
 from nltk.translate import meteor_score
@@ -58,6 +59,8 @@
 from lighteval.tasks.requests import Doc
 
 
+device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
+
 # CUSTOM METRICS
 
 
@@ -171,10 +174,10 @@ def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
     )
 
 
-swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
-
-
-def get_bert_score(model_type: str = "xlm-roberta-large"):
+def get_bert_score(model_type: str = "xlm-roberta-large", device: str = "cpu"):
+    if device == "mps":
+        raise ValueError("MPS is not supported for BERTScore")
+    print(f"Loading BERTScore with model_type={model_type}, and device={device}...")
     score = BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip)
     score.bert_scorer = BERTScorer(
         # We could download the files from here and set the baseline_path ourselves:
@@ -183,6 +186,7 @@ def get_bert_score(model_type: str = "xlm-roberta-large"):
         lang=None,  # Needs to be set if rescale_with_baseline is True
         rescale_with_baseline=False,
         baseline_path=None,
+        device=device,
     )
     return SampleLevelMetricGrouping(
         metric_name=["BERTScore-P", "BERTScore-R", "BERTScore-F"],
@@ -202,11 +206,13 @@ def get_bert_score(model_type: str = "xlm-roberta-large"):
     )
 
 
-bert_score = get_bert_score(model_type="xlm-roberta-large")
-
-
 class BLEURT:
-    def __init__(self, model_size: str = "tiny", seq_len: int = 512):
+    def __init__(
+        self,
+        model_size: str = "tiny",
+        seq_len: int = 512,
+        device: str = "cpu",
+    ):
         """Creates a BLEURT scorer based on the model size (tiny, base, large) and sequence length (128, 512)."""
         assert model_size in [
             "tiny",
@@ -214,11 +220,14 @@ def __init__(self, model_size: str = "tiny", seq_len: int = 512):
             "large",
         ], "Model size must be either tiny, base, or large"
         assert seq_len in [128, 512], "Sequence length must be either 128 or 512"
+        if device == "mps":
+            raise ValueError("MPS is not supported for BLEURT")
 
         self.tokenizer = AutoTokenizer.from_pretrained(f"Elron/bleurt-{model_size}-{seq_len}")
         self.model = AutoModelForSequenceClassification.from_pretrained(f"Elron/bleurt-{model_size}-{seq_len}")
-        self.max_length = seq_len
+        self.model = self.model.to(device)
         self.model.eval()
+        self.max_length = seq_len
 
     def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         """Uses the stored BLEURT scorer to compute the score on the current sample.
@@ -246,10 +255,11 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         return scores.item()
 
 
-def get_bleurt(model_size: str = "tiny", seq_len: int = 512):
+def get_bleurt(model_size: str = "tiny", seq_len: int = 512, device: str = "cpu"):
+    print(f"Loading BLEURT with model_size={model_size}, seq_len={seq_len}, and device={device}...")
     return SampleLevelMetric(
         metric_name=f"bleurt_{model_size}",
-        sample_level_fn=BLEURT(model_size=model_size, seq_len=seq_len).compute,
+        sample_level_fn=BLEURT(model_size=model_size, seq_len=seq_len, device=device).compute,
         category=MetricCategory.GENERATIVE,
         use_case=MetricUseCase.TRANSLATION,
         corpus_level_fn=statistics.mean,
@@ -257,19 +267,16 @@ def get_bleurt(model_size: str = "tiny", seq_len: int = 512):
     )
 
 
-# bleurt_tiny = get_bleurt(model_size="tiny", seq_len=512)
-# bleurt_base = get_bleurt(model_size="base", seq_len=512)
-bleurt_large = get_bleurt(model_size="large", seq_len=512)
-
-
 class COMET:
     def __init__(
         self,
         model_name: str = "Unbabel/wmt22-comet-da",
         batch_size: int = 1,
         gpus: int = 1,
-        accelerator: str = "cpu",  # "mps" leads to errors
+        accelerator: str = "cpu",
     ):
+        if accelerator == "mps":
+            raise ValueError("MPS is not supported for COMET")
         model_path = download_model(model_name)
         self.model = load_from_checkpoint(model_path)
         self.batch_size = batch_size
@@ -299,15 +306,16 @@ def get_comet(
     model_name: str = "Unbabel/wmt22-comet-da",
     batch_size: int = 1,
     gpus: int = 1,
-    accelerator: str = "cpu",
+    device: str = "cpu",
 ):
+    print(f"Loading COMET with model_name={model_name}, batch_size={batch_size}, gpus={gpus}, and device={device}...")
     return SampleLevelMetric(
         metric_name=model_name.split("/")[-1],
         sample_level_fn=COMET(
             model_name=model_name,
             batch_size=batch_size,
             gpus=gpus,
-            accelerator=accelerator,
+            accelerator=device,
         ).compute,
         category=MetricCategory.GENERATIVE,
         use_case=MetricUseCase.TRANSLATION,
@@ -316,12 +324,6 @@ def get_comet(
     )
 
 
-# There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
-comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=1, gpus=1, accelerator="cpu")
-# xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=1, gpus=1, accelerator="cpu")
-xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=1, gpus=1, accelerator="cpu")
-
-
 class METEOR:
     def __init__(self, alpha=0.9, beta=3, gamma=0.5):
         self.alpha = alpha
@@ -508,6 +510,25 @@ def prompt_fn(line: dict, task_name: str = None):
     return prompt_fn
 
 
+bert_score = get_bert_score(model_type="xlm-roberta-large", device="cpu")
+
+# Only take the largest version
+bleurt_large = get_bleurt(model_size="large", seq_len=512, device="cpu")
+
+# There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
+comet_wmt22_da = get_comet(
+    model_name="Unbabel/wmt22-comet-da",
+    batch_size=1,
+    gpus=1,
+    device="cpu",
+)
+xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=1, gpus=1, device="cpu")
+# XXL is likely too large for local evaluation
+# xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=1, gpus=1, device="cpu")
+
+swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
+
+
 class TranslationTask(LightevalTaskConfig):
     def __init__(
         self,
@@ -538,7 +559,7 @@ def __init__(
                 bert_score,
                 bleurt_large,
                 comet_wmt22_da,
-                xcomet_xxl,
+                xcomet_xl,
                 swiss_legal_translation_judge_gpt_4o,
                 # Additionally we could consider adding the following open source judge models:
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0

From ee2a83c0245b5e8ee4715b8263c64034b5230f7e Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 25 Nov 2024 13:31:56 +0100
Subject: [PATCH 14/57] Fixed some bugs for evaluation on GPUs.

---
 community_tasks/swiss_legal_evals.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 7ca161737..05814a526 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -59,7 +59,7 @@
 from lighteval.tasks.requests import Doc
 
 
-device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
+device = "cuda" if torch.cuda.is_available() else "cpu"
 
 # CUSTOM METRICS
 
@@ -249,6 +249,7 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
             truncation=True,
             max_length=self.max_length,
         )
+        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
         if any(len(encoding) == self.max_length for encoding in inputs["input_ids"]):
             hlog_warn(f"Some inputs were truncated to max_length={self.max_length} in BLEURT scoring")
         scores = self.model(**inputs)[0].squeeze()
@@ -510,21 +511,20 @@ def prompt_fn(line: dict, task_name: str = None):
     return prompt_fn
 
 
-bert_score = get_bert_score(model_type="xlm-roberta-large", device="cpu")
+bert_score = get_bert_score(model_type="xlm-roberta-large", device=device)
 
 # Only take the largest version
-bleurt_large = get_bleurt(model_size="large", seq_len=512, device="cpu")
+bleurt_large = get_bleurt(model_size="large", seq_len=512, device=device)
 
 # There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
 comet_wmt22_da = get_comet(
     model_name="Unbabel/wmt22-comet-da",
-    batch_size=1,
+    batch_size=32,
     gpus=1,
-    device="cpu",
+    device=device,
 )
-xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=1, gpus=1, device="cpu")
-# XXL is likely too large for local evaluation
-# xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=1, gpus=1, device="cpu")
+xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=8, gpus=1, device=device)
+xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=8, gpus=1, device=device)
 
 swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
 
@@ -560,6 +560,7 @@ def __init__(
                 bleurt_large,
                 comet_wmt22_da,
                 xcomet_xl,
+                xcomet_xxl,
                 swiss_legal_translation_judge_gpt_4o,
                 # Additionally we could consider adding the following open source judge models:
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0

From 36b7e943613931782573a8ca5ea04ce8836da6ab Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Tue, 26 Nov 2024 11:02:30 +0100
Subject: [PATCH 15/57] Added batch inference for heavy metrics and multiplied
 each score by 100.

---
 community_tasks/swiss_legal_evals.py | 178 ++++++++++++++-------------
 1 file changed, 90 insertions(+), 88 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 05814a526..205fad36b 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -161,16 +161,17 @@ def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
         higher_is_better={name: True},
         category=MetricCategory.LLM_AS_JUDGE,
         use_case=MetricUseCase.TRANSLATION,
-        sample_level_fn=JudgeLLMMixEval(
-            judge_model_name=judge_model_name,
-            template=swiss_legal_translation_judge,
-            process_judge_response=process_judge_response_freeform_gpt,
-            judge_backend="openai",
-            short_judge_name=judge_model_name,
-        ).compute,
-        corpus_level_fn={
-            name: statistics.mean,
-        },
+        sample_level_fn=lambda *args, **kwargs: [
+            {k: v * 100 if k == f"judge_score_{judge_model_name}" else v for k, v in score_dict.items()}
+            for score_dict in JudgeLLMMixEval(
+                judge_model_name=judge_model_name,
+                template=swiss_legal_translation_judge,
+                process_judge_response=process_judge_response_freeform_gpt,
+                judge_backend="openai",
+                short_judge_name=judge_model_name,
+            ).compute(*args, **kwargs)
+        ],
+        corpus_level_fn={name: statistics.mean},
     )
 
 
@@ -188,21 +189,22 @@ def get_bert_score(model_type: str = "xlm-roberta-large", device: str = "cpu"):
         baseline_path=None,
         device=device,
     )
+
     return SampleLevelMetricGrouping(
         metric_name=["BERTScore-P", "BERTScore-R", "BERTScore-F"],
-        sample_level_fn=score.compute,
+        higher_is_better={
+            "BERTScore-P": True,
+            "BERTScore-R": True,
+            "BERTScore-F": True,
+        },
         category=MetricCategory.GENERATIVE,
         use_case=MetricUseCase.TRANSLATION,
+        sample_level_fn=lambda *args, **kwargs: {k: v * 100 for k, v in score.compute(*args, **kwargs).items()},
         corpus_level_fn={
             "BERTScore-P": statistics.mean,
             "BERTScore-R": statistics.mean,
             "BERTScore-F": statistics.mean,
         },
-        higher_is_better={
-            "BERTScore-P": True,
-            "BERTScore-R": True,
-            "BERTScore-F": True,
-        },
     )
 
 
@@ -211,6 +213,7 @@ def __init__(
         self,
         model_size: str = "tiny",
         seq_len: int = 512,
+        batch_size: int = 32,
         device: str = "cpu",
     ):
         """Creates a BLEURT scorer based on the model size (tiny, base, large) and sequence length (128, 512)."""
@@ -223,48 +226,52 @@ def __init__(
         if device == "mps":
             raise ValueError("MPS is not supported for BLEURT")
 
+        self.metric_name = f"bleurt_{model_size}"
         self.tokenizer = AutoTokenizer.from_pretrained(f"Elron/bleurt-{model_size}-{seq_len}")
         self.model = AutoModelForSequenceClassification.from_pretrained(f"Elron/bleurt-{model_size}-{seq_len}")
         self.model = self.model.to(device)
         self.model.eval()
         self.max_length = seq_len
+        self.batch_size = batch_size
 
-    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
-        """Uses the stored BLEURT scorer to compute the score on the current sample.
-
-        Args:
-            golds (list[str]): Reference targets
-            predictions (list[str]): Predicted strings
-
-        Returns:
-            float: Score over the current sample's items.
-        """
-        if len(predictions) == 1:
-            predictions = predictions * len(golds)
-        inputs = self.tokenizer(
-            golds,
-            predictions,
-            return_tensors="pt",
-            padding=True,
-            truncation=True,
-            max_length=self.max_length,
-        )
-        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
-        if any(len(encoding) == self.max_length for encoding in inputs["input_ids"]):
-            hlog_warn(f"Some inputs were truncated to max_length={self.max_length} in BLEURT scoring")
-        scores = self.model(**inputs)[0].squeeze()
-        return scores.item()
-
-
-def get_bleurt(model_size: str = "tiny", seq_len: int = 512, device: str = "cpu"):
-    print(f"Loading BLEURT with model_size={model_size}, seq_len={seq_len}, and device={device}...")
-    return SampleLevelMetric(
-        metric_name=f"bleurt_{model_size}",
-        sample_level_fn=BLEURT(model_size=model_size, seq_len=seq_len, device=device).compute,
-        category=MetricCategory.GENERATIVE,
+    def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
+        golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
+        predictions = [response[0].result[0] for response in responses]
+
+        all_scores = []
+        for i in range(0, len(golds), self.batch_size):
+            batch_golds = golds[i : i + self.batch_size]
+            batch_predictions = predictions[i : i + self.batch_size]
+
+            inputs = self.tokenizer(
+                batch_golds,
+                batch_predictions,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=self.max_length,
+            )
+            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+            if any(len(encoding) == self.max_length for encoding in inputs["input_ids"]):
+                hlog_warn(f"Some inputs were truncated to max_length={self.max_length} in BLEURT scoring")
+            with torch.no_grad():
+                all_scores.extend(self.model(**inputs)[0].squeeze().tolist())
+
+        return [{self.metric_name: score * 100} for score in all_scores]
+
+
+def get_bleurt(model_size: str = "tiny", seq_len: int = 512, batch_size: int = 32, device: str = "cpu"):
+    print(
+        f"Loading BLEURT with model_size={model_size}, seq_len={seq_len}, batch_size={batch_size}, and device={device}..."
+    )
+    name = f"bleurt_{model_size}"
+    return SampleLevelMetricGrouping(
+        metric_name=[name],
+        higher_is_better={name: True},
+        category=MetricCategory.LLM_AS_JUDGE,
         use_case=MetricUseCase.TRANSLATION,
-        corpus_level_fn=statistics.mean,
-        higher_is_better=True,
+        sample_level_fn=BLEURT(model_size=model_size, seq_len=seq_len, batch_size=batch_size, device=device).compute,
+        corpus_level_fn={name: statistics.mean},
     )
 
 
@@ -272,56 +279,55 @@ class COMET:
     def __init__(
         self,
         model_name: str = "Unbabel/wmt22-comet-da",
-        batch_size: int = 1,
+        batch_size: int = 8,
         gpus: int = 1,
         accelerator: str = "cpu",
     ):
         if accelerator == "mps":
             raise ValueError("MPS is not supported for COMET")
-        model_path = download_model(model_name)
-        self.model = load_from_checkpoint(model_path)
+
+        self.metric_name = model_name.split("/")[-1]
+        self.model = load_from_checkpoint(download_model(model_name))
         self.batch_size = batch_size
         self.gpus = gpus
         self.accelerator = accelerator
 
-    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
-        data = [
-            {"src": src, "mt": pred, "ref": gold}
-            for src, pred, gold in zip(
-                [kwargs["formatted_doc"].specific["source"]] * len(predictions),
-                predictions,
-                golds,
-            )
-        ]
+    def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
+        golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
+        predictions = [response[0].result[0] for response in responses]
+        sources = [kwargs["formatted_doc"].specific["source"] for kwargs["formatted_doc"] in formatted_docs]
+
+        data = [{"src": src, "mt": pred, "ref": gold} for src, pred, gold in zip(sources, predictions, golds)]
         model_output = self.model.predict(
             data,
             batch_size=self.batch_size,
             gpus=self.gpus,
             accelerator=self.accelerator,
         )
-        # model_output["scores"] contains the sentence level scores
-        return model_output["system_score"]
+
+        return [{self.metric_name: score * 100} for score in model_output["scores"]]
 
 
 def get_comet(
     model_name: str = "Unbabel/wmt22-comet-da",
-    batch_size: int = 1,
+    batch_size: int = 8,
     gpus: int = 1,
     device: str = "cpu",
 ):
     print(f"Loading COMET with model_name={model_name}, batch_size={batch_size}, gpus={gpus}, and device={device}...")
-    return SampleLevelMetric(
-        metric_name=model_name.split("/")[-1],
+    name = model_name.split("/")[-1]
+    return SampleLevelMetricGrouping(
+        metric_name=[name],
+        higher_is_better={name: True},
+        category=MetricCategory.LLM_AS_JUDGE,
+        use_case=MetricUseCase.TRANSLATION,
         sample_level_fn=COMET(
             model_name=model_name,
             batch_size=batch_size,
             gpus=gpus,
             accelerator=device,
         ).compute,
-        category=MetricCategory.GENERATIVE,
-        use_case=MetricUseCase.TRANSLATION,
-        corpus_level_fn=statistics.mean,
-        higher_is_better=True,
+        corpus_level_fn={name: statistics.mean},
     )
 
 
@@ -360,18 +366,19 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
                 for ref, pred in zip(golds, predictions)
             ]
 
-        return statistics.mean(scores)
+        return statistics.mean(scores) * 100
 
 
 meteor = SampleLevelMetric(
     metric_name="meteor",
-    sample_level_fn=METEOR().compute,
+    higher_is_better=True,
     category=MetricCategory.GENERATIVE,
     use_case=MetricUseCase.TRANSLATION,
+    sample_level_fn=METEOR().compute,
     corpus_level_fn=statistics.mean,
-    higher_is_better=True,
 )
 
+
 # EVALS WITH SUBSET
 # This is how you create a subset task (like MMLU), which has several subset
 # each being its own evaluation task.
@@ -517,14 +524,9 @@ def prompt_fn(line: dict, task_name: str = None):
 bleurt_large = get_bleurt(model_size="large", seq_len=512, device=device)
 
 # There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
-comet_wmt22_da = get_comet(
-    model_name="Unbabel/wmt22-comet-da",
-    batch_size=32,
-    gpus=1,
-    device=device,
-)
-xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=8, gpus=1, device=device)
-xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=8, gpus=1, device=device)
+comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=32, gpus=1, device=device)
+# xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=8, gpus=1, device=device)
+# xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=8, gpus=1, device=device)
 
 swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
 
@@ -546,9 +548,9 @@ def __init__(
             hf_subset=level_name,
             hf_filter=None,
             hf_avail_splits=["train", "validation", "test"],
-            evaluation_splits=["test"],  # ["validation", "test"],
+            evaluation_splits=["test"],
             few_shots_split="validation",
-            few_shots_select=None,
+            few_shots_select=None,  # TODO: add few-shot selection
             generation_size=level_config.generation_size,
             metric=[
                 Metrics.bleu,
@@ -559,14 +561,14 @@ def __init__(
                 bert_score,
                 bleurt_large,
                 comet_wmt22_da,
-                xcomet_xl,
-                xcomet_xxl,
+                # xcomet_xl,
+                # xcomet_xxl,
                 swiss_legal_translation_judge_gpt_4o,
                 # Additionally we could consider adding the following open source judge models:
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0
                 # However, these are only fine-tuned on English data and we need multilingual support.
             ],
-            stop_sequence=["\n"],
+            stop_sequence=["\n"],  # TODO: Debug why this is not working for litellm inference
             trust_dataset=True,
         )
 

From 5ba218f8f1f08f073fff0a9d4fdd641ff7c35431 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Tue, 26 Nov 2024 14:04:00 +0100
Subject: [PATCH 16/57] Added few shot examples and did some refactoring.

---
 community_tasks/swiss_legal_evals.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 205fad36b..032880771 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -501,7 +501,7 @@ def create_prompt_fn(level_config: LevelConfig, src_lang: str, target_lang: str)
 
     def prompt_fn(line: dict, task_name: str = None):
         # Following Template A from https://github.com/huggingface/lighteval/pull/389#issuecomment-2471580177
-        custom_query = f"{src_lang.upper()}: {line[src_text_col]}\n{target_lang.upper()}:"
+        custom_query = f"{src_lang.upper()}: {line[src_text_col]}\n{target_lang.upper()}: "
 
         return Doc(
             task_name=task_name,
@@ -536,21 +536,21 @@ def __init__(
         self,
         dataset_config: DatasetConfig,
         level_name: str,
-        src_lang: str,
+        source_lang: str,
         target_lang: str,
     ):
         level_config = dataset_config.subsets[level_name]
         super().__init__(
-            name=f"{dataset_config.name}-{level_name}:{src_lang}-{target_lang}",
+            name=f"{dataset_config.name}-{level_name}:{source_lang}-{target_lang}",
             suite=["community"],
-            prompt_function=create_prompt_fn(level_config, src_lang, target_lang),
+            prompt_function=create_prompt_fn(level_config, source_lang, target_lang),
             hf_repo=dataset_config.hf_repo,
             hf_subset=level_name,
             hf_filter=None,
             hf_avail_splits=["train", "validation", "test"],
             evaluation_splits=["test"],
             few_shots_split="validation",
-            few_shots_select=None,  # TODO: add few-shot selection
+            few_shots_select="sequential",
             generation_size=level_config.generation_size,
             metric=[
                 Metrics.bleu,
@@ -560,16 +560,19 @@ def __init__(
                 meteor,
                 bert_score,
                 bleurt_large,
-                comet_wmt22_da,
+                comet_wmt22_da,  # TODO: debug why this is not saved in the details
                 # xcomet_xl,
                 # xcomet_xxl,
-                swiss_legal_translation_judge_gpt_4o,
+                swiss_legal_translation_judge_gpt_4o,  # TODO: debug why this is not showing up in the results
                 # Additionally we could consider adding the following open source judge models:
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0
                 # However, these are only fine-tuned on English data and we need multilingual support.
             ],
-            stop_sequence=["\n"],  # TODO: Debug why this is not working for litellm inference
+            stop_sequence=[".\n"],  # just "\n" leads to problems for anthropic models
             trust_dataset=True,
+            # Remove the target language in the beginning if it exists: e.g., FR: {translation}
+            # Is only applied to the generative metrics, but also there seems not to be invoked, maybe not passed through?
+            # output_regex=f"(?:{target_lang.upper()}:\s*?)?(.*)",
         )
 
 
@@ -586,12 +589,12 @@ def __init__(
     TranslationTask(
         dataset_config=dataset,
         level_name=subset,
-        src_lang=src_lang,
+        source_lang=source_lang,
         target_lang=target_lang,
     )
     for dataset in DATASETS
     for subset in dataset.subsets
-    for src_lang, target_lang in dataset.translation_pairs
+    for source_lang, target_lang in dataset.translation_pairs
 ]
 
 

From 576b847b02e1f44a72f3c497315da7a04c9a2025 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Tue, 26 Nov 2024 14:48:21 +0100
Subject: [PATCH 17/57] Switched to an own judge class.

---
 community_tasks/swiss_legal_evals.py | 37 +++++++++++++++++++---------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 032880771..22d3421fe 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -46,7 +46,7 @@
 from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.metrics.imports.bert_scorer import BERTScorer
 from lighteval.metrics.metrics import Metrics
-from lighteval.metrics.metrics_sample import BertScore, JudgeLLMMixEval
+from lighteval.metrics.metrics_sample import BertScore, JudgeLLM
 from lighteval.metrics.normalizations import remove_braces, remove_braces_and_strip
 from lighteval.metrics.utils.metric_utils import (
     MetricCategory,
@@ -154,6 +154,24 @@ def swiss_legal_translation_judge(question, options, answer, gold):
     ]
 
 
+class JudgeSwissLegalTranslation(JudgeLLM):
+    def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
+        """
+        Compute the score of a generative task using a llm as a judge.
+        """
+        questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
+        options = [formatted_doc.choices for formatted_doc in formatted_docs]
+        golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
+        predictions = [response[0].result[0] for response in responses]
+
+        scores, _, judgements = self.judge.evaluate_answer_batch(questions, predictions, options, golds)
+        # Exclude the messages (user prompt) because they are too long
+        return [
+            {f"judge_score_{self.short_judge_name}": score * 100, f"judgement_{self.short_judge_name}": judgment}
+            for score, judgment in zip(scores, judgements)
+        ]
+
+
 def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
     name = f"swiss_legal_translation_judge_{judge_model_name}"
     return SampleLevelMetricGrouping(
@@ -161,16 +179,13 @@ def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
         higher_is_better={name: True},
         category=MetricCategory.LLM_AS_JUDGE,
         use_case=MetricUseCase.TRANSLATION,
-        sample_level_fn=lambda *args, **kwargs: [
-            {k: v * 100 if k == f"judge_score_{judge_model_name}" else v for k, v in score_dict.items()}
-            for score_dict in JudgeLLMMixEval(
-                judge_model_name=judge_model_name,
-                template=swiss_legal_translation_judge,
-                process_judge_response=process_judge_response_freeform_gpt,
-                judge_backend="openai",
-                short_judge_name=judge_model_name,
-            ).compute(*args, **kwargs)
-        ],
+        sample_level_fn=JudgeSwissLegalTranslation(
+            judge_model_name=judge_model_name,
+            template=swiss_legal_translation_judge,
+            process_judge_response=process_judge_response_freeform_gpt,
+            judge_backend="openai",
+            short_judge_name=judge_model_name,
+        ).compute,
         corpus_level_fn={name: statistics.mean},
     )
 

From 41bb59aeb3698f84398231a680be9d5b8c29dcd8 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Tue, 26 Nov 2024 16:48:39 +0100
Subject: [PATCH 18/57] Fixed issue with judge metric not showing up in
 results.

---
 community_tasks/swiss_legal_evals.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 22d3421fe..70aa1e4b8 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -167,13 +167,13 @@ def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[D
         scores, _, judgements = self.judge.evaluate_answer_batch(questions, predictions, options, golds)
         # Exclude the messages (user prompt) because they are too long
         return [
-            {f"judge_score_{self.short_judge_name}": score * 100, f"judgement_{self.short_judge_name}": judgment}
+            {self.short_judge_name: score * 100, f"{self.short_judge_name}_judgment": judgment}
             for score, judgment in zip(scores, judgements)
         ]
 
 
 def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
-    name = f"swiss_legal_translation_judge_{judge_model_name}"
+    name = f"slt_judge_{judge_model_name}"
     return SampleLevelMetricGrouping(
         metric_name=[name],
         higher_is_better={name: True},
@@ -184,7 +184,7 @@ def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
             template=swiss_legal_translation_judge,
             process_judge_response=process_judge_response_freeform_gpt,
             judge_backend="openai",
-            short_judge_name=judge_model_name,
+            short_judge_name=name,
         ).compute,
         corpus_level_fn={name: statistics.mean},
     )
@@ -536,12 +536,12 @@ def prompt_fn(line: dict, task_name: str = None):
 bert_score = get_bert_score(model_type="xlm-roberta-large", device=device)
 
 # Only take the largest version
-bleurt_large = get_bleurt(model_size="large", seq_len=512, device=device)
+bleurt_large = get_bleurt(model_size="large", seq_len=512, batch_size=64, device=device)
 
 # There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
-comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=32, gpus=1, device=device)
-# xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=8, gpus=1, device=device)
-# xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=8, gpus=1, device=device)
+comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=64, gpus=1, device=device)
+xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=16, gpus=1, device=device)
+xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=8, gpus=1, device=device)
 
 swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
 
@@ -573,12 +573,12 @@ def __init__(
                 Metrics.chrf,
                 Metrics.ter,
                 meteor,
-                bert_score,
+                bert_score,  # TODO: think about allowing parallelization as well if slow
                 bleurt_large,
-                comet_wmt22_da,  # TODO: debug why this is not saved in the details
-                # xcomet_xl,
-                # xcomet_xxl,
-                swiss_legal_translation_judge_gpt_4o,  # TODO: debug why this is not showing up in the results
+                comet_wmt22_da,
+                xcomet_xl,
+                xcomet_xxl,
+                swiss_legal_translation_judge_gpt_4o,
                 # Additionally we could consider adding the following open source judge models:
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0
                 # However, these are only fine-tuned on English data and we need multilingual support.

From d82cd91ac4b1b32f8b9c295e417311cad1027c3c Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 27 Nov 2024 12:15:45 +0000
Subject: [PATCH 19/57] Fixed issue with evaluation on GPUs.

---
 community_tasks/swiss_legal_evals.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 70aa1e4b8..f370c9b14 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -270,7 +270,7 @@ def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[D
             if any(len(encoding) == self.max_length for encoding in inputs["input_ids"]):
                 hlog_warn(f"Some inputs were truncated to max_length={self.max_length} in BLEURT scoring")
             with torch.no_grad():
-                all_scores.extend(self.model(**inputs)[0].squeeze().tolist())
+                all_scores.extend(self.model(**inputs)[0].squeeze().cpu().tolist())
 
         return [{self.metric_name: score * 100} for score in all_scores]
 
@@ -539,8 +539,8 @@ def prompt_fn(line: dict, task_name: str = None):
 bleurt_large = get_bleurt(model_size="large", seq_len=512, batch_size=64, device=device)
 
 # There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
-comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=64, gpus=1, device=device)
-xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=16, gpus=1, device=device)
+# comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=64, gpus=1, device=device)
+# xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=16, gpus=1, device=device)
 xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=8, gpus=1, device=device)
 
 swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
@@ -568,15 +568,14 @@ def __init__(
             few_shots_select="sequential",
             generation_size=level_config.generation_size,
             metric=[
-                Metrics.bleu,
-                # Metrics.bleu_4,
+                Metrics.bleu,  # Metrics.bleu_4,
                 Metrics.chrf,
                 Metrics.ter,
                 meteor,
                 bert_score,  # TODO: think about allowing parallelization as well if slow
                 bleurt_large,
-                comet_wmt22_da,
-                xcomet_xl,
+                # comet_wmt22_da,
+                # xcomet_xl,
                 xcomet_xxl,
                 swiss_legal_translation_judge_gpt_4o,
                 # Additionally we could consider adding the following open source judge models:

From 1b13d9fcd011c12e2907648158838b42cb74dc00 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 27 Nov 2024 17:13:52 +0000
Subject: [PATCH 20/57] Speed up metric computation on GPUs.

---
 community_tasks/swiss_legal_evals.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index f370c9b14..f86c563c3 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -61,6 +61,15 @@
 
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
+# Try to optimize CUDA operations
+if device == "cuda":
+    torch.backends.cudnn.benchmark = True  # Enable cudnn auto-tuner
+    torch.backends.cuda.matmul.allow_tf32 = True  # Enable TF32 for faster matrix multiplications
+    # Enable tensor cores if available
+    if torch.cuda.get_device_capability()[0] >= 7:
+        # This will speed up GPU inference, e.g., for COMET and BLEURT
+        torch.set_float32_matmul_precision('medium') 
+
 # CUSTOM METRICS
 
 

From df0f3f02f0aca35cbc01e20af3d239d7f4c586c3 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 27 Nov 2024 17:15:40 +0000
Subject: [PATCH 21/57] Added more logging.

---
 community_tasks/swiss_legal_evals.py | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index f86c563c3..4cea5ba94 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -43,7 +43,7 @@
 from packaging import version
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
-from lighteval.logging.hierarchical_logger import hlog_warn
+from lighteval.logging.hierarchical_logger import hlog_warn, hlog
 from lighteval.metrics.imports.bert_scorer import BERTScorer
 from lighteval.metrics.metrics import Metrics
 from lighteval.metrics.metrics_sample import BertScore, JudgeLLM
@@ -165,9 +165,7 @@ def swiss_legal_translation_judge(question, options, answer, gold):
 
 class JudgeSwissLegalTranslation(JudgeLLM):
     def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
-        """
-        Compute the score of a generative task using a llm as a judge.
-        """
+        hlog(f"Judging {len(formatted_docs)} samples with {self.short_judge_name}...")
         questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
         options = [formatted_doc.choices for formatted_doc in formatted_docs]
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
@@ -202,7 +200,7 @@ def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
 def get_bert_score(model_type: str = "xlm-roberta-large", device: str = "cpu"):
     if device == "mps":
         raise ValueError("MPS is not supported for BERTScore")
-    print(f"Loading BERTScore with model_type={model_type}, and device={device}...")
+    hlog(f"Loading BERTScore with model_type={model_type}, and device={device}...")
     score = BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip)
     score.bert_scorer = BERTScorer(
         # We could download the files from here and set the baseline_path ourselves:
@@ -259,6 +257,7 @@ def __init__(
         self.batch_size = batch_size
 
     def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
+        hlog(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
         predictions = [response[0].result[0] for response in responses]
 
@@ -285,7 +284,7 @@ def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[D
 
 
 def get_bleurt(model_size: str = "tiny", seq_len: int = 512, batch_size: int = 32, device: str = "cpu"):
-    print(
+    hlog(
         f"Loading BLEURT with model_size={model_size}, seq_len={seq_len}, batch_size={batch_size}, and device={device}..."
     )
     name = f"bleurt_{model_size}"
@@ -317,6 +316,7 @@ def __init__(
         self.accelerator = accelerator
 
     def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
+        hlog(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
         predictions = [response[0].result[0] for response in responses]
         sources = [kwargs["formatted_doc"].specific["source"] for kwargs["formatted_doc"] in formatted_docs]
@@ -337,8 +337,8 @@ def get_comet(
     batch_size: int = 8,
     gpus: int = 1,
     device: str = "cpu",
-):
-    print(f"Loading COMET with model_name={model_name}, batch_size={batch_size}, gpus={gpus}, and device={device}...")
+):    
+    hlog(f"Loading COMET with model_name={model_name}, batch_size={batch_size}, gpus={gpus}, and device={device}...")
     name = model_name.split("/")[-1]
     return SampleLevelMetricGrouping(
         metric_name=[name],
@@ -541,16 +541,16 @@ def prompt_fn(line: dict, task_name: str = None):
 
     return prompt_fn
 
-
+# INFO: Batch sizes are optimized for an 80GB NVIDIA A100 GPU
 bert_score = get_bert_score(model_type="xlm-roberta-large", device=device)
 
 # Only take the largest version
-bleurt_large = get_bleurt(model_size="large", seq_len=512, batch_size=64, device=device)
+bleurt_large = get_bleurt(model_size="large", seq_len=512, batch_size=256, device=device)
 
 # There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
 # comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=64, gpus=1, device=device)
-# xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=16, gpus=1, device=device)
-xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=8, gpus=1, device=device)
+# xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=32, gpus=1, device=device)
+xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=16, gpus=1, device=device)
 
 swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
 
@@ -583,9 +583,7 @@ def __init__(
                 meteor,
                 bert_score,  # TODO: think about allowing parallelization as well if slow
                 bleurt_large,
-                # comet_wmt22_da,
-                # xcomet_xl,
-                xcomet_xxl,
+                xcomet_xxl,  # xcomet_xl, comet_wmt22_da
                 swiss_legal_translation_judge_gpt_4o,
                 # Additionally we could consider adding the following open source judge models:
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0

From 980c25712eeff0276922a1f805fe941c3a669ce9 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Thu, 28 Nov 2024 14:53:47 +0100
Subject: [PATCH 22/57] Switched to sample level scores for faster evaluation.

---
 community_tasks/swiss_legal_evals.py | 190 +++++++++++++++++++++++----
 1 file changed, 163 insertions(+), 27 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 4cea5ba94..42f515746 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -40,10 +40,12 @@
 from comet import download_model, load_from_checkpoint
 from nltk import word_tokenize
 from nltk.translate import meteor_score
+from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
+from nltk.translate.chrf_score import sentence_chrf
 from packaging import version
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
-from lighteval.logging.hierarchical_logger import hlog_warn, hlog
+from lighteval.logging.hierarchical_logger import hlog, hlog_warn
 from lighteval.metrics.imports.bert_scorer import BERTScorer
 from lighteval.metrics.metrics import Metrics
 from lighteval.metrics.metrics_sample import BertScore, JudgeLLM
@@ -64,11 +66,12 @@
 # Try to optimize CUDA operations
 if device == "cuda":
     torch.backends.cudnn.benchmark = True  # Enable cudnn auto-tuner
-    torch.backends.cuda.matmul.allow_tf32 = True  # Enable TF32 for faster matrix multiplications
+    # Enable TF32 for faster matrix multiplications
+    torch.backends.cuda.matmul.allow_tf32 = True
     # Enable tensor cores if available
     if torch.cuda.get_device_capability()[0] >= 7:
         # This will speed up GPU inference, e.g., for COMET and BLEURT
-        torch.set_float32_matmul_precision('medium') 
+        torch.set_float32_matmul_precision("medium")
 
 # CUSTOM METRICS
 
@@ -164,7 +167,13 @@ def swiss_legal_translation_judge(question, options, answer, gold):
 
 
 class JudgeSwissLegalTranslation(JudgeLLM):
-    def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
+    def compute(
+        self,
+        sample_ids: list[str],
+        responses: list,
+        formatted_docs: list[Doc],
+        **kwargs,
+    ) -> dict[str, float]:
         hlog(f"Judging {len(formatted_docs)} samples with {self.short_judge_name}...")
         questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
         options = [formatted_doc.choices for formatted_doc in formatted_docs]
@@ -174,7 +183,10 @@ def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[D
         scores, _, judgements = self.judge.evaluate_answer_batch(questions, predictions, options, golds)
         # Exclude the messages (user prompt) because they are too long
         return [
-            {self.short_judge_name: score * 100, f"{self.short_judge_name}_judgment": judgment}
+            {
+                self.short_judge_name: score * 100,
+                f"{self.short_judge_name}_judgment": judgment,
+            }
             for score, judgment in zip(scores, judgements)
         ]
 
@@ -197,6 +209,9 @@ def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
     )
 
 
+swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
+
+
 def get_bert_score(model_type: str = "xlm-roberta-large", device: str = "cpu"):
     if device == "mps":
         raise ValueError("MPS is not supported for BERTScore")
@@ -230,6 +245,10 @@ def get_bert_score(model_type: str = "xlm-roberta-large", device: str = "cpu"):
     )
 
 
+# INFO: Batch sizes are optimized for an 80GB NVIDIA A100 GPU
+bert_score = get_bert_score(model_type="xlm-roberta-large", device=device)
+
+
 class BLEURT:
     def __init__(
         self,
@@ -256,7 +275,13 @@ def __init__(
         self.max_length = seq_len
         self.batch_size = batch_size
 
-    def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
+    def compute(
+        self,
+        sample_ids: list[str],
+        responses: list,
+        formatted_docs: list[Doc],
+        **kwargs,
+    ) -> dict[str, float]:
         hlog(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
         predictions = [response[0].result[0] for response in responses]
@@ -283,7 +308,12 @@ def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[D
         return [{self.metric_name: score * 100} for score in all_scores]
 
 
-def get_bleurt(model_size: str = "tiny", seq_len: int = 512, batch_size: int = 32, device: str = "cpu"):
+def get_bleurt(
+    model_size: str = "tiny",
+    seq_len: int = 512,
+    batch_size: int = 32,
+    device: str = "cpu",
+):
     hlog(
         f"Loading BLEURT with model_size={model_size}, seq_len={seq_len}, batch_size={batch_size}, and device={device}..."
     )
@@ -298,6 +328,10 @@ def get_bleurt(model_size: str = "tiny", seq_len: int = 512, batch_size: int = 3
     )
 
 
+# Only take the largest version
+bleurt_large = get_bleurt(model_size="large", seq_len=512, batch_size=256, device=device)
+
+
 class COMET:
     def __init__(
         self,
@@ -315,7 +349,13 @@ def __init__(
         self.gpus = gpus
         self.accelerator = accelerator
 
-    def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
+    def compute(
+        self,
+        sample_ids: list[str],
+        responses: list,
+        formatted_docs: list[Doc],
+        **kwargs,
+    ) -> dict[str, float]:
         hlog(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
         predictions = [response[0].result[0] for response in responses]
@@ -337,7 +377,7 @@ def get_comet(
     batch_size: int = 8,
     gpus: int = 1,
     device: str = "cpu",
-):    
+):
     hlog(f"Loading COMET with model_name={model_name}, batch_size={batch_size}, gpus={gpus}, and device={device}...")
     name = model_name.split("/")[-1]
     return SampleLevelMetricGrouping(
@@ -355,6 +395,12 @@ def get_comet(
     )
 
 
+# There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
+# comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=64, gpus=1, device=device)
+# xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=32, gpus=1, device=device)
+xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=16, gpus=1, device=device)
+
+
 class METEOR:
     def __init__(self, alpha=0.9, beta=3, gamma=0.5):
         self.alpha = alpha
@@ -403,6 +449,110 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
 )
 
 
+class BLEU:
+    def __init__(
+        self,
+        weights=(0.25, 0.25, 0.25, 0.25),
+        smoothing_function=None,
+        auto_reweigh=False,
+    ):
+        """
+        Initialize BLEU scorer with specified n-gram weights.
+        Default weights are for BLEU-4 (equal weights for 1-4 grams).
+
+        Args:
+            weights: Tuple of weights for unigrams through 4-grams
+            smoothing_function: Optional smoothing function for BLEU computation
+            auto_reweigh: Whether to automatically reweigh the scores based on reference length
+        """
+        self.weights = weights
+        self.smoothing_function = smoothing_function or SmoothingFunction().method1
+        self.auto_reweigh = auto_reweigh
+
+        # Ensure NLTK data is downloaded
+        nltk.download("punkt")
+
+    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
+        """
+        Compute BLEU score for a list of predictions against their references.
+
+        Args:
+            golds: List of reference strings
+            predictions: List of prediction strings
+
+        Returns:
+            Mean BLEU score scaled to 0-100
+        """
+        scores = []
+        for ref, pred in zip(golds, predictions):
+            # Tokenize the reference and prediction
+            reference = [word_tokenize(ref)]
+            hypothesis = word_tokenize(pred)
+
+            # Calculate BLEU score for this pair
+            score = sentence_bleu(
+                references=reference,
+                hypothesis=hypothesis,
+                weights=self.weights,
+                smoothing_function=self.smoothing_function,
+                auto_reweigh=self.auto_reweigh,
+            )
+            scores.append(score)
+
+        return statistics.mean(scores) * 100
+
+
+bleu = SampleLevelMetric(
+    metric_name="bleu",
+    higher_is_better=True,
+    category=MetricCategory.GENERATIVE,
+    use_case=MetricUseCase.TRANSLATION,
+    sample_level_fn=BLEU().compute,
+    corpus_level_fn=statistics.mean,
+)
+
+
+class CHRF:
+    def __init__(self, beta: float = 3.0, max_len: int = 6, min_len: int = 1):
+        """
+        Initialize chrF scorer with specified parameters.
+        beta: Weight of recall vs precision (default: 3.0)
+        max_len: Maximum n-gram order (default: 6)
+        min_len: Minimum n-gram order (default: 1)
+        """
+        self.beta = beta
+        self.max_len = max_len
+        self.min_len = min_len
+
+    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
+        """
+        Compute chrF score for a list of predictions against their references.
+        """
+        scores = []
+        for ref, pred in zip(golds, predictions):
+            score = sentence_chrf(
+                ref,
+                pred,
+                min_len=self.min_len,
+                max_len=self.max_len,
+                beta=self.beta,
+                ignore_whitespace=True,
+            )
+            scores.append(score)
+
+        return statistics.mean(scores) * 100
+
+
+chrf = SampleLevelMetric(
+    metric_name="chrf",
+    higher_is_better=True,
+    category=MetricCategory.GENERATIVE,
+    use_case=MetricUseCase.TRANSLATION,
+    sample_level_fn=CHRF().compute,
+    corpus_level_fn=statistics.mean,
+)
+
+
 # EVALS WITH SUBSET
 # This is how you create a subset task (like MMLU), which has several subset
 # each being its own evaluation task.
@@ -541,19 +691,6 @@ def prompt_fn(line: dict, task_name: str = None):
 
     return prompt_fn
 
-# INFO: Batch sizes are optimized for an 80GB NVIDIA A100 GPU
-bert_score = get_bert_score(model_type="xlm-roberta-large", device=device)
-
-# Only take the largest version
-bleurt_large = get_bleurt(model_size="large", seq_len=512, batch_size=256, device=device)
-
-# There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
-# comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=64, gpus=1, device=device)
-# xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=32, gpus=1, device=device)
-xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=16, gpus=1, device=device)
-
-swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
-
 
 class TranslationTask(LightevalTaskConfig):
     def __init__(
@@ -577,13 +714,12 @@ def __init__(
             few_shots_select="sequential",
             generation_size=level_config.generation_size,
             metric=[
-                Metrics.bleu,  # Metrics.bleu_4,
-                Metrics.chrf,
-                Metrics.ter,
+                bleu,  # Use sample level BLEU for faster evaluation
+                chrf,  # Use sample level chrF for faster evaluation
                 meteor,
-                bert_score,  # TODO: think about allowing parallelization as well if slow
+                bert_score,
                 bleurt_large,
-                xcomet_xxl,  # xcomet_xl, comet_wmt22_da
+                xcomet_xxl,  # Just use one, disregarding xcomet_xl, comet_wmt22_da
                 swiss_legal_translation_judge_gpt_4o,
                 # Additionally we could consider adding the following open source judge models:
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0

From 9a60dc0faf8aca6706a7c110bd8898564aed3640 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Fri, 29 Nov 2024 17:42:38 +0100
Subject: [PATCH 23/57] Added rescale_with_baseline for BERTScore for better
 differentiation.

---
 community_tasks/swiss_legal_evals.py | 37 +++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 42f515746..41f017ba4 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -32,10 +32,12 @@
 """
 
 import importlib.metadata as importlib_metadata
+import os
 import statistics
 from dataclasses import dataclass
 
 import nltk
+import requests
 import torch
 from comet import download_model, load_from_checkpoint
 from nltk import word_tokenize
@@ -212,21 +214,42 @@ def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
 swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
 
 
-def get_bert_score(model_type: str = "xlm-roberta-large", device: str = "cpu"):
+def get_bert_score(language: str, num_layers: int = 24, model_type: str = "xlm-roberta-large", device: str = "cpu"):
     if device == "mps":
         raise ValueError("MPS is not supported for BERTScore")
-    hlog(f"Loading BERTScore with model_type={model_type}, and device={device}...")
+    hlog(
+        f"Loading BERTScore with lang={language}, num_layers={num_layers}, model_type={model_type}, and device={device}..."
+    )
     score = BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip)
     score.bert_scorer = BERTScorer(
         # We could download the files from here and set the baseline_path ourselves:
         # https://github.com/Tiiiger/bert_score/tree/master/bert_score/rescale_baseline
         model_type=model_type,
-        lang=None,  # Needs to be set if rescale_with_baseline is True
-        rescale_with_baseline=False,
+        lang=language,  # Needs to be set if rescale_with_baseline is True
+        num_layers=num_layers,  # Needs to be set if rescale_with_baseline is True
+        rescale_with_baseline=True,
         baseline_path=None,
         device=device,
     )
 
+    if language == "rm":
+        language = "it"
+        hlog_warn("There is no BERTScore baseline file for Rumantsch, using Italian instead.")
+
+    # Create directory structure if it doesn't exist
+    os.makedirs(os.path.dirname(score.bert_scorer.baseline_path), exist_ok=True)
+
+    # Download the baseline file if it doesn't exist
+    if not os.path.exists(score.bert_scorer.baseline_path):
+        raw_url = f"https://raw.githubusercontent.com/Tiiiger/bert_score/master/bert_score/rescale_baseline/{language}/{model_type}.tsv"
+        hlog(f"Downloading BERTScore baseline file from {raw_url}")
+        response = requests.get(raw_url)
+        if response.status_code == 200:
+            with open(score.bert_scorer.baseline_path, "wb") as f:
+                f.write(response.content)
+        else:
+            raise RuntimeError(f"Failed to download baseline file from {raw_url}")
+
     return SampleLevelMetricGrouping(
         metric_name=["BERTScore-P", "BERTScore-R", "BERTScore-F"],
         higher_is_better={
@@ -245,10 +268,6 @@ def get_bert_score(model_type: str = "xlm-roberta-large", device: str = "cpu"):
     )
 
 
-# INFO: Batch sizes are optimized for an 80GB NVIDIA A100 GPU
-bert_score = get_bert_score(model_type="xlm-roberta-large", device=device)
-
-
 class BLEURT:
     def __init__(
         self,
@@ -717,7 +736,7 @@ def __init__(
                 bleu,  # Use sample level BLEU for faster evaluation
                 chrf,  # Use sample level chrF for faster evaluation
                 meteor,
-                bert_score,
+                get_bert_score(language=target_lang, model_type="xlm-roberta-large", device=device),
                 bleurt_large,
                 xcomet_xxl,  # Just use one, disregarding xcomet_xl, comet_wmt22_da
                 swiss_legal_translation_judge_gpt_4o,

From 819b949cb14e56121d22f5643681e7cd96330864 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 2 Dec 2024 11:41:28 +0100
Subject: [PATCH 24/57] Adapted metrics.

---
 community_tasks/swiss_legal_evals.py | 39 +++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 41f017ba4..e1d35085b 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -268,6 +268,19 @@ def get_bert_score(language: str, num_layers: int = 24, model_type: str = "xlm-r
     )
 
 
+# Create BERTScore metrics for each language
+
+
+bert_scores = {
+    lang: get_bert_score(
+        language=lang,
+        model_type="xlm-roberta-large",
+        device=device,
+    )
+    for lang in ["de", "fr", "it", "rm", "en"]
+}
+
+
 class BLEURT:
     def __init__(
         self,
@@ -347,7 +360,6 @@ def get_bleurt(
     )
 
 
-# Only take the largest version
 bleurt_large = get_bleurt(model_size="large", seq_len=512, batch_size=256, device=device)
 
 
@@ -521,8 +533,8 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         return statistics.mean(scores) * 100
 
 
-bleu = SampleLevelMetric(
-    metric_name="bleu",
+bleu_sentence = SampleLevelMetric(
+    metric_name="bleu_sentence",
     higher_is_better=True,
     category=MetricCategory.GENERATIVE,
     use_case=MetricUseCase.TRANSLATION,
@@ -562,8 +574,8 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         return statistics.mean(scores) * 100
 
 
-chrf = SampleLevelMetric(
-    metric_name="chrf",
+chrf_sentence = SampleLevelMetric(
+    metric_name="chrf_sentence",
     higher_is_better=True,
     category=MetricCategory.GENERATIVE,
     use_case=MetricUseCase.TRANSLATION,
@@ -733,12 +745,19 @@ def __init__(
             few_shots_select="sequential",
             generation_size=level_config.generation_size,
             metric=[
-                bleu,  # Use sample level BLEU for faster evaluation
-                chrf,  # Use sample level chrF for faster evaluation
+                # ===== Lexical metrics =====
+                # Metrics.ter,  # TER is a corpus level metric that is very slow in bootstrapping
+                bleu_sentence,  # Use sample level BLEU for faster evaluation
+                Metrics.bleu,  # Disable this if it is too slow
+                Metrics.bleu_1,
+                Metrics.bleu_4,
+                Metrics.chrf,  # Disable this if it is too slow
+                chrf_sentence,  # Use sample level chrF for faster evaluation
                 meteor,
-                get_bert_score(language=target_lang, model_type="xlm-roberta-large", device=device),
-                bleurt_large,
-                xcomet_xxl,  # Just use one, disregarding xcomet_xl, comet_wmt22_da
+                # ===== Model-based metrics =====
+                bert_scores[target_lang],
+                bleurt_large,  # Only take the largest version, disregarding base and tiny
+                xcomet_xxl,  # Only take the largest version, disregarding xcomet_xl, comet_wmt22_da
                 swiss_legal_translation_judge_gpt_4o,
                 # Additionally we could consider adding the following open source judge models:
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0

From e758316f7b906254936098e272f57575b04444ef Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 2 Dec 2024 16:11:37 +0100
Subject: [PATCH 25/57] Switched to sacrebleu implementation for sentence level
 translation metrics.

---
 community_tasks/swiss_legal_evals.py | 99 ++++++++++------------------
 1 file changed, 33 insertions(+), 66 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index e1d35085b..ca10037b9 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -42,9 +42,8 @@
 from comet import download_model, load_from_checkpoint
 from nltk import word_tokenize
 from nltk.translate import meteor_score
-from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
-from nltk.translate.chrf_score import sentence_chrf
 from packaging import version
+from sacrebleu import sentence_bleu, sentence_chrf, sentence_ter
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
 from lighteval.logging.hierarchical_logger import hlog, hlog_warn
@@ -481,28 +480,6 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
 
 
 class BLEU:
-    def __init__(
-        self,
-        weights=(0.25, 0.25, 0.25, 0.25),
-        smoothing_function=None,
-        auto_reweigh=False,
-    ):
-        """
-        Initialize BLEU scorer with specified n-gram weights.
-        Default weights are for BLEU-4 (equal weights for 1-4 grams).
-
-        Args:
-            weights: Tuple of weights for unigrams through 4-grams
-            smoothing_function: Optional smoothing function for BLEU computation
-            auto_reweigh: Whether to automatically reweigh the scores based on reference length
-        """
-        self.weights = weights
-        self.smoothing_function = smoothing_function or SmoothingFunction().method1
-        self.auto_reweigh = auto_reweigh
-
-        # Ensure NLTK data is downloaded
-        nltk.download("punkt")
-
     def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         """
         Compute BLEU score for a list of predictions against their references.
@@ -516,19 +493,7 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         """
         scores = []
         for ref, pred in zip(golds, predictions):
-            # Tokenize the reference and prediction
-            reference = [word_tokenize(ref)]
-            hypothesis = word_tokenize(pred)
-
-            # Calculate BLEU score for this pair
-            score = sentence_bleu(
-                references=reference,
-                hypothesis=hypothesis,
-                weights=self.weights,
-                smoothing_function=self.smoothing_function,
-                auto_reweigh=self.auto_reweigh,
-            )
-            scores.append(score)
+            scores.append(sentence_bleu(pred, [ref]).score)
 
         return statistics.mean(scores) * 100
 
@@ -544,32 +509,13 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
 
 
 class CHRF:
-    def __init__(self, beta: float = 3.0, max_len: int = 6, min_len: int = 1):
-        """
-        Initialize chrF scorer with specified parameters.
-        beta: Weight of recall vs precision (default: 3.0)
-        max_len: Maximum n-gram order (default: 6)
-        min_len: Minimum n-gram order (default: 1)
-        """
-        self.beta = beta
-        self.max_len = max_len
-        self.min_len = min_len
-
     def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         """
         Compute chrF score for a list of predictions against their references.
         """
         scores = []
         for ref, pred in zip(golds, predictions):
-            score = sentence_chrf(
-                ref,
-                pred,
-                min_len=self.min_len,
-                max_len=self.max_len,
-                beta=self.beta,
-                ignore_whitespace=True,
-            )
-            scores.append(score)
+            scores.append(sentence_chrf(pred, [ref]).score)
 
         return statistics.mean(scores) * 100
 
@@ -584,9 +530,29 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
 )
 
 
+class TER:
+    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
+        """
+        Compute TER score for a list of predictions against their references.
+        """
+        scores = []
+        for ref, pred in zip(golds, predictions):
+            scores.append(sentence_ter(pred, [ref]).score)
+
+        return statistics.mean(scores) * 100
+
+
+ter_sentence = SampleLevelMetric(
+    metric_name="ter_sentence",
+    higher_is_better=False,
+    category=MetricCategory.GENERATIVE,
+    use_case=MetricUseCase.TRANSLATION,
+    sample_level_fn=TER().compute,
+    corpus_level_fn=statistics.mean,
+)
+
+
 # EVALS WITH SUBSET
-# This is how you create a subset task (like MMLU), which has several subset
-# each being its own evaluation task.
 
 
 def create_translation_pairs(langs_list: list) -> list[tuple]:
@@ -746,13 +712,14 @@ def __init__(
             generation_size=level_config.generation_size,
             metric=[
                 # ===== Lexical metrics =====
-                # Metrics.ter,  # TER is a corpus level metric that is very slow in bootstrapping
-                bleu_sentence,  # Use sample level BLEU for faster evaluation
-                Metrics.bleu,  # Disable this if it is too slow
-                Metrics.bleu_1,
-                Metrics.bleu_4,
-                Metrics.chrf,  # Disable this if it is too slow
-                chrf_sentence,  # Use sample level chrF for faster evaluation
+                # Corpus level metrics
+                Metrics.bleu,
+                Metrics.chrf,
+                # Metrics.ter,  # TER often hangs for a while and takes more than 10 minutes to compute
+                # Sample level metrics
+                bleu_sentence,
+                chrf_sentence,
+                ter_sentence,
                 meteor,
                 # ===== Model-based metrics =====
                 bert_scores[target_lang],

From d08163fa2e44c5e536bb763361f086a63e76aac5 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 4 Dec 2024 17:25:46 +0100
Subject: [PATCH 26/57] Added more stop sequences.

---
 community_tasks/swiss_legal_evals.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index ca10037b9..f4ecfe9ee 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -730,7 +730,7 @@ def __init__(
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0
                 # However, these are only fine-tuned on English data and we need multilingual support.
             ],
-            stop_sequence=[".\n"],  # just "\n" leads to problems for anthropic models
+            stop_sequence=[".\n", "\n", "</s>"],  # just "\n" leads to problems for anthropic models
             trust_dataset=True,
             # Remove the target language in the beginning if it exists: e.g., FR: {translation}
             # Is only applied to the generative metrics, but also there seems not to be invoked, maybe not passed through?

From 86c67bc3f237b0bccb37b25fcd0a9c404f600186 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Thu, 5 Dec 2024 10:27:30 +0100
Subject: [PATCH 27/57] Made stop_sequence level specific.

---
 community_tasks/swiss_legal_evals.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index f4ecfe9ee..caee42e44 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -579,6 +579,7 @@ class LevelConfig:
     text_col_name: str
     metadata_cols: list[str]
     generation_size: int
+    stop_sequence: list[str]  # just "\n" leads to problems for anthropic models, maybe we need a special case there
 
 
 @dataclass
@@ -603,18 +604,21 @@ def __post_init__(self):
             text_col_name="bgeText",
             metadata_cols=["bge"],
             generation_size=2048,
+            stop_sequence=["</s>", "\n\n"],
         ),
         "regeste_level": LevelConfig(
             name="regeste_level",
             text_col_name="regesteText",
             metadata_cols=["bge"],
             generation_size=512,
+            stop_sequence=["</s>", "\n\n"],
         ),
         "text_level": LevelConfig(
             name="text_level",
             text_col_name="text",
             metadata_cols=["bge"],
             generation_size=256,
+            stop_sequence=["</s>", ".\n", "\n"],
         ),
     },
 )
@@ -630,18 +634,21 @@ def __post_init__(self):
             text_col_name="lawText",
             metadata_cols=["rsNr"],
             generation_size=16384,
+            stop_sequence=["</s>", "\n\n"],
         ),
         "article_level": LevelConfig(
             name="article_level",
             text_col_name="artText",
             metadata_cols=["rsNr"],
             generation_size=1024,
+            stop_sequence=["</s>", "\n\n"],
         ),
         "paragraph_level": LevelConfig(
             name="paragraph_level",
             text_col_name="parText",
             metadata_cols=["rsNr"],
             generation_size=256,
+            stop_sequence=["</s>", ".\n", "\n"],
         ),
     },
 )
@@ -657,6 +664,7 @@ def __post_init__(self):
             text_col_name="text",
             metadata_cols=["filename"],
             generation_size=1024,
+            stop_sequence=["</s>", "\n\n"],
         )
     },
 )
@@ -730,7 +738,7 @@ def __init__(
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0
                 # However, these are only fine-tuned on English data and we need multilingual support.
             ],
-            stop_sequence=[".\n", "\n", "</s>"],  # just "\n" leads to problems for anthropic models
+            stop_sequence=level_config.stop_sequence,
             trust_dataset=True,
             # Remove the target language in the beginning if it exists: e.g., FR: {translation}
             # Is only applied to the generative metrics, but also there seems not to be invoked, maybe not passed through?

From f10994551a2c36550dde0f7fcd4c0b722f66038c Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Fri, 6 Dec 2024 10:00:25 +0100
Subject: [PATCH 28/57] Added gemba metric.

---
 community_tasks/swiss_legal_evals.py | 56 ++++++++++++++++++++++++++--
 1 file changed, 53 insertions(+), 3 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index caee42e44..f690fbea0 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -40,6 +40,7 @@
 import requests
 import torch
 from comet import download_model, load_from_checkpoint
+from gemba import get_gemba_scores
 from nltk import word_tokenize
 from nltk.translate import meteor_score
 from packaging import version
@@ -213,6 +214,52 @@ def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
 swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
 
 
+class GEMBA:
+    def __init__(self, method: str = "GEMBA-MQM_norm", model: str = "gpt-4o"):
+        self.method = method
+        self.model = model
+        self.name = f"{method.split('_')[0]}_{model}"
+
+    def compute(
+        self,
+        sample_ids: list[str],
+        responses: list,
+        formatted_docs: list[Doc],
+        **kwargs,
+    ) -> dict[str, float]:
+        hlog(f"Judging {len(formatted_docs)} samples with {self.name}...")
+        source_langs = [formatted_doc.specific["source_lang"] for formatted_doc in formatted_docs]
+        target_langs = [formatted_doc.specific["target_lang"] for formatted_doc in formatted_docs]
+        # There should be only one language each in the batch
+        assert len(set(source_langs)) == len(set(target_langs)) == 1
+        sources = [formatted_doc.specific["source"] for formatted_doc in formatted_docs]
+        predictions = [response[0].result[0] for response in responses]
+
+        answers, errors = get_gemba_scores(
+            sources, predictions, source_langs[0], target_langs[0], method=self.method, model=self.model
+        )
+
+        # Convert defaultdict to dict
+        errors = [[{key: value} for key, value in error.items()] for error in errors]
+
+        return [{self.name: answer, f"{self.name}_errors": error} for answer, error in zip(answers, errors)]
+
+
+def get_gemba_judge(method: str = "GEMBA-MQM_norm", model: str = "gpt-4o"):
+    name = f"{method.split('_')[0]}_{model}"
+    return SampleLevelMetricGrouping(
+        metric_name=[name],
+        higher_is_better={name: True},
+        category=MetricCategory.LLM_AS_JUDGE,
+        use_case=MetricUseCase.TRANSLATION,
+        sample_level_fn=GEMBA(method=method, model=model).compute,
+        corpus_level_fn={name: statistics.mean},
+    )
+
+
+gemba_mqm_gpt_4o = get_gemba_judge(method="GEMBA-MQM_norm", model="gpt-4o")
+
+
 def get_bert_score(language: str, num_layers: int = 24, model_type: str = "xlm-roberta-large", device: str = "cpu"):
     if device == "mps":
         raise ValueError("MPS is not supported for BERTScore")
@@ -670,17 +717,17 @@ def __post_init__(self):
 )
 
 
-def create_prompt_fn(level_config: LevelConfig, src_lang: str, target_lang: str):
+def create_prompt_fn(level_config: LevelConfig, source_lang: str, target_lang: str):
     """
     Create a prompt function for a given level configuration.
     """
     text_col = level_config.text_col_name
-    src_text_col = f"{src_lang}_{text_col}"
+    src_text_col = f"{source_lang}_{text_col}"
     target_text_col = f"{target_lang}_{text_col}"
 
     def prompt_fn(line: dict, task_name: str = None):
         # Following Template A from https://github.com/huggingface/lighteval/pull/389#issuecomment-2471580177
-        custom_query = f"{src_lang.upper()}: {line[src_text_col]}\n{target_lang.upper()}: "
+        custom_query = f"{source_lang.upper()}: {line[src_text_col]}\n{target_lang.upper()}: "
 
         return Doc(
             task_name=task_name,
@@ -691,6 +738,8 @@ def prompt_fn(line: dict, task_name: str = None):
                 **{col: line[col] for col in level_config.metadata_cols},
                 "question": custom_query,
                 "source": line[src_text_col],
+                "source_lang": source_lang,
+                "target_lang": target_lang,
             },
         )
 
@@ -733,6 +782,7 @@ def __init__(
                 bert_scores[target_lang],
                 bleurt_large,  # Only take the largest version, disregarding base and tiny
                 xcomet_xxl,  # Only take the largest version, disregarding xcomet_xl, comet_wmt22_da
+                gemba_mqm_gpt_4o,
                 swiss_legal_translation_judge_gpt_4o,
                 # Additionally we could consider adding the following open source judge models:
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0

From f357176e8d8513dcdeadf36d50fb11d9abd7dd97 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 9 Dec 2024 15:39:31 +0100
Subject: [PATCH 29/57] Updated logging.

---
 community_tasks/swiss_legal_evals.py | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index f690fbea0..f5ae79953 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -30,8 +30,8 @@
 
 Author: Joel Niklaus
 """
-
 import importlib.metadata as importlib_metadata
+import logging
 import os
 import statistics
 from dataclasses import dataclass
@@ -47,7 +47,6 @@
 from sacrebleu import sentence_bleu, sentence_chrf, sentence_ter
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
-from lighteval.logging.hierarchical_logger import hlog, hlog_warn
 from lighteval.metrics.imports.bert_scorer import BERTScorer
 from lighteval.metrics.metrics import Metrics
 from lighteval.metrics.metrics_sample import BertScore, JudgeLLM
@@ -63,6 +62,9 @@
 from lighteval.tasks.requests import Doc
 
 
+logger = logging.getLogger(__name__)
+
+
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
 # Try to optimize CUDA operations
@@ -176,7 +178,7 @@ def compute(
         formatted_docs: list[Doc],
         **kwargs,
     ) -> dict[str, float]:
-        hlog(f"Judging {len(formatted_docs)} samples with {self.short_judge_name}...")
+        logger.info(f"Judging {len(formatted_docs)} samples with {self.short_judge_name}...")
         questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
         options = [formatted_doc.choices for formatted_doc in formatted_docs]
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
@@ -227,7 +229,7 @@ def compute(
         formatted_docs: list[Doc],
         **kwargs,
     ) -> dict[str, float]:
-        hlog(f"Judging {len(formatted_docs)} samples with {self.name}...")
+        logger.info(f"Judging {len(formatted_docs)} samples with {self.name}...")
         source_langs = [formatted_doc.specific["source_lang"] for formatted_doc in formatted_docs]
         target_langs = [formatted_doc.specific["target_lang"] for formatted_doc in formatted_docs]
         # There should be only one language each in the batch
@@ -263,7 +265,7 @@ def get_gemba_judge(method: str = "GEMBA-MQM_norm", model: str = "gpt-4o"):
 def get_bert_score(language: str, num_layers: int = 24, model_type: str = "xlm-roberta-large", device: str = "cpu"):
     if device == "mps":
         raise ValueError("MPS is not supported for BERTScore")
-    hlog(
+    logger.info(
         f"Loading BERTScore with lang={language}, num_layers={num_layers}, model_type={model_type}, and device={device}..."
     )
     score = BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip)
@@ -280,7 +282,7 @@ def get_bert_score(language: str, num_layers: int = 24, model_type: str = "xlm-r
 
     if language == "rm":
         language = "it"
-        hlog_warn("There is no BERTScore baseline file for Rumantsch, using Italian instead.")
+        logger.warning("There is no BERTScore baseline file for Rumantsch, using Italian instead.")
 
     # Create directory structure if it doesn't exist
     os.makedirs(os.path.dirname(score.bert_scorer.baseline_path), exist_ok=True)
@@ -288,7 +290,7 @@ def get_bert_score(language: str, num_layers: int = 24, model_type: str = "xlm-r
     # Download the baseline file if it doesn't exist
     if not os.path.exists(score.bert_scorer.baseline_path):
         raw_url = f"https://raw.githubusercontent.com/Tiiiger/bert_score/master/bert_score/rescale_baseline/{language}/{model_type}.tsv"
-        hlog(f"Downloading BERTScore baseline file from {raw_url}")
+        logger.info(f"Downloading BERTScore baseline file from {raw_url}")
         response = requests.get(raw_url)
         if response.status_code == 200:
             with open(score.bert_scorer.baseline_path, "wb") as f:
@@ -360,7 +362,7 @@ def compute(
         formatted_docs: list[Doc],
         **kwargs,
     ) -> dict[str, float]:
-        hlog(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
+        logger.info(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
         predictions = [response[0].result[0] for response in responses]
 
@@ -379,7 +381,7 @@ def compute(
             )
             inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
             if any(len(encoding) == self.max_length for encoding in inputs["input_ids"]):
-                hlog_warn(f"Some inputs were truncated to max_length={self.max_length} in BLEURT scoring")
+                logger.warning(f"Some inputs were truncated to max_length={self.max_length} in BLEURT scoring")
             with torch.no_grad():
                 all_scores.extend(self.model(**inputs)[0].squeeze().cpu().tolist())
 
@@ -392,7 +394,7 @@ def get_bleurt(
     batch_size: int = 32,
     device: str = "cpu",
 ):
-    hlog(
+    logger.info(
         f"Loading BLEURT with model_size={model_size}, seq_len={seq_len}, batch_size={batch_size}, and device={device}..."
     )
     name = f"bleurt_{model_size}"
@@ -433,7 +435,7 @@ def compute(
         formatted_docs: list[Doc],
         **kwargs,
     ) -> dict[str, float]:
-        hlog(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
+        logger.info(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
         predictions = [response[0].result[0] for response in responses]
         sources = [kwargs["formatted_doc"].specific["source"] for kwargs["formatted_doc"] in formatted_docs]
@@ -455,7 +457,9 @@ def get_comet(
     gpus: int = 1,
     device: str = "cpu",
 ):
-    hlog(f"Loading COMET with model_name={model_name}, batch_size={batch_size}, gpus={gpus}, and device={device}...")
+    logger.info(
+        f"Loading COMET with model_name={model_name}, batch_size={batch_size}, gpus={gpus}, and device={device}..."
+    )
     name = model_name.split("/")[-1]
     return SampleLevelMetricGrouping(
         metric_name=[name],

From 2d4c0ed803c719f306cedb9892d1a01440855b9f Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 9 Dec 2024 15:41:34 +0100
Subject: [PATCH 30/57] Updated stop_sequence.

---
 community_tasks/swiss_legal_evals.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index f5ae79953..16177c2e3 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -655,14 +655,14 @@ def __post_init__(self):
             text_col_name="bgeText",
             metadata_cols=["bge"],
             generation_size=2048,
-            stop_sequence=["</s>", "\n\n"],
+            stop_sequence=["</s>", ".\n\n", "\n\n"],
         ),
         "regeste_level": LevelConfig(
             name="regeste_level",
             text_col_name="regesteText",
             metadata_cols=["bge"],
             generation_size=512,
-            stop_sequence=["</s>", "\n\n"],
+            stop_sequence=["</s>", ".\n\n", "\n\n"],
         ),
         "text_level": LevelConfig(
             name="text_level",
@@ -685,14 +685,14 @@ def __post_init__(self):
             text_col_name="lawText",
             metadata_cols=["rsNr"],
             generation_size=16384,
-            stop_sequence=["</s>", "\n\n"],
+            stop_sequence=["</s>", ".\n\n", "\n\n"],
         ),
         "article_level": LevelConfig(
             name="article_level",
             text_col_name="artText",
             metadata_cols=["rsNr"],
             generation_size=1024,
-            stop_sequence=["</s>", "\n\n"],
+            stop_sequence=["</s>", ".\n\n", "\n\n"],
         ),
         "paragraph_level": LevelConfig(
             name="paragraph_level",
@@ -715,7 +715,7 @@ def __post_init__(self):
             text_col_name="text",
             metadata_cols=["filename"],
             generation_size=1024,
-            stop_sequence=["</s>", "\n\n"],
+            stop_sequence=["</s>", ".\n\n", "\n\n"],
         )
     },
 )

From 7b7797279b6f0355e4106bad0c507eb256128962 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Tue, 10 Dec 2024 11:35:15 +0100
Subject: [PATCH 31/57] Made metric selection easier.

---
 community_tasks/swiss_legal_evals.py | 189 +++++++++++++++------------
 1 file changed, 108 insertions(+), 81 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 16177c2e3..beb738048 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -213,9 +213,6 @@ def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
     )
 
 
-swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
-
-
 class GEMBA:
     def __init__(self, method: str = "GEMBA-MQM_norm", model: str = "gpt-4o"):
         self.method = method
@@ -259,9 +256,6 @@ def get_gemba_judge(method: str = "GEMBA-MQM_norm", model: str = "gpt-4o"):
     )
 
 
-gemba_mqm_gpt_4o = get_gemba_judge(method="GEMBA-MQM_norm", model="gpt-4o")
-
-
 def get_bert_score(language: str, num_layers: int = 24, model_type: str = "xlm-roberta-large", device: str = "cpu"):
     if device == "mps":
         raise ValueError("MPS is not supported for BERTScore")
@@ -316,19 +310,6 @@ def get_bert_score(language: str, num_layers: int = 24, model_type: str = "xlm-r
     )
 
 
-# Create BERTScore metrics for each language
-
-
-bert_scores = {
-    lang: get_bert_score(
-        language=lang,
-        model_type="xlm-roberta-large",
-        device=device,
-    )
-    for lang in ["de", "fr", "it", "rm", "en"]
-}
-
-
 class BLEURT:
     def __init__(
         self,
@@ -408,9 +389,6 @@ def get_bleurt(
     )
 
 
-bleurt_large = get_bleurt(model_size="large", seq_len=512, batch_size=256, device=device)
-
-
 class COMET:
     def __init__(
         self,
@@ -476,12 +454,6 @@ def get_comet(
     )
 
 
-# There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
-# comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=64, gpus=1, device=device)
-# xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=32, gpus=1, device=device)
-xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=16, gpus=1, device=device)
-
-
 class METEOR:
     def __init__(self, alpha=0.9, beta=3, gamma=0.5):
         self.alpha = alpha
@@ -520,14 +492,15 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         return statistics.mean(scores) * 100
 
 
-meteor = SampleLevelMetric(
-    metric_name="meteor",
-    higher_is_better=True,
-    category=MetricCategory.GENERATIVE,
-    use_case=MetricUseCase.TRANSLATION,
-    sample_level_fn=METEOR().compute,
-    corpus_level_fn=statistics.mean,
-)
+def get_meteor():
+    return SampleLevelMetric(
+        metric_name="meteor",
+        higher_is_better=True,
+        category=MetricCategory.GENERATIVE,
+        use_case=MetricUseCase.TRANSLATION,
+        sample_level_fn=METEOR().compute,
+        corpus_level_fn=statistics.mean,
+    )
 
 
 class BLEU:
@@ -549,14 +522,15 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         return statistics.mean(scores) * 100
 
 
-bleu_sentence = SampleLevelMetric(
-    metric_name="bleu_sentence",
-    higher_is_better=True,
-    category=MetricCategory.GENERATIVE,
-    use_case=MetricUseCase.TRANSLATION,
-    sample_level_fn=BLEU().compute,
-    corpus_level_fn=statistics.mean,
-)
+def get_bleu_sentence():
+    return SampleLevelMetric(
+        metric_name="bleu_sentence",
+        higher_is_better=True,
+        category=MetricCategory.GENERATIVE,
+        use_case=MetricUseCase.TRANSLATION,
+        sample_level_fn=BLEU().compute,
+        corpus_level_fn=statistics.mean,
+    )
 
 
 class CHRF:
@@ -571,14 +545,15 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         return statistics.mean(scores) * 100
 
 
-chrf_sentence = SampleLevelMetric(
-    metric_name="chrf_sentence",
-    higher_is_better=True,
-    category=MetricCategory.GENERATIVE,
-    use_case=MetricUseCase.TRANSLATION,
-    sample_level_fn=CHRF().compute,
-    corpus_level_fn=statistics.mean,
-)
+def get_chrf_sentence():
+    return SampleLevelMetric(
+        metric_name="chrf_sentence",
+        higher_is_better=True,
+        category=MetricCategory.GENERATIVE,
+        use_case=MetricUseCase.TRANSLATION,
+        sample_level_fn=CHRF().compute,
+        corpus_level_fn=statistics.mean,
+    )
 
 
 class TER:
@@ -593,14 +568,15 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         return statistics.mean(scores) * 100
 
 
-ter_sentence = SampleLevelMetric(
-    metric_name="ter_sentence",
-    higher_is_better=False,
-    category=MetricCategory.GENERATIVE,
-    use_case=MetricUseCase.TRANSLATION,
-    sample_level_fn=TER().compute,
-    corpus_level_fn=statistics.mean,
-)
+def get_ter_sentence():
+    return SampleLevelMetric(
+        metric_name="ter_sentence",
+        higher_is_better=False,
+        category=MetricCategory.GENERATIVE,
+        use_case=MetricUseCase.TRANSLATION,
+        sample_level_fn=TER().compute,
+        corpus_level_fn=statistics.mean,
+    )
 
 
 # EVALS WITH SUBSET
@@ -750,6 +726,77 @@ def prompt_fn(line: dict, task_name: str = None):
     return prompt_fn
 
 
+METRICS_TO_USE = [
+    "bleu",
+    "chrf",
+    "bleu_sentence",
+    "chrf_sentence",
+    "ter_sentence",
+    "meteor",
+    "bert_score",
+    "bleurt_large",
+    "xcomet_xxl",
+    "gemba_mqm_gpt_4o",
+    "slt_judge_gpt_4o",
+]
+METRICS = {}
+
+# ===== Lexical metrics =====
+# Corpus level metrics
+if "bleu" in METRICS_TO_USE:
+    METRICS["bleu"] = Metrics.bleu
+if "chrf" in METRICS_TO_USE:
+    METRICS["chrf"] = Metrics.chrf
+if "ter" in METRICS_TO_USE:
+    # TER often hangs for a while and takes more than 10 minutes to compute
+    METRICS["ter"] = Metrics.ter
+# Sample level metrics
+if "bleu_sentence" in METRICS_TO_USE:
+    METRICS["bleu_sentence"] = get_bleu_sentence()
+if "chrf_sentence" in METRICS_TO_USE:
+    METRICS["chrf_sentence"] = get_chrf_sentence()
+if "ter_sentence" in METRICS_TO_USE:
+    METRICS["ter_sentence"] = get_ter_sentence()
+if "meteor" in METRICS_TO_USE:
+    METRICS["meteor"] = get_meteor()
+# ===== Model-based metrics =====
+if "bert_score" in METRICS_TO_USE:
+    METRICS["bert_score"] = {  # Create BERTScore metrics for each language
+        lang: get_bert_score(language=lang, model_type="xlm-roberta-large", device=device)
+        for lang in ["de", "fr", "it", "rm", "en"]
+    }
+if "bleurt_tiny" in METRICS_TO_USE:
+    METRICS["bleurt_tiny"] = get_bleurt(model_size="tiny", seq_len=512, batch_size=256, device=device)
+if "bleurt_base" in METRICS_TO_USE:
+    METRICS["bleurt_base"] = get_bleurt(model_size="base", seq_len=512, batch_size=256, device=device)
+if "bleurt_large" in METRICS_TO_USE:
+    METRICS["bleurt_large"] = get_bleurt(model_size="large", seq_len=512, batch_size=256, device=device)
+# There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
+if "wmt22-comet-da" in METRICS_TO_USE:
+    METRICS["wmt22-comet-da"] = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=64, gpus=1, device=device)
+if "xcomet_xl" in METRICS_TO_USE:
+    METRICS["xcomet_xl"] = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=32, gpus=1, device=device)
+if "xcomet_xxl" in METRICS_TO_USE:
+    METRICS["xcomet_xxl"] = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=16, gpus=1, device=device)
+if "gemba_mqm_gpt_4o" in METRICS_TO_USE:
+    METRICS["gemba_mqm_gpt_4o"] = get_gemba_judge(method="GEMBA-MQM_norm", model="gpt-4o")
+if "slt_judge_gpt_4o" in METRICS_TO_USE:
+    METRICS["slt_judge_gpt_4o"] = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
+# Additionally we could consider adding the following open source judge models:
+# flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0
+# However, these are only fine-tuned on English data and we need multilingual support.
+
+
+def get_metrics(METRICS_TO_USE, target_lang: str):
+    metrics = []
+    for metric in METRICS_TO_USE:
+        if metric in METRICS:
+            metrics.append(METRICS[metric])
+        elif metric == "bert_score":
+            metrics.append(METRICS["bert_score"][target_lang])
+    return metrics
+
+
 class TranslationTask(LightevalTaskConfig):
     def __init__(
         self,
@@ -771,27 +818,7 @@ def __init__(
             few_shots_split="validation",
             few_shots_select="sequential",
             generation_size=level_config.generation_size,
-            metric=[
-                # ===== Lexical metrics =====
-                # Corpus level metrics
-                Metrics.bleu,
-                Metrics.chrf,
-                # Metrics.ter,  # TER often hangs for a while and takes more than 10 minutes to compute
-                # Sample level metrics
-                bleu_sentence,
-                chrf_sentence,
-                ter_sentence,
-                meteor,
-                # ===== Model-based metrics =====
-                bert_scores[target_lang],
-                bleurt_large,  # Only take the largest version, disregarding base and tiny
-                xcomet_xxl,  # Only take the largest version, disregarding xcomet_xl, comet_wmt22_da
-                gemba_mqm_gpt_4o,
-                swiss_legal_translation_judge_gpt_4o,
-                # Additionally we could consider adding the following open source judge models:
-                # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0
-                # However, these are only fine-tuned on English data and we need multilingual support.
-            ],
+            metric=get_metrics(METRICS_TO_USE, target_lang),
             stop_sequence=level_config.stop_sequence,
             trust_dataset=True,
             # Remove the target language in the beginning if it exists: e.g., FR: {translation}

From fcd9505237a5054ed6a83bf5aa25bd00e05189dd Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Tue, 10 Dec 2024 18:50:10 +0100
Subject: [PATCH 32/57] Fixed dict issue.

---
 community_tasks/swiss_legal_evals.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index beb738048..9565b8ca3 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -736,7 +736,6 @@ def prompt_fn(line: dict, task_name: str = None):
     "bert_score",
     "bleurt_large",
     "xcomet_xxl",
-    "gemba_mqm_gpt_4o",
     "slt_judge_gpt_4o",
 ]
 METRICS = {}
@@ -778,7 +777,7 @@ def prompt_fn(line: dict, task_name: str = None):
     METRICS["xcomet_xl"] = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=32, gpus=1, device=device)
 if "xcomet_xxl" in METRICS_TO_USE:
     METRICS["xcomet_xxl"] = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=16, gpus=1, device=device)
-if "gemba_mqm_gpt_4o" in METRICS_TO_USE:
+if "gemba_mqm_gpt_4o" in METRICS_TO_USE:  # TODO: Somehow in long evaluations something is wrong here
     METRICS["gemba_mqm_gpt_4o"] = get_gemba_judge(method="GEMBA-MQM_norm", model="gpt-4o")
 if "slt_judge_gpt_4o" in METRICS_TO_USE:
     METRICS["slt_judge_gpt_4o"] = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
@@ -791,9 +790,11 @@ def get_metrics(METRICS_TO_USE, target_lang: str):
     metrics = []
     for metric in METRICS_TO_USE:
         if metric in METRICS:
-            metrics.append(METRICS[metric])
-        elif metric == "bert_score":
-            metrics.append(METRICS["bert_score"][target_lang])
+            if metric == "bert_score":
+                # Add only the BERTScore for the target language
+                metrics.append(METRICS["bert_score"][target_lang])
+            else:
+                metrics.append(METRICS[metric])
     return metrics
 
 

From 5a8ca464a0963edf9fd32af481ccd15870b69aac Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 11 Dec 2024 15:15:36 +0100
Subject: [PATCH 33/57] Added metric dependencies.

---
 pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 9a4d3a3ce..9aa1d76ef 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,6 +74,8 @@ dependencies = [
     "scikit-learn",
     "spacy==3.7.2",
     "sacrebleu",
+    "unbabel-comet",
+    "gemba",
     "rouge_score==0.1.2",
     "sentencepiece>=0.1.99",
     "protobuf==3.20.*", # pinned for sentencepiece compat

From bab94af49a75d57b25498fe864dae7ecb2f46562 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 11 Dec 2024 15:16:54 +0100
Subject: [PATCH 34/57] Moving metrics to extended tasks.

---
 pyproject.toml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9aa1d76ef..e1a2a772e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,8 +74,6 @@ dependencies = [
     "scikit-learn",
     "spacy==3.7.2",
     "sacrebleu",
-    "unbabel-comet",
-    "gemba",
     "rouge_score==0.1.2",
     "sentencepiece>=0.1.99",
     "protobuf==3.20.*", # pinned for sentencepiece compat
@@ -101,7 +99,9 @@ docs = ["hf-doc-builder", "watchdog"]
 extended_tasks = [
   "langdetect", # ifeval
   "openai", # llm as a judge using openai models
-  "tiktoken"
+  "tiktoken",
+  "unbabel-comet",
+  "gemba",
 ]
 s3 = ["s3fs"]
 multilingual = [

From 09be56d87c662b6a6cec33bd2c5bd1d0fbeb0d5d Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Sun, 22 Dec 2024 16:43:58 +0100
Subject: [PATCH 35/57] Added support for judges from different providers.

---
 community_tasks/swiss_legal_evals.py | 36 +++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 9565b8ca3..17fdf87b9 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -182,7 +182,7 @@ def compute(
         questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
         options = [formatted_doc.choices for formatted_doc in formatted_docs]
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
-        predictions = [response[0].result[0] for response in responses]
+        predictions = [response[0].result for response in responses]
 
         scores, _, judgements = self.judge.evaluate_answer_batch(questions, predictions, options, golds)
         # Exclude the messages (user prompt) because they are too long
@@ -195,21 +195,24 @@ def compute(
         ]
 
 
-def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
-    name = f"slt_judge_{judge_model_name}"
+def get_swiss_legal_translation_judge(
+    judge_model_name: str = "openai/gpt-4o-2024-11-20",
+    short_judge_name: str = "slt_judge_gpt-4o",
+    backend: str = "litellm",
+):
     return SampleLevelMetricGrouping(
-        metric_name=[name],
-        higher_is_better={name: True},
+        metric_name=[short_judge_name],
+        higher_is_better={short_judge_name: True},
         category=MetricCategory.LLM_AS_JUDGE,
         use_case=MetricUseCase.TRANSLATION,
         sample_level_fn=JudgeSwissLegalTranslation(
             judge_model_name=judge_model_name,
             template=swiss_legal_translation_judge,
             process_judge_response=process_judge_response_freeform_gpt,
-            judge_backend="openai",
-            short_judge_name=name,
+            judge_backend=backend,
+            short_judge_name=short_judge_name,
         ).compute,
-        corpus_level_fn={name: statistics.mean},
+        corpus_level_fn={short_judge_name: statistics.mean},
     )
 
 
@@ -779,8 +782,23 @@ def prompt_fn(line: dict, task_name: str = None):
     METRICS["xcomet_xxl"] = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=16, gpus=1, device=device)
 if "gemba_mqm_gpt_4o" in METRICS_TO_USE:  # TODO: Somehow in long evaluations something is wrong here
     METRICS["gemba_mqm_gpt_4o"] = get_gemba_judge(method="GEMBA-MQM_norm", model="gpt-4o")
+if "slt_judge_gpt_4o_mini" in METRICS_TO_USE:
+    METRICS["slt_judge_gpt_4o_mini"] = get_swiss_legal_translation_judge(
+        judge_model_name="openai/gpt-4o-mini-2024-07-18", short_judge_name="slt_judge_gpt-4o-mini"
+    )
 if "slt_judge_gpt_4o" in METRICS_TO_USE:
-    METRICS["slt_judge_gpt_4o"] = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
+    METRICS["slt_judge_gpt_4o"] = get_swiss_legal_translation_judge(
+        judge_model_name="openai/gpt-4o-2024-11-20", short_judge_name="slt_judge_gpt-4o"
+    )
+if "slt_judge_haiku_35" in METRICS_TO_USE:
+    METRICS["slt_judge_haiku_35"] = get_swiss_legal_translation_judge(
+        judge_model_name="anthropic/claude-3-5-haiku-20241022", short_judge_name="slt_judge_haiku-3.5"
+    )
+if "slt_judge_sonnet_35" in METRICS_TO_USE:
+    METRICS["slt_judge_sonnet_35"] = get_swiss_legal_translation_judge(
+        judge_model_name="anthropic/claude-3-5-sonnet-20241022", short_judge_name="slt_judge_sonnet-3.5"
+    )
+
 # Additionally we could consider adding the following open source judge models:
 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0
 # However, these are only fine-tuned on English data and we need multilingual support.

From 0aa860777f85aa39eb85ee28b88054084fe9ca55 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Sun, 22 Dec 2024 17:18:16 +0100
Subject: [PATCH 36/57] Added additional system and user prompts and few shot
 examples.

---
 community_tasks/swiss_legal_evals.py | 162 +++++++++++++++++++++++----
 1 file changed, 140 insertions(+), 22 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 17fdf87b9..bf43c31bf 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -79,18 +79,56 @@
 
 # CUSTOM METRICS
 
-
-def swiss_legal_translation_judge(question, options, answer, gold):
-    return [
-        {
-            "role": "system",
-            "content": "Act as a Judge specializing in the evaluation of translations of Swiss legal documents. Your task is to assess the accuracy, clarity, and fidelity of the model's translation to the golden translation, while considering the nuances of legal language.",
-        },
-        {
-            "role": "user",
-            "content": f"""You will be provided with a source text, its golden translation, and the model's translation. Your task is to judge how correct the model's translation is based on the golden translation, and then give a correctness score. The correctness score should be one of the below numbers: 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). You should first briefly give your reasoning process regarding how the model's translation conforms to or contradicts the golden translation, and then give the correctness score. The correctness score must strictly follow this format: \"[[score]]\", e.g., \"The correctness score: [[0.5]]\". Below are some examples.
-
-Example 1:
+SYSTEM = {
+    "basic": "Act as a Judge specializing in the evaluation of translations of Swiss legal documents. Your task is to assess the accuracy, clarity, and fidelity of the model's translation to the golden translation, while considering the nuances of legal language.",
+    "detailed": "You are a senior legal translator and quality assurance specialist with over 20 years of experience in Swiss law, certified by the Swiss Sworn Translators Association (Association suisse des traducteurs-jurés, ASTJ). You possess native-level proficiency in all Swiss national languages (German, French, Italian, and Romansh) as well as English, enabling precise evaluation of legal nuances across all linguistic combinations. Your task is to evaluate machine-translated legal texts for accuracy, clarity and fidelity to Swiss legal standards analyzing the subtle complexities of legal language. You excel at identifying even minor discrepancies and calibrating evaluation scores appropriately to reflect the severity of each error.",
+}
+
+USER = {
+    "basic": """
+You will be provided with a source text, its golden translation, and the model's translation. Your task is to judge how correct the model's translation is based on the golden translation, and then give a correctness score. The correctness score should be one of the below numbers: 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). You should first briefly give your reasoning process regarding how the model's translation conforms to or contradicts the golden translation, and then give the correctness score. The correctness score must strictly follow this format: \"[[score]]\", e.g., \"The correctness score: [[0.5]]\". Below are some examples.
+""",
+    "detailed": """
+INPUT FORMAT:
+Source Text: [Original text in source language]
+Golden Translation: [Reference professional translation]
+Model Translation: [Machine-generated translation to be evaluated]
+
+
+EVALUATION DIMENSIONS:
+Accuracy: Semantic equivalence, correct legal terminology, and preservation of legal meaning.
+Clarity: Logical flow, appropriate legal register, and unambiguous expression.
+Fidelity: Adherence to Swiss legal conventions, jurisdiction-specific terminology, and formal register.
+
+
+SCORING RUBRIC:
+1.0: Perfect translation
+0.7-0.9: Minor issues only
+0.4-0.6: Significant but non-critical errors
+0.1-0.3: Major errors affecting legal meaning
+0.0: Completely incorrect
+
+
+REQUIRED OUTPUT FORMAT:
+Reasoning: Analyze how the model's translation aligns with or differs from the golden translation, focusing on significant legal and linguistic aspects.
+Examples: Identify specific terms, phrases, or sections in the model's answer that were correct or incorrect, with explanations.
+Score: End with "The correctness score: [[score]]"
+
+
+EVALUATION GUIDELINES:
+Stylistic differences should not impact accuracy significantly unless they alter the legal meaning.
+Untranslated Latin terms (e.g., prima facie) are not considered errors, but they should still be assessed for appropriate use within the context of the answer.
+Terminology should be used consistently throughout the text.
+Consider both explicit and implicit legal meanings.
+Consider jurisdiction-specific legal terminology.
+Flag any ambiguities, omissions or additions that affect legal meaning.
+
+Below are some examples.
+""",
+}
+
+FEW_SHOT = {
+    "diverse": """Example 1:
 Source Text:
 ```A contract is void if its terms are impossible, unlawful or immoral. However, where the defect pertains only to certain terms of a contract, those terms alone are void unless there is cause to assume that the contract would not have been concluded without them.```
 
@@ -100,7 +138,8 @@ def swiss_legal_translation_judge(question, options, answer, gold):
 Model’s Translation:
 ```Il contratto è nullo se le sue clausole sono impossibili, illecite o immorali. Tuttavia, quando il vizio riguarda solo determinate clausole del contratto, solo queste sono nulle, salvo che vi sia motivo di ritenere che il contratto non sarebbe stato concluso senza di esse.```
 
-Your Judgment: The model’s translation aligns well with the golden translation in terms of accuracy, clarity, and fidelity to the source text. However, there are minor stylistic differences. For example, the golden translation uses “conchiuso,” an older and more formal term, while the model opts for “concluso,” which is modern. Similarly, the golden translation uses the idiomatic phrase “contraria alle leggi od ai buoni costumi,” whereas the model employs the more literal “illecite o immorali”. The correctness score: [[0.9]]
+Your Judgment: The model’s translation aligns well with the golden translation in terms of accuracy, clarity, and fidelity to the source text. However, there are minor stylistic differences. For example, the golden translation uses “conchiuso” an older and more formal term, while the model opts for “concluso” which is modern. Similarly, the golden translation uses the idiomatic phrase “contraria alle leggi od ai buoni costumi” whereas the model employs the more literal “illecite o immorali”. The correctness score: [[0.9]]
+
 
 Example 2:
 Source Text:
@@ -120,6 +159,7 @@ def swiss_legal_translation_judge(question, options, answer, gold):
 
 Your Judgment: The model’s translation mostly aligns with the golden translation but diverges when it comes to accuracy and fidelity to Swiss legal terminology. For instance, the term “exploitabilité” which is closer to the Swiss provision is replaced in the model’s translation with “admissibilité”. Similarly, “ingérence” is used instead of “atteinte”, although “atteinte” is commonly used in Swiss law to discuss a violation of fundamental rights. Also, the term "recherche automatisée de véhicules et surveillance du trafic (RVS)" used by the golden translation is more established than "poursuite automatisée des véhicules et de la surveillance de la circulation (AFV)" in the model’s translation. The model’s translation is almost complete, but omits a critical point in one sentence: that the evidence was unlawfully obtained due to lack of a sufficiently clear legal basis. This omission impacts the completeness. The correctness score: [[0.7]]
 
+
 Example 3:
 Source Text:
 ```Yoko Ono est propriétaire de la montre de John Lennon – rejet du recours d'un collectionneur contre un arrêt rendu par la Cour de justice genevoise
@@ -139,7 +179,6 @@ def swiss_legal_translation_judge(question, options, answer, gold):
 
 Das Bundesgericht weist die dagegen erhobene Beschwerde des Sammlers ab. Unbestritten ist zunächst, dass das Eigentum an der Uhr nach dem Tod von John Lennon durch Erbschaft an Yoko Ono übergegangen ist. Sodann durfte das Genfer Kantonsgericht willkürfrei davon ausgehen, dass die Uhr vom ehemaligen Chauffeur gestohlen wurde und umgekehrt nichts dafür spricht, dass Yoko Ono dem Chauffeur dieses einzigartige Stück, welches sie John Lennon zwei Monate vor seinem Tod mit einer Inschrift versehen überreicht hatte, hätte schenken wollen. Da es sich bei der Uhr um eine gestohlene Sache handelt, konnte der Sammler und heutige Beschwerdeführer bei ihrem Erwerb 2014 in Deutschland nicht unmittelbar zum Eigentümer werden; dies gilt gemäss dem diesbezüglich anwendbaren deutschen Recht unabhängig davon, ob der Erwerber in Bezug auf die Herkunft der Sache in gutem Glauben war.```
 
-
 Model’s Translation:
 ```Hier, der Bundesgerichtshof hat den Antrag des Käufer gegen den Urteil des Zivilgerichts in Zürich abgelehnt, das das Zivilgericht in Zürich 2023 bestätigt hatte, dass Yoko Ono die Uhr als Geschenk 1980 an John Lennon gegeben hat, bevor dieser am 8. Dezember 1980 ermordet wurde. Der Käufer, der die Uhr 2014 in eine Geneßische Auktion verkauft hat, behauptet, die Uhr sei ihm verliehen worden, und nicht von Yoko Ono.
 
@@ -152,22 +191,72 @@ def swiss_legal_translation_judge(question, options, answer, gold):
 Der Bundesgerichtshof hat den Antrag des Käufer gegen dieses Urteil abgelehnt. Er ist zunächst nicht bezweifelt, dass Yoko Ono die Uhr nach dem Tod von John Lennon durch Erbvergabe gekauft hat. Es ist auch nicht bezweifelt, dass die Zivilgerichtsverfassung in Zürich die Uhr als gestohlen angenommen hat, da niemand einen Beweis dafür gab, dass Yoko Ono die Uhr als Geschenk an den Wagenkellner hatte. Deshalb kann der Käufer nicht durch eine rechtliche Handlung die Uhr von Yoko Ono erwerben, wenn er die Uhr in Deutschland gekauft hat.```
 
 Your Judgment: The model’s translation diverges significantly from the golden translation in accuracy, clarity, and fidelity. Critical legal terminology is mistranslated, omitted, and distorted. For instance, the courts are misidentified (“Zivilgerichtsverfassung”, “Zivilgericht”, “Bundesgerichtshof”). The model’s translation has several grammatical errors, such as “Geneßische Auktion”, “Erbvergabe”, “Wagenkellner” and “zu valieren”. The model also omits the explanation that, under German law, stolen property cannot be acquired in good faith. The correctness score: [[0.2]]
+""",
+    "fr-de": """Example 1:
+Source Text:
+```Le contrat est nul s’il a pour objet une chose impossible, illicite ou contraire aux moeurs. Si le contrat n’est vicié que dans certaines de ses clauses, ces clauses sont seules frappées de nullité, à moins qu’il n’y ait lieu d’admettre que le contrat n’aurait pas été conclu sans elles.```
 
-Judge the below case, give the brief reasoning process and the correctness score.
+Golden Translation:
+```Ein Vertrag, der einen unmöglichen oder widerrechtlichen Inhalt hat oder gegen die guten Sitten verstösst, ist nichtig. Betrifft aber der Mangel bloss einzelne Teile des Vertrages, so sind nur diese nichtig, sobald nicht anzunehmen ist, dass er ohne den nichtigen Teil überhaupt nicht geschlossen worden wäre.```
+
+Model’s Translation:
+```Der Vertrag ist nichtig, wenn er einen unmöglichen, widerrechtlichen oder sittenwidrigen Inhalt hat. Betrifft der Mangel bloß einzelne Teile des Vertrages, so sind nur diese nichtig, sobald nicht anzunehmen ist, dass er ohne den nichtigen Teil überhaupt nicht geschlossen worden wäre.```
 
+Your Judgment: The model’s translation aligns well with the golden translation in terms of accuracy and clarity. However, minor stylistic differences exist. For example, the golden translation uses “gegen die guten Sitten verstösst” which is more idiomatic, while the model opts for the more literal “sittenwidrigen Inhalt hat” Similarly, “Ein Vertrag” in the golden translation better reflects generalized legal phrasing than the model’s literal “Der Vertrag”. The correctness score: [[0.9]]
 
+
+Example 2:
 Source Text:
-```{question}```
+```Art. 13 al. 2, art. 36 al. 1 Cst.; art. 141 al. 2 CPP; exploitabilité d'enregistrements réalisés par la police au moyen du Système de recherche automatisée de véhicules et surveillance du trafic (RVS).
+La réalisation et le stockage d'enregistrements par la RVS constituent une atteinte aux droits fondamentaux des personnes concernées, singulièrement à la protection de leur sphère privée, qui englobe le droit à l'autodétermination en matière de données (consid. 3.1). Le recours à la RVS ne repose sur aucune base légale suffisamment précise dans le canton de Thurgovie. L'atteinte à la sphère privée inhérente à cette surveillance viole partant l'art. 13 al. 2 en corrélation avec l'art. 36 al. 1 Cst. (consid. 3.2 et 3.3).
+La constatation d'infractions par la police dans le cadre de son activité de contrôle préventive, relève de l'investigation au sens des art. 306 ss CPP. L'exploitabilité dans le procès pénal de preuves recueillies de manière illicite faute de base légale doit être examinée à l'aune de l'art. 141 al. 2 CPP (précision de la jurisprudence; consid. 4.1 et 4.2). Exploitabilité niée en l'espèce (consid. 4.3).```
 
 Golden Translation:
-```{gold}```
+```Art. 13 Abs. 2, Art. 36 Abs. 1 BV; Art. 141 Abs. 2 StPO; Verwertbarkeit von polizeilichen Aufzeichnungen der automatischen Fahrzeugfahndung und Verkehrsüberwachung (AFV).
+Die Erhebung und die Aufbewahrung von Aufzeichnungen der AFV stellen einen Eingriff in die Grundrechte der Betroffenen dar, insbesondere in das Recht auf Privatsphäre, das den Anspruch auf informationelle Selbstbestimmung miteinschliesst (E. 3.1). Für die AFV besteht im Kanton Thurgau keine hinreichend bestimmte gesetzliche Grundlage. Der mit der Überwachung verbundene Eingriff in die Privatsphäre verstösst daher gegen Art. 13 Abs. 2 i.V.m. Art. 36 Abs. 1 BV (E. 3.2 und 3.3).
+Stellt die Polizei im Rahmen ihrer präventiven Kontrolltätigkeit strafbare Handlungen fest, ermittelt sie nach Art. 306 ff. StPO. Die Frage, ob die mangels gesetzlicher Grundlage rechtswidrig erhobenen Beweismittel im Strafprozess verwertbar sind, ist nach Art. 141 Abs. 2 StPO zu prüfen (Präzisierung der Rechtsprechung; E. 4.1 und 4.2). Verwertbarkeit im vorliegenden Fall verneint (E. 4.3).```
 
-Model's Translation:
-```{answer}```
+Model’s Translation:
+```Art. 13 Abs. 2, Art. 36 Abs. 1 BV; Art. 141 Abs. 2 StPO; Verwertbarkeit von Aufzeichnungen, die durch die Polizei mit dem System zur automatisierten Fahrzeugerfassung und Verkehrsüberwachung (RVS) durchgeführt wurden.
+Die Durchführung und Speicherung von Aufzeichnungen durch das RVS stellt einen Eingriff in die Grundrechte der betroffenen Personen dar, insbesondere in den Schutz ihrer Privatsphäre, die das Recht auf Selbstbestimmung in Bezug auf Daten umfasst (E. 3.1). Der Einsatz von RVS stützt sich im Kanton Thurgau auf keine ausreichend präzise gesetzliche Grundlage. Die in dieser Überwachung liegende Beeinträchtigung der Privatsphäre verstößt daher gegen Art. 13 Abs. 2 in Verbindung mit Art. 36 Abs. 1 BV (E. 3.2 und 3.3).
+Die Aufdeckung von Straftaten durch die Polizei im Rahmen ihrer präventiven Kontrolltätigkeit fällt unter die Ermittlung im Sinne der Art. 306 ff. StPO. Die Verwertbarkeit im Strafverfahren von Beweisen, die auf illegale Weise erhoben wurden, muss anhand von Art. 141 Abs. 2 StPO geprüft werden (Klarstellung der Rechtsprechung; E. 4.1 und 4.2). Verwertbarkeit in diesem Fall verneint (E. 4.3).```
 
-Your Judgment:""",
-        },
-    ]
+Your Judgment: The model’s translation is accurate overall but omits a critical point in the second-to-last sentence: the evidence was unlawfully obtained due to a lack of legal basis. Additionally, its fidelity to Swiss legal terminology is limited. For example, the model uses "Klarstellung der Rechtsprechung" instead of the more appropriate "Präzisierung der Rechtsprechung" and "nicht ausreichend präzise" rather than the common "hinreichend bestimmt" It also consistently uses the French abbreviation "RVS" instead of the German "automatische Fahrzeugfahndung und Verkehrsüberwachung (AFV)" Lastly, "Recht auf Selbstbestimmung in Bezug auf Daten" is overly literal compared to the idiomatic "Anspruch auf informationelle Selbstbestimmung". The correctness score: [[0.6]]
+
+
+Example 3:
+Source Text:
+```Yoko Ono est propriétaire de la montre de John Lennon – rejet du recours d'un collectionneur contre un arrêt rendu par la Cour de justice genevoise
+
+Le Tribunal fédéral rejette le recours déposé par un collectionneur contre l'arrêt de la Cour de justice genevoise par lequel celle-ci confirmait que Yoko Ono est propriétaire de la montre qu'elle avait offerte à John Lennon en 1980, deux mois avant qu'il ne soit assassiné. Le collectionneur, qui a remis la montre à une maison de vente aux enchères genevoise en 2014 afin d'en faire estimer la valeur, a quant à lui revendiqué la propriété de ladite montre.
+
+En 1980, Yoko Ono a acquis à New York une montre de marque Patek Philippe. Elle y a fait graver au dos l'inscription « (JUST LIKE) STARTING OVER LOVE YOKO 10·9·1980 N.Y.C » et l'a offerte à son époux, John Lennon, le 9 octobre 1980 pour son 40e anniversaire. Le 8 décembre 1980, John Lennon a été assassiné à New York. La montre a été répertoriée dans l'inventaire successoral et conservée dans une pièce de l'appartement de Yoko Ono à New York. Par la suite, la montre s'est retrouvée aux mains d'un homme qui avait été le chauffeur privé de Yoko Ono de 1995 à 2006. Un autre possesseur intermédiaire l'a remise à une maison de vente aux enchères allemande, où elle a été acquise par un collectionneur en 2014. Ce dernier l'a remise la même année à une maison de vente aux enchères genevoise afin d'en faire estimer la valeur, ce dont a été informée Yoko Ono. Cette dernière n'avait jusqu'alors pas eu conscience du fait que la montre n'était plus en sa possession. En 2018, le collectionneur a formé à Genève une action visant à constater sa qualité de propriétaire, action à laquelle Yoko Ono s'est opposée. En 2022, le tribunal de première instance genevois a constaté que Yoko Ono était la seule et unique propriétaire de la montre, ce que la Cour de justice du canton de Genève, statuant sur appel du collectionneur, a confirmé en 2023.
+
+Le Tribunal fédéral rejette le recours déposé par le collectionneur contre cet arrêt. Il n'est tout d'abord pas contesté que la propriété de la montre a été acquise par succession par Yoko Ono après le décès de John Lennon. C'est en outre sans arbitraire que la Cour de justice genevoise a retenu que la montre avait été volée par l'ancien chauffeur et que, à l'inverse, aucun élément ne permettait de démontrer que Yoko Ono aurait eu l'intention de faire donation au chauffeur d'une chose si particulière que la montre, gravée d'une inscription, qu'elle avait offerte à John Lennon deux mois avant son décès. Dès lors qu'il s'agit d'une chose volée, le collectionneur, aujourd'hui recourant, ne pouvait pas acquérir la propriété de la montre par un mode originaire d'acquisition lorsqu'il l'a achetée en Allemagne en 2014 ; selon le droit allemand applicable en la matière, cela vaut indépendamment du fait que l'acquéreur était ou non de bonne foi quant à l'origine de la chose.```
+
+Golden Translation:
+```Yoko Ono ist Eigentümerin der Uhr von John Lennon – Beschwerde von Sammler gegen Genfer Urteil abgewiesen
+
+Das Bundesgericht weist die Beschwerde eines Sammlers gegen das Urteil des Genfer Kantonsgerichts ab, mit dem Yoko Ono als Eigentümerin der Uhr bestätigt wurde, die sie John Lennon 1980 zwei Monate vor seiner Ermordung geschenkt hat. Der Sammler hatte die Uhr 2014 zur Schätzung bei einem Auktionshaus in Genf eingereicht und seinerseits Eigentümerschaft an der Uhr geltend gemacht.
+
+Yoko Ono hatte 1980 in New York eine Uhr der Marke Patek Philippe gekauft. Sie liess auf der Rückseite die Gravur "(JUST LIKE) STARTING OVER LOVE YOKO 10·9·1980 N.Y.C" anbringen und schenkte sie ihrem Ehemann John Lennon am 9. Oktober 1980 zum 40. Geburtstag. Am 8. Dezember 1980 wurde John Lennon in New York ermordet. Die Uhr wurde ins Erbschaftsinventar aufgenommen und in einem Zimmer der Wohnung von Yoko Ono in New York aufbewahrt. Sie gelangte von dort in die Hände eines Mannes, der von 1995 bis 2006 Privatchauffeur von Yoko Ono gewesen war. Ein weiterer Zwischenbesitzer brachte die Uhr in ein deutsches Auktionshaus, wo sie 2014 von einem Sammler erworben wurde. Dieser reichte die Uhr im gleichen Jahr bei einem Auktionshaus in Genf zur Schätzung ihres Wertes ein. Davon erfuhr Yoko Ono, die bis dahin keine Kenntnis davon gehabt hatte, dass sich die Uhr nicht mehr in ihrem Besitz befand. Der Sammler erhob 2018 in Genf eine Klage auf Feststellung seiner Eigentümerschaft, der sich Yoko Ono widersetzte. Das erstinstanzliche Genfer Gericht stellte 2022 fest, dass Yoko Ono die alleinige Eigentümerin der Uhr sei, was auf Berufung des Sammlers 2023 vom Kantonsgericht des Kantons Genf bestätigt wurde.
+
+Das Bundesgericht weist die dagegen erhobene Beschwerde des Sammlers ab. Unbestritten ist zunächst, dass das Eigentum an der Uhr nach dem Tod von John Lennon durch Erbschaft an Yoko Ono übergegangen ist. Sodann durfte das Genfer Kantonsgericht willkürfrei davon ausgehen, dass die Uhr vom ehemaligen Chauffeur gestohlen wurde und umgekehrt nichts dafür spricht, dass Yoko Ono dem Chauffeur dieses einzigartige Stück, welches sie John Lennon zwei Monate vor seinem Tod mit einer Inschrift versehen überreicht hatte, hätte schenken wollen. Da es sich bei der Uhr um eine gestohlene Sache handelt, konnte der Sammler und heutige Beschwerdeführer bei ihrem Erwerb 2014 in Deutschland nicht unmittelbar zum Eigentümer werden; dies gilt gemäss dem diesbezüglich anwendbaren deutschen Recht unabhängig davon, ob der Erwerber in Bezug auf die Herkunft der Sache in gutem Glauben war.```
+
+Model’s Translation:
+```Hier, der Bundesgerichtshof hat den Antrag des Käufer gegen den Urteil des Zivilgerichts in Zürich abgelehnt, das das Zivilgericht in Zürich 2023 bestätigt hatte, dass Yoko Ono die Uhr als Geschenk 1980 an John Lennon gegeben hat, bevor dieser am 8. Dezember 1980 ermordet wurde. Der Käufer, der die Uhr 2014 in eine Geneßische Auktion verkauft hat, behauptet, die Uhr sei ihm verliehen worden, und nicht von Yoko Ono.
+
+Im Jahr 1980 hat Yoko Ono in New York eine Uhr von der Marke Patek Philippe gekauft. Sie hat auf der Rückseite der Uhr die Inschrift "JUST LIKE STARTING OVER LOVE YOKO 10·9·1980 N.Y.C." gemalt und sie an ihren Ehemann, John Lennon, am 9. Oktober 1980 als Geburtstag gegeben. Am 8. Dezember 1980 wurde Lennon ermordet in New York. Die Uhr wurde in den Erbeverzeichnis des Lennon-Mädchens aufbewahrt und in einem Zimmer in Yoko Onos New Yorker Wohnung.
+
+Danach wurde die Uhr von einem Mann, der als Wagenkellner von Yoko Ono zwischen 1995 und 2006 gearbeitet hat, in die Hände gebracht. Ein weiterer mittlerer Eigentümer hat sie 2014 in eine deutsche Auktion verkauft, wo sie einem Käufer in Zürich übergeben wurde. Dieser Käufer hat die Uhr 2014 in eine Geneßische Auktion verkauft, um sie zu valieren, und Yoko Ono war von diesem Moment an nicht mehr bewusst, dass die Uhr nicht mehr in ihrer Hand war.
+
+Im Jahr 2018 hat der Käufer in Zürich eine Anzeige gegen Yoko Ono eröffnet, um die Qualität des Eigentums der Uhr zu bestätigen, was Yoko Ono ansprach. Im Jahr 2022 hat der Zivilgericht in Zürich bestätigt, dass Yoko Ono die einzige und letzte Eigentümerin der Uhr ist, was der Zivilgerichtshof im Jahr 2023 bestätigt hat.
+
+Der Bundesgerichtshof hat den Antrag des Käufer gegen dieses Urteil abgelehnt. Er ist zunächst nicht bezweifelt, dass Yoko Ono die Uhr nach dem Tod von John Lennon durch Erbvergabe gekauft hat. Es ist auch nicht bezweifelt, dass die Zivilgerichtsverfassung in Zürich die Uhr als gestohlen angenommen hat, da niemand einen Beweis dafür gab, dass Yoko Ono die Uhr als Geschenk an den Wagenkellner hatte. Deshalb kann der Käufer nicht durch eine rechtliche Handlung die Uhr von Yoko Ono erwerben, wenn er die Uhr in Deutschland gekauft hat.```
+
+Your Judgment: The model’s translation diverges significantly from the golden translation in accuracy, clarity, and fidelity. Critical legal terminology is mistranslated, omitted, and distorted. For instance, the courts are misidentified (“Zivilgerichtsverfassung”, “Zivilgericht”, “Bundesgerichtshof”). The model’s translation has several grammatical errors, such as “Geneßische Auktion”, “Erbvergabe”, “Wagenkellner” and “zu valieren”. The model also omits the explanation that, under German law, stolen property cannot be acquired in good faith. The correctness score: [[0.2]]
+""",
+}
 
 
 class JudgeSwissLegalTranslation(JudgeLLM):
@@ -199,7 +288,36 @@ def get_swiss_legal_translation_judge(
     judge_model_name: str = "openai/gpt-4o-2024-11-20",
     short_judge_name: str = "slt_judge_gpt-4o",
     backend: str = "litellm",
+    system_style: str = "basic",  # "basic" or "detailed"
+    few_shot_style: str = "diverse",  # "diverse" or "fr-de"
 ):
+    def swiss_legal_translation_judge(question, options, answer, gold):
+        system_prompt = SYSTEM[system_style]
+        user = USER[system_style]
+        few_shot_examples = FEW_SHOT[few_shot_style]
+        instruction = f"""Judge the below case, give the brief reasoning process and the correctness score.
+
+
+Source Text:
+```{question}```
+
+Golden Translation:
+```{gold}```
+
+Model's Translation:
+```{answer}```
+
+Your Judgment:
+"""
+
+        user_prompt = user + few_shot_examples + instruction
+
+        print(user_prompt)
+        print(system_prompt)
+        exit()
+
+        return [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
+
     return SampleLevelMetricGrouping(
         metric_name=[short_judge_name],
         higher_is_better={short_judge_name: True},

From c49e1e23a4519c24fe9f1443671da42d6a429d12 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 23 Dec 2024 10:25:42 +0100
Subject: [PATCH 37/57] Removed debug relics.

---
 community_tasks/swiss_legal_evals.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index bf43c31bf..ccc18defe 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -309,13 +309,8 @@ def swiss_legal_translation_judge(question, options, answer, gold):
 
 Your Judgment:
 """
-
         user_prompt = user + few_shot_examples + instruction
 
-        print(user_prompt)
-        print(system_prompt)
-        exit()
-
         return [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
 
     return SampleLevelMetricGrouping(

From 4418e82b5b203fe8c041132c62add7189cf54a8f Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 23 Dec 2024 10:31:49 +0100
Subject: [PATCH 38/57] Fixed issue in judge prompt.

---
 community_tasks/swiss_legal_evals.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index ccc18defe..6f7c8ab57 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -268,7 +268,7 @@ def compute(
         **kwargs,
     ) -> dict[str, float]:
         logger.info(f"Judging {len(formatted_docs)} samples with {self.short_judge_name}...")
-        questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
+        questions = [formatted_doc.specific["source"] for formatted_doc in formatted_docs]
         options = [formatted_doc.choices for formatted_doc in formatted_docs]
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
         predictions = [response[0].result for response in responses]

From 075ebd2ede76419444cbb1d0b4e75fc3185cf4ab Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 23 Dec 2024 14:54:15 +0100
Subject: [PATCH 39/57] Adapted getting predictions to new way for all metrics.

---
 community_tasks/swiss_legal_evals.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 6f7c8ab57..a6bc260a8 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -348,7 +348,7 @@ def compute(
         # There should be only one language each in the batch
         assert len(set(source_langs)) == len(set(target_langs)) == 1
         sources = [formatted_doc.specific["source"] for formatted_doc in formatted_docs]
-        predictions = [response[0].result[0] for response in responses]
+        predictions = [response[0].result for response in responses]
 
         answers, errors = get_gemba_scores(
             sources, predictions, source_langs[0], target_langs[0], method=self.method, model=self.model
@@ -461,7 +461,7 @@ def compute(
     ) -> dict[str, float]:
         logger.info(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
-        predictions = [response[0].result[0] for response in responses]
+        predictions = [response[0].result for response in responses]
 
         all_scores = []
         for i in range(0, len(golds), self.batch_size):
@@ -531,7 +531,7 @@ def compute(
     ) -> dict[str, float]:
         logger.info(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
-        predictions = [response[0].result[0] for response in responses]
+        predictions = [response[0].result for response in responses]
         sources = [kwargs["formatted_doc"].specific["source"] for kwargs["formatted_doc"] in formatted_docs]
 
         data = [{"src": src, "mt": pred, "ref": gold} for src, pred, gold in zip(sources, predictions, golds)]

From 8ee2dbc7439f2849a581464e064fdaf068e6d778 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 23 Dec 2024 14:55:22 +0100
Subject: [PATCH 40/57] Added gemba mqm metric by default.

---
 community_tasks/swiss_legal_evals.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index a6bc260a8..6886e7a47 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -852,6 +852,7 @@ def prompt_fn(line: dict, task_name: str = None):
     "bert_score",
     "bleurt_large",
     "xcomet_xxl",
+    "gemba_mqm_gpt_4o",
     "slt_judge_gpt_4o",
 ]
 METRICS = {}

From 4408d0d0ca517276c4735434220cc498759d4d4d Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 25 Dec 2024 11:25:50 +0100
Subject: [PATCH 41/57] Fixed error in gemba score when errors are no dicts.

---
 community_tasks/swiss_legal_evals.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 6886e7a47..44ce31d77 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -354,10 +354,16 @@ def compute(
             sources, predictions, source_langs[0], target_langs[0], method=self.method, model=self.model
         )
 
-        # Convert defaultdict to dict
-        errors = [[{key: value} for key, value in error.items()] for error in errors]
+        # Handle cases where errors might be nan
+        formatted_errors = []
+        for error in errors:
+            if isinstance(error, dict):
+                # Convert defaultdict to dic
+                formatted_errors.append([{key: value} for key, value in error.items()])
+            else:
+                formatted_errors.append([{"error": ["No error details available"]}])
 
-        return [{self.name: answer, f"{self.name}_errors": error} for answer, error in zip(answers, errors)]
+        return [{self.name: answer, f"{self.name}_errors": error} for answer, error in zip(answers, formatted_errors)]
 
 
 def get_gemba_judge(method: str = "GEMBA-MQM_norm", model: str = "gpt-4o"):

From be6d9abe5a492b7da2ca242569dcaa732f8e26d5 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 25 Dec 2024 11:33:00 +0100
Subject: [PATCH 42/57] Added different judge configurations for gpt 4o.

---
 community_tasks/swiss_legal_evals.py | 38 +++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 44ce31d77..d6d8e61d1 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -900,15 +900,45 @@ def prompt_fn(line: dict, task_name: str = None):
     METRICS["xcomet_xl"] = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=32, gpus=1, device=device)
 if "xcomet_xxl" in METRICS_TO_USE:
     METRICS["xcomet_xxl"] = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=16, gpus=1, device=device)
-if "gemba_mqm_gpt_4o" in METRICS_TO_USE:  # TODO: Somehow in long evaluations something is wrong here
+if "gemba_mqm_gpt_4o" in METRICS_TO_USE:
     METRICS["gemba_mqm_gpt_4o"] = get_gemba_judge(method="GEMBA-MQM_norm", model="gpt-4o")
 if "slt_judge_gpt_4o_mini" in METRICS_TO_USE:
     METRICS["slt_judge_gpt_4o_mini"] = get_swiss_legal_translation_judge(
-        judge_model_name="openai/gpt-4o-mini-2024-07-18", short_judge_name="slt_judge_gpt-4o-mini"
+        judge_model_name="openai/gpt-4o-mini-2024-07-18",
+        short_judge_name="slt_judge_gpt-4o-mini",
     )
 if "slt_judge_gpt_4o" in METRICS_TO_USE:
-    METRICS["slt_judge_gpt_4o"] = get_swiss_legal_translation_judge(
-        judge_model_name="openai/gpt-4o-2024-11-20", short_judge_name="slt_judge_gpt-4o"
+    METRICS["slt_judge_gpt_4o_basic_diverse"] = get_swiss_legal_translation_judge(
+        judge_model_name="openai/gpt-4o-2024-11-20",
+        short_judge_name="slt_judge_gpt-4o",
+    )
+if "slt_judge_gpt_4o_basic_diverse" in METRICS_TO_USE:
+    METRICS["slt_judge_gpt_4o_basic_diverse"] = get_swiss_legal_translation_judge(
+        judge_model_name="openai/gpt-4o-2024-11-20",
+        short_judge_name="slt_judge_gpt-4o-basic-diverse",
+        system_style="basic",
+        few_shot_style="diverse",
+    )
+if "slt_judge_gpt_4o_basic_fr-de" in METRICS_TO_USE:
+    METRICS["slt_judge_gpt_4o_basic_fr-de"] = get_swiss_legal_translation_judge(
+        judge_model_name="openai/gpt-4o-2024-11-20",
+        short_judge_name="slt_judge_gpt-4o-basic-fr-de",
+        system_style="basic",
+        few_shot_style="fr-de",
+    )
+if "slt_judge_gpt_4o_detailed_diverse" in METRICS_TO_USE:
+    METRICS["slt_judge_gpt_4o_detailed_diverse"] = get_swiss_legal_translation_judge(
+        judge_model_name="openai/gpt-4o-2024-11-20",
+        short_judge_name="slt_judge_gpt-4o-detailed-diverse",
+        system_style="detailed",
+        few_shot_style="diverse",
+    )
+if "slt_judge_gpt_4o_detailed_fr-de" in METRICS_TO_USE:
+    METRICS["slt_judge_gpt_4o_detailed_fr-de"] = get_swiss_legal_translation_judge(
+        judge_model_name="openai/gpt-4o-2024-11-20",
+        short_judge_name="slt_judge_gpt-4o-detailed-fr-de",
+        system_style="detailed",
+        few_shot_style="fr-de",
     )
 if "slt_judge_haiku_35" in METRICS_TO_USE:
     METRICS["slt_judge_haiku_35"] = get_swiss_legal_translation_judge(

From c7ca83f50659241cc60b9b1febc99800f8e9a8a4 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 25 Dec 2024 23:07:42 +0100
Subject: [PATCH 43/57] Fixed typo.

---
 community_tasks/swiss_legal_evals.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index d6d8e61d1..7bcb77c79 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -908,7 +908,7 @@ def prompt_fn(line: dict, task_name: str = None):
         short_judge_name="slt_judge_gpt-4o-mini",
     )
 if "slt_judge_gpt_4o" in METRICS_TO_USE:
-    METRICS["slt_judge_gpt_4o_basic_diverse"] = get_swiss_legal_translation_judge(
+    METRICS["slt_judge_gpt_4o"] = get_swiss_legal_translation_judge(
         judge_model_name="openai/gpt-4o-2024-11-20",
         short_judge_name="slt_judge_gpt-4o",
     )

From 930cbc572e2d235e176e2918d063f2c6efb244ff Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Thu, 2 Jan 2025 14:20:44 +0100
Subject: [PATCH 44/57] Disabled short metrics for evaluation of longer
 sequences.

---
 community_tasks/swiss_legal_evals.py | 32 +++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 7bcb77c79..902808759 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -954,15 +954,31 @@ def prompt_fn(line: dict, task_name: str = None):
 # However, these are only fine-tuned on English data and we need multilingual support.
 
 
-def get_metrics(METRICS_TO_USE, target_lang: str):
+def get_metrics(METRICS_TO_USE, target_lang: str, generation_size: int):
     metrics = []
     for metric in METRICS_TO_USE:
-        if metric in METRICS:
-            if metric == "bert_score":
-                # Add only the BERTScore for the target language
-                metrics.append(METRICS["bert_score"][target_lang])
-            else:
-                metrics.append(METRICS[metric])
+        if metric not in METRICS:
+            logger.debug(f"Skipping {metric} because it is not available. Available metrics: {METRICS}")
+            continue
+        short_metrics = [
+            "bleu_sentence",
+            "chrf_sentence",
+            "ter_sentence",
+            "bert_score",
+            "bleurt_tiny",
+            "bleurt_base",
+            "bleurt_large",
+        ]
+        if generation_size > 512 and metric in short_metrics:
+            logger.debug(
+                f"Skipping {metric} for generation size {generation_size} because the maximum supported sequence length is 512."
+            )
+            continue
+        if metric == "bert_score":
+            # Add only the BERTScore for the target language
+            metrics.append(METRICS["bert_score"][target_lang])
+        else:
+            metrics.append(METRICS[metric])
     return metrics
 
 
@@ -987,7 +1003,7 @@ def __init__(
             few_shots_split="validation",
             few_shots_select="sequential",
             generation_size=level_config.generation_size,
-            metric=get_metrics(METRICS_TO_USE, target_lang),
+            metric=get_metrics(METRICS_TO_USE, target_lang, level_config.generation_size),
             stop_sequence=level_config.stop_sequence,
             trust_dataset=True,
             # Remove the target language in the beginning if it exists: e.g., FR: {translation}

From 61058b16c523707495f493a510fa9a6c201da872 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Thu, 2 Jan 2025 14:40:01 +0100
Subject: [PATCH 45/57] Added xcomet metrics to sentence level metrics.

---
 community_tasks/swiss_legal_evals.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 902808759..806a5472c 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -960,6 +960,7 @@ def get_metrics(METRICS_TO_USE, target_lang: str, generation_size: int):
         if metric not in METRICS:
             logger.debug(f"Skipping {metric} because it is not available. Available metrics: {METRICS}")
             continue
+        # These metrics are sentence level metrics and we only want to use them for generation sizes up to 512.
         short_metrics = [
             "bleu_sentence",
             "chrf_sentence",
@@ -968,6 +969,9 @@ def get_metrics(METRICS_TO_USE, target_lang: str, generation_size: int):
             "bleurt_tiny",
             "bleurt_base",
             "bleurt_large",
+            "wmt22-comet-da",
+            "xcomet_xl",
+            "xcomet_xxl",
         ]
         if generation_size > 512 and metric in short_metrics:
             logger.debug(

From e043ee81b7716116e03c56adc19ef41047d41277 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Thu, 2 Jan 2025 17:40:43 +0100
Subject: [PATCH 46/57] Fixed error in bleurt and enabled lazy loading of
 metrics to save on GPU memory.

---
 community_tasks/swiss_legal_evals.py | 296 +++++++++++++++++----------
 1 file changed, 184 insertions(+), 112 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 806a5472c..a65768bf1 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -451,12 +451,57 @@ def __init__(
             raise ValueError("MPS is not supported for BLEURT")
 
         self.metric_name = f"bleurt_{model_size}"
-        self.tokenizer = AutoTokenizer.from_pretrained(f"Elron/bleurt-{model_size}-{seq_len}")
-        self.model = AutoModelForSequenceClassification.from_pretrained(f"Elron/bleurt-{model_size}-{seq_len}")
-        self.model = self.model.to(device)
-        self.model.eval()
-        self.max_length = seq_len
+        self.model_size = model_size
+        self.seq_len = seq_len
         self.batch_size = batch_size
+        self.device = device
+
+        # Lazy loading
+        self.tokenizer = None
+        self.model = None
+
+    def _ensure_initialized(self):
+        """Lazy initialization of model and tokenizer"""
+        if self.tokenizer is None:
+            logger.info(f"Loading BLEURT tokenizer {self.metric_name} lazily...")
+            self.tokenizer = AutoTokenizer.from_pretrained(f"Elron/bleurt-{self.model_size}-{self.seq_len}")
+
+        if self.model is None:
+            logger.info(f"Loading BLEURT model {self.metric_name} lazily...")
+            self.model = AutoModelForSequenceClassification.from_pretrained(
+                f"Elron/bleurt-{self.model_size}-{self.seq_len}"
+            )
+            self.model = self.model.to(self.device)
+            self.model.eval()
+
+    def _process_batch(self, references: list[str], candidates: list[str]) -> list[float]:
+        """Process a batch of references and candidates"""
+        # Clean and prepare inputs
+        references = [str(ref).strip() for ref in references]
+        candidates = [str(cand).strip() for cand in candidates]
+
+        # Tokenize
+        inputs = self.tokenizer(
+            references,
+            candidates,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=self.seq_len,
+        )
+
+        # Log warning if any sequences were truncated
+        if any(len(encoding) == self.seq_len for encoding in inputs["input_ids"]):
+            logger.warning(f"Some inputs were truncated to max_length={self.seq_len} in BLEURT scoring")
+
+        # Move to device
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+
+        # Get predictions
+        with torch.no_grad():
+            outputs = self.model(**inputs)[0]
+
+        return outputs.squeeze().cpu().tolist()
 
     def compute(
         self,
@@ -465,28 +510,27 @@ def compute(
         formatted_docs: list[Doc],
         **kwargs,
     ) -> dict[str, float]:
+        """Compute BLEURT scores for a batch of translations"""
+        self._ensure_initialized()
+
         logger.info(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
-        golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
+
+        # Get references and predictions
+        references = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
         predictions = [response[0].result for response in responses]
 
+        # Process in batches
         all_scores = []
-        for i in range(0, len(golds), self.batch_size):
-            batch_golds = golds[i : i + self.batch_size]
-            batch_predictions = predictions[i : i + self.batch_size]
-
-            inputs = self.tokenizer(
-                batch_golds,
-                batch_predictions,
-                return_tensors="pt",
-                padding=True,
-                truncation=True,
-                max_length=self.max_length,
-            )
-            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
-            if any(len(encoding) == self.max_length for encoding in inputs["input_ids"]):
-                logger.warning(f"Some inputs were truncated to max_length={self.max_length} in BLEURT scoring")
-            with torch.no_grad():
-                all_scores.extend(self.model(**inputs)[0].squeeze().cpu().tolist())
+        for i in range(0, len(references), self.batch_size):
+            batch_refs = references[i : i + self.batch_size]
+            batch_preds = predictions[i : i + self.batch_size]
+            try:
+                scores = self._process_batch(batch_refs, batch_preds)
+                all_scores.extend(scores if isinstance(scores, list) else [scores])
+            except Exception as e:
+                logger.error(f"Error processing batch {i}: {str(e)}")
+                # Use minimum score for failed batches
+                all_scores.extend([-1.0] * len(batch_refs))
 
         return [{self.metric_name: score * 100} for score in all_scores]
 
@@ -523,7 +567,8 @@ def __init__(
             raise ValueError("MPS is not supported for COMET")
 
         self.metric_name = model_name.split("/")[-1]
-        self.model = load_from_checkpoint(download_model(model_name))
+        self.model = None  # Lazy loading of the model
+        self.model_name = model_name
         self.batch_size = batch_size
         self.gpus = gpus
         self.accelerator = accelerator
@@ -535,6 +580,11 @@ def compute(
         formatted_docs: list[Doc],
         **kwargs,
     ) -> dict[str, float]:
+        # Only load the model here to save memory and time
+        if self.model is None:
+            logger.info(f"Loading COMET model {self.model_name} lazily...")
+            self.model = load_from_checkpoint(download_model(self.model_name))
+
         logger.info(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
         predictions = [response[0].result for response in responses]
@@ -863,91 +913,114 @@ def prompt_fn(line: dict, task_name: str = None):
 ]
 METRICS = {}
 
-# ===== Lexical metrics =====
-# Corpus level metrics
-if "bleu" in METRICS_TO_USE:
-    METRICS["bleu"] = Metrics.bleu
-if "chrf" in METRICS_TO_USE:
-    METRICS["chrf"] = Metrics.chrf
-if "ter" in METRICS_TO_USE:
-    # TER often hangs for a while and takes more than 10 minutes to compute
-    METRICS["ter"] = Metrics.ter
-# Sample level metrics
-if "bleu_sentence" in METRICS_TO_USE:
-    METRICS["bleu_sentence"] = get_bleu_sentence()
-if "chrf_sentence" in METRICS_TO_USE:
-    METRICS["chrf_sentence"] = get_chrf_sentence()
-if "ter_sentence" in METRICS_TO_USE:
-    METRICS["ter_sentence"] = get_ter_sentence()
-if "meteor" in METRICS_TO_USE:
-    METRICS["meteor"] = get_meteor()
-# ===== Model-based metrics =====
-if "bert_score" in METRICS_TO_USE:
-    METRICS["bert_score"] = {  # Create BERTScore metrics for each language
-        lang: get_bert_score(language=lang, model_type="xlm-roberta-large", device=device)
-        for lang in ["de", "fr", "it", "rm", "en"]
-    }
-if "bleurt_tiny" in METRICS_TO_USE:
-    METRICS["bleurt_tiny"] = get_bleurt(model_size="tiny", seq_len=512, batch_size=256, device=device)
-if "bleurt_base" in METRICS_TO_USE:
-    METRICS["bleurt_base"] = get_bleurt(model_size="base", seq_len=512, batch_size=256, device=device)
-if "bleurt_large" in METRICS_TO_USE:
-    METRICS["bleurt_large"] = get_bleurt(model_size="large", seq_len=512, batch_size=256, device=device)
-# There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
-if "wmt22-comet-da" in METRICS_TO_USE:
-    METRICS["wmt22-comet-da"] = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=64, gpus=1, device=device)
-if "xcomet_xl" in METRICS_TO_USE:
-    METRICS["xcomet_xl"] = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=32, gpus=1, device=device)
-if "xcomet_xxl" in METRICS_TO_USE:
-    METRICS["xcomet_xxl"] = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=16, gpus=1, device=device)
-if "gemba_mqm_gpt_4o" in METRICS_TO_USE:
-    METRICS["gemba_mqm_gpt_4o"] = get_gemba_judge(method="GEMBA-MQM_norm", model="gpt-4o")
-if "slt_judge_gpt_4o_mini" in METRICS_TO_USE:
-    METRICS["slt_judge_gpt_4o_mini"] = get_swiss_legal_translation_judge(
-        judge_model_name="openai/gpt-4o-mini-2024-07-18",
-        short_judge_name="slt_judge_gpt-4o-mini",
-    )
-if "slt_judge_gpt_4o" in METRICS_TO_USE:
-    METRICS["slt_judge_gpt_4o"] = get_swiss_legal_translation_judge(
-        judge_model_name="openai/gpt-4o-2024-11-20",
-        short_judge_name="slt_judge_gpt-4o",
-    )
-if "slt_judge_gpt_4o_basic_diverse" in METRICS_TO_USE:
-    METRICS["slt_judge_gpt_4o_basic_diverse"] = get_swiss_legal_translation_judge(
-        judge_model_name="openai/gpt-4o-2024-11-20",
-        short_judge_name="slt_judge_gpt-4o-basic-diverse",
-        system_style="basic",
-        few_shot_style="diverse",
-    )
-if "slt_judge_gpt_4o_basic_fr-de" in METRICS_TO_USE:
-    METRICS["slt_judge_gpt_4o_basic_fr-de"] = get_swiss_legal_translation_judge(
-        judge_model_name="openai/gpt-4o-2024-11-20",
-        short_judge_name="slt_judge_gpt-4o-basic-fr-de",
-        system_style="basic",
-        few_shot_style="fr-de",
-    )
-if "slt_judge_gpt_4o_detailed_diverse" in METRICS_TO_USE:
-    METRICS["slt_judge_gpt_4o_detailed_diverse"] = get_swiss_legal_translation_judge(
-        judge_model_name="openai/gpt-4o-2024-11-20",
-        short_judge_name="slt_judge_gpt-4o-detailed-diverse",
-        system_style="detailed",
-        few_shot_style="diverse",
-    )
-if "slt_judge_gpt_4o_detailed_fr-de" in METRICS_TO_USE:
-    METRICS["slt_judge_gpt_4o_detailed_fr-de"] = get_swiss_legal_translation_judge(
-        judge_model_name="openai/gpt-4o-2024-11-20",
-        short_judge_name="slt_judge_gpt-4o-detailed-fr-de",
-        system_style="detailed",
-        few_shot_style="fr-de",
-    )
-if "slt_judge_haiku_35" in METRICS_TO_USE:
-    METRICS["slt_judge_haiku_35"] = get_swiss_legal_translation_judge(
-        judge_model_name="anthropic/claude-3-5-haiku-20241022", short_judge_name="slt_judge_haiku-3.5"
-    )
-if "slt_judge_sonnet_35" in METRICS_TO_USE:
-    METRICS["slt_judge_sonnet_35"] = get_swiss_legal_translation_judge(
-        judge_model_name="anthropic/claude-3-5-sonnet-20241022", short_judge_name="slt_judge_sonnet-3.5"
-    )
+
+def init_lexical_metrics(metric_name: str):
+    # Corpus level metrics
+    if metric_name == "bleu":
+        METRICS["bleu"] = Metrics.bleu
+    if metric_name == "chrf":
+        METRICS["chrf"] = Metrics.chrf
+    if metric_name == "ter":
+        # TER often hangs for a while and takes more than 10 minutes to compute
+        METRICS["ter"] = Metrics.ter
+    # Sample level metrics
+    if metric_name == "bleu_sentence":
+        METRICS["bleu_sentence"] = get_bleu_sentence()
+    if metric_name == "chrf_sentence":
+        METRICS["chrf_sentence"] = get_chrf_sentence()
+    if metric_name == "ter_sentence":
+        METRICS["ter_sentence"] = get_ter_sentence()
+    if metric_name == "meteor":
+        METRICS["meteor"] = get_meteor()
+
+
+def init_model_based_metrics(metric_name: str):
+    if metric_name == "bert_score":
+        METRICS["bert_score"] = {  # Create BERTScore metrics for each language
+            lang: get_bert_score(language=lang, model_type="xlm-roberta-large", device=device)
+            for lang in ["de", "fr", "it", "rm", "en"]
+        }
+    if metric_name == "bleurt_tiny":
+        METRICS["bleurt_tiny"] = get_bleurt(model_size="tiny", seq_len=512, batch_size=256, device=device)
+    if metric_name == "bleurt_base":
+        METRICS["bleurt_base"] = get_bleurt(model_size="base", seq_len=512, batch_size=256, device=device)
+    if metric_name == "bleurt_large":
+        METRICS["bleurt_large"] = get_bleurt(model_size="large", seq_len=512, batch_size=256, device=device)
+    # There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
+    if metric_name == "wmt22-comet-da":
+        METRICS["wmt22-comet-da"] = get_comet(
+            model_name="Unbabel/wmt22-comet-da", batch_size=64, gpus=1, device=device
+        )
+    if metric_name == "xcomet_xl":
+        METRICS["xcomet_xl"] = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=32, gpus=1, device=device)
+    if metric_name == "xcomet_xxl":
+        METRICS["xcomet_xxl"] = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=16, gpus=1, device=device)
+
+
+def init_llm_judge_metrics(metric_name: str):
+    if metric_name == "gemba_mqm_gpt_4o":
+        METRICS["gemba_mqm_gpt_4o"] = get_gemba_judge(method="GEMBA-MQM_norm", model="gpt-4o")
+    if metric_name == "slt_judge_gpt_4o_mini":
+        METRICS["slt_judge_gpt_4o_mini"] = get_swiss_legal_translation_judge(
+            judge_model_name="openai/gpt-4o-mini-2024-07-18",
+            short_judge_name="slt_judge_gpt-4o-mini",
+        )
+    if metric_name == "slt_judge_gpt_4o":
+        METRICS["slt_judge_gpt_4o"] = get_swiss_legal_translation_judge(
+            judge_model_name="openai/gpt-4o-2024-11-20",
+            short_judge_name="slt_judge_gpt-4o",
+        )
+    if metric_name == "slt_judge_gpt_4o_basic_diverse":
+        METRICS["slt_judge_gpt_4o_basic_diverse"] = get_swiss_legal_translation_judge(
+            judge_model_name="openai/gpt-4o-2024-11-20",
+            short_judge_name="slt_judge_gpt-4o-basic-diverse",
+            system_style="basic",
+            few_shot_style="diverse",
+        )
+    if metric_name == "slt_judge_gpt_4o_basic_fr-de":
+        METRICS["slt_judge_gpt_4o_basic_fr-de"] = get_swiss_legal_translation_judge(
+            judge_model_name="openai/gpt-4o-2024-11-20",
+            short_judge_name="slt_judge_gpt-4o-basic-fr-de",
+            system_style="basic",
+            few_shot_style="fr-de",
+        )
+    if metric_name == "slt_judge_gpt_4o_detailed_diverse":
+        METRICS["slt_judge_gpt_4o_detailed_diverse"] = get_swiss_legal_translation_judge(
+            judge_model_name="openai/gpt-4o-2024-11-20",
+            short_judge_name="slt_judge_gpt-4o-detailed-diverse",
+            system_style="detailed",
+            few_shot_style="diverse",
+        )
+    if metric_name == "slt_judge_gpt_4o_detailed_fr-de":
+        METRICS["slt_judge_gpt_4o_detailed_fr-de"] = get_swiss_legal_translation_judge(
+            judge_model_name="openai/gpt-4o-2024-11-20",
+            short_judge_name="slt_judge_gpt-4o-detailed-fr-de",
+            system_style="detailed",
+            few_shot_style="fr-de",
+        )
+    if metric_name == "slt_judge_haiku_35":
+        METRICS["slt_judge_haiku_35"] = get_swiss_legal_translation_judge(
+            judge_model_name="anthropic/claude-3-5-haiku-20241022", short_judge_name="slt_judge_haiku-3.5"
+        )
+    if metric_name == "slt_judge_sonnet_35":
+        METRICS["slt_judge_sonnet_35"] = get_swiss_legal_translation_judge(
+            judge_model_name="anthropic/claude-3-5-sonnet-20241022", short_judge_name="slt_judge_sonnet-3.5"
+        )
+
+
+def init_metric(metric_name: str):
+    # Only load the metric once
+    if metric_name in METRICS:
+        logger.debug(f"Metric {metric_name} already initialized")
+        return
+
+    # ===== Lexical metrics =====
+    init_lexical_metrics(metric_name)
+    # ===== Model-based metrics =====
+    init_model_based_metrics(metric_name)
+    # ===== LLM Judge metrics =====
+    init_llm_judge_metrics(metric_name)
+
 
 # Additionally we could consider adding the following open source judge models:
 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0
@@ -957,9 +1030,6 @@ def prompt_fn(line: dict, task_name: str = None):
 def get_metrics(METRICS_TO_USE, target_lang: str, generation_size: int):
     metrics = []
     for metric in METRICS_TO_USE:
-        if metric not in METRICS:
-            logger.debug(f"Skipping {metric} because it is not available. Available metrics: {METRICS}")
-            continue
         # These metrics are sentence level metrics and we only want to use them for generation sizes up to 512.
         short_metrics = [
             "bleu_sentence",
@@ -978,6 +1048,8 @@ def get_metrics(METRICS_TO_USE, target_lang: str, generation_size: int):
                 f"Skipping {metric} for generation size {generation_size} because the maximum supported sequence length is 512."
             )
             continue
+
+        init_metric(metric)
         if metric == "bert_score":
             # Add only the BERTScore for the target language
             metrics.append(METRICS["bert_score"][target_lang])

From 1c38c0aba807d9d93f44303b342f8797878d0edd Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Fri, 3 Jan 2025 12:11:13 +0100
Subject: [PATCH 47/57] Refactored judge metric creation.

---
 community_tasks/swiss_legal_evals.py | 90 +++++++++++++---------------
 1 file changed, 41 insertions(+), 49 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index a65768bf1..78279ef04 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -192,7 +192,7 @@
 
 Your Judgment: The model’s translation diverges significantly from the golden translation in accuracy, clarity, and fidelity. Critical legal terminology is mistranslated, omitted, and distorted. For instance, the courts are misidentified (“Zivilgerichtsverfassung”, “Zivilgericht”, “Bundesgerichtshof”). The model’s translation has several grammatical errors, such as “Geneßische Auktion”, “Erbvergabe”, “Wagenkellner” and “zu valieren”. The model also omits the explanation that, under German law, stolen property cannot be acquired in good faith. The correctness score: [[0.2]]
 """,
-    "fr-de": """Example 1:
+    "single": """Example 1:
 Source Text:
 ```Le contrat est nul s’il a pour objet une chose impossible, illicite ou contraire aux moeurs. Si le contrat n’est vicié que dans certaines de ses clauses, ces clauses sont seules frappées de nullité, à moins qu’il n’y ait lieu d’admettre que le contrat n’aurait pas été conclu sans elles.```
 
@@ -289,7 +289,7 @@ def get_swiss_legal_translation_judge(
     short_judge_name: str = "slt_judge_gpt-4o",
     backend: str = "litellm",
     system_style: str = "basic",  # "basic" or "detailed"
-    few_shot_style: str = "diverse",  # "diverse" or "fr-de"
+    few_shot_style: str = "diverse",  # "diverse" or "single"
 ):
     def swiss_legal_translation_judge(question, options, answer, gold):
         system_prompt = SYSTEM[system_style]
@@ -898,6 +898,13 @@ def prompt_fn(line: dict, task_name: str = None):
     return prompt_fn
 
 
+JUDGE_MODELS = {
+    "gpt-4o-mini": "openai/gpt-4o-mini-2024-07-18",
+    "gpt-4o": "openai/gpt-4o-2024-11-20",
+    "claude-3-5-haiku": "anthropic/claude-3-5-haiku-20241022",
+    "claude-3-5-sonnet": "anthropic/claude-3-5-sonnet-20241022",
+}
+
 METRICS_TO_USE = [
     "bleu",
     "chrf",
@@ -911,10 +918,20 @@ def prompt_fn(line: dict, task_name: str = None):
     "gemba_mqm_gpt_4o",
     "slt_judge_gpt_4o",
 ]
+METRICS_TO_USE.extend(
+    [
+        f"slt_judge_{judge_model}-{system_style}-{few_shot_style}".replace("-", "_")
+        for judge_model in JUDGE_MODELS
+        for system_style in ["basic", "detailed"]
+        for few_shot_style in ["diverse", "single"]
+    ]
+)
+logger.info(f"Available metrics: {METRICS_TO_USE}")
+
 METRICS = {}
 
 
-def init_lexical_metrics(metric_name: str):
+def init_lexical_metric(metric_name: str):
     # Corpus level metrics
     if metric_name == "bleu":
         METRICS["bleu"] = Metrics.bleu
@@ -934,7 +951,7 @@ def init_lexical_metrics(metric_name: str):
         METRICS["meteor"] = get_meteor()
 
 
-def init_model_based_metrics(metric_name: str):
+def init_model_based_metric(metric_name: str):
     if metric_name == "bert_score":
         METRICS["bert_score"] = {  # Create BERTScore metrics for each language
             lang: get_bert_score(language=lang, model_type="xlm-roberta-large", device=device)
@@ -957,55 +974,30 @@ def init_model_based_metrics(metric_name: str):
         METRICS["xcomet_xxl"] = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=16, gpus=1, device=device)
 
 
-def init_llm_judge_metrics(metric_name: str):
+def init_llm_judge_metric(metric_name: str):
     if metric_name == "gemba_mqm_gpt_4o":
         METRICS["gemba_mqm_gpt_4o"] = get_gemba_judge(method="GEMBA-MQM_norm", model="gpt-4o")
-    if metric_name == "slt_judge_gpt_4o_mini":
-        METRICS["slt_judge_gpt_4o_mini"] = get_swiss_legal_translation_judge(
-            judge_model_name="openai/gpt-4o-mini-2024-07-18",
-            short_judge_name="slt_judge_gpt-4o-mini",
-        )
+
     if metric_name == "slt_judge_gpt_4o":
         METRICS["slt_judge_gpt_4o"] = get_swiss_legal_translation_judge(
             judge_model_name="openai/gpt-4o-2024-11-20",
             short_judge_name="slt_judge_gpt-4o",
         )
-    if metric_name == "slt_judge_gpt_4o_basic_diverse":
-        METRICS["slt_judge_gpt_4o_basic_diverse"] = get_swiss_legal_translation_judge(
-            judge_model_name="openai/gpt-4o-2024-11-20",
-            short_judge_name="slt_judge_gpt-4o-basic-diverse",
-            system_style="basic",
-            few_shot_style="diverse",
-        )
-    if metric_name == "slt_judge_gpt_4o_basic_fr-de":
-        METRICS["slt_judge_gpt_4o_basic_fr-de"] = get_swiss_legal_translation_judge(
-            judge_model_name="openai/gpt-4o-2024-11-20",
-            short_judge_name="slt_judge_gpt-4o-basic-fr-de",
-            system_style="basic",
-            few_shot_style="fr-de",
-        )
-    if metric_name == "slt_judge_gpt_4o_detailed_diverse":
-        METRICS["slt_judge_gpt_4o_detailed_diverse"] = get_swiss_legal_translation_judge(
-            judge_model_name="openai/gpt-4o-2024-11-20",
-            short_judge_name="slt_judge_gpt-4o-detailed-diverse",
-            system_style="detailed",
-            few_shot_style="diverse",
-        )
-    if metric_name == "slt_judge_gpt_4o_detailed_fr-de":
-        METRICS["slt_judge_gpt_4o_detailed_fr-de"] = get_swiss_legal_translation_judge(
-            judge_model_name="openai/gpt-4o-2024-11-20",
-            short_judge_name="slt_judge_gpt-4o-detailed-fr-de",
-            system_style="detailed",
-            few_shot_style="fr-de",
-        )
-    if metric_name == "slt_judge_haiku_35":
-        METRICS["slt_judge_haiku_35"] = get_swiss_legal_translation_judge(
-            judge_model_name="anthropic/claude-3-5-haiku-20241022", short_judge_name="slt_judge_haiku-3.5"
-        )
-    if metric_name == "slt_judge_sonnet_35":
-        METRICS["slt_judge_sonnet_35"] = get_swiss_legal_translation_judge(
-            judge_model_name="anthropic/claude-3-5-sonnet-20241022", short_judge_name="slt_judge_sonnet-3.5"
-        )
+
+    # Check all the judge metric combinations
+    for judge_model in JUDGE_MODELS:
+        for system_style in ["basic", "detailed"]:
+            for few_shot_style in ["diverse", "single"]:
+                short_judge_name = f"slt_judge_{judge_model}-{system_style}-{few_shot_style}"
+                judge_metric_name = short_judge_name.replace("-", "_")
+                if metric_name == judge_metric_name:
+                    METRICS[metric_name] = get_swiss_legal_translation_judge(
+                        judge_model_name=JUDGE_MODELS[judge_model],
+                        short_judge_name=short_judge_name,
+                        system_style=system_style,
+                        few_shot_style=few_shot_style,
+                    )
+                    break
 
 
 def init_metric(metric_name: str):
@@ -1015,11 +1007,11 @@ def init_metric(metric_name: str):
         return
 
     # ===== Lexical metrics =====
-    init_lexical_metrics(metric_name)
+    init_lexical_metric(metric_name)
     # ===== Model-based metrics =====
-    init_model_based_metrics(metric_name)
+    init_model_based_metric(metric_name)
     # ===== LLM Judge metrics =====
-    init_llm_judge_metrics(metric_name)
+    init_llm_judge_metric(metric_name)
 
 
 # Additionally we could consider adding the following open source judge models:

From e05ac6a0e6d8a0ba72259b6a4555a4a8d11bc952 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Fri, 3 Jan 2025 12:55:07 +0100
Subject: [PATCH 48/57] Improved detailed judge prompt and changed secondary
 judge models from claude to gemini.

---
 community_tasks/swiss_legal_evals.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 78279ef04..ef458ddd5 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -109,12 +109,6 @@
 0.0: Completely incorrect
 
 
-REQUIRED OUTPUT FORMAT:
-Reasoning: Analyze how the model's translation aligns with or differs from the golden translation, focusing on significant legal and linguistic aspects.
-Examples: Identify specific terms, phrases, or sections in the model's answer that were correct or incorrect, with explanations.
-Score: End with "The correctness score: [[score]]"
-
-
 EVALUATION GUIDELINES:
 Stylistic differences should not impact accuracy significantly unless they alter the legal meaning.
 Untranslated Latin terms (e.g., prima facie) are not considered errors, but they should still be assessed for appropriate use within the context of the answer.
@@ -123,8 +117,13 @@
 Consider jurisdiction-specific legal terminology.
 Flag any ambiguities, omissions or additions that affect legal meaning.
 
-Below are some examples.
-""",
+
+REQUIRED OUTPUT FORMAT:
+Your response should be in plain text with the following sections:
+Reasoning: Analyze how the model's translation aligns with or differs from the golden translation, focusing on significant legal and linguistic aspects.
+Examples: Identify specific terms, phrases, or sections in the model's answer that were correct or incorrect, with explanations.
+Score: End with exactly this format: \"The correctness score: [[score]]\"
+The correctness score must strictly follow this format: \"[[score]]\", e.g., \"The correctness score: [[0.5]]\". Below are some examples.""",
 }
 
 FEW_SHOT = {
@@ -901,8 +900,11 @@ def prompt_fn(line: dict, task_name: str = None):
 JUDGE_MODELS = {
     "gpt-4o-mini": "openai/gpt-4o-mini-2024-07-18",
     "gpt-4o": "openai/gpt-4o-2024-11-20",
-    "claude-3-5-haiku": "anthropic/claude-3-5-haiku-20241022",
-    "claude-3-5-sonnet": "anthropic/claude-3-5-sonnet-20241022",
+    "gemini-1-5-pro": "gemini/gemini-1.5-pro-002",
+    "gemini-1-5-flash": "gemini/gemini-1.5-flash-002",
+    # The Claude models do not follow the required output format.
+    # "claude-3-5-haiku": "anthropic/claude-3-5-haiku-20241022",
+    # "claude-3-5-sonnet": "anthropic/claude-3-5-sonnet-20241022",
 }
 
 METRICS_TO_USE = [

From 0aed06323120ed2d7739a57fb928328e4969dd88 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Sat, 4 Jan 2025 11:18:36 +0100
Subject: [PATCH 49/57] Changed judge order.

---
 community_tasks/swiss_legal_evals.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index ef458ddd5..f8f6b24fc 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -900,8 +900,8 @@ def prompt_fn(line: dict, task_name: str = None):
 JUDGE_MODELS = {
     "gpt-4o-mini": "openai/gpt-4o-mini-2024-07-18",
     "gpt-4o": "openai/gpt-4o-2024-11-20",
-    "gemini-1-5-pro": "gemini/gemini-1.5-pro-002",
     "gemini-1-5-flash": "gemini/gemini-1.5-flash-002",
+    "gemini-1-5-pro": "gemini/gemini-1.5-pro-002",
     # The Claude models do not follow the required output format.
     # "claude-3-5-haiku": "anthropic/claude-3-5-haiku-20241022",
     # "claude-3-5-sonnet": "anthropic/claude-3-5-sonnet-20241022",

From 46eb62ae88e9d46b7dea4ae6666060ad1500ac9d Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Tue, 7 Jan 2025 10:14:17 -0800
Subject: [PATCH 50/57] Fixed stop sequence issue in press releases.

---
 community_tasks/swiss_legal_evals.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index f8f6b24fc..df43a583e 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -862,7 +862,7 @@ def __post_init__(self):
             text_col_name="text",
             metadata_cols=["filename"],
             generation_size=1024,
-            stop_sequence=["</s>", ".\n\n", "\n\n"],
+            stop_sequence=["</s>"],
         )
     },
 )

From a78bc03b8ed7bca72c594092aed05bb7043b7694 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Fri, 10 Jan 2025 08:54:54 -0800
Subject: [PATCH 51/57] Fixed error in xcomet scores.

---
 community_tasks/swiss_legal_evals.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index df43a583e..9c1e57bea 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -589,7 +589,14 @@ def compute(
         predictions = [response[0].result for response in responses]
         sources = [kwargs["formatted_doc"].specific["source"] for kwargs["formatted_doc"] in formatted_docs]
 
-        data = [{"src": src, "mt": pred, "ref": gold} for src, pred, gold in zip(sources, predictions, golds)]
+        data = [
+            {
+                "src": src,
+                "mt": pred if isinstance(pred, str) else pred[0],
+                "ref": gold,
+            }
+            for src, pred, gold in zip(sources, predictions, golds)
+        ]
         model_output = self.model.predict(
             data,
             batch_size=self.batch_size,

From f6b50b4cf8c21ffdb1c8231e60d67a15f601aa1f Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Fri, 10 Jan 2025 10:12:28 -0800
Subject: [PATCH 52/57] Made metric groups more easily configurable.

---
 community_tasks/swiss_legal_evals.py | 36 +++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 9c1e57bea..f545ea3f5 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -914,27 +914,45 @@ def prompt_fn(line: dict, task_name: str = None):
     # "claude-3-5-sonnet": "anthropic/claude-3-5-sonnet-20241022",
 }
 
-METRICS_TO_USE = [
+LEXICAL_METRICS = [
     "bleu",
     "chrf",
     "bleu_sentence",
     "chrf_sentence",
     "ter_sentence",
     "meteor",
+]
+GPU_METRICS = [
     "bert_score",
     "bleurt_large",
     "xcomet_xxl",
+]
+API_METRICS = [
     "gemba_mqm_gpt_4o",
     "slt_judge_gpt_4o",
 ]
-METRICS_TO_USE.extend(
-    [
-        f"slt_judge_{judge_model}-{system_style}-{few_shot_style}".replace("-", "_")
-        for judge_model in JUDGE_MODELS
-        for system_style in ["basic", "detailed"]
-        for few_shot_style in ["diverse", "single"]
-    ]
-)
+JUDGE_METRICS = [
+    f"slt_judge_{judge_model}-{system_style}-{few_shot_style}".replace("-", "_")
+    for judge_model in JUDGE_MODELS
+    for system_style in ["basic", "detailed"]
+    for few_shot_style in ["diverse", "single"]
+]
+
+metrics_to_evaluate = ["judge"]
+
+METRICS_TO_USE = []
+if metrics_to_evaluate == ["debug"]:
+    METRICS_TO_USE = ["bleu"]
+elif "lexical" in metrics_to_evaluate:
+    METRICS_TO_USE += LEXICAL_METRICS
+elif "gpu" in metrics_to_evaluate:
+    METRICS_TO_USE += GPU_METRICS
+elif "api" in metrics_to_evaluate:
+    METRICS_TO_USE += API_METRICS
+elif "judge" in metrics_to_evaluate:
+    METRICS_TO_USE += JUDGE_METRICS
+else:
+    METRICS_TO_USE = LEXICAL_METRICS + GPU_METRICS + API_METRICS
 logger.info(f"Available metrics: {METRICS_TO_USE}")
 
 METRICS = {}

From 7f36065c0d0cf61f96facc02ed2b6b883627ee5e Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Sun, 12 Jan 2025 21:35:07 -0800
Subject: [PATCH 53/57] Made comet score more robust.

---
 community_tasks/swiss_legal_evals.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index f545ea3f5..9414e1e1b 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -589,14 +589,15 @@ def compute(
         predictions = [response[0].result for response in responses]
         sources = [kwargs["formatted_doc"].specific["source"] for kwargs["formatted_doc"] in formatted_docs]
 
-        data = [
-            {
-                "src": src,
-                "mt": pred if isinstance(pred, str) else pred[0],
-                "ref": gold,
-            }
-            for src, pred, gold in zip(sources, predictions, golds)
-        ]
+        def unpack(x):
+            if isinstance(x, str):
+                return x
+            elif isinstance(x, (list, tuple)):
+                return unpack(x[0])
+            else:
+                raise ValueError(f"Unknown type {type(x)} of prediction {x}")
+
+        data = [{"src": src, "mt": unpack(pred), "ref": gold} for src, pred, gold in zip(sources, predictions, golds)]
         model_output = self.model.predict(
             data,
             batch_size=self.batch_size,

From cb6bfb414e48ca0ec514feb59f99be3b1b539342 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 13 Jan 2025 11:06:40 -0800
Subject: [PATCH 54/57] Moved unpack to the pipeline code.

---
 community_tasks/swiss_legal_evals.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 9414e1e1b..6c1483dd7 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -45,6 +45,7 @@
 from nltk.translate import meteor_score
 from packaging import version
 from sacrebleu import sentence_bleu, sentence_chrf, sentence_ter
+from tqdm import tqdm
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
 from lighteval.metrics.imports.bert_scorer import BERTScorer
@@ -520,7 +521,10 @@ def compute(
 
         # Process in batches
         all_scores = []
-        for i in range(0, len(references), self.batch_size):
+        for i in tqdm(
+            range(0, len(references), self.batch_size),
+            desc=f"Processing batches of size {self.batch_size} with {self.metric_name}",
+        ):
             batch_refs = references[i : i + self.batch_size]
             batch_preds = predictions[i : i + self.batch_size]
             try:
@@ -589,15 +593,7 @@ def compute(
         predictions = [response[0].result for response in responses]
         sources = [kwargs["formatted_doc"].specific["source"] for kwargs["formatted_doc"] in formatted_docs]
 
-        def unpack(x):
-            if isinstance(x, str):
-                return x
-            elif isinstance(x, (list, tuple)):
-                return unpack(x[0])
-            else:
-                raise ValueError(f"Unknown type {type(x)} of prediction {x}")
-
-        data = [{"src": src, "mt": unpack(pred), "ref": gold} for src, pred, gold in zip(sources, predictions, golds)]
+        data = [{"src": src, "mt": pred, "ref": gold} for src, pred, gold in zip(sources, predictions, golds)]
         model_output = self.model.predict(
             data,
             batch_size=self.batch_size,

From 866e7708b9a4800a9d9f27a023f8d946270d3e48 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 27 Jan 2025 14:20:10 +0100
Subject: [PATCH 55/57] Fixed bug in comet score.

---
 community_tasks/swiss_legal_evals.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 6c1483dd7..21dcf078c 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -593,7 +593,10 @@ def compute(
         predictions = [response[0].result for response in responses]
         sources = [kwargs["formatted_doc"].specific["source"] for kwargs["formatted_doc"] in formatted_docs]
 
-        data = [{"src": src, "mt": pred, "ref": gold} for src, pred, gold in zip(sources, predictions, golds)]
+        data = [
+            {"src": src, "mt": pred if isinstance(pred, str) else pred[0], "ref": gold}
+            for src, pred, gold in zip(sources, predictions, golds)
+        ]
         model_output = self.model.predict(
             data,
             batch_size=self.batch_size,

From e7f9a096c3c294940ac72b1936eb8a82fc37f3b8 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Sat, 1 Feb 2025 11:36:56 +0100
Subject: [PATCH 56/57] Added additional judge prompt configurations.

---
 community_tasks/swiss_legal_evals.py | 200 ++++++++++++++++++---------
 1 file changed, 133 insertions(+), 67 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 21dcf078c..321920c9f 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -80,16 +80,23 @@
 
 # CUSTOM METRICS
 
+basic_system = "Act as a Judge specializing in the evaluation of translations of Swiss legal documents. Your task is to assess the accuracy, clarity, and fidelity of the model's translation to the golden translation, while considering the nuances of legal language."
+detailed_system = "You are a senior legal translator and quality assurance specialist with over 20 years of experience in Swiss law, certified by the Swiss Sworn Translators Association (Association suisse des traducteurs-jurés, ASTJ). You possess native-level proficiency in all Swiss national languages (German, French, Italian, and Romansh) as well as English, enabling precise evaluation of legal nuances across all linguistic combinations. Your task is to evaluate machine-translated legal texts for accuracy, clarity and fidelity to Swiss legal standards analyzing the subtle complexities of legal language. You excel at identifying even minor discrepancies and calibrating evaluation scores appropriately to reflect the severity of each error."
+
+
 SYSTEM = {
-    "basic": "Act as a Judge specializing in the evaluation of translations of Swiss legal documents. Your task is to assess the accuracy, clarity, and fidelity of the model's translation to the golden translation, while considering the nuances of legal language.",
-    "detailed": "You are a senior legal translator and quality assurance specialist with over 20 years of experience in Swiss law, certified by the Swiss Sworn Translators Association (Association suisse des traducteurs-jurés, ASTJ). You possess native-level proficiency in all Swiss national languages (German, French, Italian, and Romansh) as well as English, enabling precise evaluation of legal nuances across all linguistic combinations. Your task is to evaluate machine-translated legal texts for accuracy, clarity and fidelity to Swiss legal standards analyzing the subtle complexities of legal language. You excel at identifying even minor discrepancies and calibrating evaluation scores appropriately to reflect the severity of each error.",
+    "basic": basic_system,
+    "detailed": detailed_system,
+    "codebook": detailed_system,
 }
 
+final_formatting = 'The correctness score must strictly follow this format: "[[score]]", e.g., "The correctness score: [[0.5]]". Below are some examples.'
+
 USER = {
-    "basic": """
-You will be provided with a source text, its golden translation, and the model's translation. Your task is to judge how correct the model's translation is based on the golden translation, and then give a correctness score. The correctness score should be one of the below numbers: 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). You should first briefly give your reasoning process regarding how the model's translation conforms to or contradicts the golden translation, and then give the correctness score. The correctness score must strictly follow this format: \"[[score]]\", e.g., \"The correctness score: [[0.5]]\". Below are some examples.
-""",
-    "detailed": """
+    "basic": f"""
+You will be provided with a source text, its golden translation, and the model's translation. Your task is to judge how correct the model's translation is based on the golden translation, and then give a correctness score. The correctness score should be one of the below numbers: 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). You should first briefly give your reasoning process regarding how the model's translation conforms to or contradicts the golden translation, and then give the correctness score.
+{final_formatting}""",
+    "detailed": f"""
 INPUT FORMAT:
 Source Text: [Original text in source language]
 Golden Translation: [Reference professional translation]
@@ -124,25 +131,53 @@
 Reasoning: Analyze how the model's translation aligns with or differs from the golden translation, focusing on significant legal and linguistic aspects.
 Examples: Identify specific terms, phrases, or sections in the model's answer that were correct or incorrect, with explanations.
 Score: End with exactly this format: \"The correctness score: [[score]]\"
-The correctness score must strictly follow this format: \"[[score]]\", e.g., \"The correctness score: [[0.5]]\". Below are some examples.""",
+{final_formatting}""",
+    "codebook": f"""
+GENERAL INSTRUCTIONS:
+You must give each translation a score between 0 and 1 that must be divisible by 0.1 (e.g., 0.6 or 0.9). To this end, you are given a source text, its “gold translation” (official translation of the Swiss authorities) and the predicted translation, to which you must assign the score. You can also write down notes if deemed necessary.
+
+
+SCORE:
+The scores shall reflect the completeness and accuracy of the predicted translation. In other words, you should not give a score based on readability or stylistic attributes.
+
+
+POINT DEDUCTION SYSTEM:
+A perfect, i.e., a perfectly complete and accurate translation receives a score of 1.
+0.1 points deduction for a relevant legal term in an unusual but still correct manner. 0.1 points shall also be deducted if the law has not been translated (e.g., BV to BV). Finally, 0.1 points shall be deducted if a non-relevant term is missing.
+0.2 points deduction if a legally relevant legal term is translated erroneously. 0.2 points shall also be deducted if a relevant term is missing.
+0.4 points deduction for critical errors, such as when a law is translated with reference to the wrong law.
+
+Do not deduct points for discrepancies between the predicted translation and the gold translation if the predicted translation matches the source text better. The gold translation should primarily serve as a reference to help you assess cases where it is also a correct translation of the source. In some cases, the source text may differ slightly from the gold translation. This can happen if the source text itself was previously translated. Repeated errors for the same term should not lead to multiple point deductions.
+
+
+REQUIRED OUTPUT FORMAT:
+Your response should be in plain text with the following sections:
+Deductions: Focusing on significant legal and linguistic aspects, analyze and present concretely all points to be deducted together with brief explanations.
+Score: End with exactly this format: \"The correctness score: [[score]]\"
+{final_formatting}""",
 }
 
-FEW_SHOT = {
-    "diverse": """Example 1:
-Source Text:
+EXAMPLES = {
+    "law": {
+        "en-it": """Source Text:
 ```A contract is void if its terms are impossible, unlawful or immoral. However, where the defect pertains only to certain terms of a contract, those terms alone are void unless there is cause to assume that the contract would not have been concluded without them.```
 
 Golden Translation:
 ```Il contratto che ha per oggetto una cosa impossibile o contraria alle leggi od ai buoni costumi è nullo. Se il contratto è viziato solo in alcune parti, queste soltanto sono nulle, ove non si debba ammettere che senza la parte nulla esso non sarebbe stato conchiuso.```
 
 Model’s Translation:
-```Il contratto è nullo se le sue clausole sono impossibili, illecite o immorali. Tuttavia, quando il vizio riguarda solo determinate clausole del contratto, solo queste sono nulle, salvo che vi sia motivo di ritenere che il contratto non sarebbe stato concluso senza di esse.```
-
-Your Judgment: The model’s translation aligns well with the golden translation in terms of accuracy, clarity, and fidelity to the source text. However, there are minor stylistic differences. For example, the golden translation uses “conchiuso” an older and more formal term, while the model opts for “concluso” which is modern. Similarly, the golden translation uses the idiomatic phrase “contraria alle leggi od ai buoni costumi” whereas the model employs the more literal “illecite o immorali”. The correctness score: [[0.9]]
+```Il contratto è nullo se le sue clausole sono impossibili, illecite o immorali. Tuttavia, quando il vizio riguarda solo determinate clausole del contratto, solo queste sono nulle, salvo che vi sia motivo di ritenere che il contratto non sarebbe stato concluso senza di esse.```""",
+        "fr-de": """Source Text:
+```Le contrat est nul s’il a pour objet une chose impossible, illicite ou contraire aux moeurs. Si le contrat n’est vicié que dans certaines de ses clauses, ces clauses sont seules frappées de nullité, à moins qu’il n’y ait lieu d’admettre que le contrat n’aurait pas été conclu sans elles.```
 
+Golden Translation:
+```Ein Vertrag, der einen unmöglichen oder widerrechtlichen Inhalt hat oder gegen die guten Sitten verstösst, ist nichtig. Betrifft aber der Mangel bloss einzelne Teile des Vertrages, so sind nur diese nichtig, sobald nicht anzunehmen ist, dass er ohne den nichtigen Teil überhaupt nicht geschlossen worden wäre.```
 
-Example 2:
-Source Text:
+Model’s Translation:
+```Der Vertrag ist nichtig, wenn er einen unmöglichen, widerrechtlichen oder sittenwidrigen Inhalt hat. Betrifft der Mangel bloß einzelne Teile des Vertrages, so sind nur diese nichtig, sobald nicht anzunehmen ist, dass er ohne den nichtigen Teil überhaupt nicht geschlossen worden wäre.```""",
+    },
+    "headnote": {
+        "de-fr": """Source Text:
 ```Art. 13 Abs. 2, Art. 36 Abs. 1 BV; Art. 141 Abs. 2 StPO; Verwertbarkeit von polizeilichen Aufzeichnungen der automatischen Fahrzeugfahndung und Verkehrsüberwachung (AFV).
 Die Erhebung und die Aufbewahrung von Aufzeichnungen der AFV stellen einen Eingriff in die Grundrechte der Betroffenen dar, insbesondere in das Recht auf Privatsphäre, das den Anspruch auf informationelle Selbstbestimmung miteinschliesst (E. 3.1). Für die AFV besteht im Kanton Thurgau keine hinreichend bestimmte gesetzliche Grundlage. Der mit der Überwachung verbundene Eingriff in die Privatsphäre verstösst daher gegen Art. 13 Abs. 2 i.V.m. Art. 36 Abs. 1 BV (E. 3.2 und 3.3).
 Stellt die Polizei im Rahmen ihrer präventiven Kontrolltätigkeit strafbare Handlungen fest, ermittelt sie nach Art. 306 ff. StPO. Die Frage, ob die mangels gesetzlicher Grundlage rechtswidrig erhobenen Beweismittel im Strafprozess verwertbar sind, ist nach Art. 141 Abs. 2 StPO zu prüfen (Präzisierung der Rechtsprechung; E. 4.1 und 4.2). Verwertbarkeit im vorliegenden Fall verneint (E. 4.3).```
@@ -155,13 +190,24 @@
 Model’s Translation:
 ```Art. 13 al. 2, Art. 36 al. 1 Cst.; Art. 141 al. 2 CPP; Admissibilité des enregistrements de la poursuite automatisée des véhicules et de la surveillance de la circulation (AFV).
 La collecte et la conservation des enregistrements de l'AFV constituent une ingérence dans les droits fondamentaux des personnes concernées, en particulier dans le droit à la vie privée, qui inclut le droit à l'autodétermination informationnelle (consid. 3.1). Dans le canton de Thurgovie, il n'existe pas de base légale suffisamment déterminée pour l'AFV. Par conséquent, l'ingérence dans la vie privée associée à la surveillance viole donc les art. 13 al. 2 combiné avec l'art. 36 al. 1 Cst (consid. 3.2 et 3.3).
-Si la police relève des infractions pénales dans le cadre de ses activités de contrôle préventif, elle enquête conformément aux art. 306 et suivants CPP. La question de savoir si les preuves illégalement recueillies peuvent être utilisées dans la procédure pénale est examinée conformément à l'art. 141 al. 2 CPP (précision de la jurisprudence; consid. 4.1 et 4.2). Admissibilité dans le cas présent niée (consid. 4.3).```
-
-Your Judgment: The model’s translation mostly aligns with the golden translation but diverges when it comes to accuracy and fidelity to Swiss legal terminology. For instance, the term “exploitabilité” which is closer to the Swiss provision is replaced in the model’s translation with “admissibilité”. Similarly, “ingérence” is used instead of “atteinte”, although “atteinte” is commonly used in Swiss law to discuss a violation of fundamental rights. Also, the term "recherche automatisée de véhicules et surveillance du trafic (RVS)" used by the golden translation is more established than "poursuite automatisée des véhicules et de la surveillance de la circulation (AFV)" in the model’s translation. The model’s translation is almost complete, but omits a critical point in one sentence: that the evidence was unlawfully obtained due to lack of a sufficiently clear legal basis. This omission impacts the completeness. The correctness score: [[0.7]]
+Si la police relève des infractions pénales dans le cadre de ses activités de contrôle préventif, elle enquête conformément aux art. 306 et suivants CPP. La question de savoir si les preuves illégalement recueillies peuvent être utilisées dans la procédure pénale est examinée conformément à l'art. 141 al. 2 CPP (précision de la jurisprudence; consid. 4.1 et 4.2). Admissibilité dans le cas présent niée (consid. 4.3).```""",
+        "fr-de": """Source Text:
+```Art. 13 al. 2, art. 36 al. 1 Cst.; art. 141 al. 2 CPP; exploitabilité d'enregistrements réalisés par la police au moyen du Système de recherche automatisée de véhicules et surveillance du trafic (RVS).
+La réalisation et le stockage d'enregistrements par la RVS constituent une atteinte aux droits fondamentaux des personnes concernées, singulièrement à la protection de leur sphère privée, qui englobe le droit à l'autodétermination en matière de données (consid. 3.1). Le recours à la RVS ne repose sur aucune base légale suffisamment précise dans le canton de Thurgovie. L'atteinte à la sphère privée inhérente à cette surveillance viole partant l'art. 13 al. 2 en corrélation avec l'art. 36 al. 1 Cst. (consid. 3.2 et 3.3).
+La constatation d'infractions par la police dans le cadre de son activité de contrôle préventive, relève de l'investigation au sens des art. 306 ss CPP. L'exploitabilité dans le procès pénal de preuves recueillies de manière illicite faute de base légale doit être examinée à l'aune de l'art. 141 al. 2 CPP (précision de la jurisprudence; consid. 4.1 et 4.2). Exploitabilité niée en l'espèce (consid. 4.3).```
 
+Golden Translation:
+```Art. 13 Abs. 2, Art. 36 Abs. 1 BV; Art. 141 Abs. 2 StPO; Verwertbarkeit von polizeilichen Aufzeichnungen der automatischen Fahrzeugfahndung und Verkehrsüberwachung (AFV).
+Die Erhebung und die Aufbewahrung von Aufzeichnungen der AFV stellen einen Eingriff in die Grundrechte der Betroffenen dar, insbesondere in das Recht auf Privatsphäre, das den Anspruch auf informationelle Selbstbestimmung miteinschliesst (E. 3.1). Für die AFV besteht im Kanton Thurgau keine hinreichend bestimmte gesetzliche Grundlage. Der mit der Überwachung verbundene Eingriff in die Privatsphäre verstösst daher gegen Art. 13 Abs. 2 i.V.m. Art. 36 Abs. 1 BV (E. 3.2 und 3.3).
+Stellt die Polizei im Rahmen ihrer präventiven Kontrolltätigkeit strafbare Handlungen fest, ermittelt sie nach Art. 306 ff. StPO. Die Frage, ob die mangels gesetzlicher Grundlage rechtswidrig erhobenen Beweismittel im Strafprozess verwertbar sind, ist nach Art. 141 Abs. 2 StPO zu prüfen (Präzisierung der Rechtsprechung; E. 4.1 und 4.2). Verwertbarkeit im vorliegenden Fall verneint (E. 4.3).```
 
-Example 3:
-Source Text:
+Model’s Translation:
+```Art. 13 Abs. 2, Art. 36 Abs. 1 BV; Art. 141 Abs. 2 StPO; Verwertbarkeit von Aufzeichnungen, die durch die Polizei mit dem System zur automatisierten Fahrzeugerfassung und Verkehrsüberwachung (RVS) durchgeführt wurden.
+Die Durchführung und Speicherung von Aufzeichnungen durch das RVS stellt einen Eingriff in die Grundrechte der betroffenen Personen dar, insbesondere in den Schutz ihrer Privatsphäre, die das Recht auf Selbstbestimmung in Bezug auf Daten umfasst (E. 3.1). Der Einsatz von RVS stützt sich im Kanton Thurgau auf keine ausreichend präzise gesetzliche Grundlage. Die in dieser Überwachung liegende Beeinträchtigung der Privatsphäre verstößt daher gegen Art. 13 Abs. 2 in Verbindung mit Art. 36 Abs. 1 BV (E. 3.2 und 3.3).
+Die Aufdeckung von Straftaten durch die Polizei im Rahmen ihrer präventiven Kontrolltätigkeit fällt unter die Ermittlung im Sinne der Art. 306 ff. StPO. Die Verwertbarkeit im Strafverfahren von Beweisen, die auf illegale Weise erhoben wurden, muss anhand von Art. 141 Abs. 2 StPO geprüft werden (Klarstellung der Rechtsprechung; E. 4.1 und 4.2). Verwertbarkeit in diesem Fall verneint (E. 4.3).```""",
+    },
+    "press": {
+        "fr-de": """Source Text:
 ```Yoko Ono est propriétaire de la montre de John Lennon – rejet du recours d'un collectionneur contre un arrêt rendu par la Cour de justice genevoise
 
 Le Tribunal fédéral rejette le recours déposé par un collectionneur contre l'arrêt de la Cour de justice genevoise par lequel celle-ci confirmait que Yoko Ono est propriétaire de la montre qu'elle avait offerte à John Lennon en 1980, deux mois avant qu'il ne soit assassiné. Le collectionneur, qui a remis la montre à une maison de vente aux enchères genevoise en 2014 afin d'en faire estimer la valeur, a quant à lui revendiqué la propriété de ladite montre.
@@ -188,73 +234,78 @@
 
 Im Jahr 2018 hat der Käufer in Zürich eine Anzeige gegen Yoko Ono eröffnet, um die Qualität des Eigentums der Uhr zu bestätigen, was Yoko Ono ansprach. Im Jahr 2022 hat der Zivilgericht in Zürich bestätigt, dass Yoko Ono die einzige und letzte Eigentümerin der Uhr ist, was der Zivilgerichtshof im Jahr 2023 bestätigt hat.
 
-Der Bundesgerichtshof hat den Antrag des Käufer gegen dieses Urteil abgelehnt. Er ist zunächst nicht bezweifelt, dass Yoko Ono die Uhr nach dem Tod von John Lennon durch Erbvergabe gekauft hat. Es ist auch nicht bezweifelt, dass die Zivilgerichtsverfassung in Zürich die Uhr als gestohlen angenommen hat, da niemand einen Beweis dafür gab, dass Yoko Ono die Uhr als Geschenk an den Wagenkellner hatte. Deshalb kann der Käufer nicht durch eine rechtliche Handlung die Uhr von Yoko Ono erwerben, wenn er die Uhr in Deutschland gekauft hat.```
+Der Bundesgerichtshof hat den Antrag des Käufer gegen dieses Urteil abgelehnt. Er ist zunächst nicht bezweifelt, dass Yoko Ono die Uhr nach dem Tod von John Lennon durch Erbvergabe gekauft hat. Es ist auch nicht bezweifelt, dass die Zivilgerichtsverfassung in Zürich die Uhr als gestohlen angenommen hat, da niemand einen Beweis dafür gab, dass Yoko Ono die Uhr als Geschenk an den Wagenkellner hatte. Deshalb kann der Käufer nicht durch eine rechtliche Handlung die Uhr von Yoko Ono erwerben, wenn er die Uhr in Deutschland gekauft hat.```""",
+    },
+}
 
-Your Judgment: The model’s translation diverges significantly from the golden translation in accuracy, clarity, and fidelity. Critical legal terminology is mistranslated, omitted, and distorted. For instance, the courts are misidentified (“Zivilgerichtsverfassung”, “Zivilgericht”, “Bundesgerichtshof”). The model’s translation has several grammatical errors, such as “Geneßische Auktion”, “Erbvergabe”, “Wagenkellner” and “zu valieren”. The model also omits the explanation that, under German law, stolen property cannot be acquired in good faith. The correctness score: [[0.2]]
-""",
-    "single": """Example 1:
-Source Text:
-```Le contrat est nul s’il a pour objet une chose impossible, illicite ou contraire aux moeurs. Si le contrat n’est vicié que dans certaines de ses clauses, ces clauses sont seules frappées de nullité, à moins qu’il n’y ait lieu d’admettre que le contrat n’aurait pas été conclu sans elles.```
+FEW_SHOT = {
+    "diverse_absolute": f"""Example 1:
+{EXAMPLES["law"]["en-it"]}
 
-Golden Translation:
-```Ein Vertrag, der einen unmöglichen oder widerrechtlichen Inhalt hat oder gegen die guten Sitten verstösst, ist nichtig. Betrifft aber der Mangel bloss einzelne Teile des Vertrages, so sind nur diese nichtig, sobald nicht anzunehmen ist, dass er ohne den nichtigen Teil überhaupt nicht geschlossen worden wäre.```
+Your Judgment: The model’s translation aligns well with the golden translation in terms of accuracy, clarity, and fidelity to the source text. However, there are minor stylistic differences. For example, the golden translation uses “conchiuso” an older and more formal term, while the model opts for “concluso” which is modern. Similarly, the golden translation uses the idiomatic phrase “contraria alle leggi od ai buoni costumi” whereas the model employs the more literal “illecite o immorali”. The correctness score: [[0.9]]
 
-Model’s Translation:
-```Der Vertrag ist nichtig, wenn er einen unmöglichen, widerrechtlichen oder sittenwidrigen Inhalt hat. Betrifft der Mangel bloß einzelne Teile des Vertrages, so sind nur diese nichtig, sobald nicht anzunehmen ist, dass er ohne den nichtigen Teil überhaupt nicht geschlossen worden wäre.```
 
-Your Judgment: The model’s translation aligns well with the golden translation in terms of accuracy and clarity. However, minor stylistic differences exist. For example, the golden translation uses “gegen die guten Sitten verstösst” which is more idiomatic, while the model opts for the more literal “sittenwidrigen Inhalt hat” Similarly, “Ein Vertrag” in the golden translation better reflects generalized legal phrasing than the model’s literal “Der Vertrag”. The correctness score: [[0.9]]
+Example 2:
+{EXAMPLES["headnote"]["de-fr"]}
 
+Your Judgment: The model’s translation mostly aligns with the golden translation but diverges when it comes to accuracy and fidelity to Swiss legal terminology. For instance, the term “exploitabilité” which is closer to the Swiss provision is replaced in the model’s translation with “admissibilité”. Similarly, “ingérence” is used instead of “atteinte”, although “atteinte” is commonly used in Swiss law to discuss a violation of fundamental rights. Also, the term "recherche automatisée de véhicules et surveillance du trafic (RVS)" used by the golden translation is more established than "poursuite automatisée des véhicules et de la surveillance de la circulation (AFV)" in the model’s translation. The model’s translation is almost complete, but omits a critical point in one sentence: that the evidence was unlawfully obtained due to lack of a sufficiently clear legal basis. This omission impacts the completeness. The correctness score: [[0.7]]
 
-Example 2:
-Source Text:
-```Art. 13 al. 2, art. 36 al. 1 Cst.; art. 141 al. 2 CPP; exploitabilité d'enregistrements réalisés par la police au moyen du Système de recherche automatisée de véhicules et surveillance du trafic (RVS).
-La réalisation et le stockage d'enregistrements par la RVS constituent une atteinte aux droits fondamentaux des personnes concernées, singulièrement à la protection de leur sphère privée, qui englobe le droit à l'autodétermination en matière de données (consid. 3.1). Le recours à la RVS ne repose sur aucune base légale suffisamment précise dans le canton de Thurgovie. L'atteinte à la sphère privée inhérente à cette surveillance viole partant l'art. 13 al. 2 en corrélation avec l'art. 36 al. 1 Cst. (consid. 3.2 et 3.3).
-La constatation d'infractions par la police dans le cadre de son activité de contrôle préventive, relève de l'investigation au sens des art. 306 ss CPP. L'exploitabilité dans le procès pénal de preuves recueillies de manière illicite faute de base légale doit être examinée à l'aune de l'art. 141 al. 2 CPP (précision de la jurisprudence; consid. 4.1 et 4.2). Exploitabilité niée en l'espèce (consid. 4.3).```
 
-Golden Translation:
-```Art. 13 Abs. 2, Art. 36 Abs. 1 BV; Art. 141 Abs. 2 StPO; Verwertbarkeit von polizeilichen Aufzeichnungen der automatischen Fahrzeugfahndung und Verkehrsüberwachung (AFV).
-Die Erhebung und die Aufbewahrung von Aufzeichnungen der AFV stellen einen Eingriff in die Grundrechte der Betroffenen dar, insbesondere in das Recht auf Privatsphäre, das den Anspruch auf informationelle Selbstbestimmung miteinschliesst (E. 3.1). Für die AFV besteht im Kanton Thurgau keine hinreichend bestimmte gesetzliche Grundlage. Der mit der Überwachung verbundene Eingriff in die Privatsphäre verstösst daher gegen Art. 13 Abs. 2 i.V.m. Art. 36 Abs. 1 BV (E. 3.2 und 3.3).
-Stellt die Polizei im Rahmen ihrer präventiven Kontrolltätigkeit strafbare Handlungen fest, ermittelt sie nach Art. 306 ff. StPO. Die Frage, ob die mangels gesetzlicher Grundlage rechtswidrig erhobenen Beweismittel im Strafprozess verwertbar sind, ist nach Art. 141 Abs. 2 StPO zu prüfen (Präzisierung der Rechtsprechung; E. 4.1 und 4.2). Verwertbarkeit im vorliegenden Fall verneint (E. 4.3).```
+Example 3:
+{EXAMPLES["press"]["fr-de"]}
 
-Model’s Translation:
-```Art. 13 Abs. 2, Art. 36 Abs. 1 BV; Art. 141 Abs. 2 StPO; Verwertbarkeit von Aufzeichnungen, die durch die Polizei mit dem System zur automatisierten Fahrzeugerfassung und Verkehrsüberwachung (RVS) durchgeführt wurden.
-Die Durchführung und Speicherung von Aufzeichnungen durch das RVS stellt einen Eingriff in die Grundrechte der betroffenen Personen dar, insbesondere in den Schutz ihrer Privatsphäre, die das Recht auf Selbstbestimmung in Bezug auf Daten umfasst (E. 3.1). Der Einsatz von RVS stützt sich im Kanton Thurgau auf keine ausreichend präzise gesetzliche Grundlage. Die in dieser Überwachung liegende Beeinträchtigung der Privatsphäre verstößt daher gegen Art. 13 Abs. 2 in Verbindung mit Art. 36 Abs. 1 BV (E. 3.2 und 3.3).
-Die Aufdeckung von Straftaten durch die Polizei im Rahmen ihrer präventiven Kontrolltätigkeit fällt unter die Ermittlung im Sinne der Art. 306 ff. StPO. Die Verwertbarkeit im Strafverfahren von Beweisen, die auf illegale Weise erhoben wurden, muss anhand von Art. 141 Abs. 2 StPO geprüft werden (Klarstellung der Rechtsprechung; E. 4.1 und 4.2). Verwertbarkeit in diesem Fall verneint (E. 4.3).```
+Your Judgment: The model’s translation diverges significantly from the golden translation in accuracy, clarity, and fidelity. Critical legal terminology is mistranslated, omitted, and distorted. For instance, the courts are misidentified (“Zivilgerichtsverfassung”, “Zivilgericht”, “Bundesgerichtshof”). The model’s translation has several grammatical errors, such as “Geneßische Auktion”, “Erbvergabe”, “Wagenkellner” and “zu valieren”. The model also omits the explanation that, under German law, stolen property cannot be acquired in good faith. The correctness score: [[0.2]]
+""",
+    "single_absolute": f"""Example 1:
+{EXAMPLES["law"]["fr-de"]}
+
+Your Judgment: The model’s translation aligns well with the golden translation in terms of accuracy and clarity. However, minor stylistic differences exist. For example, the golden translation uses “gegen die guten Sitten verstösst” which is more idiomatic, while the model opts for the more literal “sittenwidrigen Inhalt hat” Similarly, “Ein Vertrag” in the golden translation better reflects generalized legal phrasing than the model’s literal “Der Vertrag”. The correctness score: [[0.9]]
+
+
+Example 2:
+{EXAMPLES["headnote"]["fr-de"]}
 
 Your Judgment: The model’s translation is accurate overall but omits a critical point in the second-to-last sentence: the evidence was unlawfully obtained due to a lack of legal basis. Additionally, its fidelity to Swiss legal terminology is limited. For example, the model uses "Klarstellung der Rechtsprechung" instead of the more appropriate "Präzisierung der Rechtsprechung" and "nicht ausreichend präzise" rather than the common "hinreichend bestimmt" It also consistently uses the French abbreviation "RVS" instead of the German "automatische Fahrzeugfahndung und Verkehrsüberwachung (AFV)" Lastly, "Recht auf Selbstbestimmung in Bezug auf Daten" is overly literal compared to the idiomatic "Anspruch auf informationelle Selbstbestimmung". The correctness score: [[0.6]]
 
 
 Example 3:
-Source Text:
-```Yoko Ono est propriétaire de la montre de John Lennon – rejet du recours d'un collectionneur contre un arrêt rendu par la Cour de justice genevoise
+{EXAMPLES["press"]["fr-de"]}
 
-Le Tribunal fédéral rejette le recours déposé par un collectionneur contre l'arrêt de la Cour de justice genevoise par lequel celle-ci confirmait que Yoko Ono est propriétaire de la montre qu'elle avait offerte à John Lennon en 1980, deux mois avant qu'il ne soit assassiné. Le collectionneur, qui a remis la montre à une maison de vente aux enchères genevoise en 2014 afin d'en faire estimer la valeur, a quant à lui revendiqué la propriété de ladite montre.
+Your Judgment: The model’s translation diverges significantly from the golden translation in accuracy, clarity, and fidelity. Critical legal terminology is mistranslated, omitted, and distorted. For instance, the courts are misidentified (“Zivilgerichtsverfassung”, “Zivilgericht”, “Bundesgerichtshof”). The model’s translation has several grammatical errors, such as “Geneßische Auktion”, “Erbvergabe”, “Wagenkellner” and “zu valieren”. The model also omits the explanation that, under German law, stolen property cannot be acquired in good faith. The correctness score: [[0.2]]
+""",
+    "diverse_deduction": f"""Example 1:
+{EXAMPLES["law"]["en-it"]}
 
-En 1980, Yoko Ono a acquis à New York une montre de marque Patek Philippe. Elle y a fait graver au dos l'inscription « (JUST LIKE) STARTING OVER LOVE YOKO 10·9·1980 N.Y.C » et l'a offerte à son époux, John Lennon, le 9 octobre 1980 pour son 40e anniversaire. Le 8 décembre 1980, John Lennon a été assassiné à New York. La montre a été répertoriée dans l'inventaire successoral et conservée dans une pièce de l'appartement de Yoko Ono à New York. Par la suite, la montre s'est retrouvée aux mains d'un homme qui avait été le chauffeur privé de Yoko Ono de 1995 à 2006. Un autre possesseur intermédiaire l'a remise à une maison de vente aux enchères allemande, où elle a été acquise par un collectionneur en 2014. Ce dernier l'a remise la même année à une maison de vente aux enchères genevoise afin d'en faire estimer la valeur, ce dont a été informée Yoko Ono. Cette dernière n'avait jusqu'alors pas eu conscience du fait que la montre n'était plus en sa possession. En 2018, le collectionneur a formé à Genève une action visant à constater sa qualité de propriétaire, action à laquelle Yoko Ono s'est opposée. En 2022, le tribunal de première instance genevois a constaté que Yoko Ono était la seule et unique propriétaire de la montre, ce que la Cour de justice du canton de Genève, statuant sur appel du collectionneur, a confirmé en 2023.
+Your Judgment: The model’s translation is legally sound and has no critical errors, but minor legal terminology choices warrant deductions. There are no deductions for minor stylistic differences, such as the use of “concluso” instead of “conchiuso”. However, there is a 0.1 points deduction for the more literal “illecite o immorali” instead of “contraria alle leggi od ai buoni costumi”. Another 0.1 points deduction is for the use of “clausole” instead of “parti” as “clausole” is narrower as the source text. The correctness score: [[0.8]]
 
-Le Tribunal fédéral rejette le recours déposé par le collectionneur contre cet arrêt. Il n'est tout d'abord pas contesté que la propriété de la montre a été acquise par succession par Yoko Ono après le décès de John Lennon. C'est en outre sans arbitraire que la Cour de justice genevoise a retenu que la montre avait été volée par l'ancien chauffeur et que, à l'inverse, aucun élément ne permettait de démontrer que Yoko Ono aurait eu l'intention de faire donation au chauffeur d'une chose si particulière que la montre, gravée d'une inscription, qu'elle avait offerte à John Lennon deux mois avant son décès. Dès lors qu'il s'agit d'une chose volée, le collectionneur, aujourd'hui recourant, ne pouvait pas acquérir la propriété de la montre par un mode originaire d'acquisition lorsqu'il l'a achetée en Allemagne en 2014 ; selon le droit allemand applicable en la matière, cela vaut indépendamment du fait que l'acquéreur était ou non de bonne foi quant à l'origine de la chose.```
 
-Golden Translation:
-```Yoko Ono ist Eigentümerin der Uhr von John Lennon – Beschwerde von Sammler gegen Genfer Urteil abgewiesen
+Example 2:
+{EXAMPLES["headnote"]["de-fr"]}
 
-Das Bundesgericht weist die Beschwerde eines Sammlers gegen das Urteil des Genfer Kantonsgerichts ab, mit dem Yoko Ono als Eigentümerin der Uhr bestätigt wurde, die sie John Lennon 1980 zwei Monate vor seiner Ermordung geschenkt hat. Der Sammler hatte die Uhr 2014 zur Schätzung bei einem Auktionshaus in Genf eingereicht und seinerseits Eigentümerschaft an der Uhr geltend gemacht.
+Your Judgment: The model’s translation is legally sound but contains terminological inconsistencies and an omission that affects its completeness. A 0.1 points deduction applies for “admissibilité” instead of “exploitabilité”, as the latter is the precise Swiss legal term for evidentiary usability. Another 0.1 points deduction is given for “ingérence” instead of “atteinte”, as "atteinte" is the standard term for fundamental rights violations in Swiss law. Additionally, “poursuite automatisée des véhicules et de la surveillance de la circulation (AFV)” replaces “recherche automatisée de véhicules et surveillance du trafic (RVS)”, deviating from the established Swiss legal terminology, leading to another 0.1 points deduction. A 0.2 points deduction is applied for the omitting of a legally significant detail—the lack of a sufficiently clear legal basis for evidence collection, which directly affects the completeness of the translation. The correctness score: [[0.5]]
 
-Yoko Ono hatte 1980 in New York eine Uhr der Marke Patek Philippe gekauft. Sie liess auf der Rückseite die Gravur "(JUST LIKE) STARTING OVER LOVE YOKO 10·9·1980 N.Y.C" anbringen und schenkte sie ihrem Ehemann John Lennon am 9. Oktober 1980 zum 40. Geburtstag. Am 8. Dezember 1980 wurde John Lennon in New York ermordet. Die Uhr wurde ins Erbschaftsinventar aufgenommen und in einem Zimmer der Wohnung von Yoko Ono in New York aufbewahrt. Sie gelangte von dort in die Hände eines Mannes, der von 1995 bis 2006 Privatchauffeur von Yoko Ono gewesen war. Ein weiterer Zwischenbesitzer brachte die Uhr in ein deutsches Auktionshaus, wo sie 2014 von einem Sammler erworben wurde. Dieser reichte die Uhr im gleichen Jahr bei einem Auktionshaus in Genf zur Schätzung ihres Wertes ein. Davon erfuhr Yoko Ono, die bis dahin keine Kenntnis davon gehabt hatte, dass sich die Uhr nicht mehr in ihrem Besitz befand. Der Sammler erhob 2018 in Genf eine Klage auf Feststellung seiner Eigentümerschaft, der sich Yoko Ono widersetzte. Das erstinstanzliche Genfer Gericht stellte 2022 fest, dass Yoko Ono die alleinige Eigentümerin der Uhr sei, was auf Berufung des Sammlers 2023 vom Kantonsgericht des Kantons Genf bestätigt wurde.
 
-Das Bundesgericht weist die dagegen erhobene Beschwerde des Sammlers ab. Unbestritten ist zunächst, dass das Eigentum an der Uhr nach dem Tod von John Lennon durch Erbschaft an Yoko Ono übergegangen ist. Sodann durfte das Genfer Kantonsgericht willkürfrei davon ausgehen, dass die Uhr vom ehemaligen Chauffeur gestohlen wurde und umgekehrt nichts dafür spricht, dass Yoko Ono dem Chauffeur dieses einzigartige Stück, welches sie John Lennon zwei Monate vor seinem Tod mit einer Inschrift versehen überreicht hatte, hätte schenken wollen. Da es sich bei der Uhr um eine gestohlene Sache handelt, konnte der Sammler und heutige Beschwerdeführer bei ihrem Erwerb 2014 in Deutschland nicht unmittelbar zum Eigentümer werden; dies gilt gemäss dem diesbezüglich anwendbaren deutschen Recht unabhängig davon, ob der Erwerber in Bezug auf die Herkunft der Sache in gutem Glauben war.```
+Example 3:
+{EXAMPLES["press"]["fr-de"]}
 
-Model’s Translation:
-```Hier, der Bundesgerichtshof hat den Antrag des Käufer gegen den Urteil des Zivilgerichts in Zürich abgelehnt, das das Zivilgericht in Zürich 2023 bestätigt hatte, dass Yoko Ono die Uhr als Geschenk 1980 an John Lennon gegeben hat, bevor dieser am 8. Dezember 1980 ermordet wurde. Der Käufer, der die Uhr 2014 in eine Geneßische Auktion verkauft hat, behauptet, die Uhr sei ihm verliehen worden, und nicht von Yoko Ono.
+Your Judgment: The model’s translation fails to meet standards of completeness and accuracy due to severe errors and omissions. A 0.4 points deduction is applied for the omission of a critical legal argument—that stolen property cannot be acquired in good faith under German law. Furthermore, 0.1 points deductions are applied for grammatical errors that reduce clarity, such as  “Geneßische Auktion”, “Wagenkellner” and “zu valieren”. Two 0.2 points deductions apply for the misidentification of the courts "Zivilgericht" and “Zivilgerichtsverfassung” instead of "Kantonsgericht" and "Bundesgerichtshof" instead of "Bundesgericht". Another 0.2 points deduction comes from the translation “Erbvergabe” instead of “Erbschaft”. The correctness score: [[0]]
+""",
+    "single_deduction": f"""Example 1:
+{EXAMPLES["law"]["fr-de"]}
 
-Im Jahr 1980 hat Yoko Ono in New York eine Uhr von der Marke Patek Philippe gekauft. Sie hat auf der Rückseite der Uhr die Inschrift "JUST LIKE STARTING OVER LOVE YOKO 10·9·1980 N.Y.C." gemalt und sie an ihren Ehemann, John Lennon, am 9. Oktober 1980 als Geburtstag gegeben. Am 8. Dezember 1980 wurde Lennon ermordet in New York. Die Uhr wurde in den Erbeverzeichnis des Lennon-Mädchens aufbewahrt und in einem Zimmer in Yoko Onos New Yorker Wohnung.
+Your Judgment: The model’s translation is complete and legally sound, but a minor terminology issue warrants a deduction. A 0.1 points deduction applies for "sittenwidrigen Inhalt hat" instead of "gegen die guten Sitten verstößt", as the latter is the standard Swiss legal phrasing in contract law. No deduction is applied for "Der Vertrag" instead of "Ein Vertrag", as the point deduction system does not penalize stylistic variations that do not impact legal accuracy or completeness. The correctness score: [[0.9]]
 
-Danach wurde die Uhr von einem Mann, der als Wagenkellner von Yoko Ono zwischen 1995 und 2006 gearbeitet hat, in die Hände gebracht. Ein weiterer mittlerer Eigentümer hat sie 2014 in eine deutsche Auktion verkauft, wo sie einem Käufer in Zürich übergeben wurde. Dieser Käufer hat die Uhr 2014 in eine Geneßische Auktion verkauft, um sie zu valieren, und Yoko Ono war von diesem Moment an nicht mehr bewusst, dass die Uhr nicht mehr in ihrer Hand war.
 
-Im Jahr 2018 hat der Käufer in Zürich eine Anzeige gegen Yoko Ono eröffnet, um die Qualität des Eigentums der Uhr zu bestätigen, was Yoko Ono ansprach. Im Jahr 2022 hat der Zivilgericht in Zürich bestätigt, dass Yoko Ono die einzige und letzte Eigentümerin der Uhr ist, was der Zivilgerichtshof im Jahr 2023 bestätigt hat.
+Example 2:
+{EXAMPLES["headnote"]["fr-de"]}
 
-Der Bundesgerichtshof hat den Antrag des Käufer gegen dieses Urteil abgelehnt. Er ist zunächst nicht bezweifelt, dass Yoko Ono die Uhr nach dem Tod von John Lennon durch Erbvergabe gekauft hat. Es ist auch nicht bezweifelt, dass die Zivilgerichtsverfassung in Zürich die Uhr als gestohlen angenommen hat, da niemand einen Beweis dafür gab, dass Yoko Ono die Uhr als Geschenk an den Wagenkellner hatte. Deshalb kann der Käufer nicht durch eine rechtliche Handlung die Uhr von Yoko Ono erwerben, wenn er die Uhr in Deutschland gekauft hat.```
+Your Judgment: The model’s translation is legally structured and accurate in meaning, but omissions and terminological inconsistencies affect completeness and precision. A 0.2 points deduction applies for omitting a key legal argument—that evidence was unlawfully obtained due to a lack of legal basis, impacting completeness. A 0.1 points deduction is given for terminological imprecision, such as "nicht ausreichend präzise" instead of "hinreichend bestimmt". Another 0.1 points deduction applies for using the French abbreviation "RVS" instead of the correct German "AFV", and a final 0.1 points deduction is applied for overly literal phrasing in "Recht auf Selbstbestimmung in Bezug auf Daten" instead of "Anspruch auf informationelle Selbstbestimmung". Despite these issues, the translation conveys the core meaning correctly. The correctness score: [[0.5]]
 
-Your Judgment: The model’s translation diverges significantly from the golden translation in accuracy, clarity, and fidelity. Critical legal terminology is mistranslated, omitted, and distorted. For instance, the courts are misidentified (“Zivilgerichtsverfassung”, “Zivilgericht”, “Bundesgerichtshof”). The model’s translation has several grammatical errors, such as “Geneßische Auktion”, “Erbvergabe”, “Wagenkellner” and “zu valieren”. The model also omits the explanation that, under German law, stolen property cannot be acquired in good faith. The correctness score: [[0.2]]
+
+Example 3:
+{EXAMPLES["press"]["fr-de"]}
+
+Your Judgment: The model’s translation fails to meet standards of completeness and accuracy due to severe errors and omissions. A 0.4 points deduction is applied for the omission of a critical legal argument—that stolen property cannot be acquired in good faith under German law. Furthermore, 0.1 points deductions are applied for grammatical errors that reduce clarity, such as  “Geneßische Auktion”, “Wagenkellner” and “zu valieren”. Two 0.2 points deductions apply for the misidentification of the courts "Zivilgericht" and “Zivilgerichtsverfassung” instead of "Kantonsgericht" and "Bundesgerichtshof" instead of "Bundesgericht". Another 0.2 points deduction comes from the translation “Erbvergabe” instead of “Erbschaft”. The correctness score: [[0]]
 """,
 }
 
@@ -288,13 +339,19 @@ def get_swiss_legal_translation_judge(
     judge_model_name: str = "openai/gpt-4o-2024-11-20",
     short_judge_name: str = "slt_judge_gpt-4o",
     backend: str = "litellm",
-    system_style: str = "basic",  # "basic" or "detailed"
+    system_style: str = "basic",  # "basic" or "detailed" or "codebook"
     few_shot_style: str = "diverse",  # "diverse" or "single"
+    judgment_style: str = "absolute",  # "absolute" or "deduction"
 ):
+    if system_style == "codebook" and judgment_style == "absolute":
+        raise ValueError("The codebook system style can only be used with the deduction judgment style.")
+    if system_style in ("basic", "detailed") and judgment_style == "deduction":
+        raise ValueError(f"The {system_style} can only be used with the absolute judgment style.")
+
     def swiss_legal_translation_judge(question, options, answer, gold):
         system_prompt = SYSTEM[system_style]
         user = USER[system_style]
-        few_shot_examples = FEW_SHOT[few_shot_style]
+        few_shot_examples = FEW_SHOT[f"{few_shot_style}_{judgment_style}"]
         instruction = f"""Judge the below case, give the brief reasoning process and the correctness score.
 
 
@@ -907,6 +964,7 @@ def prompt_fn(line: dict, task_name: str = None):
 JUDGE_MODELS = {
     "gpt-4o-mini": "openai/gpt-4o-mini-2024-07-18",
     "gpt-4o": "openai/gpt-4o-2024-11-20",
+    # The Gemini models are not very good judges.
     "gemini-1-5-flash": "gemini/gemini-1.5-flash-002",
     "gemini-1-5-pro": "gemini/gemini-1.5-pro-002",
     # The Claude models do not follow the required output format.
@@ -932,10 +990,18 @@ def prompt_fn(line: dict, task_name: str = None):
     "slt_judge_gpt_4o",
 ]
 JUDGE_METRICS = [
-    f"slt_judge_{judge_model}-{system_style}-{few_shot_style}".replace("-", "_")
-    for judge_model in JUDGE_MODELS
-    for system_style in ["basic", "detailed"]
-    for few_shot_style in ["diverse", "single"]
+    [
+        f"slt_judge_{judge_model}-{system_style}-{few_shot_style}-{judgment_style}".replace("-", "_")
+        for judge_model in JUDGE_MODELS
+        for few_shot_style in ["diverse", "single"]
+        for system_style, judgment_style in [
+            ("basic", "absolute"),
+            ("detailed", "absolute"),
+            ("codebook", "deduction"),
+            # Make sure that the codebook system style is used with the deduction judgment style
+            # and the basic and detailed system styles are used with the absolute judgment style
+        ]
+    ]
 ]
 
 metrics_to_evaluate = ["judge"]

From 186a6c83291ca26a4f3fac374cd3cdfdd3802763 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Sat, 1 Feb 2025 11:54:43 +0100
Subject: [PATCH 57/57] Fixed judge setup.

---
 community_tasks/swiss_legal_evals.py | 31 +++++++++++++++-------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 321920c9f..521ff7cfe 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -990,17 +990,15 @@ def prompt_fn(line: dict, task_name: str = None):
     "slt_judge_gpt_4o",
 ]
 JUDGE_METRICS = [
-    [
-        f"slt_judge_{judge_model}-{system_style}-{few_shot_style}-{judgment_style}".replace("-", "_")
-        for judge_model in JUDGE_MODELS
-        for few_shot_style in ["diverse", "single"]
-        for system_style, judgment_style in [
-            ("basic", "absolute"),
-            ("detailed", "absolute"),
-            ("codebook", "deduction"),
-            # Make sure that the codebook system style is used with the deduction judgment style
-            # and the basic and detailed system styles are used with the absolute judgment style
-        ]
+    f"slt_judge_{judge_model}-{system_style}-{few_shot_style}-{judgment_style}".replace("-", "_")
+    for judge_model in JUDGE_MODELS
+    for few_shot_style in ["diverse", "single"]
+    for system_style, judgment_style in [
+        ("basic", "absolute"),
+        ("detailed", "absolute"),
+        ("codebook", "deduction"),
+        # Make sure that the codebook system style is used with the deduction judgment style
+        # and the basic and detailed system styles are used with the absolute judgment style
     ]
 ]
 
@@ -1079,9 +1077,13 @@ def init_llm_judge_metric(metric_name: str):
 
     # Check all the judge metric combinations
     for judge_model in JUDGE_MODELS:
-        for system_style in ["basic", "detailed"]:
-            for few_shot_style in ["diverse", "single"]:
-                short_judge_name = f"slt_judge_{judge_model}-{system_style}-{few_shot_style}"
+        for few_shot_style in ["diverse", "single"]:
+            for system_style, judgment_style in [
+                ("basic", "absolute"),
+                ("detailed", "absolute"),
+                ("codebook", "deduction"),
+            ]:
+                short_judge_name = f"slt_judge_{judge_model}-{system_style}-{few_shot_style}-{judgment_style}"
                 judge_metric_name = short_judge_name.replace("-", "_")
                 if metric_name == judge_metric_name:
                     METRICS[metric_name] = get_swiss_legal_translation_judge(
@@ -1089,6 +1091,7 @@ def init_llm_judge_metric(metric_name: str):
                         short_judge_name=short_judge_name,
                         system_style=system_style,
                         few_shot_style=few_shot_style,
+                        judgment_style=judgment_style,
                     )
                     break