Benchmark runner test (#488)

AbanteAI · Jan 18, 2024 · b438094 · b438094
1 parent 338046e
commit b438094
Show file tree

Hide file tree

Showing 14 changed files with 208 additions and 70 deletions.
diff --git a/benchmarks/arg_parser.py b/benchmarks/arg_parser.py
@@ -17,9 +17,8 @@ def common_benchmark_parser():
     )
     parser.add_argument(
         "--benchmarks",
-        action="append",
         nargs="*",
-        default=[[]],
+        default=[],
         help=(
             "Which benchmarks to run. max_benchmarks ignored when set. Exact meaning"
             " depends on benchmark."

diff --git a/benchmarks/benchmark_runner.py b/benchmarks/benchmark_runner.py
@@ -177,6 +177,7 @@ async def evaluate_sample(sample_file, retries=1):
     """Run a sample using Mentat and return the resulting diff"""
     sample = Sample.load(sample_file)
     results = []
+    start_dir = Path.cwd()
     for i in range(retries):
         formatted_title = re.sub(r"[ '\"/\\-^]", "", sample.title).replace(" ", "_")
         result = BenchmarkResult(
@@ -189,20 +190,23 @@ async def evaluate_sample(sample_file, retries=1):
             diff_merge_base=sample.diff_merge_base,
             diff_active=sample.diff_active,
         )
-        cwd = Path(repo.working_dir)
-
-        # Run sample in PythonClient
-        paths = list[Path]()
-        for a in sample.context:
-            paths.append(Path(a))
-        client = PythonClient(cwd=cwd, paths=paths)
-        response = await run_client(
-            client, sample.message_prompt, result, sample.message_history
-        )
-        await grade_and_clean_diff(
-            repo, response, result, comparison_diff=sample.diff_edit
-        )
-        results.append(result)
+        try:
+            cwd = Path(repo.working_dir)
+
+            # Run sample in PythonClient
+            paths = list[Path]()
+            for a in sample.context:
+                paths.append(Path(a))
+            client = PythonClient(cwd=cwd, paths=paths)
+            response = await run_client(
+                client, sample.message_prompt, result, sample.message_history
+            )
+            await grade_and_clean_diff(
+                repo, response, result, comparison_diff=sample.diff_edit
+            )
+            results.append(result)
+        finally:
+            os.chdir(start_dir)
     return results
 
 
@@ -212,36 +216,43 @@ async def evalute_py(path, retries):
     title = benchmark.title
 
     print("Benchmark:", title)
-    repo = setup_repo(
-        url=benchmark.repo,
-        commit=benchmark.commit,
-    )
-    cwd = Path(repo.working_dir)
-
-    if hasattr(benchmark, "comparison_commit"):
-        comparison_commit = benchmark.comparison_commit
-        repo.git.checkout(comparison_commit)
-        comparison_diff = repo.git.diff(benchmark.commit)
-    else:
-        comparison_diff = None
-
-    for i, prompt in enumerate(benchmark.prompts):
-        print("  Prompt:", prompt)
-        for j in range(1, retries + 1):
-            formatted_title = re.sub(r"[ '\"/\\-^]", "", title).replace(" ", "_")
-            result = BenchmarkResult(
-                name=f"{formatted_title}-{i}-{j}",
-                family=formatted_title,
-            )
-            client = PythonClient(cwd=cwd, config=benchmark.config)
-            response = await run_client(client, prompt, result)
+    start_dir = Path.cwd()
+    try:
+        repo = setup_repo(
+            url=benchmark.repo,
+            commit=benchmark.commit,
+        )
+        cwd = Path(repo.working_dir)
 
-            await client.shutdown()
-            if hasattr(benchmark, "verify"):
-                result.verify = benchmark.verify()
+        if hasattr(benchmark, "comparison_commit"):
+            comparison_commit = benchmark.comparison_commit
+            repo.git.checkout(comparison_commit)
+            comparison_diff = repo.git.diff(benchmark.commit)
+        else:
+            comparison_diff = None
+
+        for i, prompt in enumerate(benchmark.prompts):
+            print("  Prompt:", prompt)
+            for j in range(1, retries + 1):
+                formatted_title = re.sub(r"[ '\"/\\-^]", "", title).replace(" ", "_")
+                result = BenchmarkResult(
+                    name=f"{formatted_title}-{i}-{j}",
+                    family=formatted_title,
+                )
+                client = PythonClient(
+                    cwd=cwd, paths=benchmark.paths, config=benchmark.config
+                )
+                response = await run_client(client, prompt, result)
 
-            await grade_and_clean_diff(repo, response, result, comparison_diff)
-            results.append(result)
+                await client.shutdown()
+                if hasattr(benchmark, "verify"):
+                    result.verify = benchmark.verify()
+
+                await grade_and_clean_diff(repo, response, result, comparison_diff)
+                os.chdir("../..")
+                results.append(result)
+    finally:
+        os.chdir(start_dir)
     return results
 
 
@@ -252,9 +263,9 @@ def benchmark_listed(title, benchmarks):
     return False
 
 
-async def run_benchmarks(retries, benchmarks):
+async def run_benchmarks(benchmarks, retries=1):
     print("Running benchmarks")
-    benchmarks_dir = f"{os.path.dirname(__file__)}/benchmarks"
+    benchmarks_dir = Path("benchmarks/benchmarks")
 
     benchmark_paths = []
     for root, dirs, files in os.walk(benchmarks_dir):
@@ -296,7 +307,7 @@ async def run_benchmarks(retries, benchmarks):
     args = parser.parse_args()
     asyncio.run(
         run_benchmarks(
+            args.benchmarks,
             args.retries,
-            args.benchmarks[0],
         )
     )
diff --git a/benchmarks/benchmarks/mentat/clojure_exercism_runner.py b/benchmarks/benchmarks/mentat/clojure_exercism_runner.py
@@ -14,6 +14,7 @@
 repo = "https://github.com/AbanteAI/mentat"
 commit = "d611e2ff742856c7328d54f6e71c2418f9c5508b"
 minimum_context = ["tests/benchmarks/exercise_runners"]
+paths = []
 
 config = Config(
     auto_context_tokens=8000,

diff --git a/benchmarks/benchmarks/mentat/license_update.py b/benchmarks/benchmarks/mentat/license_update.py
@@ -22,6 +22,7 @@
 repo = "https://github.com/AbanteAI/mentat"
 commit = "b0848711c36e0c2fe9619ebb2b77dc6d27396ff2"
 minimum_context = ["tests/license_check.py:11-22"]
+paths = []
 
 config = Config(
     auto_context_tokens=8000,

diff --git a/benchmarks/benchmarks/mentat/pre_tags.py b/benchmarks/benchmarks/mentat/pre_tags.py
@@ -22,6 +22,7 @@
 
 repo = "https://github.com/AbanteAI/mentat"
 commit = "b8d90b89e4a0d7ad266bf914c4ce99c473dd8dc0"
+paths = []
 
 config = Config(
     auto_context_tokens=8000,

diff --git a/benchmarks/exercism_practice.py b/benchmarks/exercism_practice.py
@@ -225,7 +225,7 @@ def run_exercism_benchmark(
     args = parser.parse_args()
     clone_exercism_repo(args.refresh_repo, args.language)
     run_exercism_benchmark(
-        args.benchmarks[0],
+        args.benchmarks,
         args.max_benchmarks,
         args.max_iterations,
         args.max_workers,

diff --git a/mentat/sampler/utils.py b/mentat/sampler/utils.py
@@ -9,7 +9,7 @@
 from mentat.git_handler import get_non_gitignored_files
 from mentat.utils import is_file_text_encoded
 
-CLONE_TO_DIR = Path("benchmark_repos")
+CLONE_TO_DIR = Path("benchmarks/benchmark_repos")
 
 
 def clone_repo(

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,14 +1,14 @@
 [tool.isort]
 profile = "black"
 known_first_party = "mentat"
-skip = ["vscode/bundled", "benchmark_repos", "testbed/exercism-python"]
+skip = ["vscode/bundled", "benchmarks/benchmark_repos", "testbed/exercism-python"]
 
 [tool.ruff]
 line-length = 120
 ignore = ["E731"]
 
 [tool.pytest.ini_options]
-addopts = "--ignore=vscode/bundled --ignore=benchmark_repos --ignore=testbed/exercism-python"
+addopts = "--ignore=vscode/bundled --ignore=benchmarks/benchmark_repos --ignore=testbed/exercism-python"
 
 [tool.black]
 preview = "true"

diff --git a/pyrightconfig.json b/pyrightconfig.json
@@ -1,6 +1,13 @@
 {
     "include": ["mentat"],
-    "ignore": ["testbed", "tests", "scripts", "benchmark_repos", "build"],
+    "ignore": [
+        "testbed",
+        "tests",
+        "scripts",
+        "benchmark_repos",
+        "build",
+        "benchmarks/benchmark_repos",
+    ],
     "typeCheckingMode": "strict",
     "reportMissingTypeStubs": false,
 }
diff --git a/scripts/run_and_upload_benchmarks.sh b/scripts/run_and_upload_benchmarks.sh
@@ -11,12 +11,12 @@ TIMESTAMP=$(date +%Y%m%d%H%M%S)
     --max_benchmarks 200 \
     --language javascript
 
-SUMMARY=$(jq '.summary_string' benchmark_repos/exercism-javascript/results.json)
+SUMMARY=$(jq '.summary_string' benchmarks/benchmark_repos/exercism-javascript/results.json)
 BUCKET="benchmarks.mentat.ai"
 
 # Upload results to S3
-aws s3 cp benchmark_repos/exercism-javascript/results.html s3://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.html
-aws s3 cp benchmark_repos/exercism-javascript/results.json s3://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.json
+aws s3 cp benchmarks/benchmark_repos/exercism-javascript/results.html s3://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.html
+aws s3 cp benchmarks/benchmark_repos/exercism-javascript/results.json s3://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.json
 
 # Send slack notification
 JAVASCRIPT_RESULTS_URL="http://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.html"
@@ -32,11 +32,11 @@ curl -X POST -H "Content-Type: application/json" -d "{\"benchmark_report\": \"${
     --max_benchmarks 200 \
     --language python
 
-SUMMARY=$(jq '.summary_string' benchmark_repos/exercism-python/results.json)
+SUMMARY=$(jq '.summary_string' benchmarks/benchmark_repos/exercism-python/results.json)
 
 # Upload results to S3
-aws s3 cp benchmark_repos/exercism-python/results.html s3://${BUCKET}/exercism-python-results-${TIMESTAMP}.html
-aws s3 cp benchmark_repos/exercism-python/results.json s3://${BUCKET}/exercism-python-results-${TIMESTAMP}.json
+aws s3 cp benchmarks/benchmark_repos/exercism-python/results.html s3://${BUCKET}/exercism-python-results-${TIMESTAMP}.html
+aws s3 cp benchmarks/benchmark_repos/exercism-python/results.json s3://${BUCKET}/exercism-python-results-${TIMESTAMP}.json
 
 # Send slack notification
 PYTHON_RESULTS_URL="http://${BUCKET}/exercism-python-results-${TIMESTAMP}.html"

diff --git a/testbed/benchmarks/benchmarks/clojure_exercism_runner.py b/testbed/benchmarks/benchmarks/clojure_exercism_runner.py
@@ -0,0 +1,19 @@
+from mentat.config import Config
+
+title = "Clojure Exercism Runner"
+
+description = """
+This benchmark tests the ability to write an exercism test runner for the clojure language.
+"""
+
+prompts = [
+    "Write a test runner for the clojure language.",
+]
+
+
+repo = "https://github.com/AbanteAI/mentat"
+commit = "d611e2ff742856c7328d54f6e71c2418f9c5508b"
+minimum_context = ["tests/benchmarks/exercise_runners"]
+paths = ["tests/benchmarks/exercise_runners"]
+
+config = Config()
diff --git a/testbed/exercism-python/exercises/practice/accumulate/accumulate_test.py b/testbed/exercism-python/exercises/practice/accumulate/accumulate_test.py
@@ -9,31 +9,33 @@ def test_empty_sequence(self):
 
     def test_pow(self):
         self.assertEqual(
-            accumulate([1, 2, 3, 4, 5], lambda x: x * x), [1, 4, 9, 16, 25])
+            accumulate([1, 2, 3, 4, 5], lambda x: x * x), [1, 4, 9, 16, 25]
+        )
 
     def test_divmod(self):
         self.assertEqual(
-            accumulate([10, 17, 23], lambda x: divmod(x, 7)),
-            [(1, 3), (2, 3), (3, 2)])
+            accumulate([10, 17, 23], lambda x: divmod(x, 7)), [(1, 3), (2, 3), (3, 2)]
+        )
 
     def test_composition(self):
         inp = [10, 17, 23]
         self.assertEqual(
             accumulate(
-                accumulate(inp, lambda x: divmod(x, 7)),
-                lambda x: 7 * x[0] + x[1]), inp)
+                accumulate(inp, lambda x: divmod(x, 7)), lambda x: 7 * x[0] + x[1]
+            ),
+            inp,
+        )
 
     def test_capitalize(self):
-        self.assertEqual(
-            accumulate(['hello', 'world'], str.upper), ['HELLO', 'WORLD'])
+        self.assertEqual(accumulate(["hello", "world"], str.upper), ["HELLO", "WORLD"])
 
     def test_recursive(self):
-        inp = ['a', 'b', 'c']
-        out = [['a1', 'a2', 'a3'], ['b1', 'b2', 'b3'], ['c1', 'c2', 'c3']]
+        inp = ["a", "b", "c"]
+        out = [["a1", "a2", "a3"], ["b1", "b2", "b3"], ["c1", "c2", "c3"]]
         self.assertEqual(
-            accumulate(
-                inp, lambda x: accumulate(list('123'), lambda y: x + y)), out)
+            accumulate(inp, lambda x: accumulate(list("123"), lambda y: x + y)), out
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()