Skip to content
This repository has been archived by the owner on Jan 7, 2025. It is now read-only.

Commit

Permalink
Benchmark runner test (#488)
Browse files Browse the repository at this point in the history
  • Loading branch information
jakethekoenig authored Jan 18, 2024
1 parent 338046e commit b438094
Show file tree
Hide file tree
Showing 14 changed files with 208 additions and 70 deletions.
3 changes: 1 addition & 2 deletions benchmarks/arg_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@ def common_benchmark_parser():
)
parser.add_argument(
"--benchmarks",
action="append",
nargs="*",
default=[[]],
default=[],
help=(
"Which benchmarks to run. max_benchmarks ignored when set. Exact meaning"
" depends on benchmark."
Expand Down
101 changes: 56 additions & 45 deletions benchmarks/benchmark_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ async def evaluate_sample(sample_file, retries=1):
"""Run a sample using Mentat and return the resulting diff"""
sample = Sample.load(sample_file)
results = []
start_dir = Path.cwd()
for i in range(retries):
formatted_title = re.sub(r"[ '\"/\\-^]", "", sample.title).replace(" ", "_")
result = BenchmarkResult(
Expand All @@ -189,20 +190,23 @@ async def evaluate_sample(sample_file, retries=1):
diff_merge_base=sample.diff_merge_base,
diff_active=sample.diff_active,
)
cwd = Path(repo.working_dir)

# Run sample in PythonClient
paths = list[Path]()
for a in sample.context:
paths.append(Path(a))
client = PythonClient(cwd=cwd, paths=paths)
response = await run_client(
client, sample.message_prompt, result, sample.message_history
)
await grade_and_clean_diff(
repo, response, result, comparison_diff=sample.diff_edit
)
results.append(result)
try:
cwd = Path(repo.working_dir)

# Run sample in PythonClient
paths = list[Path]()
for a in sample.context:
paths.append(Path(a))
client = PythonClient(cwd=cwd, paths=paths)
response = await run_client(
client, sample.message_prompt, result, sample.message_history
)
await grade_and_clean_diff(
repo, response, result, comparison_diff=sample.diff_edit
)
results.append(result)
finally:
os.chdir(start_dir)
return results


Expand All @@ -212,36 +216,43 @@ async def evalute_py(path, retries):
title = benchmark.title

print("Benchmark:", title)
repo = setup_repo(
url=benchmark.repo,
commit=benchmark.commit,
)
cwd = Path(repo.working_dir)

if hasattr(benchmark, "comparison_commit"):
comparison_commit = benchmark.comparison_commit
repo.git.checkout(comparison_commit)
comparison_diff = repo.git.diff(benchmark.commit)
else:
comparison_diff = None

for i, prompt in enumerate(benchmark.prompts):
print(" Prompt:", prompt)
for j in range(1, retries + 1):
formatted_title = re.sub(r"[ '\"/\\-^]", "", title).replace(" ", "_")
result = BenchmarkResult(
name=f"{formatted_title}-{i}-{j}",
family=formatted_title,
)
client = PythonClient(cwd=cwd, config=benchmark.config)
response = await run_client(client, prompt, result)
start_dir = Path.cwd()
try:
repo = setup_repo(
url=benchmark.repo,
commit=benchmark.commit,
)
cwd = Path(repo.working_dir)

await client.shutdown()
if hasattr(benchmark, "verify"):
result.verify = benchmark.verify()
if hasattr(benchmark, "comparison_commit"):
comparison_commit = benchmark.comparison_commit
repo.git.checkout(comparison_commit)
comparison_diff = repo.git.diff(benchmark.commit)
else:
comparison_diff = None

for i, prompt in enumerate(benchmark.prompts):
print(" Prompt:", prompt)
for j in range(1, retries + 1):
formatted_title = re.sub(r"[ '\"/\\-^]", "", title).replace(" ", "_")
result = BenchmarkResult(
name=f"{formatted_title}-{i}-{j}",
family=formatted_title,
)
client = PythonClient(
cwd=cwd, paths=benchmark.paths, config=benchmark.config
)
response = await run_client(client, prompt, result)

await grade_and_clean_diff(repo, response, result, comparison_diff)
results.append(result)
await client.shutdown()
if hasattr(benchmark, "verify"):
result.verify = benchmark.verify()

await grade_and_clean_diff(repo, response, result, comparison_diff)
os.chdir("../..")
results.append(result)
finally:
os.chdir(start_dir)
return results


Expand All @@ -252,9 +263,9 @@ def benchmark_listed(title, benchmarks):
return False


async def run_benchmarks(retries, benchmarks):
async def run_benchmarks(benchmarks, retries=1):
print("Running benchmarks")
benchmarks_dir = f"{os.path.dirname(__file__)}/benchmarks"
benchmarks_dir = Path("benchmarks/benchmarks")

benchmark_paths = []
for root, dirs, files in os.walk(benchmarks_dir):
Expand Down Expand Up @@ -296,7 +307,7 @@ async def run_benchmarks(retries, benchmarks):
args = parser.parse_args()
asyncio.run(
run_benchmarks(
args.benchmarks,
args.retries,
args.benchmarks[0],
)
)
1 change: 1 addition & 0 deletions benchmarks/benchmarks/mentat/clojure_exercism_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
repo = "https://github.com/AbanteAI/mentat"
commit = "d611e2ff742856c7328d54f6e71c2418f9c5508b"
minimum_context = ["tests/benchmarks/exercise_runners"]
paths = []

config = Config(
auto_context_tokens=8000,
Expand Down
1 change: 1 addition & 0 deletions benchmarks/benchmarks/mentat/license_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
repo = "https://github.com/AbanteAI/mentat"
commit = "b0848711c36e0c2fe9619ebb2b77dc6d27396ff2"
minimum_context = ["tests/license_check.py:11-22"]
paths = []

config = Config(
auto_context_tokens=8000,
Expand Down
1 change: 1 addition & 0 deletions benchmarks/benchmarks/mentat/pre_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

repo = "https://github.com/AbanteAI/mentat"
commit = "b8d90b89e4a0d7ad266bf914c4ce99c473dd8dc0"
paths = []

config = Config(
auto_context_tokens=8000,
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/exercism_practice.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def run_exercism_benchmark(
args = parser.parse_args()
clone_exercism_repo(args.refresh_repo, args.language)
run_exercism_benchmark(
args.benchmarks[0],
args.benchmarks,
args.max_benchmarks,
args.max_iterations,
args.max_workers,
Expand Down
2 changes: 1 addition & 1 deletion mentat/sampler/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from mentat.git_handler import get_non_gitignored_files
from mentat.utils import is_file_text_encoded

CLONE_TO_DIR = Path("benchmark_repos")
CLONE_TO_DIR = Path("benchmarks/benchmark_repos")


def clone_repo(
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
[tool.isort]
profile = "black"
known_first_party = "mentat"
skip = ["vscode/bundled", "benchmark_repos", "testbed/exercism-python"]
skip = ["vscode/bundled", "benchmarks/benchmark_repos", "testbed/exercism-python"]

[tool.ruff]
line-length = 120
ignore = ["E731"]

[tool.pytest.ini_options]
addopts = "--ignore=vscode/bundled --ignore=benchmark_repos --ignore=testbed/exercism-python"
addopts = "--ignore=vscode/bundled --ignore=benchmarks/benchmark_repos --ignore=testbed/exercism-python"

[tool.black]
preview = "true"
Expand Down
9 changes: 8 additions & 1 deletion pyrightconfig.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
{
"include": ["mentat"],
"ignore": ["testbed", "tests", "scripts", "benchmark_repos", "build"],
"ignore": [
"testbed",
"tests",
"scripts",
"benchmark_repos",
"build",
"benchmarks/benchmark_repos",
],
"typeCheckingMode": "strict",
"reportMissingTypeStubs": false,
}
12 changes: 6 additions & 6 deletions scripts/run_and_upload_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ TIMESTAMP=$(date +%Y%m%d%H%M%S)
--max_benchmarks 200 \
--language javascript

SUMMARY=$(jq '.summary_string' benchmark_repos/exercism-javascript/results.json)
SUMMARY=$(jq '.summary_string' benchmarks/benchmark_repos/exercism-javascript/results.json)
BUCKET="benchmarks.mentat.ai"

# Upload results to S3
aws s3 cp benchmark_repos/exercism-javascript/results.html s3://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.html
aws s3 cp benchmark_repos/exercism-javascript/results.json s3://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.json
aws s3 cp benchmarks/benchmark_repos/exercism-javascript/results.html s3://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.html
aws s3 cp benchmarks/benchmark_repos/exercism-javascript/results.json s3://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.json

# Send slack notification
JAVASCRIPT_RESULTS_URL="http://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.html"
Expand All @@ -32,11 +32,11 @@ curl -X POST -H "Content-Type: application/json" -d "{\"benchmark_report\": \"${
--max_benchmarks 200 \
--language python

SUMMARY=$(jq '.summary_string' benchmark_repos/exercism-python/results.json)
SUMMARY=$(jq '.summary_string' benchmarks/benchmark_repos/exercism-python/results.json)

# Upload results to S3
aws s3 cp benchmark_repos/exercism-python/results.html s3://${BUCKET}/exercism-python-results-${TIMESTAMP}.html
aws s3 cp benchmark_repos/exercism-python/results.json s3://${BUCKET}/exercism-python-results-${TIMESTAMP}.json
aws s3 cp benchmarks/benchmark_repos/exercism-python/results.html s3://${BUCKET}/exercism-python-results-${TIMESTAMP}.html
aws s3 cp benchmarks/benchmark_repos/exercism-python/results.json s3://${BUCKET}/exercism-python-results-${TIMESTAMP}.json

# Send slack notification
PYTHON_RESULTS_URL="http://${BUCKET}/exercism-python-results-${TIMESTAMP}.html"
Expand Down
19 changes: 19 additions & 0 deletions testbed/benchmarks/benchmarks/clojure_exercism_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from mentat.config import Config

title = "Clojure Exercism Runner"

description = """
This benchmark tests the ability to write an exercism test runner for the clojure language.
"""

prompts = [
"Write a test runner for the clojure language.",
]


repo = "https://github.com/AbanteAI/mentat"
commit = "d611e2ff742856c7328d54f6e71c2418f9c5508b"
minimum_context = ["tests/benchmarks/exercise_runners"]
paths = ["tests/benchmarks/exercise_runners"]

config = Config()
Original file line number Diff line number Diff line change
Expand Up @@ -9,31 +9,33 @@ def test_empty_sequence(self):

def test_pow(self):
self.assertEqual(
accumulate([1, 2, 3, 4, 5], lambda x: x * x), [1, 4, 9, 16, 25])
accumulate([1, 2, 3, 4, 5], lambda x: x * x), [1, 4, 9, 16, 25]
)

def test_divmod(self):
self.assertEqual(
accumulate([10, 17, 23], lambda x: divmod(x, 7)),
[(1, 3), (2, 3), (3, 2)])
accumulate([10, 17, 23], lambda x: divmod(x, 7)), [(1, 3), (2, 3), (3, 2)]
)

def test_composition(self):
inp = [10, 17, 23]
self.assertEqual(
accumulate(
accumulate(inp, lambda x: divmod(x, 7)),
lambda x: 7 * x[0] + x[1]), inp)
accumulate(inp, lambda x: divmod(x, 7)), lambda x: 7 * x[0] + x[1]
),
inp,
)

def test_capitalize(self):
self.assertEqual(
accumulate(['hello', 'world'], str.upper), ['HELLO', 'WORLD'])
self.assertEqual(accumulate(["hello", "world"], str.upper), ["HELLO", "WORLD"])

def test_recursive(self):
inp = ['a', 'b', 'c']
out = [['a1', 'a2', 'a3'], ['b1', 'b2', 'b3'], ['c1', 'c2', 'c3']]
inp = ["a", "b", "c"]
out = [["a1", "a2", "a3"], ["b1", "b2", "b3"], ["c1", "c2", "c3"]]
self.assertEqual(
accumulate(
inp, lambda x: accumulate(list('123'), lambda y: x + y)), out)
accumulate(inp, lambda x: accumulate(list("123"), lambda y: x + y)), out
)


if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()
Loading

0 comments on commit b438094

Please sign in to comment.