Skip to content

Commit

Permalink
add e2b support
Browse files Browse the repository at this point in the history
  • Loading branch information
terryyz committed Jan 22, 2025
1 parent 342aed8 commit 468eece
Show file tree
Hide file tree
Showing 7 changed files with 113 additions and 7 deletions.
31 changes: 27 additions & 4 deletions bigcodebench/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from typing import Any, Dict, List, Tuple, Optional
from warnings import warn
from gradio_client import Client, handle_file
from e2b import Sandbox

import httpx
import numpy as np
Expand Down Expand Up @@ -118,9 +119,10 @@ def evaluate(
subset: str,
samples: Optional[str] = None,
no_execute: bool = False,
local_execute: bool = False,
execution: str = "e2b", # "e2b", "gradio", "local"
selective_evaluate: str = "",
remote_execute_api: str = "https://bigcode-bigcodebench-evaluator.hf.space/",
e2b_endpoint: str = "bigcodebench-evaluator",
gradio_endpoint: str = "https://bigcode-bigcodebench-evaluator.hf.space/",
pass_k: str = "1,5,10",
save_pass_rate: bool = True,
calibrated: bool = True,
Expand Down Expand Up @@ -152,10 +154,10 @@ def evaluate(
assert samples.endswith(".jsonl")
result_path = samples.replace(".jsonl", "_eval_results.json")

if not local_execute:
if execution == "gradio":
while True:
try:
client = Client(remote_execute_api)
client = Client(gradio_endpoint)
results, pass_at_k = client.predict(
split=split,
subset=subset,
Expand All @@ -178,7 +180,28 @@ def evaluate(
time.sleep(4)
gt_pass_rate = pass_at_k["gt_pass_rate"]
failed_tasks = pass_at_k["failed_tasks"]

elif execution == "e2b":
sandbox = Sandbox(e2b_endpoint, timeout=60*10)

# upload file to sandbox
with open(samples, "r") as file:
sandbox.files.write(samples, file)

# run the evaluation
sandbox.commands.run("python3 -m bigcodebench.evaluate \
--split {} --subset {} --samples {} \
--pass_k {} --save_pass_rate {} --calibrated {} \
--parallel {} --min_time_limit {} --max_as_limit {} \
--max_data_limit {} --max_stack_limit {} --check_gt_only {} --no_gt {} \
".format(split, subset, samples, pass_k, save_pass_rate, calibrated, parallel,
min_time_limit, max_as_limit, max_data_limit, max_stack_limit, check_gt_only, no_gt))

# download the results
content = sandbox.files.read(result_path)
with open(result_path, "w") as file:
file.write(content)

else:

pass_at_k = dict()
Expand Down
2 changes: 1 addition & 1 deletion bigcodebench/gen/util/openai_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def make_request(
kwargs["top_p"] = 0.95
kwargs["max_completion_tokens"] = max_tokens
kwargs["temperature"] = temperature
if model.startswith("o1-") or model.startswith("o3-"): # pop top-p and max_completion_tokens
if model.startswith("o1-") or model.startswith("o3-") or model.endswith("-reasoner"): # pop top-p and max_completion_tokens
kwargs.pop("top_p")
kwargs.pop("max_completion_tokens")
kwargs.pop("temperature")
Expand Down
2 changes: 1 addition & 1 deletion bigcodebench/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def run_codegen(
temperature: float = 0.0,
max_new_tokens: int = 1280,
greedy: bool = False,
reasoning_effort: str = "medium", # o1 and o3 only
reasoning_effort: str = "medium",
strip_newlines: bool = False,
direct_completion: bool = False,
resume: bool = True,
Expand Down
2 changes: 1 addition & 1 deletion bigcodebench/provider/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def codegen(
tokenizer=None,
) for prompt in prompts]
# use concurrency based batching for o1 and deepseek models
if self.name.startswith("o1-") or self.name == "deepseek-chat":
if self.name.startswith("o1-") or self.name.startswith("o3-") or self.name.startswith("deepseek"):
return self._codegen_batch_via_concurrency(messages, num_samples)

return self._codegen_api_batch(messages, num_samples)
Expand Down
66 changes: 66 additions & 0 deletions sandbox-templates/e2b.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Better use newer Python as generated code can use new features
FROM python:3.10-slim

# install git, g++ and python3-tk
RUN apt-get update && apt-get install -y \
git \
g++ \
python3-tk \
zip \
unzip \
procps \
r-base \
libgdal-dev \
# Add these new dependencies for matplotlib
libfreetype6-dev \
libpng-dev \
pkg-config \
python3-dev \
python3-matplotlib \
&& rm -rf /var/lib/apt/lists/*

# upgrade to latest pip
RUN pip install --upgrade pip

# Add a new user "bigcodebenchuser"
RUN adduser --disabled-password --gecos "" bigcodebenchuser

RUN rm -rf /bigcodebench

# Acquire benchmark code to local
ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit
RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench

RUN pip install numpy==1.24.3 pyarrow==14.0.1

RUN cd /bigcodebench && \
pip install . --no-deps && \
pip install \
appdirs>=1.4.4 \
fire>=0.6.0 \
multipledispatch>=0.6.0 \
pqdm>=0.2.0 \
tempdir>=0.7.1 \
termcolor>=2.0.0 \
tqdm>=4.56.0 \
tree_sitter_languages>=1.10.2 \
tree-sitter==0.21.3 \
wget>=3.2 \
gradio-client \
rich

RUN pip install -I --timeout 2000 -r https://raw.githubusercontent.com/bigcode-project/bigcodebench/refs/heads/main/Requirements/requirements-eval.txt

# Ensure the numpy version is compatible with the datasets version
RUN pip install datasets==2.17.0

# Pre-install the dataset
RUN python3 -c "from bigcodebench.data import get_bigcodebench; get_bigcodebench(subset='full'); get_bigcodebench(subset='hard')"

WORKDIR /app

RUN chown -R bigcodebenchuser:bigcodebenchuser /app

RUN chmod -R 777 /app

USER bigcodebenchuser
16 changes: 16 additions & 0 deletions sandbox-templates/e2b.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# This is a config for E2B sandbox template.
# You can use template ID (tbjhnhg5e3bd22i8jqgk) or template name (bigcodebench-evaluator) to create a sandbox:

# Python SDK
# from e2b import Sandbox, AsyncSandbox
# sandbox = Sandbox("bigcodebench-evaluator") # Sync sandbox
# sandbox = await AsyncSandbox.create("bigcodebench-evaluator") # Async sandbox

# JS SDK
# import { Sandbox } from 'e2b'
# const sandbox = await Sandbox.create('bigcodebench-evaluator')

team_id = "f317d0d2-ba02-44c5-8b77-e4a2d7830c7c"
dockerfile = "e2b.Dockerfile"
template_name = "bigcodebench-evaluator"
template_id = "tbjhnhg5e3bd22i8jqgk"
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ install_requires =
google-generativeai>=0.5.4
mistralai>=0.2.0,<1.0.0
openai>=1.11.1
e2b

[options.entry_points]
console_scripts =
Expand Down

0 comments on commit 468eece

Please sign in to comment.