[fix eval] Fix issues with miniwob remote runtime evaluation (#5001)

All-Hands-AI · Nov 14, 2024 · 852c90f · 852c90f
1 parent 42b49e6
commit 852c90f
Show file tree

Hide file tree

Showing 5 changed files with 18 additions and 2 deletions.
diff --git a/evaluation/miniwob/README.md b/evaluation/miniwob/README.md
@@ -16,6 +16,20 @@ Access with browser the above MiniWoB URLs and see if they load correctly.
 ./evaluation/miniwob/scripts/run_infer.sh llm.claude-35-sonnet-eval
 ```
 
+### Run Inference on `RemoteRuntime` (experimental)
+
+This is in limited beta. Contact Xingyao over slack if you want to try this out!
+
+```bash
+./evaluation/miniwob/scripts/run_infer.sh [model_config] [git-version] [agent] [note] [eval_limit] [num_workers]
+
+# Example - This runs evaluation on BrowsingAgent for 125 instances on miniwob, with 2 workers running in parallel
+export ALLHANDS_API_KEY="YOUR-API-KEY"
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+./evaluation/miniwob/scripts/run_infer.sh llm.eval HEAD BrowsingAgent "" 125 2
+```
+
 Results will be in `evaluation/evaluation_outputs/outputs/miniwob/`
 
 To calculate the average reward, run:

diff --git a/evaluation/miniwob/__init__.py b/evaluation/miniwob/__init__.py
diff --git a/evaluation/miniwob/get_avg_reward.py b/evaluation/miniwob/get_avg_reward.py
@@ -23,7 +23,7 @@
             data = json.loads(line)
             actual_num += 1
             total_cost += data['metrics']['accumulated_cost']
-            total_reward += data['test_result']
+            total_reward += data['test_result']['reward']
 
     avg_reward = total_reward / total_num
     print('Avg Reward: ', avg_reward)

diff --git a/evaluation/miniwob/run_infer.py b/evaluation/miniwob/run_infer.py
@@ -47,6 +47,7 @@
 
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
     'CodeActAgent': codeact_user_response,
+    'BrowsingAgent': 'Continue the task. IMPORTANT: do not talk to the user until you have finished the task',
 }
 
 
@@ -66,6 +67,7 @@ def get_config(
             browsergym_eval_env=env_id,
             api_key=os.environ.get('ALLHANDS_API_KEY', None),
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
+            remote_runtime_init_timeout=1800,
             keep_runtime_alive=False,
             timeout=120,
             remote_runtime_init_timeout=1800,

diff --git a/evaluation/miniwob/scripts/run_infer.sh b/evaluation/miniwob/scripts/run_infer.sh
@@ -33,7 +33,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 EVAL_NOTE="${AGENT_VERSION}_${NOTE}"
 
-COMMAND="poetry run python evaluation/miniwob/run_infer.py \
+COMMAND="export PYTHONPATH=evaluation/miniwob:\$PYTHONPATH && poetry run python evaluation/miniwob/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \