Merge pull request #1578 from codestoryai/features/add-evaluation-cri…

…teria-and-reward-scaling-to-edits-and-terminal [sidecar] add evaluation criteria and reward scaling to edits and terminal
codestoryai · Nov 28, 2024 · 8dce32f · 8dce32f
2 parents 4acda60 + dda1726
commit 8dce32f
Show file tree

Hide file tree

Showing 2 changed files with 111 additions and 6 deletions.
diff --git a/sidecar/src/agentic/tool/code_edit/types.rs b/sidecar/src/agentic/tool/code_edit/types.rs
@@ -501,12 +501,71 @@ Edit instruction here
         )
     }
 
-    fn get_evaluation_criteria(&self, _trajectory_length: usize) -> Vec<String> {
-        vec![]
+    fn get_evaluation_criteria(&self, trajectory_length: usize) -> Vec<String> {
+        let mut evaluation_criteria = if trajectory_length < 3 {
+            vec![
+                "Exploratory Actions: Recognize that initial searches and information-gathering steps are essential and should not be heavily penalized if they don't yield immediate results.",
+                "Appropriateness of Action: Evaluate if the action is logical given the agent's current knowledge and the early stage of problem-solving.",
+            ]
+        } else {
+            vec![
+                "Solution Quality: Assess the logical changes, contextual fit, and overall improvement without introducing new issues.",
+                "Progress Assessment: Evaluate the agent's awareness of solution history, detection of repetitive actions, and planned next steps.",
+                "Repetitive or Redundant Actions: Detect if the agent is repeating the same unsuccessful or redundant actions without making progress. Pay close attention to the agent's history and outputs indicating lack of progress.",
+            ]
+        };
+        evaluation_criteria.extend(vec![
+            "Instruction Clarity: Ensure that instructions and pseudocode are clear and actionable.",
+            "Instruction Compliance: The git diff must *exactly* implement the provided pseudo_code. Identify any discrepancies, omissions, or additions. If discrepancies exist, you should lower the reward accordingly.",
+            "Code Modification Accuracy and Quality: Check for correct identification of code spans, accuracy of changes, syntax errors, logical flaws, unintended modifications, and unintended side effects.",
+            "Python-Specific Features Utilization: Assess whether the agent has appropriately utilized Python-specific features that enhance the solution.",
+            "Common Git Diff Issues and Unintended Changes: Check for issues such as incorrect line numbers, unintended additions or deletions, formatting errors, changes to unrelated parts of the code, and heavily penalize unintended changes.",
+            "Addressing Test Failures: Verify if the agent is properly addressing test failures from previous `RunTests` actions.",
+        ]);
+        evaluation_criteria
+            .into_iter()
+            .map(|evaluation_criteria| evaluation_criteria.to_owned())
+            .collect()
     }
 
     fn get_reward_scale(&self, _trajectory_length: usize) -> Vec<ToolRewardScale> {
-        vec![]
+        vec![
+            ToolRewardScale::new(
+                90,
+                100,
+                "The code change is optimal, with a perfect Git diff exactly matching the pseudo code, and requires no further changes.",
+            ),
+            ToolRewardScale::new(
+                75,
+                89,
+                "The code change significantly advances the solution, with an accurate Git diff exactly matching the pseudo code,.",
+            ),
+            ToolRewardScale::new(
+                50,
+                74,
+                "The code change is mostly correct but has minor issues or opportunities for optimization; the Git diff exactly matching the pseudo code,.",
+            ),
+            ToolRewardScale::new(
+                25,
+                49,
+                "The code change is acceptable but has noticeable issues or is less effective than possible alternatives;",
+            ),
+            ToolRewardScale::new(
+                0,
+                24,
+                "The code change has minimal impact or introduces minor negative consequences",
+            ),
+            ToolRewardScale::new(
+                -49,
+                -1,
+                "The code change is inappropriate, unhelpful, or introduces new issues; the action did not result in any successful code changes. The Git diff does not match the pseud code and instructions, contains significant inaccuracies or shows no changes. Penalize attempts to modify non-existent code elements (hallucinations) based on severity.",
+            ),
+            ToolRewardScale::new(
+                -100,
+                -50,
+                "The code change is counterproductive, causing significant setbacks or demonstrating persistent repetition without learning. The Git diff is severely flawed or indicates that no effective changes were made. Heavily penalize severe hallucinations or continuous attempts to modify non-existent code elements.",
+            ),
+        ]
     }
 }
 

diff --git a/sidecar/src/agentic/tool/terminal/terminal.rs b/sidecar/src/agentic/tool/terminal/terminal.rs
@@ -123,11 +123,57 @@ Your command here
         )
     }
 
-    fn get_evaluation_criteria(&self, _trajectory_length: usize) -> Vec<String> {
-        vec![]
+    fn get_evaluation_criteria(&self, trajectory_length: usize) -> Vec<String> {
+        let evaluation_criteria = if trajectory_length < 3 {
+            vec![
+                "Exploratory Actions: Recognize that initial searches and information-gathering steps are essential and should not be heavily penalized if they don't yield immediate results.",
+                "Appropriateness of Action: Evaluate if the action is logical given the agent's current knowledge and the early stage of problem-solving.",
+            ]
+        } else {
+            vec![
+                "Solution Quality: Assess the logical changes, contextual fit, and overall improvement without introducing new issues.",
+                "Progress Assessment: Evaluate the agent's awareness of solution history, detection of repetitive actions, and planned next steps.",
+                "Repetitive or Redundant Actions: Detect if the agent is repeating the same unsuccessful or redundant actions without making progress. Pay close attention to the agent's history and outputs indicating lack of progress.",
+            ]
+        };
+        evaluation_criteria
+            .into_iter()
+            .map(|evaluation_criteria| evaluation_criteria.to_owned())
+            .collect()
     }
 
     fn get_reward_scale(&self, _trajectory_length: usize) -> Vec<ToolRewardScale> {
-        vec![]
+        vec![
+            ToolRewardScale::new(
+                75,
+                100,
+                "The action significantly advances the solution.",
+            ),
+            ToolRewardScale::new(
+                50,
+                74,
+                "The action contributes positively towards solving the problem.",
+            ),
+            ToolRewardScale::new(
+                25,
+                49,
+                "The action is acceptable but may have some issues.",
+            ),
+            ToolRewardScale::new(
+                0,
+                24,
+                "The action has minimal impact or minor negative consequences.",
+            ),
+            ToolRewardScale::new(
+                -49,
+                -1,
+                "The code change is inappropriate, unhelpful, introduces new issues, or redundantly repeats previous changes without making further progress. The Git diff does not align with instructions or is unnecessary.",
+            ),
+            ToolRewardScale::new(
+                -100,
+                -50,
+                "The code change is counterproductive, causing significant setbacks or demonstrating persistent repetition without learning. The agent fails to recognize completed tasks and continues to attempt redundant actions.",
+            ),
+        ]
     }
 }