feat(W&B): Update W&B settings

tigergraph · Mar 3, 2024 · a098f10 · a098f10
1 parent 1bdd825
commit a098f10
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 12 deletions.
diff --git a/.github/workflows/build-test-ci.yaml b/.github/workflows/build-test-ci.yaml
@@ -1,13 +1,13 @@
-name: nlqs build and test ci
+name: CoPilot LLM Performance Evaluation
 
 on:
   push:
     branches: [ "build-test-ci" ]
   pull_request:
     branches: [ "main" ]
     types: [ labeled ]
-  #schedule:
-  #  - cron: '45 22 * * *'
+  schedule:
+    - cron: '45 22 * * *'
   workflow_dispatch:
 
 jobs:

diff --git a/tests/create_wandb_report.py b/tests/create_wandb_report.py
@@ -8,15 +8,15 @@
 branch_name = os.getenv("PR_NUMBER", Repository('.').head.shorthand)
 
 report = wr.Report(
-    project="llm-eval-sweep",
+    project="CoPilot",
     title="Test Summary For PR #"+ branch_name + " at "+datetime.now().strftime("%m/%d/%Y, %H:%M"),
     description="Evaluate the peformance of the changes made to the service.",
 )
 
 python_filter = "branch == '"+ branch_name +"' and commit_hash == '"+Repository('.').head.peel(Commit).id.hex+"'"
 
 acc_llm_service_bar_plot = wr.PanelGrid(
-    runsets=[wr.Runset(project="llm-eval-sweep", name="LLM Service Grouping", groupby=["llm_service"]).set_filters_with_python_expr(python_filter)],
+    runsets=[wr.Runset(project="CoPilot", name="LLM Service Grouping", groupby=["llm_service"]).set_filters_with_python_expr(python_filter)],
     panels = [
         wr.BarPlot(
             title="Average Accuracy by LLM Service",
@@ -30,7 +30,7 @@
 )
 
 acc_question_type_bar_plot = wr.PanelGrid(
-    runsets=[wr.Runset(project="llm-eval-sweep", name="Question Type Grouping", groupby=["question_type"]).set_filters_with_python_expr(python_filter)],
+    runsets=[wr.Runset(project="CoPilot", name="Question Type Grouping", groupby=["question_type"]).set_filters_with_python_expr(python_filter)],
     panels = [
         wr.BarPlot(
             title="Average Accuracy by Question Type",
@@ -45,7 +45,7 @@
 
 
 acc_parallel_cords = wr.PanelGrid(
-    runsets=[wr.Runset(project="llm-eval-sweep").set_filters_with_python_expr(python_filter)],
+    runsets=[wr.Runset(project="CoPilot").set_filters_with_python_expr(python_filter)],
     panels = [
         wr.ParallelCoordinatesPlot(
             columns=[
@@ -60,7 +60,7 @@
 )
 
 nrp_llm_service_bar_plot = wr.PanelGrid(
-    runsets=[wr.Runset(project="llm-eval-sweep", name="LLM Service Grouping", groupby=["llm_service"]).set_filters_with_python_expr(python_filter)],
+    runsets=[wr.Runset(project="CoPilot", name="LLM Service Grouping", groupby=["llm_service"]).set_filters_with_python_expr(python_filter)],
     panels = [
         wr.BarPlot(
             title="Average Not Wrong Percent by LLM Service",
@@ -74,7 +74,7 @@
 )
 
 nrp_question_type_bar_plot = wr.PanelGrid(
-    runsets=[wr.Runset(project="llm-eval-sweep", name="Question Type Grouping", groupby=["question_type"]).set_filters_with_python_expr(python_filter)],
+    runsets=[wr.Runset(project="CoPilot", name="Question Type Grouping", groupby=["question_type"]).set_filters_with_python_expr(python_filter)],
     panels = [
         wr.BarPlot(
             title="Average Not Wrong Percent by Question Type",
@@ -89,7 +89,7 @@
 
 
 nrp_parallel_cords = wr.PanelGrid(
-    runsets=[wr.Runset(project="llm-eval-sweep").set_filters_with_python_expr(python_filter)],
+    runsets=[wr.Runset(project="CoPilot").set_filters_with_python_expr(python_filter)],
     panels = [
         wr.ParallelCoordinatesPlot(
             columns=[
@@ -104,7 +104,7 @@
 )
 
 table = wr.PanelGrid(
-    runsets=[wr.Runset(project="llm-eval-sweep").set_filters_with_python_expr(python_filter)],
+    runsets=[wr.Runset(project="CoPilot").set_filters_with_python_expr(python_filter)],
     panels = [
         wr.WeavePanelSummaryTable(table_name="qa_results",
             layout={'w': 24, 'h': 16}  # change the layout!

diff --git a/tests/test_service.py b/tests/test_service.py
@@ -206,7 +206,7 @@ def tearDownClass(cls):
                     }
                     final_df = filtered_df[filtered_df["Dataset"] == dataset]
                     if final_df.shape[0] > 0:
-                        cls.wandbLogger = wandb.init(project="llm-eval-sweep", config=cls.config)
+                        cls.wandbLogger = wandb.init(project="CoPilot", config=cls.config)
                         acc = (final_df["Answer Correct"].sum())/final_df["Answer Correct"].shape[0]
                         not_wrong_perc = (final_df["Answer Correct"].sum() + (final_df["Answered Question"] == False).sum())/final_df["Answer Correct"].shape[0]
                         avg_resp_time = final_df["Response Time (seconds)"].mean()