Skip to content

Commit

Permalink
feat(W&B): Update W&B settings
Browse files Browse the repository at this point in the history
  • Loading branch information
parkererickson-tg committed Mar 3, 2024
1 parent 1bdd825 commit a098f10
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 12 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/build-test-ci.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
name: nlqs build and test ci
name: CoPilot LLM Performance Evaluation

on:
push:
branches: [ "build-test-ci" ]
pull_request:
branches: [ "main" ]
types: [ labeled ]
#schedule:
# - cron: '45 22 * * *'
schedule:
- cron: '45 22 * * *'
workflow_dispatch:

jobs:
Expand Down
16 changes: 8 additions & 8 deletions tests/create_wandb_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@
branch_name = os.getenv("PR_NUMBER", Repository('.').head.shorthand)

report = wr.Report(
project="llm-eval-sweep",
project="CoPilot",
title="Test Summary For PR #"+ branch_name + " at "+datetime.now().strftime("%m/%d/%Y, %H:%M"),
description="Evaluate the peformance of the changes made to the service.",
)

python_filter = "branch == '"+ branch_name +"' and commit_hash == '"+Repository('.').head.peel(Commit).id.hex+"'"

acc_llm_service_bar_plot = wr.PanelGrid(
runsets=[wr.Runset(project="llm-eval-sweep", name="LLM Service Grouping", groupby=["llm_service"]).set_filters_with_python_expr(python_filter)],
runsets=[wr.Runset(project="CoPilot", name="LLM Service Grouping", groupby=["llm_service"]).set_filters_with_python_expr(python_filter)],
panels = [
wr.BarPlot(
title="Average Accuracy by LLM Service",
Expand All @@ -30,7 +30,7 @@
)

acc_question_type_bar_plot = wr.PanelGrid(
runsets=[wr.Runset(project="llm-eval-sweep", name="Question Type Grouping", groupby=["question_type"]).set_filters_with_python_expr(python_filter)],
runsets=[wr.Runset(project="CoPilot", name="Question Type Grouping", groupby=["question_type"]).set_filters_with_python_expr(python_filter)],
panels = [
wr.BarPlot(
title="Average Accuracy by Question Type",
Expand All @@ -45,7 +45,7 @@


acc_parallel_cords = wr.PanelGrid(
runsets=[wr.Runset(project="llm-eval-sweep").set_filters_with_python_expr(python_filter)],
runsets=[wr.Runset(project="CoPilot").set_filters_with_python_expr(python_filter)],
panels = [
wr.ParallelCoordinatesPlot(
columns=[
Expand All @@ -60,7 +60,7 @@
)

nrp_llm_service_bar_plot = wr.PanelGrid(
runsets=[wr.Runset(project="llm-eval-sweep", name="LLM Service Grouping", groupby=["llm_service"]).set_filters_with_python_expr(python_filter)],
runsets=[wr.Runset(project="CoPilot", name="LLM Service Grouping", groupby=["llm_service"]).set_filters_with_python_expr(python_filter)],
panels = [
wr.BarPlot(
title="Average Not Wrong Percent by LLM Service",
Expand All @@ -74,7 +74,7 @@
)

nrp_question_type_bar_plot = wr.PanelGrid(
runsets=[wr.Runset(project="llm-eval-sweep", name="Question Type Grouping", groupby=["question_type"]).set_filters_with_python_expr(python_filter)],
runsets=[wr.Runset(project="CoPilot", name="Question Type Grouping", groupby=["question_type"]).set_filters_with_python_expr(python_filter)],
panels = [
wr.BarPlot(
title="Average Not Wrong Percent by Question Type",
Expand All @@ -89,7 +89,7 @@


nrp_parallel_cords = wr.PanelGrid(
runsets=[wr.Runset(project="llm-eval-sweep").set_filters_with_python_expr(python_filter)],
runsets=[wr.Runset(project="CoPilot").set_filters_with_python_expr(python_filter)],
panels = [
wr.ParallelCoordinatesPlot(
columns=[
Expand All @@ -104,7 +104,7 @@
)

table = wr.PanelGrid(
runsets=[wr.Runset(project="llm-eval-sweep").set_filters_with_python_expr(python_filter)],
runsets=[wr.Runset(project="CoPilot").set_filters_with_python_expr(python_filter)],
panels = [
wr.WeavePanelSummaryTable(table_name="qa_results",
layout={'w': 24, 'h': 16} # change the layout!
Expand Down
2 changes: 1 addition & 1 deletion tests/test_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def tearDownClass(cls):
}
final_df = filtered_df[filtered_df["Dataset"] == dataset]
if final_df.shape[0] > 0:
cls.wandbLogger = wandb.init(project="llm-eval-sweep", config=cls.config)
cls.wandbLogger = wandb.init(project="CoPilot", config=cls.config)
acc = (final_df["Answer Correct"].sum())/final_df["Answer Correct"].shape[0]
not_wrong_perc = (final_df["Answer Correct"].sum() + (final_df["Answered Question"] == False).sum())/final_df["Answer Correct"].shape[0]
avg_resp_time = final_df["Response Time (seconds)"].mean()
Expand Down

0 comments on commit a098f10

Please sign in to comment.