zeno-ml · cabreraalex · Dec 5, 2023
@@ -0,0 +1,7 @@
+# Retrieval Augmented Generation (RAG) Evaluation with RAGAS
+
+This example shows how to use the [ragas](https://docs.ragas.io/) library to evaluate retrieval augmented generation (RAG) models
+and visualize the results using Zeno.
+
+The [rag-eval.ipynb](rag-eval.ipynb) notebook walks through an example on the [
+Financial Opinion Mining and Question Answering (fiqa)](https://sites.google.com/view/fiqa/) dataset.
@@ -0,0 +1,205 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import dotenv\n",
+    "from datasets import load_dataset\n",
+    "import pandas as pd\n",
+    "from zeno_client import ZenoClient, ZenoMetric\n",
+    "from ragas.metrics import (\n",
+    "    answer_relevancy,\n",
+    "    faithfulness,\n",
+    "    context_recall,\n",
+    "    context_precision,\n",
+    ")\n",
+    "from ragas import evaluate\n",
+    "\n",
+    "dotenv.load_dotenv(override=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")\n",
+    "fiqa_eval"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = evaluate(\n",
+    "    fiqa_eval[\"baseline\"],\n",
+    "    metrics=[\n",
+    "        context_precision,\n",
+    "        faithfulness,\n",
+    "        answer_relevancy,\n",
+    "        context_recall,\n",
+    "    ],\n",
+    ")\n",
+    "\n",
+    "result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = result.to_pandas()\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = ZenoClient(os.environ[\"ZENO_API_KEY\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "project = client.create_project(\n",
+    "    name=\"ragas FICA eval\",\n",
+    "    description=\"Evaluation of RAG model using ragas on FICA dataset\",\n",
+    "    public=True,\n",
+    "    view={\n",
+    "        \"data\": {\n",
+    "            \"type\": \"vstack\",\n",
+    "            \"keys\": {\n",
+    "                \"question\": {\"type\": \"markdown\"},\n",
+    "                \"texts\": {\n",
+    "                    \"type\": \"list\",\n",
+    "                    \"elements\": {\"type\": \"markdown\"},\n",
+    "                    \"border\": True,\n",
+    "                    \"pad\": True,\n",
+    "                },\n",
+    "            },\n",
+    "        },\n",
+    "        \"label\": {\n",
+    "            \"type\": \"markdown\",\n",
+    "        },\n",
+    "        \"output\": {\n",
+    "            \"type\": \"vstack\",\n",
+    "            \"keys\": {\n",
+    "                \"answer\": {\"type\": \"markdown\"},\n",
+    "                \"ground_truths\": {\n",
+    "                    \"type\": \"list\",\n",
+    "                    \"elements\": {\"type\": \"markdown\"},\n",
+    "                    \"border\": True,\n",
+    "                    \"pad\": True,\n",
+    "                },\n",
+    "            },\n",
+    "        },\n",
+    "        \"size\": \"large\",\n",
+    "    },\n",
+    "    metrics=[\n",
+    "        ZenoMetric(\n",
+    "            name=\"context_precision\", type=\"mean\", columns=[\"context_precision\"]\n",
+    "        ),\n",
+    "        ZenoMetric(name=\"faithfulness\", type=\"mean\", columns=[\"faithfulness\"]),\n",
+    "        ZenoMetric(name=\"answer_relevancy\", type=\"mean\", columns=[\"answer_relevancy\"]),\n",
+    "        ZenoMetric(name=\"context_recall\", type=\"mean\", columns=[\"context_recall\"]),\n",
+    "    ],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_df = pd.DataFrame(\n",
+    "    {\n",
+    "        \"data\": df.apply(\n",
+    "            lambda x: {\"question\": x[\"question\"], \"texts\": list(x[\"contexts\"])}, axis=1\n",
+    "        ),\n",
+    "        \"label\": df[\"ground_truths\"].apply(lambda x: \"\\n\".join(x)),\n",
+    "    }\n",
+    ")\n",
+    "data_df[\"id\"] = data_df.index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "project.upload_dataset(\n",
+    "    data_df, id_column=\"id\", data_column=\"data\", label_column=\"label\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_df = df[\n",
+    "    [\n",
+    "        \"context_precision\",\n",
+    "        \"faithfulness\",\n",
+    "        \"answer_relevancy\",\n",
+    "        \"context_recall\",\n",
+    "    ]\n",
+    "].copy()\n",
+    "output_df[\"output\"] = df.apply(\n",
+    "    lambda x: {\"answer\": x[\"answer\"], \"ground_truths\": list(x[\"ground_truths\"])}, axis=1\n",
+    ")\n",
+    "output_df[\"id\"] = output_df.index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "project.upload_system(\n",
+    "    output_df, name=\"Base System\", id_column=\"id\", output_column=\"output\"\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "zeno-build",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -0,0 +1,3 @@
+ragas
+python-dotenv
+datasets