Move llama-dataset metadata from llama-hub repo to llama-index (#11488)

* copy dataset metadata * update library.json and download modules * working * pants tailor * use main not dev branch
run-llama · Feb 29, 2024 · b3112de · b3112de
1 parent f508073
commit b3112de
Show file tree

Hide file tree

Showing 81 changed files with 2,552 additions and 11 deletions.
diff --git a/docs/BUILD b/docs/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/docs/examples/discover_llamaindex/document_management/BUILD b/docs/examples/discover_llamaindex/document_management/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/docs/examples/finetuning/embeddings/BUILD b/docs/examples/finetuning/embeddings/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/docs/examples/output_parsing/BUILD b/docs/examples/output_parsing/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/10k/uber_2021/BUILD b/llama-datasets/10k/uber_2021/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/10k/uber_2021/README.md b/llama-datasets/10k/uber_2021/README.md
@@ -0,0 +1,61 @@
+# Uber 10K Dataset 2021
+
+## CLI Usage
+
+You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamadataset Uber10KDataset2021 --download-dir ./data
+```
+
+You can then inspect the files at `./data`. When you're ready to load the data into
+python, you can use the below snippet of code:
+
+```python
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.llama_dataset import LabelledRagDataset
+
+rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
+documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
+```
+
+## Code Usage
+
+You can download the dataset to a directory, say `./data` directly in Python
+as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
+run your own LlamaIndex RAG pipeline with the `llamadataset`.
+
+```python
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+# download and install dependencies for benchmark dataset
+rag_dataset, documents = download_llama_dataset("Uber10KDataset2021", "./data")
+
+# build basic RAG system
+index = VectorStoreIndex.from_documents(documents=documents)
+query_engine = index.as_query_engine()
+
+# evaluate using the RagEvaluatorPack
+RagEvaluatorPack = download_llama_pack(
+    "RagEvaluatorPack", "./rag_evaluator_pack"
+)
+rag_evaluator_pack = RagEvaluatorPack(
+    rag_dataset=rag_dataset,
+    query_engine=query_engine,
+    show_progress=True,
+)
+
+############################################################################
+# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+# then you'll need to use different batch_size and sleep_time_in_seconds.  #
+# For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+# and sleep_time_in_seconds=15 (as of December 2023.)                      #
+############################################################################
+
+benchmark_df = await rag_evaluator_pack.arun(
+    batch_size=20,  # batches the number of openai api calls to make
+    sleep_time_in_seconds=1,  # seconds to sleep before making an api call
+)
+```
diff --git a/llama-datasets/10k/uber_2021/card.json b/llama-datasets/10k/uber_2021/card.json
@@ -0,0 +1,27 @@
+{
+  "name": "Uber 10K Dataset 2021",
+  "className": "LabelledRagDataset",
+  "description": "A labelled RAG dataset based on the Uber 2021 10K document, consisting of queries, reference answers, and reference contexts.",
+  "numberObservations": 822,
+  "containsExamplesByHumans": false,
+  "containsExamplesByAi": true,
+  "sourceUrls": [],
+  "baselines": [
+    {
+      "name": "llamaindex",
+      "config": {
+        "chunkSize": 1024,
+        "llm": "gpt-3.5-turbo",
+        "similarityTopK": 2,
+        "embedModel": "text-embedding-ada-002"
+      },
+      "metrics": {
+        "contextSimilarity": 0.943,
+        "correctness": 3.874,
+        "faithfulness": 0.667,
+        "relevancy": 0.844
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/10k/uber_2021/llamaindex_baseline.py"
+    }
+  ]
+}
diff --git a/llama-datasets/10k/uber_2021/llamaindex_baseline.py b/llama-datasets/10k/uber_2021/llamaindex_baseline.py
@@ -0,0 +1,41 @@
+import asyncio
+
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+from llama_index.llms import OpenAI
+
+
+async def main():
+    # DOWNLOAD LLAMADATASET
+    rag_dataset, documents = download_llama_dataset(
+        "Uber10KDataset2021", "./uber10k_2021_dataset"
+    )
+
+    # BUILD BASIC RAG PIPELINE
+    index = VectorStoreIndex.from_documents(documents=documents)
+    query_engine = index.as_query_engine()
+
+    # EVALUATE WITH PACK
+    RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack_stuff")
+    judge_llm = OpenAI(model="gpt-3.5-turbo")
+    rag_evaluator = RagEvaluatorPack(
+        query_engine=query_engine, rag_dataset=rag_dataset, judge_llm=judge_llm
+    )
+
+    ############################################################################
+    # NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+    # then you'll need to use different batch_size and sleep_time_in_seconds.  #
+    # For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+    # and sleep_time_in_seconds=15 (as of December 2023.)                      #
+    ############################################################################
+    benchmark_df = await rag_evaluator.arun(
+        batch_size=20,  # batches the number of openai api calls to make
+        sleep_time_in_seconds=1,  # number of seconds sleep before making an api call
+    )
+    print(benchmark_df)
+
+
+if __name__ == "__main__":
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main)
diff --git a/llama-datasets/__init__.py b/llama-datasets/__init__.py
diff --git a/llama-datasets/blockchain_solana/BUILD b/llama-datasets/blockchain_solana/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/blockchain_solana/README.md b/llama-datasets/blockchain_solana/README.md
@@ -0,0 +1,61 @@
+# Blockchain Solana Dataset
+
+## CLI Usage
+
+You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamadataset BlockchainSolanaDataset --download-dir ./data
+```
+
+You can then inspect the files at `./data`. When you're ready to load the data into
+python, you can use the below snippet of code:
+
+```python
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.llama_dataset import LabelledRagDataset
+
+rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
+documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
+```
+
+## Code Usage
+
+You can download the dataset to a directory, say `./data` directly in Python
+as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
+run your own LlamaIndex RAG pipeline with the `llamadataset`.
+
+```python
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+# download and install dependencies for benchmark dataset
+rag_dataset, documents = download_llama_dataset(
+    "BlockchainSolanaDataset", "./data"
+)
+
+# build basic RAG system
+index = VectorStoreIndex.from_documents(documents=documents)
+query_engine = index.as_query_engine()
+
+# evaluate using the RagEvaluatorPack
+RagEvaluatorPack = download_llama_pack(
+    "RagEvaluatorPack", "./rag_evaluator_pack"
+)
+rag_evaluator_pack = RagEvaluatorPack(
+    rag_dataset=rag_dataset, query_engine=query_engine
+)
+
+############################################################################
+# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+# then you'll need to use different batch_size and sleep_time_in_seconds.  #
+# For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+# and sleep_time_in_seconds=15 (as of December 2023.)                      #
+############################################################################
+
+benchmark_df = await rag_evaluator_pack.arun(
+    batch_size=20,  # batches the number of openai api calls to make
+    sleep_time_in_seconds=1,  # seconds to sleep before making an api call
+)
+```
diff --git a/llama-datasets/blockchain_solana/card.json b/llama-datasets/blockchain_solana/card.json
@@ -0,0 +1,27 @@
+{
+  "name": "Blockchain Solana",
+  "className": "LabelledRagDataset",
+  "description": "A labelled RAG dataset based off an article, From Bitcoin to Solana – Innovating Blockchain towards Enterprise Applications),by Xiangyu Li, Xinyu Wang, Tingli Kong, Junhao Zheng and Min Luo, consisting of queries, reference answers, and reference contexts.",
+  "numberObservations": 58,
+  "containsExamplesByHumans": false,
+  "containsExamplesByAi": true,
+  "sourceUrls": ["https://arxiv.org/abs/2207.05240"],
+  "baselines": [
+    {
+      "name": "llamaindex",
+      "config": {
+        "chunkSize": 1024,
+        "llm": "gpt-3.5-turbo",
+        "similarityTopK": 2,
+        "embedModel": "text-embedding-ada-002"
+      },
+      "metrics": {
+        "contextSimilarity": 0.945,
+        "correctness": 4.457,
+        "faithfulness": 1.0,
+        "relevancy": 1.0
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/blockchain_solana/llamaindex_baseline.py"
+    }
+  ]
+}
diff --git a/llama-datasets/blockchain_solana/llamaindex_baseline.py b/llama-datasets/blockchain_solana/llamaindex_baseline.py
@@ -0,0 +1,37 @@
+import asyncio
+
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+
+async def main():
+    # DOWNLOAD LLAMADATASET
+    rag_dataset, documents = download_llama_dataset(
+        "BlockchainSolanaDataset", "./blockchain_solana"
+    )
+
+    # BUILD BASIC RAG PIPELINE
+    index = VectorStoreIndex.from_documents(documents=documents)
+    query_engine = index.as_query_engine()
+
+    # EVALUATE WITH PACK
+    RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack_stuff")
+    rag_evaluator = RagEvaluatorPack(query_engine=query_engine, rag_dataset=rag_dataset)
+
+    ############################################################################
+    # NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+    # then you'll need to use different batch_size and sleep_time_in_seconds.  #
+    # For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+    # and sleep_time_in_seconds=15 (as of December 2023.)                      #
+    ############################################################################
+    benchmark_df = await rag_evaluator.arun(
+        batch_size=20,  # batches the number of openai api calls to make
+        sleep_time_in_seconds=1,  # number of seconds sleep before making an api call
+    )
+    print(benchmark_df)
+
+
+if __name__ == "__main__":
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main)
diff --git a/llama-datasets/braintrust_coda/BUILD b/llama-datasets/braintrust_coda/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/braintrust_coda/README.md b/llama-datasets/braintrust_coda/README.md
@@ -0,0 +1,65 @@
+# Braintrust Coda Help Desk Dataset
+
+[![Braintrust (346 x 40 px)](https://github.com/nerdai/llama-hub/assets/92402603/a99bddf3-0eab-42e8-8c53-8432da8299d3)](https://www.braintrustdata.com/)
+
+_This dataset was kindly provided by Kenny Wong and Ankur Goyal._
+
+## CLI Usage
+
+You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamadataset BraintrustCodaHelpDeskDataset --download-dir ./data
+```
+
+You can then inspect the files at `./data`. When you're ready to load the data into
+python, you can use the below snippet of code:
+
+```python
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.llama_dataset import LabelledRagDataset
+
+rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
+documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
+```
+
+## Code Usage
+
+You can download the dataset to a directory, say `./data` directly in Python
+as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
+run your own LlamaIndex RAG pipeline with the `llamadataset`.
+
+```python
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+# download and install dependencies for benchmark dataset
+rag_dataset, documents = download_llama_dataset(
+    "BraintrustCodaHelpDeskDataset", "./data"
+)
+
+# build basic RAG system
+index = VectorStoreIndex.from_documents(documents=documents)
+query_engine = index.as_query_engine()
+
+# evaluate using the RagEvaluatorPack
+RagEvaluatorPack = download_llama_pack(
+    "RagEvaluatorPack", "./rag_evaluator_pack"
+)
+rag_evaluator_pack = RagEvaluatorPack(
+    rag_dataset=rag_dataset, query_engine=query_engine
+)
+
+############################################################################
+# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+# then you'll need to use different batch_size and sleep_time_in_seconds.  #
+# For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+# and sleep_time_in_seconds=15 (as of December 2023.)                      #
+############################################################################
+
+benchmark_df = await rag_evaluator_pack.arun(
+    batch_size=20,  # batches the number of openai api calls to make
+    sleep_time_in_seconds=1,  # seconds to sleep before making an api call
+)
+```
diff --git a/llama-datasets/braintrust_coda/__init__.py b/llama-datasets/braintrust_coda/__init__.py
diff --git a/llama-datasets/braintrust_coda/card.json b/llama-datasets/braintrust_coda/card.json
@@ -0,0 +1,29 @@
+{
+  "name": "Braintrust Coda Help Desk",
+  "className": "LabelledRagDataset",
+  "description": "A list of automatically generated question/answer pairs from the Coda (https://coda.io/) help docs. This dataset is interesting because most models include Coda’s documentation as part of their training set, so you can baseline performance without RAG.",
+  "numberObservations": 100,
+  "containsExamplesByHumans": false,
+  "containsExamplesByAi": true,
+  "sourceUrls": [
+    "https://gist.githubusercontent.com/wong-codaio/b8ea0e087f800971ca5ec9eef617273e/raw/39f8bd2ebdecee485021e20f2c1d40fd649a4c77/articles.json"
+  ],
+  "baselines": [
+    {
+      "name": "llamaindex",
+      "config": {
+        "chunkSize": 1024,
+        "llm": "gpt-3.5-turbo",
+        "similarityTopK": 2,
+        "embedModel": "text-embedding-ada-002"
+      },
+      "metrics": {
+        "contextSimilarity": 0.955,
+        "correctness": 4.32,
+        "faithfulness": 0.9,
+        "relevancy": 0.93
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/braintrust_coda/llamaindex_baseline.py"
+    }
+  ]
+}