Skip to content

Commit

Permalink
Move llama-dataset metadata from llama-hub repo to llama-index (#11488)
Browse files Browse the repository at this point in the history
* copy dataset metadata

* update library.json and download modules

* working

* pants tailor

* use main not dev branch
  • Loading branch information
nerdai authored Feb 29, 2024
1 parent f508073 commit b3112de
Show file tree
Hide file tree
Showing 81 changed files with 2,552 additions and 11 deletions.
1 change: 1 addition & 0 deletions docs/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python_sources()
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python_sources()
1 change: 1 addition & 0 deletions docs/examples/finetuning/embeddings/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python_sources()
1 change: 1 addition & 0 deletions docs/examples/output_parsing/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python_sources()
1 change: 1 addition & 0 deletions llama-datasets/10k/uber_2021/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python_sources()
61 changes: 61 additions & 0 deletions llama-datasets/10k/uber_2021/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Uber 10K Dataset 2021

## CLI Usage

You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:

```bash
llamaindex-cli download-llamadataset Uber10KDataset2021 --download-dir ./data
```

You can then inspect the files at `./data`. When you're ready to load the data into
python, you can use the below snippet of code:

```python
from llama_index.core import SimpleDirectoryReader
from llama_index.core.llama_dataset import LabelledRagDataset

rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
```

## Code Usage

You can download the dataset to a directory, say `./data` directly in Python
as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
run your own LlamaIndex RAG pipeline with the `llamadataset`.

```python
from llama_index.core.llama_dataset import download_llama_dataset
from llama_index.core.llama_pack import download_llama_pack
from llama_index.core import VectorStoreIndex

# download and install dependencies for benchmark dataset
rag_dataset, documents = download_llama_dataset("Uber10KDataset2021", "./data")

# build basic RAG system
index = VectorStoreIndex.from_documents(documents=documents)
query_engine = index.as_query_engine()

# evaluate using the RagEvaluatorPack
RagEvaluatorPack = download_llama_pack(
"RagEvaluatorPack", "./rag_evaluator_pack"
)
rag_evaluator_pack = RagEvaluatorPack(
rag_dataset=rag_dataset,
query_engine=query_engine,
show_progress=True,
)

############################################################################
# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
# then you'll need to use different batch_size and sleep_time_in_seconds. #
# For Usage Tier 1, settings that seemed to work well were batch_size=5, #
# and sleep_time_in_seconds=15 (as of December 2023.) #
############################################################################

benchmark_df = await rag_evaluator_pack.arun(
batch_size=20, # batches the number of openai api calls to make
sleep_time_in_seconds=1, # seconds to sleep before making an api call
)
```
27 changes: 27 additions & 0 deletions llama-datasets/10k/uber_2021/card.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"name": "Uber 10K Dataset 2021",
"className": "LabelledRagDataset",
"description": "A labelled RAG dataset based on the Uber 2021 10K document, consisting of queries, reference answers, and reference contexts.",
"numberObservations": 822,
"containsExamplesByHumans": false,
"containsExamplesByAi": true,
"sourceUrls": [],
"baselines": [
{
"name": "llamaindex",
"config": {
"chunkSize": 1024,
"llm": "gpt-3.5-turbo",
"similarityTopK": 2,
"embedModel": "text-embedding-ada-002"
},
"metrics": {
"contextSimilarity": 0.943,
"correctness": 3.874,
"faithfulness": 0.667,
"relevancy": 0.844
},
"codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/10k/uber_2021/llamaindex_baseline.py"
}
]
}
41 changes: 41 additions & 0 deletions llama-datasets/10k/uber_2021/llamaindex_baseline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import asyncio

from llama_index.core.llama_dataset import download_llama_dataset
from llama_index.core.llama_pack import download_llama_pack
from llama_index.core import VectorStoreIndex
from llama_index.llms import OpenAI


async def main():
# DOWNLOAD LLAMADATASET
rag_dataset, documents = download_llama_dataset(
"Uber10KDataset2021", "./uber10k_2021_dataset"
)

# BUILD BASIC RAG PIPELINE
index = VectorStoreIndex.from_documents(documents=documents)
query_engine = index.as_query_engine()

# EVALUATE WITH PACK
RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack_stuff")
judge_llm = OpenAI(model="gpt-3.5-turbo")
rag_evaluator = RagEvaluatorPack(
query_engine=query_engine, rag_dataset=rag_dataset, judge_llm=judge_llm
)

############################################################################
# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
# then you'll need to use different batch_size and sleep_time_in_seconds. #
# For Usage Tier 1, settings that seemed to work well were batch_size=5, #
# and sleep_time_in_seconds=15 (as of December 2023.) #
############################################################################
benchmark_df = await rag_evaluator.arun(
batch_size=20, # batches the number of openai api calls to make
sleep_time_in_seconds=1, # number of seconds sleep before making an api call
)
print(benchmark_df)


if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main)
Empty file added llama-datasets/__init__.py
Empty file.
1 change: 1 addition & 0 deletions llama-datasets/blockchain_solana/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python_sources()
61 changes: 61 additions & 0 deletions llama-datasets/blockchain_solana/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Blockchain Solana Dataset

## CLI Usage

You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:

```bash
llamaindex-cli download-llamadataset BlockchainSolanaDataset --download-dir ./data
```

You can then inspect the files at `./data`. When you're ready to load the data into
python, you can use the below snippet of code:

```python
from llama_index.core import SimpleDirectoryReader
from llama_index.core.llama_dataset import LabelledRagDataset

rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
```

## Code Usage

You can download the dataset to a directory, say `./data` directly in Python
as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
run your own LlamaIndex RAG pipeline with the `llamadataset`.

```python
from llama_index.core.llama_dataset import download_llama_dataset
from llama_index.core.llama_pack import download_llama_pack
from llama_index.core import VectorStoreIndex

# download and install dependencies for benchmark dataset
rag_dataset, documents = download_llama_dataset(
"BlockchainSolanaDataset", "./data"
)

# build basic RAG system
index = VectorStoreIndex.from_documents(documents=documents)
query_engine = index.as_query_engine()

# evaluate using the RagEvaluatorPack
RagEvaluatorPack = download_llama_pack(
"RagEvaluatorPack", "./rag_evaluator_pack"
)
rag_evaluator_pack = RagEvaluatorPack(
rag_dataset=rag_dataset, query_engine=query_engine
)

############################################################################
# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
# then you'll need to use different batch_size and sleep_time_in_seconds. #
# For Usage Tier 1, settings that seemed to work well were batch_size=5, #
# and sleep_time_in_seconds=15 (as of December 2023.) #
############################################################################

benchmark_df = await rag_evaluator_pack.arun(
batch_size=20, # batches the number of openai api calls to make
sleep_time_in_seconds=1, # seconds to sleep before making an api call
)
```
27 changes: 27 additions & 0 deletions llama-datasets/blockchain_solana/card.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"name": "Blockchain Solana",
"className": "LabelledRagDataset",
"description": "A labelled RAG dataset based off an article, From Bitcoin to Solana – Innovating Blockchain towards Enterprise Applications),by Xiangyu Li, Xinyu Wang, Tingli Kong, Junhao Zheng and Min Luo, consisting of queries, reference answers, and reference contexts.",
"numberObservations": 58,
"containsExamplesByHumans": false,
"containsExamplesByAi": true,
"sourceUrls": ["https://arxiv.org/abs/2207.05240"],
"baselines": [
{
"name": "llamaindex",
"config": {
"chunkSize": 1024,
"llm": "gpt-3.5-turbo",
"similarityTopK": 2,
"embedModel": "text-embedding-ada-002"
},
"metrics": {
"contextSimilarity": 0.945,
"correctness": 4.457,
"faithfulness": 1.0,
"relevancy": 1.0
},
"codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/blockchain_solana/llamaindex_baseline.py"
}
]
}
37 changes: 37 additions & 0 deletions llama-datasets/blockchain_solana/llamaindex_baseline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import asyncio

from llama_index.core.llama_dataset import download_llama_dataset
from llama_index.core.llama_pack import download_llama_pack
from llama_index.core import VectorStoreIndex


async def main():
# DOWNLOAD LLAMADATASET
rag_dataset, documents = download_llama_dataset(
"BlockchainSolanaDataset", "./blockchain_solana"
)

# BUILD BASIC RAG PIPELINE
index = VectorStoreIndex.from_documents(documents=documents)
query_engine = index.as_query_engine()

# EVALUATE WITH PACK
RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack_stuff")
rag_evaluator = RagEvaluatorPack(query_engine=query_engine, rag_dataset=rag_dataset)

############################################################################
# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
# then you'll need to use different batch_size and sleep_time_in_seconds. #
# For Usage Tier 1, settings that seemed to work well were batch_size=5, #
# and sleep_time_in_seconds=15 (as of December 2023.) #
############################################################################
benchmark_df = await rag_evaluator.arun(
batch_size=20, # batches the number of openai api calls to make
sleep_time_in_seconds=1, # number of seconds sleep before making an api call
)
print(benchmark_df)


if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main)
1 change: 1 addition & 0 deletions llama-datasets/braintrust_coda/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python_sources()
65 changes: 65 additions & 0 deletions llama-datasets/braintrust_coda/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Braintrust Coda Help Desk Dataset

[![Braintrust (346 x 40 px)](https://github.com/nerdai/llama-hub/assets/92402603/a99bddf3-0eab-42e8-8c53-8432da8299d3)](https://www.braintrustdata.com/)

_This dataset was kindly provided by Kenny Wong and Ankur Goyal._

## CLI Usage

You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:

```bash
llamaindex-cli download-llamadataset BraintrustCodaHelpDeskDataset --download-dir ./data
```

You can then inspect the files at `./data`. When you're ready to load the data into
python, you can use the below snippet of code:

```python
from llama_index.core import SimpleDirectoryReader
from llama_index.core.llama_dataset import LabelledRagDataset

rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
```

## Code Usage

You can download the dataset to a directory, say `./data` directly in Python
as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
run your own LlamaIndex RAG pipeline with the `llamadataset`.

```python
from llama_index.core.llama_dataset import download_llama_dataset
from llama_index.core.llama_pack import download_llama_pack
from llama_index.core import VectorStoreIndex

# download and install dependencies for benchmark dataset
rag_dataset, documents = download_llama_dataset(
"BraintrustCodaHelpDeskDataset", "./data"
)

# build basic RAG system
index = VectorStoreIndex.from_documents(documents=documents)
query_engine = index.as_query_engine()

# evaluate using the RagEvaluatorPack
RagEvaluatorPack = download_llama_pack(
"RagEvaluatorPack", "./rag_evaluator_pack"
)
rag_evaluator_pack = RagEvaluatorPack(
rag_dataset=rag_dataset, query_engine=query_engine
)

############################################################################
# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
# then you'll need to use different batch_size and sleep_time_in_seconds. #
# For Usage Tier 1, settings that seemed to work well were batch_size=5, #
# and sleep_time_in_seconds=15 (as of December 2023.) #
############################################################################

benchmark_df = await rag_evaluator_pack.arun(
batch_size=20, # batches the number of openai api calls to make
sleep_time_in_seconds=1, # seconds to sleep before making an api call
)
```
Empty file.
29 changes: 29 additions & 0 deletions llama-datasets/braintrust_coda/card.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"name": "Braintrust Coda Help Desk",
"className": "LabelledRagDataset",
"description": "A list of automatically generated question/answer pairs from the Coda (https://coda.io/) help docs. This dataset is interesting because most models include Coda’s documentation as part of their training set, so you can baseline performance without RAG.",
"numberObservations": 100,
"containsExamplesByHumans": false,
"containsExamplesByAi": true,
"sourceUrls": [
"https://gist.githubusercontent.com/wong-codaio/b8ea0e087f800971ca5ec9eef617273e/raw/39f8bd2ebdecee485021e20f2c1d40fd649a4c77/articles.json"
],
"baselines": [
{
"name": "llamaindex",
"config": {
"chunkSize": 1024,
"llm": "gpt-3.5-turbo",
"similarityTopK": 2,
"embedModel": "text-embedding-ada-002"
},
"metrics": {
"contextSimilarity": 0.955,
"correctness": 4.32,
"faithfulness": 0.9,
"relevancy": 0.93
},
"codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/braintrust_coda/llamaindex_baseline.py"
}
]
}
Loading

0 comments on commit b3112de

Please sign in to comment.