Skip to content

Commit

Permalink
updating dataset tools
Browse files Browse the repository at this point in the history
  • Loading branch information
DARREN OBERST authored and DARREN OBERST committed Apr 30, 2024
1 parent 2b7fb98 commit 30b79ac
Show file tree
Hide file tree
Showing 9 changed files with 1,956 additions and 1,870 deletions.
7 changes: 5 additions & 2 deletions examples/Datasets/build_aws_transcript_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@

import json
import os
from llmware.util import Datasets
from llmware.dataset_tools import Datasets
from llmware.library import Library
from llmware.setup import Setup
from llmware.configs import LLMWareConfig


def build_aws_transcribe_datasets(library_name):
Expand Down Expand Up @@ -38,4 +39,6 @@ def build_aws_transcribe_datasets(library_name):

if __name__ == "__main__":

build_aws_transcribe_datasets("aws_transcripts_lib_0")
LLMWareConfig().set_active_db("sqlite")

build_aws_transcribe_datasets("aws_transcripts_lib_1")
25 changes: 19 additions & 6 deletions examples/Datasets/build_dataset_from_prompt_history.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import json
import os
from llmware.prompts import Prompt
from llmware.util import Datasets
from llmware.dataset_tools import Datasets
from llmware.configs import LLMWareConfig


# Use prompt history to easily create model-ready fine-tuning datasets
Expand All @@ -19,12 +20,22 @@ def create_datasets_from_prompt_history(model_name):

# Perform several prompts
print (f"\n > Performing several prompts to populate the prompt state...")
response = prompter.prompt_main(prompt="Who was the 46th president?", context=context)
response = prompter.number_or_none(prompt="What year did Joe Biden start as vice president?", context=context)
response = prompter.summarize_with_bullets(prompt="Who is Joe Biden?", context=context)

response = prompter.prompt_main("Who was the 46th president?", context=context,
register_trx=True)

response = prompter.prompt_main(prompt="What year did Joe Biden start as vice president?", context=context,
register_trx=True)

response = prompter.prompt_main(prompt="Who is Joe Biden?", context=context, register_trx=True)

for i, entries in enumerate(prompter.interaction_history):
print("update: interaction prompt history created: ", i, entries)

prompter.save_state()

# Create a Datasets object
datasets = Datasets()
datasets = Datasets(testing_split=0.0, validation_split=0.0)

# Create dataset wrapped in "Alpaca format"
print (f"\n > Creating a dataset from prompt history in ALPACA format...")
Expand All @@ -40,7 +51,7 @@ def create_datasets_from_prompt_history(model_name):
sample = datasets.get_dataset_sample(datasets.current_ds_name)
print (f"\nRandom sample from the dataset:\n{json.dumps(sample, indent=2)}")

# Create dataset wrapped in "Chat GPT format"
# Create dataset wrapped in "human_bot format"
print (f"\n > Creating a dataset from prompt history in HUMAN BOT format...")
humanbot_dataset = datasets.build_gen_ds_from_prompt_history(prompt_wrapper="human_bot")
print (f"\nThe dataset dict:\n{json.dumps(humanbot_dataset, indent=2)}")
Expand All @@ -52,6 +63,8 @@ def create_datasets_from_prompt_history(model_name):

if __name__ == "__main__":

LLMWareConfig().set_active_db("sqlite")

model_name = "llmware/bling-1b-0.1"

create_datasets_from_prompt_history(model_name)
Expand Down
5 changes: 4 additions & 1 deletion examples/Datasets/build_embedding_ft_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@
import os

from llmware.library import Library
from llmware.util import Datasets
from llmware.dataset_tools import Datasets
from llmware.setup import Setup
from llmware.configs import LLMWareConfig


def build_embedding_finetuning_dataset(library_name):
Expand Down Expand Up @@ -42,5 +43,7 @@ def build_embedding_finetuning_dataset(library_name):

if __name__ == "__main__":

LLMWareConfig().set_active_db("sqlite")

my_lib_name = "financial_docs_library"
output = build_embedding_finetuning_dataset(my_lib_name)
3 changes: 3 additions & 0 deletions examples/Datasets/knowledge_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from llmware.library import Library
from llmware.setup import Setup
from llmware.util import Graph
from llmware.configs import LLMWareConfig


def build_and_use_knowledge_graph (library_name):
Expand Down Expand Up @@ -88,4 +89,6 @@ def build_and_use_knowledge_graph (library_name):

if __name__ == "__main__":

LLMWareConfig().set_active_db("sqlite")

build_and_use_knowledge_graph("kg_test_0")
12 changes: 7 additions & 5 deletions examples/Datasets/working_with_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@

import json
import os
from llmware.util import Datasets
from llmware.dataset_tools import Datasets
from llmware.library import Library
from llmware.retrieval import Query
from llmware.parsers import Parser
from llmware.setup import Setup
from llmware.configs import LLMWareConfig


def build_and_use_dataset(library_name):
Expand All @@ -27,7 +27,7 @@ def build_and_use_dataset(library_name):
# Create a Datasets object from library
datasets = Datasets(library)

# Build a basic dataset useful for industry domain adaptation for finetuning embedding models
# Build a basic dataset useful for industry domain adaptation for fine-tuning embedding models
print (f"\n > Building basic text dataset...")

basic_embedding_dataset = datasets.build_text_ds(min_tokens=500, max_tokens=1000)
Expand Down Expand Up @@ -56,11 +56,13 @@ def build_and_use_dataset(library_name):

# Pass a set of query results to create a dataset from those results only
query_results = Query(library=library).query("africa")
query_filtered_dataaset = datasets.build_text_ds(min_tokens=250,max_tokens=600, qr=query_results)
query_filtered_dataset = datasets.build_text_ds(min_tokens=250,max_tokens=600, qr=query_results)

return 0


if __name__ == "__main__":

build_and_use_dataset("test_txt_datasets")
LLMWareConfig().set_active_db("sqlite")

build_and_use_dataset("test_txt_datasets_0")
Loading

0 comments on commit 30b79ac

Please sign in to comment.