-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
adding gguf real time streaming method and examples
- Loading branch information
DARREN OBERST
authored and
DARREN OBERST
committed
May 11, 2024
1 parent
73f8e8d
commit ff4fa6f
Showing
4 changed files
with
249 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
|
||
""" This example illustrates how to use the stream method for GGUF models for fast streaming of inference, | ||
especially for real-time chat interactions. | ||
Please note that the stream method has been implemented for GGUF models starting in llmware-0.2.13. This will be | ||
any model with GGUFGenerativeModel class, and generally includes models with names that end in "gguf". | ||
See also the chat UI example in the UI examples folder. | ||
We would recommend using a chat optimized model, and have included a representative list below. | ||
""" | ||
|
||
|
||
from llmware.models import ModelCatalog | ||
from llmware.gguf_configs import GGUFConfigs | ||
|
||
# sets an absolute output maximum for the GGUF engine - normally set by default at 256 | ||
GGUFConfigs().set_config("max_output_tokens", 1000) | ||
|
||
chat_models = ["phi-3-gguf", | ||
"llama-2-7b-chat-gguf", | ||
"llama-3-instruct-bartowski-gguf", | ||
"openhermes-mistral-7b-gguf", | ||
"zephyr-7b-gguf", | ||
"tiny-llama-chat-gguf"] | ||
|
||
model_name = chat_models[0] | ||
|
||
# maximum output can be set optionally at any number up to the "max_output_tokens" set | ||
model = ModelCatalog().load_model(model_name, max_output=200) | ||
|
||
text_out = "" | ||
|
||
token_count = 0 | ||
|
||
prompt = "I am interested in gaining an understanding of the banking industry. What topics should I research?" | ||
|
||
# since model.stream provides a generator, then use as follows to consume the generator | ||
|
||
for streamed_token in model.stream(prompt): | ||
|
||
text_out += streamed_token | ||
if text_out.strip(): | ||
print(streamed_token, end="") | ||
|
||
token_count += 1 | ||
|
||
# final output text and token count | ||
|
||
print("\n\n***total text out***: ", text_out) | ||
print("\n***total tokens***: ", token_count) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
|
||
""" This example shows how to build a local chatbot prototype using llmware and Streamlit. The example shows | ||
how to use several GGUF chat models in the LLMWare catalog, along with using the model.stream method which | ||
provides a real time generator for displaying the bot response in real-time. | ||
This is purposefully super simple script (but surprisingly fun) to provide the core of the recipe. | ||
The Streamlit code below is derived from Streamlit tutorials available at: | ||
https://docs.streamlit.io/develop/tutorials/llms/build-conversational-apps | ||
If you are new to using Steamlit, to run this example: | ||
1. pip3 install streamlit | ||
2. to run, go to the command line: streamlit run "path/to/gguf_streaming_chatbot.py" | ||
""" | ||
|
||
import streamlit as st | ||
from llmware.models import ModelCatalog | ||
from llmware.gguf_configs import GGUFConfigs | ||
|
||
GGUFConfigs().set_config("max_output_tokens", 500) | ||
|
||
|
||
def simple_chat_ui_app (model_name): | ||
|
||
st.title(f"Simple Chat with {model_name}") | ||
|
||
model = ModelCatalog().load_model(model_name, temperature=0.3, sample=True, max_output=450) | ||
|
||
# initialize chat history | ||
if "messages" not in st.session_state: | ||
st.session_state.messages = [] | ||
|
||
# display chat messages from history on app rerun | ||
for message in st.session_state.messages: | ||
with st.chat_message(message["role"]): | ||
st.markdown(message["content"]) | ||
|
||
# accept user input | ||
prompt = st.chat_input("Say something") | ||
if prompt: | ||
|
||
with st.chat_message("user"): | ||
st.markdown(prompt) | ||
|
||
with st.chat_message("assistant"): | ||
|
||
# note that the st.write_stream method consumes a generator - so pass model.stream(prompt) directly | ||
bot_response = st.write_stream(model.stream(prompt)) | ||
|
||
st.session_state.messages.append({"role": "user", "content": prompt}) | ||
st.session_state.messages.append({"role": "assistant", "content": bot_response}) | ||
|
||
return 0 | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
# a few representative good chat models that can run locally | ||
# note: will take a minute for the first time it is downloaded and cached locally | ||
|
||
chat_models = ["phi-3-gguf", | ||
"llama-2-7b-chat-gguf", | ||
"llama-3-instruct-bartowski-gguf", | ||
"openhermes-mistral-7b-gguf", | ||
"zephyr-7b-gguf", | ||
"tiny-llama-chat-gguf"] | ||
|
||
model_name = chat_models[0] | ||
|
||
simple_chat_ui_app(model_name) | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters