Skip to content

Commit

Permalink
Move from llama-cpp and huggingface to ollama (#15)
Browse files Browse the repository at this point in the history
* Move from llama-cpp and huggingface to ollama

* Update CI

* test ollama

* host ollama on gh runner
  • Loading branch information
virajmalia authored Jun 5, 2024
1 parent b5d3bf1 commit 8fa74d3
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 160 deletions.
9 changes: 6 additions & 3 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,13 @@ jobs:
- name: Install dependencies
run: |
pip install flake8
curl -fsSL https://ollama.com/install.sh | sudo -E sh
ollama serve &
sleep 5
# This endpoint blocks until ready
time curl -i http://localhost:11434
ollama run llama3
- uses: actions/checkout@v4
- name: Set device type to CPU for CI
run: |
sed -i s/\"llama-cpp-python.*\"/\"llama-cpp-python\"/g pyproject.toml
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
Expand Down
17 changes: 7 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Develop a free and open source, fully-featured AI solution with agents.
- ChatGPT/GPT4o

## Rule
- APIs that have usage limitations or require keys to be registered with an online account are not permitted to be added to this project.
- APIs that have usage limitations or require keys to be registered with an online account won't be added to this project.

## System requirements
- Powerful CPU or Nvidia GPU (>=8G VRAM)
Expand All @@ -33,23 +33,20 @@ echo $CUDACXX && $CUDACXX --version
```

## Steps to run
1. `pip install -e .`
2. `llama4u`

Default model: https://huggingface.co/PawanKrd/Meta-Llama-3-8B-Instruct-GGUF/blob/main/llama-3-8b-instruct.Q3_K_M.gguf
1. Host `llama3` model from [Ollama][1] on your computer
2. `pip install -e .`
3. `llama4u`

`llama4u --help` for full CLI

## Description
Llama4U is an AI assistant developed using [LlamaCPP][1], [LangChain][2] and [Llama3][3]. A completely free AI solution that can be hosted locally, while providing online capabilities in a responsible and user-controllable way.
Llama4U is an AI assistant developed using [Ollama][1], [LangChain][2] and [Llama3][3]. A completely free AI solution that can be hosted locally, while providing online capabilities in a responsible and user-controllable way.

## Credits
- Meta, for the open source Llama models
- HuggingFace community
- LlamaCPP and llama-cpp-python communities
- Ollama
- LangChain community


[1]: https://github.com/abetlen/llama-cpp-python
[1]: https://github.com/ollama/ollama
[2]: https://python.langchain.com/v0.1/docs/get_started/introduction/
[3]: https://huggingface.co/blog/llama3
7 changes: 1 addition & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,9 @@ classifiers = [
"Operating System :: OS Independent",
]
dependencies = [
"huggingface_hub",
"llama-cpp-python @ https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.75-cu124/llama_cpp_python-0.2.75-cp310-cp310-linux_x86_64.whl",
"llama-index-llms-llama-cpp",
"llama-index-embeddings-huggingface",
"llama-index-embeddings-langchain",
"langchain",
"langchain-core",
"langchain-community",
"sentence-transformers",
"langchain-chroma",
"duckduckgo_search",
"termcolor"
Expand Down
2 changes: 0 additions & 2 deletions src/input/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ def parse_arguments():
""" parse input arguments """
version = importlib.metadata.version('Llama4U')
parser = argparse.ArgumentParser(description=f'Llama4U v{version}')
parser.add_argument('-r', '--repo_id', type=str, required=False, help='Repository ID')
parser.add_argument('-f', '--filename', type=str, required=False, help='Filename')
parser.add_argument('-q', '--query', type=str, required=False, help='Single Query')
parser.add_argument('-v', '--verbose', type=int, required=False, help='Enable verbose output')
return parser.parse_args()
195 changes: 56 additions & 139 deletions src/llama4u.py
Original file line number Diff line number Diff line change
@@ -1,158 +1,75 @@
""" Llama4U """
import sys
from os import devnull
from contextlib import contextmanager,redirect_stderr
import asyncio
from termcolor import colored
from huggingface_hub import hf_hub_download
import llama_cpp
from langchain_community.llms.llamacpp import LlamaCpp
from langchain.chains.conversation.base import ConversationChain
from langchain.memory.buffer import ConversationBufferMemory
from langchain_core.prompts import (
ChatPromptTemplate, HumanMessagePromptTemplate
)
from langchain_community.chat_models.ollama import ChatOllama
from langchain_community.chat_message_histories.in_memory import ChatMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory
from input.input import parse_arguments

LLAMA4U_STR = 'Llama4U'

class Llama4U():
""" Llama4U """

# Model config parameters
model_kwargs = {
"n_gpu_layers": -1,
"logits_all": True,
'split_mode':llama_cpp.LLAMA_SPLIT_MODE_LAYER,
'vocab_only': False,
'use_mmap': True,
'use_mlock': False,
'kv_overrides': None,
'seed': llama_cpp.LLAMA_DEFAULT_SEED,
'n_ctx': 2048,
'n_batch': 512,
'rope_scaling_type': llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
'pooling_type': llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
'rope_freq_base': 0.0,
'rope_freq_scale': 0.0,
'yarn_ext_factor':-1.0,
'yarn_attn_factor': 1.0,
'yarn_beta_fast': 32.0,
'yarn_beta_slow': 1.0,
'yarn_orig_ctx': 0,
'embedding': False,
'offload_kqv': True,
'flash_attn': False,
'last_n_tokens_size': 64,
'lora_scale': 1.0,
'numa': False,
'chat_format': 'llama-2',
'chat_handler': None,
'verbose':True,
}

# Chat config parameters
chat_kwargs = {
'temperature': 0.2,
'top_p': 0.95,
'top_k': 40,
'min_p': 0.05,
'typical_p': 1.0,
'max_tokens': None,
'echo': False,
'presence_penalty':0.0,
'frequency_penalty':0.0,
'repeat_penalty':1.1,
'tfs_z':1.0,
'mirostat_mode': 0,
'mirostat_tau': 5.0,
'mirostat_eta': 0.1,
'logprobs': True,
#'top_logprobs': 1,
}

# Define the human message template
human_template = HumanMessagePromptTemplate.from_template(
"{history}<|eot_id|>\n\n{input}<|eot_id|>"
)

# Combine the templates into a chat prompt template
chat_template = ChatPromptTemplate.from_messages([human_template])

def __init__(self,
hf_repo_id,
model_filename
):
if hf_repo_id is None:
self.hf_repo_id='PawanKrd/Meta-Llama-3-8B-Instruct-GGUF'
if model_filename is None:
model_filename='llama-3-8b-instruct.Q3_K_M.gguf'
self.model_path = hf_hub_download(repo_id=self.hf_repo_id, filename=model_filename)

# Initialize LLM
self.llm = LlamaCpp(
model_path=self.model_path,
**self.model_kwargs,
)

# Initialize Conversation "Chain"
# using our LLM, chat template and config params
self.conversation_chain = ConversationChain(
llm=self.llm,
prompt=self.chat_template,
memory=ConversationBufferMemory(),
llm_kwargs=self.chat_kwargs,
system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, \
formulate a response that is clear and understandable by an 18yo human."""
prompt = ChatPromptTemplate.from_messages(
[
("system", system_prompt),
MessagesPlaceholder("chat_history"),
("human", "{input}"),
]
)

llm = ChatOllama(model='llama3')
runnable = prompt | llm

store = {}

def __init__(self):
# Initialize LLM chat chain
self.with_msg_history = RunnableWithMessageHistory(
runnable=self.runnable,
get_session_history=self.get_session_history,
input_messages_key="input",
history_messages_key="chat_history",
)

def process_user_input(self):
""" Get input from stdout """
print(colored('>>> ', 'yellow'), end="")
user_prompt = input()
if user_prompt.lower() in ["exit", "quit", "bye"]:
print(colored(f'{LLAMA4U_STR}: =====', 'yellow'))
print("Chat session ended. Goodbye!")
sys.exit(0)
return user_prompt

def start_chat_session(self, query=""):
""" Chat session loop """
my_messages=""
stop_next_iter = False
for _ in range(50):
if stop_next_iter:
break

# User's turn
if not query:
my_messages = self.process_user_input()
else:
my_messages = query
stop_next_iter = True

# AI's turn
response = self.conversation_chain.predict(input=my_messages)
print(response.strip())

@contextmanager
def suppress_stderr(verbose):
"""A context manager that redirects stderr to devnull based on verbose selection """
if verbose <= 0:
with open(devnull, 'w', encoding='utf-8') as fnull:
with redirect_stderr(fnull) as err:
yield err
else:
yield ()
def get_session_history(self, session_id):
""" Get session history from session_id """
if session_id not in self.store:
self.store[session_id] = ChatMessageHistory()
return self.store[session_id]

async def chat_session(self):
""" Chat session with history """
while True:
print(colored('>>> ', 'yellow'), end="")
user_prompt = input()

response = self.with_msg_history.invoke(
{"input": user_prompt},
config={"configurable": {"session_id": "abc123"}},
)
print(response.content)

async def dispatch(self, query=""):
""" Dispatch query """
if query:
response = self.llm.invoke(input=query)
query=""
print(response.content)
else:
await self.chat_session()

def main():
""" Pip Package entrypoint """
args = parse_arguments()
if args.verbose:
verbose = args.verbose
else:
verbose = 0

with suppress_stderr(verbose):
llama4u = Llama4U(args.repo_id, args.filename)
llama4u.start_chat_session(args.query)
llama4u = Llama4U()
asyncio.run(llama4u.dispatch(args.query))

if __name__ == '__main__':
main()

0 comments on commit 8fa74d3

Please sign in to comment.