Move from llama-cpp and huggingface to ollama (#15)

* Move from llama-cpp and huggingface to ollama * Update CI * test ollama * host ollama on gh runner
virajmalia · Jun 5, 2024 · 8fa74d3 · 8fa74d3
1 parent b5d3bf1
commit 8fa74d3
Show file tree

Hide file tree

Showing 5 changed files with 70 additions and 160 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -25,10 +25,13 @@ jobs:
     - name: Install dependencies
       run: |
         pip install flake8
+        curl -fsSL https://ollama.com/install.sh | sudo -E sh
+        ollama serve &
+        sleep 5
+        # This endpoint blocks until ready
+        time curl -i http://localhost:11434
+        ollama run llama3
     - uses: actions/checkout@v4
-    - name: Set device type to CPU for CI
-      run: |
-        sed -i s/\"llama-cpp-python.*\"/\"llama-cpp-python\"/g pyproject.toml
     - name: Lint with flake8
       run: |
         # stop the build if there are Python syntax errors or undefined names

diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@ Develop a free and open source, fully-featured AI solution with agents.
 - ChatGPT/GPT4o
 
 ## Rule
-- APIs that have usage limitations or require keys to be registered with an online account are not permitted to be added to this project.
+- APIs that have usage limitations or require keys to be registered with an online account won't be added to this project.
 
 ## System requirements
 - Powerful CPU or Nvidia GPU (>=8G VRAM)
@@ -33,23 +33,20 @@ echo $CUDACXX && $CUDACXX --version
 ```
 
 ## Steps to run
-1. `pip install -e .`
-2. `llama4u`
-
-    Default model: https://huggingface.co/PawanKrd/Meta-Llama-3-8B-Instruct-GGUF/blob/main/llama-3-8b-instruct.Q3_K_M.gguf
+1. Host `llama3` model from [Ollama][1] on your computer
+2. `pip install -e .`
+3. `llama4u`
 
 `llama4u --help` for full CLI
 
 ## Description
-Llama4U is an AI assistant developed using [LlamaCPP][1], [LangChain][2] and [Llama3][3]. A completely free AI solution that can be hosted locally, while providing online capabilities in a responsible and user-controllable way.
+Llama4U is an AI assistant developed using [Ollama][1], [LangChain][2] and [Llama3][3]. A completely free AI solution that can be hosted locally, while providing online capabilities in a responsible and user-controllable way.
 
 ## Credits
 - Meta, for the open source Llama models
-- HuggingFace community
-- LlamaCPP and llama-cpp-python communities
+- Ollama
 - LangChain community
 
-
-[1]: https://github.com/abetlen/llama-cpp-python
+[1]: https://github.com/ollama/ollama
 [2]: https://python.langchain.com/v0.1/docs/get_started/introduction/
 [3]: https://huggingface.co/blog/llama3
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,14 +19,9 @@ classifiers = [
     "Operating System :: OS Independent",
 ]
 dependencies = [
-    "huggingface_hub",
-    "llama-cpp-python @ https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.75-cu124/llama_cpp_python-0.2.75-cp310-cp310-linux_x86_64.whl",
-    "llama-index-llms-llama-cpp",
-    "llama-index-embeddings-huggingface",
-    "llama-index-embeddings-langchain",
     "langchain",
+    "langchain-core",
     "langchain-community",
-    "sentence-transformers",
     "langchain-chroma",
     "duckduckgo_search",
     "termcolor"

diff --git a/src/input/input.py b/src/input/input.py
@@ -5,8 +5,6 @@ def parse_arguments():
     """ parse input arguments """
     version = importlib.metadata.version('Llama4U')
     parser = argparse.ArgumentParser(description=f'Llama4U v{version}')
-    parser.add_argument('-r', '--repo_id', type=str, required=False, help='Repository ID')
-    parser.add_argument('-f', '--filename', type=str, required=False, help='Filename')
     parser.add_argument('-q', '--query', type=str, required=False, help='Single Query')
     parser.add_argument('-v', '--verbose', type=int, required=False, help='Enable verbose output')
     return parser.parse_args()
diff --git a/src/llama4u.py b/src/llama4u.py
@@ -1,158 +1,75 @@
 """ Llama4U """
-import sys
-from os import devnull
-from contextlib import contextmanager,redirect_stderr
+import asyncio
 from termcolor import colored
-from huggingface_hub import hf_hub_download
-import llama_cpp
-from langchain_community.llms.llamacpp import LlamaCpp
-from langchain.chains.conversation.base import ConversationChain
-from langchain.memory.buffer import ConversationBufferMemory
-from langchain_core.prompts import (
-    ChatPromptTemplate, HumanMessagePromptTemplate
-)
+from langchain_community.chat_models.ollama import ChatOllama
+from langchain_community.chat_message_histories.in_memory import ChatMessageHistory
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain_core.runnables.history import RunnableWithMessageHistory
 from input.input import parse_arguments
 
 LLAMA4U_STR = 'Llama4U'
 
 class Llama4U():
     """ Llama4U """
 
-    # Model config parameters
-    model_kwargs = {
-        "n_gpu_layers": -1,
-        "logits_all": True,
-        'split_mode':llama_cpp.LLAMA_SPLIT_MODE_LAYER,
-        'vocab_only': False,
-        'use_mmap': True,
-        'use_mlock': False,
-        'kv_overrides': None,
-        'seed': llama_cpp.LLAMA_DEFAULT_SEED,
-        'n_ctx': 2048,
-        'n_batch': 512,
-        'rope_scaling_type': llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
-        'pooling_type': llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
-        'rope_freq_base': 0.0,
-        'rope_freq_scale': 0.0,
-        'yarn_ext_factor':-1.0,
-        'yarn_attn_factor': 1.0,
-        'yarn_beta_fast': 32.0,
-        'yarn_beta_slow': 1.0,
-        'yarn_orig_ctx': 0,
-        'embedding': False,
-        'offload_kqv': True,
-        'flash_attn': False,
-        'last_n_tokens_size': 64,
-        'lora_scale': 1.0,
-        'numa': False,
-        'chat_format': 'llama-2',
-        'chat_handler': None,
-        'verbose':True,
-    }
-
-    # Chat config parameters
-    chat_kwargs = {
-        'temperature': 0.2,
-        'top_p': 0.95,
-        'top_k': 40,
-        'min_p': 0.05,
-        'typical_p': 1.0,
-        'max_tokens': None,
-        'echo': False,
-        'presence_penalty':0.0,
-        'frequency_penalty':0.0,
-        'repeat_penalty':1.1,
-        'tfs_z':1.0,
-        'mirostat_mode': 0,
-        'mirostat_tau': 5.0,
-        'mirostat_eta': 0.1,
-        'logprobs': True,
-        #'top_logprobs': 1,
-    }
-
-    # Define the human message template
-    human_template = HumanMessagePromptTemplate.from_template(
-        "{history}<|eot_id|>\n\n{input}<|eot_id|>"
-        )
-
-    # Combine the templates into a chat prompt template
-    chat_template = ChatPromptTemplate.from_messages([human_template])
-
-    def __init__(self,
-                 hf_repo_id,
-                 model_filename
-                 ):
-        if hf_repo_id is None:
-            self.hf_repo_id='PawanKrd/Meta-Llama-3-8B-Instruct-GGUF'
-        if model_filename is None:
-            model_filename='llama-3-8b-instruct.Q3_K_M.gguf'
-        self.model_path = hf_hub_download(repo_id=self.hf_repo_id, filename=model_filename)
-
-        # Initialize LLM
-        self.llm = LlamaCpp(
-            model_path=self.model_path,
-            **self.model_kwargs,
-        )
-
-        # Initialize Conversation "Chain"
-        # using our LLM, chat template and config params
-        self.conversation_chain = ConversationChain(
-                llm=self.llm,
-                prompt=self.chat_template,
-                memory=ConversationBufferMemory(),
-                llm_kwargs=self.chat_kwargs,
+    system_prompt = """Given a chat history and the latest user question \
+            which might reference context in the chat history, \
+            formulate a response that is clear and understandable by an 18yo human."""
+    prompt = ChatPromptTemplate.from_messages(
+        [
+            ("system", system_prompt),
+            MessagesPlaceholder("chat_history"),
+            ("human", "{input}"),
+        ]
+    )
+
+    llm = ChatOllama(model='llama3')
+    runnable = prompt | llm
+
+    store = {}
+
+    def __init__(self):
+        # Initialize LLM chat chain
+        self.with_msg_history = RunnableWithMessageHistory(
+            runnable=self.runnable,
+            get_session_history=self.get_session_history,
+            input_messages_key="input",
+            history_messages_key="chat_history",
             )
 
-    def process_user_input(self):
-        """ Get input from stdout """
-        print(colored('>>> ', 'yellow'), end="")
-        user_prompt = input()
-        if user_prompt.lower() in ["exit", "quit", "bye"]:
-            print(colored(f'{LLAMA4U_STR}: =====', 'yellow'))
-            print("Chat session ended. Goodbye!")
-            sys.exit(0)
-        return user_prompt
-
-    def start_chat_session(self, query=""):
-        """ Chat session loop """
-        my_messages=""
-        stop_next_iter = False
-        for _ in range(50):
-            if stop_next_iter:
-                break
-
-            # User's turn
-            if not query:
-                my_messages = self.process_user_input()
-            else:
-                my_messages = query
-                stop_next_iter = True
-
-            # AI's turn
-            response = self.conversation_chain.predict(input=my_messages)
-            print(response.strip())
-
-@contextmanager
-def suppress_stderr(verbose):
-    """A context manager that redirects stderr to devnull based on verbose selection """
-    if verbose <= 0:
-        with open(devnull, 'w', encoding='utf-8') as fnull:
-            with redirect_stderr(fnull) as err:
-                yield err
-    else:
-        yield ()
+    def get_session_history(self, session_id):
+        """ Get session history from session_id """
+        if session_id not in self.store:
+            self.store[session_id] = ChatMessageHistory()
+        return self.store[session_id]
+
+    async def chat_session(self):
+        """ Chat session with history """
+        while True:
+            print(colored('>>> ', 'yellow'), end="")
+            user_prompt = input()
+
+            response = self.with_msg_history.invoke(
+                {"input": user_prompt},
+                config={"configurable": {"session_id": "abc123"}},
+                )
+            print(response.content)
+
+    async def dispatch(self, query=""):
+        """ Dispatch query """
+        if query:
+            response = self.llm.invoke(input=query)
+            query=""
+            print(response.content)
+        else:
+            await self.chat_session()
 
 def main():
     """ Pip Package entrypoint """
     args = parse_arguments()
-    if args.verbose:
-        verbose = args.verbose
-    else:
-        verbose = 0
 
-    with suppress_stderr(verbose):
-        llama4u = Llama4U(args.repo_id, args.filename)
-        llama4u.start_chat_session(args.query)
+    llama4u = Llama4U()
+    asyncio.run(llama4u.dispatch(args.query))
 
 if __name__ == '__main__':
     main()