Skip to content

Commit

Permalink
separated widgets, all callbacks work
Browse files Browse the repository at this point in the history
  • Loading branch information
gustavz committed Aug 29, 2023
1 parent e537112 commit 78c93f7
Show file tree
Hide file tree
Showing 13 changed files with 370 additions and 386 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ __pycache__
.ipynb_checkpoints
.DS_Store
testing.ipynb
.vscode
.vscode
.venv
28 changes: 15 additions & 13 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
from datachad.streamlit.helper import (
authentication_and_options_side_bar,
chat_interface,
initialize_session_state,
from datachad.streamlit.helper import init_session_state
from datachad.streamlit.widgets import (
advanced_options_widget,
authentication_widget,
chat_interface_widget,
init_widgets,
page_header,
upload_data_source,
usage_side_bar,
vector_store_selection,
select_data_source_widget,
usage_widget,
)

init_session_state()
page_header()
initialize_session_state()
authentication_and_options_side_bar()
upload_data_source()
vector_store_selection()
chat_interface()
usage_side_bar()
init_widgets()
authentication_widget()
select_data_source_widget()
advanced_options_widget()
chat_interface_widget()
usage_widget()
Empty file added datachad/__init__.py
Empty file.
13 changes: 11 additions & 2 deletions datachad/backend/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,17 @@

MODEL_PATH = Path("models")
DATA_PATH = Path("data")
GPT4ALL_BINARY = "ggml-gpt4all-j-v1.3-groovy.bin"

DEFAULT_USER = "admin"

FORCE_LOCAL_DEEPLAKE = True
CHUNK_SIZE = 512
CHUNK_OVERLAP_PCT = 15
TEMPERATURE = 0.0
MAX_TOKENS = 2560
MAXIMAL_MARGINAL_RELEVANCE = True
DISTANCE_METRIC = "cos"
K_FETCH_K_RATIO = 3

ENABLE_ADVANCED_OPTIONS = True
STORE_DOCS_EXTRA = False
LOCAL_DEEPLAKE = False
23 changes: 11 additions & 12 deletions datachad/backend/deeplake.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
from langchain.schema import Document
from langchain.vectorstores import DeepLake, VectorStore

from datachad.backend.constants import DATA_PATH, DEFAULT_USER, FORCE_LOCAL_DEEPLAKE
from datachad.backend.constants import DATA_PATH, DEFAULT_USER, LOCAL_DEEPLAKE
from datachad.backend.io import clean_string_for_storing
from datachad.backend.loader import load_data_source, split_docs
from datachad.backend.logging import logger
from datachad.backend.models import MODES, get_embeddings
from datachad.backend.models import get_embeddings
from datachad.backend.utils import clean_string_for_storing

SPLIT = "_"
Expand Down Expand Up @@ -60,7 +60,7 @@ def get_datasets(self, workspace: str):

def get_deeplake_dataset_path(dataset_name: str, options: dict, credentials: dict):
# TODO add user id and dataset size as unique id
if options["mode"] == MODES.LOCAL or FORCE_LOCAL_DEEPLAKE:
if LOCAL_DEEPLAKE:
dataset_path = str(DATA_PATH / dataset_name)
else:
dataset_path = f"hub://{credentials['activeloop_id']}/{dataset_name}"
Expand All @@ -77,10 +77,8 @@ def delete_all_deeplake_datasets(credentials: dict):
deeplake.delete(path, token=credentials["activeloop_token"], force=True)


def get_existing_deeplake_vector_store_paths(
options: str, credentials: dict
) -> list[str]:
if options["mode"] == MODES.LOCAL or FORCE_LOCAL_DEEPLAKE:
def get_existing_deeplake_vector_store_paths(credentials: dict) -> list[str]:
if LOCAL_DEEPLAKE:
return glob(str(DATA_PATH / "*"), recursive=False)
else:
dataset_names = list_deeplake_datasets(
Expand All @@ -90,17 +88,18 @@ def get_existing_deeplake_vector_store_paths(
return dataset_pahs


def get_deeplake_vector_store_paths_for_user(
options: str, credentials: dict
) -> list[str]:
all_paths = get_existing_deeplake_vector_store_paths(options, credentials)
def get_deeplake_vector_store_paths_for_user(credentials: dict) -> list[str]:
all_paths = get_existing_deeplake_vector_store_paths(credentials)
# TODO: replace DEFAULT_USER with user id once supported
user_paths = [p for p in all_paths if p.split(SPLIT)[-1] == DEFAULT_USER]
return user_paths


def get_data_source_from_deeplake_dataset_path(dataset_path):
return dataset_path.split(SPLIT)[-4].split("/")[-1]
data_source = (
f"{SPLIT}".join(dataset_path.split(SPLIT)[:-3]).split("/")[-1].lstrip("data-")
)
return data_source


def get_deeplake_vector_store_path(
Expand Down
4 changes: 2 additions & 2 deletions datachad/backend/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
GitLoader,
NotebookLoader,
OnlinePDFLoader,
PDFMinerLoader,
PyPDFLoader,
PythonLoader,
TextLoader,
UnstructuredEPubLoader,
Expand Down Expand Up @@ -73,7 +73,7 @@ def load(self) -> List[Document]:
".html": (UnstructuredHTMLLoader, {}),
".md": (UnstructuredMarkdownLoader, {}),
".odt": (UnstructuredODTLoader, {}),
".pdf": (PDFMinerLoader, {}),
".pdf": (PyPDFLoader, {}),
".ppt": (UnstructuredPowerPointLoader, {}),
".pptx": (UnstructuredPowerPointLoader, {}),
".txt": (TextLoader, {"encoding": "utf8"}),
Expand Down
33 changes: 15 additions & 18 deletions datachad/backend/logging.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,20 @@
import logging
import sys

logger = logging.getLogger(__name__)


def configure_logger(debug: int = 0) -> None:
# boilerplate code to enable logging in the streamlit app console
log_level = logging.DEBUG if debug == 1 else logging.INFO
logger.setLevel(log_level)

stream_handler = logging.StreamHandler(stream=sys.stdout)
stream_handler.setLevel(log_level)

formatter = logging.Formatter("%(name)s :: %(levelname)s :: %(message)s")

stream_handler.setFormatter(formatter)

logger.addHandler(stream_handler)
def create_logger(level="DEBUG"):
logger = logging.getLogger(__name__)
logger.propagate = False


configure_logger(0)
logger.setLevel(level)
# if no streamhandler present, add one
if not any(
isinstance(handler, logging.StreamHandler) for handler in logger.handlers
):
stream_handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter("%(name)s :: %(levelname)s :: %(message)s")
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
return logger


logger = create_logger()
33 changes: 2 additions & 31 deletions datachad/backend/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,12 @@
import streamlit as st
import tiktoken
from langchain.base_language import BaseLanguageModel
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.openai import Embeddings, OpenAIEmbeddings
from langchain.llms import GPT4All
from transformers import AutoTokenizer

from datachad.backend.constants import GPT4ALL_BINARY, MODEL_PATH
from datachad.backend.constants import MODEL_PATH
from datachad.backend.logging import logger


Expand All @@ -24,20 +22,13 @@ def all(cls) -> List[Any]:
@dataclass
class Model:
name: str
mode: str
embedding: str
path: str = None # for local models only

def __str__(self) -> str:
return self.name


class MODES(Enum):
# Add more modes as needed
OPENAI = "OpenAI"
LOCAL = "Local"


class EMBEDDINGS(Enum):
# Add more embeddings as needed
OPENAI = "text-embedding-ada-002"
Expand All @@ -48,20 +39,9 @@ class MODELS(Enum):
# Add more models as needed
GPT35TURBO = Model(
name="gpt-3.5-turbo",
mode=MODES.OPENAI,
embedding=EMBEDDINGS.OPENAI,
)
GPT4 = Model(name="gpt-4", mode=MODES.OPENAI, embedding=EMBEDDINGS.OPENAI)
GPT4ALL = Model(
name="GPT4All",
mode=MODES.LOCAL,
embedding=EMBEDDINGS.HUGGINGFACE,
path=str(MODEL_PATH / GPT4ALL_BINARY),
)

@classmethod
def for_mode(cls, mode) -> List[Model]:
return [m for m in cls.all() if isinstance(m, Model) and m.mode == mode]
GPT4 = Model(name="gpt-4", embedding=EMBEDDINGS.OPENAI)


def get_model(options: dict, credentials: dict) -> BaseLanguageModel:
Expand All @@ -73,15 +53,6 @@ def get_model(options: dict, credentials: dict) -> BaseLanguageModel:
openai_api_key=credentials["openai_api_key"],
streaming=True,
)
case MODELS.GPT4ALL.name:
model = GPT4All(
model=options["model"].path,
n_ctx=options["model_n_ctx"],
backend="gptj",
temp=options["temperature"],
verbose=True,
callbacks=[StreamingStdOutCallbackHandler()],
)
# Added models need to be cased here
case _default:
msg = f"Model {options['model'].name} not supported!"
Expand Down
5 changes: 4 additions & 1 deletion datachad/backend/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@
Standalone question:"""
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
prompt_template = """Use the following pieces of context to answer the question posed at the beginning and end the end.
If the context does not provide enough information to answer the question, try to answer the question from your own knowledge, but make it clear that you do so.
Question: {question}
{context}
Expand Down
20 changes: 1 addition & 19 deletions datachad/streamlit/constants.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,8 @@
PAGE_ICON = "🤖"
APP_NAME = "DataChad V2"
APP_NAME = "DataChad"
PROJECT_URL = "https://github.com/gustavz/DataChad"
DEFAULT_DATA_SOURCE = "https://github.com/gustavz/DataChad.git"

CHUNK_SIZE = 512
CHUNK_OVERLAP_PCT = 15
TEMPERATURE = 0.0
MAX_TOKENS = 3584
MODEL_N_CTX = 1000
MAXIMAL_MARGINAL_RELEVANCE = False
DISTANCE_METRIC = "cos"
K_FETCH_K_RATIO = 3

ENABLE_ADVANCED_OPTIONS = True
ENABLE_LOCAL_MODE = True
STORE_DOCS_EXTRA = False

MODE_HELP = """
Choose between `OpenAI` which uses the openai library to make API calls, or `Local` which runs all operations (Embedding, Vector Stor and LLM) locally.\n
To enable `Local` mode (disabled for the demo) set `ENABLE_LOCAL_MODE` to `True` in `datachad/constants.py` before deploying the app.\n
Furthermore you need to have the model binaries downloaded and stored inside `./models/`\n
"""

LOCAL_MODE_DISABLED_HELP = """
This is a demo hosted with limited resources. Local Mode is not enabled.\n
Expand Down
Loading

0 comments on commit 78c93f7

Please sign in to comment.