This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathchatPDF.py
130 lines (89 loc) · 3.66 KB
/
chatPDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# RUN THIS CELL FIRST!
import textract
import os
import pandas as pd
import matplotlib.pyplot as plt
from transformers import GPT2TokenizerFast
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain
from dotenv import load_dotenv
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.environ.get('API_KEY')
"""# 1. Loading PDFs and chunking with LangChain"""
# Simple method - Split by pages
loader = PyPDFLoader("./assets/test.pdf")
pages = loader.load_and_split()
print(pages[0])
# SKIP TO STEP 2 IF YOU'RE USING THIS METHOD
chunks = pages
# Advanced method - Split by chunk
# Step 1: Convert PDF to text
doc = textract.process("./assets/test.pdf")
# Step 2: Save to .txt and reopen (helps prevent issues)
with open('test.txt', 'w') as f:
f.write(doc.decode('utf-8'))
with open('test.txt', 'r') as f:
text = f.read()
# Step 3: Create function to count tokens
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
def count_tokens(text: str) -> int:
return len(tokenizer.encode(text))
# Step 4: Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
# Set a really small chunk size, just to show.
chunk_size=512,
chunk_overlap=24,
length_function=count_tokens,
)
chunks = text_splitter.create_documents([text])
# Result is many LangChain 'Documents' around 500 tokens or less (Recursive splitter sometimes allows more tokens to retain context)
type(chunks[0])
# Quick data visualization to ensure chunking was successful
# Create a list of token counts
token_counts = [count_tokens(chunk.page_content) for chunk in chunks]
# Create a DataFrame from the token counts
df = pd.DataFrame({'Token Count': token_counts})
# Create a histogram of the token count distribution
df.hist(bins=40, )
# Show the plot
plt.show()
"""# 2. Embed text and store embeddings"""
# Get embedding model
embeddings = OpenAIEmbeddings()
# Create vector database
db = FAISS.from_documents(chunks, embeddings)
"""# 3. Setup retrieval function"""
# Check similarity search is working
query = "What is needed for DevOps to work?"
docs = db.similarity_search(query)
# print(docs[0])
# Create QA chain to integrate similarity search with user queries (answer query from knowledge base)
chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff")
query = "What is needed for DevOps to work?"
docs = db.similarity_search(query)
print(chain.run(input_documents=docs, question=query))
"""# 5. Create chatbot with chat memory (OPTIONAL) """
# from IPython.display import display
# import ipywidgets as widgets
# # Create conversation chain that uses our vectordb as retriver, this also allows for chat history management
# qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0.1), db.as_retriever())
# chat_history = []
# def on_submit(_):
# query = input_box.value
# input_box.value = ""
# if query.lower() == 'exit':
# print("Thank you for using the State of the Union chatbot!")
# return
# result = qa({"question": query, "chat_history": chat_history})
# chat_history.append((query, result['answer']))
# display(widgets.HTML(f'<b>User:</b> {query}'))
# display(widgets.HTML(f'<b><font color="blue">Chatbot:</font></b> {result["answer"]}'))
# print("Welcome to the Transformers chatbot! Type 'exit' to stop.")
# input_box = widgets.Text(placeholder='Please enter your question:')
# input_box.on_submit(on_submit)
# display(input_box)