forked from dennisfelczy/OpenAI_Document_Analyzer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelperfunctions.py
441 lines (372 loc) · 19.1 KB
/
helperfunctions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
import os
from langchain.embeddings.openai import OpenAIEmbeddings
#from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS
import azure.cognitiveservices.speech as speechsdk
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
import json
import shutil
import streamlit as st
import ftfy
# refresh vector index list from directory names in faiss folder
def refresh_vector_index_list():
print(st.session_state.project)
directory_list = list()
for root, dirs, files in os.walk("projects/"+st.session_state.project+"/faiss/", topdown=False):
for name in dirs:
directory_list.append(os.path.join(name))
st.session_state.vector_index_list = directory_list
def resetpage():
st.session_state.startpage=1
st.session_state.endpage=len(st.session_state.pagecontent)
# refresh vector index list from directory names in faiss folder
def refresh_topic_list():
directory_list = list()
for root, dirs, files in os.walk("projects/"+st.session_state.project+"/topics/", topdown=False):
for name in dirs:
directory_list.append(os.path.join(name))
st.session_state.topic_list = directory_list
def add_topic(topicname):
print('adding topic to '+st.session_state.project)
if os.path.isdir("projects/"+st.session_state.project+"/topics/"+topicname):
st.error("Topic already exists")
else:
os.mkdir("projects/"+st.session_state.project+"/topics/"+topicname)
#write questions.txt to dir
with open("projects/"+st.session_state.project+"/topics/"+topicname+"/questions.txt", "w") as f:
pass
#write queries.txt to dir
with open("projects/"+st.session_state.project+"/topics/"+topicname+"/queries.txt", "w") as f:
pass
#write ground_truth.txt to dir
with open("projects/"+st.session_state.project+"/topics/"+topicname+"/ground_truth.txt", "w") as f:
pass
refresh_topic_list()
st.session_state.topic=topicname
load_topic()
def delete_topic(topicname):
if os.path.isdir("projects/"+st.session_state.project+"/topics/"+topicname):
#delete directory
shutil.rmtree("projects/"+st.session_state.project+"/topics/"+topicname, ignore_errors=True)
refresh_topic_list()
if(len(st.session_state.topic_list)>0):
st.session_state.topic=st.session_state.topic_list[0]
load_topic()
def refresh_project_list():
if 'project_list' in st.session_state:
del st.session_state.project_list
dirs = [entry.path for entry in os.scandir('projects') if entry.is_dir()]
#replace "projects\\" with "
for i in range(len(dirs)):
dirs[i]=dirs[i].replace("projects\\","")
st.session_state.project_list = dirs
if(len(dirs)>0):
st.session_state.project=dirs[0]
def add_project(projectname):
if os.path.isdir("projects/"+projectname):
st.error("project already exists")
else:
os.mkdir("projects/"+projectname)
#write questions.txt to dir
os.mkdir("projects/"+projectname+"/files")
os.mkdir("projects/"+projectname+"/faiss")
os.mkdir("projects/"+projectname+"/topics")
refresh_project_list()
st.session_state.project=projectname
loadproject()
def delete_project(projectname):
if os.path.isdir("projects/"+projectname):
#delete directory
shutil.rmtree("projects/"+projectname, ignore_errors=True)
refresh_project_list()
if(len(st.session_state.project_list)>0):
st.session_state.project=st.session_state.project_list[0]
loadproject()
# refresh vector index list from directory names in faiss folder
def refresh_vector_index_list():
directory_list = list()
for root, dirs, files in os.walk("projects/"+st.session_state.project+"/faiss/", topdown=False):
for name in dirs:
directory_list.append(os.path.join(name))
st.session_state.vector_index_list = directory_list
def setquestion():
if(st.session_state['question']!="-"):
st.session_state['questioninput']=st.session_state['question']
def getmessages(systemessage,question):
from langchain.schema import(
HumanMessage,
SystemMessage
)
messages = [
SystemMessage(content=systemessage),
HumanMessage(content=question)
]
return messages
# Speech to Text with Azure Cognitive Services
def recognize_from_microphone(target):
# This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION" which are loaded from the .env file in main
speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'), region=os.environ.get('SPEECH_REGION'))
speech_config.speech_recognition_language=st.session_state.language
audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
st.info("Speak into your microphone.")
speech_recognition_result = speech_recognizer.recognize_once_async().get()
if speech_recognition_result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("Recognized: {}".format(speech_recognition_result.text))
if target=="question":
add_question(speech_recognition_result.text)
st.session_state.question=speech_recognition_result.text
else:
add_query(speech_recognition_result.text)
st.session_state.query=speech_recognition_result.text
elif speech_recognition_result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized: {}".format(speech_recognition_result.no_match_details))
elif speech_recognition_result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = speech_recognition_result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
print("Did you set the speech resource key and region values?")
def askquestion():
if 'context' not in st.session_state:
st.warning('No Context is used. Please use query first')
else:
if 'vs' in st.session_state: # if there's the vector store (user uploaded, split and embedded a file)
#replace € with € in the answer
st.session_state.answer=ftfy.fix_encoding(askwithcontext(st.session_state.question))
else:
st.error('Please select a Document first')
def gettokens(text):
import tiktoken
enc = tiktoken.encoding_for_model('gpt-3.5-turbo')
return len(enc.encode(text))
# Text to Speech with Azure Cognitive Services
# This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
def synthesize_text(text):
speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'), region=os.environ.get('SPEECH_REGION'))
audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)
# The language of the voice that speaks.
if st.session_state.language=="de-DE":
speech_config.speech_synthesis_voice_name='de-DE-KatjaNeural'
else:
speech_config.speech_synthesis_voice_name='en-US-JennyNeural'
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
speech_synthesis_result = speech_synthesizer.speak_text_async(text).get()
if speech_synthesis_result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
print("Speech synthesized for text")
elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = speech_synthesis_result.cancellation_details
print("Speech synthesis canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
if cancellation_details.error_details:
print("Error details: {}".format(cancellation_details.error_details))
print("Did you set the speech resource key and region values?")
def load_embeddings():
embeddings = OpenAIEmbeddings(deployment="text-embedding-ada-002", chunk_size=16)
st.session_state.vs = FAISS.load_local("projects/"+st.session_state.project+"/faiss/"+st.session_state.vector_index_name, embeddings)
st.session_state.document_name = st.session_state.vector_index_name
st.success(st.session_state.vector_index_name+' loaded successfully.')
contentjsonfile="projects/"+st.session_state.project+"/files/"+st.session_state.vector_index_name+".pagecontent.json"
with open(contentjsonfile, encoding='utf-8') as json_file:
st.session_state.pagecontent = json.load(json_file)
#st.success('Pagecontent '+ st.session_state.vector_index_name+' loaded successfully.')
tablemdfile="projects/"+st.session_state.project+"/files/"+st.session_state.vector_index_name+".tables.md"
with open(tablemdfile, encoding='utf-8') as table_file:
st.session_state.tables = table_file.read()
fullmdfile="projects/"+st.session_state.project+"/files/"+st.session_state.vector_index_name+".md"
with open(fullmdfile, encoding='utf-8') as fullmd_file:
st.session_state.fullmd = fullmd_file.read()
keyvaluesjsonfile="projects/"+st.session_state.project+"/files/"+st.session_state.vector_index_name+".keyvalues.json"
with open(keyvaluesjsonfile, encoding='utf-8') as json_file:
st.session_state.keyvalues = json.load(json_file)
#st.success('Tables from '+ st.session_state.vector_index_name+' loaded successfully.')
st.session_state.startpage=1
resetpage()
def getgroundtruthpages():
groundtruthpages = []
with open("projects/"+st.session_state.project+'/topics/'+st.session_state.topic+'/ground_truth.txt') as f:
for line in f:
if line.split(";")[0] == st.session_state.vector_index_name:
groundtruthpages=line.split(";")[1].split(",")
for i in range(len(groundtruthpages)):
groundtruthpages[i]=int(groundtruthpages[i])
#print(groundtruthpages)
return groundtruthpages
def setgroundtruthpages():
newgroundtruthpages = st.session_state.ground_truth
with open("projects/"+st.session_state.project+'/topics/'+st.session_state.topic+'/ground_truth.txt','r') as f:
lines = f.readlines()
addline=True
with open("projects/"+st.session_state.project+'/topics/'+st.session_state.topic+'/ground_truth.txt','w') as f:
for line in lines:
if line.split(";")[0] == st.session_state.vector_index_name:
addline=False
if newgroundtruthpages=="\n":
f.write(st.session_state.vector_index_name+";"+newgroundtruthpages+"\n")
else:
f.write(line)
if addline:
with open("projects/"+st.session_state.project+'/topics/'+st.session_state.topic+'/ground_truth.txt','a') as f:
f.write(st.session_state.vector_index_name+";"+newgroundtruthpages+"\n")
def getcontext():
if st.session_state.query!="-":
vector_store = st.session_state.vs
query = st.session_state.query
groundtruthpages=getgroundtruthpages()
pagecontent=st.session_state.pagecontent
k=st.session_state.k
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(deployment="text-embedding-ada-002", chunk_size=16)
pagechecker=[]
print("Query: ",query)
result= vector_store.similarity_search_with_score(query=query, k=k, embeddings=embeddings, return_metadata=True)
pages=[]
context=""
querycontent=[]
queryscores=[]
querypages=[]
for r in result:
querycontent.append(r[0].page_content)
queryscores.append(r[1])
querypages.append(str(",".join(str(x) for x in r[0].metadata['pages'])))
for pagenr in r[0].metadata['pages']:
if pagenr not in pages:
pages.append(pagenr)
sortedpages=pages.copy()
sortedpages.sort()
for p in sortedpages:
currenttokens=gettokens(context)
if gettokens(context+pagecontent[str(p)])<=3500:
context+=pagecontent[str(p)]
else:
st.info("Warning !!! Skipping page "+str(p)+" as context is already "+str(currenttokens))
print("Warning !!! Skipping page "+str(p)+" as context is already "+str(currenttokens))
st.session_state.context=context
st.session_state.sourcepages=pages
st.session_state.querycontent=querycontent
st.session_state.queryscores=queryscores
st.session_state.querypages=querypages
if len(groundtruthpages)>0:
st.info("expected pages: "+str(",".join(str(x) for x in groundtruthpages))+" - found pages: "+str(",".join(str(x) for x in pages))+" with "+str(k)+" Sources (k) and a context size of "+str(gettokens(context))+" tokens")
else:
st.info("found pages: "+str(",".join(str(x) for x in pages))+" with "+str(k)+" Sources (k) and a context size of "+str(gettokens(context))+" tokens")
print("expected pages: ",groundtruthpages," - found pages: ",pages," with ",k," Sources (k) amd a context size of ",gettokens(context)," tokens")
st.session_state.reducedpages=str(str(len(pages)))+" of "+str(len(st.session_state.pagecontent))+" pages ("+str(((len(pages)/len(st.session_state.pagecontent)))*100)+"%)"
allpagesfound=True
if len(groundtruthpages)>0:
for gtp in groundtruthpages:
if gtp not in pages:
allpagesfound=False
st.warning("missing page: "+str(gtp)+" - try a higher k value or refine query")
print("missing page: ",gtp)
if allpagesfound:
st.success("all pages found. Document reduced from "+str(len(st.session_state.pagecontent))+" to "+str(len(pages))+" pages ("+str((1-(len(pages)/len(st.session_state.pagecontent)))*100)+"%)")
print("all pages found")
st.session_state.questioninput=st.session_state.question
else:
st.info("No pages for Ground Truth check defined")
def askwithcontext(question):
from langchain.llms import AzureOpenAI
from langchain.prompts import PromptTemplate
from langchain.chat_models import AzureChatOpenAI
vector_store = st.session_state.vs
context=st.session_state.context
t=st.session_state.t
#text davinci
if st.session_state.model=="text-davinci-003":
prompttemplate = PromptTemplate(
input_variables=["context", "question"],
template= """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. If you use information which were not part of the context remove them from the answer.
Context:
{context}
Question: {question}
Helpful Answer:"""
)
prompt=prompttemplate.format(context=context, question=question)
llm = AzureOpenAI(deployment_name='text-davinci-003', temperature=t)
answer=llm(prompt)
#GPT-35-Turbo
if st.session_state.model=="gpt-35-turbo":
prompttemplate = PromptTemplate(
input_variables=["context"],
template= """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. If you use information which were not part of the context remove them from the answer.
Context:
{context}
"""
)
prompt=prompttemplate.format(context=context)
chat=AzureChatOpenAI(deployment_name='gpt-35-turbo', temperature=t)
answer=chat(getmessages(prompt,question)).content
return answer
def delete_query():
queryname=st.session_state.query
with open("projects/"+st.session_state.project+'/topics/'+st.session_state.topic+'/queries.txt', 'r') as f:
lines = f.readlines()
with open("projects/"+st.session_state.project+'/topics/'+st.session_state.topic+'/queries.txt', 'w') as f:
for line in lines:
if line.strip("\n") != queryname:
f.write(line)
#print(line)
load_topic(False)
if 'context' in st.session_state:
del st.session_state.context
def add_query(queryname):
#add query to end of queries.txt
with open("projects/"+st.session_state.project+'/topics/'+st.session_state.topic+'/queries.txt', 'a') as f:
f.write(queryname+"\n")
load_topic(False)
def delete_question():
questionname=st.session_state.question
with open("projects/"+st.session_state.project+'/topics/'+st.session_state.topic+'/questions.txt', 'r') as f:
lines = f.readlines()
with open("projects/"+st.session_state.project+'/topics/'+st.session_state.topic+'/questions.txt', 'w') as f:
for line in lines:
if line.strip("\n") != questionname:
f.write(line)
#print(line)
load_topic(False)
if 'answer' in st.session_state:
del st.session_state.answer
def add_question(questionname):
#add question to end of questions.txt
with open("projects/"+st.session_state.project+'/topics/'+st.session_state.topic+'/questions.txt', 'a') as f:
f.write(questionname+"\n")
load_topic(False)
def load_topic(reset=True):
if reset:
if 'context' in st.session_state:
del st.session_state.context
if 'answer' in st.session_state:
del st.session_state.answer
#open queries text and add all lines to a list
query_list = []
with open("projects/"+st.session_state.project+'/topics/'+st.session_state.topic+'/queries.txt') as f:
for line in f:
query_list.append(line.strip())
st.session_state.query_list = query_list
#open questions text and add all lines to a list
question_list = []
with open("projects/"+st.session_state.project+'/topics/'+st.session_state.topic+'/questions.txt') as f:
for line in f:
question_list.append(line.strip())
st.session_state.question_list = question_list
def loadproject():
if 'context' in st.session_state:
del st.session_state.context
if 'answer' in st.session_state:
del st.session_state.answer
if 'vector_index_list' in st.session_state:
del st.session_state.vector_index_list
if 'topic_list' in st.session_state:
del st.session_state.topic_list
if 'vector_index_name' in st.session_state:
del st.session_state.vector_index_name
if 'topic_list' in st.session_state:
del st.session_state.topic_list
refresh_vector_index_list()
refresh_topic_list()
if len(st.session_state.topic_list)>0:
st.session_state.topic=st.session_state.topic_list[0]
load_topic()
#load_embeddings()