-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_text_and_save_index.py
121 lines (91 loc) · 3.36 KB
/
extract_text_and_save_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import pandas as pd
from tqdm import tqdm
import pdfplumber
from langchain.document_loaders import DirectoryLoader
from langchain.schema import Document
# splits
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import NLTKTextSplitter
# prompts
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
# vector stores
from langchain.vectorstores import FAISS
# models
from langchain.llms import HuggingFacePipeline
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings
# retrievers
from langchain.chains import RetrievalQA
### Open pdf file and extract text including tables
pdf_reader = pdfplumber.open("books/Cambridge IGCSE and O Level Computer Science.pdf")
included_pages_intervals = [[14, 52],
[57, 82],
[87, 155],
[159, 188],
[192, 225],
[229, 264],
[270, 306],
[311, 348],
[351, 365],
[368, 393]]
included_pages = []
for interval in included_pages_intervals:
l = list(range(interval[0], interval[1]+1))
included_pages = included_pages + l
def include_page(page_number):
one_based_page_number = page_number + 1
if one_based_page_number in included_pages:
return True
else:
return False
parts = []
def include_text(obj):
if 'size' in obj and obj['size'] >= 10: # include all main body text (exclude figures , tables, header/footer etc.)
return True
else:
return False
def extract_single_page(page):
f_page = page.filter(include_text)
text = f_page.extract_text()
tables = page.find_tables()
table_text = ''
for table in tables:
table_df = pd.DataFrame.from_records(table.extract())
# test if table is empty (if all values are either '' or null)
if (table_df == '').values.sum() + table_df.isnull().values.sum() == table_df.shape[0]*table_df.shape[1]:
pass # Table is empty
else:
table_text = table_text + '\n\n' + table_df.to_html(header=False, index=False)
return text + '\n\n' + table_text
def extract_pages(pdf_reader, source):
documents = []
for page_number, page in tqdm(enumerate(pdf_reader.pages), total=len(pdf_reader.pages)):
if include_page(page_number):
doc = Document(
page_content = extract_single_page(page),
metadata={"source": source, "page": page_number - 11},
)
documents.append(doc)
global parts
parts =[]
return documents
documents = extract_pages(pdf_reader, "Cambridge IGCSE and O Level Computer Science.pdf")
print('pages extracted: ' + str(len(documents)))
### Split text
# text_splitter = NLTKTextSplitter()
# texts = text_splitter.split_documents(documents)
# print(f'We have created {len(texts)} chunks from {len(documents)} pages')
# print('done')
### Create vector embeddings
### download embeddings model
embeddings = HuggingFaceInstructEmbeddings(
model_name = 'sentence-transformers/all-MiniLM-L6-v2',
model_kwargs = {"device": "cpu"}
)
vectordb = FAISS.from_documents(
documents = documents,
embedding = embeddings
)
### Persist embeddings in vector database
vectordb.save_local("faiss_index_hp")