diff --git a/Generative-AI/PDF Wizard/.env b/Generative-AI/PDF Wizard/.env new file mode 100644 index 0000000000..92e9f098e7 --- /dev/null +++ b/Generative-AI/PDF Wizard/.env @@ -0,0 +1 @@ +GOOGLE_API_KEY = "Your_GEMIN_API_KEY"; \ No newline at end of file diff --git a/Generative-AI/PDF Wizard/app.py b/Generative-AI/PDF Wizard/app.py new file mode 100644 index 0000000000..9b5f4b72a8 --- /dev/null +++ b/Generative-AI/PDF Wizard/app.py @@ -0,0 +1,145 @@ +import streamlit as st +from PyPDF2 import PdfReader +from langchain.text_splitter import RecursiveCharacterTextSplitter +import os +from langchain_google_genai import GoogleGenerativeAIEmbeddings +import google.generativeai as genai +from langchain.vectorstores import FAISS +from langchain_google_genai import ChatGoogleGenerativeAI +from langchain.chains.question_answering import load_qa_chain +from langchain.prompts import PromptTemplate +from dotenv import load_dotenv + + +load_dotenv() +api_key = os.getenv("GOOGLE_API_KEY") +if not api_key: + raise ValueError("GOOGLE_API_KEY environment variable is missing. Please add it to your .env file.") +genai.configure(api_key=api_key) + + +##Function that reads the pdd goes through each and every page +def get_pdf_text(pdf_docs): + if not pdf_docs: + return "" + text="" + for pdf in pdf_docs: + pdf_reader= PdfReader(pdf) + for page in pdf_reader.pages: + text+= page.extract_text() + # Optionally wrap the above block in try-except if needed: + # try: + # pdf_reader = PdfReader(pdf) + # for page in pdf_reader.pages: + # text += page.extract_text() + # except Exception as e: + # st.error(f"Error reading PDF '{pdf.name}': {str(e)}") + return text + + +##Function that breaks text into chunks +def get_text_chunks(text): + # Adjust chunk size and overlap as needed + text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000) + chunks = text_splitter.split_text(text) + return chunks + + +##Function that saves the data we got from conversation to local(here), generally stored in data base +#converting into chunks -> storing data in faiss vector +def get_vector_store(text_chunks, embedding_model="models/embedding-001", store_dir="faiss_index"): + """Create and save vector store from text chunks. + + Args: + text_chunks: List of text chunks to embed + embedding_model: Name of the embedding model to use + store_dir: Directory to save the vector store + + Returns: + None + """ + if not text_chunks: + st.warning("No text to process. Please check the PDF content.") + return + + try: + embeddings = GoogleGenerativeAIEmbeddings(model=embedding_model) + vector_store = FAISS.from_texts(text_chunks, embedding=embeddings) + + # Ensure the directory exists + if not os.path.exists(store_dir): + os.makedirs(store_dir) + + # Save the vector store index in the directory + vector_store.save_local(store_dir) + except Exception as e: + st.error(f"Error creating vector store: {str(e)}") + + +#Function to give the prompt and ask the bot to act accordingly, giving the gemini model +def get_conversational_chain(): + + prompt_template = """ + Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in + provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n + Context:\n {context}?\n + Question: \n{question}\n + + Answer: + """ + model = ChatGoogleGenerativeAI(model="gemini-1.5-flash",temperature=0.3) + + prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question"]) + chain = load_qa_chain(model, chain_type="stuff", prompt=prompt) + + return chain + + +##Function for user input, question +#give the question-> do a similarity search on all the faiss vectors-> go with converstional chain-> response from chain +def user_input(user_question): + embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") + + # Check if the faiss_index file exists before loading + if not os.path.exists("faiss_index/index.faiss"): + st.error("FAISS index file not found. Please process the PDF files first.") + return + + new_db = FAISS.load_local("faiss_index", embeddings) + docs = new_db.similarity_search(user_question) + + chain = get_conversational_chain() + + response = chain( + {"input_documents": docs, "question": user_question}, + return_only_outputs=True + ) + + print(response) + st.write("Reply: ", response["output_text"]) + + +##Main Function for streamlit app +def main(): + st.set_page_config("PDF Wizard") + st.header("Chat with multiple PDFs📄") + + user_question = st.text_input("📎Ask a Question from the PDF Files") + + if user_question: + user_input(user_question) + + with st.sidebar: + st.title("Menu:") + pdf_docs = st.file_uploader("Upload your PDF Files and Click on the Submit & Process Button", accept_multiple_files=True) + if st.button("Submit & Process"): + with st.spinner("Processing..."): + raw_text = get_pdf_text(pdf_docs) + text_chunks = get_text_chunks(raw_text) + get_vector_store(text_chunks) + st.success("Done") + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Generative-AI/PDF Wizard/faiss_index/app.py b/Generative-AI/PDF Wizard/faiss_index/app.py new file mode 100644 index 0000000000..0b490bf55d --- /dev/null +++ b/Generative-AI/PDF Wizard/faiss_index/app.py @@ -0,0 +1,129 @@ +import streamlit as st +from PyPDF2 import PdfReader +from langchain.text_splitter import RecursiveCharacterTextSplitter +import os +from langchain_google_genai import GoogleGenerativeAIEmbeddings +import google.generativeai as genai +from langchain.vectorstores import FAISS +from langchain_google_genai import ChatGoogleGenerativeAI +from langchain.chains.question_answering import load_qa_chain +from langchain.prompts import PromptTemplate +from dotenv import load_dotenv + + +load_dotenv() +os.getenv("GOOGLE_API_KEY") +genai.configure(api_key=os.getenv("GOOGLE_API_KEY")) + + + + +##Function that reads the pdd goes through each and every page +def get_pdf_text(pdf_docs): + text="" + for pdf in pdf_docs: + pdf_reader= PdfReader(pdf) + for page in pdf_reader.pages: + text+= page.extract_text() + return text + + + +##Function that breaks text into chunks +def get_text_chunks(text): + # Adjust chunk size and overlap as needed + text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000) + chunks = text_splitter.split_text(text) + return chunks + + + + +##Function that saves the data we got from conversation to local(here), generally stored in data base +#converting into chunks -> storing data in faiss vector +def get_vector_store(text_chunks): + embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") + vector_store = FAISS.from_texts(text_chunks, embedding=embeddings) + + # Ensure the directory exists + if not os.path.exists("faiss_index"): + os.makedirs("faiss_index") + + # Save the vector store index in the directory + vector_store.save_local("faiss_index") + + + + +#Function to give the prompt and ask the bot to act accordingly, giving the gemini model +def get_conversational_chain(): + + prompt_template = """ + Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in + provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n + Context:\n {context}?\n + Question: \n{question}\n + + Answer: + """ + model = ChatGoogleGenerativeAI(model="gemini-1.5-flash",temperature=0.3) + + prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question"]) + chain = load_qa_chain(model, chain_type="stuff", prompt=prompt) + + return chain + + + + + +##Function for user input, question +#give the question-> do a similarity search on all the faiss vectors-> go with converstional chain-> response from chain +def user_input(user_question): + embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") + + # Check if the faiss_index file exists before loading + if not os.path.exists("faiss_index/index.faiss"): + st.error("FAISS index file not found. Please process the PDF files first.") + return + + new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True) + docs = new_db.similarity_search(user_question) + + chain = get_conversational_chain() + + response = chain( + {"input_documents": docs, "question": user_question}, + return_only_outputs=True + ) + + print(response) + st.write("Reply: ", response["output_text"]) + + + + +##Main Function for streamlit app +def main(): + st.set_page_config("PDF Wizard") + st.header("Chat with multiple PDFs📄") + + user_question = st.text_input("📎Ask a Question from the PDF Files") + + if user_question: + user_input(user_question) + + with st.sidebar: + st.title("Menu:") + pdf_docs = st.file_uploader("Upload your PDF Files and Click on the Submit & Process Button", accept_multiple_files=True) + if st.button("Submit & Process"): + with st.spinner("Processing..."): + raw_text = get_pdf_text(pdf_docs) + text_chunks = get_text_chunks(raw_text) + get_vector_store(text_chunks) + st.success("Done") + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Generative-AI/PDF Wizard/faiss_index/index.faiss b/Generative-AI/PDF Wizard/faiss_index/index.faiss new file mode 100644 index 0000000000..7c9f680723 Binary files /dev/null and b/Generative-AI/PDF Wizard/faiss_index/index.faiss differ diff --git a/Generative-AI/PDF Wizard/faiss_index/index.pkl b/Generative-AI/PDF Wizard/faiss_index/index.pkl new file mode 100644 index 0000000000..b159355665 Binary files /dev/null and b/Generative-AI/PDF Wizard/faiss_index/index.pkl differ diff --git a/Generative-AI/PDF Wizard/image.png b/Generative-AI/PDF Wizard/image.png new file mode 100644 index 0000000000..d3ae1b29c3 Binary files /dev/null and b/Generative-AI/PDF Wizard/image.png differ diff --git a/Generative-AI/PDF Wizard/readme_pdf_wiz.md b/Generative-AI/PDF Wizard/readme_pdf_wiz.md new file mode 100644 index 0000000000..acb0530c37 --- /dev/null +++ b/Generative-AI/PDF Wizard/readme_pdf_wiz.md @@ -0,0 +1,92 @@ +# PDF Wizard + + +📌 Overview +----------- + +PDF Wizard is an AI-powered Streamlit web application that allows users to chat with multiple PDFs. It processes uploaded PDF files, extracts their text, converts them into embeddings, stores them in a FAISS vector database, and answers user queries based on the stored content using Google's Gemini AI. + +🚀 Features +----------- + +* 📂 Upload multiple PDF files +* 🧠 AI-powered chatbot using Google Gemini API +* 🔍 Search and retrieve context-based answers +* 📚 FAISS vector database for efficient document searching +* ⚡ Streamlit-based interactive UI + + +🛠️ Tech Stack +-------------- + +* Python +* Streamlit +* FAISS +* Google Generative AI (Gemini API) +* PyPDF2 +* LangChain +* dotenv + + +📦 Installation +--------------- +1. Clone the Repo +``` +git clone https://github.com/UTSAVS26/PyVerse.git +cd Generative-AI +cd PDF-Wizard + ``` +2. Activate the environment +``` +python -m venv venv +``` +``` +source venv/bin/activate # For macOS/Linux +venv\Scripts\activate # For Windows +``` +3. Install dependencies +`pip install -r requirements.txt` + +4. **Set Up Environment Variables** + + * Create a .env file in the project root. + * GOOGLE\_API\_KEY= **your\_google\_api\_key** + + +🎯 Usage +-------- + +1. Run the application +`streamlit run app.py` + +2. **Upload PDF files** via the sidebar. + +3. **Ask questions** based on the uploaded PDFs. + +📂 ScreenShots +--------------------- +![alt text](image.png) + +📂 Project Structure +-------------------- +``` +PDF-Wizard +│-- faiss_index/ # Directory to store FAISS vectors +│-- app.py # Main Streamlit app +│-- requirements.txt # Required dependencies +│-- .env # API keys and environment variables +│-- README.md # Project documentation +``` + +🌟 Acknowledgments +------------------ + +* [Streamlit](https://streamlit.io/) +* [FAISS](https://faiss.ai/) +* [Google Generative AI](https://ai.google.dev/) + +👤 Contributor +---------------- +- **Name:** Arnab Ghosh +- **GitHub:** [tulug-559](https://github.com/tulu-g559) +- **Contact:** [email](garnab559@gmai.com) \ No newline at end of file diff --git a/Generative-AI/PDF Wizard/requirements.txt b/Generative-AI/PDF Wizard/requirements.txt new file mode 100644 index 0000000000..d6ecaadf59 --- /dev/null +++ b/Generative-AI/PDF Wizard/requirements.txt @@ -0,0 +1,7 @@ +streamlit +google-generativeai +python-dotenv +langchain +PyPDF2 +faiss-cpu +langchain_google_genai \ No newline at end of file