-
Notifications
You must be signed in to change notification settings - Fork 28
/
Copy pathmain.py
157 lines (133 loc) · 6.43 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import requests
from gpt_tools import GPTChat
from arxiv_tools import get_arxiv_papers
from pdf_tools import extract_text_from_pdf
import re
from termcolor import colored
import os
# Constants
MODEL = "gpt-4o"
MAX_RESULTS = 10
MAX_TEXT_LENGTH = 100000
NUMBER_OF_TURNS = 3
INITIAL_SEARCH_TERM = "coding ability of large language models"
PAPERS_DIR = "research_papers"
def sanitize_folder_name(name, max_length=30):
"""Sanitize folder name and limit length"""
# Remove invalid characters and replace with underscore
sanitized = re.sub(r'[\\/*?:"<>|]', "_", name)
# Replace spaces with underscores
sanitized = sanitized.replace(" ", "_")
# Limit length while keeping whole words
if len(sanitized) > max_length:
sanitized = sanitized[:max_length].rsplit('_', 1)[0]
return sanitized
def create_papers_directory(search_term):
"""Create directory for papers based on search term"""
folder_name = sanitize_folder_name(search_term)
folder_path = os.path.join(PAPERS_DIR, folder_name)
try:
os.makedirs(folder_path, exist_ok=True)
print(colored(f"✓ Created directory: {folder_path}", "green"))
return folder_path
except Exception as e:
print(colored(f"✗ Error creating directory: {str(e)}", "red"))
raise
def download_pdf(url, filename, folder_path):
try:
print(colored(f"→ Downloading PDF from: {url}", "yellow"))
response = requests.get(url)
response.raise_for_status()
file_path = os.path.join(folder_path, filename)
with open(file_path, 'wb') as f:
f.write(response.content)
print(colored(f"✓ PDF downloaded successfully: {file_path}", "green"))
return file_path
except requests.exceptions.RequestException as e:
error_msg = f"✗ Error downloading PDF: {str(e)}"
print(colored(error_msg, "red"))
raise
def main(search_term, chosen_paper):
try:
print(colored("\n=== Starting New Research Iteration ===", "cyan"))
print(colored(f"→ Current search term: {search_term}", "yellow"))
# Create directory for this search term
papers_folder = create_papers_directory(search_term)
# Initialize GPT chat model
gpt_chooser = GPTChat(
"You are a helpful assistant that chooses the most interesting academic paper. pick a paper which you find most useful and interesting and promising, explain briefly why(do not use any numbers in your explanation) then return the number of the paper you chose.",
model=MODEL
)
# Get papers from ArXiv
papers = get_arxiv_papers(search_term, max_results=MAX_RESULTS)
if not papers:
print(colored("✗ No papers found. Please try again.", "red"))
return search_term, chosen_paper
# Print titles and remove previously chosen paper
for paper in papers[:]:
if chosen_paper and chosen_paper.title == paper.title:
papers.remove(paper)
else:
print(colored(f"• {paper.title}", "cyan"))
# Get GPT's choice
summaries = str({i: paper.summary for i, paper in enumerate(papers)})
print(colored("\n→ Asking GPT to choose a paper...", "yellow"))
reason_and_number = gpt_chooser.get_completion(summaries)
# Parse the chosen paper number
try:
chosen_number = int(re.findall(r'\d+', reason_and_number)[0])
chosen_paper = papers[chosen_number]
except (IndexError, ValueError) as e:
print(colored(f"✗ Error parsing GPT's choice: {str(e)}", "red"))
return search_term, chosen_paper
# Download and process the chosen paper
valid_filename = re.sub(r'[\\/*?:"<>|]', "_", chosen_paper.title)
pdf_file_path = download_pdf(chosen_paper.pdf_url, f"{valid_filename}.pdf", papers_folder)
# Extract and process text
text = extract_text_from_pdf(pdf_file_path)
text = text[:MAX_TEXT_LENGTH]
# Get GPT's summary
print(colored("\n→ Getting paper summary from GPT...", "yellow"))
gpt_summarizer = GPTChat(
"You are a helpful assistant that summarizes academic papers. summarize the paper in a few sentences. also return a new promising search term for future research. return the search term as Search term: search term.",
model=MODEL
)
summary = gpt_summarizer.get_completion(text)
# Save summary to a text file
summary_file = os.path.join(papers_folder, f"{valid_filename}_summary.txt")
try:
with open(summary_file, 'w', encoding='utf-8') as f:
f.write(f"Paper Title: {chosen_paper.title}\n\n")
f.write(f"Summary:\n{summary}")
print(colored(f"✓ Summary saved to: {summary_file}", "green"))
except Exception as e:
print(colored(f"✗ Error saving summary: {str(e)}", "red"))
# Extract new search term
try:
match = re.search(r'Search term:(.*)', summary, re.IGNORECASE)
if match:
search_term = match.group(1).strip().replace('"', '')
print(colored(f"\n✓ New search term: {search_term}", "green"))
else:
print(colored("✗ Could not extract new search term", "red"))
except Exception as e:
print(colored(f"✗ Error extracting search term: {str(e)}", "red"))
return search_term, chosen_paper
except Exception as e:
print(colored(f"✗ Error in main process: {str(e)}", "red"))
return search_term, chosen_paper
if __name__ == "__main__":
print(colored("=== GPT-4 Research Assistant ===", "cyan"))
print(colored("→ Initializing...", "yellow"))
# Create main research papers directory
try:
os.makedirs(PAPERS_DIR, exist_ok=True)
print(colored(f"✓ Created main papers directory: {PAPERS_DIR}", "green"))
except Exception as e:
print(colored(f"✗ Error creating main directory: {str(e)}", "red"))
raise
chosen_paper = None
search_term = INITIAL_SEARCH_TERM
for i in range(NUMBER_OF_TURNS):
print(colored(f"\n=== Research Turn {i+1}/{NUMBER_OF_TURNS} ===", "cyan"))
search_term, chosen_paper = main(search_term, chosen_paper)