-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdblp_2.py
118 lines (101 loc) · 4.33 KB
/
dblp_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import requests
import random
from pymongo import MongoClient
from pymongo.server_api import ServerApi
import time
# MongoDB URI and client setup
uri = "mongodb+srv://KTAP8:[email protected]/?retryWrites=true&w=majority&appName=DsdeData"
client = MongoClient(uri, server_api=ServerApi('1'))
db = client['DsdeData'] # Database name
collection = db['scraped'] # New collection to store scraped data
# Function to fetch data from DBLP API with retry and exponential backoff
def fetch_dblp_data(query, max_results=10, max_retries=5):
base_url = "https://dblp.org/search/publ/api"
params = {
"q": query,
"h": max_results,
"format": "json"
}
retry_count = 0
delay = 1 # Initial delay in seconds
while retry_count < max_retries:
response = requests.get(base_url, params=params)
if response.status_code == 200:
return response.json()
elif response.status_code == 429:
print("Rate limit hit. Retrying...")
time.sleep(delay)
delay *= 2 # Exponential backoff
retry_count += 1
else:
print(f"Failed to fetch data. HTTP Status code: {response.status_code}")
return None
print("Max retries exceeded.")
return None
# Query terms and subject mapping
queries = [
"Agricultural", "Bio", "Arts", "Human", "Business", "Management", "Account",
"Chemical Engineering", "Computer", "Decision", "Dentist", "Earth", "Economic",
"Finance", "Energy", "Engineering", "Environment", "Health", "Immun",
"Material science", "Math", "Medicine", "Neuroscience", "Nursing", "Pharma",
"Physic", "Psychology", "Social", "Veterinary", "Multidisciplinary"
]
subject_map = {
"AGRI": ["Agricultural", "Bio"], "ARTS": ["Arts"], "BIOC": ["Human"],
"BUSI": ["Business", "Management", "Account"], "CENG": ["Chemical Engineering"],
"CHEM": ["Chem"], "COMP": ["Computer"], "DECI": ["Decision"], "DENT": ["Dentist"],
"EART": ["Earth"], "ECON": ["Economic", "Finance"], "ENER": ["Energy"],
"ENGI": ["Engineering"], "ENVI": ["Environment"], "HEAL": ["Health"],
"IMMU": ["Immun"], "MATE": ["Material science"], "MATH": ["Math"],
"MEDI": ["Medicine"], "NEUR": ["Neuroscience"], "NURS": ["Nursing"],
"PHAR": ["Pharma"], "PHYS": ["Physic"], "PSYC": ["Psychology"],
"SOCI": ["Social"], "VETE": ["Veterinary"], "MULT": ["Multidisciplinary"]
}
def map_subject(value):
for key, values in subject_map.items():
if value in values:
return key
return value
def drop_scrape():
db['scraped'].drop()
print("The 'scraped' collection has been dropped.")
# Fetch and insert data in batches
results = []
while len(results) < 1000:
query = random.choice(queries)
dblp_data = fetch_dblp_data(query, max_results=25)
if dblp_data and 'result' in dblp_data and 'hits' in dblp_data['result'] and 'hit' in dblp_data['result']['hits']:
hits = dblp_data['result']['hits']['hit']
batch = []
for hit in hits:
info = hit['info']
title = info.get('title', None)
authors = info.get('authors', {}).get('author', [])
if isinstance(authors, list):
author_names = [author['text'] for author in authors]
else:
author_names = [authors['text']]
venue = info.get('venue', None)
year = info.get('year', None)
# Prepare document to insert
if(title and venue and year):
document = {
"title": title,
"authors": author_names,
"year": year,
"date": None,
"language": {"@xml:lang": "eng"},
"authkeywords": {"author-keyword": venue},
"subject_areas": {"subject-area": map_subject(query)}
}
batch.append(document)
# Insert batch into collection
if batch:
collection.insert_many(batch)
results.extend(batch)
print(f'Inserted batch of {len(batch)} documents.')
# Delay between requests to prevent rate limits
time.sleep(2) # Increase delay slightly if rate limit is still an issue
print("Data insertion complete!")
#print(results)
print(len(results))