-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcnpq_produtividade_list.py
72 lines (57 loc) · 2.38 KB
/
cnpq_produtividade_list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from bs4 import BeautifulSoup
from urllib.request import (urlopen, urlparse, urlunparse, urlretrieve)
import csv
from difflib import SequenceMatcher
import heapq
import unidecode
def similar(a, b):
return SequenceMatcher(None, a, b).ratio()
## directories
DATA_DIR="../data"
## list of researchers
profs="{0}/profs/all-authors.csv".format(DATA_DIR)
# URL to download list from CNPq
URL="http://plsql1.cnpq.br/divulg/RESULTADO_PQ_102003.prc_comp_cmt_links?V_COD_DEMANDA=200310&V_TPO_RESULT=CURSO&V_COD_AREA_CONHEC=10300007&V_COD_CMT_ASSESSOR=CC"
# cnpq uses complete names whereas csindex does not
cnpq_to_csindex_name_map = {}
def main():
## look for correspondence
dict={}
## load name of researchers as per csindex
csindexnames = [line.rstrip('\n') for line in open(profs)]
## load cnpq data
soup = BeautifulSoup(urlopen(URL), 'lxml') # Parse the HTML as a string
table = soup.find_all('table')[3].find_all('table')[1] # Grab the first table
for row in table.find_all('tr'):
columns = row.find_all('td')
if len(columns)<2: continue
## cnpq data includes accents; csindex does not. this is a big problem for comparison.
## remove accents!
authorName=unidecode.unidecode(columns[0].get_text())
if authorName == "Nome" or authorName == "Início": continue
level=columns[1].get_text()
firstnameCNPQ = authorName.split(" ")[0]
##TODO: check if first + last name is an exact match!
if not authorName in csindexnames:
# look for the most similar name whose first name match
prioQueue = []
for x in csindexnames:
firstnameCSINDEX = x.split(" ")[0]
if (not firstnameCNPQ == firstnameCSINDEX): continue
sim = similar(authorName, x)
heapq.heappush(prioQueue, (sim, x))
## call attention to fix this thing!
authorName = "{0} => {1}".format(authorName, heapq.nlargest(5, prioQueue))
dict.update({authorName: level})
for (key, value) in dict.items():
print("{0}, {1}".format(key, value))
if __name__ == "__main__":
# a='Adenilso da Silva Simao'
# b='Adenilso Simao'
# seq=SequenceMatcher(None, a,b)
# d=seq.ratio()*100
# print(d)
# d=similar(a, b)
# print(d)
# print('{0:.2f} , {1}'.format(d, 'hello'))
main()