-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_org.py
126 lines (95 loc) · 3.94 KB
/
get_org.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
from cleantext import clean
import re
import json
import os
import sys
import logging
import subprocess
base_url = 'https://www.rusprofile.ru/codes/'
org_activity_codes = [
561000,
471100,
562900,
563000,
]
logging.basicConfig(filename='log.log', filemode='w', level=logging.INFO)
def process_page(soup):
orgs = []
for org in soup.findAll('div', class_='company-item'):
title = clean(org.find('a').get_text(), lower=True)
link = org.find('a')['href']
head_title = clean(org.select_one('.company-item-info>dl>dt').get_text(), lower=True)
head_name = clean(org.select_one('.company-item-info>dl>dd').get_text(), lower=True)
address = clean(org.find('address').get_text(), lower=True)
inn = clean(org.find(text="ИНН").parent.parent.findNext('dd').get_text(), lower=True)
ogrn = clean(org.find(text="ОГРН").parent.parent.findNext('dd').get_text(), lower=True)
reg_date = clean(org.find(text="Дата регистрации").parent.parent.findNext('dd').get_text(), lower=True)
reg_cap_dirty = org.find(text="Уставный капитал")
reg_cap = clean(reg_cap_dirty.parent.parent.findNext('dd').get_text(), lower=True) if reg_cap_dirty != None else None
status = org.select_one('.warning-text, .attention-text')
if status != None:
status = clean(status.get_text(), lower=True)
main_activity = clean(org.find(text="Основной вид деятельности").parent.parent.findNext('dd').get_text(), lower=True)
main_activity_code = int(re.findall(r'\d+', main_activity.replace('.', ''))[0].ljust(6, '0'))
if status == 'Организация ликвидирована':
return (orgs, False)
orgs.append({
'title': title,
'link': link,
'head_title': head_title,
'head_name': head_name,
'inn' : inn,
'ogrn': ogrn,
'reg_date': reg_date,
'reg_cap': reg_cap,
'status': status,
'main_activity': main_activity,
'main_activity_code': main_activity_code,
'address': address
})
return (orgs, True)
s = requests.Session()
for code in org_activity_codes:
page_num = 1
is_next_page = True
while is_next_page:
sys.stdout.flush()
if page_num > 100:
break
print(f'Processing {code} page {page_num}.')
url = base_url + str(code) + '/' + str(page_num)
#resp = s.get(url, verify=False, cookies={'okved_all':'yes'})
resp = s.get(url, verify=False)
if resp.status_code != 200:
is_next_page = False
logging.info(f'[Status not 200.] [STOP] {url}')
break
page = resp.text
soup = BeautifulSoup(page, 'html.parser')
is_busted = False
if soup.find('form', id='checkform'):
is_busted = True
logging.info(f'[Capcha.] {url}')
subprocess.run(['firefox.exe', '-new-instance', url])
continue
cout_text = soup.find('span', class_='okved-company-tools__label').get_text()
count = int(re.findall(r'\d+', cout_text)[-1])
print(f'pages {(count // 100) + 1}')
if page_num > (count // 100) + 1:
is_next_page = False
logging.info(f'[Last page.] [STOP] {url}')
break
(orgs, is_next_page) = process_page(soup)
file_name = f'out\\{code}\\{page_num}.json'
os.makedirs(os.path.dirname(file_name), exist_ok=True)
with open(file_name, 'w', encoding="utf-8") as out_file:
out_file.write(json.dumps(orgs, indent=3, ensure_ascii=False))
if is_next_page == False:
logging.info(f'[Found liqidated organization.] [STOP] {url}')
break
page_num += 1
time.sleep(6)