-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path05_parse_post2013.py
133 lines (110 loc) · 5.88 KB
/
05_parse_post2013.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import time
from lxml import html
from threading import Thread
from multiprocessing import Queue
import requests
import glob
from xml.etree import ElementTree as ET
import re
import config
_folder_post_parse2013 = config._POST2013 # папка с постами 2013 года
def write_user_to_xml(d_user_code, d_user_name): # пишем уникальных пльзователй в xml
_UUE_Array_El = [d_user_code,
f'https://www.roller.ru/forum/memberlist.php?mode=viewprofile&u={d_user_code}',
d_user_name]
_UUE_Array.append(_UUE_Array_El)
def write_post_user_to_xml(d_user_code, d_user_name,
d_post_code): # пишем набор +1 на покатушку _user_code,_user_name, d_post_code
_PU_Array_El = [d_user_code,
d_user_name,
f'https://www.roller.ru/forum/memberlist.php?mode=viewprofile&u={d_user_code}',
d_post_code,
f'https://www.roller.ru/forum/viewtopic.php?f=3&t={d_post_code}']
_PU_Array.append(_PU_Array_El)
def parse_post(dw_post_uq):
_files = glob.glob(f'{_folder_post_parse2013}\\{dw_post_uq}*.html')
_user_inpost_unique = []
_user_inpost_unique.clear()
for _file in _files:
print(f'{_file}, {work_queue.qsize()}')
with open(_file, 'r', encoding='UTF-8') as fileR:
_str = fileR.read()
_main_tree = html.fromstring(_str) # загружаем в строку
_messages = _main_tree.xpath('.//table[@class="tablebg"]')
for _message in _messages:
_user_tree = _message.xpath('.//tr/th/noindex/a')[0]
# print(html.tostring(_user_tree, encoding='unicode'))
d_user_code = _user_tree.get('href')
d_user_code = d_user_code.replace('./memberlist.php?mode=viewprofile&u=', '')
d_user_code = d_user_code.split('&')[0]
d_user_name = _user_tree.text
_mess_tree = _message.xpath('.//div[@class="postbody"]')[0]
_mess_str = html.tostring(_mess_tree, encoding='unicode')
if not d_user_code in _user_unique: # сохраняем уникальный набор пользователй
_user_unique.append(d_user_code)
write_user_to_xml(d_user_code, d_user_name)
if not d_user_code in _user_inpost_unique: # сохраняем уникальный набор участников
_user_inpost_unique.append(d_user_code)
write_post_user_to_xml(d_user_code, d_user_name, dw_post_uq.split('_')[0])
def process(dw_TheadName):
while not work_queue.empty():
_post_uq = work_queue.get()
# time.sleep(.1)
print(f'THEAD {_post_uq} - {dw_TheadName}')
parse_post(_post_uq)
def main():
list_th = []
for i in range(50):
p1 = Thread(target=process, args=[f'Thead{i}'])
list_th.append(p1)
for th in list_th:
th.start()
for th in list_th:
th.join()
_folder_post_prs = config._POSTPROCESS # каталог для записи
# ----------------------------file exchange----------------------------------------
_export_xml_file = f'{_folder_post_prs}\\full_data_2013.xml' # имя файла
_xml_root = ET.Element("root") # корневой элемент
_xml_references = ET.SubElement(_xml_root, "references") # общее дерево
#----------------------------ЗАПИСЫВАЕМ УНИКАЛЬНЫХ ПОЛЬЗОВАТЕЛЕЙ-----------------------------------
_user_unique_root_SBR = ET.SubElement(_xml_references, "reference")
_user_unique_root_SBR.set("reference_name", "user")
for _i in range(len(_UUE_Array)):
_user_unique_item = ET.SubElement(_user_unique_root_SBR, "item")
_user_unique_item.set("user_code", _UUE_Array[_i][0])
_user_unique_item.set("user_link", _UUE_Array[_i][1])
_user_unique_item.set("user_name", _UUE_Array[_i][2])
#----------------------------ЗАПИСЫВАЕМ ПОСЕТИТЕЛЕЙ ПОКАТУШЕК------------------------------
_post_user_root_SBR = ET.SubElement(_xml_references, "reference")
_post_user_root_SBR.set("reference_name", "post_user")
for _i in range(len(_PU_Array)):
_post_user_item = ET.SubElement(_post_user_root_SBR, "item")
_post_user_item.set("user_code", _PU_Array[_i][0])
_post_user_item.set("user_name", _PU_Array[_i][1])
_post_user_item.set("user_link", _PU_Array[_i][2])
_post_user_item.set("post_code", _PU_Array[_i][3])
_post_user_item.set("post_link", _PU_Array[_i][4])
# ----------------------------file exchange----------------------------------------
_xml_tree = ET.ElementTree(_xml_root) # записываем дерево в файл
_xml_tree.write(_export_xml_file) # сохраняем файл
if __name__ == "__main__":
_user_unique = []
_UUE_Array = []
_PU_Array = []
# -------------------------------------------------------------------------------------------------
_list_post = []
_list_post.clear()
_list_files = os.listdir(_folder_post_parse2013) # получаем все файлы с постами (48341_83391_0.html, 48341_83391_1.html....)
for _file in _list_files:
_post_user = _file.split('.')[0]
_post_user = f"{_post_user.split('_')[0]}_{_post_user.split('_')[1]}"
if _post_user not in _list_post:
_list_post.append(_post_user) # выбираем уникальные наборы (пост+ведущий) 48341_83391_0.html = 48341_83391
# -------------------------------------------------------------------------------------------------
# _list_post.clear()
# _list_post.append('15700_3952')
work_queue = Queue()
for _post in _list_post:
work_queue.put(_post)
main()