-
Notifications
You must be signed in to change notification settings - Fork 286
/
Copy pathcomplex_sentence.py
202 lines (185 loc) · 9.07 KB
/
complex_sentence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/usr/bin/env python3
# coding: utf-8
# File: Complex_sentence.py
# Author: lhy<[email protected],https://huangyong.github.io>
# Date: 18-9-4
import re
import pymongo
import urllib.parse
'''中文复句整理及模板'''
class EventsExtraction:
def __init__(self):
self.but_wds = self.pattern_but()
self.seq_wds = self.pattern_seq()
self.condition_wds = self.pattern_condition()
self.more_wds = self.pattern_more()
self.but_patterns = self.create_pattern(self.but_wds)
self.seq_patterns = self.create_pattern(self.seq_wds)
self.condition_patterns = self.create_pattern(self.condition_wds)
self.more_patterns = self.create_pattern(self.more_wds)
'''转折事件'''
def pattern_but(self):
wds = [[['与其'], ['不如'],'but'],
[['虽然','尽管','虽'],['但也','但还','但却','但'],'but'],
[['虽然','尽管','虽'],[ '但','但是也','但是还','但是却',],'but'],
[['不是'],['而是'],'but'],
[['即使','就算是'],['也','还'],'but'],
[['即便'],['也','还'],'but'],
[['虽然','即使'],['但是','可是','然而','仍然','还是','也', '但'],'but'],
[['虽然','尽管','固然'],['也','还','却'],'but'],
[['与其','宁可'],['决不','也不','也要'],'but'],
[['与其','宁肯'],['决不','也要','也不'],'but'],
[['与其','宁愿'],['也不','决不','也要'],'but'],
[['虽然','尽管','固然'],['也','还','却'],'but'],
[['不管','不论','无论','即使'],['都', '也', '总', '始终', '一直'],'but'],
[['虽'],['可是','倒','但','可','却','还是','但是'],'but'],
[['虽然','纵然','即使'],['倒','还是','但是','但','可是','可','却'],'but'],
[['虽说'],['还是','但','但是','可是','可','却'],'but'],
[['无论'],['都','也','还','仍然','总','始终','一直'],'but'],
[['与其'],['宁可','不如','宁肯','宁愿'],'but']]
return wds
'''顺承事件'''
def pattern_seq(self):
wds =[
[['又', '再', '才', '并'], ['进而'], 'sequence'],
[['首先', '第一'], ['其次', '然后'], 'sequence'],
[['首先', '先是'], ['再', '又', '还', '才'], 'sequence'],
[['一方面'], ['另一方面', '又', '也', '还'], 'sequence']]
return wds
'''并列事件'''
def pattern_more(self):
wds = [
[['不但', '不仅'], ['并且'], 'more'],
[['不单'], ['而且', '并且', '也', '还'], 'more'],
[['不但'], ['而且', '并且', '也', '还'], 'more'],
[['不管'], ['都', '也', '总', '始终', '一直'], 'more'],
[['不光'], ['而且', '并且', '也', '还'], 'more'],
[['虽然', '尽管'], ['不过'], 'more'],
[['不仅'], ['还', '而且', '并且', '也'], 'more'],
[['不论'], ['还是', '也', '总', '都', '始终', '一直'], 'more'],
[['不只'], ['而且', '也', '并且', '还'], 'more'],
[['不但', '不仅', '不光', '不只'], ['而且'], 'more'],
[['尚且', '都', '也', '又', '更'], ['还', '又'], 'more'],
[['既然', '既',], ['就', '便', '那', '那么', '也', '还'], 'more'],
[['无论', '不管', '不论', '或'], ['或'], 'choice'],
[['或是'], ['或是'], 'choice'],
[['或者', '无论', '不管', '不论'], ['或者'], 'choice'],
[['不是'], ['也'], 'choice'],
[['要么', '或者'], ['要么', '或者'], 'choice'],
]
return wds
'''条件事件'''
def pattern_condition(self):
wds = [
[['除非'], ['否则', '才', '不然', '要不'], 'condition'],
[['除非'], ['否则的话'], 'condition'],
[['还是', '无论', '不管'], ['还是', '都', '总'], 'condition'],
[['既然'], ['又', '且', '也', '亦'], 'condition'],
[['假如'], ['那么', '就', '也', '还'], 'condition'],
[['假若', '如果'], ['那么', '就', '那', '则', '便'], 'condition'],
[['假使', '如果'], ['那么', '就', '那', '则', '便'], 'condition'],
[['尽管', '如果'], ['那么', '就', '那', '则', '便'], 'condition'],
[['即使', '就是'], ['也', '还是'], 'condition'],
[['如果', '既然'], ['那么'], 'condition'],
[['如', '假设'], ['则', '那么', '就', '那'], 'condition'],
[['如果', '假设'], ['那么', '则', '就', '那'], 'condition'],
[['万一'], ['那么', '就'], 'condition'],
[['要是', '如果'], ['就', '那'], 'condition'],
[['要是', '如果', '假如'], ['那么', '就', '那', '的话'], 'condition'],
[['一旦'], ['就'], 'condition'],
[['既然', '假如', '既', '如果'], ['则','就'], 'condition'],
[['只要'], ['就', '便', '都', '总'], 'condition'],
[['只有'], ['才', '还'], 'condition'],
]
return wds
'''编译模式'''
def create_pattern(self, wds):
patterns = []
for wd in wds:
pre = wd[0]
pos = wd[1]
pattern = re.compile(r'({0})(.*)({1})([^??!!。;;::\n\r,,]*)'.format('|'.join(pre), '|'.join(pos)))
patterns.append(pattern)
return patterns
'''文章分句处理, 切分长句,冒号,分号,感叹号等做维护标识'''
def split_sents(self, content):
return [sentence.replace(' ','') for sentence in re.split(r'[??!!。;;::\n\r]', content) if sentence]
'''模式匹配'''
def pattern_match(self, patterns, sent):
datas = {}
max = 0
for p in patterns:
ress = p.findall(sent)
if ress:
for res in ress:
data = {'pre_wd': res[0], 'pre_part': res[1], 'post_wd': res[2], 'post_part ': res[3]}
len_res = len(res[0] + res[2])
if len_res > max:
datas = data
max = len_res
return datas
'''基于模式,抽取出相应的四元组'''
def extract_tuples(self, sent):
but_tuples = self.pattern_match(self.but_patterns, sent)
condition_tuples = self.pattern_match(self.condition_patterns, sent)
seq_tuples = self.pattern_match(self.seq_patterns, sent)
more_tuples = self.pattern_match(self.more_patterns, sent)
return but_tuples, condition_tuples, seq_tuples, more_tuples
'''处理主函数'''
def extract_main(self, content):
sents = self.split_sents(content)
datas = []
for sent in sents:
data = {}
data['sent'] = sent
but_tuples, condition_tuples, seq_tuples, more_tuples = self.extract_tuples(sent)
if but_tuples:
data['type'] = 'but'
data['tuples'] = but_tuples
if condition_tuples:
data['type'] = 'condition'
data['tuples'] = condition_tuples
if seq_tuples:
data['type'] = 'seq'
data['tuples'] = seq_tuples
if more_tuples:
data['type'] = 'more'
data['tuples'] = more_tuples
if 'type' in data:
datas.append(data)
return datas
'''基于给定语料与模板的事件抽取,假定你选择的是Mongodb数据库'''
class TextMining:
def __init__(self):
mongo_host = '127.0.0.1' # 你数据库的ip
mongo_port = 28017 #你数据库端口
mongo_db = 'news' #数据库名称
mongo_col = 'news_data' #数据表名称
username = urllib.parse.quote_plus("root")#用户名
password = urllib.parse.quote_plus("12345678") #密码
client = pymongo.MongoClient(
'mongodb://{}:{}@{}:{}'.format(username, password, mongo_host, mongo_port))
self.db = client[mongo_db]
self.col = self.db[mongo_col]
self.extractor = EventsExtraction()
'''批量跑数据库中的数据,并插入相应数据库当中'''
def process_mongonews(self):
count = 0
for item in self.col.find():
content = item['content']
url = item['url']
count+=1
try:
datas = self.extractor.extract_main(content)
if datas:
data = {}
data['url'] = url
data['data'] = datas
self.db['event_extract'].insert(data)
except Exception as e:
print(e)
if count % 10000 == 0:
print(count)
if __name__ == '__main__':
handler = TextMining()
handler.process_mongonews()