forked from ffliu6/ChiLingFeat
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextractor.py
482 lines (417 loc) · 21.4 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
# -*- coding: UTF-8 -*-
"""
Software: LingFeat - Comprehensive Linguistic Features for Readability Assessment
Page: extractor.py
License: CC-BY-SA 4.0
Original Author: Bruce W. Lee (이웅성) @brucewlee
Affiliation 1: LXPER AI, Seoul, South Korea
Affiliation 2: University of Pennsylvania, PA, USA
Contributing Author: -
Affiliation : -
"""
# utilities
import math
import os
import warnings
import logging
import tqdm
def nop(it, *a, **k):
return it
tqdm.tqdm = nop
from utils import nan_check
# performance-central dependencies
import spacy
# import supar
from supar import Parser
# discourse features
import _Discourse.EnDF as Disco_EnDF
import _Discourse.EnGF as Disco_EnGF
# syntactic features
import _Syntactic.POSF as Synta_POSF
import _Syntactic.PhrF as Synta_PhrF
import _Syntactic.TrSF as Synta_TrSF
# lexico-Semantic features
import _LexicoSemantic.TTRF as LxSem_TTRF
import _LexicoSemantic.VarF as LxSem_VarF
# shallow features
import _ShallowTraditional.ShaF as ShaTr_ShaF
# ignore warning
warnings.filterwarnings("ignore")
# current path
dir_path = os.path.dirname(os.path.realpath(__file__))
# load models
#NLP = spacy.load('en_core_web_sm')
NLP=spacy.load('zh_core_web_trf')
#SuPar = Parser.load('crf-con-en')
SuPar=Parser.load('crf-con-zh')
from stanfordcorenlp import StanfordCoreNLP
# stanfordnlp = StanfordCoreNLP('/Users/fred6/codes/PhD/Research/2022/ARR/stanford-corenlp-full-2018-02-27', lang='zh')
stanfordnlp = StanfordCoreNLP('http://localhost', port=9000, lang='zh')
class pass_text:
"""
Initialize pipeline
input :
- text: original input text to analyze
saves :
- self.origin_doc
- self.NLP_doc: spacy pipeline object
- self.depends: syntactical dependency tree by stanfordcoreNLP
"""
def __init__(self, text:str):
self.NLP_doc = NLP(text)
self.origin_doc = text
self.depends = stanfordnlp.dependency_parse(text)
"""
Preprocess given text, count tokens & sentences
** throughout this program, only n_token and n_sent are defaulted at 1 to prevent division error
input :
- short (default False): include shorts words of under 3 letters
- see_token (default False): return token_list
- see_sent_token (default False): return sent_token_list
saves :
- self.n_token
- self.n_sent
- self.token_list: lemmatized token list, only alphabets
- self.sent_token_list: token list, no lemmatization, list of list in sentence
output:
- n_token
- n_sent
- token_list (optional)
- sent_token_list (optional)
"""
def preprocess(self, short=False, see_token=False, see_sent_token=False):
n_token = 1
n_sent = 1
token_list = []
raw_token_list = []
sent_token_list = []
# sent_list is for raw string sentences
sent_list = []
# Chinese punctunations list
ChiPuns = "!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
# ChiPuns = ChiPuns.decode('utf-8')
# count tokens, sentence + make lists
for sent in self.NLP_doc.sents:
n_sent += 1
sent_list.append(sent.text)
temp_list = []
for token in sent:
print(token)
if token.text in ChiPuns:
temp_list.append(token.text)
else:
if short == True:
n_token += 1
#token_list.append(token.lemma_.lower())
token_list.append(token.text)
if short == False:
if len(token.text) >= 3:
n_token += 1
token_list.append(token.text.lower())
else:
continue
'''
if token.text.isalpha():
temp_list.append(token.text)
if short == True:
n_token += 1
#token_list.append(token.lemma_.lower())
token_list.append(token.text)
if short == False:
if len(token.text) >= 3:
n_token += 1
token_list.append(token.text.lower())
'''
if len(temp_list) > 1:
temp_list.extend(token_list)
sent_token_list.append(temp_list)
'''
if n_token != 1:
n_token -= 1 # discard the default n_token = 1
if n_sent != 1:
n_sent -= 1 # discard the default n_sent = 1
'''
self.n_token = n_token
self.n_sent = n_sent
self.token_list = token_list
self.sent_token_list = sent_token_list
result = {"n_token": self.n_token,
"n_sent": self.n_sent,
}
if see_token == True:
result["token"] = token_list
if see_sent_token == True:
result["sent_token"] = sent_token_list
return result
"""
Shallow Features -> 35
output (type -> dictionary):
- as_fewStroke_C: Percentage of characters containing 1 to 10 strokes per sentence
- as_moderateStroke_C: Percentage of characters 11 to 20 strokes per sentence
- as_highStroke_C: Percentage of characters containing over 20 strokes per sentence
- at_Stroke_C: Average number of strokes per word
- as_lowhsk_C: Percentage of HSK1 to HSK3-characters per sentence
- as_moderatehsk_C: Percentage of HSK4 to HSK5-characters per sentence
- as_highhsk_C: Percentage of HSK6-characters per sentence
- as_nonhsk_C: Percentage of not-HSK-characters per sentence
- to_fewStroke_C: Total number of characters containing 1 to 10 strokes
- to_moderateStroke_C: Total number of characters containing 11 to 20 strokes
- as_Chara_C: average count of characters per sentence
- at_Chara_C: average count of characters per token
- ats_Chara_C: Average number of characters per word per sentence
- ats_Utoken_C: Average number of characters per unique word per sentence
- as_TwoWord_C: Number of two-character words per sentence
- Per_TwoWord_C: Percentage of two-character words per sentence
- as_ThreeWord_C: Number of three-character words per sentence
- Per_ThreeWord_C: Percentage of three-character words per sentence
- as_FourWord_C: Number of four-character words per sentence
- Per_FourWord_C: Percentage of four-character words per sentence
- as_upFiveWord_C: Number of five-up-character words per sentence
- Per_upFiveWord_C: Percentage of five-up-character words per sentence
- as_lowhsk_W: Percentage of HSK1 to HSK3-words per sentence
- as_moderatehsk_W: Percentage of HSK4 to HSK5-words per sentence
- as_highhsk_W: Percentage of HSK6-words per sentence
- as_nonhsk_W: Percentage of Not-HSK-words per sentence
- as_firstcommon_W: Percentage of words in 1-1000 mardain frequency list (most-common) per sentence
- as_secondcommon_W: Percentage of words in 2-2000 mardain frequency list (second-most-common) per sentence
- as_allcommon_W: Percentage of words in 1-3000 mardain frequency list (all most-common) per sentence
- TokSenM_S: total count of tokens x total count of sentence
- TokSenS_S: sqrt(total count of tokens x total count of sentence)
- TokSenL_S: log(total count of tokens)/log(total count of sentence)
- as_Token_C: average count of tokens per sentence
- as_MultiWord: Number of multi-character words per sentence
- at_DiffWord: Proportion of difficult words, as according to mandarin frequency lists, divided by the total number of words
"""
def ShaF_(self):
result = ShaTr_ShaF.retrieve(self.origin_doc, self.token_list, self.sent_token_list, self.n_token, self.n_sent, threshold_diff=3000)
result = nan_check(result)
return result
"""
Part-of-Speech Features -> 55 -> 132 - 4 -> 128
output (type -> dictionary):
- to_NoTag_C: total count of Noun POS tags
- as_NoTag_C: average count of Noun POS tags per sentence
- at_NoTag_C: average count of Noun POS tags per token
- ra_NoAjT_C: ratio of Noun POS count to Adjective POS count
- ra_NoVeT_C: ratio of Noun POS count to Verb POS count
- ra_NoAvT_C: ratio of Noun POS count to Adverb POS count
- ra_NoSuT_C: ratio of Noun POS count to Subordinating Conjunction count
- ra_NoCoT_C: ratio of Noun POS count to Coordinating Conjunction count
- to_VeTag_C: total count of Verb POS tags
- as_VeTag_C: average count of Verb POS tags per sentence
- at_VeTag_C: average count of Verb POS tags per token
- ra_VeAjT_C: ratio of Verb POS count to Adjective POS count
- ra_VeNoT_C: ratio of Verb POS count to Noun POS count
- ra_VeAvT_C: ratio of Verb POS count to Adverb POS count
- ra_VeSuT_C: ratio of Verb POS count to Subordinating Conjunction count
- ra_VeCoT_C: ratio of Verb POS count to Coordinating Conjunction count
- to_AjTag_C: total count of Adjective POS tags
- as_AjTag_C: average count of Adjective POS tags per sentence
- at_AjTag_C: average count of Adjective POS tags per token
- ra_AjNoT_C: ratio of Adjective POS count to Noun POS count
- ra_AjVeT_C: ratio of Adjective POS count to Verb POS count
- ra_AjAvT_C: ratio of Adjective POS count to Adverb POS count
- ra_AjSuT_C: ratio of Adjective POS count to Subordinating Conjunction count
- ra_AjCoT_C: ratio of Adjective POS count to Coordinating Conjunction count
- to_AvTag_C: total count of Adverb POS tags
- as_AvTag_C: average count of Adverb POS tags per sentence
- at_AvTag_C: average count of Adverb POS tags per token
- ra_AvAjT_C: ratio of Adverb POS count to Adjective POS count
- ra_AvNoT_C: ratio of Adverb POS count to Noun POS count
- ra_AvVeT_C: ratio of Adverb POS count to Verb POS count
- ra_AvSuT_C: ratio of Adverb POS count to Subordinating Conjunction count
- ra_AvCoT_C: ratio of Adverb POS count to Coordinating Conjunction count
- to_SuTag_C: total count of Subordinating Conjunction POS tags
- as_SuTag_C: average count of Subordinating Conjunction POS tags per sentence
- at_SuTag_C: average count of Subordinating Conjunction POS tags per token
- ra_SuAjT_C: ratio of Subordinating Conjunction POS count to Adjective POS count
- ra_SuNoT_C: ratio of Subordinating Conjunction POS count to Noun POS count
- ra_SuVeT_C: ratio of Subordinating Conjunction POS count to Verb POS count
- ra_SuAvT_C: ratio of Subordinating Conjunction POS count to Adverb POS count
- ra_SuCoT_C: ratio of Subordinating Conjunction POS count to Coordinating Conjunction count
- to_CoTag_C: total count of Coordinating Conjunction POS tags
- as_CoTag_C: average count of Coordinating Conjunction POS tags per sentence
- at_CoTag_C: average count of Coordinating Conjunction POS tags per token
- ra_CoAjT_C: ratio of Coordinating Conjunction POS count to Adjective POS count
- ra_CoNoT_C: ratio of Coordinating Conjunction POS count to Noun POS count
- ra_CoVeT_C: ratio of Coordinating Conjunction POS count to Verb POS count
- ra_CoAvT_C: ratio of Coordinating Conjunction POS count to Adverb POS count
- ra_CoSuT_C: ratio of Coordinating Conjunction POS count to Subordinating Conjunction count
- to_ContW_C: total count of Content words
- as_ContW_C: average count of Content words per sentence
- at_ContW_C: average count of Content words per token
- to_FuncW_C: total count of Function words
- as_FuncW_C: average count of Function words per sentence
- at_FuncW_C: average count of Function words per token
- ra_CoFuW_C: ratio of Content words to Function words
- at_UAdj_C: Percentage of unique adjectives per sentence
- as_UAdj_C: Number of unique adjectives per sentence
- at_UFunction_C: Percentage of unique functional words per sentence
- as_UFunction_C: Number of unique functional words per sentence
- at_UVerb_C: Percentage of unique verbs per sentence
- as_UVerb_C: Number of unique verbs per sentence
- at_UNoun_C: Percentage of unique nouns per sentence
- as_UNoun_C: Number of unique nouns per sentence
- at_UContent_C: Percentage of unique content words per sentence
- as_UContent_C: Number of unique content words per sentence
- at_UAdverb_C: Percentage of unique adverbs per sentence
- as_UAdverb_C: Number of unique adverbs per sentence
- to_NoPhr_C: total count of Noun phrases
- as_NoPhr_C: average count of Noun phrases per sentence
- at_NoPhr_C: average count of Noun phrases per token
- ra_NoVeP_C: ratio of Noun phrases count to Verb phrases count
- ra_NoSuP_C: ratio of Noun phrases count to Subordinate Clauses count
- ra_NoPrP_C: ratio of Noun phrases count to Prep phrases count
- ra_NoAjP_C: ratio of Noun phrases count to Adj phrases count
- ra_NoAvP_C: ratio of Noun phrases count to Adv phrases count
- to_VePhr_C: total count of Verb phrases
- as_VePhr_C: average count of Verb phrases per sentence
- at_VePhr_C: average count of Verb phrases per token
- ra_VeNoP_C: ratio of Verb phrases count to Noun phrases count
- ra_VeSuP_C: ratio of Verb phrases count to Subordinate Clauses count
- ra_VePrP_C: ratio of Verb phrases count to Prep phrases count
- ra_VeAjP_C: ratio of Verb phrases count to Adj phrases count
- ra_VeAvP_C: ratio of Verb phrases count to Adv phrases count
- to_SuPhr_C: total count of Subordinate Clauses
- as_SuPhr_C: average count of Subordinate Clauses per sentence
- at_SuPhr_C: average count of Subordinate Clauses per token
- ra_SuNoP_C: ratio of Subordinate Clauses count to Noun phrases count
- ra_SuVeP_C: ratio of Subordinate Clauses count to Verb phrases count
- ra_SuPrP_C: ratio of Subordinate Clauses count to Prep phrases count
- ra_SuAjP_C: ratio of Subordinate Clauses count to Adj phrases count
- ra_SuAvP_C: ratio of Subordinate Clauses count to Adv phrases count
- to_PrPhr_C: total count of prepositional phrases
- as_PrPhr_C: average count of prepositional phrases per sentence
- at_PrPhr_C: average count of prepositional phrases per token
- ra_PrNoP_C: ratio of Prep phrases count to Noun phrases count
- ra_PrVeP_C: ratio of Prep phrases count to Verb phrases count
- ra_PrSuP_C: ratio of Prep phrases count to Subordinate Clauses count
- ra_PrAjP_C: ratio of Prep phrases count to Adj phrases count
- ra_PrAvP_C: ratio of Prep phrases count to Adv phrases count
- to_AjPhr_C: total count of Adjective phrases
- as_AjPhr_C: average count of Adjective phrases per sentence
- at_AjPhr_C: average count of Adjective phrases per token
- ra_AjNoP_C: ratio of Adj phrases count to Noun phrases count
- ra_AjVeP_C: ratio of Adj phrases count to Verb phrases count
- ra_AjSuP_C: ratio of Adj phrases count to Subordinate Clauses count
- ra_AjPrP_C: ratio of Adj phrases count to Prep phrases count
- ra_AjAvP_C: ratio of Adj phrases count to Adv phrases count
- to_AvPhr_C: total count of Adverb phrases
- as_AvPhr_C: average count of Adverb phrases per sentence
- at_AvPhr_C: average count of Adverb phrases per token
- ra_AvNoP_C: ratio of Adv phrases count to Noun phrases count
- ra_AvVeP_C: ratio of Adv phrases count to Verb phrases count
- ra_AvSuP_C: ratio of Adv phrases count to Subordinate Clauses count
- ra_AvPrP_C: ratio of Adv phrases count to Prep phrases count
- ra_AvAjP_C: ratio of Adv phrases count to Adj phrases count
- as_LenNPhrase_C: Average length of noun phrases per sentence
- as_LenVPhrase_C: Average length of verbal phrases per sentence
- as_LenPrePhrase_C: Average length of prepositional phrases per sentence
- to_TreeH_C: total Tree height of all sentences
- as_TreeH_C: average Tree height per sentence
- at_TreeH_C: average Tree height per token (word)
- to_FTree_C: total length of flattened Trees
- as_FTree_C: average length of flattened Trees per sentence
- at_FTree_C: average length of flattened Trees per token (word)
- as_DisDepend: Average dependency distance per sentence
- max_DisDepend: Maximum dependency distance per sentence
- to_DisDepend_C: Total number of dependency distances per sentence
- as_DisDepend_C: Average number of dependency distances per sentence
"""
def POSF_(self):
result = Synta_POSF.retrieve(self.NLP_doc, self.depends, self.n_token, self.n_sent)
result = nan_check(result)
return result
def PhrF_(self):
result = Synta_PhrF.retrieve(SuPar, self.sent_token_list, self.n_token, self.n_sent)
result = nan_check(result)
return result
def TrSF_(self):
result = Synta_TrSF.retrieve(SuPar, self.sent_token_list, self.n_token, self.n_sent)
result = nan_check(result)
return result
"""
Extract Entity Density Features -> 27 -> 56
output (type -> dictionary):
- to_EntiM_C: total number of named Entities Mentions counts
- as_EntiM_C: average number of named Entities Mentions counts per sentence
- at_EntiM_C: average number of named Entities Mentions counts per token (word)
- to_UEnti_C: total number of unique named Entities
- as_UEnti_C: average number of unique named Entities per sentence
- at_UEnti_C: average number of unique namend Entities per token (word)
- Per_nonEnti_C: Percentage of Not-NE nouns per sentence
- as_nonEnti_C: Number of Not-NE nouns per sentence
- ra_SSToT_C: ratio of ss transitions to total
- ra_SOToT_C: ratio of so transitions to total
- ra_SXToT_C: ratio of sx transitions to total
- ra_SNToT_C: ratio of sn transitions to total
- ra_OSToT_C: ratio of os transitions to total
- ra_OOToT_C: ratio of oo transitions to total
- ra_OXToT_C: ratio of ox transitions to total
- ra_ONToT_C: ratio of on transitions to total
- ra_XSToT_C: ratio of xs transitions to total
- ra_XOToT_C: ratio of xo transitions to total
- ra_XXToT_C: ratio of xx transitions to total
- ra_XNToT_C: ratio of xn transitions to total
- ra_NSToT_C: ratio of ns transitions to total
- ra_NOToT_C: ratio of no transitions to total
- ra_NXToT_C: ratio of nx transitions to total
- ra_NNToT_C: ratio of nn transitions to total
- LoCohPA_S: Local Coherence for PA score
- LoCohPW_S: Local Coherence for PW score
- LoCohPU_S: Local Coherence for PU score
- LoCoDPA_S: Local Coherence distance for PA score
- LoCoDPW_S: Local Coherence distance for PW score
- LoCoDPU_S: Local Coherence distance for PU score
- Per_UConj_C: Percentage of conjunctions per sentence
- as_UConj_C: Number of unique conjunctions per sentence
- Per_UConj_C: Percentage of unique conjunctions per sentence
- Per_Pronoun_C: Percentage of pronouns per sentence
- as_UPronoun_C: Number of unique pronouns per sentence
- Per_UPronoun_C: Percentage of unique pronouns per sentence
- to_Personal_C: Total number of personal pronouns
- to_FirstPersonal_C: Total number of first person pronouns
- to_ThirdPersonal_C: Total number of third person pronouns
- SimpNoV_S: unique Nouns/total Nouns (Noun Variation-1)
- SquaNoV_S: (unique Nouns**2)/total Nouns (Squared Noun Variation-1)
- CorrNoV_S: unique Nouns/sqrt(2*total Nouns) (Corrected Noun Variation-1)
- SimpVeV_S: unique Verbs/total Verbs (Verb Variation-1)
- SquaVeV_S: (unique Verbs**2)/total Verbs (Squared Verb Variation-1)
- CorrVeV_S: unique Verbs/sqrt(2*total Verbs) (Corrected Verb Variation-1)
- SimpAjV_S: unique Adjectives/total Adjectives (Adjective Variation-1)
- SquaAjV_S: (unique Adjectives**2)/total Adjectives (Squared Adjective Variation-1)
- CorrAjV_S: unique Adjectives/sqrt(2*total Adjectives) (Corrected Adjective Variation-1)
- SimpAvV_S: unique Adverbs/total Adverbs (AdVerb Variation-1)
- SquaAvV_S: (unique Adverbs**2)/total Adverbs (Squared AdVerb Variation-1)
- CorrAvV_S: unique Adverbs/sqrt(2*total Adverbs) (Corrected AdVerb Variation-1)
- SimpTTR_S: unique tokens/total tokens (TTR)
- CorrTTR_S: unique tokens/sqrt(2*total tokens) (Corrected TTR)
- BiLoTTR_S: log(unique tokens)/log(total tokens) (Bi-Logarithmic TTR)
- UberTTR_S: (log(unique tokens))^2/log(total tokens/unique tokens) (Uber Index)
- MTLDTTR_S: Measure of Textual Lexical Diversity (default TTR = 0.72)
"""
def EnDF_(self):
result = Disco_EnDF.retrieve(self.NLP_doc, self.n_sent, self.n_token)
result = nan_check(result)
return result
def EnGF_(self):
"""
if self.n_sent <= 2:
raise RuntimeError(
"\n|-.-'-.- LingFeat -.-'-.-|\n"
+"Error Raised:\n"
+"This problem might be caused due to the following reasons.\n"
+"1.Entity Grid needs at least two sentences, found: {}.\n".format(self.n_sent))
else:
"""
result = Disco_EnGF.EntityGrid(self.NLP_doc, n_sent=self.n_sent).retrieve()
result = nan_check(result)
return result
def VarF_(self):
result = LxSem_VarF.retrieve(self.NLP_doc)
result = nan_check(result)
return result
def TTRF_(self):
result = LxSem_TTRF.retrieve(self.n_token, self.token_list)
result = nan_check(result)
return result