forked from VietHoang1512/Vietnamese-Spell-Correction
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadd_noise.py
121 lines (107 loc) · 6.39 KB
/
add_noise.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import re
import numpy as np
from unidecode import unidecode
letters=list("abcdefghijklmnopqrstuvwxyzáàảãạâấầẩẫậăắằẳẵặóòỏõọôốồổỗộơớờởỡợéèẻẽẹêếềểễệúùủũụưứừửữựíìỉĩịýỳỷỹỵđABCDEFGHIJKLMNOPQRSTUVWXYZÁÀẢÃẠÂẤẦẨẪẬĂẮẰẲẴẶÓÒỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÉÈẺẼẸÊẾỀỂỄỆÚÙỦŨỤƯỨỪỬỮỰÍÌỈĨỊÝỲỶỸỴĐ")
letters2=list("abcdefghijklmnopqrstuvwxyz")
typo={"ă":"aw","â":"aa","á":"as","à":"af","ả":"ar","ã":"ax","ạ":"aj","ắ":"aws","ổ":"oor","ỗ":"oox","ộ":"ooj","ơ":"ow",
"ằ":"awf","ẳ":"awr","ẵ":"awx","ặ":"awj","ó":"os","ò":"of","ỏ":"or","õ":"ox","ọ":"oj","ô":"oo","ố":"oos","ồ":"oof",
"ớ":"ows","ờ":"owf","ở":"owr","ỡ":"owx","ợ":"owj","é":"es","è":"ef","ẻ":"er","ẽ":"ex","ẹ":"ej","ê":"ee","ế":"ees","ề":"eef",
"ể":"eer","ễ":"eex","ệ":"eej","ú":"us","ù":"uf","ủ":"ur","ũ":"ux","ụ":"uj","ư":"uw","ứ":"uws","ừ":"uwf","ử":"uwr","ữ":"uwx",
"ự":"uwj","í":"is","ì":"if","ỉ":"ir","ị":"ij","ĩ":"ix","ý":"ys","ỳ":"yf","ỷ":"yr","ỵ":"yj","đ":"dd",
"Ă":"Aw","Â":"Aa","Á":"As","À":"Af","Ả":"Ar","Ã":"Ax","Ạ":"Aj","Ắ":"Aws","Ổ":"Oor","Ỗ":"Oox","Ộ":"Ooj","Ơ":"Ow",
"Ằ":"AWF","Ẳ":"Awr","Ẵ":"Awx","Ặ":"Awj","Ó":"Os","Ò":"Of","Ỏ":"Or","Õ":"Ox","Ọ":"Oj","Ô":"Oo","Ố":"Oos","Ồ":"Oof",
"Ớ":"Ows","Ờ":"Owf","Ở":"Owr","Ỡ":"Owx","Ợ":"Owj","É":"Es","È":"Ef","Ẻ":"Er","Ẽ":"Ex","Ẹ":"Ej","Ê":"Ee","Ế":"Ees","Ề":"Eef",
"Ể":"Eer","Ễ":"Eex","Ệ":"Eej","Ú":"Us","Ù":"Uf","Ủ":"Ur","Ũ":"Ux","Ụ":"Uj","Ư":"Uw","Ứ":"Uws","Ừ":"Uwf","Ử":"Uwr","Ữ":"Uwx",
"Ự":"Uwj","Í":"Is","Ì":"If","Ỉ":"Ir","Ị":"Ij","Ĩ":"Ix","Ý":"Ys","Ỳ":"Yf","Ỷ":"Yr","Ỵ":"Yj","Đ":"Dd"}
region={"ẻ":"ẽ","ẽ":"ẻ","ũ":"ủ","ủ":"ũ","ã":"ả","ả":"ã","ỏ":"õ","õ":"ỏ","i":"j"}
region2={"s":"x","l":"n","n":"l","x":"s","d":"gi","S":"X","L":"N","N":"L","X":"S","Gi":"D","D":"Gi"}
vowel=list("aeiouyáàảãạâấầẩẫậăắằẳẵặóòỏõọôốồổỗộơớờởỡợéèẻẽẹêếềểễệúùủũụưứừửữựíìỉĩịýỳỷỹỵ")
acronym={"không":"ko"," anh":" a","em":"e","biết":"bít","giờ":"h","gì":"j","muốn":"mún","học":"hok","yêu":"iu",
"chồng":"ck","vợ":"vk"," ông":" ô","được":"đc","tôi":"t",
"Không":"Ko"," Anh":" A","Em":"E","Biết":"Bít","Giờ":"H","Gì":"J","Muốn":"Mún","Học":"Hok","Yêu":"Iu",
"Chồng":"Ck","Vợ":"Vk"," Ông":" Ô","Được":"Đc","Tôi":"T",}
teen={"ch":"ck","ph":"f","th":"tk","nh":"nk",
"Ch":"Ck","Ph":"F","Th":"Tk","Nh":"Nk"}
def teen_code(sentence,pivot):
random = np.random.uniform(0,1,1)[0]
new_sentence=str(sentence)
if random>pivot:
for word in acronym.keys():
if re.search(word, new_sentence):
random2 = np.random.uniform(0,1,1)[0]
if random2 <0.5:
new_sentence=new_sentence.replace(word,acronym[word])
for word in teen.keys():
if re.search(word, new_sentence):
random3 = np.random.uniform(0,1,1)[0]
if random3 <0.05:
new_sentence=new_sentence.replace(word,teen[word])
return new_sentence
else:
return sentence
def add_noise(sentence, pivot1,pivot2):
sentence=teen_code(sentence,0.5)
noisy_sentence = ""
i = 0
while i < len(sentence):
if sentence[i] not in letters:
noisy_sentence+=sentence[i]
else:
random = np.random.uniform(0,1,1)[0]
if random < pivot1:
noisy_sentence+=(sentence[i])
elif random<pivot2:
if sentence[i] in typo.keys() and sentence[i] in region.keys():
random2=np.random.uniform(0,1,1)[0]
if random2<=0.4:
noisy_sentence+=typo[sentence[i]]
elif random2<0.8:
noisy_sentence+=region[sentence[i]]
elif random2<0.95 :
noisy_sentence+=unidecode(sentence[i])
else:
noisy_sentence+=sentence[i]
elif sentence[i] in typo.keys():
random3=np.random.uniform(0,1,1)[0]
if random3<=0.6:
noisy_sentence+=typo[sentence[i]]
elif random3<0.9 :
noisy_sentence+=unidecode(sentence[i])
else:
noisy_sentence+=sentence[i]
elif sentence[i] in region.keys():
random4=np.random.uniform(0,1,1)[0]
if random4<=0.6:
noisy_sentence+=region[sentence[i]]
elif random4<0.85 :
noisy_sentence+=unidecode(sentence[i])
else:
noisy_sentence+=sentence[i]
elif i<len(sentence)-1 :
if sentence[i] in region2.keys() and (i==0 or sentence[i-1] not in letters) and sentence[i+1] in vowel:
random5=np.random.uniform(0,1,1)[0]
if random5<=0.9:
noisy_sentence+=region2[sentence[i]]
else:
noisy_sentence+=sentence[i]
else:
noisy_sentence+=sentence[i]
else:
new_random = np.random.uniform(0,1,1)[0]
if new_random <=0.33:
if i == (len(sentence) - 1):
continue
else:
noisy_sentence+=(sentence[i+1])
noisy_sentence+=(sentence[i])
i += 1
elif new_random <= 0.66:
random_letter = np.random.choice(letters2, 1)[0]
noisy_sentence+=random_letter
else:
pass
i += 1
return noisy_sentence
text=input("Nhập đoạn text cần tạo noise")
print()
print(add_noise(text,0.94,0.985))