-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlang.py
35 lines (26 loc) · 809 Bytes
/
lang.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import re
import hanzidentifier as hanzi
languages = {'en': 'en',
'sc': 'zh_hans',
'tc': 'zh_hant',
'ja': 'ja',
'ko': 'ko'}
japanese_regex = r'([ぁ-んァ-ンァ-ン゙゚])'
korean_regex = r'([\u3131-\u314e\u314f-\u3163\uac00-\ud7a3])'
def identify_language(text: str):
# Japanese
if re.search(japanese_regex, text):
return languages['ja']
# Korean
elif re.search(korean_regex, text):
return languages['ko']
# Chinese
elif hanzi.has_chinese(text):
print(hanzi.identify(text))
if hanzi.is_simplified(text):
return languages['sc'] # Simplified Chinese
else:
return languages['tc'] # Traditional Chinese
# English
else:
return languages['en']