unstable-20231115

project-trans · Nov 15, 2023 · 67b410f · 67b410f
1 parent 8fb995e
commit 67b410f
Show file tree

Hide file tree

Showing 9 changed files with 12,381 additions and 383 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1 +1,3 @@
-/.idea
+/.idea
+/default_v2.zip
+/default_v2
diff --git a/MtF-wiki b/MtF-wiki
diff --git a/README.md b/README.md
@@ -1,32 +1,40 @@
 # 跨儿计划 RIME 词典
 
-## 语料来源
+## 语料
+
+来源：
 
 - [MtF.wiki](https://github.com/project-trans/MtF-wiki)
 - [FtM.wiki](https://github.com/project-trans/FtM-wiki)
 - [RLE.wiki](https://github.com/project-trans/RLE-wiki)
 - [女性倾向跨性别者科学](https://github.com/project-trans/transfeminine-science)
 - [中华人民共和国跨性别相关法律法规变迁](https://github.com/project-trans/legal-spec)
 
-### 停用词
+使用[pkuseg](https://github.com/lancopku/pkuseg-python)分词，[python-pinyin](https://github.com/mozillazg/python-pinyin)注音。
+
+停用词：
+
+- [现代汉语常用词表](https://gist.github.com/indiejoseph/eae09c673460aa0b56db)
 
-[现代汉语常用词表](https://gist.github.com/indiejoseph/eae09c673460aa0b56db)
+## 仓库内容
 
-## RIME 词典
+去除停用词后包含一万余条记录。
 
-词典中包含除常用词表外的汉语词汇及词频.
+1. 词频统计：`result.json`，JSON 格式，降序排列。
+1. RIME 词典：包含除常用词表外的汉语词汇及词频，无编码，预构建文件位于仓库根目录下`project_trans.dict.yaml`，亦可在 Release 中获取。
+1. RIME 词典：全拼编码，位于仓库根目录下`project_trans_pinyin.dict.yaml`。
 
-可在 Release 中获取
+软件源：
 
 - [NUR](https://github.com/Cryolitia/nur-packages/blob/master/pkgs/rimePackages/rime-project-trans.nix)
 - [AUR](https://aur.archlinux.org/packages/rime-project-trans-bin)
 
-## 词云
+### 词云
 
-### 完整
+#### 无停用词
 
 ![result_full](./result_full.png)
 
-### 停用常用词
+#### 停用常用词
 
 ![result](./result.png)
diff --git a/RLE-wiki b/RLE-wiki
diff --git a/main.py b/main.py
@@ -1,5 +1,7 @@
 import os
 import collections
+import json
+import copy
 
 from markdown_it import MarkdownIt
 from mdit_plain.renderer import RendererPlain
@@ -37,36 +39,54 @@
 seg = pkuseg.pkuseg(model_name="default_v2") # download from https://github.com/lancopku/pkuseg-python/releases/download/v0.0.25/default_v2.zip
 word_dict: dict[str, int] = collections.defaultdict(int)
 for sentence in sentence_list:
-    sentence2 = regex.sub("\\p{P}", " ", sentence).strip()
+    sentence2 = regex.sub("\\p{P}+", " ", sentence).strip()
     # word_list = jieba.cut(sentence2, cut_all=True, use_paddle=True)
     word_list = seg.cut(sentence2)
     for word in word_list:
         word2 = regex.sub("\\s+", " ", word)
         if len(word2) > 1 and word2 not in common_word_set and pattern.match(word2):
             word_dict[word2] += 1
 
-# wordcloud
-colormap = matplotlib.colors.ListedColormap(["#5BCEFA", "#F5A9B8", "#2D2D2D", "#9B59D0", "#FFF433"])
-w = wordcloud.WordCloud(width=1920, height=1080, font_path="sarasa-ui-tc-regular.ttf", background_color="#7f7f7f", colormap=colormap)
-w.generate_from_frequencies(word_dict)
-#w.to_file("result.png")
+# build wordcloud
+build_wordcloud = False;
+
+if build_wordcloud:
+    colormap = matplotlib.colors.ListedColormap(["#5BCEFA", "#F5A9B8", "#2D2D2D", "#9B59D0", "#FFF433"])
+    w = wordcloud.WordCloud(width=1920, height=1080, font_path="sarasa-ui-tc-regular.ttf", background_color="#7f7f7f", colormap=colormap)
+    w.generate_from_frequencies(word_dict)
+    w.to_file("result.png")
 
 # build dict
 word_list = sorted(word_dict.items(), key=lambda kv: kv[1], reverse=True)
-open("result.txt", "w").write(str(word_list))
+open("result.json", "w", encoding="utf8").write(json.dumps(word_list, ensure_ascii=False))
+
+#build rime dict
 rime_dict_str = """---
 name: project_trans
 version: "0.1"
 sort: by_weight
 ...
 
 """
+rime_dict_pinyin_str = copy.deepcopy(rime_dict_str)
+
+pinyin_pattern = regex.compile("(\\w|\\s)+")
 for word in word_list:
-    word_str = word[0]
-    word_str += "\t"
-    word_str += " ".join(lazy_pinyin(word[0]))
-    word_str += "\t"
-    word_str += str(word[1])
-    word_str += "\n"
-    rime_dict_str += word_str
-open("project_trans.dict.yaml", "w").write(rime_dict_str)
+    pinyin = " ".join(lazy_pinyin(word[0]))
+    if pinyin_pattern.fullmatch(pinyin):
+        word_str = word[0]
+        word_str += "\t"
+        word_str += pinyin
+        word_str += "\t"
+        word_str += str(word[1])
+        word_str += "\n"
+        rime_dict_str += word_str
+
+        word_str_pinyin = word[0]
+        word_str_pinyin += "\t\t"
+        word_str_pinyin += str(word[1])
+        word_str_pinyin += "\n"
+        rime_dict_pinyin_str += word_str_pinyin
+
+open("project_trans.dict.yaml", "w", encoding="utf8").write(rime_dict_str)
+open("project_trans_pinyin.dict.yaml", "w", encoding="utf8").write(rime_dict_pinyin_str)
+4 −0		content/zh-cn/docs/srs/china/sh411/_index.md
+16 −9		content/zh-cn/docs/srs/education.md
+14 −6		docs/campus/CUHKSZ.md
+2 −2		docs/campus/THU.md
+156 −0		docs/campus/XJTU.md