-
Notifications
You must be signed in to change notification settings - Fork 71
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
305 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,275 @@ | ||
ansj_fast_lda | ||
============= | ||
============= | ||
|
||
这是我在参考了 yangliuy 童靴的lda实现..做的.. | ||
|
||
是一个利用GibbsSampling 的LDA实现 | ||
|
||
封装了两个内置的分词 | ||
|
||
|
||
调用方式更加简单.ps目前不支持命令行.代码里面有两个例子大家可以参考 | ||
|
||
```` | ||
File[] files = new File("/Users/ansj/Desktop/搜索组分享/文本分类语料库").listFiles(); | ||
LDA lda = new LDA(10); | ||
for (File file : files) { | ||
lda.addDoc(file, "gb2312"); | ||
} | ||
lda.trainAndSave("result/cluster", "utf-8"); | ||
```` | ||
|
||
|
||
或者 | ||
|
||
|
||
```` | ||
Analysis dicAnalysis = DicAnalysis.getInstance(new File("library/result_1_3.dic"), "UTF-8"); | ||
LDA lda = new LDA(dicAnalysis, new LDAGibbsModel(10, 5, 0.1, 100, Integer.MAX_VALUE, Integer.MAX_VALUE)); | ||
BufferedReader newReader = Files.newReader(new File("/Users/ansj/Documents/temp/computer_300000.txt"), Charsets.UTF_8); | ||
String temp = null; | ||
while ((temp = newReader.readLine()) != null) { | ||
lda.addDoc(temp); | ||
} | ||
lda.trainAndSave("result/computer", "utf-8"); | ||
```` | ||
|
||
|
||
以下是运行后的一个结果 10个topic 100次迭代 | ||
|
||
```` | ||
topic 0 : | ||
教育 0.024076469669325036 | ||
学校 0.016010479660942534 | ||
学生 0.014581063710089938 | ||
工作 0.010251975401793508 | ||
教师 0.010160084376381556 | ||
发展 0.009863991072276375 | ||
社会 0.007607555892716207 | ||
教学 0.006249610739406241 | ||
建设 0.005902466865627754 | ||
提高 0.0053613308270907 | ||
学习 0.004871245358226952 | ||
孩子 0.0041769576106699775 | ||
培养 0.004136117154931332 | ||
实施 0.003727712597544876 | ||
进行 0.003697082255740892 | ||
管理 0.0036358215721329235 | ||
思想 0.003349938381962404 | ||
国家 0.003268257470485113 | ||
改革 0.003237627128681129 | ||
活动 0.003186576559007822 | ||
topic 1 : | ||
环境 0.016325714890308193 | ||
光华 0.01205996899776804 | ||
日月 0.011455375091738728 | ||
污染 0.005594173058287891 | ||
城市 0.005510201682450487 | ||
环保 0.004536133722736594 | ||
垃圾 0.004485750897234152 | ||
文章 0.004401779521396747 | ||
信区 0.004267425320056899 | ||
来源 0.004149865393884533 | ||
发信人 0.004133071118717052 | ||
阅读 0.004133071118717052 | ||
fudan 0.00409948256838209 | ||
edu 0.004082688293214609 | ||
环境保护 0.003998716917377205 | ||
发信站 0.0039147455415398 | ||
cn 0.003897951266372319 | ||
返回 0.003864362716037357 | ||
讨论区 0.003847568440869876 | ||
药物 0.0037971856153674335 | ||
topic 2 : | ||
网络 0.009624681710355278 | ||
规定 0.008204120848487257 | ||
病毒 0.007501477841541783 | ||
管理 0.006096191827650836 | ||
安全 0.00591289365192593 | ||
软件 0.005576846996430269 | ||
计算机 0.005240800340934607 | ||
使用 0.0051033267091409274 | ||
文件 0.004874203989484795 | ||
光华 0.004843654293530644 | ||
日月 0.0045839818779203605 | ||
用户 0.004538157333989134 | ||
进行 0.004477057942080832 | ||
windows 0.004171560982539322 | ||
单位 0.0041410112865851705 | ||
应当 0.004125736438608095 | ||
微软 0.0040799118946768685 | ||
信息 0.0040188125027685664 | ||
程序 0.003896613718951962 | ||
机动车 0.00383551432704366 | ||
topic 3 : | ||
新华社 0.0126322893839167 | ||
中国 0.012225714250864987 | ||
主席 0.010409678656567339 | ||
问题 0.00985402597473 | ||
国家 0.008756273115490376 | ||
总统 0.008580090557834635 | ||
今天 0.007685625265120868 | ||
访问 0.00730615514093927 | ||
人民 0.007238392618763984 | ||
举行 0.0071435250877185845 | ||
会议 0.006533662388141017 | ||
表示 0.0065065573792709025 | ||
苏联 0.006289717308309989 | ||
合作 0.006127087255089304 | ||
两国 0.006032219724043905 | ||
记者 0.005978009706303676 | ||
关系 0.005842484661953105 | ||
发展 0.005747617130907706 | ||
美国 0.00563919709542725 | ||
会见 0.0049480193692393384 | ||
topic 4 : | ||
经济 0.02759296744343016 | ||
中国 0.01584534610437293 | ||
发展 0.012999604599233078 | ||
增长 0.009474603187087483 | ||
国家 0.006292806780586894 | ||
社会 0.005842350210677823 | ||
政府 0.0057994495849721965 | ||
问题 0.005742248750698029 | ||
美国 0.005420494057905834 | ||
市场 0.005348993015063125 | ||
世界 0.005205990929377705 | ||
企业 0.005055838739408014 | ||
投资 0.004548181335224774 | ||
我国 0.004326528102412373 | ||
技术 0.004061974243894347 | ||
可能 0.003690168821112256 | ||
增加 0.003618667778269546 | ||
认为 0.003611517673985275 | ||
这种 0.003604367569701004 | ||
改革 0.0035328665268582944 | ||
topic 5 : | ||
比赛 0.023177485843473445 | ||
选手 0.011980032687335522 | ||
参加 0.011323446686103978 | ||
中国 0.011270209983301421 | ||
北京 0.011110499874893749 | ||
亚运会 0.010879807496082666 | ||
今天 0.010294203765254533 | ||
冠军 0.00855513814037099 | ||
世界 0.008484155869967579 | ||
新华社 0.008430919167165022 | ||
记者 0.00827120905875735 | ||
成绩 0.007880806571538595 | ||
获得 0.007721096463130922 | ||
中国队 0.0076678597603283645 | ||
举行 0.006691853542281477 | ||
女子 0.006443415595869542 | ||
金牌 0.006372433325466133 | ||
进行 0.005698101756633737 | ||
队员 0.005591628351028622 | ||
全国 0.005325444837015835 | ||
topic 6 : | ||
电脑 0.00674109146361348 | ||
产品 0.006725562572888731 | ||
设备 0.006150993616073024 | ||
内存 0.0059335891459265395 | ||
技术 0.005840415801578046 | ||
选择 0.005048442374615854 | ||
显示器 0.0049397401395426125 | ||
公司 0.004862095685918868 | ||
cpu 0.004768922341570375 | ||
价格 0.004737864560120877 | ||
没有 0.004598104543598137 | ||
usb 0.004551517871423891 | ||
支持 0.004520460089974393 | ||
主板 0.004442815636350649 | ||
采用 0.0044272867456259 | ||
性能 0.004411757854901151 | ||
使用 0.004396228964176402 | ||
速度 0.004240940056928914 | ||
系统 0.004147766712580421 | ||
需要 0.00402353558678243 | ||
topic 7 : | ||
文化 0.020326661598747707 | ||
艺术 0.016473812740009988 | ||
演出 0.009171856669408264 | ||
活动 0.008595082888160103 | ||
创作 0.008283625046286096 | ||
群众 0.008249018619411207 | ||
作品 0.006599445605041464 | ||
文艺 0.006068813726293155 | ||
举办 0.005722749457544259 | ||
全国 0.005422827091295214 | ||
中国 0.005180582103170987 | ||
民族 0.0048575887856720165 | ||
工作 0.004649950224422678 | ||
音乐 0.004176995723799186 | ||
参加 0.003738647650050583 | ||
人民 0.0037155766988006566 | ||
优秀 0.0037040412231756936 | ||
电影 0.003634828369425914 | ||
生活 0.0034271898081765758 | ||
建设 0.003288764100677017 | ||
topic 8 : | ||
使用 0.007529768353427813 | ||
进行 0.007150280759611747 | ||
系统 0.007003382336199076 | ||
美国 0.005252842790531417 | ||
问题 0.004787664449724627 | ||
目标 0.004408176855908561 | ||
美军 0.004285761503064668 | ||
函数 0.0042735199677802795 | ||
没有 0.004187829220789555 | ||
设计 0.003906273909248603 | ||
飞机 0.003881790838679824 | ||
导弹 0.0038573077681110456 | ||
能力 0.0037838585564047103 | ||
战争 0.0037471339505515426 | ||
武器 0.003722650879982764 | ||
过程 0.003722650879982764 | ||
可能 0.0036369601329920395 | ||
作战 0.0035390278507169254 | ||
方法 0.003343163286166698 | ||
装备 0.0033064386803135304 | ||
topic 9 : | ||
企业 0.013354841192682198 | ||
上海 0.009405784030079769 | ||
记者 0.0077157214308304235 | ||
新华社 0.006996189829169812 | ||
生产 0.006477457744251696 | ||
合作 0.005490193453601089 | ||
全国 0.005255927350734843 | ||
时间 0.00517226088542547 | ||
地区 0.004988194661744848 | ||
公司 0.004988194661744848 | ||
投资 0.0048375950241879755 | ||
铁路 0.004753928558878602 | ||
北京 0.004469462576826733 | ||
产品 0.004419262697641108 | ||
资金 0.004302129646207986 | ||
运输 0.004268663060084235 | ||
单位 0.004184996594774862 | ||
去年 0.0041515300086511124 | ||
今年 0.00403439695721799 | ||
职工 0.003900530612722992 | ||
```` | ||
|
||
|
||
|
||
测试效果60w我文章大约需要3g内存 | ||
|
||
|
||
|
||
|
||
|
||
再次感谢yangliuy童鞋!!! |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import java.io.File; | ||
import java.io.IOException; | ||
|
||
import org.ansj.lda.LDA; | ||
|
||
public class FileLDATest { | ||
public static void main(String[] args) throws IOException { | ||
File[] files = new File("/Users/ansj/Desktop/搜索组分享/文本分类语料库").listFiles(); | ||
|
||
LDA lda = new LDA(10); | ||
for (File dir : files) { | ||
if (dir.isHidden() || !dir.isDirectory()) { | ||
continue; | ||
} | ||
|
||
for (File file : dir.listFiles()) { | ||
if (file.isHidden() || !file.getName().toLowerCase().endsWith(".txt")) { | ||
continue; | ||
} | ||
lda.addDoc(file, "gb2312"); | ||
} | ||
|
||
} | ||
|
||
lda.trainAndSave("result/cluster", "utf-8"); | ||
} | ||
} |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters