Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

修改添加词性标注功能 #40

Open
wants to merge 28 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
10c5a00
update finalseg process and demo. xuming 20160818
Aug 18, 2016
f702b99
删除多余的工具代码,添加基本Demo示例。xuming 20160824
Aug 24, 2016
3136108
删除多余的工具代码,添加基本Demo示例。xuming 20160824
Aug 24, 2016
8e33c62
rewrite segmenter and all. xuming 20160910
Sep 10, 2016
6f50cde
rewrite segmenter and all. xuming 20160910
Sep 10, 2016
7ecbf6a
update segmenter and add demos. xuming 20160911
Sep 11, 2016
665d22e
update demos. xuming 20160911
Sep 11, 2016
6615f8a
add word pom. xuming 20160911
Sep 11, 2016
6a9727a
add word segmenter. xuming 20160911
Sep 11, 2016
7c9ddc6
添加word分词项目的词典加载以及英文分词。xuming 20160912
Sep 12, 2016
82d5f3e
添加人名识别、标点符号及停用词处理。xuming 20160913
Sep 13, 2016
05a4d29
添加词性标注、人名识别、数量词发现、特殊词发现功能. xuming 20160914
Sep 13, 2016
7498a2d
提交全局配置文件 xuming 20160914
Sep 14, 2016
e56ae1f
添加全切分分词器和最大得分分词器。 xuming 20160914
Sep 14, 2016
25be308
添加双数组词典树结构及二元文法模型. xuming 20160914
Sep 14, 2016
6d42510
构建HanLP分词器. xuming 20160915
Sep 15, 2016
81b2717
构建HanLP 2。 xuming 20160916
Sep 16, 2016
b23be53
构建HanLP分词器3. xuming 20160916
Sep 16, 2016
d54b7c7
构建HanLP4。 xuming 20160917
Sep 17, 2016
e7e28de
构建HanLP5。 xuming 20160918
Sep 18, 2016
438aadc
构建HanLP6。 xuming 20160919
Sep 19, 2016
1c1f994
构建HanLP7。 xuming 20160919
Sep 19, 2016
467804d
构建HanLP8:添加人名识别。 xuming 20160920
Sep 20, 2016
7395697
构建HanLP9:添加机构名识别、地名识别。 xuming 20160920
Sep 21, 2016
156a35d
构建HanLP9:添加stopword。 xuming 20160921
Sep 21, 2016
d2c9c74
构建HanLP10:添加同义词识别。 xuming 20160923
Sep 23, 2016
1837325
构建HanLP11:添加文本推荐、自动摘要、语义距离。 xuming 20160927
Sep 27, 2016
9dcbd04
构建HanLP完成。 xuming 20160928
Sep 28, 2016
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
4 changes: 2 additions & 2 deletions conf/user.dict
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ utf-8 3 nz
簡體字 53 n
簡體字典 53 n
矿泉水瓶盖 53 n
点赞 3 nz

点赞 3 userDict
普顿思 3 userDict
12 changes: 12 additions & 0 deletions conf/user.dict.bak
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
小清新 3
百搭 3
显瘦 3
又拍云 3
iphone 3
鲜芋仙 3
UTF-8 3 nz
utf-8 3 nz
簡體字 53 n
簡體字典 53 n
矿泉水瓶盖 53 n
点赞 3 nz
47 changes: 34 additions & 13 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.sonatype.oss</groupId>
<artifactId>oss-parent</artifactId>
<version>7</version>
</parent>
<groupId>com.huaban</groupId>

<groupId>org.xm</groupId>
<artifactId>jieba-analysis</artifactId>
<version>1.0.3-SNAPSHOT</version>
<version>1.0.4-SNAPSHOT</version>
<packaging>jar</packaging>

<name>结巴分词工具(jieba for java)</name>
<name>分词工具(java)</name>
<url>http://maven.apache.org</url>
<inceptionYear>2013</inceptionYear>
<licenses>
Expand All @@ -29,8 +25,12 @@
</scm>

<properties>
<java.version>1.8</java.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.target>1.7</maven.compiler.target>
<maven.compiler.target>1.8</maven.compiler.target>
<slf4j-api.version>1.6.4</slf4j-api.version>
<logback-classic.version>0.9.28</logback-classic.version>

</properties>

<developers>
Expand All @@ -49,14 +49,35 @@
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.8</version>
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.3.1</version>
<version>3.3.2</version>
</dependency>

<!-- SLF4J日志框架API -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j-api.version}</version>
</dependency>
<!-- LOGBACK日志实现提供者 -->
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>${logback-classic.version}</version>
<exclusions>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
</exclusions>
<scope>runtime</scope>
</dependency>

</dependencies>

<build>
Expand All @@ -66,8 +87,8 @@
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.7</source>
<target>1.7</target>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ public List<SegToken> process(String paragraph, SegMode mode) {
int j = 0;
for (; j < token.length() - 1; ++j) {
gram2 = token.substring(j, j + 2);
if (wordDict.containsWord(gram2))
if (wordDict.containsWord(gram2)) //wordDict is the core.txt ; if the gram2 in resources/core.txt
tokens.add(new SegToken(gram2, offset + j, offset + j + 2, wordDict.getNature(gram2)));
}
}
Expand Down
3 changes: 0 additions & 3 deletions src/main/java/com/huaban/analysis/jieba/WordDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ public void init(Path configFile) {
synchronized (WordDictionary.class) {
if (loadedPath.contains(abspath))
return;

DirectoryStream<Path> stream;
try {
stream = Files.newDirectoryStream(configFile, String.format(Locale.getDefault(), "*%s", USER_DICT_SUFFIX));
Expand All @@ -73,8 +72,6 @@ public void init(Path configFile) {
}
loadedPath.add(abspath);
} catch (IOException e) {
// TODO Auto-generated catch block
// e.printStackTrace();
System.err.println(String.format(Locale.getDefault(), "%s: load user dict failure!", configFile.toString()));
}
}
Expand Down
17 changes: 6 additions & 11 deletions src/main/java/com/huaban/analysis/jieba/viterbi/FinalSeg.java
Original file line number Diff line number Diff line change
@@ -1,21 +1,16 @@
package com.huaban.analysis.jieba.viterbi;

import com.huaban.analysis.jieba.CharacterUtil;
import com.huaban.analysis.jieba.Node;
import com.huaban.analysis.jieba.Pair;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Vector;
import java.util.*;
import java.util.regex.Matcher;
import java.util.Collections;

import com.huaban.analysis.jieba.CharacterUtil;
import com.huaban.analysis.jieba.Pair;
import com.huaban.analysis.jieba.Node;


public class FinalSeg {
Expand All @@ -26,7 +21,7 @@ public class FinalSeg {
private static Map<Character, Double> start;
private static Map<Character, Map<Character, Double>> trans;
private static Map<Character, char[]> prevStatus;
private static Double MIN_FLOAT = -3.14e100;;
private static Double MIN_FLOAT = -3.14e100;


private FinalSeg() {
Expand Down
85 changes: 0 additions & 85 deletions src/main/java/com/pycredit/fenci/utils/Item.java

This file was deleted.

65 changes: 0 additions & 65 deletions src/main/java/com/pycredit/fenci/utils/SegItem.java

This file was deleted.

45 changes: 0 additions & 45 deletions src/main/java/com/pycredit/fenci/utils/TxtUtil.java

This file was deleted.

Loading