Skip to content

Commit

Permalink
fix:提供新的方法处理自定义用户词典分词带空格的情况(huaban#137)
Browse files Browse the repository at this point in the history
  • Loading branch information
littlesparklet committed Nov 6, 2024
1 parent e46e44b commit 6d7c24a
Show file tree
Hide file tree
Showing 5 changed files with 212 additions and 0 deletions.
2 changes: 2 additions & 0 deletions conf/userextend.dict
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
弹性公网IP,3
IPSEC VPN,3
18 changes: 18 additions & 0 deletions src/main/java/com/huaban/analysis/jieba/CharacterUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
public class CharacterUtil {
public static Pattern reSkip = Pattern.compile("(\\d+\\.\\d+|[a-zA-Z0-9]+)");
private static final char[] connectors = new char[] { '+', '#', '&', '.', '_', '-' };
private static final char[] connectorsExtend = new char[] { '+', '#', '&', '.', '_', '-', ' ' };


public static boolean isChineseLetter(char ch) {
Expand Down Expand Up @@ -36,6 +37,12 @@ public static boolean isConnector(char ch) {
return false;
}

public static boolean isConnectorExtend(char ch) {
for (char connector : connectorsExtend)
if (ch == connector)
return true;
return false;
}

public static boolean ccFind(char ch) {
if (isChineseLetter(ch))
Expand All @@ -49,6 +56,17 @@ public static boolean ccFind(char ch) {
return false;
}

public static boolean ccFindExtend(char ch) {
if (isChineseLetter(ch))
return true;
if (isEnglishLetter(ch))
return true;
if (isDigit(ch))
return true;
if (isConnectorExtend(ch))
return true;
return false;
}

/**
* 全角 to 半角,大写 to 小写
Expand Down
80 changes: 80 additions & 0 deletions src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,86 @@ public List<SegToken> process(String paragraph, SegMode mode) {
return tokens;
}

public List<SegToken> processExtend(String paragraph, SegMode mode) {
List<SegToken> tokens = new ArrayList<SegToken>();
StringBuilder sb = new StringBuilder();
int offset = 0;
for (int i = 0; i < paragraph.length(); ++i) {
char ch = CharacterUtil.regularize(paragraph.charAt(i));
if (CharacterUtil.ccFindExtend(ch))
sb.append(ch);
else {
if (sb.length() > 0) {
// process
if (mode == SegMode.SEARCH) {
for (String word : sentenceProcess(sb.toString())) {
tokens.add(new SegToken(word, offset, offset += word.length()));
}
}
else {
for (String token : sentenceProcess(sb.toString())) {
if (token.length() > 2) {
String gram2;
int j = 0;
for (; j < token.length() - 1; ++j) {
gram2 = token.substring(j, j + 2);
if (wordDict.containsWord(gram2))
tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
}
}
if (token.length() > 3) {
String gram3;
int j = 0;
for (; j < token.length() - 2; ++j) {
gram3 = token.substring(j, j + 3);
if (wordDict.containsWord(gram3))
tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
}
}
tokens.add(new SegToken(token, offset, offset += token.length()));
}
}
sb = new StringBuilder();
offset = i;
}
if (wordDict.containsWord(paragraph.substring(i, i + 1)))
tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset));
else
tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset));
}
}
if (sb.length() > 0)
if (mode == SegMode.SEARCH) {
for (String token : sentenceProcess(sb.toString())) {
tokens.add(new SegToken(token, offset, offset += token.length()));
}
}
else {
for (String token : sentenceProcess(sb.toString())) {
if (token.length() > 2) {
String gram2;
int j = 0;
for (; j < token.length() - 1; ++j) {
gram2 = token.substring(j, j + 2);
if (wordDict.containsWord(gram2))
tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
}
}
if (token.length() > 3) {
String gram3;
int j = 0;
for (; j < token.length() - 2; ++j) {
gram3 = token.substring(j, j + 3);
if (wordDict.containsWord(gram3))
tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
}
}
tokens.add(new SegToken(token, offset, offset += token.length()));
}
}

return tokens;
}

/*
*
Expand Down
57 changes: 57 additions & 0 deletions src/main/java/com/huaban/analysis/jieba/WordDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,27 @@ public void init(String[] paths) {
}
}
}

public void init(Path configFile, String splitChar) {
String abspath = configFile.toAbsolutePath().toString();
Log.debug("initialize user dictionary:" + abspath);
synchronized (WordDictionary.class) {
if (loadedPath.contains(abspath))
return;

DirectoryStream<Path> stream;
try {
stream = Files.newDirectoryStream(configFile, String.format(Locale.getDefault(), "*%s", USER_DICT_SUFFIX));
for (Path path: stream){
Log.error(String.format(Locale.getDefault(), "loading dict %s", path.toString()));
singleton.loadUserDict(path, splitChar);
}
loadedPath.add(abspath);
} catch (IOException e) {
Log.error(String.format(Locale.getDefault(), "%s: load user dict failure!", configFile.toString()));
}
}
}

/**
* let user just use their own dict instead of the default dict
Expand Down Expand Up @@ -156,6 +177,10 @@ public void loadUserDict(Path userDict) {
loadUserDict(userDict, StandardCharsets.UTF_8);
}

public void loadUserDict(Path userDict, String splitChar) {
loadUserDict(userDict, StandardCharsets.UTF_8, splitChar);
}

public void loadUserDict(String userDictPath) {
loadUserDict(userDictPath, StandardCharsets.UTF_8);
}
Expand Down Expand Up @@ -223,6 +248,38 @@ public void loadUserDict(String userDictPath, Charset charset) {
Log.error(String.format(Locale.getDefault(), "%s: load user dict failure!", userDictPath));
}
}

public void loadUserDict(Path userDict, Charset charset, String splitChar) {
try {
BufferedReader br = Files.newBufferedReader(userDict, charset);
long s = System.currentTimeMillis();
int count = 0;
while (br.ready()) {
String line = br.readLine();
String[] tokens = line.split(splitChar);

if (tokens.length < 1) {
// Ignore empty line
continue;
}

String word = tokens[0];

double freq = 3.0d;
if (tokens.length == 2)
freq = Double.valueOf(tokens[1]);
word = addWord(word);
freqs.put(word, Math.log(freq / total));
count++;
}
// System.out.println(freqs);
Log.debug(String.format(Locale.getDefault(), "user dict %s load finished, tot words:%d, time elapsed:%dms", userDict.toString(), count, System.currentTimeMillis() - s));
br.close();
}
catch (IOException e) {
Log.error(String.format(Locale.getDefault(), "%s: load user dict failure!", userDict.toString()));
}
}

public DictSegment getTrie() {
return this._dict;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/**
*
*/
package com.huaban.analysis.jieba;

import com.huaban.analysis.jieba.JiebaSegmenter.SegMode;
import junit.framework.TestCase;
import org.junit.Test;

import java.nio.file.Paths;
import java.util.List;
import java.util.Locale;


/**
* @author matrix
*
*/
public class JiebaSegmenterExtendTest extends TestCase {
private JiebaSegmenter segmenter = new JiebaSegmenter();
String[] sentences =
new String[] {
"订购弹性公网IP",
"订购IPSEC VPN"
};

@Override
protected void setUp() throws Exception {
WordDictionary.getInstance().init(Paths.get("conf"), ",");
}


@Override
protected void tearDown() throws Exception {
super.tearDown();
}


@Test
public void testCutForSearch() {
for (String sentence : sentences) {
List<SegToken> tokens = segmenter.processExtend(sentence, SegMode.SEARCH);
System.out.print(String.format(Locale.getDefault(), "\n%s\n%s", sentence, tokens.toString()));
}
}


@Test
public void testCutForIndex() {
for (String sentence : sentences) {
List<SegToken> tokens = segmenter.processExtend(sentence, SegMode.INDEX);
System.out.print(String.format(Locale.getDefault(), "\n%s\n%s", sentence, tokens.toString()));
}
}
}

0 comments on commit 6d7c24a

Please sign in to comment.