fix:提供新的方法处理自定义用户词典分词带空格的情况(huaban#137)

littlesparklet · Nov 6, 2024 · 6d7c24a · 6d7c24a
1 parent e46e44b
commit 6d7c24a
Show file tree

Hide file tree

Showing 5 changed files with 212 additions and 0 deletions.
diff --git a/conf/userextend.dict b/conf/userextend.dict
@@ -0,0 +1,2 @@
+弹性公网IP,3
+IPSEC VPN,3
diff --git a/src/main/java/com/huaban/analysis/jieba/CharacterUtil.java b/src/main/java/com/huaban/analysis/jieba/CharacterUtil.java
@@ -6,6 +6,7 @@
 public class CharacterUtil {
     public static Pattern reSkip = Pattern.compile("(\\d+\\.\\d+|[a-zA-Z0-9]+)");
     private static final char[] connectors = new char[] { '+', '#', '&', '.', '_', '-' };
+    private static final char[] connectorsExtend = new char[] { '+', '#', '&', '.', '_', '-', ' ' };
 
 
     public static boolean isChineseLetter(char ch) {
@@ -36,6 +37,12 @@ public static boolean isConnector(char ch) {
         return false;
     }
 
+    public static boolean isConnectorExtend(char ch) {
+        for (char connector : connectorsExtend)
+            if (ch == connector)
+                return true;
+        return false;
+    }
 
     public static boolean ccFind(char ch) {
         if (isChineseLetter(ch))
@@ -49,6 +56,17 @@ public static boolean ccFind(char ch) {
         return false;
     }
 
+    public static boolean ccFindExtend(char ch) {
+        if (isChineseLetter(ch))
+            return true;
+        if (isEnglishLetter(ch))
+            return true;
+        if (isDigit(ch))
+            return true;
+        if (isConnectorExtend(ch))
+            return true;
+        return false;
+    }
 
     /**
      * 全角 to 半角,大写 to 小写

diff --git a/src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java b/src/main/java/com/huaban/analysis/jieba/JiebaSegmenter.java
@@ -175,6 +175,86 @@ public List<SegToken> process(String paragraph, SegMode mode) {
         return tokens;
     }
 
+    public List<SegToken> processExtend(String paragraph, SegMode mode) {
+        List<SegToken> tokens = new ArrayList<SegToken>();
+        StringBuilder sb = new StringBuilder();
+        int offset = 0;
+        for (int i = 0; i < paragraph.length(); ++i) {
+            char ch = CharacterUtil.regularize(paragraph.charAt(i));
+            if (CharacterUtil.ccFindExtend(ch))
+                sb.append(ch);
+            else {
+                if (sb.length() > 0) {
+                    // process
+                    if (mode == SegMode.SEARCH) {
+                        for (String word : sentenceProcess(sb.toString())) {
+                            tokens.add(new SegToken(word, offset, offset += word.length()));
+                        }
+                    }
+                    else {
+                        for (String token : sentenceProcess(sb.toString())) {
+                            if (token.length() > 2) {
+                                String gram2;
+                                int j = 0;
+                                for (; j < token.length() - 1; ++j) {
+                                    gram2 = token.substring(j, j + 2);
+                                    if (wordDict.containsWord(gram2))
+                                        tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
+                                }
+                            }
+                            if (token.length() > 3) {
+                                String gram3;
+                                int j = 0;
+                                for (; j < token.length() - 2; ++j) {
+                                    gram3 = token.substring(j, j + 3);
+                                    if (wordDict.containsWord(gram3))
+                                        tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
+                                }
+                            }
+                            tokens.add(new SegToken(token, offset, offset += token.length()));
+                        }
+                    }
+                    sb = new StringBuilder();
+                    offset = i;
+                }
+                if (wordDict.containsWord(paragraph.substring(i, i + 1)))
+                    tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset));
+                else
+                    tokens.add(new SegToken(paragraph.substring(i, i + 1), offset, ++offset));
+            }
+        }
+        if (sb.length() > 0)
+            if (mode == SegMode.SEARCH) {
+                for (String token : sentenceProcess(sb.toString())) {
+                    tokens.add(new SegToken(token, offset, offset += token.length()));
+                }
+            }
+            else {
+                for (String token : sentenceProcess(sb.toString())) {
+                    if (token.length() > 2) {
+                        String gram2;
+                        int j = 0;
+                        for (; j < token.length() - 1; ++j) {
+                            gram2 = token.substring(j, j + 2);
+                            if (wordDict.containsWord(gram2))
+                                tokens.add(new SegToken(gram2, offset + j, offset + j + 2));
+                        }
+                    }
+                    if (token.length() > 3) {
+                        String gram3;
+                        int j = 0;
+                        for (; j < token.length() - 2; ++j) {
+                            gram3 = token.substring(j, j + 3);
+                            if (wordDict.containsWord(gram3))
+                                tokens.add(new SegToken(gram3, offset + j, offset + j + 3));
+                        }
+                    }
+                    tokens.add(new SegToken(token, offset, offset += token.length()));
+                }
+            }
+
+        return tokens;
+    }
 
     /*
      *

diff --git a/src/main/java/com/huaban/analysis/jieba/WordDictionary.java b/src/main/java/com/huaban/analysis/jieba/WordDictionary.java
@@ -88,6 +88,27 @@ public void init(String[] paths) {
             }
         }
     }
+
+    public void init(Path configFile, String splitChar) {
+        String abspath = configFile.toAbsolutePath().toString();
+        Log.debug("initialize user dictionary:" + abspath);
+        synchronized (WordDictionary.class) {
+            if (loadedPath.contains(abspath))
+                return;
+
+            DirectoryStream<Path> stream;
+            try {
+                stream = Files.newDirectoryStream(configFile, String.format(Locale.getDefault(), "*%s", USER_DICT_SUFFIX));
+                for (Path path: stream){
+                    Log.error(String.format(Locale.getDefault(), "loading dict %s", path.toString()));
+                    singleton.loadUserDict(path, splitChar);
+                }
+                loadedPath.add(abspath);
+            } catch (IOException e) {
+                Log.error(String.format(Locale.getDefault(), "%s: load user dict failure!", configFile.toString()));
+            }
+        }
+    }
 
     /**
      * let user just use their own dict instead of the default dict
@@ -156,6 +177,10 @@ public void loadUserDict(Path userDict) {
         loadUserDict(userDict, StandardCharsets.UTF_8);
     }
 
+    public void loadUserDict(Path userDict, String splitChar) {
+        loadUserDict(userDict, StandardCharsets.UTF_8, splitChar);
+    }
+
     public void loadUserDict(String userDictPath) {
         loadUserDict(userDictPath, StandardCharsets.UTF_8);
     }
@@ -223,6 +248,38 @@ public void loadUserDict(String userDictPath, Charset charset) {
             Log.error(String.format(Locale.getDefault(), "%s: load user dict failure!", userDictPath));
         }
     }
+
+    public void loadUserDict(Path userDict, Charset charset, String splitChar) {
+        try {
+            BufferedReader br = Files.newBufferedReader(userDict, charset);
+            long s = System.currentTimeMillis();
+            int count = 0;
+            while (br.ready()) {
+                String line = br.readLine();
+                String[] tokens = line.split(splitChar);
+
+                if (tokens.length < 1) {
+                    // Ignore empty line
+                    continue;
+                }
+
+                String word = tokens[0];
+
+                double freq = 3.0d;
+                if (tokens.length == 2)
+                    freq = Double.valueOf(tokens[1]);
+                word = addWord(word);
+                freqs.put(word, Math.log(freq / total));
+                count++;
+            }
+            // System.out.println(freqs);
+            Log.debug(String.format(Locale.getDefault(), "user dict %s load finished, tot words:%d, time elapsed:%dms", userDict.toString(), count, System.currentTimeMillis() - s));
+            br.close();
+        }
+        catch (IOException e) {
+            Log.error(String.format(Locale.getDefault(), "%s: load user dict failure!", userDict.toString()));
+        }
+    }
 
     public DictSegment getTrie() {
         return this._dict;

diff --git a/src/test/java/com/huaban/analysis/jieba/JiebaSegmenterExtendTest.java b/src/test/java/com/huaban/analysis/jieba/JiebaSegmenterExtendTest.java
@@ -0,0 +1,55 @@
+/**
+ * 
+ */
+package com.huaban.analysis.jieba;
+
+import com.huaban.analysis.jieba.JiebaSegmenter.SegMode;
+import junit.framework.TestCase;
+import org.junit.Test;
+
+import java.nio.file.Paths;
+import java.util.List;
+import java.util.Locale;
+
+
+/**
+ * @author matrix
+ * 
+ */
+public class JiebaSegmenterExtendTest extends TestCase {
+    private JiebaSegmenter segmenter = new JiebaSegmenter();
+    String[] sentences =
+            new String[] {
+                          "订购弹性公网IP",
+                          "订购IPSEC VPN"
+                           };
+
+    @Override
+    protected void setUp() throws Exception {
+        WordDictionary.getInstance().init(Paths.get("conf"), ",");
+    }
+
+
+    @Override
+    protected void tearDown() throws Exception {
+        super.tearDown();
+    }
+
+
+    @Test
+    public void testCutForSearch() {
+        for (String sentence : sentences) {
+            List<SegToken> tokens = segmenter.processExtend(sentence, SegMode.SEARCH);
+            System.out.print(String.format(Locale.getDefault(), "\n%s\n%s", sentence, tokens.toString()));
+        }
+    }
+
+
+    @Test
+    public void testCutForIndex() {
+        for (String sentence : sentences) {
+            List<SegToken> tokens = segmenter.processExtend(sentence, SegMode.INDEX);
+            System.out.print(String.format(Locale.getDefault(), "\n%s\n%s", sentence, tokens.toString()));
+        }
+    }
+}