Refactor extractRefPatterns: Replace regex with optimized text parsin…

…g for enhanced accuracy and significant speedup
wtetsu · Jan 20, 2025 · 325af2b · 325af2b
1 parent 7706e71
commit 325af2b
Show file tree

Hide file tree

Showing 4 changed files with 114 additions and 54 deletions.
diff --git a/__test__/main/core/config.test.ts b/__test__/main/core/config.test.ts
@@ -33,7 +33,7 @@ test("parseSettings should handle various input cases", async () => {
       normalDialogStyles: null,
       movingDialogStyles: "",
       hiddenDialogStyles: "{",
-    }),
+    })
   ).toEqual({
     shortWordLength: 2,
     empty: "",

diff --git a/__test__/main/lib/text.test.ts b/__test__/main/lib/text.test.ts
@@ -31,27 +31,9 @@ test("linkWords function test", () => {
   testList(text.linkWords([]), []);
   testList(text.linkWords(["word0"]), ["word0"]);
   testList(text.linkWords(["word0", "word1"]), ["word0 word1", "word0"]);
-  testList(text.linkWords(["word0", "word1", "word2"]), [
-    "word0 word1 word2",
-    "word0 word1",
-    "word0",
-    "word0 ~ word2",
-    "word0 word2",
-  ]);
-  testList(text.linkWords(["announcement", "of", "hoge"]), [
-    "announcement of hoge",
-    "announcement of",
-    "announcement",
-    "announcement ~ hoge",
-    "announcement hoge",
-  ]);
-  testList(text.linkWords(["Announcement", "of", "Hoge"]), [
-    "Announcement of Hoge",
-    "Announcement of",
-    "Announcement",
-    "Announcement ~ Hoge",
-    "Announcement Hoge",
-  ]);
+  testList(text.linkWords(["word0", "word1", "word2"]), ["word0 word1 word2", "word0 word1", "word0", "word0 ~ word2", "word0 word2"]);
+  testList(text.linkWords(["announcement", "of", "hoge"]), ["announcement of hoge", "announcement of", "announcement", "announcement ~ hoge", "announcement hoge"]);
+  testList(text.linkWords(["Announcement", "of", "Hoge"]), ["Announcement of Hoge", "Announcement of", "Announcement", "Announcement ~ Hoge", "Announcement Hoge"]);
   testList(text.linkWords(["American", "English"]), [
     "American English", //
     "American",
@@ -160,3 +142,59 @@ const testList = (actualList, expectedList) => {
     expect(actualList.includes(expected)).toBeTruthy();
   }
 };
+
+test("extractRefPatternsInText basic cases", () => {
+  const e = text.extractRefPatternsInText;
+
+  expect(e("")).toEqual([]);
+  expect(e("This is a test.")).toEqual([]);
+  expect(e("The disease <→actinobacillosis> occurred.")).toEqual(["actinobacillosis"]);
+  expect(e("Find <→pattern1> and <→pattern2> in text.")).toEqual(["pattern1", "pattern2"]);
+  expect(e("Caused by ＝fungus.")).toEqual(["fungus"]);
+  expect(e("Affected by ＝bacteria and ＝virus.")).toEqual(["bacteria and", "virus"]);
+  expect(e("Found <→object> and caused by ＝reason.")).toEqual(["object", "reason"]);
+});
+
+test("extractRefPatternsInText complex cases", () => {
+  const e = text.extractRefPatternsInText;
+
+  expect(e("Nested <→outer<→inner>> pattern.")).toEqual(["outer<→inner"]);
+  expect(e("Check ＝valid valid.")).toEqual(["valid valid"]);
+  expect(e("Found ＝some pattern.")).toEqual(["some pattern"]);
+  expect(e("Special <→characters like @# > are skipped.")).toEqual(["characters like @#"]);
+  expect(e("Example with trailing ＝word")).toEqual(["word"]);
+  expect(e("Example with trailing ＝word.")).toEqual(["word"]);
+  expect(e("Unfinished <→pattern left open.")).toEqual([]);
+  expect(e("Check this ＝!invalid.")).toEqual([]);
+});
+
+test("extractRefPatternsInText edge cases", () => {
+  const e = text.extractRefPatternsInText;
+
+  expect(e("Duplicate <→same> and <→same> patterns.")).toEqual(["same", "same"]);
+  expect(e("Repeated ＝test & ＝test.")).toEqual(["test", "test"]);
+  expect(e("Upper ＝Case & ＝case.")).toEqual(["Case", "case"]);
+});
+
+test("extractRefPatternsInText tricky cases", () => {
+  const e = text.extractRefPatternsInText;
+
+  expect(e("<→open and =wrongly formatted>")).toEqual(["open and =wrongly formatted"]);
+  expect(e("Mismatched <→open and no closing")).toEqual([]);
+  expect(e("Nested weirdness <→<→inner> outer>>")).toEqual(["<→inner"]);
+
+  expect(e("<→special@chars!>")).toEqual(["special@chars!"]);
+  expect(e("＝including 123 numbers ")).toEqual(["including"]);
+  expect(e("Spaces <→ surrounded by > extra")).toEqual(["surrounded by"]);
+  expect(e("＝   leading spaces")).toEqual(["leading spaces"]);
+  expect(e("＝ spaces  in  the  middle  are  kept ")).toEqual(["spaces  in  the  middle  are  kept"]);
+
+  expect(e("Non-ASCII <→あいうえお>")).toEqual(["あいうえお"]);
+  expect(e("Mixing ASCII and ＝日本語 patterns")).toEqual([]);
+
+  expect(e(`<→${"x".repeat(1000)}>`)).toEqual([`${"x".repeat(1000)}`]);
+  expect(e(`＝${"y".repeat(1000)} with trailing space`)).toEqual([`${"y".repeat(1000)} with trailing space`]);
+
+  expect(e("＝pattern with trailing: colon")).toEqual(["pattern with trailing"]);
+  expect(e("<→pattern ending with punctuation.")).toEqual([]);
+});
diff --git a/src/main/core/lookuper.js b/src/main/core/lookuper.js
@@ -8,6 +8,7 @@ import dom from "../lib/dom";
 import ShortCache from "../lib/shortcache";
 import storage from "../lib/storage";
 import utils from "../lib/utils";
+import text from "../lib/text";
 import Generator from "./generator";
 
 const TEXT_LENGTH_LIMIT = 128;
@@ -29,11 +30,6 @@ export default class Lookuper {
     this.generator = new Generator(settings);
     const cacheSize = process.env.NODE_ENV === "production" ? 100 : 0;
     this.shortCache = new ShortCache(cacheSize);
-
-    // String.prototype.matchAll may not exist(#44)
-    if (String.prototype.matchAll) {
-      this.reForReferences = /[→＝]([- A-z']+)/g;
-    }
   }
 
   #canUpdate() {
@@ -137,17 +133,17 @@ export default class Lookuper {
       allEntries.push(...entries);
       langs.push(lang);
     }
-    const { heads, descriptions } = await fetchDescriptions(allEntries, this.reForReferences);
+    const { heads, descriptions } = await fetchDescriptions(allEntries);
     const { html, hitCount } = this.generator.generate(heads, descriptions, enableShortWord && langs[0] === "en");
     return { html, hit: hitCount };
   }
 }
 
-const fetchDescriptions = async (entries, reForReferences) => {
+const fetchDescriptions = async (entries) => {
   const primaryDescriptions = await storage.local.get(entries);
   const primaryHeads = entries.filter((e) => primaryDescriptions[e]);
 
-  const refHeads = pickOutRefs(primaryDescriptions, reForReferences);
+  const refHeads = extractRefPatterns(primaryDescriptions);
   if (refHeads.length === 0) {
     return { heads: primaryHeads, descriptions: primaryDescriptions };
   }
@@ -158,34 +154,15 @@ const fetchDescriptions = async (entries, reForReferences) => {
   return { heads, descriptions };
 };
 
-const pickOutRefs = (descriptions, reForReferences) => {
+const extractRefPatterns = (descriptions) => {
   const resultSet = new Set();
-  if (!reForReferences) {
-    return resultSet;
-  }
-  const existingKeys = new Set(Object.keys(descriptions));
   const descList = Object.values(descriptions);
 
   for (let i = 0; i < descList.length; i++) {
-    const desc = descList[i];
-    const refList = capture(desc, reForReferences);
-
-    for (let i = 0; i < refList.length; i++) {
-      const ref = refList[i];
-      if (existingKeys.has(ref)) {
-        continue;
-      }
+    const refList = text.extractRefPatternsInText(descList[i]);
+    for (const ref of refList) {
       resultSet.add(ref);
     }
   }
   return Array.from(resultSet);
 };
-
-const capture = (str, re) => {
-  const capturedStrings = [];
-  const matches = str.matchAll(re);
-  for (const m of matches) {
-    capturedStrings.push(m[1]);
-  }
-  return capturedStrings;
-};
diff --git a/src/main/lib/text.js b/src/main/lib/text.js
@@ -18,7 +18,7 @@ text.dealWithHyphens = (sourceStr, doIsValidCharacter = isValidCharacter) => {
   let result = "";
   let currentIndex = 0;
 
-  for (;;) {
+  for (; ;) {
     if (currentIndex >= str.length) {
       result += str.substring(currentIndex);
       break;
@@ -61,7 +61,7 @@ text.splitIntoWords = (str, doIsValidCharacter = isValidCharacter) => {
   const words = [];
   let startIndex = null;
   let i = 0;
-  for (;;) {
+  for (; ;) {
     const code = str.charCodeAt(i);
     const isEnglishCharacter = doIsValidCharacter(code);
     if (isEnglishCharacter) {
@@ -191,6 +191,51 @@ text.linkWords = (words, minWordNum = 1, enablePhrasing = true) => {
   return result1;
 };
 
+
+/**
+ * Extracts patterns from the input string based on specific markers.
+ *
+ * 1: <→...> : Any characters between "<→" and ">".
+ * 2: ＝... Alphabet characters and spaces after "＝".
+ */
+text.extractRefPatternsInText = (input) => {
+  const results = [];
+
+  let i = 0;
+
+  while (i < input.length) {
+    if (input[i] === '<' && input[i + 1] === '→') {
+      const start = i + 2;
+      const end = input.indexOf('>', start);
+      if (end !== -1) {
+        const w = input.slice(start, end).trim();
+        results.push(w);
+        i = end + 1;
+        continue;
+      }
+    }
+
+    if (input[i] === '＝') {
+      const start = i + 1;
+      let end = start;
+      while (end < input.length && input[end] >= 'A' && input[end] <= 'Z' || input[end] >= 'a' && input[end] <= 'z' || input[end] === ' ') {
+        end++;
+      }
+      const w = input.slice(start, end).trim();
+      if (w) {
+        results.push(w);
+      }
+      i = end;
+      continue;
+    }
+
+    i++;
+  }
+
+  return results;
+}
+
+
 const makeLinkedWords = (wordList, minWordNum, enablePhrasing = true) => {
   const linkedWords = [];
   const phraseProcessedWords = [];