Skip to content

Commit

Permalink
Refactor extractRefPatterns: Replace regex with optimized text parsin…
Browse files Browse the repository at this point in the history
…g for enhanced accuracy and significant speedup
  • Loading branch information
wtetsu committed Jan 20, 2025
1 parent 7706e71 commit 325af2b
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 54 deletions.
2 changes: 1 addition & 1 deletion __test__/main/core/config.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ test("parseSettings should handle various input cases", async () => {
normalDialogStyles: null,
movingDialogStyles: "",
hiddenDialogStyles: "{",
}),
})
).toEqual({
shortWordLength: 2,
empty: "",
Expand Down
80 changes: 59 additions & 21 deletions __test__/main/lib/text.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,27 +31,9 @@ test("linkWords function test", () => {
testList(text.linkWords([]), []);
testList(text.linkWords(["word0"]), ["word0"]);
testList(text.linkWords(["word0", "word1"]), ["word0 word1", "word0"]);
testList(text.linkWords(["word0", "word1", "word2"]), [
"word0 word1 word2",
"word0 word1",
"word0",
"word0 ~ word2",
"word0 word2",
]);
testList(text.linkWords(["announcement", "of", "hoge"]), [
"announcement of hoge",
"announcement of",
"announcement",
"announcement ~ hoge",
"announcement hoge",
]);
testList(text.linkWords(["Announcement", "of", "Hoge"]), [
"Announcement of Hoge",
"Announcement of",
"Announcement",
"Announcement ~ Hoge",
"Announcement Hoge",
]);
testList(text.linkWords(["word0", "word1", "word2"]), ["word0 word1 word2", "word0 word1", "word0", "word0 ~ word2", "word0 word2"]);
testList(text.linkWords(["announcement", "of", "hoge"]), ["announcement of hoge", "announcement of", "announcement", "announcement ~ hoge", "announcement hoge"]);
testList(text.linkWords(["Announcement", "of", "Hoge"]), ["Announcement of Hoge", "Announcement of", "Announcement", "Announcement ~ Hoge", "Announcement Hoge"]);
testList(text.linkWords(["American", "English"]), [
"American English", //
"American",
Expand Down Expand Up @@ -160,3 +142,59 @@ const testList = (actualList, expectedList) => {
expect(actualList.includes(expected)).toBeTruthy();
}
};

test("extractRefPatternsInText basic cases", () => {
const e = text.extractRefPatternsInText;

expect(e("")).toEqual([]);
expect(e("This is a test.")).toEqual([]);
expect(e("The disease <→actinobacillosis> occurred.")).toEqual(["actinobacillosis"]);
expect(e("Find <→pattern1> and <→pattern2> in text.")).toEqual(["pattern1", "pattern2"]);
expect(e("Caused by =fungus.")).toEqual(["fungus"]);
expect(e("Affected by =bacteria and =virus.")).toEqual(["bacteria and", "virus"]);
expect(e("Found <→object> and caused by =reason.")).toEqual(["object", "reason"]);
});

test("extractRefPatternsInText complex cases", () => {
const e = text.extractRefPatternsInText;

expect(e("Nested <→outer<→inner>> pattern.")).toEqual(["outer<→inner"]);
expect(e("Check =valid valid.")).toEqual(["valid valid"]);
expect(e("Found =some pattern.")).toEqual(["some pattern"]);
expect(e("Special <→characters like @# > are skipped.")).toEqual(["characters like @#"]);
expect(e("Example with trailing =word")).toEqual(["word"]);
expect(e("Example with trailing =word.")).toEqual(["word"]);
expect(e("Unfinished <→pattern left open.")).toEqual([]);
expect(e("Check this =!invalid.")).toEqual([]);
});

test("extractRefPatternsInText edge cases", () => {
const e = text.extractRefPatternsInText;

expect(e("Duplicate <→same> and <→same> patterns.")).toEqual(["same", "same"]);
expect(e("Repeated =test & =test.")).toEqual(["test", "test"]);
expect(e("Upper =Case & =case.")).toEqual(["Case", "case"]);
});

test("extractRefPatternsInText tricky cases", () => {
const e = text.extractRefPatternsInText;

expect(e("<→open and =wrongly formatted>")).toEqual(["open and =wrongly formatted"]);
expect(e("Mismatched <→open and no closing")).toEqual([]);
expect(e("Nested weirdness <→<→inner> outer>>")).toEqual(["<→inner"]);

expect(e("<→special@chars!>")).toEqual(["special@chars!"]);
expect(e("=including 123 numbers ")).toEqual(["including"]);
expect(e("Spaces <→ surrounded by > extra")).toEqual(["surrounded by"]);
expect(e("= leading spaces")).toEqual(["leading spaces"]);
expect(e("= spaces in the middle are kept ")).toEqual(["spaces in the middle are kept"]);

expect(e("Non-ASCII <→あいうえお>")).toEqual(["あいうえお"]);
expect(e("Mixing ASCII and =日本語 patterns")).toEqual([]);

expect(e(`<→${"x".repeat(1000)}>`)).toEqual([`${"x".repeat(1000)}`]);
expect(e(`=${"y".repeat(1000)} with trailing space`)).toEqual([`${"y".repeat(1000)} with trailing space`]);

expect(e("=pattern with trailing: colon")).toEqual(["pattern with trailing"]);
expect(e("<→pattern ending with punctuation.")).toEqual([]);
});
37 changes: 7 additions & 30 deletions src/main/core/lookuper.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import dom from "../lib/dom";
import ShortCache from "../lib/shortcache";
import storage from "../lib/storage";
import utils from "../lib/utils";
import text from "../lib/text";
import Generator from "./generator";

const TEXT_LENGTH_LIMIT = 128;
Expand All @@ -29,11 +30,6 @@ export default class Lookuper {
this.generator = new Generator(settings);
const cacheSize = process.env.NODE_ENV === "production" ? 100 : 0;
this.shortCache = new ShortCache(cacheSize);

// String.prototype.matchAll may not exist(#44)
if (String.prototype.matchAll) {
this.reForReferences = /[]([- A-z']+)/g;
}
}

#canUpdate() {
Expand Down Expand Up @@ -137,17 +133,17 @@ export default class Lookuper {
allEntries.push(...entries);
langs.push(lang);
}
const { heads, descriptions } = await fetchDescriptions(allEntries, this.reForReferences);
const { heads, descriptions } = await fetchDescriptions(allEntries);
const { html, hitCount } = this.generator.generate(heads, descriptions, enableShortWord && langs[0] === "en");
return { html, hit: hitCount };
}
}

const fetchDescriptions = async (entries, reForReferences) => {
const fetchDescriptions = async (entries) => {
const primaryDescriptions = await storage.local.get(entries);
const primaryHeads = entries.filter((e) => primaryDescriptions[e]);

const refHeads = pickOutRefs(primaryDescriptions, reForReferences);
const refHeads = extractRefPatterns(primaryDescriptions);
if (refHeads.length === 0) {
return { heads: primaryHeads, descriptions: primaryDescriptions };
}
Expand All @@ -158,34 +154,15 @@ const fetchDescriptions = async (entries, reForReferences) => {
return { heads, descriptions };
};

const pickOutRefs = (descriptions, reForReferences) => {
const extractRefPatterns = (descriptions) => {
const resultSet = new Set();
if (!reForReferences) {
return resultSet;
}
const existingKeys = new Set(Object.keys(descriptions));
const descList = Object.values(descriptions);

for (let i = 0; i < descList.length; i++) {
const desc = descList[i];
const refList = capture(desc, reForReferences);

for (let i = 0; i < refList.length; i++) {
const ref = refList[i];
if (existingKeys.has(ref)) {
continue;
}
const refList = text.extractRefPatternsInText(descList[i]);
for (const ref of refList) {
resultSet.add(ref);
}
}
return Array.from(resultSet);
};

const capture = (str, re) => {
const capturedStrings = [];
const matches = str.matchAll(re);
for (const m of matches) {
capturedStrings.push(m[1]);
}
return capturedStrings;
};
49 changes: 47 additions & 2 deletions src/main/lib/text.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ text.dealWithHyphens = (sourceStr, doIsValidCharacter = isValidCharacter) => {
let result = "";
let currentIndex = 0;

for (;;) {
for (; ;) {
if (currentIndex >= str.length) {
result += str.substring(currentIndex);
break;
Expand Down Expand Up @@ -61,7 +61,7 @@ text.splitIntoWords = (str, doIsValidCharacter = isValidCharacter) => {
const words = [];
let startIndex = null;
let i = 0;
for (;;) {
for (; ;) {
const code = str.charCodeAt(i);
const isEnglishCharacter = doIsValidCharacter(code);
if (isEnglishCharacter) {
Expand Down Expand Up @@ -191,6 +191,51 @@ text.linkWords = (words, minWordNum = 1, enablePhrasing = true) => {
return result1;
};


/**
* Extracts patterns from the input string based on specific markers.
*
* 1: <→...> : Any characters between "<→" and ">".
* 2: =... Alphabet characters and spaces after "=".
*/
text.extractRefPatternsInText = (input) => {
const results = [];

let i = 0;

while (i < input.length) {
if (input[i] === '<' && input[i + 1] === '→') {
const start = i + 2;
const end = input.indexOf('>', start);
if (end !== -1) {
const w = input.slice(start, end).trim();
results.push(w);
i = end + 1;
continue;
}
}

if (input[i] === '=') {
const start = i + 1;
let end = start;
while (end < input.length && input[end] >= 'A' && input[end] <= 'Z' || input[end] >= 'a' && input[end] <= 'z' || input[end] === ' ') {
end++;
}
const w = input.slice(start, end).trim();
if (w) {
results.push(w);
}
i = end;
continue;
}

i++;
}

return results;
}


const makeLinkedWords = (wordList, minWordNum, enablePhrasing = true) => {
const linkedWords = [];
const phraseProcessedWords = [];
Expand Down

0 comments on commit 325af2b

Please sign in to comment.