Skip to content

Commit

Permalink
Do not take OCR languages into account when reading the cache
Browse files Browse the repository at this point in the history
This may fix #37
  • Loading branch information
scambier committed Jan 23, 2024
1 parent 7baa897 commit 15feb4a
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 18 deletions.
11 changes: 2 additions & 9 deletions lib/src/cache.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,20 +50,13 @@ export function getCachePath(file: TFile): {
* @param optLangs
* @returns
*/
export async function readCache(
file: TFile,
optLangs = ''
): Promise<ExtractedText | null> {
export async function readCache(file: TFile): Promise<ExtractedText | null> {
const cachePath = getCachePath(file)

// Get the text from the cache if it exists
if (await app.vault.adapter.exists(cachePath.fullpath)) {
const raw = await app.vault.adapter.read(cachePath.fullpath)
const cache = JSON.parse(raw) as ExtractedText
// Check that the languages list has not changed since the cache was created
if (cache.langs === optLangs) {
return cache
}
return JSON.parse(raw) as ExtractedText
}
return null
}
Expand Down
32 changes: 24 additions & 8 deletions lib/src/ocr/ocr-manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ import type { ocrLangs } from './ocr-langs'
* Concatenates an array of langs to a single string to be passed to Tesseract
* e.g. ['fra', 'eng'] => 'eng+fra'
* The langs are sorted alphabetically because it's also used a cache key
* @param langs
* @returns
* @param langs
* @returns
*/
function concatLangs (langs: Array<typeof ocrLangs[number]>): string {
function concatLangs(langs: Array<(typeof ocrLangs)[number]>): string {
return langs.sort().join('+')
}

Expand Down Expand Up @@ -102,7 +102,11 @@ class OCRManager {
*/
public async getImageText(file: TFile, options: OcrOptions): Promise<string> {
try {
return await imagesProcessQueue.add(() => this.#getImageText(file, options)) ?? ''
return (
(await imagesProcessQueue.add(() =>
this.#getImageText(file, options)
)) ?? ''
)
} catch (e) {
console.warn(
`Text Extractor - Error while extracting text from ${file.basename}`
Expand All @@ -113,9 +117,8 @@ class OCRManager {
}

async #getImageText(file: TFile, options: OcrOptions): Promise<string> {
const langs = concatLangs(options.langs)
// Get the text from the cache if it exists
const cache = await readCache(file, langs)
const cache = await readCache(file)
if (cache) {
return cache.text ?? FAILED_TO_EXTRACT
}
Expand All @@ -128,6 +131,7 @@ class OCRManager {
const cachePath = getCachePath(file)
const data = new Uint8ClampedArray(await app.vault.readBinary(file))
const worker = OCRWorker.getWorker()
const langs = concatLangs(options.langs)

return new Promise(async (resolve, reject) => {
try {
Expand All @@ -144,12 +148,24 @@ class OCRManager {
.trim()

// Add it to the cache
await writeCache(cachePath.folder, cachePath.filename, text, file.path, langs)
await writeCache(
cachePath.folder,
cachePath.filename,
text,
file.path,
langs
)
resolve(text)
} catch (e) {
// In case of error (unreadable PDF or timeout) just add
// an empty string to the cache
await writeCache(cachePath.folder, cachePath.filename, '', file.path, langs)
await writeCache(
cachePath.folder,
cachePath.filename,
'',
file.path,
langs
)
resolve('')
}
})
Expand Down
2 changes: 1 addition & 1 deletion plugin/src/settings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ export class TextExtractorSettingsTab extends PluginSettingTab {
info.createDiv({
cls: 'setting-item-description',
text: `A list of languages to use for OCR. e.g. if your vault contains documents in English and French, you'd want to add 'eng' and 'fra' here.
This setting only applies to images, not PDFs.`,
This setting only applies to images, not PDFs. You may have to clear the cache after changing this setting.`,
})

new LangSelector({
Expand Down

0 comments on commit 15feb4a

Please sign in to comment.