Skip to content

Commit

Permalink
Update zoteroocr.js for Zotero7
Browse files Browse the repository at this point in the history
Signed-off-by: Stefan Weil <[email protected]>
  • Loading branch information
stweil committed Mar 21, 2024
1 parent d6be1cc commit 7ba01b9
Show file tree
Hide file tree
Showing 2 changed files with 298 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/bootstrap.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ async function startup({ id, version, rootURI }) {
});

log("Starting (loadSubScript)");
Services.scriptloader.loadSubScript(rootURI + 'chrome/content/zoteroocr.js');
Services.scriptloader.loadSubScript(rootURI + 'zoteroocr.js');
log("Starting (init)");
ZoteroOCR.init({ id, version, rootURI });
log("Starting (addToAllWindows)");
Expand Down
297 changes: 297 additions & 0 deletions src/zoteroocr.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
// zoteroocr.js

// See https://developer.mozilla.org/en-US/docs/Mozilla/JavaScript_code_modules.
Components.utils.import("resource://gre/modules/FileUtils.jsm");
Components.utils.import("resource://gre/modules/osfile.jsm");

// Code for Zotero 7.
ZoteroOCR = {
id: null,
version: null,
rootURI: null,
initialized: false,
addedElementIDs: [],

init({ id, version, rootURI } = {}) {
if (this.initialized) return;
this.id = id;
this.version = version;
this.rootURI = rootURI;
this.initialized = true;
},

log(msg) {
Zotero.debug("Zotero OCR: " + msg);
},

addToWindow(window) {
let doc = window.document;

// createElementNS() necessary in Zotero 6; createElement() defaults to HTML in Zotero 7
let HTML_NS = "http://www.w3.org/1999/xhtml";
let XUL_NS = "http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul";

// Use Fluent for localization
window.MozXULElement.insertFTLIfNeeded("zotero-ocr.ftl");

// Add menu option
let menuitem = doc.createElementNS(XUL_NS, 'menuitem');
menuitem.id = 'ocr-selected-pdfs';
menuitem.setAttribute('type', 'checkbox');
menuitem.setAttribute('data-l10n-id', 'ocr-selected-pdfs');
// MozMenuItem#checked is available in Zotero 7
menuitem.addEventListener('command', () => {
// Zotero.OCR.openPreferenceWindow();
ZoteroOCR.recognize(window);
});
doc.getElementById('menu_ToolsPopup').appendChild(menuitem);
this.storeAddedElement(menuitem);
},

addToAllWindows() {
var windows = Zotero.getMainWindows();
for (let win of windows) {
if (!win.ZoteroPane) continue;
this.addToWindow(win);
}
},

storeAddedElement(elem) {
if (!elem.id) {
throw new Error("Element must have an id");
}
this.addedElementIDs.push(elem.id);
},

removeFromWindow(window) {
var doc = window.document;
// Remove all elements added to DOM
for (let id of this.addedElementIDs) {
doc.getElementById(id)?.remove();
}
doc.querySelector('[href="zotero-ocr.ftl"]').remove();
},

removeFromAllWindows() {
var windows = Zotero.getMainWindows();
for (let win of windows) {
if (!win.ZoteroPane) continue;
this.removeFromWindow(win);
}
},

async recognize(window) {

// Look for the tesseract executable in the settings and at commonly used locations.
// If it is found, the settings are updated.
// Otherwise abort with an alert.
Zotero.debug("entering recognize()");
let ocrEngine = Zotero.Prefs.get("zoteroocr.ocrPath");
let found = false;
if (ocrEngine) {
let pathOrFile = FileUtils.File(ocrEngine);
// If a directory is given, then try for the standard name of the tool.
if (pathOrFile.isDirectory()) {
if (Zotero.isWin) {
ocrEngine = OS.Path.join(ocrEngine, "tesseract.exe");
}
else {
ocrEngine = OS.Path.join(ocrEngine, "tesseract");
}
Zotero.Prefs.set("zoteroocr.ocrPath", ocrEngine);
}
found = await OS.File.exists(ocrEngine);
}
else {
let path = ["", "/usr/local/bin/", "/usr/bin/", "C:\\Program Files\\Tesseract-OCR\\", "/opt/homebrew/bin/", "/usr/local/homebrew/bin/"];
for (ocrEngine of path) {
ocrEngine += "tesseract";
if (Zotero.isWin) {
ocrEngine += ".exe";
}
let tesseractFound = await OS.File.exists(ocrEngine);
if (tesseractFound) {
found = true;
Zotero.debug("Found " + ocrEngine);
Zotero.Prefs.set("zoteroocr.ocrPath", ocrEngine);
break;
}
Zotero.debug("No " + ocrEngine);
}
}
if (!found) {
window.alert("Tesseract executable not found. Tried: " + ocrEngine);
return;
}

// See https://developer.mozilla.org/en-US/docs/Archive/Add-ons/Code_snippets/File_I_O#Getting_special_files
// and https://dxr.mozilla.org/mozilla-central/source/xpcom/io/nsDirectoryServiceDefs.h.
let zdir = FileUtils.getDir('GreBinD', []);

// Look for a specific path in the preferences for pdftoppm
let pdftoppm = Zotero.Prefs.get("zoteroocr.pdftoppmPath");
if (!pdftoppm) {
// look for pdftoppm in various possible directories
let path = ["", "/usr/local/bin/", "/usr/bin/", "/opt/homebrew/bin/", "/usr/local/homebrew/bin/", zdir.clone()];
for (pdftoppm of path) {
pdftoppm += "pdftoppm";
if (Zotero.isWin) {
ocrEngine += ".exe";
}
let pdftoppmFound = await OS.File.exists(pdftoppm);
if (pdftoppmFound) {
found = true;
Zotero.debug("Found " + pdftoppm);
Zotero.Prefs.set("zoteroocr.pdftoppmPath", pdftoppm);
break;
}
Zotero.debug("No " + pdftoppm);
}
}
if (Zotero.isWin && !(pdftoppm.endsWith(".exe"))) {
pdftoppm = pdftoppm + ".exe";
}
if (!(await OS.File.exists(pdftoppm))) {
window.alert("No " + pdftoppm + " executable found.");
return;
}

let items = Zotero.getActiveZoteroPane().getSelectedItems();
for (let item of items) {
// find the PDF
let pdfItem;
if (item.isAttachment()) {
if (item.isFileAttachment() && item.attachmentContentType == 'application/pdf') {
pdfItem = item;
item = Zotero.Items.get(item.parentItemID);
}
else {
window.alert("Item is attachment but not PDF and will be ignored.");
continue;
}
}
else {
let pdfAttachments = item.getAttachments(false)
.map(itemID => Zotero.Items.get(itemID))
.filter(att => att.isFileAttachment() && att.attachmentContentType == 'application/pdf');
if (pdfAttachments.length == 0) {
window.alert("No PDF found for the selected item.");
continue;
}
if (pdfAttachments.length > 1) {
window.alert("There are several PDFs attached to this item. Only the first one will be processed.");
}
pdfItem = pdfAttachments[0];
}
let pdf = pdfItem.getFilePath();
let base = pdf.replace(/\.pdf$/, '');
let dir = OS.Path.dirname(pdf);
// let infofile = dir + '/pdfinfo.txt';
let ocrbase = Zotero.Prefs.get("zoteroocr.overwritePDF") ? base : base + '.ocr';
// TODO filter out PDFs which have already a text layer

// extract images from PDF
let imageList = OS.Path.join(dir, 'image-list.txt');
if (!(await OS.File.exists(imageList))) {
try {
Zotero.debug("Running " + pdftoppm + ' -png -r 300 ' + pdf + ' ' + dir + '/page');
await Zotero.Utilities.Internal.exec(pdftoppm, ['-progress', '-png', '-r', 300, pdf, dir + '/page']);
}
catch (e) {
Zotero.logError(e);
}

var iterator = new OS.File.DirectoryIterator(dir);
var imageListArray = [];
await iterator.forEach(function onEntry(entry) {
Zotero.debug(entry);
if (entry.name.match(/-\d+\.png$/)) {
imageListArray.push(entry.path);
}
});
Zotero.debug('Files are now:')
Zotero.debug(imageListArray);

// save the list of images in a separate file
Zotero.File.putContents(Zotero.File.pathToFile(imageList), imageListArray.join('\n'));

}

let parameters = [dir + '/image-list.txt'];
parameters.push(ocrbase);
if (Zotero.Prefs.get("zoteroocr.language")) {
parameters.push('-l');
parameters.push(Zotero.Prefs.get("zoteroocr.language"));
}
parameters.push('txt');
if (Zotero.Prefs.get("zoteroocr.outputPDF")) {
parameters.push('pdf');
}
if (Zotero.Prefs.get("zoteroocr.outputHocr")) {
parameters.push('hocr');
}
try {
Zotero.debug("Running " + ocrEngine + ' ' + parameters.join(' '));
await Zotero.Utilities.Internal.exec(ocrEngine, parameters);
}
catch (e) {
Zotero.logError(e);
}

if (Zotero.Prefs.get("zoteroocr.outputNote")) {
let contents = await Zotero.File.getContentsAsync(ocrbase + '.txt');
contents = contents.replace(/(?:\r\n|\r|\n)/g, '<br />');
let newNote = new Zotero.Item('note');
newNote.setNote(contents);
newNote.parentID = item.id;
await newNote.saveTx();
}

if (Zotero.Prefs.get("zoteroocr.outputHocr")) {
let contents = await Zotero.File.getContentsAsync(ocrbase + '.hocr');
// replace the absolute paths of images with relative ones
let escapedDir = dir.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
let regexp = new RegExp(escapedDir + "/", 'g');
contents = contents.replace(regexp, '');
// split content into the preamble and pages
contents = contents.replace("</body>\n</html>", '');
let parts = contents.split("<div class='ocr_page'");
let preamble = parts[0];
// create new html attachments including hocrjs for individual pages
let maximumPagesAsHtml = parseInt(Zotero.Prefs.get("zoteroocr.maximumPagesAsHtml"));
let upperLimit = parts.length;
if (!(isNaN(maximumPagesAsHtml)) && (maximumPagesAsHtml + 1 < upperLimit)) {
upperLimit = maximumPagesAsHtml + 1;
}
for (let i = 1; i < upperLimit; i++) {
let pagename = 'page-' + i + '.html';
let htmlfile = Zotero.File.pathToFile(OS.Path.join(dir, pagename));
let pagecontent = preamble + "<div class='ocr_page'" + parts[i] + '<script src="https://unpkg.com/hocrjs"></script>\n</body>\n</html>';
Zotero.File.putContents(htmlfile, pagecontent);
await Zotero.Attachments.linkFromFile({
file: OS.Path.join(dir, pagename),
contentType: "text/html",
parentItemID: item.id
});
}
}

// attach PDF if it is a new one
if (Zotero.Prefs.get("zoteroocr.outputPDF") && !(Zotero.Prefs.get("zoteroocr.overwritePDF"))) {
await Zotero.Attachments.linkFromFile({
file: ocrbase + '.pdf',
parentItemID: item.id
});
}

if (!Zotero.Prefs.get("zoteroocr.outputPNG") && imageListArray) {
// delete image list
await Zotero.File.removeIfExists(imageList);
// delete PNGs
for (let imageName of imageListArray) {
await Zotero.File.removeIfExists(imageName);
}
}
}
},
};

0 comments on commit 7ba01b9

Please sign in to comment.