Skip to content

Commit

Permalink
search: Replace third-party OpenDocument search helper.
Browse files Browse the repository at this point in the history
The odt2txt package is extremely slow when working with larger
documents, and has not seen an update in 7+ years.

Fixes #3481
  • Loading branch information
mtwebster committed Nov 12, 2024
1 parent c4a14be commit 0b1d10f
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 6 deletions.
1 change: 0 additions & 1 deletion debian/control
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ Depends:
poppler-utils,
exif,
id3,
odt2txt,
catdoc,
untex,
html2text,
Expand Down
3 changes: 2 additions & 1 deletion search-helpers/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ ppt_to_txt = executable('nemo-ppt-to-txt',
)

install_data(
'nemo-xls-to-txt',
['nemo-xls-to-txt', 'nemo-odf-to-txt'],
install_dir: join_paths(get_option('prefix'), get_option('bindir')),
install_mode: 'rwxr-xr-x'
)
Expand All @@ -31,6 +31,7 @@ install_data(
'mso.nemo_search_helper',
'mso-ppt.nemo_search_helper',
'mso-xls.nemo_search_helper',
'odf.nemo_search_helper',
install_dir: join_paths(nemoDataPath, 'search-helpers')
)

Expand Down
48 changes: 48 additions & 0 deletions search-helpers/nemo-odf-to-txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/python3

import sys
import zipfile
from pathlib import Path
from html.parser import HTMLParser

class Parser(HTMLParser):
parsed = []
get_next_data = False
concat_next_data = False

def handle_starttag(self, tag, attrs):
if tag.startswith("text") or tag.startswith("meta"):
if tag == "text:s":
self.concat_next_data = True
self.get_next_data = True

def handle_endtag(self, tag):
pass

def handle_data(self, data):
if self.get_next_data:
if data != "\n":
if self.concat_next_data:
self.parsed[-1] += " " + data
self.concat_next_data = False
else:
self.parsed.append(data.strip())
self.get_next_data = False

path = sys.argv[1]

parser = Parser()
zipfile = zipfile.ZipFile(path)
files = zipfile.infolist()

for f in files:
if f.filename in ("meta.xml", "content.xml"):
contents = zipfile.read(f.filename).decode()
parser.feed(contents)
if len(parser.parsed) == 0:
continue

out_str = ", ".join(parser.parsed)
print(f"{Path(f.filename).stem}: {out_str}\n", flush=True, file=sys.stdout)
parser.parsed = []
exit(0)
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[Nemo Search Helper]
TryExec=odt2txt;
Exec=odt2txt %s
TryExec=nemo-odf-to-txt;
Exec=nemo-odf-to-txt %s
MimeType=application/vnd.oasis.opendocument.text;application/vnd.oasis.opendocument.spreadsheet;application/vnd.oasis.opendocument.presentation;application/vnd.oasis.opendocument.graphics;
priority=100
Priority=100
1 change: 0 additions & 1 deletion search-helpers/third-party/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ helpers = [
'untex.nemo_search_helper',
'exif.nemo_search_helper',
'id3.nemo_search_helper',
'libreoffice.nemo_search_helper',
'mso-doc.nemo_search_helper',
'pdf2txt.nemo_search_helper',
'pdftotext.nemo_search_helper',
Expand Down

0 comments on commit 0b1d10f

Please sign in to comment.