From b0ba4537c7cc65a97f49235bc6de9f8323b64799 Mon Sep 17 00:00:00 2001
From: Liam Bigelow <40188355+bglw@users.noreply.github.com>
Date: Mon, 12 Sep 2022 15:44:47 +1200
Subject: [PATCH] Improve error handling for Pagefind HTML parsing
---
CHANGELOG.md | 2 ++
pagefind/features/errors.feature | 38 ++++++++++++++++++++++++++++++++
pagefind/src/fossick/mod.rs | 14 +++++++++++-
3 files changed, 53 insertions(+), 1 deletion(-)
create mode 100644 pagefind/features/errors.feature
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 04ddbbcb..e516d125 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@
## Unreleased
+* Pagefind now gracefully skips pages that fail HTML parsing, and provides more context when these errors are hit.
+
## v0.8.0 (August 23, 2022)
### Important Changes
diff --git a/pagefind/features/errors.feature b/pagefind/features/errors.feature
new file mode 100644
index 00000000..68557dd4
--- /dev/null
+++ b/pagefind/features/errors.feature
@@ -0,0 +1,38 @@
+Feature: Graceful Pagefind Errors
+ Background:
+ Given I have the environment variables:
+ | PAGEFIND_SOURCE | public |
+ Given I have a "public/index.html" file with the body:
+ """
+
Nothing
+ """
+
+ Scenario: Pagefind gracefully skips pages with parsing ambiguities
+ Given I have a "public/cat/index.html" file with the body:
+ """
+
hello world
+ """
+ Given I have a "public/dog/index.html" file with the body:
+ """
+
hello world
+
+ """
+ When I run my program
+ Then I should see "Running Pagefind" in stdout
+ Then I should see "Failed to parse file public/dog/index.html" in stdout
+ Then I should see the file "public/_pagefind/pagefind.js"
+ When I serve the "public" directory
+ When I load "/"
+ When I evaluate:
+ """
+ async function() {
+ let pagefind = await import("/_pagefind/pagefind.js");
+
+ let search = await pagefind.search("world");
+ let results = await Promise.all(search.results.map(r => r.data()));
+
+ document.querySelector('[data-url]').innerText = results.map(r => r.url).sort().join(', ');
+ }
+ """
+ Then There should be no logs
+ Then The selector "[data-url]" should contain "/cat/"
diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs
index 4504ef31..bbc37a53 100644
--- a/pagefind/src/fossick/mod.rs
+++ b/pagefind/src/fossick/mod.rs
@@ -59,7 +59,11 @@ impl Fossicker {
break;
}
if let Err(error) = rewriter.write(&buf[..read]) {
- panic!("HTML parse encountered an error: {:#?}", error);
+ println!(
+ "Failed to parse file {} — skipping this file. Error:\n{error}",
+ self.file_path.to_str().unwrap_or("[unknown file]")
+ );
+ return Ok(());
}
}
@@ -75,6 +79,10 @@ impl Fossicker {
fn parse_digest(&mut self) -> (String, HashMap>) {
let mut map: HashMap> = HashMap::new();
+ // TODO: push this error handling up a level and return an Err from parse_digest
+ if self.data.as_ref().is_none() {
+ return ("".into(), map); // empty page result, will be dropped from search
+ }
let data = self.data.as_ref().unwrap();
let stemmer = get_stemmer(&data.language);
@@ -144,6 +152,10 @@ impl Fossicker {
let (content, word_data) = self.parse_digest();
+ if self.data.is_none() {
+ return Err(());
+ }
+
let data = self.data.unwrap();
let url = build_url(&self.file_path, options);