From b0ba4537c7cc65a97f49235bc6de9f8323b64799 Mon Sep 17 00:00:00 2001 From: Liam Bigelow <40188355+bglw@users.noreply.github.com> Date: Mon, 12 Sep 2022 15:44:47 +1200 Subject: [PATCH] Improve error handling for Pagefind HTML parsing --- CHANGELOG.md | 2 ++ pagefind/features/errors.feature | 38 ++++++++++++++++++++++++++++++++ pagefind/src/fossick/mod.rs | 14 +++++++++++- 3 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 pagefind/features/errors.feature diff --git a/CHANGELOG.md b/CHANGELOG.md index 04ddbbcb..e516d125 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ ## Unreleased +* Pagefind now gracefully skips pages that fail HTML parsing, and provides more context when these errors are hit. + ## v0.8.0 (August 23, 2022) ### Important Changes diff --git a/pagefind/features/errors.feature b/pagefind/features/errors.feature new file mode 100644 index 00000000..68557dd4 --- /dev/null +++ b/pagefind/features/errors.feature @@ -0,0 +1,38 @@ +Feature: Graceful Pagefind Errors + Background: + Given I have the environment variables: + | PAGEFIND_SOURCE | public | + Given I have a "public/index.html" file with the body: + """ +

Nothing

+ """ + + Scenario: Pagefind gracefully skips pages with parsing ambiguities + Given I have a "public/cat/index.html" file with the body: + """ +

hello world

+ """ + Given I have a "public/dog/index.html" file with the body: + """ +

hello world

+ + """ + When I run my program + Then I should see "Running Pagefind" in stdout + Then I should see "Failed to parse file public/dog/index.html" in stdout + Then I should see the file "public/_pagefind/pagefind.js" + When I serve the "public" directory + When I load "/" + When I evaluate: + """ + async function() { + let pagefind = await import("/_pagefind/pagefind.js"); + + let search = await pagefind.search("world"); + let results = await Promise.all(search.results.map(r => r.data())); + + document.querySelector('[data-url]').innerText = results.map(r => r.url).sort().join(', '); + } + """ + Then There should be no logs + Then The selector "[data-url]" should contain "/cat/" diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs index 4504ef31..bbc37a53 100644 --- a/pagefind/src/fossick/mod.rs +++ b/pagefind/src/fossick/mod.rs @@ -59,7 +59,11 @@ impl Fossicker { break; } if let Err(error) = rewriter.write(&buf[..read]) { - panic!("HTML parse encountered an error: {:#?}", error); + println!( + "Failed to parse file {} — skipping this file. Error:\n{error}", + self.file_path.to_str().unwrap_or("[unknown file]") + ); + return Ok(()); } } @@ -75,6 +79,10 @@ impl Fossicker { fn parse_digest(&mut self) -> (String, HashMap>) { let mut map: HashMap> = HashMap::new(); + // TODO: push this error handling up a level and return an Err from parse_digest + if self.data.as_ref().is_none() { + return ("".into(), map); // empty page result, will be dropped from search + } let data = self.data.as_ref().unwrap(); let stemmer = get_stemmer(&data.language); @@ -144,6 +152,10 @@ impl Fossicker { let (content, word_data) = self.parse_digest(); + if self.data.is_none() { + return Err(()); + } + let data = self.data.unwrap(); let url = build_url(&self.file_path, options);