Skip to content

Commit

Permalink
fix: node pruning raising an AttributeError in certain cases (#761)
Browse files Browse the repository at this point in the history
Fixes #760
  • Loading branch information
PLPeeters authored Dec 4, 2024
1 parent c6e8340 commit 76200b7
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 1 deletion.
6 changes: 6 additions & 0 deletions tests/filters_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,15 +100,21 @@ def doc3():
my_h2 = '<h2>42</h2>'
return html.fromstring('<html><body>' + my_h1 + my_h2 + my_p*50 + '</body></html>')

def doc4():
my_p = '<p>abc</p>'
return html.fromstring('<html><body>' + my_p + '</body></html><!-- comment -->')

#test xpath pruning
assert extract(doc(), prune_xpath='//p') == ''
assert extract(doc2(), prune_xpath='//p') == 'ABC'
assert extract(doc2(), prune_xpath=['//p', '//h1']) == ''
assert extract(doc3(), prune_xpath=['//p', '//h1']) == '42'
assert extract(doc4(), prune_xpath=['//comment()']) == 'abc'
# sanity check
assert extract(doc()) != ''
assert extract(doc2()) != ''
assert extract(doc3()) != ''
assert extract(doc4()) != ''


if __name__ == '__main__':
Expand Down
2 changes: 1 addition & 1 deletion trafilatura/htmlprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def prune_unwanted_nodes(
# There is a previous node, append text to its tail
prev.tail = (prev.tail or "") + " " + subtree.tail
# remove the node
subtree.getparent().remove(subtree)
delete_element(subtree)

if with_backup:
new_len = len(tree.text_content())
Expand Down

0 comments on commit 76200b7

Please sign in to comment.