Skip to content

Commit

Permalink
fix #769
Browse files Browse the repository at this point in the history
  • Loading branch information
CodyInnowhere authored and CodyInnowhere committed Jan 3, 2025
1 parent eed8c0b commit 9853e56
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 9 deletions.
23 changes: 18 additions & 5 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,18 @@ def test_formatting():
my_result = extract(my_document, output_format='xml', include_links=True, config=ZERO_CONFIG)
assert '<item>Number <ref target="test.html">2</ref></item>' in my_result

my_document = html.fromstring("""<html><body><article>
<ul>
<li>Number 0</li>
<li>Number <a href="test.html">1</a></li>
<li><a href="test.html">Number 2</a> n2</li>
<li>Number 3</li>
<li><p>Number 4</p> n4</li>
</ul>
<p>Test</p></article></body></html>
""")
my_result = extract(my_document, output_format='markdown', include_links=True, config=ZERO_CONFIG)
assert my_result == '- Number 0\n- Number [1](test.html)\n- [Number 2](test.html) n2\n- Number 3\n- Number 4 n4\n\nTest'
# XML and Markdown formatting within <p>-tag
my_document = html.fromstring('<html><body><p><b>bold</b>, <i>italics</i>, <tt>tt</tt>, <strike>deleted</strike>, <u>underlined</u>, <a href="test.html">link</a> and additional text to bypass detection.</p></body></html>')
my_result = extract(copy(my_document), fast=True, include_formatting=False, config=ZERO_CONFIG)
Expand Down Expand Up @@ -444,23 +456,24 @@ def test_formatting():

my_document = html.fromstring("""
<html><head><body><article>python code below:
```python
<pre><code>
def test:
print('hello')
print('world')
```
</code></pre>
</article></body></html>
""")
my_result = extract(my_document, output_format='markdown', include_formatting=True)
assert "python code below:\n```python\ndef test:\nprint('hello')\nprint('world')\n```" == my_result
assert "python code below:\n```\ndef test:\nprint('hello')\nprint('world')\n```" == my_result

my_result = extract(my_document, output_format='markdown', include_formatting=True, preserve_space=True)
assert """python code below:
```python
```
def test:
print('hello')
print('world')
```""" == my_result
```""" == my_result


def test_extract_with_metadata():
Expand Down
40 changes: 36 additions & 4 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,32 @@ def validate_tei(xmldoc: _Element) -> bool:

return result

def is_element_in_item(element: _Element) -> bool:
"""Check whether an element is a list item or within a list item"""
return element.tag == 'item' or bool(element.xpath('ancestor::item'))


def is_first_element_in_item(element: _Element) -> bool:
"""Check whether an element is the first element in list item"""
if element.tag == 'item' and element.text:
return True
for sub_elem in element.xpath('ancestor::item'):
if not sub_elem.text:
return True
return False


def is_last_element_in_item(element: _Element) -> bool:
"""Check whether an element is the last element in list item"""
if not is_element_in_item(element):
return False

# pure text only in list item
if element.tag == 'item':
return len(element.getchildren()) == 0
# element within list item
return element.getnext() is None or element.getnext().tag == 'item'


def replace_element_text(element: _Element, include_formatting: bool) -> str:
"Determine element text based on just the text of the element. One must deal with the tail separately."
Expand Down Expand Up @@ -291,8 +317,9 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
if elem_text:
elem_text = f"{elem_text} "
# lists
elif element.tag == "item" and elem_text:
elem_text = f"- {elem_text}\n"
if is_first_element_in_item(element) and not is_in_table_cell(element):
elem_text = f"- {elem_text} "

return elem_text


Expand Down Expand Up @@ -348,7 +375,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
# Process text

# Common elements (Now processes end-tag logic correctly)
if element.tag in NEWLINE_ELEMS and not element.xpath("ancestor::cell"):
if element.tag in NEWLINE_ELEMS and not element.xpath("ancestor::cell") and not is_element_in_item(element):
# spacing hack
returnlist.append("\n\u2424\n" if include_formatting and element.tag != 'row' else "\n")
elif element.tag == "cell":
Expand All @@ -357,8 +384,13 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
returnlist.append(" ")

# this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS
# unless it's within a list item or a table
if element.tail and not is_in_table_cell(element):
returnlist.append(element.tail)
returnlist.append(element.tail.strip() if is_element_in_item(element) else element.tail)

# deal with list items alone
if is_last_element_in_item(element) and not is_in_table_cell(element):
returnlist.append('\n')


def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool, preserve_space: bool = False) -> str:
Expand Down

0 comments on commit 9853e56

Please sign in to comment.