fix #769

adbar · Jan 3, 2025 · 9853e56 · 9853e56
1 parent eed8c0b
commit 9853e56
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 9 deletions.
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -407,6 +407,18 @@ def test_formatting():
     my_result = extract(my_document, output_format='xml', include_links=True, config=ZERO_CONFIG)
     assert '<item>Number <ref target="test.html">2</ref></item>' in my_result
 
+    my_document = html.fromstring("""<html><body><article>
+        <ul>
+            <li>Number 0</li>
+            <li>Number <a href="test.html">1</a></li>
+            <li><a href="test.html">Number 2</a> n2</li>
+            <li>Number 3</li>
+            <li><p>Number 4</p> n4</li>
+        </ul>
+        <p>Test</p></article></body></html>
+    """)
+    my_result = extract(my_document, output_format='markdown', include_links=True, config=ZERO_CONFIG)
+    assert my_result == '- Number 0\n- Number [1](test.html)\n- [Number 2](test.html) n2\n- Number 3\n- Number 4 n4\n\nTest'
     # XML and Markdown formatting within <p>-tag
     my_document = html.fromstring('<html><body><p><b>bold</b>, <i>italics</i>, <tt>tt</tt>, <strike>deleted</strike>, <u>underlined</u>, <a href="test.html">link</a> and additional text to bypass detection.</p></body></html>')
     my_result = extract(copy(my_document), fast=True, include_formatting=False, config=ZERO_CONFIG)
@@ -444,23 +456,24 @@ def test_formatting():
 
     my_document = html.fromstring("""
     <html><head><body><article>python code below:
-    ```python
+    <pre><code>
 def test:
     print('hello')
     print('world')
-    ```
+    </code></pre>
     </article></body></html> 
     """)
     my_result = extract(my_document, output_format='markdown', include_formatting=True)
-    assert "python code below:\n```python\ndef test:\nprint('hello')\nprint('world')\n```" == my_result
+    assert "python code below:\n```\ndef test:\nprint('hello')\nprint('world')\n```" == my_result
 
     my_result = extract(my_document, output_format='markdown', include_formatting=True, preserve_space=True)
     assert """python code below:
-    ```python
+    ```
 def test:
     print('hello')
     print('world')
-    ```""" == my_result
+    
+```""" == my_result
 
 
 def test_extract_with_metadata():

diff --git a/trafilatura/xml.py b/trafilatura/xml.py
@@ -249,6 +249,32 @@ def validate_tei(xmldoc: _Element) -> bool:
 
     return result
 
+def is_element_in_item(element: _Element) -> bool:
+    """Check whether an element is a list item or within a list item"""
+    return element.tag == 'item' or bool(element.xpath('ancestor::item'))
+
+
+def is_first_element_in_item(element: _Element) -> bool:
+    """Check whether an element is the first element in list item"""
+    if element.tag == 'item' and element.text:
+        return True
+    for sub_elem in element.xpath('ancestor::item'):
+        if not sub_elem.text:
+            return True
+    return False
+
+
+def is_last_element_in_item(element: _Element) -> bool:
+    """Check whether an element is the last element in list item"""
+    if not is_element_in_item(element):
+        return False
+
+    # pure text only in list item
+    if element.tag == 'item':
+        return len(element.getchildren()) == 0
+    # element within list item
+    return element.getnext() is None or element.getnext().tag == 'item'
+
 
 def replace_element_text(element: _Element, include_formatting: bool) -> str:
     "Determine element text based on just the text of the element. One must deal with the tail separately."
@@ -291,8 +317,9 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
         if elem_text:
             elem_text = f"{elem_text} "
     # lists
-    elif element.tag == "item" and elem_text:
-        elem_text = f"- {elem_text}\n"
+    if is_first_element_in_item(element) and not is_in_table_cell(element):
+        elem_text = f"- {elem_text} "
+
     return elem_text
 
 
@@ -348,7 +375,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
     # Process text
 
     # Common elements (Now processes end-tag logic correctly)
-    if element.tag in NEWLINE_ELEMS and not element.xpath("ancestor::cell"):
+    if element.tag in NEWLINE_ELEMS and not element.xpath("ancestor::cell") and not is_element_in_item(element):
         # spacing hack
         returnlist.append("\n\u2424\n" if include_formatting and element.tag != 'row' else "\n")
     elif element.tag == "cell":
@@ -357,8 +384,13 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
         returnlist.append(" ")
 
     # this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS
+    # unless it's within a list item or a table
     if element.tail and not is_in_table_cell(element):
-        returnlist.append(element.tail)
+        returnlist.append(element.tail.strip() if is_element_in_item(element) else element.tail)
+
+    # deal with list items alone
+    if is_last_element_in_item(element) and not is_in_table_cell(element):
+        returnlist.append('\n')
 
 
 def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool, preserve_space: bool = False) -> str: