adbar · unsleepy22 · Dec 24, 2024 · Jan 3, 2025 · Jan 3, 2025 · Jan 3, 2025
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -407,6 +407,18 @@ def test_formatting():
     my_result = extract(my_document, output_format='xml', include_links=True, config=ZERO_CONFIG)
     assert '<item>Number <ref target="test.html">2</ref></item>' in my_result
 
+    my_document = html.fromstring("""<html><body><article>
+        <ul>
+            <li>Number 0</li>
+            <li>Number <a href="test.html">1</a></li>
+            <li><a href="test.html">Number 2</a> n2</li>
+            <li>Number 3</li>
+            <li><p>Number 4</p> n4</li>
+        </ul>
+        <p>Test</p></article></body></html>
+    """)
+    my_result = extract(my_document, output_format='markdown', include_links=True, config=ZERO_CONFIG)
+    assert my_result == '- Number 0\n- Number [1](test.html)\n- [Number 2](test.html) n2\n- Number 3\n- Number 4 n4\n\nTest'
     # XML and Markdown formatting within <p>-tag
     my_document = html.fromstring('<html><body><p><b>bold</b>, <i>italics</i>, <tt>tt</tt>, <strike>deleted</strike>, <u>underlined</u>, <a href="test.html">link</a> and additional text to bypass detection.</p></body></html>')
     my_result = extract(copy(my_document), fast=True, include_formatting=False, config=ZERO_CONFIG)
@@ -442,6 +454,27 @@ def test_formatting():
     my_result = extract(my_document, output_format='xml', fast=True, include_formatting=True, config=ZERO_CONFIG)
     assert '<head rend="h4">1) The <code>in</code> Operator</head>' in my_result and '<p>The easiest way to check if a Python string contains a substring is to use the <code>in</code> operator. The <code>in</code> operator is used to check data structures for membership in Python. It returns a Boolean (either <code>True</code> or <code>False</code>) and can be used as follows:</p>' in my_result
 
+    my_document = html.fromstring("""
+    <html><head><body><article>python code below:
+    <pre><code>
+def test:
+    print('hello')
+    print('world')
+    </code></pre>
+    </article></body></html> 
+    """)
+    my_result = extract(my_document, output_format='markdown', include_formatting=True)
+    assert "python code below:\n```\ndef test:\nprint('hello')\nprint('world')\n```" == my_result
+
+    my_result = extract(my_document, output_format='markdown', include_formatting=True, preserve_space=True)
+    assert """python code below:
+    ```
+def test:
+    print('hello')
+    print('world')
+
+```""" == my_result
+
 
 def test_extract_with_metadata():
     '''Test extract_with_metadata method'''

diff --git a/tests/xml_tei_tests.py b/tests/xml_tei_tests.py
@@ -486,7 +486,7 @@ def test_replace_element_text():
     elem = Element("item")
     elem.text = "Test text"
     elem.tag = "item"
-    assert replace_element_text(elem, True) == "- Test text\n"
+    assert replace_element_text(elem, True) == "- Test text "
 
     elem = Element("ref")
     elem.text = "Link"

diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -91,9 +91,10 @@ def determine_returnstring(document: Document, options: Extractor) -> str:
             header += "---\n"
         else:
             header = ""
-        returnstring = f"{header}{xmltotxt(document.body, options.formatting)}"
+        returnstring = f"{header}{xmltotxt(document.body, options.formatting, options.preserve_space)}"
         if document.commentsbody is not None:
-            returnstring = f"{returnstring}\n{xmltotxt(document.commentsbody, options.formatting)}".strip()
+            returnstring = \
+                f"{returnstring}\n{xmltotxt(document.commentsbody, options.formatting, options.preserve_space)}".strip()
     # normalize Unicode format (defaults to NFC)
     return normalize_unicode(returnstring)
 
@@ -140,6 +141,7 @@ def bare_extraction(
     include_tables: bool = True,
     include_images: bool = False,
     include_formatting: bool = False,
+    preserve_space: bool = False,
     include_links: bool = False,
     deduplicate: bool = False,
     date_extraction_params: Optional[Dict[str, Any]] = None,
@@ -171,6 +173,7 @@ def bare_extraction(
         include_images: Take images into account (experimental).
         include_formatting: Keep structural elements related to formatting
             (present in XML format, converted to markdown otherwise).
+        preserve_space: Preserve space when formatting text.
         include_links: Keep links along with their targets (experimental).
         deduplicate: Remove duplicate segments and documents.
         date_extraction_params: Provide extraction parameters to htmldate as dict().
@@ -205,6 +208,7 @@ def bare_extraction(
             recall=favor_recall,
             comments=include_comments,
             formatting=include_formatting,
+            preserve_space=preserve_space,
             links=include_links,
             images=include_images,
             tables=include_tables,
@@ -361,6 +365,7 @@ def extract(
     include_tables: bool = True,
     include_images: bool = False,
     include_formatting: bool = False,
+    preserve_space: bool = False,
     include_links: bool = False,
     deduplicate: bool = False,
     date_extraction_params: Optional[Dict[str, Any]] = None,
@@ -394,6 +399,7 @@ def extract(
         include_images: Take images into account (experimental).
         include_formatting: Keep structural elements related to formatting
             (only valuable if output_format is set to XML).
+        preserve_space: Preserve space when formatting text.
         include_links: Keep links along with their targets (experimental).
         deduplicate: Remove duplicate segments and documents.
         date_extraction_params: Provide extraction parameters to htmldate as dict().
@@ -427,6 +433,7 @@ def extract(
         include_tables=include_tables,
         include_images=include_images,
         include_formatting=include_formatting,
+        preserve_space=preserve_space,
         include_links=include_links,
         deduplicate=deduplicate,
         date_extraction_params=date_extraction_params,
@@ -456,6 +463,7 @@ def extract_with_metadata(
     include_tables: bool = True,
     include_images: bool = False,
     include_formatting: bool = False,
+    preserve_space: bool = False,
     include_links: bool = False,
     deduplicate: bool = False,
     date_extraction_params: Optional[Dict[str, Any]] = None,
@@ -487,6 +495,7 @@ def extract_with_metadata(
         include_images: Take images into account (experimental).
         include_formatting: Keep structural elements related to formatting
             (only valuable if output_format is set to XML).
+        preserve_space: Preserve space when formatting text.
         include_links: Keep links along with their targets (experimental).
         deduplicate: Remove duplicate segments and documents.
         date_extraction_params: Provide extraction parameters to htmldate as dict().
@@ -515,6 +524,7 @@ def extract_with_metadata(
         include_tables=include_tables,
         include_images=include_images,
         include_formatting=include_formatting,
+        preserve_space=preserve_space,
         include_links=include_links,
         deduplicate=deduplicate,
         date_extraction_params=date_extraction_params,
@@ -564,6 +574,7 @@ def _internal_extraction(
         include_tables: bool = True,
         include_images: bool = False,
         include_formatting: bool = False,
+        preserve_space: bool = False,
         include_links: bool = False,
         deduplicate: bool = False,
         date_extraction_params: Optional[Dict[str, Any]] = None,
@@ -590,6 +601,7 @@ def _internal_extraction(
             recall=favor_recall,
             comments=include_comments,
             formatting=include_formatting,
+            preserve_space=preserve_space,
             links=include_links,
             images=include_images,
             tables=include_tables,

diff --git a/trafilatura/settings.py b/trafilatura/settings.py
@@ -70,6 +70,7 @@ class Extractor:
         "focus",
         "comments",
         "formatting",
+        "preserve_space",
         "links",
         "images",
         "tables",
@@ -108,6 +109,7 @@ def __init__(
         recall: bool = False,
         comments: bool = True,
         formatting: bool = False,
+        preserve_space: bool = False,
         links: bool = False,
         images: bool = False,
         tables: bool = True,
@@ -131,6 +133,7 @@ def __init__(
         )
         self.comments: bool = comments
         self.formatting: bool = formatting or self.format == "markdown"
+        self.preserve_space: bool = preserve_space
         self.links: bool = links
         self.images: bool = images
         self.tables: bool = tables

diff --git a/trafilatura/xml.py b/trafilatura/xml.py
@@ -249,6 +249,36 @@ def validate_tei(xmldoc: _Element) -> bool:
 
     return result
 
+def is_element_in_item(element: _Element) -> bool:
+    """Check whether an element is a list item or within a list item"""
+    return element.tag == 'item' or bool(element.xpath('ancestor::item'))
+
+
+def is_first_element_in_item(element: _Element) -> bool:
+    """Check whether an element is the first element in list item"""
+    if element.tag == 'item' and element.text:
+        return True
+    for sub_elem in element.xpath('ancestor::item'):
+        if not sub_elem.text:
+            return True
+    return False
+
+
+def is_last_element_in_item(element: _Element) -> bool:
+    """Check whether an element is the last element in list item"""
+    if not is_element_in_item(element):
+        return False
+
+    # pure text only in list item
+    if element.tag == 'item':
+        return len(element.getchildren()) == 0
+    # element within list item
+    next_element = element.getnext()
+    if next_element is None:
+        return True
+    else:
+        return next_element.tag == 'item'
+
 
 def replace_element_text(element: _Element, include_formatting: bool) -> str:
     "Determine element text based on just the text of the element. One must deal with the tail separately."
@@ -291,8 +321,9 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
         if elem_text:
             elem_text = f"{elem_text} "
     # lists
-    elif element.tag == "item" and elem_text:
-        elem_text = f"- {elem_text}\n"
+    if is_first_element_in_item(element) and not is_in_table_cell(element):
+        elem_text = f"- {elem_text} "
+
     return elem_text
 
 
@@ -348,7 +379,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
     # Process text
 
     # Common elements (Now processes end-tag logic correctly)
-    if element.tag in NEWLINE_ELEMS and not element.xpath("ancestor::cell"):
+    if element.tag in NEWLINE_ELEMS and not element.xpath("ancestor::cell") and not is_element_in_item(element):
         # spacing hack
         returnlist.append("\n\u2424\n" if include_formatting and element.tag != 'row' else "\n")
     elif element.tag == "cell":
@@ -357,11 +388,16 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
         returnlist.append(" ")
 
     # this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS
+    # unless it's within a list item or a table
     if element.tail and not is_in_table_cell(element):
-        returnlist.append(element.tail)
+        returnlist.append(element.tail.strip() if is_element_in_item(element) else element.tail)
+
+    # deal with list items alone
+    if is_last_element_in_item(element) and not is_in_table_cell(element):
+        returnlist.append('\n')
 
 
-def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str:
+def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool, preserve_space: bool = False) -> str:
     "Convert to plain text format and optionally preserve formatting as markdown."
     if xmloutput is None:
         return ""
@@ -370,7 +406,7 @@ def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str:
 
     process_element(xmloutput, returnlist, include_formatting)
 
-    return unescape(sanitize("".join(returnlist)) or "")
+    return unescape(sanitize("".join(returnlist), preserve_space) or "")
 
 
 def xmltocsv(document: Document, include_formatting: bool, *, delim: str = "\t", null: str = "null") -> str: