Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add preserve_space option to keep original format in code blocks #772

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,18 @@ def test_formatting():
my_result = extract(my_document, output_format='xml', include_links=True, config=ZERO_CONFIG)
assert '<item>Number <ref target="test.html">2</ref></item>' in my_result

my_document = html.fromstring("""<html><body><article>
<ul>
<li>Number 0</li>
<li>Number <a href="test.html">1</a></li>
<li><a href="test.html">Number 2</a> n2</li>
<li>Number 3</li>
<li><p>Number 4</p> n4</li>
</ul>
<p>Test</p></article></body></html>
""")
my_result = extract(my_document, output_format='markdown', include_links=True, config=ZERO_CONFIG)
assert my_result == '- Number 0\n- Number [1](test.html)\n- [Number 2](test.html) n2\n- Number 3\n- Number 4 n4\n\nTest'
# XML and Markdown formatting within <p>-tag
my_document = html.fromstring('<html><body><p><b>bold</b>, <i>italics</i>, <tt>tt</tt>, <strike>deleted</strike>, <u>underlined</u>, <a href="test.html">link</a> and additional text to bypass detection.</p></body></html>')
my_result = extract(copy(my_document), fast=True, include_formatting=False, config=ZERO_CONFIG)
Expand Down Expand Up @@ -442,6 +454,27 @@ def test_formatting():
my_result = extract(my_document, output_format='xml', fast=True, include_formatting=True, config=ZERO_CONFIG)
assert '<head rend="h4">1) The <code>in</code> Operator</head>' in my_result and '<p>The easiest way to check if a Python string contains a substring is to use the <code>in</code> operator. The <code>in</code> operator is used to check data structures for membership in Python. It returns a Boolean (either <code>True</code> or <code>False</code>) and can be used as follows:</p>' in my_result

my_document = html.fromstring("""
<html><head><body><article>python code below:
<pre><code>
def test:
print('hello')
print('world')
</code></pre>
</article></body></html>
""")
my_result = extract(my_document, output_format='markdown', include_formatting=True)
assert "python code below:\n```\ndef test:\nprint('hello')\nprint('world')\n```" == my_result

my_result = extract(my_document, output_format='markdown', include_formatting=True, preserve_space=True)
assert """python code below:
```
def test:
print('hello')
print('world')

```""" == my_result


def test_extract_with_metadata():
'''Test extract_with_metadata method'''
Expand Down
2 changes: 1 addition & 1 deletion tests/xml_tei_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,7 @@ def test_replace_element_text():
elem = Element("item")
elem.text = "Test text"
elem.tag = "item"
assert replace_element_text(elem, True) == "- Test text\n"
assert replace_element_text(elem, True) == "- Test text "

elem = Element("ref")
elem.text = "Link"
Expand Down
16 changes: 14 additions & 2 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,10 @@ def determine_returnstring(document: Document, options: Extractor) -> str:
header += "---\n"
else:
header = ""
returnstring = f"{header}{xmltotxt(document.body, options.formatting)}"
returnstring = f"{header}{xmltotxt(document.body, options.formatting, options.preserve_space)}"
if document.commentsbody is not None:
returnstring = f"{returnstring}\n{xmltotxt(document.commentsbody, options.formatting)}".strip()
returnstring = \
f"{returnstring}\n{xmltotxt(document.commentsbody, options.formatting, options.preserve_space)}".strip()
# normalize Unicode format (defaults to NFC)
return normalize_unicode(returnstring)

Expand Down Expand Up @@ -140,6 +141,7 @@ def bare_extraction(
include_tables: bool = True,
include_images: bool = False,
include_formatting: bool = False,
preserve_space: bool = False,
include_links: bool = False,
deduplicate: bool = False,
date_extraction_params: Optional[Dict[str, Any]] = None,
Expand Down Expand Up @@ -171,6 +173,7 @@ def bare_extraction(
include_images: Take images into account (experimental).
include_formatting: Keep structural elements related to formatting
(present in XML format, converted to markdown otherwise).
preserve_space: Preserve space when formatting text.
include_links: Keep links along with their targets (experimental).
deduplicate: Remove duplicate segments and documents.
date_extraction_params: Provide extraction parameters to htmldate as dict().
Expand Down Expand Up @@ -205,6 +208,7 @@ def bare_extraction(
recall=favor_recall,
comments=include_comments,
formatting=include_formatting,
preserve_space=preserve_space,
links=include_links,
images=include_images,
tables=include_tables,
Expand Down Expand Up @@ -361,6 +365,7 @@ def extract(
include_tables: bool = True,
include_images: bool = False,
include_formatting: bool = False,
preserve_space: bool = False,
include_links: bool = False,
deduplicate: bool = False,
date_extraction_params: Optional[Dict[str, Any]] = None,
Expand Down Expand Up @@ -394,6 +399,7 @@ def extract(
include_images: Take images into account (experimental).
include_formatting: Keep structural elements related to formatting
(only valuable if output_format is set to XML).
preserve_space: Preserve space when formatting text.
include_links: Keep links along with their targets (experimental).
deduplicate: Remove duplicate segments and documents.
date_extraction_params: Provide extraction parameters to htmldate as dict().
Expand Down Expand Up @@ -427,6 +433,7 @@ def extract(
include_tables=include_tables,
include_images=include_images,
include_formatting=include_formatting,
preserve_space=preserve_space,
include_links=include_links,
deduplicate=deduplicate,
date_extraction_params=date_extraction_params,
Expand Down Expand Up @@ -456,6 +463,7 @@ def extract_with_metadata(
include_tables: bool = True,
include_images: bool = False,
include_formatting: bool = False,
preserve_space: bool = False,
include_links: bool = False,
deduplicate: bool = False,
date_extraction_params: Optional[Dict[str, Any]] = None,
Expand Down Expand Up @@ -487,6 +495,7 @@ def extract_with_metadata(
include_images: Take images into account (experimental).
include_formatting: Keep structural elements related to formatting
(only valuable if output_format is set to XML).
preserve_space: Preserve space when formatting text.
include_links: Keep links along with their targets (experimental).
deduplicate: Remove duplicate segments and documents.
date_extraction_params: Provide extraction parameters to htmldate as dict().
Expand Down Expand Up @@ -515,6 +524,7 @@ def extract_with_metadata(
include_tables=include_tables,
include_images=include_images,
include_formatting=include_formatting,
preserve_space=preserve_space,
include_links=include_links,
deduplicate=deduplicate,
date_extraction_params=date_extraction_params,
Expand Down Expand Up @@ -564,6 +574,7 @@ def _internal_extraction(
include_tables: bool = True,
include_images: bool = False,
include_formatting: bool = False,
preserve_space: bool = False,
include_links: bool = False,
deduplicate: bool = False,
date_extraction_params: Optional[Dict[str, Any]] = None,
Expand All @@ -590,6 +601,7 @@ def _internal_extraction(
recall=favor_recall,
comments=include_comments,
formatting=include_formatting,
preserve_space=preserve_space,
links=include_links,
images=include_images,
tables=include_tables,
Expand Down
3 changes: 3 additions & 0 deletions trafilatura/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ class Extractor:
"focus",
"comments",
"formatting",
"preserve_space",
"links",
"images",
"tables",
Expand Down Expand Up @@ -108,6 +109,7 @@ def __init__(
recall: bool = False,
comments: bool = True,
formatting: bool = False,
preserve_space: bool = False,
links: bool = False,
images: bool = False,
tables: bool = True,
Expand All @@ -131,6 +133,7 @@ def __init__(
)
self.comments: bool = comments
self.formatting: bool = formatting or self.format == "markdown"
self.preserve_space: bool = preserve_space
self.links: bool = links
self.images: bool = images
self.tables: bool = tables
Expand Down
48 changes: 42 additions & 6 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,36 @@ def validate_tei(xmldoc: _Element) -> bool:

return result

def is_element_in_item(element: _Element) -> bool:
"""Check whether an element is a list item or within a list item"""
return element.tag == 'item' or bool(element.xpath('ancestor::item'))


def is_first_element_in_item(element: _Element) -> bool:
"""Check whether an element is the first element in list item"""
if element.tag == 'item' and element.text:
return True
for sub_elem in element.xpath('ancestor::item'):
if not sub_elem.text:
return True
return False


def is_last_element_in_item(element: _Element) -> bool:
"""Check whether an element is the last element in list item"""
if not is_element_in_item(element):
return False

# pure text only in list item
if element.tag == 'item':
return len(element.getchildren()) == 0
# element within list item
next_element = element.getnext()
if next_element is None:
return True
else:
return next_element.tag == 'item'


def replace_element_text(element: _Element, include_formatting: bool) -> str:
"Determine element text based on just the text of the element. One must deal with the tail separately."
Expand Down Expand Up @@ -291,8 +321,9 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
if elem_text:
elem_text = f"{elem_text} "
# lists
elif element.tag == "item" and elem_text:
elem_text = f"- {elem_text}\n"
if is_first_element_in_item(element) and not is_in_table_cell(element):
elem_text = f"- {elem_text} "

return elem_text


Expand Down Expand Up @@ -348,7 +379,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
# Process text

# Common elements (Now processes end-tag logic correctly)
if element.tag in NEWLINE_ELEMS and not element.xpath("ancestor::cell"):
if element.tag in NEWLINE_ELEMS and not element.xpath("ancestor::cell") and not is_element_in_item(element):
# spacing hack
returnlist.append("\n\u2424\n" if include_formatting and element.tag != 'row' else "\n")
elif element.tag == "cell":
Expand All @@ -357,11 +388,16 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
returnlist.append(" ")

# this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS
# unless it's within a list item or a table
if element.tail and not is_in_table_cell(element):
returnlist.append(element.tail)
returnlist.append(element.tail.strip() if is_element_in_item(element) else element.tail)

# deal with list items alone
if is_last_element_in_item(element) and not is_in_table_cell(element):
returnlist.append('\n')


def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str:
def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool, preserve_space: bool = False) -> str:
"Convert to plain text format and optionally preserve formatting as markdown."
if xmloutput is None:
return ""
Expand All @@ -370,7 +406,7 @@ def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str:

process_element(xmloutput, returnlist, include_formatting)

return unescape(sanitize("".join(returnlist)) or "")
return unescape(sanitize("".join(returnlist), preserve_space) or "")


def xmltocsv(document: Document, include_formatting: bool, *, delim: str = "\t", null: str = "null") -> str:
Expand Down
Loading