diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3bd62d0178..323f6484e0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,8 +1,10 @@
-## 0.16.11-dev0
+## 0.16.11-dev1
### Enhancements
- **Enhance quote standardization tests** with additional Unicode scenarios
+- **Relax table segregation rule in chunking.** Previously a `Table` element was always segregated into its own pre-chunk such that the `Table` appeared alone in a chunk or was split into multiple `TableChunk` elements, but never combined with `Text`-subtype elements. Allow table elements to be combined with other elements in the same chunk when space allows.
+- **Compute chunk length based solely on `element.text`.** Previously `.metadata.text_as_html` was also considered and since it is always longer that the text (due to HTML tag overhead) it was the effective length criterion. Remove text-as-html from the length calculation such that text-length is the sole criterion for sizing a chunk.
### Features
diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py
index eeb5f3740f..f63e738a7c 100644
--- a/test_unstructured/chunking/test_base.py
+++ b/test_unstructured/chunking/test_base.py
@@ -11,15 +11,16 @@
from unstructured.chunking.base import (
ChunkingOptions,
+ PreChunk,
PreChunkBuilder,
PreChunkCombiner,
PreChunker,
- TablePreChunk,
- TextPreChunk,
- TextPreChunkAccumulator,
_CellAccumulator,
+ _Chunker,
+ _HtmlTableSplitter,
+ _PreChunkAccumulator,
_RowAccumulator,
- _TableSplitter,
+ _TableChunker,
_TextSplitter,
is_on_next_page,
is_title,
@@ -181,27 +182,27 @@ def it_gathers_elements_into_pre_chunks_respecting_the_specified_chunk_size(self
pre_chunk_iter = PreChunker.iter_pre_chunks(elements, opts=opts)
pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
+ assert isinstance(pre_chunk, PreChunk)
assert pre_chunk._elements == [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet, consectetur adipiscing elit."),
]
# --
pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
+ assert isinstance(pre_chunk, PreChunk)
assert pre_chunk._elements == [
Text("Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.")
]
# --
pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
+ assert isinstance(pre_chunk, PreChunk)
assert pre_chunk._elements == [
Title("Ut Enim"),
Text("Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi."),
]
# --
pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
+ assert isinstance(pre_chunk, PreChunk)
assert pre_chunk._elements == [Text("Ut aliquip ex ea commodo consequat."), CheckBox()]
# --
with pytest.raises(StopIteration):
@@ -233,21 +234,18 @@ def it_accumulates_elements_added_to_it(self):
assert builder._text_length == 112
assert builder._remaining_space == 36
- @pytest.mark.parametrize("element", [Table("Heading\nCell text"), Text("abcd " * 200)])
- def it_will_fit_a_Table_or_oversized_element_when_empty(self, element: Element):
+ def it_will_fit_an_oversized_element_when_empty(self):
builder = PreChunkBuilder(opts=ChunkingOptions())
- assert builder.will_fit(element)
+ assert builder.will_fit(Text("abcd " * 200))
@pytest.mark.parametrize(
("existing_element", "next_element"),
[
- (Text("abcd"), Table("Fruits\nMango")),
(Text("abcd"), Text("abcd " * 200)),
- (Table("Heading\nCell text"), Table("Fruits\nMango")),
(Table("Heading\nCell text"), Text("abcd " * 200)),
],
)
- def but_not_when_it_already_contains_an_element_of_any_kind(
+ def but_not_when_it_already_contains_an_element(
self, existing_element: Element, next_element: Element
):
builder = PreChunkBuilder(opts=ChunkingOptions())
@@ -256,11 +254,13 @@ def but_not_when_it_already_contains_an_element_of_any_kind(
assert not builder.will_fit(next_element)
@pytest.mark.parametrize("element", [Text("abcd"), Table("Fruits\nMango")])
- def it_will_not_fit_any_element_when_it_already_contains_a_table(self, element: Element):
+ def it_will_accept_another_element_that_fits_when_it_already_contains_a_table(
+ self, element: Element
+ ):
builder = PreChunkBuilder(opts=ChunkingOptions())
builder.add_element(Table("Heading\nCell text"))
- assert not builder.will_fit(element)
+ assert builder.will_fit(element)
def it_will_not_fit_an_element_when_it_already_exceeds_the_soft_maxlen(self):
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100, new_after_n_chars=50))
@@ -290,7 +290,7 @@ def but_it_will_fit_an_element_that_fits(self):
# -- 55 + 2 (separator) + 43 == 100 --
assert builder.will_fit(Text("In rhoncus ipsum sed lectus porto volutpat.")) # 43-chars
- def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
+ def it_generates_a_PreChunk_when_flushed_and_resets_itself_to_empty(self):
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
builder.add_element(Title("Introduction"))
builder.add_element(
@@ -302,7 +302,13 @@ def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
pre_chunk = next(builder.flush())
- assert isinstance(pre_chunk, TextPreChunk)
+ # -- pre-chunk builder was reset before the yield, such that the iterator does not need to
+ # -- be exhausted before clearing out the old elements and a new pre-chunk can be
+ # -- accumulated immediately (first `next()` call is required however, to advance to the
+ # -- yield statement).
+ assert builder._text_length == 0
+ assert builder._remaining_space == 150
+ assert isinstance(pre_chunk, PreChunk)
assert pre_chunk._elements == [
Title("Introduction"),
Text(
@@ -310,24 +316,6 @@ def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
"lectus porta volutpat.",
),
]
- assert builder._text_length == 0
- assert builder._remaining_space == 150
-
- def and_it_generates_a_TablePreChunk_when_it_contains_a_Table_element(self):
- builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
- builder.add_element(Table("Heading\nCell text"))
-
- pre_chunk = next(builder.flush())
-
- # -- pre-chunk builder was reset before the yield, such that the iterator does not need to
- # -- be exhausted before clearing out the old elements and a new pre-chunk can be
- # -- accumulated immediately (first `next()` call is required however, to advance to the
- # -- yield statement).
- assert builder._text_length == 0
- assert builder._remaining_space == 150
- # -- pre-chunk is a `TablePreChunk` --
- assert isinstance(pre_chunk, TablePreChunk)
- assert pre_chunk._table == Table("Heading\nCell text")
def but_it_does_not_generate_a_pre_chunk_on_flush_when_empty(self):
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
@@ -345,21 +333,19 @@ def it_computes_overlap_from_each_pre_chunk_and_applies_it_to_the_next(self):
builder.add_element(Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."))
pre_chunk = list(builder.flush())[0]
- assert isinstance(pre_chunk, TextPreChunk)
+ assert isinstance(pre_chunk, PreChunk)
assert pre_chunk._text == "Lorem ipsum dolor sit amet consectetur adipiscing elit."
builder.add_element(Table("In rhoncus ipsum sed lectus porta volutpat."))
pre_chunk = list(builder.flush())[0]
- assert isinstance(pre_chunk, TablePreChunk)
- assert pre_chunk._text_with_overlap == (
- "dipiscing elit.\nIn rhoncus ipsum sed lectus porta volutpat."
- )
+ assert isinstance(pre_chunk, PreChunk)
+ assert pre_chunk._text == "dipiscing elit.\n\nIn rhoncus ipsum sed lectus porta volutpat."
builder.add_element(Text("Donec semper facilisis metus finibus."))
pre_chunk = list(builder.flush())[0]
- assert isinstance(pre_chunk, TextPreChunk)
+ assert isinstance(pre_chunk, PreChunk)
assert pre_chunk._text == "porta volutpat.\n\nDonec semper facilisis metus finibus."
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
@@ -381,252 +367,8 @@ def it_considers_separator_length_when_computing_text_length_and_remaining_space
# ================================================================================================
-class DescribeTablePreChunk:
- """Unit-test suite for `unstructured.chunking.base.TablePreChunk` objects."""
-
- def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
- html_table = (
- "
\n"
- "\n"
- "Header Col 1 | Header Col 2 |
\n"
- "\n"
- "\n"
- "Lorem ipsum | adipiscing |
\n"
- "\n"
- "
"
- )
- text_table = "Header Col 1 Header Col 2\nLorem ipsum adipiscing"
- pre_chunk = TablePreChunk(
- Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
- overlap_prefix="ctus porta volutpat.",
- opts=ChunkingOptions(max_characters=175),
- )
-
- chunk_iter = pre_chunk.iter_chunks()
-
- chunk = next(chunk_iter)
- assert isinstance(chunk, Table)
- assert chunk.text == (
- "ctus porta volutpat.\nHeader Col 1 Header Col 2\nLorem ipsum adipiscing"
- )
- assert chunk.metadata.text_as_html == (
- ""
- "Header Col 1 | Header Col 2 |
"
- "Lorem ipsum | adipiscing |
"
- "
"
- )
- with pytest.raises(StopIteration):
- next(chunk_iter)
-
- def but_not_when_the_table_is_is_empty_or_contains_only_whitespace(self):
- html_table = ""
- pre_chunk = TablePreChunk(
- Table(" \t \n ", metadata=ElementMetadata(text_as_html=html_table)),
- overlap_prefix="volutpat.",
- opts=ChunkingOptions(max_characters=175),
- )
-
- chunk_iter = pre_chunk.iter_chunks()
-
- with pytest.raises(StopIteration):
- next(chunk_iter)
-
- def and_it_includes_the_original_table_element_in_metadata_when_so_instructed(self):
- table = Table("foo bar", metadata=ElementMetadata(text_as_html=""))
- opts = ChunkingOptions(include_orig_elements=True)
- pre_chunk = TablePreChunk(table, "", opts)
-
- chunk_iter = pre_chunk.iter_chunks()
-
- chunk = next(chunk_iter)
- assert isinstance(chunk, Table)
- assert chunk.metadata.orig_elements == [table]
- assert chunk.metadata.text_as_html == ""
- # --
- with pytest.raises(StopIteration):
- next(chunk_iter)
-
- def but_not_when_instructed_not_to(self):
- pre_chunk = TablePreChunk(Table("foobar"), "", ChunkingOptions(include_orig_elements=False))
-
- chunk = next(pre_chunk.iter_chunks())
-
- assert isinstance(chunk, Table)
- assert chunk.metadata.orig_elements is None
-
- def it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
- html_table = """\
-
-
- Header Col 1 | Header Col 2 |
-
-
- Lorem ipsum | A Link example |
- Consectetur | adipiscing elit |
- Nunc aliquam | id enim nec molestie |
-
-
- """
- text_table = (
- "Header Col 1 Header Col 2\n"
- "Lorem ipsum dolor sit amet\n"
- "Consectetur adipiscing elit\n"
- "Nunc aliquam id enim nec molestie\n"
- "Vivamus quis nunc ipsum donec ac fermentum"
- )
- pre_chunk = TablePreChunk(
- Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
- overlap_prefix="",
- opts=ChunkingOptions(max_characters=100, text_splitting_separators=("\n", " ")),
- )
-
- chunk_iter = pre_chunk.iter_chunks()
-
- chunk = next(chunk_iter)
- assert isinstance(chunk, TableChunk)
- assert chunk.text == "Header Col 1 Header Col 2"
- assert chunk.metadata.text_as_html == (
- ""
- )
- assert chunk.metadata.is_continuation is None
- # --
- chunk = next(chunk_iter)
- assert isinstance(chunk, TableChunk)
- assert chunk.text == "Lorem ipsum A Link example"
- assert chunk.metadata.text_as_html == (
- "Lorem ipsum | A Link example |
"
- )
- assert chunk.metadata.is_continuation
- # --
- chunk = next(chunk_iter)
- assert isinstance(chunk, TableChunk)
- assert chunk.text == "Consectetur adipiscing elit"
- assert chunk.metadata.text_as_html == (
- "Consectetur | adipiscing elit |
"
- )
- assert chunk.metadata.is_continuation
- # --
- chunk = next(chunk_iter)
- assert isinstance(chunk, TableChunk)
- assert chunk.text == "Nunc aliquam id enim nec molestie"
- assert chunk.metadata.text_as_html == (
- "Nunc aliquam | id enim nec molestie |
"
- )
- assert chunk.metadata.is_continuation
- # --
- with pytest.raises(StopIteration):
- next(chunk_iter)
-
- def and_it_includes_the_whole_original_Table_in_each_metadata_when_so_instructed(self):
- """Even though text and html are split, the orig_elements metadata is not."""
- table = Table(
- "Header Col 1 Header Col 2\nLorem ipsum dolor sit amet",
- metadata=ElementMetadata(text_as_html=""),
- )
- opts = ChunkingOptions(max_characters=30, include_orig_elements=True)
- pre_chunk = TablePreChunk(table, overlap_prefix="", opts=opts)
-
- chunk_iter = pre_chunk.iter_chunks()
-
- chunk = next(chunk_iter)
- assert isinstance(chunk, TableChunk)
- assert chunk.text == "Header Col 1 Header Col 2"
- assert chunk.metadata.orig_elements == [table]
- assert not chunk.metadata.is_continuation
- # --
- chunk = next(chunk_iter)
- assert isinstance(chunk, TableChunk)
- assert chunk.text == "Lorem ipsum dolor sit amet"
- assert chunk.metadata.orig_elements == [table]
- assert chunk.metadata.is_continuation
-
- @pytest.mark.parametrize(
- ("text", "expected_value"),
- [
- # -- normally it splits exactly on overlap size |------- 20 -------|
- ("In rhoncus ipsum sed lectus porta volutpat.", "ctus porta volutpat."),
- # -- but it strips leading whitespace when the tail includes it --
- ("In rhoncus ipsum sed lectus porta volutpat.", "porta volutpat."),
- ],
- )
- def it_computes_its_overlap_tail_for_use_in_inter_pre_chunk_overlap(
- self, text: str, expected_value: str
- ):
- pre_chunk = TablePreChunk(
- Table(text), overlap_prefix="", opts=ChunkingOptions(overlap=20, overlap_all=True)
- )
- assert pre_chunk.overlap_tail == expected_value
-
- @pytest.mark.parametrize(
- ("text", "overlap_prefix", "expected_value"),
- [
- (
- "In rhoncus ipsum sed lectus porta volutpat.",
- "",
- "In rhoncus ipsum sed lectus porta volutpat.",
- ),
- (
- "In rhoncus ipsum sed lectus porta volutpat.",
- "ctus porta volutpat.",
- "ctus porta volutpat.\nIn rhoncus ipsum sed lectus porta volutpat.",
- ),
- ],
- )
- def it_includes_its_overlap_prefix_in_its_text_when_present(
- self, text: str, overlap_prefix: str, expected_value: str
- ):
- pre_chunk = TablePreChunk(
- Table(text), overlap_prefix=overlap_prefix, opts=ChunkingOptions()
- )
- assert pre_chunk._text_with_overlap == expected_value
-
- def it_computes_metadata_for_each_chunk_to_help(self):
- table = Table("Lorem ipsum", metadata=ElementMetadata(text_as_html=""))
- pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions())
-
- metadata = pre_chunk._metadata
-
- assert metadata.text_as_html == ""
- # -- opts.include_orig_elements is True by default --
- assert metadata.orig_elements == [table]
- # -- it produces a new instance each time it is called so changing one chunk's metadata does
- # -- not change that of any other chunk.
- assert pre_chunk._metadata is not metadata
-
- def but_it_omits_orig_elements_from_metadata_when_so_instructed(self):
- pre_chunk = TablePreChunk(
- Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="")),
- overlap_prefix="",
- opts=ChunkingOptions(include_orig_elements=False),
- )
-
- assert pre_chunk._metadata.orig_elements is None
-
- def it_computes_the_original_elements_list_to_help(self):
- table = Table(
- "Lorem ipsum",
- metadata=ElementMetadata(text_as_html="", orig_elements=[Table("Lorem Ipsum")]),
- )
- pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions())
-
- orig_elements = pre_chunk._orig_elements
-
- # -- a TablePreChunk always has exactly one original (Table) element --
- assert len(orig_elements) == 1
- orig_element = orig_elements[0]
- # -- each item in orig_elements is a copy of the original element so we can mutate it
- # -- without changing user's data.
- assert orig_element == table
- assert orig_element is not table
- # -- it strips any .metadata.orig_elements from each element to prevent a recursive data
- # -- structure
- assert orig_element.metadata.orig_elements is None
- # -- computation is only on first call, all chunks get exactly the same orig-elements --
- assert pre_chunk._orig_elements is orig_elements
-
-
-class DescribeTextPreChunk:
- """Unit-test suite for `unstructured.chunking.base.TextPreChunk` objects."""
+class DescribePreChunk:
+ """Unit-test suite for `unstructured.chunking.base.PreChunk` objects."""
@pytest.mark.parametrize(
("overlap_pfx", "texts", "other_overlap_pfx", "other_texts", "expected_value"),
@@ -643,7 +385,7 @@ class DescribeTextPreChunk:
("", ["bar", "baz"], "foo", ["bah", "dah"], False),
],
)
- def it_knows_when_it_is_equal_to_another_TextPreChunk_instance(
+ def it_knows_when_it_is_equal_to_another_PreChunk_instance(
self,
overlap_pfx: str,
texts: list[str],
@@ -652,15 +394,15 @@ def it_knows_when_it_is_equal_to_another_TextPreChunk_instance(
expected_value: bool,
):
opts = ChunkingOptions()
- pre_chunk = TextPreChunk([Text(t) for t in texts], overlap_prefix=overlap_pfx, opts=opts)
- other_pre_chunk = TextPreChunk(
+ pre_chunk = PreChunk([Text(t) for t in texts], overlap_prefix=overlap_pfx, opts=opts)
+ other_pre_chunk = PreChunk(
[Text(t) for t in other_texts], overlap_prefix=other_overlap_pfx, opts=opts
)
assert (pre_chunk == other_pre_chunk) is expected_value
- def and_it_knows_it_is_not_equal_to_an_object_that_is_not_a_TextPreChunk(self):
- pre_chunk = TextPreChunk([], overlap_prefix="", opts=ChunkingOptions())
+ def and_it_knows_it_is_NOT_equal_to_an_object_that_is_not_a_PreChunk(self):
+ pre_chunk = PreChunk([], overlap_prefix="", opts=ChunkingOptions())
assert pre_chunk != 42
@pytest.mark.parametrize(
@@ -676,22 +418,22 @@ def and_it_knows_it_is_not_equal_to_an_object_that_is_not_a_TextPreChunk(self):
(99, 73, False),
],
)
- def it_knows_when_it_can_combine_itself_with_another_TextPreChunk_instance(
+ def it_knows_when_it_can_combine_itself_with_another_PreChunk_instance(
self, max_characters: int, combine_text_under_n_chars: int, expected_value: bool
):
- """This allows `PreChunkCombiner` to operate without knowing `TextPreChunk` internals."""
+ """This allows `PreChunkCombiner` to operate without knowing `PreChunk` internals."""
opts = ChunkingOptions(
max_characters=max_characters,
combine_text_under_n_chars=combine_text_under_n_chars,
overlap=20,
overlap_all=True,
)
- pre_chunk = TextPreChunk(
+ pre_chunk = PreChunk(
[Text("Lorem ipsum dolor sit amet consectetur adipiscing.")], # len == 50
overlap_prefix="e feugiat efficitur.", # len == 20
opts=opts,
)
- next_pre_chunk = TextPreChunk(
+ next_pre_chunk = PreChunk(
[Text("In rhoncus sum sed lectus.")], # len == 26
overlap_prefix="sectetur adipiscing.", # len == 20 but shouldn't come into computation
opts=opts,
@@ -699,13 +441,13 @@ def it_knows_when_it_can_combine_itself_with_another_TextPreChunk_instance(
assert pre_chunk.can_combine(next_pre_chunk) is expected_value
- def it_can_combine_itself_with_another_TextPreChunk_instance(self):
+ def it_can_combine_itself_with_another_PreChunk_instance(self):
""".combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`.
Note that neither the original or other pre_chunk are mutated.
"""
opts = ChunkingOptions()
- pre_chunk = TextPreChunk(
+ pre_chunk = PreChunk(
[
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Text("In rhoncus ipsum sed lectus porta volutpat."),
@@ -713,7 +455,7 @@ def it_can_combine_itself_with_another_TextPreChunk_instance(self):
overlap_prefix="feugiat efficitur.",
opts=opts,
)
- other_pre_chunk = TextPreChunk(
+ other_pre_chunk = PreChunk(
[
Text("Donec semper facilisis metus finibus malesuada."),
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
@@ -728,7 +470,7 @@ def it_can_combine_itself_with_another_TextPreChunk_instance(self):
# -- overlap-prefix from the existing pre-chunk and the other overlap-prefix is discarded
# -- (although it's still in there at the end of the first pre-chunk since that's where it
# -- came from originally).
- assert new_pre_chunk == TextPreChunk(
+ assert new_pre_chunk == PreChunk(
[
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Text("In rhoncus ipsum sed lectus porta volutpat."),
@@ -740,7 +482,7 @@ def it_can_combine_itself_with_another_TextPreChunk_instance(self):
)
# -- Neither pre-chunk used for combining is mutated, so we don't have to worry about who
# -- else may have been given a reference to them.
- assert pre_chunk == TextPreChunk(
+ assert pre_chunk == PreChunk(
[
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Text("In rhoncus ipsum sed lectus porta volutpat."),
@@ -748,7 +490,7 @@ def it_can_combine_itself_with_another_TextPreChunk_instance(self):
overlap_prefix="feugiat efficitur.",
opts=opts,
)
- assert other_pre_chunk == TextPreChunk(
+ assert other_pre_chunk == PreChunk(
[
Text("Donec semper facilisis metus finibus malesuada."),
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
@@ -757,6 +499,52 @@ def it_can_combine_itself_with_another_TextPreChunk_instance(self):
opts=opts,
)
+ @pytest.mark.parametrize(
+ ("text", "expected_value"),
+ [
+ # -- normally it splits exactly on overlap size |------- 20 -------|
+ ("In rhoncus ipsum sed lectus porta volutpat.", "ctus porta volutpat."),
+ # -- but it strips leading and trailing whitespace when the tail includes it --
+ ("In rhoncus ipsum sed lect us portas volutpat. ", "us portas volutpat."),
+ ],
+ )
+ def it_computes_its_overlap_tail_for_use_in_inter_pre_chunk_overlap(
+ self, text: str, expected_value: str
+ ):
+ pre_chunk = PreChunk(
+ [Text(text)], overlap_prefix="", opts=ChunkingOptions(overlap=20, overlap_all=True)
+ )
+ assert pre_chunk.overlap_tail == expected_value
+
+ @pytest.mark.parametrize(
+ ("elements", "overlap_prefix", "expected_value"),
+ [
+ ([Text("foo"), Text("bar")], "bah da bing.", "bah da bing.\n\nfoo\n\nbar"),
+ ([Text("foo"), PageBreak(""), Text("bar")], "da bang.", "da bang.\n\nfoo\n\nbar"),
+ ([PageBreak(""), Text("foo")], "bah da boom.", "bah da boom.\n\nfoo"),
+ ([Text("foo"), Text("bar"), PageBreak("")], "", "foo\n\nbar"),
+ ],
+ )
+ def it_knows_the_concatenated_text_of_the_pre_chunk_to_help(
+ self, elements: list[Text], overlap_prefix: str, expected_value: str
+ ):
+ """._text is the "joined" text of the pre-chunk elements.
+
+ The text-segment contributed by each element is separated from the next by a blank line
+ ("\n\n"). An element that contributes no text does not give rise to a separator.
+ """
+ pre_chunk = PreChunk(elements, overlap_prefix=overlap_prefix, opts=ChunkingOptions())
+ assert pre_chunk._text == expected_value
+
+
+# ================================================================================================
+# CHUNKING HELPER/SPLITTERS
+# ================================================================================================
+
+
+class Describe_Chunker:
+ """Unit-test suite for `unstructured.chunking.base._Chunker` objects."""
+
def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
elements = [
Title("Introduction"),
@@ -766,16 +554,23 @@ def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window
),
]
opts = ChunkingOptions(max_characters=200, include_orig_elements=True)
- pre_chunk = TextPreChunk(elements, overlap_prefix="e feugiat efficitur.", opts=opts)
+ chunker = _Chunker(
+ elements,
+ text=(
+ "e feugiat efficitur.\n\nIntroduction\n\nLorem ipsum dolor sit amet consectetur"
+ " adipiscing elit. In rhoncus ipsum sed lectus porta volutpat."
+ ),
+ opts=opts,
+ )
- chunk_iter = pre_chunk.iter_chunks()
+ chunk_iter = chunker._iter_chunks()
chunk = next(chunk_iter)
assert chunk == CompositeElement(
"e feugiat efficitur.\n\nIntroduction\n\nLorem ipsum dolor sit amet consectetur"
" adipiscing elit. In rhoncus ipsum sed lectus porta volutpat.",
)
- assert chunk.metadata is pre_chunk._consolidated_metadata
+ assert chunk.metadata is chunker._consolidated_metadata
assert chunk.metadata.orig_elements == elements
# --
with pytest.raises(StopIteration):
@@ -783,19 +578,17 @@ def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window
def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
# -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
- # -- The pre-chunker will isolate that element in a pre_chunk of its own.
- elements = [
- Text(
- "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
- " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
- " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
- " commodo consequat."
- )
- ]
+ # -- The pre-chunker will automatically isolate that element in a pre_chunk of its own.
+ text = (
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor"
+ " incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud"
+ " exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."
+ )
+ elements = [Text(text)]
opts = ChunkingOptions(max_characters=200, include_orig_elements=True)
- pre_chunk = TextPreChunk(elements, overlap_prefix="", opts=opts)
+ chunker = _Chunker(elements, text=text, opts=opts)
- chunk_iter = pre_chunk.iter_chunks()
+ chunk_iter = chunker._iter_chunks()
# -- Note that .metadata.orig_elements is the same single original element, "repeated" for
# -- each text-split chunk. This behavior emerges without explicit command as a consequence
@@ -807,93 +600,70 @@ def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(se
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
" veniam, quis nostrud exercitation ullamco laboris nisi ut"
)
- assert chunk.metadata is pre_chunk._consolidated_metadata
+ assert chunk.metadata is chunker._consolidated_metadata
assert chunk.metadata.orig_elements == elements
# --
chunk = next(chunk_iter)
assert chunk == CompositeElement("aliquip ex ea commodo consequat.")
- assert chunk.metadata is pre_chunk._continuation_metadata
+ assert chunk.metadata is chunker._continuation_metadata
assert chunk.metadata.orig_elements == elements
# --
with pytest.raises(StopIteration):
next(chunk_iter)
- def and_it_adds_the_is_continuation_flag_for_second_and_later_text_split_chunks(self):
+ def and_it_adds_the_is_continuation_flag_for_second_and_later_split_chunks(self):
+ # -- |--------------------- 48 ---------------------|
+ text = "'Lorem ipsum dolor' means 'Thank you very much'."
metadata = ElementMetadata(
category_depth=0,
filename="foo.docx",
languages=["lat"],
parent_id="f87731e0",
)
+ elements = [Text(text, metadata=metadata)]
- pre_chunk = TextPreChunk(
- # -- |--------------------- 48 ---------------------|
- [Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata)],
- overlap_prefix="",
- opts=ChunkingOptions(max_characters=20),
- )
-
- chunk_iter = pre_chunk.iter_chunks()
+ chunk_iter = _Chunker.iter_chunks(elements, text, opts=ChunkingOptions(max_characters=20))
assert [c.metadata.is_continuation for c in chunk_iter] == [None, True, True]
def but_it_generates_no_chunks_when_the_pre_chunk_contains_no_text(self):
metadata = ElementMetadata()
- pre_chunk = TextPreChunk(
- [PageBreak("", metadata=metadata)],
- overlap_prefix="",
+
+ chunk_iter = _Chunker.iter_chunks(
+ [PageBreak(" ", metadata=metadata)],
+ text="",
opts=ChunkingOptions(),
)
- chunk_iter = pre_chunk.iter_chunks()
-
with pytest.raises(StopIteration):
next(chunk_iter)
- @pytest.mark.parametrize(
- ("text", "expected_value"),
- [
- # -- normally it splits exactly on overlap size |------- 20 -------|
- ("In rhoncus ipsum sed lectus porta volutpat.", "ctus porta volutpat."),
- # -- but it strips leading and trailing whitespace when the tail includes it --
- ("In rhoncus ipsum sed lectus porta volutpat. ", "porta volutpat."),
- ],
- )
- def it_computes_its_overlap_tail_for_use_in_inter_pre_chunk_overlap(
- self, text: str, expected_value: str
- ):
- pre_chunk = TextPreChunk(
- [Text(text)], overlap_prefix="", opts=ChunkingOptions(overlap=20, overlap_all=True)
- )
- assert pre_chunk.overlap_tail == expected_value
-
def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self):
- pre_chunk = TextPreChunk(
- [
- Title(
- "Lorem Ipsum",
- metadata=ElementMetadata(
- category_depth=0,
- filename="foo.docx",
- languages=["lat"],
- parent_id="f87731e0",
- ),
+ elements = [
+ Title(
+ "Lorem Ipsum",
+ metadata=ElementMetadata(
+ category_depth=0,
+ filename="foo.docx",
+ languages=["lat"],
+ parent_id="f87731e0",
),
- Text(
- "'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
- metadata=ElementMetadata(
- category_depth=1,
- filename="foo.docx",
- image_path="sprite.png",
- languages=["lat", "eng"],
- ),
+ ),
+ Text(
+ "'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
+ metadata=ElementMetadata(
+ category_depth=1,
+ filename="foo.docx",
+ image_path="sprite.png",
+ languages=["lat", "eng"],
),
- ],
- overlap_prefix="",
- opts=ChunkingOptions(),
- )
+ ),
+ ]
+ text = "Lorem Ipsum\n\n'Lorem ipsum dolor' means 'Thank you very much' in Latin."
+
+ chunker = _Chunker(elements, text=text, opts=ChunkingOptions())
- assert pre_chunk._all_metadata_values == {
+ assert chunker._all_metadata_values == {
# -- scalar values are accumulated in a list in element order --
"category_depth": [0, 1],
# -- all values are accumulated, not only unique ones --
@@ -920,19 +690,17 @@ def but_it_discards_ad_hoc_metadata_fields_during_consolidation(self):
image_path="sprite.png",
languages=["lat", "eng"],
)
- metadata_2.quotient = 1.74
-
- pre_chunk = TextPreChunk(
- [
- Title("Lorem Ipsum", metadata=metadata),
- Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
- ],
- overlap_prefix="",
- opts=ChunkingOptions(),
- )
+ metadata_2.quotient = 1.74
+ elements = [
+ Title("Lorem Ipsum", metadata=metadata),
+ Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
+ ]
+ text = "Lorem Ipsum\n\n'Lorem ipsum dolor' means 'Thank you very much' in Latin."
+
+ chunker = _Chunker(elements, text=text, opts=ChunkingOptions())
# -- ad-hoc fields "coefficient" and "quotient" do not appear --
- assert pre_chunk._all_metadata_values == {
+ assert chunker._all_metadata_values == {
"category_depth": [0, 1],
"filename": ["foo.docx", "foo.docx"],
"image_path": ["sprite.png"],
@@ -945,9 +713,11 @@ def and_it_adds_the_pre_chunk_elements_to_metadata_when_so_instructed(self):
metadata = ElementMetadata(filename="foo.pdf")
element = Title("Lorem Ipsum", metadata=metadata)
element_2 = Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata)
- pre_chunk = TextPreChunk([element, element_2], overlap_prefix="", opts=opts)
+ elements = [element, element_2]
+ text = "Lorem Ipsum\n\n'Lorem ipsum dolor' means 'Thank you very much' in Latin."
+ chunker = _Chunker(elements, text=text, opts=opts)
- consolidated_metadata = pre_chunk._consolidated_metadata
+ consolidated_metadata = chunker._consolidated_metadata
# -- pre-chunk elements are included as metadata --
orig_elements = consolidated_metadata.orig_elements
@@ -963,40 +733,38 @@ def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strate
Only non-None fields should appear in the dict and each field value should be the
consolidation of the values across the pre_chunk elements.
"""
- pre_chunk = TextPreChunk(
- [
- PageBreak(""),
- Title(
- "Lorem Ipsum",
- metadata=ElementMetadata(
- filename="foo.docx",
- # -- category_depth has DROP strategy so doesn't appear in result --
- category_depth=0,
- emphasized_text_contents=["Lorem", "Ipsum"],
- emphasized_text_tags=["b", "i"],
- languages=["lat"],
- ),
+ elements = [
+ PageBreak(""),
+ Title(
+ "Lorem Ipsum",
+ metadata=ElementMetadata(
+ filename="foo.docx",
+ # -- category_depth has DROP strategy so doesn't appear in result --
+ category_depth=0,
+ emphasized_text_contents=["Lorem", "Ipsum"],
+ emphasized_text_tags=["b", "i"],
+ languages=["lat"],
),
- Text(
- "'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
- metadata=ElementMetadata(
- # -- filename change doesn't happen IRL but demonstrates FIRST strategy --
- filename="bar.docx",
- # -- emphasized_text_contents has LIST_CONCATENATE strategy, so "Lorem"
- # -- appears twice in consolidated-meta (as it should) and length matches
- # -- that of emphasized_text_tags both before and after consolidation.
- emphasized_text_contents=["Lorem", "ipsum"],
- emphasized_text_tags=["i", "b"],
- # -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once --
- languages=["eng", "lat"],
- ),
+ ),
+ Text(
+ "'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
+ metadata=ElementMetadata(
+ # -- filename change doesn't happen IRL but demonstrates FIRST strategy --
+ filename="bar.docx",
+ # -- emphasized_text_contents has LIST_CONCATENATE strategy, so "Lorem"
+ # -- appears twice in consolidated-meta (as it should) and length matches
+ # -- that of emphasized_text_tags both before and after consolidation.
+ emphasized_text_contents=["Lorem", "ipsum"],
+ emphasized_text_tags=["i", "b"],
+ # -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once --
+ languages=["eng", "lat"],
),
- ],
- overlap_prefix="",
- opts=ChunkingOptions(),
- )
+ ),
+ ]
+ text = "Lorem Ipsum\n\n'Lorem ipsum dolor' means 'Thank you very much' in Latin."
+ chunker = _Chunker(elements, text=text, opts=ChunkingOptions())
- meta_kwargs = pre_chunk._meta_kwargs
+ meta_kwargs = chunker._meta_kwargs
assert meta_kwargs == {
"filename": "foo.docx",
@@ -1006,19 +774,21 @@ def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strate
}
def it_computes_the_original_elements_list_to_help(self):
+ opts = ChunkingOptions(include_orig_elements=True)
element = Title("Introduction")
element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")
element_3 = CompositeElement(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(orig_elements=[Text("Porta volupat.")]),
)
- pre_chunk = TextPreChunk(
- [element, element_2, element_3],
- overlap_prefix="",
- opts=ChunkingOptions(include_orig_elements=True),
+ elements = [element, element_2, element_3]
+ text = (
+ "Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn"
+ " rhoncus ipsum sed lectus porta volutpat."
)
+ chunker = _Chunker(elements, text=text, opts=opts)
- orig_elements = pre_chunk._orig_elements
+ orig_elements = chunker._orig_elements
# -- all elements of pre-chunk are included --
assert orig_elements == [element, element_2, element_3]
@@ -1029,39 +799,233 @@ def it_computes_the_original_elements_list_to_help(self):
assert orig_elements[2] is not element_3
assert orig_elements[2].metadata.orig_elements is None
# -- computation is only on first call, all chunks get exactly the same orig-elements --
- assert pre_chunk._orig_elements is orig_elements
+ assert chunker._orig_elements is orig_elements
+
+
+class Describe_TableChunker:
+ """Unit-test suite for `unstructured.chunking.base._TableChunker` objects."""
+
+ def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
+ html_table = (
+ "\n"
+ "\n"
+ "Header Col 1 | Header Col 2 |
\n"
+ "\n"
+ "\n"
+ "Lorem ipsum | adipiscing |
\n"
+ "\n"
+ "
"
+ )
+ text_table = "Header Col 1 Header Col 2\nLorem ipsum adipiscing"
+
+ chunk_iter = _TableChunker.iter_chunks(
+ Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
+ overlap_prefix="ctus porta volutpat.",
+ opts=ChunkingOptions(max_characters=175),
+ )
+
+ chunk = next(chunk_iter)
+ assert isinstance(chunk, Table)
+ assert chunk.text == (
+ "ctus porta volutpat.\nHeader Col 1 Header Col 2\nLorem ipsum adipiscing"
+ )
+ assert chunk.metadata.text_as_html == (
+ ""
+ "Header Col 1 | Header Col 2 |
"
+ "Lorem ipsum | adipiscing |
"
+ "
"
+ )
+ with pytest.raises(StopIteration):
+ next(chunk_iter)
+
+ def but_not_when_the_table_is_is_empty_or_contains_only_whitespace(self):
+ html_table = ""
+
+ chunk_iter = _TableChunker.iter_chunks(
+ Table(" \t \n ", metadata=ElementMetadata(text_as_html=html_table)),
+ overlap_prefix="volutpat.",
+ opts=ChunkingOptions(max_characters=175),
+ )
+
+ with pytest.raises(StopIteration):
+ next(chunk_iter)
+
+ def and_it_includes_the_original_table_element_in_metadata_when_so_instructed(self):
+ table = Table("foo bar", metadata=ElementMetadata(text_as_html=""))
+ opts = ChunkingOptions(include_orig_elements=True)
+
+ chunk_iter = _TableChunker.iter_chunks(table, "", opts)
+
+ chunk = next(chunk_iter)
+ assert isinstance(chunk, Table)
+ assert chunk.metadata.orig_elements == [table]
+ assert chunk.metadata.text_as_html == ""
+ # --
+ with pytest.raises(StopIteration):
+ next(chunk_iter)
+
+ def but_not_when_instructed_not_to(self):
+ chunk_iter = _TableChunker.iter_chunks(
+ Table("foobar"), "", ChunkingOptions(include_orig_elements=False)
+ )
+
+ chunk = next(chunk_iter)
+
+ assert isinstance(chunk, Table)
+ assert chunk.metadata.orig_elements is None
+
+ def it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
+ html_table = """\
+
+
+ Header Col 1 | Header Col 2 |
+
+
+ Lorem ipsum | A Link example |
+ Consectetur | adipiscing elit |
+ Nunc aliquam | id enim nec molestie |
+
+
+ """
+ text_table = (
+ "Header Col 1 Header Col 2\n"
+ "Lorem ipsum dolor sit amet\n"
+ "Consectetur adipiscing elit\n"
+ "Nunc aliquam id enim nec molestie\n"
+ "Vivamus quis nunc ipsum donec ac fermentum"
+ )
+
+ chunk_iter = _TableChunker.iter_chunks(
+ Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
+ overlap_prefix="",
+ opts=ChunkingOptions(max_characters=100, text_splitting_separators=("\n", " ")),
+ )
+
+ chunk = next(chunk_iter)
+ assert isinstance(chunk, TableChunk)
+ assert chunk.text == (
+ "Header Col 1 Header Col 2 Lorem ipsum A Link example Consectetur adipiscing elit"
+ )
+ assert chunk.metadata.text_as_html == (
+ ""
+ "Header Col 1 | Header Col 2 |
"
+ "Lorem ipsum | A Link example |
"
+ "Consectetur | adipiscing elit |
"
+ "
"
+ )
+ assert chunk.metadata.is_continuation is None
+ # --
+ chunk = next(chunk_iter)
+ assert isinstance(chunk, TableChunk)
+ assert chunk.text == "Nunc aliquam id enim nec molestie"
+ assert chunk.metadata.text_as_html == (
+ "Nunc aliquam | id enim nec molestie |
"
+ )
+ assert chunk.metadata.is_continuation
+ # --
+ with pytest.raises(StopIteration):
+ next(chunk_iter)
+
+ def and_it_includes_the_whole_original_Table_in_each_metadata_when_so_instructed(self):
+ """Even though text and html are split, the orig_elements metadata is not."""
+ table = Table(
+ "Header Col 1 Header Col 2\nLorem ipsum dolor sit amet",
+ metadata=ElementMetadata(text_as_html=""),
+ )
+ opts = ChunkingOptions(max_characters=30, include_orig_elements=True)
+
+ chunk_iter = _TableChunker.iter_chunks(table, overlap_prefix="", opts=opts)
+
+ chunk = next(chunk_iter)
+ assert isinstance(chunk, TableChunk)
+ assert chunk.text == "Header Col 1 Header Col 2"
+ assert chunk.metadata.orig_elements == [table]
+ assert not chunk.metadata.is_continuation
+ # --
+ chunk = next(chunk_iter)
+ assert isinstance(chunk, TableChunk)
+ assert chunk.text == "Lorem ipsum dolor sit amet"
+ assert chunk.metadata.orig_elements == [table]
+ assert chunk.metadata.is_continuation
@pytest.mark.parametrize(
- ("elements", "overlap_prefix", "expected_value"),
+ ("text", "overlap_prefix", "expected_value"),
[
- ([Text("foo"), Text("bar")], "bah da bing.", "bah da bing.\n\nfoo\n\nbar"),
- ([Text("foo"), PageBreak(""), Text("bar")], "da bang.", "da bang.\n\nfoo\n\nbar"),
- ([PageBreak(""), Text("foo")], "bah da boom.", "bah da boom.\n\nfoo"),
- ([Text("foo"), Text("bar"), PageBreak("")], "", "foo\n\nbar"),
+ (
+ "In rhoncus ipsum sed lectus porta volutpat.",
+ "",
+ "In rhoncus ipsum sed lectus porta volutpat.",
+ ),
+ (
+ "In rhoncus ipsum sed lectus porta volutpat.",
+ "ctus porta volutpat.",
+ "ctus porta volutpat.\nIn rhoncus ipsum sed lectus porta volutpat.",
+ ),
],
)
- def it_knows_the_concatenated_text_of_the_pre_chunk_to_help(
- self, elements: list[Text], overlap_prefix: str, expected_value: str
+ def it_includes_its_overlap_prefix_in_its_text_when_present(
+ self, text: str, overlap_prefix: str, expected_value: str
):
- """._text is the "joined" text of the pre-chunk elements.
+ table_chunker = _TableChunker(
+ Table(text), overlap_prefix=overlap_prefix, opts=ChunkingOptions()
+ )
+ assert table_chunker._text_with_overlap == expected_value
- The text-segment contributed by each element is separated from the next by a blank line
- ("\n\n"). An element that contributes no text does not give rise to a separator.
- """
- pre_chunk = TextPreChunk(elements, overlap_prefix=overlap_prefix, opts=ChunkingOptions())
- assert pre_chunk._text == expected_value
+ def it_computes_metadata_for_each_chunk_to_help(self):
+ table = Table("Lorem ipsum", metadata=ElementMetadata(text_as_html=""))
+ table_chunker = _TableChunker(table, overlap_prefix="", opts=ChunkingOptions())
+
+ metadata = table_chunker._metadata
+
+ assert metadata.text_as_html == ""
+ # -- opts.include_orig_elements is True by default --
+ assert metadata.orig_elements == [table]
+ # -- it produces a new instance each time it is called so changing one chunk's metadata does
+ # -- not change that of any other chunk.
+ assert table_chunker._metadata is not metadata
+
+ def but_it_omits_orig_elements_from_metadata_when_so_instructed(self):
+ table_chunker = _TableChunker(
+ Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="")),
+ overlap_prefix="",
+ opts=ChunkingOptions(include_orig_elements=False),
+ )
+
+ assert table_chunker._metadata.orig_elements is None
+
+ def it_computes_the_original_elements_list_to_help(self):
+ table = Table(
+ "Lorem ipsum",
+ metadata=ElementMetadata(text_as_html="", orig_elements=[Table("Lorem Ipsum")]),
+ )
+ table_chunker = _TableChunker(table, overlap_prefix="", opts=ChunkingOptions())
+
+ orig_elements = table_chunker._orig_elements
+
+ # -- a _TableChunker always has exactly one original (Table) element --
+ assert len(orig_elements) == 1
+ orig_element = orig_elements[0]
+ # -- each item in orig_elements is a copy of the original element so we can mutate it
+ # -- without changing user's data.
+ assert orig_element == table
+ assert orig_element is not table
+ # -- it strips any .metadata.orig_elements from each element to prevent a recursive data
+ # -- structure
+ assert orig_element.metadata.orig_elements is None
+ # -- computation is only on first call, all chunks get exactly the same orig-elements --
+ assert table_chunker._orig_elements is orig_elements
# ================================================================================================
-# PRE-CHUNK SPLITTERS
+# HTML SPLITTERS
# ================================================================================================
-class Describe_TableSplitter:
- """Unit-test suite for `unstructured.chunking.base._TableSplitter`."""
+class Describe_HtmlTableSplitter:
+ """Unit-test suite for `unstructured.chunking.base._HtmlTableSplitter`."""
def it_splits_an_HTML_table_on_whole_row_boundaries_when_possible(self):
- opts = ChunkingOptions(max_characters=(150))
+ opts = ChunkingOptions(max_characters=(40))
html_table = HtmlTable.from_html_text(
"""
@@ -1097,7 +1061,7 @@ def it_splits_an_HTML_table_on_whole_row_boundaries_when_possible(self):
"""
)
- assert list(_TableSplitter.iter_subtables(html_table, opts)) == [
+ assert list(_HtmlTableSplitter.iter_subtables(html_table, opts)) == [
(
"Stanley Cups Team Location Stanley Cups",
""
@@ -1119,7 +1083,7 @@ def it_splits_an_HTML_table_on_whole_row_boundaries_when_possible(self):
]
def and_it_splits_an_oversized_row_on_an_even_cell_boundary_when_possible(self):
- opts = ChunkingOptions(max_characters=(100))
+ opts = ChunkingOptions(max_characters=(93))
html_table = HtmlTable.from_html_text(
"""
@@ -1143,7 +1107,7 @@ def and_it_splits_an_oversized_row_on_an_even_cell_boundary_when_possible(self):
"""
)
- assert list(_TableSplitter.iter_subtables(html_table, opts)) == [
+ assert list(_HtmlTableSplitter.iter_subtables(html_table, opts)) == [
(
"Lorem ipsum dolor sit amet. Consectetur adipiscing elit.",
""
@@ -1189,7 +1153,7 @@ def and_it_splits_an_oversized_cell_on_an_even_word_boundary(self):
"""
)
- assert list(_TableSplitter.iter_subtables(html_table, opts)) == [
+ assert list(_HtmlTableSplitter.iter_subtables(html_table, opts)) == [
(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do",
""
@@ -1346,16 +1310,14 @@ def it_accumulates_elements_added_to_it(self):
("Lorem Ipsum dolor sit amet. | ", False),
],
)
- def it_will_fit_a_cell_with_text_shorter_than_maxlen_minus_33_when_empty(
+ def it_will_fit_a_cell_with_text_shorter_than_maxlen_when_empty(
self, cell_html: str, expected_value: bool
):
- """Cell text must be 22-chars or shorter to fit in 55-char window.
-
- `` overhead is 33 characters.
- """
- accum = _CellAccumulator(maxlen=55)
+ accum = _CellAccumulator(maxlen=25)
cell = HtmlCell(fragment_fromstring(cell_html))
+ print(f"{cell.text=}")
+
assert accum.will_fit(cell) is expected_value
@pytest.mark.parametrize(
@@ -1368,16 +1330,12 @@ def it_will_fit_a_cell_with_text_shorter_than_maxlen_minus_33_when_empty(
("Lorem Ipsum dolor sit amet. | ", False), # -- 27 --
],
)
- def and_it_will_fit_a_cell_with_text_shorter_than_remaining_space_minus_9_when_not_empty(
+ def and_it_will_fit_a_cell_with_text_shorter_than_remaining_space_when_not_empty(
self, cell_html: str, expected_value: bool
):
- """Cell text must be 9-chars shorter than remaining space to fit with accumulated cells.
-
- `... | ` overhead is 9 characters.
- """
- accum = _CellAccumulator(maxlen=85)
+ accum = _CellAccumulator(maxlen=44)
accum.add_cell(HtmlCell(fragment_fromstring("abcdefghijklmnopqrstuvwxyz | ")))
- # -- remaining space is 85 - 26 -33 = 26; max new cell text len is 17 --
+ # -- remaining space is 44 - 26 = 18; max new cell text len is 17 --
cell = HtmlCell(fragment_fromstring(cell_html))
assert accum.will_fit(cell) is expected_value
@@ -1429,23 +1387,19 @@ def it_accumulates_rows_added_to_it(self):
@pytest.mark.parametrize(
("row_html", "expected_value"),
[
- ("
", True), # -- 5 --
- (" |
", True), # -- 14 --
- ("Lorem Ipsum. |
", True), # -- 30 --
- ("Lorem Ipsum dolor sit. |
", True), # -- 40 --
- ("Lorem | Sit amet |
", True), # -- 40 --
- ("Lorem Ipsum dolor sit amet. |
", False), # -- 45 --
- ("Lorem Ipsum | Dolor sit. |
", False), # -- 48 --
+ ("
", True), # -- 0 --
+ (" |
", True), # -- 0 --
+ ("Lorem Ipsum. |
", True), # -- 12 --
+ ("Lorem Ipsum dolor sit |
", True), # -- 21 --
+ ("Lorem | Sit amet |
", True), # -- 14 --
+ ("Lorem Ipsum dolor sit amet. |
", False), # -- 27 --
+ ("Lorem Ipsum | Dolor sit. |
", False), # -- 22 --
],
)
- def it_will_fit_a_row_with_HTML_shorter_than_maxlen_minus_15_when_empty(
+ def it_will_fit_a_row_with_text_shorter_than_maxlen_when_empty(
self, row_html: str, expected_value: bool
):
- """Row HTML must be 40-chars or shorter to fit in 55-char chunking window.
-
- `` overhead is 15 characters.
- """
- accum = _RowAccumulator(maxlen=55)
+ accum = _RowAccumulator(maxlen=21)
row = HtmlRow(fragment_fromstring(row_html))
assert accum.will_fit(row) is expected_value
@@ -1453,22 +1407,22 @@ def it_will_fit_a_row_with_HTML_shorter_than_maxlen_minus_15_when_empty(
@pytest.mark.parametrize(
("row_html", "expected_value"),
[
- ("
", True), # -- 5 --
- (" |
", True), # -- 14 --
- ("Lorem Ipsum dolor sit |
", True), # -- 39 --
- ("Lorem Ipsum dolor sit. |
", True), # -- 40 --
- ("Lorem | Sit amet |
", True), # -- 40 --
- ("Lorem | Sit amet. |
", False), # -- 41 --
- ("Lorem Ipsum | Dolor sit. |
", False), # -- 48 --
+ ("
", True), # -- 0 --
+ (" |
", True), # -- 0 --
+ ("Lorem Ipsum. |
", True), # -- 12 --
+ ("Lorem Ipsum dolor sit |
", True), # -- 21 --
+ ("Lorem | Sit amet |
", True), # -- 14 --
+ ("Lorem Ipsum dolor sit amet. |
", False), # -- 27 --
+ ("Lorem Ipsum | Dolor sit. |
", False), # -- 22 --
],
)
- def and_it_will_fit_a_row_with_HTML_shorter_than_remaining_space_when_not_empty(
+ def and_it_will_fit_a_row_with_text_shorter_than_remaining_space_when_not_empty(
self, row_html: str, expected_value: bool
):
"""There is no overhead beyond row HTML for additional rows."""
- accum = _RowAccumulator(maxlen=99)
+ accum = _RowAccumulator(maxlen=48)
accum.add_row(HtmlRow(fragment_fromstring("abcdefghijklmnopqrstuvwxyz |
")))
- # -- remaining space is 85 - 26 - 33 = 26; max new row HTML len is 40 --
+ # -- remaining space is 48 - 26 = 21 --
row = HtmlRow(fragment_fromstring(row_html))
assert accum.will_fit(row) is expected_value
@@ -1514,10 +1468,10 @@ def but_it_does_not_generate_a_TextAndHtml_pair_when_empty(self):
class DescribePreChunkCombiner:
"""Unit-test suite for `unstructured.chunking.base.PreChunkCombiner`."""
- def it_combines_sequential_small_text_pre_chunks(self):
+ def it_combines_sequential_small_pre_chunks(self):
opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=250)
pre_chunks = [
- TextPreChunk(
+ PreChunk(
[
Title("Lorem Ipsum"), # 11
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
@@ -1525,7 +1479,8 @@ def it_combines_sequential_small_text_pre_chunks(self):
overlap_prefix="",
opts=opts,
),
- TextPreChunk(
+ PreChunk([Table("Heading\nCell text")], overlap_prefix="", opts=opts),
+ PreChunk(
[
Title("Mauris Nec"), # 10
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
@@ -1533,7 +1488,7 @@ def it_combines_sequential_small_text_pre_chunks(self):
overlap_prefix="",
opts=opts,
),
- TextPreChunk(
+ PreChunk(
[
Title("Sed Orci"), # 8
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
@@ -1546,10 +1501,11 @@ def it_combines_sequential_small_text_pre_chunks(self):
pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
+ assert isinstance(pre_chunk, PreChunk)
assert pre_chunk._elements == [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+ Table("Heading\nCell text"),
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
Title("Sed Orci"),
@@ -1558,57 +1514,10 @@ def it_combines_sequential_small_text_pre_chunks(self):
with pytest.raises(StopIteration):
next(pre_chunk_iter)
- def but_it_does_not_combine_table_pre_chunks(self):
- opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=250)
- pre_chunks = [
- TextPreChunk(
- [
- Title("Lorem Ipsum"),
- Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
- ],
- overlap_prefix="",
- opts=opts,
- ),
- TablePreChunk(Table("Heading\nCell text"), overlap_prefix="", opts=opts),
- TextPreChunk(
- [
- Title("Mauris Nec"),
- Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
- ],
- overlap_prefix="",
- opts=opts,
- ),
- ]
-
- pre_chunk_iter = PreChunkCombiner(
- pre_chunks, ChunkingOptions(max_characters=250, combine_text_under_n_chars=250)
- ).iter_combined_pre_chunks()
-
- pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
- assert pre_chunk._elements == [
- Title("Lorem Ipsum"),
- Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
- ]
- # --
- pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TablePreChunk)
- assert pre_chunk._table == Table("Heading\nCell text")
- # --
- pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
- assert pre_chunk._elements == [
- Title("Mauris Nec"),
- Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
- ]
- # --
- with pytest.raises(StopIteration):
- next(pre_chunk_iter)
-
def it_respects_the_specified_combination_threshold(self):
opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=80)
pre_chunks = [
- TextPreChunk( # 68
+ PreChunk( # 68
[
Title("Lorem Ipsum"), # 11
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
@@ -1616,7 +1525,7 @@ def it_respects_the_specified_combination_threshold(self):
overlap_prefix="",
opts=opts,
),
- TextPreChunk( # 71
+ PreChunk( # 71
[
Title("Mauris Nec"), # 10
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
@@ -1625,7 +1534,7 @@ def it_respects_the_specified_combination_threshold(self):
opts=opts,
),
# -- len == 139
- TextPreChunk(
+ PreChunk(
[
Title("Sed Orci"), # 8
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
@@ -1638,7 +1547,7 @@ def it_respects_the_specified_combination_threshold(self):
pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
+ assert isinstance(pre_chunk, PreChunk)
assert pre_chunk._elements == [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
@@ -1647,7 +1556,7 @@ def it_respects_the_specified_combination_threshold(self):
]
# --
pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
+ assert isinstance(pre_chunk, PreChunk)
assert pre_chunk._elements == [
Title("Sed Orci"),
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
@@ -1659,7 +1568,7 @@ def it_respects_the_specified_combination_threshold(self):
def it_respects_the_hard_maximum_window_length(self):
opts = ChunkingOptions(max_characters=200, combine_text_under_n_chars=200)
pre_chunks = [
- TextPreChunk( # 68
+ PreChunk( # 68
[
Title("Lorem Ipsum"), # 11
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
@@ -1667,7 +1576,7 @@ def it_respects_the_hard_maximum_window_length(self):
overlap_prefix="",
opts=opts,
),
- TextPreChunk( # 71
+ PreChunk( # 71
[
Title("Mauris Nec"), # 10
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
@@ -1676,7 +1585,7 @@ def it_respects_the_hard_maximum_window_length(self):
opts=opts,
),
# -- len == 139
- TextPreChunk(
+ PreChunk(
[
Title("Sed Orci"), # 8
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
@@ -1690,7 +1599,7 @@ def it_respects_the_hard_maximum_window_length(self):
pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
+ assert isinstance(pre_chunk, PreChunk)
assert pre_chunk._elements == [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
@@ -1699,7 +1608,7 @@ def it_respects_the_hard_maximum_window_length(self):
]
# --
pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
+ assert isinstance(pre_chunk, PreChunk)
assert pre_chunk._elements == [
Title("Sed Orci"),
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
@@ -1712,8 +1621,8 @@ def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
"""Such as occurs when a single element exceeds the window size."""
opts = ChunkingOptions(max_characters=150, combine_text_under_n_chars=150)
pre_chunks = [
- TextPreChunk([Title("Lorem Ipsum")], overlap_prefix="", opts=opts),
- TextPreChunk( # 179
+ PreChunk([Title("Lorem Ipsum")], overlap_prefix="", opts=opts),
+ PreChunk( # 179
[
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit." # 55
@@ -1724,7 +1633,7 @@ def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
overlap_prefix="",
opts=opts,
),
- TextPreChunk([Title("Vulputate Consequat")], overlap_prefix="", opts=opts),
+ PreChunk([Title("Vulputate Consequat")], overlap_prefix="", opts=opts),
]
pre_chunk_iter = PreChunkCombiner(
@@ -1732,11 +1641,11 @@ def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
).iter_combined_pre_chunks()
pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
+ assert isinstance(pre_chunk, PreChunk)
assert pre_chunk._elements == [Title("Lorem Ipsum")]
# --
pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
+ assert isinstance(pre_chunk, PreChunk)
assert pre_chunk._elements == [
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit."
@@ -1746,21 +1655,21 @@ def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
]
# --
pre_chunk = next(pre_chunk_iter)
- assert isinstance(pre_chunk, TextPreChunk)
+ assert isinstance(pre_chunk, PreChunk)
assert pre_chunk._elements == [Title("Vulputate Consequat")]
# --
with pytest.raises(StopIteration):
next(pre_chunk_iter)
-class DescribeTextPreChunkAccumulator:
- """Unit-test suite for `unstructured.chunking.base.TextPreChunkAccumulator`."""
+class Describe_PreChunkAccumulator:
+ """Unit-test suite for `unstructured.chunking.base._PreChunkAccumulator`."""
- def it_generates_a_combined_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
+ def it_generates_a_combined_PreChunk_when_flushed_and_resets_itself_to_empty(self):
opts = ChunkingOptions(combine_text_under_n_chars=500)
- accum = TextPreChunkAccumulator(opts=opts)
+ accum = _PreChunkAccumulator(opts=opts)
- pre_chunk = TextPreChunk(
+ pre_chunk = PreChunk(
[
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
@@ -1771,7 +1680,7 @@ def it_generates_a_combined_TextPreChunk_when_flushed_and_resets_itself_to_empty
assert accum.will_fit(pre_chunk)
accum.add_pre_chunk(pre_chunk)
- pre_chunk = TextPreChunk(
+ pre_chunk = PreChunk(
[
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
@@ -1782,7 +1691,7 @@ def it_generates_a_combined_TextPreChunk_when_flushed_and_resets_itself_to_empty
assert accum.will_fit(pre_chunk)
accum.add_pre_chunk(pre_chunk)
- pre_chunk = TextPreChunk(
+ pre_chunk = PreChunk(
[
Title("Sed Orci"),
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
@@ -1799,8 +1708,8 @@ def it_generates_a_combined_TextPreChunk_when_flushed_and_resets_itself_to_empty
pre_chunk = next(pre_chunk_iter)
with pytest.raises(StopIteration):
next(pre_chunk_iter)
- # -- and it is a _TextPreChunk containing all the elements --
- assert isinstance(pre_chunk, TextPreChunk)
+ # -- and it is a PreChunk containing all the elements --
+ assert isinstance(pre_chunk, PreChunk)
assert pre_chunk._elements == [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
@@ -1815,8 +1724,8 @@ def it_generates_a_combined_TextPreChunk_when_flushed_and_resets_itself_to_empty
with pytest.raises(StopIteration):
next(accum.flush())
- def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
- accum = TextPreChunkAccumulator(opts=ChunkingOptions(max_characters=150))
+ def but_it_does_not_generate_a_PreChunk_on_flush_when_empty(self):
+ accum = _PreChunkAccumulator(opts=ChunkingOptions(max_characters=150))
assert list(accum.flush()) == []
diff --git a/test_unstructured/chunking/test_basic.py b/test_unstructured/chunking/test_basic.py
index 85f807b6ff..88e01563fe 100644
--- a/test_unstructured/chunking/test_basic.py
+++ b/test_unstructured/chunking/test_basic.py
@@ -25,31 +25,31 @@ def test_it_chunks_a_document_when_basic_chunking_strategy_is_specified_on_parti
assert chunks == [
CompositeElement(
"US Trustee Handbook\n\nCHAPTER 1\n\nINTRODUCTION\n\nCHAPTER 1 – INTRODUCTION"
- "\n\nA.\tPURPOSE"
+ "\n\nA. PURPOSE"
),
CompositeElement(
"The United States Trustee appoints and supervises standing trustees and monitors and"
- " supervises cases under chapter 13 of title 11 of the United States Code. 28 U.S.C."
- " § 586(b). The Handbook, issued as part of our duties under 28 U.S.C. § 586,"
+ " supervises cases under chapter 13 of title 11 of the United States Code. 28 U.S.C."
+ " § 586(b). The Handbook, issued as part of our duties under 28 U.S.C. § 586,"
" establishes or clarifies the position of the United States Trustee Program (Program)"
" on the duties owed by a standing trustee to the debtors, creditors, other parties in"
- " interest, and the United States Trustee. The Handbook does not present a full and"
+ " interest, and the United States Trustee. The Handbook does not present a full and"
),
CompositeElement(
"complete statement of the law; it should not be used as a substitute for legal"
- " research and analysis. The standing trustee must be familiar with relevant"
+ " research and analysis. The standing trustee must be familiar with relevant"
" provisions of the Bankruptcy Code, Federal Rules of Bankruptcy Procedure (Rules),"
- " any local bankruptcy rules, and case law. 11 U.S.C. § 321, 28 U.S.C. § 586,"
- " 28 C.F.R. § 58.6(a)(3). Standing trustees are encouraged to follow Practice Tips"
+ " any local bankruptcy rules, and case law. 11 U.S.C. § 321, 28 U.S.C. § 586,"
+ " 28 C.F.R. § 58.6(a)(3). Standing trustees are encouraged to follow Practice Tips"
" identified in this Handbook but these are not considered mandatory."
),
CompositeElement(
"Nothing in this Handbook should be construed to excuse the standing trustee from"
" complying with all duties imposed by the Bankruptcy Code and Rules, local rules, and"
- " orders of the court. The standing trustee should notify the United States Trustee"
+ " orders of the court. The standing trustee should notify the United States Trustee"
" whenever the provision of the Handbook conflicts with the local rules or orders of"
- " the court. The standing trustee is accountable for all duties set forth in this"
- " Handbook, but need not personally perform any duty unless otherwise indicated. All"
+ " the court. The standing trustee is accountable for all duties set forth in this"
+ " Handbook, but need not personally perform any duty unless otherwise indicated. All"
),
CompositeElement(
"statutory references in this Handbook refer to the Bankruptcy Code, 11 U.S.C. § 101"
@@ -57,12 +57,12 @@ def test_it_chunks_a_document_when_basic_chunking_strategy_is_specified_on_parti
),
CompositeElement(
"This Handbook does not create additional rights against the standing trustee or"
- " United States Trustee in favor of other parties.\n\nB.\tROLE OF THE UNITED STATES"
+ " United States Trustee in favor of other parties.\n\nB. ROLE OF THE UNITED STATES"
" TRUSTEE"
),
CompositeElement(
"The Bankruptcy Reform Act of 1978 removed the bankruptcy judge from the"
- " responsibilities for daytoday administration of cases. Debtors, creditors, and"
+ " responsibilities for daytoday administration of cases. Debtors, creditors, and"
" third parties with adverse interests to the trustee were concerned that the court,"
" which previously appointed and supervised the trustee, would not impartially"
" adjudicate their rights as adversaries of that trustee. To address these concerns,"
@@ -70,24 +70,24 @@ def test_it_chunks_a_document_when_basic_chunking_strategy_is_specified_on_parti
),
CompositeElement(
"Many administrative functions formerly performed by the court were placed within the"
- " Department of Justice through the creation of the Program. Among the administrative"
+ " Department of Justice through the creation of the Program. Among the administrative"
" functions assigned to the United States Trustee were the appointment and supervision"
- " of chapter 13 trustees./ This Handbook is issued under the authority of the"
- " Program’s enabling statutes. \n\nC.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t"
+ " of chapter 13 trustees./ This Handbook is issued under the authority of the"
+ " Program’s enabling statutes.\n\nC. STATUTORY DUTIES OF A STANDING TRUSTEE"
),
CompositeElement(
- "The standing trustee has a fiduciary responsibility to the bankruptcy estate. The"
- " standing trustee is more than a mere disbursing agent. The standing trustee must"
- " be personally involved in the trustee operation. If the standing trustee is or"
+ "The standing trustee has a fiduciary responsibility to the bankruptcy estate. The"
+ " standing trustee is more than a mere disbursing agent. The standing trustee must"
+ " be personally involved in the trustee operation. If the standing trustee is or"
" becomes unable to perform the duties and responsibilities of a standing trustee,"
" the standing trustee must immediately advise the United States Trustee."
- " 28 U.S.C. § 586(b), 28 C.F.R. § 58.4(b) referencing 28 C.F.R. § 58.3(b)."
+ " 28 U.S.C. § 586(b), 28 C.F.R. § 58.4(b) referencing 28 C.F.R. § 58.3(b)."
),
CompositeElement(
"Although this Handbook is not intended to be a complete statutory reference, the"
" standing trustee’s primary statutory duties are set forth in 11 U.S.C. § 1302, which"
" incorporates by reference some of the duties of chapter 7 trustees found in"
- " 11 U.S.C. § 704. These duties include, but are not limited to, the"
+ " 11 U.S.C. § 704. These duties include, but are not limited to, the"
" following:\n\nCopyright"
),
]
diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py
index 45071e667e..443b073755 100644
--- a/test_unstructured/chunking/test_title.py
+++ b/test_unstructured/chunking/test_title.py
@@ -8,7 +8,7 @@
import pytest
-from test_unstructured.unit_utils import FixtureRequest, Mock, function_mock
+from test_unstructured.unit_utils import FixtureRequest, Mock, function_mock, input_path
from unstructured.chunking.base import CHUNK_MULTI_PAGE_DEFAULT
from unstructured.chunking.title import _ByTitleChunkingOptions, chunk_by_title
from unstructured.documents.coordinates import CoordinateSystem
@@ -20,10 +20,12 @@
ElementMetadata,
ListItem,
Table,
+ TableChunk,
Text,
Title,
)
from unstructured.partition.html import partition_html
+from unstructured.staging.base import elements_from_json
# ================================================================================================
# INTEGRATION-TESTS
@@ -33,7 +35,53 @@
# ================================================================================================
-def test_it_splits_a_large_element_into_multiple_chunks():
+def test_it_chunks_text_followed_by_table_together_when_both_fit():
+ elements = elements_from_json(input_path("chunking/title_table_200.json"))
+
+ chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
+
+ assert len(chunks) == 1
+ assert isinstance(chunks[0], CompositeElement)
+
+
+def test_it_chunks_table_followed_by_text_together_when_both_fit():
+ elements = elements_from_json(input_path("chunking/table_text_200.json"))
+
+ # -- disable chunk combining so we test pre-chunking behavior, not chunk-combining --
+ chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
+
+ assert len(chunks) == 1
+ assert isinstance(chunks[0], CompositeElement)
+
+
+def test_it_splits_oversized_table():
+ elements = elements_from_json(input_path("chunking/table_2000.json"))
+
+ chunks = chunk_by_title(elements)
+
+ assert len(chunks) == 5
+ assert all(isinstance(chunk, TableChunk) for chunk in chunks)
+
+
+def test_it_starts_new_chunk_for_table_after_full_text_chunk():
+ elements = elements_from_json(input_path("chunking/long_text_table_200.json"))
+
+ chunks = chunk_by_title(elements, max_characters=250)
+
+ assert len(chunks) == 2
+ assert [type(chunk) for chunk in chunks] == [CompositeElement, Table]
+
+
+def test_it_starts_new_chunk_for_text_after_full_table_chunk():
+ elements = elements_from_json(input_path("chunking/full_table_long_text_250.json"))
+
+ chunks = chunk_by_title(elements, max_characters=250)
+
+ assert len(chunks) == 2
+ assert [type(chunk) for chunk in chunks] == [Table, CompositeElement]
+
+
+def test_it_splits_a_large_text_element_into_multiple_chunks():
elements: list[Element] = [
Title("Introduction"),
Text(
@@ -68,7 +116,7 @@ def test_it_splits_elements_by_title_and_table():
chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=True)
- assert len(chunks) == 4
+ assert len(chunks) == 3
# --
chunk = chunks[0]
assert isinstance(chunk, CompositeElement)
@@ -76,13 +124,10 @@ def test_it_splits_elements_by_title_and_table():
Title("A Great Day"),
Text("Today is a great day."),
Text("It is sunny outside."),
+ Table("Heading\nCell text"),
]
# --
chunk = chunks[1]
- assert isinstance(chunk, Table)
- assert chunk.metadata.orig_elements == [Table("Heading\nCell text")]
- # ==
- chunk = chunks[2]
assert isinstance(chunk, CompositeElement)
assert chunk.metadata.orig_elements == [
Title("An Okay Day"),
@@ -90,7 +135,7 @@ def test_it_splits_elements_by_title_and_table():
Text("It is rainy outside."),
]
# --
- chunk = chunks[3]
+ chunk = chunks[2]
assert isinstance(chunk, CompositeElement)
assert chunk.metadata.orig_elements == [
Title("A Bad Day"),
@@ -119,9 +164,8 @@ def test_chunk_by_title():
assert chunks == [
CompositeElement(
- "A Great Day\n\nToday is a great day.\n\nIt is sunny outside.",
+ "A Great Day\n\nToday is a great day.\n\nIt is sunny outside.\n\nHeading Cell text"
),
- Table("Heading\nCell text"),
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
CompositeElement(
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
@@ -150,10 +194,7 @@ def test_chunk_by_title_separates_by_page_number():
CompositeElement(
"A Great Day",
),
- CompositeElement(
- "Today is a great day.\n\nIt is sunny outside.",
- ),
- Table("Heading\nCell text"),
+ CompositeElement("Today is a great day.\n\nIt is sunny outside.\n\nHeading Cell text"),
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
CompositeElement(
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
@@ -178,9 +219,8 @@ def test_chuck_by_title_respects_multipage():
chunks = chunk_by_title(elements, multipage_sections=True, combine_text_under_n_chars=0)
assert chunks == [
CompositeElement(
- "A Great Day\n\nToday is a great day.\n\nIt is sunny outside.",
+ "A Great Day\n\nToday is a great day.\n\nIt is sunny outside.\n\nHeading Cell text"
),
- Table("Heading\nCell text"),
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
CompositeElement(
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
@@ -206,9 +246,8 @@ def test_chunk_by_title_groups_across_pages():
assert chunks == [
CompositeElement(
- "A Great Day\n\nToday is a great day.\n\nIt is sunny outside.",
+ "A Great Day\n\nToday is a great day.\n\nIt is sunny outside.\n\nHeading Cell text"
),
- Table("Heading\nCell text"),
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
CompositeElement(
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
diff --git a/test_unstructured/partition/test_json.py b/test_unstructured/partition/test_json.py
index 5022588a23..7a591953d3 100644
--- a/test_unstructured/partition/test_json.py
+++ b/test_unstructured/partition/test_json.py
@@ -37,7 +37,7 @@ def test_it_chunks_elements_when_a_chunking_strategy_is_specified():
"example-docs/spring-weather.html.json", chunking_strategy="basic", max_characters=1500
)
- assert len(chunks) == 10
+ assert len(chunks) == 9
assert all(isinstance(ch, CompositeElement) for ch in chunks)
diff --git a/test_unstructured/testfiles/chunking/full_table_long_text_250.json b/test_unstructured/testfiles/chunking/full_table_long_text_250.json
new file mode 100644
index 0000000000..f8b739257d
--- /dev/null
+++ b/test_unstructured/testfiles/chunking/full_table_long_text_250.json
@@ -0,0 +1,32 @@
+[
+ {
+ "type": "Table",
+ "element_id": "ca96108263324e9d865a98f19cf7c940",
+ "text": "RFP Number: 2024-PMO-01 RFP Title: PMO Services RFP RFP Due Date and Time: Number of Pages: #189 05/30/2024 by 5:00pm Central Time",
+ "metadata": {
+ "category_depth": 1,
+ "page_number": 1,
+ "parent_id": "747587de72444235a68c768d544ff5f3",
+ "text_as_html": " RFP Number: 2024-PMO-01 | RFP Title: PMO Services RFP |
RFP Due Date and Time: | Number of Pages: #189 |
05/30/2024 by 5:00pm Central Time | |
",
+ "languages": [
+ "eng"
+ ],
+ "filetype": "text/html"
+ }
+ },
+ {
+ "type": "NarrativeText",
+ "element_id": "5bc93ad5828445f98cac824c750cacfd",
+ "text": "Format: CSV file for Export and Download Contact: Charles Stringham cstringham@alsde.edu to arrange secure data transfer OR with technical questions nickey.johnson@alsde.edu for other questions",
+ "metadata": {
+ "category_depth": 2,
+ "page_number": 1,
+ "parent_id": "d8fa364bbfdf42d7b37c7a1dcb90ecf5",
+ "text_as_html": "Format: CSV file for Export and Download
Contact: Charles Stringham cstringham@alsde.edu to arrange secure data transfer OR with technical questions
nickey.johnson@alsde.edu for other questions
",
+ "languages": [
+ "eng"
+ ],
+ "filetype": "text/html"
+ }
+ }
+]
diff --git a/test_unstructured/testfiles/chunking/long_text_table_200.json b/test_unstructured/testfiles/chunking/long_text_table_200.json
new file mode 100644
index 0000000000..4ff54bf06c
--- /dev/null
+++ b/test_unstructured/testfiles/chunking/long_text_table_200.json
@@ -0,0 +1,32 @@
+[
+ {
+ "type": "NarrativeText",
+ "element_id": "5bc93ad5828445f98cac824c750cacfd",
+ "text": "Format: CSV file for Export and Download Contact: Charles Stringham cstringham@alsde.edu to arrange secure data transfer OR with technical questions nickey.johnson@alsde.edu for other questions",
+ "metadata": {
+ "category_depth": 2,
+ "page_number": 1,
+ "parent_id": "d8fa364bbfdf42d7b37c7a1dcb90ecf5",
+ "text_as_html": "Format: CSV file for Export and Download
Contact: Charles Stringham cstringham@alsde.edu to arrange secure data transfer OR with technical questions
nickey.johnson@alsde.edu for other questions
",
+ "languages": [
+ "eng"
+ ],
+ "filetype": "text/html"
+ }
+ },
+ {
+ "type": "Table",
+ "element_id": "ca96108263324e9d865a98f19cf7c940",
+ "text": "RFP Number: 2024-PMO-01 RFP Title: PMO Services RFP RFP Due Date and Time: Number of Pages: #189 05/30/2024 by 5:00pm Central Time",
+ "metadata": {
+ "category_depth": 1,
+ "page_number": 1,
+ "parent_id": "747587de72444235a68c768d544ff5f3",
+ "text_as_html": " RFP Number: 2024-PMO-01 | RFP Title: PMO Services RFP |
RFP Due Date and Time: | Number of Pages: #189 |
05/30/2024 by 5:00pm Central Time | |
",
+ "languages": [
+ "eng"
+ ],
+ "filetype": "text/html"
+ }
+ }
+]
diff --git a/test_unstructured/testfiles/chunking/table_2000.json b/test_unstructured/testfiles/chunking/table_2000.json
new file mode 100644
index 0000000000..d5e013c383
--- /dev/null
+++ b/test_unstructured/testfiles/chunking/table_2000.json
@@ -0,0 +1,17 @@
+[
+ {
+ "type": "Table",
+ "element_id": "e6278883f688428c98cec628a00b0102",
+ "text": "Field Name Size Type Description Example School_Year 9 VARCHAR School year the assessment was given 2019-2020 LEA_Name VARCHAR Official Name of the School System Happy City Schools LEA_Code 3 VARCHAR 3-digit ALSDE-assigned system code 010 or 298 School_Code 6 VARCHAR 4-digit ALSDE-assigned school code 0100 or 9203 Student_Identifier 10 VARCHAR Student's ALSDE ID number -SSID ***must be 10 digits and start with \"19\" or \"20\"*** 9999999999 Student_Last_Name 35 VARCHAR Student's last name Smith Student_First_Name 35 VARCHAR Student's first name Jane Student_Date_of_Birth_Month 2 VARCHAR Student birth date month. MM 05, 11 Student_Date_of_Birth_Day 2 VARCHAR Student birth date day. DD 03, 25 Student_Date_of_Birth_Year 4 VARCHAR Student birth date Year. YYYY 2015 Reading_Teacher_Identifier 13 VARCHAR Reading Teacher's ALSDE ID/TCHNumber. The teacher who is primarily responsible for Reading instruction of the student. (These are two names for the same number). ***must be in this format 3 letters, dash, 4 numbers, dash, 4 numbers*** XXX-9999-9999, NOJ-1234-5678 Reading_Assessment_Name 15 VARCHAR Unique identifier for Reading assessment. Vendor's name for overall assessment. XXXX Reading_Administration_Mode 8 VARCHAR This field indicates if the assessment was administered in an in-person (face-to-face) or a remote learning environment. The options are: InPerson or Remote Reading_Benchmark_Period 3 VARCHAR Benchmark period during the term the assessment was administered. Summer School will be SSS. BOY, MOY or EOY (SSS for summer school) Reading_Date_Completed 10 VARCHAR This is the date on which the assessment is completed MM/DD/YYYY 43962 Reading_Extended_Time 2 VARCHAR The field will contain a \"Y\" if the student was given more than the allotted time to finish the assessment or any subtest of the assessment as defined by the vendor in a standard administration. Y",
+ "metadata": {
+ "category_depth": 1,
+ "page_number": 1,
+ "parent_id": "3ddff8c2b6c44a16be24baf72bdd78a2",
+ "text_as_html": " Field Name | Size | Type | Description | Example |
School_Year | 9 | VARCHAR | School year the assessment was given | 2019-2020 |
LEA_Name | | VARCHAR | Official Name of the School System | Happy City Schools |
LEA_Code | 3 | VARCHAR | 3-digit ALSDE-assigned system code | 010 or 298 |
School_Code | 6 | VARCHAR | 4-digit ALSDE-assigned school code | 0100 or 9203 |
Student_Identifier | 10 | VARCHAR | Student's ALSDE ID number -SSID ***must be 10 digits and start with \"19\" or \"20\"*** | 9999999999 |
Student_Last_Name | 35 | VARCHAR | Student's last name | Smith |
Student_First_Name | 35 | VARCHAR | Student's first name | Jane |
Student_Date_of_Birth_Month | 2 | VARCHAR | Student birth date month. MM | 05, 11 |
Student_Date_of_Birth_Day | 2 | VARCHAR | Student birth date day. DD | 03, 25 |
Student_Date_of_Birth_Year | 4 | VARCHAR | Student birth date Year. YYYY | 2015 |
Reading_Teacher_Identifier | 13 | VARCHAR | Reading Teacher's ALSDE ID/TCHNumber. The teacher who is primarily responsible for Reading instruction of the student. (These are two names for the same number). ***must be in this format 3 letters, dash, 4 numbers, dash, 4 numbers*** | XXX-9999-9999, NOJ-1234-5678 |
Reading_Assessment_Name | 15 | VARCHAR | Unique identifier for Reading assessment. Vendor's name for overall assessment. | XXXX |
Reading_Administration_Mode | 8 | VARCHAR | This field indicates if the assessment was administered in an in-person (face-to-face) or a remote learning environment. The options are: | InPerson or Remote |
Reading_Benchmark_Period | 3 | VARCHAR | Benchmark period during the term the assessment was administered. Summer School will be SSS. | BOY, MOY or EOY (SSS for summer school) |
Reading_Date_Completed | 10 | VARCHAR | This is the date on which the assessment is completed MM/DD/YYYY | 43962 |
Reading_Extended_Time | 2 | VARCHAR | The field will contain a \"Y\" if the student was given more than the allotted time to finish the assessment or any subtest of the assessment as defined by the vendor in a standard administration. | Y |
",
+ "languages": [
+ "eng"
+ ],
+ "filetype": "text/html"
+ }
+ }
+]
diff --git a/test_unstructured/testfiles/chunking/table_text_200.json b/test_unstructured/testfiles/chunking/table_text_200.json
new file mode 100644
index 0000000000..456d134358
--- /dev/null
+++ b/test_unstructured/testfiles/chunking/table_text_200.json
@@ -0,0 +1,32 @@
+[
+ {
+ "type": "Table",
+ "element_id": "ca96108263324e9d865a98f19cf7c940",
+ "text": "RFP Number: 2024-PMO-01 RFP Title: PMO Services RFP RFP Due Date and Time: Number of Pages: #189 05/30/2024 by 5:00pm Central Time",
+ "metadata": {
+ "category_depth": 1,
+ "page_number": 1,
+ "parent_id": "747587de72444235a68c768d544ff5f3",
+ "text_as_html": " RFP Number: 2024-PMO-01 | RFP Title: PMO Services RFP |
RFP Due Date and Time: | Number of Pages: #189 |
05/30/2024 by 5:00pm Central Time | |
",
+ "languages": [
+ "eng"
+ ],
+ "filetype": "text/html"
+ }
+ },
+ {
+ "type": "Text",
+ "element_id": "0163a58539934b3aaca402c9e961b0d6",
+ "text": "REQUEST FOR PROPOSALS",
+ "metadata": {
+ "category_depth": 1,
+ "page_number": 1,
+ "parent_id": "747587de72444235a68c768d544ff5f3",
+ "text_as_html": "REQUEST FOR PROPOSALS
",
+ "languages": [
+ "eng"
+ ],
+ "filetype": "text/html"
+ }
+ }
+]
diff --git a/test_unstructured/testfiles/chunking/title_table_200.json b/test_unstructured/testfiles/chunking/title_table_200.json
new file mode 100644
index 0000000000..3d0a2b15a2
--- /dev/null
+++ b/test_unstructured/testfiles/chunking/title_table_200.json
@@ -0,0 +1,32 @@
+[
+ {
+ "type": "Title",
+ "element_id": "0163a58539934b3aaca402c9e961b0d6",
+ "text": "REQUEST FOR PROPOSALS",
+ "metadata": {
+ "category_depth": 1,
+ "page_number": 1,
+ "parent_id": "747587de72444235a68c768d544ff5f3",
+ "text_as_html": "REQUEST FOR PROPOSALS
",
+ "languages": [
+ "eng"
+ ],
+ "filetype": "text/html"
+ }
+ },
+ {
+ "type": "Table",
+ "element_id": "ca96108263324e9d865a98f19cf7c940",
+ "text": "RFP Number: 2024-PMO-01 RFP Title: PMO Services RFP RFP Due Date and Time: Number of Pages: #189 05/30/2024 by 5:00pm Central Time",
+ "metadata": {
+ "category_depth": 1,
+ "page_number": 1,
+ "parent_id": "747587de72444235a68c768d544ff5f3",
+ "text_as_html": " RFP Number: 2024-PMO-01 | RFP Title: PMO Services RFP |
RFP Due Date and Time: | Number of Pages: #189 |
05/30/2024 by 5:00pm Central Time | |
",
+ "languages": [
+ "eng"
+ ],
+ "filetype": "text/html"
+ }
+ }
+]
diff --git a/test_unstructured/unit_utils.py b/test_unstructured/unit_utils.py
index 11b1106dfd..a3565dcd9d 100644
--- a/test_unstructured/unit_utils.py
+++ b/test_unstructured/unit_utils.py
@@ -101,6 +101,13 @@ def parse_optional_datetime(datetime_str: Optional[str]) -> Optional[dt.datetime
return dt.datetime.fromisoformat(datetime_str) if datetime_str else None
+def input_path(rel_path: str) -> str:
+ """Resolve the absolute-path to `rel_path` in the testfiles directory."""
+ testfiles_dir = pathlib.Path(__file__).parent / "testfiles"
+ file_path = testfiles_dir / rel_path
+ return str(file_path.resolve())
+
+
# ------------------------------------------------------------------------------------------------
# MOCKING FIXTURES
# ------------------------------------------------------------------------------------------------
diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json b/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json
index 4f0950cc49..4f534582ea 100644
--- a/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json
+++ b/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json
@@ -1,8 +1,8 @@
[
{
"type": "CompositeElement",
- "element_id": "36385872440a208d3521a8a885d5f873",
- "text": "US Trustee Handbook\n\nCHAPTER 1\n\nINTRODUCTION\n\nCHAPTER 1 \u2013 INTRODUCTION\n\nA.\tPURPOSE",
+ "element_id": "85002882dd396da0b1b82c925b002be5",
+ "text": "US Trustee Handbook\n\nCHAPTER 1\n\nINTRODUCTION\n\nCHAPTER 1 \u2013 INTRODUCTION\n\nA. PURPOSE",
"metadata": {
"data_source": {
"record_locator": {
@@ -55,8 +55,8 @@
},
{
"type": "CompositeElement",
- "element_id": "91d26c5ec7f727ece12679cf6b80f90d",
- "text": "le 11 of the United States Code. 28 U.S.C. \u00a7 586(b). The Handbook, issued as part of our duties under 28 U.S.C. \u00a7 586, establishes or clarifies the",
+ "element_id": "1abe685eb8dfed0f2266d6cf793d7e6b",
+ "text": "le 11 of the United States Code. 28 U.S.C. \u00a7 586(b). The Handbook, issued as part of our duties under 28 U.S.C. \u00a7 586, establishes or clarifies the",
"metadata": {
"data_source": {
"record_locator": {
@@ -103,8 +103,8 @@
},
{
"type": "CompositeElement",
- "element_id": "20447c8f42ed2b919bd0e5707e7899ae",
- "text": "s, creditors, other parties in interest, and the United States Trustee. The Handbook does not present a full and complete statement of the law; it",
+ "element_id": "40588c4c1489058c4fec885f4696ebcc",
+ "text": "s, creditors, other parties in interest, and the United States Trustee. The Handbook does not present a full and complete statement of the law; it",
"metadata": {
"data_source": {
"record_locator": {
@@ -127,8 +127,8 @@
},
{
"type": "CompositeElement",
- "element_id": "e34c56af21b43f4179f996ddea901bc4",
- "text": "ment of the law; it should not be used as a substitute for legal research and analysis. The standing trustee must be familiar with relevant",
+ "element_id": "9ddf0b109cf940de5f575acc9d9758c8",
+ "text": "ment of the law; it should not be used as a substitute for legal research and analysis. The standing trustee must be familiar with relevant provisions",
"metadata": {
"data_source": {
"record_locator": {
@@ -151,8 +151,8 @@
},
{
"type": "CompositeElement",
- "element_id": "55e660e5b0d0ec6ee5476621e556d6c8",
- "text": "iliar with relevant provisions of the Bankruptcy Code, Federal Rules of Bankruptcy Procedure (Rules), any local bankruptcy rules, and case law. 11",
+ "element_id": "b7d1b42646393ca0f41af0e8ec48f9a9",
+ "text": "relevant provisions of the Bankruptcy Code, Federal Rules of Bankruptcy Procedure (Rules), any local bankruptcy rules, and case law. 11 U.S.C. \u00a7 321,",
"metadata": {
"data_source": {
"record_locator": {
@@ -175,8 +175,8 @@
},
{
"type": "CompositeElement",
- "element_id": "a9335be161a6a7a080ff78e4e07cbadb",
- "text": ", and case law. 11 U.S.C. \u00a7 321, 28 U.S.C. \u00a7 586, 28 C.F.R. \u00a7 58.6(a)(3). Standing trustees are encouraged to follow Practice Tips identified in",
+ "element_id": "9ee33f4141eca1f98ca4299d0fdfba31",
+ "text": "w. 11 U.S.C. \u00a7 321, 28 U.S.C. \u00a7 586, 28 C.F.R. \u00a7 58.6(a)(3). Standing trustees are encouraged to follow Practice Tips identified in this Handbook but",
"metadata": {
"data_source": {
"record_locator": {
@@ -199,8 +199,8 @@
},
{
"type": "CompositeElement",
- "element_id": "5f2d61a46e9d16ce346eacc25321a250",
- "text": "Tips identified in this Handbook but these are not considered mandatory.",
+ "element_id": "6da3b5e2a833fa5ab6685f0fa46d2d6f",
+ "text": "n this Handbook but these are not considered mandatory.",
"metadata": {
"data_source": {
"record_locator": {
@@ -246,8 +246,8 @@
},
{
"type": "CompositeElement",
- "element_id": "2ff156994a8c58d8a5c91918a543ec28",
- "text": "tcy Code and Rules, local rules, and orders of the court. The standing trustee should notify the United States Trustee whenever the provision of the",
+ "element_id": "685600ed24c5b0e3b34e7d639d3b1959",
+ "text": "tcy Code and Rules, local rules, and orders of the court. The standing trustee should notify the United States Trustee whenever the provision of the",
"metadata": {
"data_source": {
"record_locator": {
@@ -270,8 +270,8 @@
},
{
"type": "CompositeElement",
- "element_id": "7c43851f864b7ccc35150c93d06abe80",
- "text": "he provision of the Handbook conflicts with the local rules or orders of the court. The standing trustee is accountable for all duties set forth in",
+ "element_id": "c998f5c10c9dac92e4d3624896a603c7",
+ "text": "he provision of the Handbook conflicts with the local rules or orders of the court. The standing trustee is accountable for all duties set forth in",
"metadata": {
"data_source": {
"record_locator": {
@@ -294,8 +294,8 @@
},
{
"type": "CompositeElement",
- "element_id": "7caf69b806daa033d686fae6100f4d7c",
- "text": "duties set forth in this Handbook, but need not personally perform any duty unless otherwise indicated. All statutory references in this Handbook",
+ "element_id": "d4b750e9af7167156f369b310a8cebb8",
+ "text": "duties set forth in this Handbook, but need not personally perform any duty unless otherwise indicated. All statutory references in this Handbook",
"metadata": {
"data_source": {
"record_locator": {
@@ -365,8 +365,8 @@
},
{
"type": "CompositeElement",
- "element_id": "66ff9b9385d511ca7e71f1e6852d3221",
- "text": "B.\tROLE OF THE UNITED STATES TRUSTEE",
+ "element_id": "8f411358790d6ee5b0d24f919206d3fd",
+ "text": "B. ROLE OF THE UNITED STATES TRUSTEE",
"metadata": {
"data_source": {
"record_locator": {
@@ -388,8 +388,8 @@
},
{
"type": "CompositeElement",
- "element_id": "1876c502fcbb25fd7b978417aea8dded",
- "text": "The Bankruptcy Reform Act of 1978 removed the bankruptcy judge from the responsibilities for daytoday administration of cases. Debtors, creditors,",
+ "element_id": "6044d58375609c8802cfae16cef5cee9",
+ "text": "The Bankruptcy Reform Act of 1978 removed the bankruptcy judge from the responsibilities for daytoday administration of cases. Debtors, creditors, and",
"metadata": {
"data_source": {
"record_locator": {
@@ -411,8 +411,8 @@
},
{
"type": "CompositeElement",
- "element_id": "5f89702a93c3df34a62905e5dff5c54d",
- "text": "Debtors, creditors, and third parties with adverse interests to the trustee were concerned that the court, which previously appointed and supervised",
+ "element_id": "a4030396eaf54570462ed74f86e45bc8",
+ "text": "ors, creditors, and third parties with adverse interests to the trustee were concerned that the court, which previously appointed and supervised the",
"metadata": {
"data_source": {
"record_locator": {
@@ -435,8 +435,8 @@
},
{
"type": "CompositeElement",
- "element_id": "c916e417ed924c556baed9616c3f81ae",
- "text": "nted and supervised the trustee, would not impartially adjudicate their rights as adversaries of that trustee. To address these concerns, judicial and",
+ "element_id": "80e3b20fead224c85652bbdce327a28d",
+ "text": "and supervised the trustee, would not impartially adjudicate their rights as adversaries of that trustee. To address these concerns, judicial and",
"metadata": {
"data_source": {
"record_locator": {
@@ -483,8 +483,8 @@
},
{
"type": "CompositeElement",
- "element_id": "709927b67286cccaf8fb25d63667c277",
- "text": "Many administrative functions formerly performed by the court were placed within the Department of Justice through the creation of the Program. Among",
+ "element_id": "39a3f1465d06269d2544ded43dc3a7df",
+ "text": "Many administrative functions formerly performed by the court were placed within the Department of Justice through the creation of the Program. Among",
"metadata": {
"data_source": {
"record_locator": {
@@ -506,8 +506,8 @@
},
{
"type": "CompositeElement",
- "element_id": "509676fb8d4f77b5f270629dee7a2664",
- "text": "the Program. Among the administrative functions assigned to the United States Trustee were the appointment and supervision of chapter 13 trustees./",
+ "element_id": "2872e5d0bea6ec1523eb9ae2c1c64add",
+ "text": "the Program. Among the administrative functions assigned to the United States Trustee were the appointment and supervision of chapter 13 trustees./",
"metadata": {
"data_source": {
"record_locator": {
@@ -530,8 +530,8 @@
},
{
"type": "CompositeElement",
- "element_id": "7ced6d1ee6cc9478adfd8e2a613be42a",
- "text": "apter 13 trustees./ This Handbook is issued under the authority of the Program\u2019s enabling statutes. ",
+ "element_id": "24e1076110b431b248b43b1fdaae5282",
+ "text": "apter 13 trustees./ This Handbook is issued under the authority of the Program\u2019s enabling statutes.",
"metadata": {
"data_source": {
"record_locator": {
@@ -554,8 +554,8 @@
},
{
"type": "CompositeElement",
- "element_id": "2c82d3fa4252275d5309a640eb25cd68",
- "text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t",
+ "element_id": "158a80e29cfe6aa83a4931d955a8fa4f",
+ "text": "C. STATUTORY DUTIES OF A STANDING TRUSTEE",
"metadata": {
"data_source": {
"record_locator": {
@@ -577,8 +577,8 @@
},
{
"type": "CompositeElement",
- "element_id": "a819e32a65d1f545cb404fe3f6273357",
- "text": "The standing trustee has a fiduciary responsibility to the bankruptcy estate. The standing trustee is more than a mere disbursing agent. The",
+ "element_id": "e5fdcc6a007017354a9d708dc04fee02",
+ "text": "The standing trustee has a fiduciary responsibility to the bankruptcy estate. The standing trustee is more than a mere disbursing agent. The standing",
"metadata": {
"data_source": {
"record_locator": {
@@ -600,8 +600,8 @@
},
{
"type": "CompositeElement",
- "element_id": "9e98089003e3b42ed7f1c263335dee3c",
- "text": "bursing agent. The standing trustee must be personally involved in the trustee operation. If the standing trustee is or becomes unable to perform",
+ "element_id": "0bf52e064da3ef4fb8b0a92d4b9fa694",
+ "text": "agent. The standing trustee must be personally involved in the trustee operation. If the standing trustee is or becomes unable to perform the duties",
"metadata": {
"data_source": {
"record_locator": {
@@ -624,8 +624,8 @@
},
{
"type": "CompositeElement",
- "element_id": "d476b15e5336342b1da22d100849b23c",
- "text": "s unable to perform the duties and responsibilities of a standing trustee, the standing trustee must immediately advise the United States Trustee. 28",
+ "element_id": "db297530e558410b89acd93c6b452b84",
+ "text": "perform the duties and responsibilities of a standing trustee, the standing trustee must immediately advise the United States Trustee. 28 U.S.C. \u00a7",
"metadata": {
"data_source": {
"record_locator": {
@@ -648,8 +648,8 @@
},
{
"type": "CompositeElement",
- "element_id": "8f8c9c0919f7502bd2fabad0b12ad664",
- "text": "States Trustee. 28 U.S.C. \u00a7 586(b), 28 C.F.R. \u00a7 58.4(b) referencing 28 C.F.R. \u00a7 58.3(b).",
+ "element_id": "201bfacc211f0eb640e2830b8c29ae41",
+ "text": "rustee. 28 U.S.C. \u00a7 586(b), 28 C.F.R. \u00a7 58.4(b) referencing 28 C.F.R. \u00a7 58.3(b).",
"metadata": {
"data_source": {
"record_locator": {
@@ -695,8 +695,8 @@
},
{
"type": "CompositeElement",
- "element_id": "9864d90bf9febdd104e7eac4c56689ba",
- "text": "are set forth in 11 U.S.C. \u00a7 1302, which incorporates by reference some of the duties of chapter 7 trustees found in 11 U.S.C. \u00a7 704. These duties",
+ "element_id": "fd4c45036e8f17c27271f75944389724",
+ "text": "are set forth in 11 U.S.C. \u00a7 1302, which incorporates by reference some of the duties of chapter 7 trustees found in 11 U.S.C. \u00a7 704. These duties",
"metadata": {
"data_source": {
"record_locator": {
@@ -719,8 +719,8 @@
},
{
"type": "CompositeElement",
- "element_id": "a91f963bcd1c092bffb844453aafa499",
- "text": "704. These duties include, but are not limited to, the following:",
+ "element_id": "a968d741409111b777fc123ef01f5407",
+ "text": "\u00a7 704. These duties include, but are not limited to, the following:",
"metadata": {
"data_source": {
"record_locator": {
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 309e1c0c38..d2557b8499 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.11-dev0" # pragma: no cover
+__version__ = "0.16.11-dev1" # pragma: no cover
diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py
index b91c3982ea..695393c55c 100644
--- a/unstructured/chunking/base.py
+++ b/unstructured/chunking/base.py
@@ -43,9 +43,6 @@
BoundaryPredicate: TypeAlias = Callable[[Element], bool]
"""Detects when element represents crossing a semantic boundary like section or page."""
-PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"
-"""The kind of object produced by a pre-chunker."""
-
TextAndHtml: TypeAlias = tuple[str, str]
@@ -288,8 +285,13 @@ def _iter_pre_chunks(self) -> Iterator[PreChunk]:
pre_chunk_builder = PreChunkBuilder(self._opts)
for element in self._elements:
- # -- start new pre-chunk when necessary --
- if self._is_in_new_semantic_unit(element) or not pre_chunk_builder.will_fit(element):
+ # -- start new pre-chunk when necessary to uphold segregation guarantees --
+ if (
+ # -- start new pre-chunk when necessary to uphold segregation guarantees --
+ self._is_in_new_semantic_unit(element)
+ # -- or when next element won't fit --
+ or not pre_chunk_builder.will_fit(element)
+ ):
yield from pre_chunk_builder.flush()
# -- add this element to the work-in-progress (WIP) pre-chunk --
@@ -320,8 +322,7 @@ class PreChunkBuilder:
the next element in the element stream.
`.flush()` is used to build a PreChunk object from the accumulated elements. This method
- returns an iterator that generates zero-or-one `TextPreChunk` or `TablePreChunk` object and is
- used like so:
+ returns an iterator that generates zero-or-one `PreChunk` object and is used like so:
yield from builder.flush()
@@ -355,15 +356,13 @@ def flush(self) -> Iterator[PreChunk]:
boundary has been reached. Also to clear out a terminal pre-chunk at the end of an element
stream.
"""
- if not self._elements:
+ elements = self._elements
+
+ if not elements:
return
- pre_chunk = (
- TablePreChunk(self._elements[0], self._overlap_prefix, self._opts)
- if isinstance(self._elements[0], Table)
- # -- copy list, don't use original or it may change contents as builder proceeds --
- else TextPreChunk(list(self._elements), self._overlap_prefix, self._opts)
- )
+ # -- copy element list, don't use original or it may change contents as builder proceeds --
+ pre_chunk = PreChunk(elements, self._overlap_prefix, self._opts)
# -- clear builder before yield so we're not sensitive to the timing of how/when this
# -- iterator is exhausted and can add elements for the next pre-chunk immediately.
self._reset_state(pre_chunk.overlap_tail)
@@ -384,12 +383,6 @@ def will_fit(self, element: Element) -> bool:
# -- an empty pre-chunk will accept any element (including an oversized-element) --
if len(self._elements) == 0:
return True
- # -- a `Table` will not fit in a non-empty pre-chunk --
- if isinstance(element, Table):
- return False
- # -- no element will fit in a pre-chunk that already contains a `Table` element --
- if isinstance(self._elements[0], Table):
- return False
# -- a pre-chunk that already exceeds the soft-max is considered "full" --
if self._text_length > self._opts.soft_max:
return False
@@ -429,45 +422,67 @@ def _text_length(self) -> int:
# ================================================================================================
-# PRE-CHUNK SUB-TYPES
+# PRE-CHUNK
# ================================================================================================
-class TablePreChunk:
- """A pre-chunk composed of a single Table element."""
+class PreChunk:
+ """Sequence of elements staged to form a single chunk.
- def __init__(self, table: Table, overlap_prefix: str, opts: ChunkingOptions) -> None:
- self._table = table
+ This object is purposely immutable.
+ """
+
+ def __init__(
+ self, elements: Iterable[Element], overlap_prefix: str, opts: ChunkingOptions
+ ) -> None:
+ self._elements = list(elements)
self._overlap_prefix = overlap_prefix
self._opts = opts
- def iter_chunks(self) -> Iterator[Table | TableChunk]:
- """Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
- # -- A table with no non-whitespace text produces no chunks --
- if not self._table_text:
- return
+ def __eq__(self, other: Any) -> bool:
+ if not isinstance(other, PreChunk):
+ return False
+ return self._overlap_prefix == other._overlap_prefix and self._elements == other._elements
- # -- only text-split a table when it's longer than the chunking window --
- maxlen = self._opts.hard_max
- if len(self._text_with_overlap) <= maxlen and len(self._html) <= maxlen:
- # -- use the compactified html for .text_as_html, even though we're not splitting --
- metadata = self._metadata
- metadata.text_as_html = self._html or None
- # -- note the overlap-prefix is prepended to its text --
- yield Table(text=self._text_with_overlap, metadata=metadata)
- return
+ def can_combine(self, pre_chunk: PreChunk) -> bool:
+ """True when `pre_chunk` can be combined with this one without exceeding size limits."""
+ if len(self._text) >= self._opts.combine_text_under_n_chars:
+ return False
+ # -- avoid duplicating length computations by doing a trial-combine which is just as
+ # -- efficient and definitely more robust than hoping two different computations of combined
+ # -- length continue to get the same answer as the code evolves. Only possible because
+ # -- `.combine()` is non-mutating.
+ combined_len = len(self.combine(pre_chunk)._text)
- # -- When there's no HTML, split it like a normal element. Also fall back to text-only
- # -- chunks when `max_characters` is less than 50. `.text_as_html` metadata is impractical
- # -- for a chunking window that small because the 33 characterss of HTML overhead for each
- # -- chunk (``) would produce a very large number of
- # -- very small chunks.
- if not self._html or self._opts.hard_max < 50:
- yield from self._iter_text_only_table_chunks()
- return
+ return combined_len <= self._opts.hard_max
- # -- otherwise, form splits with "synchronized" text and html --
- yield from self._iter_text_and_html_table_chunks()
+ def combine(self, other_pre_chunk: PreChunk) -> PreChunk:
+ """Return new `PreChunk` that combines this and `other_pre_chunk`."""
+ # -- combined pre-chunk gets the overlap-prefix of the first pre-chunk. The second overlap
+ # -- is automatically incorporated at the end of the first chunk, where it originated.
+ return PreChunk(
+ self._elements + other_pre_chunk._elements,
+ overlap_prefix=self._overlap_prefix,
+ opts=self._opts,
+ )
+
+ def iter_chunks(self) -> Iterator[CompositeElement | Table | TableChunk]:
+ """Form this pre-chunk into one or more chunk elements maxlen or smaller.
+
+ When the total size of the pre-chunk will fit in the chunking window, a single chunk it
+ emitted. When this prechunk contains an oversized element (always isolated), it is split
+ into two or more chunks that each fit the chunking window.
+ """
+
+ # -- a one-table-only pre-chunk is handled specially, by `TablePreChunk`, mainly because
+ # -- it may need to be split into multiple `TableChunk` elements and that operation is
+ # -- quite specialized.
+ if len(self._elements) == 1 and isinstance(self._elements[0], Table):
+ yield from _TableChunker.iter_chunks(
+ self._elements[0], self._overlap_prefix, self._opts
+ )
+ else:
+ yield from _Chunker.iter_chunks(self._elements, self._text, self._opts)
@lazyproperty
def overlap_tail(self) -> str:
@@ -478,178 +493,66 @@ def overlap_tail(self) -> str:
trailing whitespace.
"""
overlap = self._opts.inter_chunk_overlap
- return self._text_with_overlap[-overlap:].strip() if overlap else ""
-
- @lazyproperty
- def _html(self) -> str:
- """The compactified HTML for this table when it has text-as-HTML.
-
- The empty string when table-structure has not been captured, perhaps because
- `infer_table_structure` was set `False` in the partitioning call.
- """
- if not (html_table := self._html_table):
- return ""
-
- return html_table.html
-
- @lazyproperty
- def _html_table(self) -> HtmlTable | None:
- """The `lxml` HTML element object for this table.
-
- `None` when the `Table` element has no `.metadata.text_as_html`.
- """
- if (text_as_html := self._table.metadata.text_as_html) is None:
- return None
-
- text_as_html = text_as_html.strip()
- if not text_as_html: # pragma: no cover
- return None
-
- return HtmlTable.from_html_text(text_as_html)
-
- def _iter_text_and_html_table_chunks(self) -> Iterator[TableChunk]:
- """Split table into chunks where HTML corresponds exactly to text.
-
- `.metadata.text_as_html` for each chunk is a parsable `` HTML fragment.
- """
- if (html_table := self._html_table) is None: # pragma: no cover
- raise ValueError("this method is undefined for a table having no .text_as_html")
-
- is_continuation = False
-
- for text, html in _TableSplitter.iter_subtables(html_table, self._opts):
- metadata = self._metadata
- metadata.text_as_html = html
- # -- second and later chunks get `.metadata.is_continuation = True` --
- metadata.is_continuation = is_continuation or None
- is_continuation = True
-
- yield TableChunk(text=text, metadata=metadata)
-
- def _iter_text_only_table_chunks(self) -> Iterator[TableChunk]:
- """Split oversized text-only table (no text-as-html) into chunks."""
- text_remainder = self._text_with_overlap
- split = self._opts.split
- is_continuation = False
-
- while text_remainder:
- # -- split off the next chunk-worth of characters into a TableChunk --
- chunk_text, text_remainder = split(text_remainder)
- metadata = self._metadata
- # -- second and later chunks get `.metadata.is_continuation = True` --
- metadata.is_continuation = is_continuation or None
- is_continuation = True
-
- yield TableChunk(text=chunk_text, metadata=metadata)
-
- @property
- def _metadata(self) -> ElementMetadata:
- """The base `.metadata` value for chunks formed from this pre-chunk.
+ return self._text[-overlap:].strip() if overlap else ""
- The term "base" here means that other metadata fields will be added, depending on the
- chunk. In particular, `.metadata.text_as_html` will be different for each text-split chunk
- and `.metadata.is_continuation` must be added for second-and-later text-split chunks.
+ def _iter_text_segments(self) -> Iterator[str]:
+ """Generate overlap text and each element text segment in order.
- Note this is a fresh copy of the metadata on each call since it will need to be mutated
- differently for each chunk formed from this pre-chunk.
+ Empty text segments are not included.
"""
- CS = ConsolidationStrategy
- metadata = copy.deepcopy(self._table.metadata)
-
- # -- drop metadata fields not appropriate for chunks, in particular
- # -- parent_id's will not reliably point to an existing element
- drop_field_names = [
- field_name
- for field_name, strategy in CS.field_consolidation_strategies().items()
- if strategy is CS.DROP
- ]
- for field_name in drop_field_names:
- setattr(metadata, field_name, None)
-
- if self._opts.include_orig_elements:
- metadata.orig_elements = self._orig_elements
- return metadata
+ if self._overlap_prefix:
+ yield self._overlap_prefix
+ for e in self._elements:
+ text = " ".join(e.text.strip().split())
+ if not text:
+ continue
+ yield text
@lazyproperty
- def _orig_elements(self) -> list[Element]:
- """The `.metadata.orig_elements` value for chunks formed from this pre-chunk.
+ def _text(self) -> str:
+ """The concatenated text of all elements in this pre-chunk, including any overlap.
- Note this is not just the `Table` element, it must be adjusted to strip out any
- `.metadata.orig_elements` value it may have when it is itself a chunk and not a direct
- product of partitioning.
+ Whitespace is normalized to a single space. The text of each element is separated from
+ that of the next by a blank line ("\n\n").
"""
- # -- make a copy because we're going to mutate the `Table` element and it doesn't belong to
- # -- us (the user may have downstream purposes for it).
- orig_table = copy.deepcopy(self._table)
- # -- prevent recursive .orig_elements when `Table` element is a chunk --
- orig_table.metadata.orig_elements = None
- return [orig_table]
-
- @lazyproperty
- def _table_text(self) -> str:
- """The text in this table, not including any overlap-prefix or extra whitespace."""
- return " ".join(self._table.text.split())
+ return self._opts.text_separator.join(self._iter_text_segments())
- @lazyproperty
- def _text_with_overlap(self) -> str:
- """The text for this chunk, including the overlap-prefix when present."""
- overlap_prefix = self._overlap_prefix
- table_text = self._table.text.strip()
- # -- use row-separator between overlap and table-text --
- return overlap_prefix + "\n" + table_text if overlap_prefix else table_text
+# ================================================================================================
+# CHUNKING HELPER/SPLITTERS
+# ================================================================================================
-class TextPreChunk:
- """A sequence of elements that belong to the same semantic unit within a document.
- The name "section" derives from the idea of a document-section, a heading followed by the
- paragraphs "under" that heading. That structure is not found in all documents and actual section
- content can vary, but that's the concept.
+class _Chunker:
+ """Forms chunks from a pre-chunk other than one containing only a `Table`.
- This object is purposely immutable.
+ Produces zero-or-more `CompositeElement` objects.
"""
- def __init__(
- self, elements: Iterable[Element], overlap_prefix: str, opts: ChunkingOptions
- ) -> None:
+ def __init__(self, elements: Iterable[Element], text: str, opts: ChunkingOptions) -> None:
self._elements = list(elements)
- self._overlap_prefix = overlap_prefix
+ self._text = text
self._opts = opts
- def __eq__(self, other: Any) -> bool:
- if not isinstance(other, TextPreChunk):
- return False
- return self._overlap_prefix == other._overlap_prefix and self._elements == other._elements
-
- def can_combine(self, pre_chunk: TextPreChunk) -> bool:
- """True when `pre_chunk` can be combined with this one without exceeding size limits."""
- if len(self._text) >= self._opts.combine_text_under_n_chars:
- return False
- # -- avoid duplicating length computations by doing a trial-combine which is just as
- # -- efficient and definitely more robust than hoping two different computations of combined
- # -- length continue to get the same answer as the code evolves. Only possible because
- # -- `.combine()` is non-mutating.
- combined_len = len(self.combine(pre_chunk)._text)
-
- return combined_len <= self._opts.hard_max
+ @classmethod
+ def iter_chunks(
+ cls, elements: Iterable[Element], text: str, opts: ChunkingOptions
+ ) -> Iterator[CompositeElement]:
+ """Form zero or more chunks from `elements`.
- def combine(self, other_pre_chunk: TextPreChunk) -> TextPreChunk:
- """Return new `TextPreChunk` that combines this and `other_pre_chunk`."""
- # -- combined pre-chunk gets the overlap-prefix of the first pre-chunk. The second overlap
- # -- is automatically incorporated at the end of the first chunk, where it originated.
- return TextPreChunk(
- self._elements + other_pre_chunk._elements,
- overlap_prefix=self._overlap_prefix,
- opts=self._opts,
- )
+ One `CompositeElement` is produced when all `elements` will fit. Otherwise there is a
+ single `Text`-subtype element and chunks are formed by splitting.
+ """
+ return cls(elements, text, opts)._iter_chunks()
- def iter_chunks(self) -> Iterator[CompositeElement]:
- """Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller."""
+ def _iter_chunks(self) -> Iterator[CompositeElement]:
+ """Form zero or more chunks from `elements`."""
# -- a pre-chunk containing no text (maybe only a PageBreak element for example) does not
# -- generate any chunks.
if not self._text:
return
+ # -- `split()` is the text-splitting function used to split an oversized element --
split = self._opts.split
# -- emit first chunk --
@@ -662,17 +565,6 @@ def iter_chunks(self) -> Iterator[CompositeElement]:
s, remainder = split(remainder)
yield CompositeElement(text=s, metadata=self._continuation_metadata)
- @lazyproperty
- def overlap_tail(self) -> str:
- """The portion of this chunk's text to be repeated as a prefix in the next chunk.
-
- This value is the empty-string ("") when either the `.overlap` length option is `0` or
- `.overlap_all` is `False`. When there is a text value, it is stripped of both leading and
- trailing whitespace.
- """
- overlap = self._opts.inter_chunk_overlap
- return self._text[-overlap:].strip() if overlap else ""
-
@lazyproperty
def _all_metadata_values(self) -> dict[str, list[Any]]:
"""Collection of all populated metadata values across elements.
@@ -738,18 +630,6 @@ def _continuation_metadata(self) -> ElementMetadata:
continuation_metadata.is_continuation = True
return continuation_metadata
- def _iter_text_segments(self) -> Iterator[str]:
- """Generate overlap text and each element text segment in order.
-
- Empty text segments are not included.
- """
- if self._overlap_prefix:
- yield self._overlap_prefix
- for e in self._elements:
- if not e.text:
- continue
- yield e.text
-
@lazyproperty
def _meta_kwargs(self) -> dict[str, Any]:
"""The consolidated metadata values as a dict suitable for constructing ElementMetadata.
@@ -806,22 +686,183 @@ def iter_orig_elements():
return list(iter_orig_elements())
+
+class _TableChunker:
+ """Responsible for forming chunks, especially splits, from a single-table pre-chunk.
+
+ Table splitting is specialized because we recursively split on an even row, cell, text
+ boundary. This object encapsulate those details.
+ """
+
+ def __init__(self, table: Table, overlap_prefix: str, opts: ChunkingOptions) -> None:
+ self._table = table
+ self._overlap_prefix = overlap_prefix
+ self._opts = opts
+
+ @classmethod
+ def iter_chunks(
+ cls, table: Table, overlap_prefix: str, opts: ChunkingOptions
+ ) -> Iterator[Table | TableChunk]:
+ """Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
+ return cls(table, overlap_prefix, opts)._iter_chunks()
+
+ def _iter_chunks(self) -> Iterator[Table | TableChunk]:
+ """Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
+ # -- A table with no non-whitespace text produces no chunks --
+ if not self._table_text:
+ return
+
+ # -- only text-split a table when it's longer than the chunking window --
+ maxlen = self._opts.hard_max
+ if len(self._text_with_overlap) <= maxlen and len(self._html) <= maxlen:
+ # -- use the compactified html for .text_as_html, even though we're not splitting --
+ metadata = self._metadata
+ metadata.text_as_html = self._html or None
+ # -- note the overlap-prefix is prepended to its text --
+ yield Table(text=self._text_with_overlap, metadata=metadata)
+ return
+
+ # -- When there's no HTML, split it like a normal element. Also fall back to text-only
+ # -- chunks when `max_characters` is less than 50. `.text_as_html` metadata is impractical
+ # -- for a chunking window that small because the 33 characters of HTML overhead for each
+ # -- chunk (``) would produce a very large number of
+ # -- very small chunks.
+ if not self._html or self._opts.hard_max < 50:
+ yield from self._iter_text_only_table_chunks()
+ return
+
+ # -- otherwise, form splits with "synchronized" text and html --
+ yield from self._iter_text_and_html_table_chunks()
+
@lazyproperty
- def _text(self) -> str:
- """The concatenated text of all elements in this pre-chunk.
+ def _html(self) -> str:
+ """The compactified HTML for this table when it has text-as-HTML.
+
+ The empty string when table-structure has not been captured, perhaps because
+ `infer_table_structure` was set `False` in the partitioning call.
+ """
+ if not (html_table := self._html_table):
+ return ""
+
+ return html_table.html
+
+ @lazyproperty
+ def _html_table(self) -> HtmlTable | None:
+ """The `lxml` HTML element object for this table.
- Each element-text is separated from the next by a blank line ("\n\n").
+ `None` when the `Table` element has no `.metadata.text_as_html`.
"""
- text_separator = self._opts.text_separator
- return text_separator.join(self._iter_text_segments())
+ if (text_as_html := self._table.metadata.text_as_html) is None:
+ return None
+
+ text_as_html = text_as_html.strip()
+ if not text_as_html: # pragma: no cover
+ return None
+
+ return HtmlTable.from_html_text(text_as_html)
+
+ def _iter_text_and_html_table_chunks(self) -> Iterator[TableChunk]:
+ """Split table into chunks where HTML corresponds exactly to text.
+
+ `.metadata.text_as_html` for each chunk is a parsable `` HTML fragment.
+ """
+ if (html_table := self._html_table) is None: # pragma: no cover
+ raise ValueError("this method is undefined for a table having no .text_as_html")
+
+ is_continuation = False
+
+ for text, html in _HtmlTableSplitter.iter_subtables(html_table, self._opts):
+ metadata = self._metadata
+ metadata.text_as_html = html
+ # -- second and later chunks get `.metadata.is_continuation = True` --
+ metadata.is_continuation = is_continuation or None
+ is_continuation = True
+
+ yield TableChunk(text=text, metadata=metadata)
+
+ def _iter_text_only_table_chunks(self) -> Iterator[TableChunk]:
+ """Split oversized text-only table (no text-as-html) into chunks.
+
+ `.metadata.text_as_html` is optional, not included when `infer_table_structure` is
+ `False`.
+ """
+ text_remainder = self._text_with_overlap
+ split = self._opts.split
+ is_continuation = False
+
+ while text_remainder:
+ # -- split off the next chunk-worth of characters into a TableChunk --
+ chunk_text, text_remainder = split(text_remainder)
+ metadata = self._metadata
+ # -- second and later chunks get `.metadata.is_continuation = True` --
+ metadata.is_continuation = is_continuation or None
+ is_continuation = True
+
+ yield TableChunk(text=chunk_text, metadata=metadata)
+
+ @property
+ def _metadata(self) -> ElementMetadata:
+ """The base `.metadata` value for chunks formed from this pre-chunk.
+
+ The term "base" here means that other metadata fields will be added, depending on the
+ chunk. In particular, `.metadata.text_as_html` will be different for each text-split chunk
+ and `.metadata.is_continuation` must be added for second-and-later text-split chunks.
+
+ Note this is a fresh copy of the metadata on each call since it will need to be mutated
+ differently for each chunk formed from this pre-chunk.
+ """
+ CS = ConsolidationStrategy
+ metadata = copy.deepcopy(self._table.metadata)
+
+ # -- drop metadata fields not appropriate for chunks, in particular
+ # -- parent_id's will not reliably point to an existing element
+ drop_field_names = [
+ field_name
+ for field_name, strategy in CS.field_consolidation_strategies().items()
+ if strategy is CS.DROP
+ ]
+ for field_name in drop_field_names:
+ setattr(metadata, field_name, None)
+
+ if self._opts.include_orig_elements:
+ metadata.orig_elements = self._orig_elements
+ return metadata
+
+ @lazyproperty
+ def _orig_elements(self) -> list[Element]:
+ """The `.metadata.orig_elements` value for chunks formed from this pre-chunk.
+
+ Note this is not just the `Table` element, it must be adjusted to strip out any
+ `.metadata.orig_elements` value it may have when it is itself a chunk and not a direct
+ product of partitioning.
+ """
+ # -- make a copy because we're going to mutate the `Table` element and it doesn't belong to
+ # -- us (the user may have downstream purposes for it).
+ orig_table = copy.deepcopy(self._table)
+ # -- prevent recursive .orig_elements when `Table` element is a chunk --
+ orig_table.metadata.orig_elements = None
+ return [orig_table]
+
+ @lazyproperty
+ def _table_text(self) -> str:
+ """The text in this table, not including any overlap-prefix or extra whitespace."""
+ return " ".join(self._table.text.split())
+
+ @lazyproperty
+ def _text_with_overlap(self) -> str:
+ """The text for this chunk, including the overlap-prefix when present."""
+ overlap_prefix = self._overlap_prefix
+ table_text = self._table.text.strip()
+ # -- use row-separator between overlap and table-text --
+ return overlap_prefix + "\n" + table_text if overlap_prefix else table_text
# ================================================================================================
-# PRE-CHUNK SPLITTERS
+# HTML SPLITTERS
# ================================================================================================
-class _TableSplitter:
+class _HtmlTableSplitter:
"""Produces (text, html) pairs for a `` HtmlElement.
Each chunk contains a whole number of rows whenever possible. An oversized row is split on an
@@ -1040,7 +1081,7 @@ def flush(self) -> Iterator[TextAndHtml]:
def will_fit(self, cell: HtmlCell) -> bool:
"""True when `cell` will fit within remaining space left by accummulated cells."""
- return self._remaining_space >= len(cell.html)
+ return self._remaining_space >= len(cell.text)
def _iter_cell_texts(self) -> Iterator[str]:
"""Generate contents of each accumulated cell as a separate string.
@@ -1054,10 +1095,11 @@ def _iter_cell_texts(self) -> Iterator[str]:
@property
def _remaining_space(self) -> int:
- """Number of characters remaining when accumulated cells are formed into HTML."""
- # -- 24 is `len("")`, the overhead in addition to ``
- # -- HTML fragments
- return self._maxlen - 24 - sum(len(c.html) for c in self._cells)
+ """Number of characters remaining when text of accumulated cells is joined."""
+ # -- separators are one space (" ") at the end of each cell's text, including last one to
+ # -- account for space before prospective next cell.
+ separators_len = len(self._cells)
+ return self._maxlen - separators_len - sum(len(c.text) for c in self._cells)
class _RowAccumulator:
@@ -1087,7 +1129,7 @@ def flush(self) -> Iterator[TextAndHtml]:
def will_fit(self, row: HtmlRow) -> bool:
"""True when `row` will fit within remaining space left by accummulated rows."""
- return self._remaining_space >= len(row.html)
+ return self._remaining_space >= row.text_len
def _iter_cell_texts(self) -> Iterator[str]:
"""Generate contents of each row cell as a separate string.
@@ -1100,8 +1142,10 @@ def _iter_cell_texts(self) -> Iterator[str]:
@property
def _remaining_space(self) -> int:
"""Number of characters remaining when accumulated rows are formed into HTML."""
- # -- 15 is `len("")`, the overhead in addition to ` | ` HTML fragments --
- return self._maxlen - 15 - sum(len(r.html) for r in self._rows)
+ # -- separators are one space (" ") at the end of each row's text, including last one to
+ # -- account for space before prospective next row.
+ separators_len = len(self._rows)
+ return self._maxlen - separators_len - sum(r.text_len for r in self._rows)
# ================================================================================================
@@ -1117,16 +1161,10 @@ def __init__(self, pre_chunks: Iterable[PreChunk], opts: ChunkingOptions):
self._opts = opts
def iter_combined_pre_chunks(self) -> Iterator[PreChunk]:
- """Generate pre-chunk objects, combining TextPreChunk objects when they'll fit in window."""
- accum = TextPreChunkAccumulator(self._opts)
+ """Generate pre-chunk objects, combining `PreChunk` objects when they'll fit in window."""
+ accum = _PreChunkAccumulator(self._opts)
for pre_chunk in self._pre_chunks:
- # -- a table pre-chunk is never combined --
- if isinstance(pre_chunk, TablePreChunk):
- yield from accum.flush()
- yield pre_chunk
- continue
-
# -- finish accumulating pre-chunk when it's full --
if not accum.will_fit(pre_chunk):
yield from accum.flush()
@@ -1136,39 +1174,37 @@ def iter_combined_pre_chunks(self) -> Iterator[PreChunk]:
yield from accum.flush()
-class TextPreChunkAccumulator:
- """Accumulates, measures, and combines text pre-chunks.
+class _PreChunkAccumulator:
+ """Accumulates, measures, and combines pre-chunks.
Used for combining pre-chunks for chunking strategies like "by-title" that can potentially
- produce undersized chunks and offer the `combine_text_under_n_chars` option. Note that only
- sequential `TextPreChunk` objects can be combined. A `TablePreChunk` is never combined with
- another pre-chunk.
+ produce undersized chunks and offer the `combine_text_under_n_chars` option.
Provides `.add_pre_chunk()` allowing a pre-chunk to be added to the chunk and provides
monitoring properties `.remaining_space` and `.text_length` suitable for deciding whether to add
another pre-chunk.
- `.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object.
- This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used
+ `.flush()` is used to combine the accumulated pre-chunks into a single `PreChunk` object.
+ This method returns an interator that generates zero-or-one `PreChunk` objects and is used
like so:
yield from accum.flush()
- If no pre-chunks have been accumulated, no `TextPreChunk` is generated. Flushing the builder
- clears the pre-chunks it contains so it is ready to accept the next text-pre-chunk.
+ If no pre-chunks have been accumulated, no `PreChunk` is generated. Flushing the builder
+ clears the pre-chunks it contains so it is ready to accept the next pre-chunk.
"""
def __init__(self, opts: ChunkingOptions) -> None:
self._opts = opts
- self._pre_chunk: TextPreChunk | None = None
+ self._pre_chunk: PreChunk | None = None
- def add_pre_chunk(self, pre_chunk: TextPreChunk) -> None:
+ def add_pre_chunk(self, pre_chunk: PreChunk) -> None:
"""Add a pre-chunk to the accumulator for possible combination with next pre-chunk."""
self._pre_chunk = (
pre_chunk if self._pre_chunk is None else self._pre_chunk.combine(pre_chunk)
)
- def flush(self) -> Iterator[TextPreChunk]:
+ def flush(self) -> Iterator[PreChunk]:
"""Generate accumulated pre-chunk as a single combined pre-chunk.
Does not generate a pre-chunk when none has been accumulated.
@@ -1181,7 +1217,7 @@ def flush(self) -> Iterator[TextPreChunk]:
# -- and reset the accumulator (to empty) --
self._pre_chunk = None
- def will_fit(self, pre_chunk: TextPreChunk) -> bool:
+ def will_fit(self, pre_chunk: PreChunk) -> bool:
"""True when there is room for `pre_chunk` in accumulator.
An empty accumulator always has room. Otherwise there is only room when `pre_chunk` can be
@@ -1206,7 +1242,7 @@ def will_fit(self, pre_chunk: TextPreChunk) -> bool:
# predicate.
#
# These can be mixed and matched to produce different chunking behaviors like "by_title" or left
-# out altogether to produce "by_element" behavior.
+# out altogether to produce "basic-chunking" behavior.
#
# The effective lifetime of the function that produce a predicate (rather than directly being one)
# is limited to a single element-stream because these retain state (e.g. current page number) to
diff --git a/unstructured/common/html_table.py b/unstructured/common/html_table.py
index 5ddcf55928..a441e5a57b 100644
--- a/unstructured/common/html_table.py
+++ b/unstructured/common/html_table.py
@@ -136,11 +136,15 @@ def iter_cell_texts(self) -> Iterator[str]:
for td in self._tr:
if (text := td.text) is None:
continue
- text = text.strip()
if not text:
continue
yield text
+ @lazyproperty
+ def text_len(self) -> int:
+ """Length of the normalized text, as it would appear in `element.text`."""
+ return len(" ".join(self.iter_cell_texts()))
+
class HtmlCell:
"""A `` element."""
@@ -158,4 +162,4 @@ def text(self) -> str:
"""Text inside ` | ` element, empty string when no text."""
if (text := self._td.text) is None:
return ""
- return text.strip()
+ return " ".join(text.strip().split())
|