diff --git a/CHANGELOG.md b/CHANGELOG.md index 3bd62d0178..323f6484e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,10 @@ -## 0.16.11-dev0 +## 0.16.11-dev1 ### Enhancements - **Enhance quote standardization tests** with additional Unicode scenarios +- **Relax table segregation rule in chunking.** Previously a `Table` element was always segregated into its own pre-chunk such that the `Table` appeared alone in a chunk or was split into multiple `TableChunk` elements, but never combined with `Text`-subtype elements. Allow table elements to be combined with other elements in the same chunk when space allows. +- **Compute chunk length based solely on `element.text`.** Previously `.metadata.text_as_html` was also considered and since it is always longer that the text (due to HTML tag overhead) it was the effective length criterion. Remove text-as-html from the length calculation such that text-length is the sole criterion for sizing a chunk. ### Features diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py index eeb5f3740f..f63e738a7c 100644 --- a/test_unstructured/chunking/test_base.py +++ b/test_unstructured/chunking/test_base.py @@ -11,15 +11,16 @@ from unstructured.chunking.base import ( ChunkingOptions, + PreChunk, PreChunkBuilder, PreChunkCombiner, PreChunker, - TablePreChunk, - TextPreChunk, - TextPreChunkAccumulator, _CellAccumulator, + _Chunker, + _HtmlTableSplitter, + _PreChunkAccumulator, _RowAccumulator, - _TableSplitter, + _TableChunker, _TextSplitter, is_on_next_page, is_title, @@ -181,27 +182,27 @@ def it_gathers_elements_into_pre_chunks_respecting_the_specified_chunk_size(self pre_chunk_iter = PreChunker.iter_pre_chunks(elements, opts=opts) pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) + assert isinstance(pre_chunk, PreChunk) assert pre_chunk._elements == [ Title("Lorem Ipsum"), Text("Lorem ipsum dolor sit amet, consectetur adipiscing elit."), ] # -- pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) + assert isinstance(pre_chunk, PreChunk) assert pre_chunk._elements == [ Text("Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.") ] # -- pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) + assert isinstance(pre_chunk, PreChunk) assert pre_chunk._elements == [ Title("Ut Enim"), Text("Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi."), ] # -- pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) + assert isinstance(pre_chunk, PreChunk) assert pre_chunk._elements == [Text("Ut aliquip ex ea commodo consequat."), CheckBox()] # -- with pytest.raises(StopIteration): @@ -233,21 +234,18 @@ def it_accumulates_elements_added_to_it(self): assert builder._text_length == 112 assert builder._remaining_space == 36 - @pytest.mark.parametrize("element", [Table("Heading\nCell text"), Text("abcd " * 200)]) - def it_will_fit_a_Table_or_oversized_element_when_empty(self, element: Element): + def it_will_fit_an_oversized_element_when_empty(self): builder = PreChunkBuilder(opts=ChunkingOptions()) - assert builder.will_fit(element) + assert builder.will_fit(Text("abcd " * 200)) @pytest.mark.parametrize( ("existing_element", "next_element"), [ - (Text("abcd"), Table("Fruits\nMango")), (Text("abcd"), Text("abcd " * 200)), - (Table("Heading\nCell text"), Table("Fruits\nMango")), (Table("Heading\nCell text"), Text("abcd " * 200)), ], ) - def but_not_when_it_already_contains_an_element_of_any_kind( + def but_not_when_it_already_contains_an_element( self, existing_element: Element, next_element: Element ): builder = PreChunkBuilder(opts=ChunkingOptions()) @@ -256,11 +254,13 @@ def but_not_when_it_already_contains_an_element_of_any_kind( assert not builder.will_fit(next_element) @pytest.mark.parametrize("element", [Text("abcd"), Table("Fruits\nMango")]) - def it_will_not_fit_any_element_when_it_already_contains_a_table(self, element: Element): + def it_will_accept_another_element_that_fits_when_it_already_contains_a_table( + self, element: Element + ): builder = PreChunkBuilder(opts=ChunkingOptions()) builder.add_element(Table("Heading\nCell text")) - assert not builder.will_fit(element) + assert builder.will_fit(element) def it_will_not_fit_an_element_when_it_already_exceeds_the_soft_maxlen(self): builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100, new_after_n_chars=50)) @@ -290,7 +290,7 @@ def but_it_will_fit_an_element_that_fits(self): # -- 55 + 2 (separator) + 43 == 100 -- assert builder.will_fit(Text("In rhoncus ipsum sed lectus porto volutpat.")) # 43-chars - def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self): + def it_generates_a_PreChunk_when_flushed_and_resets_itself_to_empty(self): builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150)) builder.add_element(Title("Introduction")) builder.add_element( @@ -302,7 +302,13 @@ def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self): pre_chunk = next(builder.flush()) - assert isinstance(pre_chunk, TextPreChunk) + # -- pre-chunk builder was reset before the yield, such that the iterator does not need to + # -- be exhausted before clearing out the old elements and a new pre-chunk can be + # -- accumulated immediately (first `next()` call is required however, to advance to the + # -- yield statement). + assert builder._text_length == 0 + assert builder._remaining_space == 150 + assert isinstance(pre_chunk, PreChunk) assert pre_chunk._elements == [ Title("Introduction"), Text( @@ -310,24 +316,6 @@ def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self): "lectus porta volutpat.", ), ] - assert builder._text_length == 0 - assert builder._remaining_space == 150 - - def and_it_generates_a_TablePreChunk_when_it_contains_a_Table_element(self): - builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150)) - builder.add_element(Table("Heading\nCell text")) - - pre_chunk = next(builder.flush()) - - # -- pre-chunk builder was reset before the yield, such that the iterator does not need to - # -- be exhausted before clearing out the old elements and a new pre-chunk can be - # -- accumulated immediately (first `next()` call is required however, to advance to the - # -- yield statement). - assert builder._text_length == 0 - assert builder._remaining_space == 150 - # -- pre-chunk is a `TablePreChunk` -- - assert isinstance(pre_chunk, TablePreChunk) - assert pre_chunk._table == Table("Heading\nCell text") def but_it_does_not_generate_a_pre_chunk_on_flush_when_empty(self): builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150)) @@ -345,21 +333,19 @@ def it_computes_overlap_from_each_pre_chunk_and_applies_it_to_the_next(self): builder.add_element(Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")) pre_chunk = list(builder.flush())[0] - assert isinstance(pre_chunk, TextPreChunk) + assert isinstance(pre_chunk, PreChunk) assert pre_chunk._text == "Lorem ipsum dolor sit amet consectetur adipiscing elit." builder.add_element(Table("In rhoncus ipsum sed lectus porta volutpat.")) pre_chunk = list(builder.flush())[0] - assert isinstance(pre_chunk, TablePreChunk) - assert pre_chunk._text_with_overlap == ( - "dipiscing elit.\nIn rhoncus ipsum sed lectus porta volutpat." - ) + assert isinstance(pre_chunk, PreChunk) + assert pre_chunk._text == "dipiscing elit.\n\nIn rhoncus ipsum sed lectus porta volutpat." builder.add_element(Text("Donec semper facilisis metus finibus.")) pre_chunk = list(builder.flush())[0] - assert isinstance(pre_chunk, TextPreChunk) + assert isinstance(pre_chunk, PreChunk) assert pre_chunk._text == "porta volutpat.\n\nDonec semper facilisis metus finibus." def it_considers_separator_length_when_computing_text_length_and_remaining_space(self): @@ -381,252 +367,8 @@ def it_considers_separator_length_when_computing_text_length_and_remaining_space # ================================================================================================ -class DescribeTablePreChunk: - """Unit-test suite for `unstructured.chunking.base.TablePreChunk` objects.""" - - def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self): - html_table = ( - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "
Header Col 1 Header Col 2
Lorem ipsum adipiscing
" - ) - text_table = "Header Col 1 Header Col 2\nLorem ipsum adipiscing" - pre_chunk = TablePreChunk( - Table(text_table, metadata=ElementMetadata(text_as_html=html_table)), - overlap_prefix="ctus porta volutpat.", - opts=ChunkingOptions(max_characters=175), - ) - - chunk_iter = pre_chunk.iter_chunks() - - chunk = next(chunk_iter) - assert isinstance(chunk, Table) - assert chunk.text == ( - "ctus porta volutpat.\nHeader Col 1 Header Col 2\nLorem ipsum adipiscing" - ) - assert chunk.metadata.text_as_html == ( - "" - "" - "" - "
Header Col 1Header Col 2
Lorem ipsumadipiscing
" - ) - with pytest.raises(StopIteration): - next(chunk_iter) - - def but_not_when_the_table_is_is_empty_or_contains_only_whitespace(self): - html_table = "
\t \n
" - pre_chunk = TablePreChunk( - Table(" \t \n ", metadata=ElementMetadata(text_as_html=html_table)), - overlap_prefix="volutpat.", - opts=ChunkingOptions(max_characters=175), - ) - - chunk_iter = pre_chunk.iter_chunks() - - with pytest.raises(StopIteration): - next(chunk_iter) - - def and_it_includes_the_original_table_element_in_metadata_when_so_instructed(self): - table = Table("foo bar", metadata=ElementMetadata(text_as_html="foo bar
")) - opts = ChunkingOptions(include_orig_elements=True) - pre_chunk = TablePreChunk(table, "", opts) - - chunk_iter = pre_chunk.iter_chunks() - - chunk = next(chunk_iter) - assert isinstance(chunk, Table) - assert chunk.metadata.orig_elements == [table] - assert chunk.metadata.text_as_html == "foo bar
" - # -- - with pytest.raises(StopIteration): - next(chunk_iter) - - def but_not_when_instructed_not_to(self): - pre_chunk = TablePreChunk(Table("foobar"), "", ChunkingOptions(include_orig_elements=False)) - - chunk = next(pre_chunk.iter_chunks()) - - assert isinstance(chunk, Table) - assert chunk.metadata.orig_elements is None - - def it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self): - html_table = """\ - - - - - - - - - -
Header Col 1 Header Col 2
Lorem ipsum A Link example
Consectetur adipiscing elit
Nunc aliquam id enim nec molestie
- """ - text_table = ( - "Header Col 1 Header Col 2\n" - "Lorem ipsum dolor sit amet\n" - "Consectetur adipiscing elit\n" - "Nunc aliquam id enim nec molestie\n" - "Vivamus quis nunc ipsum donec ac fermentum" - ) - pre_chunk = TablePreChunk( - Table(text_table, metadata=ElementMetadata(text_as_html=html_table)), - overlap_prefix="", - opts=ChunkingOptions(max_characters=100, text_splitting_separators=("\n", " ")), - ) - - chunk_iter = pre_chunk.iter_chunks() - - chunk = next(chunk_iter) - assert isinstance(chunk, TableChunk) - assert chunk.text == "Header Col 1 Header Col 2" - assert chunk.metadata.text_as_html == ( - "
Header Col 1Header Col 2
" - ) - assert chunk.metadata.is_continuation is None - # -- - chunk = next(chunk_iter) - assert isinstance(chunk, TableChunk) - assert chunk.text == "Lorem ipsum A Link example" - assert chunk.metadata.text_as_html == ( - "
Lorem ipsumA Link example
" - ) - assert chunk.metadata.is_continuation - # -- - chunk = next(chunk_iter) - assert isinstance(chunk, TableChunk) - assert chunk.text == "Consectetur adipiscing elit" - assert chunk.metadata.text_as_html == ( - "
Consecteturadipiscing elit
" - ) - assert chunk.metadata.is_continuation - # -- - chunk = next(chunk_iter) - assert isinstance(chunk, TableChunk) - assert chunk.text == "Nunc aliquam id enim nec molestie" - assert chunk.metadata.text_as_html == ( - "
Nunc aliquamid enim nec molestie
" - ) - assert chunk.metadata.is_continuation - # -- - with pytest.raises(StopIteration): - next(chunk_iter) - - def and_it_includes_the_whole_original_Table_in_each_metadata_when_so_instructed(self): - """Even though text and html are split, the orig_elements metadata is not.""" - table = Table( - "Header Col 1 Header Col 2\nLorem ipsum dolor sit amet", - metadata=ElementMetadata(text_as_html=""), - ) - opts = ChunkingOptions(max_characters=30, include_orig_elements=True) - pre_chunk = TablePreChunk(table, overlap_prefix="", opts=opts) - - chunk_iter = pre_chunk.iter_chunks() - - chunk = next(chunk_iter) - assert isinstance(chunk, TableChunk) - assert chunk.text == "Header Col 1 Header Col 2" - assert chunk.metadata.orig_elements == [table] - assert not chunk.metadata.is_continuation - # -- - chunk = next(chunk_iter) - assert isinstance(chunk, TableChunk) - assert chunk.text == "Lorem ipsum dolor sit amet" - assert chunk.metadata.orig_elements == [table] - assert chunk.metadata.is_continuation - - @pytest.mark.parametrize( - ("text", "expected_value"), - [ - # -- normally it splits exactly on overlap size |------- 20 -------| - ("In rhoncus ipsum sed lectus porta volutpat.", "ctus porta volutpat."), - # -- but it strips leading whitespace when the tail includes it -- - ("In rhoncus ipsum sed lectus porta volutpat.", "porta volutpat."), - ], - ) - def it_computes_its_overlap_tail_for_use_in_inter_pre_chunk_overlap( - self, text: str, expected_value: str - ): - pre_chunk = TablePreChunk( - Table(text), overlap_prefix="", opts=ChunkingOptions(overlap=20, overlap_all=True) - ) - assert pre_chunk.overlap_tail == expected_value - - @pytest.mark.parametrize( - ("text", "overlap_prefix", "expected_value"), - [ - ( - "In rhoncus ipsum sed lectus porta volutpat.", - "", - "In rhoncus ipsum sed lectus porta volutpat.", - ), - ( - "In rhoncus ipsum sed lectus porta volutpat.", - "ctus porta volutpat.", - "ctus porta volutpat.\nIn rhoncus ipsum sed lectus porta volutpat.", - ), - ], - ) - def it_includes_its_overlap_prefix_in_its_text_when_present( - self, text: str, overlap_prefix: str, expected_value: str - ): - pre_chunk = TablePreChunk( - Table(text), overlap_prefix=overlap_prefix, opts=ChunkingOptions() - ) - assert pre_chunk._text_with_overlap == expected_value - - def it_computes_metadata_for_each_chunk_to_help(self): - table = Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="
")) - pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions()) - - metadata = pre_chunk._metadata - - assert metadata.text_as_html == "
" - # -- opts.include_orig_elements is True by default -- - assert metadata.orig_elements == [table] - # -- it produces a new instance each time it is called so changing one chunk's metadata does - # -- not change that of any other chunk. - assert pre_chunk._metadata is not metadata - - def but_it_omits_orig_elements_from_metadata_when_so_instructed(self): - pre_chunk = TablePreChunk( - Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="
")), - overlap_prefix="", - opts=ChunkingOptions(include_orig_elements=False), - ) - - assert pre_chunk._metadata.orig_elements is None - - def it_computes_the_original_elements_list_to_help(self): - table = Table( - "Lorem ipsum", - metadata=ElementMetadata(text_as_html="
", orig_elements=[Table("Lorem Ipsum")]), - ) - pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions()) - - orig_elements = pre_chunk._orig_elements - - # -- a TablePreChunk always has exactly one original (Table) element -- - assert len(orig_elements) == 1 - orig_element = orig_elements[0] - # -- each item in orig_elements is a copy of the original element so we can mutate it - # -- without changing user's data. - assert orig_element == table - assert orig_element is not table - # -- it strips any .metadata.orig_elements from each element to prevent a recursive data - # -- structure - assert orig_element.metadata.orig_elements is None - # -- computation is only on first call, all chunks get exactly the same orig-elements -- - assert pre_chunk._orig_elements is orig_elements - - -class DescribeTextPreChunk: - """Unit-test suite for `unstructured.chunking.base.TextPreChunk` objects.""" +class DescribePreChunk: + """Unit-test suite for `unstructured.chunking.base.PreChunk` objects.""" @pytest.mark.parametrize( ("overlap_pfx", "texts", "other_overlap_pfx", "other_texts", "expected_value"), @@ -643,7 +385,7 @@ class DescribeTextPreChunk: ("", ["bar", "baz"], "foo", ["bah", "dah"], False), ], ) - def it_knows_when_it_is_equal_to_another_TextPreChunk_instance( + def it_knows_when_it_is_equal_to_another_PreChunk_instance( self, overlap_pfx: str, texts: list[str], @@ -652,15 +394,15 @@ def it_knows_when_it_is_equal_to_another_TextPreChunk_instance( expected_value: bool, ): opts = ChunkingOptions() - pre_chunk = TextPreChunk([Text(t) for t in texts], overlap_prefix=overlap_pfx, opts=opts) - other_pre_chunk = TextPreChunk( + pre_chunk = PreChunk([Text(t) for t in texts], overlap_prefix=overlap_pfx, opts=opts) + other_pre_chunk = PreChunk( [Text(t) for t in other_texts], overlap_prefix=other_overlap_pfx, opts=opts ) assert (pre_chunk == other_pre_chunk) is expected_value - def and_it_knows_it_is_not_equal_to_an_object_that_is_not_a_TextPreChunk(self): - pre_chunk = TextPreChunk([], overlap_prefix="", opts=ChunkingOptions()) + def and_it_knows_it_is_NOT_equal_to_an_object_that_is_not_a_PreChunk(self): + pre_chunk = PreChunk([], overlap_prefix="", opts=ChunkingOptions()) assert pre_chunk != 42 @pytest.mark.parametrize( @@ -676,22 +418,22 @@ def and_it_knows_it_is_not_equal_to_an_object_that_is_not_a_TextPreChunk(self): (99, 73, False), ], ) - def it_knows_when_it_can_combine_itself_with_another_TextPreChunk_instance( + def it_knows_when_it_can_combine_itself_with_another_PreChunk_instance( self, max_characters: int, combine_text_under_n_chars: int, expected_value: bool ): - """This allows `PreChunkCombiner` to operate without knowing `TextPreChunk` internals.""" + """This allows `PreChunkCombiner` to operate without knowing `PreChunk` internals.""" opts = ChunkingOptions( max_characters=max_characters, combine_text_under_n_chars=combine_text_under_n_chars, overlap=20, overlap_all=True, ) - pre_chunk = TextPreChunk( + pre_chunk = PreChunk( [Text("Lorem ipsum dolor sit amet consectetur adipiscing.")], # len == 50 overlap_prefix="e feugiat efficitur.", # len == 20 opts=opts, ) - next_pre_chunk = TextPreChunk( + next_pre_chunk = PreChunk( [Text("In rhoncus sum sed lectus.")], # len == 26 overlap_prefix="sectetur adipiscing.", # len == 20 but shouldn't come into computation opts=opts, @@ -699,13 +441,13 @@ def it_knows_when_it_can_combine_itself_with_another_TextPreChunk_instance( assert pre_chunk.can_combine(next_pre_chunk) is expected_value - def it_can_combine_itself_with_another_TextPreChunk_instance(self): + def it_can_combine_itself_with_another_PreChunk_instance(self): """.combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`. Note that neither the original or other pre_chunk are mutated. """ opts = ChunkingOptions() - pre_chunk = TextPreChunk( + pre_chunk = PreChunk( [ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Text("In rhoncus ipsum sed lectus porta volutpat."), @@ -713,7 +455,7 @@ def it_can_combine_itself_with_another_TextPreChunk_instance(self): overlap_prefix="feugiat efficitur.", opts=opts, ) - other_pre_chunk = TextPreChunk( + other_pre_chunk = PreChunk( [ Text("Donec semper facilisis metus finibus malesuada."), Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."), @@ -728,7 +470,7 @@ def it_can_combine_itself_with_another_TextPreChunk_instance(self): # -- overlap-prefix from the existing pre-chunk and the other overlap-prefix is discarded # -- (although it's still in there at the end of the first pre-chunk since that's where it # -- came from originally). - assert new_pre_chunk == TextPreChunk( + assert new_pre_chunk == PreChunk( [ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Text("In rhoncus ipsum sed lectus porta volutpat."), @@ -740,7 +482,7 @@ def it_can_combine_itself_with_another_TextPreChunk_instance(self): ) # -- Neither pre-chunk used for combining is mutated, so we don't have to worry about who # -- else may have been given a reference to them. - assert pre_chunk == TextPreChunk( + assert pre_chunk == PreChunk( [ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Text("In rhoncus ipsum sed lectus porta volutpat."), @@ -748,7 +490,7 @@ def it_can_combine_itself_with_another_TextPreChunk_instance(self): overlap_prefix="feugiat efficitur.", opts=opts, ) - assert other_pre_chunk == TextPreChunk( + assert other_pre_chunk == PreChunk( [ Text("Donec semper facilisis metus finibus malesuada."), Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."), @@ -757,6 +499,52 @@ def it_can_combine_itself_with_another_TextPreChunk_instance(self): opts=opts, ) + @pytest.mark.parametrize( + ("text", "expected_value"), + [ + # -- normally it splits exactly on overlap size |------- 20 -------| + ("In rhoncus ipsum sed lectus porta volutpat.", "ctus porta volutpat."), + # -- but it strips leading and trailing whitespace when the tail includes it -- + ("In rhoncus ipsum sed lect us portas volutpat. ", "us portas volutpat."), + ], + ) + def it_computes_its_overlap_tail_for_use_in_inter_pre_chunk_overlap( + self, text: str, expected_value: str + ): + pre_chunk = PreChunk( + [Text(text)], overlap_prefix="", opts=ChunkingOptions(overlap=20, overlap_all=True) + ) + assert pre_chunk.overlap_tail == expected_value + + @pytest.mark.parametrize( + ("elements", "overlap_prefix", "expected_value"), + [ + ([Text("foo"), Text("bar")], "bah da bing.", "bah da bing.\n\nfoo\n\nbar"), + ([Text("foo"), PageBreak(""), Text("bar")], "da bang.", "da bang.\n\nfoo\n\nbar"), + ([PageBreak(""), Text("foo")], "bah da boom.", "bah da boom.\n\nfoo"), + ([Text("foo"), Text("bar"), PageBreak("")], "", "foo\n\nbar"), + ], + ) + def it_knows_the_concatenated_text_of_the_pre_chunk_to_help( + self, elements: list[Text], overlap_prefix: str, expected_value: str + ): + """._text is the "joined" text of the pre-chunk elements. + + The text-segment contributed by each element is separated from the next by a blank line + ("\n\n"). An element that contributes no text does not give rise to a separator. + """ + pre_chunk = PreChunk(elements, overlap_prefix=overlap_prefix, opts=ChunkingOptions()) + assert pre_chunk._text == expected_value + + +# ================================================================================================ +# CHUNKING HELPER/SPLITTERS +# ================================================================================================ + + +class Describe_Chunker: + """Unit-test suite for `unstructured.chunking.base._Chunker` objects.""" + def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self): elements = [ Title("Introduction"), @@ -766,16 +554,23 @@ def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window ), ] opts = ChunkingOptions(max_characters=200, include_orig_elements=True) - pre_chunk = TextPreChunk(elements, overlap_prefix="e feugiat efficitur.", opts=opts) + chunker = _Chunker( + elements, + text=( + "e feugiat efficitur.\n\nIntroduction\n\nLorem ipsum dolor sit amet consectetur" + " adipiscing elit. In rhoncus ipsum sed lectus porta volutpat." + ), + opts=opts, + ) - chunk_iter = pre_chunk.iter_chunks() + chunk_iter = chunker._iter_chunks() chunk = next(chunk_iter) assert chunk == CompositeElement( "e feugiat efficitur.\n\nIntroduction\n\nLorem ipsum dolor sit amet consectetur" " adipiscing elit. In rhoncus ipsum sed lectus porta volutpat.", ) - assert chunk.metadata is pre_chunk._consolidated_metadata + assert chunk.metadata is chunker._consolidated_metadata assert chunk.metadata.orig_elements == elements # -- with pytest.raises(StopIteration): @@ -783,19 +578,17 @@ def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self): # -- Chunk-splitting only occurs when a *single* element is too big to fit in the window. - # -- The pre-chunker will isolate that element in a pre_chunk of its own. - elements = [ - Text( - "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod" - " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim" - " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea" - " commodo consequat." - ) - ] + # -- The pre-chunker will automatically isolate that element in a pre_chunk of its own. + text = ( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor" + " incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud" + " exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat." + ) + elements = [Text(text)] opts = ChunkingOptions(max_characters=200, include_orig_elements=True) - pre_chunk = TextPreChunk(elements, overlap_prefix="", opts=opts) + chunker = _Chunker(elements, text=text, opts=opts) - chunk_iter = pre_chunk.iter_chunks() + chunk_iter = chunker._iter_chunks() # -- Note that .metadata.orig_elements is the same single original element, "repeated" for # -- each text-split chunk. This behavior emerges without explicit command as a consequence @@ -807,93 +600,70 @@ def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(se " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim" " veniam, quis nostrud exercitation ullamco laboris nisi ut" ) - assert chunk.metadata is pre_chunk._consolidated_metadata + assert chunk.metadata is chunker._consolidated_metadata assert chunk.metadata.orig_elements == elements # -- chunk = next(chunk_iter) assert chunk == CompositeElement("aliquip ex ea commodo consequat.") - assert chunk.metadata is pre_chunk._continuation_metadata + assert chunk.metadata is chunker._continuation_metadata assert chunk.metadata.orig_elements == elements # -- with pytest.raises(StopIteration): next(chunk_iter) - def and_it_adds_the_is_continuation_flag_for_second_and_later_text_split_chunks(self): + def and_it_adds_the_is_continuation_flag_for_second_and_later_split_chunks(self): + # -- |--------------------- 48 ---------------------| + text = "'Lorem ipsum dolor' means 'Thank you very much'." metadata = ElementMetadata( category_depth=0, filename="foo.docx", languages=["lat"], parent_id="f87731e0", ) + elements = [Text(text, metadata=metadata)] - pre_chunk = TextPreChunk( - # -- |--------------------- 48 ---------------------| - [Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata)], - overlap_prefix="", - opts=ChunkingOptions(max_characters=20), - ) - - chunk_iter = pre_chunk.iter_chunks() + chunk_iter = _Chunker.iter_chunks(elements, text, opts=ChunkingOptions(max_characters=20)) assert [c.metadata.is_continuation for c in chunk_iter] == [None, True, True] def but_it_generates_no_chunks_when_the_pre_chunk_contains_no_text(self): metadata = ElementMetadata() - pre_chunk = TextPreChunk( - [PageBreak("", metadata=metadata)], - overlap_prefix="", + + chunk_iter = _Chunker.iter_chunks( + [PageBreak(" ", metadata=metadata)], + text="", opts=ChunkingOptions(), ) - chunk_iter = pre_chunk.iter_chunks() - with pytest.raises(StopIteration): next(chunk_iter) - @pytest.mark.parametrize( - ("text", "expected_value"), - [ - # -- normally it splits exactly on overlap size |------- 20 -------| - ("In rhoncus ipsum sed lectus porta volutpat.", "ctus porta volutpat."), - # -- but it strips leading and trailing whitespace when the tail includes it -- - ("In rhoncus ipsum sed lectus porta volutpat. ", "porta volutpat."), - ], - ) - def it_computes_its_overlap_tail_for_use_in_inter_pre_chunk_overlap( - self, text: str, expected_value: str - ): - pre_chunk = TextPreChunk( - [Text(text)], overlap_prefix="", opts=ChunkingOptions(overlap=20, overlap_all=True) - ) - assert pre_chunk.overlap_tail == expected_value - def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self): - pre_chunk = TextPreChunk( - [ - Title( - "Lorem Ipsum", - metadata=ElementMetadata( - category_depth=0, - filename="foo.docx", - languages=["lat"], - parent_id="f87731e0", - ), + elements = [ + Title( + "Lorem Ipsum", + metadata=ElementMetadata( + category_depth=0, + filename="foo.docx", + languages=["lat"], + parent_id="f87731e0", ), - Text( - "'Lorem ipsum dolor' means 'Thank you very much' in Latin.", - metadata=ElementMetadata( - category_depth=1, - filename="foo.docx", - image_path="sprite.png", - languages=["lat", "eng"], - ), + ), + Text( + "'Lorem ipsum dolor' means 'Thank you very much' in Latin.", + metadata=ElementMetadata( + category_depth=1, + filename="foo.docx", + image_path="sprite.png", + languages=["lat", "eng"], ), - ], - overlap_prefix="", - opts=ChunkingOptions(), - ) + ), + ] + text = "Lorem Ipsum\n\n'Lorem ipsum dolor' means 'Thank you very much' in Latin." + + chunker = _Chunker(elements, text=text, opts=ChunkingOptions()) - assert pre_chunk._all_metadata_values == { + assert chunker._all_metadata_values == { # -- scalar values are accumulated in a list in element order -- "category_depth": [0, 1], # -- all values are accumulated, not only unique ones -- @@ -920,19 +690,17 @@ def but_it_discards_ad_hoc_metadata_fields_during_consolidation(self): image_path="sprite.png", languages=["lat", "eng"], ) - metadata_2.quotient = 1.74 - - pre_chunk = TextPreChunk( - [ - Title("Lorem Ipsum", metadata=metadata), - Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2), - ], - overlap_prefix="", - opts=ChunkingOptions(), - ) + metadata_2.quotient = 1.74 + elements = [ + Title("Lorem Ipsum", metadata=metadata), + Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2), + ] + text = "Lorem Ipsum\n\n'Lorem ipsum dolor' means 'Thank you very much' in Latin." + + chunker = _Chunker(elements, text=text, opts=ChunkingOptions()) # -- ad-hoc fields "coefficient" and "quotient" do not appear -- - assert pre_chunk._all_metadata_values == { + assert chunker._all_metadata_values == { "category_depth": [0, 1], "filename": ["foo.docx", "foo.docx"], "image_path": ["sprite.png"], @@ -945,9 +713,11 @@ def and_it_adds_the_pre_chunk_elements_to_metadata_when_so_instructed(self): metadata = ElementMetadata(filename="foo.pdf") element = Title("Lorem Ipsum", metadata=metadata) element_2 = Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata) - pre_chunk = TextPreChunk([element, element_2], overlap_prefix="", opts=opts) + elements = [element, element_2] + text = "Lorem Ipsum\n\n'Lorem ipsum dolor' means 'Thank you very much' in Latin." + chunker = _Chunker(elements, text=text, opts=opts) - consolidated_metadata = pre_chunk._consolidated_metadata + consolidated_metadata = chunker._consolidated_metadata # -- pre-chunk elements are included as metadata -- orig_elements = consolidated_metadata.orig_elements @@ -963,40 +733,38 @@ def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strate Only non-None fields should appear in the dict and each field value should be the consolidation of the values across the pre_chunk elements. """ - pre_chunk = TextPreChunk( - [ - PageBreak(""), - Title( - "Lorem Ipsum", - metadata=ElementMetadata( - filename="foo.docx", - # -- category_depth has DROP strategy so doesn't appear in result -- - category_depth=0, - emphasized_text_contents=["Lorem", "Ipsum"], - emphasized_text_tags=["b", "i"], - languages=["lat"], - ), + elements = [ + PageBreak(""), + Title( + "Lorem Ipsum", + metadata=ElementMetadata( + filename="foo.docx", + # -- category_depth has DROP strategy so doesn't appear in result -- + category_depth=0, + emphasized_text_contents=["Lorem", "Ipsum"], + emphasized_text_tags=["b", "i"], + languages=["lat"], ), - Text( - "'Lorem ipsum dolor' means 'Thank you very much' in Latin.", - metadata=ElementMetadata( - # -- filename change doesn't happen IRL but demonstrates FIRST strategy -- - filename="bar.docx", - # -- emphasized_text_contents has LIST_CONCATENATE strategy, so "Lorem" - # -- appears twice in consolidated-meta (as it should) and length matches - # -- that of emphasized_text_tags both before and after consolidation. - emphasized_text_contents=["Lorem", "ipsum"], - emphasized_text_tags=["i", "b"], - # -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once -- - languages=["eng", "lat"], - ), + ), + Text( + "'Lorem ipsum dolor' means 'Thank you very much' in Latin.", + metadata=ElementMetadata( + # -- filename change doesn't happen IRL but demonstrates FIRST strategy -- + filename="bar.docx", + # -- emphasized_text_contents has LIST_CONCATENATE strategy, so "Lorem" + # -- appears twice in consolidated-meta (as it should) and length matches + # -- that of emphasized_text_tags both before and after consolidation. + emphasized_text_contents=["Lorem", "ipsum"], + emphasized_text_tags=["i", "b"], + # -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once -- + languages=["eng", "lat"], ), - ], - overlap_prefix="", - opts=ChunkingOptions(), - ) + ), + ] + text = "Lorem Ipsum\n\n'Lorem ipsum dolor' means 'Thank you very much' in Latin." + chunker = _Chunker(elements, text=text, opts=ChunkingOptions()) - meta_kwargs = pre_chunk._meta_kwargs + meta_kwargs = chunker._meta_kwargs assert meta_kwargs == { "filename": "foo.docx", @@ -1006,19 +774,21 @@ def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strate } def it_computes_the_original_elements_list_to_help(self): + opts = ChunkingOptions(include_orig_elements=True) element = Title("Introduction") element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") element_3 = CompositeElement( "In rhoncus ipsum sed lectus porta volutpat.", metadata=ElementMetadata(orig_elements=[Text("Porta volupat.")]), ) - pre_chunk = TextPreChunk( - [element, element_2, element_3], - overlap_prefix="", - opts=ChunkingOptions(include_orig_elements=True), + elements = [element, element_2, element_3] + text = ( + "Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn" + " rhoncus ipsum sed lectus porta volutpat." ) + chunker = _Chunker(elements, text=text, opts=opts) - orig_elements = pre_chunk._orig_elements + orig_elements = chunker._orig_elements # -- all elements of pre-chunk are included -- assert orig_elements == [element, element_2, element_3] @@ -1029,39 +799,233 @@ def it_computes_the_original_elements_list_to_help(self): assert orig_elements[2] is not element_3 assert orig_elements[2].metadata.orig_elements is None # -- computation is only on first call, all chunks get exactly the same orig-elements -- - assert pre_chunk._orig_elements is orig_elements + assert chunker._orig_elements is orig_elements + + +class Describe_TableChunker: + """Unit-test suite for `unstructured.chunking.base._TableChunker` objects.""" + + def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self): + html_table = ( + "
\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "
Header Col 1 Header Col 2
Lorem ipsum adipiscing
" + ) + text_table = "Header Col 1 Header Col 2\nLorem ipsum adipiscing" + + chunk_iter = _TableChunker.iter_chunks( + Table(text_table, metadata=ElementMetadata(text_as_html=html_table)), + overlap_prefix="ctus porta volutpat.", + opts=ChunkingOptions(max_characters=175), + ) + + chunk = next(chunk_iter) + assert isinstance(chunk, Table) + assert chunk.text == ( + "ctus porta volutpat.\nHeader Col 1 Header Col 2\nLorem ipsum adipiscing" + ) + assert chunk.metadata.text_as_html == ( + "" + "" + "" + "
Header Col 1Header Col 2
Lorem ipsumadipiscing
" + ) + with pytest.raises(StopIteration): + next(chunk_iter) + + def but_not_when_the_table_is_is_empty_or_contains_only_whitespace(self): + html_table = "
\t \n
" + + chunk_iter = _TableChunker.iter_chunks( + Table(" \t \n ", metadata=ElementMetadata(text_as_html=html_table)), + overlap_prefix="volutpat.", + opts=ChunkingOptions(max_characters=175), + ) + + with pytest.raises(StopIteration): + next(chunk_iter) + + def and_it_includes_the_original_table_element_in_metadata_when_so_instructed(self): + table = Table("foo bar", metadata=ElementMetadata(text_as_html="foo bar
")) + opts = ChunkingOptions(include_orig_elements=True) + + chunk_iter = _TableChunker.iter_chunks(table, "", opts) + + chunk = next(chunk_iter) + assert isinstance(chunk, Table) + assert chunk.metadata.orig_elements == [table] + assert chunk.metadata.text_as_html == "foo bar
" + # -- + with pytest.raises(StopIteration): + next(chunk_iter) + + def but_not_when_instructed_not_to(self): + chunk_iter = _TableChunker.iter_chunks( + Table("foobar"), "", ChunkingOptions(include_orig_elements=False) + ) + + chunk = next(chunk_iter) + + assert isinstance(chunk, Table) + assert chunk.metadata.orig_elements is None + + def it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self): + html_table = """\ + + + + + + + + + +
Header Col 1 Header Col 2
Lorem ipsum A Link example
Consectetur adipiscing elit
Nunc aliquam id enim nec molestie
+ """ + text_table = ( + "Header Col 1 Header Col 2\n" + "Lorem ipsum dolor sit amet\n" + "Consectetur adipiscing elit\n" + "Nunc aliquam id enim nec molestie\n" + "Vivamus quis nunc ipsum donec ac fermentum" + ) + + chunk_iter = _TableChunker.iter_chunks( + Table(text_table, metadata=ElementMetadata(text_as_html=html_table)), + overlap_prefix="", + opts=ChunkingOptions(max_characters=100, text_splitting_separators=("\n", " ")), + ) + + chunk = next(chunk_iter) + assert isinstance(chunk, TableChunk) + assert chunk.text == ( + "Header Col 1 Header Col 2 Lorem ipsum A Link example Consectetur adipiscing elit" + ) + assert chunk.metadata.text_as_html == ( + "" + "" + "" + "" + "
Header Col 1Header Col 2
Lorem ipsumA Link example
Consecteturadipiscing elit
" + ) + assert chunk.metadata.is_continuation is None + # -- + chunk = next(chunk_iter) + assert isinstance(chunk, TableChunk) + assert chunk.text == "Nunc aliquam id enim nec molestie" + assert chunk.metadata.text_as_html == ( + "
Nunc aliquamid enim nec molestie
" + ) + assert chunk.metadata.is_continuation + # -- + with pytest.raises(StopIteration): + next(chunk_iter) + + def and_it_includes_the_whole_original_Table_in_each_metadata_when_so_instructed(self): + """Even though text and html are split, the orig_elements metadata is not.""" + table = Table( + "Header Col 1 Header Col 2\nLorem ipsum dolor sit amet", + metadata=ElementMetadata(text_as_html=""), + ) + opts = ChunkingOptions(max_characters=30, include_orig_elements=True) + + chunk_iter = _TableChunker.iter_chunks(table, overlap_prefix="", opts=opts) + + chunk = next(chunk_iter) + assert isinstance(chunk, TableChunk) + assert chunk.text == "Header Col 1 Header Col 2" + assert chunk.metadata.orig_elements == [table] + assert not chunk.metadata.is_continuation + # -- + chunk = next(chunk_iter) + assert isinstance(chunk, TableChunk) + assert chunk.text == "Lorem ipsum dolor sit amet" + assert chunk.metadata.orig_elements == [table] + assert chunk.metadata.is_continuation @pytest.mark.parametrize( - ("elements", "overlap_prefix", "expected_value"), + ("text", "overlap_prefix", "expected_value"), [ - ([Text("foo"), Text("bar")], "bah da bing.", "bah da bing.\n\nfoo\n\nbar"), - ([Text("foo"), PageBreak(""), Text("bar")], "da bang.", "da bang.\n\nfoo\n\nbar"), - ([PageBreak(""), Text("foo")], "bah da boom.", "bah da boom.\n\nfoo"), - ([Text("foo"), Text("bar"), PageBreak("")], "", "foo\n\nbar"), + ( + "In rhoncus ipsum sed lectus porta volutpat.", + "", + "In rhoncus ipsum sed lectus porta volutpat.", + ), + ( + "In rhoncus ipsum sed lectus porta volutpat.", + "ctus porta volutpat.", + "ctus porta volutpat.\nIn rhoncus ipsum sed lectus porta volutpat.", + ), ], ) - def it_knows_the_concatenated_text_of_the_pre_chunk_to_help( - self, elements: list[Text], overlap_prefix: str, expected_value: str + def it_includes_its_overlap_prefix_in_its_text_when_present( + self, text: str, overlap_prefix: str, expected_value: str ): - """._text is the "joined" text of the pre-chunk elements. + table_chunker = _TableChunker( + Table(text), overlap_prefix=overlap_prefix, opts=ChunkingOptions() + ) + assert table_chunker._text_with_overlap == expected_value - The text-segment contributed by each element is separated from the next by a blank line - ("\n\n"). An element that contributes no text does not give rise to a separator. - """ - pre_chunk = TextPreChunk(elements, overlap_prefix=overlap_prefix, opts=ChunkingOptions()) - assert pre_chunk._text == expected_value + def it_computes_metadata_for_each_chunk_to_help(self): + table = Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="
")) + table_chunker = _TableChunker(table, overlap_prefix="", opts=ChunkingOptions()) + + metadata = table_chunker._metadata + + assert metadata.text_as_html == "
" + # -- opts.include_orig_elements is True by default -- + assert metadata.orig_elements == [table] + # -- it produces a new instance each time it is called so changing one chunk's metadata does + # -- not change that of any other chunk. + assert table_chunker._metadata is not metadata + + def but_it_omits_orig_elements_from_metadata_when_so_instructed(self): + table_chunker = _TableChunker( + Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="
")), + overlap_prefix="", + opts=ChunkingOptions(include_orig_elements=False), + ) + + assert table_chunker._metadata.orig_elements is None + + def it_computes_the_original_elements_list_to_help(self): + table = Table( + "Lorem ipsum", + metadata=ElementMetadata(text_as_html="
", orig_elements=[Table("Lorem Ipsum")]), + ) + table_chunker = _TableChunker(table, overlap_prefix="", opts=ChunkingOptions()) + + orig_elements = table_chunker._orig_elements + + # -- a _TableChunker always has exactly one original (Table) element -- + assert len(orig_elements) == 1 + orig_element = orig_elements[0] + # -- each item in orig_elements is a copy of the original element so we can mutate it + # -- without changing user's data. + assert orig_element == table + assert orig_element is not table + # -- it strips any .metadata.orig_elements from each element to prevent a recursive data + # -- structure + assert orig_element.metadata.orig_elements is None + # -- computation is only on first call, all chunks get exactly the same orig-elements -- + assert table_chunker._orig_elements is orig_elements # ================================================================================================ -# PRE-CHUNK SPLITTERS +# HTML SPLITTERS # ================================================================================================ -class Describe_TableSplitter: - """Unit-test suite for `unstructured.chunking.base._TableSplitter`.""" +class Describe_HtmlTableSplitter: + """Unit-test suite for `unstructured.chunking.base._HtmlTableSplitter`.""" def it_splits_an_HTML_table_on_whole_row_boundaries_when_possible(self): - opts = ChunkingOptions(max_characters=(150)) + opts = ChunkingOptions(max_characters=(40)) html_table = HtmlTable.from_html_text( """
@@ -1097,7 +1061,7 @@ def it_splits_an_HTML_table_on_whole_row_boundaries_when_possible(self): """ ) - assert list(_TableSplitter.iter_subtables(html_table, opts)) == [ + assert list(_HtmlTableSplitter.iter_subtables(html_table, opts)) == [ ( "Stanley Cups Team Location Stanley Cups", "
" @@ -1119,7 +1083,7 @@ def it_splits_an_HTML_table_on_whole_row_boundaries_when_possible(self): ] def and_it_splits_an_oversized_row_on_an_even_cell_boundary_when_possible(self): - opts = ChunkingOptions(max_characters=(100)) + opts = ChunkingOptions(max_characters=(93)) html_table = HtmlTable.from_html_text( """
@@ -1143,7 +1107,7 @@ def and_it_splits_an_oversized_row_on_an_even_cell_boundary_when_possible(self): """ ) - assert list(_TableSplitter.iter_subtables(html_table, opts)) == [ + assert list(_HtmlTableSplitter.iter_subtables(html_table, opts)) == [ ( "Lorem ipsum dolor sit amet. Consectetur adipiscing elit.", "
" @@ -1189,7 +1153,7 @@ def and_it_splits_an_oversized_cell_on_an_even_word_boundary(self): """ ) - assert list(_TableSplitter.iter_subtables(html_table, opts)) == [ + assert list(_HtmlTableSplitter.iter_subtables(html_table, opts)) == [ ( "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do", "
" @@ -1346,16 +1310,14 @@ def it_accumulates_elements_added_to_it(self): ("", False), ], ) - def it_will_fit_a_cell_with_text_shorter_than_maxlen_minus_33_when_empty( + def it_will_fit_a_cell_with_text_shorter_than_maxlen_when_empty( self, cell_html: str, expected_value: bool ): - """Cell text must be 22-chars or shorter to fit in 55-char window. - - `
Lorem Ipsum dolor sit amet.
...
` overhead is 33 characters. - """ - accum = _CellAccumulator(maxlen=55) + accum = _CellAccumulator(maxlen=25) cell = HtmlCell(fragment_fromstring(cell_html)) + print(f"{cell.text=}") + assert accum.will_fit(cell) is expected_value @pytest.mark.parametrize( @@ -1368,16 +1330,12 @@ def it_will_fit_a_cell_with_text_shorter_than_maxlen_minus_33_when_empty( ("Lorem Ipsum dolor sit amet.", False), # -- 27 -- ], ) - def and_it_will_fit_a_cell_with_text_shorter_than_remaining_space_minus_9_when_not_empty( + def and_it_will_fit_a_cell_with_text_shorter_than_remaining_space_when_not_empty( self, cell_html: str, expected_value: bool ): - """Cell text must be 9-chars shorter than remaining space to fit with accumulated cells. - - `...` overhead is 9 characters. - """ - accum = _CellAccumulator(maxlen=85) + accum = _CellAccumulator(maxlen=44) accum.add_cell(HtmlCell(fragment_fromstring("abcdefghijklmnopqrstuvwxyz"))) - # -- remaining space is 85 - 26 -33 = 26; max new cell text len is 17 -- + # -- remaining space is 44 - 26 = 18; max new cell text len is 17 -- cell = HtmlCell(fragment_fromstring(cell_html)) assert accum.will_fit(cell) is expected_value @@ -1429,23 +1387,19 @@ def it_accumulates_rows_added_to_it(self): @pytest.mark.parametrize( ("row_html", "expected_value"), [ - ("", True), # -- 5 -- - ("", True), # -- 14 -- - ("Lorem Ipsum.", True), # -- 30 -- - ("Lorem Ipsum dolor sit.", True), # -- 40 -- - ("LoremSit amet", True), # -- 40 -- - ("Lorem Ipsum dolor sit amet.", False), # -- 45 -- - ("Lorem IpsumDolor sit.", False), # -- 48 -- + ("", True), # -- 0 -- + ("", True), # -- 0 -- + ("Lorem Ipsum.", True), # -- 12 -- + ("Lorem Ipsum dolor sit", True), # -- 21 -- + ("LoremSit amet", True), # -- 14 -- + ("Lorem Ipsum dolor sit amet.", False), # -- 27 -- + ("Lorem IpsumDolor sit.", False), # -- 22 -- ], ) - def it_will_fit_a_row_with_HTML_shorter_than_maxlen_minus_15_when_empty( + def it_will_fit_a_row_with_text_shorter_than_maxlen_when_empty( self, row_html: str, expected_value: bool ): - """Row HTML must be 40-chars or shorter to fit in 55-char chunking window. - - `...
` overhead is 15 characters. - """ - accum = _RowAccumulator(maxlen=55) + accum = _RowAccumulator(maxlen=21) row = HtmlRow(fragment_fromstring(row_html)) assert accum.will_fit(row) is expected_value @@ -1453,22 +1407,22 @@ def it_will_fit_a_row_with_HTML_shorter_than_maxlen_minus_15_when_empty( @pytest.mark.parametrize( ("row_html", "expected_value"), [ - ("", True), # -- 5 -- - ("", True), # -- 14 -- - ("Lorem Ipsum dolor sit", True), # -- 39 -- - ("Lorem Ipsum dolor sit.", True), # -- 40 -- - ("LoremSit amet", True), # -- 40 -- - ("LoremSit amet.", False), # -- 41 -- - ("Lorem IpsumDolor sit.", False), # -- 48 -- + ("", True), # -- 0 -- + ("", True), # -- 0 -- + ("Lorem Ipsum.", True), # -- 12 -- + ("Lorem Ipsum dolor sit", True), # -- 21 -- + ("LoremSit amet", True), # -- 14 -- + ("Lorem Ipsum dolor sit amet.", False), # -- 27 -- + ("Lorem IpsumDolor sit.", False), # -- 22 -- ], ) - def and_it_will_fit_a_row_with_HTML_shorter_than_remaining_space_when_not_empty( + def and_it_will_fit_a_row_with_text_shorter_than_remaining_space_when_not_empty( self, row_html: str, expected_value: bool ): """There is no overhead beyond row HTML for additional rows.""" - accum = _RowAccumulator(maxlen=99) + accum = _RowAccumulator(maxlen=48) accum.add_row(HtmlRow(fragment_fromstring("abcdefghijklmnopqrstuvwxyz"))) - # -- remaining space is 85 - 26 - 33 = 26; max new row HTML len is 40 -- + # -- remaining space is 48 - 26 = 21 -- row = HtmlRow(fragment_fromstring(row_html)) assert accum.will_fit(row) is expected_value @@ -1514,10 +1468,10 @@ def but_it_does_not_generate_a_TextAndHtml_pair_when_empty(self): class DescribePreChunkCombiner: """Unit-test suite for `unstructured.chunking.base.PreChunkCombiner`.""" - def it_combines_sequential_small_text_pre_chunks(self): + def it_combines_sequential_small_pre_chunks(self): opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=250) pre_chunks = [ - TextPreChunk( + PreChunk( [ Title("Lorem Ipsum"), # 11 Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55 @@ -1525,7 +1479,8 @@ def it_combines_sequential_small_text_pre_chunks(self): overlap_prefix="", opts=opts, ), - TextPreChunk( + PreChunk([Table("Heading\nCell text")], overlap_prefix="", opts=opts), + PreChunk( [ Title("Mauris Nec"), # 10 Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59 @@ -1533,7 +1488,7 @@ def it_combines_sequential_small_text_pre_chunks(self): overlap_prefix="", opts=opts, ), - TextPreChunk( + PreChunk( [ Title("Sed Orci"), # 8 Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63 @@ -1546,10 +1501,11 @@ def it_combines_sequential_small_text_pre_chunks(self): pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks() pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) + assert isinstance(pre_chunk, PreChunk) assert pre_chunk._elements == [ Title("Lorem Ipsum"), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), + Table("Heading\nCell text"), Title("Mauris Nec"), Text("Mauris nec urna non augue vulputate consequat eget et nisi."), Title("Sed Orci"), @@ -1558,57 +1514,10 @@ def it_combines_sequential_small_text_pre_chunks(self): with pytest.raises(StopIteration): next(pre_chunk_iter) - def but_it_does_not_combine_table_pre_chunks(self): - opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=250) - pre_chunks = [ - TextPreChunk( - [ - Title("Lorem Ipsum"), - Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), - ], - overlap_prefix="", - opts=opts, - ), - TablePreChunk(Table("Heading\nCell text"), overlap_prefix="", opts=opts), - TextPreChunk( - [ - Title("Mauris Nec"), - Text("Mauris nec urna non augue vulputate consequat eget et nisi."), - ], - overlap_prefix="", - opts=opts, - ), - ] - - pre_chunk_iter = PreChunkCombiner( - pre_chunks, ChunkingOptions(max_characters=250, combine_text_under_n_chars=250) - ).iter_combined_pre_chunks() - - pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) - assert pre_chunk._elements == [ - Title("Lorem Ipsum"), - Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), - ] - # -- - pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TablePreChunk) - assert pre_chunk._table == Table("Heading\nCell text") - # -- - pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) - assert pre_chunk._elements == [ - Title("Mauris Nec"), - Text("Mauris nec urna non augue vulputate consequat eget et nisi."), - ] - # -- - with pytest.raises(StopIteration): - next(pre_chunk_iter) - def it_respects_the_specified_combination_threshold(self): opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=80) pre_chunks = [ - TextPreChunk( # 68 + PreChunk( # 68 [ Title("Lorem Ipsum"), # 11 Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55 @@ -1616,7 +1525,7 @@ def it_respects_the_specified_combination_threshold(self): overlap_prefix="", opts=opts, ), - TextPreChunk( # 71 + PreChunk( # 71 [ Title("Mauris Nec"), # 10 Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59 @@ -1625,7 +1534,7 @@ def it_respects_the_specified_combination_threshold(self): opts=opts, ), # -- len == 139 - TextPreChunk( + PreChunk( [ Title("Sed Orci"), # 8 Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63 @@ -1638,7 +1547,7 @@ def it_respects_the_specified_combination_threshold(self): pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks() pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) + assert isinstance(pre_chunk, PreChunk) assert pre_chunk._elements == [ Title("Lorem Ipsum"), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), @@ -1647,7 +1556,7 @@ def it_respects_the_specified_combination_threshold(self): ] # -- pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) + assert isinstance(pre_chunk, PreChunk) assert pre_chunk._elements == [ Title("Sed Orci"), Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), @@ -1659,7 +1568,7 @@ def it_respects_the_specified_combination_threshold(self): def it_respects_the_hard_maximum_window_length(self): opts = ChunkingOptions(max_characters=200, combine_text_under_n_chars=200) pre_chunks = [ - TextPreChunk( # 68 + PreChunk( # 68 [ Title("Lorem Ipsum"), # 11 Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55 @@ -1667,7 +1576,7 @@ def it_respects_the_hard_maximum_window_length(self): overlap_prefix="", opts=opts, ), - TextPreChunk( # 71 + PreChunk( # 71 [ Title("Mauris Nec"), # 10 Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59 @@ -1676,7 +1585,7 @@ def it_respects_the_hard_maximum_window_length(self): opts=opts, ), # -- len == 139 - TextPreChunk( + PreChunk( [ Title("Sed Orci"), # 8 Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63 @@ -1690,7 +1599,7 @@ def it_respects_the_hard_maximum_window_length(self): pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks() pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) + assert isinstance(pre_chunk, PreChunk) assert pre_chunk._elements == [ Title("Lorem Ipsum"), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), @@ -1699,7 +1608,7 @@ def it_respects_the_hard_maximum_window_length(self): ] # -- pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) + assert isinstance(pre_chunk, PreChunk) assert pre_chunk._elements == [ Title("Sed Orci"), Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), @@ -1712,8 +1621,8 @@ def it_accommodates_and_isolates_an_oversized_pre_chunk(self): """Such as occurs when a single element exceeds the window size.""" opts = ChunkingOptions(max_characters=150, combine_text_under_n_chars=150) pre_chunks = [ - TextPreChunk([Title("Lorem Ipsum")], overlap_prefix="", opts=opts), - TextPreChunk( # 179 + PreChunk([Title("Lorem Ipsum")], overlap_prefix="", opts=opts), + PreChunk( # 179 [ Text( "Lorem ipsum dolor sit amet consectetur adipiscing elit." # 55 @@ -1724,7 +1633,7 @@ def it_accommodates_and_isolates_an_oversized_pre_chunk(self): overlap_prefix="", opts=opts, ), - TextPreChunk([Title("Vulputate Consequat")], overlap_prefix="", opts=opts), + PreChunk([Title("Vulputate Consequat")], overlap_prefix="", opts=opts), ] pre_chunk_iter = PreChunkCombiner( @@ -1732,11 +1641,11 @@ def it_accommodates_and_isolates_an_oversized_pre_chunk(self): ).iter_combined_pre_chunks() pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) + assert isinstance(pre_chunk, PreChunk) assert pre_chunk._elements == [Title("Lorem Ipsum")] # -- pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) + assert isinstance(pre_chunk, PreChunk) assert pre_chunk._elements == [ Text( "Lorem ipsum dolor sit amet consectetur adipiscing elit." @@ -1746,21 +1655,21 @@ def it_accommodates_and_isolates_an_oversized_pre_chunk(self): ] # -- pre_chunk = next(pre_chunk_iter) - assert isinstance(pre_chunk, TextPreChunk) + assert isinstance(pre_chunk, PreChunk) assert pre_chunk._elements == [Title("Vulputate Consequat")] # -- with pytest.raises(StopIteration): next(pre_chunk_iter) -class DescribeTextPreChunkAccumulator: - """Unit-test suite for `unstructured.chunking.base.TextPreChunkAccumulator`.""" +class Describe_PreChunkAccumulator: + """Unit-test suite for `unstructured.chunking.base._PreChunkAccumulator`.""" - def it_generates_a_combined_TextPreChunk_when_flushed_and_resets_itself_to_empty(self): + def it_generates_a_combined_PreChunk_when_flushed_and_resets_itself_to_empty(self): opts = ChunkingOptions(combine_text_under_n_chars=500) - accum = TextPreChunkAccumulator(opts=opts) + accum = _PreChunkAccumulator(opts=opts) - pre_chunk = TextPreChunk( + pre_chunk = PreChunk( [ Title("Lorem Ipsum"), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), @@ -1771,7 +1680,7 @@ def it_generates_a_combined_TextPreChunk_when_flushed_and_resets_itself_to_empty assert accum.will_fit(pre_chunk) accum.add_pre_chunk(pre_chunk) - pre_chunk = TextPreChunk( + pre_chunk = PreChunk( [ Title("Mauris Nec"), Text("Mauris nec urna non augue vulputate consequat eget et nisi."), @@ -1782,7 +1691,7 @@ def it_generates_a_combined_TextPreChunk_when_flushed_and_resets_itself_to_empty assert accum.will_fit(pre_chunk) accum.add_pre_chunk(pre_chunk) - pre_chunk = TextPreChunk( + pre_chunk = PreChunk( [ Title("Sed Orci"), Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."), @@ -1799,8 +1708,8 @@ def it_generates_a_combined_TextPreChunk_when_flushed_and_resets_itself_to_empty pre_chunk = next(pre_chunk_iter) with pytest.raises(StopIteration): next(pre_chunk_iter) - # -- and it is a _TextPreChunk containing all the elements -- - assert isinstance(pre_chunk, TextPreChunk) + # -- and it is a PreChunk containing all the elements -- + assert isinstance(pre_chunk, PreChunk) assert pre_chunk._elements == [ Title("Lorem Ipsum"), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), @@ -1815,8 +1724,8 @@ def it_generates_a_combined_TextPreChunk_when_flushed_and_resets_itself_to_empty with pytest.raises(StopIteration): next(accum.flush()) - def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self): - accum = TextPreChunkAccumulator(opts=ChunkingOptions(max_characters=150)) + def but_it_does_not_generate_a_PreChunk_on_flush_when_empty(self): + accum = _PreChunkAccumulator(opts=ChunkingOptions(max_characters=150)) assert list(accum.flush()) == [] diff --git a/test_unstructured/chunking/test_basic.py b/test_unstructured/chunking/test_basic.py index 85f807b6ff..88e01563fe 100644 --- a/test_unstructured/chunking/test_basic.py +++ b/test_unstructured/chunking/test_basic.py @@ -25,31 +25,31 @@ def test_it_chunks_a_document_when_basic_chunking_strategy_is_specified_on_parti assert chunks == [ CompositeElement( "US Trustee Handbook\n\nCHAPTER 1\n\nINTRODUCTION\n\nCHAPTER 1 – INTRODUCTION" - "\n\nA.\tPURPOSE" + "\n\nA. PURPOSE" ), CompositeElement( "The United States Trustee appoints and supervises standing trustees and monitors and" - " supervises cases under chapter 13 of title 11 of the United States Code. 28 U.S.C." - " § 586(b). The Handbook, issued as part of our duties under 28 U.S.C. § 586," + " supervises cases under chapter 13 of title 11 of the United States Code. 28 U.S.C." + " § 586(b). The Handbook, issued as part of our duties under 28 U.S.C. § 586," " establishes or clarifies the position of the United States Trustee Program (Program)" " on the duties owed by a standing trustee to the debtors, creditors, other parties in" - " interest, and the United States Trustee. The Handbook does not present a full and" + " interest, and the United States Trustee. The Handbook does not present a full and" ), CompositeElement( "complete statement of the law; it should not be used as a substitute for legal" - " research and analysis. The standing trustee must be familiar with relevant" + " research and analysis. The standing trustee must be familiar with relevant" " provisions of the Bankruptcy Code, Federal Rules of Bankruptcy Procedure (Rules)," - " any local bankruptcy rules, and case law. 11 U.S.C. § 321, 28 U.S.C. § 586," - " 28 C.F.R. § 58.6(a)(3). Standing trustees are encouraged to follow Practice Tips" + " any local bankruptcy rules, and case law. 11 U.S.C. § 321, 28 U.S.C. § 586," + " 28 C.F.R. § 58.6(a)(3). Standing trustees are encouraged to follow Practice Tips" " identified in this Handbook but these are not considered mandatory." ), CompositeElement( "Nothing in this Handbook should be construed to excuse the standing trustee from" " complying with all duties imposed by the Bankruptcy Code and Rules, local rules, and" - " orders of the court. The standing trustee should notify the United States Trustee" + " orders of the court. The standing trustee should notify the United States Trustee" " whenever the provision of the Handbook conflicts with the local rules or orders of" - " the court. The standing trustee is accountable for all duties set forth in this" - " Handbook, but need not personally perform any duty unless otherwise indicated. All" + " the court. The standing trustee is accountable for all duties set forth in this" + " Handbook, but need not personally perform any duty unless otherwise indicated. All" ), CompositeElement( "statutory references in this Handbook refer to the Bankruptcy Code, 11 U.S.C. § 101" @@ -57,12 +57,12 @@ def test_it_chunks_a_document_when_basic_chunking_strategy_is_specified_on_parti ), CompositeElement( "This Handbook does not create additional rights against the standing trustee or" - " United States Trustee in favor of other parties.\n\nB.\tROLE OF THE UNITED STATES" + " United States Trustee in favor of other parties.\n\nB. ROLE OF THE UNITED STATES" " TRUSTEE" ), CompositeElement( "The Bankruptcy Reform Act of 1978 removed the bankruptcy judge from the" - " responsibilities for daytoday administration of cases. Debtors, creditors, and" + " responsibilities for daytoday administration of cases. Debtors, creditors, and" " third parties with adverse interests to the trustee were concerned that the court," " which previously appointed and supervised the trustee, would not impartially" " adjudicate their rights as adversaries of that trustee. To address these concerns," @@ -70,24 +70,24 @@ def test_it_chunks_a_document_when_basic_chunking_strategy_is_specified_on_parti ), CompositeElement( "Many administrative functions formerly performed by the court were placed within the" - " Department of Justice through the creation of the Program. Among the administrative" + " Department of Justice through the creation of the Program. Among the administrative" " functions assigned to the United States Trustee were the appointment and supervision" - " of chapter 13 trustees./ This Handbook is issued under the authority of the" - " Program’s enabling statutes. \n\nC.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t" + " of chapter 13 trustees./ This Handbook is issued under the authority of the" + " Program’s enabling statutes.\n\nC. STATUTORY DUTIES OF A STANDING TRUSTEE" ), CompositeElement( - "The standing trustee has a fiduciary responsibility to the bankruptcy estate. The" - " standing trustee is more than a mere disbursing agent. The standing trustee must" - " be personally involved in the trustee operation. If the standing trustee is or" + "The standing trustee has a fiduciary responsibility to the bankruptcy estate. The" + " standing trustee is more than a mere disbursing agent. The standing trustee must" + " be personally involved in the trustee operation. If the standing trustee is or" " becomes unable to perform the duties and responsibilities of a standing trustee," " the standing trustee must immediately advise the United States Trustee." - " 28 U.S.C. § 586(b), 28 C.F.R. § 58.4(b) referencing 28 C.F.R. § 58.3(b)." + " 28 U.S.C. § 586(b), 28 C.F.R. § 58.4(b) referencing 28 C.F.R. § 58.3(b)." ), CompositeElement( "Although this Handbook is not intended to be a complete statutory reference, the" " standing trustee’s primary statutory duties are set forth in 11 U.S.C. § 1302, which" " incorporates by reference some of the duties of chapter 7 trustees found in" - " 11 U.S.C. § 704. These duties include, but are not limited to, the" + " 11 U.S.C. § 704. These duties include, but are not limited to, the" " following:\n\nCopyright" ), ] diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py index 45071e667e..443b073755 100644 --- a/test_unstructured/chunking/test_title.py +++ b/test_unstructured/chunking/test_title.py @@ -8,7 +8,7 @@ import pytest -from test_unstructured.unit_utils import FixtureRequest, Mock, function_mock +from test_unstructured.unit_utils import FixtureRequest, Mock, function_mock, input_path from unstructured.chunking.base import CHUNK_MULTI_PAGE_DEFAULT from unstructured.chunking.title import _ByTitleChunkingOptions, chunk_by_title from unstructured.documents.coordinates import CoordinateSystem @@ -20,10 +20,12 @@ ElementMetadata, ListItem, Table, + TableChunk, Text, Title, ) from unstructured.partition.html import partition_html +from unstructured.staging.base import elements_from_json # ================================================================================================ # INTEGRATION-TESTS @@ -33,7 +35,53 @@ # ================================================================================================ -def test_it_splits_a_large_element_into_multiple_chunks(): +def test_it_chunks_text_followed_by_table_together_when_both_fit(): + elements = elements_from_json(input_path("chunking/title_table_200.json")) + + chunks = chunk_by_title(elements, combine_text_under_n_chars=0) + + assert len(chunks) == 1 + assert isinstance(chunks[0], CompositeElement) + + +def test_it_chunks_table_followed_by_text_together_when_both_fit(): + elements = elements_from_json(input_path("chunking/table_text_200.json")) + + # -- disable chunk combining so we test pre-chunking behavior, not chunk-combining -- + chunks = chunk_by_title(elements, combine_text_under_n_chars=0) + + assert len(chunks) == 1 + assert isinstance(chunks[0], CompositeElement) + + +def test_it_splits_oversized_table(): + elements = elements_from_json(input_path("chunking/table_2000.json")) + + chunks = chunk_by_title(elements) + + assert len(chunks) == 5 + assert all(isinstance(chunk, TableChunk) for chunk in chunks) + + +def test_it_starts_new_chunk_for_table_after_full_text_chunk(): + elements = elements_from_json(input_path("chunking/long_text_table_200.json")) + + chunks = chunk_by_title(elements, max_characters=250) + + assert len(chunks) == 2 + assert [type(chunk) for chunk in chunks] == [CompositeElement, Table] + + +def test_it_starts_new_chunk_for_text_after_full_table_chunk(): + elements = elements_from_json(input_path("chunking/full_table_long_text_250.json")) + + chunks = chunk_by_title(elements, max_characters=250) + + assert len(chunks) == 2 + assert [type(chunk) for chunk in chunks] == [Table, CompositeElement] + + +def test_it_splits_a_large_text_element_into_multiple_chunks(): elements: list[Element] = [ Title("Introduction"), Text( @@ -68,7 +116,7 @@ def test_it_splits_elements_by_title_and_table(): chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=True) - assert len(chunks) == 4 + assert len(chunks) == 3 # -- chunk = chunks[0] assert isinstance(chunk, CompositeElement) @@ -76,13 +124,10 @@ def test_it_splits_elements_by_title_and_table(): Title("A Great Day"), Text("Today is a great day."), Text("It is sunny outside."), + Table("Heading\nCell text"), ] # -- chunk = chunks[1] - assert isinstance(chunk, Table) - assert chunk.metadata.orig_elements == [Table("Heading\nCell text")] - # == - chunk = chunks[2] assert isinstance(chunk, CompositeElement) assert chunk.metadata.orig_elements == [ Title("An Okay Day"), @@ -90,7 +135,7 @@ def test_it_splits_elements_by_title_and_table(): Text("It is rainy outside."), ] # -- - chunk = chunks[3] + chunk = chunks[2] assert isinstance(chunk, CompositeElement) assert chunk.metadata.orig_elements == [ Title("A Bad Day"), @@ -119,9 +164,8 @@ def test_chunk_by_title(): assert chunks == [ CompositeElement( - "A Great Day\n\nToday is a great day.\n\nIt is sunny outside.", + "A Great Day\n\nToday is a great day.\n\nIt is sunny outside.\n\nHeading Cell text" ), - Table("Heading\nCell text"), CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."), CompositeElement( "A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.", @@ -150,10 +194,7 @@ def test_chunk_by_title_separates_by_page_number(): CompositeElement( "A Great Day", ), - CompositeElement( - "Today is a great day.\n\nIt is sunny outside.", - ), - Table("Heading\nCell text"), + CompositeElement("Today is a great day.\n\nIt is sunny outside.\n\nHeading Cell text"), CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."), CompositeElement( "A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.", @@ -178,9 +219,8 @@ def test_chuck_by_title_respects_multipage(): chunks = chunk_by_title(elements, multipage_sections=True, combine_text_under_n_chars=0) assert chunks == [ CompositeElement( - "A Great Day\n\nToday is a great day.\n\nIt is sunny outside.", + "A Great Day\n\nToday is a great day.\n\nIt is sunny outside.\n\nHeading Cell text" ), - Table("Heading\nCell text"), CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."), CompositeElement( "A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.", @@ -206,9 +246,8 @@ def test_chunk_by_title_groups_across_pages(): assert chunks == [ CompositeElement( - "A Great Day\n\nToday is a great day.\n\nIt is sunny outside.", + "A Great Day\n\nToday is a great day.\n\nIt is sunny outside.\n\nHeading Cell text" ), - Table("Heading\nCell text"), CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."), CompositeElement( "A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.", diff --git a/test_unstructured/partition/test_json.py b/test_unstructured/partition/test_json.py index 5022588a23..7a591953d3 100644 --- a/test_unstructured/partition/test_json.py +++ b/test_unstructured/partition/test_json.py @@ -37,7 +37,7 @@ def test_it_chunks_elements_when_a_chunking_strategy_is_specified(): "example-docs/spring-weather.html.json", chunking_strategy="basic", max_characters=1500 ) - assert len(chunks) == 10 + assert len(chunks) == 9 assert all(isinstance(ch, CompositeElement) for ch in chunks) diff --git a/test_unstructured/testfiles/chunking/full_table_long_text_250.json b/test_unstructured/testfiles/chunking/full_table_long_text_250.json new file mode 100644 index 0000000000..f8b739257d --- /dev/null +++ b/test_unstructured/testfiles/chunking/full_table_long_text_250.json @@ -0,0 +1,32 @@ +[ + { + "type": "Table", + "element_id": "ca96108263324e9d865a98f19cf7c940", + "text": "RFP Number: 2024-PMO-01 RFP Title: PMO Services RFP RFP Due Date and Time: Number of Pages: #189 05/30/2024 by 5:00pm Central Time", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "747587de72444235a68c768d544ff5f3", + "text_as_html": "
RFP Number: 2024-PMO-01RFP Title: PMO Services RFP
RFP Due Date and Time:Number of Pages: #189
05/30/2024 by 5:00pm Central Time
", + "languages": [ + "eng" + ], + "filetype": "text/html" + } + }, + { + "type": "NarrativeText", + "element_id": "5bc93ad5828445f98cac824c750cacfd", + "text": "Format: CSV file for Export and Download Contact: Charles Stringham cstringham@alsde.edu to arrange secure data transfer OR with technical questions nickey.johnson@alsde.edu for other questions", + "metadata": { + "category_depth": 2, + "page_number": 1, + "parent_id": "d8fa364bbfdf42d7b37c7a1dcb90ecf5", + "text_as_html": "

Format: CSV file for Export and Download

Contact: Charles Stringham cstringham@alsde.edu to arrange secure data transfer OR with technical questions

nickey.johnson@alsde.edu for other questions

", + "languages": [ + "eng" + ], + "filetype": "text/html" + } + } +] diff --git a/test_unstructured/testfiles/chunking/long_text_table_200.json b/test_unstructured/testfiles/chunking/long_text_table_200.json new file mode 100644 index 0000000000..4ff54bf06c --- /dev/null +++ b/test_unstructured/testfiles/chunking/long_text_table_200.json @@ -0,0 +1,32 @@ +[ + { + "type": "NarrativeText", + "element_id": "5bc93ad5828445f98cac824c750cacfd", + "text": "Format: CSV file for Export and Download Contact: Charles Stringham cstringham@alsde.edu to arrange secure data transfer OR with technical questions nickey.johnson@alsde.edu for other questions", + "metadata": { + "category_depth": 2, + "page_number": 1, + "parent_id": "d8fa364bbfdf42d7b37c7a1dcb90ecf5", + "text_as_html": "

Format: CSV file for Export and Download

Contact: Charles Stringham cstringham@alsde.edu to arrange secure data transfer OR with technical questions

nickey.johnson@alsde.edu for other questions

", + "languages": [ + "eng" + ], + "filetype": "text/html" + } + }, + { + "type": "Table", + "element_id": "ca96108263324e9d865a98f19cf7c940", + "text": "RFP Number: 2024-PMO-01 RFP Title: PMO Services RFP RFP Due Date and Time: Number of Pages: #189 05/30/2024 by 5:00pm Central Time", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "747587de72444235a68c768d544ff5f3", + "text_as_html": "
RFP Number: 2024-PMO-01RFP Title: PMO Services RFP
RFP Due Date and Time:Number of Pages: #189
05/30/2024 by 5:00pm Central Time
", + "languages": [ + "eng" + ], + "filetype": "text/html" + } + } +] diff --git a/test_unstructured/testfiles/chunking/table_2000.json b/test_unstructured/testfiles/chunking/table_2000.json new file mode 100644 index 0000000000..d5e013c383 --- /dev/null +++ b/test_unstructured/testfiles/chunking/table_2000.json @@ -0,0 +1,17 @@ +[ + { + "type": "Table", + "element_id": "e6278883f688428c98cec628a00b0102", + "text": "Field Name Size Type Description Example School_Year 9 VARCHAR School year the assessment was given 2019-2020 LEA_Name VARCHAR Official Name of the School System Happy City Schools LEA_Code 3 VARCHAR 3-digit ALSDE-assigned system code 010 or 298 School_Code 6 VARCHAR 4-digit ALSDE-assigned school code 0100 or 9203 Student_Identifier 10 VARCHAR Student's ALSDE ID number -SSID ***must be 10 digits and start with \"19\" or \"20\"*** 9999999999 Student_Last_Name 35 VARCHAR Student's last name Smith Student_First_Name 35 VARCHAR Student's first name Jane Student_Date_of_Birth_Month 2 VARCHAR Student birth date month. MM 05, 11 Student_Date_of_Birth_Day 2 VARCHAR Student birth date day. DD 03, 25 Student_Date_of_Birth_Year 4 VARCHAR Student birth date Year. YYYY 2015 Reading_Teacher_Identifier 13 VARCHAR Reading Teacher's ALSDE ID/TCHNumber. The teacher who is primarily responsible for Reading instruction of the student. (These are two names for the same number). ***must be in this format 3 letters, dash, 4 numbers, dash, 4 numbers*** XXX-9999-9999, NOJ-1234-5678 Reading_Assessment_Name 15 VARCHAR Unique identifier for Reading assessment. Vendor's name for overall assessment. XXXX Reading_Administration_Mode 8 VARCHAR This field indicates if the assessment was administered in an in-person (face-to-face) or a remote learning environment. The options are: InPerson or Remote Reading_Benchmark_Period 3 VARCHAR Benchmark period during the term the assessment was administered. Summer School will be SSS. BOY, MOY or EOY (SSS for summer school) Reading_Date_Completed 10 VARCHAR This is the date on which the assessment is completed MM/DD/YYYY 43962 Reading_Extended_Time 2 VARCHAR The field will contain a \"Y\" if the student was given more than the allotted time to finish the assessment or any subtest of the assessment as defined by the vendor in a standard administration. Y", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "3ddff8c2b6c44a16be24baf72bdd78a2", + "text_as_html": "
Field NameSizeTypeDescriptionExample
School_Year9VARCHARSchool year the assessment was given2019-2020
LEA_NameVARCHAROfficial Name of the School SystemHappy City Schools
LEA_Code3VARCHAR3-digit ALSDE-assigned system code010 or 298
School_Code6VARCHAR4-digit ALSDE-assigned school code0100 or 9203
Student_Identifier10VARCHARStudent's ALSDE ID number -SSID ***must be 10 digits and start with \"19\" or \"20\"***9999999999
Student_Last_Name35VARCHARStudent's last nameSmith
Student_First_Name35VARCHARStudent's first nameJane
Student_Date_of_Birth_Month2VARCHARStudent birth date month. MM05, 11
Student_Date_of_Birth_Day2VARCHARStudent birth date day. DD03, 25
Student_Date_of_Birth_Year4VARCHARStudent birth date Year. YYYY2015
Reading_Teacher_Identifier13VARCHARReading Teacher's ALSDE ID/TCHNumber. The teacher who is primarily responsible for Reading instruction of the student. (These are two names for the same number). ***must be in this format 3 letters, dash, 4 numbers, dash, 4 numbers***XXX-9999-9999, NOJ-1234-5678
Reading_Assessment_Name15VARCHARUnique identifier for Reading assessment. Vendor's name for overall assessment.XXXX
Reading_Administration_Mode8VARCHARThis field indicates if the assessment was administered in an in-person (face-to-face) or a remote learning environment. The options are:InPerson or Remote
Reading_Benchmark_Period3VARCHARBenchmark period during the term the assessment was administered. Summer School will be SSS.BOY, MOY or EOY (SSS for summer school)
Reading_Date_Completed10VARCHARThis is the date on which the assessment is completed MM/DD/YYYY43962
Reading_Extended_Time2VARCHARThe field will contain a \"Y\" if the student was given more than the allotted time to finish the assessment or any subtest of the assessment as defined by the vendor in a standard administration.Y
", + "languages": [ + "eng" + ], + "filetype": "text/html" + } + } +] diff --git a/test_unstructured/testfiles/chunking/table_text_200.json b/test_unstructured/testfiles/chunking/table_text_200.json new file mode 100644 index 0000000000..456d134358 --- /dev/null +++ b/test_unstructured/testfiles/chunking/table_text_200.json @@ -0,0 +1,32 @@ +[ + { + "type": "Table", + "element_id": "ca96108263324e9d865a98f19cf7c940", + "text": "RFP Number: 2024-PMO-01 RFP Title: PMO Services RFP RFP Due Date and Time: Number of Pages: #189 05/30/2024 by 5:00pm Central Time", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "747587de72444235a68c768d544ff5f3", + "text_as_html": "
RFP Number: 2024-PMO-01RFP Title: PMO Services RFP
RFP Due Date and Time:Number of Pages: #189
05/30/2024 by 5:00pm Central Time
", + "languages": [ + "eng" + ], + "filetype": "text/html" + } + }, + { + "type": "Text", + "element_id": "0163a58539934b3aaca402c9e961b0d6", + "text": "REQUEST FOR PROPOSALS", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "747587de72444235a68c768d544ff5f3", + "text_as_html": "

REQUEST FOR PROPOSALS

", + "languages": [ + "eng" + ], + "filetype": "text/html" + } + } +] diff --git a/test_unstructured/testfiles/chunking/title_table_200.json b/test_unstructured/testfiles/chunking/title_table_200.json new file mode 100644 index 0000000000..3d0a2b15a2 --- /dev/null +++ b/test_unstructured/testfiles/chunking/title_table_200.json @@ -0,0 +1,32 @@ +[ + { + "type": "Title", + "element_id": "0163a58539934b3aaca402c9e961b0d6", + "text": "REQUEST FOR PROPOSALS", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "747587de72444235a68c768d544ff5f3", + "text_as_html": "

REQUEST FOR PROPOSALS

", + "languages": [ + "eng" + ], + "filetype": "text/html" + } + }, + { + "type": "Table", + "element_id": "ca96108263324e9d865a98f19cf7c940", + "text": "RFP Number: 2024-PMO-01 RFP Title: PMO Services RFP RFP Due Date and Time: Number of Pages: #189 05/30/2024 by 5:00pm Central Time", + "metadata": { + "category_depth": 1, + "page_number": 1, + "parent_id": "747587de72444235a68c768d544ff5f3", + "text_as_html": "
RFP Number: 2024-PMO-01RFP Title: PMO Services RFP
RFP Due Date and Time:Number of Pages: #189
05/30/2024 by 5:00pm Central Time
", + "languages": [ + "eng" + ], + "filetype": "text/html" + } + } +] diff --git a/test_unstructured/unit_utils.py b/test_unstructured/unit_utils.py index 11b1106dfd..a3565dcd9d 100644 --- a/test_unstructured/unit_utils.py +++ b/test_unstructured/unit_utils.py @@ -101,6 +101,13 @@ def parse_optional_datetime(datetime_str: Optional[str]) -> Optional[dt.datetime return dt.datetime.fromisoformat(datetime_str) if datetime_str else None +def input_path(rel_path: str) -> str: + """Resolve the absolute-path to `rel_path` in the testfiles directory.""" + testfiles_dir = pathlib.Path(__file__).parent / "testfiles" + file_path = testfiles_dir / rel_path + return str(file_path.resolve()) + + # ------------------------------------------------------------------------------------------------ # MOCKING FIXTURES # ------------------------------------------------------------------------------------------------ diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json b/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json index 4f0950cc49..4f534582ea 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json @@ -1,8 +1,8 @@ [ { "type": "CompositeElement", - "element_id": "36385872440a208d3521a8a885d5f873", - "text": "US Trustee Handbook\n\nCHAPTER 1\n\nINTRODUCTION\n\nCHAPTER 1 \u2013 INTRODUCTION\n\nA.\tPURPOSE", + "element_id": "85002882dd396da0b1b82c925b002be5", + "text": "US Trustee Handbook\n\nCHAPTER 1\n\nINTRODUCTION\n\nCHAPTER 1 \u2013 INTRODUCTION\n\nA. PURPOSE", "metadata": { "data_source": { "record_locator": { @@ -55,8 +55,8 @@ }, { "type": "CompositeElement", - "element_id": "91d26c5ec7f727ece12679cf6b80f90d", - "text": "le 11 of the United States Code. 28 U.S.C. \u00a7 586(b). The Handbook, issued as part of our duties under 28 U.S.C. \u00a7 586, establishes or clarifies the", + "element_id": "1abe685eb8dfed0f2266d6cf793d7e6b", + "text": "le 11 of the United States Code. 28 U.S.C. \u00a7 586(b). The Handbook, issued as part of our duties under 28 U.S.C. \u00a7 586, establishes or clarifies the", "metadata": { "data_source": { "record_locator": { @@ -103,8 +103,8 @@ }, { "type": "CompositeElement", - "element_id": "20447c8f42ed2b919bd0e5707e7899ae", - "text": "s, creditors, other parties in interest, and the United States Trustee. The Handbook does not present a full and complete statement of the law; it", + "element_id": "40588c4c1489058c4fec885f4696ebcc", + "text": "s, creditors, other parties in interest, and the United States Trustee. The Handbook does not present a full and complete statement of the law; it", "metadata": { "data_source": { "record_locator": { @@ -127,8 +127,8 @@ }, { "type": "CompositeElement", - "element_id": "e34c56af21b43f4179f996ddea901bc4", - "text": "ment of the law; it should not be used as a substitute for legal research and analysis. The standing trustee must be familiar with relevant", + "element_id": "9ddf0b109cf940de5f575acc9d9758c8", + "text": "ment of the law; it should not be used as a substitute for legal research and analysis. The standing trustee must be familiar with relevant provisions", "metadata": { "data_source": { "record_locator": { @@ -151,8 +151,8 @@ }, { "type": "CompositeElement", - "element_id": "55e660e5b0d0ec6ee5476621e556d6c8", - "text": "iliar with relevant provisions of the Bankruptcy Code, Federal Rules of Bankruptcy Procedure (Rules), any local bankruptcy rules, and case law. 11", + "element_id": "b7d1b42646393ca0f41af0e8ec48f9a9", + "text": "relevant provisions of the Bankruptcy Code, Federal Rules of Bankruptcy Procedure (Rules), any local bankruptcy rules, and case law. 11 U.S.C. \u00a7 321,", "metadata": { "data_source": { "record_locator": { @@ -175,8 +175,8 @@ }, { "type": "CompositeElement", - "element_id": "a9335be161a6a7a080ff78e4e07cbadb", - "text": ", and case law. 11 U.S.C. \u00a7 321, 28 U.S.C. \u00a7 586, 28 C.F.R. \u00a7 58.6(a)(3). Standing trustees are encouraged to follow Practice Tips identified in", + "element_id": "9ee33f4141eca1f98ca4299d0fdfba31", + "text": "w. 11 U.S.C. \u00a7 321, 28 U.S.C. \u00a7 586, 28 C.F.R. \u00a7 58.6(a)(3). Standing trustees are encouraged to follow Practice Tips identified in this Handbook but", "metadata": { "data_source": { "record_locator": { @@ -199,8 +199,8 @@ }, { "type": "CompositeElement", - "element_id": "5f2d61a46e9d16ce346eacc25321a250", - "text": "Tips identified in this Handbook but these are not considered mandatory.", + "element_id": "6da3b5e2a833fa5ab6685f0fa46d2d6f", + "text": "n this Handbook but these are not considered mandatory.", "metadata": { "data_source": { "record_locator": { @@ -246,8 +246,8 @@ }, { "type": "CompositeElement", - "element_id": "2ff156994a8c58d8a5c91918a543ec28", - "text": "tcy Code and Rules, local rules, and orders of the court. The standing trustee should notify the United States Trustee whenever the provision of the", + "element_id": "685600ed24c5b0e3b34e7d639d3b1959", + "text": "tcy Code and Rules, local rules, and orders of the court. The standing trustee should notify the United States Trustee whenever the provision of the", "metadata": { "data_source": { "record_locator": { @@ -270,8 +270,8 @@ }, { "type": "CompositeElement", - "element_id": "7c43851f864b7ccc35150c93d06abe80", - "text": "he provision of the Handbook conflicts with the local rules or orders of the court. The standing trustee is accountable for all duties set forth in", + "element_id": "c998f5c10c9dac92e4d3624896a603c7", + "text": "he provision of the Handbook conflicts with the local rules or orders of the court. The standing trustee is accountable for all duties set forth in", "metadata": { "data_source": { "record_locator": { @@ -294,8 +294,8 @@ }, { "type": "CompositeElement", - "element_id": "7caf69b806daa033d686fae6100f4d7c", - "text": "duties set forth in this Handbook, but need not personally perform any duty unless otherwise indicated. All statutory references in this Handbook", + "element_id": "d4b750e9af7167156f369b310a8cebb8", + "text": "duties set forth in this Handbook, but need not personally perform any duty unless otherwise indicated. All statutory references in this Handbook", "metadata": { "data_source": { "record_locator": { @@ -365,8 +365,8 @@ }, { "type": "CompositeElement", - "element_id": "66ff9b9385d511ca7e71f1e6852d3221", - "text": "B.\tROLE OF THE UNITED STATES TRUSTEE", + "element_id": "8f411358790d6ee5b0d24f919206d3fd", + "text": "B. ROLE OF THE UNITED STATES TRUSTEE", "metadata": { "data_source": { "record_locator": { @@ -388,8 +388,8 @@ }, { "type": "CompositeElement", - "element_id": "1876c502fcbb25fd7b978417aea8dded", - "text": "The Bankruptcy Reform Act of 1978 removed the bankruptcy judge from the responsibilities for daytoday administration of cases. Debtors, creditors,", + "element_id": "6044d58375609c8802cfae16cef5cee9", + "text": "The Bankruptcy Reform Act of 1978 removed the bankruptcy judge from the responsibilities for daytoday administration of cases. Debtors, creditors, and", "metadata": { "data_source": { "record_locator": { @@ -411,8 +411,8 @@ }, { "type": "CompositeElement", - "element_id": "5f89702a93c3df34a62905e5dff5c54d", - "text": "Debtors, creditors, and third parties with adverse interests to the trustee were concerned that the court, which previously appointed and supervised", + "element_id": "a4030396eaf54570462ed74f86e45bc8", + "text": "ors, creditors, and third parties with adverse interests to the trustee were concerned that the court, which previously appointed and supervised the", "metadata": { "data_source": { "record_locator": { @@ -435,8 +435,8 @@ }, { "type": "CompositeElement", - "element_id": "c916e417ed924c556baed9616c3f81ae", - "text": "nted and supervised the trustee, would not impartially adjudicate their rights as adversaries of that trustee. To address these concerns, judicial and", + "element_id": "80e3b20fead224c85652bbdce327a28d", + "text": "and supervised the trustee, would not impartially adjudicate their rights as adversaries of that trustee. To address these concerns, judicial and", "metadata": { "data_source": { "record_locator": { @@ -483,8 +483,8 @@ }, { "type": "CompositeElement", - "element_id": "709927b67286cccaf8fb25d63667c277", - "text": "Many administrative functions formerly performed by the court were placed within the Department of Justice through the creation of the Program. Among", + "element_id": "39a3f1465d06269d2544ded43dc3a7df", + "text": "Many administrative functions formerly performed by the court were placed within the Department of Justice through the creation of the Program. Among", "metadata": { "data_source": { "record_locator": { @@ -506,8 +506,8 @@ }, { "type": "CompositeElement", - "element_id": "509676fb8d4f77b5f270629dee7a2664", - "text": "the Program. Among the administrative functions assigned to the United States Trustee were the appointment and supervision of chapter 13 trustees./", + "element_id": "2872e5d0bea6ec1523eb9ae2c1c64add", + "text": "the Program. Among the administrative functions assigned to the United States Trustee were the appointment and supervision of chapter 13 trustees./", "metadata": { "data_source": { "record_locator": { @@ -530,8 +530,8 @@ }, { "type": "CompositeElement", - "element_id": "7ced6d1ee6cc9478adfd8e2a613be42a", - "text": "apter 13 trustees./ This Handbook is issued under the authority of the Program\u2019s enabling statutes. ", + "element_id": "24e1076110b431b248b43b1fdaae5282", + "text": "apter 13 trustees./ This Handbook is issued under the authority of the Program\u2019s enabling statutes.", "metadata": { "data_source": { "record_locator": { @@ -554,8 +554,8 @@ }, { "type": "CompositeElement", - "element_id": "2c82d3fa4252275d5309a640eb25cd68", - "text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t", + "element_id": "158a80e29cfe6aa83a4931d955a8fa4f", + "text": "C. STATUTORY DUTIES OF A STANDING TRUSTEE", "metadata": { "data_source": { "record_locator": { @@ -577,8 +577,8 @@ }, { "type": "CompositeElement", - "element_id": "a819e32a65d1f545cb404fe3f6273357", - "text": "The standing trustee has a fiduciary responsibility to the bankruptcy estate. The standing trustee is more than a mere disbursing agent. The", + "element_id": "e5fdcc6a007017354a9d708dc04fee02", + "text": "The standing trustee has a fiduciary responsibility to the bankruptcy estate. The standing trustee is more than a mere disbursing agent. The standing", "metadata": { "data_source": { "record_locator": { @@ -600,8 +600,8 @@ }, { "type": "CompositeElement", - "element_id": "9e98089003e3b42ed7f1c263335dee3c", - "text": "bursing agent. The standing trustee must be personally involved in the trustee operation. If the standing trustee is or becomes unable to perform", + "element_id": "0bf52e064da3ef4fb8b0a92d4b9fa694", + "text": "agent. The standing trustee must be personally involved in the trustee operation. If the standing trustee is or becomes unable to perform the duties", "metadata": { "data_source": { "record_locator": { @@ -624,8 +624,8 @@ }, { "type": "CompositeElement", - "element_id": "d476b15e5336342b1da22d100849b23c", - "text": "s unable to perform the duties and responsibilities of a standing trustee, the standing trustee must immediately advise the United States Trustee. 28", + "element_id": "db297530e558410b89acd93c6b452b84", + "text": "perform the duties and responsibilities of a standing trustee, the standing trustee must immediately advise the United States Trustee. 28 U.S.C. \u00a7", "metadata": { "data_source": { "record_locator": { @@ -648,8 +648,8 @@ }, { "type": "CompositeElement", - "element_id": "8f8c9c0919f7502bd2fabad0b12ad664", - "text": "States Trustee. 28 U.S.C. \u00a7 586(b), 28 C.F.R. \u00a7 58.4(b) referencing 28 C.F.R. \u00a7 58.3(b).", + "element_id": "201bfacc211f0eb640e2830b8c29ae41", + "text": "rustee. 28 U.S.C. \u00a7 586(b), 28 C.F.R. \u00a7 58.4(b) referencing 28 C.F.R. \u00a7 58.3(b).", "metadata": { "data_source": { "record_locator": { @@ -695,8 +695,8 @@ }, { "type": "CompositeElement", - "element_id": "9864d90bf9febdd104e7eac4c56689ba", - "text": "are set forth in 11 U.S.C. \u00a7 1302, which incorporates by reference some of the duties of chapter 7 trustees found in 11 U.S.C. \u00a7 704. These duties", + "element_id": "fd4c45036e8f17c27271f75944389724", + "text": "are set forth in 11 U.S.C. \u00a7 1302, which incorporates by reference some of the duties of chapter 7 trustees found in 11 U.S.C. \u00a7 704. These duties", "metadata": { "data_source": { "record_locator": { @@ -719,8 +719,8 @@ }, { "type": "CompositeElement", - "element_id": "a91f963bcd1c092bffb844453aafa499", - "text": "704. These duties include, but are not limited to, the following:", + "element_id": "a968d741409111b777fc123ef01f5407", + "text": "\u00a7 704. These duties include, but are not limited to, the following:", "metadata": { "data_source": { "record_locator": { diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 309e1c0c38..d2557b8499 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.11-dev0" # pragma: no cover +__version__ = "0.16.11-dev1" # pragma: no cover diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py index b91c3982ea..695393c55c 100644 --- a/unstructured/chunking/base.py +++ b/unstructured/chunking/base.py @@ -43,9 +43,6 @@ BoundaryPredicate: TypeAlias = Callable[[Element], bool] """Detects when element represents crossing a semantic boundary like section or page.""" -PreChunk: TypeAlias = "TablePreChunk | TextPreChunk" -"""The kind of object produced by a pre-chunker.""" - TextAndHtml: TypeAlias = tuple[str, str] @@ -288,8 +285,13 @@ def _iter_pre_chunks(self) -> Iterator[PreChunk]: pre_chunk_builder = PreChunkBuilder(self._opts) for element in self._elements: - # -- start new pre-chunk when necessary -- - if self._is_in_new_semantic_unit(element) or not pre_chunk_builder.will_fit(element): + # -- start new pre-chunk when necessary to uphold segregation guarantees -- + if ( + # -- start new pre-chunk when necessary to uphold segregation guarantees -- + self._is_in_new_semantic_unit(element) + # -- or when next element won't fit -- + or not pre_chunk_builder.will_fit(element) + ): yield from pre_chunk_builder.flush() # -- add this element to the work-in-progress (WIP) pre-chunk -- @@ -320,8 +322,7 @@ class PreChunkBuilder: the next element in the element stream. `.flush()` is used to build a PreChunk object from the accumulated elements. This method - returns an iterator that generates zero-or-one `TextPreChunk` or `TablePreChunk` object and is - used like so: + returns an iterator that generates zero-or-one `PreChunk` object and is used like so: yield from builder.flush() @@ -355,15 +356,13 @@ def flush(self) -> Iterator[PreChunk]: boundary has been reached. Also to clear out a terminal pre-chunk at the end of an element stream. """ - if not self._elements: + elements = self._elements + + if not elements: return - pre_chunk = ( - TablePreChunk(self._elements[0], self._overlap_prefix, self._opts) - if isinstance(self._elements[0], Table) - # -- copy list, don't use original or it may change contents as builder proceeds -- - else TextPreChunk(list(self._elements), self._overlap_prefix, self._opts) - ) + # -- copy element list, don't use original or it may change contents as builder proceeds -- + pre_chunk = PreChunk(elements, self._overlap_prefix, self._opts) # -- clear builder before yield so we're not sensitive to the timing of how/when this # -- iterator is exhausted and can add elements for the next pre-chunk immediately. self._reset_state(pre_chunk.overlap_tail) @@ -384,12 +383,6 @@ def will_fit(self, element: Element) -> bool: # -- an empty pre-chunk will accept any element (including an oversized-element) -- if len(self._elements) == 0: return True - # -- a `Table` will not fit in a non-empty pre-chunk -- - if isinstance(element, Table): - return False - # -- no element will fit in a pre-chunk that already contains a `Table` element -- - if isinstance(self._elements[0], Table): - return False # -- a pre-chunk that already exceeds the soft-max is considered "full" -- if self._text_length > self._opts.soft_max: return False @@ -429,45 +422,67 @@ def _text_length(self) -> int: # ================================================================================================ -# PRE-CHUNK SUB-TYPES +# PRE-CHUNK # ================================================================================================ -class TablePreChunk: - """A pre-chunk composed of a single Table element.""" +class PreChunk: + """Sequence of elements staged to form a single chunk. - def __init__(self, table: Table, overlap_prefix: str, opts: ChunkingOptions) -> None: - self._table = table + This object is purposely immutable. + """ + + def __init__( + self, elements: Iterable[Element], overlap_prefix: str, opts: ChunkingOptions + ) -> None: + self._elements = list(elements) self._overlap_prefix = overlap_prefix self._opts = opts - def iter_chunks(self) -> Iterator[Table | TableChunk]: - """Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller.""" - # -- A table with no non-whitespace text produces no chunks -- - if not self._table_text: - return + def __eq__(self, other: Any) -> bool: + if not isinstance(other, PreChunk): + return False + return self._overlap_prefix == other._overlap_prefix and self._elements == other._elements - # -- only text-split a table when it's longer than the chunking window -- - maxlen = self._opts.hard_max - if len(self._text_with_overlap) <= maxlen and len(self._html) <= maxlen: - # -- use the compactified html for .text_as_html, even though we're not splitting -- - metadata = self._metadata - metadata.text_as_html = self._html or None - # -- note the overlap-prefix is prepended to its text -- - yield Table(text=self._text_with_overlap, metadata=metadata) - return + def can_combine(self, pre_chunk: PreChunk) -> bool: + """True when `pre_chunk` can be combined with this one without exceeding size limits.""" + if len(self._text) >= self._opts.combine_text_under_n_chars: + return False + # -- avoid duplicating length computations by doing a trial-combine which is just as + # -- efficient and definitely more robust than hoping two different computations of combined + # -- length continue to get the same answer as the code evolves. Only possible because + # -- `.combine()` is non-mutating. + combined_len = len(self.combine(pre_chunk)._text) - # -- When there's no HTML, split it like a normal element. Also fall back to text-only - # -- chunks when `max_characters` is less than 50. `.text_as_html` metadata is impractical - # -- for a chunking window that small because the 33 characterss of HTML overhead for each - # -- chunk (`
...
`) would produce a very large number of - # -- very small chunks. - if not self._html or self._opts.hard_max < 50: - yield from self._iter_text_only_table_chunks() - return + return combined_len <= self._opts.hard_max - # -- otherwise, form splits with "synchronized" text and html -- - yield from self._iter_text_and_html_table_chunks() + def combine(self, other_pre_chunk: PreChunk) -> PreChunk: + """Return new `PreChunk` that combines this and `other_pre_chunk`.""" + # -- combined pre-chunk gets the overlap-prefix of the first pre-chunk. The second overlap + # -- is automatically incorporated at the end of the first chunk, where it originated. + return PreChunk( + self._elements + other_pre_chunk._elements, + overlap_prefix=self._overlap_prefix, + opts=self._opts, + ) + + def iter_chunks(self) -> Iterator[CompositeElement | Table | TableChunk]: + """Form this pre-chunk into one or more chunk elements maxlen or smaller. + + When the total size of the pre-chunk will fit in the chunking window, a single chunk it + emitted. When this prechunk contains an oversized element (always isolated), it is split + into two or more chunks that each fit the chunking window. + """ + + # -- a one-table-only pre-chunk is handled specially, by `TablePreChunk`, mainly because + # -- it may need to be split into multiple `TableChunk` elements and that operation is + # -- quite specialized. + if len(self._elements) == 1 and isinstance(self._elements[0], Table): + yield from _TableChunker.iter_chunks( + self._elements[0], self._overlap_prefix, self._opts + ) + else: + yield from _Chunker.iter_chunks(self._elements, self._text, self._opts) @lazyproperty def overlap_tail(self) -> str: @@ -478,178 +493,66 @@ def overlap_tail(self) -> str: trailing whitespace. """ overlap = self._opts.inter_chunk_overlap - return self._text_with_overlap[-overlap:].strip() if overlap else "" - - @lazyproperty - def _html(self) -> str: - """The compactified HTML for this table when it has text-as-HTML. - - The empty string when table-structure has not been captured, perhaps because - `infer_table_structure` was set `False` in the partitioning call. - """ - if not (html_table := self._html_table): - return "" - - return html_table.html - - @lazyproperty - def _html_table(self) -> HtmlTable | None: - """The `lxml` HTML element object for this table. - - `None` when the `Table` element has no `.metadata.text_as_html`. - """ - if (text_as_html := self._table.metadata.text_as_html) is None: - return None - - text_as_html = text_as_html.strip() - if not text_as_html: # pragma: no cover - return None - - return HtmlTable.from_html_text(text_as_html) - - def _iter_text_and_html_table_chunks(self) -> Iterator[TableChunk]: - """Split table into chunks where HTML corresponds exactly to text. - - `.metadata.text_as_html` for each chunk is a parsable `` HTML fragment. - """ - if (html_table := self._html_table) is None: # pragma: no cover - raise ValueError("this method is undefined for a table having no .text_as_html") - - is_continuation = False - - for text, html in _TableSplitter.iter_subtables(html_table, self._opts): - metadata = self._metadata - metadata.text_as_html = html - # -- second and later chunks get `.metadata.is_continuation = True` -- - metadata.is_continuation = is_continuation or None - is_continuation = True - - yield TableChunk(text=text, metadata=metadata) - - def _iter_text_only_table_chunks(self) -> Iterator[TableChunk]: - """Split oversized text-only table (no text-as-html) into chunks.""" - text_remainder = self._text_with_overlap - split = self._opts.split - is_continuation = False - - while text_remainder: - # -- split off the next chunk-worth of characters into a TableChunk -- - chunk_text, text_remainder = split(text_remainder) - metadata = self._metadata - # -- second and later chunks get `.metadata.is_continuation = True` -- - metadata.is_continuation = is_continuation or None - is_continuation = True - - yield TableChunk(text=chunk_text, metadata=metadata) - - @property - def _metadata(self) -> ElementMetadata: - """The base `.metadata` value for chunks formed from this pre-chunk. + return self._text[-overlap:].strip() if overlap else "" - The term "base" here means that other metadata fields will be added, depending on the - chunk. In particular, `.metadata.text_as_html` will be different for each text-split chunk - and `.metadata.is_continuation` must be added for second-and-later text-split chunks. + def _iter_text_segments(self) -> Iterator[str]: + """Generate overlap text and each element text segment in order. - Note this is a fresh copy of the metadata on each call since it will need to be mutated - differently for each chunk formed from this pre-chunk. + Empty text segments are not included. """ - CS = ConsolidationStrategy - metadata = copy.deepcopy(self._table.metadata) - - # -- drop metadata fields not appropriate for chunks, in particular - # -- parent_id's will not reliably point to an existing element - drop_field_names = [ - field_name - for field_name, strategy in CS.field_consolidation_strategies().items() - if strategy is CS.DROP - ] - for field_name in drop_field_names: - setattr(metadata, field_name, None) - - if self._opts.include_orig_elements: - metadata.orig_elements = self._orig_elements - return metadata + if self._overlap_prefix: + yield self._overlap_prefix + for e in self._elements: + text = " ".join(e.text.strip().split()) + if not text: + continue + yield text @lazyproperty - def _orig_elements(self) -> list[Element]: - """The `.metadata.orig_elements` value for chunks formed from this pre-chunk. + def _text(self) -> str: + """The concatenated text of all elements in this pre-chunk, including any overlap. - Note this is not just the `Table` element, it must be adjusted to strip out any - `.metadata.orig_elements` value it may have when it is itself a chunk and not a direct - product of partitioning. + Whitespace is normalized to a single space. The text of each element is separated from + that of the next by a blank line ("\n\n"). """ - # -- make a copy because we're going to mutate the `Table` element and it doesn't belong to - # -- us (the user may have downstream purposes for it). - orig_table = copy.deepcopy(self._table) - # -- prevent recursive .orig_elements when `Table` element is a chunk -- - orig_table.metadata.orig_elements = None - return [orig_table] - - @lazyproperty - def _table_text(self) -> str: - """The text in this table, not including any overlap-prefix or extra whitespace.""" - return " ".join(self._table.text.split()) + return self._opts.text_separator.join(self._iter_text_segments()) - @lazyproperty - def _text_with_overlap(self) -> str: - """The text for this chunk, including the overlap-prefix when present.""" - overlap_prefix = self._overlap_prefix - table_text = self._table.text.strip() - # -- use row-separator between overlap and table-text -- - return overlap_prefix + "\n" + table_text if overlap_prefix else table_text +# ================================================================================================ +# CHUNKING HELPER/SPLITTERS +# ================================================================================================ -class TextPreChunk: - """A sequence of elements that belong to the same semantic unit within a document. - The name "section" derives from the idea of a document-section, a heading followed by the - paragraphs "under" that heading. That structure is not found in all documents and actual section - content can vary, but that's the concept. +class _Chunker: + """Forms chunks from a pre-chunk other than one containing only a `Table`. - This object is purposely immutable. + Produces zero-or-more `CompositeElement` objects. """ - def __init__( - self, elements: Iterable[Element], overlap_prefix: str, opts: ChunkingOptions - ) -> None: + def __init__(self, elements: Iterable[Element], text: str, opts: ChunkingOptions) -> None: self._elements = list(elements) - self._overlap_prefix = overlap_prefix + self._text = text self._opts = opts - def __eq__(self, other: Any) -> bool: - if not isinstance(other, TextPreChunk): - return False - return self._overlap_prefix == other._overlap_prefix and self._elements == other._elements - - def can_combine(self, pre_chunk: TextPreChunk) -> bool: - """True when `pre_chunk` can be combined with this one without exceeding size limits.""" - if len(self._text) >= self._opts.combine_text_under_n_chars: - return False - # -- avoid duplicating length computations by doing a trial-combine which is just as - # -- efficient and definitely more robust than hoping two different computations of combined - # -- length continue to get the same answer as the code evolves. Only possible because - # -- `.combine()` is non-mutating. - combined_len = len(self.combine(pre_chunk)._text) - - return combined_len <= self._opts.hard_max + @classmethod + def iter_chunks( + cls, elements: Iterable[Element], text: str, opts: ChunkingOptions + ) -> Iterator[CompositeElement]: + """Form zero or more chunks from `elements`. - def combine(self, other_pre_chunk: TextPreChunk) -> TextPreChunk: - """Return new `TextPreChunk` that combines this and `other_pre_chunk`.""" - # -- combined pre-chunk gets the overlap-prefix of the first pre-chunk. The second overlap - # -- is automatically incorporated at the end of the first chunk, where it originated. - return TextPreChunk( - self._elements + other_pre_chunk._elements, - overlap_prefix=self._overlap_prefix, - opts=self._opts, - ) + One `CompositeElement` is produced when all `elements` will fit. Otherwise there is a + single `Text`-subtype element and chunks are formed by splitting. + """ + return cls(elements, text, opts)._iter_chunks() - def iter_chunks(self) -> Iterator[CompositeElement]: - """Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller.""" + def _iter_chunks(self) -> Iterator[CompositeElement]: + """Form zero or more chunks from `elements`.""" # -- a pre-chunk containing no text (maybe only a PageBreak element for example) does not # -- generate any chunks. if not self._text: return + # -- `split()` is the text-splitting function used to split an oversized element -- split = self._opts.split # -- emit first chunk -- @@ -662,17 +565,6 @@ def iter_chunks(self) -> Iterator[CompositeElement]: s, remainder = split(remainder) yield CompositeElement(text=s, metadata=self._continuation_metadata) - @lazyproperty - def overlap_tail(self) -> str: - """The portion of this chunk's text to be repeated as a prefix in the next chunk. - - This value is the empty-string ("") when either the `.overlap` length option is `0` or - `.overlap_all` is `False`. When there is a text value, it is stripped of both leading and - trailing whitespace. - """ - overlap = self._opts.inter_chunk_overlap - return self._text[-overlap:].strip() if overlap else "" - @lazyproperty def _all_metadata_values(self) -> dict[str, list[Any]]: """Collection of all populated metadata values across elements. @@ -738,18 +630,6 @@ def _continuation_metadata(self) -> ElementMetadata: continuation_metadata.is_continuation = True return continuation_metadata - def _iter_text_segments(self) -> Iterator[str]: - """Generate overlap text and each element text segment in order. - - Empty text segments are not included. - """ - if self._overlap_prefix: - yield self._overlap_prefix - for e in self._elements: - if not e.text: - continue - yield e.text - @lazyproperty def _meta_kwargs(self) -> dict[str, Any]: """The consolidated metadata values as a dict suitable for constructing ElementMetadata. @@ -806,22 +686,183 @@ def iter_orig_elements(): return list(iter_orig_elements()) + +class _TableChunker: + """Responsible for forming chunks, especially splits, from a single-table pre-chunk. + + Table splitting is specialized because we recursively split on an even row, cell, text + boundary. This object encapsulate those details. + """ + + def __init__(self, table: Table, overlap_prefix: str, opts: ChunkingOptions) -> None: + self._table = table + self._overlap_prefix = overlap_prefix + self._opts = opts + + @classmethod + def iter_chunks( + cls, table: Table, overlap_prefix: str, opts: ChunkingOptions + ) -> Iterator[Table | TableChunk]: + """Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller.""" + return cls(table, overlap_prefix, opts)._iter_chunks() + + def _iter_chunks(self) -> Iterator[Table | TableChunk]: + """Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller.""" + # -- A table with no non-whitespace text produces no chunks -- + if not self._table_text: + return + + # -- only text-split a table when it's longer than the chunking window -- + maxlen = self._opts.hard_max + if len(self._text_with_overlap) <= maxlen and len(self._html) <= maxlen: + # -- use the compactified html for .text_as_html, even though we're not splitting -- + metadata = self._metadata + metadata.text_as_html = self._html or None + # -- note the overlap-prefix is prepended to its text -- + yield Table(text=self._text_with_overlap, metadata=metadata) + return + + # -- When there's no HTML, split it like a normal element. Also fall back to text-only + # -- chunks when `max_characters` is less than 50. `.text_as_html` metadata is impractical + # -- for a chunking window that small because the 33 characters of HTML overhead for each + # -- chunk (`
...
`) would produce a very large number of + # -- very small chunks. + if not self._html or self._opts.hard_max < 50: + yield from self._iter_text_only_table_chunks() + return + + # -- otherwise, form splits with "synchronized" text and html -- + yield from self._iter_text_and_html_table_chunks() + @lazyproperty - def _text(self) -> str: - """The concatenated text of all elements in this pre-chunk. + def _html(self) -> str: + """The compactified HTML for this table when it has text-as-HTML. + + The empty string when table-structure has not been captured, perhaps because + `infer_table_structure` was set `False` in the partitioning call. + """ + if not (html_table := self._html_table): + return "" + + return html_table.html + + @lazyproperty + def _html_table(self) -> HtmlTable | None: + """The `lxml` HTML element object for this table. - Each element-text is separated from the next by a blank line ("\n\n"). + `None` when the `Table` element has no `.metadata.text_as_html`. """ - text_separator = self._opts.text_separator - return text_separator.join(self._iter_text_segments()) + if (text_as_html := self._table.metadata.text_as_html) is None: + return None + + text_as_html = text_as_html.strip() + if not text_as_html: # pragma: no cover + return None + + return HtmlTable.from_html_text(text_as_html) + + def _iter_text_and_html_table_chunks(self) -> Iterator[TableChunk]: + """Split table into chunks where HTML corresponds exactly to text. + + `.metadata.text_as_html` for each chunk is a parsable `` HTML fragment. + """ + if (html_table := self._html_table) is None: # pragma: no cover + raise ValueError("this method is undefined for a table having no .text_as_html") + + is_continuation = False + + for text, html in _HtmlTableSplitter.iter_subtables(html_table, self._opts): + metadata = self._metadata + metadata.text_as_html = html + # -- second and later chunks get `.metadata.is_continuation = True` -- + metadata.is_continuation = is_continuation or None + is_continuation = True + + yield TableChunk(text=text, metadata=metadata) + + def _iter_text_only_table_chunks(self) -> Iterator[TableChunk]: + """Split oversized text-only table (no text-as-html) into chunks. + + `.metadata.text_as_html` is optional, not included when `infer_table_structure` is + `False`. + """ + text_remainder = self._text_with_overlap + split = self._opts.split + is_continuation = False + + while text_remainder: + # -- split off the next chunk-worth of characters into a TableChunk -- + chunk_text, text_remainder = split(text_remainder) + metadata = self._metadata + # -- second and later chunks get `.metadata.is_continuation = True` -- + metadata.is_continuation = is_continuation or None + is_continuation = True + + yield TableChunk(text=chunk_text, metadata=metadata) + + @property + def _metadata(self) -> ElementMetadata: + """The base `.metadata` value for chunks formed from this pre-chunk. + + The term "base" here means that other metadata fields will be added, depending on the + chunk. In particular, `.metadata.text_as_html` will be different for each text-split chunk + and `.metadata.is_continuation` must be added for second-and-later text-split chunks. + + Note this is a fresh copy of the metadata on each call since it will need to be mutated + differently for each chunk formed from this pre-chunk. + """ + CS = ConsolidationStrategy + metadata = copy.deepcopy(self._table.metadata) + + # -- drop metadata fields not appropriate for chunks, in particular + # -- parent_id's will not reliably point to an existing element + drop_field_names = [ + field_name + for field_name, strategy in CS.field_consolidation_strategies().items() + if strategy is CS.DROP + ] + for field_name in drop_field_names: + setattr(metadata, field_name, None) + + if self._opts.include_orig_elements: + metadata.orig_elements = self._orig_elements + return metadata + + @lazyproperty + def _orig_elements(self) -> list[Element]: + """The `.metadata.orig_elements` value for chunks formed from this pre-chunk. + + Note this is not just the `Table` element, it must be adjusted to strip out any + `.metadata.orig_elements` value it may have when it is itself a chunk and not a direct + product of partitioning. + """ + # -- make a copy because we're going to mutate the `Table` element and it doesn't belong to + # -- us (the user may have downstream purposes for it). + orig_table = copy.deepcopy(self._table) + # -- prevent recursive .orig_elements when `Table` element is a chunk -- + orig_table.metadata.orig_elements = None + return [orig_table] + + @lazyproperty + def _table_text(self) -> str: + """The text in this table, not including any overlap-prefix or extra whitespace.""" + return " ".join(self._table.text.split()) + + @lazyproperty + def _text_with_overlap(self) -> str: + """The text for this chunk, including the overlap-prefix when present.""" + overlap_prefix = self._overlap_prefix + table_text = self._table.text.strip() + # -- use row-separator between overlap and table-text -- + return overlap_prefix + "\n" + table_text if overlap_prefix else table_text # ================================================================================================ -# PRE-CHUNK SPLITTERS +# HTML SPLITTERS # ================================================================================================ -class _TableSplitter: +class _HtmlTableSplitter: """Produces (text, html) pairs for a `
` HtmlElement. Each chunk contains a whole number of rows whenever possible. An oversized row is split on an @@ -1040,7 +1081,7 @@ def flush(self) -> Iterator[TextAndHtml]: def will_fit(self, cell: HtmlCell) -> bool: """True when `cell` will fit within remaining space left by accummulated cells.""" - return self._remaining_space >= len(cell.html) + return self._remaining_space >= len(cell.text) def _iter_cell_texts(self) -> Iterator[str]: """Generate contents of each accumulated cell as a separate string. @@ -1054,10 +1095,11 @@ def _iter_cell_texts(self) -> Iterator[str]: @property def _remaining_space(self) -> int: - """Number of characters remaining when accumulated cells are formed into HTML.""" - # -- 24 is `len("
")`, the overhead in addition to `` - # -- HTML fragments - return self._maxlen - 24 - sum(len(c.html) for c in self._cells) + """Number of characters remaining when text of accumulated cells is joined.""" + # -- separators are one space (" ") at the end of each cell's text, including last one to + # -- account for space before prospective next cell. + separators_len = len(self._cells) + return self._maxlen - separators_len - sum(len(c.text) for c in self._cells) class _RowAccumulator: @@ -1087,7 +1129,7 @@ def flush(self) -> Iterator[TextAndHtml]: def will_fit(self, row: HtmlRow) -> bool: """True when `row` will fit within remaining space left by accummulated rows.""" - return self._remaining_space >= len(row.html) + return self._remaining_space >= row.text_len def _iter_cell_texts(self) -> Iterator[str]: """Generate contents of each row cell as a separate string. @@ -1100,8 +1142,10 @@ def _iter_cell_texts(self) -> Iterator[str]: @property def _remaining_space(self) -> int: """Number of characters remaining when accumulated rows are formed into HTML.""" - # -- 15 is `len("
")`, the overhead in addition to `` HTML fragments -- - return self._maxlen - 15 - sum(len(r.html) for r in self._rows) + # -- separators are one space (" ") at the end of each row's text, including last one to + # -- account for space before prospective next row. + separators_len = len(self._rows) + return self._maxlen - separators_len - sum(r.text_len for r in self._rows) # ================================================================================================ @@ -1117,16 +1161,10 @@ def __init__(self, pre_chunks: Iterable[PreChunk], opts: ChunkingOptions): self._opts = opts def iter_combined_pre_chunks(self) -> Iterator[PreChunk]: - """Generate pre-chunk objects, combining TextPreChunk objects when they'll fit in window.""" - accum = TextPreChunkAccumulator(self._opts) + """Generate pre-chunk objects, combining `PreChunk` objects when they'll fit in window.""" + accum = _PreChunkAccumulator(self._opts) for pre_chunk in self._pre_chunks: - # -- a table pre-chunk is never combined -- - if isinstance(pre_chunk, TablePreChunk): - yield from accum.flush() - yield pre_chunk - continue - # -- finish accumulating pre-chunk when it's full -- if not accum.will_fit(pre_chunk): yield from accum.flush() @@ -1136,39 +1174,37 @@ def iter_combined_pre_chunks(self) -> Iterator[PreChunk]: yield from accum.flush() -class TextPreChunkAccumulator: - """Accumulates, measures, and combines text pre-chunks. +class _PreChunkAccumulator: + """Accumulates, measures, and combines pre-chunks. Used for combining pre-chunks for chunking strategies like "by-title" that can potentially - produce undersized chunks and offer the `combine_text_under_n_chars` option. Note that only - sequential `TextPreChunk` objects can be combined. A `TablePreChunk` is never combined with - another pre-chunk. + produce undersized chunks and offer the `combine_text_under_n_chars` option. Provides `.add_pre_chunk()` allowing a pre-chunk to be added to the chunk and provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding whether to add another pre-chunk. - `.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object. - This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used + `.flush()` is used to combine the accumulated pre-chunks into a single `PreChunk` object. + This method returns an interator that generates zero-or-one `PreChunk` objects and is used like so: yield from accum.flush() - If no pre-chunks have been accumulated, no `TextPreChunk` is generated. Flushing the builder - clears the pre-chunks it contains so it is ready to accept the next text-pre-chunk. + If no pre-chunks have been accumulated, no `PreChunk` is generated. Flushing the builder + clears the pre-chunks it contains so it is ready to accept the next pre-chunk. """ def __init__(self, opts: ChunkingOptions) -> None: self._opts = opts - self._pre_chunk: TextPreChunk | None = None + self._pre_chunk: PreChunk | None = None - def add_pre_chunk(self, pre_chunk: TextPreChunk) -> None: + def add_pre_chunk(self, pre_chunk: PreChunk) -> None: """Add a pre-chunk to the accumulator for possible combination with next pre-chunk.""" self._pre_chunk = ( pre_chunk if self._pre_chunk is None else self._pre_chunk.combine(pre_chunk) ) - def flush(self) -> Iterator[TextPreChunk]: + def flush(self) -> Iterator[PreChunk]: """Generate accumulated pre-chunk as a single combined pre-chunk. Does not generate a pre-chunk when none has been accumulated. @@ -1181,7 +1217,7 @@ def flush(self) -> Iterator[TextPreChunk]: # -- and reset the accumulator (to empty) -- self._pre_chunk = None - def will_fit(self, pre_chunk: TextPreChunk) -> bool: + def will_fit(self, pre_chunk: PreChunk) -> bool: """True when there is room for `pre_chunk` in accumulator. An empty accumulator always has room. Otherwise there is only room when `pre_chunk` can be @@ -1206,7 +1242,7 @@ def will_fit(self, pre_chunk: TextPreChunk) -> bool: # predicate. # # These can be mixed and matched to produce different chunking behaviors like "by_title" or left -# out altogether to produce "by_element" behavior. +# out altogether to produce "basic-chunking" behavior. # # The effective lifetime of the function that produce a predicate (rather than directly being one) # is limited to a single element-stream because these retain state (e.g. current page number) to diff --git a/unstructured/common/html_table.py b/unstructured/common/html_table.py index 5ddcf55928..a441e5a57b 100644 --- a/unstructured/common/html_table.py +++ b/unstructured/common/html_table.py @@ -136,11 +136,15 @@ def iter_cell_texts(self) -> Iterator[str]: for td in self._tr: if (text := td.text) is None: continue - text = text.strip() if not text: continue yield text + @lazyproperty + def text_len(self) -> int: + """Length of the normalized text, as it would appear in `element.text`.""" + return len(" ".join(self.iter_cell_texts())) + class HtmlCell: """A `` element.""" @@ -158,4 +162,4 @@ def text(self) -> str: """Text inside `` element, empty string when no text.""" if (text := self._td.text) is None: return "" - return text.strip() + return " ".join(text.strip().split())