Skip to content

Commit

Permalink
Merge branch 'main' into dalle-image-gen
Browse files Browse the repository at this point in the history
  • Loading branch information
sjrl authored Nov 14, 2024
2 parents daee756 + e5a8072 commit 8eaf28a
Show file tree
Hide file tree
Showing 22 changed files with 417 additions and 24 deletions.
2 changes: 1 addition & 1 deletion docs/pydoc/config/rankers_api.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
loaders:
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
search_path: [../../../haystack/components/rankers]
modules: ["lost_in_the_middle", "meta_field", "transformers_similarity", "sentence_transformers_diversity"]
modules: ["lost_in_the_middle", "meta_field", "meta_field_grouping_ranker", "transformers_similarity", "sentence_transformers_diversity"]
ignore_when_discovered: ["__init__"]
processors:
- type: filter
Expand Down
12 changes: 9 additions & 3 deletions haystack/components/converters/pdfminer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class PDFMinerToDocument:
```
"""

def __init__(
def __init__( # pylint: disable=too-many-positional-arguments
self,
line_overlap: float = 0.5,
char_margin: float = 2.0,
Expand Down Expand Up @@ -92,7 +92,7 @@ def __init__(
all_texts=all_texts,
)

def __converter(self, extractor) -> Document:
def _converter(self, extractor) -> Document:
"""
Extracts text from PDF pages then convert the text into Documents
Expand Down Expand Up @@ -151,13 +151,19 @@ def run(
continue
try:
pdf_reader = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params)
document = self.__converter(pdf_reader)
document = self._converter(pdf_reader)
except Exception as e:
logger.warning(
"Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
)
continue

if document.content is None or document.content.strip() == "":
logger.warning(
"PDFMinerToDocument could not extract text from the file {source}. Returning an empty document.",
source=source,
)

merged_metadata = {**bytestream.meta, **metadata}
document.meta = merged_metadata
documents.append(document)
Expand Down
10 changes: 8 additions & 2 deletions haystack/components/converters/pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,10 @@ def from_dict(cls, data):
:returns:
Deserialized component.
"""
custom_converter_data = data["init_parameters"]["converter"]
init_parameters = data.get("init_parameters", {})
custom_converter_data = init_parameters.get("converter")
if custom_converter_data is not None:
data["init_parameters"]["converter"] = deserialize_class_instance(custom_converter_data)

return default_from_dict(cls, data)

def _default_convert(self, reader: "PdfReader") -> Document:
Expand Down Expand Up @@ -142,6 +142,12 @@ def run(
)
continue

if document.content is None or document.content.strip() == "":
logger.warning(
"PyPDFToDocument could not extract text from the file {source}. Returning an empty document.",
source=source,
)

merged_metadata = {**bytestream.meta, **metadata}
document.meta = merged_metadata
documents.append(document)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def __init__(
progress_bar: bool = True,
meta_fields_to_embed: Optional[List[str]] = None,
embedding_separator: str = "\n",
):
): # pylint: disable=too-many-positional-arguments
"""
Creates a HuggingFaceAPIDocumentEmbedder component.
Expand Down Expand Up @@ -168,7 +168,7 @@ def __init__(
model_or_url = url
else:
msg = f"Unknown api_type {api_type}"
raise ValueError(api_type)
raise ValueError(msg)

self.api_type = api_type
self.api_params = api_params
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def __init__(
suffix: str = "",
truncate: bool = True,
normalize: bool = False,
):
): # pylint: disable=too-many-positional-arguments
"""
Creates a HuggingFaceAPITextEmbedder component.
Expand Down Expand Up @@ -138,7 +138,7 @@ def __init__(
model_or_url = url
else:
msg = f"Unknown api_type {api_type}"
raise ValueError()
raise ValueError(msg)

self.api_type = api_type
self.api_params = api_params
Expand Down
4 changes: 2 additions & 2 deletions haystack/components/generators/chat/hugging_face_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ class HuggingFaceAPIChatGenerator:
```
"""

def __init__(
def __init__( # pylint: disable=too-many-positional-arguments
self,
api_type: Union[HFGenerationAPIType, str],
api_params: Dict[str, str],
Expand Down Expand Up @@ -162,7 +162,7 @@ def __init__(
model_or_url = url
else:
msg = f"Unknown api_type {api_type}"
raise ValueError(api_type)
raise ValueError(msg)

# handle generation kwargs setup
generation_kwargs = generation_kwargs.copy() if generation_kwargs else {}
Expand Down
14 changes: 12 additions & 2 deletions haystack/components/generators/chat/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import copy
import json
import os
from datetime import datetime
from typing import Any, Callable, Dict, List, Optional, Union

from openai import OpenAI, Stream
Expand Down Expand Up @@ -63,7 +64,7 @@ class OpenAIChatGenerator:
```
"""

def __init__(
def __init__( # pylint: disable=too-many-positional-arguments
self,
api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
model: str = "gpt-4o-mini",
Expand Down Expand Up @@ -222,11 +223,15 @@ def run(
raise ValueError("Cannot stream multiple responses, please set n=1.")
chunks: List[StreamingChunk] = []
chunk = None
_first_token = True

# pylint: disable=not-an-iterable
for chunk in chat_completion:
if chunk.choices and streaming_callback:
chunk_delta: StreamingChunk = self._build_chunk(chunk)
if _first_token:
_first_token = False
chunk_delta.meta["completion_start_time"] = datetime.now().isoformat()
chunks.append(chunk_delta)
streaming_callback(chunk_delta) # invoke callback with the chunk_delta
completions = [self._connect_chunks(chunk, chunks)]
Expand Down Expand Up @@ -280,7 +285,12 @@ def _connect_chunks(self, chunk: Any, chunks: List[StreamingChunk]) -> ChatMessa
payload["function"]["arguments"] += delta.arguments or ""
complete_response = ChatMessage.from_assistant(json.dumps(payloads))
else:
complete_response = ChatMessage.from_assistant("".join([chunk.content for chunk in chunks]))
total_content = ""
total_meta = {}
for streaming_chunk in chunks:
total_content += streaming_chunk.content
total_meta.update(streaming_chunk.meta)
complete_response = ChatMessage.from_assistant(total_content, meta=total_meta)
complete_response.meta.update(
{
"model": chunk.model,
Expand Down
2 changes: 1 addition & 1 deletion haystack/components/generators/hugging_face_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
model_or_url = url
else:
msg = f"Unknown api_type {api_type}"
raise ValueError(api_type)
raise ValueError(msg)

# handle generation kwargs setup
generation_kwargs = generation_kwargs.copy() if generation_kwargs else {}
Expand Down
10 changes: 8 additions & 2 deletions haystack/components/preprocessors/document_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@

from more_itertools import windowed

from haystack import Document, component
from haystack import Document, component, logging
from haystack.core.serialization import default_from_dict, default_to_dict
from haystack.utils import deserialize_callable, serialize_callable

logger = logging.getLogger(__name__)


@component
class DocumentSplitter:
Expand Down Expand Up @@ -46,7 +48,7 @@ class DocumentSplitter:
```
"""

def __init__(
def __init__( # pylint: disable=too-many-positional-arguments
self,
split_by: Literal["function", "page", "passage", "sentence", "word"] = "word",
split_length: int = 200,
Expand Down Expand Up @@ -112,6 +114,9 @@ def run(self, documents: List[Document]):
raise ValueError(
f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
)
if doc.content == "":
logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)
continue
units = self._split_into_units(doc.content, self.split_by)
text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
units, self.split_length, self.split_overlap, self.split_threshold
Expand Down Expand Up @@ -173,6 +178,7 @@ def _concatenate_units(
# concatenate the last split with the current one
text_splits[-1] += txt

# NOTE: This line skips documents that have content=""
elif len(txt) > 0:
text_splits.append(txt)
splits_pages.append(cur_page)
Expand Down
2 changes: 2 additions & 0 deletions haystack/components/rankers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@

from haystack.components.rankers.lost_in_the_middle import LostInTheMiddleRanker
from haystack.components.rankers.meta_field import MetaFieldRanker
from haystack.components.rankers.meta_field_grouping_ranker import MetaFieldGroupingRanker
from haystack.components.rankers.sentence_transformers_diversity import SentenceTransformersDiversityRanker
from haystack.components.rankers.transformers_similarity import TransformersSimilarityRanker

__all__ = [
"LostInTheMiddleRanker",
"MetaFieldRanker",
"MetaFieldGroupingRanker",
"SentenceTransformersDiversityRanker",
"TransformersSimilarityRanker",
]
118 changes: 118 additions & 0 deletions haystack/components/rankers/meta_field_grouping_ranker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

from collections import defaultdict
from typing import Any, Dict, List, Optional, cast

from haystack import Document, component, logging

logger = logging.getLogger(__name__)


@component
class MetaFieldGroupingRanker:
"""
Reorders the documents by grouping them based on metadata keys.
The MetaFieldGroupingRanker can group documents by a primary metadata key `group_by`, and subgroup them with an optional
secondary key, `subgroup_by`.
Within each group or subgroup, it can also sort documents by a metadata key `sort_docs_by`.
The output is a flat list of documents ordered by `group_by` and `subgroup_by` values.
Any documents without a group are placed at the end of the list.
The proper organization of documents helps improve the efficiency and performance of subsequent processing by an LLM.
### Usage example
```python
from haystack.components.rankers import MetaFieldGroupingRanker
from haystack.dataclasses import Document
docs = [
Document(content="Javascript is a popular programming language", meta={"group": "42", "split_id": 7, "subgroup": "subB"}),
Document(content="Python is a popular programming language",meta={"group": "42", "split_id": 4, "subgroup": "subB"}),
Document(content="A chromosome is a package of DNA", meta={"group": "314", "split_id": 2, "subgroup": "subC"}),
Document(content="An octopus has three hearts", meta={"group": "11", "split_id": 2, "subgroup": "subD"}),
Document(content="Java is a popular programming language", meta={"group": "42", "split_id": 3, "subgroup": "subB"})
]
ranker = MetaFieldGroupingRanker(group_by="group",subgroup_by="subgroup", sort_docs_by="split_id")
result = ranker.run(documents=docs)
print(result["documents"])
# [
# Document(id=d665bbc83e52c08c3d8275bccf4f22bf2bfee21c6e77d78794627637355b8ebc,
# content: 'Java is a popular programming language', meta: {'group': '42', 'split_id': 3, 'subgroup': 'subB'}),
# Document(id=a20b326f07382b3cbf2ce156092f7c93e8788df5d48f2986957dce2adb5fe3c2,
# content: 'Python is a popular programming language', meta: {'group': '42', 'split_id': 4, 'subgroup': 'subB'}),
# Document(id=ce12919795d22f6ca214d0f161cf870993889dcb146f3bb1b3e1ffdc95be960f,
# content: 'Javascript is a popular programming language', meta: {'group': '42', 'split_id': 7, 'subgroup': 'subB'}),
# Document(id=d9fc857046c904e5cf790b3969b971b1bbdb1b3037d50a20728fdbf82991aa94,
# content: 'A chromosome is a package of DNA', meta: {'group': '314', 'split_id': 2, 'subgroup': 'subC'}),
# Document(id=6d3b7bdc13d09aa01216471eb5fb0bfdc53c5f2f3e98ad125ff6b85d3106c9a3,
# content: 'An octopus has three hearts', meta: {'group': '11', 'split_id': 2, 'subgroup': 'subD'})
# ]
```
""" # noqa: E501

def __init__(self, group_by: str, subgroup_by: Optional[str] = None, sort_docs_by: Optional[str] = None):
"""
Creates an instance of DeepsetMetadataGrouper.
:param group_by: The metadata key to aggregate the documents by.
:param subgroup_by: The metadata key to aggregate the documents within a group that was created by the
`group_by` key.
:param sort_docs_by: Determines which metadata key is used to sort the documents. If not provided, the
documents within the groups or subgroups are not sorted and are kept in the same order as
they were inserted in the subgroups.
"""
self.group_by = group_by
self.sort_docs_by = sort_docs_by
self.subgroup_by = subgroup_by

@component.output_types(documents=List[Document])
def run(self, documents: List[Document]) -> Dict[str, Any]:
"""
Groups the provided list of documents based on the `group_by` parameter and optionally the `subgroup_by`.
The output is a list of documents reordered based on how they were grouped.
:param documents: The list of documents to group.
:returns:
A dictionary with the following keys:
- documents: The list of documents ordered by the `group_by` and `subgroup_by` metadata values.
"""

if not documents:
return {"documents": []}

document_groups: Dict[str, Dict[str, List[Document]]] = defaultdict(lambda: defaultdict(list))
no_group_docs = []

for doc in documents:
group_value = str(doc.meta.get(self.group_by, ""))

if group_value:
subgroup_value = "no_subgroup"
if self.subgroup_by and self.subgroup_by in doc.meta:
subgroup_value = doc.meta[self.subgroup_by]

document_groups[group_value][subgroup_value].append(doc)
else:
no_group_docs.append(doc)

ordered_docs = []
for group in document_groups:
for subgroup in document_groups[group]:
docs = document_groups[group][subgroup]
if self.sort_docs_by:
docs.sort(key=lambda d: d.meta.get(cast(str, self.sort_docs_by), float("inf")))
ordered_docs.extend(docs)

ordered_docs.extend(no_group_docs)

return {"documents": ordered_docs}
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ format-check = "ruff format --check {args}"

[tool.hatch.envs.test]
extra-dependencies = [
"numpy>=2", # Haystack is compatible both with numpy 1.x and 2.x, but we test with 2.x
"numpy>=2", # Haystack is compatible both with numpy 1.x and 2.x, but we test with 2.x

"transformers[torch,sentencepiece]==4.44.2", # ExtractiveReader, TransformersSimilarityRanker, LocalWhisperTranscriber, HFGenerators...
"huggingface_hub>=0.23.0", # Hugging Face API Generators and Embedders
Expand Down Expand Up @@ -118,7 +118,7 @@ extra-dependencies = [

# Tracing
"opentelemetry-sdk",
"ddtrace==2.15.0rc2",
"ddtrace",

# Structured logging
"structlog",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
features:
- |
Add warning logs to the PDFMinerToDocument and PyPDFToDocument to indicate when a processed PDF file has no content.
This can happen if the PDF file is a scanned image.
Also added an explicit check and warning message to the DocumentSplitter that warns the user that empty Documents are skipped.
This behavior was already occurring, but now its clearer through logs that this is happening.
4 changes: 4 additions & 0 deletions releasenotes/notes/add-metadata-grouper-21ec05fd4a307425.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
We have added a new MetaFieldGroupingRanker component that reorders documents by grouping them based on metadata keys. This can be useful for pre-processing Documents before feeding them to an LLM.
6 changes: 6 additions & 0 deletions releasenotes/notes/openai-ttft-42b1ad551b542930.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
features:
- |
Add TTFT (Time-to-First-Token) support for OpenAI generators. This
captures the time taken to generate the first token from the model and
can be used to analyze the latency of the application.
Loading

0 comments on commit 8eaf28a

Please sign in to comment.