Skip to content

Commit

Permalink
refactor: Add support for new filters declaration (#6397)
Browse files Browse the repository at this point in the history
* Rework filter logic for InMemoryDocumentStore to support new filters
declaration

* Fix legacy filters tests

* Simplify logic and handle dates comparison

* Rework MetadataRouter to support new filters

* Update docstrings

* Add release notes

* Fix linting

* Avoid duplicating filters specifications

* Handle corner case

* Simplify docstring

* Fix filters logic and tests

* Fix Document Store testing legacy filters tests
  • Loading branch information
silvanocerza authored Nov 24, 2023
1 parent 28c2b09 commit fd16ec6
Show file tree
Hide file tree
Showing 10 changed files with 872 additions and 879 deletions.
2 changes: 1 addition & 1 deletion e2e/preview/pipelines/test_preprocessing_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def test_preprocessing_pipeline(tmp_path):
preprocessing_pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter")
preprocessing_pipeline.add_component(instance=DocumentLanguageClassifier(), name="language_classifier")
preprocessing_pipeline.add_component(
instance=MetadataRouter(rules={"en": {"language": {"$eq": "en"}}}), name="router"
instance=MetadataRouter(rules={"en": {"field": "language", "operator": "==", "value": "en"}}), name="router"
)
preprocessing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner")
preprocessing_pipeline.add_component(
Expand Down
41 changes: 34 additions & 7 deletions haystack/preview/components/routers/metadata_router.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Dict, List

from haystack.preview import component, Document
from haystack.preview.utils.filters import document_matches_filter
from haystack.preview.utils.filters import document_matches_filter, convert


@component
Expand All @@ -19,12 +19,36 @@ def __init__(self, rules: Dict[str, Dict]):
follow the format of filtering expressions in Haystack. For example:
```python
{
"edge_1": {"created_at": {"$gte": "2023-01-01", "$lt": "2023-04-01"}},
"edge_2": {"created_at": {"$gte": "2023-04-01", "$lt": "2023-07-01"}},
"edge_3": {"created_at": {"$gte": "2023-07-01", "$lt": "2023-10-01"}},
"edge_4": {"created_at": {"$gte": "2023-10-01", "$lt": "2024-01-01"}},
}
```
"edge_1": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-01-01"},
{"field": "meta.created_at", "operator": "<", "value": "2023-04-01"},
],
},
"edge_2": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-04-01"},
{"field": "meta.created_at", "operator": "<", "value": "2023-07-01"},
],
},
"edge_3": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-07-01"},
{"field": "meta.created_at", "operator": "<", "value": "2023-10-01"},
],
},
"edge_4": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-10-01"},
{"field": "meta.created_at", "operator": "<", "value": "2024-01-01"},
],
},
}
```
"""
self.rules = rules
component.set_output_types(self, unmatched=List[Document], **{edge: List[Document] for edge in rules})
Expand All @@ -43,6 +67,9 @@ def run(self, documents: List[Document]):
for document in documents:
cur_document_matched = False
for edge, rule in self.rules.items():
if "operator" not in rule:
# Must be a legacy filter, convert it
rule = convert(rule)
if document_matches_filter(rule, document):
output[edge].append(document)
cur_document_matched = True
Expand Down
82 changes: 15 additions & 67 deletions haystack/preview/document_stores/in_memory/document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from haystack.preview.document_stores.decorator import document_store
from haystack.preview.dataclasses import Document
from haystack.preview.document_stores.protocols import DuplicatePolicy
from haystack.preview.utils.filters import document_matches_filter
from haystack.preview.utils.filters import document_matches_filter, convert
from haystack.preview.document_stores.errors import DuplicateDocumentError, DocumentStoreError
from haystack.preview.utils import expit

Expand Down Expand Up @@ -92,75 +92,15 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc
"""
Returns the documents that match the filters provided.
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical operator (`"$and"`,
`"$or"`, `"$not"`), a comparison operator (`"$eq"`, `$ne`, `"$in"`, `$nin`, `"$gt"`, `"$gte"`, `"$lt"`,
`"$lte"`) or a metadata field name.
Logical operator keys take a dictionary of metadata field names and/or logical operators as value. Metadata
field names take a dictionary of comparison operators as value. Comparison operator keys take a single value or
(in case of `"$in"`) a list of values as value. If no logical operator is provided, `"$and"` is used as default
operation. If no comparison operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used
as default operation.
Example:
```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
# or simpler using default operators
filters = {
"type": "article",
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": ["economy", "politics"],
"publisher": "nytimes"
}
}
```
To use the same logical operator multiple times on the same level, logical operators can take a list of
dictionaries as value.
Example:
```python
filters = {
"$or": [
{
"$and": {
"Type": "News Paper",
"Date": {
"$lt": "2019-01-01"
}
}
},
{
"$and": {
"Type": "Blog Post",
"Date": {
"$gte": "2019-01-01"
}
}
}
]
}
```
For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol documentation.
:param filters: The filters to apply to the document list.
:return: A list of Documents that match the given filters.
"""
if filters:
return [doc for doc in self.storage.values() if document_matches_filter(conditions=filters, document=doc)]
if "operator" not in filters:
filters = convert(filters)
return [doc for doc in self.storage.values() if document_matches_filter(filters=filters, document=doc)]
return list(self.storage.values())

def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL) -> int:
Expand Down Expand Up @@ -220,9 +160,17 @@ def bm25_retrieval(
if not query:
raise ValueError("Query should be a non-empty string")

content_type_filter = {"$or": {"content": {"$not": None}, "dataframe": {"$not": None}}}
content_type_filter = {
"operator": "OR",
"conditions": [
{"field": "content", "operator": "!=", "value": None},
{"field": "dataframe", "operator": "!=", "value": None},
],
}
if filters:
filters = {"$and": [content_type_filter, filters]}
if "operator" not in filters:
filters = convert(filters)
filters = {"operator": "AND", "conditions": [content_type_filter, filters]}
else:
filters = content_type_filter
all_documents = self.filter_documents(filters=filters)
Expand Down
99 changes: 47 additions & 52 deletions haystack/preview/document_stores/protocols.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,69 +51,64 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc
"""
Returns the documents that match the filters provided.
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical operator (`"$and"`,
`"$or"`, `"$not"`), a comparison operator (`"$eq"`, `$ne`, `"$in"`, `$nin`, `"$gt"`, `"$gte"`, `"$lt"`,
`"$lte"`) or a metadata field name.
Filters are defined as nested dictionaries that can be of two types:
- Comparison
- Logic
Logical operator keys take a dictionary of metadata field names and/or logical operators as value. Metadata
field names take a dictionary of comparison operators as value. Comparison operator keys take a single value or
(in case of `"$in"`) a list of values as value. If no logical operator is provided, `"$and"` is used as default
operation. If no comparison operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used
as default operation.
Comparison dictionaries must contain the keys:
Example:
- `field`
- `operator`
- `value`
```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
# or simpler using default operators
filters = {
"type": "article",
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": ["economy", "politics"],
"publisher": "nytimes"
}
}
```
Logic dictionaries must contain the keys:
- `operator`
- `conditions`
The `conditions` key must be a list of dictionaries, either of type Comparison or Logic.
To use the same logical operator multiple times on the same level, logical operators can take a list of
dictionaries as value.
The `operator` value in Comparison dictionaries must be one of:
Example:
- `==`
- `!=`
- `>`
- `>=`
- `<`
- `<=`
- `in`
- `not in`
The `operator` values in Logic dictionaries must be one of:
- `NOT`
- `OR`
- `AND`
A simple filter:
```python
filters = {"field": "meta.type", "operator": "==", "value": "article"}
```
A more complex filter:
```python
filters = {
"$or": [
"operator": "AND",
"conditions": [
{"field": "meta.type", "operator": "==", "value": "article"},
{"field": "meta.date", "operator": ">=", "value": 1420066800},
{"field": "meta.date", "operator": "<", "value": 1609455600},
{"field": "meta.rating", "operator": ">=", "value": 3},
{
"$and": {
"Type": "News Paper",
"Date": {
"$lt": "2019-01-01"
}
}
"operator": "OR",
"conditions": [
{"field": "meta.genre", "operator": "in", "value": ["economy", "politics"]},
{"field": "meta.publisher", "operator": "==", "value": "nytimes"},
],
},
{
"$and": {
"Type": "Blog Post",
"Date": {
"$gte": "2019-01-01"
}
}
}
]
],
}
```
:param filters: the filters to apply to the document list.
:return: a list of Documents that match the given filters.
Expand Down
25 changes: 15 additions & 10 deletions haystack/preview/testing/document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def document_store(self):
@pytest.mark.unit
def test_incorrect_filter_type(self, document_store: DocumentStore, filterable_docs: List[Document]):
document_store.write_documents(filterable_docs)
with pytest.raises(FilterError):
with pytest.raises(ValueError):
document_store.filter_documents(filters="something odd") # type: ignore

@pytest.mark.unit
Expand Down Expand Up @@ -574,7 +574,9 @@ def document_store(self):
def test_lt_filter(self, document_store: DocumentStore, filterable_docs: List[Document]):
document_store.write_documents(filterable_docs)
result = document_store.filter_documents(filters={"number": {"$lt": 0.0}})
assert result == [doc for doc in filterable_docs if "number" in doc.meta and doc.meta["number"] < 0]
assert result == [
doc for doc in filterable_docs if doc.meta.get("number") is not None and doc.meta["number"] < 0
]

@pytest.mark.unit
def test_lt_filter_non_numeric(self, document_store: DocumentStore, filterable_docs: List[Document]):
Expand Down Expand Up @@ -614,7 +616,9 @@ def document_store(self):
def test_lte_filter(self, document_store: DocumentStore, filterable_docs: List[Document]):
document_store.write_documents(filterable_docs)
result = document_store.filter_documents(filters={"number": {"$lte": 2.0}})
assert result == [doc for doc in filterable_docs if "number" in doc.meta and doc.meta["number"] <= 2.0]
assert result == [
doc for doc in filterable_docs if doc.meta.get("number") is not None and doc.meta["number"] <= 2.0
]

@pytest.mark.unit
def test_lte_filter_non_numeric(self, document_store: DocumentStore, filterable_docs: List[Document]):
Expand Down Expand Up @@ -658,7 +662,8 @@ def test_filter_simple_or(self, document_store: DocumentStore, filterable_docs:
assert result == [
doc
for doc in filterable_docs
if (("number" in doc.meta and doc.meta["number"] < 1) or doc.meta.get("name") in ["name_0", "name_1"])
if (doc.meta.get("number") is not None and doc.meta["number"] < 1)
or doc.meta.get("name") in ["name_0", "name_1"]
]

@pytest.mark.unit
Expand Down Expand Up @@ -733,7 +738,10 @@ def test_filter_nested_or(self, document_store: DocumentStore, filterable_docs:
assert result == [
doc
for doc in filterable_docs
if (doc.meta.get("name") in ["name_0", "name_1"] or ("number" in doc.meta and doc.meta["number"] < 1))
if (
doc.meta.get("name") in ["name_0", "name_1"]
or (doc.meta.get("number") is not None and doc.meta["number"] < 1)
)
]

@pytest.mark.unit
Expand Down Expand Up @@ -783,11 +791,8 @@ def test_filter_nested_or_and(self, document_store: DocumentStore, filterable_do
doc
for doc in filterable_docs
if (
("number" in doc.meta and doc.meta["number"] < 1)
or (
doc.meta.get("name") in ["name_0", "name_1"]
and ("chapter" in doc.meta and doc.meta["chapter"] != "intro")
)
(doc.meta.get("number") is not None and doc.meta["number"] < 1)
or (doc.meta.get("name") in ["name_0", "name_1"] and (doc.meta.get("chapter") != "intro"))
)
]

Expand Down
Loading

0 comments on commit fd16ec6

Please sign in to comment.