From f708cf605611b9f65308f3bc86a5fdb96d3cf897 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Mon, 13 Nov 2023 11:59:18 +0100 Subject: [PATCH] refactor!: set `scale_score` default value to False (#6276) * set default scale_score to False * release note --- .../retrievers/in_memory_bm25_retriever.py | 6 +++--- .../retrievers/in_memory_embedding_retriever.py | 8 ++++---- .../document_stores/in_memory/document_store.py | 12 ++++++------ .../scale-score-false-h2-3c8e4e7e543e8cce.yaml | 6 ++++++ .../retrievers/test_in_memory_bm25_retriever.py | 16 ++++++++-------- .../test_in_memory_embedding_retriever.py | 16 ++++++++-------- 6 files changed, 35 insertions(+), 29 deletions(-) create mode 100644 releasenotes/notes/scale-score-false-h2-3c8e4e7e543e8cce.yaml diff --git a/haystack/preview/components/retrievers/in_memory_bm25_retriever.py b/haystack/preview/components/retrievers/in_memory_bm25_retriever.py index 4cd0c5f7a2..a3f7826123 100644 --- a/haystack/preview/components/retrievers/in_memory_bm25_retriever.py +++ b/haystack/preview/components/retrievers/in_memory_bm25_retriever.py @@ -17,7 +17,7 @@ def __init__( document_store: InMemoryDocumentStore, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, - scale_score: bool = True, + scale_score: bool = False, ): """ Create the InMemoryBM25Retriever component. @@ -26,7 +26,7 @@ def __init__( :param filters: A dictionary with filters to narrow down the search space. Defaults to `None`. :param top_k: The maximum number of documents to retrieve. Defaults to `10`. :param scale_score: Scales the BM25 score to a unit interval in the range of 0 to 1, where 1 means extremely relevant. If set to `False`, uses raw similarity scores. - Defaults to `True`. + Defaults to `False`. :raises ValueError: If the specified `top_k` is not > 0. """ @@ -90,7 +90,7 @@ def run( :param filters: A dictionary with filters to narrow down the search space. :param top_k: The maximum number of documents to return. :param scale_score: Scales the BM25 score to a unit interval in the range of 0 to 1, where 1 means extremely relevant. If set to `False`, uses raw similarity scores. - Defaults to `True`. + If not specified, the value provided at initialization is used. :return: The retrieved documents. :raises ValueError: If the specified DocumentStore is not found or is not a InMemoryDocumentStore instance. diff --git a/haystack/preview/components/retrievers/in_memory_embedding_retriever.py b/haystack/preview/components/retrievers/in_memory_embedding_retriever.py index 64a68ba320..dad86fdc58 100644 --- a/haystack/preview/components/retrievers/in_memory_embedding_retriever.py +++ b/haystack/preview/components/retrievers/in_memory_embedding_retriever.py @@ -17,7 +17,7 @@ def __init__( document_store: InMemoryDocumentStore, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, - scale_score: bool = True, + scale_score: bool = False, return_embedding: bool = False, ): """ @@ -27,7 +27,7 @@ def __init__( :param filters: A dictionary with filters to narrow down the search space. Defaults to `None`. :param top_k: The maximum number of documents to retrieve. Defaults to `10`. :param scale_score: Scales the BM25 score to a unit interval in the range of 0 to 1, where 1 means extremely relevant. If set to `False`, uses raw similarity scores. - Defaults to `True`. + Defaults to `False`. :param return_embedding: Whether to return the embedding of the retrieved Documents. Default is `False`. :raises ValueError: If the specified top_k is not > 0. @@ -98,8 +98,8 @@ def run( :param query_embedding: Embedding of the query. :param filters: A dictionary with filters to narrow down the search space. :param top_k: The maximum number of documents to return. - :param scale_score: Scales the BM25 score to a unit interval in the range of 0 to 1, where 1 means extremely relevant. If set to `False`, uses raw similarity scores. - Defaults to `True`. + :param scale_score: Scales the similarity score to a unit interval in the range of 0 to 1, where 1 means extremely relevant. If set to `False`, uses raw similarity scores. + If not specified, the value provided at initialization is used. :param return_embedding: Whether to return the embedding of the retrieved Documents. :return: The retrieved documents. diff --git a/haystack/preview/document_stores/in_memory/document_store.py b/haystack/preview/document_stores/in_memory/document_store.py index 906f40bd99..8f46a6df44 100644 --- a/haystack/preview/document_stores/in_memory/document_store.py +++ b/haystack/preview/document_stores/in_memory/document_store.py @@ -204,7 +204,7 @@ def delete_documents(self, document_ids: List[str]) -> None: del self.storage[doc_id] def bm25_retrieval( - self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, scale_score: bool = True + self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, scale_score: bool = False ) -> List[Document]: """ Retrieves documents that are most relevant to the query using BM25 algorithm. @@ -212,7 +212,7 @@ def bm25_retrieval( :param query: The query string. :param filters: A dictionary with filters to narrow down the search space. :param top_k: The number of top documents to retrieve. Default is 10. - :param scale_score: Whether to scale the scores of the retrieved documents. Default is True. + :param scale_score: Whether to scale the scores of the retrieved documents. Default is False. :return: A list of the top_k documents most relevant to the query. """ if not query: @@ -279,7 +279,7 @@ def embedding_retrieval( query_embedding: List[float], filters: Optional[Dict[str, Any]] = None, top_k: int = 10, - scale_score: bool = True, + scale_score: bool = False, return_embedding: bool = False, ) -> List[Document]: """ @@ -288,7 +288,7 @@ def embedding_retrieval( :param query_embedding: Embedding of the query. :param filters: A dictionary with filters to narrow down the search space. :param top_k: The number of top documents to retrieve. Default is 10. - :param scale_score: Whether to scale the scores of the retrieved Documents. Default is True. + :param scale_score: Whether to scale the scores of the retrieved Documents. Default is False. :param return_embedding: Whether to return the embedding of the retrieved Documents. Default is False. :return: A list of the top_k documents most relevant to the query. """ @@ -327,14 +327,14 @@ def embedding_retrieval( return top_documents def _compute_query_embedding_similarity_scores( - self, embedding: List[float], documents: List[Document], scale_score: bool = True + self, embedding: List[float], documents: List[Document], scale_score: bool = False ) -> List[float]: """ Computes the similarity scores between the query embedding and the embeddings of the documents. :param embedding: Embedding of the query. :param documents: A list of Documents. - :param scale_score: Whether to scale the scores of the Documents. Default is True. + :param scale_score: Whether to scale the scores of the Documents. Default is False. :return: A list of scores. """ diff --git a/releasenotes/notes/scale-score-false-h2-3c8e4e7e543e8cce.yaml b/releasenotes/notes/scale-score-false-h2-3c8e4e7e543e8cce.yaml new file mode 100644 index 0000000000..1b79085958 --- /dev/null +++ b/releasenotes/notes/scale-score-false-h2-3c8e4e7e543e8cce.yaml @@ -0,0 +1,6 @@ +--- +preview: + - | + Change the default value of `scale_score` to False for Retrievers. + Users can still explicitly set `scale_score` to True to get relevance + scores in the range [0, 1]. diff --git a/test/preview/components/retrievers/test_in_memory_bm25_retriever.py b/test/preview/components/retrievers/test_in_memory_bm25_retriever.py index f1eaa8d9ee..2c2092574b 100644 --- a/test/preview/components/retrievers/test_in_memory_bm25_retriever.py +++ b/test/preview/components/retrievers/test_in_memory_bm25_retriever.py @@ -26,21 +26,21 @@ def test_init_default(self): retriever = InMemoryBM25Retriever(InMemoryDocumentStore()) assert retriever.filters is None assert retriever.top_k == 10 - assert retriever.scale_score + assert retriever.scale_score is False @pytest.mark.unit def test_init_with_parameters(self): retriever = InMemoryBM25Retriever( - InMemoryDocumentStore(), filters={"name": "test.txt"}, top_k=5, scale_score=False + InMemoryDocumentStore(), filters={"name": "test.txt"}, top_k=5, scale_score=True ) assert retriever.filters == {"name": "test.txt"} assert retriever.top_k == 5 - assert not retriever.scale_score + assert retriever.scale_score @pytest.mark.unit def test_init_with_invalid_top_k_parameter(self): with pytest.raises(ValueError): - InMemoryBM25Retriever(InMemoryDocumentStore(), top_k=-2, scale_score=False) + InMemoryBM25Retriever(InMemoryDocumentStore(), top_k=-2) @pytest.mark.unit def test_to_dict(self): @@ -56,7 +56,7 @@ def test_to_dict(self): "document_store": {"type": "MyFakeStore", "init_parameters": {}}, "filters": None, "top_k": 10, - "scale_score": True, + "scale_score": False, }, } @@ -66,7 +66,7 @@ def test_to_dict_with_custom_init_parameters(self): document_store = MyFakeStore() document_store.to_dict = lambda: {"type": "MyFakeStore", "init_parameters": {}} component = InMemoryBM25Retriever( - document_store=document_store, filters={"name": "test.txt"}, top_k=5, scale_score=False + document_store=document_store, filters={"name": "test.txt"}, top_k=5, scale_score=True ) data = component.to_dict() assert data == { @@ -75,7 +75,7 @@ def test_to_dict_with_custom_init_parameters(self): "document_store": {"type": "MyFakeStore", "init_parameters": {}}, "filters": {"name": "test.txt"}, "top_k": 5, - "scale_score": False, + "scale_score": True, }, } @@ -94,7 +94,7 @@ def test_from_dict(self): assert isinstance(component.document_store, InMemoryDocumentStore) assert component.filters == {"name": "test.txt"} assert component.top_k == 5 - assert component.scale_score + assert component.scale_score is False @pytest.mark.unit def test_from_dict_without_docstore(self): diff --git a/test/preview/components/retrievers/test_in_memory_embedding_retriever.py b/test/preview/components/retrievers/test_in_memory_embedding_retriever.py index 0ad2d4b474..acbd6acedd 100644 --- a/test/preview/components/retrievers/test_in_memory_embedding_retriever.py +++ b/test/preview/components/retrievers/test_in_memory_embedding_retriever.py @@ -16,21 +16,21 @@ def test_init_default(self): retriever = InMemoryEmbeddingRetriever(InMemoryDocumentStore()) assert retriever.filters is None assert retriever.top_k == 10 - assert retriever.scale_score + assert retriever.scale_score is False @pytest.mark.unit def test_init_with_parameters(self): retriever = InMemoryEmbeddingRetriever( - InMemoryDocumentStore(), filters={"name": "test.txt"}, top_k=5, scale_score=False + InMemoryDocumentStore(), filters={"name": "test.txt"}, top_k=5, scale_score=True ) assert retriever.filters == {"name": "test.txt"} assert retriever.top_k == 5 - assert not retriever.scale_score + assert retriever.scale_score @pytest.mark.unit def test_init_with_invalid_top_k_parameter(self): with pytest.raises(ValueError): - InMemoryEmbeddingRetriever(InMemoryDocumentStore(), top_k=-2, scale_score=False) + InMemoryEmbeddingRetriever(InMemoryDocumentStore(), top_k=-2) @pytest.mark.unit def test_to_dict(self): @@ -46,7 +46,7 @@ def test_to_dict(self): "document_store": {"type": "MyFakeStore", "init_parameters": {}}, "filters": None, "top_k": 10, - "scale_score": True, + "scale_score": False, "return_embedding": False, }, } @@ -60,7 +60,7 @@ def test_to_dict_with_custom_init_parameters(self): document_store=document_store, filters={"name": "test.txt"}, top_k=5, - scale_score=False, + scale_score=True, return_embedding=True, ) data = component.to_dict() @@ -70,7 +70,7 @@ def test_to_dict_with_custom_init_parameters(self): "document_store": {"type": "MyFakeStore", "init_parameters": {}}, "filters": {"name": "test.txt"}, "top_k": 5, - "scale_score": False, + "scale_score": True, "return_embedding": True, }, } @@ -90,7 +90,7 @@ def test_from_dict(self): assert isinstance(component.document_store, InMemoryDocumentStore) assert component.filters == {"name": "test.txt"} assert component.top_k == 5 - assert component.scale_score + assert component.scale_score is False @pytest.mark.unit def test_from_dict_without_docstore(self):