Skip to content

Commit

Permalink
feat!: Chemistry Queries for PatCID2 (#191)
Browse files Browse the repository at this point in the history
Introduces new query methods for the PatCID data:
- Deprecates the existing chemistry functions. All existing
functionality is covered by the new ones.
- Introduces querying with SMARTS as well.
- Uses the new (to be released) orchestrator task for querying the
knowledge database.

---------

Signed-off-by: DKL <[email protected]>
  • Loading branch information
kdinkla authored Jan 17, 2025
1 parent 4150e78 commit 32bcf5c
Show file tree
Hide file tree
Showing 4 changed files with 185 additions and 1 deletion.
39 changes: 39 additions & 0 deletions deepsearch/chemistry/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from typing import Optional

from pydantic import BaseModel


class ChemistryModel(BaseModel, extra="allow"):
id: int
"""Transient identifier for short term use."""

persistent_id: str
"""Identifier for long term (storage) use."""


class ChemistryDocument(ChemistryModel):
application_id: Optional[str]
"""Identifier under which a patent application has been filed."""

publication_id: Optional[str]
"""Identifier under which a patent has been published."""

title: str
"""(Readable) title of the document."""


class ChemistryCompound(ChemistryModel):
smiles: str
"""SMILES representation of compound structure."""

display_name: str
"""User friendly representation of compound."""

inchi: str
"""InChI representation of compound structure."""

inchikey: str
"""Hashed form of InChI."""

sum_formula: str
"""Sum formula of compound. For example 'C6 O2 H5'"""
1 change: 1 addition & 0 deletions deepsearch/chemistry/queries/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .queries import *
8 changes: 7 additions & 1 deletion deepsearch/chemistry/queries/molecules.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ def MoleculeQuery(
num_items: int = 10,
) -> Query:
"""
Use the vector database in Deep Search for querying molecules
DEPRECATED: Migrate to compounds queries.
Use the knowledge database in Deep Search for querying molecules
by substructure or similarity.
The result is contained in the `molecules` output of the response.
"""
Expand Down Expand Up @@ -97,6 +99,8 @@ def MoleculesInPatentsQuery(
partial_lookup: bool = False,
) -> Query:
"""
DEPRECATED: Migrate to compounds queries.
List all molecules contained in a list of patents.
The result is contained in the `molecules` output of the response.
"""
Expand Down Expand Up @@ -134,6 +138,8 @@ def PatentsWithMoleculesQuery(
num_items: int = 10,
) -> Query:
"""
DEPRECATED: Migrate to documents queries.
List all patents containing any of the input molecules.
The result is contained in the `patents` output of the response.
"""
Expand Down
138 changes: 138 additions & 0 deletions deepsearch/chemistry/queries/queries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
from abc import ABC
from typing import Any, Type, overload

from pydantic import BaseModel

from deepsearch.chemistry.models import ChemistryCompound, ChemistryDocument
from deepsearch.chemistry.resources import KnowledgeDbResource
from deepsearch.cps.client import api
from deepsearch.cps.client.queries.query import Query


class ChemistryQuery(BaseModel, ABC):
_result_type: Type


class CompoundsQuery(ChemistryQuery):
_result_type = ChemistryCompound


class DocumentsQuery(ChemistryQuery):
_result_type = ChemistryDocument


class CompoundsByIds(CompoundsQuery):
"""Query compounds that have any of the given identifiers."""

inchikeys: list[str] = []
persistent_ids: list[str] = []


class CompoundsBySmiles(CompoundsQuery):
"""Query compounds that (exactly) match the given SMILES code."""

structure: str


class CompoundsBySmarts(CompoundsQuery):
"""Query compounds that (exactly) match the given SMARTS code."""

structure: str


class CompoundsBySimilarity(CompoundsQuery):
"""Query compounds that are similar to the given SMILES code."""

structure: str
threshold: float = 0.9


class CompoundsBySubstructure(CompoundsQuery):
"""Query compounds that contain a substructure with the given SMILES code."""

structure: str


class CompoundsIn(CompoundsQuery):
"""Query compounds that occur in the given documents."""

documents: DocumentsQuery


class DocumentsByIds(DocumentsQuery):
"""Query documents that have any of the given identifiers."""

publication_ids: list[str] = []
application_ids: list[str] = []
persistent_ids: list[str] = []


class DocumentsHaving(DocumentsQuery):
"""Query documents that contain compounds matching the given query."""

compounds: CompoundsQuery


@overload
def query_chemistry(
api: api.CpsApi, query: CompoundsQuery, offset: int = 0, limit: int = 10
) -> list[ChemistryCompound]: ...


@overload
def query_chemistry(
api: api.CpsApi, query: DocumentsQuery, offset: int = 0, limit: int = 10
) -> list[ChemistryDocument]: ...


def query_chemistry(
api: api.CpsApi, query: ChemistryQuery, offset: int = 0, limit: int = 10
) -> list[Any]:
"""Perform a chemistry query on the knowledge base."""

# Resolve knowledge lookup functions and arguments.
function_names = {
CompoundsByIds: "compounds",
CompoundsBySmiles: "compounds_by_smiles",
CompoundsBySmarts: "compounds_by_smarts",
CompoundsBySimilarity: "compounds_by_similarity",
CompoundsBySubstructure: "compounds_by_substructure",
CompoundsIn: "compounds_in_documents",
DocumentsByIds: "documents",
DocumentsHaving: "documents_having_compounds",
}

query_parts: list[ChemistryQuery] = [query]

if type(query) is CompoundsIn:
query_parts.append(query.documents)
elif type(query) is DocumentsHaving:
query_parts.append(query.compounds)

function_parts = [function_names[type(q)] for q in query_parts]
arguments = query_parts[-1].model_dump()

# Compose query task.
query_tasks = Query()

lookup = query_tasks.add(
"KnowledgeLookup",
task_id="lookup",
parameters={
"schema": "patcid",
"function": function_parts,
"arguments": arguments,
"offset": offset,
"limit": limit,
},
coordinates=KnowledgeDbResource(),
)
lookup.output("result").output_as("result")

# Run task.
response = api.queries.run(query_tasks)

return [
query_parts[0]._result_type.model_validate(item)
for item in response.outputs["result"]
]

0 comments on commit 32bcf5c

Please sign in to comment.