Skip to content

Commit

Permalink
Merge pull request #159 from databio/dev
Browse files Browse the repository at this point in the history
Release v0.9.0
  • Loading branch information
khoroshevskyi authored Jan 3, 2025
2 parents a6fcb8a + 1215618 commit 4da3297
Show file tree
Hide file tree
Showing 34 changed files with 1,387 additions and 398 deletions.
2 changes: 1 addition & 1 deletion bedhost/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.8.0"
__version__ = "0.9.0"
4 changes: 4 additions & 0 deletions bedhost/data_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,7 @@ class BaseListResponse(BaseModel):
limit: int
offset: int
results: list


class CreateBEDsetRequest(BaseModel):
registry_path: str
93 changes: 91 additions & 2 deletions bedhost/routers/bed_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
BEDFileNotFoundError,
TokenizeFileNotExistError,
)
from bbconf.models.bed_models import BedClassification # BedPEPHub,
from bbconf.models.bed_models import (
BedClassification, # BedPEPHub,
BedEmbeddingResult,
BedFiles,
BedListResult,
Expand All @@ -27,6 +27,8 @@
BedStatsModel,
TokenizedBedResponse,
TokenizedPathResponse,
QdrantSearchResult,
RefGenValidReturnModel,
)
from fastapi import APIRouter, File, HTTPException, Query, UploadFile
from fastapi.responses import PlainTextResponse
Expand Down Expand Up @@ -193,6 +195,27 @@ async def get_bed_pephub(
)


@router.get(
"/{bed_id}/neighbours",
summary="Get nearest neighbours for a single BED record",
response_model=BedListSearchResult,
response_model_by_alias=False,
description=f"Returns most similar BED files in the database. "
f"Example\n bed_id: {EXAMPLE_BED}",
)
async def get_bed_neighbours(
bed_id: str = BedDigest,
limit: int = 10,
offset: int = 0,
):
try:
return bbagent.bed.get_neighbours(bed_id, limit=limit, offset=offset)
except BEDFileNotFoundError as _:
raise HTTPException(
status_code=404,
)


@router.get(
"/{bed_id}/embedding",
summary="Get embeddings for a single BED record",
Expand Down Expand Up @@ -335,7 +358,52 @@ async def text_to_bed_search(query, limit: int = 10, offset: int = 0):
Example: query="cancer"
"""
_LOGGER.info(f"Searching for: {query}")
results = bbagent.bed.text_to_bed_search(query, limit=limit, offset=offset)

# results_sql = bbagent.bed.sql_search(
# query, limit=round(limit / 2, 0), offset=round(offset / 2, 0)
# )
#
# if results_sql.count > results_sql.offset:
# qdrant_offset = offset - results_sql.offset
# else:
# qdrant_offset = offset - results_sql.count
#
# results_qdr = bbagent.bed.text_to_bed_search(
# query, limit=limit, offset=qdrant_offset - 1 if qdrant_offset > 0 else 0
# )
#
# results = BedListSearchResult(
# count=results_qdr.count,
# limit=limit,
# offset=offset,
# results=(results_sql.results + results_qdr.results)[0:limit],
# )
spaceless_query = query.replace(" ", "")
if len(spaceless_query) == 32 and spaceless_query == query:
try:
similar_results = bbagent.bed.get_neighbours(
query, limit=limit, offset=offset
)

if similar_results.results and offset == 0:

result = QdrantSearchResult(
id=query,
payload={},
score=1.0,
metadata=bbagent.bed.get(query),
)

similar_results.results.insert(0, result)
return similar_results
except Exception as _:
pass

results = bbagent.bed.text_to_bed_search(
query,
limit=limit,
offset=offset,
)

if results:
return results
Expand Down Expand Up @@ -414,3 +482,24 @@ async def get_tokens(
status_code=404,
detail="Tokenized file not found",
)


@router.get(
"/{bed_id}/genome-stats",
summary="Get reference genome validation results",
response_model=RefGenValidReturnModel,
)
async def get_ref_gen_results(
bed_id: str,
):
"""
Return reference genome validation results for a bed file
Example: bed: 0dcdf8986a72a3d85805bbc9493a1302
"""
try:
return bbagent.bed.get_reference_validation(bed_id)
except BEDFileNotFoundError as _:
raise HTTPException(
status_code=404,
detail=f"Bed file {bed_id} not found",
)
88 changes: 73 additions & 15 deletions bedhost/routers/bedset_api.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
import logging

from bbconf.exceptions import BedSetNotFoundError
from bbconf.exceptions import BedSetNotFoundError, BedSetTrackHubLimitError
from bbconf.models.bedset_models import (
BedSetBedFiles,
BedSetListResult,
BedSetMetadata,
BedSetPlots,
BedSetStats,
)
from pephubclient.helpers import is_registry_path, unwrap_registry_path
from fastapi import APIRouter, HTTPException, Request, Response

from ..const import EXAMPLE_BEDSET, PKG_NAME
from ..main import bbagent
from ..data_models import CreateBEDsetRequest
from ..utils import zip_pep

router = APIRouter(prefix="/v1/bedset", tags=["bedset"])
Expand Down Expand Up @@ -165,22 +167,78 @@ async def get_trackDb_file_bedset(bedset_id: str):
"""
Generate trackDb file for the BED set track hub
"""
# Response should be this type:
# trackDb_txt = (
# trackDb_txt + f"track\t {metadata.name}\n"
# "type\t bigBed\n"
# f"bigDataUrl\t {metadata.files.bigbed_file.access_methods[0].access_url.url} \n"
# f"shortLabel\t {metadata.name}\n"
# f"longLabel\t {metadata.description}\n"
# "visibility\t full\n\n"
# )
try:
trackDb_txt = bbagent.bedset.get_track_hub_file(bedset_id)
except BedSetTrackHubLimitError as _:
raise HTTPException(
status_code=400,
detail="Track hub limit reached. Please try smaller BEDset.",
)

return Response(trackDb_txt, media_type="text/plain")

hit = bbagent.bedset.get_bedset_bedfiles(bedset_id)

trackDb_txt = ""
for bed in hit.results:
metadata = bbagent.bed.get(bed.id, full=True)
@router.post(
"/create/",
description="Create a new bedset by providing registry path to the PEPhub project",
)
async def create_bedset(bedset: CreateBEDsetRequest):
"""
Create a new bedset
"""
# Validate the PEPhub project string
if not is_registry_path(bedset.registry_path):
raise HTTPException(status_code=406, detail="Invalid registry path")

project_reg_path = unwrap_registry_path(bedset.registry_path)

if metadata.files.bigbed_file:
if project_reg_path.namespace not in ["databio", "bedbase", "pepkit"]:
raise HTTPException(status_code=403, detail="User is not in admin list")

trackDb_txt = (
trackDb_txt + f"track\t {metadata.name}\n"
"type\t bigBed\n"
f"bigDataUrl\t {metadata.files.bigbed_file.access_methods[0].access_url.url} \n"
f"shortLabel\t {metadata.name}\n"
f"longLabel\t {metadata.description}\n"
"visibility\t full\n\n"
)
try:
project = bbagent.config.phc.load_project(bedset.registry_path)
except Exception as _:
raise HTTPException(
status_code=404, detail=f"Project: '{bedset.registry_path}' not found"
)

bedfiles_list = [
bedfile_id.get("record_identifier") or bedfile_id.sample_name
for bedfile_id in project.samples
]

if bbagent.bedset.exists(identifier=project.name):
raise HTTPException(
status_code=409,
detail=f"BEDset with identifier {project.name} already exists",
)

return Response(trackDb_txt, media_type="text/plain")
try:
bbagent.bedset.create(
identifier=project.name,
name=project.name,
bedid_list=bedfiles_list,
statistics=True,
description=project.description,
annotation={
"source": project.config.get("source", ""),
"author": project.config.get("author", project_reg_path.namespace),
},
no_fail=False,
overwrite=False,
)
except Exception as err:
raise HTTPException(
status_code=400, detail=f"Unable to create bedset. Error: {err}"
)

return {"status": "success"}
56 changes: 56 additions & 0 deletions interactive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import bbconf

bba = bbconf.BedBaseAgent("deployment/config/api-dev.bedbase.org.yaml")

bba.config._b2bsi = bba.config._init_b2bsi_object()
bba.config._r2v = bba.config._init_r2v_object()
bba.config._bivec = bba.config._init_bivec_object()


# Here's some code to test the BiVectorSearchInterface

from geniml.search.interfaces import BiVectorSearchInterface
from geniml.search.backends import BiVectorBackend

from geniml.search.query2vec import Text2Vec

search_backend = BiVectorBackend(
metadata_backend=self._qdrant_text_engine, bed_backend=self._qdrant_engine
)

t2v = Text2Vec("sentence-transformers/all-MiniLM-L6-v2", v2v=None)

bvsi = BiVectorSearchInterface()

from langchain_huggingface.embeddings import HuggingFaceEmbeddings
import logging
from typing import Union

import numpy as np
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

from geniml.text2bednn import Vec2VecFNN
from geniml.search.query2vec.abstract import Query2Vec

# culprit:
te = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Testing the sentence transformers:


from sentence_transformers import SentenceTransformer

sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(sentences)
print(embeddings)


from fastembed import TextEmbedding

model = TextEmbedding(
model_name="sentence-transformers/all-MiniLM-L6-v2", max_length=512
)
sentences = ["This is an example sentence", "Each sentence is converted"]
embeddings = list(model.embed(sentences))
4 changes: 2 additions & 2 deletions requirements/requirements-all.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# bbconf @ git+https://github.com/databio/bbconf.git@dev#egg=bbconf
bbconf>=0.9.0
bbconf>=0.10.0
fastapi>=0.103.0
logmuse>=0.2.7
markdown
Expand All @@ -9,4 +9,4 @@ uvicorn
yacman>=0.9.2
pephubclient>=0.4.1
psycopg[binary,pool]
python-multipart>=0.0.9
python-multipart>=0.0.9
Loading

0 comments on commit 4da3297

Please sign in to comment.