Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Search PubMed Papers In Specific Date Range for Paper Ranking Workflow #1367

Merged
merged 19 commits into from
Jan 20, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 19 additions & 12 deletions src/bioregistry/analysis/paper_ranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,37 +141,42 @@
return fetched_metadata


def _get_ids(term: str, use_text_word: bool, relative_date: int) -> set[str]:
def _get_ids(term: str, use_text_word: bool, start_date: str, end_date: str) -> set[str]:
from indra.literature import pubmed_client

return {
str(pubmed_id)
for pubmed_id in pubmed_client.get_ids(
term, use_text_word=use_text_word, reldate=relative_date
term, use_text_word=use_text_word, mindate=start_date, maxdate=end_date
)
}


def _search(
terms: list[str], pubmed_ids_to_filter: set[str], relative_date: int
terms: list[str], pubmed_ids_to_filter: set[str], start_date: str, end_date: str
) -> dict[str, list[str]]:
paper_to_terms: defaultdict[str, list[str]] = defaultdict(list)
for term in tqdm(terms, desc="Searching PubMed", unit="search term", leave=False):
for pubmed_id in _get_ids(term, use_text_word=True, relative_date=relative_date):
for pubmed_id in _get_ids(term, use_text_word=True, start_date=start_date, end_date=end_date):
if pubmed_id not in pubmed_ids_to_filter:
paper_to_terms[pubmed_id].append(term)
return dict(paper_to_terms)


def fetch_pubmed_papers(*, pubmed_ids_to_filter: set[str], relative_date: int) -> pd.DataFrame:

def fetch_pubmed_papers(*, pubmed_ids_to_filter: set[str], start_date: str, end_date: str) -> pd.DataFrame:
"""Fetch PubMed papers from the last 30 days using specific search terms, excluding curated papers.

:param pubmed_ids_to_filter: List containing already curated PMIDs
:param relative_date: the number of recent days to search
:param pubmed_ids_to_filter: List containing already curated PMIDs.
:param start_date: The start date of the period for which papers are being ranked.
:param end_date: The end date of the period for which papers are being ranked.
:return: DataFrame containing PubMed paper details.
"""
paper_to_terms = _search(
DEFAULT_SEARCH_TERMS, pubmed_ids_to_filter=pubmed_ids_to_filter, relative_date=relative_date
DEFAULT_SEARCH_TERMS,
pubmed_ids_to_filter=pubmed_ids_to_filter,
start_date=start_date,
end_date=end_date
)

papers = _get_metadata_for_ids(paper_to_terms)
Expand Down Expand Up @@ -477,11 +482,13 @@
# These have already been curated and will therefore be filtered out
curated_pubmed_ids: set[str] = {str(pubmed) for pubmed in df["pubmed"] if pd.notna(pubmed)}

# FIXME the fetch_pubmed_papers function should
# take into account the start and end date. as
predictions_df = fetch_pubmed_papers(pubmed_ids_to_filter=curated_pubmed_ids, relative_date=30)
predictions_df = fetch_pubmed_papers(
pubmed_ids_to_filter=curated_pubmed_ids,
start_date=start_date,
end_date=end_date
)
if not predictions_df.empty:
predictions_path = output_path.joinpath(f"predictions.tsv")
predictions_path = output_path.joinpath("predictions.tsv")

Check warning on line 491 in src/bioregistry/analysis/paper_ranking.py

View check run for this annotation

Codecov / codecov/patch

src/bioregistry/analysis/paper_ranking.py#L491

Added line #L491 was not covered by tests
predict_and_save(predictions_df, vectorizer, classifiers, meta_clf, predictions_path)


Expand Down
Loading