diff --git a/src/bioregistry/analysis/paper_ranking.py b/src/bioregistry/analysis/paper_ranking.py index deda06715..4c4dae39c 100644 --- a/src/bioregistry/analysis/paper_ranking.py +++ b/src/bioregistry/analysis/paper_ranking.py @@ -141,37 +141,42 @@ def _get_metadata_for_ids(pubmed_ids: Iterable[Union[int, str]]) -> dict[str, di return fetched_metadata -def _get_ids(term: str, use_text_word: bool, relative_date: int) -> set[str]: +def _get_ids(term: str, use_text_word: bool, start_date: str, end_date: str) -> set[str]: from indra.literature import pubmed_client return { str(pubmed_id) for pubmed_id in pubmed_client.get_ids( - term, use_text_word=use_text_word, reldate=relative_date + term, use_text_word=use_text_word, mindate=start_date, maxdate=end_date ) } def _search( - terms: list[str], pubmed_ids_to_filter: set[str], relative_date: int + terms: list[str], pubmed_ids_to_filter: set[str], start_date: str, end_date: str ) -> dict[str, list[str]]: paper_to_terms: defaultdict[str, list[str]] = defaultdict(list) for term in tqdm(terms, desc="Searching PubMed", unit="search term", leave=False): - for pubmed_id in _get_ids(term, use_text_word=True, relative_date=relative_date): + for pubmed_id in _get_ids(term, use_text_word=True, start_date=start_date, end_date=end_date): if pubmed_id not in pubmed_ids_to_filter: paper_to_terms[pubmed_id].append(term) return dict(paper_to_terms) -def fetch_pubmed_papers(*, pubmed_ids_to_filter: set[str], relative_date: int) -> pd.DataFrame: + +def fetch_pubmed_papers(*, pubmed_ids_to_filter: set[str], start_date: str, end_date: str) -> pd.DataFrame: """Fetch PubMed papers from the last 30 days using specific search terms, excluding curated papers. - :param pubmed_ids_to_filter: List containing already curated PMIDs - :param relative_date: the number of recent days to search + :param pubmed_ids_to_filter: List containing already curated PMIDs. + :param start_date: The start date of the period for which papers are being ranked. + :param end_date: The end date of the period for which papers are being ranked. :return: DataFrame containing PubMed paper details. """ paper_to_terms = _search( - DEFAULT_SEARCH_TERMS, pubmed_ids_to_filter=pubmed_ids_to_filter, relative_date=relative_date + DEFAULT_SEARCH_TERMS, + pubmed_ids_to_filter=pubmed_ids_to_filter, + start_date=start_date, + end_date=end_date ) papers = _get_metadata_for_ids(paper_to_terms) @@ -477,11 +482,13 @@ def runner( # These have already been curated and will therefore be filtered out curated_pubmed_ids: set[str] = {str(pubmed) for pubmed in df["pubmed"] if pd.notna(pubmed)} - # FIXME the fetch_pubmed_papers function should - # take into account the start and end date. as - predictions_df = fetch_pubmed_papers(pubmed_ids_to_filter=curated_pubmed_ids, relative_date=30) + predictions_df = fetch_pubmed_papers( + pubmed_ids_to_filter=curated_pubmed_ids, + start_date=start_date, + end_date=end_date + ) if not predictions_df.empty: - predictions_path = output_path.joinpath(f"predictions.tsv") + predictions_path = output_path.joinpath("predictions.tsv") predict_and_save(predictions_df, vectorizer, classifiers, meta_clf, predictions_path)