From f3e463b34bda305d64e2a2032465a1dd0cbedc42 Mon Sep 17 00:00:00 2001 From: Nazanin Donyapour Date: Fri, 5 Jul 2024 19:20:07 +0000 Subject: [PATCH] extract-data-csv plugin --- .../extract-data-csv-plugin/.bumpversion.cfg | 29 ++++ utils/extract-data-csv-plugin/.dockerignore | 4 + utils/extract-data-csv-plugin/.gitattributes | 1 + utils/extract-data-csv-plugin/.gitignore | 1 + utils/extract-data-csv-plugin/CHANGELOG.md | 5 + utils/extract-data-csv-plugin/Dockerfile | 22 +++ utils/extract-data-csv-plugin/README.md | 18 +++ utils/extract-data-csv-plugin/VERSION | 1 + utils/extract-data-csv-plugin/build-docker.sh | 4 + .../extract_data_csv_0@1@0.cwl | 141 ++++++++++++++++++ utils/extract-data-csv-plugin/ict.yml | 87 +++++++++++ utils/extract-data-csv-plugin/pyproject.toml | 31 ++++ .../mm/utils/extract_data_csv/__init__.py | 7 + .../mm/utils/extract_data_csv/__main__.py | 73 +++++++++ .../extract_data_csv/extract_data_csv.py | 50 +++++++ .../extract-data-csv-plugin/tests/__init__.py | 1 + .../tests/fake_sample_records.csv | 3 + .../tests/test_extract_data_csv.py | 49 ++++++ 18 files changed, 527 insertions(+) create mode 100644 utils/extract-data-csv-plugin/.bumpversion.cfg create mode 100644 utils/extract-data-csv-plugin/.dockerignore create mode 100644 utils/extract-data-csv-plugin/.gitattributes create mode 100644 utils/extract-data-csv-plugin/.gitignore create mode 100644 utils/extract-data-csv-plugin/CHANGELOG.md create mode 100644 utils/extract-data-csv-plugin/Dockerfile create mode 100644 utils/extract-data-csv-plugin/README.md create mode 100644 utils/extract-data-csv-plugin/VERSION create mode 100755 utils/extract-data-csv-plugin/build-docker.sh create mode 100644 utils/extract-data-csv-plugin/extract_data_csv_0@1@0.cwl create mode 100644 utils/extract-data-csv-plugin/ict.yml create mode 100644 utils/extract-data-csv-plugin/pyproject.toml create mode 100644 utils/extract-data-csv-plugin/src/polus/mm/utils/extract_data_csv/__init__.py create mode 100644 utils/extract-data-csv-plugin/src/polus/mm/utils/extract_data_csv/__main__.py create mode 100644 utils/extract-data-csv-plugin/src/polus/mm/utils/extract_data_csv/extract_data_csv.py create mode 100644 utils/extract-data-csv-plugin/tests/__init__.py create mode 100644 utils/extract-data-csv-plugin/tests/fake_sample_records.csv create mode 100644 utils/extract-data-csv-plugin/tests/test_extract_data_csv.py diff --git a/utils/extract-data-csv-plugin/.bumpversion.cfg b/utils/extract-data-csv-plugin/.bumpversion.cfg new file mode 100644 index 00000000..f8274026 --- /dev/null +++ b/utils/extract-data-csv-plugin/.bumpversion.cfg @@ -0,0 +1,29 @@ +[bumpversion] +current_version = 0.1.0 +commit = False +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:VERSION] + +[bumpversion:file:README.md] + +[bumpversion:file:plugin.json] + +[bumpversion:file:src/polus/mm/utils/extract_data_csv/__init__.py] diff --git a/utils/extract-data-csv-plugin/.dockerignore b/utils/extract-data-csv-plugin/.dockerignore new file mode 100644 index 00000000..7c603f81 --- /dev/null +++ b/utils/extract-data-csv-plugin/.dockerignore @@ -0,0 +1,4 @@ +.venv +out +tests +__pycache__ diff --git a/utils/extract-data-csv-plugin/.gitattributes b/utils/extract-data-csv-plugin/.gitattributes new file mode 100644 index 00000000..87e654bb --- /dev/null +++ b/utils/extract-data-csv-plugin/.gitattributes @@ -0,0 +1 @@ +*.csv filter=lfs diff=lfs merge=lfs -text diff --git a/utils/extract-data-csv-plugin/.gitignore b/utils/extract-data-csv-plugin/.gitignore new file mode 100644 index 00000000..c04bc49f --- /dev/null +++ b/utils/extract-data-csv-plugin/.gitignore @@ -0,0 +1 @@ +poetry.lock diff --git a/utils/extract-data-csv-plugin/CHANGELOG.md b/utils/extract-data-csv-plugin/CHANGELOG.md new file mode 100644 index 00000000..b67793f7 --- /dev/null +++ b/utils/extract-data-csv-plugin/CHANGELOG.md @@ -0,0 +1,5 @@ +# CHANGELOG + +## 0.1.0 + +Initial release. diff --git a/utils/extract-data-csv-plugin/Dockerfile b/utils/extract-data-csv-plugin/Dockerfile new file mode 100644 index 00000000..e8ac9249 --- /dev/null +++ b/utils/extract-data-csv-plugin/Dockerfile @@ -0,0 +1,22 @@ +FROM condaforge/mambaforge + +ENV EXEC_DIR="/opt/executables" +ENV POLUS_LOG="INFO" +RUN mkdir -p ${EXEC_DIR} + + +# Work directory defined in the base container +# WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY CHANGELOG.md ${EXEC_DIR} + +# Install needed packages here + +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir + +CMD ["--help"] diff --git a/utils/extract-data-csv-plugin/README.md b/utils/extract-data-csv-plugin/README.md new file mode 100644 index 00000000..2139717a --- /dev/null +++ b/utils/extract-data-csv-plugin/README.md @@ -0,0 +1,18 @@ +# extract_data_csv (0.1.0) + +Extract data from a CSV file + +## Options + +This plugin takes 6 input arguments and 2 output argument: + +| Name | Description | I/O | Type | Default | +|---------------|-------------------------|--------|--------|---------| +| input_csv_path | Path to the input csv file, Type: string, File type: input, Accepted formats: csv | Input | File | File | +| query | query str to search the dataset, Type: string, File type: input, Accepted formats: txt | Input | string | string | +| min_row | The row min inex, Type: int | Input | int | int | +| max_row | The row max inex, Type: int | Input | int | int | +| column_name | The name of the column to load data, Type: string, File type: input, Accepted formats: txt | Input | string | string | +| output_txt_path | Path to the txt datoutput file, Type: string, File type: output, Accepted formats: txt | Input | string | string | +| output_txt_path | Path to the txt output file | Output | File | File | +| output_data | The output data | Output | {'type': 'array', 'items': 'string'} | {'type': 'array', 'items': 'string'} | diff --git a/utils/extract-data-csv-plugin/VERSION b/utils/extract-data-csv-plugin/VERSION new file mode 100644 index 00000000..6e8bf73a --- /dev/null +++ b/utils/extract-data-csv-plugin/VERSION @@ -0,0 +1 @@ +0.1.0 diff --git a/utils/extract-data-csv-plugin/build-docker.sh b/utils/extract-data-csv-plugin/build-docker.sh new file mode 100755 index 00000000..1e818292 --- /dev/null +++ b/utils/extract-data-csv-plugin/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$(", "Brandon Walker "] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.13" +typer = "^0.7.0" +sophios = "0.1.4" +pandas = "2.2.2" + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +pytest = "^7.4" +pytest-sugar = "^0.9.6" +pre-commit = "^3.2.1" +black = "^23.3.0" +mypy = "^1.1.1" +ruff = "^0.0.270" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +pythonpath = [ + "." +] diff --git a/utils/extract-data-csv-plugin/src/polus/mm/utils/extract_data_csv/__init__.py b/utils/extract-data-csv-plugin/src/polus/mm/utils/extract_data_csv/__init__.py new file mode 100644 index 00000000..b357d01e --- /dev/null +++ b/utils/extract-data-csv-plugin/src/polus/mm/utils/extract_data_csv/__init__.py @@ -0,0 +1,7 @@ +"""extract_data_csv.""" + +__version__ = "0.1.0" + +from polus.mm.utils.extract_data_csv.extract_data_csv import ( # noqa # pylint: disable=unused-import + extract_data_csv, +) diff --git a/utils/extract-data-csv-plugin/src/polus/mm/utils/extract_data_csv/__main__.py b/utils/extract-data-csv-plugin/src/polus/mm/utils/extract_data_csv/__main__.py new file mode 100644 index 00000000..17a9e5f1 --- /dev/null +++ b/utils/extract-data-csv-plugin/src/polus/mm/utils/extract_data_csv/__main__.py @@ -0,0 +1,73 @@ +"""Package entrypoint for the extract_data_csv package.""" + +# Base packages +import logging +from os import environ + +import typer +from polus.mm.utils.extract_data_csv.extract_data_csv import extract_data_csv + +logging.basicConfig( + format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s", + datefmt="%d-%b-%y %H:%M:%S", +) +POLUS_LOG = getattr(logging, environ.get("POLUS_LOG", "INFO")) +logger = logging.getLogger("polus.mm.utils.extract_data_csv.") +logger.setLevel(POLUS_LOG) + +app = typer.Typer(help="extract_data_csv.") + + +@app.command() +def main( # noqa: PLR0913 + input_csv_path: str = typer.Option( + ..., + "--input_csv_path", + help="Path to the input csv file, Type string, File type input", + ), + query: str = typer.Option( + ..., + "--query", + help="query str to search the dataset, Type string, File type input", + ), + min_row: int = typer.Option( + ..., + "--min_row", + help="The row min inex, Type int", + ), + max_row: int = typer.Option( + ..., + "--max_row", + help="The row max inex, Type int", + ), + column_name: str = typer.Option( + ..., + "--column_name", + help="The name of the column to load data, Type string, File type input", + ), + output_txt_path: str = typer.Option( + ..., + "--output_txt_path", + help="Path to the txt datoutput file, Type string, File type output", + ), +) -> None: + """extract_data_csv.""" + logger.info(f"input_csv_path: {input_csv_path}") + logger.info(f"query: {query}") + logger.info(f"min_row: {min_row}") + logger.info(f"max_row: {max_row}") + logger.info(f"column_name: {column_name}") + logger.info(f"output_txt_path: {output_txt_path}") + + extract_data_csv( + input_csv_path=input_csv_path, + query=query, + min_row=min_row, + max_row=max_row, + column_name=column_name, + output_txt_path=output_txt_path, + ) + + +if __name__ == "__main__": + app() diff --git a/utils/extract-data-csv-plugin/src/polus/mm/utils/extract_data_csv/extract_data_csv.py b/utils/extract-data-csv-plugin/src/polus/mm/utils/extract_data_csv/extract_data_csv.py new file mode 100644 index 00000000..41bcec27 --- /dev/null +++ b/utils/extract-data-csv-plugin/src/polus/mm/utils/extract_data_csv/extract_data_csv.py @@ -0,0 +1,50 @@ +"""Extract data from a CSV file.""" +from pathlib import Path + +import pandas + + +def extract_data_csv( # noqa: PLR0913 + input_csv_path: str, + query: str, + column_name: str, + output_txt_path: str, + min_row: int = 1, + max_row: int = -1, +) -> None: + """extract_data_csv. + + Args: + input_csv_path: Path to the input csv file, Type string, File type input + query: query str to search the dataset, Type string, File type input + column_name: The name of the column to load data, Type string, File type input + output_txt_path: Path to the txt datoutput file, Type string, File type output + min_row: The row min inex, Type int + max_row: The row max inex, Type int + Returns: + None + """ + df = pandas.read_csv(input_csv_path) + + print(df.shape) # noqa: T201 + print(df.columns) # noqa: T201g + + if query: + df = df.query(query) + print(df) # noqa: T201 + + # Remove rows with null value in thecolumn + df.dropna(subset=[column_name], inplace=True) + + # Perform row slicing (if any) + if int(min_row) != 1 or int(max_row) != -1: + # We want to convert to zero-based indices and we also want + # the upper index to be inclusive (i.e. <=) so -1 lower index. + df = df[(int(min_row) - 1) : int(max_row)] + print(df) # noqa: T201g + + # Now restrict to the column we want + with Path.open(Path(output_txt_path), mode="w", encoding="utf-8") as f: + for sm in df[column_name].to_list(): + # repr() preserves backslashes and strip quotes + f.write(f"{repr(sm)[1:-1]}\n") diff --git a/utils/extract-data-csv-plugin/tests/__init__.py b/utils/extract-data-csv-plugin/tests/__init__.py new file mode 100644 index 00000000..d8c0b330 --- /dev/null +++ b/utils/extract-data-csv-plugin/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for extract_data_csv.""" diff --git a/utils/extract-data-csv-plugin/tests/fake_sample_records.csv b/utils/extract-data-csv-plugin/tests/fake_sample_records.csv new file mode 100644 index 00000000..85918dd6 --- /dev/null +++ b/utils/extract-data-csv-plugin/tests/fake_sample_records.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be7e03a84d85217bc644a92b740965be1d4361ad3162bc9f3c7932e638fbf5bc +size 1296 diff --git a/utils/extract-data-csv-plugin/tests/test_extract_data_csv.py b/utils/extract-data-csv-plugin/tests/test_extract_data_csv.py new file mode 100644 index 00000000..d4b58917 --- /dev/null +++ b/utils/extract-data-csv-plugin/tests/test_extract_data_csv.py @@ -0,0 +1,49 @@ +"""Tests for extract_data_csv.""" +from pathlib import Path + +from polus.mm.utils.extract_data_csv.extract_data_csv import extract_data_csv +from sophios.api.pythonapi import Step +from sophios.api.pythonapi import Workflow + + +def test_extract_data_csv() -> None: + """Test extract_data_csv.""" + input_csv_path = "fake_sample_records.csv" + input_csv_path = str(Path(__file__).resolve().parent / Path(input_csv_path)) + query = "" + column_name = "Smiles" + output_txt_path = "smiles.txt" + + extract_data_csv(input_csv_path, query, column_name, output_txt_path) + + assert Path(output_txt_path).exists() + + +def test_extract_data_csv_cwl() -> None: + """Test extract_data_csv CWL.""" + cwl_file = Path("extract_data_csv_0@1@0.cwl") + + # Create the step for the CWL file + extract_data_csv_step = Step(clt_path=cwl_file) + + input_csv_path = "fake_sample_records.csv" + input_csv_path = str(Path(__file__).resolve().parent / Path(input_csv_path)) + + extract_data_csv_step.input_csv_path = input_csv_path + extract_data_csv_step.query = "" + extract_data_csv_step.column_name = "Smiles" + extract_data_csv_step.output_txt_path = "smiles.txt" + + # Define the workflow with the step + steps = [extract_data_csv_step] + filename = "extract_data_csv" + workflow = Workflow(steps, filename) + + # Run the workflow + workflow.run() + + # Check for the existence of the output file + outdir = Path("outdir") + assert any( + file.name == "smiles.txt" for file in outdir.rglob("*") + ), "The file output_scored.txt was not found."