diff --git a/.bumpversion.cfg b/.bumpversion.cfg new file mode 100644 index 0000000..dace2a7 --- /dev/null +++ b/.bumpversion.cfg @@ -0,0 +1,23 @@ +[bumpversion] +current_version = 0.1.1 +commit = False +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:VERSION] diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..6a1f564 --- /dev/null +++ b/.flake8 @@ -0,0 +1,4 @@ +[flake8] +ignore = W503, E501 +max-line-length = 88 +extended-ignore = E203 diff --git a/.gitignore b/.gitignore index e968fa7..a07072c 100644 --- a/.gitignore +++ b/.gitignore @@ -157,4 +157,23 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -.idea/ +#.idea/ + +# vscode +.vscode + +# test data directory +data + +# local manifests +src/polus/plugins/_plugins/manifests/* + +# allow python scripts inside manifests dir +!src/polus/plugins/_plugins/manifests/*.py + +#macOS +*.DS_Store + + +#husky +node_modules diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 652e537..6e78c94 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ fail_fast: true repos: - - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.4.0 hooks: - id: check-added-large-files + exclude: (.*?)\.(h5)$ - id: check-case-conflict - id: check-json - id: pretty-format-json @@ -26,27 +26,41 @@ repos: args: ["--fix=lf"] description: Forces to replace line ending by the UNIX 'lf' character. - id: trailing-whitespace - exclude: '.bumpversion.cfg' + exclude: ".bumpversion.cfg" - id: check-merge-conflict - repo: https://github.com/psf/black - rev: '23.3.0' + rev: "23.3.0" hooks: - id: black language_version: python3.9 - exclude: ^src\/polus\/plugins\/_plugins\/models\/\w*Schema.py$ + exclude: | + (?x)( + ^src\/polus\/plugins\/_plugins\/models\/pydanticv1\/\w*Schema.py$| + ^src\/polus\/plugins\/_plugins\/models\/pydanticv2\/\w*Schema.py$ + ) - repo: https://github.com/charliermarsh/ruff-pre-commit # Ruff version. - rev: 'v0.0.274' + rev: "v0.0.274" hooks: - id: ruff - exclude: ^src\/polus\/plugins\/_plugins\/models\/\w*Schema.py$ + exclude: | + (?x)( + test_[a-zA-Z0-9]+.py$| + ^src\/polus\/plugins\/_plugins\/models\/pydanticv1\/\w*Schema.py$| + ^src\/polus\/plugins\/_plugins\/models\/pydanticv2\/\w*Schema.py$ + ) args: [--fix] - repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v1.4.0' + rev: "v1.4.0" hooks: - id: mypy - exclude: ^src\/polus\/plugins\/_plugins\/models\/\w*Schema.py$ + exclude: | + (?x)( + test_[a-zA-Z0-9]+.py$| + ^src\/polus\/plugins\/_plugins\/models\/pydanticv1\/\w*Schema.py$| + ^src\/polus\/plugins\/_plugins\/models\/pydanticv2\/\w*Schema.py$ + ) additional_dependencies: [types-requests==2.31.0.1] diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 0000000..0192cba --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1 @@ +* @Nicholas-Schaub @NHotaling @hsidky diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..17e51c3 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +0.1.1 diff --git a/clustering/feature-subsetting-tool/.bumpversion.cfg b/clustering/feature-subsetting-tool/.bumpversion.cfg new file mode 100644 index 0000000..e576e44 --- /dev/null +++ b/clustering/feature-subsetting-tool/.bumpversion.cfg @@ -0,0 +1,28 @@ +[bumpversion] +current_version = 0.2.1-dev0 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:plugin.json] +[bumpversion:file:README.md] + +[bumpversion:file:VERSION] + +[bumpversion:file:src/polus/tabular/clustering/feature_subsetting/__init__.py] diff --git a/clustering/feature-subsetting-tool/Dockerfile b/clustering/feature-subsetting-tool/Dockerfile new file mode 100644 index 0000000..cb19bc8 --- /dev/null +++ b/clustering/feature-subsetting-tool/Dockerfile @@ -0,0 +1,21 @@ +FROM polusai/bfio:2.3.6 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".csv" +ENV POLUS_LOG="INFO" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir + + +ENTRYPOINT ["python3", "-m", "polus.tabular.clustering.feature_subsetting"] +CMD ["--help"] diff --git a/clustering/feature-subsetting-tool/README.md b/clustering/feature-subsetting-tool/README.md new file mode 100644 index 0000000..7bc8231 --- /dev/null +++ b/clustering/feature-subsetting-tool/README.md @@ -0,0 +1,58 @@ +# Feature Data Subset(v0.2.1-dev0) + +This WIPP plugin subsets data based on a given feature. It works in conjunction with the `polus-feature-extraction-plugin`, where the feature extraction plugin can be used to extract the features such as the mean intensity of every image in the input image collection. + +# Usage +The details and usage of the plugin inputs is provided in the section below. In addition to the subsetted data, the output directory also consists of a `summary.txt` file which has information as to what images were kept and their new filename if they were renamed. + +### Explanation of inputs +Some of the inputs are pretty straighforward and are used commonly across most WIPP plugins. This section is used to provide some details and examples of the inputs that may be a little complicated. The image collection with the following pattern will be used as an example : `r{r+}_t{t+}_p{p+}_z{z+}_c{c+}.ome.tif`, where r,t,p,z,c stand for replicate, timepoint, positon,z-positon, and channel respectively. Consider we have 5 replicates, 3 timepoints, 50 positions, 10 z-planes and 4 channels. + +1. `inpDir` - This contains the path to the input image collection to subset data from. +2. `tabularDir` This contains the path to the tabular files with file formats (`.csv`, `.arrow`, `.parquet`) containing the feature values for each image. This can be the output of the feature extraction or nyxus plugin +3. `filePattern` - Filepattern of the input images +4. `imageFeature` - Tabular data featuring image filenames +5. `tabularFeature` - Tabular feature that will be used to filter images +6. `groupVar` - This is a mandatory input across which to subset data. This can take either 1 or 2 variables as input and if 2 variables are provided then the second variable will be treated as the minor grouping variable. In our example, if the `z` is provided as input, then within a subcollection, the mean of the feature value will be taken for all images with the same z. Then the z positions will be filtered out based on the input of `percentile` and `removeDirection` variables. Now if `z,c` are provided as input, then 'c' will be treated as the minor grouping variable which means that the mean will be taken for all images with the same z for each channel. Also, the plugin will ensures that the same values of z positions are filtered out across c. +7. `percentile` and `removeDirection` - These two variables denote the critieria with which images are filtered. For example, if percentile is `0.1` and removeDirection is set to `Below` then images with feature value below the 10th percentile will be removed. On the other hand, if removeDirection is set to above then all images with feature value greater than the 10th pecentile will be removed. This enables data subsetting from both `brightfield` and `darkfield` microscopy images. + + **Optional Arguments** + +8. `sectionVar` - This is an optional input to segregate the input image collection into sub-collections. The analysis will be done seperately for each sub-collection. In our example, if the user enters `r,t` as the sectionVar, then we will have 15 subcollections (5*3),1 for each combination of timepoint and replicate. If the user enters `r` as sectionVar, then we will have 5 sub collections, 1 for each replicate. If the user wants to consider the whole image collection as a single section, then no input is required. NOTE: As a post processing step, same number of images will be subsetted across different sections. +9. `padding` - This is an optional variable with default value of 0. A delay of 3 means that 3 additional planes will captured on either side of the subsetted data. This can be used as a sanity check to ensure that the subsetted data captures the images we want. For example, in our examples if the following z values were filtered out intitially - 5,6,7 ; then a delay of 3 means that the output dataset will have z positions 2,3,4,5,6,7,8,9,10 if all them exist. +10. `writeOutput` - This is an optional argument with default value `True`. If it is set to true, then both the output image collection and `summary.txt` file will be created. If it is set to false, then the output directory will only consist of summary.txt. This option enables the user to tune the hyperparameters such as percentile, removeDirecton, feature without actually creating the output image collection. + + + +Contact [Gauhar Bains](mailto:gauhar.bains@labshare.org) for more information. + +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +## Building + +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. + +## Options + +This plugin takes eleven input arguments and one output argument: + +| Name | Description | I/O | Type | +| ------------------- | ----------------------------------------------------- | ------ | ------------- | +| `--inpDir` | Input image collection to be processed by this plugin | Input | collection | +| `--tabularDir` | Path to tabular data | Input | genericData | +| `--filePattern` | Filename pattern used to separate data | Input | string | +| `--imageFeature` | Feature in tabular data with image filenames | Input | string | +| `--tabularFeature` | Tabular feature to filter image files | Input | string | +| `--padding` | Number of images to capture outside the cutoff | Input | integer | +| `--groupVar` | variables to group by in a section | Input | string | +| `--percentile` | Percentile to remove | Input | float | +| `--removeDirection` | remove direction above or below percentile | Input | string | +| `--sectionVar` | variables to divide larger sections | Input | string | +| `--writeOutput` | write output image collection or not | Input | boolean | +| `--outDir` | Output collection | Output | genericData | +| `--preview` | Generate a JSON file with outputs | Output | JSON | diff --git a/clustering/feature-subsetting-tool/VERSION b/clustering/feature-subsetting-tool/VERSION new file mode 100644 index 0000000..6c0f6f4 --- /dev/null +++ b/clustering/feature-subsetting-tool/VERSION @@ -0,0 +1 @@ +0.2.1-dev0 diff --git a/clustering/feature-subsetting-tool/build-docker.sh b/clustering/feature-subsetting-tool/build-docker.sh new file mode 100644 index 0000000..d82557e --- /dev/null +++ b/clustering/feature-subsetting-tool/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$( x00_y01_p01_c1.ome.tif +x00_y01_p03_c2.ome.tif -----> x00_y01_p01_c2.ome.tif +x00_y01_p03_c3.ome.tif -----> x00_y01_p01_c3.ome.tif +x00_y01_p03_c4.ome.tif -----> x00_y01_p01_c4.ome.tif +x00_y01_p03_c5.ome.tif -----> x00_y01_p01_c5.ome.tif +x00_y01_p04_c1.ome.tif -----> x00_y01_p02_c1.ome.tif +x00_y01_p04_c2.ome.tif -----> x00_y01_p02_c2.ome.tif +x00_y01_p04_c3.ome.tif -----> x00_y01_p02_c3.ome.tif +x00_y01_p04_c4.ome.tif -----> x00_y01_p02_c4.ome.tif +x00_y01_p04_c5.ome.tif -----> x00_y01_p02_c5.ome.tif diff --git a/clustering/feature-subsetting-tool/package-release.sh b/clustering/feature-subsetting-tool/package-release.sh new file mode 100644 index 0000000..1efde1b --- /dev/null +++ b/clustering/feature-subsetting-tool/package-release.sh @@ -0,0 +1,16 @@ +# This script is designed to help package a new version of a plugin + +# Get the new version +version=$(", + "Hamdah Shafqat abbasi " + ] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +filepattern = "^2.0.4" +typer = "^0.7.0" +tqdm = "^4.64.1" +vaex = "^4.17.0" + + +[tool.poetry.group.dev.dependencies] +pre-commit = "^3.3.3" +bump2version = "^1.0.1" +pytest = "^7.3.2" +pytest-xdist = "^3.3.1" +pytest-sugar = "^0.9.7" +ipykernel = "^6.28.0" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/clustering/feature-subsetting-tool/run-docker.sh b/clustering/feature-subsetting-tool/run-docker.sh new file mode 100644 index 0000000..0810b5c --- /dev/null +++ b/clustering/feature-subsetting-tool/run-docker.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +version=$( None: + """Generate preview of the plugin outputs.""" + shutil.copy( + Path(__file__).parents[4].joinpath("example/summary.txt"), + out_dir, + ) + + +@app.command() +def main( # noqa: PLR0913 + inp_dir: Path = typer.Option( + ..., + "--inpDir", + "-i", + help="Path to the collection of input images.", + ), + tabular_dir: Path = typer.Option( + ..., + "--tabularDir", + "-t", + help="Path to the collection of tabular files containing features.", + ), + file_pattern: Optional[str] = typer.Option( + ".*", + "--filePattern", + "-f", + help="Pattern use to parse filenames", + ), + image_feature: str = typer.Option( + None, + "--imageFeature", + "-if", + help="Image filenames feature in tabular data.", + ), + tabular_feature: str = typer.Option( + None, + "--tabularFeature", + "-tf", + help="Select tabular feature to subset data.", + ), + padding: Optional[int] = typer.Option( + 0, + "--padding", + "-p", + help="Number of images to capture outside the cutoff.", + ), + group_var: str = typer.Option( + ..., + "--groupVar", + "-g", + help="variables to group by in a section.", + ), + percentile: float = typer.Option( + None, + "--percentile", + "-pc", + help="Percentile to remove.", + ), + remove_direction: Optional[str] = typer.Option( + "Below", + "--removeDirection", + "-r", + help="Remove direction above or below percentile.", + ), + section_var: Optional[str] = typer.Option( + None, + "--sectionVar", + "-s", + help="Variables to divide larger sections.", + ), + write_output: Optional[bool] = typer.Option( + False, + "--writeOutput", + "-w", + help="Write output image collection or not.", + ), + out_dir: Path = typer.Option( + ..., + "--outDir", + "-o", + help="Output directory", + ), + preview: Optional[bool] = typer.Option( + False, + "--preview", + help="Output a JSON preview of files", + ), +) -> None: + """Subset data using a given feature.""" + logger.info(f"--inpDir = {inp_dir}") + logger.info(f"--tabularDir = {tabular_dir}") + logger.info(f"--imageFeature = {image_feature}") + logger.info(f"--tabularFeature = {tabular_feature}") + logger.info(f"--filePattern = {file_pattern}") + logger.info(f"--padding = {padding}") + logger.info(f"--groupVar = {group_var}") + logger.info(f"--percentile = {percentile}") + logger.info(f"--removeDirection = {remove_direction}") + logger.info(f"--sectionVar = {section_var}") + logger.info(f"--writeOutput = {write_output}") + logger.info(f"--outDir = {out_dir}") + + inp_dir = inp_dir.resolve() + out_dir = out_dir.resolve() + + assert inp_dir.exists(), f"{inp_dir} does not exist!! Please check input path again" + assert ( + out_dir.exists() + ), f"{out_dir} does not exist!! Please check output path again" + + if preview: + generate_preview(out_dir) + + else: + fs.feature_subset( + inp_dir, + tabular_dir, + out_dir, + file_pattern, + group_var, + percentile, + remove_direction, + section_var, + image_feature, + tabular_feature, + padding, + write_output, + ) + + +if __name__ == "__main__": + app() diff --git a/clustering/feature-subsetting-tool/src/polus/tabular/clustering/feature_subsetting/feature_subset.py b/clustering/feature-subsetting-tool/src/polus/tabular/clustering/feature_subsetting/feature_subset.py new file mode 100644 index 0000000..15e4b74 --- /dev/null +++ b/clustering/feature-subsetting-tool/src/polus/tabular/clustering/feature_subsetting/feature_subset.py @@ -0,0 +1,300 @@ +"""Feature Subsetting Tool.""" + +import logging +import os +import shutil +from pathlib import Path +from typing import Any + +import filepattern +import vaex +from tqdm import tqdm + +CHUNK_SIZE = 10000 + +logger = logging.getLogger(__name__) +logger.setLevel(os.environ.get("POLUS_LOG", logging.INFO)) +POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".csv") + + +def filter_planes( + feature_dict: dict, + remove_direction: str, + percentile: float, +) -> set[Any]: + """Filter planes by the criteria specified by remove_direction and percentile. + + Args: + feature_dict : planes and respective feature value + remove_direction: remove above or below percentile + percentile : cutoff percentile + + Returns: + set: planes that fit the criteria + """ + planes = list(feature_dict.keys()) + feat_value = [feature_dict[i] for i in planes] + thresh = min(feat_value) + percentile * (max(feat_value) - min(feat_value)) + + # filter planes + if remove_direction == "Below": + keep_planes = [z for z in planes if feature_dict[z] >= thresh] + else: + keep_planes = [z for z in planes if feature_dict[z] <= thresh] + + return set(keep_planes) + + +def make_uniform(planes_dict: dict, uniques: list[int], padding: int) -> dict: + """Ensure each section has the same number of images. + + This function makes the output collection uniform in + the sense that it preserves same number of planes across + sections. It also captures additional planes based + on the value of the padding variable + + Args: + planes_dict: planes to keep in different sections + uniques : unique values for the major grouping variable + padding : additional images to capture outside cutoff + + Returns: + dictionary: dictionary containing planes to keep + """ + # max no. of planes + max_len = max([len(i) for i in planes_dict.values()]) + + # max planes that can be added on each side + min_ind = min([min(planes_dict[k]) for k in planes_dict]) + max_ind = max([max(planes_dict[k]) for k in planes_dict]) + max_add_left = uniques.index(min_ind) + max_add_right = len(uniques) - (uniques.index(max_ind) + 1) + + # add planes in each section based on padding and max number of planes + for section_id, planes in planes_dict.items(): + len_to_add = max_len - len(planes) + len_add_left = min(int(len_to_add) / 2 + padding, max_add_left) + len_add_right = min(len_to_add - len_add_left + padding, max_add_right) + left_ind = int(uniques.index(min(planes)) - len_add_left) + right_ind = int(uniques.index(max(planes)) + len_add_right) + 1 + planes_dict[section_id] = uniques[left_ind:right_ind] + return planes_dict + + +def feature_subset( # noqa : C901 + inp_dir: Path, + tabular_dir: Path, + out_dir: Path, + file_pattern: str, + group_var: str, + percentile: float, + remove_direction: str, + section_var: str, + image_feature: str, + tabular_feature: str, + padding: int, + write_output: bool, +) -> None: + """Subsetting images based on feature values. + + Args: + inp_dir: Path to the collection of input images + tabular_dir : Path to the tabular data directory + out_dir : Path to output directory + file_pattern : Pattern to parse image file names + group_var : variables to group by in a section + percentile : Percentile to remove + remove_direction : Remove direction above or below percentile + section_var : Variables to divide larger sections + image_feature: Image filenames feature in tabular data + tabular_feature : Select tabular feature to subset data + padding : additional images to capture outside cutoff + write_output : Write output image collection or not. + """ + tabular_dir_files = [ + f + for f in Path(tabular_dir).iterdir() + if f.is_file() + and "".join(f.suffixes) in [".csv", ".arrow", ".parquet", ".fits"] + ] + + if len(tabular_dir_files) == 0: + msg = f"No tabular files detected Please check {tabular_dir} again" + raise ValueError(msg) + + # Get the column headers + headers = [] + for in_file in tabular_dir_files: + df = vaex.open(in_file) + headers.append(list(df.columns)) + headers = list(set(headers[0]).intersection(*headers)) + logger.info("Merging the data along rows...") + + featuredf = [] + for in_file in tqdm( + tabular_dir_files, + total=len(tabular_dir_files), + desc="Vaex loading of file", + ): + if in_file.suffix == ".csv": + df = vaex.from_csv(in_file, chunk_size=100_000, convert=True) + else: + df = vaex.open(in_file) + df = df[list(headers)] + featuredf.append(df) + + feature_df = vaex.concat(featuredf) + + if feature_df.shape[0] == 0: + msg = f"tabular files are empty Please check {tabular_dir} again" + raise ValueError(msg) + + # store image name and its feature value + feature_dict = dict( + zip( + list(feature_df[image_feature].to_numpy()), + list(feature_df[tabular_feature].to_numpy()), + ), + ) + + # seperate filepattern variables into different categories + fps = filepattern.FilePattern(inp_dir, file_pattern) + if not len(fps) > 0: + msg = "No image files are detected. Please check filepattern again!" + raise ValueError(msg) + + uniques = fps.get_unique_values() + var = fps.get_variables() + grouping_variables = group_var.split(",") + if len(grouping_variables) > 1: + min_grouping_var, maj_grouping_var = ( + grouping_variables[1], + grouping_variables[0], + ) + gp_by = [min_grouping_var, maj_grouping_var] + else: + gp_by = [group_var] + + if section_var is not None: + section_variables = section_var.split(",") + sub_section_variables = [ + v for v in var if v not in grouping_variables + section_variables + ] + else: + sub_section_variables = [v for v in var if v not in grouping_variables] + + logger.info("Iterating over sections...") + # single iteration of this loop gives all images in one section + + section_feat = [] + section_keep_planes = [] + keep_planes = {} + + for file in fps(group_by=gp_by): + section_feat_dict: dict[Any, Any] = {} + if section_var is not None: + section_id = tuple([file[0][i] for i in section_var.split(",")]) + else: + section_id = 1 + + # iterate over files in one section + + fm = file[1][0][0] + fname = file[1][0][1][0].name + + if min_grouping_var is None: + fm[min_grouping_var] = None + + if fm[min_grouping_var] not in section_feat_dict: + section_feat_dict[fm[min_grouping_var]] = {} + + if fm[maj_grouping_var] not in section_feat_dict[fm[min_grouping_var]]: + section_feat_dict[fm[min_grouping_var]][fm[maj_grouping_var]] = [] + + section_feat_dict[fm[min_grouping_var]][fm[maj_grouping_var]].append( + feature_dict[fname], + ) + + section_feat.append(section_feat_dict) + + sectionfeat: dict[Any, Any] = {} + for f in section_feat: + for k, v in f.items(): + if k not in sectionfeat: + sectionfeat[k] = {} + sectionfeat[k].update(v) + + # average feature value by grouping variable + + for key1 in sectionfeat: + for key2 in sectionfeat[key1]: + sectionfeat[key1][key2] = sum(sectionfeat[key1][key2]) / len( + sectionfeat[key1][key2], + ) + + # find planes to keep based on specified criteria + section_keep_planes.append( + filter_planes(sectionfeat[key1], remove_direction, percentile), + ) + + # keep same planes within a section, across the minor grouping variable + section_keep_planes = list(section_keep_planes[0].union(*section_keep_planes)) + section_keep_planes = [ + i + for i in range( # type: ignore + min(section_keep_planes), + max(section_keep_planes) + 1, # type: ignore + ) + if i in uniques[maj_grouping_var] + ] + keep_planes[section_id] = section_keep_planes + + # # keep same number of planes across different sections + keep_planes = make_uniform(keep_planes, list(uniques[maj_grouping_var]), padding) + + # start writing summary.txt + summary = Path.open(Path(out_dir, "summary.txt"), "w") + + summary.write("\n Files : \n \n") + # update summary.txt with section renaming info + + logger.info("renaming subsetted data") + + for file in fps(group_by=sub_section_variables + grouping_variables): + if section_var is not None: + section_id = tuple([file[0][i] for i in section_var.split(",")]) + else: + section_id = 1 + + section_keep_planes = keep_planes[section_id] + rename_map = dict(zip(keep_planes[section_id], uniques[maj_grouping_var])) + + if section_var is not None and section_var.strip(): + summary.write( + f"Section : {({k: file[0][k] for k in section_variables})} \n", + ) + logger.info( + "Renaming files from section : {} \n".format( + {k: file[0][k] for k in section_variables}, + ), + ) + fm = file[1][0][0] + fname = file[1][0][1][0].name + + if fm[maj_grouping_var] not in keep_planes[section_id]: + continue + + # old and new file name + old_file_name = fname + + file_name_dict = dict(fm.items()) + file_name_dict[maj_grouping_var] = rename_map[fm[maj_grouping_var]] + + new_file_name = fps.get_matching(**file_name_dict)[0][1][0].name + + # if write output collection + if write_output: + shutil.copy2(Path(inp_dir, old_file_name), Path(out_dir, new_file_name)) + + summary.write(f"{old_file_name} -----> {new_file_name} \n") + summary.close() diff --git a/clustering/feature-subsetting-tool/tests/__init__.py b/clustering/feature-subsetting-tool/tests/__init__.py new file mode 100644 index 0000000..00b38f2 --- /dev/null +++ b/clustering/feature-subsetting-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Feature Subsetting Tool.""" diff --git a/clustering/feature-subsetting-tool/tests/conftest.py b/clustering/feature-subsetting-tool/tests/conftest.py new file mode 100644 index 0000000..6aee03c --- /dev/null +++ b/clustering/feature-subsetting-tool/tests/conftest.py @@ -0,0 +1,58 @@ +"""Test fixtures. + +Set up all data used in tests. +""" +import tempfile +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + + +@pytest.fixture( + params=[ + (500, ".csv"), + ], +) +def get_params(request: pytest.FixtureRequest) -> tuple[int, str]: + """To get the parameter of the fixture.""" + return request.param + + +@pytest.fixture() +def generate_synthetic_data( + get_params: tuple[int, str], +) -> tuple[Path, Path, Path, str]: + """Generate tabular data.""" + nrows, file_extension = get_params + input_directory = Path(tempfile.mkdtemp(prefix="inpDir_", dir=Path.cwd())) + tabular_directory = Path(tempfile.mkdtemp(prefix="tabularDir_", dir=Path.cwd())) + output_directory = Path(tempfile.mkdtemp(prefix="out_", dir=Path.cwd())) + rng = np.random.default_rng() + channels = 5 + zpos = 4 + nrows = 3 + for c in range(channels): + for z in range(zpos): + file_name = Path(input_directory, f"x00_y01_p0{z}_c{c}.ome.tif") + Path.open(Path(file_name), "a").close() + + tabular_data = { + "intensity_image": [file_name.name] * nrows, + "MEAN": rng.random(nrows).tolist(), + "MEAN_ABSOLUTE_DEVIATION": rng.random(nrows).tolist(), + "MEDIAN": rng.random(nrows).tolist(), + "MODE": rng.random(nrows).tolist(), + } + outname = file_name.stem.split(".")[0] + + df = pd.DataFrame(tabular_data) + if file_extension == ".csv": + outpath = Path(tabular_directory, f"{outname}.csv") + df.to_csv(outpath, index=False) + if file_extension == ".arrow": + outpath = Path(tabular_directory, f"{outname}.arrow") + df.to_feather(outpath) + + return input_directory, tabular_directory, output_directory, file_extension diff --git a/clustering/feature-subsetting-tool/tests/test_cli.py b/clustering/feature-subsetting-tool/tests/test_cli.py new file mode 100644 index 0000000..aece6a2 --- /dev/null +++ b/clustering/feature-subsetting-tool/tests/test_cli.py @@ -0,0 +1,92 @@ +"""Test Command line Tool.""" + +from typer.testing import CliRunner +from polus.tabular.clustering.feature_subsetting.__main__ import app +import shutil +from pathlib import Path + + +def test_cli(generate_synthetic_data: tuple[Path, Path, Path, str]) -> None: + """Test the command line.""" + inp_dir, tabular_dir, out_dir, _ = generate_synthetic_data + file_pattern = "x{x+}_y{y+}_p{p+}_c{c+}.ome.tif" + image_feature = "intensity_image" + tabular_feature = "MEAN" + padding = 0 + group_var = "p,c" + + runner = CliRunner() + result = runner.invoke( + app, + [ + "--inpDir", + inp_dir, + "--tabularDir", + tabular_dir, + "--filePattern", + file_pattern, + "--imageFeature", + image_feature, + "--tabularFeature", + tabular_feature, + "--padding", + padding, + "--groupVar", + group_var, + "--percentile", + 0.8, + "--removeDirection", + "Below", + "--writeOutput", + "--outDir", + out_dir, + ], + ) + + assert result.exit_code == 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) + shutil.rmtree(tabular_dir) + + +def test_short_cli(generate_synthetic_data: tuple[Path, Path, Path, str]) -> None: + """Test short cli command line.""" + inp_dir, tabular_dir, out_dir, _ = generate_synthetic_data + file_pattern = "x{x+}_y{y+}_p{p+}_c{c+}.ome.tif" + image_feature = "intensity_image" + tabular_feature = "MEAN" + padding = 0 + group_var = "p,c" + + runner = CliRunner() + result = runner.invoke( + app, + [ + "-i", + inp_dir, + "-t", + tabular_dir, + "-f", + file_pattern, + "-if", + image_feature, + "-tf", + tabular_feature, + "-p", + padding, + "-g", + group_var, + "-pc", + 0.8, + "-r", + "Below", + "-w", + "-o", + out_dir, + ], + ) + + assert result.exit_code == 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) + shutil.rmtree(tabular_dir) diff --git a/clustering/feature-subsetting-tool/tests/test_feature_subsetting.py b/clustering/feature-subsetting-tool/tests/test_feature_subsetting.py new file mode 100644 index 0000000..1675392 --- /dev/null +++ b/clustering/feature-subsetting-tool/tests/test_feature_subsetting.py @@ -0,0 +1,72 @@ +"""Test Feature Subsetting Plugin.""" + +import shutil +from pathlib import Path + +import polus.tabular.clustering.feature_subsetting.feature_subset as fs + + +def test_feature_subset( + generate_synthetic_data: tuple[Path, Path, Path, str], +) -> None: + """Test images subsetting based on feature values.""" + inp_dir, tabular_dir, out_dir, _ = generate_synthetic_data + file_pattern = "x{x+}_y{y+}_p{p+}_c{c+}.ome.tif" + image_feature = "intensity_image" + tabular_feature = "MEAN" + padding = 0 + percentile = 0.8 + remove_direction = "Below" + group_var = "p,c" + write_output = True + + fs.feature_subset( + inp_dir=inp_dir, + tabular_dir=tabular_dir, + out_dir=out_dir, + file_pattern=file_pattern, + group_var=group_var, + percentile=percentile, + remove_direction=remove_direction, + section_var=None, + image_feature=image_feature, + tabular_feature=tabular_feature, + padding=padding, + write_output=write_output, + ) + + out_ext = [Path(f.name).suffix for f in out_dir.iterdir()] + assert len(out_ext) != 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) + shutil.rmtree(tabular_dir) + + +def test_filter_planes() -> None: + """Test filter planes.""" + feature_dict = { + 1: 1236.597914951989, + 2: 1153.754875685871, + 3: 1537.3429175240055, + 4: 1626.0415809327849, + } + + percentile = 0.1 + remove_direction = "Below" + fn = fs.filter_planes( + feature_dict=feature_dict, + remove_direction=remove_direction, + percentile=percentile, + ) + + assert type(fn) == set + + +def test_make_uniform() -> None: + """Test each section contain same number of images.""" + planes_dict = {1: [3, 4]} + uniques = [1, 2, 3, 4] + padding = 0 + fn = fs.make_uniform(planes_dict=planes_dict, uniques=uniques, padding=padding) + + assert len(fn) != 0 diff --git a/clustering/hdbscan-clustering-tool/.bumpversion.cfg b/clustering/hdbscan-clustering-tool/.bumpversion.cfg new file mode 100644 index 0000000..732b62d --- /dev/null +++ b/clustering/hdbscan-clustering-tool/.bumpversion.cfg @@ -0,0 +1,28 @@ +[bumpversion] +current_version = 0.4.8-dev1 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:plugin.json] + +[bumpversion:file:VERSION] +[bumpversion:file:README.md] + +[bumpversion:file:src/polus/tabular/clustering/hdbscan_clustering/__init__.py] diff --git a/clustering/hdbscan-clustering-tool/.gitignore b/clustering/hdbscan-clustering-tool/.gitignore new file mode 100644 index 0000000..9ed1c37 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/.gitignore @@ -0,0 +1,23 @@ +# Jupyter Notebook +.ipynb_checkpoints +poetry.lock +../../poetry.lock +# Environments +.env +.myenv +.venv +env/ +venv/ +# test data directory +data +# yaml file +.pre-commit-config.yaml +# hidden files +.DS_Store +.ds_store +# flake8 +.flake8 +../../.flake8 +__pycache__ +.mypy_cache +requirements.txt diff --git a/clustering/hdbscan-clustering-tool/Dockerfile b/clustering/hdbscan-clustering-tool/Dockerfile new file mode 100644 index 0000000..69e7d18 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/Dockerfile @@ -0,0 +1,21 @@ +FROM polusai/bfio:2.3.6 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_LOG="INFO" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".csv" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir + + +ENTRYPOINT ["python3", "-m", "polus.tabular.clustering.hdbscan_clustering"] +CMD ["--help"] diff --git a/clustering/hdbscan-clustering-tool/README.md b/clustering/hdbscan-clustering-tool/README.md new file mode 100644 index 0000000..37f1589 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/README.md @@ -0,0 +1,52 @@ +# Hierarchical Density-Based Spatial Clustering of Applications with Noise(HDBSCAN) Clustering (v0.4.8-dev1) + +The HDBSCAN Clustering plugin clusters the data using [HDBSCAN clustering](https://pypi.org/project/hdbscan/) library. The input and output for this plugin is a CSV file. Each observation (row) in the input CSV file is assigned to one of the clusters. The output CSV file contains the column `cluster` that identifies the cluster to which each observation belongs. A user can supply a regular expression with capture groups if they wish to cluster each group independently, or if they wish to average the numerical features across each group and treat them as a single observation. + +## Inputs: + +### Input directory: +This plugin supports the all [vaex](https://vaex.readthedocs.io/en/latest/guides/io.html) supported file formats. + +### Filename pattern: +This plugin uses [filepattern](https://filepattern2.readthedocs.io/en/latest/Home.html) python library to parse file names of tabular files to be processed by this plugin. + +### Grouping pattern: +The input for this parameter is a regular expression with capture group. This input splits the data into groups based on the matched pattern. A new column `group` is created in the output file that has the group based on the given pattern. Unless `averageGroups` is set to `true`, providing a grouping pattern will cluster each group independently. + +### Average groups: +`groupingPattern` to average the numerical features and produce a single row per group which is then clustered. The resulting cluster is assigned to all observations belonging in that group. + +### Label column: +This is the name of the column containing the labels to be used with `groupingPattern`. + +### Minimum cluster size: +This parameter defines the smallest number of points that should be considered as cluster. This is a required parameter. The input should be an integer and the value should be greater than 1. + +### Increment outlier ID: +This parameter sets the ID of the outlier cluster to `1`, otherwise it will be 0. This is useful for visualization purposes if the resulting cluster IDs are turned into image annotations. + +## Output: +The output is a tabular file containing the clustered data. + +## Building +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Install WIPP Plugin +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +## Options + +This plugin takes four input arguments and one output argument: + +| Name | Description | I/O | Type | +| ---------------------- | ---------------------------------------------------------------------------------------------- | ------ | ------------- | +| `--inpDir` | Input tabular data files. | Input | genericData | +| `--groupingPattern` | Regular expression to group rows. Clustering will be applied across capture groups by default. | Input | string | +| `--averageGroups` | Average data across groups. Requires capture groups | Input | boolean | +| `--labelCol` | Name of the column containing labels for grouping pattern. | Input | string | +| `--minClusterSize` | Minimum cluster size. | Input | number | +| `--incrementOutlierId` | Increments outlier ID to 1. | Input | boolean | +| `--outDir` | Output collection | Output | genericData | +| `--preview` | Generate a JSON file with outputs | Output | JSON | diff --git a/clustering/hdbscan-clustering-tool/VERSION b/clustering/hdbscan-clustering-tool/VERSION new file mode 100644 index 0000000..5915443 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/VERSION @@ -0,0 +1 @@ +0.4.8-dev1 diff --git a/clustering/hdbscan-clustering-tool/build-docker.sh b/clustering/hdbscan-clustering-tool/build-docker.sh new file mode 100755 index 0000000..2e7dd18 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$(", + "Hythem Sidky ", + "Hamdah Shafqat abbasi " + ] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +filepattern = "^2.0.4" +typer = "^0.7.0" +tqdm = "^4.64.1" +preadator="0.4.0.dev2" +vaex = "^4.17.0" +hdbscan = "^0.8.34rc1" + + +[tool.poetry.group.dev.dependencies] +pre-commit = "^3.3.3" +bump2version = "^1.0.1" +pytest = "^7.3.2" +pytest-xdist = "^3.3.1" +pytest-sugar = "^0.9.7" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/clustering/hdbscan-clustering-tool/run-docker.sh b/clustering/hdbscan-clustering-tool/run-docker.sh new file mode 100755 index 0000000..9311151 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/run-docker.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +version=$( None: + """Cluster data using HDBSCAN.""" + logger.info(f"--inpDir = {inp_dir}") + logger.info(f"--filePattern = {file_pattern}") + # Regular expression for grouping. + logger.info(f"--groupingPattern = {grouping_pattern}") + # Whether to average data for each group. + logger.info(f"--averageGroups = {average_groups}") + # Name of column to use for grouping. + logger.info(f"--labelCol = {label_col}") + # Minimum cluster size for clustering using HDBSCAN. + logger.info(f"--minClusterSize = {min_cluster_size}") + # Set outlier cluster id as 1. + logger.info(f"--incrementOutlierId = {increment_outlier_id}") + logger.info(f"--outDir = {out_dir}") + + inp_dir = inp_dir.resolve() + out_dir = out_dir.resolve() + + assert inp_dir.exists(), f"{inp_dir} does not exist!! Please check input path again" + assert ( + out_dir.exists() + ), f"{out_dir} does not exist!! Please check output path again" + + num_workers = max([cpu_count(), 2]) + + files = fp.FilePattern(inp_dir, file_pattern) + + if files is None: + msg = f"No tabular files found. Please check {file_pattern} again" + raise ValueError(msg) + + if preview: + with Path.open(Path(out_dir, "preview.json"), "w") as jfile: + out_json: dict[str, Any] = { + "filepattern": file_pattern, + "outDir": [], + } + for file in files(): + out_name = file[1][0].name.replace( + "".join(file[1][0].suffixes), + f"_hdbscan{hd.POLUS_TAB_EXT}", + ) + out_json["outDir"].append(out_name) + json.dump(out_json, jfile, indent=2) + else: + with preadator.ProcessManager( + name="Cluster data using HDBSCAN", + num_processes=num_workers, + threads_per_process=2, + ) as pm: + for file in tqdm( + files(), + total=len(files()), + desc="Clustering data", + mininterval=5, + initial=0, + unit_scale=True, + colour="cyan", + ): + pm.submit_process( + hd.hdbscan_clustering, + file[1][0], + min_cluster_size, + out_dir, + grouping_pattern, + label_col, + average_groups, + increment_outlier_id, + ) + pm.join_processes() + + +if __name__ == "__main__": + app() diff --git a/clustering/hdbscan-clustering-tool/src/polus/tabular/clustering/hdbscan_clustering/hdbscan_clustering.py b/clustering/hdbscan-clustering-tool/src/polus/tabular/clustering/hdbscan_clustering/hdbscan_clustering.py new file mode 100644 index 0000000..3940c28 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/src/polus/tabular/clustering/hdbscan_clustering/hdbscan_clustering.py @@ -0,0 +1,150 @@ +"""Hdbscan Clustering Plugin.""" +import logging +import os +import re +from itertools import chain +from pathlib import Path + +import hdbscan +import numpy as np +import vaex + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".csv") +CHUNK_SIZE = 10000 + + +def hdbscan_model( + data: np.ndarray, + min_cluster_size: int, + increment_outlier_id: bool, +) -> np.ndarray: + """Cluster data using HDBSCAN. + + Args: + data: Data that need to be clustered. + min_cluster_size: Minimum cluster size. + increment_outlier_id : Increment outlier ID to unity. + + Returns: + Cluster labels for each row of data. + """ + clusters = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size).fit(data) + labels = clusters.labels_.flatten().astype(np.uint16) + 1 + return labels + 1 if increment_outlier_id else labels + + +def hdbscan_clustering( # noqa: PLR0913 + file: Path, + min_cluster_size: int, + out_dir: Path, + grouping_pattern: str, + label_col: str, + average_groups: bool, + increment_outlier_id: bool, +) -> None: + """Cluster data using HDBSCAN. + + Args: + file: Path of a tabular file. + min_cluster_size: Smallest size grouping that should be considered as a cluster. + out_dir: Path to output directory. + grouping_pattern: Regular expression to caputure groups in a label_col. + label_col: Name of column containing labels. + average_groups:To average data across groups. + increment_outlier_id: Increment outlier ID to unity. + """ + if Path(file.name).suffix == ".csv": + df = vaex.from_csv(file, convert=True, chunk_size=CHUNK_SIZE) + else: + df = vaex.open(file) + # If user provided a regular expression. + if grouping_pattern: + if label_col == "None": + msg = f"Please define label column to capture groups {label_col}" + raise ValueError(msg) + + # Create a column group with matching string + group = np.array( + [ + re.search(grouping_pattern, x).group(0) # type: ignore + for x in df[label_col].tolist() + if len(re.search(grouping_pattern, x).group(0)) != 0 # type: ignore + ], + ) + if len(group) == 0: + msg = f"Could not find group with pattern {grouping_pattern}" + raise ValueError(msg) + + # Create a column group with matching string + df["group"] = group + int_columns = [ + feature + for feature in df.get_column_names() + if df.data_type(feature) == int or df.data_type(feature) == float + ] + + # If we want to average features for each group. + if average_groups: + df_grouped = df.groupby( + "group", + agg=[vaex.agg.mean(x) for x in int_columns], + ) + # Cluster data using HDBSCAN clustering. + logger.info("Clustering the data") + cluster_ids = hdbscan_model( + df_grouped.values, + min_cluster_size, + increment_outlier_id, + ) + df_grouped["cluster"] = cluster_ids + df = df.join( + df_grouped["group", "cluster"], + left_on="group", + right_on="group", + ) + + else: + dfs = [] + for group, df_ss in df.groupby("group"): + # Cluster data using HDBSCAN clustering. + logger.info(f"Clustering data in group {group}") + + cluster_ids = hdbscan_model( + df_ss.values, + min_cluster_size, + increment_outlier_id, + ) + + dfs.append(cluster_ids) + cluster_ids = np.array(list(chain.from_iterable(dfs))) + df["cluster"] = cluster_ids + + # No grouping. Vanilla clustering. + else: + int_columns = [ + feature + for feature in df.get_column_names() + if df.data_type(feature) == int or df.data_type(feature) == float + ] + + # Cluster data using HDBSCAN clustering + logger.info("Clustering the data") + cluster_ids = hdbscan_model( + df[int_columns].values, + min_cluster_size, + increment_outlier_id, + ) + df["cluster"] = cluster_ids + + outname = Path(out_dir, f"{Path(file.name).stem}_hdbscan{POLUS_TAB_EXT}") + + if POLUS_TAB_EXT == ".arrow": + df.export_feather(outname) + logger.info(f"Saving outputs: {outname}") + else: + df.export_csv(path=outname, chunk_size=CHUNK_SIZE) + + logger.info("Finished all processes!") diff --git a/clustering/hdbscan-clustering-tool/tests/__init__.py b/clustering/hdbscan-clustering-tool/tests/__init__.py new file mode 100644 index 0000000..2f89ec8 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Hdbscan Clustering Plugin.""" diff --git a/clustering/hdbscan-clustering-tool/tests/conftest.py b/clustering/hdbscan-clustering-tool/tests/conftest.py new file mode 100644 index 0000000..a609d5b --- /dev/null +++ b/clustering/hdbscan-clustering-tool/tests/conftest.py @@ -0,0 +1,48 @@ +"""Test fixtures. + +Set up all data used in tests. +""" +import tempfile +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + + +@pytest.fixture( + params=[(50000, ".csv"), (100000, ".arrow")], +) +def get_params(request: pytest.FixtureRequest) -> tuple[int, str]: + """To get the parameter of the fixture.""" + return request.param + + +@pytest.fixture() +def generate_synthetic_data(get_params: tuple[int, str]) -> tuple[Path, Path, str]: + """Generate tabular data.""" + nrows, file_extension = get_params + + input_directory = Path(tempfile.mkdtemp(prefix="inputs_")) + output_directory = Path(tempfile.mkdtemp(prefix="out_")) + rng = np.random.default_rng() + tabular_data = { + "sepal_length": rng.random(nrows).tolist(), + "sepal_width": rng.random(nrows).tolist(), + "petal_length": rng.random(nrows).tolist(), + "petal_width": rng.random(nrows).tolist(), + "species": rng.choice( + ["Iris-setosa", "Iris-versicolor", "Iris-virginica"], + nrows, + ).tolist(), + } + + df = pd.DataFrame(tabular_data) + if file_extension == ".csv": + outpath = Path(input_directory, "data.csv") + df.to_csv(outpath, index=False) + if file_extension == ".arrow": + outpath = Path(input_directory, "data.arrow") + df.to_feather(outpath) + + return input_directory, output_directory, file_extension diff --git a/clustering/hdbscan-clustering-tool/tests/test_cli.py b/clustering/hdbscan-clustering-tool/tests/test_cli.py new file mode 100644 index 0000000..11f46c0 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/tests/test_cli.py @@ -0,0 +1,74 @@ +"""Test Command line Tool.""" + +from typer.testing import CliRunner +from polus.tabular.clustering.hdbscan_clustering.__main__ import app +import shutil +from pathlib import Path + + +def test_cli(generate_synthetic_data: tuple[Path, Path, str]) -> None: + """Test the command line.""" + inp_dir, out_dir, file_extension = generate_synthetic_data + pattern = r"\w+$" + file_pattern = f".*{file_extension}" + label = "species" + clustersize = 3 + + runner = CliRunner() + result = runner.invoke( + app, + [ + "--inpDir", + inp_dir, + "--filePattern", + file_pattern, + "--groupingPattern", + pattern, + "--averageGroups", + "--labelCol", + label, + "--minClusterSize", + clustersize, + "--incrementOutlierId", + "--outDir", + out_dir, + ], + ) + + assert result.exit_code == 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) + + +def test_short_cli(generate_synthetic_data: tuple[Path, Path, str]) -> None: + """Test short command line.""" + inp_dir, out_dir, file_extension = generate_synthetic_data + pattern = r"\w+$" + file_pattern = f".*{file_extension}" + label = "species" + clustersize = 3 + + runner = CliRunner() + result = runner.invoke( + app, + [ + "-i", + inp_dir, + "-f", + file_pattern, + "-g", + pattern, + "-a", + "-l", + label, + "-m", + clustersize, + "-io", + "-o", + out_dir, + ], + ) + + assert result.exit_code == 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) diff --git a/clustering/hdbscan-clustering-tool/tests/test_hdbscan_clustering.py b/clustering/hdbscan-clustering-tool/tests/test_hdbscan_clustering.py new file mode 100644 index 0000000..eb34f80 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/tests/test_hdbscan_clustering.py @@ -0,0 +1,49 @@ +"""Test Hdbscan Clustering Plugin.""" + +import shutil +from pathlib import Path + +import filepattern as fp +import polus.tabular.clustering.hdbscan_clustering.hdbscan_clustering as hd +import vaex + + +def test_hdbscan_clustering(generate_synthetic_data: tuple[Path, Path, str]) -> None: + """Test hdbscan clustering of tabular data.""" + inp_dir, out_dir, file_extension = generate_synthetic_data + pattern = r"\w+$" + file_pattern = f".*{file_extension}" + files = fp.FilePattern(inp_dir, file_pattern) + for file in files(): + hd.hdbscan_clustering( + file=file[1][0], + min_cluster_size=3, + grouping_pattern=pattern, + label_col="species", + average_groups=True, + increment_outlier_id=True, + out_dir=out_dir, + ) + + out_ext = [Path(f.name).suffix for f in out_dir.iterdir()] + assert all(out_ext) is True + for f in out_dir.iterdir(): + df = vaex.open(f) + assert "cluster" in df.column_names + assert df["cluster"].values != 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) + + +def test_hdbscan_model(generate_synthetic_data: tuple[Path, Path, str]) -> None: + """Test hdbscan model.""" + inp_dir, _, file_extension = generate_synthetic_data + file_pattern = f".*{file_extension}" + files = fp.FilePattern(inp_dir, file_pattern) + for file in files(): + df = vaex.open(file[1][0]) + data = df[df.column_names[:-1]].values + min_cluster_size = 3 + label = hd.hdbscan_model(data, min_cluster_size, True) + assert len(label) != 0 + shutil.rmtree(inp_dir) diff --git a/clustering/k-means-clustering-tool/.bumpversion.cfg b/clustering/k-means-clustering-tool/.bumpversion.cfg new file mode 100644 index 0000000..8664577 --- /dev/null +++ b/clustering/k-means-clustering-tool/.bumpversion.cfg @@ -0,0 +1,34 @@ +[bumpversion] +current_version = 0.3.5-dev1 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:plugin.json] + +[bumpversion:file:VERSION] + +[bumpversion:file:README.md] + +[bumpversion:file:CHANGELOG.md] +[bumpversion:file:ict.yaml] +[bumpversion:file:k-meansclustering.cwl] + +[bumpversion:file:src/polus/tabular/clustering/k_means/__init__.py] + diff --git a/clustering/k-means-clustering-tool/CHANGELOG.md b/clustering/k-means-clustering-tool/CHANGELOG.md new file mode 100644 index 0000000..07e3b83 --- /dev/null +++ b/clustering/k-means-clustering-tool/CHANGELOG.md @@ -0,0 +1,13 @@ +# K-Means Clustering(0.3.5-dev1) + +1. This plugin is updated only to the new plugin standards +2. Before plugin support only `.csv` as an input files supported `.csv` and `.feather` file formats. Now this plugin support other vaex supported file formats both as inputs and outputs. +3. Some additional input arguments added `filePattern`, `fileExtension` +4. Implemented latest updated filepattern package +5. This plugin is now installable with pip. +6. Argparse package is replaced with Typer package for command line arguments. +7. `baseCommand` added in a plugin manifiest. +8. `--preview` flag is added which shows outputs to be generated by this plugin. +9. Use `python -m python -m polus.plugins.clustering.k_means` to run plugin from command line. +10. No unnitests before and new pytests added for testing. +11. Implemented parallel processing diff --git a/clustering/k-means-clustering-tool/Dockerfile b/clustering/k-means-clustering-tool/Dockerfile new file mode 100644 index 0000000..6ad3068 --- /dev/null +++ b/clustering/k-means-clustering-tool/Dockerfile @@ -0,0 +1,20 @@ +FROM polusai/bfio:2.1.9 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".arrow" +ENV POLUS_LOG="INFO" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir + +ENTRYPOINT ["python3", "-m", "polus.tabular.clustering.k_means"] +CMD ["--help"] diff --git a/clustering/k-means-clustering-tool/README.md b/clustering/k-means-clustering-tool/README.md new file mode 100644 index 0000000..931c055 --- /dev/null +++ b/clustering/k-means-clustering-tool/README.md @@ -0,0 +1,65 @@ +# K-Means Clustering(v0.3.5-dev1) + +The K-Means Clustering plugin clusters the data using Scikit-learn K-Means clustering algorithm and outputs csv file. Each instance(row) in the input csv file is assigned to one of the clusters. The output csv file contains the column 'Cluster' that shows which cluster the instance belongs to. + +## Inputs: + +### Input data: +The input tabular data that need to be clustered. This plugin supports `.csv` and `.arrow` file formats + +### Methods: +Choose any one of the method mentioned to determine the k-value and cluster the data. + +#### Elbow method +The elbow method runs k-means clustering for a range of values of k and for each k value it calculates the within cluster sum of squared errors (WSS). The idea behind this method is that SSE tends to decrease towards 0 as k-value increases. The goal here is to choose a k-value that has low WSS and the elbow represents where there is diminishing returns by increasing k. + +#### Calinski-Harabasz index +The Calinski-Harabasz index is defined as the ratio of the sum of between-cluster dispersion to the sum of within-cluster dispersion. To choose k, pick maximum number of clusters to be considered and then choose the value of k with the highest score. + +#### Davies-Bouldin index +The Davies-Bouldin index is defined as the average similarity measure of each cluster with its most similar one, where similarity is a ratio of within-cluster distances to between-cluster distances. To choose k value, pick maximum number of clusters to be considered and choose the value of k with lowest value for DB_index. + +### Manual +Select manual method only when you know the number of clusters required to cluster the data. + +### Minimum range: +Enter starting number of sequence in range function to determine k-value. This parameter is required only when elbow or Calinski Harabasz or Davies Bouldin methods are selected. + +### Maximum range: +Enter ending number of sequence in range function to determine k-value. This parameter is required only when elbow or Calinski Harabasz or Davies Bouldin methods are selected. + +### Number of clusters: +Enter k-value if you already know how many clusters are required. This parameter is required only when manual method is selected. + +## Note: +1. If 'Manual' method is selected, enter number of clusters required. +2. If 'Elbow' or 'CalinskiHarabasz' or 'DaviesBouldin' methods are selected, then you should enter values for both 'maximumrange' and 'minimumrange'. +3. The 'minimumrange'value should be >1. + +## Output: +The output is a tabular file containing the cluster data to which each instance in the data belongs to. + +## Building + +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +## Options + +This plugin takes seven input arguments and one output argument: + +| Name | Description | I/O | Type | +| ---------------- | --------------------------------------------------------------------------- | ------ | ------------- | +| `--inpdir` | Input tabular data | Input | genericData | +| `--filePattern` | Pattern to parse tabular files | Input | string | +| `--methods` | Select either Elbow or Calinski Harabasz or Davies Bouldin or Manual method | Input | enum | +| `--minimumrange` | Enter minimum k-value | Input | integer | +| `--maximumrange` | Enter maximum k-value | Input | integer | +| `--numofclus` | Enter number of clusters | Input | integer | +| `--outdir` | Output collection | Output | genericData | +| `--preview` | Generate JSON file with outputs | Output | JSON | diff --git a/clustering/k-means-clustering-tool/VERSION b/clustering/k-means-clustering-tool/VERSION new file mode 100644 index 0000000..65543cf --- /dev/null +++ b/clustering/k-means-clustering-tool/VERSION @@ -0,0 +1 @@ +0.3.5-dev1 diff --git a/clustering/k-means-clustering-tool/build-docker.sh b/clustering/k-means-clustering-tool/build-docker.sh new file mode 100644 index 0000000..275ef59 --- /dev/null +++ b/clustering/k-means-clustering-tool/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$(", +"Kelechi Nina Mezu ", +"Hamdah Shafqat Abbasi " +] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = "^3.9" +filepattern = "^2.0.0" +typer = "^0.7.0" +nyxus = "^0.5.0" +vaex = "^4.7.0" +scikit_learn="^1.0.2" + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +pre-commit = "^3.0.4" +black = "^23.1.0" +flake8 = "^6.0.0" +mypy = "^1.0.0" +pytest = "^7.2.1" +ipykernel = "^6.21.2" +requests = "^2.28.2" +pandas = "^2.0.1" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/clustering/k-means-clustering-tool/run-plugin.sh b/clustering/k-means-clustering-tool/run-plugin.sh new file mode 100644 index 0000000..18c8bea --- /dev/null +++ b/clustering/k-means-clustering-tool/run-plugin.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +version=$( None: + """K-means clustering plugin.""" + logger.info(f"inpDir = {inp_dir}") + logger.info(f"filePattern = {file_pattern}") + logger.info(f"minimumRange = {minimum_range}") + logger.info(f"maximumRange = {maximum_range}") + logger.info(f"numOfClus = {num_of_clus}") + logger.info(f"outDir = {out_dir}") + + assert inp_dir.exists(), f"{inp_dir} doesnot exist!! Please check input path again" + assert out_dir.exists(), f"{out_dir} doesnot exist!! Please check output path again" + assert file_pattern in [ + ".csv", + ".arrow", + ], f"{file_pattern} tabular files are not supported by this plugin" + + num_threads = max([cpu_count(), 2]) + + pattern = ".*" + file_pattern + fps = fp.FilePattern(inp_dir, pattern) + print(pattern) + + if not fps: + msg = f"No {file_pattern} files found." + raise ValueError(msg) + + if preview: + with open(pathlib.Path(out_dir, "preview.json"), "w") as jfile: + out_json: dict[str, Any] = { + "filepattern": pattern, + "outDir": [], + } + for file in fps(): + out_name = str(file[1][0].stem) + POLUS_TAB_EXT + out_json["outDir"].append(out_name) + json.dump(out_json, jfile, indent=2) + + flist = [f[1][0] for f in fps()] + + with multiprocessing.Pool(processes=num_threads) as executor: + executor.map( + partial( + km.clustering, + file_pattern=pattern, + methods=methods, + minimum_range=minimum_range, + maximum_range=maximum_range, + num_of_clus=num_of_clus, + out_dir=out_dir, + ), + flist, + ) + executor.close() + executor.join() + + +if __name__ == "__main__": + app() diff --git a/clustering/k-means-clustering-tool/src/polus/tabular/clustering/k_means/k_means.py b/clustering/k-means-clustering-tool/src/polus/tabular/clustering/k_means/k_means.py new file mode 100644 index 0000000..ce2ad64 --- /dev/null +++ b/clustering/k-means-clustering-tool/src/polus/tabular/clustering/k_means/k_means.py @@ -0,0 +1,215 @@ +"""K_means clustering.""" +import logging +import os +import pathlib + +import numpy +import numpy as np +import numpy.matlib +import vaex +from sklearn.cluster import KMeans +from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score + +from .utils import Methods + +# Initialize the logger +logging.basicConfig( + format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s", + datefmt="%d-%b-%y %H:%M:%S", +) +logger = logging.getLogger("main") +logger.setLevel(os.environ.get("POLUS_LOG", logging.INFO)) +POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".arrow") + + +def elbow(data_array: np.array, minimum_range: int, maximum_range: int) -> np.array: + """Determine k value and cluster data using elbow method. + + Args: + data_array : Input data. + minimum_range : Starting number of sequence in range function to determine k-value. + maximum_range : Ending number of sequence in range function to determine k-value. + + Returns: + Labeled data. + """ + sse = [] + label_value = [] + logger.info("Starting Elbow Method...") + K = range(minimum_range, maximum_range + 1) + for k in K: + kmeans = KMeans(n_clusters=k, random_state=9).fit(data_array) + centroids = kmeans.cluster_centers_ + pred_clusters = kmeans.predict(data_array) + curr_sse = 0 + + # calculate square of Euclidean distance of each point from its cluster center and add to current WSS + logger.info("Calculating Euclidean distance...") + for i in range(len(data_array)): + curr_center = centroids[pred_clusters[i]] + curr_sse += np.linalg.norm(data_array[i] - np.array(curr_center)) ** 2 + sse.append(curr_sse) + labels = kmeans.labels_ + label_value.append(labels) + + logger.info("Finding elbow point in curve...") + # Find the elbow point in the curve + points = len(sse) + # Get coordinates of all points + coord = np.vstack((range(points), sse)).T + # First point + f_point = coord[0] + # Vector between first and last point + linevec = coord[-1] - f_point + # Normalize the line vector + linevecn = linevec / np.sqrt(np.sum(linevec**2)) + # Vector between all point and first point + vecf = coord - f_point + # Parallel vector + prod = np.sum(vecf * numpy.matlib.repmat(linevecn, points, 1), axis=1) + vecfpara = np.outer(prod, linevecn) + # Perpendicular vector + vecline = vecf - vecfpara + # Distance from curve to line + dist = np.sqrt(np.sum(vecline**2, axis=1)) + # Maximum distance point + k_cluster = np.argmax(dist) + minimum_range + logger.info("k cluster: %s", k_cluster) + logger.info("label value: %s", label_value) + logger.info("Setting label_data") + label_data = label_value[k_cluster] + return label_data + + +def calinski_davies( + data_array: np.array, methods: Methods, minimum_range: int, maximum_range: int +) -> np.array: + """Determine k value and cluster data using Calinski Harabasz Index method or Davies Bouldin based on method selection. + + Args: + data: Input data. + methods: Select either Calinski Harabasz or Davies Bouldin method. + minimum_range: Starting number of sequence in range function to determine k-value. + maximum_range:Ending number of sequence in range function to determine k-value. + + Returns: + Labeled data. + """ + K = range(minimum_range, maximum_range + 1) + chdb = [] + label_value = [] + for k in K: + kmeans = KMeans(n_clusters=k, random_state=9).fit(data_array) + labels = kmeans.labels_ + label_value.append(labels) + if f"{methods}" == "CalinskiHarabasz": + ch_db = calinski_harabasz_score(data_array, labels) + else: + ch_db = davies_bouldin_score(data_array, labels) + chdb.append(ch_db) + if f"{methods}" == "CalinskiHarabasz": + score = max(chdb) + else: + score = min(chdb) + k_cluster = chdb.index(score) + label_data = label_value[k_cluster] + return label_data + + +def clustering( + file: pathlib.Path, + file_pattern: str, + methods: Methods, + minimum_range: int, + maximum_range: int, + num_of_clus: int, + out_dir: pathlib.Path, +): + """K-means clustering methods to find clusters of similar or more related objects. + + Args: + file: Input path. + file_pattern: Pattern to parse tabular files. + methods: Select either Calinski Harabasz or Davies Bouldin method or Manual. + minimum_range: Starting number of sequence in range function to determine k-value. + maximum_range:Ending number of sequence in range function to determine k-value. + """ + # Get file name + filename = file.stem + logger.info("Started reading the file " + file.name) + with open(file, encoding="utf-8", errors="ignore") as fr: + ncols = len(fr.readline().split(",")) + chunk_size = max([2**24 // ncols, 1]) + + if f"{file_pattern}" == ".csv": + df = vaex.read_csv(file, convert=True, chunk_size=chunk_size) + else: + df = vaex.open(file) + # Get list of column names + cols = df.get_column_names() + + # Separate data by categorical and numerical data types + numerical = [] + categorical = [] + for col in cols: + if df[col].dtype == str: + categorical.append(col) + else: + numerical.append(col) + # Remove label field + if "label" in numerical: + numerical.remove("label") + + if numerical is None: + raise ValueError("There are no numerical features in the data.") + else: + data = df[numerical] + + if categorical: + cat_array = df[categorical] + else: + logger.info("No categorical features found in the data") + + if f"{methods}" != "Manual": + # Check whether minimum range and maximum range value is entered + if methods and not (minimum_range or maximum_range): + raise ValueError( + "Enter both minimumrange and maximumrange to determine k-value." + ) + if minimum_range <= 1: + raise ValueError("Minimumrange should be greater than 1.") + logger.info( + "Determining k-value using " + methods + " and clustering the data." + ) + if f"{methods}" == "CalinskiHarabasz": + label_data = calinski_davies(data, methods, minimum_range, maximum_range) + if f"{methods}" == "DaviesBouldin": + label_data = calinski_davies(data, methods, minimum_range, maximum_range) + if f"{methods}" == "Elbow": + label_data = elbow(data, minimum_range, maximum_range) + else: + # Check whether numofclus is entered + if not num_of_clus: + raise ValueError("Enter number of clusters") + kvalue = num_of_clus + kmeans = KMeans(n_clusters=kvalue).fit(data) + label_data = kmeans.labels_ + + # Cluster data using K-Means clustering + logger.info("Adding Cluster Data") + data["Cluster"] = label_data + + # Add Categorical Data back to data processed + if categorical: + logger.info("Adding categorical data") + for col in categorical: + data[col] = cat_array[col].values + + # Save dataframe to feather file or to csv file + out_file = pathlib.Path(out_dir, (filename + POLUS_TAB_EXT)) + + if f"{POLUS_TAB_EXT}" in [".feather", ".arrow"]: + data.export_feather(out_file) + else: + logger.info("Saving csv file") + data.export_csv(out_file, chunk_size=chunk_size) diff --git a/clustering/k-means-clustering-tool/src/polus/tabular/clustering/k_means/utils.py b/clustering/k-means-clustering-tool/src/polus/tabular/clustering/k_means/utils.py new file mode 100644 index 0000000..91bb81b --- /dev/null +++ b/clustering/k-means-clustering-tool/src/polus/tabular/clustering/k_means/utils.py @@ -0,0 +1,12 @@ +"""K_means clustering.""" +import enum + + +class Methods(str, enum.Enum): + """Clustering methods to determine k-value.""" + + ELBOW = "Elbow" + CALINSKIHARABASZ = "CalinskiHarabasz" + DAVIESBOULDIN = "DaviesBouldin" + MANUAL = "Manual" + Default = "Elbow" diff --git a/clustering/k-means-clustering-tool/tests/__init__.py b/clustering/k-means-clustering-tool/tests/__init__.py new file mode 100644 index 0000000..36f89f9 --- /dev/null +++ b/clustering/k-means-clustering-tool/tests/__init__.py @@ -0,0 +1 @@ +"""K_means clustering.""" diff --git a/clustering/k-means-clustering-tool/tests/conftest.py b/clustering/k-means-clustering-tool/tests/conftest.py new file mode 100644 index 0000000..58dce0f --- /dev/null +++ b/clustering/k-means-clustering-tool/tests/conftest.py @@ -0,0 +1,91 @@ +"""Test Fixtures.""" + +import pathlib +import shutil +import tempfile + +import numpy as np +import pandas as pd +import pytest + + +class Generatedata: + """Generate tabular data with several different file format.""" + + def __init__(self, file_pattern: str, size: int, outname: str) -> None: + """Define instance attributes.""" + self.dirpath = pathlib.Path.cwd() + self.inp_dir = tempfile.mkdtemp(dir=self.dirpath) + self.out_dir = tempfile.mkdtemp(dir=self.dirpath) + self.file_pattern = file_pattern + self.size = size + self.outname = outname + self.x = self.create_dataframe() + + def get_inp_dir(self) -> pathlib.Path: + """Get input directory.""" + return pathlib.Path(self.inp_dir) + + def get_out_dir(self) -> pathlib.Path: + """Get output directory.""" + return pathlib.Path(self.out_dir) + + def create_dataframe(self) -> pd.core.frame.DataFrame: + """Create Pandas dataframe.""" + rng = np.random.default_rng() + diction_1 = { + "A": np.linspace(0.0, 4.0, self.size, dtype="float32", endpoint=False), + "B": np.linspace(0.0, 6.0, self.size, dtype="float32", endpoint=False), + "C": np.linspace(0.0, 8.0, self.size, dtype="float32", endpoint=False), + "D": np.linspace(0.0, 10.0, self.size, dtype="float32", endpoint=False), + "label": rng.integers(low=1, high=4, size=self.size), + } + + return pd.DataFrame(diction_1) + + def csv_func(self) -> None: + """Convert pandas dataframe to csv file format.""" + self.x.to_csv(pathlib.Path(self.inp_dir, self.outname), index=False) + + def arrow_func(self) -> None: + """Convert pandas dataframe to Arrow file format.""" + self.x.to_feather(pathlib.Path(self.inp_dir, self.outname)) + + def __call__(self) -> None: + """To make a class callable.""" + data_ext = { + ".csv": self.csv_func, + ".arrow": self.arrow_func, + } + + return data_ext[self.file_pattern]() + + def clean_directories(self) -> None: + """Remove files.""" + for d in self.dirpath.iterdir(): + if d.is_dir() and d.name.startswith("tmp"): + shutil.rmtree(d) + + +def pytest_addoption(parser: pytest.Parser) -> None: + """Add options to pytest.""" + parser.addoption( + "--slow", + action="store_true", + dest="slow", + default=False, + help="run slow tests", + ) + + +@pytest.fixture( + params=[ + ("CalinskiHarabasz", 500, ".csv", 2, 5), + ("DaviesBouldin", 250, ".arrow", 2, 7), + ("Elbow", 500, ".arrow", 2, 10), + ("Manual", 200, ".arrow", 2, 5), + ], +) +def get_params(request: pytest.FixtureRequest) -> pytest.FixtureRequest: + """To get the parameter of the fixture.""" + return request.param diff --git a/clustering/k-means-clustering-tool/tests/test_main.py b/clustering/k-means-clustering-tool/tests/test_main.py new file mode 100644 index 0000000..922a7ac --- /dev/null +++ b/clustering/k-means-clustering-tool/tests/test_main.py @@ -0,0 +1,142 @@ +"""K_means clustering.""" + +import shutil + +import filepattern as fp +import pytest +import vaex +from polus.tabular.clustering.k_means import k_means as km +from polus.tabular.clustering.k_means.__main__ import app +from typer.testing import CliRunner + +from .conftest import Generatedata + +runner = CliRunner() + + +@pytest.mark.parametrize( + ("ext", "minrange", "maxrange"), + [(".arrow", 2, 5), (".csv", 2, 7)], +) +@pytest.mark.skipif("not config.getoption('slow')") +def test_elbow(ext: str, minrange: int, maxrange: int) -> None: + """Testing elbow function.""" + d = Generatedata(ext, outname=f"data_1{ext}", size=10000) + d() + pattern = f".*{ext}" + fps = fp.FilePattern(d.get_inp_dir(), pattern) + + for file in fps(): + if f"{pattern}" == ".csv": + df = vaex.read_csv(file[1][0], convert=True) + else: + df = vaex.open(file[1][0]) + + label_data = km.elbow( + data_array=df[:, :4].values, + minimum_range=minrange, + maximum_range=maxrange, + ) + + assert label_data is not None + + d.clean_directories() + + +@pytest.mark.parametrize( + ("method", "datasize", "ext", "minrange", "maxrange"), + [ + ("CalinskiHarabasz", 500, ".arrow", 2, 5), + ("DaviesBouldin", 600, ".csv", 2, 7), + ], +) +@pytest.mark.skipif("not config.getoption('slow')") +def test_calinski_davies( + method: str, + datasize: int, + ext: str, + minrange: int, + maxrange: int, +) -> None: + """Testing calinski_davies and davies_bouldin methods.""" + d = Generatedata(ext, outname=f"data_1{ext}", size=datasize) + d() + pattern = f".*{ext}" + fps = fp.FilePattern(d.get_inp_dir(), pattern) + + for file in fps(): + if f"{pattern}" == ".csv": + df = vaex.read_csv(file[1][0], convert=True) + else: + df = vaex.open(file[1][0]) + + label_data = km.calinski_davies( + data_array=df[:, :4].values, + methods=method, + minimum_range=minrange, + maximum_range=maxrange, + ) + + assert label_data is not None + + d.clean_directories() + + +@pytest.mark.skipif("not config.getoption('slow')") +def test_clustering(get_params: pytest.FixtureRequest) -> None: + """Test clustering function.""" + method, datasize, ext, minrange, maxrange = get_params + d = Generatedata(ext, outname=f"data_1{ext}", size=datasize) + d() + pattern = f".*{ext}" + numclusters = 3 + fps = fp.FilePattern(d.get_inp_dir(), pattern) + for file in fps(): + km.clustering( + file=file[1][0], + file_pattern=ext, + methods=method, + minimum_range=minrange, + maximum_range=maxrange, + num_of_clus=numclusters, + out_dir=d.get_out_dir(), + ) + assert d.get_out_dir().joinpath("data_1.arrow") + df = vaex.open(d.get_out_dir().joinpath("data_1.arrow")) + assert "Cluster" in df.columns + d.clean_directories() + + +def test_cli(get_params: pytest.FixtureRequest) -> None: + """Test Cli.""" + method, data_size, inpext, minrange, maxrange = get_params + d = Generatedata(inpext, outname=f"data_1{inpext}", size=data_size) + d() + shutil.copy( + d.get_inp_dir().joinpath(f"data_1{inpext}"), + d.get_inp_dir().joinpath(f"data_2{inpext}"), + ) + numclusters = 3 + + result = runner.invoke( + app, + [ + "--inpDir", + d.get_inp_dir(), + "--filePattern", + inpext, + "--methods", + method, + "--minimumRange", + minrange, + "--maximumRange", + maxrange, + "--numOfClus", + numclusters, + "--outDir", + d.get_out_dir(), + ], + ) + assert result.exit_code == 0 + + d.clean_directories() diff --git a/clustering/outlier-removal-tool/.bumpversion.cfg b/clustering/outlier-removal-tool/.bumpversion.cfg new file mode 100644 index 0000000..72c49c4 --- /dev/null +++ b/clustering/outlier-removal-tool/.bumpversion.cfg @@ -0,0 +1,35 @@ +[bumpversion] +current_version = 0.2.7-dev1 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:VERSION] + +[bumpversion:file:README.md] + +[bumpversion:file:plugin.json] + +[bumpversion:file:CHANGELOG.md] + +[bumpversion:file:ict.yaml] + +[bumpversion:file:outlierremoval.cwl] + +[bumpversion:file:src/polus/tabular/clustering/outlier_removal/__init__.py] diff --git a/clustering/outlier-removal-tool/.dockerignore b/clustering/outlier-removal-tool/.dockerignore new file mode 100644 index 0000000..7c603f8 --- /dev/null +++ b/clustering/outlier-removal-tool/.dockerignore @@ -0,0 +1,4 @@ +.venv +out +tests +__pycache__ diff --git a/clustering/outlier-removal-tool/.gitignore b/clustering/outlier-removal-tool/.gitignore new file mode 100644 index 0000000..9ed1c37 --- /dev/null +++ b/clustering/outlier-removal-tool/.gitignore @@ -0,0 +1,23 @@ +# Jupyter Notebook +.ipynb_checkpoints +poetry.lock +../../poetry.lock +# Environments +.env +.myenv +.venv +env/ +venv/ +# test data directory +data +# yaml file +.pre-commit-config.yaml +# hidden files +.DS_Store +.ds_store +# flake8 +.flake8 +../../.flake8 +__pycache__ +.mypy_cache +requirements.txt diff --git a/clustering/outlier-removal-tool/CHANGELOG.md b/clustering/outlier-removal-tool/CHANGELOG.md new file mode 100644 index 0000000..09a0c7f --- /dev/null +++ b/clustering/outlier-removal-tool/CHANGELOG.md @@ -0,0 +1,15 @@ +# [0.2.7-dev1] - 2024-01-12 + +## Added + +- Pytests to test this plugin +- This plugin is now installable with pip. +- Added support for arrow file format in addition to csv + +## Changed + +- Updated dependencies (bfio, filepattern, preadator) to latest +- Argparse package is replaced with Typer package for command line arguments +- Replaced docker base image with latest container image with pre-installed bfio +- Replaced pandas with vaex +- Seperating descriptive from numerical features for outlier detection if present in the tabular data diff --git a/clustering/outlier-removal-tool/Dockerfile b/clustering/outlier-removal-tool/Dockerfile new file mode 100644 index 0000000..9f88e9b --- /dev/null +++ b/clustering/outlier-removal-tool/Dockerfile @@ -0,0 +1,22 @@ +FROM polusai/bfio:2.3.6 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".arrow" + + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache + + + +ENTRYPOINT ["python3", "-m", "polus.tabular.clustering.outlier_removal"] +CMD ["--help"] diff --git a/clustering/outlier-removal-tool/README.md b/clustering/outlier-removal-tool/README.md new file mode 100644 index 0000000..3a97455 --- /dev/null +++ b/clustering/outlier-removal-tool/README.md @@ -0,0 +1,52 @@ +# Outlier removal (v0.2.7-dev1) + +The outlier removal plugin removes the outliers from the data based on the method selected and outputs csv file. The output will have separate csv files for inliers and outliers. The input file should be in csv format. + +The plugin support vaex supported input csv file that need outliers to be removed. The file should be in csv format. This is a required parameter for the plugin. + +## Methods + +Choose any one of the methods mentioned to remove outliers from the data. + +### Isolation Forest + +Ensemble-based unsupervised method for outlier detection. The algorithm isolates outliers instead of normal instances. It works based on the principle that outliers are few and different and hence, the outliers can be identified easier than the normal points. The score is calculated as the path length to isolate the observation. These two methods can be selected to detect outliers> + +1. `IsolationForest` Detect outliers globally that deviates significantly from the rest of the datapoints +2. `IForest` Detect local outliers that are distinct when compared to those of its neighbors. + + +### Global + + + +### Local + + + +## Outputs: + +Select the output file by passing value to `outputType`. User can select from following options `inlier`, `oulier` or `combined`. The combined file contains `anomaly` column which score each datapoint if it is inlier or outlier. + +## Building + +To build the Docker image for the conversion plugin, run +`./build-docker.sh` + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +## Options + +This plugin takes three input arguments and one output argument: + +| Name | Description | I/O | Type | +| ----------- | ------------------------------------- | ------ | ------------- | +| `--inpDir` | Input directory containing tabular files | Input | genericData | +| `--filePattern` | Pattern to parse tabular file names | Input | string | +| `--methods` | Select methods for outlier removal | Input | enum | +| `--outputType` | Select type of output file | Input | enum | +| `--outdir` | Output collection | Output | genericData | +| `--preview` | Generate a JSON file with outputs | Output | JSON | diff --git a/clustering/outlier-removal-tool/VERSION b/clustering/outlier-removal-tool/VERSION new file mode 100644 index 0000000..7d24d15 --- /dev/null +++ b/clustering/outlier-removal-tool/VERSION @@ -0,0 +1 @@ +0.2.7-dev1 diff --git a/clustering/outlier-removal-tool/build-docker.sh b/clustering/outlier-removal-tool/build-docker.sh new file mode 100644 index 0000000..be64f72 --- /dev/null +++ b/clustering/outlier-removal-tool/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$(", + "Hamdah Shafqat abbasi " + ] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +filepattern = "^2.0.4" +typer = "^0.7.0" +tqdm = "^4.64.1" +preadator="0.4.0.dev2" +vaex = "^4.17.0" +scikit-learn = "^1.3.2" +pyod = "^1.1.2" + + +[tool.poetry.group.dev.dependencies] +pre-commit = "^3.3.3" +bump2version = "^1.0.1" +pytest = "^7.3.2" +pytest-xdist = "^3.3.1" +pytest-sugar = "^0.9.7" +ipykernel = "^6.28.0" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/clustering/outlier-removal-tool/run-docker.sh b/clustering/outlier-removal-tool/run-docker.sh new file mode 100644 index 0000000..f2c3472 --- /dev/null +++ b/clustering/outlier-removal-tool/run-docker.sh @@ -0,0 +1,19 @@ +version=$( None: + """Remove outliers from the data.""" + logger.info(f"--inpDir = {inp_dir}") + logger.info(f"--filePattern = {file_pattern}") + logger.info(f"--method = {method}") + logger.info(f"--outputType = {output_type}") + logger.info(f"--outDir = {out_dir}") + + inp_dir = inp_dir.resolve() + out_dir = out_dir.resolve() + + assert inp_dir.exists(), f"{inp_dir} does not exist!! Please check input path again" + assert ( + out_dir.exists() + ), f"{out_dir} does not exist!! Please check output path again" + + files = fp.FilePattern(inp_dir, file_pattern) + + if preview: + with Path.open(Path(out_dir, "preview.json"), "w") as jfile: + out_json: dict[str, Any] = { + "filepattern": file_pattern, + "outDir": [], + } + for file in files(): + outname = file[1][0].name.replace( + "".join(file[1][0].suffixes), + f"_{output_type}{rm.POLUS_TAB_EXT}", + ) + + out_json["outDir"].append(outname) + json.dump(out_json, jfile, indent=2) + + else: + with preadator.ProcessManager( + name="Cluster data using HDBSCAN", + num_processes=num_workers, + threads_per_process=2, + ) as pm: + for file in files(): + pm.submit_process( + rm.outlier_detection, + file[1][0], + method, + output_type, + out_dir, + ) + pm.join_processes() + + +if __name__ == "__main__": + app() diff --git a/clustering/outlier-removal-tool/src/polus/tabular/clustering/outlier_removal/outlier_removal.py b/clustering/outlier-removal-tool/src/polus/tabular/clustering/outlier_removal/outlier_removal.py new file mode 100644 index 0000000..cb7364b --- /dev/null +++ b/clustering/outlier-removal-tool/src/polus/tabular/clustering/outlier_removal/outlier_removal.py @@ -0,0 +1,135 @@ +"""Outlier Removal Plugin.""" +import enum +import logging +import os +from pathlib import Path + +import numpy as np +import vaex +from pyod.models.iforest import IForest +from sklearn.ensemble import IsolationForest +from sklearn.preprocessing import StandardScaler + +logger = logging.getLogger(__name__) +logger.setLevel(os.environ.get("POLUS_LOG", logging.INFO)) +POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".arrow") + +CHUNK_SIZE = 10000 + + +class Methods(str, enum.Enum): + """Available outlier detection methods.""" + + ISOLATIONFOREST = "IsolationForest" + IFOREST = "IForest" + DEFAULT = "IsolationForest" + + +class Outputs(str, enum.Enum): + """Output Files.""" + + INLIER = "inlier" + OUTLIER = "outlier" + COMBINED = "combined" + DEFAULT = "inlier" + + +def write_outputs(data: vaex.DataFrame, outname: Path) -> None: + """Write outputs in either arrow or csv file formats. + + Args: + data: vaex dataframe. + outname: Name of output file. + """ + if POLUS_TAB_EXT == ".arrow": + data.export_feather(outname) + logger.info(f"Saving outputs: {outname}") + if POLUS_TAB_EXT == ".csv": + data.export_csv(outname, chunk_size=CHUNK_SIZE) + logger.info(f"Saving outputs: {outname}") + + +def isolationforest(data_set: np.ndarray, method: Methods) -> np.ndarray: + """Isolation Forest algorithm. + + Args: + data_set: Input data. + method: Type of method to remove outliers. + + Returns: + ndarray whether or not the data point should be considered as an inlier. + + """ + if method == Methods.ISOLATIONFOREST: + clf = IsolationForest(random_state=19, n_estimators=200) + + if method == Methods.IFOREST: + clf = IForest(random_state=10, n_estimators=200) + + if method == Methods.DEFAULT: + clf = IsolationForest(random_state=19, n_estimators=200) + + clf.fit(data_set) + return clf.predict(data_set) + + +def outlier_detection( + file: Path, + method: Methods, + output_type: Outputs, + out_dir: Path, +) -> None: + """Detects outliers using Isolation Forest algorithm. + + Args: + file: Input tabular data. + method: Select a method to remove outliers. + output_type: Select type of output file. + out_dir: Path to output directory. + """ + if Path(file.name).suffix == ".csv": + data = vaex.from_csv(file, convert=True, chunk_size=CHUNK_SIZE) + else: + data = vaex.open(file) + + int_columns = [ + feature + for feature in data.get_column_names() + if data.data_type(feature) == int or data.data_type(feature) == float + ] + + if len(int_columns) == 0: + msg = "Features with integer datatype do not exist" + raise ValueError(msg) + + # Standardize the data + df = StandardScaler().fit_transform(data[int_columns]) + + # Detect outliers + logger.info("Detecting outliers using " + method) + rem_out = isolationforest(df, method) + + data["anomaly"] = rem_out + + if method == Methods.ISOLATIONFOREST or method == Methods.DEFAULT: + inliers = data[data["anomaly"] == 1] + outliers = data[data["anomaly"] == -1] + + if method == Methods.IFOREST: + inliers = data[data["anomaly"] == 0] + outliers = data[data["anomaly"] == 1] + + # Drop 'anomaly' column + inliers = inliers.drop("anomaly", inplace=True) + outliers = outliers.drop("anomaly", inplace=True) + + outname = Path(out_dir, f"{Path(file.name).stem}_{output_type}{POLUS_TAB_EXT}") + + if output_type == Outputs.INLIER: + write_outputs(inliers, outname) + if output_type == Outputs.OUTLIER: + write_outputs(outliers, outname) + if output_type == Outputs.COMBINED: + write_outputs(data, outname) + if output_type == Outputs.DEFAULT: + write_outputs(inliers, outname) diff --git a/clustering/outlier-removal-tool/tests/__init__.py b/clustering/outlier-removal-tool/tests/__init__.py new file mode 100644 index 0000000..727cdca --- /dev/null +++ b/clustering/outlier-removal-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Test Outlier Removal Plugin.""" diff --git a/clustering/outlier-removal-tool/tests/conftest.py b/clustering/outlier-removal-tool/tests/conftest.py new file mode 100644 index 0000000..1829c1a --- /dev/null +++ b/clustering/outlier-removal-tool/tests/conftest.py @@ -0,0 +1,54 @@ +"""Test fixtures. + +Set up all data used in tests. +""" +import tempfile +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + + +@pytest.fixture( + params=[ + (5000, ".csv", "IsolationForest", "combined"), + (100000, ".arrow", "IForest", "inlier"), + (500000, ".csv", "IsolationForest", "outlier"), + ], +) +def get_params(request: pytest.FixtureRequest) -> tuple[int, str]: + """To get the parameter of the fixture.""" + return request.param + + +@pytest.fixture() +def generate_synthetic_data( + get_params: tuple[int, str, str, str], +) -> tuple[Path, Path, str, str, str]: + """Generate tabular data.""" + nrows, file_extension, method, output_type = get_params + + input_directory = Path(tempfile.mkdtemp(prefix="inputs_")) + output_directory = Path(tempfile.mkdtemp(prefix="out_")) + rng = np.random.default_rng() + tabular_data = { + "sepal_length": rng.random(nrows).tolist(), + "sepal_width": rng.random(nrows).tolist(), + "petal_length": rng.random(nrows).tolist(), + "petal_width": rng.random(nrows).tolist(), + "species": rng.choice( + ["Iris-setosa", "Iris-versicolor", "Iris-virginica"], + nrows, + ).tolist(), + } + + df = pd.DataFrame(tabular_data) + if file_extension == ".csv": + outpath = Path(input_directory, "data.csv") + df.to_csv(outpath, index=False) + if file_extension == ".arrow": + outpath = Path(input_directory, "data.arrow") + df.to_feather(outpath) + + return input_directory, output_directory, file_extension, method, output_type diff --git a/clustering/outlier-removal-tool/tests/test_cli.py b/clustering/outlier-removal-tool/tests/test_cli.py new file mode 100644 index 0000000..c1c24e9 --- /dev/null +++ b/clustering/outlier-removal-tool/tests/test_cli.py @@ -0,0 +1,59 @@ +"""Test Command line Tool.""" +from typer.testing import CliRunner +from polus.tabular.clustering.outlier_removal.__main__ import app +import shutil +from pathlib import Path + + +def test_cli(generate_synthetic_data: tuple[Path, Path, str, str, str]) -> None: + """Test the command line.""" + inp_dir, out_dir, file_extension, method, output_type = generate_synthetic_data + file_pattern = f".*{file_extension}" + + runner = CliRunner() + result = runner.invoke( + app, + [ + "--inpDir", + inp_dir, + "--filePattern", + file_pattern, + "--method", + method, + "--outputType", + output_type, + "--outDir", + out_dir, + ], + ) + + assert result.exit_code == 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) + + +def test_short_cli(generate_synthetic_data: tuple[Path, Path, str, str, str]) -> None: + """Test short command line.""" + inp_dir, out_dir, file_extension, method, output_type = generate_synthetic_data + file_pattern = f".*{file_extension}" + + runner = CliRunner() + result = runner.invoke( + app, + [ + "-i", + inp_dir, + "-f", + file_pattern, + "-m", + method, + "-ot", + output_type, + "-o", + out_dir, + ], + ) + + assert result.exit_code == 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) diff --git a/clustering/outlier-removal-tool/tests/test_outlier_removal.py b/clustering/outlier-removal-tool/tests/test_outlier_removal.py new file mode 100644 index 0000000..68d2867 --- /dev/null +++ b/clustering/outlier-removal-tool/tests/test_outlier_removal.py @@ -0,0 +1,46 @@ +"""Test Outlier Removal Plugin.""" +import shutil +from pathlib import Path + +import filepattern as fp +import numpy as np +import polus.tabular.clustering.outlier_removal.outlier_removal as rm +import vaex + + +def test_outlier_detection( + generate_synthetic_data: tuple[Path, Path, str, str, str], +) -> None: + """Test outlier detection of tabular data.""" + inp_dir, out_dir, file_extension, method, output_type = generate_synthetic_data + + file_pattern = f".*{file_extension}" + files = fp.FilePattern(inp_dir, file_pattern) + for file in files(): + rm.outlier_detection( + file=file[1][0], + method=method, + output_type=output_type, + out_dir=out_dir, + ) + out_ext = [Path(f.name).suffix for f in out_dir.iterdir()] + assert all(out_ext) is True + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) + + +def test_isolationforest( + generate_synthetic_data: tuple[Path, Path, str, str, str], +) -> None: + """Test isolationforest method.""" + inp_dir, out_dir, file_extension, method, output_type = generate_synthetic_data + file_pattern = f".*{file_extension}" + files = fp.FilePattern(inp_dir, file_pattern) + for file in files(): + df = vaex.open(file[1][0]) + data = df[df.column_names[:-1]].values + prediction = rm.isolationforest(data, method) + assert len(prediction) != 0 + assert type(prediction) == np.ndarray + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) diff --git a/features/feature-segmentation-eval-tool/.bumpversion.cfg b/features/feature-segmentation-eval-tool/.bumpversion.cfg new file mode 100644 index 0000000..3d5ebbd --- /dev/null +++ b/features/feature-segmentation-eval-tool/.bumpversion.cfg @@ -0,0 +1,31 @@ +[bumpversion] +current_version = 0.2.6-dev1 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:plugin.json] + +[bumpversion:file:VERSION] + +[bumpversion:file:README.md] + +[bumpversion:file:ict.yaml] + +[bumpversion:file:src/polus/tabular/features/feature_segmentation_eval/__init__.py] diff --git a/features/feature-segmentation-eval-tool/Dockerfile b/features/feature-segmentation-eval-tool/Dockerfile new file mode 100644 index 0000000..c7130f7 --- /dev/null +++ b/features/feature-segmentation-eval-tool/Dockerfile @@ -0,0 +1,20 @@ +FROM polusai/bfio:2.3.6 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".csv" +ENV POLUS_LOG="INFO" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir + +ENTRYPOINT ["python3", "-m", "polus.tabular.features.feature_segmentation_eval"] +CMD ["--help"] diff --git a/features/feature-segmentation-eval-tool/README.md b/features/feature-segmentation-eval-tool/README.md new file mode 100644 index 0000000..fca31b5 --- /dev/null +++ b/features/feature-segmentation-eval-tool/README.md @@ -0,0 +1,28 @@ +# Feature segmentation eval (v0.2.6-dev1) + +Plugin to generate evaluation metrics for feature comparison of ground truth and predicted images. Contact [Vishakha Goyal](mailto:vishakha.goyal@nih.gov), [Hamdah Shafqat Abbasi](mailto:hamdahshafqat.abbasi@nih.gov) for more information. + +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +## Building + +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. + +## Options + +This plugin takes six input arguments and one output argument: + +| Name | Description | I/O | Type | +|---------------|-------------------------|--------|--------| +| `--GTDir` | Ground truth feature collection to be processed by this plugin. | Input | genericData | +| `--PredDir` | Predicted feature collection to be processed by this plugin. | Input | genericData | +| `--filePattern` | Filename pattern to filter data. | Input | string | +| `--combineLabels`   | Boolean to calculate number of bins for histogram by combining GT and Predicted Labels | Input | boolean | +| `--singleOutFile`   | Boolean to save output file as a single file.| Input | boolean | +| `--outDir` | Output collection | Output | genericData | +| `--preview` | Generate a JSON file with outputs | Output | JSON | diff --git a/features/feature-segmentation-eval-tool/VERSION b/features/feature-segmentation-eval-tool/VERSION new file mode 100644 index 0000000..9073550 --- /dev/null +++ b/features/feature-segmentation-eval-tool/VERSION @@ -0,0 +1 @@ +0.2.6-dev1 diff --git a/features/feature-segmentation-eval-tool/build-docker.sh b/features/feature-segmentation-eval-tool/build-docker.sh new file mode 100644 index 0000000..9ba5a2d --- /dev/null +++ b/features/feature-segmentation-eval-tool/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$(", + "Hamdah Shafqat Abbasi " + ] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = "^3.9" +filepattern = "^2.0.1" +opencv_python = "^4.5.1.48" +scikit-learn="^1.4.0" +pandas = "^1.2.4" +scipy = "^1.6.2" +typer = "^0.7.0" +vaex = "^4.7.0" + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +pre-commit = "^3.1.0" +black = "^23.1.0" +flake8 = "^6.0.0" +mypy = "^1.0.1" +pytest = "^7.2.1" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/features/feature-segmentation-eval-tool/run-plugin.sh b/features/feature-segmentation-eval-tool/run-plugin.sh new file mode 100644 index 0000000..36e7024 --- /dev/null +++ b/features/feature-segmentation-eval-tool/run-plugin.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +version=$( None: + """Generate evaluation metrics of ground truth and predicted images.""" + logger.info(f"GTDir: {gt_dir}") + logger.info(f"PredDir: {pred_dir}") + logger.info(f"filePattern: {file_pattern}") + logger.info(f"combineLabels: {combine_labels}") + logger.info(f"singleOutFile: {single_out_file}") + logger.info(f"outDir: {out_dir}") + + starttime = time.time() + + if not gt_dir.exists(): + msg = "Groundtruth directory does not exist" + raise ValueError(msg, gt_dir) + if not pred_dir.exists(): + msg = "Predicted directory does not exist" + raise ValueError(msg, pred_dir) + if not out_dir.exists(): + msg = "outDir does not exist" + raise ValueError(msg, out_dir) + + if preview: + logger.info(f"generating preview data in {out_dir}") + with Path.open(Path(out_dir, "preview.json"), "w") as jfile: + out_json: dict[str, Any] = { + "filepattern": file_pattern, + "outDir": [], + } + if single_out_file: + out_name = f"result{fs.POLUS_TAB_EXT}" + out_json["outDir"].append(out_name) + + fps = fp.FilePattern(gt_dir, file_pattern) + for file in fps(): + outname = file[1][0].name.split(".")[0] + out_name = f"{outname}{fs.POLUS_TAB_EXT}" + out_json["outDir"].append(out_name) + json.dump(out_json, jfile, indent=2) + + fs.feature_evaluation( + gt_dir, + pred_dir, + combine_labels, + file_pattern, + single_out_file, + out_dir, + ) + + endtime = (time.time() - starttime) / 60 + logger.info(f"Total time taken for execution: {endtime:.4f} minutes") + + +if __name__ == "__main__": + app() diff --git a/features/feature-segmentation-eval-tool/src/polus/tabular/features/feature_segmentation_eval/feature_evaluation.py b/features/feature-segmentation-eval-tool/src/polus/tabular/features/feature_segmentation_eval/feature_evaluation.py new file mode 100644 index 0000000..5726dad --- /dev/null +++ b/features/feature-segmentation-eval-tool/src/polus/tabular/features/feature_segmentation_eval/feature_evaluation.py @@ -0,0 +1,468 @@ +"""Feature segmentation evaluation package.""" +import logging +import os +from pathlib import Path +from typing import Any +from typing import Optional +from typing import Union + +import cv2 +import filepattern +import numpy as np +import pandas as pd +import scipy.stats +import vaex +from scipy.spatial import distance + +from .metrics import evaluate_all + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".csv") + +EXT = (".arrow", ".feather") +CHUNK_SIZE = 5_000_000 + +HEADER = [ + "Image", + "features", + "histogram intersection", + "correlation", + "chi square", + "bhattacharya distance", + "L1 score", + "L2 score", + "L infinity score", + "cosine distance", + "canberra distance", + "ks divergence", + "match distance", + "cvm distance", + "psi value", + "kl divergence", + "js divergence", + "wasserstein distance", + "Mean square error", + "Root mean square error", + "Normalized Root Mean Squared Error", + "Mean Error", + "Mean Absolute Error", + "Geometric Mean Absolute Error", + "Median Absolute Error", + "Mean Percentage Error", + "Mean Absolute Percentage Error", + "Median Absolute Percentage Error", + "Symmetric Mean Absolute Percentage Error", + "Symmetric Median Absolute Percentage Error", + "Mean Arctangent Absolute Percentage Error", + "Normalized Absolute Error", + "Normalized Absolute Percentage Error", + "Root Mean Squared Percentage Error", + "Root Median Squared Percentage Error", + "Integral Normalized Root Squared Error", + "Root Relative Squared Error", + "Relative Absolute Error (aka Approximation Error)", + "Mean Directional Accuracy", +] + + +def convert_vaex_dataframe(file_path: Path) -> vaex.dataframe.DataFrame: + """The vaex reading of tabular data with (".csv", ".feather", ".arrow") format. + + Args: + file_path: Path to tabular data. + + Returns: + A vaex dataframe. + """ + if file_path.name.endswith(".csv"): + return vaex.read_csv(Path(file_path), convert=True, chunk_size=CHUNK_SIZE) + if file_path.name.endswith(EXT): + return vaex.open(Path(file_path)) + return None + + +def write_outfile(x: vaex.dataframe.DataFrame, out_name: Path) -> None: + """Write an output in vaex supported tabular format.""" + if POLUS_TAB_EXT in [".feather", ".arrow"]: + x.export_feather(out_name) + else: + x.export_csv(path=out_name, chunk_size=CHUNK_SIZE) + + +def comparison( # noqa C901 + expected_array: np.ndarray, + actual_array: np.ndarray, + bin_count: int, +) -> tuple[ + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + Any, + Any, + float, + float, + Any, +]: + """Calculate the metrics for predicted and ground truth histograms. + + Args: + expected_array: numpy array of original values + actual_array: numpy array of predicted values + bin_count: number of bins provided as an input to calculate histogram. + + Returns: + All metrics + """ + count1, _ = np.histogram(expected_array, bins=bin_count) + pdf1 = count1 / sum(count1) + cdf1 = np.cumsum(pdf1) + + for i in range(0, len(actual_array)): + if actual_array[i] < expected_array.min(): + actual_array[i] = expected_array.min() + if actual_array[i] > expected_array.max(): + actual_array[i] = expected_array.max() + + count2, _ = np.histogram(actual_array, bins=bin_count) + pdf2 = count2 / sum(count2) + cdf2 = np.cumsum(pdf2) + expected_percents = pdf1 + actual_percents = pdf2 + + ### PDF input + def sub_psi(e_perc: Union[float, int], a_perc: Union[float, int]) -> float: + """Compute PSI Value.""" + if a_perc == 0: + a_perc = 0.0001 + if e_perc == 0: + e_perc = 0.0001 + + return (e_perc - a_perc) * np.log(e_perc / a_perc) + + def sub_kld(e_perc: Union[float, int], a_perc: Union[float, int]) -> float: + """Compute KL Divergence.""" + if a_perc == 0: + a_perc = 0.0001 + if e_perc == 0: + e_perc = 0.0001 + + return (e_perc) * np.log(e_perc / a_perc) + + def sub_jsd( + expected_percents: Union[float, int], + actual_percents: Union[float, int], + ) -> float: + """Compute JS Divergence.""" + p = np.array(expected_percents) + q = np.array(actual_percents) + m = (p + q) / 2 + # compute Jensen Shannon Divergence + divergence = (scipy.stats.entropy(p, m) + scipy.stats.entropy(q, m)) / 2 + # compute the Jensen Shannon Distance + return np.sqrt(divergence) + + def l1(pdf1: np.ndarray, pdf2: np.ndarray) -> float: + """Compute L1 Distance.""" + return np.sum(abs(pdf1 - pdf2)) + + def l2(pdf1: np.ndarray, pdf2: np.ndarray) -> float: + """Compute L2 Distance.""" + return np.sqrt(sum((pdf1 - pdf2) ** 2)) + + def linfinity(pdf1: np.ndarray, pdf2: np.ndarray) -> float: + """Compute L-infinity Distance.""" + return np.max(abs(pdf1 - pdf2)) + + def hist_intersect(pdf1: np.ndarray, pdf2: np.ndarray) -> float: + """Compute Histogram Intersection.""" + pdf1 = pdf1.astype(np.float32) + pdf2 = pdf2.astype(np.float32) + return cv2.compareHist(pdf1, pdf2, cv2.HISTCMP_INTERSECT) + + def cosine_d(pdf1: np.ndarray, pdf2: np.ndarray) -> float: + """Compute cosine distance.""" + return distance.cosine(pdf1, pdf2) + + def canberra(pdf1: np.ndarray, pdf2: np.ndarray) -> float: + """Compute Canberra distance.""" + return distance.canberra(pdf1, pdf2) + + def correlation(pdf1: np.ndarray, pdf2: np.ndarray) -> float: + """Compute Correlation.""" + pdf1 = pdf1.astype(np.float32) + pdf2 = pdf2.astype(np.float32) + return cv2.compareHist(pdf1, pdf2, cv2.HISTCMP_CORREL) + + def chi_square(pdf1: np.ndarray, pdf2: np.ndarray) -> float: + """Compute Chi Square.""" + pdf1 = pdf1.astype(np.float32) + pdf2 = pdf2.astype(np.float32) + return cv2.compareHist(pdf1, pdf2, cv2.HISTCMP_CHISQR) + + def bhattacharya(pdf1: np.ndarray, pdf2: np.ndarray) -> float: + """Compute Bhattacharya Distance.""" + pdf1 = pdf1.astype(np.float32) + pdf2 = pdf2.astype(np.float32) + return cv2.compareHist(pdf1, pdf2, cv2.HISTCMP_BHATTACHARYYA) + + ### CDF input + + def ks_divergence(cdf1: np.ndarray, cdf2: np.ndarray) -> float: + """Compute KS Divergence.""" + return np.max(abs(cdf1 - cdf2)) + + def match(cdf1: np.ndarray, cdf2: np.ndarray) -> float: + """Compute Match Distance.""" + return np.sum(abs(cdf1 - cdf2)) + + def cvm(cdf1: np.ndarray, cdf2: np.ndarray) -> float: + """Compute CVM Distance.""" + return np.sum((cdf1 - cdf2) ** 2) + + def ws_d(cdf1: np.ndarray, cdf2: np.ndarray) -> float: + """Compute Wasserstein Distance.""" + return scipy.stats.wasserstein_distance(cdf1, cdf2) + + ### metrics that take pdf input + psi_value = np.sum( + sub_psi(expected_percents[i], actual_percents[i]) + for i in range(0, len(expected_percents)) + ) + + kld_value = np.sum( + sub_kld(expected_percents[i], actual_percents[i]) + for i in range(0, len(expected_percents)) + ) + + jsd_value = sub_jsd(expected_percents, actual_percents) + + errors = evaluate_all(expected_percents, actual_percents) + + ### metrics that take cdf input + + wd_value = ws_d(cdf1, cdf2) + + return ( + hist_intersect(pdf1, pdf2), + correlation(pdf1, pdf2), + chi_square(pdf1, pdf2), + bhattacharya(pdf1, pdf2), + l1(pdf1, pdf2), + l2(pdf1, pdf2), + linfinity(pdf1, pdf2), + cosine_d(pdf1, pdf2), + canberra(pdf1, pdf2), + ks_divergence(cdf1, cdf2), + match(cdf1, cdf2), + cvm(cdf1, cdf2), + psi_value, + kld_value, + jsd_value, + wd_value, + errors, + ) + + +def feature_evaluation( # noqa C901 + gt_dir: Path, + pred_dir: Path, + combine_labels: Optional[bool], + file_pattern: str, + single_out_file: Optional[bool], + out_dir: Path, +) -> None: + """Generate evaluation metrics of ground truth and predicted images. + + Args: + gt_dir: Ground truth directory + pred_dir: Predicted directory + combine_labels: Calculate no of bins by combining GT and Predicted Labels + file_pattern: Pattern to parse data + single_out_file: Outputs in single combined or in separate files. + out_dir: Output directory. + """ + fp = filepattern.FilePattern(gt_dir, file_pattern) + + if single_out_file: + lst: list[Any] = [] + + header = [ + "Image", + "features", + "histogram intersection", + "correlation", + "chi square", + "bhattacharya distance", + "L1 score", + "L2 score", + "L infinity score", + "cosine distance", + "canberra distance", + "ks divergence", + "match distance", + "cvm distance", + "psi value", + "kl divergence", + "js divergence", + "wasserstein distance", + "Mean square error", + "Root mean square error", + "Normalized Root Mean Squared Error", + "Mean Error", + "Mean Absolute Error", + "Geometric Mean Absolute Error", + "Median Absolute Error", + "Mean Percentage Error", + "Mean Absolute Percentage Error", + "Median Absolute Percentage Error", + "Symmetric Mean Absolute Percentage Error", + "Symmetric Median Absolute Percentage Error", + "Mean Arctangent Absolute Percentage Error", + "Normalized Absolute Error", + "Normalized Absolute Percentage Error", + "Root Mean Squared Percentage Error", + "Root Median Squared Percentage Error", + "Integral Normalized Root Squared Error", + "Root Relative Squared Error", + "Relative Absolute Error (aka Approximation Error)", + "Mean Directional Accuracy", + ] + for file in fp(): + file_path = file[1][0] + file_name = file[1][0].name + if file[1][0].name.endswith((".csv", ".feather", ".arrow")): + df_gt = convert_vaex_dataframe(file_path) + + pred_fpath = Path(pred_dir, file_name) + if not pred_fpath.exists(): + continue + df_pred = convert_vaex_dataframe(pred_fpath) + + feature_list = [ + feature + for feature in df_gt.get_column_names() + if feature not in ["mask_image", "intensity_image", "label"] + if feature in df_pred.get_column_names() + ] + if not single_out_file: + lst = [] + + for feature in feature_list: + z_gt = df_gt[f"{feature}"].values + z_pred = df_pred[f"{feature}"].values + z_gt = np.array(z_gt, dtype=float) + z_pred = np.array(z_pred, dtype=float) + z_gt = z_gt[~np.isnan(z_gt)] + z_pred = z_pred[~np.isnan(z_pred)] + predsize = 0 + if z_pred.size > predsize and z_gt.size > predsize: + logger.info(f"evaluating feature {feature} for {file_name}") + expected_array = z_gt + actual_array = z_pred + if combine_labels: + combined = np.concatenate((actual_array, expected_array)) + q1 = np.quantile(combined, 0.25) + q3 = np.quantile(combined, 0.75) + iqr = q3 - q1 + bin_width = (2 * iqr) / (len(combined) ** (1 / 3)) + if bin_width == float(0.0) or np.isnan(bin_width): + continue + bin_count = np.ceil((combined.max() - combined.min()) / (bin_width)) + else: + q1 = np.quantile(expected_array, 0.25) + q3 = np.quantile(expected_array, 0.75) + iqr = q3 - q1 + bin_width = (2 * iqr) / (len(expected_array) ** (1 / 3)) + if bin_width == float(0.0) or np.isnan(bin_width): + continue + bin_count = np.ceil( + (expected_array.max() - expected_array.min()) / (bin_width), + ) + if bin_count > 2**16 or np.isnan(bin_count) or bin_count == 0: + continue + bin_count = int(bin_count) + + ( + hist_intersect, + correlation, + chi_square, + bhattacharya, + l1, + l2, + linfinity, + cosine_d, + canberra, + ks_divergence, + match, + cvm, + psi_value, + kld_value, + jsd_value, + wd_value, + errors, + ) = comparison(z_gt, z_pred, bin_count) + data_result = [ + file_name, + feature, + hist_intersect, + correlation, + chi_square, + bhattacharya, + l1, + l2, + linfinity, + cosine_d, + canberra, + ks_divergence, + match, + cvm, + psi_value, + kld_value, + jsd_value, + wd_value, + errors.get("mse"), + errors.get("rmse"), + errors.get("nrmse"), + errors.get("me"), + errors.get("mae"), + errors.get("gmae"), + errors.get("mdae"), + errors.get("mpe"), + errors.get("mape"), + errors.get("mdape"), + errors.get("smape"), + errors.get("smdape"), + errors.get("maape"), + errors.get("std_ae"), + errors.get("std_ape"), + errors.get("rmspe"), + errors.get("rmdspe"), + errors.get("inrse"), + errors.get("rrse"), + errors.get("rae"), + errors.get("mda"), + ] + lst.append(data_result) + + if not single_out_file: + df = vaex.from_pandas(pd.DataFrame(lst, columns=header)) + outname = file_name.split(".")[0] + POLUS_TAB_EXT + write_outfile(df, Path(out_dir, outname)) + + if single_out_file: + df = vaex.from_pandas(pd.DataFrame(lst, columns=header)) + outname = "result" + POLUS_TAB_EXT + write_outfile(df, Path(out_dir, outname)) + + logger.info("evaluation complete.") diff --git a/features/feature-segmentation-eval-tool/src/polus/tabular/features/feature_segmentation_eval/metrics.py b/features/feature-segmentation-eval-tool/src/polus/tabular/features/feature_segmentation_eval/metrics.py new file mode 100644 index 0000000..494f52e --- /dev/null +++ b/features/feature-segmentation-eval-tool/src/polus/tabular/features/feature_segmentation_eval/metrics.py @@ -0,0 +1,247 @@ +"""Feature segmentation evaluation package.""" +## Source: https://gist.github.com/bshishov/5dc237f59f019b26145648e2124ca1c9 + +import logging +from typing import Optional + +import numpy as np + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +EPSILON = 1e-10 + + +def _error(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Simple error.""" + return actual - predicted + + +def _percentage_error(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Percentage error. + + Note: result is NOT multiplied by 100. + """ + return _error(actual, predicted) / (actual + EPSILON) + + +def _geometric_mean( + a: np.ndarray, + axis: Optional[int] = 0, + dtype: Optional[np.dtype] = None, +) -> np.ndarray: + """Geometric mean.""" + if not isinstance(a, np.ndarray): # if not an ndarray object attempt to convert it + log_a = np.log(np.array(a, dtype=dtype)) + elif dtype: # Must change the default dtype allowing array type + if isinstance(a, np.ma.MaskedArray): + log_a = np.log(np.ma.asarray(a, dtype=dtype)) + else: + log_a = np.log(np.asarray(a, dtype=dtype)) + else: + log_a = np.log(a) + return np.exp(log_a.mean(axis=axis)) + + +def mse(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Mean Squared Error.""" + return np.mean(np.square(_error(actual, predicted))) + + +def rmse(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Root Mean Squared Error.""" + return np.sqrt(mse(actual, predicted)) + + +def nrmse(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Normalized Root Mean Squared Error.""" + return rmse(actual, predicted) / (actual.max() - actual.min()) + + +def me(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Mean Error.""" + return np.mean(_error(actual, predicted)) + + +def mae(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Mean Absolute Error.""" + return np.mean(np.abs(_error(actual, predicted))) + + +mad = mae # Mean Absolute Deviation (it is the same as MAE) + + +def gmae(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Geometric Mean Absolute Error.""" + return _geometric_mean(np.abs(_error(actual, predicted))) + + +def mdae(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Median Absolute Error.""" + return np.median(np.abs(_error(actual, predicted))) + + +def mpe(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Mean Percentage Error.""" + return np.mean(_percentage_error(actual, predicted)) + + +def mape(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Mean Absolute Percentage Error. + + Properties: + + Easy to interpret + + Scale independent + - Biased, not symmetric + - Undefined when actual[t] == 0 + Note: result is NOT multiplied by 100. + """ + return np.mean(np.abs(_percentage_error(actual, predicted))) + + +def mdape(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Median Absolute Percentage Error. + + Note: result is NOT multiplied by 100. + """ + return np.median(np.abs(_percentage_error(actual, predicted))) + + +def smape(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Symmetric Mean Absolute Percentage Error. + + Note: result is NOT multiplied by 100. + """ + return np.mean( + 2.0 + * np.abs(actual - predicted) + / ((np.abs(actual) + np.abs(predicted)) + EPSILON), + ) + + +def smdape(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Symmetric Median Absolute Percentage Error. + + Note: result is NOT multiplied by 100. + """ + return np.median( + 2.0 + * np.abs(actual - predicted) + / ((np.abs(actual) + np.abs(predicted)) + EPSILON), + ) + + +def maape(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Mean Arctangent Absolute Percentage Error. + + Note: result is NOT multiplied by 100. + """ + return np.mean(np.arctan(np.abs((actual - predicted) / (actual + EPSILON)))) + + +def std_ae(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Normalized Absolute Error.""" + __mae = mae(actual, predicted) + return np.sqrt( + np.sum(np.square(_error(actual, predicted) - __mae)) / (len(actual) - 1), + ) + + +def std_ape(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Normalized Absolute Percentage Error.""" + __mape = mape(actual, predicted) + return np.sqrt( + np.sum(np.square(_percentage_error(actual, predicted) - __mape)) + / (len(actual) - 1), + ) + + +def rmspe(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Root Mean Squared Percentage Error. + + Note: result is NOT multiplied by 100. + """ + return np.sqrt(np.mean(np.square(_percentage_error(actual, predicted)))) + + +def rmdspe(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Root Median Squared Percentage Error. + + Note: result is NOT multiplied by 100. + """ + return np.sqrt(np.median(np.square(_percentage_error(actual, predicted)))) + + +def inrse(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Integral Normalized Root Squared Error.""" + return np.sqrt( + np.sum(np.square(_error(actual, predicted))) + / np.sum(np.square(actual - np.mean(actual))), + ) + + +def rrse(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Root Relative Squared Error.""" + return np.sqrt( + np.sum(np.square(actual - predicted)) + / np.sum(np.square(actual - np.mean(actual))), + ) + + +def rae(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Relative Absolute Error (aka Approximation Error).""" + return np.sum(np.abs(actual - predicted)) / ( + np.sum(np.abs(actual - np.mean(actual))) + EPSILON + ) + + +def mda(actual: np.ndarray, predicted: np.ndarray) -> np.ndarray: + """Mean Directional Accuracy.""" + return np.mean( + ( + np.sign(actual[1:] - actual[:-1]) == np.sign(predicted[1:] - predicted[:-1]) + ).astype(int), + ) + + +METRICS = { + "mse": mse, + "rmse": rmse, + "nrmse": nrmse, + "me": me, + "mae": mae, + "mad": mad, + "gmae": gmae, + "mdae": mdae, + "mpe": mpe, + "mape": mape, + "mdape": mdape, + "smape": smape, + "smdape": smdape, + "maape": maape, + "std_ae": std_ae, + "std_ape": std_ape, + "rmspe": rmspe, + "rmdspe": rmdspe, + "inrse": inrse, + "rrse": rrse, + "rae": rae, + "mda": mda, +} + + +def evaluate(actual: np.ndarray, predicted: np.ndarray, metrics: dict) -> dict: + """Compute error metrics.""" + results = {} + for name in metrics: + try: + results[name] = METRICS[name](actual, predicted) + except ValueError as err: + results[name] = np.nan + logger.info(f"Unable to compute metric {name}: {err}") + return results + + +def evaluate_all(actual: np.ndarray, predicted: np.ndarray) -> dict: + """Compute all metrics.""" + return evaluate(actual, predicted, metrics=set(METRICS.keys())) # type: ignore diff --git a/features/feature-segmentation-eval-tool/tests/__init__.py b/features/feature-segmentation-eval-tool/tests/__init__.py new file mode 100644 index 0000000..51fdd37 --- /dev/null +++ b/features/feature-segmentation-eval-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Feature segmentation evaluation package.""" diff --git a/features/feature-segmentation-eval-tool/tests/conftest.py b/features/feature-segmentation-eval-tool/tests/conftest.py new file mode 100644 index 0000000..ff35427 --- /dev/null +++ b/features/feature-segmentation-eval-tool/tests/conftest.py @@ -0,0 +1,90 @@ +"""Test fixtures. + +Set up all data used in tests. +""" +import tempfile +from pathlib import Path +from typing import Union + +import numpy as np +import pandas as pd +import pytest + + +@pytest.fixture() +def gt_dir() -> Union[str, Path]: + """Create directory for groundtruth features data.""" + return Path(tempfile.mkdtemp(dir=Path.cwd())) + + +@pytest.fixture() +def pred_dir() -> Union[str, Path]: + """Create directory for predicted features data.""" + return Path(tempfile.mkdtemp(dir=Path.cwd())) + + +@pytest.fixture() +def output_directory() -> Union[str, Path]: + """Create output directory.""" + return Path(tempfile.mkdtemp(dir=Path.cwd())) + + +@pytest.fixture( + params=[ + (".csv", 500, True, True), + (".arrow", 100, True, False), + (".csv", 1000, False, True), + (".csv", 10000, True, False), + ], +) +def params(request: pytest.FixtureRequest) -> pytest.FixtureRequest: + """To get the parameter of the fixture.""" + return request.param + + +@pytest.fixture() +def generate_data( + gt_dir: Union[str, Path], + pred_dir: Union[str, Path], + params: pytest.FixtureRequest, +) -> tuple[Union[str, Path], Union[str, Path]]: + """Creating dataset for groundtruth and prediction.""" + file_ext, size, _, _ = params + df_size = size + rng = np.random.default_rng(42) + + diction_1 = { + "intensity_image": list(np.repeat("p0_y1_r19_c0.ome.tif", df_size)), + "mask_image": list(np.repeat("p0_y1_r19_c0.ome.tif", df_size)), + "label": list(range(1, df_size + 1)), + "INTEGRATED_INTENSITY": rng.uniform(0.0, 6480.0, size=df_size), + "MEAN": rng.uniform(0.0, 43108.5, size=df_size), + "UNIFORMITY": rng.normal(0.0, 1.0, size=df_size), + "P01": rng.integers(low=1, high=10, size=df_size), + "POLYGONALITY_AVE": list(np.repeat(0, df_size)), + } + df_size = round(size / 1.2) + + diction_2 = { + "intensity_image": list(np.repeat("p0_y1_r01_c0.ome.tif", df_size)), + "mask_image": list(np.repeat("p0_y1_r01_c0.ome.tif", df_size)), + "label": list(range(1, df_size + 1)), + "INTEGRATED_INTENSITY": rng.uniform(0.0, 8000.0, size=df_size), + "MEAN": rng.uniform(0.0, 6000.5, size=df_size), + "UNIFORMITY": rng.normal(0.0, 0.5, size=df_size), + "P01": rng.integers(low=1, high=20, size=df_size), + "POLYGONALITY_AVE": list(np.repeat(0, df_size)), + } + df1 = pd.DataFrame(diction_1) + df2 = pd.DataFrame(diction_2) + if file_ext == ".csv": + for i in range(5): + df1.to_csv(Path(gt_dir, f"p0_y1_r0{i}_c0.csv"), index=False) + df2.to_csv(Path(pred_dir, f"p0_y1_r0{i}_c0.csv"), index=False) + + if file_ext == ".arrow": + for i in range(5): + df1.to_feather(Path(gt_dir, f"p0_y1_r0{i}_c0.arrow")) + df2.to_feather(Path(pred_dir, f"p0_y1_r0{i}_c0.arrow")) + + return gt_dir, pred_dir diff --git a/features/feature-segmentation-eval-tool/tests/test_cli.py b/features/feature-segmentation-eval-tool/tests/test_cli.py new file mode 100644 index 0000000..e54bcfd --- /dev/null +++ b/features/feature-segmentation-eval-tool/tests/test_cli.py @@ -0,0 +1,41 @@ +"""Test Command line Tool.""" +import shutil +from pathlib import Path +from typing import Union + +from polus.tabular.features.feature_segmentation_eval.__main__ import app +from typer.testing import CliRunner + + +def clean_directories() -> None: + """Remove all temporary directories.""" + for d in Path(".").cwd().iterdir(): + if d.is_dir() and d.name.startswith("tmp"): + shutil.rmtree(d) + + +def test_cli( + generate_data: tuple[Union[Path, str], Union[Path, str]], + output_directory: Union[str, Path], +) -> None: + """Test the command line.""" + runner = CliRunner() + gt_dir, pred_dir = generate_data + result = runner.invoke( + app, + [ + "--GTDir", + gt_dir, + "--PredDir", + pred_dir, + "--filePattern", + ".*.csv", + "--combineLabels", + "--singleOutFile", + "--outDir", + output_directory, + ], + ) + + assert result.exit_code == 0 + clean_directories() diff --git a/features/feature-segmentation-eval-tool/tests/test_feature_single.py b/features/feature-segmentation-eval-tool/tests/test_feature_single.py new file mode 100644 index 0000000..d80376f --- /dev/null +++ b/features/feature-segmentation-eval-tool/tests/test_feature_single.py @@ -0,0 +1,40 @@ +"""Test feature segmentation evaluation package.""" +import shutil +from pathlib import Path +from typing import Union + +import polus.tabular.features.feature_segmentation_eval.feature_evaluation as fs +import pytest +import vaex + + +def clean_directories() -> None: + """Remove all temporary directories.""" + for d in Path(".").cwd().iterdir(): + if d.is_dir() and d.name.startswith("tmp"): + shutil.rmtree(d) + + +def test_feature_evaluation( + generate_data: tuple[Path, Path], + output_directory: Union[str, Path], + params: pytest.FixtureRequest, +) -> None: + """Test calculating metrics for predicted and ground truth histograms.""" + _, _, combinelabels, single_outfile = params + gt_dir, pred_dir = generate_data + fs.feature_evaluation( + gt_dir=gt_dir, + pred_dir=pred_dir, + combine_labels=combinelabels, + file_pattern=".*", + single_out_file=single_outfile, + out_dir=output_directory, + ) + + for file in list(Path(output_directory).rglob("*")): + df = vaex.open(file) + num_columns = 39 + assert len(df.columns) == num_columns + assert (df.shape[0]) != 0 + clean_directories() diff --git a/features/polus-csv-statistics-plugin/Dockerfile b/features/polus-csv-statistics-plugin/Dockerfile new file mode 100644 index 0000000..d6b8f9f --- /dev/null +++ b/features/polus-csv-statistics-plugin/Dockerfile @@ -0,0 +1,17 @@ +FROM polusai/bfio:2.1.9 + +COPY VERSION / + +ARG EXEC_DIR="/opt/executables" +ARG DATA_DIR="/data" + +RUN mkdir -p ${EXEC_DIR} \ + && mkdir -p ${DATA_DIR}/inputs \ + && mkdir ${DATA_DIR}/outputs + +COPY src ${EXEC_DIR}/ +WORKDIR ${EXEC_DIR} + +RUN pip3 install -r ${EXEC_DIR}/requirements.txt + +ENTRYPOINT ["python3", "/opt/executables/main.py"] \ No newline at end of file diff --git a/features/polus-csv-statistics-plugin/README.md b/features/polus-csv-statistics-plugin/README.md new file mode 100644 index 0000000..51ac1c4 --- /dev/null +++ b/features/polus-csv-statistics-plugin/README.md @@ -0,0 +1,37 @@ +# CSV Statistics + +This WIPP plugin performs statistics on values in each column of a csv file if the data is numeric. Rows of data are grouped together by rows that have a matching value in a column with header named `file`. If no columns have the `file` header, then this plugin throws and error. + +Available statistics are: + +1. [mean (arithmetic mean)](https://en.wikipedia.org/wiki/Mean#Arithmetic_mean_(AM)) +2. [median](https://en.wikipedia.org/wiki/Median#The_sample_median) +3. [std (standard deviation)](https://en.wikipedia.org/wiki/Standard_deviation) +4. [var (variance)](https://en.wikipedia.org/wiki/Variance) +5. [skew (Fisher-Pearson skewness)](https://www.itl.nist.gov/div898/handbook/eda/section3/eda35b.htm) +6. [kurt (excess kurtosis)](https://www.itl.nist.gov/div898/handbook/eda/section3/eda35b.htm) +7. count (number of rows sampled) +8. [iqr (Interquartile_range)](https://en.wikipedia.org/wiki/Interquartile_range) + +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +## Building + +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. + +## Options + +This plugin takes two input argument and one output argument: + +| Name | Description | I/O | Type | +| --------------- | --------------------------------------------------- | ------ | ------------- | +| `--statistics` | Types of statistics to calculate | Input | array | +| `--inpDir` | Input csv collection to be processed by this plugin | Input | csvCollection | +| `--filePattern` | The filePattern of the images in represented in csv | Input | string | +| `--groupBy` | The variable(s) of how the images should be grouped | Input | string | +| `--outDir` | Output collection | Output | csvCollection | diff --git a/features/polus-csv-statistics-plugin/VERSION b/features/polus-csv-statistics-plugin/VERSION new file mode 100644 index 0000000..7dff5b8 --- /dev/null +++ b/features/polus-csv-statistics-plugin/VERSION @@ -0,0 +1 @@ +0.2.1 \ No newline at end of file diff --git a/features/polus-csv-statistics-plugin/build-docker.sh b/features/polus-csv-statistics-plugin/build-docker.sh new file mode 100755 index 0000000..ff8f13c --- /dev/null +++ b/features/polus-csv-statistics-plugin/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$( fcheck: + fcheck += 1 + logger.info('Unique Files parsed: {}'.format(fnum)) \ No newline at end of file diff --git a/features/polus-csv-statistics-plugin/src/requirements.txt b/features/polus-csv-statistics-plugin/src/requirements.txt new file mode 100644 index 0000000..6dd96c6 --- /dev/null +++ b/features/polus-csv-statistics-plugin/src/requirements.txt @@ -0,0 +1 @@ +filepattern==1.4.7 \ No newline at end of file diff --git a/formats/tabular-converter-tool/.bumpversion.cfg b/formats/tabular-converter-tool/.bumpversion.cfg new file mode 100644 index 0000000..e5eeb10 --- /dev/null +++ b/formats/tabular-converter-tool/.bumpversion.cfg @@ -0,0 +1,31 @@ +[bumpversion] +current_version = 0.1.2-dev1 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:plugin.json] + +[bumpversion:file:README.md] + +[bumpversion:file:ict.yaml] + +[bumpversion:file:VERSION] + +[bumpversion:file:src/polus/tabular/formats/tabular_converter/__init__.py] diff --git a/formats/tabular-converter-tool/.gitignore b/formats/tabular-converter-tool/.gitignore new file mode 100644 index 0000000..e891280 --- /dev/null +++ b/formats/tabular-converter-tool/.gitignore @@ -0,0 +1,175 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +poetry.lock +../../poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# vscode +.vscode + +# test data directory +data +# yaml file +.pre-commit-config.yaml + +# hidden files +.DS_Store +.ds_store +# flake8 +.flake8 diff --git a/formats/tabular-converter-tool/Dockerfile b/formats/tabular-converter-tool/Dockerfile new file mode 100644 index 0000000..f33ed02 --- /dev/null +++ b/formats/tabular-converter-tool/Dockerfile @@ -0,0 +1,20 @@ +FROM polusai/bfio:2.3.6 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".csv" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +RUN pip3 install --index-url https://test.pypi.org/simple/ filepattern==2.2.7 +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir + +ENTRYPOINT ["python3", "-m", "polus.tabular.formats.tabular_converter"] +CMD ["--help"] diff --git a/formats/tabular-converter-tool/README.md b/formats/tabular-converter-tool/README.md new file mode 100644 index 0000000..dc5197f --- /dev/null +++ b/formats/tabular-converter-tool/README.md @@ -0,0 +1,41 @@ +# Tabular Converter (v0.1.2-dev1) + +This WIPP plugin allows the tabular data conversion to `arrow` file format and vice versa. Currently this plugins handles only the vaex supported file formats. +This plugin supports the following file formats which are convertable into `arrow` file format: + +1. fcs +2. csv +3. hdf5 +4. fits +5. parquet +6. feather + +However the `arrow` file format is convertable to all other file formats except `fcs` and `fits`. +The support for additional file formats will be added in future. + + +Contact [Kelechi Nina Mezu](mailto:nina.mezu@nih.gov), [Hamdah Shafqat Abbasi](mailto:hamdahshafqat.abbasi@nih.gov) for more information. + +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +## Building + +To build the Docker image for the conversion plugin, run +`bash build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the +contents of `plugin.json` into the pop-up window and submit. + +## Options + +This plugin takes two input arguments and one output argument: + +| Name | Description | I/O | Type | +|---------------|-------------------------|--------|--------| +| `--inpDir` | Input generic data collection to be processed by this plugin | Input | genericData | +| `--filePattern` | Pattern to parse tabular files | Input | string | +| `--fileExtension` | Desired pattern to convert | Input | string | +| `--outDir` | Output collection | Output | genericData | +| `--preview` | Generate JSON file with outputs | Output | JSON | diff --git a/formats/tabular-converter-tool/VERSION b/formats/tabular-converter-tool/VERSION new file mode 100644 index 0000000..12fd03c --- /dev/null +++ b/formats/tabular-converter-tool/VERSION @@ -0,0 +1 @@ +0.1.2-dev1 diff --git a/formats/tabular-converter-tool/build-docker.sh b/formats/tabular-converter-tool/build-docker.sh new file mode 100644 index 0000000..fed7c5a --- /dev/null +++ b/formats/tabular-converter-tool/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$(", +"hamshkhawar " +] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +filepattern = "^2.0.4" +typer = "^0.7.0" +tqdm = "^4.64.1" +pyarrow = "^11.0.0" +blake3 = "^0.3.3" +fcsparser = "^0.2.4" +vaex = "^4.7.0" + + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +pre-commit = "^3.1.0" +black = "^23.1.0" +flake8 = "^6.0.0" +mypy = "^1.0.1" +pytest = "^7.2.1" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/formats/tabular-converter-tool/run-plugin.sh b/formats/tabular-converter-tool/run-plugin.sh new file mode 100644 index 0000000..00d2e44 --- /dev/null +++ b/formats/tabular-converter-tool/run-plugin.sh @@ -0,0 +1,26 @@ +#!/bin/bash +version=$( None: + """Execute Main function.""" + logger.info(f"inpDir = {inp_dir}") + logger.info(f"outDir = {out_dir}") + logger.info(f"filePattern = {file_pattern}") + logger.info(f"fileExtension = {file_extension}") + + assert inp_dir.exists(), f"{inp_dir} doesnot exist!! Please check input path again" + assert out_dir.exists(), f"{out_dir} doesnot exist!! Please check output path again" + + file_pattern = ".*" + file_pattern + + fps = fp.FilePattern(inp_dir, file_pattern) + + if preview: + with open(pathlib.Path(out_dir, "preview.json"), "w") as jfile: + out_json: dict[str, Any] = { + "filepattern": file_pattern, + "outDir": [], + } + for file in fps: + out_name = str(file[1][0].stem) + file_pattern + out_json["outDir"].append(out_name) + json.dump(out_json, jfile, indent=2) + + processes = [] + with ProcessPoolExecutor(max_workers) as executor: + for files in fps: + file = files[1][0] + tab = tc.ConvertTabular(file, file_extension, out_dir) + if files[1][0].suffix == ".fcs": + processes.append(executor.submit(tab.fcs_to_arrow)) + elif files[1][0].suffix == ".arrow": + processes.append(executor.submit(tab.arrow_to_tabular)) + else: + processes.append(executor.submit(tab.df_to_arrow)) + + for f in tqdm( + as_completed(processes), + desc=f"converting tabular data to {file_pattern}", + total=len(processes), + ): + f.result() + + tab.remove_files() + + logger.info("Finished all processes!") + + +if __name__ == "__main__": + app() diff --git a/formats/tabular-converter-tool/src/polus/tabular/formats/tabular_converter/tabular_converter.py b/formats/tabular-converter-tool/src/polus/tabular/formats/tabular_converter/tabular_converter.py new file mode 100644 index 0000000..9303907 --- /dev/null +++ b/formats/tabular-converter-tool/src/polus/tabular/formats/tabular_converter/tabular_converter.py @@ -0,0 +1,158 @@ +"""Tabular Converter.""" +import enum +import logging +import os +import pathlib + +import fcsparser +import vaex + +logger = logging.getLogger(__name__) + +POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".arrow") + + +class Extensions(str, enum.Enum): + """Extension types to be converted.""" + + FITS = ".fits" + FEATHER = ".feather" + PARQUET = ".parquet" + HDF = ".hdf5" + FCS = ".fcs" + CSV = ".csv" + ARROW = ".arrow" + Default = POLUS_TAB_EXT + + +class ConvertTabular: + """Convert vaex supported file formats into Arrow data format and vice versa. + + Args: + file: Path to input file. + file_extension : Desired ouput file extension. + out_dir: Path to save the output csv file. + """ + + def __init__( + self, file: pathlib.Path, file_extension: Extensions, out_dir: pathlib.Path + ): + """Define Instance attributes.""" + self.file = file + self.out_dir = out_dir + self.file_extension = file_extension + self.output_file = pathlib.Path( + self.out_dir, (self.file.stem + self.file_extension) + ) + + def csv_to_df(self) -> vaex.DataFrame: + """Convert csv into datafram or hdf5 file.""" + logger.info("csv_to_df: Copy csv file into out_dir for processing...") + logger.info("csv_to_df: Checking size of csv file...") + # Open csv file and count rows in file + with open(self.file, encoding="utf-8") as fr: + ncols = len(fr.readline().split(",")) + chunk_size = max([2**24 // ncols, 1]) + logger.info("csv_to_df: # of columns are: " + str(ncols)) + # Convert large csv files to hdf5 if more than 1,000,000 rows + logger.info("csv_to_df: converting file into hdf5 format") + df = vaex.from_csv(self.file, convert=True, chunk_size=chunk_size) + return df + + def binary_to_df(self) -> vaex.DataFrame: + """Convert any binary formats into vaex dataframe.""" + binary_patterns = [".fits", ".feather", ".parquet", ".hdf5", ".arrow"] + logger.info("binary_to_df: Scanning directory for binary file pattern... ") + if self.file_extension in binary_patterns: + # convert hdf5 to vaex df + df = vaex.open(self.file) + return df + else: + raise FileNotFoundError( + "No supported binary file extensions were found in the directory. Please check file directory again." + ) + + def fcs_to_arrow(self) -> None: + """Convert fcs file to csv. Copied from polus-fcs-to-csv-converter plugin.""" + logger.info( + "fcs_to_feather : Begin parsing data out of .fcs file" + self.file.stem + ) + # Use fcsparser to parse data into python dataframe + _, data = fcsparser.parse(self.file, meta_data_only=False, reformat_meta=True) + + # Export the fcs data to vaex df + logger.info("fcs_to_feather: converting data to vaex dataframe...") + df = vaex.from_pandas(data) + logger.info("fcs_to_feather: writing file...") + logger.info( + "fcs_to_feather: Writing Vaex Dataframe to Feather File Format for:" + + self.file.stem + ) + df.export_feather(self.output_file) + + def df_to_arrow(self) -> None: + """Convert vaex dataframe to Arrow feather file.""" + logger.info("df_to_feather: Scanning input directory files... ") + if self.file_extension == ".csv": + # convert csv to vaex df or hdf5 + df = self.csv_to_df() + else: + df = self.binary_to_df() + + logger.info("df_to_arrow: writing file...") + logger.info( + "df_to_arrow: Writing Vaex Dataframe to Feather File Format for:" + + self.file.stem + ) + df.export_feather(self.output_file) + + def remove_files(self) -> None: + """Delete intermediate files other than arrow and json files from output directory.""" + for f in self.out_dir.iterdir(): + extension_list = [ + ".arrow", + ".json", + ".feather", + ".csv", + ".hdf5", + ".fits", + ".fcs", + ".parquet", + ] + if f.suffix not in extension_list: + os.remove(f) + + logger.info("Done") + + def arrow_to_tabular(self) -> None: + """Convert Arrow file into tabular file. + + This function uses vaex to open an arrow file and converts into other vaex supported formats. + Note: At the moment [.csv, parquet, hdf5, feather] file formats are supported. + """ + data = vaex.open(self.file) + logger.info("Arrow Conversion: Copy ${self.file} into outDir for processing...") + ncols = len(data) + chunk_size = max([2**24 // ncols, 1]) + logger.info("Arrow Conversion: checking for file format") + + if self.file_extension == ".csv": + logger.info("Arrow Conversion: Converting PyArrow Table into .csv file") + # Streaming contents of Arrow Table into csv + return data.export_csv(self.output_file, chunksize=chunk_size) + + elif self.file_extension == ".parquet": + logger.info("Arrow Conversion: Converting PyArrow Table into .parquet file") + return data.export_parquet(self.output_file) + + elif self.file_extension == ".hdf5": + logger.info("Arrow Conversion: Converting PyArrow Table into .hdf5") + return data.export_hdf5(self.output_file) + elif self.file_extension == ".feather": + logger.info("Arrow Conversion: Converting PyArrow Table into .hdf5") + return data.export_feather(self.output_file) + + else: + logger.error( + "Arrow Conversion Error: This format is not supported in this plugin" + ) diff --git a/formats/tabular-converter-tool/tabularconverter.cwl b/formats/tabular-converter-tool/tabularconverter.cwl new file mode 100644 index 0000000..9185bee --- /dev/null +++ b/formats/tabular-converter-tool/tabularconverter.cwl @@ -0,0 +1,32 @@ +class: CommandLineTool +cwlVersion: v1.2 +inputs: + fileExtension: + inputBinding: + prefix: --fileExtension + type: string + filePattern: + inputBinding: + prefix: --filePattern + type: string? + inpDir: + inputBinding: + prefix: --inpDir + type: Directory + outDir: + inputBinding: + prefix: --outDir + type: Directory +outputs: + outDir: + outputBinding: + glob: $(inputs.outDir.basename) + type: Directory +requirements: + DockerRequirement: + dockerPull: polusai/tabular-converter-tool:0.1.2-dev1 + InitialWorkDirRequirement: + listing: + - entry: $(inputs.outDir) + writable: true + InlineJavascriptRequirement: {} diff --git a/formats/tabular-converter-tool/tests/__init__.py b/formats/tabular-converter-tool/tests/__init__.py new file mode 100644 index 0000000..f8d42a1 --- /dev/null +++ b/formats/tabular-converter-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Testing of Tabular Converter plugin.""" diff --git a/formats/tabular-converter-tool/tests/test_main.py b/formats/tabular-converter-tool/tests/test_main.py new file mode 100644 index 0000000..67518ba --- /dev/null +++ b/formats/tabular-converter-tool/tests/test_main.py @@ -0,0 +1,173 @@ +"""Testing of Tabular Converter plugin.""" +import pathlib +import random +import shutil +import string +import tempfile + +import fcsparser +import filepattern as fp +import numpy as np +import pandas as pd +import pytest +import vaex +from astropy.table import Table +from polus.tabular.formats.tabular_converter import tabular_converter as tc + + +class Generatedata: + """Generate tabular data with several different file format.""" + + def __init__(self, file_pattern: str) -> None: + """Define instance attributes.""" + self.dirpath = pathlib.Path.cwd() + self.inp_dir = tempfile.mkdtemp(dir=self.dirpath) + self.out_dir = tempfile.mkdtemp(dir=self.dirpath) + self.file_pattern = file_pattern + self.x = self.create_dataframe() + + def get_inp_dir(self) -> str: + """Get input directory.""" + return self.inp_dir + + def get_out_dir(self) -> str: + """Get output directory.""" + return self.out_dir + + def create_dataframe(self) -> pd.core.frame.DataFrame: + """Create Pandas dataframe.""" + return pd.DataFrame( + { + "A": [random.choice(string.ascii_letters) for i in range(100)], + "B": np.random.randint(low=1, high=100, size=100), + "C": np.random.normal(0.0, 1.0, size=100), + }, + ) + + def fits_func(self) -> None: + """Convert pandas dataframe to fits file format.""" + ft = Table.from_pandas(self.x) + ft.write(pathlib.Path(self.inp_dir, "data.fits")) + + def fcs_func(self) -> None: + """Get the test example of fcs data.""" + fpath = fcsparser.test_sample_path + shutil.copy(fpath, self.inp_dir) + + def csv_func(self) -> None: + """Convert pandas dataframe to csv file format.""" + self.x.to_csv(pathlib.Path(self.inp_dir, "data.csv"), index=False) + + def parquet_func(self) -> None: + """Convert pandas dataframe to parquet file format.""" + self.x.to_parquet( + pathlib.Path(self.inp_dir, "data.parquet"), + engine="auto", + compression=None, + ) + + def feather_func(self) -> None: + """Convert pandas dataframe to feather file format.""" + self.x.to_feather(pathlib.Path(self.inp_dir, "data.feather")) + + def arrow_func(self) -> None: + """Convert pandas dataframe to Arrow file format.""" + self.x.to_feather(pathlib.Path(self.inp_dir, "data.arrow")) + + def hdf_func(self) -> None: + """Convert pandas dataframe to hdf5 file format.""" + v_df = vaex.from_pandas(self.x, copy_index=False) + v_df.export(pathlib.Path(self.inp_dir, "data.hdf5")) + + def __call__(self) -> None: + """To make a class callable.""" + data_ext = { + ".hdf5": self.hdf_func, + ".csv": self.csv_func, + ".parquet": self.parquet_func, + ".feather": self.feather_func, + ".fits": self.fits_func, + ".fcs": self.fcs_func, + ".arrow": self.arrow_func, + } + + return data_ext[self.file_pattern]() + + def clean_directories(self): + """Remove files.""" + for d in self.dirpath.iterdir(): + if d.is_dir() and d.name.startswith("tmp"): + shutil.rmtree(d) + + +FILE_EXT = [[".hdf5", ".parquet", ".csv", ".feather", ".fits", ".fcs", ".arrow"]] + + +@pytest.fixture(params=FILE_EXT) +def poly(request): + """To get the parameter of the fixture.""" + return request.param + + +def test_tabular_coverter(poly): + """Testing of vaex supported inter conversion of tabular data.""" + for i in poly: + if i not in [".fcs", ".arrow"]: + d = Generatedata(i) + d() + pattern = f".*{i}" + fps = fp.FilePattern(d.get_inp_dir(), pattern) + for file in fps(): + print(file) + tab = tc.ConvertTabular(file[1][0], ".arrow", d.get_out_dir()) + tab.df_to_arrow() + + assert ( + all( + file[1][0].suffix + for file in fp.FilePattern(d.get_out_dir(), ".arrow") + ) + is True + ) + elif i == ".fcs": + d = Generatedata(".fcs") + d() + pattern = f".*{i}" + fps = fp.FilePattern(d.get_inp_dir(), pattern) + for file in fps(): + tab = tc.ConvertTabular(file[1][0], ".arrow", d.get_out_dir()) + tab.fcs_to_arrow() + + assert ( + all( + file[1][0].suffix + for file in fp.FilePattern(d.get_out_dir(), ".arrow") + ) + is True + ) + + elif i == ".arrow": + d = Generatedata(".arrow") + d() + pattern = f".*{i}" + fps = fp.FilePattern(d.get_inp_dir(), pattern) + extension_list = [ + ".feather", + ".csv", + ".hdf5", + ".parquet", + ] + for ext in extension_list: + for file in fps(): + tab = tc.ConvertTabular(file[1][0], ext, d.get_out_dir()) + tab.arrow_to_tabular() + + assert ( + all( + file[1][0].suffix + for file in fp.FilePattern(d.get_out_dir(), ext) + ) + is True + ) + + d.clean_directories() diff --git a/noxfile.py b/noxfile.py new file mode 100644 index 0000000..dc282ac --- /dev/null +++ b/noxfile.py @@ -0,0 +1,26 @@ +"""Nox automation file.""" + +from nox import Session, session + +python_versions = ["3.9"] + + +@session(python=["3.9"]) +def export_ts(session: Session) -> None: + """Export Pydantic model as TypeScript object.""" + session.install("-r", "requirements-dev.txt") + + session.run( + "datamodel-codegen", + "--input", + "./polus/_plugins/models/PolusComputeSchema.json", + "--output", + "./polus/_plugins/models/PolusComputeSchema.py", + ) + session.run( + "pydantic2ts", + "--module", + "./polus/_plugins/models/PolusComputeSchema.py", + "--output", + "./polus/_plugins/models/PolusComputeSchema.ts", + ) diff --git a/package.json b/package.json new file mode 100644 index 0000000..78175fd --- /dev/null +++ b/package.json @@ -0,0 +1,30 @@ +{ + "name": "@polusai/tabular-tools", + "version": "0.1.1", + "description": "Monorepo for generic WIPP plugins", + "scripts": {}, + "repository": { + "type": "git", + "url": "git+https://github.com/polusAI/tabular-tools.git" + }, + "license": "MIT", + "bugs": { + "url": "https://github.com/polusAI/tabular-tools/issues" + }, + "homepage": "https://github.com/polusAI/tabular-tools#readme", + "devDependencies": { + "@commitlint/cli": "^8.2.0", + "@commitlint/config-conventional": "^8.2.0", + "husky": "^3.0.8" + }, + "husky": { + "hooks": { + "commit-msg": "commitlint -E HUSKY_GIT_PARAMS" + } + }, + "commitlint": { + "extends": [ + "@commitlint/config-conventional" + ] + } +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..2b40a69 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,63 @@ +[tool.poetry] +authors = ["Nicholas Schaub ", "Camilo Velez "] +description = "Python API to configure and run Polus Plugins." +license = "License :: OSI Approved :: MIT License" +maintainers = ["Camilo Velez "] +name = "polus-tabular" +packages = [{include = "polus", from = "src"}] +readme = "README.md" +repository = "https://github.com/PolusAI/tabular-tools" +version = "0.1.1" + +[tool.poetry.dependencies] +python = ">=3.9, <3.12" + +click = "^8.1.3" +cwltool = "^3.1.20230513155734" +fsspec = "^2023.6.0" +pydantic = ">=1.10.0" +pygithub = "^1.58.2" +python-on-whales = "^0.68.0" +pyyaml = "^6.0" +tqdm = "^4.65.0" +validators = "^0.22.0" +xmltodict = "^0.13.0" + +[tool.poetry.group.dev.dependencies] +python = ">=3.9, <3.12" + +black = "^23.3.0" +bump2version = "^1.0.1" +datamodel-code-generator = "^0.23.0" +flake8 = "^6.0.0" +fsspec = "^2023.1.0" +mypy = "^1.4.0" +nox = "^2022.11.21" +poetry = "^1.3.2" +pre-commit = "^3.3.3" +pydantic = ">=1.10" +pytest = "^7.3.2" +pytest-benchmark = "^4.0.0" +pytest-cov = "^4.1.0" +pytest-sugar = "^0.9.7" +pytest-xdist = "^3.3.1" +python-on-whales = "^0.68.0" +pyyaml = "^6.0" +ruff = "^0.0.274" +tqdm = "^4.64.1" +xmltodict = "^0.13.0" + +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.isort] +profile = "black" + +[tool.pytest.ini_options] +addopts = [ + "--import-mode=importlib", +] +markers = [ + "repo: marks tests that validate plugin.json files in local repo", +] diff --git a/ruff.toml b/ruff.toml index 20707d6..bf5e426 100644 --- a/ruff.toml +++ b/ruff.toml @@ -41,13 +41,15 @@ max-complexity = 12 [pydocstyle] convention = "google" -# Ignore `F401` (unused import violations) in all `__init__.py` files. [per-file-ignores] -"__init__.py" = ["F401"] -"__main__.py" = ["B008", "S101"] +"__init__.py" = ["F401"] # Unused import. +"__main__.py" = [ + "B008", + "S101", # Use of assert detected. +] "./**/tests/*.py" = [ "S101", # Use of assert detected. - "PLR2004", # Use of magic value in comparison. + "PLR2004", # Use of magic values detected. ] [isort] diff --git a/to_clt.py b/to_clt.py new file mode 100644 index 0000000..a2dd9b9 --- /dev/null +++ b/to_clt.py @@ -0,0 +1,108 @@ +# ruff: noqa +"""Script to convert all WIPP manifests to CLT. + +This script will first convert all WIPP manifests to ICT and then to CLT. +WIPP -> ICT -> CLT. +""" + +# pylint: disable=W0718, W1203 +import logging +from pathlib import Path + +import typer +from ict import ICT +from tqdm import tqdm + +app = typer.Typer(help="Convert WIPP manifests to ICT.") +ict_logger = logging.getLogger("ict") +fhandler = logging.FileHandler("clt_conversion.log") +fformat = logging.Formatter( + "%(asctime)s - %(levelname)s - %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p" +) +fhandler.setFormatter(fformat) +fhandler.setLevel("INFO") +ict_logger.setLevel("INFO") +ict_logger.addHandler(fhandler) +ict_logger.setLevel(logging.INFO) +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%m/%d/%Y %I:%M:%S %p", +) +logger = logging.getLogger("wipp_to_clt") +logger.addHandler(fhandler) + +REPO_PATH = Path(__file__).parent +LOCAL_MANIFESTS = list(REPO_PATH.rglob("*plugin.json")) +logger.info(f"Found {len(LOCAL_MANIFESTS)} manifests in {REPO_PATH}") +IGNORE_LIST = ["cookiecutter", ".env", "Shared-Memory-OpenMP"] +# Shared-Memory-OpenMP ignored for now until version +# and container are fixed in the manifest +LOCAL_MANIFESTS = [ + x for x in LOCAL_MANIFESTS if not any(ig in str(x) for ig in IGNORE_LIST) +] + + +@app.command() +def main( + all_: bool = typer.Option( + False, + "--all", + "-a", + help="Convert all manifests in the repository.", + ), + name: str = typer.Option( + None, + "--name", + "-n", + help="Name of the plugin to convert.", + ), +) -> None: + """Convert WIPP manifests to ICT.""" + problems = {} + converted = 0 + if not all_ and name is None: + logger.error("Please provide a name if not converting all manifests.") + raise typer.Abort + if name is not None: + if all_: + logger.warning("Ignoring --all flag since a name was provided.") + logger.info(f"name: {name}") + all_ = False + logger.info(f"all: {all_}") + if all_: + n = len(LOCAL_MANIFESTS) + for manifest in tqdm(LOCAL_MANIFESTS): + try: + ict_ = ICT.from_wipp(manifest) + ict_name = ( + ict_.name.split("/")[-1].lower() + ".cwl" # pylint: disable=E1101 + ) + ict_.save_clt(manifest.with_name(ict_name)) + + converted += 1 + + except BaseException as e: + problems[Path(manifest).parts[4:-1]] = str(e) + if name is not None: + n = 1 + for manifest in [x for x in LOCAL_MANIFESTS if name in str(x)]: + try: + ict_ = ICT.from_wipp(manifest) + ict_name = ( + ict_.name.split("/")[-1].lower() + ".cwl" # pylint: disable=E1101 + ) + ict_.save_clt(manifest.with_name(ict_name)) + converted += 1 + + except BaseException as e: + problems[Path(manifest).parts[4:-1]] = str(e) + + logger.info(f"Converted {converted}/{n} plugins") + if len(problems) > 0: + logger.error(f"Problems: {problems}") + logger.info(f"There were {len(problems)} problems in {n} manifests.") + + +if __name__ == "__main__": + app() diff --git a/to_ict.py b/to_ict.py new file mode 100644 index 0000000..fcb858d --- /dev/null +++ b/to_ict.py @@ -0,0 +1,99 @@ +# ruff: noqa +"""Script to convert all WIPP manifests to ICT.""" + +# pylint: disable=W0718, W1203 +import logging +from pathlib import Path + +import typer +from ict import ICT, validate +from tqdm import tqdm + +app = typer.Typer(help="Convert WIPP manifests to ICT.") +ict_logger = logging.getLogger("ict") +fhandler = logging.FileHandler("ict_conversion.log") +fformat = logging.Formatter( + "%(asctime)s - %(levelname)s - %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p" +) +fhandler.setFormatter(fformat) +fhandler.setLevel("INFO") +ict_logger.setLevel("INFO") +ict_logger.addHandler(fhandler) +ict_logger.setLevel(logging.INFO) +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%m/%d/%Y %I:%M:%S %p", +) +logger = logging.getLogger("wipp_to_ict") +logger.addHandler(fhandler) + +REPO_PATH = Path(__file__).parent +LOCAL_MANIFESTS = list(REPO_PATH.rglob("*plugin.json")) +logger.info(f"Found {len(LOCAL_MANIFESTS)} manifests in {REPO_PATH}") +IGNORE_LIST = ["cookiecutter", ".env", "Shared-Memory-OpenMP"] +# Shared-Memory-OpenMP ignored for now until version +# and container are fixed in the manifest +LOCAL_MANIFESTS = [ + x for x in LOCAL_MANIFESTS if not any(ig in str(x) for ig in IGNORE_LIST) +] + + +@app.command() +def main( + all_: bool = typer.Option( + False, + "--all", + "-a", + help="Convert all manifests in the repository.", + ), + name: str = typer.Option( + None, + "--name", + "-n", + help="Name of the plugin to convert.", + ), +) -> None: + """Convert WIPP manifests to ICT.""" + problems = {} + converted = 0 + if not all_ and name is None: + logger.error("Please provide a name if not converting all manifests.") + raise typer.Abort + if name is not None: + if all_: + logger.warning("Ignoring --all flag since a name was provided.") + logger.info(f"name: {name}") + all_ = False + logger.info(f"all: {all_}") + if all_: + n = len(LOCAL_MANIFESTS) + for manifest in tqdm(LOCAL_MANIFESTS): + try: + ict_ = ICT.from_wipp(manifest) + yaml_path = ict_.save_yaml(manifest.with_name("ict.yaml")) + validate(yaml_path) + converted += 1 + + except BaseException as e: + problems[Path(manifest).parts[4:-1]] = str(e) + if name is not None: + n = 1 + for manifest in [x for x in LOCAL_MANIFESTS if name in str(x)]: + try: + ict_ = ICT.from_wipp(manifest) + yaml_path = ict_.save_yaml(manifest.with_name("ict.yaml")) + validate(yaml_path) + converted += 1 + + except BaseException as e: + problems[Path(manifest).parts[4:-1]] = str(e) + + logger.info(f"Converted {converted}/{n} plugins") + if len(problems) > 0: + logger.error(f"Problems: {problems}") + logger.info(f"There were {len(problems)} problems in {n} manifests.") + + +if __name__ == "__main__": + app() diff --git a/transforms/polus-csv-merger-plugin/README.md b/transforms/polus-csv-merger-plugin/README.md new file mode 100644 index 0000000..704f340 --- /dev/null +++ b/transforms/polus-csv-merger-plugin/README.md @@ -0,0 +1,35 @@ +# CSV Row Merger + +This WIPP plugin merges all csv files in a csv collection into one or more csv files using either row or column merging. + +**If row merging**, csv files are assumed to have headers (column titles) in the first row. If headers are not the same between all files, csv files that don't have a specific column header will have the column filled with 'NaN' values. A column titled `file` is created in the output file, and this contains the name of the original input csv file associated with the row of data. **This plugin creates a csvCollection with a single csv file.** + +**If column merging**, it is assumed that all files have a column titled `file` that is used to merge columns across csv files. If some files have a `file` column value that does not match another csv file, then a new row is generated with the specified value in `file` and missing column values are filled with `NaN` values. **This plugin creates a csvCollection with a single csv file.** + +**When column merging, if sameRows==true**, then no `file` column needs to be present. All files with the same number of columns will be merged into one csv file. **This plugin creates a csvCollection with as many csv files as there are unique numbers of rows in the csv collection.** + +If `stripExtension` is set to true, then the `.csv` file extension is removed from the file name in the `file` column. + +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +## Building + +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. + +## Options + +This plugin takes two input argument and one output argument: + +| Name | Description | I/O | Type | +|--------------------|------------------------------------------------------------|--------|---------------| +| `--inpDir` | Input image collection to be processed by this plugin | Input | collection | +| `--stripExtension` | Should csv be removed from the filename in the output file | Input | boolean | +| `--dim` | Perform `rows` or `columns` merger | Input | string | +| `--sameRows` | Only merge csv files with the same number of rows? | Input | boolean | +| `--outDir` | Output csv file | Output | csvCollection | + diff --git a/transforms/polus-csv-merger-plugin/VERSION b/transforms/polus-csv-merger-plugin/VERSION new file mode 100644 index 0000000..60a2d3e --- /dev/null +++ b/transforms/polus-csv-merger-plugin/VERSION @@ -0,0 +1 @@ +0.4.0 \ No newline at end of file diff --git a/transforms/polus-csv-merger-plugin/build-docker.sh b/transforms/polus-csv-merger-plugin/build-docker.sh new file mode 100755 index 0000000..758b23c --- /dev/null +++ b/transforms/polus-csv-merger-plugin/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$( 0) { + for (i in 1:length(excludes)) { + if(!excludes[i] %in% colnames(dataset)) { + logwarn('column to exclude from %s is not found',file_name) + } + } + datasub <-dataset[ , !(names(dataset) %in% excludes)] + } + else if(length(excludes) == 0) { + datasub <-dataset + } + # Remove columns with all values as zero + datasub <- datasub[colSums(datasub) > 0] + + #Check whether predict column is present in dataframe + if(!(predictcolumn %in% colnames(datasub))) { + logwarn('predict column name is not found in %s',file_name) + next + } + + #Get column names without predict variable + drop_dep <- datasub[ , !(names(datasub) %in% predictcolumn)] + resp_var <- colnames(drop_dep) + + #Number of cores + num_of_cores = detectCores() + loginfo('Cores = %s', num_of_cores) + + #Chunk Size + chunk <- floor((nrow(datasub)/ncol(datasub))*num_of_cores) + + #Function to determine chunks + make.data<-function(formula,data,chunksize,...){ + n<-nrow(data) + cursor<-0 + datafun<-function(reset=FALSE){ + if (reset){ + cursor<<-0 + return(NULL) + } + if (cursor>=n) + return(NULL) + start<-cursor+1 + cursor<<-cursor+min(chunksize, n-cursor) + data[start:cursor,] + } + } + + #Convert to ffdf object + datasub_ff = as.ffdf(datasub) + + #Chunk data + chunk_data <-make.data(formula(paste(predictcolumn,paste(resp_var,collapse= "+"),sep="~")), datasub_ff, chunksize=chunk) + + if((modeltype == 'Gaussian') || (modeltype == 'Poisson') || (modeltype == 'Binomial') || (modeltype == 'Quasibinomial') || (modeltype == 'Quasipoisson') || (modeltype == 'Quasi')) { + modeltype <- tolower(modeltype) + } + + if (modeltype == 'NegativeBinomial') { + fit <- glm.nb(as.formula(paste(predictcolumn,1,sep="~")), data = datasub) + mu <- exp(coef(fit)) + val_pred<-eval(parse(text=paste('datasub',predictcolumn, sep = "$"))) + theta_val = theta.ml(val_pred, mu,nrow(datasub), limit = 22, eps = .Machine$double.eps^0.25, trace = FALSE) + } + + model_list <- c('gaussian','Gamma', 'binomial', 'poisson', 'quasi', 'quasibinomial', 'quasipoisson' ) + + model_data <- function(pred_var, data_model) { + if((modeltype %in% model_list)) { + reg_model <- bigglm(formula(paste(predictcolumn,paste(pred_var,collapse= "+"),sep="~")), data = data_model, family = eval(parse(text=paste(modeltype,"()", sep = ""))), chunksize = chunk) + } + else if(modeltype == 'NegativeBinomial') { + reg_model <- bigglm(formula(paste(predictcolumn,paste(pred_var,collapse= "+"),sep="~")), data = data_model, family = negative.binomial(theta= theta_val), chunksize=chunk) + } + else if(modeltype == 'Multinomial') { + reg_model <- multinom(formula(paste(paste("as.factor(",predictcolumn,")"),paste(pred_var,collapse= "+"),sep="~")), data = data_model, maxit=10, MaxNWts = 10000) + } + return(reg_model) + } + + #Model data based on the options selected + #Get only main effects of the variables + if (glmmethod == 'PrimaryFactors') { + if (modeltype != 'Multinomial') { + test_glm<- model_data(resp_var,chunk_data) + } + else if (modeltype == 'Multinomial') { + test_glm<- model_data(resp_var,datasub_ff) + } + } + #Get interaction values + else if (glmmethod == 'Interaction') { + datasub_pred <- datasub[ , !(names(datasub) %in% predictcolumn)] + #Get correlation between variables + tmp <- cor(datasub_pred) + tmp[upper.tri(tmp)] <- 0 + diag(tmp) <- 0 + + #Remove variables with no interaction + data_no_int <- which(tmp >= 0.1 | tmp < -0.1, arr.ind = TRUE) + data_frame<-data.frame(row = rownames(data_no_int), col = colnames(tmp)[data_no_int[, "col"]], + value = tmp[tmp >= 0.1 | tmp < -0.1]) + colnames(data_frame)<- c("variable1","variable2","coef") + + #Interaction variables + data_frame$variableint <- paste(data_frame$variable1, data_frame$variable2, sep="*") + data_list <- as.character(data_frame$variableint) + if (modeltype != 'Multinomial') { + test_glm<- model_data(data_list,chunk_data) + } + else if (modeltype == 'Multinomial') { + test_glm<- model_data(data_list, datasub_ff) + } + } + #Get second order polynomial values + else if (glmmethod == 'SecondOrder') { + var_resp <- paste('poly(',resp_var,',2)') + if (modeltype != 'Multinomial') { + test_glm<- model_data(var_resp,chunk_data) + } + else if (modeltype == 'Multinomial') { + test_glm<- model_data(var_resp,datasub_ff) + } + } + + #Set output directory + setwd(csvfile) + file_save <- paste0(file_name,".csv") + + #Convert summary of the analysis to a dataframe + tidy_summary <- tidy(test_glm) + + #Reorder the columns + tidy_final <- tidy_summary[c("term", "p.value", "estimate","std.error")] + colnames(tidy_final) <- c("Factors","P-Value","Estimate","Std.Error") + + #Write the dataframe to csv file + write.csv(tidy_final, file_save) + } +} \ No newline at end of file diff --git a/transforms/tabular-merger-tool/.bumpversion.cfg b/transforms/tabular-merger-tool/.bumpversion.cfg index def8097..90eed98 100644 --- a/transforms/tabular-merger-tool/.bumpversion.cfg +++ b/transforms/tabular-merger-tool/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.3-dev2 +current_version = 0.1.3-dev3 commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? @@ -24,4 +24,6 @@ replace = version = "{new_version}" [bumpversion:file:VERSION] -[bumpversion:file:src/polus/images/transforms/tabular/tabular_merger/__init__.py] +[bumpversion:file:README.md] + +[bumpversion:file:src/polus/tabular/transforms/tabular_merger/__init__.py] diff --git a/transforms/tabular-merger-tool/Dockerfile b/transforms/tabular-merger-tool/Dockerfile index a851ae2..07c61b6 100755 --- a/transforms/tabular-merger-tool/Dockerfile +++ b/transforms/tabular-merger-tool/Dockerfile @@ -1,4 +1,4 @@ -FROM polusai/bfio:2.1.9 +FROM polusai/bfio:2.3.6 # environment variables defined in polusai/bfio ENV EXEC_DIR="/opt/executables" @@ -12,10 +12,9 @@ WORKDIR ${EXEC_DIR} COPY pyproject.toml ${EXEC_DIR} COPY VERSION ${EXEC_DIR} COPY README.md ${EXEC_DIR} -RUN pip3 install --index-url https://test.pypi.org/simple/ filepattern==2.2.7 COPY src ${EXEC_DIR}/src RUN pip3 install ${EXEC_DIR} --no-cache-dir -ENTRYPOINT ["python3", "-m", "polus.images.transforms.tabular.tabular_merger"] +ENTRYPOINT ["python3", "-m", "polus.tabular.transforms.tabular_merger"] CMD ["--help"] diff --git a/transforms/tabular-merger-tool/README.md b/transforms/tabular-merger-tool/README.md index 3d42224..bd034f9 100644 --- a/transforms/tabular-merger-tool/README.md +++ b/transforms/tabular-merger-tool/README.md @@ -1,4 +1,4 @@ -# Tabular Merger (v0.1.0) +# Tabular Merger (v0.1.3-dev3) This WIPP plugin merges all tabular files with vaex supported file formats into a combined file using either row or column merging. diff --git a/transforms/tabular-merger-tool/VERSION b/transforms/tabular-merger-tool/VERSION index df5163a..e28037e 100644 --- a/transforms/tabular-merger-tool/VERSION +++ b/transforms/tabular-merger-tool/VERSION @@ -1 +1 @@ -0.1.3-dev2 +0.1.3-dev3 diff --git a/transforms/tabular-merger-tool/plugin.json b/transforms/tabular-merger-tool/plugin.json index 60c401c..a665a9a 100644 --- a/transforms/tabular-merger-tool/plugin.json +++ b/transforms/tabular-merger-tool/plugin.json @@ -1,18 +1,18 @@ { "name": "Tabular Merger", - "version": "0.1.3-dev2", + "version": "0.1.3-dev3", "title": "Tabular Merger", "description": "Merge vaex supported tabular file format into a single merged file.", "author": "Nicholas Schaub (nick.schaub@nih.gov), Hamdah Shafqat Abbasi (hamdahshafqat.abbasi@nih.gov)", "institution": "National Center for Advancing Translational Sciences, National Institutes of Health", - "repository": "https://github.com/PolusAI/polus-plugins", + "repository": "https://github.com/PolusAI/tabular-tools", "website": "https://ncats.nih.gov/preclinical/core/informatics", "citation": "", - "containerId": "polusai/tabular-merger-tool:0.1.3-dev2", + "containerId": "polusai/tabular-merger-tool:0.1.3-dev3", "baseCommand": [ "python3", "-m", - "polus.images.transforms.tabular.tabular_merger" + "polus.tabular.transforms.tabular_merger" ], "inputs": [ { @@ -111,4 +111,4 @@ "description": "Column name use to merge files" } ] -} +} \ No newline at end of file diff --git a/transforms/tabular-merger-tool/pyproject.toml b/transforms/tabular-merger-tool/pyproject.toml index 34beb73..9d3ef0f 100644 --- a/transforms/tabular-merger-tool/pyproject.toml +++ b/transforms/tabular-merger-tool/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] -name = "polus-images-transforms-tabular-tabular-merger" -version = "0.1.3-dev2" +name = "polus-tabular-transforms-tabular-merger" +version = "0.1.3-dev3" description = "Merge vaex supported tabular file format into a single merged file." authors = [ "Nick Schaub ", @@ -11,14 +11,12 @@ packages = [{include = "polus", from = "src"}] [tool.poetry.dependencies] python = ">=3.9" -filepattern = "^2.0.0" typer = "^0.7.0" blake3 = "^0.3.3" llvmlite = "^0.39.1" -fastapi = "^0.92.0" -astropy = "5.2.1" vaex = "^4.17.0" tqdm = "^4.65.0" +filepattern = "^2.0.5" [tool.poetry.group.dev.dependencies] diff --git a/transforms/tabular-merger-tool/src/polus/images/transforms/tabular/tabular_merger/__init__.py b/transforms/tabular-merger-tool/src/polus/tabular/transforms/tabular_merger/__init__.py similarity index 65% rename from transforms/tabular-merger-tool/src/polus/images/transforms/tabular/tabular_merger/__init__.py rename to transforms/tabular-merger-tool/src/polus/tabular/transforms/tabular_merger/__init__.py index 0a87e66..77291e3 100644 --- a/transforms/tabular-merger-tool/src/polus/images/transforms/tabular/tabular_merger/__init__.py +++ b/transforms/tabular-merger-tool/src/polus/tabular/transforms/tabular_merger/__init__.py @@ -1,4 +1,4 @@ """Tabular Merger.""" -__version__ = "0.1.3-dev2" +__version__ = "0.1.3-dev3" from . import tabular_merger diff --git a/transforms/tabular-merger-tool/src/polus/images/transforms/tabular/tabular_merger/__main__.py b/transforms/tabular-merger-tool/src/polus/tabular/transforms/tabular_merger/__main__.py similarity index 95% rename from transforms/tabular-merger-tool/src/polus/images/transforms/tabular/tabular_merger/__main__.py rename to transforms/tabular-merger-tool/src/polus/tabular/transforms/tabular_merger/__main__.py index 648ad41..55bdbdf 100644 --- a/transforms/tabular-merger-tool/src/polus/images/transforms/tabular/tabular_merger/__main__.py +++ b/transforms/tabular-merger-tool/src/polus/tabular/transforms/tabular_merger/__main__.py @@ -9,7 +9,7 @@ import filepattern as fp import typer -from polus.images.transforms.tabular.tabular_merger import tabular_merger as tm +from polus.tabular.transforms.tabular_merger import tabular_merger as tm app = typer.Typer() @@ -18,7 +18,7 @@ format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s", datefmt="%d-%b-%y %H:%M:%S", ) -logger = logging.getLogger("polus.images.transforms.tabular_merger") +logger = logging.getLogger("polus.tabular.transforms.tabular_merger") logger.setLevel(os.environ.get("POLUS_LOG", logging.INFO)) POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".arrow") diff --git a/transforms/tabular-merger-tool/src/polus/images/transforms/tabular/tabular_merger/tabular_merger.py b/transforms/tabular-merger-tool/src/polus/tabular/transforms/tabular_merger/tabular_merger.py similarity index 100% rename from transforms/tabular-merger-tool/src/polus/images/transforms/tabular/tabular_merger/tabular_merger.py rename to transforms/tabular-merger-tool/src/polus/tabular/transforms/tabular_merger/tabular_merger.py diff --git a/transforms/tabular-merger-tool/tests/test_main.py b/transforms/tabular-merger-tool/tests/test_main.py index 90c9f58..bf05f04 100644 --- a/transforms/tabular-merger-tool/tests/test_main.py +++ b/transforms/tabular-merger-tool/tests/test_main.py @@ -8,7 +8,7 @@ import pandas as pd import pytest import vaex -from polus.images.transforms.tabular.tabular_merger import tabular_merger as tm +from polus.tabular.transforms.tabular_merger import tabular_merger as tm class Generatedata: diff --git a/transforms/tabular-thresholding-tool/.bumpversion.cfg b/transforms/tabular-thresholding-tool/.bumpversion.cfg new file mode 100644 index 0000000..dffe7ad --- /dev/null +++ b/transforms/tabular-thresholding-tool/.bumpversion.cfg @@ -0,0 +1,33 @@ +[bumpversion] +current_version = 0.1.6-dev1 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:plugin.json] + +[bumpversion:file:README.md] + +[bumpversion:file:ict.yaml] + +[bumpversion:file:tabular-thresholding-plugin.cwl] + +[bumpversion:file:VERSION] + +[bumpversion:file:src/polus/tabular/transforms/tabular_thresholding/__init__.py] diff --git a/transforms/tabular-thresholding-tool/.gitignore b/transforms/tabular-thresholding-tool/.gitignore new file mode 100644 index 0000000..e891280 --- /dev/null +++ b/transforms/tabular-thresholding-tool/.gitignore @@ -0,0 +1,175 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +poetry.lock +../../poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# vscode +.vscode + +# test data directory +data +# yaml file +.pre-commit-config.yaml + +# hidden files +.DS_Store +.ds_store +# flake8 +.flake8 diff --git a/transforms/tabular-thresholding-tool/Dockerfile b/transforms/tabular-thresholding-tool/Dockerfile new file mode 100644 index 0000000..e17cf7e --- /dev/null +++ b/transforms/tabular-thresholding-tool/Dockerfile @@ -0,0 +1,20 @@ +FROM polusai/bfio:2.3.6 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".csv" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +RUN pip3 install --index-url https://test.pypi.org/simple/ filepattern==2.2.7 +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir + +ENTRYPOINT ["python3", "-m", "polus.tabular.transforms.tabular_thresholding"] +CMD ["--help"] diff --git a/transforms/tabular-thresholding-tool/README.md b/transforms/tabular-thresholding-tool/README.md new file mode 100644 index 0000000..6f7f169 --- /dev/null +++ b/transforms/tabular-thresholding-tool/README.md @@ -0,0 +1,47 @@ +# Tabular Thresholding Plugin (v0.1.6-dev1) +This plugin uses three [threshold methods](https://github.com/nishaq503/thresholding.git) to compute threshold values on a user-defined variable and then determines if each label (ROI) is above or below the calculated threshold value. A new feature column will be computed for selected threshold method with the values in binary format (0, 1) \ +*0* `negative or below threshold`\ +*1* `positive or above threshold` + +## Threshold methods + +### *1-* False Positive Rate +It estimates mean and standard deviation of `negControl` values based on the assumption that it follows a single guassian distribution and computes threshold such that the area to the right is equal to a user-defined `falsePositiverate`. Values must range between 0 and 1 + +### *2-* OTSU +It computes threshold by using `negControl` and `posControl` values to minimize the weighted variance of these two classes. `numBins` are number of bins to compute histogram of `negControl` and `posControl` values + +### *3-* MEAN+Sigma +It computes threshold by calculating mean and `n` number of standard deviations of `negControl` values. + +Contact [Hamdah Shafqat Abbasi](mailto: hamdah.abbasi@axleinfo.com) for more information. + +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +## Building + +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the +contents of `plugin.json` into the pop-up window and submit. + +## Options + +This plugin takes 10 input arguments and one output argument: + +| Name | Description | I/O | Type | +|-------------------------|-----------------------------------------------------|--------|---------------| +| `--inpDir` | Input directory containing tabular data CSVs | Input | genericData | +| `--filePattern` | Pattern to parse tabular files | Input | string | +| `--negControl` | FeatureName describing non treated wells/ROI | Input | string | +| `--posControl` | FeatureName describing treated wells/ROI | Input | string | +| `--varName` | FeatureName for thresholding | Input | string | +| `--thresholdType` | See above in README | Input | enum | +| `--falsePositiverate` | Area to the right of the threshold | Input | float | +| `--numBins` | Number of bins for histogram | Input | number | +| `--n` | Number of standard deviation | Input | number | +| `--outFormat` | Output file format | Input | enum | +| `--outDir` | Output collection | Output | genericData | diff --git a/transforms/tabular-thresholding-tool/VERSION b/transforms/tabular-thresholding-tool/VERSION new file mode 100644 index 0000000..aeb7350 --- /dev/null +++ b/transforms/tabular-thresholding-tool/VERSION @@ -0,0 +1 @@ +0.1.6-dev1 diff --git a/transforms/tabular-thresholding-tool/build-docker.sh b/transforms/tabular-thresholding-tool/build-docker.sh new file mode 100644 index 0000000..056da7e --- /dev/null +++ b/transforms/tabular-thresholding-tool/build-docker.sh @@ -0,0 +1,2 @@ +version=$(", +"Najib Ishaq " +] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9" +filepattern = "^2.0.4" +typer = "^0.7.0" +tqdm = "^4.64.1" +vaex = "^4.17.0" + + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +pre-commit = "^3.1.0" +black = "^23.1.0" +flake8 = "^6.0.0" +mypy = "^1.0.1" +pytest = "^7.2.1" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/transforms/tabular-thresholding-tool/run-plugin.sh b/transforms/tabular-thresholding-tool/run-plugin.sh new file mode 100755 index 0000000..55e02a5 --- /dev/null +++ b/transforms/tabular-thresholding-tool/run-plugin.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# version=$( None: + """Calculate binary thresholds for tabular data.""" + starttime = time.time() + logger.info(f"inpDir = {inp_dir}") + logger.info(f"outDir = {out_dir}") + logger.info(f"filePattern = {file_pattern}") + logger.info(f"negControl = {neg_control}") + logger.info(f"posControl = {pos_control}") + logger.info(f"varName = {var_name}") + logger.info(f"thresholdType = {threshold_type}") + logger.info(f"falsePositiverate = {false_positive_rate}") + logger.info(f"numBins = {num_bins}") + logger.info(f"n = {n}") + logger.info(f"outFormat = {out_format}") + + inp_dir = inp_dir.resolve() + out_dir = out_dir.resolve() + + assert inp_dir.exists(), f"{inp_dir} doesnot exists!! Please check input path again" + assert ( + out_dir.exists() + ), f"{out_dir} doesnot exists!! Please check output path again" + # By default it ingests all input files if not file_pattern is defined + file_pattern = ".*" + file_pattern + + fps = fp.FilePattern(inp_dir, file_pattern) + + if preview: + with open(pathlib.Path(out_dir, "preview.json"), "w") as jfile: + out_json: dict[Union[str, List], Any] = { + "filepattern": file_pattern, + "outDir": [], + } + for file in fps: + out_name = str(file[1][0].name.split(".")[0]) + "_binary" + out_format + thr_json = str(file[1][0].name.split(".")[0]) + "_thresholds.json" + out_json["outDir"].append(out_name) + out_json["outDir"].append(thr_json) + + json.dump(out_json, jfile, indent=2) + + num_workers = max(multiprocessing.cpu_count() // 2, 2) + + flist = [f[1][0] for f in fps] + logger.info(f"Number of tabular files detected: {len(flist)}, filenames: {flist}") + assert len(flist) != 0, f"No tabular file is detected: {flist}" + + with multiprocessing.Pool(processes=num_workers) as executor: + executor.map( + partial( + tt.thresholding_func, + neg_control, + pos_control, + var_name, + threshold_type, + false_positive_rate, + num_bins, + n, + out_format, + out_dir, + ), + flist, + ) + executor.close() + executor.join() + + # Deleting intermediate files from input directory + for f in inp_dir.iterdir(): + if f.is_file() and file_pattern != ".*.hdf5": + if f.suffix in [".hdf5", ".yaml"]: + os.remove(f) + else: + if ".hdf5.hdf5" in f.name or f.suffix == ".yaml": + os.remove(f) + + endtime = round((time.time() - starttime) / 60, 3) + logger.info(f"Time taken to process binary threhold CSVs: {endtime} minutes!!!") + return + + +if __name__ == "__main__": + app() diff --git a/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/tabular_thresholding.py b/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/tabular_thresholding.py new file mode 100644 index 0000000..0033cf2 --- /dev/null +++ b/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/tabular_thresholding.py @@ -0,0 +1,169 @@ +"""Tabular Thresholding.""" +import enum +import json +import logging +import os +import pathlib +import warnings +from typing import Dict, Optional, Union + +import numpy as np +import vaex + +from .thresholding import custom_fpr +from.thresholding import n_sigma +from .thresholding import otsu + +logger = logging.getLogger(__name__) + +POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".arrow") + + +class Extensions(str, enum.Enum): + """File format of an output file.""" + + CSV = ".csv" + ARROW = ".arrow" + PARQUET = ".parquet" + HDF = ".hdf5" + FEATHER = ".feather" + Default = POLUS_TAB_EXT + + +class Methods(str, enum.Enum): + """Threshold methods.""" + + OTSU = "otsu" + NSIGMA = "n_sigma" + FPR = "fpr" + ALL = "all" + Default = "all" + + +def thresholding_func( + neg_control: str, + pos_control: str, + var_name: str, + threshold_type: Methods, + false_positive_rate: Optional[float], + num_bins: Optional[int], + n: Optional[int], + out_format: Extensions, + out_dir: pathlib.Path, + file: pathlib.Path, +) -> None: + """Compute variable threshold using negative or negative and positive control data. + + Computes the variable value of each ROI if above or below threshold. The control data used for computing threshold depends on the type of thresholding methods + https://github.com/nishaq503/thresholding.git. + Args: + file: Filename. + neg_control: Column name containing information of non treated wells. + pos_control:Column name containing information of wells with the known treatment. + var_name:Column name for computing thresholds. + threshold_type:Name of threshold method. + out_format: Output file extension. + false_positive_rate: Tuning parameter. + num_bins: Number of bins. + n: Number of standard deviation away from mean value. + + """ + chunk_size = 100_000 + if file.suffix == ".csv": + df = vaex.from_csv(file, convert=True, chunk_size=chunk_size) + else: + df = vaex.open(file, convert=True, progress=True) + + assert any( + item in [var_name, neg_control, pos_control] for item in list(df.columns) + ), f"They are missing {var_name}, {neg_control}, {pos_control} column names tabular data file. Please do check variables again!!" + + assert df.shape != ( + 0, + 0, + ), f"File {file} is not loaded properly! Please do check input files again" + + if pos_control is None: + logger.info( + "Otsu threshold will not be computed as it requires information of both neg_control & pos_control" + ) + + threshold_dict: Dict[str, Union[float, str]] = {} + plate = file.stem + threshold_dict["plate"] = plate + + if df[neg_control].unique() != [0.0, 1.0]: + warnings.warn("controls are missing. NaN value are computed for thresholds") + nan_value = np.nan * np.arange(0, len(df[neg_control].values), 1) + threshold_dict["fpr"] = np.nan + threshold_dict["otsu"] = np.nan + threshold_dict["nsigma"] = np.nan + df["fpr"] = nan_value + df["otsu"] = nan_value + df["nsigma"] = nan_value + + else: + pos_controls = df[df[pos_control] == 1][var_name].values + neg_controls = df[df[neg_control] == 1][var_name].values + + if threshold_type == "fpr": + print(threshold_type) + threshold = custom_fpr.find_threshold( + neg_controls, false_positive_rate=false_positive_rate + ) + threshold_dict[threshold_type] = threshold + df[threshold_type] = df.func.where(df[var_name] <= threshold, 0, 1) + elif threshold_type == "otsu": + combine_array = np.append(neg_controls, pos_controls, axis=0) + threshold = otsu.find_threshold( + combine_array, num_bins=num_bins, normalize_histogram=False + ) + threshold_dict[threshold_type] = threshold + df[threshold_type] = df.func.where(df[var_name] <= threshold, 0, 1) + elif threshold_type == "nsigma": + threshold = n_sigma.find_threshold(neg_controls, n=n) + threshold_dict[threshold_type] = threshold + df[threshold_type] = df.func.where(df[var_name] <= threshold, 0, 1) + elif threshold_type == "all": + fpr_thr = custom_fpr.find_threshold( + neg_controls, false_positive_rate=false_positive_rate + ) + combine_array = np.append(neg_controls, pos_controls, axis=0) + + if len(pos_controls) == 0: + warnings.warn( + "controls are missing. NaN value are computed for otsu thresholds" + ) + threshold_dict["otsu"] = np.nan + df["otsu"] = np.nan * np.arange(0, len(df[var_name].values), 1) + else: + otsu_thr = otsu.find_threshold( + combine_array, num_bins=num_bins, normalize_histogram=False + ) + threshold_dict["otsu"] = otsu_thr + df["otsu"] = df.func.where(df[var_name] <= otsu_thr, 0, 1) + + nsigma_thr = n_sigma.find_threshold(neg_controls, n=n) + threshold_dict["fpr"] = fpr_thr + threshold_dict["nsigma"] = nsigma_thr + df["fpr"] = df.func.where(df[var_name] <= fpr_thr, 0, 1) + df["nsigma"] = df.func.where(df[var_name] <= nsigma_thr, 0, 1) + + outjson = pathlib.Path(out_dir).joinpath(f"{plate}_thresholds.json") + with open(outjson, "w") as outfile: + json.dump(threshold_dict, outfile) + logger.info(f"Saving Thresholds in JSON fileformat {outjson}") + + if f"{out_format}" in [".feather", ".arrow"]: + outname = pathlib.Path(out_dir, f"{plate}_binary{out_format}") + df.export_feather(outname) + logger.info(f"Saving f'{plate}_binary{out_format}") + elif f"{out_format}" == ".csv": + outname = pathlib.Path(out_dir).joinpath(f"{plate}_binary{out_format}") + df.export_csv(path=outname, chunk_size=chunk_size) + else: + outname = pathlib.Path(out_dir).joinpath(f"{plate}_binary{out_format}") + df.export(outname, progress=True) + logger.info(f"Saving f'{plate}_binary{out_format}") + + return diff --git a/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/thresholding/__init__.py b/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/thresholding/__init__.py new file mode 100644 index 0000000..5e67d64 --- /dev/null +++ b/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/thresholding/__init__.py @@ -0,0 +1,6 @@ +"""Tabular Thresholding.""" +__version__ = "0.1.3" + +from . import custom_fpr +from . import n_sigma +from . import otsu diff --git a/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/thresholding/custom_fpr.py b/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/thresholding/custom_fpr.py new file mode 100644 index 0000000..4a0fd6d --- /dev/null +++ b/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/thresholding/custom_fpr.py @@ -0,0 +1,36 @@ +"""Tabular Thresholding.""" +import statistics + +import numpy + + +def find_threshold( + values: numpy.ndarray, + false_positive_rate: float, +) -> float: + """Compute a threshold value using a user-specified false positive rate. + + We assume that the `negative_values` follow a single gaussian distribution. + We estimate the mean and standard deviation of this distribution and + compute a threshold such that the area to the right of the threshold is + equal to the given `false_positive_rate`. + + Args: + values: drawn from a single gaussian distribution. + false_positive_rate: A user-defined tuning parameter. + + Returns: + The computed threshold value. + """ + if not (0 < false_positive_rate < 1): + raise ValueError( + f"`false_positive_rate` mut be in the range (0, 1). Got {false_positive_rate:.2e} instead." + ) + + mu = float(numpy.mean(values)) + sigma = float(numpy.std(values)) + + distribution = statistics.NormalDist(mu, sigma) + threshold = distribution.inv_cdf(1 - false_positive_rate) + + return threshold diff --git a/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/thresholding/n_sigma.py b/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/thresholding/n_sigma.py new file mode 100644 index 0000000..6c72279 --- /dev/null +++ b/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/thresholding/n_sigma.py @@ -0,0 +1,18 @@ +"""Tabular Thresholding.""" +import numpy + + +def find_threshold(values: numpy.ndarray, n: int = 4) -> float: + """Compute the threshold as `mu + n * sigma`. + + Args: + values: 1d array of values over which tom compute the threshold. + n: number of standard deviations to go away from the mean. + + Returns: + The threshold value. + """ + mu = numpy.mean(values) + sigma = numpy.std(values) + threshold = mu + n * sigma + return float(threshold) diff --git a/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/thresholding/otsu.py b/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/thresholding/otsu.py new file mode 100644 index 0000000..8fc2281 --- /dev/null +++ b/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/thresholding/otsu.py @@ -0,0 +1,45 @@ +"""Tabular Thresholding.""" +import numpy + + +def find_threshold( + values: numpy.ndarray, + num_bins: int = 256, + normalize_histogram: bool = False, +) -> float: + """Compute the otsu threshold for the given values. + + Args: + values: 1d array of values + num_bins: to use for a histogram + normalize_histogram: Whether to normalize the histogram by max + frequency. + Returns: + The calculated threshold value. + """ + # Get the image histogram + hist, bin_edges = numpy.histogram(values, bins=num_bins) + + # Get normalized histogram if it is required + if normalize_histogram: + hist = numpy.divide(hist.ravel(), hist.max(initial=0)) + + # Calculate centers of bins + bin_mids = (bin_edges[:-1] + bin_edges[1:]) / 2.0 + + # Iterate over all thresholds (indices) and get the probabilities w1(t), w2(t) + weight1 = numpy.cumsum(hist) + weight2 = numpy.cumsum(hist[::-1])[::-1] + + # Get the class means mu0(t) + mean1 = numpy.cumsum(hist * bin_mids) / weight1 + # Get the class means mu1(t) + mean2 = (numpy.cumsum((hist * bin_mids)[::-1]) / weight2[::-1])[::-1] + + inter_class_variance = weight1[:-1] * weight2[1:] * (mean1[:-1] - mean2[1:]) ** 2 + + # Maximize the inter_class_variance + index_of_max_var = numpy.argmax(inter_class_variance) + + threshold = bin_mids[:-1][index_of_max_var] + return float(threshold) diff --git a/transforms/tabular-thresholding-tool/tabular-thresholding-plugin.cwl b/transforms/tabular-thresholding-tool/tabular-thresholding-plugin.cwl new file mode 100644 index 0000000..fecde01 --- /dev/null +++ b/transforms/tabular-thresholding-tool/tabular-thresholding-plugin.cwl @@ -0,0 +1,60 @@ +class: CommandLineTool +cwlVersion: v1.2 +inputs: + falsePositiverate: + inputBinding: + prefix: --falsePositiverate + type: double? + filePattern: + inputBinding: + prefix: --filePattern + type: string? + inpDir: + inputBinding: + prefix: --inpDir + type: Directory + n: + inputBinding: + prefix: --n + type: double? + negControl: + inputBinding: + prefix: --negControl + type: string + numBins: + inputBinding: + prefix: --numBins + type: double? + outDir: + inputBinding: + prefix: --outDir + type: Directory + outFormat: + inputBinding: + prefix: --outFormat + type: string + posControl: + inputBinding: + prefix: --posControl + type: string? + thresholdType: + inputBinding: + prefix: --thresholdType + type: string + varName: + inputBinding: + prefix: --varName + type: string +outputs: + outDir: + outputBinding: + glob: $(inputs.outDir.basename) + type: Directory +requirements: + DockerRequirement: + dockerPull: polusai/tabular-thresholding-tool:0.1.6-dev1 + InitialWorkDirRequirement: + listing: + - entry: $(inputs.outDir) + writable: true + InlineJavascriptRequirement: {} diff --git a/transforms/tabular-thresholding-tool/tests/__init__.py b/transforms/tabular-thresholding-tool/tests/__init__.py new file mode 100644 index 0000000..6711b98 --- /dev/null +++ b/transforms/tabular-thresholding-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Testing of Tabular Thresholding.""" diff --git a/transforms/tabular-thresholding-tool/tests/test_main.py b/transforms/tabular-thresholding-tool/tests/test_main.py new file mode 100644 index 0000000..b2a543f --- /dev/null +++ b/transforms/tabular-thresholding-tool/tests/test_main.py @@ -0,0 +1,144 @@ +"""Tabular Thresholding.""" + +import pathlib +import random +import shutil +import string +import tempfile + +import filepattern as fp +import numpy as np +import pandas as pd +import pytest +import vaex +from polus.tabular.transforms.tabular_thresholding import ( + tabular_thresholding as tt, +) + + +class Generatedata: + """Generate tabular data with several different file format.""" + + def __init__(self, file_pattern: str, size: int, outname: str) -> None: + """Define instance attributes.""" + self.dirpath = pathlib.Path.cwd() + self.inp_dir = tempfile.mkdtemp(dir=self.dirpath) + self.out_dir = tempfile.mkdtemp(dir=self.dirpath) + self.file_pattern = file_pattern + self.size = size + self.outname = outname + self.x = self.create_dataframe() + + def get_inp_dir(self) -> pathlib.Path: + """Get input directory.""" + return pathlib.Path(self.inp_dir) + + def get_out_dir(self) -> pathlib.Path: + """Get output directory.""" + return pathlib.Path(self.out_dir) + + def create_dataframe(self) -> pd.core.frame.DataFrame: + """Create Pandas dataframe.""" + diction_1 = { + "A": list(range(self.size)), + "B": [random.choice(string.ascii_letters) for i in range(self.size)], + "C": np.random.randint(low=1, high=100, size=self.size), + "D": np.random.normal(0.0, 1.0, size=self.size), + "MEAN": np.linspace(1.0, 4000.0, self.size), + "neg_control": [random.choice("01") for i in range(self.size)], + "pos_neutral": [random.choice("01") for i in range(self.size)], + } + + df = pd.DataFrame(diction_1) + df["neg_control"] = df["neg_control"].astype(int) + df["pos_neutral"] = df["pos_neutral"].astype(int) + + return df + + def csv_func(self) -> None: + """Convert pandas dataframe to csv file format.""" + self.x.to_csv(pathlib.Path(self.inp_dir, self.outname), index=False) + + def parquet_func(self) -> None: + """Convert pandas dataframe to parquet file format.""" + self.x.to_parquet( + pathlib.Path(self.inp_dir, self.outname), + engine="auto", + compression=None, + ) + + def feather_func(self) -> None: + """Convert pandas dataframe to feather file format.""" + self.x.to_feather(pathlib.Path(self.inp_dir, self.outname)) + + def arrow_func(self) -> None: + """Convert pandas dataframe to Arrow file format.""" + self.x.to_feather(pathlib.Path(self.inp_dir, self.outname)) + + def hdf_func(self) -> None: + """Convert pandas dataframe to hdf5 file format.""" + v_df = vaex.from_pandas(self.x, copy_index=False) + v_df.export(pathlib.Path(self.inp_dir, self.outname)) + + def __call__(self) -> None: + """To make a class callable.""" + data_ext = { + ".hdf5": self.hdf_func, + ".csv": self.csv_func, + ".parquet": self.parquet_func, + ".feather": self.feather_func, + ".arrow": self.arrow_func, + } + + return data_ext[self.file_pattern]() + + def clean_directories(self): + """Remove files.""" + for d in self.dirpath.iterdir(): + if d.is_dir() and d.name.startswith("tmp"): + shutil.rmtree(d) + + +EXT = [[".csv", ".feather", ".arrow", ".parquet", ".hdf5"]] + + +@pytest.fixture(params=EXT) +def poly(request): + """To get the parameter of the fixture.""" + return request.param + + +def test_tabular_thresholding(poly): + """Testing of merging of tabular data by rows with equal number of rows.""" + for i in poly: + d = Generatedata(i, outname=f"data_1{i}", size=1000000) + d() + pattern = f".*{i}" + fps = fp.FilePattern(d.get_inp_dir(), pattern) + for file in fps(): + tt.thresholding_func( + neg_control="neg_control", + pos_control="pos_neutral", + var_name="MEAN", + threshold_type="all", + false_positive_rate=0.01, + num_bins=512, + n=4, + out_format=i, + out_dir=d.get_out_dir(), + file=file[1][0], + ) + + assert i in [f.suffix for f in d.get_out_dir().iterdir()] + + df = vaex.open( + pathlib.Path(d.get_out_dir(), file[1][0].stem + "_binary" + i), + ) + threshold_methods = ["fpr", "otsu", "nsigma"] + assert (all(item in list(df.columns) for item in threshold_methods)) is True + assert np.allclose(np.unique(df[threshold_methods]), [0, 1]) is True + assert file[1][0].stem + "_thresholds.json" in [ + f.name for f in d.get_out_dir().iterdir() + ] + + d.clean_directories() diff --git a/utils/filepattern-generator-plugin/Dockerfile b/utils/filepattern-generator-plugin/Dockerfile new file mode 100644 index 0000000..45ce46b --- /dev/null +++ b/utils/filepattern-generator-plugin/Dockerfile @@ -0,0 +1,9 @@ +FROM polusai/bfio:2.1.9 + +ENV EXEC_DIR="/opt/executables" +RUN mkdir -p ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY src ${EXEC_DIR}/ +RUN pip3 install -r ${EXEC_DIR}/requirements.txt --no-cache-dir + +ENTRYPOINT ["python3", "/opt/executables/main.py"] diff --git a/utils/filepattern-generator-plugin/README.md b/utils/filepattern-generator-plugin/README.md new file mode 100644 index 0000000..9fdc742 --- /dev/null +++ b/utils/filepattern-generator-plugin/README.md @@ -0,0 +1,36 @@ +# Filepattern Generator + + +Filepattern Generator plugin creates a json containing a number of new filepatterns, where each filepattern will subset the image data in the directory + +Contact [Nick Schaub , Hamdah Shafqat Abbasi](mailto:nick.schaub@nih.gov, hamdah.abbasi@axleinfo.com) for more information. + +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + + +## Building + +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the +contents of `plugin.json` into the pop-up window and submit. + +## Options + +This plugin takes 5 input arguments and +1 output argument: + +| Name | Description | I/O | Type | +|---------------|---------------------------------------------------------------|--------|---------------| +| `--inpDir` | Input image directory | Input | collection | +| `--pattern` | Filepattern to parse image files | Input | string | +| `--chunkSize` | Number of images to generate collective filepattern | Input | number | +| `--groupBy` | Select a parameter to generate filepatterns in specific order | Input | string | +| `--outDir` | Output generic collection | Output | genericData | + + + + diff --git a/utils/filepattern-generator-plugin/VERSION b/utils/filepattern-generator-plugin/VERSION new file mode 100644 index 0000000..7dff5b8 --- /dev/null +++ b/utils/filepattern-generator-plugin/VERSION @@ -0,0 +1 @@ +0.2.1 \ No newline at end of file diff --git a/utils/filepattern-generator-plugin/build-docker.sh b/utils/filepattern-generator-plugin/build-docker.sh new file mode 100755 index 0000000..107f7e0 --- /dev/null +++ b/utils/filepattern-generator-plugin/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$( Tuple[str, int]: + + """This function produces the best combination of variables for a given chunksize + Args: + inpDir (Path): Path to Image files + pattern (str, optional): Regex to parse image files + groupBy (str, optional): Specify variable to group image filenames + chunk_size (str, optional): Number of images to generate collective filepattern + Returns: + variables for grouping image filenames, count + """ + + fp = filepattern.FilePattern(inpDir, pattern) + + # Get the number of unique values for each variable + counts = {k: len(v) for k, v in fp.uniques.items()} + + # Check to see if groupBy already gives a sufficient chunkSize + best_count = 0 + if groupBy is None: + for k, v in counts.items(): + if v <= chunkSize and v < best_count: + best_group, best_count = k, v + elif best_count == 0: + best_group, best_count = k, v + groupBy = best_group + + count = 1 + for v in groupBy: + count *= counts[v] + if count >= chunkSize: + return groupBy, count + best_group, best_count = groupBy, count + + # Search for a combination of `variables` that give a value close to the chunk_size + variables = [v for v in fp.variables if v not in groupBy] + for i in range(len(variables)): + groups = {best_group: best_count} + for p in combinations(variables, i): + group = groupBy + "".join("".join(c) for c in p) + count = 1 + for v in group: + count *= counts[v] + groups[group] = count + + # If all groups are over the chunk_size, then return just return the best_group + if all(v > chunkSize for k, v in groups.items()): + return best_group, best_count + + # Find the best_group + for k, v in groups.items(): + if v > chunkSize: + continue + if v > best_count: + best_group, best_count = k, v + return best_group, best_count + + +def save_generator_outputs(x: Dict[str, int], outDir: Path): + """Convert dictionary of filepatterns and number of image files which can be parsed with each filepattern to json file + Args: + x (Dict): A dictionary of filepatterns and number of image files which can be parsed with each filepattern + outDir (Path): Path to save the outputs + Returns: + json file with array of file patterns + """ + data = json.loads('{"filePatterns": []}') + with open(os.path.join(outDir, "file_patterns.json"), "w") as cwlout: + for key, value in x.items(): + data["filePatterns"].append(key) + json.dump(data, cwlout) + + return + + +def main( + inpDir: Path, + pattern: str, + chunkSize: int, + groupBy: str, + outDir: Path, +): + + starttime = time.time() + + # If the pattern isn't given, try to infer one + if pattern is None: + try: + pattern = filepattern.infer_pattern([f.name for f in inpDir.iterdir()]) + except ValueError: + logger.error( + "Could not infer a filepattern from the input files, " + + "and no filepattern was provided." + ) + raise + + assert inpDir.exists(), logger.info("Input directory does not exist") + + logger.info("Finding best grouping...") + groupBy, count = get_grouping(inpDir, pattern, groupBy, chunkSize) + + logger.info("Generating filepatterns...") + fp = filepattern.FilePattern(inpDir, pattern) + fps, counts = [], [] + for files in fp(group_by=groupBy): + fps.append(filepattern.infer_pattern([f["file"].name for f in files])) + fp_temp = filepattern.FilePattern(inpDir, fps[-1]) + counts.append(sum(len(f) for f in fp_temp)) + + assert sum(counts) == len([f for f in fp]) + + save_generator_outputs(dict(zip(fps, counts)), outDir) + + endtime = (time.time() - starttime) / 60 + logger.info(f"Total time taken to process all images: {endtime}") + + +if __name__ == "__main__": + + # Import environment variables + POLUS_LOG = getattr(logging, os.environ.get("POLUS_LOG", "INFO")) + + # Initialize the logger + logging.basicConfig( + format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s", + datefmt="%d-%b-%y %H:%M:%S", + ) + logger = logging.getLogger("main") + logger.setLevel(POLUS_LOG) + + # Argument parsing + logger.info("Parsing arguments...") + parser = argparse.ArgumentParser( + prog="main", description="Filepattern generator Plugin" + ) + # Input arguments + parser.add_argument( + "--inpDir", + dest="inpDir", + type=str, + help="Input image collection to be processed by this plugin", + required=True, + ) + parser.add_argument( + "--pattern", + dest="pattern", + type=str, + help="Filepattern regex used to parse image files", + required=False, + ) + parser.add_argument( + "--chunkSize", + dest="chunkSize", + type=int, + default=30, + help="Select chunksize for generating Filepattern from collective image set", + required=False, + ) + parser.add_argument( + "--groupBy", + dest="groupBy", + type=str, + help="Select a parameter to generate Filepatterns in specific order", + required=False, + ) + parser.add_argument( + "--outDir", dest="outDir", type=str, help="Output collection", required=True + ) + + # Parse the arguments + args = parser.parse_args() + inpDir = Path(args.inpDir) + + if inpDir.joinpath("images").is_dir(): + inpDir = inpDir.joinpath("images").absolute() + logger.info("inputDir = {}".format(inpDir)) + outDir = Path(args.outDir) + logger.info("outDir = {}".format(outDir)) + pattern = args.pattern + logger.info("pattern = {}".format(pattern)) + chunkSize = args.chunkSize + logger.info("chunkSize = {}".format(chunkSize)) + groupBy = args.groupBy + logger.info("groupBy = {}".format(groupBy)) + + main( + inpDir=inpDir, + pattern=pattern, + chunkSize=chunkSize, + groupBy=groupBy, + outDir=outDir, + ) diff --git a/utils/filepattern-generator-plugin/src/requirements.txt b/utils/filepattern-generator-plugin/src/requirements.txt new file mode 100644 index 0000000..aae7cb2 --- /dev/null +++ b/utils/filepattern-generator-plugin/src/requirements.txt @@ -0,0 +1 @@ +filepattern==1.4.7 diff --git a/utils/filepattern-generator-plugin/tests/test_main.py b/utils/filepattern-generator-plugin/tests/test_main.py new file mode 100644 index 0000000..caeae1f --- /dev/null +++ b/utils/filepattern-generator-plugin/tests/test_main.py @@ -0,0 +1,43 @@ +from pathlib import Path +import os, sys + +dirpath = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(os.path.join(dirpath, "../")) +import unittest +from src.main import * +import json + +inpDir = Path(dirpath).parent.joinpath("images") +outDir = Path(dirpath).parent.joinpath("out") +pattern = "p0{r}_x{x+}_y{y+}_wx{t}_wy{p}_c{c}.ome.tif" +chunkSize = 9 +filename = "pattern_generator" +data = { + "p00_x01_y{rr}_wx0_wy0_c{t}.ome.tif": 30, + "p00_x01_y{rr}_wx0_wy1_c{t}.ome.tif": 30, + "p00_x01_y{rr}_wx0_wy2_c{t}.ome.tif": 30, +} + + +class Test_Filepattern_Generator(unittest.TestCase): + def setUp(self) -> None: + + self.inpDir = inpDir + self.pattern = pattern + self.chunkSize = chunkSize + self.filename = filename + self.outDir = outDir + self.data = data + + def test_generated_json_output(self): + save_generator_outputs(self.data, outDir) + with open(outDir.joinpath("file_patterns.json"), "r") as read_file: + data = json.load(read_file) + file_pattern = data["filePatterns"] + self.assertTrue(file_pattern[0] == "p00_x01_y{rr}_wx0_wy0_c{t}.ome.tif") + self.assertTrue(file_pattern[1] == "p00_x01_y{rr}_wx0_wy1_c{t}.ome.tif") + self.assertTrue(file_pattern[2] == "p00_x01_y{rr}_wx0_wy2_c{t}.ome.tif") + + +if __name__ == "__main__": + unittest.main() diff --git a/utils/filepattern-generator-plugin/tests/version_test.py b/utils/filepattern-generator-plugin/tests/version_test.py new file mode 100644 index 0000000..25f0ea2 --- /dev/null +++ b/utils/filepattern-generator-plugin/tests/version_test.py @@ -0,0 +1,46 @@ +import unittest, json +from pathlib import Path +import urllib.request as request + + +class VersionTest(unittest.TestCase): + """Verify VERSION is correct""" + + version_path = Path(__file__).parent.parent.joinpath("VERSION") + json_path = Path(__file__).parent.parent.joinpath("plugin.json") + url = "https://hub.docker.com/repository/docker/polusai/filepattern-generator-plugin/tags?page=1&ordering=last_updated" + + def test_plugin_manifest(self): + """Tests VERSION matches the version in the plugin manifest""" + + # Get the plugin version + with open(self.version_path, "r") as file: + version = file.readline() + + # Load the plugin manifest + with open(self.json_path, "r") as file: + plugin_json = json.load(file) + + self.assertEqual(plugin_json["version"], version) + self.assertTrue(plugin_json["containerId"].endswith(version)) + + def test_docker_hub(self): + """Tests VERSION matches the latest docker container tag""" + + # Get the plugin version + with open(self.version_path, "r") as file: + version = file.readline() + + response = json.load(request.urlopen(self.url)) + if len(response["results"]) == 0: + self.fail( + "Could not find repository or no containers are in the repository." + ) + latest_tag = json.load(response)["results"][0]["name"] + + self.assertEqual(latest_tag, version) + + +if __name__ == "__main__": + + unittest.main() diff --git a/utils/polus-csv-collection-merger/Dockerfile b/utils/polus-csv-collection-merger/Dockerfile new file mode 100644 index 0000000..9137b85 --- /dev/null +++ b/utils/polus-csv-collection-merger/Dockerfile @@ -0,0 +1,4 @@ +FROM alpine +COPY VERSION / +COPY script.sh script.sh +ENTRYPOINT ["sh", "script.sh"] \ No newline at end of file diff --git a/utils/polus-csv-collection-merger/README.md b/utils/polus-csv-collection-merger/README.md new file mode 100644 index 0000000..ea885e5 --- /dev/null +++ b/utils/polus-csv-collection-merger/README.md @@ -0,0 +1,45 @@ +# Polus CSV Collection Merger Plugin + +This plugin helps to merge multiple CSV Collections in WIPP into one collection for later analysis. + +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +**This plugin is in development and is subject for change** + +## Options + +This plugin takes four input parameters and one output parameter: + +| Name | Description | I/O | WIPP Type | +|----------------------|------------------------------------------------|--------|---------------| +| `input-collection-a` | Input CSV collection A | Input | csvCollection | +| `input-collection-b` | Input CSV collection B | Input | csvCollection | +| `append-a` | Option to append collection ID to files from A | Input | boolean | +| `append-b` | Option to append collection ID to files from B | Input | boolean | +| `output` | Output CSV collection | Output | csvCollection | + +## Build the plugin + +```bash +docker build . -t labshare/polus-csv-collection-merger:0.1.1 +``` + + +## Run the plugin + +### Manually + +To test, create 3 folders: `` and `` should contain csv collections you would like to merge. `` is the target folder which will contain the merged files. + +Run the docker container +```bash +docker run -v :/a \ + -v :/b \ + -v :/c \ + labshare/polus-csv-collection-merger:0.1.1 \ + --input-collection-a /a \ + --input-collection-b /b \ + --append-a 'true' \ + --append-b 'true' \ + --output /c +``` \ No newline at end of file diff --git a/utils/polus-csv-collection-merger/VERSION b/utils/polus-csv-collection-merger/VERSION new file mode 100644 index 0000000..6da28dd --- /dev/null +++ b/utils/polus-csv-collection-merger/VERSION @@ -0,0 +1 @@ +0.1.1 \ No newline at end of file diff --git a/utils/polus-csv-collection-merger/csvcollectionsmerger.cwl b/utils/polus-csv-collection-merger/csvcollectionsmerger.cwl new file mode 100644 index 0000000..fb0684d --- /dev/null +++ b/utils/polus-csv-collection-merger/csvcollectionsmerger.cwl @@ -0,0 +1,28 @@ +class: CommandLineTool +cwlVersion: v1.2 +inputs: + append-a: + inputBinding: + prefix: --append-a + type: boolean? + append-b: + inputBinding: + prefix: --append-b + type: boolean? + input-collection-a: + inputBinding: + prefix: --input-collection-a + type: Directory + input-collection-b: + inputBinding: + prefix: --input-collection-b + type: Directory + output: + inputBinding: + prefix: --output + type: Directory +outputs: + output: !!python/name:builtins.NotImplementedError '' +requirements: + DockerRequirement: + dockerPull: polusai/csv-collection-merger:0.1.2 diff --git a/utils/polus-csv-collection-merger/ict.yaml b/utils/polus-csv-collection-merger/ict.yaml new file mode 100644 index 0000000..b6b2e70 --- /dev/null +++ b/utils/polus-csv-collection-merger/ict.yaml @@ -0,0 +1,61 @@ +author: +- Konstantin taletskiy +contact: konstantin.taletskiy@labshare.org +container: polusai/csv-collection-merger:0.1.2 +description: Merge two csv collections. You have an option to prepend collection name + to avoid name conflicts. +entrypoint: '[python3, main.py]' +inputs: +- description: Input csv collection A. + format: + - input-collection-a + name: input-collection-a + required: true + type: path +- description: Append collection name to collection A. + format: + - append-a + name: append-a + required: false + type: boolean +- description: Input csv collection B. + format: + - input-collection-b + name: input-collection-b + required: true + type: path +- description: Append collection name to collection B. + format: + - append-b + name: append-b + required: false + type: boolean +name: polusai/CSVcollectionsmerger +outputs: +- description: Output csv collection for the plugin + format: + - output + name: output + required: true + type: path +repository: https://github.com/polusai/image-tools +specVersion: 1.0.0 +title: CSV collections merger +ui: +- description: Pick a collection... + key: inputs.input-collection-a + title: 'CSV Collection A: ' + type: path +- description: Pick an option... + key: inputs.append-a + title: 'Append collection name to filenames in A: ' + type: checkbox +- description: Pick a collection... + key: inputs.input-collection-b + title: 'CSV Collection B: ' + type: path +- description: Pick an option... + key: inputs.append-b + title: 'Append collection name to filenames in B: ' + type: checkbox +version: 0.1.2 diff --git a/utils/polus-csv-collection-merger/plugin.json b/utils/polus-csv-collection-merger/plugin.json new file mode 100644 index 0000000..d777c0c --- /dev/null +++ b/utils/polus-csv-collection-merger/plugin.json @@ -0,0 +1,61 @@ +{ + "name": "CSV collections merger", + "version": "0.1.2", + "title": "CSV collections merger", + "description": "Merge two csv collections. You have an option to prepend collection name to avoid name conflicts.", + "author": "Konstantin taletskiy (konstantin.taletskiy@labshare.org)", + "containerId": "polusai/csv-collection-merger:0.1.2", + "inputs": [ + { + "name": "input-collection-a", + "type": "csvCollection", + "description": "Input csv collection A." + }, + { + "name": "append-a", + "type": "boolean", + "required": "false", + "description": "Append collection name to collection A." + }, + { + "name": "input-collection-b", + "type": "csvCollection", + "description": "Input csv collection B." + }, + { + "name": "append-b", + "type": "boolean", + "required": "false", + "description": "Append collection name to collection B." + } + ], + "outputs": [ + { + "name": "output", + "type": "csvCollection", + "description": "Output csv collection for the plugin" + } + ], + "ui": [ + { + "key": "inputs.input-collection-a", + "title": "CSV Collection A: ", + "description": "Pick a collection..." + }, + { + "key": "inputs.append-a", + "title": "Append collection name to filenames in A: ", + "description": "Pick an option..." + }, + { + "key": "inputs.input-collection-b", + "title": "CSV Collection B: ", + "description": "Pick a collection..." + }, + { + "key": "inputs.append-b", + "title": "Append collection name to filenames in B: ", + "description": "Pick an option..." + } + ] +} \ No newline at end of file diff --git a/utils/polus-csv-collection-merger/script.sh b/utils/polus-csv-collection-merger/script.sh new file mode 100644 index 0000000..646306d --- /dev/null +++ b/utils/polus-csv-collection-merger/script.sh @@ -0,0 +1,61 @@ +#!/bin/sh + +while [[ $# -gt 0 ]] +do +key="$1" + +case $key in + --input-collection-a) + INPUT_A="$2" + shift # past argument + shift # past value + ;; + --input-collection-b) + INPUT_B="$2" + shift # past argument + shift # past value + ;; + --append-a) + APPEND_A="$2" + shift # past argument + shift # past value + ;; + --append-b) + APPEND_B="$2" + shift # past argument + shift # past value + ;; + --output) + OUTPUT="$2" + shift # past argument + shift # past value + ;; +esac +done + +echo "INPUT COLLECTION A = ${INPUT_A}" +echo "INPUT COLLECTION B = ${INPUT_B}" +echo "APPEND A = ${APPEND_A}" +echo "APPEND B = ${APPEND_B}" +echo "OUTPUT = ${OUTPUT}" + +COLLECTION_A="$(basename $INPUT_A)" +COLLECTION_B="$(basename $INPUT_B)" +echo " " + +echo "Copying files from collection A ($COLLECTION_A):" +for f in $INPUT_A/*; do echo "$(basename $f)"; done +if [ "$APPEND_A" = "true" ]; then + for f in $INPUT_A/*; do cp "$f" "$OUTPUT"/"$COLLECTION_A"_"$(basename $f)"; done +else + for f in $INPUT_A/*; do cp "$f" "$OUTPUT"/"$(basename $f)"; done +fi +echo " " + +echo "Copying files from collection B ($COLLECTION_B):" +for f in $INPUT_B/*; do echo "$(basename $f)"; done +if [ "$APPEND_B" = "true" ]; then + for f in $INPUT_B/*; do cp "$f" "$OUTPUT"/"$COLLECTION_B"_"$(basename $f)"; done +else + for f in $INPUT_B/*; do cp "$f" "$OUTPUT"/"$(basename $f)"; done +fi \ No newline at end of file diff --git a/utils/polus-python-template/cookiecutter.json b/utils/polus-python-template/cookiecutter.json new file mode 100644 index 0000000..030f8cf --- /dev/null +++ b/utils/polus-python-template/cookiecutter.json @@ -0,0 +1,16 @@ +{ + "author": "Data Scientist", + "author_email": "data.scientist@labshare.org", + "plugin_name": "Awesome Plugin", + "plugin_package": "polus.plugins.package1.package2.awesome_function", + "plugin_description": "An awesome function.", + "plugin_version": "0.1.0", + + "package_folders": "{%set folders = cookiecutter.plugin_package.replace('.', '/') %}{{folders}}", + "package_name": "{% set packages = cookiecutter.plugin_package.split('.') %}{{ packages | last }}", + "project_name": "{% set project_name = cookiecutter.plugin_package.replace('_', '-').replace('.', '-') %}{{ project_name }}", + "plugin_slug": "{% set plugin_slug = cookiecutter.package_name.replace('_', '-') %}polus-{{plugin_slug}}-plugin", + "container_name": "{% set container_name = ('-').join(cookiecutter.plugin_slug.split('-')[1:])%}{{ container_name }}", + "container_id": "polusai/{{cookiecutter.container_name}}", + "container_version": "{{cookiecutter.plugin_version}}" +} diff --git a/utils/polus-stitching-vector-merger-plugin/Dockerfile b/utils/polus-stitching-vector-merger-plugin/Dockerfile new file mode 100644 index 0000000..2733ed1 --- /dev/null +++ b/utils/polus-stitching-vector-merger-plugin/Dockerfile @@ -0,0 +1,4 @@ +FROM python:3.6-alpine +COPY VERSION / +COPY main.py main.py +ENTRYPOINT ["python3", "main.py"] \ No newline at end of file diff --git a/utils/polus-stitching-vector-merger-plugin/README.md b/utils/polus-stitching-vector-merger-plugin/README.md new file mode 100644 index 0000000..0c7e53a --- /dev/null +++ b/utils/polus-stitching-vector-merger-plugin/README.md @@ -0,0 +1,29 @@ +# Polus Stitching Vector Collection Merger Plugin + +This WIPP plugin merges stitching vector collections together. It takes as input a minimum of 2 collections upto a maximum of 5 collections. + + Contact [Gauhar Bains](mailto:gauhar.bains@labshare.org) for more information. + +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +## Building + +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. + +## Options + +This plugin takes one input argument and one output argument: + +| Name | Description | I/O | Type | +| --------------------- | ------------------------------ | ------ | --------------- | +| `--VectorCollection1` | 1st stitchingVector Collection | Input | stitchingVector | +| `--VectorCollection2` | 2nd stitchingVector Collection | Input | stitchingVector | +| `--VectorCollection3` | 3rd stitchingVector Collection | Input | stitchingVector | +| `--VectorCollection4` | 4th stitchingVector Collection | Input | stitchingVector | +| `--VectorCollection5` | 5th stitchingVector Collection | Input | stitchingVector | +| `--outDir` | Output collection | Output | stitchingVector | diff --git a/utils/polus-stitching-vector-merger-plugin/VERSION b/utils/polus-stitching-vector-merger-plugin/VERSION new file mode 100644 index 0000000..84aa3a7 --- /dev/null +++ b/utils/polus-stitching-vector-merger-plugin/VERSION @@ -0,0 +1 @@ +0.1.8 \ No newline at end of file diff --git a/utils/polus-stitching-vector-merger-plugin/build-docker.sh b/utils/polus-stitching-vector-merger-plugin/build-docker.sh new file mode 100755 index 0000000..5515f86 --- /dev/null +++ b/utils/polus-stitching-vector-merger-plugin/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$(\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:VERSION] + +[bumpversion:file:README.md] + +[bumpversion:file:ict.yaml] + +[bumpversion:file:downloadrxivtextdata.cwl] + +[bumpversion:file:plugin.json] + +[bumpversion:file:src/polus/tabular/utils/rxiv_download/__init__.py] diff --git a/utils/rxiv-download-tool/.dockerignore b/utils/rxiv-download-tool/.dockerignore new file mode 100644 index 0000000..7c603f8 --- /dev/null +++ b/utils/rxiv-download-tool/.dockerignore @@ -0,0 +1,4 @@ +.venv +out +tests +__pycache__ diff --git a/utils/rxiv-download-tool/.gitignore b/utils/rxiv-download-tool/.gitignore new file mode 100644 index 0000000..c9c7ae7 --- /dev/null +++ b/utils/rxiv-download-tool/.gitignore @@ -0,0 +1,2 @@ +poetry.lock +out diff --git a/utils/rxiv-download-tool/Dockerfile b/utils/rxiv-download-tool/Dockerfile new file mode 100644 index 0000000..73d25ec --- /dev/null +++ b/utils/rxiv-download-tool/Dockerfile @@ -0,0 +1,20 @@ +FROM polusai/bfio:2.3.6 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".csv" +ENV POLUS_EXT=".xml" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir + +ENTRYPOINT ["python3", "-m", "polus.tabular.utils.rxiv_download"] +CMD ["--help"] diff --git a/utils/rxiv-download-tool/README.md b/utils/rxiv-download-tool/README.md new file mode 100644 index 0000000..fcb0d7b --- /dev/null +++ b/utils/rxiv-download-tool/README.md @@ -0,0 +1,30 @@ +# Rxiv Download (v0.1.0-dev1) + +This plugin allows to download data from open access archives. Currently this plugin supports downloading data from [arxiv](https://www.openarchives.org/). Later additional support for other archives will be added. + +## Building + +To build the Docker image for the download plugin, run +`bash build-docker.sh`. + +## Run the Docker image + +To execute the built docker image for the download plugin, run +`bash run-plugin.sh`. + +## Options + +This plugin takes 2 input arguments and +1 output argument: + +| Name | Description | I/O | Type | +| --------------- | ------------------------------------------------------------ | ------ | ----------- | +| `--rxiv ` | Download data from open access archives | Input | String | +| `--start ` | Start date | Input | String | +| `--outDir` | Directory to store the downloaded data | Output | genericData | +| `--preview` | Generate a JSON file with outputs | Output | JSON | + + + +## Sample docker command: +```docker run -v /home/ec2-user/data/:/home/ec2-user/data/ polusai/rxiv-download-tool:0.1.0-dev1 --rxiv="arXiv" --start='2023-2-16' --outDir=/home/ec2-user/data/output``` diff --git a/utils/rxiv-download-tool/VERSION b/utils/rxiv-download-tool/VERSION new file mode 100644 index 0000000..6b1a238 --- /dev/null +++ b/utils/rxiv-download-tool/VERSION @@ -0,0 +1 @@ +0.1.0-dev1 diff --git a/utils/rxiv-download-tool/build-docker.sh b/utils/rxiv-download-tool/build-docker.sh new file mode 100644 index 0000000..a5b03c6 --- /dev/null +++ b/utils/rxiv-download-tool/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$( + +2023-12-18T17:52:12Z +http://export.arxiv.org/oai2 + + +
+ oai:arXiv.org:1007.1025 + 2023-12-18 + cs + physics:nlin +
+ + + Inflection system of a language as a complex network + Fukś, Henryk + Computer Science - Computation and Language + Nonlinear Sciences - Adaptation and Self-Organizing Systems + We investigate inflection structure of a synthetic language using Latin as an +example. We construct a bipartite graph in which one group of vertices +correspond to dictionary headwords and the other group to inflected forms +encountered in a given text. Each inflected form is connected to its +corresponding headword, which in some cases in non-unique. The resulting sparse +graph decomposes into a large number of connected components, to be called word +groups. We then show how the concept of the word group can be used to construct +coverage curves of selected Latin texts. We also investigate a version of the +inflection graph in which all theoretically possible inflected forms are +included. Distribution of sizes of connected components of this graphs +resembles cluster distribution in a lattice percolation near the critical +point. + + Comment: 6 pages, 9 figures + 2010-07-06 + text + http://arxiv.org/abs/1007.1025 + Proceedings of 2009 IEEE Toronto International Conference - + Science and Technology for Humanity, IEEE, Toronto 2009, pp. 491-496 + doi:10.1109/TIC-STH.2009.5444449 + + +
+ +
+ oai:arXiv.org:1007.1026 + 2023-12-18 + physics:nlin +
+ + + On the calibration of neural networks for histological slide-level + classification + Kurz, Alexander + Mehrtens, Hendrik A. + Bucher, Tabea-Clara + Brinker, Titus J. + Electrical Engineering and Systems Science - Image and Video Processing + Computer Science - Computer Vision and Pattern Recognition + Deep Neural Networks have shown promising classification performance when +predicting certain biomarkers from Whole Slide Images in digital pathology. +However, the calibration of the networks' output probabilities is often not +evaluated. Communicating uncertainty by providing reliable confidence scores is +of high relevance in the medical context. In this work, we compare three neural +network architectures that combine feature representations on patch-level to a +slide-level prediction with respect to their classification performance and +evaluate their calibration. As slide-level classification task, we choose the +prediction of Microsatellite Instability from Colorectal Cancer tissue +sections. We observe that Transformers lead to good results in terms of +classification performance and calibration. When evaluating the classification +performance on a separate dataset, we observe that Transformers generalize +best. The investigation of reliability diagrams provides additional insights to +the Expected Calibration Error metric and we observe that especially +Transformers push the output probabilities to extreme values, which results in +overconfident predictions. + + Comment: 7 pages, 2 figures, 2 tables + 2023-12-15 + text + http://arxiv.org/abs/2312.09719 + + +
+ +
+ oai:arXiv.org:2312.09720 + 2023-12-18 + eess +
+ + + RIS-Enabled NLoS Near-Field Joint Position and Velocity Estimation under + User Mobility + Rahal, Moustafa + Denis, Benoit + Keskin, Musa Furkan + Uguen, Bernard + Wymeersch, Henk + Electrical Engineering and Systems Science - Signal Processing + In the context of single-base station (BS) non-line-of-sight (NLoS) +single-epoch localization with the aid of a reflective reconfigurable +intelligent surface (RIS), this paper introduces a novel three-step algorithm +that jointly estimates the position and velocity of a mobile user equipment +(UE), while compensating for the Doppler effects observed in near-field (NF) at +the RIS elements over the short transmission duration of a sequence of downlink +(DL) pilot symbols. First, a low-complexity initialization procedure is +proposed, relying in part on far-field (FF) approximation and a static user +assumption. Then, an alternating optimization procedure is designed to +iteratively refine the velocity and position estimates, as well as the channel +gain. The refinement routines leverage small angle approximations and the +linearization of the RIS response, accounting for both NF and mobility effects. +We evaluate the performance of the proposed algorithm through extensive +simulations under diverse operating conditions with regard to signal-to-noise +ratio (SNR), UE mobility, uncontrolled multipath and RIS-UE distance. Our +results reveal remarkable performance improvements over the state-of-the-art +(SoTA) mobility-agnostic benchmark algorithm, while indicating convergence of +the proposed algorithm to respective theoretical bounds on position and +velocity estimation. + + Comment: 11 pages, 9 figures, journal + 2023-12-15 + text + http://arxiv.org/abs/2312.09720 + + +
+6905935|1001 +
+
diff --git a/utils/rxiv-download-tool/ict.yaml b/utils/rxiv-download-tool/ict.yaml new file mode 100644 index 0000000..6950894 --- /dev/null +++ b/utils/rxiv-download-tool/ict.yaml @@ -0,0 +1,51 @@ +author: +- Nick Schaub +- Hamdah Shafqat +contact: nick.schaub@nih.gov +container: polusai/rxiv-download-tool:0.1.0-dev1 +description: This plugin allows to download data from Rxiv website. +entrypoint: python3 -m polus.tabular.utils.rxiv_download +inputs: +- description: Pull records from open access archives. + format: + - rxiv + name: rxiv + required: true + type: string +- description: Start date. + format: + - start + name: start + required: false + type: string +- description: Generate an output preview. + format: + - preview + name: preview + required: false + type: boolean +name: polusai/DownloadRxivtextdata +outputs: +- description: Output collection. + format: + - outDir + name: outDir + required: true + type: path +repository: https://github.com/PolusAI/tabular-tools +specVersion: 1.0.0 +title: Download Rxiv text data +ui: +- description: Pull records from open access archives. + key: inputs.rxiv + title: rxiv + type: text +- description: Start date. + key: inputs.start + title: start + type: text +- description: Generate an output preview. + key: inputs.preview + title: Preview example output of this plugin + type: checkbox +version: 0.1.0-dev1 diff --git a/utils/rxiv-download-tool/plugin.json b/utils/rxiv-download-tool/plugin.json new file mode 100644 index 0000000..8f08081 --- /dev/null +++ b/utils/rxiv-download-tool/plugin.json @@ -0,0 +1,70 @@ +{ + "name": "Rxiv-Download", + "version": "0.1.0-dev1", + "title": "Download Rxiv text data", + "description": "This plugin allows to download data from Rxiv website.", + "author": "Nick Schaub (nick.schaub@nih.gov), Hamdah Shafqat Abbasi (hamdahshafqat.abbasi@nih.gov)", + "institution": "National Center for Advancing Translational Sciences, National Institutes of Health", + "repository": "https://github.com/PolusAI/tabular-tools", + "website": "https://ncats.nih.gov/preclinical/core/informatics", + "citation": "", + "containerId": "polusai/rxiv-download-tool:0.1.0-dev1", + "baseCommand": [ + "python3", + "-m", + "polus.tabular.utils.rxiv_download" + ], + "inputs": [ + { + "name": "rxiv", + "type": "string", + "title": "rxiv", + "description": "Pull records from open access archives.", + "required": "True" + }, + { + "name": "start", + "type": "string", + "title": "start", + "description": "Start date.", + "required": "False" + }, + { + "name": "preview", + "type": "boolean", + "title": "Preview", + "description": "Generate an output preview.", + "required": "False" + } + ], + "outputs": [ + { + "name": "outDir", + "type": "genericData", + "description": "Output collection." + } + ], + "ui": [ + { + "key": "inputs.rxiv", + "type": "string", + "title": "rxiv", + "description": "Pull records from open access archives.", + "required": "True" + }, + { + "key": "inputs.start", + "type": "string", + "title": "start", + "description": "Start date.", + "required": "False" + }, + { + "key": "inputs.preview", + "type": "boolean", + "title": "Preview example output of this plugin", + "description": "Generate an output preview.", + "required": "False" + } + ] +} \ No newline at end of file diff --git a/utils/rxiv-download-tool/pyproject.toml b/utils/rxiv-download-tool/pyproject.toml new file mode 100644 index 0000000..c9dfcaf --- /dev/null +++ b/utils/rxiv-download-tool/pyproject.toml @@ -0,0 +1,41 @@ +[tool.poetry] +name = "polus-tabular-utils-rxiv-download" +version = "0.1.0-dev1" +description = "Fetch text data from rxiv" +authors = [ + "Nick Schaub ", + "Hamdah Shafqat abbasi " + ] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +typer = "^0.7.0" +requests = "^2.31.0" +rxiv-types = "^0.1.0" +tqdm = "^4.66.1" +xmltodict = "^0.13.0" +pydantic = "1.10.4" + + +[[tool.poetry.source]] +name = "test" +url = "https://test.pypi.org/simple/" +default = false +secondary = true + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +flake8 = "^6.0.0" +pre-commit = "^3.2.1" +flake8-docstrings = "^1.7.0" +black = "^23.3.0" +mypy = "^1.1.1" +pytest = "^7.2.2" +ruff = "^0.0.270" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/utils/rxiv-download-tool/run-plugin.sh b/utils/rxiv-download-tool/run-plugin.sh new file mode 100644 index 0000000..48c596a --- /dev/null +++ b/utils/rxiv-download-tool/run-plugin.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +version=$( None: + """Scaled Nyxus plugin allows to extract features from labelled images.""" + logger.info(f"--rxiv = {rxiv}") + logger.info(f"--start = {start}") + logger.info(f"--outDir = {out_dir}") + + if start is not None: + start_date = datetime.strptime(start, "%Y-%m-%d").date() + + out_dir = out_dir.resolve() + + if not out_dir.exists(): + out_dir.mkdir(exist_ok=True) + + assert out_dir.exists(), f"{out_dir} does not exist!! Please check input path again" + + model = ArxivDownload(path=out_dir, rxiv=rxiv, start=start_date) + model.fetch_and_save_records() + + if preview: + generate_preview(out_dir) + logger.info(f"generating preview data in {out_dir}") + + +if __name__ == "__main__": + app() diff --git a/utils/rxiv-download-tool/src/polus/tabular/utils/rxiv_download/fetch.py b/utils/rxiv-download-tool/src/polus/tabular/utils/rxiv_download/fetch.py new file mode 100644 index 0000000..b198627 --- /dev/null +++ b/utils/rxiv-download-tool/src/polus/tabular/utils/rxiv_download/fetch.py @@ -0,0 +1,217 @@ +"""Rxiv Download Plugin.""" +import json +import logging +import os +import shutil +from concurrent.futures import ProcessPoolExecutor +from datetime import datetime +from io import BytesIO +from pathlib import Path +from typing import Optional + +import requests +import xmltodict +from rxiv_types import arxiv_records +from rxiv_types.models.oai_pmh.org.openarchives.oai.pkg_2.resumption_token_type import ( + ResumptionTokenType, +) +from tqdm import tqdm +from xsdata.models.datatype import XmlDate + +logger = logging.getLogger(__name__) +logger.setLevel(os.environ.get("POLUS_LOG", logging.INFO)) + +POLUS_EXT = os.environ.get("POLUS_EXT", ".xml") + +RXIVS = { + "arXiv": {"url": "https://export.arxiv.org/oai2", "stride": 1000}, +} + + +def generate_preview( + path: Path, +) -> None: + """Generate preview of the plugin outputs.""" + prev_file = list( + Path().cwd().parents[4].joinpath("examples").rglob(f"*{POLUS_EXT}"), + )[0] + + shutil.copy(prev_file, path) + + +class ArxivDownload: + """Fetch OAI records from an API. + + Args: + rxiv: The rxiv to pull from. Must be one of ["arXiv"].str + token: A resumption token. Defaults to None. + start: Start date. Only used if `token=None`. + + Returns: + Raw XML bytes. + """ + + def __init__( + self, + path: Path, + rxiv: str, + start: Optional[datetime] = None, + ) -> None: + """Create a ArxivDownload.""" + self.path = path + self.rxiv = rxiv + self.start = start + + if self.rxiv not in RXIVS: + msg = f"{self.rxiv} is an invalid rxiv value. Must be one of {list(RXIVS)}" + raise ValueError( + msg, + ) + + if self.start is None and len(list(self.path.rglob(f"*{POLUS_EXT}"))) == 0: + self.start = datetime(1900, 1, 1) + + elif self.start is None and len(list(self.path.rglob(f"*{POLUS_EXT}"))) != 0: + self.start = self._resume_from() + + self.start = self.start + + self.params = {"verb": "ListRecords"} + + @staticmethod + def path_from_token( + path: Path, + rxiv: str, + start: Optional[datetime] = None, + token: Optional[ResumptionTokenType] = None, + ) -> Path: + """Creating output directory for saving records.""" + if start and token is not None: + file_path = path.joinpath( + f"{rxiv}_" + + f"{start.year}{str(start.month).zfill(2)}{str(start.day).zfill(0)}_" + + f"{int(token.cursor)}{POLUS_EXT}", + ) + + file_path.parent.mkdir(exist_ok=True, parents=True) + + return file_path + + def fetch_records(self) -> bytes: + """Fetch OAI records from an API.""" + # Configure parameters + if self.start is not None: + self.params.update( + { + "from": f"{self.start.year}-" + + f"{str(self.start.month).zfill(2)}-" + + f"{str(self.start.day).zfill(2)}", + "metadataPrefix": "oai_dc", + }, + ) + response = requests.get( + RXIVS["arXiv"]["url"], # type: ignore + params=self.params, + timeout=20, + ) + if response.ok: + logger.info( + f"Successfully hit url: {response.url}", + ) + else: + logger.info( + f"Error pulling data: {response.url} status {response.status_code}", + ) + + return response.content + + @staticmethod + def _get_latest(file: Path) -> datetime: + """Find the latest date to resume download files.""" + fixed_date = datetime(1900, 1, 1) + records = arxiv_records(str(file.absolute())) + if records.list_records is None: + msg = "Record list is empty!! Please download it again" + raise ValueError(msg) + for record in records.list_records.record: + if record.header is None: + msg = "Record header is empty!! Please download it again" + raise ValueError(msg) + if not isinstance(record.header.datestamp, XmlDate): + msg = "Record date is missing!!" + raise ValueError(msg) + record_date = record.header.datestamp.to_datetime() + if record_date > fixed_date: + last = record_date + return last + + def _resume_from(self) -> datetime: + """Find the previous cursor and create a resume token.""" + if not self.path.exists(): + return datetime(1900, 1, 1) + files = [ + f + for f in self.path.iterdir() + if f.is_file() and f.name.startswith(self.rxiv) + ] + + with ProcessPoolExecutor() as executor: + dates = list(executor.map(self._get_latest, files)) + return max(dates) + + @staticmethod + def save_records(path: Path, record: bytes) -> None: + """Writing response content either in XML or JSON format.""" + if POLUS_EXT == ".xml": + with Path.open(path, "wb") as fw: + fw.write(record) + fw.close() + elif POLUS_EXT == ".json": + parsed_data = xmltodict.parse(record, attr_prefix="") + json_data = json.dumps(parsed_data, indent=2) + with Path.open(path, "w") as fw: + fw.write(json_data) + fw.close() + + def fetch_and_save_records(self) -> None: + """Fetch and save response contents.""" + response = self.fetch_records() + + records = arxiv_records(BytesIO(response)) + + if records.list_records is None: + msg = "Unable to download a record" + raise ValueError(msg) + + for record in records.list_records.record: + if record.header is not None and not isinstance( + record.header.datestamp, + XmlDate, + ): + msg = "Error with downloading a XML record" + raise ValueError(msg) + + logger.info("Getting token...") + token = records.list_records.resumption_token + key, _ = token.value.split("|") + index = token.cursor + + if token.complete_list_size is None: + msg = "Error with downloading a XML record" + raise ValueError(msg) + + logger.info(f"Resuming from date: {self.start}") + + for i in tqdm( + range(int(index), token.complete_list_size, 1000), + total=((token.complete_list_size - int(index)) // 1000 + 1), + ): + thread_token = ResumptionTokenType(value="|".join([key, str(i)]), cursor=i) + + file_path = self.path_from_token( + path=self.path, + rxiv=self.rxiv, + start=self.start, + token=thread_token, + ) + self.save_records(path=file_path, record=response) diff --git a/utils/rxiv-download-tool/tests/__init__.py b/utils/rxiv-download-tool/tests/__init__.py new file mode 100644 index 0000000..17974cd --- /dev/null +++ b/utils/rxiv-download-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Rxiv Download Plugin.""" diff --git a/utils/rxiv-download-tool/tests/conftest.py b/utils/rxiv-download-tool/tests/conftest.py new file mode 100644 index 0000000..b1448d5 --- /dev/null +++ b/utils/rxiv-download-tool/tests/conftest.py @@ -0,0 +1,41 @@ +"""Test fixtures. + +Set up all data used in tests. +""" + +import shutil +import tempfile +from pathlib import Path +from typing import Union + +import pytest + + +def pytest_addoption(parser: pytest.Parser) -> None: + """Add options to pytest.""" + parser.addoption( + "--slow", + action="store_true", + dest="slow", + default=False, + help="run slow tests", + ) + + +def clean_directories() -> None: + """Remove all temporary directories.""" + for d in Path(".").cwd().iterdir(): + if d.is_dir() and d.name.startswith("tmp"): + shutil.rmtree(d) + + +@pytest.fixture() +def output_directory() -> Union[str, Path]: + """Create output directory.""" + return Path(tempfile.mkdtemp(dir=Path.cwd())) + + +@pytest.fixture(params=["2023-12-16", "2023-12-17"]) +def get_params(request: pytest.FixtureRequest) -> pytest.FixtureRequest: + """To get the parameter of the fixture.""" + return request.param diff --git a/utils/rxiv-download-tool/tests/test_cli.py b/utils/rxiv-download-tool/tests/test_cli.py new file mode 100644 index 0000000..807ea43 --- /dev/null +++ b/utils/rxiv-download-tool/tests/test_cli.py @@ -0,0 +1,51 @@ +"""Test Command line Tool.""" + +from typer.testing import CliRunner +from pathlib import Path +import pytest +from polus.tabular.utils.rxiv_download.__main__ import app +from .conftest import clean_directories +import time + + +def test_cli(output_directory: Path, get_params: pytest.FixtureRequest) -> None: + """Test the command line.""" + runner = CliRunner() + start = get_params + result = runner.invoke( + app, + [ + "--rxiv", + "arXiv", + "--start", + start, + "--outDir", + output_directory, + ], + ) + + assert result.exit_code == 0 + time.sleep(5) + clean_directories() + + +@pytest.mark.skipif("not config.getoption('slow')") +def test_short_cli(output_directory: Path, get_params: pytest.FixtureRequest) -> None: + """Test short cli command line.""" + runner = CliRunner() + start = get_params + result = runner.invoke( + app, + [ + "-r", + "arXiv", + "-s", + start, + "-o", + output_directory, + ], + ) + + assert result.exit_code == 0 + time.sleep(5) + clean_directories() diff --git a/utils/rxiv-download-tool/tests/test_fetch.py b/utils/rxiv-download-tool/tests/test_fetch.py new file mode 100644 index 0000000..92a17cd --- /dev/null +++ b/utils/rxiv-download-tool/tests/test_fetch.py @@ -0,0 +1,43 @@ +"""Test Command line Tool.""" + +from pathlib import Path +import polus.tabular.utils.rxiv_download.fetch as ft +from .conftest import clean_directories +import time +import pytest +from datetime import datetime + + +def test_fetch_and_save_records( + output_directory: Path, get_params: pytest.FixtureRequest +) -> None: + """Test record fetching and saving.""" + + start = datetime.strptime(get_params, "%Y-%m-%d").date() + + model = ft.ArxivDownload(path=output_directory, rxiv="arXiv", start=start) + model.fetch_and_save_records() + + out_ext = all([Path(f.name).suffix for f in output_directory.iterdir()]) + + assert out_ext == True + + out_date = [Path(f.name).stem.split("_")[1] for f in output_directory.iterdir()][0] + assert out_date == "".join(get_params.split("-")) + clean_directories() + time.sleep(5) + + +def test_fetch_records( + output_directory: Path, get_params: pytest.FixtureRequest +) -> None: + """Test fetch records.""" + + start = datetime.strptime(get_params, "%Y-%m-%d").date() + + model = ft.ArxivDownload(path=output_directory, rxiv="arXiv", start=start) + response = model.fetch_records() + + assert response != 0 + clean_directories() + time.sleep(5) diff --git a/visualization/polus-graph-pyramid-builder-plugin/Dockerfile b/visualization/polus-graph-pyramid-builder-plugin/Dockerfile new file mode 100644 index 0000000..d303a4f --- /dev/null +++ b/visualization/polus-graph-pyramid-builder-plugin/Dockerfile @@ -0,0 +1,24 @@ +# Get image containing bfio +FROM polusai/bfio:2.1.9 + +COPY VERSION / + +ARG EXEC_DIR="/opt/executables" +ARG DATA_DIR="/data" + +#Create folders +RUN mkdir -p ${EXEC_DIR} \ + && mkdir -p ${DATA_DIR}/inputs \ + && mkdir ${DATA_DIR}/outputs + +#Copy executable +COPY src ${EXEC_DIR}/ + +RUN pip3 install -r ${EXEC_DIR}/requirements.txt --no-cache-dir + +RUN python3 ${EXEC_DIR}/dl_fi.py + +WORKDIR ${EXEC_DIR} + +# Default command. Additional arguments are provided through the command line +ENTRYPOINT ["python3", "/opt/executables/main.py"] diff --git a/visualization/polus-graph-pyramid-builder-plugin/README.md b/visualization/polus-graph-pyramid-builder-plugin/README.md new file mode 100644 index 0000000..4b9a1f1 --- /dev/null +++ b/visualization/polus-graph-pyramid-builder-plugin/README.md @@ -0,0 +1,40 @@ +# Polus CZI Extraction Plugin + +This WIPP plugin will import a csv collection and build a DeepZoom pyramid of graphs, where each graph contains a heatmap of each column plotted against another column. All n-columns are plotted against each other, excluding tranposed graphs and graphs where each axis has the same column. This leads to a total of (n^2-n)/2 graphs. + +Two types of graphs will be produced: +1) Linear sclaed graphs +2) Log scaled graphs + + The output will contain dzi and csv files for both linear and log scaled outputs. + There were will be two different directories that contain the pyramid images for the linear and log scaled outputs + +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +## Building + +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. + +## Options + +This plugin takes one input argument and one output argument: + +| Name | Description | I/O | Type | +| -------- | ---------------------- | ------ | ---------------- | +| `inpDir` | Input CSV collection | Input | CSV Collection | +| `outDir` | Output pyramid | Output | Pyramid | + +## Run the plugin + +### Run the Docker Container + +```bash +docker run -v /path/to/data:/data graph-pyramid-builder \ + --inpDir /data/input \ + --outDir /data/output +``` diff --git a/visualization/polus-graph-pyramid-builder-plugin/VERSION b/visualization/polus-graph-pyramid-builder-plugin/VERSION new file mode 100644 index 0000000..e05cb33 --- /dev/null +++ b/visualization/polus-graph-pyramid-builder-plugin/VERSION @@ -0,0 +1 @@ +1.3.8 diff --git a/visualization/polus-graph-pyramid-builder-plugin/build-docker.sh b/visualization/polus-graph-pyramid-builder-plugin/build-docker.sh new file mode 100755 index 0000000..e96c755 --- /dev/null +++ b/visualization/polus-graph-pyramid-builder-plugin/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$(' + +# Initialize the logger +logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt='%d-%b-%y %H:%M:%S') +logger = logging.getLogger("main") +logger.setLevel(logging.INFO) + +def is_number(value): + """ This function checks to see if the value can be converted to a number """ + try: + float(value) + return True + except: + return False + +def load_csv(fpath): + """ Load a csv and select data + + Data is loaded from a csv, and data columns containing numeric values are + returned in a pandas Dataframe. The second row of the csv may contain + column classifiers, so the second row is first loaded and checked to + determine if the classifiers are present. + Inputs: + fpath - Path to csv file + Outputs: + data - A pandas Dataframe + cnames - Names of columns + """ + + # Check if the first row is column coding, and if it is then find valid columns + data = pandas.read_csv(fpath,nrows=1) + is_coded = True + cnames = [] + for ind,fname in zip(range(len(data.columns)),data.columns): + if data[fname][0] != 'F' and data[fname][0] != 'C': + is_coded = False + if is_number(data[fname][0]): + cnames.append([fname,ind]) + else: + logging.info('Column {} does not appear to contain numeric values. Not building graphs for this column.'.format(fname)) + elif data[fname][0] == 'F': + cnames.append([fname,ind]) + else: + logging.info('Skipping column {} for reason: one hot encodings'.format(fname)) + + # Load the data + if is_coded: + data = pandas.read_csv(fpath,skiprows=[1],usecols=[c[0] for c in cnames]) + + else: + data = pandas.read_csv(fpath,usecols=[c[0] for c in cnames]) + + return data, cnames + +def bin_data(data, bin_stats): + """ This function bins the data + Inputs: + data - pandas dataframe of data + bin_stats - stats of the data + Outputs: + bins - binned data ranging from (0, bincount) + graph_index - Numeric value of column index from original csv + graph_dict - a dictionary containing the indexes of graphs + """ + + column_names = data.columns + nfeats = data.shape[1] + nrows = data.shape[0] + + # Handle NaN values + data_ind = pandas.notnull(data) + data[~data_ind] = 255 + + data = data.astype(np.uint16) # cast to save memory + data[data>=bincount] = bincount - 1 # in case of numerical precision issues + + + if nrows < 2**8: + dtype = np.uint8 + elif nrows < 2**16: + dtype = np.uint16 + elif nrows < 2**32: + dtype = np.uint32 + else: + dtype = np.uint64 + + totalgraphs = int((nfeats**2 - nfeats)/2) + bins = np.zeros((totalgraphs, bincount, bincount), dtype=dtype) + graph_index = [] + graph_dict = {} + + # Create a linear index for feature bins + i = 0 + for feat1 in range(nfeats): + name1 = column_names[feat1] + feat1_tf = data[name1] * bincount + + for feat2 in range(feat1 + 1, nfeats): + graph_dict[(feat1, feat2)] = i + name2 = column_names[feat2] + + feat2_tf = data[name2] + feat2_tf = feat2_tf[data_ind[name1] & data_ind[name2]] + + if feat2_tf.size<=1: + continue + + # sort linear matrix indices + SortedFeats = np.sort(feat1_tf[data_ind[name1] & data_ind[name2]] + feat2_tf) + + # Do math to get the indices + ind2 = np.nonzero(np.diff(SortedFeats))[0] # nonzeros are cumulative sum of all bin values + ind2 = np.append(ind2,SortedFeats.size-1) + rows = (SortedFeats[ind2]/bincount).astype(np.uint8) # calculate row from linear index + cols = np.mod(SortedFeats[ind2],bincount) # calculate column from linear index + counts = np.diff(ind2) # calculate the number of values in each bin + + bins[i,rows[0],cols[0]] = ind2[0] + 1 + bins[i,rows[1:],cols[1:]] = counts + graph_index.append([feat1,feat2]) + i = i + 1 + + return bins, graph_index, graph_dict + +def transform_data(data,column_names, typegraph): + """ Bin the data + + Data from a pandas Dataframe is binned in two dimensions. Binning is performed by + binning data in one column along one axis and another column is binned along the + other axis. All combinations of columns are binned without repeats or transposition. + There are only bincount number of bins in each dimension, and each bin is 1/bincount the size of the + difference between the maximum and minimum of each column. + If the data needs to be logarithmically scaled, then the data is transformed by the algorithm presented + in this paper: https://iopscience.iop.org/article/10.1088/0957-0233/24/2/027001 + Inputs: + data - A pandas Dataframe, with nfeats number of columns + column_names - Names of Dataframe columns + typegraph - Defines whether logarithmic scale or linear scalef + Outputs: + bins - A numpy matrix that has shape (int((nfeats**2 - nfeats)/2),bincount,bincount) + bin_feats - A list containing the minimum and maximum values of each column + index - Numeric value of column index from original csv + diction - a dictionary containing the indexes of graphs + """ + + nfeats = len(column_names) + + # If logarithmic, need to transform the data + # https://iopscience.iop.org/article/10.1088/0957-0233/24/2/027001 + # Adjusts for behavior near zero + + if typegraph == "log": + C = 1/np.log(10)# Derivative of Natural Log e, d(ln(x))/dx = 1/x + data = data.astype(np.float64) + data = np.sign(data) * np.log10(1 + (abs(data/C))) + + bin_stats = {'min': data.min(), + 'max': data.max(), + 'binwidth': (data.max()-data.min()+10**-6)/bincount} + + + # Transform data into bin positions for fast binning + data = ((data - bin_stats['min'])/bin_stats['binwidth']).apply(np.floor) + + bins, index, diction = bin_data(data, bin_stats) + return bins, bin_stats, index, diction + +""" 2. Plot Generation """ +def format_ticks(out): + """ Generate tick labels + Polus Plots uses D3 to generate the plots. This function tries to mimic the + formatting of tick labels. In place of using scientific notation a scale + prefix is appended to the end of the number. See _prefix comments to see the + suffixes that are used. Numbers that are larger or smaller than 10**24 or + 10**-24 respectively are not handled and may throw an error. Values outside + of this range do not currently have an agreed upon prefix in the measurement + science community. + + Inputs: + out - the values of the ticks used in graph + Outputs: + fticks - a list of strings containing formatted tick labels + """ + _prefix = { + -24: 'y', # yocto + -21: 'z', # zepto + -18: 'a', # atto + -15: 'f', # femto + -12: 'p', # pico + -9: 'n', # nano + -6: 'u', # micro + -3: 'm', # mili + 0: ' ', + 3: 'k', # kilo + 6: 'M', # mega + 9: 'G', # giga + 12: 'T', # tera + 15: 'P', # peta + 18: 'E', # exa + 21: 'Z', # zetta + 24: 'Y', # yotta + } + + fticks = [] + convertprefix = [] + for i in out: + formtick = "%#.3f" % i + decformtick = '%.2e' % Decimal(formtick) + convertexponent = float(decformtick[-3:]) + numbers = float(decformtick[:-4]) + if convertexponent > 0: + if convertexponent % 3 == 2: + movednum = round(numbers/10,2) + newprefix = _prefix[int(convertexponent + 1)] + formtick = str(movednum) + newprefix + elif convertexponent % 3 == 1: + movednum = round(numbers*10,1) + newprefix = _prefix[int(convertexponent - 1)] + formtick = str(movednum) + newprefix + else: + newprefix = _prefix[int(convertexponent)] + if i < 0: + formtick = str(decformtick[:5]) + newprefix + else: + formtick = str(decformtick[:4]) + newprefix + elif convertexponent < 0: + if convertexponent % -3 == -2: + movednum = round(numbers*10,1) + newprefix = _prefix[int(convertexponent - 1)] + formtick = str(movednum) + newprefix + elif convertexponent % -3 == -1: + movednum = round(numbers/10,2) + newprefix = _prefix[int(convertexponent + 1)] + formtick = str(movednum) + newprefix + else: + newprefix = _prefix[convertexponent] + if i < 0: + formtick = str(decformtick[:5]) + newprefix + else: + formtick = str(decformtick[:4]) + newprefix + else: + if i < 0: + formtick = str(decformtick[:5]) + _prefix[int(convertexponent)] + else: + formtick = str(decformtick[:4]) + _prefix[int(convertexponent)] + convertprefix.append(int(convertexponent)) + fticks.append(formtick) + + return fticks + +# Create a custom colormap to mimick Polus Plots +def get_cmap(): + + cmap_values = [[1.0,1.0,1.0,1.0]] + cmap_values.extend([[r/255,g/255,b/255,1] for r,g,b in zip(np.arange(0,255,2), + np.arange(153,255+1/128,102/126), + np.arange(34+1/128,0,-34/126))]) + cmap_values.extend([[r/255,g/255,b/255,1] for r,g,b in zip(np.arange(255,136-1/128,-119/127), + np.arange(255,0,-2), + np.arange(0,68+1/128,68/127))]) + cmap = ListedColormap(cmap_values) + + return cmap + +def gen_plot(col1, + col2, + indexdict, + column_names, + bin_stats, + fig, + ax, + data, + typegraph): + """ Generate a heatmap + Generate a heatmap of data for column 1 against column 2. + Inputs: + col1 - the column plotted on the y-axis + col2 - column plotted on the x-axis + indexdict - a dictionary containing the indexes of graphs + column_names - list of column names + bin_stats - a list containing the min,max values of each column + fig - pregenerated figure + ax - pregenerated axis + data - p regenerated heatmap bbox artist + typegraph - specifies whether the data is log scaled or linearly scaled + Outputs: + hmap - A numpy array containing pixels of the heatmap + """ + def keepdecreasing(labeltexts0, decreasefont, bbxtext): + """ This function decreases the size of the labels if its too big """ + labeltexts0.set_fontsize(decreasefont) + bbxtext = labeltexts0.get_window_extent(renderer = fig.canvas.renderer) + decreasefont = decreasefont - 1 + return bbxtext, decreasefont + + def calculateticks(ticks, bin_width, fmin, typegraph): + """ This functio n calculates the tick values for the graphs """ + + if typegraph == "linear": + tick_vals = [t for t in ticks*bin_width+fmin] + if typegraph == "log": + C = 1/np.log(10) + tick_vals = [np.sign(t)*C*(-1+(10**abs(t))) for t in ticks*bin_width+fmin] + return tick_vals + + if col2>col1: + d = np.squeeze(bins[indexdict[col1, col2],:,:]) + r = col1 + c = col2 + elif col2 CHUNK_SIZE) or (bbxtext.y0 < 0 or bbxtext.y1 > (CHUNK_SIZE*.075)): + bbxtext, decreasefont = keepdecreasing(axlabel.texts[0], decreasefont, bbxtext) + + # This is to decrease the size of the title labels if the name is too large (Y AXIS LABEL) + if len(aylabel.texts) == 0: + aylabel.text(0.5, 0.5, "\n".join(wrap(cname_r, 60)), va = 'center', ha = 'center', fontsize = sizefont, rotation = 90, wrap = True) + else: + aylabeltext0 = aylabel.texts[0] + aylabeltext0.set_text("\n".join(wrap(cname_r, 60))) + aylabeltext0.set_fontsize(sizefont) + + bbytext = (aylabel.texts[0]).get_window_extent(renderer = fig.canvas.renderer) + decreasefont = sizefont - 1 + while (bbytext.y0 < 0 or bbytext.y1 > CHUNK_SIZE) or (bbytext.x0 < 0 or bbytext.x1 > (CHUNK_SIZE*.075)): + bbytext, decreasefont = keepdecreasing(aylabel.texts[0], decreasefont, bbytext) + + while len(ax.lines) > 0: + ax.lines[-1].remove() + + # Calculating the value of each tick in the graph (fixed width) + fmin_c = bin_stats['min'][cname_c] + fmax_c = bin_stats['max'][cname_c] + binwidth_c = bin_stats['binwidth'][cname_c] + tick_vals_c= calculateticks(ax.get_xticks(), binwidth_c, fmin_c, typegraph) + if fmin_c < 0: # draw x=0 + ax.axvline(x=abs(fmin_c)/binwidth_c) + ax.set_xticklabels(format_ticks(tick_vals_c), rotation=45, fontsize = 5, ha='right') + + # Calculating the value of each tick in the graph (fixed width) + fmin_r = bin_stats['min'][cname_r] + fmax_r = bin_stats['max'][cname_r] + binwidth_r = bin_stats['binwidth'][cname_r] + tick_vals_r = calculateticks(ax.get_yticks(), binwidth_r, fmin_r, typegraph) + if fmin_r < 0: # draw y=0 + ax.axhline(y=abs(fmin_r)/binwidth_r) + ax.set_yticklabels(format_ticks(tick_vals_r), fontsize=5, ha='right') + + fig.canvas.draw() + hmap = np.array(fig.canvas.renderer.buffer_rgba()) + + return hmap + +def get_default_fig(cmap): + """ Generate a default figure, axis, and heatmap artist + Generate a figure and draw an empty graph with useful settings for repeated + drawing of new figures. By passing the existing figure, axis, and heatmap + artist to the plot generator, many things do not need to be drawn from + scratch. This decreases the plot drawing time by a factor of 2-3 times. + Inputs: + cmap - the heatmap colormap + Outputs: + fig - A reference to the figure object + ax - A reference to the axis object + data - A reference to the heatmap artist + """ + fig, ax = plt.subplots(dpi=int(CHUNK_SIZE/4),figsize=(4,4),tight_layout={'h_pad':1,'w_pad':1}) + datacolor = ax.pcolorfast(np.zeros((bincount, bincount),np.uint64),cmap=cmap) + ticks = [t for t in range(0, bincount+1, int(bincount/(10)))] + + ax.set_xlim(0,bincount) + ax.set_ylim(0,bincount) + ax.set_xticks(ticks) + ax.set_yticks(ticks) + ax.set_xlabel(" ") + ax.set_ylabel(" ") + + ax.set_xticklabels(ticks, rotation = 45) + ax.set_yticklabels(ticks) + + fig.canvas.draw() + + axlabel = fig.add_axes([.075, 0, 1, .075], frameon = False, alpha = .5, facecolor = 'b') + axlabel.set_xticks([]) + axlabel.set_yticks([]) + axlabel.set_clip_on(True) + aylabel = fig.add_axes([0, .075, .075, 1], frameon = False, alpha = .5, facecolor = 'b') + aylabel.set_xticks([]) + aylabel.set_yticks([]) + aylabel.set_clip_on(True) + + return fig, ax, datacolor + +""" 3. Pyramid generation functions """ + +def _avg2(image): + """ Average pixels with optical field of 2x2 and stride 2 """ + + # Convert 32-bit pixels to prevent overflow during averaging + image = image.astype(np.uint32) + imageshape0 = image.shape[0] + imageshape1 = image.shape[1] + # Get the height and width of each image to the nearest even number + y_max = imageshape0 - imageshape0 % 2 + x_max = imageshape1 - imageshape1 % 2 + + # Perform averaging + avg_img = np.zeros(np.ceil([image.shape[0]/2,image.shape[1]/2,image.shape[2]]).astype(np.uint32)) + for z in range(4): + avg_img[0:int(y_max/2),0:int(x_max/2),z]= (image[0:y_max-1:2,0:x_max-1:2,z] + \ + image[1:y_max:2,0:x_max-1:2,z] + \ + image[0:y_max-1:2,1:x_max:2,z] + \ + image[1:y_max:2,1:x_max:2,z]) / 4 + + # The next if statements handle edge cases if the height or width of the + # image has an odd number of pixels + if y_max != imageshape0: + for z in range(3): + avg_img[-1,:int(x_max/2),z] = (image[-1,0:x_max-1:2,z] + \ + image[-1,1:x_max:2,z]) / 2 + if x_max != imageshape1: + for z in range(4): + avg_img[:int(y_max/2),-1,z] = (image[0:y_max-1:2,-1,z] + \ + image[1:y_max:2,-1,z]) / 2 + if y_max != imageshape0 and x_max != imageshape1: + for z in range(4): + avg_img[-1,-1,z] = image[-1,-1,z] + return avg_img + +def metadata_to_graph_info(outPath,outFile, ngraphs): + + # Create an output path object for the info file + op = Path(outPath).joinpath("{}.dzi".format(outFile)) + + # create an output path for the images + of = Path(outPath).joinpath('{}_files'.format(outFile)) + of.mkdir(exist_ok=True) + + # Get metadata info from the bfio reader + rows = np.ceil(np.sqrt(ngraphs)) + cols = np.round(np.sqrt(ngraphs)) + sizes = [cols*CHUNK_SIZE,rows*CHUNK_SIZE] + + # Calculate the number of pyramid levels + num_scales = np.ceil(np.log2(rows*CHUNK_SIZE)).astype(np.uint8) + + # create a scales template, use the full resolution + scales = { + "size":sizes, + "key": num_scales + } + + # initialize the json dictionary + info = { + "scales": [scales], # Will build scales belows + "rows": rows, + "cols": cols + } + + # create the information for each scale + for i in range(1,num_scales+1): + previous_scale = info['scales'][-1] + current_scale = copy.deepcopy(previous_scale) + current_scale['key'] = str(num_scales - i) + current_scale['size'] = [int(np.ceil(previous_scale['size'][0]/2)),int(np.ceil(previous_scale['size'][1]/2))] + info['scales'].append(current_scale) + + # write the dzi file + with open(op,'w') as writer: + writer.write(DZI.format(int(info['cols']*CHUNK_SIZE),int(info['rows']*CHUNK_SIZE))) + + return info + + +def _get_higher_res(S,info,cnames, outpath,out_file,indexscale,indexdict,binstats, typegraph, X=None,Y=None): + """ + The following function builds the image pyramid at scale S by building up only + the necessary information at high resolution layers of the pyramid. So, if 0 is + the original resolution of the image, getting a tile at scale 2 will generate + only the necessary information at layers 0 and 1 to create the desired tile at + layer 2. This function is recursive and can be parallelized. + Inputs: + S - current scale + info - dictionary of scale information + outpath - directory for all outputs + out_file - directory for current dataset + indexscale - index of the graph + binstats - stats for the binned data + typegraph - specifies whether the data is linear or logarithmically scaled + Outputs: + DeepZoom format of images. + """ + + # Get the scale info + num_scales = len(info['scales']) + scale_info = info['scales'][num_scales-S-1] + + if scale_info==None: + raise ValueError("No scale information for resolution {}.".format(S)) + if X == None: + X = [0,scale_info['size'][0]] + if Y == None: + Y = [0,scale_info['size'][1]] + + # Modify upper bound to stay within resolution dimensions + if X[1] > scale_info['size'][0]: + X[1] = scale_info['size'][0] + if Y[1] > scale_info['size'][1]: + Y[1] = scale_info['size'][1] + + # Initialize the output + image = np.zeros((int(Y[1]-Y[0]),int(X[1]-X[0]),4),dtype=np.uint8) + + # If requesting from the lowest scale, then just generate the graph + if S==num_scales-1: + index = int((int(Y[0]/CHUNK_SIZE) + int(X[0]/CHUNK_SIZE) * info['rows'])) + if index>=len(indexscale): + image = np.ones((CHUNK_SIZE,CHUNK_SIZE,4),dtype=np.uint8) * (bincount + 55) + else: + image = gen_plot(col1=indexscale[index][0], + col2=indexscale[index][1], + indexdict=indexdict, + column_names=cnames, + bin_stats=binstats, + fig=fig, + ax=ax, + data=datacolor, + typegraph=typegraph) + + else: + # Set the subgrid dimensions + subgrid_dimX = list(np.arange(2*X[0], 2*X[1], CHUNK_SIZE).astype('int')) + subgrid_dimX.append(2*X[1]) + subgrid_dimY = list(np.arange(2*Y[0], 2*Y[1], CHUNK_SIZE).astype('int')) + subgrid_dimY.append(2*Y[1]) + + + for y in range(0,len(subgrid_dimY)-1): + subgrid_Y_ind0 = np.ceil((subgrid_dimY[y] - subgrid_dimY[0])/2).astype('int') + subgrid_Y_ind1 = np.ceil((subgrid_dimY[y+1] - subgrid_dimY[0])/2).astype('int') + for x in range(0,len(subgrid_dimX)-1): + subgrid_X_ind0 = np.ceil((subgrid_dimX[x] - subgrid_dimX[0])/2).astype('int') + subgrid_X_ind1 = np.ceil((subgrid_dimX[x+1] - subgrid_dimX[0])/2).astype('int') + if S==(num_scales - 6): #to use multiple processors to compute faster. + sub_image = _get_higher_res_par(S=S+1, + info=info, + cnames=cnames, + outpath=outpath, + out_file=out_file, + indexscale=indexscale, + indexdict=indexdict, + binstats=binstats, + typegraph=typegraph, + X=subgrid_dimX[x:x+2], + Y=subgrid_dimY[y:y+2]) + else: + sub_image = _get_higher_res(S=S+1, + info=info, + cnames=cnames, + outpath=outpath, + out_file=out_file, + indexscale=indexscale, + indexdict=indexdict, + binstats=binstats, + typegraph=typegraph, + X=subgrid_dimX[x:x+2], + Y=subgrid_dimY[y:y+2]) + + image[subgrid_Y_ind0:subgrid_Y_ind1, subgrid_X_ind0:subgrid_X_ind1,:] = _avg2(sub_image) + del sub_image + + # Write the chunk + outpath = Path(outpath).joinpath('{}_files'.format(out_file),str(S)) + outpath.mkdir(exist_ok=True) + imageio.imwrite(outpath.joinpath('{}_{}.png'.format(int(X[0]/CHUNK_SIZE),int(Y[0]/CHUNK_SIZE))),image,format='PNG-FI',compression=1) + logger.info('Finished building tile (scale,X,Y): ({},{},{})'.format(S,int(X[0]/CHUNK_SIZE),int(Y[0]/CHUNK_SIZE))) + return image + +# This function performs the same operation as _get_highe_res, except it uses multiprocessing to grab higher +# resolution layers at a specific layer. +def _get_higher_res_par(S,info, cnames, outpath,out_file,indexscale, indexdict, binstats, typegraph, X=None,Y=None): + # Get the scale info + num_scales = len(info['scales']) + scale_info = info['scales'][num_scales-S-1] + + if scale_info==None: + ValueError("No scale information for resolution {}.".format(S)) + + if X == None: + X = [0,scale_info['size'][0]] + if Y == None: + Y = [0,scale_info['size'][1]] + + # Modify upper bound to stay within resolution dimensions + if X[1] > scale_info['size'][0]: + X[1] = scale_info['size'][0] + if Y[1] > scale_info['size'][1]: + Y[1] = scale_info['size'][1] + + # Initialize the output + image = np.zeros((Y[1]-Y[0],X[1]-X[0],4),dtype=np.uint8) + # If requesting from the lowest scale, then just generate the graph + if S==int(info['scales'][0]['key']): + index = (int(Y[0]/CHUNK_SIZE) + int(X[0]/CHUNK_SIZE) * info['rows']) + if index>=len(indexscale): + image = np.ones((CHUNK_SIZE,CHUNK_SIZE,4),dtype=np.uint8) * (bincount + 55) + else: + image = gen_plot(col1=indexscale[index][0], + col2=indexscale[index][1], + indexdict=indexdict, + column_names=cnames, + bin_stats=binstats, + fig=fig, + ax=ax, + data=datacolor, + typegraph=typegraph) + + else: + # Set the subgrid dimensions + subgrid_dimX = list(np.arange(2*X[0], 2*X[1], CHUNK_SIZE).astype('int')) + subgrid_dimX.append(2*X[1]) + subgrid_dimY = list(np.arange(2*Y[0], 2*Y[1], CHUNK_SIZE).astype('int')) + subgrid_dimY.append(2*Y[1]) + + subgrid_images = [] + + with Pool(processes=np.min(4,initial=multiprocessing.cpu_count())) as pool: + for y in range(0,len(subgrid_dimY)-1): + subgrid_Y_ind0 = np.ceil((subgrid_dimY[y] - subgrid_dimY[0])/2).astype('int') + subgrid_Y_ind1 = np.ceil((subgrid_dimY[y+1] - subgrid_dimY[0])/2).astype('int') + for x in range(0,len(subgrid_dimX)-1): + subgrid_X_ind0 = np.ceil((subgrid_dimX[x] - subgrid_dimX[0])/2).astype('int') + subgrid_X_ind1 = np.ceil((subgrid_dimX[x+1] - subgrid_dimX[0])/2).astype('int') + subgrid_images.append(pool.apply_async(_get_higher_res,(S+1, + info, + cnames, + outpath, + out_file, + indexscale, + indexdict, + binstats, + typegraph, + subgrid_dimX[x:x+2], + subgrid_dimY[y:y+2]))) + image[subgrid_Y_ind0:subgrid_Y_ind1,subgrid_X_ind0:subgrid_X_ind1,:] = _avg2((subgrid_images[y*(len(subgrid_dimX)-1) + x]).get()) + + del subgrid_images + + # Write the chunk + outpath = Path(outpath).joinpath('{}_files'.format(out_file),str(S)) + outpath.mkdir(exist_ok=True) + imageio.imwrite(outpath.joinpath('{}_{}.png'.format(int(X[0]/CHUNK_SIZE),int(Y[0]/CHUNK_SIZE))),image,format='PNG-FI',compression=1) + logger.info('Finished building tile (scale,X,Y): ({},{},{})'.format(S,int(X[0]/CHUNK_SIZE),int(Y[0]/CHUNK_SIZE))) + return image + +def write_csv(cnames,index,f_info,out_path,out_file): + """ This function writes the csv file necessary for the Deep Zoom format """ + + header = 'dataset_id, x_axis_id, y_axis_id, x_axis_name, y_axis_name, title, length, width, global_row, global_col\n' + line = '{:d}, {:d}, {:d}, {:s}, {:s}, default title, {:d}, {:d}, {:d}, {:d}\n' + l_ind = 0 + with open(str(Path(out_path).joinpath(out_file+'.csv').absolute()),'w') as writer: + writer.write(header) + for ind in index: + ind1 = ind[1] + ind0 = ind[0] + writer.write(line.format(1, + cnames[ind1][1], + cnames[ind0][1], + cnames[ind1][0], + cnames[ind0][0], + CHUNK_SIZE, + CHUNK_SIZE, + int(np.mod(l_ind,f_info['rows'])), + int(l_ind/f_info['rows']))) + l_ind += 1 + +if __name__=="__main__": + + + """ Initialize argument parser """ + logger.info("Parsing arguments...") + parser = argparse.ArgumentParser(prog='main', description='Build an image pyramid from data in a csv file.') + + """ Define the arguments """ + parser.add_argument('--inpDir', dest='inpDir', type=str, + help='Path to input images.', required=True) + + parser.add_argument('--outDir', dest='outDir', type=str, + help='Path to output images.', required=True) + + parser.add_argument('--bincount', dest='bin_count', type=int, + help='Number of bins', required=True) + + parser.add_argument('--scale', dest='scale', type=str, + help='Linear, Log, or Both', required=False) + + """ Get the input arguments """ + args = parser.parse_args() + + input_path = args.inpDir + output_path = Path(args.outDir) + bincount = args.bin_count + scales = [args.scale.lower()] + all_scales = ['linear','log'] + if scales[0] not in all_scales: + scales = all_scales + + logger.info('inpDir = {}'.format(input_path)) + logger.info('outDir = {}'.format(output_path)) + + # Set up the logger for each scale + loggers = {} + for scale in scales: + loggers[scale] = logging.getLogger("main.{}".format(scale.upper())) + loggers[scale].setLevel(logging.INFO) + + # Get the path to each csv file in the collection + input_files = [str(f.absolute()) for f in Path(input_path).iterdir() if ''.join(f.suffixes)=='.csv'] + + # Generate the default figure components + logger.info('Generating colormap and default figure...') + cmap = get_cmap() + fig, ax, datacolor = get_default_fig(cmap) + logger.info('Done!') + + for f in input_files: + + logger.info('Loading csv: {}'.format(f)) + data, cnames = load_csv(f) + column_names = [c[0] for c in cnames] + + for scale in scales: + + # Set the file path folder + folder_name = Path(f).name.replace('.csv','_{}'.format(scale)) + + # Process for current scale + loggers[scale].info('Processing: {}'.format(folder_name)) + + # Bin the data + loggers[scale].info('Binning data for {} {} features...'.format(len(column_names),scale.upper())) + bins, bin_stats, data_index, data_dict = transform_data(data,column_names, scale) + + # Generate the dzi file + loggers[scale].info('Generating pyramid {} metadata...'.format(scale.upper())) + ngraphs = len(data_index) + info_data = metadata_to_graph_info(output_path,folder_name, ngraphs) + loggers[scale].info('Done!') + + loggers[scale].info('Writing {} layout file...!'.format(scale.upper())) + write_csv(cnames,data_index,info_data,output_path,folder_name) + loggers[scale].info('Done!') + + # Create the pyramid + loggers[scale].info('Building {} pyramids...'.format(scale.upper())) + image_data = _get_higher_res(0, info_data,column_names, output_path,folder_name,data_index, data_dict, bin_stats, scale) + loggers[scale].info('Done!') diff --git a/visualization/polus-graph-pyramid-builder-plugin/src/requirements.txt b/visualization/polus-graph-pyramid-builder-plugin/src/requirements.txt new file mode 100644 index 0000000..da4cf76 --- /dev/null +++ b/visualization/polus-graph-pyramid-builder-plugin/src/requirements.txt @@ -0,0 +1,4 @@ +pandas>=0.25.1 +matplotlib>=3.1.1 +numpy>=1.21.0 +imageio==2.5.0