diff --git a/docs/conf.py b/docs/conf.py index f8893ee..b28bab8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -81,3 +81,11 @@ def setup(app): examples_path.mkdir(exist_ok=True) shutil.copy("../examples/GettingStarted.ipynb", "_examples/GettingStarted.ipynb") +shutil.copy("../examples/ConvertingData.ipynb", "_examples/ConvertingData.ipynb") +shutil.copy("../examples/VisualiseData.ipynb", "_examples/VisualiseData.ipynb") +shutil.copy("../examples/DoseMetrics.ipynb", "_examples/DoseMetrics.ipynb") +shutil.copy("../examples/Radiomics.ipynb", "_examples/Radiomics.ipynb") +shutil.copy( + "../examples/DatasetPreparation.ipynb", "_examples/DatasetPreparation.ipynb" +) +shutil.copy("../examples/WorkingWithData.ipynb", "_examples/WorkingWithData.ipynb") diff --git a/docs/index.rst b/docs/index.rst index 44f2c7e..04dbc30 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -6,6 +6,18 @@ :hidden: _examples/GettingStarted + _examples/ConvertingData + _examples/VisualiseData + _examples/Radiomics + _examples/DoseMetrics + _examples/DatasetPreparation + +.. toctree:: + :caption: Guides + :maxdepth: 2 + :hidden: + + _examples/WorkingWithData .. toctree:: :caption: Developers diff --git a/examples/ConvertingData.ipynb b/examples/ConvertingData.ipynb new file mode 100644 index 0000000..1b3513c --- /dev/null +++ b/examples/ConvertingData.ipynb @@ -0,0 +1,233 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Converting Data\n", + "\n", + "In this example, the preprocessing and conversion of DICOM data is demonstrated. These are\n", + "essential first steps before data can be analysed using PyDicer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " from pydicer import PyDicer\n", + "except ImportError:\n", + " !pip install pydicer\n", + " from pydicer import PyDicer\n", + "\n", + "from pathlib import Path\n", + "\n", + "from pydicer.input.test import TestInput" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup PyDicer\n", + "\n", + "As in the `Getting Started` example, we must first define a working directory for our dataset. We\n", + "also create a `PyDicer` object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "directory = Path(\"./working\")\n", + "pydicer = PyDicer(directory)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetch some data\n", + "\n", + "A TestInput class is provided in pydicer to download some sample data to work with. Several other\n", + "input classes exist if you'd like to retrieve DICOM data for conversion from somewhere else. See \n", + "the [docs for information](https://australiancancerdatanetwork.github.io/pydicer/html/input.html)\n", + "on how the PyDicer input classes work.\n", + "\n", + "Most commonly, if you have DICOM files stored within a folder on your file system you can simply\n", + "pass the path to your DICOM directory to the `pydicer.add_input()` function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dicom_directory = directory.joinpath(\"dicom\")\n", + "test_input = TestInput(dicom_directory)\n", + "test_input.fetch_data()\n", + "\n", + "# Add the input DICOM location to the pydicer object\n", + "pydicer.add_input(dicom_directory)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocess\n", + "\n", + "With some DICOM data ready to work with, we must first use the PyDicer `preprocess` module. This\n", + "module will crawl over all DICOM data available and will index all information required for\n", + "conversion of the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pydicer.preprocess()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Inspect Preprocessed Data\n", + "\n", + "Here we load the data that was indexed during preprocessing and output the first rows. This data\n", + "will be used by the following step of data conversion." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_preprocessed = pydicer.read_preprocessed_data()\n", + "df_preprocessed.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Convert Data\n", + "\n", + "With the DICOM data having been indexed during preprocessing, we are now ready to convert this data\n", + "into NIfTI format which will be stored within the PyDicer standard directory structure.\n", + "\n", + "Running the following cell will begin the conversion process. While this cell is running, take a\n", + "look inside the `working/data` directory to see how the converted data is being stored.\n", + "\n", + "Notice the `converted.csv` file stored for each patient. This tracks each converted data object.\n", + "This will be loaded as a Pandas DataFrame for use throughout PyDicer.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pydicer.convert.convert()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load Converted DataFrame\n", + "\n", + "Once data is converted, we can load a Pandas DataFrame which contains a description of each object\n", + "converted.\n", + "\n", + "The most useful columns in the DataFrame for working with this data in PyDicer are:\n", + "- `hashed_uid`: This is a 6 character hexidecimal hash of the associated DICOM SeriesInstanceUID.\n", + " PyDicer refers to objects using this hashed identifier for a more consice representation.\n", + "- `modality`: The modality of the data object.\n", + "- `patient_id`: The ID of the patient this data object belongs to.\n", + "- `path`: The path within the working directory where files for this data object are stored." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pydicer.read_converted_data()\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Quarantine\n", + "\n", + "If anything goes wrong while converting a DICOM object during either the preprocess step or the\n", + "conversion step, the problematic DICOM data will be copied to the `working/quarantine` directory.\n", + "\n", + "It's a good idea to regularly check your quarantine directory to ensure that no critical data\n", + "objects are being quarantine. If so you may want to consider rectifying the issue and running the\n", + "preprocess and conversion steps again.\n", + "\n", + "As can be seen by running the cell below, there were several DICOM objects moved to the quarantine\n", + "during for our test dataset. This was due to there being multiple slices at the same location with\n", + "differing pixel data in one CT image series." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_quarantine = pydicer.read_quarantined_data()\n", + "df_quarantine" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "814af119db7f8f2860617be3dcd1d37c560587d11c65bd58c45b1679d3ee6ea4" + }, + "kernelspec": { + "display_name": "Python 3.8.0 64-bit ('pydicer': pyenv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/DatasetPreparation.ipynb b/examples/DatasetPreparation.ipynb new file mode 100644 index 0000000..7cc81c7 --- /dev/null +++ b/examples/DatasetPreparation.ipynb @@ -0,0 +1,404 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Dataset Preparation\n", + "\n", + "When working with real-world DICOM datasets, you will often need to tackle the task of cleaning the\n", + "dataset. Often you will have several image series, structure set and even dose grids for each\n", + "patient. However you typically want to select one relevant DICOM object in each category.\n", + "\n", + "To help solve this, PyDicer provides a `dataset preparation` module which can be used to extract\n", + "a subset of data from your overall set. Two example use cases where this might be useful are:\n", + "\n", + "**Analysing dose to structures for a radiotherapy treatment**\n", + "You will want to extract the dose grid which was calculated from the plan used to treat the\n", + "patient, as well as the linked structure set and planning CT image.\n", + "\n", + "**Validating an Auto-segmentation tool**\n", + "A structure set may have been prepared for the purposes of validation and saved off with a specific\n", + "`SeriesDescription`. You select the latest structure set with that description as well as the\n", + "linked image series to perform the auto-segmentation validation.\n", + "\n", + "As you will see in the examples below, you can provide your own logic to extract subsets of data\n", + "using PyDicer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " from pydicer import PyDicer\n", + "except ImportError:\n", + " !pip install pydicer\n", + " from pydicer import PyDicer\n", + "\n", + "from pathlib import Path\n", + "import pandas as pd\n", + "\n", + "from pydicer.utils import fetch_converted_test_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup PyDicer\n", + "\n", + "As in some other examples, we will use the HNSCC data prepared which has been preprepared and is\n", + "downloaded into the `testdata_hnscc` directory. We also setup our `PyDicer` object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "working_directory = fetch_converted_test_data(\"./testdata_hnscc\", dataset=\"HNSCC\")\n", + "\n", + "pydicer = PyDicer(working_directory)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explore data\n", + "\n", + "When we use the `read_converted_data` function, by default it will return all data which has been\n", + "converted and is stored in the `testdata_hnscc/data` directory.\n", + "\n", + "Let's use this function and output the entire DataFrame of converted data to see what we have\n", + "available in this dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pydicer.read_converted_data()\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare dose data\n", + "\n", + "Here we use the `dataset preparation` module to extract the latest dose grid by date. We refer to\n", + "this subset of data as `dose_project`.\n", + "\n", + "We use the built in data extraction function, named [`rt_latest_dose`](https://australiancancerdatanetwork.github.io/pydicer/dataset.html#pydicer.dataset.functions.rt_latest_dose)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dose_project_name = \"dose_project\"\n", + "pydicer.dataset.prepare(dose_project_name, \"rt_latest_dose\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once the cell above has finished running, the dataset has been prepared. You can explore the\n", + "dataset in the `testdata_hnscc/dose_project` directory. Take notice of two things:\n", + "- The `converted.csv` file stored from each patient now only includes the data objects which have\n", + " been selected as part of this subset of data.\n", + "- The data object folders are not actual folders, but symbolic links to the original data found in\n", + " the `testdata_hnscc/data` directory. Like this, data isn't duplicated but the folder structure\n", + " remains easy to navigate.\n", + "\n", + "> Note: Symbolic links are supported on Unix-based (Linux, MacOS) operating systems only. These\n", + "> won't work on Windows however you can still use the dataset prepared which is tracked in the\n", + "> converted csv files." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load prepared Dataset\n", + "\n", + "By supplying the `dataset_name` to the `read_converted_data` function, we obtain a DataFrame\n", + "containing only the data objects part of that subset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_dose_project = pydicer.read_converted_data(dataset_name=dose_project_name)\n", + "df_dose_project" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that we now only have one of each data object modality in our `dose_project` subset. We are\n", + "now ready to work with that subset (e.g. extract dose metrics)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare Structure Dataset\n", + "\n", + "In the next example, we are only want to extract structure sets and their associated images. This\n", + "might be useful when training or validating an auto-segmentation model.\n", + "\n", + "In this example, we not only select the latest structure set by date, but we specify the\n", + "`StudyDescription` values of the DICOM metadata of the data objects we want to select. To achieve\n", + "this, we use the build in [`rt_latest_struct`](https://australiancancerdatanetwork.github.io/pydicer/dataset.html#pydicer.dataset.functions.rt_latest_struct) function.\n", + "\n", + "Observe the output of the following cell and explore the `testdata_hnscc` directory. We not have\n", + "one structure set and the linked image for each patient." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define the dataset name and the study description values to match\n", + "structure_project_name = \"structure_project\"\n", + "series_descriptions = [\n", + " \"RT SIMULATION\"\n", + "]\n", + "\n", + "\n", + "# Prepare the subset of data\n", + "pydicer.dataset.prepare(\n", + " structure_project_name,\n", + " \"rt_latest_struct\",\n", + " StudyDescription=series_descriptions\n", + ")\n", + "\n", + "# Load the data subset and display the DataFrame\n", + "df_structure_project = pydicer.read_converted_data(dataset_name=structure_project_name)\n", + "df_structure_project" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare Dataset from DataFrame\n", + "\n", + "In some scenarios, you may want to simply perform some filtering on the DataFrame returned by the\n", + "`read_converted_data` function and generate a subset of data based on that.\n", + "\n", + "In the following cell, a subset of data named `image_project` is generated by filtering the\n", + "DataFrame to keep only `CT` images.\n", + "\n", + "After running the following cell, explore the `testdata_hnscc/image_project` directory to confirm\n", + "that only image objects were selected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Read the converted DataFrame and filter only CT images\n", + "df = pydicer.read_converted_data()\n", + "df_ct = df[df.modality==\"CT\"]\n", + "\n", + "# Prepare a data subset using this filtered DataFrame\n", + "image_project_name = \"image_project\"\n", + "pydicer.dataset.prepare_from_dataframe(image_project_name, df_ct)\n", + "\n", + "# Load the data subset and display the DataFrame\n", + "df_image_project = pydicer.read_converted_data(dataset_name=image_project_name)\n", + "df_image_project\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define Custom Preparation Function\n", + "\n", + "In more complex use cases you may want to define your own logic for extracting data objects into a\n", + "subset. For example, you may have an additional DataFrame containing treatment start dates of\n", + "patients, and you would like to select the dose grid, structure set and image series which are\n", + "closest to that date.\n", + "\n", + "In the following cell, we preare a `clinical_project` subset of data. We create a dummy set of\n", + "clinical tabular data `df_clinical`. This stores each patient's stage and RT start date.\n", + "\n", + "We use the information in `df_clinical`, to select patients who are stage 1-3 along with the data\n", + "objects where the dose grid date is nearest to their treatment start date." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define some dummy clinical data\n", + "df_clinical = pd.DataFrame([\n", + " {\n", + " \"patient_id\": \"HNSCC-01-0199\",\n", + " \"stage\": 2,\n", + " \"rt_start_date\": \"2002-10-28\",\n", + " },\n", + " {\n", + " \"patient_id\": \"HNSCC-01-0176\",\n", + " \"stage\": 1,\n", + " \"rt_start_date\": \"2009-03-02\",\n", + " },\n", + " {\n", + " \"patient_id\": \"HNSCC-01-0019\",\n", + " \"stage\": 4,\n", + " \"rt_start_date\": \"1998-07-10\",\n", + " },\n", + "])\n", + "\n", + "# Convert date to a datetime object\n", + "df_clinical['rt_start_date'] = pd.to_datetime(df_clinical['rt_start_date'], format='%Y-%m-%d')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import some pydicer utility functions that we'll need\n", + "from pydicer.utils import load_object_metadata, determine_dcm_datetime\n", + "\n", + "# Define a function which accept the converted DataFrame as input and returns a filtered DataFrame\n", + "# of objects to keep in the data subset. This function also takes the clinical DataFrame as input.\n", + "def extract_clinical_data(df_data, df_clinical):\n", + "\n", + " # Merge the clinical data with our data objects\n", + " df = pd.merge(df_data, df_clinical, on=\"patient_id\", how=\"outer\")\n", + "\n", + " # Filter out patients who aren't stage 1-3\n", + " df = df[(df.stage >= 1) & (df.stage <= 3)]\n", + "\n", + " # Determine the date of each data object\n", + " df[\"obj_date\"] = df.apply(lambda row: determine_dcm_datetime(load_object_metadata(row)), axis=1)\n", + "\n", + " # List to track row indicies we will keep\n", + " keep_rows = []\n", + "\n", + " # Sort their dose grids by descending order, so we can select the first (latest)\n", + " # dose grid and link the structure set and image series to use for the data subset.\n", + " df = df.sort_values(\"obj_date\", ascending=False)\n", + "\n", + " # Loop the data by patient to select the data objects\n", + " for patient_id, df_pat in df.groupby(\"patient_id\"):\n", + "\n", + " df_doses = df_pat[df_pat.modality==\"RTDOSE\"]\n", + "\n", + " # If there are no dose grid, we skip this patient\n", + " if len(df_doses) == 0:\n", + " continue\n", + "\n", + " # Otherwise, we select the first dose grid (which is the latest since they are sorted) \n", + " # to keep\n", + " dose_row = df_doses.iloc[0]\n", + "\n", + " df_linked_structs = pydicer.get_structures_linked_to_dose(dose_row)\n", + "\n", + " # Skip patient if no linked structure sets are found\n", + " if len(df_linked_structs) == 0:\n", + " continue\n", + "\n", + " # Finally, find the image linked to the structure set\n", + " struct_row = df_linked_structs.iloc[0]\n", + "\n", + " df_linked_images = df[df.sop_instance_uid==struct_row.referenced_sop_instance_uid]\n", + "\n", + " # Skip if no images found\n", + " if len(df_linked_images) == 0:\n", + " continue\n", + "\n", + " image_row = df_linked_images.iloc[0]\n", + "\n", + " # Store the indcies of these data objects\n", + " keep_rows.append(image_row.name)\n", + " keep_rows.append(struct_row.name)\n", + " keep_rows.append(dose_row.name)\n", + "\n", + " # Return only the rows of the data objects we want to keep in the data subset\n", + " return df_data.loc[keep_rows]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "clinical_project_name = \"clinical\"\n", + "\n", + "# Prepare the subset of data using our custom function\n", + "pydicer.dataset.prepare(\n", + " clinical_project_name,\n", + " extract_clinical_data,\n", + " df_clinical=df_clinical\n", + ")\n", + "\n", + "# Load the data subset and display the DataFrame\n", + "df_clinical_project = pydicer.read_converted_data(dataset_name=clinical_project_name)\n", + "df_clinical_project" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "814af119db7f8f2860617be3dcd1d37c560587d11c65bd58c45b1679d3ee6ea4" + }, + "kernelspec": { + "display_name": "Python 3.8.0 64-bit ('pydicer': pyenv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/DoseMetrics.ipynb b/examples/DoseMetrics.ipynb new file mode 100644 index 0000000..e364024 --- /dev/null +++ b/examples/DoseMetrics.ipynb @@ -0,0 +1,172 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Dose Metrics\n", + "\n", + "In this example notebook we will compute Dose Volume Histograms (DVH) for our `RTDOSE` objects\n", + "across structures found in `RTSTRUCT` objects in our dataset. We use\n", + "[HNSCC](https://wiki.cancerimagingarchive.net/display/Public/HNSCC) data from the Cancer Imaging\n", + "Archive which has already been converted using PyDicer for demonstration purposes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " from pydicer import PyDicer\n", + "except ImportError:\n", + " !pip install pydicer\n", + " from pydicer import PyDicer\n", + "\n", + "from pathlib import Path\n", + "\n", + "from pydicer.utils import fetch_converted_test_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetch data\n", + "\n", + "HNSCC data prepared for this example are downloaded and stored into a `testdata_hnscc` directory.\n", + "We will use this for our PyDicer working directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "working_directory = fetch_converted_test_data(\"./testdata_hnscc\", dataset=\"HNSCC\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialise PyDicer object\n", + "\n", + "Using the working directory containing the test data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pydicer = PyDicer(working_directory)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compute DVH\n", + "\n", + "Before we can extract dose metrics, we must compute Dose Volume Histograms for all dose objects and\n", + "structure sets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pydicer.analyse.compute_dvh()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Inspect DVH\n", + "\n", + "DVHs computed are stored in the respective dose object directories on the file system. Inspect a\n", + "dose object directory (e.g. `testdata_hnscc/data/HNSCC-01-0019/doses/309e1a`). Here you will find\n", + "a `.png` file which plots the DVH for each of the linked structures. In addition a `.csv` file \n", + "stores the raw DVH values.\n", + "\n", + "The DVHs can for this dataset can be loaded into a pandas DataFrame with the\n", + "`get_all_dvhs_for_dataset` function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_dvh = pydicer.analyse.get_all_dvhs_for_dataset()\n", + "df_dvh.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compute Dose Metrics\n", + "\n", + "The `compute_dose_metrics` function in the `analyse` module can compute **D**, **V** and **Dcc**\n", + "metrics. Specify the points at which to compute those values. For example, the following cell\n", + "computes the **D95**, **D50**, **V5** and **Dcc10**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_dose_metrics = pydicer.analyse.compute_dose_metrics(\n", + " d_point=[95, 50],\n", + " v_point=[5],\n", + " d_cc_point=[10]\n", + ")\n", + "df_dose_metrics.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "814af119db7f8f2860617be3dcd1d37c560587d11c65bd58c45b1679d3ee6ea4" + }, + "kernelspec": { + "display_name": "Python 3.8.0 64-bit ('pydicer': pyenv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/Radiomics.ipynb b/examples/Radiomics.ipynb new file mode 100644 index 0000000..29b121a --- /dev/null +++ b/examples/Radiomics.ipynb @@ -0,0 +1,247 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Compute Radiomics\n", + "\n", + "In this example notebook we use [PyRadiomics](https://github.com/AIM-Harvard/pyradiomics) to\n", + "compute various type of radiomics features. We use some\n", + "[LCTSC](https://wiki.cancerimagingarchive.net/pages/viewpage.action?pageId=24284539) data from the\n", + "Cancer Imaging Archive which has already been converted using PyDicer for demonstration purposes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " from pydicer import PyDicer\n", + "except ImportError:\n", + " !pip install pydicer\n", + " from pydicer import PyDicer\n", + "\n", + "from pathlib import Path\n", + "\n", + "from pydicer.utils import fetch_converted_test_data\n", + "\n", + "from pydicer.utils import load_object_metadata" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetch data\n", + "\n", + "LCTSC data prepared for this example are downloaded and stored into a `testdata_lctsc` directory.\n", + "We will use this for our PyDicer working directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "working_directory = fetch_converted_test_data(\"./testdata_lctsc\", dataset=\"LCTSC\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialise PyDicer object\n", + "\n", + "Using the working directory containing the LCTSC test data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pydicer = PyDicer(working_directory)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compute Default Radiomics\n", + "\n", + "By default, PyDicer will compute only first-order radiomics features. Radiomics are computed for\n", + "each structure available in the dataset using the image data of images linked to those structures." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pydicer.analyse.compute_radiomics()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetch computed Radiomics\n", + "\n", + "Use the `get_all_computed_radiomics_for_dataset` function to fetch all radiomics features computed\n", + "in the last step.\n", + "\n", + "The `.head()` function on a Pandas DataFrame output the first 5 rows for inspection." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Display the DataFrame of radiomics computed\n", + "df_radiomics = pydicer.analyse.get_all_computed_radiomics_for_dataset()\n", + "df_radiomics.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Specify Radiomics to Compute\n", + "\n", + "PyDicer used the popular `pyradiomics` library to compute radiomics. So, you may specify any\n", + "radiomics features provided in that library to be computed. See the [pyradiomics documentation for\n", + "a list of radiomics features\n", + "available](https://pyradiomics.readthedocs.io/en/latest/features.html).\n", + "\n", + "In this example, we specify all `shape` features as well as `first-order` features to be computed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import pyradiomics feature modules\n", + "from radiomics import (\n", + " firstorder,\n", + " shape,\n", + ")\n", + "\n", + "# Prepare a dict of features to compute grouped by class\n", + "first_order_features = firstorder.RadiomicsFirstOrder.getFeatureNames()\n", + "shape_features = shape.RadiomicsShape.getFeatureNames()\n", + "compute_radiomics = {\n", + " \"firstorder\": [f for f in first_order_features if not first_order_features[f]],\n", + " \"shape\": [f for f in shape_features if not shape_features[f]],\n", + "}\n", + "\n", + "# Pass the dict to the compute the radiomics\n", + "pydicer.analyse.compute_radiomics(radiomics=compute_radiomics)\n", + "\n", + "# Fetch the computed radiomics and output the first few rows\n", + "df_radiomics = pydicer.analyse.get_all_computed_radiomics_for_dataset()\n", + "df_radiomics.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also set a specific subset of features like this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Specify features to compute grouped by class\n", + "compute_radiomics = {\n", + " \"firstorder\": [\"Maximum\", \"Minimum\", \"Mean\", \"Median\"],\n", + " \"shape\": [\"SurfaceArea\", \"VoxelVolume\"],\n", + "}\n", + "\n", + "# Pass the dict to the compute the radiomics\n", + "pydicer.analyse.compute_radiomics(radiomics=compute_radiomics)\n", + "\n", + "df_radiomics = pydicer.analyse.get_all_computed_radiomics_for_dataset()\n", + "df_radiomics.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Track Metadata\n", + "\n", + "When analysing your radiomic features, it may be useful to have certain metadata available from\n", + "either the image or structure set. You can specify which DICOM header tags to extract metadata for\n", + "and these will be stored alongside the radiomic feature values.\n", + "\n", + "In the cell below, we recompute our radiomics and store the `PatientSex` header value from the\n", + "image series and the `StudyDate` value from the structure set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Compute the radiomics specifying the meta data to keep\n", + "pydicer.analyse.compute_radiomics(\n", + " radiomics=compute_radiomics,\n", + " image_meta_data=[\"PatientSex\"],\n", + " structure_meta_data=[\"StudyDate\"]\n", + ")\n", + "\n", + "# Fetch the results and display the first rows\n", + "df_radiomics = pydicer.analyse.get_all_computed_radiomics_for_dataset()\n", + "df_radiomics.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "814af119db7f8f2860617be3dcd1d37c560587d11c65bd58c45b1679d3ee6ea4" + }, + "kernelspec": { + "display_name": "Python 3.8.0 64-bit ('pydicer': pyenv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/VisualiseData.ipynb b/examples/VisualiseData.ipynb new file mode 100644 index 0000000..d139f5f --- /dev/null +++ b/examples/VisualiseData.ipynb @@ -0,0 +1,153 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Visualise Data\n", + "\n", + "PyDicer's `visualise` module will produce cross-sections of data objects and store them\n", + "in `.png` format within the data object directory. This is particularly useful for fast inspection\n", + "of the data to ensure that nothing has gone wrong during conversion.\n", + "\n", + "The `visualise` module can be run at any time after conversion. If you are using advanced features\n", + "of PyDicer, such as `auto-segmentation inference` and `object generation`, you can run the\n", + "visualise module following the generation of the new data objects to produce the cross-section\n", + "`.png` files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " from pydicer import PyDicer\n", + "except ImportError:\n", + " !pip install pydicer\n", + " from pydicer import PyDicer\n", + "\n", + "from pathlib import Path\n", + "\n", + "from pydicer.utils import fetch_converted_test_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup PyDicer\n", + "\n", + "HNSCC data prepared for this example are downloaded and stored into a `testdata_hnscc` directory.\n", + "We will use this for our PyDicer working directory. We also initialise our PyDicer object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "working_directory = fetch_converted_test_data(\"./testdata_hnscc\", dataset=\"HNSCC\")\n", + "\n", + "pydicer = PyDicer(working_directory)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualise Data\n", + "\n", + "We simply call the `visualise()` function of the `visualise` module to produce the cross-sections.\n", + "\n", + "Inspect some of the data object directories in `testdata_hnscc/data` and look for the `.png`\n", + "cross-sections. The `{hashed_uid}` in files named `vis_{hashed_uid}.png` refers to a UID hash\n", + "linking to the image being visualised. Visualisations are produced for:\n", + "- Images\n", + "- RT Structure Sets\n", + "- RT Dose Grids" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pydicer.visualise.visualise()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run for a single patient\n", + "\n", + "You can run the visualisation for only a single patient (or list of specific patients) by providing\n", + "the `patient` argument." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pydicer.visualise.visualise(patient=\"HNSCC-01-0199\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Avoid Re-generating Visualisation\n", + "\n", + "If you've added more data to your dataset, and want to avoid re-generating visualisations, set the\n", + "`force` argument to `False`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pydicer.visualise.visualise(force=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "814af119db7f8f2860617be3dcd1d37c560587d11c65bd58c45b1679d3ee6ea4" + }, + "kernelspec": { + "display_name": "Python 3.8.0 64-bit ('pydicer': pyenv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/WorkingWithData.ipynb b/examples/WorkingWithData.ipynb new file mode 100644 index 0000000..6142459 --- /dev/null +++ b/examples/WorkingWithData.ipynb @@ -0,0 +1,227 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Working with Data\n", + "\n", + "Here we present some useful tips & tricks which to help working with data which has been converted\n", + "using PyDicer. As you will see, working with data in PyDicer is heavily oriented around DataFrames\n", + "provided by the Pandas library. If you aren't familiar with Pandas, we recommend working through \n", + "the [Pandas Getting Started Tutorials](https://pandas.pydata.org/docs/getting_started/index.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " from pydicer import PyDicer\n", + "except ImportError:\n", + " !pip install pydicer\n", + " from pydicer import PyDicer\n", + "\n", + "from pathlib import Path\n", + "\n", + "import SimpleITK as sitk\n", + "\n", + "from pydicer.utils import fetch_converted_test_data\n", + "\n", + "from pydicer.utils import load_object_metadata, determine_dcm_datetime" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup PyDicer\n", + "\n", + "Here we load the LCTSC data which has already been converted. This is downloaded into the\n", + "`testdata_lctsc` directory. We also initialise a `PyDicer` object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "working_directory = fetch_converted_test_data(\"./testdata_lctsc\", dataset=\"LCTSC\")\n", + "\n", + "pydicer = PyDicer(working_directory)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read Converted Data\n", + "\n", + "To obtain a DataFrame of the converted data, use the `read_converted_data` function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pydicer.read_converted_data()\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Iterating Over Objects\n", + "\n", + "If you want to perform some operation on (for example) all images in your dataset, you can iterate\n", + "over each image row like this. Within each loop we load each image as a `SimpleITK` image (just\n", + "for demonstration purposes).)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for idx, ct_row in df[df.modality==\"CT\"].iterrows():\n", + "\n", + " print(f\"Loading image with hashed UID: {ct_row.hashed_uid}...\", end=\"\")\n", + "\n", + " img_path = Path(ct_row.path).joinpath(\"CT.nii.gz\")\n", + " img = sitk.ReadImage(str(img_path))\n", + "\n", + " print(\" Complete\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading Object Metadata\n", + "\n", + "The metadata from the DICOM headers is stored by PyDicer and can be easily loaded using the\n", + "`load_object_metadata` function. Simply pass a row from the converted DataFrame into this function\n", + "to load the metadata for that object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "first_row = df.iloc[0]\n", + "ds = load_object_metadata(first_row)\n", + "ds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Keep only specific header tags\n", + "\n", + "Loading object metadata can be slow, especially when doing this for many objects at once. So, you\n", + "can specify the `keep_tags` argument if you know which header attributes you want to use. This\n", + "speeds up loading metadata significantly.\n", + "\n", + "Here we load only the `StudyDate`, `PatientSex` and `Manufacturer`.\n", + "\n", + "> Tip: These tags are defined by the DICOM standard, and we use `pydicom` to load this metadata. In\n", + "> fact, the metadata returned is a `pydicom` Dataset. Check out the [`pydicom` documentation](https://pydicom.github.io/pydicom/dev/old/pydicom_user_guide.html) for more information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ds = load_object_metadata(first_row, keep_tags=[\"StudyDate\", \"PatientSex\", \"Manufacturer\"])\n", + "ds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loading metadata for all data objects\n", + "\n", + "You can use the Pandas `apply` function to load metadata for all rows and add it as a column to the\n", + "converted DataFrame." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"StudyDescription\"] = df.apply(lambda row: load_object_metadata(row, keep_tags=\"StudyDescription\").StudyDescription, axis=1)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Determine Date of Object\n", + "\n", + "There are several DICOM header tags which could define the date of an object. The DICOM standard\n", + "doesn't require all of these to be set within the metadata. PyDicer provides the \n", + "`determine_dcm_datetime` function to extract the date from the DICOM header." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ds = load_object_metadata(first_row)\n", + "obj_datetime = determine_dcm_datetime(ds)\n", + "print(obj_datetime)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "814af119db7f8f2860617be3dcd1d37c560587d11c65bd58c45b1679d3ee6ea4" + }, + "kernelspec": { + "display_name": "Python 3.8.0 64-bit ('pydicer': pyenv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pydicer/analyse/data.py b/pydicer/analyse/data.py index 5f4e5b0..fb467e3 100644 --- a/pydicer/analyse/data.py +++ b/pydicer/analyse/data.py @@ -549,10 +549,10 @@ def compute_dvh( already exists. Defaults to True. bin_width (float, optional): The bin width of the Dose Volume Histogram. structure_meta_data_cols (list, optional): A list of DICOM tags which will be extracted - from the structure DICOM headers and included in the resulting table of radiomics. + from the structure DICOM headers and included in the resulting table of DVHs. Defaults to None. dose_meta_data_cols (list, optional): A list of DICOM tags which will be extracted from - the Dose DICOM headers and included in the resulting table of radiomics. + the Dose DICOM headers and included in the resulting table of DVHs. Defaults to None. Raises: diff --git a/pydicer/convert/data.py b/pydicer/convert/data.py index cf3ae2d..e12be3a 100644 --- a/pydicer/convert/data.py +++ b/pydicer/convert/data.py @@ -16,7 +16,7 @@ from pydicer.convert.rtstruct import convert_rtstruct, write_nrrd_from_mask_directory from pydicer.convert.headers import convert_dicom_headers from pydicer.utils import hash_uid, read_preprocessed_data, get_iterator -from pydicer.quarantine.treat import copy_file_to_quarantine +from pydicer.quarantine import copy_file_to_quarantine from pydicer.constants import ( CONVERTED_DIR_NAME, @@ -117,7 +117,9 @@ def handle_missing_slice(files, ignore_duplicates=False): df_duplicated = df_files[df_files["slice_location"].duplicated()] if len(df_duplicated) > 0: check_slice_location = df_duplicated.iloc[0].slice_location - df_check_duplicates = df_files[df_files["slice_location"] == check_slice_location] + df_check_duplicates = df_files[ + df_files["slice_location"] == check_slice_location + ] pix_array = None for _, row in df_check_duplicates.iterrows(): @@ -146,7 +148,9 @@ def handle_missing_slice(files, ignore_duplicates=False): # check to see if any slice thickness exceed 2% tolerance # this is conservative as missing slices would produce 100% differences - slice_thickness_variations = ~np.isclose(slice_location_diffs, expected_slice_diff, rtol=0.02) + slice_thickness_variations = ~np.isclose( + slice_location_diffs, expected_slice_diff, rtol=0.02 + ) if np.any(slice_thickness_variations): logger.warning("Missing DICOM slices found") @@ -155,7 +159,9 @@ def handle_missing_slice(files, ignore_duplicates=False): missing_indices = np.where(slice_thickness_variations)[0] for missing_index in missing_indices: - num_missing_slices = int(slice_location_diffs[missing_index] / expected_slice_diff) - 1 + num_missing_slices = ( + int(slice_location_diffs[missing_index] / expected_slice_diff) - 1 + ) # locate nearest DICOM files to the missing slices prior_dcm_file = df_files.iloc[missing_index]["file_path"] @@ -179,12 +185,18 @@ def handle_missing_slice(files, ignore_duplicates=False): ) # write a copy to a temporary DICOM file - working_dcm.PixelData = interp_array.astype(prior_dcm.pixel_array.dtype).tobytes() + working_dcm.PixelData = interp_array.astype( + prior_dcm.pixel_array.dtype + ).tobytes() # compute spatial information - image_orientation = np.array(prior_dcm.ImageOrientationPatient, dtype=float) + image_orientation = np.array( + prior_dcm.ImageOrientationPatient, dtype=float + ) - image_plane_normal = np.cross(image_orientation[:3], image_orientation[3:]) + image_plane_normal = np.cross( + image_orientation[:3], image_orientation[3:] + ) image_position_patient = np.array( np.array(prior_dcm.ImagePositionPatient) @@ -236,7 +248,9 @@ def link_via_frame_of_reference(for_uid, df_preprocess): df_linked_series = df_linked_series[df_linked_series.modality.isin(modality_prefs)] df_linked_series.loc[:, "modality"] = df_linked_series.modality.astype("category") - df_linked_series.modality = df_linked_series.modality.cat.set_categories(modality_prefs) + df_linked_series.modality = df_linked_series.modality.cat.set_categories( + modality_prefs + ) df_linked_series.sort_values(["modality"], inplace=True) return df_linked_series @@ -328,7 +342,9 @@ def convert(self, patient=None, force=True): patient_directory = self.output_directory.joinpath(patient_id) - patient_logger = PatientLogger(patient_id, self.output_directory, force=False) + patient_logger = PatientLogger( + patient_id, self.output_directory, force=False + ) # Grab the sop_class_uid, modality and for_uid (should be the same for all files in # series) @@ -364,7 +380,9 @@ def convert(self, patient=None, force=True): if config.get_config("interp_missing_slices"): series_files = handle_missing_slice( df_files, - ignore_duplicates=config.get_config("ignore_duplicate_slices"), + ignore_duplicates=config.get_config( + "ignore_duplicate_slices" + ), ) else: # TODO Handle inconsistent slice spacing @@ -413,14 +431,18 @@ def convert(self, patient=None, force=True): # If not linked via referenced UID, then try to link via FOR if len(df_linked_series) == 0: for_uid = rt_struct_file.referenced_for_uid - df_linked_series = link_via_frame_of_reference(for_uid, df_preprocess) + df_linked_series = link_via_frame_of_reference( + for_uid, df_preprocess + ) # Check that the linked series is available # TODO handle rendering the masks even if we don't have an image series it's # linked to if len(df_linked_series) == 0: error_log = "Series Referenced by RTSTRUCT not found" - patient_logger.log_module_error("convert", sop_instance_hash, error_log) + patient_logger.log_module_error( + "convert", sop_instance_hash, error_log + ) if not output_dir.exists() or force: # Only convert if it doesn't already exist or if force is True @@ -447,11 +469,15 @@ def convert(self, patient=None, force=True): if config.get_config("generate_nrrd"): nrrd_file = output_dir.joinpath("STRUCTURE_SET.nrrd") - logger.info("Saving structures in nrrd format: %s", nrrd_file) + logger.info( + "Saving structures in nrrd format: %s", nrrd_file + ) write_nrrd_from_mask_directory( output_dir, nrrd_file, - matplotlib.colormaps.get_cmap(config.get_config("nrrd_colormap")), + matplotlib.colormaps.get_cmap( + config.get_config("nrrd_colormap") + ), ) # Save JSON @@ -513,7 +539,9 @@ def convert(self, patient=None, force=True): sop_instance_hash = hash_uid(rt_plan_file.sop_instance_uid) # Update the output directory for this plan - output_dir = patient_directory.joinpath(object_type, sop_instance_hash) + output_dir = patient_directory.joinpath( + object_type, sop_instance_hash + ) if not output_dir.exists() or force: # Only convert if it doesn't already exist or if force is True @@ -525,8 +553,12 @@ def convert(self, patient=None, force=True): entry["sop_instance_uid"] = rt_plan_file.sop_instance_uid entry["hashed_uid"] = sop_instance_hash - entry["referenced_sop_instance_uid"] = rt_plan_file.referenced_uid - entry["path"] = str(output_dir.relative_to(self.working_directory)) + entry[ + "referenced_sop_instance_uid" + ] = rt_plan_file.referenced_uid + entry["path"] = str( + output_dir.relative_to(self.working_directory) + ) self.add_entry(entry) patient_logger.eval_module_process("convert", sop_instance_hash) @@ -540,7 +572,9 @@ def convert(self, patient=None, force=True): sop_instance_hash = hash_uid(rt_dose_file.sop_instance_uid) # Update the output directory for this plan - output_dir = patient_directory.joinpath(object_type, sop_instance_hash) + output_dir = patient_directory.joinpath( + object_type, sop_instance_hash + ) if not output_dir.exists() or force: # Only convert if it doesn't already exist or if force is True @@ -550,7 +584,9 @@ def convert(self, patient=None, force=True): nifti_file.parent.mkdir(exist_ok=True, parents=True) logger.debug("Writing RTDOSE to: %s", nifti_file) convert_rtdose( - rt_dose_file.file_path, force=True, dose_output_path=nifti_file + rt_dose_file.file_path, + force=True, + dose_output_path=nifti_file, ) json_file = output_dir.joinpath("metadata.json") @@ -562,8 +598,12 @@ def convert(self, patient=None, force=True): entry["sop_instance_uid"] = rt_dose_file.sop_instance_uid entry["hashed_uid"] = sop_instance_hash - entry["referenced_sop_instance_uid"] = rt_dose_file.referenced_uid - entry["path"] = str(output_dir.relative_to(self.working_directory)) + entry[ + "referenced_sop_instance_uid" + ] = rt_dose_file.referenced_uid + entry["path"] = str( + output_dir.relative_to(self.working_directory) + ) self.add_entry(entry) @@ -578,7 +618,9 @@ def convert(self, patient=None, force=True): # Broad except ok here, since we will put these file into a # quarantine location for further inspection. logger.error( - "Unable to convert series for patient: %s with UID: %s", patient_id, series_uid + "Unable to convert series for patient: %s with UID: %s", + patient_id, + series_uid, ) logger.exception(e) @@ -588,7 +630,9 @@ def convert(self, patient=None, force=True): for f in df_files.file_path.tolist(): logger.error( - "Error parsing file %s: %s. Placing file into Quarantine folder...", f, e + "Error parsing file %s: %s. Placing file into Quarantine folder...", + f, + e, ) copy_file_to_quarantine(Path(f), self.working_directory, e) patient_logger.log_module_error("convert", sop_instance_hash, e) diff --git a/pydicer/dataset/preparation.py b/pydicer/dataset/preparation.py index d1f3fd9..23d15bc 100644 --- a/pydicer/dataset/preparation.py +++ b/pydicer/dataset/preparation.py @@ -47,7 +47,10 @@ def add_object_to_dataset(self, dataset_name: str, data_object_row: pd.Series): symlink_path = dataset_dir.joinpath(object_path.relative_to(CONVERTED_DIR_NAME)) rel_part = os.sep.join( - [".." for _ in symlink_path.parent.relative_to(self.working_directory).parts] + [ + ".." + for _ in symlink_path.parent.relative_to(self.working_directory).parts + ] ) src_path = Path(f"{rel_part}{os.sep}{object_path}") @@ -67,7 +70,10 @@ def add_object_to_dataset(self, dataset_name: str, data_object_row: pd.Series): df_converted = pd.read_csv(pat_converted_csv, index_col=0, dtype=col_types) # Check if this object already exists in the converted dataframe - if len(df_converted[df_converted.hashed_uid == data_object_row.hashed_uid]) == 0: + if ( + len(df_converted[df_converted.hashed_uid == data_object_row.hashed_uid]) + == 0 + ): # If not add it df_pat = pd.concat([df_converted, df_pat]) else: @@ -92,6 +98,9 @@ def prepare_from_dataframe(self, dataset_name: str, df_prepare: pd.DataFrame): "remove the existing directory" ) + # Create a copy of df_prepare + df_prepare = df_prepare.copy() + # Remove the working directory part for when we re-save off the filtered converted csv df_prepare.path = df_prepare.path.apply( lambda p: str(Path(p).relative_to(self.working_directory)) @@ -101,7 +110,9 @@ def prepare_from_dataframe(self, dataset_name: str, df_prepare: pd.DataFrame): for _, row in df_prepare.iterrows(): self.add_object_to_dataset(dataset_name, row) - def prepare(self, dataset_name: str, preparation_function: Callable, patients=None, **kwargs): + def prepare( + self, dataset_name: str, preparation_function: Callable, patients=None, **kwargs + ): """Calls upon an appropriate preparation function to generate a clean dataset ready for use. Additional keyword arguments are passed through to the preparation_function. @@ -124,7 +135,11 @@ def prepare(self, dataset_name: str, preparation_function: Callable, patients=No "preparation_function must be a function or a str defined in pydicer.dataset" ) - logger.info("Preparing dataset %s using function: %s", dataset_name, preparation_function) + logger.info( + "Preparing dataset %s using function: %s", + dataset_name, + preparation_function, + ) # Grab the DataFrame containing all the converted data df_converted = read_converted_data(self.working_directory, patients=patients) diff --git a/pydicer/preprocess/data.py b/pydicer/preprocess/data.py index 4770ca8..cccccbf 100644 --- a/pydicer/preprocess/data.py +++ b/pydicer/preprocess/data.py @@ -15,7 +15,7 @@ RT_STRUCTURE_STORAGE_UID, CT_IMAGE_STORAGE_UID, ) -from pydicer.quarantine.treat import copy_file_to_quarantine +from pydicer.quarantine import copy_file_to_quarantine from pydicer.utils import read_preprocessed_data, get_iterator logger = logging.getLogger(__name__) @@ -51,7 +51,6 @@ def scan_file(self, file): ds = pydicom.read_file(file, force=True) try: - dicom_type_uid = ds.SOPClassUID res_dict = { @@ -68,7 +67,6 @@ def scan_file(self, file): res_dict["for_uid"] = ds.FrameOfReferenceUID if dicom_type_uid == RT_STRUCTURE_STORAGE_UID: - try: referenced_series_uid = ( ds.ReferencedFrameOfReferenceSequence[0] @@ -84,15 +82,16 @@ def scan_file(self, file): # Check other tags for a linked DICOM # e.g. ds.ReferencedFrameOfReferenceSequence[0].FrameOfReferenceUID # Potentially, we should check each referenced - referenced_frame_of_reference_uid = ds.ReferencedFrameOfReferenceSequence[ - 0 - ].FrameOfReferenceUID + referenced_frame_of_reference_uid = ( + ds.ReferencedFrameOfReferenceSequence[0].FrameOfReferenceUID + ) res_dict["referenced_for_uid"] = referenced_frame_of_reference_uid except AttributeError: - logger.warning("Unable to determine Referenced Frame of Reference UID") + logger.warning( + "Unable to determine Referenced Frame of Reference UID" + ) elif dicom_type_uid == RT_PLAN_STORAGE_UID: - try: referenced_sop_instance_uid = ds.ReferencedStructureSetSequence[ 0 @@ -102,7 +101,6 @@ def scan_file(self, file): logger.warning("Unable to determine Reference Series UID") elif dicom_type_uid == RT_DOSE_STORAGE_UID: - try: referenced_sop_instance_uid = ds.ReferencedRTPlanSequence[ 0 @@ -112,18 +110,21 @@ def scan_file(self, file): logger.warning("Unable to determine Reference Series UID") elif dicom_type_uid in (CT_IMAGE_STORAGE_UID, PET_IMAGE_STORAGE_UID): - image_position = np.array(ds.ImagePositionPatient, dtype=float) image_orientation = np.array(ds.ImageOrientationPatient, dtype=float) - image_plane_normal = np.cross(image_orientation[:3], image_orientation[3:]) + image_plane_normal = np.cross( + image_orientation[:3], image_orientation[3:] + ) slice_location = (image_position * image_plane_normal)[2] res_dict["slice_location"] = slice_location else: - raise ValueError(f"Could not determine DICOM type {ds.Modality} {dicom_type_uid}.") + raise ValueError( + f"Could not determine DICOM type {ds.Modality} {dicom_type_uid}." + ) logger.debug( "Successfully scanned DICOM file with SOP Instance UID: %s", @@ -205,7 +206,6 @@ def preprocess(self, input_directory, force=True): # If we don't want to force preprocess and preprocesses files already exists, filter these # out if not force and preprocessed_csv_path.exists(): - logger.info("Not forcing preprocessing, will only scan unindexed files") df = read_preprocessed_data(self.working_directory) diff --git a/pydicer/quarantine/treat.py b/pydicer/quarantine.py similarity index 74% rename from pydicer/quarantine/treat.py rename to pydicer/quarantine.py index 0a7eb92..1123a84 100644 --- a/pydicer/quarantine/treat.py +++ b/pydicer/quarantine.py @@ -20,11 +20,14 @@ def copy_file_to_quarantine(file, working_directory, error_msg): # Attempt to get some header information from the DICOM object to write into the summary - summary_dict = {"file": file, "error": error_msg, "quarantine_dttm": datetime.datetime.now()} + summary_dict = { + "file": file, + "error": error_msg, + "quarantine_dttm": datetime.datetime.now(), + } ds = pydicom.read_file(file, force=True) for k in QUARATINE_DICOM_KEYS: - val = None if k in ds: val = ds[k].value @@ -58,13 +61,20 @@ def copy_file_to_quarantine(file, working_directory, error_msg): df_summary.to_csv(summary_file) -class TreatImages: - """ - Class to treat the quarantined images and prepare it for further processing +def read_quarantined_data(working_directory: Path): + """A function to read the data from the quarantine summary. Args: - quaran_directory (Path): path to the quarantine directory + working_directory (pathlib.Path): The PyDicer working directory + + Returns: + pd.DataFrame: A DataFrame summarising the contents of the quarantine. """ - def __init__(self, quaran_directory): - self.quaran_directory = quaran_directory + quarantine_dir = Path(working_directory).joinpath("quarantine") + + summary_file = quarantine_dir.joinpath("summary.csv") + + df_summary = pd.read_csv(summary_file, index_col=0) + + return df_summary diff --git a/pydicer/quarantine/__init__.py b/pydicer/quarantine/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/pydicer/tool.py b/pydicer/tool.py index edf52dd..07d47f7 100644 --- a/pydicer/tool.py +++ b/pydicer/tool.py @@ -16,7 +16,13 @@ from pydicer.dataset.preparation import PrepareDataset from pydicer.analyse.data import AnalyseData -from pydicer.utils import read_converted_data, add_structure_name_mapping, copy_doc +from pydicer.utils import ( + read_converted_data, + add_structure_name_mapping, + get_structures_linked_to_dose, + copy_doc, +) +from pydicer.quarantine import read_quarantined_data from pydicer.generate.object import add_object, add_structure_object, add_dose_object from pydicer.generate.segmentation import ( @@ -42,7 +48,6 @@ class PyDicer: """ def __init__(self, working_directory="."): - self.working_directory = Path(working_directory) self.pydicer_directory = self.working_directory.joinpath(PYDICER_DIR_NAME) @@ -111,14 +116,18 @@ def update_logging(self): maxBytes=100 * 1024 * 1024, # Max 100 MB per log file before rotating backupCount=100, # Keep up to 100 log files in history ) - file_formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") + file_formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) file_handler.setFormatter(file_formatter) file_handler.setLevel(logging.DEBUG) logger.addHandler(file_handler) if verbosity > 0: console_handler = logging.StreamHandler(sys.stdout) - console_formatter = logging.Formatter("%(name)s - %(levelname)s - %(message)s") + console_formatter = logging.Formatter( + "%(name)s - %(levelname)s - %(message)s" + ) console_handler.setFormatter(console_formatter) console_handler.setLevel(verbosity) logger.addHandler(console_handler) @@ -145,7 +154,9 @@ def add_input(self, input_obj): self.dicom_directories.append(dicom_path) logger.debug("Added DICOM input path: %s", dicom_path) else: - raise ValueError("input_obj must be of type str, pathlib.Path or inherit InputBase") + raise ValueError( + "input_obj must be of type str, pathlib.Path or inherit InputBase" + ) def preprocess(self, force=True): """Preprocess the DICOM data in preparation for conversion @@ -156,7 +167,9 @@ def preprocess(self, force=True): """ if len(self.dicom_directories) == 0: - raise ValueError("No DICOM input locations set. Add one using the add_input function.") + raise ValueError( + "No DICOM input locations set. Add one using the add_input function." + ) preprocess_data = PreprocessData(self.working_directory) preprocess_data.preprocess(self.dicom_directories, force=force) @@ -182,62 +195,72 @@ def run_pipeline(self, patient=None, force=True): self.analyse.compute_radiomics( dataset_name=CONVERTED_DIR_NAME, patient=patient, force=force ) - self.analyse.compute_dvh(dataset_name=CONVERTED_DIR_NAME, patient=patient, force=force) + self.analyse.compute_dvh( + dataset_name=CONVERTED_DIR_NAME, patient=patient, force=force + ) @copy_doc(add_structure_name_mapping, remove_args=["working_directory"]) def add_structure_name_mapping( # pylint: disable=missing-function-docstring self, *args, **kwargs ) -> pd.DataFrame: - return add_structure_name_mapping( *args, working_directory=self.working_directory, **kwargs ) + @copy_doc(read_preprocessed_data, remove_args=["working_directory"]) + def read_preprocessed_data( # pylint: disable=missing-function-docstring + self, + ) -> pd.DataFrame: + return read_preprocessed_data(working_directory=self.working_directory) + @copy_doc(read_converted_data, remove_args=["working_directory"]) def read_converted_data( # pylint: disable=missing-function-docstring self, *_, **kwargs ) -> pd.DataFrame: - return read_converted_data(working_directory=self.working_directory, **kwargs) + @copy_doc(read_quarantined_data, remove_args=["working_directory"]) + # pylint: disable=missing-function-docstring + def read_quarantined_data(self) -> pd.DataFrame: + return read_quarantined_data(working_directory=self.working_directory) + + @copy_doc(read_quarantined_data, remove_args=["working_directory"]) + # pylint: disable=missing-function-docstring + def get_structures_linked_to_dose(self, *args, **kwargs) -> pd.DataFrame: + return get_structures_linked_to_dose(self.working_directory, *args, **kwargs) + @copy_doc(add_object, remove_args=["working_directory"]) def add_object( # pylint: disable=missing-function-docstring self, *args, **kwargs ) -> pd.DataFrame: - return add_object(self.working_directory, *args, **kwargs) @copy_doc(add_structure_object, remove_args=["working_directory"]) def add_structure_object( # pylint: disable=missing-function-docstring self, *args, **kwargs ) -> pd.DataFrame: - return add_structure_object(self.working_directory, *args, **kwargs) @copy_doc(add_dose_object, remove_args=["working_directory"]) def add_dose_object( # pylint: disable=missing-function-docstring self, *args, **kwargs ) -> pd.DataFrame: - return add_dose_object(self.working_directory, *args, **kwargs) @copy_doc(read_all_segmentation_logs, remove_args=["working_directory"]) def read_all_segmentation_logs( # pylint: disable=missing-function-docstring self, *args, **kwargs ) -> pd.DataFrame: - return read_all_segmentation_logs(self.working_directory, *args, **kwargs) @copy_doc(segment_image, remove_args=["working_directory"]) def segment_image( # pylint: disable=missing-function-docstring self, *args, **kwargs ) -> pd.DataFrame: - return segment_image(self.working_directory, *args, **kwargs) @copy_doc(segment_dataset, remove_args=["working_directory"]) def segment_dataset( # pylint: disable=missing-function-docstring self, *args, **kwargs ) -> pd.DataFrame: - return segment_dataset(self.working_directory, *args, **kwargs) diff --git a/pydicer/utils.py b/pydicer/utils.py index 426a25a..23c3b0d 100644 --- a/pydicer/utils.py +++ b/pydicer/utils.py @@ -382,7 +382,9 @@ def map_structure_name(struct_name, struct_map_dict): str: the mapped structure name """ # Check if the structure name needs to be mapped - mapped_struct_name_set = {i for i in struct_map_dict if struct_name in struct_map_dict[i]} + mapped_struct_name_set = { + i for i in struct_map_dict if struct_name in struct_map_dict[i] + } # If not true, then either the structure name is already in mapped form, or the structure name # is not being captured in the specific mapping dictionary @@ -392,7 +394,9 @@ def map_structure_name(struct_name, struct_map_dict): return struct_name -def get_structures_linked_to_dose(working_directory: Path, dose_row: pd.Series) -> pd.DataFrame: +def get_structures_linked_to_dose( + working_directory: Path, dose_row: pd.Series +) -> pd.DataFrame: """Get the structure sets which are linked to a dose object. Args: @@ -423,7 +427,8 @@ def get_structures_linked_to_dose(working_directory: Path, dose_row: pd.Series) # Also link via Frame of Reference df_for_linked = df_converted[ - (df_converted["modality"] == "RTSTRUCT") & (df_converted["for_uid"] == dose_row.for_uid) + (df_converted["modality"] == "RTSTRUCT") + & (df_converted["for_uid"] == dose_row.for_uid) ] if df_linked_struct is None: @@ -475,7 +480,9 @@ def add_structure_name_mapping( if structure_set_row is not None: # Mapping for specific structure set logger.info( - "Adding mapping %s for structure set %s", mapping_id, structure_set_row.hashed_uid + "Adding mapping %s for structure set %s", + mapping_id, + structure_set_row.hashed_uid, ) mapping_path_base = Path(structure_set_row.path) @@ -576,7 +583,7 @@ def fetch_converted_test_data(working_directory=None, dataset="HNSCC"): zip_url = "https://zenodo.org/record/8237552/files/HNSCC_pydicer.zip" working_name = "testdata" elif dataset == "LCTSC": - zip_url = "https://zenodo.org/records/10005835/files/LCTSC_pydicer.zip" + zip_url = "https://zenodo.org/records/10254078/files/LCTSC_pydicer.zip" working_name = "LCTSC" else: raise ValueError(f"Unknown dataset {dataset}") @@ -618,7 +625,11 @@ def wrapped(func): for arg in remove_args: func.__doc__ = "\n".join( - [line for line in func.__doc__.split("\n") if not line.strip().startswith(arg)] + [ + line + for line in func.__doc__.split("\n") + if not line.strip().startswith(arg) + ] ) return func