nf-core · LeonHornich · Sep 2, 2024 · Sep 3, 2024 · Sep 4, 2024 · Sep 4, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -24,7 +24,7 @@ jobs:
     strategy:
       matrix:
         NXF_VER:
-          - "23.04.0"
+          - "24.03.0-edge"
           - "latest-everything"
         ANALYSIS:
           - "test_screening"

diff --git a/modules/local/gpt_prepare_query.nf b/modules/local/gpt_prepare_query.nf
@@ -0,0 +1,15 @@
+process GPT_PREPARE_QUERY {
+    input:
+    path data
+    val source
+    val column
+    val count
+    val mode
+    val question
+
+    output:
+    path "gpt_${source}_query.txt", emit: query
+
+    script:
+    template 'generateGptQuery.py'
+}
diff --git a/modules/local/prepare_gpt_input.nf b/modules/local/prepare_gpt_input.nf
diff --git a/nextflow.config b/nextflow.config
@@ -40,6 +40,17 @@ params {
     overrepresented            = false
     umi_clustering             = false
     skip_clonality             = false
+    gpt_interpretation         = null
+
+    // Nf-gpt parameters
+    gpt_drugz_gene_amount      = 100
+    gpt_drugz_question         = "Which of the following genes enhance or supress drug activity?"
+    gpt_mle_gene_amount        = 100
+    gpt_mle_question           = "What genes are known to have pan-effects on cancer?"
+    gpt_bagel2_gene_amount     = 100
+    gpt_bagel_question         = "What can you tell me about these genes in the context of functional genomics?"
+    gpt_rra_gene_amount        = 100
+    gpt_rra_question           = "What genes are known to have pan-effects on cancer?"
 
     // UMI parameters
     umi_bin_size               = 1
@@ -97,6 +108,18 @@ params {
 
 }
 
+// nf-gpt plugin settings
+gpt {
+    // The user should provide a functioning api key
+    apiKey      = null
+    // Models are available in nf-gpt 0.4.0: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-4, and gpt-3.5-turbo
+    model       = "gpt-4o-mini"
+    // Set maximum number of tokens. This depends on the used model: https://platform.openai.com/docs/models
+    maxTokens   = 10000
+    // Sampling temperature
+    temperature = 0.7
+}
+
 // Load base.config by default for all pipelines
 includeConfig 'conf/base.config'
 
@@ -235,6 +258,7 @@ singularity.registry = 'quay.io'
 // Nextflow plugins
 plugins {
     id '[email protected]' // Validation of pipeline parameters and creation of an input channel from a sample sheet
+    id '[email protected]' // Allows access to nf-gpt functionality
 }
 
 // Load igenomes.config if required

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -256,6 +256,10 @@
                     "description": "Essential genes to remove from the drugZ modules",
                     "pattern": "\\\\S+"
                 },
+                "gpt_interpretation": {
+                    "type": "string",
+                    "description": "Determines whether or not to run nf-gpt plugin"
+                },
                 "hitselection": {
                     "type": "boolean",
                     "description": "Specify to run the Hitselection algorithm"
@@ -542,5 +546,47 @@
         {
             "$ref": "#/definitions/generic_options"
         }
-    ]
+    ],
+    "properties": {
+        "gpt_drugz_gene_amount": {
+            "type": "integer",
+            "default": 100,
+            "description": "Number of top genes to be selected from drugZ."
+        },
+        "gpt_drugz_question": {
+            "type": "string",
+            "default": "Which of the following genes enhance or supress drug activity?",
+            "description": "Question parsed with drugZ data to gpt."
+        },
+        "gpt_mle_gene_amount": {
+            "type": "integer",
+            "default": 100,
+            "description": "Number of top genes to be selected from mle."
+        },
+        "gpt_mle_question": {
+            "type": "string",
+            "default": "What genes are known to have pan-effects on cancer?",
+            "description": "Question parsed with MAGeCK mle data to gpt."
+        },
+        "gpt_bagel2_gene_amount": {
+            "type": "integer",
+            "default": 100,
+            "description": "Number of top genes to be selected from bagel2."
+        },
+        "gpt_bagel_question": {
+            "type": "string",
+            "default": "What can you tell me about these genes in the context of functional genomics?",
+            "description": "Question parsed with bagel2 data to gpt."
+        },
+        "gpt_rra_gene_amount": {
+            "type": "integer",
+            "default": 100,
+            "description": "Number of top genes to be selected from rra."
+        },
+        "gpt_rra_question": {
+            "type": "string",
+            "default": "What genes are known to have pan-effects on cancer?",
+            "description": "Question parsed with rra data to gpt."
+        }
+    }
 }
diff --git a/templates/generateGptQuery.py b/templates/generateGptQuery.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+
+# Define process input variables
+data_path = "${data}"
+source = "${source}"
+target_column = "${column}"
+num_genes = "${count}"
+num_genes = int(num_genes)
+mode = "${mode}"
+question = "${question}"
+
+# Open data file
+with open(data_path, "r") as file:
+    # Read the header row and split into column names
+    header = file.readline().strip().split("\t")
+
+    # Check if specified column exists
+    if target_column not in header:
+        print(
+            f"Error: The specified column '{target_column}' was not found in the data file!"
+        )
+
+    # Find the target column index
+    target_index = header.index(target_column)
+
+    # Initiate list to store gene id's with corresponding data values
+    data = []
+
+    # Prepare data file's rows
+    for line in file:
+        row = line.strip().split("\t")
+        gene_id = row[0]
+        value = float(row[target_index])
+        data.append((gene_id, value))
+
+    # Sort the data based on provided mode
+    if mode == "low":
+        sorted_data = sorted(data, key=lambda x: x[1])
+    elif mode == "high":
+        sorted_data = sorted(data, key=lambda x: x[1], reverse=True)
+    else:
+        print("Error: Please provide either 'low' or 'high' as mode.")
+
+    # Extract num_genes many top genes
+    top_gene_ids = [gene_id for gene_id, value in sorted_data[:num_genes]]
+
+    # Write everything into a output file
+    with open("gpt_${source}_query.txt", "w") as query_file:
+        query_file.write(question + """\n""")
+        for gene_id in top_gene_ids:
+            query_file.write(gene_id + """\n""")
diff --git a/workflows/crisprseq_screening.nf b/workflows/crisprseq_screening.nf
@@ -18,6 +18,11 @@ include { MAGECK_FLUTEMLE                              } from '../modules/local/
 include { MAGECK_FLUTEMLE as MAGECK_FLUTEMLE_CONTRASTS } from '../modules/local/mageck/flutemle'
 include { MAGECK_FLUTEMLE as MAGECK_FLUTEMLE_DAY0      } from '../modules/local/mageck/flutemle'
 include { VENNDIAGRAM                                  } from '../modules/local/venndiagram'
+include { GPT_PREPARE_QUERY as GPT_PREPARE_BAGEL2_QUERY} from '../modules/local/gpt_prepare_query'
+include { GPT_PREPARE_QUERY as GPT_PREPARE_DRUGZ_QUERY } from '../modules/local/gpt_prepare_query'
+include { GPT_PREPARE_QUERY as GPT_PREPARE_MLE_QUERY   } from '../modules/local/gpt_prepare_query'
+include { GPT_PREPARE_QUERY as GPT_PREPARE_RRA_QUERY   } from '../modules/local/gpt_prepare_query'
+
 // nf-core modules
 include { FASTQC                                       } from '../modules/nf-core/fastqc/main'
 include { CUTADAPT as CUTADAPT_THREE_PRIME             } from '../modules/nf-core/cutadapt/main'
@@ -35,6 +40,7 @@ include { BOWTIE2_ALIGN                                } from '../modules/nf-cor
 // Local subworkflows
 include { INITIALISATION_CHANNEL_CREATION_SCREENING    } from '../subworkflows/local/utils_nfcore_crisprseq_pipeline'
 // Functions
+include { gptPromptForText                             } from 'plugin/nf-gpt'
 include { paramsSummaryMap                             } from 'plugin/nf-validation'
 include { paramsSummaryMultiqc                         } from '../subworkflows/nf-core/utils_nfcore_pipeline'
 include { softwareVersionsToYAML                       } from '../subworkflows/nf-core/utils_nfcore_pipeline'
@@ -357,6 +363,109 @@ workflow CRISPRSEQ_SCREENING {
 
     }
 
+    //
+    // Calling of nf-gpt plugin on drugZ, MAGeCK mle or bagel2
+    //
+    if(params.gpt_interpretation.split(',').contains('drugz')) {
+        if(params.drugz) {
+            def gpt_drugz_data = DRUGZ.out.per_gene_results.map { meta, genes -> genes }
+            def gpt_drugZ_source = "drugZ"
+            def gpt_drugZ_target_column = "pval_supp"
+            def gpt_drugZ_mode = "high"
+            GPT_PREPARE_DRUGZ_QUERY(
+                gpt_drugz_data,
+                gpt_drugZ_source,
+                gpt_drugZ_target_column,
+                params.gpt_drugz_gene_amount,
+                gpt_drugZ_mode,
+                params.gpt_drugz_question
+            )
+
+            GPT_PREPARE_DRUGZ_QUERY.out.query.map {
+                it -> it.text
+            }
+            .collect()
+            .flatMap { it -> gptPromptForText(it[0]) }
+            .collectFile( name: "${params.outdir}/gpt/gpt_drugz_output.txt", newLine: true, sort: false )
+        } else {
+            error "You specified DrugZ for gpt interpretation, but DrugZ is not running."
+        }
+    }
+    if(params.gpt_interpretation.split(',').contains('mle')) {
+        if(params.mle) {
+            def gpt_mle_data = MAGECK_MLE.out.gene_summary.map { meta, genes -> genes }
+            def gpt_mle_source = "mle"
+            def gpt_mle_target_column = "control_vs_treatment|p-value"
+            def gpt_mle_mode = "high"
+            GPT_PREPARE_MLE_QUERY(
+                gpt_mle_data,
+                gpt_mle_source,
+                gpt_mle_target_column,
+                params.gpt_mle_gene_amount,
+                gpt_mle_mode,
+                params.gpt_mle_question
+            )
+
+            GPT_PREPARE_MLE_QUERY.out.query.map {
+                it -> it.text
+            }
+            .collect()
+            .flatMap { it -> gptPromptForText(it[0]) }
+            .collectFile( name: "${params.outdir}/gpt/gpt_mle_output.txt", newLine: true, sort: false )
+        } else {
+            error "You specified MAGeCK MLE for gpt interpretation, but MAGeCK MLE is not running."
+        }
+    }
+    if(params.gpt_interpretation.split(',').contains('bagel2')) {
+        if(params.bagel2) {
+            def gpt_bagel2_data = BAGEL2_BF.out.bf.map { meta, genes -> genes }
+            def gpt_bagel2_source = "bagel2"
+            def gpt_bagel2_target_column = "BF"
+            def gpt_bagel2_mode = "high"
+            GPT_PREPARE_BAGEL2_QUERY(
+                gpt_bagel2_data,
+                gpt_bagel2_source,
+                gpt_bagel2_target_column,
+                params.gpt_bagel2_gene_amount,
+                gpt_bagel2_mode,
+                params.gpt_bagel_question
+            )
+
+            GPT_PREPARE_BAGEL2_QUERY.out.query.map {
+                it -> it.text
+            }
+            .collect()
+            .flatMap { it -> gptPromptForText(it[0]) }
+            .collectFile( name: "${params.outdir}/gpt/gpt_bagel2_output.txt", newLine: true, sort: false )
+        } else {
+            error "You specified BAGEL2 for gpt interpretation, but BAGEL2 is not running."
+        }
+    }
+    if(params.gpt_interpretation.split(',').contains('rra')) {
+        if(params.rra) {
+            def gpt_rra_data = MAGECK_TEST.out.gene_summary.map { meta, genes -> genes }
+            def gpt_rra_source = "rra"
+            def gpt_rra_target_column = "neg|rank"
+            def gpt_rra_mode = "low"
+            GPT_PREPARE_RRA_QUERY(
+                gpt_rra_data,
+                gpt_rra_source,
+                gpt_rra_target_column,
+                params.gpt_rra_gene_amount,
+                gpt_rra_mode,
+                params.gpt_rra_question
+            )
+
+            GPT_PREPARE_RRA_QUERY.out.query.map {
+                it -> it.text
+            }
+            .collect()
+            .flatMap { it -> gptPromptForText(it[0]) }
+            .collectFile( name: "${params.outdir}/gpt/gpt_rra_output.txt", newLine: true, sort: false )
+        } else {
+            error "You specified MAGeCK RRA for gpt interpretation, but MAGeCK RRA is not running."
+        }
+    }
 
     if(params.mle && params.bagel2) {
         ch_venndiagram = BAGEL2_PR.out.pr.join(MAGECK_MLE.out.gene_summary)