Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementation of nf-gpt into crisprseq pipeline. #193

Open
wants to merge 24 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
1c830ed
added nf-gpt plugin to nextflow.config
Sep 2, 2024
96074ea
introduced new boolean parameter gpt_interpretation that controls nf-…
Sep 3, 2024
c34bcc9
changed min nxf version to 24.03.0-edge
Sep 4, 2024
e30179b
added nf-gpt parameters to nextflow.config
Sep 4, 2024
6211a12
gpt plugin can now be parsed with individual data from drugZ, bagel2 …
Sep 5, 2024
fbc2de6
gpt questions can now be specified in custom config.
Sep 8, 2024
da32fa1
Data parser refactoring. Now 1 parser can handle data from different …
Sep 9, 2024
2d20407
merge gpt changes to dev branch
Sep 11, 2024
a936cfb
adjusted some formating
Sep 11, 2024
78c2c74
Added rra support. Also, gpt interpretation now only works when modul…
Sep 11, 2024
615618f
grouped nf-gpt related parameters in nextflow_schema.json.
Sep 12, 2024
295045c
added more checks before nf-gpt calling
Sep 17, 2024
e8c2b56
Merge branch 'dev' into dev
LaurenceKuhl Sep 18, 2024
05c4d56
Merge branch 'dev' into dev
LaurenceKuhl Sep 19, 2024
0aee7db
Parsing process now selects column by index not by name. Adjusted max…
Sep 29, 2024
cb0442b
Merge branch 'dev' of github.com:LeonHornich/crisprseq into dev
Sep 29, 2024
f4ce2ec
Updated schema json. Ran code formater.
Sep 29, 2024
eecb6f4
updated default gpt question for each module, removing request for re…
Oct 5, 2024
71b745e
updated schema file with new default values.
Oct 5, 2024
c55bec1
Merge branch 'dev' into dev
LaurenceKuhl Oct 18, 2024
e37c4d1
Merge branch 'dev' into dev
LaurenceKuhl Nov 11, 2024
4e2b551
Update nextflow.config
LaurenceKuhl Nov 11, 2024
ef44d1d
Merge branch 'nf-core:dev' into dev
LeonHornich Feb 22, 2025
f3fd5f3
Merge branch 'nf-core:dev' into dev
LeonHornich Mar 2, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
strategy:
matrix:
NXF_VER:
- "23.04.0"
- "24.03.0-edge"
- "latest-everything"
ANALYSIS:
- "test_screening"
Expand Down
15 changes: 15 additions & 0 deletions modules/local/gpt_prepare_query.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
process GPT_PREPARE_QUERY {
input:
path data
val source
val column
val count
val mode
val question

output:
path "gpt_${source}_query.txt", emit: query

script:
template 'generateGptQuery.py'
}
11 changes: 0 additions & 11 deletions modules/local/prepare_gpt_input.nf

This file was deleted.

24 changes: 24 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,17 @@ params {
overrepresented = false
umi_clustering = false
skip_clonality = false
gpt_interpretation = null

// Nf-gpt parameters
gpt_drugz_gene_amount = 100
gpt_drugz_question = "Which of the following genes enhance or supress drug activity?"
gpt_mle_gene_amount = 100
gpt_mle_question = "What genes are known to have pan-effects on cancer?"
gpt_bagel2_gene_amount = 100
gpt_bagel_question = "What can you tell me about these genes in the context of functional genomics?"
gpt_rra_gene_amount = 100
gpt_rra_question = "What genes are known to have pan-effects on cancer?"

// UMI parameters
umi_bin_size = 1
Expand Down Expand Up @@ -97,6 +108,18 @@ params {

}

// nf-gpt plugin settings
gpt {
// The user should provide a functioning api key
apiKey = null
// Models are available in nf-gpt 0.4.0: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-4, and gpt-3.5-turbo
model = "gpt-4o-mini"
// Set maximum number of tokens. This depends on the used model: https://platform.openai.com/docs/models
maxTokens = 10000
// Sampling temperature
temperature = 0.7
}

// Load base.config by default for all pipelines
includeConfig 'conf/base.config'

Expand Down Expand Up @@ -235,6 +258,7 @@ singularity.registry = 'quay.io'
// Nextflow plugins
plugins {
id '[email protected]' // Validation of pipeline parameters and creation of an input channel from a sample sheet
id '[email protected]' // Allows access to nf-gpt functionality
}

// Load igenomes.config if required
Expand Down
48 changes: 47 additions & 1 deletion nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,10 @@
"description": "Essential genes to remove from the drugZ modules",
"pattern": "\\\\S+"
},
"gpt_interpretation": {
"type": "string",
"description": "Determines whether or not to run nf-gpt plugin"
},
"hitselection": {
"type": "boolean",
"description": "Specify to run the Hitselection algorithm"
Expand Down Expand Up @@ -542,5 +546,47 @@
{
"$ref": "#/definitions/generic_options"
}
]
],
"properties": {
"gpt_drugz_gene_amount": {
"type": "integer",
"default": 100,
"description": "Number of top genes to be selected from drugZ."
},
"gpt_drugz_question": {
"type": "string",
"default": "Which of the following genes enhance or supress drug activity?",
"description": "Question parsed with drugZ data to gpt."
},
"gpt_mle_gene_amount": {
"type": "integer",
"default": 100,
"description": "Number of top genes to be selected from mle."
},
"gpt_mle_question": {
"type": "string",
"default": "What genes are known to have pan-effects on cancer?",
"description": "Question parsed with MAGeCK mle data to gpt."
},
"gpt_bagel2_gene_amount": {
"type": "integer",
"default": 100,
"description": "Number of top genes to be selected from bagel2."
},
"gpt_bagel_question": {
"type": "string",
"default": "What can you tell me about these genes in the context of functional genomics?",
"description": "Question parsed with bagel2 data to gpt."
},
"gpt_rra_gene_amount": {
"type": "integer",
"default": 100,
"description": "Number of top genes to be selected from rra."
},
"gpt_rra_question": {
"type": "string",
"default": "What genes are known to have pan-effects on cancer?",
"description": "Question parsed with rra data to gpt."
}
}
}
51 changes: 51 additions & 0 deletions templates/generateGptQuery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env python

# Define process input variables
data_path = "${data}"
source = "${source}"
target_column = "${column}"
num_genes = "${count}"
num_genes = int(num_genes)
mode = "${mode}"
question = "${question}"

# Open data file
with open(data_path, "r") as file:
# Read the header row and split into column names
header = file.readline().strip().split("\t")

# Check if specified column exists
if target_column not in header:
print(
f"Error: The specified column '{target_column}' was not found in the data file!"
)

# Find the target column index
target_index = header.index(target_column)

# Initiate list to store gene id's with corresponding data values
data = []

# Prepare data file's rows
for line in file:
row = line.strip().split("\t")
gene_id = row[0]
value = float(row[target_index])
data.append((gene_id, value))

# Sort the data based on provided mode
if mode == "low":
sorted_data = sorted(data, key=lambda x: x[1])
elif mode == "high":
sorted_data = sorted(data, key=lambda x: x[1], reverse=True)
else:
print("Error: Please provide either 'low' or 'high' as mode.")

# Extract num_genes many top genes
top_gene_ids = [gene_id for gene_id, value in sorted_data[:num_genes]]

# Write everything into a output file
with open("gpt_${source}_query.txt", "w") as query_file:
query_file.write(question + """\n""")
for gene_id in top_gene_ids:
query_file.write(gene_id + """\n""")
109 changes: 109 additions & 0 deletions workflows/crisprseq_screening.nf
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ include { MAGECK_FLUTEMLE } from '../modules/local/
include { MAGECK_FLUTEMLE as MAGECK_FLUTEMLE_CONTRASTS } from '../modules/local/mageck/flutemle'
include { MAGECK_FLUTEMLE as MAGECK_FLUTEMLE_DAY0 } from '../modules/local/mageck/flutemle'
include { VENNDIAGRAM } from '../modules/local/venndiagram'
include { GPT_PREPARE_QUERY as GPT_PREPARE_BAGEL2_QUERY} from '../modules/local/gpt_prepare_query'
include { GPT_PREPARE_QUERY as GPT_PREPARE_DRUGZ_QUERY } from '../modules/local/gpt_prepare_query'
include { GPT_PREPARE_QUERY as GPT_PREPARE_MLE_QUERY } from '../modules/local/gpt_prepare_query'
include { GPT_PREPARE_QUERY as GPT_PREPARE_RRA_QUERY } from '../modules/local/gpt_prepare_query'

// nf-core modules
include { FASTQC } from '../modules/nf-core/fastqc/main'
include { CUTADAPT as CUTADAPT_THREE_PRIME } from '../modules/nf-core/cutadapt/main'
Expand All @@ -35,6 +40,7 @@ include { BOWTIE2_ALIGN } from '../modules/nf-cor
// Local subworkflows
include { INITIALISATION_CHANNEL_CREATION_SCREENING } from '../subworkflows/local/utils_nfcore_crisprseq_pipeline'
// Functions
include { gptPromptForText } from 'plugin/nf-gpt'
include { paramsSummaryMap } from 'plugin/nf-validation'
include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline'
include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline'
Expand Down Expand Up @@ -357,6 +363,109 @@ workflow CRISPRSEQ_SCREENING {

}

//
// Calling of nf-gpt plugin on drugZ, MAGeCK mle or bagel2
//
if(params.gpt_interpretation.split(',').contains('drugz')) {
if(params.drugz) {
def gpt_drugz_data = DRUGZ.out.per_gene_results.map { meta, genes -> genes }
def gpt_drugZ_source = "drugZ"
def gpt_drugZ_target_column = "pval_supp"
def gpt_drugZ_mode = "high"
GPT_PREPARE_DRUGZ_QUERY(
gpt_drugz_data,
gpt_drugZ_source,
gpt_drugZ_target_column,
params.gpt_drugz_gene_amount,
gpt_drugZ_mode,
params.gpt_drugz_question
)

GPT_PREPARE_DRUGZ_QUERY.out.query.map {
it -> it.text
}
.collect()
.flatMap { it -> gptPromptForText(it[0]) }
.collectFile( name: "${params.outdir}/gpt/gpt_drugz_output.txt", newLine: true, sort: false )
} else {
error "You specified DrugZ for gpt interpretation, but DrugZ is not running."
}
}
if(params.gpt_interpretation.split(',').contains('mle')) {
if(params.mle) {
def gpt_mle_data = MAGECK_MLE.out.gene_summary.map { meta, genes -> genes }
def gpt_mle_source = "mle"
def gpt_mle_target_column = "control_vs_treatment|p-value"
def gpt_mle_mode = "high"
GPT_PREPARE_MLE_QUERY(
gpt_mle_data,
gpt_mle_source,
gpt_mle_target_column,
params.gpt_mle_gene_amount,
gpt_mle_mode,
params.gpt_mle_question
)

GPT_PREPARE_MLE_QUERY.out.query.map {
it -> it.text
}
.collect()
.flatMap { it -> gptPromptForText(it[0]) }
.collectFile( name: "${params.outdir}/gpt/gpt_mle_output.txt", newLine: true, sort: false )
} else {
error "You specified MAGeCK MLE for gpt interpretation, but MAGeCK MLE is not running."
}
}
if(params.gpt_interpretation.split(',').contains('bagel2')) {
if(params.bagel2) {
def gpt_bagel2_data = BAGEL2_BF.out.bf.map { meta, genes -> genes }
def gpt_bagel2_source = "bagel2"
def gpt_bagel2_target_column = "BF"
def gpt_bagel2_mode = "high"
GPT_PREPARE_BAGEL2_QUERY(
gpt_bagel2_data,
gpt_bagel2_source,
gpt_bagel2_target_column,
params.gpt_bagel2_gene_amount,
gpt_bagel2_mode,
params.gpt_bagel_question
)

GPT_PREPARE_BAGEL2_QUERY.out.query.map {
it -> it.text
}
.collect()
.flatMap { it -> gptPromptForText(it[0]) }
.collectFile( name: "${params.outdir}/gpt/gpt_bagel2_output.txt", newLine: true, sort: false )
} else {
error "You specified BAGEL2 for gpt interpretation, but BAGEL2 is not running."
}
}
if(params.gpt_interpretation.split(',').contains('rra')) {
if(params.rra) {
def gpt_rra_data = MAGECK_TEST.out.gene_summary.map { meta, genes -> genes }
def gpt_rra_source = "rra"
def gpt_rra_target_column = "neg|rank"
def gpt_rra_mode = "low"
GPT_PREPARE_RRA_QUERY(
gpt_rra_data,
gpt_rra_source,
gpt_rra_target_column,
params.gpt_rra_gene_amount,
gpt_rra_mode,
params.gpt_rra_question
)

GPT_PREPARE_RRA_QUERY.out.query.map {
it -> it.text
}
.collect()
.flatMap { it -> gptPromptForText(it[0]) }
.collectFile( name: "${params.outdir}/gpt/gpt_rra_output.txt", newLine: true, sort: false )
} else {
error "You specified MAGeCK RRA for gpt interpretation, but MAGeCK RRA is not running."
}
}

if(params.mle && params.bagel2) {
ch_venndiagram = BAGEL2_PR.out.pr.join(MAGECK_MLE.out.gene_summary)
Expand Down