ScanExitron.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# ===============================================================================
__version__ = "v1.3.1beta"
import sys
import os
import argparse
from pyfaidx import Fasta
import numpy as np
import subprocess
import shutil
import secrets
from collections import OrderedDict
from io import BytesIO
import configparser
from tempfile import TemporaryDirectory


def remove(infile):
    if os.path.isfile(infile):
        os.remove(infile)


def status_message(msg):
    print(msg)
    sys.stdout.flush()


def run_cmd(cmd, msg=None):
    status_message(cmd)
    if "," in msg:
        begin, finish = msg.split(",")
        status_message(begin)
    else:
        finish = msg
    try:
        result = subprocess.check_output(
            cmd, shell=True, stderr=subprocess.STDOUT, stdin=subprocess.PIPE
        )
    except subprocess.CalledProcessError as err:
        error_msg = "Error happend!: {}\n{}".format(err, err.output)
    else:
        error_msg = ""
    if not error_msg:
        status_message(finish)
        return True, result
    else:
        status_message(error_msg)
        return False, None


def config_getter(config_file="config.ini"):
    this_dir = os.path.dirname(os.path.realpath(__file__))
    config_default = os.path.join(this_dir, config_file)
    config = configparser.ConfigParser(os.environ)
    config.read(config_default)
    hg38_ref = config.get("fasta", "hg38")
    hg19_ref = config.get("fasta", "hg19")
    hg38_anno = config.get("annotation", "hg38")
    hg19_anno = config.get("annotation", "hg19")
    hg38_cds = config.get("cds", "hg38")
    hg19_cds = config.get("cds", "hg19")
    return {
        "hg38_ref": hg38_ref,
        "hg19_ref": hg19_ref,
        "hg38_anno": hg38_anno,
        "hg19_anno": hg19_anno,
        "hg38_cds": hg38_cds,
        "hg19_cds": hg19_cds,
    }


chrms = {
    "chr1",
    "chr2",
    "chr3",
    "chr4",
    "chr5",
    "chr6",
    "chr7",
    "chr8",
    "chr9",
    "chr10",
    "chr11",
    "chr12",
    "chr13",
    "chr14",
    "chr15",
    "chr16",
    "chr17",
    "chr18",
    "chr19",
    "chr20",
    "chr21",
    "chr22",
    "chrX",
    "chrY",
    "chrM",
}

non_mito_chrms = {
    "chr1",
    "chr2",
    "chr3",
    "chr4",
    "chr5",
    "chr6",
    "chr7",
    "chr8",
    "chr9",
    "chr10",
    "chr11",
    "chr12",
    "chr13",
    "chr14",
    "chr15",
    "chr16",
    "chr17",
    "chr18",
    "chr19",
    "chr20",
    "chr21",
    "chr22",
    "chrX",
    "chrY",
}

chrms_dict = {
    "1": "chr1",
    "2": "chr2",
    "3": "chr3",
    "4": "chr4",
    "5": "chr5",
    "6": "chr6",
    "7": "chr7",
    "8": "chr8",
    "9": "chr9",
    "10": "chr10",
    "11": "chr11",
    "12": "chr12",
    "13": "chr13",
    "14": "chr14",
    "15": "chr15",
    "16": "chr16",
    "17": "chr17",
    "18": "chr18",
    "19": "chr19",
    "20": "chr20",
    "21": "chr21",
    "22": "chr22",
    "X": "chrX",
    "Y": "chrY",
    "MT": "chrM",
}

reverse_chrms_dict = dict((chrms_dict[i], i) for i in chrms_dict)


def BED_handler(inbed, tmp_dir):
    """keep only canonical chromosomes and convert b37/b38 to hg19/38"""
    rnd_id = secrets.token_hex(16)
    tmp_file = f"{tmp_dir}/tmp.{rnd_id}.txt"
    tmp = open(tmp_file, "w")
    with open(inbed, "r") as f:
        for line in f:
            l = line.rstrip("\n").split("\t")
            if l[0] in chrms:
                tmp.write("\t".join(l) + "\n")
            elif l[0] in chrms_dict:
                l[0] = chrms_dict[l[0]]
                tmp.write("\t".join(l) + "\n")
    tmp.close()
    shutil.move(tmp_file, inbed)
    return os.path.abspath(inbed)


def junction_caller(
    bam_file, ref="hg38", strand=1, out_name=None, config=None, tmp_dir=None
):
    """
    Call junctions using regtools
    output: out_name.janno
    """
    if not config:
        sys.stderr.write("No config file was found!\n")
        sys.exit(1)
    if ref == "hg19":
        fasta = config["hg19_ref"]
        gtf = config["hg19_anno"]
    elif ref == "hg38":
        fasta = config["hg38_ref"]
        gtf = config["hg38_anno"]

    prefix = os.path.splitext(os.path.basename(bam_file))[0]

    if not out_name:
        out_name = prefix

    if os.path.exists(f"{out_name}.janno.done"):
        status_message(f"{out_name}.janno found, skip junction identification.\n")
        return "{}.janno".format(out_name)

    cmd = f"regtools junctions extract -s {strand} -i 5 -I 10000000 {bam_file} -o {tmp_dir}/{prefix}.bed"

    bed_flag, _ = run_cmd(cmd, "Calling junctions start,Calling junctions finished!")
    if bed_flag:
        bed = BED_handler(f"{tmp_dir}/{prefix}.bed", tmp_dir)

    cmd = f"regtools junctions annotate {bed} {fasta} {gtf} -o {out_name}.janno"

    janno_flag, _ = run_cmd(cmd, f"{out_name}.janno generated!")
    if janno_flag:
        status_message("{}.janno generated!".format(out_name))
        os.remove("{}".format(bed))
        done_file(f"{out_name}.janno")
        return "{}.janno".format(out_name)
    return False


def junction_overlap_CDS_to_position_BED(
    janno, ao_cutoff=3, ref="hg38", tmp_dir=None, config=None
):
    """
    intersect junctions with annotated CDS to search exitrons
    """
    if not config:
        sys.stderr.write("No config file was found!\n")
        sys.exit(1)

    if ref == "hg19":
        cds = config["hg19_cds"]
    elif ref == "hg38":
        cds = config["hg38_cds"]

    genome_seq = seq_dict(ref=ref, config=config)

    print("Reading {}".format(janno))

    # write all the novel junctions with canonical splicing sites to file (junction.bed)
    rnd_id = secrets.token_hex(16)
    junction_bed = f"{tmp_dir}/{rnd_id}.junction.bed"
    total_junctions = 0
    with open(janno, "r") as f, open(junction_bed, "w") as out:
        f.readline()
        for line in f:
            l = line.rstrip().split("\t")
            total_junctions += int(l[4])
            chrm = l[0]
            start = int(l[1])
            end = int(l[2])
            stats = l[10]
            strand = l[5]
            if stats == "N" and strand != "?":
                if strand == "+":
                    left_site = genome_seq[chrm][start : start + 2].seq
                    right_site = genome_seq[chrm][end - 3 : end - 1].seq
                elif strand == "-":
                    left_site = genome_seq[chrm][
                        end - 3 : end - 1
                    ].reverse.complement.seq
                    right_site = genome_seq[chrm][
                        start : start + 2
                    ].reverse.complement.seq
                l[6] = "{}-{}".format(left_site, right_site)
                splice_site = l[6].upper()
                if splice_site in {"GT-AG", "GC-AG", "AT-AC"}:
                    out.write("{}\n".format("\t".join(l[:7])))

    overlap_file = f"{tmp_dir}/{rnd_id}.overlap.bed"
    cmd = f"bedtools intersect -s -wo -a {junction_bed} -b {cds} > {overlap_file}"

    run_cmd(cmd, "Junctions intersect with CDS,Junctions intersect with CDS finished!")

    # no overlap in CDS and junctions file
    if os.path.isfile(overlap_file) and os.path.getsize(overlap_file) == 0:
        remove(overlap_file)
        remove(junction_bed)
        print("No overlaps found in {} and gencode CDS".format(janno))
        return None, None
    # overlaps found in CDS and junctions file
    elif os.path.isfile(overlap_file) and os.path.getsize(overlap_file) > 0:
        tmp_dict = OrderedDict()
        with open(overlap_file) as f:
            for line in f:
                l = line.rstrip().split("\t")
                chrm = l[0]
                length = int(l[-1])
                junc_start = int(l[1])
                junc_end = int(l[2])
                junc_id = l[3]
                junc_read_no = l[4]
                strand = l[5]
                splice_site = l[6]
                ref_start = int(l[8])
                ref_end = int(l[9])
                gene_name = l[11]
                gene_id = l[10]
                pos_key = "{}:{}-{}".format(chrm, junc_start, junc_end)
                if (
                    length == junc_end - junc_start
                    and junc_start > ref_start
                    and junc_end < ref_end
                    and chrm in non_mito_chrms
                    and int(junc_read_no) >= ao_cutoff
                ):
                    if pos_key not in tmp_dict:
                        info = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
                            chrm,
                            junc_start,
                            junc_end,
                            junc_id,
                            junc_read_no,
                            strand,
                            gene_name,
                            length - 1,
                            splice_site,
                            gene_id,
                            total_junctions,
                        )
                        tmp_dict[pos_key] = info
        remove(overlap_file)
        remove(junction_bed)
        # exitrons found
        if len(tmp_dict) > 0:
            position_bed_file = (
                os.path.splitext(os.path.basename(janno))[0] + ".position.bed"
            )
            out = open(position_bed_file, "w")

            src_exitron_file = os.path.splitext(os.path.basename(janno))[0] + ".src"
            output = open(src_exitron_file, "w")

            position_set = set([])
            for i in tmp_dict:
                (
                    chrm,
                    junc_start,
                    junc_end,
                    junc_id,
                    junc_read_no,
                    strand,
                    gene_name,
                    junc_len,
                    splice_site,
                    gene_id,
                    total_junctions,
                ) = tmp_dict[i].split("\t")

                # chrm = reverse_chrms_dict[chrm]
                output.write(
                    "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                        chrm,
                        junc_start,
                        junc_end,
                        junc_id,
                        junc_read_no,
                        strand,
                        gene_name,
                        junc_len,
                        splice_site,
                        gene_id,
                        total_junctions,
                    )
                )
                start = int(junc_start)
                end = int(junc_end)
                # ao = int(junc_read_no)
                middle_point = int(np.median([start, end]))

                if "{}\t{}".format(chrm, start) not in position_set:
                    out.write("{}\t{}\t{}\n".format(chrm, start - 1, start))
                    position_set.add("{}\t{}".format(chrm, start))

                if "{}\t{}".format(chrm, end) not in position_set:
                    out.write("{}\t{}\t{}\n".format(chrm, end - 1, end))
                    position_set.add("{}\t{}".format(chrm, end))

                if "{}\t{}".format(chrm, middle_point) not in position_set:
                    out.write(
                        "{}\t{}\t{}\n".format(chrm, middle_point - 1, middle_point)
                    )
                    position_set.add("{}\t{}".format(chrm, middle_point))

            output.close()
            out.close()
        else:
            print("No exitron found in {}".format(janno))
            return None, None
    return src_exitron_file, position_bed_file


def percent_spliced_out(
    bam_file, src_exitron_file, position_bed_file, ao_cutoff, pso_cutoff, mapq, out
):
    print("Reading BAM file: {}".format(bam_file))
    depth_dict = {}
    cmd = "samtools bedcov {0} {1} -Q {2}".format(position_bed_file, bam_file, mapq)
    depth_flag, result = run_cmd(cmd, "Calculate PSO and PSI.")

    if depth_flag:
        result_file = BytesIO(result)
        result_string = result_file.getvalue().decode("utf-8")
        for line in result_string.split("\n"):
            if line:
                chrm, _, pos, depth = line.rstrip().split()
                depth_dict["{}\t{}".format(chrm, pos)] = int(depth)

    with open(src_exitron_file) as f:
        for line in f:
            l = line.rstrip("\n").split("\t")
            chrm = l[0]
            start = int(l[1])
            end = int(l[2])
            ao = int(l[4])
            strand = l[5]
            middle_point = int(np.median([start, end]))

            if strand == "+":
                five_prime_reads = depth_dict["{}\t{}".format(chrm, start)] - ao
                three_prime_reads = depth_dict["{}\t{}".format(chrm, end)] - ao
                middle_reads = depth_dict["{}\t{}".format(chrm, middle_point)] - ao
            elif strand == "-":
                five_prime_reads = depth_dict["{}\t{}".format(chrm, end)] - ao
                three_prime_reads = depth_dict["{}\t{}".format(chrm, start)] - ao
                middle_reads = depth_dict["{}\t{}".format(chrm, middle_point)] - ao
            ave_dp = (five_prime_reads + three_prime_reads + middle_reads) / 3.0
            if five_prime_reads < 0 or three_prime_reads < 0:
                continue
            try:
                pso = float(ao) / (ave_dp + ao)
            except ZeroDivisionError:
                print(f"Error in {chrm} {start} {end}")
                pso = 0
            psi = 1.0 - float("{:.3g}".format(pso))
            dp = int(ao / pso)
            if ao >= ao_cutoff and pso >= pso_cutoff:
                out.write(
                    "{}\t{:.3g}\t{}\t{}\t{}\n".format(
                        "\t".join(l[:-1]), pso, psi, dp, l[-1]
                    )
                )
    os.remove(src_exitron_file)
    os.remove(position_bed_file)
    out.close()
    print("Finished reading BAM file: {}".format(bam_file))


def external_tool_checking():
    """checking dependencies are installed"""
    software = ["regtools", "bedtools", "samtools"]
    cmd = "which"
    for each in software:
        try:
            path = subprocess.check_output([cmd, each], stderr=subprocess.STDOUT)
            path = str(path, "utf-8")
        except subprocess.CalledProcessError:
            print(
                "Checking for '" + each + "': ERROR - could not find '" + each + "'",
                file=sys.stderr,
            )
            print("Exiting.", file=sys.stderr)
            sys.exit(0)
        print("Checking for '" + each + "': found " + path)


def done_file(name):
    out = open(name + ".done", "w")
    out.write("done!")
    out.close()


def MAPQ_filter(in_bam, threads=6, mapq=50):
    prefix = os.path.splitext(os.path.basename(in_bam))[0]
    if os.path.exists(f"{prefix}.hq.bam.done"):
        status_message(f"{prefix}.hq.bam found, skip MAPQ filtering!\n")
        return "{}.hq.bam".format(prefix)
    cmd = "samtools view -q {0} -@ {1} -O BAM -o {2}.hq.bam {3} && samtools index {2}.hq.bam".format(
        mapq, threads, prefix, in_bam
    )

    filter_flag, _ = run_cmd(cmd, "BAM filtering begins, BAM filtering finished.")

    if filter_flag:
        done_file("{}.hq.bam".format(prefix))
        return "{}.hq.bam".format(prefix)
    else:
        return False


def seq_dict(ref="hg38", config=None):
    if not config:
        sys.stderr.write("No config file was found!\n")
        sys.exit(1)

    if ref == "hg19":
        fasta = config["hg19_ref"]
    elif ref == "hg38":
        fasta = config["hg38_ref"]
    genome_dict = Fasta(fasta, sequence_always_upper=True)
    return genome_dict


def parse_args():
    parser = argparse.ArgumentParser(
        description="%(prog)s -i input_rna_seq_bam_file -r [hg38/hg19] -m mapping_quality",
        epilog="ScanExitron: detecting exitron splicing events using RNA-Seq data",
    )
    parser.add_argument(
        "-i",
        "--input",
        action="store",
        dest="input",
        help="Input BAM/CRAM file along with BAI/CRAI file",
        required=True,
    )
    parser.add_argument(
        "-a",
        "--ao",
        action="store",
        dest="ao",
        type=int,
        help="AO cutoff (default: %(default)s)",
        default=3,
    )
    parser.add_argument(
        "-p",
        "--pso",
        action="store",
        dest="pso",
        type=float,
        help="PSO cutoff (default: %(default)s)",
        default=0.05,
    )
    parser.add_argument(
        "-m",
        "--mapq",
        action="store",
        dest="mapq",
        type=int,
        help="consider reads with MAPQ >= cutoff (default: %(default)s)",
        default=50,
    )
    parser.add_argument(
        "-s",
        "--strand",
        action="store",
        dest="strand",
        type=int,
        help="Strand specificity of RNA library preparation (0 = unstranded, 1 = first-strand/RF, 2, = second-strand/FR) (default: %(default)s)",
        default=1,
    )

    parser.add_argument(
        "-t",
        "--threads",
        action="store",
        dest="threads",
        type=int,
        help="number of threads (default: %(default)s)",
        default=1,
    )
    parser.add_argument(
        "-c",
        "--config",
        action="store",
        dest="config",
        type=str,
        help="config file (default: %(default)s)",
        default="config.ini",
    )
    parser.add_argument(
        "-r",
        "--ref",
        action="store",
        dest="ref",
        help="reference (default: %(default)s)",
        choices=["hg19", "hg38"],
        default="hg38",
    )
    parser.add_argument(
        "-v", "--version", action="version", version="%(prog)s {}".format(__version__)
    )
    args = parser.parse_args()
    return args


def main():
    external_tool_checking()
    args = parse_args()
    config = config_getter(args.config)

    out_bam = MAPQ_filter(in_bam=args.input, threads=args.threads, mapq=args.mapq)
    tmp_dir = TemporaryDirectory()
    tmp_dir_name = tmp_dir.name

    prefix = os.path.splitext(os.path.basename(args.input))[0]
    outfile = prefix + ".exitron"
    outstream = open(outfile, "w")
    outstream.write(
        "chrom\tstart\tend\tname\tao\tstrand\tgene_symbol\tlength\tsplice_site\tgene_id\tpso\tpsi\tdp\ttotal_junctions\n"
    )

    if out_bam:
        janno_file = junction_caller(
            bam_file=out_bam,
            ref=args.ref,
            strand=args.strand,
            config=config,
            tmp_dir=tmp_dir_name,
        )
        src_exitron_file, position_bed_file = junction_overlap_CDS_to_position_BED(
            janno_file,
            ao_cutoff=args.ao,
            ref=args.ref,
            tmp_dir=tmp_dir_name,
            config=config,
        )

        if src_exitron_file is not None and position_bed_file is not None:
            percent_spliced_out(
                bam_file=args.input,
                src_exitron_file=src_exitron_file,
                position_bed_file=position_bed_file,
                ao_cutoff=args.ao,
                pso_cutoff=args.pso,
                mapq=args.mapq,
                out=outstream,
            )
            tmp_dir.cleanup()
            # remove(janno_file)
    outstream.close()


if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        sys.stderr.write("User interrupt me ^_^ \n")
        sys.exit(1)