Add package content

PhyloSofS-Team · Jan 15, 2019 · 7ea04e6 · 7ea04e6
1 parent 9cc3472
commit 7ea04e6
Show file tree

Hide file tree

Showing 10 changed files with 1,725 additions and 2 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2019 MASSIV
+Copyright (c) 2019, Diego Javier Zea, Hugues Richard, Elodie Laine
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/exonhomology/subexons/subexon_alignment.py b/exonhomology/subexons/subexon_alignment.py
@@ -0,0 +1,375 @@
+"""
+subexon_alignment: Module to create the subexon MSA with MAFFT.
+===============================================================
+
+This module creates a MSA of subexons using MAFFT.
+"""
+
+import subprocess
+import tempfile
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn
+from Bio import AlignIO
+from collections import OrderedDict
+
+# List from `orthokeep` in `EnsemblRESTTranscriptQueries.py`
+# Order from NCBI Taxonomy/CommonTree, but using Human as 1
+sp_order = {
+    'homo_sapiens': 1,
+    'mus_musculus': 6,
+    'macaca_mulatta': 3,
+    'danio_rerio': 12,
+    'xenopus_tropicalis': 11,
+    'caenorhabditis_elegans': 14,
+    'gallus_gallus': 10,
+    'rattus_norvegicus': 5,
+    'bos_taurus': 7,
+    'monodelphis_domestica': 4,
+    'ornithorhynchus_anatinus': 9,
+    'drosophila_melanogaster': 13,
+    'gorilla_gorilla': 2,
+    'sus_scrofa': 8
+}
+
+
+def _create_subexon_index(subexon_table):
+    """
+    Return a pandas' DataFrame with subexon information.
+    """
+
+    # NOTE : Subexon ID is the same for subexons with the same sequence
+    # taking phases into account. Being more specific with the subset
+    # columns may cause duplicated subexons in the chimeric sequence.
+    subset_columns = ['Subexon ID', 'Gene stable ID']
+    unique_subexons = subexon_table.drop_duplicates(subset=subset_columns)
+
+    unique_subexons['Order'] = [
+        row['Subexon genomic coding start']
+        if row['Strand'] == 1 else (-1 * row['Subexon genomic coding end'])
+        for _, row in unique_subexons.iterrows()
+    ]
+
+    unique_subexons = unique_subexons.sort_values(by=['Order'])
+
+    unique_subexons = unique_subexons.loc[:, [
+        'Subexon ID', 'Subexon genomic coding start',
+        'Subexon genomic coding end', 'Exon protein sequence',
+        'Subexon rank in transcript'
+    ]]
+
+    unique_subexons['Subexon Index'] = list(range(0, unique_subexons.shape[0]))
+    output = subexon_table.merge(unique_subexons)
+
+    return output
+
+
+def _create_transcript_index(subexon_table):
+    """
+    Return a pandas' DataFrame with the gene and transcript ids.
+    """
+    transcript_id_columns = ['Gene stable ID', 'Transcript stable ID']
+    unique_transcripts = subexon_table.drop_duplicates(
+        subset=transcript_id_columns)
+
+    unique_transcripts = unique_transcripts.loc[:, transcript_id_columns]
+
+    unique_transcripts['Transcript Index'] = list(range(0,
+                                                   unique_transcripts.shape[0]))
+    output = subexon_table.merge(unique_transcripts)
+
+    return output
+
+
+def create_subexon_matrix(subexon_table):
+    """
+    Return a binary matrix showing subexon presence in transcripts.
+    """
+
+    # _create_subexon_index and _create_transcript_index change
+    # subexon_table with the last merge before return:
+    subexon_table = _create_subexon_index(subexon_table)
+    subexon_table = _create_transcript_index(subexon_table)
+
+    subexon_table = subexon_table.sort_values(
+        by=['Transcript Index', 'Subexon Index'])
+
+    n_subexons = len(subexon_table['Subexon Index'].unique())
+    n_transcripts = len(subexon_table['Transcript Index'].unique())
+
+    subexon_matrix = np.zeros((n_transcripts, n_subexons), dtype=np.bool)
+
+    for _, row in subexon_table.iterrows():
+        subexon_matrix[row['Transcript Index'], row['Subexon Index']] = True
+
+    return subexon_table, subexon_matrix
+
+
+def _get_sequence(subexon_seqs,
+                  subexon_column,
+                  padding,
+                  sequence_column='Exon protein sequence'):
+    """
+    Return the sequence of the subexon as a string.
+
+    This function takes the subexon_seqs pandas' DataFrame as input, that has
+    'Subexon Index' as the DataFrame index.
+
+    This replaces termination codons (*) with the padding.
+    """
+    seq = str(subexon_seqs.loc[subexon_column, sequence_column])
+    return seq.replace('*', padding)
+
+
+def _does_it_need_padding(transcript_matrix, col_index):
+    """
+    Return True if the column doesn't share any transcript with the previous.
+    """
+    answer = not np.any(
+        np.logical_and(transcript_matrix[:, col_index - 1],
+                       transcript_matrix[:, col_index]))
+    return answer
+
+
+def create_chimeric_sequences(subexon_table,
+                              subexon_matrix,
+                              padding='XXXXXXXXXX'):
+    """
+    Create chimeric sequence for MAFFT.
+
+    It returns a Dict from 'Gene stable ID' to a tuple with the chimeric
+    sequence and a Dict from 'Subexon Index' to
+    """
+    chimerics = {}
+    for gene_id, gene_df in subexon_table.groupby('Gene stable ID'):
+
+        # DataFrame to get a subexon sequence using its 'Subexon Index'
+        subexon_seq_columns = ['Subexon Index', 'Exon protein sequence']
+        subexon_seqs = gene_df.loc[:, subexon_seq_columns]
+        # NOTE: It make copies to not delete subexons inplace:
+        subexon_seqs = subexon_seqs.drop_duplicates(subset=subexon_seq_columns)
+        subexon_seqs = subexon_seqs.set_index('Subexon Index')
+        subexon_seqs = subexon_seqs.sort_index()
+
+        transcript_index = sorted(gene_df['Transcript Index'].unique())
+
+        subexon_index = gene_df['Subexon Index'].unique()
+        subexon_index.sort()
+
+        transcript_matrix = subexon_matrix[transcript_index,:]
+        transcript_matrix = transcript_matrix[:, subexon_index]
+
+        subexon = subexon_index[0]
+        chimeric = _get_sequence(subexon_seqs, subexon, padding)
+        breaks = {subexon: len(chimeric)}
+        for col_index in range(1, len(subexon_index)):
+            subexon = subexon_index[col_index]
+            chimeric += _get_sequence(subexon_seqs, subexon, padding)
+            if _does_it_need_padding(
+                    transcript_matrix,
+                    col_index) and chimeric[-len(padding):] != padding:
+                chimeric += padding
+
+            breaks.update({subexon: len(chimeric)})
+
+        chimerics[gene_id] = (chimeric, breaks)
+
+    return chimerics
+
+
+def _print_temporal_fasta(chimerics):
+    """
+    Save the chimeric sequences in a temporal fasta file and return its name.
+    """
+    with tempfile.NamedTemporaryFile(
+            suffix='.fasta', delete=False) as tmp_fasta:
+        for (key, value) in chimerics.items():
+            tmp_fasta.write('>%s\n%s\n' % (key, value[0]))
+
+    return tmp_fasta.name
+
+
+def run_mafft(chimerics,
+              output_path='alignment.fasta',
+              mafft_path='mafft',
+              mafft_arguments=None):
+    """
+    Run MAFFT in the chimeric sequences and return the output file.
+
+    mafft_arguments should be a list of str arguments for subprocess, e.g.:
+    ['--maxiterate', '100']
+    """
+    input_fasta = _print_temporal_fasta(chimerics)
+
+    command = [mafft_path, input_fasta]
+    if mafft_arguments is not None:
+        command.extend(mafft_arguments)
+
+    with open(output_path, 'w') as outfile:
+        process = subprocess.Popen(command, stdout=subprocess.PIPE)
+        for line in process.stdout:
+            outfile.write(line)
+        process.wait()
+
+    return output_path
+
+
+def sort_species(chimerics, transcript_info, species_order=sp_order):
+    """
+    Sort `chimerics` dict using `transcript_info` and `sp_order`
+    """
+    gene2sp = pd.Series(
+        transcript_info.Species.values,
+        index=transcript_info['Gene stable ID']).to_dict()
+    return OrderedDict(
+        sorted(list(chimerics.items()), key=lambda x: species_order[gene2sp[x[0]]]))
+
+
+def create_msa_matrix(chimerics, msa_file):  # pylint: disable=too-many-locals
+    """
+    Convert a msa_file from chimerics to a matrix.
+
+    Each cell has the subexon number (Index) or nan for gaps and padding.
+    """
+    msa = AlignIO.read(msa_file, 'fasta')
+    n_seq = len(msa[:, 0])
+    assert n_seq > 1, 'There are few sequences in %s' % msa_file
+    n_col = len(msa[0])
+    assert n_col > 1, 'There are few columns in %s' % msa_file
+
+    msa_matrix = np.zeros((n_seq, n_col))
+    msa_matrix.fill(np.nan)
+
+    gene_ids = []
+    for seq_index in range(0, n_seq):
+        record = msa[seq_index]
+        gene_ids.append(record.id)
+        subexon2len = chimerics[record.id][1]
+        subexons = sorted(subexon2len, key=subexon2len.get)
+        seq_len = 0
+        subexon_index = 0
+        for col_index in range(0, n_col):
+            residue = record.seq[col_index]
+            if residue != '-':
+                seq_len += 1
+
+            if seq_len > subexon2len[subexons[subexon_index]]:
+                subexon_index += 1
+
+            if residue in {'-', 'X'}:
+                value = np.nan
+            else:
+                value = subexons[subexon_index]
+
+            msa_matrix[seq_index, col_index] = value
+
+    return gene_ids, msa_matrix
+
+
+def plot_msa_subexons(gene_ids,
+                      msa_matrix,
+                      subexon_table=None,
+                      outfile='alignment.png'):
+    """
+    Save a plot from the msa matrix where cells are subexons.
+
+    If `subexon_table` is `None`, the binary plot with the constitutive
+    exons is not generated. Otherwise, it'll be in the `_constitutive.png`
+    file. The `subexon_table` also allows the coloring by Subexon ID for
+    the `_subexon_id.png` file.
+    """
+    # stack...: stop-seaborn-plotting-multiple-figures-on-top-of-one-another
+    plt.figure()
+
+    subexon_number = int(msa_matrix[~np.isnan(msa_matrix)].max()) + 1
+    cmap = seaborn.color_palette('tab10', subexon_number)
+
+    msa_matrix_df = pd.DataFrame(msa_matrix)
+    msa_matrix_df['Gene ID'] = gene_ids
+    msa_matrix_df.set_index('Gene ID', inplace=True)
+    sns_plot = seaborn.heatmap(msa_matrix_df, cmap=cmap)
+
+    # This depends on the seaborn version: seaborn-0.8.1
+    sns_plot.get_figure().savefig(outfile, bbox_inches='tight')
+    # stackoverflow: second-y-axis-label-getting-cut-off
+
+    if subexon_table is not None:
+        msa_matrix_df_copy = msa_matrix_df.copy()
+
+        id2fraction = pd.Series(
+            subexon_table['Transcript fraction'].values,
+            index=subexon_table['Subexon Index']).to_dict()
+
+        subexon_subdf = subexon_table.loc[:, [
+            'Number of transcripts for subexon', 'Transcripts in gene',
+            'Subexon Index', 'Subexon ID'
+        ]]
+
+        subexon_subdf = subexon_subdf.drop_duplicates().set_index(
+            'Subexon Index')
+
+        nrow, ncol = msa_matrix_df.shape
+
+        for i in range(0, nrow):
+            for j in range(0, ncol):
+                subexon_index = msa_matrix_df_copy.iloc[i, j]
+                if not np.isnan(subexon_index):
+                    subexon_index = int(subexon_index)
+                    fraction = id2fraction[subexon_index]
+                    if fraction == 1.0:
+                        msa_matrix_df_copy.iloc[i, j] = 1.0
+                    else:
+                        nt = subexon_subdf.loc[subexon_index][
+                            'Number of transcripts for subexon']
+                        n = subexon_subdf.loc[subexon_index][
+                            'Transcripts in gene']
+                        if n == 1:
+                            msa_matrix_df_copy.iloc[i, j] = 0.33
+                        elif n - 1 == nt:
+                            msa_matrix_df_copy.iloc[i, j] = 0.66
+                        else:
+                            msa_matrix_df_copy.iloc[i, j] = 0.0
+
+        plt.figure()
+        outfile_1 = outfile.replace('.png', '_constitutive.png')
+        sns_const_plot1 = seaborn.heatmap(msa_matrix_df_copy, cmap="RdYlGn")
+        sns_const_plot1.get_figure().savefig(outfile_1, bbox_inches='tight')
+
+        msa_matrix_df_copy = msa_matrix_df.copy()
+        for i in range(0, nrow):
+            for j in range(0, ncol):
+                subexon_index = msa_matrix_df_copy.iloc[i, j]
+                if not np.isnan(subexon_index):
+                    subexon_index = int(subexon_index)
+                    msa_matrix_df_copy.iloc[i, j] = subexon_subdf.loc[
+                        subexon_index]['Subexon ID']
+        plt.figure()
+        outfile2 = outfile.replace('.png', '_subexon_id.png')
+        sns_const_plot2 = seaborn.heatmap(msa_matrix_df_copy, cmap="Spectral")
+        sns_const_plot2.get_figure().savefig(outfile2, bbox_inches='tight')
+
+    return sns_plot
+
+
+def minimal_regions(msa_matrix):
+    """
+    Return minimal homologous regions (columns) in msa_matrix.
+    """
+    non_full_nan_columns = np.where(~np.all(np.isnan(msa_matrix), axis=0))[0]
+    msa_matrix = msa_matrix[:, non_full_nan_columns]
+    msa_matrix[np.isnan(msa_matrix)] = -1.0
+    n_seq, n_col = msa_matrix.shape
+
+    column = msa_matrix[:, 0]
+    clusters = [list(column)]
+    for col_index in range(1, n_col):
+        previous_column = msa_matrix[:, col_index - 1]
+        column = msa_matrix[:, col_index]
+        if np.any(column != previous_column) and not (
+                sum(column) == -1 * n_seq
+                or sum(previous_column) == -1 * n_seq):
+            clusters.append(list(column))
+
+    return clusters