-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
1,725 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,375 @@ | ||
""" | ||
subexon_alignment: Module to create the subexon MSA with MAFFT. | ||
=============================================================== | ||
This module creates a MSA of subexons using MAFFT. | ||
""" | ||
|
||
import subprocess | ||
import tempfile | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import matplotlib.pyplot as plt | ||
import seaborn | ||
from Bio import AlignIO | ||
from collections import OrderedDict | ||
|
||
# List from `orthokeep` in `EnsemblRESTTranscriptQueries.py` | ||
# Order from NCBI Taxonomy/CommonTree, but using Human as 1 | ||
sp_order = { | ||
'homo_sapiens': 1, | ||
'mus_musculus': 6, | ||
'macaca_mulatta': 3, | ||
'danio_rerio': 12, | ||
'xenopus_tropicalis': 11, | ||
'caenorhabditis_elegans': 14, | ||
'gallus_gallus': 10, | ||
'rattus_norvegicus': 5, | ||
'bos_taurus': 7, | ||
'monodelphis_domestica': 4, | ||
'ornithorhynchus_anatinus': 9, | ||
'drosophila_melanogaster': 13, | ||
'gorilla_gorilla': 2, | ||
'sus_scrofa': 8 | ||
} | ||
|
||
|
||
def _create_subexon_index(subexon_table): | ||
""" | ||
Return a pandas' DataFrame with subexon information. | ||
""" | ||
|
||
# NOTE : Subexon ID is the same for subexons with the same sequence | ||
# taking phases into account. Being more specific with the subset | ||
# columns may cause duplicated subexons in the chimeric sequence. | ||
subset_columns = ['Subexon ID', 'Gene stable ID'] | ||
unique_subexons = subexon_table.drop_duplicates(subset=subset_columns) | ||
|
||
unique_subexons['Order'] = [ | ||
row['Subexon genomic coding start'] | ||
if row['Strand'] == 1 else (-1 * row['Subexon genomic coding end']) | ||
for _, row in unique_subexons.iterrows() | ||
] | ||
|
||
unique_subexons = unique_subexons.sort_values(by=['Order']) | ||
|
||
unique_subexons = unique_subexons.loc[:, [ | ||
'Subexon ID', 'Subexon genomic coding start', | ||
'Subexon genomic coding end', 'Exon protein sequence', | ||
'Subexon rank in transcript' | ||
]] | ||
|
||
unique_subexons['Subexon Index'] = list(range(0, unique_subexons.shape[0])) | ||
output = subexon_table.merge(unique_subexons) | ||
|
||
return output | ||
|
||
|
||
def _create_transcript_index(subexon_table): | ||
""" | ||
Return a pandas' DataFrame with the gene and transcript ids. | ||
""" | ||
transcript_id_columns = ['Gene stable ID', 'Transcript stable ID'] | ||
unique_transcripts = subexon_table.drop_duplicates( | ||
subset=transcript_id_columns) | ||
|
||
unique_transcripts = unique_transcripts.loc[:, transcript_id_columns] | ||
|
||
unique_transcripts['Transcript Index'] = list(range(0, | ||
unique_transcripts.shape[0])) | ||
output = subexon_table.merge(unique_transcripts) | ||
|
||
return output | ||
|
||
|
||
def create_subexon_matrix(subexon_table): | ||
""" | ||
Return a binary matrix showing subexon presence in transcripts. | ||
""" | ||
|
||
# _create_subexon_index and _create_transcript_index change | ||
# subexon_table with the last merge before return: | ||
subexon_table = _create_subexon_index(subexon_table) | ||
subexon_table = _create_transcript_index(subexon_table) | ||
|
||
subexon_table = subexon_table.sort_values( | ||
by=['Transcript Index', 'Subexon Index']) | ||
|
||
n_subexons = len(subexon_table['Subexon Index'].unique()) | ||
n_transcripts = len(subexon_table['Transcript Index'].unique()) | ||
|
||
subexon_matrix = np.zeros((n_transcripts, n_subexons), dtype=np.bool) | ||
|
||
for _, row in subexon_table.iterrows(): | ||
subexon_matrix[row['Transcript Index'], row['Subexon Index']] = True | ||
|
||
return subexon_table, subexon_matrix | ||
|
||
|
||
def _get_sequence(subexon_seqs, | ||
subexon_column, | ||
padding, | ||
sequence_column='Exon protein sequence'): | ||
""" | ||
Return the sequence of the subexon as a string. | ||
This function takes the subexon_seqs pandas' DataFrame as input, that has | ||
'Subexon Index' as the DataFrame index. | ||
This replaces termination codons (*) with the padding. | ||
""" | ||
seq = str(subexon_seqs.loc[subexon_column, sequence_column]) | ||
return seq.replace('*', padding) | ||
|
||
|
||
def _does_it_need_padding(transcript_matrix, col_index): | ||
""" | ||
Return True if the column doesn't share any transcript with the previous. | ||
""" | ||
answer = not np.any( | ||
np.logical_and(transcript_matrix[:, col_index - 1], | ||
transcript_matrix[:, col_index])) | ||
return answer | ||
|
||
|
||
def create_chimeric_sequences(subexon_table, | ||
subexon_matrix, | ||
padding='XXXXXXXXXX'): | ||
""" | ||
Create chimeric sequence for MAFFT. | ||
It returns a Dict from 'Gene stable ID' to a tuple with the chimeric | ||
sequence and a Dict from 'Subexon Index' to | ||
""" | ||
chimerics = {} | ||
for gene_id, gene_df in subexon_table.groupby('Gene stable ID'): | ||
|
||
# DataFrame to get a subexon sequence using its 'Subexon Index' | ||
subexon_seq_columns = ['Subexon Index', 'Exon protein sequence'] | ||
subexon_seqs = gene_df.loc[:, subexon_seq_columns] | ||
# NOTE: It make copies to not delete subexons inplace: | ||
subexon_seqs = subexon_seqs.drop_duplicates(subset=subexon_seq_columns) | ||
subexon_seqs = subexon_seqs.set_index('Subexon Index') | ||
subexon_seqs = subexon_seqs.sort_index() | ||
|
||
transcript_index = sorted(gene_df['Transcript Index'].unique()) | ||
|
||
subexon_index = gene_df['Subexon Index'].unique() | ||
subexon_index.sort() | ||
|
||
transcript_matrix = subexon_matrix[transcript_index,:] | ||
transcript_matrix = transcript_matrix[:, subexon_index] | ||
|
||
subexon = subexon_index[0] | ||
chimeric = _get_sequence(subexon_seqs, subexon, padding) | ||
breaks = {subexon: len(chimeric)} | ||
for col_index in range(1, len(subexon_index)): | ||
subexon = subexon_index[col_index] | ||
chimeric += _get_sequence(subexon_seqs, subexon, padding) | ||
if _does_it_need_padding( | ||
transcript_matrix, | ||
col_index) and chimeric[-len(padding):] != padding: | ||
chimeric += padding | ||
|
||
breaks.update({subexon: len(chimeric)}) | ||
|
||
chimerics[gene_id] = (chimeric, breaks) | ||
|
||
return chimerics | ||
|
||
|
||
def _print_temporal_fasta(chimerics): | ||
""" | ||
Save the chimeric sequences in a temporal fasta file and return its name. | ||
""" | ||
with tempfile.NamedTemporaryFile( | ||
suffix='.fasta', delete=False) as tmp_fasta: | ||
for (key, value) in chimerics.items(): | ||
tmp_fasta.write('>%s\n%s\n' % (key, value[0])) | ||
|
||
return tmp_fasta.name | ||
|
||
|
||
def run_mafft(chimerics, | ||
output_path='alignment.fasta', | ||
mafft_path='mafft', | ||
mafft_arguments=None): | ||
""" | ||
Run MAFFT in the chimeric sequences and return the output file. | ||
mafft_arguments should be a list of str arguments for subprocess, e.g.: | ||
['--maxiterate', '100'] | ||
""" | ||
input_fasta = _print_temporal_fasta(chimerics) | ||
|
||
command = [mafft_path, input_fasta] | ||
if mafft_arguments is not None: | ||
command.extend(mafft_arguments) | ||
|
||
with open(output_path, 'w') as outfile: | ||
process = subprocess.Popen(command, stdout=subprocess.PIPE) | ||
for line in process.stdout: | ||
outfile.write(line) | ||
process.wait() | ||
|
||
return output_path | ||
|
||
|
||
def sort_species(chimerics, transcript_info, species_order=sp_order): | ||
""" | ||
Sort `chimerics` dict using `transcript_info` and `sp_order` | ||
""" | ||
gene2sp = pd.Series( | ||
transcript_info.Species.values, | ||
index=transcript_info['Gene stable ID']).to_dict() | ||
return OrderedDict( | ||
sorted(list(chimerics.items()), key=lambda x: species_order[gene2sp[x[0]]])) | ||
|
||
|
||
def create_msa_matrix(chimerics, msa_file): # pylint: disable=too-many-locals | ||
""" | ||
Convert a msa_file from chimerics to a matrix. | ||
Each cell has the subexon number (Index) or nan for gaps and padding. | ||
""" | ||
msa = AlignIO.read(msa_file, 'fasta') | ||
n_seq = len(msa[:, 0]) | ||
assert n_seq > 1, 'There are few sequences in %s' % msa_file | ||
n_col = len(msa[0]) | ||
assert n_col > 1, 'There are few columns in %s' % msa_file | ||
|
||
msa_matrix = np.zeros((n_seq, n_col)) | ||
msa_matrix.fill(np.nan) | ||
|
||
gene_ids = [] | ||
for seq_index in range(0, n_seq): | ||
record = msa[seq_index] | ||
gene_ids.append(record.id) | ||
subexon2len = chimerics[record.id][1] | ||
subexons = sorted(subexon2len, key=subexon2len.get) | ||
seq_len = 0 | ||
subexon_index = 0 | ||
for col_index in range(0, n_col): | ||
residue = record.seq[col_index] | ||
if residue != '-': | ||
seq_len += 1 | ||
|
||
if seq_len > subexon2len[subexons[subexon_index]]: | ||
subexon_index += 1 | ||
|
||
if residue in {'-', 'X'}: | ||
value = np.nan | ||
else: | ||
value = subexons[subexon_index] | ||
|
||
msa_matrix[seq_index, col_index] = value | ||
|
||
return gene_ids, msa_matrix | ||
|
||
|
||
def plot_msa_subexons(gene_ids, | ||
msa_matrix, | ||
subexon_table=None, | ||
outfile='alignment.png'): | ||
""" | ||
Save a plot from the msa matrix where cells are subexons. | ||
If `subexon_table` is `None`, the binary plot with the constitutive | ||
exons is not generated. Otherwise, it'll be in the `_constitutive.png` | ||
file. The `subexon_table` also allows the coloring by Subexon ID for | ||
the `_subexon_id.png` file. | ||
""" | ||
# stack...: stop-seaborn-plotting-multiple-figures-on-top-of-one-another | ||
plt.figure() | ||
|
||
subexon_number = int(msa_matrix[~np.isnan(msa_matrix)].max()) + 1 | ||
cmap = seaborn.color_palette('tab10', subexon_number) | ||
|
||
msa_matrix_df = pd.DataFrame(msa_matrix) | ||
msa_matrix_df['Gene ID'] = gene_ids | ||
msa_matrix_df.set_index('Gene ID', inplace=True) | ||
sns_plot = seaborn.heatmap(msa_matrix_df, cmap=cmap) | ||
|
||
# This depends on the seaborn version: seaborn-0.8.1 | ||
sns_plot.get_figure().savefig(outfile, bbox_inches='tight') | ||
# stackoverflow: second-y-axis-label-getting-cut-off | ||
|
||
if subexon_table is not None: | ||
msa_matrix_df_copy = msa_matrix_df.copy() | ||
|
||
id2fraction = pd.Series( | ||
subexon_table['Transcript fraction'].values, | ||
index=subexon_table['Subexon Index']).to_dict() | ||
|
||
subexon_subdf = subexon_table.loc[:, [ | ||
'Number of transcripts for subexon', 'Transcripts in gene', | ||
'Subexon Index', 'Subexon ID' | ||
]] | ||
|
||
subexon_subdf = subexon_subdf.drop_duplicates().set_index( | ||
'Subexon Index') | ||
|
||
nrow, ncol = msa_matrix_df.shape | ||
|
||
for i in range(0, nrow): | ||
for j in range(0, ncol): | ||
subexon_index = msa_matrix_df_copy.iloc[i, j] | ||
if not np.isnan(subexon_index): | ||
subexon_index = int(subexon_index) | ||
fraction = id2fraction[subexon_index] | ||
if fraction == 1.0: | ||
msa_matrix_df_copy.iloc[i, j] = 1.0 | ||
else: | ||
nt = subexon_subdf.loc[subexon_index][ | ||
'Number of transcripts for subexon'] | ||
n = subexon_subdf.loc[subexon_index][ | ||
'Transcripts in gene'] | ||
if n == 1: | ||
msa_matrix_df_copy.iloc[i, j] = 0.33 | ||
elif n - 1 == nt: | ||
msa_matrix_df_copy.iloc[i, j] = 0.66 | ||
else: | ||
msa_matrix_df_copy.iloc[i, j] = 0.0 | ||
|
||
plt.figure() | ||
outfile_1 = outfile.replace('.png', '_constitutive.png') | ||
sns_const_plot1 = seaborn.heatmap(msa_matrix_df_copy, cmap="RdYlGn") | ||
sns_const_plot1.get_figure().savefig(outfile_1, bbox_inches='tight') | ||
|
||
msa_matrix_df_copy = msa_matrix_df.copy() | ||
for i in range(0, nrow): | ||
for j in range(0, ncol): | ||
subexon_index = msa_matrix_df_copy.iloc[i, j] | ||
if not np.isnan(subexon_index): | ||
subexon_index = int(subexon_index) | ||
msa_matrix_df_copy.iloc[i, j] = subexon_subdf.loc[ | ||
subexon_index]['Subexon ID'] | ||
plt.figure() | ||
outfile2 = outfile.replace('.png', '_subexon_id.png') | ||
sns_const_plot2 = seaborn.heatmap(msa_matrix_df_copy, cmap="Spectral") | ||
sns_const_plot2.get_figure().savefig(outfile2, bbox_inches='tight') | ||
|
||
return sns_plot | ||
|
||
|
||
def minimal_regions(msa_matrix): | ||
""" | ||
Return minimal homologous regions (columns) in msa_matrix. | ||
""" | ||
non_full_nan_columns = np.where(~np.all(np.isnan(msa_matrix), axis=0))[0] | ||
msa_matrix = msa_matrix[:, non_full_nan_columns] | ||
msa_matrix[np.isnan(msa_matrix)] = -1.0 | ||
n_seq, n_col = msa_matrix.shape | ||
|
||
column = msa_matrix[:, 0] | ||
clusters = [list(column)] | ||
for col_index in range(1, n_col): | ||
previous_column = msa_matrix[:, col_index - 1] | ||
column = msa_matrix[:, col_index] | ||
if np.any(column != previous_column) and not ( | ||
sum(column) == -1 * n_seq | ||
or sum(previous_column) == -1 * n_seq): | ||
clusters.append(list(column)) | ||
|
||
return clusters |
Oops, something went wrong.