Skip to content

Commit

Permalink
Add package content
Browse files Browse the repository at this point in the history
  • Loading branch information
diegozea committed Jan 15, 2019
1 parent 9cc3472 commit 7ea04e6
Show file tree
Hide file tree
Showing 10 changed files with 1,725 additions and 2 deletions.
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2019 MASSIV
Copyright (c) 2019, Diego Javier Zea, Hugues Richard, Elodie Laine

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
375 changes: 375 additions & 0 deletions exonhomology/subexons/subexon_alignment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,375 @@
"""
subexon_alignment: Module to create the subexon MSA with MAFFT.
===============================================================
This module creates a MSA of subexons using MAFFT.
"""

import subprocess
import tempfile

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
from Bio import AlignIO
from collections import OrderedDict

# List from `orthokeep` in `EnsemblRESTTranscriptQueries.py`
# Order from NCBI Taxonomy/CommonTree, but using Human as 1
sp_order = {
'homo_sapiens': 1,
'mus_musculus': 6,
'macaca_mulatta': 3,
'danio_rerio': 12,
'xenopus_tropicalis': 11,
'caenorhabditis_elegans': 14,
'gallus_gallus': 10,
'rattus_norvegicus': 5,
'bos_taurus': 7,
'monodelphis_domestica': 4,
'ornithorhynchus_anatinus': 9,
'drosophila_melanogaster': 13,
'gorilla_gorilla': 2,
'sus_scrofa': 8
}


def _create_subexon_index(subexon_table):
"""
Return a pandas' DataFrame with subexon information.
"""

# NOTE : Subexon ID is the same for subexons with the same sequence
# taking phases into account. Being more specific with the subset
# columns may cause duplicated subexons in the chimeric sequence.
subset_columns = ['Subexon ID', 'Gene stable ID']
unique_subexons = subexon_table.drop_duplicates(subset=subset_columns)

unique_subexons['Order'] = [
row['Subexon genomic coding start']
if row['Strand'] == 1 else (-1 * row['Subexon genomic coding end'])
for _, row in unique_subexons.iterrows()
]

unique_subexons = unique_subexons.sort_values(by=['Order'])

unique_subexons = unique_subexons.loc[:, [
'Subexon ID', 'Subexon genomic coding start',
'Subexon genomic coding end', 'Exon protein sequence',
'Subexon rank in transcript'
]]

unique_subexons['Subexon Index'] = list(range(0, unique_subexons.shape[0]))
output = subexon_table.merge(unique_subexons)

return output


def _create_transcript_index(subexon_table):
"""
Return a pandas' DataFrame with the gene and transcript ids.
"""
transcript_id_columns = ['Gene stable ID', 'Transcript stable ID']
unique_transcripts = subexon_table.drop_duplicates(
subset=transcript_id_columns)

unique_transcripts = unique_transcripts.loc[:, transcript_id_columns]

unique_transcripts['Transcript Index'] = list(range(0,
unique_transcripts.shape[0]))
output = subexon_table.merge(unique_transcripts)

return output


def create_subexon_matrix(subexon_table):
"""
Return a binary matrix showing subexon presence in transcripts.
"""

# _create_subexon_index and _create_transcript_index change
# subexon_table with the last merge before return:
subexon_table = _create_subexon_index(subexon_table)
subexon_table = _create_transcript_index(subexon_table)

subexon_table = subexon_table.sort_values(
by=['Transcript Index', 'Subexon Index'])

n_subexons = len(subexon_table['Subexon Index'].unique())
n_transcripts = len(subexon_table['Transcript Index'].unique())

subexon_matrix = np.zeros((n_transcripts, n_subexons), dtype=np.bool)

for _, row in subexon_table.iterrows():
subexon_matrix[row['Transcript Index'], row['Subexon Index']] = True

return subexon_table, subexon_matrix


def _get_sequence(subexon_seqs,
subexon_column,
padding,
sequence_column='Exon protein sequence'):
"""
Return the sequence of the subexon as a string.
This function takes the subexon_seqs pandas' DataFrame as input, that has
'Subexon Index' as the DataFrame index.
This replaces termination codons (*) with the padding.
"""
seq = str(subexon_seqs.loc[subexon_column, sequence_column])
return seq.replace('*', padding)


def _does_it_need_padding(transcript_matrix, col_index):
"""
Return True if the column doesn't share any transcript with the previous.
"""
answer = not np.any(
np.logical_and(transcript_matrix[:, col_index - 1],
transcript_matrix[:, col_index]))
return answer


def create_chimeric_sequences(subexon_table,
subexon_matrix,
padding='XXXXXXXXXX'):
"""
Create chimeric sequence for MAFFT.
It returns a Dict from 'Gene stable ID' to a tuple with the chimeric
sequence and a Dict from 'Subexon Index' to
"""
chimerics = {}
for gene_id, gene_df in subexon_table.groupby('Gene stable ID'):

# DataFrame to get a subexon sequence using its 'Subexon Index'
subexon_seq_columns = ['Subexon Index', 'Exon protein sequence']
subexon_seqs = gene_df.loc[:, subexon_seq_columns]
# NOTE: It make copies to not delete subexons inplace:
subexon_seqs = subexon_seqs.drop_duplicates(subset=subexon_seq_columns)
subexon_seqs = subexon_seqs.set_index('Subexon Index')
subexon_seqs = subexon_seqs.sort_index()

transcript_index = sorted(gene_df['Transcript Index'].unique())

subexon_index = gene_df['Subexon Index'].unique()
subexon_index.sort()

transcript_matrix = subexon_matrix[transcript_index,:]
transcript_matrix = transcript_matrix[:, subexon_index]

subexon = subexon_index[0]
chimeric = _get_sequence(subexon_seqs, subexon, padding)
breaks = {subexon: len(chimeric)}
for col_index in range(1, len(subexon_index)):
subexon = subexon_index[col_index]
chimeric += _get_sequence(subexon_seqs, subexon, padding)
if _does_it_need_padding(
transcript_matrix,
col_index) and chimeric[-len(padding):] != padding:
chimeric += padding

breaks.update({subexon: len(chimeric)})

chimerics[gene_id] = (chimeric, breaks)

return chimerics


def _print_temporal_fasta(chimerics):
"""
Save the chimeric sequences in a temporal fasta file and return its name.
"""
with tempfile.NamedTemporaryFile(
suffix='.fasta', delete=False) as tmp_fasta:
for (key, value) in chimerics.items():
tmp_fasta.write('>%s\n%s\n' % (key, value[0]))

return tmp_fasta.name


def run_mafft(chimerics,
output_path='alignment.fasta',
mafft_path='mafft',
mafft_arguments=None):
"""
Run MAFFT in the chimeric sequences and return the output file.
mafft_arguments should be a list of str arguments for subprocess, e.g.:
['--maxiterate', '100']
"""
input_fasta = _print_temporal_fasta(chimerics)

command = [mafft_path, input_fasta]
if mafft_arguments is not None:
command.extend(mafft_arguments)

with open(output_path, 'w') as outfile:
process = subprocess.Popen(command, stdout=subprocess.PIPE)
for line in process.stdout:
outfile.write(line)
process.wait()

return output_path


def sort_species(chimerics, transcript_info, species_order=sp_order):
"""
Sort `chimerics` dict using `transcript_info` and `sp_order`
"""
gene2sp = pd.Series(
transcript_info.Species.values,
index=transcript_info['Gene stable ID']).to_dict()
return OrderedDict(
sorted(list(chimerics.items()), key=lambda x: species_order[gene2sp[x[0]]]))


def create_msa_matrix(chimerics, msa_file): # pylint: disable=too-many-locals
"""
Convert a msa_file from chimerics to a matrix.
Each cell has the subexon number (Index) or nan for gaps and padding.
"""
msa = AlignIO.read(msa_file, 'fasta')
n_seq = len(msa[:, 0])
assert n_seq > 1, 'There are few sequences in %s' % msa_file
n_col = len(msa[0])
assert n_col > 1, 'There are few columns in %s' % msa_file

msa_matrix = np.zeros((n_seq, n_col))
msa_matrix.fill(np.nan)

gene_ids = []
for seq_index in range(0, n_seq):
record = msa[seq_index]
gene_ids.append(record.id)
subexon2len = chimerics[record.id][1]
subexons = sorted(subexon2len, key=subexon2len.get)
seq_len = 0
subexon_index = 0
for col_index in range(0, n_col):
residue = record.seq[col_index]
if residue != '-':
seq_len += 1

if seq_len > subexon2len[subexons[subexon_index]]:
subexon_index += 1

if residue in {'-', 'X'}:
value = np.nan
else:
value = subexons[subexon_index]

msa_matrix[seq_index, col_index] = value

return gene_ids, msa_matrix


def plot_msa_subexons(gene_ids,
msa_matrix,
subexon_table=None,
outfile='alignment.png'):
"""
Save a plot from the msa matrix where cells are subexons.
If `subexon_table` is `None`, the binary plot with the constitutive
exons is not generated. Otherwise, it'll be in the `_constitutive.png`
file. The `subexon_table` also allows the coloring by Subexon ID for
the `_subexon_id.png` file.
"""
# stack...: stop-seaborn-plotting-multiple-figures-on-top-of-one-another
plt.figure()

subexon_number = int(msa_matrix[~np.isnan(msa_matrix)].max()) + 1
cmap = seaborn.color_palette('tab10', subexon_number)

msa_matrix_df = pd.DataFrame(msa_matrix)
msa_matrix_df['Gene ID'] = gene_ids
msa_matrix_df.set_index('Gene ID', inplace=True)
sns_plot = seaborn.heatmap(msa_matrix_df, cmap=cmap)

# This depends on the seaborn version: seaborn-0.8.1
sns_plot.get_figure().savefig(outfile, bbox_inches='tight')
# stackoverflow: second-y-axis-label-getting-cut-off

if subexon_table is not None:
msa_matrix_df_copy = msa_matrix_df.copy()

id2fraction = pd.Series(
subexon_table['Transcript fraction'].values,
index=subexon_table['Subexon Index']).to_dict()

subexon_subdf = subexon_table.loc[:, [
'Number of transcripts for subexon', 'Transcripts in gene',
'Subexon Index', 'Subexon ID'
]]

subexon_subdf = subexon_subdf.drop_duplicates().set_index(
'Subexon Index')

nrow, ncol = msa_matrix_df.shape

for i in range(0, nrow):
for j in range(0, ncol):
subexon_index = msa_matrix_df_copy.iloc[i, j]
if not np.isnan(subexon_index):
subexon_index = int(subexon_index)
fraction = id2fraction[subexon_index]
if fraction == 1.0:
msa_matrix_df_copy.iloc[i, j] = 1.0
else:
nt = subexon_subdf.loc[subexon_index][
'Number of transcripts for subexon']
n = subexon_subdf.loc[subexon_index][
'Transcripts in gene']
if n == 1:
msa_matrix_df_copy.iloc[i, j] = 0.33
elif n - 1 == nt:
msa_matrix_df_copy.iloc[i, j] = 0.66
else:
msa_matrix_df_copy.iloc[i, j] = 0.0

plt.figure()
outfile_1 = outfile.replace('.png', '_constitutive.png')
sns_const_plot1 = seaborn.heatmap(msa_matrix_df_copy, cmap="RdYlGn")
sns_const_plot1.get_figure().savefig(outfile_1, bbox_inches='tight')

msa_matrix_df_copy = msa_matrix_df.copy()
for i in range(0, nrow):
for j in range(0, ncol):
subexon_index = msa_matrix_df_copy.iloc[i, j]
if not np.isnan(subexon_index):
subexon_index = int(subexon_index)
msa_matrix_df_copy.iloc[i, j] = subexon_subdf.loc[
subexon_index]['Subexon ID']
plt.figure()
outfile2 = outfile.replace('.png', '_subexon_id.png')
sns_const_plot2 = seaborn.heatmap(msa_matrix_df_copy, cmap="Spectral")
sns_const_plot2.get_figure().savefig(outfile2, bbox_inches='tight')

return sns_plot


def minimal_regions(msa_matrix):
"""
Return minimal homologous regions (columns) in msa_matrix.
"""
non_full_nan_columns = np.where(~np.all(np.isnan(msa_matrix), axis=0))[0]
msa_matrix = msa_matrix[:, non_full_nan_columns]
msa_matrix[np.isnan(msa_matrix)] = -1.0
n_seq, n_col = msa_matrix.shape

column = msa_matrix[:, 0]
clusters = [list(column)]
for col_index in range(1, n_col):
previous_column = msa_matrix[:, col_index - 1]
column = msa_matrix[:, col_index]
if np.any(column != previous_column) and not (
sum(column) == -1 * n_seq
or sum(previous_column) == -1 * n_seq):
clusters.append(list(column))

return clusters
Loading

0 comments on commit 7ea04e6

Please sign in to comment.