Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Repo jbei org changes #5

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 0 additions & 22 deletions diva_seq_opt/__init__.py
Original file line number Diff line number Diff line change
@@ -1,22 +0,0 @@
import matplotlib.pyplot as plt
import numpy as np

def predict_loading_concentration(spectra,model,low=5,high=20,n=1000,output_file=None):
cluster_densities = []
loading_concentrations = np.linspace(low,high,n)
for lc in loading_concentrations:
X = np.append(lc,spectra).reshape(1,-1)
y = model.predict(X)
cluster_densities.append(y[0])

#print(cluster_densities)
plt.plot(loading_concentrations,cluster_densities)
plt.title('Loading Concentration vs Predicted Cluster Density')
plt.xlabel('Loading Concentration [pM]')
plt.ylabel('Predicted Cluster Density [k/mm^2]')
plt.xlim([low,high])

if output_file is None:
plt.show()
else:
plt.savefig(output_file,dpi=600)
39 changes: 0 additions & 39 deletions diva_seq_opt/predict_loading_concentration.py

This file was deleted.

94 changes: 57 additions & 37 deletions diva_seq_opt/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,29 +27,29 @@

def aproximate_sequence(df,n=100,start=0,stop=15000):
"""Reduce the Feature Space by Estimating a Curve with Its Integral"""

#Create even spacing
int_lims = np.linspace(start,stop,n+1)

#Create Interpolation function
abundance_curve = interp1d(df['base pairs'],df['Value'],fill_value='extrapolate')

#Aproximate Function using an integral
state = [quad(abundance_curve,a,b,limit=2000)[0] for a,b in zip(int_lims[:-1],int_lims[1:])]
bps = int_lims[:-1]

return state,bps


def find_peaks(df,test=False,peak_bps=[35,50,150,300,400,500,600,700,1000,2000,3000,7000,10380]):
"""Find the peaks of a ladder gel. Return a list of peak times and the
"""Find the peaks of a ladder gel. Return a list of peak times and the
base pairs that correspond to those times.
"""

for threshold in [0.30,0.20,0.15,0.10,0.05]:
peaks = peakutils.indexes(df['Value'].values, thres=threshold, min_dist=10)
peak_times = df['Time'].values[peaks[1:-1]]

if test:
print('testing!')
plt.figure()
Expand All @@ -58,7 +58,7 @@ def find_peaks(df,test=False,peak_bps=[35,50,150,300,400,500,600,700,1000,2000,3
plt.show()
print(len(peaks),len(peak_bps))
print(len(peak_times),len(peak_bps))

if len(peak_times) == len(peak_bps):
break

Expand All @@ -67,53 +67,53 @@ def find_peaks(df,test=False,peak_bps=[35,50,150,300,400,500,600,700,1000,2000,3

def normalize_peak(df,start_bp=500,end_bp=11000):
'''Normalize BioAnalyzer Curve so its total area is 1.'''

#Subtract out Noise Floor
noise_floor = np.mean(df['Value'].tail(20))
df['Value'] -= noise_floor
df.loc[df['Value']<0,'Value']=0

#Select only important segment
LIBRARY_SEGMENT = (df['base pairs'] > start_bp) & (df['base pairs'] < end_bp)
df = df.loc[LIBRARY_SEGMENT]

#Normalize Values to have an area under the curve of 1
area = np.trapz(df['Value'],x=df['base pairs'])
df.loc[:,'Value'] /= area

return df


def prepare_data(ladder_file,bioanalyzer_file):
'''Convert a Ladder File and BioAnalyzer File into a machine learning data point'''
#Load & Parse Ladder

#Load & Parse Ladder
ladder_df = pd.read_csv(ladder_file,skiprows=17)[:-1]
ladder_df = ladder_df.apply(np.vectorize(float))
peak_times,peak_bps = find_peaks(ladder_df)

#Check to make sure the right number of peaks were found in the ladder
if len(peak_times) != len(peak_bps):
Warning('Could Not Find Peaks in Ladder File! Bug Zak to Increase the Robustness of the Peak Finding Algorithm')
return None

time_to_bps = interp1d(peak_times,peak_bps,fill_value='extrapolate',kind='slinear')

#Read Bioanalyzer File
sample_df = pd.read_csv(bioanalyzer_file,skiprows=17)[:-2]

#Only Grab Relevant Columns
sample_df = sample_df.loc[:,sample_df.columns.isin(['Time','Value'])]

#Convert Values to Floats
sample_df = sample_df.apply(np.vectorize(float))

#Rescale From Retention Time to Base Pairs
sample_df['base pairs'] = sample_df['Time'].apply(time_to_bps)

#Normalize Curve
sample_df = normalize_peak(sample_df)

return sample_df


Expand All @@ -137,16 +137,16 @@ def get_file_pairs(data_dir,
ladder_file_re='.*ladder',
bioanalyzer_fila_re='sample'
):

#Get list of all files in the training directory
file_paths = glob.glob(data_dir+'/*')

#Find File Pairs
file_pairs = {}
for file_path in file_paths:
match = re.match(file_id_re,file_path).group(0)
file_pairs.setdefault(match,[]).append(file_path)

#create training file pairs
training_files = []
for _,file_pair in file_pairs.items():
Expand All @@ -162,43 +162,43 @@ def get_file_pairs(data_dir,
def load_training_data(training_dir,plot=False):
"""load bioanalyzer training data from training directory."""
training_files = get_training_files(training_dir)

df = pd.DataFrame()
for ladder_file,bioanalyzer_file in training_files:
sample_df = prepare_data(ladder_file,bioanalyzer_file)
llc,cluster_density = get_sample_metadata(bioanalyzer_file)

if (sample_df is None) or (llc is None):
continue

#Plot Base Pairs Vs Values
if plot:
sample_df.plot('base pairs','Value')
plt.show()
state,bps = aproximate_sequence(sample_df,n=10,stop=12000)

state,bps = aproximate_sequence(sample_df,n=10,stop=12000)

#Create df
columns = ['Library Loading Concentration','Cluster Density'] + [str(int(bp)) for bp in bps]
data = [[llc,cluster_density] + state]
df = pd.concat([df,pd.DataFrame(data,columns=columns)])
df = pd.concat([df,pd.DataFrame(data,columns=columns)])
return df


def fit_model(training_df,output_file):
X = training_df.loc[:,df.columns != 'Cluster Density']
y = training_df['Cluster Density']
model = RandomForestRegressor().fit(X,y)

with open(output_file,'wb') as fp:
pickle.dump(model,fp)

return model


def plot_model_performance(model,training_df):
'''Create a Plot of the Models Cross Validated Performance'''

X = training_df.loc[:,df.columns != 'Cluster Density']
y = training_df['Cluster Density']
y_p = cross_val_predict(model,X,y)
Expand Down Expand Up @@ -228,4 +228,24 @@ def plot_model_performance(model,training_df):
plt.tight_layout()

plt.savefig('../figures/ModelAccuracy.pdf',dpi=600)
plt.show()
plt.show()

def predict_loading_concentration(spectra,model,low=5,high=20,n=1000,output_file=None):
cluster_densities = []
loading_concentrations = np.linspace(low,high,n)
for lc in loading_concentrations:
X = np.append(lc,spectra).reshape(1,-1)
y = model.predict(X)
cluster_densities.append(y[0])

#print(cluster_densities)
plt.plot(loading_concentrations,cluster_densities)
plt.title('Loading Concentration vs Predicted Cluster Density')
plt.xlabel('Loading Concentration [pM]')
plt.ylabel('Predicted Cluster Density [k/mm^2]')
plt.xlim([low,high])

if output_file is None:
plt.show()
else:
plt.savefig(output_file,dpi=600)
6 changes: 6 additions & 0 deletions install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/sh

# Installs the pipenv for running the tool.
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There should be instructions for this in README.


python3 -m pipenv --rm
python3 -m pipenv install
44 changes: 23 additions & 21 deletions predict_loading_concentration.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,35 +4,37 @@ def warn(*args, **kwargs):
import warnings
tianar marked this conversation as resolved.
Show resolved Hide resolved
warnings.warn = warn

from diva_dna_seq.utility import aproximate_sequence,prepare_data
from diva_dna_seq import predict_loading_concentration
from diva_seq_opt import utility
import argparse
import pickle


def main():
#Allow Matplotlib to be used on commandline
import matplotlib as mpl
tianar marked this conversation as resolved.
Show resolved Hide resolved
mpl.use('Agg')

#Allow Matplotlib to be used on commandline
import matplotlib as mpl
mpl.use('Agg')
#Handle inputs
#Parse Inputs
parser = argparse.ArgumentParser(description='Use BioAnalyzer Data to Predict Library Loading Concentration for Library Barcoding')

#Handle inputs
#Parse Inputs
parser = argparse.ArgumentParser(description='Use BioAnalyzer Data to Predict Library Loading Concentration for Library Barcoding')
parser.add_argument('BAF',type=str, help='BioAnalyzer CSV')
parser.add_argument('LAD',type=str, help='Ladder CSV')
parser.add_argument('OF' ,type=str, help='Prediction Plot output file, *.png')
args = parser.parse_args()

parser.add_argument('BAF',type=str, help='BioAnalyzer CSV')
parser.add_argument('LAD',type=str, help='Ladder CSV')
parser.add_argument('OF' ,type=str, help='Prediction Plot output file, *.png')
args = parser.parse_args()
#Load Model
with open('./diva_seq_opt/model/model30.pkl','rb') as fp:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Later on we should change this model name to e.g. model_latest.pkl and always have a copy of the latest model. For example, at the end of the year the model directory will contain:
model_2018.pkl, model_05_2021.pkl, model_12_2021.pkl, model_latest.pkl, where the last two would be the same.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I very much agree for the future!

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please include a TODO about that before this line?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the path for models should not be among the code but in a separate directory at the same level as notebooks.

model = pickle.load(fp)

#Load Model
with open('../model/model30.pkl','rb') as fp:
model = pickle.load(fp)
#Prepare Data
sample_df = utility.prepare_data(args.LAD,args.BAF)

#Prepare Data
sample_df = prepare_data(args.LAD,args.BAF)
#Create Features
state,bps = utility.aproximate_sequence(sample_df,n=10,stop=12000)

#Create Features
state,bps = aproximate_sequence(sample_df,n=10,stop=12000)
#Predict
utility.predict_loading_concentration(state,model,output_file=args.OF)

#Predict
predict_loading_concentration(state,model,output_file=args.OF)
if __name__ == "__main__":
main()