diff --git a/README.md b/README.md index 1f3466d..2645e0c 100755 --- a/README.md +++ b/README.md @@ -4,12 +4,11 @@ This package is a repository of tools used by the DIVA DNA Seq team at the Joint ## Installation -The tool can be installed using pip with - -```bash -pip install diva_dna_seq +The dependencies of this tool can be installed with: +``` +pip install -r requirements.txt ``` ## Use -This tool can then be used either in a Jupyter notebook or on the command line. A bioanalyzer ladder file and result file are needed. The tool generates a plot that relates library loading concentration for your library to \ No newline at end of file +This tool can then be used either in a Jupyter notebook or on the command line. A bioanalyzer ladder file and result file are needed. The tool generates a plot that relates library loading concentration for your library to the MiSeq flow cell cluster density. diff --git a/diva_seq_opt/__init__.py b/diva_seq_opt/__init__.py index d26c1db..e69de29 100644 --- a/diva_seq_opt/__init__.py +++ b/diva_seq_opt/__init__.py @@ -1,22 +0,0 @@ -import matplotlib.pyplot as plt -import numpy as np - -def predict_loading_concentration(spectra,model,low=5,high=20,n=1000,output_file=None): - cluster_densities = [] - loading_concentrations = np.linspace(low,high,n) - for lc in loading_concentrations: - X = np.append(lc,spectra).reshape(1,-1) - y = model.predict(X) - cluster_densities.append(y[0]) - - #print(cluster_densities) - plt.plot(loading_concentrations,cluster_densities) - plt.title('Loading Concentration vs Predicted Cluster Density') - plt.xlabel('Loading Concentration [pM]') - plt.ylabel('Predicted Cluster Density [k/mm^2]') - plt.xlim([low,high]) - - if output_file is None: - plt.show() - else: - plt.savefig(output_file,dpi=600) \ No newline at end of file diff --git a/diva_seq_opt/predict_loading_concentration.py b/diva_seq_opt/predict_loading_concentration.py deleted file mode 100755 index a0f06a1..0000000 --- a/diva_seq_opt/predict_loading_concentration.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python -from diva_seq_opt.utility import aproximate_sequence,prepare_data -from diva_seq_opt import predict_loading_concentration -import diva_seq_opt -import argparse -import pickle -import numpy as np -import os - -#Allow Matplotlib to be used on commandline -import matplotlib as mpl -mpl.use('Agg') - - - -#Handle inputs -def main(): - #Parse Inputs - parser = argparse.ArgumentParser(description='Use BioAnalyzer Data to Predict Library Loading Concentration for Library Barcoding') - - parser.add_argument('BAF',type=str, help='BioAnalyzer CSV') - parser.add_argument('LAD',type=str, help='Ladder CSV') - parser.add_argument('OF' ,type=str, help='Prediction Plot output file, *.png') - args = parser.parse_args() - - #Load Model - model_path = os.path.dirname(os.path.realpath(diva_seq_opt.__file__)) - model_path = os.path.join(model_path,'model/model30.pkl') - with open(model_path,'rb') as fp: - model = pickle.load(fp) - - #Prepare Data - sample_df = prepare_data(args.LAD,args.BAF) - - #Create Features - state,bps = aproximate_sequence(sample_df,n=10,stop=12000) - - #Predict - predict_loading_concentration(state,model,output_file=args.OF) \ No newline at end of file diff --git a/diva_seq_opt/utility.py b/diva_seq_opt/utility.py index 727dedc..c419633 100755 --- a/diva_seq_opt/utility.py +++ b/diva_seq_opt/utility.py @@ -27,29 +27,29 @@ def aproximate_sequence(df,n=100,start=0,stop=15000): """Reduce the Feature Space by Estimating a Curve with Its Integral""" - + #Create even spacing int_lims = np.linspace(start,stop,n+1) - + #Create Interpolation function abundance_curve = interp1d(df['base pairs'],df['Value'],fill_value='extrapolate') - + #Aproximate Function using an integral state = [quad(abundance_curve,a,b,limit=2000)[0] for a,b in zip(int_lims[:-1],int_lims[1:])] bps = int_lims[:-1] - + return state,bps - + def find_peaks(df,test=False,peak_bps=[35,50,150,300,400,500,600,700,1000,2000,3000,7000,10380]): - """Find the peaks of a ladder gel. Return a list of peak times and the + """Find the peaks of a ladder gel. Return a list of peak times and the base pairs that correspond to those times. """ - + for threshold in [0.30,0.20,0.15,0.10,0.05]: peaks = peakutils.indexes(df['Value'].values, thres=threshold, min_dist=10) peak_times = df['Time'].values[peaks[1:-1]] - + if test: print('testing!') plt.figure() @@ -58,7 +58,7 @@ def find_peaks(df,test=False,peak_bps=[35,50,150,300,400,500,600,700,1000,2000,3 plt.show() print(len(peaks),len(peak_bps)) print(len(peak_times),len(peak_bps)) - + if len(peak_times) == len(peak_bps): break @@ -67,53 +67,53 @@ def find_peaks(df,test=False,peak_bps=[35,50,150,300,400,500,600,700,1000,2000,3 def normalize_peak(df,start_bp=500,end_bp=11000): '''Normalize BioAnalyzer Curve so its total area is 1.''' - + #Subtract out Noise Floor noise_floor = np.mean(df['Value'].tail(20)) df['Value'] -= noise_floor df.loc[df['Value']<0,'Value']=0 - + #Select only important segment LIBRARY_SEGMENT = (df['base pairs'] > start_bp) & (df['base pairs'] < end_bp) df = df.loc[LIBRARY_SEGMENT] - + #Normalize Values to have an area under the curve of 1 area = np.trapz(df['Value'],x=df['base pairs']) df.loc[:,'Value'] /= area - + return df def prepare_data(ladder_file,bioanalyzer_file): '''Convert a Ladder File and BioAnalyzer File into a machine learning data point''' - - #Load & Parse Ladder + + #Load & Parse Ladder ladder_df = pd.read_csv(ladder_file,skiprows=17)[:-1] ladder_df = ladder_df.apply(np.vectorize(float)) peak_times,peak_bps = find_peaks(ladder_df) - + #Check to make sure the right number of peaks were found in the ladder if len(peak_times) != len(peak_bps): Warning('Could Not Find Peaks in Ladder File! Bug Zak to Increase the Robustness of the Peak Finding Algorithm') return None - + time_to_bps = interp1d(peak_times,peak_bps,fill_value='extrapolate',kind='slinear') - + #Read Bioanalyzer File sample_df = pd.read_csv(bioanalyzer_file,skiprows=17)[:-2] - + #Only Grab Relevant Columns sample_df = sample_df.loc[:,sample_df.columns.isin(['Time','Value'])] - + #Convert Values to Floats sample_df = sample_df.apply(np.vectorize(float)) - + #Rescale From Retention Time to Base Pairs sample_df['base pairs'] = sample_df['Time'].apply(time_to_bps) - + #Normalize Curve sample_df = normalize_peak(sample_df) - + return sample_df @@ -137,16 +137,16 @@ def get_file_pairs(data_dir, ladder_file_re='.*ladder', bioanalyzer_fila_re='sample' ): - + #Get list of all files in the training directory file_paths = glob.glob(data_dir+'/*') - + #Find File Pairs file_pairs = {} for file_path in file_paths: match = re.match(file_id_re,file_path).group(0) file_pairs.setdefault(match,[]).append(file_path) - + #create training file pairs training_files = [] for _,file_pair in file_pairs.items(): @@ -162,26 +162,26 @@ def get_file_pairs(data_dir, def load_training_data(training_dir,plot=False): """load bioanalyzer training data from training directory.""" training_files = get_training_files(training_dir) - + df = pd.DataFrame() for ladder_file,bioanalyzer_file in training_files: sample_df = prepare_data(ladder_file,bioanalyzer_file) llc,cluster_density = get_sample_metadata(bioanalyzer_file) - + if (sample_df is None) or (llc is None): continue - + #Plot Base Pairs Vs Values if plot: sample_df.plot('base pairs','Value') plt.show() - - state,bps = aproximate_sequence(sample_df,n=10,stop=12000) - + + state,bps = aproximate_sequence(sample_df,n=10,stop=12000) + #Create df columns = ['Library Loading Concentration','Cluster Density'] + [str(int(bp)) for bp in bps] data = [[llc,cluster_density] + state] - df = pd.concat([df,pd.DataFrame(data,columns=columns)]) + df = pd.concat([df,pd.DataFrame(data,columns=columns)]) return df @@ -189,16 +189,16 @@ def fit_model(training_df,output_file): X = training_df.loc[:,df.columns != 'Cluster Density'] y = training_df['Cluster Density'] model = RandomForestRegressor().fit(X,y) - + with open(output_file,'wb') as fp: pickle.dump(model,fp) - + return model def plot_model_performance(model,training_df): '''Create a Plot of the Models Cross Validated Performance''' - + X = training_df.loc[:,df.columns != 'Cluster Density'] y = training_df['Cluster Density'] y_p = cross_val_predict(model,X,y) @@ -228,4 +228,24 @@ def plot_model_performance(model,training_df): plt.tight_layout() plt.savefig('../figures/ModelAccuracy.pdf',dpi=600) - plt.show() \ No newline at end of file + plt.show() + +def predict_loading_concentration(spectra,model,low=5,high=20,n=1000,output_file=None): + cluster_densities = [] + loading_concentrations = np.linspace(low,high,n) + for lc in loading_concentrations: + X = np.append(lc,spectra).reshape(1,-1) + y = model.predict(X) + cluster_densities.append(y[0]) + + #print(cluster_densities) + plt.plot(loading_concentrations,cluster_densities) + plt.title('Loading Concentration vs Predicted Cluster Density') + plt.xlabel('Loading Concentration [pM]') + plt.ylabel('Predicted Cluster Density [k/mm^2]') + plt.xlim([low,high]) + + if output_file is None: + plt.show() + else: + plt.savefig(output_file,dpi=600) diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..5ee4b3c --- /dev/null +++ b/install.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +# Installs the pipenv for running the tool. + +python3 -m pipenv --rm +python3 -m pipenv install diff --git a/predict_loading_concentration.py b/predict_loading_concentration.py index 4d70d05..ff56abb 100755 --- a/predict_loading_concentration.py +++ b/predict_loading_concentration.py @@ -4,35 +4,37 @@ def warn(*args, **kwargs): import warnings warnings.warn = warn -from diva_dna_seq.utility import aproximate_sequence,prepare_data -from diva_dna_seq import predict_loading_concentration +from diva_seq_opt import utility import argparse import pickle +import matplotlib as mpl +def main(): + #Allow Matplotlib to be used on commandline + mpl.use('Agg') -#Allow Matplotlib to be used on commandline -import matplotlib as mpl -mpl.use('Agg') + #Handle inputs + #Parse Inputs + parser = argparse.ArgumentParser(description='Use BioAnalyzer Data to Predict Library Loading Concentration for Library Barcoding') -#Handle inputs -#Parse Inputs -parser = argparse.ArgumentParser(description='Use BioAnalyzer Data to Predict Library Loading Concentration for Library Barcoding') + parser.add_argument('BAF',type=str, help='BioAnalyzer CSV') + parser.add_argument('LAD',type=str, help='Ladder CSV') + parser.add_argument('OF' ,type=str, help='Prediction Plot output file, *.png') + args = parser.parse_args() -parser.add_argument('BAF',type=str, help='BioAnalyzer CSV') -parser.add_argument('LAD',type=str, help='Ladder CSV') -parser.add_argument('OF' ,type=str, help='Prediction Plot output file, *.png') -args = parser.parse_args() + #Load Model + with open('./diva_seq_opt/model/model30.pkl','rb') as fp: + model = pickle.load(fp) -#Load Model -with open('../model/model30.pkl','rb') as fp: - model = pickle.load(fp) + #Prepare Data + sample_df = utility.prepare_data(args.LAD,args.BAF) -#Prepare Data -sample_df = prepare_data(args.LAD,args.BAF) + #Create Features + state,bps = utility.aproximate_sequence(sample_df,n=10,stop=12000) -#Create Features -state,bps = aproximate_sequence(sample_df,n=10,stop=12000) + #Predict + utility.predict_loading_concentration(state,model,output_file=args.OF) -#Predict -predict_loading_concentration(state,model,output_file=args.OF) \ No newline at end of file +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index 40cd3da..fe8b8ef 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup setup(name='diva_seq_opt', - version='0.0.3', + version='0.0.4', description='Predict the library loading concentration to use for mi-seq runs based on bioanalyzer data.', url='https://github.com/JBEI/library_loading_concentration_prediction', author='Zak Costello', @@ -9,11 +9,11 @@ license='BSD', packages=['diva_seq_opt'], entry_points = { - 'console_scripts': ['predict_loading_concentration=diva_seq_opt.predict_loading_concentration:main'], + 'console_scripts': ['predict_loading_concentration=predict_loading_concentration:main'], }, install_requires=[ 'pandas','numpy','scipy','matplotlib', 'peakutils','sklearn','tpot' ], include_package_data=True, - zip_safe=False) \ No newline at end of file + zip_safe=False)