JBEI · jdmccauley · Apr 13, 2021 · Apr 26, 2021 · Apr 26, 2021 · May 6, 2021
diff --git a/diva_seq_opt/__init__.py b/diva_seq_opt/__init__.py
@@ -1,22 +0,0 @@
-import matplotlib.pyplot as plt
-import numpy as np
-
-def predict_loading_concentration(spectra,model,low=5,high=20,n=1000,output_file=None):
-    cluster_densities = []
-    loading_concentrations = np.linspace(low,high,n)
-    for lc in loading_concentrations:
-        X = np.append(lc,spectra).reshape(1,-1)
-        y = model.predict(X)
-        cluster_densities.append(y[0])
-
-    #print(cluster_densities)
-    plt.plot(loading_concentrations,cluster_densities)
-    plt.title('Loading Concentration vs Predicted Cluster Density')
-    plt.xlabel('Loading Concentration [pM]')
-    plt.ylabel('Predicted Cluster Density [k/mm^2]')
-    plt.xlim([low,high])
-
-    if output_file is None:
-        plt.show()
-    else:
-        plt.savefig(output_file,dpi=600)

diff --git a/diva_seq_opt/predict_loading_concentration.py b/diva_seq_opt/predict_loading_concentration.py
diff --git a/diva_seq_opt/utility.py b/diva_seq_opt/utility.py
@@ -27,29 +27,29 @@
 
 def aproximate_sequence(df,n=100,start=0,stop=15000):
     """Reduce the Feature Space by Estimating a Curve with Its Integral"""
-    
+
     #Create even spacing
     int_lims = np.linspace(start,stop,n+1)
-    
+
     #Create Interpolation function
     abundance_curve = interp1d(df['base pairs'],df['Value'],fill_value='extrapolate')
-    
+
     #Aproximate Function using an integral
     state = [quad(abundance_curve,a,b,limit=2000)[0] for a,b in zip(int_lims[:-1],int_lims[1:])]
     bps = int_lims[:-1]
-    
+
     return state,bps
-    
+
 
 def find_peaks(df,test=False,peak_bps=[35,50,150,300,400,500,600,700,1000,2000,3000,7000,10380]):
-    """Find the peaks of a ladder gel. Return a list of peak times and the 
+    """Find the peaks of a ladder gel. Return a list of peak times and the
     base pairs that correspond to those times.
     """
-    
+
     for threshold in [0.30,0.20,0.15,0.10,0.05]:
         peaks = peakutils.indexes(df['Value'].values, thres=threshold, min_dist=10)
         peak_times = df['Time'].values[peaks[1:-1]]
-    
+
         if test:
             print('testing!')
             plt.figure()
@@ -58,7 +58,7 @@ def find_peaks(df,test=False,peak_bps=[35,50,150,300,400,500,600,700,1000,2000,3
             plt.show()
             print(len(peaks),len(peak_bps))
             print(len(peak_times),len(peak_bps))
-            
+
         if len(peak_times) == len(peak_bps):
             break
 
@@ -67,53 +67,53 @@ def find_peaks(df,test=False,peak_bps=[35,50,150,300,400,500,600,700,1000,2000,3
 
 def normalize_peak(df,start_bp=500,end_bp=11000):
     '''Normalize BioAnalyzer Curve so its total area is 1.'''
-    
+
     #Subtract out Noise Floor
     noise_floor = np.mean(df['Value'].tail(20))
     df['Value'] -= noise_floor
     df.loc[df['Value']<0,'Value']=0
-    
+
     #Select only important segment
     LIBRARY_SEGMENT = (df['base pairs'] > start_bp) & (df['base pairs'] < end_bp)
     df = df.loc[LIBRARY_SEGMENT]
-    
+
     #Normalize Values to have an area under the curve of 1
     area = np.trapz(df['Value'],x=df['base pairs'])
     df.loc[:,'Value'] /= area
-    
+
     return df
 
 
 def prepare_data(ladder_file,bioanalyzer_file):
     '''Convert a Ladder File and BioAnalyzer File into a machine learning data point'''
-    
-    #Load & Parse Ladder  
+
+    #Load & Parse Ladder
     ladder_df = pd.read_csv(ladder_file,skiprows=17)[:-1]
     ladder_df = ladder_df.apply(np.vectorize(float))
     peak_times,peak_bps = find_peaks(ladder_df)
-    
+
     #Check to make sure the right number of peaks were found in the ladder
     if len(peak_times) != len(peak_bps):
         Warning('Could Not Find Peaks in Ladder File! Bug Zak to Increase the Robustness of the Peak Finding Algorithm')
         return None
-    
+
     time_to_bps = interp1d(peak_times,peak_bps,fill_value='extrapolate',kind='slinear')
-    
+
     #Read Bioanalyzer File
     sample_df = pd.read_csv(bioanalyzer_file,skiprows=17)[:-2]
-    
+
     #Only Grab Relevant Columns
     sample_df = sample_df.loc[:,sample_df.columns.isin(['Time','Value'])]
-    
+
     #Convert Values to Floats
     sample_df = sample_df.apply(np.vectorize(float))
-    
+
     #Rescale From Retention Time to Base Pairs
     sample_df['base pairs'] = sample_df['Time'].apply(time_to_bps)
-    
+
     #Normalize Curve
     sample_df = normalize_peak(sample_df)
-    
+
     return sample_df
 
 
@@ -137,16 +137,16 @@ def get_file_pairs(data_dir,
                    ladder_file_re='.*ladder',
                    bioanalyzer_fila_re='sample'
                   ):
-    
+
     #Get list of all files in the training directory
     file_paths = glob.glob(data_dir+'/*')
-    
+
     #Find File Pairs
     file_pairs = {}
     for file_path in file_paths:
         match = re.match(file_id_re,file_path).group(0)
         file_pairs.setdefault(match,[]).append(file_path)
-    
+
     #create training file pairs
     training_files = []
     for _,file_pair in file_pairs.items():
@@ -162,43 +162,43 @@ def get_file_pairs(data_dir,
 def load_training_data(training_dir,plot=False):
     """load bioanalyzer training data from training directory."""
     training_files = get_training_files(training_dir)
-    
+
     df = pd.DataFrame()
     for ladder_file,bioanalyzer_file in training_files:
         sample_df = prepare_data(ladder_file,bioanalyzer_file)
         llc,cluster_density = get_sample_metadata(bioanalyzer_file)
-    
+
         if (sample_df is None) or (llc is None):
             continue
-            
+
         #Plot Base Pairs Vs Values
         if plot:
             sample_df.plot('base pairs','Value')
             plt.show()
-        
-        state,bps = aproximate_sequence(sample_df,n=10,stop=12000)    
-    
+
+        state,bps = aproximate_sequence(sample_df,n=10,stop=12000)
+
         #Create df
         columns = ['Library Loading Concentration','Cluster Density'] + [str(int(bp)) for bp in bps]
         data = [[llc,cluster_density] + state]
-        df = pd.concat([df,pd.DataFrame(data,columns=columns)]) 
+        df = pd.concat([df,pd.DataFrame(data,columns=columns)])
     return df
 
 
 def fit_model(training_df,output_file):
     X = training_df.loc[:,df.columns != 'Cluster Density']
     y = training_df['Cluster Density']
     model = RandomForestRegressor().fit(X,y)
-    
+
     with open(output_file,'wb') as fp:
         pickle.dump(model,fp)
-    
+
     return model
 
 
 def plot_model_performance(model,training_df):
     '''Create a Plot of the Models Cross Validated Performance'''
-    
+
     X = training_df.loc[:,df.columns != 'Cluster Density']
     y = training_df['Cluster Density']
     y_p = cross_val_predict(model,X,y)
@@ -228,4 +228,24 @@ def plot_model_performance(model,training_df):
     plt.tight_layout()
 
     plt.savefig('../figures/ModelAccuracy.pdf',dpi=600)
-    plt.show()
+    plt.show()
+
+def predict_loading_concentration(spectra,model,low=5,high=20,n=1000,output_file=None):
+    cluster_densities = []
+    loading_concentrations = np.linspace(low,high,n)
+    for lc in loading_concentrations:
+        X = np.append(lc,spectra).reshape(1,-1)
+        y = model.predict(X)
+        cluster_densities.append(y[0])
+
+    #print(cluster_densities)
+    plt.plot(loading_concentrations,cluster_densities)
+    plt.title('Loading Concentration vs Predicted Cluster Density')
+    plt.xlabel('Loading Concentration [pM]')
+    plt.ylabel('Predicted Cluster Density [k/mm^2]')
+    plt.xlim([low,high])
+
+    if output_file is None:
+        plt.show()
+    else:
+        plt.savefig(output_file,dpi=600)
diff --git a/install.sh b/install.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+
+# Installs the pipenv for running the tool.
+
+python3 -m pipenv --rm
+python3 -m pipenv install
diff --git a/predict_loading_concentration.py b/predict_loading_concentration.py
@@ -4,35 +4,37 @@ def warn(*args, **kwargs):
 import warnings
 warnings.warn = warn
 
-from diva_dna_seq.utility import aproximate_sequence,prepare_data
-from diva_dna_seq import predict_loading_concentration
+from diva_seq_opt import utility
 import argparse
 import pickle
 
 
+def main():
+	#Allow Matplotlib to be used on commandline
+	import matplotlib as mpl
+	mpl.use('Agg')
 
-#Allow Matplotlib to be used on commandline
-import matplotlib as mpl
-mpl.use('Agg')
+	#Handle inputs
+	#Parse Inputs
+	parser = argparse.ArgumentParser(description='Use BioAnalyzer Data to Predict Library Loading Concentration for Library Barcoding')
 
-#Handle inputs
-#Parse Inputs
-parser = argparse.ArgumentParser(description='Use BioAnalyzer Data to Predict Library Loading Concentration for Library Barcoding')
+	parser.add_argument('BAF',type=str, help='BioAnalyzer CSV')
+	parser.add_argument('LAD',type=str, help='Ladder CSV')
+	parser.add_argument('OF' ,type=str, help='Prediction Plot output file, *.png')
+	args = parser.parse_args()
 
-parser.add_argument('BAF',type=str, help='BioAnalyzer CSV')
-parser.add_argument('LAD',type=str, help='Ladder CSV')
-parser.add_argument('OF' ,type=str, help='Prediction Plot output file, *.png')
-args = parser.parse_args()
+	#Load Model
+	with open('./diva_seq_opt/model/model30.pkl','rb') as fp:
+	    model = pickle.load(fp)
 
-#Load Model
-with open('../model/model30.pkl','rb') as fp:
-    model = pickle.load(fp)
+	#Prepare Data
+	sample_df = utility.prepare_data(args.LAD,args.BAF)
 
-#Prepare Data
-sample_df = prepare_data(args.LAD,args.BAF)
+	#Create Features
+	state,bps = utility.aproximate_sequence(sample_df,n=10,stop=12000)
 
-#Create Features
-state,bps = aproximate_sequence(sample_df,n=10,stop=12000)    
+	#Predict
+	utility.predict_loading_concentration(state,model,output_file=args.OF)
 
-#Predict
-predict_loading_concentration(state,model,output_file=args.OF)
+if __name__ == "__main__":
+    main()