From 64a3d928ff9f29238a4945336a4bef18761ba034 Mon Sep 17 00:00:00 2001
From: Joshua McCauley <JDMcCauley@lbl.gov>
Date: Mon, 12 Apr 2021 17:12:46 -0700
Subject: [PATCH 1/6] Initial commit to get working

---
 Pipfile                          | 20 ++++++++++++++++++++
 README.md                        | 23 ++++++++++++-----------
 diva_seq_opt/utility.py          |  2 +-
 install.sh                       |  6 ++++++
 predict_loading_concentration.py |  8 ++++----
 run.sh                           |  4 ++++
 6 files changed, 47 insertions(+), 16 deletions(-)
 create mode 100644 Pipfile
 create mode 100755 install.sh
 create mode 100755 run.sh

diff --git a/Pipfile b/Pipfile
new file mode 100644
index 0000000..2085279
--- /dev/null
+++ b/Pipfile
@@ -0,0 +1,20 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+pandas = "==0.22.0"
+numpy = "==1.14.2"
+matplotlib = "==2.0.2"
+scipy = "==1.0.0"
+ipython = "==6.4.0"
+seaborn = "==0.8.1"
+scikit-learn = "==0.19.1"
+PeakUtils = "==1.1.0"
+TPOT = "==0.9.2"
+
+[dev-packages]
+
+[requires]
+python_version = "3.9"
diff --git a/README.md b/README.md
index 1f3466d..7e26d86 100755
--- a/README.md
+++ b/README.md
@@ -1,15 +1,16 @@
-# DIVA DNA Seq Utilities
+# predict_loading_concentration
 
-This package is a repository of tools used by the DIVA DNA Seq team at the Joint BioEnergy Institute (JBEI) to predict the library loading concentration for their MiSeq runs.  It employs a machine learning model to determine the relationship between library fragment sizes, loading conentration and cluster density.
+## Dependencies:
+* `python3`
+* `pipenv`
 
-## Installation 
-
-The tool can be installed using pip with 
-
-```bash
-pip install diva_dna_seq
+## Installation:
+`./install.sh`
+Note this has the following output as of right now:
 ```
+pipenv.patched.notpip._internal.exceptions.InstallationError: Command errored out with exit status 1: /Users/jdmccauley/.local/share/virtualenvs/library_loading_concentration_prediction-8ryN5LMN/bin/python /usr/local/lib/python3.9/site-packages/pipenv/patched/notpip install --ignore-installed --no-user --prefix /private/var/folders/22/d1_56bp93jq_z9m0qy5ql0z80000gq/T/pip-build-env-tdi9o8gr/overlay --no-warn-script-location --no-binary :none: --only-binary :none: -i https://pypi.org/simple -- wheel setuptools Cython 'numpy==1.9.3; python_version=='"'"'3.5'"'"'' 'numpy==1.12.1; python_version=='"'"'3.6'"'"'' 'numpy==1.13.1; python_version>='"'"'3.7'"'"'' Check the logs for full command output.
+```
+This is fine, just move on.
 
-## Use
-
-This tool can then be used either in a Jupyter notebook or on the command line. A bioanalyzer ladder file and result file are needed. The tool generates a plot that relates library loading concentration for your library to 
\ No newline at end of file
+## Running:
+`./run.sh sample_file.csv ladder_file.csv output_file.pdf`
diff --git a/diva_seq_opt/utility.py b/diva_seq_opt/utility.py
index 727dedc..6d5faf8 100755
--- a/diva_seq_opt/utility.py
+++ b/diva_seq_opt/utility.py
@@ -228,4 +228,4 @@ def plot_model_performance(model,training_df):
     plt.tight_layout()
 
     plt.savefig('../figures/ModelAccuracy.pdf',dpi=600)
-    plt.show()
\ No newline at end of file
+    plt.show()
diff --git a/install.sh b/install.sh
new file mode 100755
index 0000000..5ee4b3c
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+
+# Installs the pipenv for running the tool.
+
+python3 -m pipenv --rm
+python3 -m pipenv install
diff --git a/predict_loading_concentration.py b/predict_loading_concentration.py
index 4d70d05..4e3d64a 100755
--- a/predict_loading_concentration.py
+++ b/predict_loading_concentration.py
@@ -4,8 +4,8 @@ def warn(*args, **kwargs):
 import warnings
 warnings.warn = warn
 
-from diva_dna_seq.utility import aproximate_sequence,prepare_data
-from diva_dna_seq import predict_loading_concentration
+from diva_seq_opt.utility import aproximate_sequence,prepare_data
+from diva_seq_opt import predict_loading_concentration
 import argparse
 import pickle
 
@@ -25,7 +25,7 @@ def warn(*args, **kwargs):
 args = parser.parse_args()
 
 #Load Model
-with open('../model/model30.pkl','rb') as fp:
+with open('./diva_seq_opt/model/model30.pkl','rb') as fp:
     model = pickle.load(fp)
 
 #Prepare Data
@@ -35,4 +35,4 @@ def warn(*args, **kwargs):
 state,bps = aproximate_sequence(sample_df,n=10,stop=12000)    
 
 #Predict
-predict_loading_concentration(state,model,output_file=args.OF)
\ No newline at end of file
+predict_loading_concentration(state,model,output_file=args.OF)
diff --git a/run.sh b/run.sh
new file mode 100755
index 0000000..47035a2
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+# Runs the program
+python3 -m pipenv run predict_loading_concentration $@

From 81be32680513c35a9a6860309c1fef68181c512b Mon Sep 17 00:00:00 2001
From: William Morrell <WCMorrell@lbl.gov>
Date: Mon, 26 Apr 2021 14:13:39 -0700
Subject: [PATCH 2/6] Remove self-import

The predict_loading_concentration module was importing itself, which is
probably not what we want it doing.
---
 diva_seq_opt/predict_loading_concentration.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/diva_seq_opt/predict_loading_concentration.py b/diva_seq_opt/predict_loading_concentration.py
index a0f06a1..b48e96d 100755
--- a/diva_seq_opt/predict_loading_concentration.py
+++ b/diva_seq_opt/predict_loading_concentration.py
@@ -1,6 +1,5 @@
 #!/usr/bin/python
 from diva_seq_opt.utility import aproximate_sequence,prepare_data
-from diva_seq_opt import predict_loading_concentration
 import diva_seq_opt
 import argparse
 import pickle
@@ -36,4 +35,4 @@ def main():
     state,bps = aproximate_sequence(sample_df,n=10,stop=12000)    
     
     #Predict
-    predict_loading_concentration(state,model,output_file=args.OF)
\ No newline at end of file
+    predict_loading_concentration(state,model,output_file=args.OF)

From c1c0d36ab2b61f246083c5b3e339695b2eff9d6c Mon Sep 17 00:00:00 2001
From: William Morrell <WCMorrell@lbl.gov>
Date: Mon, 26 Apr 2021 14:14:17 -0700
Subject: [PATCH 3/6] Wrap top-level predict_loading_concentration in main

---
 predict_loading_concentration.py | 54 +++++++++++++++++---------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/predict_loading_concentration.py b/predict_loading_concentration.py
index 4e3d64a..39462f9 100755
--- a/predict_loading_concentration.py
+++ b/predict_loading_concentration.py
@@ -10,29 +10,33 @@ def warn(*args, **kwargs):
 import pickle
 
 
+def main():
+	#Allow Matplotlib to be used on commandline
+	import matplotlib as mpl
+	mpl.use('Agg')
+	
+	#Handle inputs
+	#Parse Inputs
+	parser = argparse.ArgumentParser(description='Use BioAnalyzer Data to Predict Library Loading Concentration for Library Barcoding')
+	
+	parser.add_argument('BAF',type=str, help='BioAnalyzer CSV')
+	parser.add_argument('LAD',type=str, help='Ladder CSV')
+	parser.add_argument('OF' ,type=str, help='Prediction Plot output file, *.png')
+	args = parser.parse_args()
+	
+	#Load Model
+	with open('./diva_seq_opt/model/model30.pkl','rb') as fp:
+	    model = pickle.load(fp)
+	
+	#Prepare Data
+	sample_df = prepare_data(args.LAD,args.BAF)
+	
+	#Create Features
+	state,bps = aproximate_sequence(sample_df,n=10,stop=12000)    
+	
+	#Predict
+	predict_loading_concentration(state,model,output_file=args.OF)
+
+if __name__ == "__main__":
+    main()
 
-#Allow Matplotlib to be used on commandline
-import matplotlib as mpl
-mpl.use('Agg')
-
-#Handle inputs
-#Parse Inputs
-parser = argparse.ArgumentParser(description='Use BioAnalyzer Data to Predict Library Loading Concentration for Library Barcoding')
-
-parser.add_argument('BAF',type=str, help='BioAnalyzer CSV')
-parser.add_argument('LAD',type=str, help='Ladder CSV')
-parser.add_argument('OF' ,type=str, help='Prediction Plot output file, *.png')
-args = parser.parse_args()
-
-#Load Model
-with open('./diva_seq_opt/model/model30.pkl','rb') as fp:
-    model = pickle.load(fp)
-
-#Prepare Data
-sample_df = prepare_data(args.LAD,args.BAF)
-
-#Create Features
-state,bps = aproximate_sequence(sample_df,n=10,stop=12000)    
-
-#Predict
-predict_loading_concentration(state,model,output_file=args.OF)

From 6174548b1ab4fe393f587c46289301ef8d8a1848 Mon Sep 17 00:00:00 2001
From: Joshua McCauley <JDMcCauley@lbl.gov>
Date: Wed, 5 May 2021 17:15:57 -0700
Subject: [PATCH 4/6] Revert README and delete files

Remove the run.sh and Pipfile for this repo since they are not used. Also, revert the README to its previous state in commit 651b2c381357deb.
---
 Pipfile   | 20 --------------------
 README.md | 23 +++++++++++------------
 run.sh    |  4 ----
 3 files changed, 11 insertions(+), 36 deletions(-)
 delete mode 100644 Pipfile
 delete mode 100755 run.sh

diff --git a/Pipfile b/Pipfile
deleted file mode 100644
index 2085279..0000000
--- a/Pipfile
+++ /dev/null
@@ -1,20 +0,0 @@
-[[source]]
-url = "https://pypi.org/simple"
-verify_ssl = true
-name = "pypi"
-
-[packages]
-pandas = "==0.22.0"
-numpy = "==1.14.2"
-matplotlib = "==2.0.2"
-scipy = "==1.0.0"
-ipython = "==6.4.0"
-seaborn = "==0.8.1"
-scikit-learn = "==0.19.1"
-PeakUtils = "==1.1.0"
-TPOT = "==0.9.2"
-
-[dev-packages]
-
-[requires]
-python_version = "3.9"
diff --git a/README.md b/README.md
index 7e26d86..1f3466d 100755
--- a/README.md
+++ b/README.md
@@ -1,16 +1,15 @@
-# predict_loading_concentration
+# DIVA DNA Seq Utilities
 
-## Dependencies:
-* `python3`
-* `pipenv`
+This package is a repository of tools used by the DIVA DNA Seq team at the Joint BioEnergy Institute (JBEI) to predict the library loading concentration for their MiSeq runs.  It employs a machine learning model to determine the relationship between library fragment sizes, loading conentration and cluster density.
 
-## Installation:
-`./install.sh`
-Note this has the following output as of right now:
-```
-pipenv.patched.notpip._internal.exceptions.InstallationError: Command errored out with exit status 1: /Users/jdmccauley/.local/share/virtualenvs/library_loading_concentration_prediction-8ryN5LMN/bin/python /usr/local/lib/python3.9/site-packages/pipenv/patched/notpip install --ignore-installed --no-user --prefix /private/var/folders/22/d1_56bp93jq_z9m0qy5ql0z80000gq/T/pip-build-env-tdi9o8gr/overlay --no-warn-script-location --no-binary :none: --only-binary :none: -i https://pypi.org/simple -- wheel setuptools Cython 'numpy==1.9.3; python_version=='"'"'3.5'"'"'' 'numpy==1.12.1; python_version=='"'"'3.6'"'"'' 'numpy==1.13.1; python_version>='"'"'3.7'"'"'' Check the logs for full command output.
+## Installation 
+
+The tool can be installed using pip with 
+
+```bash
+pip install diva_dna_seq
 ```
-This is fine, just move on.
 
-## Running:
-`./run.sh sample_file.csv ladder_file.csv output_file.pdf`
+## Use
+
+This tool can then be used either in a Jupyter notebook or on the command line. A bioanalyzer ladder file and result file are needed. The tool generates a plot that relates library loading concentration for your library to 
\ No newline at end of file
diff --git a/run.sh b/run.sh
deleted file mode 100755
index 47035a2..0000000
--- a/run.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/sh
-
-# Runs the program
-python3 -m pipenv run predict_loading_concentration $@

From cc561bdf36c9e81911bdd249a4f7ccdc132003e0 Mon Sep 17 00:00:00 2001
From: Joshua McCauley <JDMcCauley@lbl.gov>
Date: Thu, 13 May 2021 16:04:18 -0700
Subject: [PATCH 5/6] Found the real predict_loading_concentration

The real predict_loading_concentration function that was called in the two predict_loading_concentration.py files was located in __init__.py.
This real function was moved to diva_seq_opt/utility.
The duplicated predict_loading_concentration.py in diva_seq_opt was deleted, since the same actions are done by the parent directory's predict_loading_concentration.py.
The __init__.py was cleared of all text, so it serves only to make diva_seq_opt a module.
The main predict_loading_concentration.py was modified to make the call to predict_loading_concentration() in utility.py.
---
 diva_seq_opt/__init__.py                      | 22 -----
 diva_seq_opt/predict_loading_concentration.py | 38 --------
 diva_seq_opt/utility.py                       | 92 +++++++++++--------
 predict_loading_concentration.py              | 22 ++---
 4 files changed, 66 insertions(+), 108 deletions(-)
 delete mode 100755 diva_seq_opt/predict_loading_concentration.py

diff --git a/diva_seq_opt/__init__.py b/diva_seq_opt/__init__.py
index d26c1db..e69de29 100644
--- a/diva_seq_opt/__init__.py
+++ b/diva_seq_opt/__init__.py
@@ -1,22 +0,0 @@
-import matplotlib.pyplot as plt
-import numpy as np
-
-def predict_loading_concentration(spectra,model,low=5,high=20,n=1000,output_file=None):
-    cluster_densities = []
-    loading_concentrations = np.linspace(low,high,n)
-    for lc in loading_concentrations:
-        X = np.append(lc,spectra).reshape(1,-1)
-        y = model.predict(X)
-        cluster_densities.append(y[0])
-    
-    #print(cluster_densities)
-    plt.plot(loading_concentrations,cluster_densities)
-    plt.title('Loading Concentration vs Predicted Cluster Density')
-    plt.xlabel('Loading Concentration [pM]')
-    plt.ylabel('Predicted Cluster Density [k/mm^2]')
-    plt.xlim([low,high])
-    
-    if output_file is None:
-        plt.show()
-    else:
-        plt.savefig(output_file,dpi=600)
\ No newline at end of file
diff --git a/diva_seq_opt/predict_loading_concentration.py b/diva_seq_opt/predict_loading_concentration.py
deleted file mode 100755
index b48e96d..0000000
--- a/diva_seq_opt/predict_loading_concentration.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/python
-from diva_seq_opt.utility import aproximate_sequence,prepare_data
-import diva_seq_opt
-import argparse
-import pickle
-import numpy as np
-import os
-
-#Allow Matplotlib to be used on commandline
-import matplotlib as mpl
-mpl.use('Agg')
-
-
-
-#Handle inputs
-def main():    
-    #Parse Inputs
-    parser = argparse.ArgumentParser(description='Use BioAnalyzer Data to Predict Library Loading Concentration for Library Barcoding')
-    
-    parser.add_argument('BAF',type=str, help='BioAnalyzer CSV')
-    parser.add_argument('LAD',type=str, help='Ladder CSV')
-    parser.add_argument('OF' ,type=str, help='Prediction Plot output file, *.png')
-    args = parser.parse_args()
-    
-    #Load Model
-    model_path = os.path.dirname(os.path.realpath(diva_seq_opt.__file__))
-    model_path = os.path.join(model_path,'model/model30.pkl')
-    with open(model_path,'rb') as fp:
-        model = pickle.load(fp)
-    
-    #Prepare Data
-    sample_df = prepare_data(args.LAD,args.BAF)
-    
-    #Create Features
-    state,bps = aproximate_sequence(sample_df,n=10,stop=12000)    
-    
-    #Predict
-    predict_loading_concentration(state,model,output_file=args.OF)
diff --git a/diva_seq_opt/utility.py b/diva_seq_opt/utility.py
index 6d5faf8..c419633 100755
--- a/diva_seq_opt/utility.py
+++ b/diva_seq_opt/utility.py
@@ -27,29 +27,29 @@
 
 def aproximate_sequence(df,n=100,start=0,stop=15000):
     """Reduce the Feature Space by Estimating a Curve with Its Integral"""
-    
+
     #Create even spacing
     int_lims = np.linspace(start,stop,n+1)
-    
+
     #Create Interpolation function
     abundance_curve = interp1d(df['base pairs'],df['Value'],fill_value='extrapolate')
-    
+
     #Aproximate Function using an integral
     state = [quad(abundance_curve,a,b,limit=2000)[0] for a,b in zip(int_lims[:-1],int_lims[1:])]
     bps = int_lims[:-1]
-    
+
     return state,bps
-    
+
 
 def find_peaks(df,test=False,peak_bps=[35,50,150,300,400,500,600,700,1000,2000,3000,7000,10380]):
-    """Find the peaks of a ladder gel. Return a list of peak times and the 
+    """Find the peaks of a ladder gel. Return a list of peak times and the
     base pairs that correspond to those times.
     """
-    
+
     for threshold in [0.30,0.20,0.15,0.10,0.05]:
         peaks = peakutils.indexes(df['Value'].values, thres=threshold, min_dist=10)
         peak_times = df['Time'].values[peaks[1:-1]]
-    
+
         if test:
             print('testing!')
             plt.figure()
@@ -58,7 +58,7 @@ def find_peaks(df,test=False,peak_bps=[35,50,150,300,400,500,600,700,1000,2000,3
             plt.show()
             print(len(peaks),len(peak_bps))
             print(len(peak_times),len(peak_bps))
-            
+
         if len(peak_times) == len(peak_bps):
             break
 
@@ -67,53 +67,53 @@ def find_peaks(df,test=False,peak_bps=[35,50,150,300,400,500,600,700,1000,2000,3
 
 def normalize_peak(df,start_bp=500,end_bp=11000):
     '''Normalize BioAnalyzer Curve so its total area is 1.'''
-    
+
     #Subtract out Noise Floor
     noise_floor = np.mean(df['Value'].tail(20))
     df['Value'] -= noise_floor
     df.loc[df['Value']<0,'Value']=0
-    
+
     #Select only important segment
     LIBRARY_SEGMENT = (df['base pairs'] > start_bp) & (df['base pairs'] < end_bp)
     df = df.loc[LIBRARY_SEGMENT]
-    
+
     #Normalize Values to have an area under the curve of 1
     area = np.trapz(df['Value'],x=df['base pairs'])
     df.loc[:,'Value'] /= area
-    
+
     return df
 
 
 def prepare_data(ladder_file,bioanalyzer_file):
     '''Convert a Ladder File and BioAnalyzer File into a machine learning data point'''
-    
-    #Load & Parse Ladder  
+
+    #Load & Parse Ladder
     ladder_df = pd.read_csv(ladder_file,skiprows=17)[:-1]
     ladder_df = ladder_df.apply(np.vectorize(float))
     peak_times,peak_bps = find_peaks(ladder_df)
-    
+
     #Check to make sure the right number of peaks were found in the ladder
     if len(peak_times) != len(peak_bps):
         Warning('Could Not Find Peaks in Ladder File! Bug Zak to Increase the Robustness of the Peak Finding Algorithm')
         return None
-    
+
     time_to_bps = interp1d(peak_times,peak_bps,fill_value='extrapolate',kind='slinear')
-    
+
     #Read Bioanalyzer File
     sample_df = pd.read_csv(bioanalyzer_file,skiprows=17)[:-2]
-    
+
     #Only Grab Relevant Columns
     sample_df = sample_df.loc[:,sample_df.columns.isin(['Time','Value'])]
-    
+
     #Convert Values to Floats
     sample_df = sample_df.apply(np.vectorize(float))
-    
+
     #Rescale From Retention Time to Base Pairs
     sample_df['base pairs'] = sample_df['Time'].apply(time_to_bps)
-    
+
     #Normalize Curve
     sample_df = normalize_peak(sample_df)
-    
+
     return sample_df
 
 
@@ -137,16 +137,16 @@ def get_file_pairs(data_dir,
                    ladder_file_re='.*ladder',
                    bioanalyzer_fila_re='sample'
                   ):
-    
+
     #Get list of all files in the training directory
     file_paths = glob.glob(data_dir+'/*')
-    
+
     #Find File Pairs
     file_pairs = {}
     for file_path in file_paths:
         match = re.match(file_id_re,file_path).group(0)
         file_pairs.setdefault(match,[]).append(file_path)
-    
+
     #create training file pairs
     training_files = []
     for _,file_pair in file_pairs.items():
@@ -162,26 +162,26 @@ def get_file_pairs(data_dir,
 def load_training_data(training_dir,plot=False):
     """load bioanalyzer training data from training directory."""
     training_files = get_training_files(training_dir)
-    
+
     df = pd.DataFrame()
     for ladder_file,bioanalyzer_file in training_files:
         sample_df = prepare_data(ladder_file,bioanalyzer_file)
         llc,cluster_density = get_sample_metadata(bioanalyzer_file)
-    
+
         if (sample_df is None) or (llc is None):
             continue
-            
+
         #Plot Base Pairs Vs Values
         if plot:
             sample_df.plot('base pairs','Value')
             plt.show()
-        
-        state,bps = aproximate_sequence(sample_df,n=10,stop=12000)    
-    
+
+        state,bps = aproximate_sequence(sample_df,n=10,stop=12000)
+
         #Create df
         columns = ['Library Loading Concentration','Cluster Density'] + [str(int(bp)) for bp in bps]
         data = [[llc,cluster_density] + state]
-        df = pd.concat([df,pd.DataFrame(data,columns=columns)]) 
+        df = pd.concat([df,pd.DataFrame(data,columns=columns)])
     return df
 
 
@@ -189,16 +189,16 @@ def fit_model(training_df,output_file):
     X = training_df.loc[:,df.columns != 'Cluster Density']
     y = training_df['Cluster Density']
     model = RandomForestRegressor().fit(X,y)
-    
+
     with open(output_file,'wb') as fp:
         pickle.dump(model,fp)
-    
+
     return model
 
 
 def plot_model_performance(model,training_df):
     '''Create a Plot of the Models Cross Validated Performance'''
-    
+
     X = training_df.loc[:,df.columns != 'Cluster Density']
     y = training_df['Cluster Density']
     y_p = cross_val_predict(model,X,y)
@@ -229,3 +229,23 @@ def plot_model_performance(model,training_df):
 
     plt.savefig('../figures/ModelAccuracy.pdf',dpi=600)
     plt.show()
+
+def predict_loading_concentration(spectra,model,low=5,high=20,n=1000,output_file=None):
+    cluster_densities = []
+    loading_concentrations = np.linspace(low,high,n)
+    for lc in loading_concentrations:
+        X = np.append(lc,spectra).reshape(1,-1)
+        y = model.predict(X)
+        cluster_densities.append(y[0])
+
+    #print(cluster_densities)
+    plt.plot(loading_concentrations,cluster_densities)
+    plt.title('Loading Concentration vs Predicted Cluster Density')
+    plt.xlabel('Loading Concentration [pM]')
+    plt.ylabel('Predicted Cluster Density [k/mm^2]')
+    plt.xlim([low,high])
+
+    if output_file is None:
+        plt.show()
+    else:
+        plt.savefig(output_file,dpi=600)
diff --git a/predict_loading_concentration.py b/predict_loading_concentration.py
index 39462f9..456021c 100755
--- a/predict_loading_concentration.py
+++ b/predict_loading_concentration.py
@@ -4,8 +4,7 @@ def warn(*args, **kwargs):
 import warnings
 warnings.warn = warn
 
-from diva_seq_opt.utility import aproximate_sequence,prepare_data
-from diva_seq_opt import predict_loading_concentration
+from diva_seq_opt import utility
 import argparse
 import pickle
 
@@ -14,29 +13,28 @@ def main():
 	#Allow Matplotlib to be used on commandline
 	import matplotlib as mpl
 	mpl.use('Agg')
-	
+
 	#Handle inputs
 	#Parse Inputs
 	parser = argparse.ArgumentParser(description='Use BioAnalyzer Data to Predict Library Loading Concentration for Library Barcoding')
-	
+
 	parser.add_argument('BAF',type=str, help='BioAnalyzer CSV')
 	parser.add_argument('LAD',type=str, help='Ladder CSV')
 	parser.add_argument('OF' ,type=str, help='Prediction Plot output file, *.png')
 	args = parser.parse_args()
-	
+
 	#Load Model
 	with open('./diva_seq_opt/model/model30.pkl','rb') as fp:
 	    model = pickle.load(fp)
-	
+
 	#Prepare Data
-	sample_df = prepare_data(args.LAD,args.BAF)
-	
+	sample_df = utility.prepare_data(args.LAD,args.BAF)
+
 	#Create Features
-	state,bps = aproximate_sequence(sample_df,n=10,stop=12000)    
-	
+	state,bps = utility.aproximate_sequence(sample_df,n=10,stop=12000)
+
 	#Predict
-	predict_loading_concentration(state,model,output_file=args.OF)
+	utility.predict_loading_concentration(state,model,output_file=args.OF)
 
 if __name__ == "__main__":
     main()
-

From 56d43df959c386fca6fe7acbd24fad908e295a70 Mon Sep 17 00:00:00 2001
From: Joshua McCauley <JDMcCauley@lbl.gov>
Date: Fri, 21 May 2021 14:49:24 -0700
Subject: [PATCH 6/6] Move import, update setup/README

Moved an import statement in predict_loading_concentration:main from main() to the top.
Edited setup.py to call the new predict_loading_concentration.
Iterated version.
Updated REAMDE to show dependency installation instructions.
Finish the last sentence of the README.
---
 README.md                        | 9 ++++-----
 predict_loading_concentration.py | 2 +-
 setup.py                         | 6 +++---
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 1f3466d..2645e0c 100755
--- a/README.md
+++ b/README.md
@@ -4,12 +4,11 @@ This package is a repository of tools used by the DIVA DNA Seq team at the Joint
 
 ## Installation 
 
-The tool can be installed using pip with 
-
-```bash
-pip install diva_dna_seq
+The dependencies of this tool can be installed with:
+```
+pip install -r requirements.txt
 ```
 
 ## Use
 
-This tool can then be used either in a Jupyter notebook or on the command line. A bioanalyzer ladder file and result file are needed. The tool generates a plot that relates library loading concentration for your library to 
\ No newline at end of file
+This tool can then be used either in a Jupyter notebook or on the command line. A bioanalyzer ladder file and result file are needed. The tool generates a plot that relates library loading concentration for your library to the MiSeq flow cell cluster density.
diff --git a/predict_loading_concentration.py b/predict_loading_concentration.py
index 456021c..ff56abb 100755
--- a/predict_loading_concentration.py
+++ b/predict_loading_concentration.py
@@ -7,11 +7,11 @@ def warn(*args, **kwargs):
 from diva_seq_opt import utility
 import argparse
 import pickle
+import matplotlib as mpl
 
 
 def main():
 	#Allow Matplotlib to be used on commandline
-	import matplotlib as mpl
 	mpl.use('Agg')
 
 	#Handle inputs
diff --git a/setup.py b/setup.py
index 40cd3da..fe8b8ef 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup
 
 setup(name='diva_seq_opt',
-      version='0.0.3',
+      version='0.0.4',
       description='Predict the library loading concentration to use for mi-seq runs based on bioanalyzer data.',
       url='https://github.com/JBEI/library_loading_concentration_prediction',
       author='Zak Costello',
@@ -9,11 +9,11 @@
       license='BSD',
       packages=['diva_seq_opt'],      
       entry_points = {
-        'console_scripts': ['predict_loading_concentration=diva_seq_opt.predict_loading_concentration:main'],
+        'console_scripts': ['predict_loading_concentration=predict_loading_concentration:main'],
       },
       install_requires=[
           'pandas','numpy','scipy','matplotlib',
           'peakutils','sklearn','tpot'
       ],
       include_package_data=True,
-      zip_safe=False)
\ No newline at end of file
+      zip_safe=False)