From 5ade1ffe60d3c9a8e5a02728b691063b29b24720 Mon Sep 17 00:00:00 2001
From: Ell Bogat <ell.bogat@nasa.gov>
Date: Wed, 13 Nov 2024 12:17:30 -0500
Subject: [PATCH 01/11] add psflib functions

---
 spaceKLIP/__init__.py |   1 +
 spaceKLIP/psflib.py   | 421 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 422 insertions(+)
 create mode 100644 spaceKLIP/psflib.py

diff --git a/spaceKLIP/__init__.py b/spaceKLIP/__init__.py
index dcd1595f..034a16db 100644
--- a/spaceKLIP/__init__.py
+++ b/spaceKLIP/__init__.py
@@ -8,6 +8,7 @@
 from . import mast
 from . import plotting
 from . import psf
+from . import psflib
 from . import pyklippipeline
 from . import utils
 
diff --git a/spaceKLIP/psflib.py b/spaceKLIP/psflib.py
new file mode 100644
index 00000000..b2aa2de3
--- /dev/null
+++ b/spaceKLIP/psflib.py
@@ -0,0 +1,421 @@
+################################################################################
+# This script takes a directory of JWST calints files as input and builds a    #
+# database of reference info for each file, which can then be read via the     #
+# pipeline script.                                                             #
+#                                                                              #
+# Basically it will hold the target-specific info needed by the pipeline, as   #
+# well as info needed to choose reference observations for a given science     #
+# target.                                                                      #
+#                                                                              #
+# Written 2024-07-10 by Ellis Bogat                                            #
+################################################################################
+
+
+# imports
+import os
+import warnings
+import glob
+import re
+import mocapy
+import pandas as pd
+from astropy.io import fits
+
+from astroquery.simbad import Simbad
+import numpy as np
+
+sp_class_letters = ['O','B','A','F','G','K','M','T','Y']
+
+# Helper functions for logic with database series
+def isnone(series):
+    try:
+        return np.isnan(series)
+    except:
+        return (series == '') | (series == None) 
+
+
+def not_isnone(series):
+    
+    return ~ isnone(series)
+
+
+def load_refdb(fpath):
+    """
+    Reads the database of target- and observation-specific reference info for 
+    each observation in the PSF library.
+
+    Args:
+        fpath (str or os.path): Path to the .csv file containing the reference
+            database.
+
+    Returns:
+        pandas.DataFrame: database containing target- and observation-specific 
+         info for each file.
+    """
+
+    refdb = pd.read_csv(fpath)
+    refdb.set_index('TARGNAME',inplace=True)
+
+    return refdb
+
+
+def decode_simbad_sptype(input_sptypes):
+
+    if isinstance(input_sptypes,str):
+        input_sptypes = [input_sptypes]
+
+    
+    sp_classes = []
+    sp_subclasses = []
+    sp_lclasses = []
+
+    for simbad_spectype in input_sptypes:
+        
+        m = re.search(r'([OBAFGKMTY])(\d\.*\d*)[-+/]*(\d\.*\d*)*[-+/]*(I*V*I*)[-/]*(I*V*I*)',simbad_spectype)
+        
+        if m is None:
+            sp_classes.append('')
+            sp_subclasses.append('')
+            sp_lclasses.append('')
+        else:
+            res = m.group(1,2,3,4,5)
+            
+            sp_classes.append(res[0])
+            sp_subclasses.append(res[1])
+            if 'V' in res[3:] or res[3:] == ('',''):
+                sp_lclasses.append('V')
+
+            elif res[4] != '':
+                sp_lclasses.append(res[4])
+
+            else:
+                sp_lclasses.append(res[3])
+
+    return (sp_classes,sp_subclasses,sp_lclasses)
+
+
+def update_db_sptypes(refdb):
+    simbad_sptypes = refdb.SPTYPE.values
+
+    sp_classes, sp_subclasses, sp_lclasses = decode_simbad_sptype(simbad_sptypes)
+
+    refdb_copy = refdb.copy()
+    refdb_copy['SP_CLASS'] = sp_classes
+    refdb_copy['SP_SUBCLASS'] = sp_subclasses
+    refdb_copy['SP_LCLASS'] = sp_lclasses
+
+    return refdb_copy
+
+
+def build_refdb(idir,odir='.',uncal=False,overwrite=False):
+    """
+    Constructs a database of target-specific reference info for each
+    calints file in the input directory.
+
+    Args:
+        idir (path): Path to pre-processed (stage 2) JWST images to be added 
+         to the database
+        odir (path, optional): Location to save the database. Defaults to '.'.
+        uncal (bool, optional): Toggle using uncal files as inputs instead of
+         calints. Defaults to None.
+        overwrite (bool, optional): If true, overwrite the existing caldb.
+
+    Returns:
+        pandas.DataFrame: database containing target- and observation-specific 
+         info for each file.
+    """
+    
+    # TODO:
+    # - describe each column & its units
+    # - write tests for build_refdb()
+    #       - nonexistent input directory
+    #       - empty input directory 
+    #       - no calints files in input directory
+    #       - header kw missing
+    #       - duplicate science target with different program names
+    #       - calints vs uncal toggle 
+    #           - (warning if no calints present but uncals are
+    #             'did you mean to set uncal = True?')
+    
+    # Check that you won't accidentally overwrite an existing csv.
+    outpath = os.path.join(odir,'ref_lib.csv')
+    if os.path.exists(outpath):
+
+        if overwrite:
+            msg = f'\nThis operation will overwrite {outpath}. \nIf this is not what you want, abort now!'
+            warnings.warn(msg)
+
+        else:
+            raise Exception(f'This operation is trying to overwrite {outpath}.\nIf this is what you want, set overwrite=True.')
+
+    # Read in uncal files 
+    print('Reading input files...')
+    suffix = 'uncal' if uncal else 'calints'
+    fpaths = sorted(glob.glob(os.path.join(idir,f"*_{suffix}.fits")))
+
+    # Start a dataframe with the header info we want from each file
+    csv_list = []
+    fits_cols = [
+        'TARGPROP',
+        'TARGNAME', # Save 2MASS ID also
+        'FILENAME',
+        'DATE-OBS',
+        'TIME-OBS',
+        'DURATION', # Total exposure time 
+        'TARG_RA',
+        'TARG_DEC',
+        'TARGURA', # RA uncertainty
+        'TARGUDEC', # Dec uncertainty
+        'MU_RA', # Proper motion
+        'MU_DEC',                
+        'MU_EPOCH',
+        'INSTRUME',
+        'DETECTOR',
+        'MODULE',
+        'CHANNEL',
+        'FILTER',
+        'PUPIL',
+    ]
+    for fpath in fpaths:
+        row = []
+        hdr = fits.getheader(fpath)
+        for col in fits_cols:
+            row.append(hdr[col])
+        csv_list.append(row)
+
+    df = pd.DataFrame(csv_list,columns=fits_cols)
+
+    # Make a df with only one entry for each unique target
+    targnames = np.unique(df['TARGNAME']) 
+    df_unique = pd.DataFrame(np.transpose([targnames]),columns=['TARGNAME'])
+
+    # Get 2MASS IDs
+    print('Collecting 2MASS IDs...')
+    twomass_ids = []
+    for targname in targnames:
+        result_table = Simbad.query_objectids(targname)
+        if result_table is None:
+            raise Exception(f'No SIMBAD object found for targname {targname}.')    
+        tmids_found = []
+        for name in list(result_table['ID']):
+            if name[:6] == '2MASS ':
+                twomass_ids.append(name)
+                tmids_found.append(name) 
+        if len(tmids_found) < 1:
+            raise Exception(f'No 2MASS ID found for targname {targname}.')
+        elif len(tmids_found) > 1:
+            raise Exception(f'Multiple 2MASS ID found for targname {targname}: {tmids_found}')
+    df_unique['2MASS_ID'] = twomass_ids
+    df_unique.set_index('2MASS_ID',inplace=True)
+
+    # Query SIMBAD
+    print('Querying SIMBAD...')
+
+    customSimbad = Simbad()
+    customSimbad.add_votable_fields('sptype', 
+                                    'flux(K)', 'flux_error(K)', 
+                                    'plx', 'plx_error')
+    simbad_list = list(df_unique.index)
+    scistar_simbad_table = customSimbad.query_objects(simbad_list)
+
+    # Convert to pandas df and make 2MASS IDs the index
+    df_simbad = scistar_simbad_table.to_pandas()
+    df_simbad['2MASS_ID'] = simbad_list
+    df_simbad.set_index('2MASS_ID',inplace=True)
+
+    # Rename some columns
+    simbad_cols = { # Full column list here: http://simbad.u-strasbg.fr/Pages/guide/sim-fscript.htx 
+        'SPTYPE': 'SP_TYPE', # maybe use 'simple_spt' or 'complete_spt'?
+        'KMAG': 'FLUX_K', # 'kmag'
+        'KMAG_ERR': 'FLUX_ERROR_K', # 'ekmag'
+        'PLX': 'PLX_VALUE', # 'plx'
+        'PLX_ERR': 'PLX_ERROR', # 'eplx'
+        }
+    for col,simbad_col in simbad_cols.items():
+        df_simbad[col] = list(df_simbad[simbad_col])
+
+    # Add the values we want to df_unique
+    df_unique = pd.concat([df_unique,df_simbad.loc[:,simbad_cols.keys()]],axis=1)
+
+    # Query mocadb.ca for extra info
+    print('Querying MOCADB (this may take a minute)...')
+    names_df = pd.DataFrame(list(df_unique.index),columns=['designation'])
+    moca = mocapy.MocaEngine()
+    mdf = moca.query("SELECT tt.designation AS input_designation, sam.* FROM tmp_table AS tt LEFT JOIN mechanics_all_designations AS mad ON(mad.designation LIKE tt.designation) LEFT JOIN summary_all_objects AS sam ON(sam.moca_oid=mad.moca_oid)", tmp_table=names_df)
+    mdf.set_index('input_designation',inplace=True)
+
+    moca_cols = {
+        'SPTYPE': 'spt', # maybe use 'simple_spt' or 'complete_spt'?
+        'PLX': 'plx', # 'plx'
+        'PLX_ERR': 'eplx', # 'eplx'
+        'AGE': 'age', # 'age'
+        'AGE_ERR': 'eage', # 'eage'
+    }
+
+    # Update the column names for consistency
+    for col,moca_col in moca_cols.items():
+        # print(col, list(mdf[moca_col]))
+        mdf[col] = list(mdf[moca_col])
+
+    # Fill in values missing from SIMBAD with MOCA
+
+    df_unique['COMMENTS'] = ''
+
+    # Sort all the dfs by index so they match up
+    df_unique.sort_index(inplace=True)   
+    df_simbad.sort_index(inplace=True)   
+    mdf.sort_index(inplace=True)   
+
+    # Replace values and update comments
+    cols_overlap = list(set(list(simbad_cols.keys())).intersection(list(moca_cols.keys())))
+    for col in cols_overlap:
+        df_unique.loc[isnone(df_simbad[col]) & ~isnone(mdf[col]),'COMMENTS'] += f"{col} adopted from MOCA. "
+        df_unique.loc[isnone(df_simbad[col]) & ~isnone(mdf[col]),col] = mdf
+
+    for col in ['AGE','AGE_ERR']:
+        df_unique[col] = mdf[col]
+        df_unique.loc[~isnone(mdf[col]),'COMMENTS'] += f"{col} adopted from MOCA. "
+
+    # # Replace values
+    # df_unique.loc[df_unique['SPTYPE']=='','SPTYPE'] = None
+    # df_unique_replaced = df_unique.loc[:,cols_overlap].combine_first(mdf.loc[:,cols_overlap])
+    # df_unique.loc[:,cols_overlap] = df_unique_replaced.loc[:,cols_overlap]
+    # cols_overlap = ['SPTYPE','PLX','PLX_ERR']
+    # df_unique.loc[isnone(df_unique.loc[:,cols_overlap]),cols_overlap] = mdf.loc[isnone(df_unique.loc[:,cols_overlap]),cols_overlap]
+    # df_unique.loc[:,cols_overlap].where(not_isnone,other=mdf,inplace=True)
+
+    # Calculate distances from plx in mas
+    df_unique['DIST'] = 1. / (df_unique['PLX'] / 1000)
+    df_unique['DIST_ERR'] = df_unique['PLX_ERR'] / 1000 / ((df_unique['PLX'] / 1000)**2)
+
+    # Decode spectral types
+    df_unique = update_db_sptypes(df_unique)
+
+    # Add empty columns
+    empty_cols = [
+        'FLAGS',
+        'HAS_DISK',
+        'HAS_CANDS']
+    for col in empty_cols:
+        df_unique[col] = ''
+
+    # Apply dataframe of unique targets to the original file list
+    df.set_index('TARGNAME',inplace=True)
+    df_unique.reset_index(inplace=True)
+    df_unique.set_index('TARGNAME',inplace=True)
+    df_unique = df_unique.reindex(df.index)
+    df_out = pd.concat([df,df_unique],axis=1)
+    
+    # Save dataframe
+    df_out.to_csv(outpath)
+    print(f'Database saved to {outpath}')
+    print(f'Done.')
+
+    return df_out
+
+
+def get_sciref_files(sci_target, refdb, idir=None, spt_choice=None, filters=None, exclude_disks=False):
+    """Construct a list of science files and reference files to input to a PSF subtraction routine.
+
+    Args:
+        sci_target (str): 
+            name of the science target to be PSF subtracted. Can be the proposal target name, 
+            JWST resolved target name, or 2MASS ID.
+        refdb (pandas.DataFrame or str): 
+            pandas dataframe or filepath to csv containing the reference database generated by 
+            the build_refdb() function.
+        idir (str):
+            path to directory of input data, to be appended to file names.
+        spt_choice (str, optional): 
+            None (default): use all spectral types.
+            'near' : use only references with the same spectral class letter.
+            'nearer' : use only refs within +- 1 spectral type. ie. M3-5 for an M4 science target.
+            'nearest' : use only refs with the exact same spectral type.
+        filters (str or list, optional): 
+            None (default) : include all filters.
+            'F444W' or other filter name: include only that filter.
+            ['filt1','filt2']: include only filt1 and filt2
+        exclude_disks (bool, optional): Exclude references that are known to have disks. Defaults to False.
+
+    Returns:
+        list: filenames of science observations.
+        list: filenames of reference observations.
+    """
+
+    # TODO:
+        # - filter out flags?
+        # - sptype filters 
+        # - tests:
+            # - sci_target in index
+            # - sci_target in 2MASS_ID
+            # - sci_target in TARGPROP
+            # - sci_target not in refdb 
+            # - exceptions if 0 science fnames are returned
+            # - warnings if 0 reference observations are returned
+
+    if isinstance(refdb,str):
+        refdb = load_refdb(refdb)
+
+    # Locate input target 2MASS ID 
+    # (input name could be in index, TARGPROP, or 2MASS_ID column)
+    if sci_target in refdb['2MASS_ID'].to_list():
+        targ_2mass = sci_target
+
+    elif sci_target in refdb.index.to_list():
+        targ_2mass = refdb.loc[sci_target,'2MASS_ID'].to_list()[0]
+        
+    elif sci_target in refdb['TARGPROP'].to_list():
+        refdb_temp = refdb.reset_index()
+        refdb_temp.set_index('TARGPROP',inplace=True)
+        targ_2mass = refdb_temp.loc[sci_target,'2MASS_ID'].to_list()[0]
+    
+    else:
+        raise Exception(f'Science target {sci_target} not found in reference database.')
+    
+    refdb_temp = refdb.reset_index()
+    refdb_temp.set_index('FILENAME',inplace=True)
+
+    # Collect all the science files
+    sci_fnames = refdb_temp.index[refdb_temp['2MASS_ID'] == targ_2mass].to_list()
+
+    # Start list of reference files
+    ref_fnames = refdb_temp.index[refdb_temp['2MASS_ID'] != targ_2mass].to_list()
+
+    # Collect the reference files
+    if spt_choice != None:
+        raise Exception('Only spt_choice=None is currently supported.')
+
+    # Remove observations with disks flagged
+    if exclude_disks:
+        disk_fnames = refdb_temp.index[refdb_temp['HAS_DISK'] == True].to_list()
+        ref_fnames = list(set(ref_fnames) - set(disk_fnames))
+    
+    if filters != None:
+        if isinstance('filter',str):
+            filters = [filters]
+        filter_fnames = []
+        for filter in filters:
+            filter_fnames.extend(refdb_temp.index[refdb_temp['FILTER'] == filter].to_list())
+        if len(filter_fnames) == 0:
+            raise Warning(f'No observations found with filters {filters}.')
+        
+        sci_fnames = list(set(sci_fnames).intersection(filter_fnames))
+        ref_fnames = list(set(ref_fnames).intersection(filter_fnames))
+
+    # Make sure no observations are in both sci_fnames and ref_fnames
+    if len(set(sci_fnames).intersection(ref_fnames)) > 0:
+        raise Exception("One or more filenames exists in both the science and reference file list. Something is wrong.")
+
+    if not idir is None:
+        sci_fpaths = [os.path.join(idir,sci_fname) for sci_fname in sci_fnames]
+        ref_fpaths = [os.path.join(idir,ref_fname) for ref_fname in ref_fnames]
+    else:
+        sci_fpaths = sci_fnames
+        ref_fpaths = ref_fnames
+        
+    return [sci_fpaths, ref_fpaths]
+
+# # TESTING
+# idir = 'DATA/NANREPLACED_v0'
+# ref_db = build_refdb(idir,overwrite=True)
+
+# print('Done Done.')
\ No newline at end of file

From 155d805b988a2207dce549ae4470af77f745e966 Mon Sep 17 00:00:00 2001
From: Ell Bogat <ell.bogat@nasa.gov>
Date: Mon, 25 Nov 2024 08:51:31 -0500
Subject: [PATCH 02/11] update build_refdb to accept generic input file
 suffixes

---
 spaceKLIP/psflib.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/spaceKLIP/psflib.py b/spaceKLIP/psflib.py
index b2aa2de3..ea4643fb 100644
--- a/spaceKLIP/psflib.py
+++ b/spaceKLIP/psflib.py
@@ -106,7 +106,7 @@ def update_db_sptypes(refdb):
     return refdb_copy
 
 
-def build_refdb(idir,odir='.',uncal=False,overwrite=False):
+def build_refdb(idir,odir='.',suffix='calints',overwrite=False):
     """
     Constructs a database of target-specific reference info for each
     calints file in the input directory.
@@ -115,8 +115,7 @@ def build_refdb(idir,odir='.',uncal=False,overwrite=False):
         idir (path): Path to pre-processed (stage 2) JWST images to be added 
          to the database
         odir (path, optional): Location to save the database. Defaults to '.'.
-        uncal (bool, optional): Toggle using uncal files as inputs instead of
-         calints. Defaults to None.
+        suffix (str, optional): Input filename suffix, e.g. 'uncal' or 'calints'. Defaults to 'calints'.
         overwrite (bool, optional): If true, overwrite the existing caldb.
 
     Returns:
@@ -147,9 +146,9 @@ def build_refdb(idir,odir='.',uncal=False,overwrite=False):
         else:
             raise Exception(f'This operation is trying to overwrite {outpath}.\nIf this is what you want, set overwrite=True.')
 
-    # Read in uncal files 
+    # Read input files 
     print('Reading input files...')
-    suffix = 'uncal' if uncal else 'calints'
+    suffix = suffix.strip('_')
     fpaths = sorted(glob.glob(os.path.join(idir,f"*_{suffix}.fits")))
 
     # Start a dataframe with the header info we want from each file
@@ -175,6 +174,7 @@ def build_refdb(idir,odir='.',uncal=False,overwrite=False):
         'FILTER',
         'PUPIL',
     ]
+
     for fpath in fpaths:
         row = []
         hdr = fits.getheader(fpath)
@@ -313,7 +313,10 @@ def build_refdb(idir,odir='.',uncal=False,overwrite=False):
     return df_out
 
 
-def get_sciref_files(sci_target, refdb, idir=None, spt_choice=None, filters=None, exclude_disks=False):
+def get_sciref_files(sci_target, refdb, idir=None, 
+                     spt_choice=None, 
+                     filters=None, 
+                     exclude_disks=False):
     """Construct a list of science files and reference files to input to a PSF subtraction routine.
 
     Args:

From 1bdbc90fdf49963466f605927f0d8e36fe8dd7c6 Mon Sep 17 00:00:00 2001
From: Ell Bogat <ell.bogat@nasa.gov>
Date: Mon, 25 Nov 2024 09:07:09 -0500
Subject: [PATCH 03/11] set default refdb column values to 'unknown' if manual
 flagging is needed

---
 spaceKLIP/psflib.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/spaceKLIP/psflib.py b/spaceKLIP/psflib.py
index ea4643fb..61ca09d9 100644
--- a/spaceKLIP/psflib.py
+++ b/spaceKLIP/psflib.py
@@ -131,9 +131,7 @@ def build_refdb(idir,odir='.',suffix='calints',overwrite=False):
     #       - no calints files in input directory
     #       - header kw missing
     #       - duplicate science target with different program names
-    #       - calints vs uncal toggle 
-    #           - (warning if no calints present but uncals are
-    #             'did you mean to set uncal = True?')
+    # - logic for if 'HAS_DISK','HAS_CANDS' have a mix of 'unknown' and bool values
     
     # Check that you won't accidentally overwrite an existing csv.
     outpath = os.path.join(odir,'ref_lib.csv')
@@ -150,6 +148,8 @@ def build_refdb(idir,odir='.',suffix='calints',overwrite=False):
     print('Reading input files...')
     suffix = suffix.strip('_')
     fpaths = sorted(glob.glob(os.path.join(idir,f"*_{suffix}.fits")))
+    if len(fpaths) == 0:
+        raise Exception(f'No "{suffix}" files found in input directory {idir} .')
 
     # Start a dataframe with the header info we want from each file
     csv_list = []
@@ -258,7 +258,7 @@ def build_refdb(idir,odir='.',suffix='calints',overwrite=False):
 
     # Fill in values missing from SIMBAD with MOCA
 
-    df_unique['COMMENTS'] = ''
+    df_unique['COMMENTS'] = 'unknown'
 
     # Sort all the dfs by index so they match up
     df_unique.sort_index(inplace=True)   
@@ -291,12 +291,13 @@ def build_refdb(idir,odir='.',suffix='calints',overwrite=False):
     df_unique = update_db_sptypes(df_unique)
 
     # Add empty columns
-    empty_cols = [
+    manual_cols = [
         'FLAGS',
         'HAS_DISK',
         'HAS_CANDS']
-    for col in empty_cols:
-        df_unique[col] = ''
+    
+    for col in manual_cols:
+        df_unique[col] = 'unknown'
 
     # Apply dataframe of unique targets to the original file list
     df.set_index('TARGNAME',inplace=True)
@@ -312,7 +313,6 @@ def build_refdb(idir,odir='.',suffix='calints',overwrite=False):
 
     return df_out
 
-
 def get_sciref_files(sci_target, refdb, idir=None, 
                      spt_choice=None, 
                      filters=None, 

From a5bf15c03d761104955b84b48a14dc652a0bc9a6 Mon Sep 17 00:00:00 2001
From: Ell Bogat <ell.bogat@nasa.gov>
Date: Tue, 26 Nov 2024 09:04:30 -0500
Subject: [PATCH 04/11] add ability to download ref_db files from MAST

---
 spaceKLIP/psflib.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/spaceKLIP/psflib.py b/spaceKLIP/psflib.py
index 61ca09d9..0bdfc785 100644
--- a/spaceKLIP/psflib.py
+++ b/spaceKLIP/psflib.py
@@ -19,6 +19,7 @@
 import mocapy
 import pandas as pd
 from astropy.io import fits
+from spaceKLIP import mast
 
 from astroquery.simbad import Simbad
 import numpy as np
@@ -157,6 +158,7 @@ def build_refdb(idir,odir='.',suffix='calints',overwrite=False):
         'TARGPROP',
         'TARGNAME', # Save 2MASS ID also
         'FILENAME',
+        'OBS_ID',
         'DATE-OBS',
         'TIME-OBS',
         'DURATION', # Total exposure time 
@@ -313,6 +315,7 @@ def build_refdb(idir,odir='.',suffix='calints',overwrite=False):
 
     return df_out
 
+
 def get_sciref_files(sci_target, refdb, idir=None, 
                      spt_choice=None, 
                      filters=None, 
@@ -417,8 +420,16 @@ def get_sciref_files(sci_target, refdb, idir=None,
         
     return [sci_fpaths, ref_fpaths]
 
-# # TESTING
-# idir = 'DATA/NANREPLACED_v0'
-# ref_db = build_refdb(idir,overwrite=True)
 
-# print('Done Done.')
\ No newline at end of file
+def download_mast(ref_db,token=None,
+                  overwrite=False,exists_ok=True,
+                  progress=False, verbose=False,
+                  base_dir=os.path.join('DATA','MAST_DOWNLOAD')):
+    
+    for fname in list(ref_db.FILENAME):
+        mast.get_mast_filename(fname,
+                               outputdir=base_dir,
+                               overwrite=overwrite, exists_ok=exists_ok,
+                               progress=progress, verbose=verbose,
+                               mast_api_token=token)
+

From 02a5896f3cb6fe04bec3dfaecccdcb76e67a4b82 Mon Sep 17 00:00:00 2001
From: Ell Bogat <ell.bogat@nasa.gov>
Date: Tue, 26 Nov 2024 09:15:39 -0500
Subject: [PATCH 05/11] add mocapy to requirements

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 5e82e331..1dc5a506 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,4 +12,4 @@ webbpsf_ext>=1.2.0
 pyklip>=2.7.1
 ipywidgets>=8.1.5
 lmfit>1.2.2
-
+mocapy @ git+https://github.com/jgagneastro/mocapy.git

From 7fd38d58d4f51aee123987163aa32f6860968709 Mon Sep 17 00:00:00 2001
From: Ell Bogat <ell.bogat@nasa.gov>
Date: Wed, 4 Dec 2024 14:58:48 -0500
Subject: [PATCH 06/11] add tqdm import

---
 spaceKLIP/mast.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spaceKLIP/mast.py b/spaceKLIP/mast.py
index 57e3e3c6..d35bef84 100644
--- a/spaceKLIP/mast.py
+++ b/spaceKLIP/mast.py
@@ -9,6 +9,7 @@
 import astropy, astropy.table
 from astroquery.mast import Mast
 import requests
+import tqdm
 
 import logging
 log = logging.getLogger(__name__)

From a0896655157ce21d56d42fdf96fef8f94037245c Mon Sep 17 00:00:00 2001
From: Ell Bogat <ell.bogat@nasa.gov>
Date: Wed, 4 Dec 2024 14:59:18 -0500
Subject: [PATCH 07/11] add mast download for specific file suffixes

---
 spaceKLIP/psflib.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/spaceKLIP/psflib.py b/spaceKLIP/psflib.py
index 0bdfc785..57a76e06 100644
--- a/spaceKLIP/psflib.py
+++ b/spaceKLIP/psflib.py
@@ -424,9 +424,24 @@ def get_sciref_files(sci_target, refdb, idir=None,
 def download_mast(ref_db,token=None,
                   overwrite=False,exists_ok=True,
                   progress=False, verbose=False,
+                  suffix=None, # e.g. 'calints'
                   base_dir=os.path.join('DATA','MAST_DOWNLOAD')):
     
-    for fname in list(ref_db.FILENAME):
+    fnames = list(ref_db.FILENAME)
+
+    # Update file suffix if provided
+    if not suffix == None:
+            new_suffix = suffix.strip('_')
+
+            for ff,fname in enumerate(fnames):
+                fname_split = fname.split('_')
+                new_fname = '_'.join(fname_split[:-1]) + f'_{new_suffix}.fits'
+
+                fnames[ff] = new_fname    
+        
+    # Download each file
+    for fname in fnames:
+        
         mast.get_mast_filename(fname,
                                outputdir=base_dir,
                                overwrite=overwrite, exists_ok=exists_ok,

From 34a6e71b37873eee749ecb8de42ff0cde77ba58f Mon Sep 17 00:00:00 2001
From: Ell Bogat <ell.bogat@nasa.gov>
Date: Wed, 4 Dec 2024 15:11:40 -0500
Subject: [PATCH 08/11] convert print statements to log statements

---
 spaceKLIP/psflib.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/spaceKLIP/psflib.py b/spaceKLIP/psflib.py
index 57a76e06..13e47500 100644
--- a/spaceKLIP/psflib.py
+++ b/spaceKLIP/psflib.py
@@ -24,6 +24,10 @@
 from astroquery.simbad import Simbad
 import numpy as np
 
+import logging
+log = logging.getLogger(__name__)
+log.setLevel(logging.INFO)
+
 sp_class_letters = ['O','B','A','F','G','K','M','T','Y']
 
 # Helper functions for logic with database series
@@ -146,7 +150,7 @@ def build_refdb(idir,odir='.',suffix='calints',overwrite=False):
             raise Exception(f'This operation is trying to overwrite {outpath}.\nIf this is what you want, set overwrite=True.')
 
     # Read input files 
-    print('Reading input files...')
+    log.info('Reading input files...')
     suffix = suffix.strip('_')
     fpaths = sorted(glob.glob(os.path.join(idir,f"*_{suffix}.fits")))
     if len(fpaths) == 0:
@@ -191,7 +195,7 @@ def build_refdb(idir,odir='.',suffix='calints',overwrite=False):
     df_unique = pd.DataFrame(np.transpose([targnames]),columns=['TARGNAME'])
 
     # Get 2MASS IDs
-    print('Collecting 2MASS IDs...')
+    log.info('Collecting 2MASS IDs...')
     twomass_ids = []
     for targname in targnames:
         result_table = Simbad.query_objectids(targname)
@@ -210,7 +214,7 @@ def build_refdb(idir,odir='.',suffix='calints',overwrite=False):
     df_unique.set_index('2MASS_ID',inplace=True)
 
     # Query SIMBAD
-    print('Querying SIMBAD...')
+    log.info('Querying SIMBAD...')
 
     customSimbad = Simbad()
     customSimbad.add_votable_fields('sptype', 
@@ -239,7 +243,7 @@ def build_refdb(idir,odir='.',suffix='calints',overwrite=False):
     df_unique = pd.concat([df_unique,df_simbad.loc[:,simbad_cols.keys()]],axis=1)
 
     # Query mocadb.ca for extra info
-    print('Querying MOCADB (this may take a minute)...')
+    log.info('Querying MOCADB (this may take a minute)...')
     names_df = pd.DataFrame(list(df_unique.index),columns=['designation'])
     moca = mocapy.MocaEngine()
     mdf = moca.query("SELECT tt.designation AS input_designation, sam.* FROM tmp_table AS tt LEFT JOIN mechanics_all_designations AS mad ON(mad.designation LIKE tt.designation) LEFT JOIN summary_all_objects AS sam ON(sam.moca_oid=mad.moca_oid)", tmp_table=names_df)
@@ -255,7 +259,6 @@ def build_refdb(idir,odir='.',suffix='calints',overwrite=False):
 
     # Update the column names for consistency
     for col,moca_col in moca_cols.items():
-        # print(col, list(mdf[moca_col]))
         mdf[col] = list(mdf[moca_col])
 
     # Fill in values missing from SIMBAD with MOCA
@@ -310,9 +313,8 @@ def build_refdb(idir,odir='.',suffix='calints',overwrite=False):
     
     # Save dataframe
     df_out.to_csv(outpath)
-    print(f'Database saved to {outpath}')
-    print(f'Done.')
-
+    log.info(f'Database saved to {outpath}')
+    
     return df_out
 
 

From 6cbd7ad93127709d5d32e4caf97d3a4a5983f525 Mon Sep 17 00:00:00 2001
From: Ell Bogat <ell.bogat@nasa.gov>
Date: Wed, 4 Dec 2024 15:35:00 -0500
Subject: [PATCH 09/11] add tqdm to spaceKLIP requirements

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 1dc5a506..f20f0a1e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,4 +12,5 @@ webbpsf_ext>=1.2.0
 pyklip>=2.7.1
 ipywidgets>=8.1.5
 lmfit>1.2.2
+tqdm
 mocapy @ git+https://github.com/jgagneastro/mocapy.git

From 78146fc7dd0b1ac21f1ea8d808fe55f854cc36ea Mon Sep 17 00:00:00 2001
From: Ell Bogat <ell.bogat@nasa.gov>
Date: Thu, 5 Dec 2024 10:39:55 -0500
Subject: [PATCH 10/11] fix syntax warning

---
 spaceKLIP/mast.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spaceKLIP/mast.py b/spaceKLIP/mast.py
index d35bef84..222fc472 100644
--- a/spaceKLIP/mast.py
+++ b/spaceKLIP/mast.py
@@ -193,7 +193,7 @@ def query_coron_datasets(inst,
 
     if return_filenames:
 
-        if level is not None and level.lower() is not 'cal' and level.lower() is not '2b':
+        if level is not None and level.lower() != 'cal' and level.lower() != '2b':
             # Transform filenames to either rate or uncal files
             # This may not be robust to all possible scenarios yet...
             if level.lower()=='rate' or level.lower()=='2a':

From fc67e23ba2d9a9c6db98e300a755dda56e7cc7d1 Mon Sep 17 00:00:00 2001
From: Ell Bogat <ell.bogat@nasa.gov>
Date: Thu, 5 Dec 2024 10:40:24 -0500
Subject: [PATCH 11/11] add spectral type filters

---
 spaceKLIP/psflib.py | 150 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 134 insertions(+), 16 deletions(-)

diff --git a/spaceKLIP/psflib.py b/spaceKLIP/psflib.py
index 13e47500..81f0b976 100644
--- a/spaceKLIP/psflib.py
+++ b/spaceKLIP/psflib.py
@@ -32,6 +32,16 @@
 
 # Helper functions for logic with database series
 def isnone(series):
+    """Helper function to determine which elements of a series
+    are NaN, None, or empty strings.
+
+    Args:
+        series (list-like): input data to check
+
+    Returns:
+        boolean array: True values where input array is NaN/None/''
+    """
+
     try:
         return np.isnan(series)
     except:
@@ -64,6 +74,19 @@ def load_refdb(fpath):
 
 
 def decode_simbad_sptype(input_sptypes):
+    """Decodes the complicated SIMBAD spectral type string into simplified 
+    spectral type components.
+
+    Args:
+        input_sptypes (str or list): SIMBAD spectral type string (or list of strings)
+
+    Returns:
+        tuple: list of spectral class letters, list of subclass numbers, 
+            and list of luminosity class numerals for each input string.
+    """
+
+    # TODO:
+        # - test this!
 
     if isinstance(input_sptypes,str):
         input_sptypes = [input_sptypes]
@@ -98,7 +121,56 @@ def decode_simbad_sptype(input_sptypes):
     return (sp_classes,sp_subclasses,sp_lclasses)
 
 
+def adjust_spttype(spt_tup):
+    # TODO: 
+    # - Tests:
+        # - input hotter than O0
+        # - input colder than Y9
+        # - subclass not int                
+        # - known transitions: A10 -> F0, M-3 -> F7
+
+    spt_class, spt_subclass = spt_tup
+
+    spt_subclass = int(spt_subclass)
+
+    if not spt_class in sp_class_letters:
+        raise Exception(f'Invalid spectral class letter: {spt_class}')
+
+    i_class = sp_class_letters.index(spt_class)
+
+    while spt_subclass > 9:
+
+        if spt_class == sp_class_letters[-1]:
+            return (spt_class,9)
+
+        i_class += 1
+        spt_class = sp_class_letters[i_class]
+        spt_subclass -= 10
+
+    while spt_subclass < 0:
+
+        if spt_class == sp_class_letters[0]:
+            return (spt_class,0)
+        
+        i_class -= 1
+        spt_class = sp_class_letters[i_class]
+        spt_subclass += 10
+
+    return (spt_class,spt_subclass)
+
+
 def update_db_sptypes(refdb):
+    """
+    Separate the spectral type column into a column each for the
+    spectral class letter, subclass number, and luminosity class numeral.
+
+    Args:
+        refdb (pandas.DataFrame): PSF reference dataframe
+
+    Returns:
+        pandas.DataFrame: updated PSF reference dataframe
+    """
+
     simbad_sptypes = refdb.SPTYPE.values
 
     sp_classes, sp_subclasses, sp_lclasses = decode_simbad_sptype(simbad_sptypes)
@@ -319,7 +391,7 @@ def build_refdb(idir,odir='.',suffix='calints',overwrite=False):
 
 
 def get_sciref_files(sci_target, refdb, idir=None, 
-                     spt_choice=None, 
+                     spt_tolerance=None, 
                      filters=None, 
                      exclude_disks=False):
     """Construct a list of science files and reference files to input to a PSF subtraction routine.
@@ -333,11 +405,12 @@ def get_sciref_files(sci_target, refdb, idir=None,
             the build_refdb() function.
         idir (str):
             path to directory of input data, to be appended to file names.
-        spt_choice (str, optional): 
+        spt_tolerance (str or int, optional): 
             None (default): use all spectral types.
-            'near' : use only references with the same spectral class letter.
-            'nearer' : use only refs within +- 1 spectral type. ie. M3-5 for an M4 science target.
-            'nearest' : use only refs with the exact same spectral type.
+            'exact' : use only refs with the exact same spectral type.
+            'class' : use only references with the same spectral class letter.
+            'subclass' : use only references with the same spectral class letter and subclass number. (ignores luminosity class)
+            int : use only refs within +- N spectral subclasses, e.g. M3-5 for an M4 science target if spt_tolerance = 1.
         filters (str or list, optional): 
             None (default) : include all filters.
             'F444W' or other filter name: include only that filter.
@@ -350,15 +423,7 @@ def get_sciref_files(sci_target, refdb, idir=None,
     """
 
     # TODO:
-        # - filter out flags?
-        # - sptype filters 
-        # - tests:
-            # - sci_target in index
-            # - sci_target in 2MASS_ID
-            # - sci_target in TARGPROP
-            # - sci_target not in refdb 
-            # - exceptions if 0 science fnames are returned
-            # - warnings if 0 reference observations are returned
+        # - filter out manual flags
 
     if isinstance(refdb,str):
         refdb = load_refdb(refdb)
@@ -377,6 +442,7 @@ def get_sciref_files(sci_target, refdb, idir=None,
         targ_2mass = refdb_temp.loc[sci_target,'2MASS_ID'].to_list()[0]
     
     else:
+        log.error(f'Science target {sci_target} not found in reference database.')
         raise Exception(f'Science target {sci_target} not found in reference database.')
     
     refdb_temp = refdb.reset_index()
@@ -384,13 +450,65 @@ def get_sciref_files(sci_target, refdb, idir=None,
 
     # Collect all the science files
     sci_fnames = refdb_temp.index[refdb_temp['2MASS_ID'] == targ_2mass].to_list()
+    first_scifile = sci_fnames[0]
 
     # Start list of reference files
     ref_fnames = refdb_temp.index[refdb_temp['2MASS_ID'] != targ_2mass].to_list()
 
     # Collect the reference files
-    if spt_choice != None:
-        raise Exception('Only spt_choice=None is currently supported.')
+    if spt_tolerance != None:
+
+        # Consider handling float subclasses (e.g. M4.5) better. Take floor for now.
+        refdb_temp['SP_SUBCLASS'] = np.floor(refdb_temp['SP_SUBCLASS'])
+
+        targ_sp_class = refdb_temp.loc[first_scifile,'SP_CLASS']
+        targ_sp_subclass = refdb_temp.loc[first_scifile,'SP_SUBCLASS']
+        targ_sp_lclass = refdb_temp.loc[first_scifile,'SP_LCLASS']
+        
+        if isinstance(spt_tolerance,str):
+            if spt_tolerance.lower() == 'exact':
+
+                spt_fnames = refdb_temp.index[(refdb_temp['SP_CLASS'] == targ_sp_class) & 
+                                                (refdb_temp['SP_SUBCLASS'] == targ_sp_subclass) & 
+                                                (refdb_temp['SP_LCLASS'] == targ_sp_lclass)
+                                                ].to_list()
+                
+            elif spt_tolerance.lower() == 'class':
+
+                spt_fnames = refdb_temp.index[(refdb_temp['SP_CLASS'] == targ_sp_class)
+                                                ].to_list()
+
+            elif spt_tolerance.lower() == 'subclass':
+
+                spt_fnames = refdb_temp.index[(refdb_temp['SP_CLASS'] == targ_sp_class) & 
+                                                (refdb_temp['SP_SUBCLASS'] == targ_sp_subclass)
+                                                ].to_list()
+            
+            else:
+                raise Exception(f'spt_tolerance {spt_tolerance} not configured.')
+        
+        else:
+
+            assert isinstance(spt_tolerance,int)
+
+            spt_fnames = []
+            
+            for i in range(-spt_tolerance,spt_tolerance+1):
+
+                spt_tup = (targ_sp_class,targ_sp_subclass+i)
+            
+                # Carry over spectral classes and subclasses correctly
+                spt_tup = adjust_spttype(spt_tup)
+            
+                spt_fnames.extend(refdb_temp.index[(refdb_temp['SP_CLASS'] == spt_tup[0]) & 
+                                                   (refdb_temp['SP_SUBCLASS'] == spt_tup[1])
+                                                  ].to_list())
+            
+        if len(spt_fnames) == 0:
+            raise Warning(f'No observations found with specified spectral type filter.')
+        
+        sci_fnames = list(set(sci_fnames).intersection(spt_fnames))
+        ref_fnames = list(set(ref_fnames).intersection(spt_fnames))
 
     # Remove observations with disks flagged
     if exclude_disks: