Merge branch 'master' into update-weights-varlss-mod

igmhub · Jan 17, 2025 · 3a18ed2 · 3a18ed2
2 parents 6e9ec38 + 7add2b7
commit 3a18ed2
Show file tree

Hide file tree

Showing 10 changed files with 106 additions and 32 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 9.8.0
+current_version = 9.11.0
 commit = True
 
 [bumpversion:file:py/picca/_version.py]
diff --git a/.github/workflows/bump_version.yml b/.github/workflows/bump_version.yml
@@ -13,7 +13,7 @@ jobs:
   bump_version:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
       with:
         token: ${{ secrets.PAT_MW_TOKEN_ACTION }}
 
@@ -23,7 +23,7 @@ jobs:
         git config --global user.name "Michael Walther (bot)"
     
     - name: Set up Python
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
 
     - name: Install bump2version
       run:  pip install bump2version

diff --git a/.github/workflows/publish-to-test-pypi.yml b/.github/workflows/publish-to-test-pypi.yml
@@ -14,9 +14,9 @@ jobs:
       id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Set up Python
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: "3.x"
 
@@ -37,7 +37,8 @@ jobs:
     - name: Publish distribution 📦 to Test PyPI
       uses: pypa/gh-action-pypi-publish@release/v1
       with:
-        repository-url: https://test.pypi.org/legacy/
+        repository_url: https://test.pypi.org/legacy/
+        attestations: false
 
     - name: Publish distribution 📦 to PyPI
       if: startsWith(github.ref, 'refs/tags')

diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -15,9 +15,9 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Set up Python 3.12
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: 3.12
     - name: Install dependencies

diff --git a/.github/workflows/python_package.yml b/.github/workflows/python_package.yml
@@ -29,9 +29,9 @@ jobs:
 #      OPENBLAS_NUM_THREADS: 1
 #      NUMEXPR_NUM_THREADS: 1
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies

diff --git a/bin/picca_Pk1D.py b/bin/picca_Pk1D.py
@@ -97,6 +97,16 @@ def process_all_files(index_file_args):
             if np.sum(delta.exposures_diff)==0:
                 continue
 
+        if linear_binning:
+            max_num_pixels_forest_theoretical=(1180-1050)*(delta.z_qso+1)/pixel_step  #currently no min/max restframe values defined, so hard coding for the moment
+        else:
+            max_num_pixels_forest_theoretical=(np.log(1180)-np.log10(1050))/pixel_step
+        # minimum number of pixel in forest
+        if args.nb_pixel_min is not None:
+            min_num_pixels = args.nb_pixel_min
+        else:
+            min_num_pixels = int(args.nb_pixel_frac_min*max_num_pixels_forest_theoretical)     #this is currently just hardcoding values so that spectra have a minimum length changing with z; but might be problematic for SBs
+
         if args.parts_in_redshift:
             # define chunks on a fixed redshift grid
             z_grid = np.array(args.z_parts)
@@ -105,11 +115,12 @@ def process_all_files(index_file_args):
                             delta.delta,
                             delta.exposures_diff,
                             delta.ivar,
-                            min_num_pixels=args.nb_pixel_min,
+                            min_num_pixels=min_num_pixels,
                             reso_matrix=(delta.resolution_matrix
                                  if reso_correction == 'matrix' else None),
                             linear_binning=linear_binning)
             num_parts = len(split_array[0])
+
             if reso_correction == 'matrix':
                 (mean_z_array, lambda_array, delta_array, exposures_diff_array,
                 ivar_array, reso_matrix_array) = split_array
@@ -194,7 +205,15 @@ def process_all_files(index_file_args):
                      pixel_step, log_lambda_array[part_index],
                      delta_array[part_index], exposures_diff_array[part_index],
                      ivar_array[part_index], args.no_apply_filling)
-            if num_masked_pixels > args.nb_pixel_masked_max:
+
+            if (args.nb_pixel_masked_max is not None):
+                max_num_masked_pixels = args.nb_pixel_masked_max
+            elif linear_binning:
+                max_num_masked_pixels = int(args.nb_pixel_masked_frac_max*(np.max(lambda_new)-np.min(lambda_new))/pixel_step)           #this only accounts for masking inside the spectrum, not at the adges
+            else:
+                max_num_masked_pixels = int(args.nb_pixel_masked_frac_max*(np.max(log_lambda_new)-np.min(log_lambda_new))/pixel_step)
+
+            if num_masked_pixels > max_num_masked_pixels:
                 continue
 
             # Compute pk_raw, needs uniform binning
@@ -388,16 +407,30 @@ def main(cmdargs):
 
     parser.add_argument('--nb-pixel-min',
                         type=int,
-                        default=75,
+                        default=None,
                         required=False,
                         help='Minimal number of pixels in a part of forest')
+
+
+    parser.add_argument('--nb-pixel-frac-min',
+                        type=float,
+                        default=None,
+                        required=False,
+                        help='Minimal number of pixels in a part of forest (default is assuming ~577 pixels for a z=2.5 QSO in the forest of 1050-1180A and masking up to 75)')
 
     parser.add_argument(
         '--nb-pixel-masked-max',
         type=int,
-        default=40,
+        default=None,#40,
         required=False,
         help='Maximal number of masked pixels in a part of forest')
+
+    parser.add_argument(
+        '--nb-pixel-masked-frac-max',
+        type=float,
+        default=None,
+        required=False,
+        help='Maximal number of masked pixels in a part of forest (default is 21% of the forest length, i.e. similar to the previous value at z=2.5 for a 3 chunk spectrum and 1050-1180A)')
 
     parser.add_argument('--no-apply-filling',
                         action='store_true',
@@ -503,6 +536,25 @@ def main(cmdargs):
     #create output dir if it does not exist
     os.makedirs(args.out_dir, exist_ok=True)
 
+    if args.nb_pixel_min is None and args.nb_pixel_frac_min is None:
+            # could make this fraction a new default, but that would probably cause trouble for SBs
+            # args.nb_pixel_frac_min = 0.13  #this is a suggestion for a new default
+            args.nb_pixel_min = 75    #this is the previous default
+    elif not (args.nb_pixel_frac_min is None or args.nb_pixel_min is None):
+        print("both nb_pixel_frac_min and nb_pixel_min were set, using the latter")
+        args.nb_pixel_frac_min=None
+
+    if args.nb_pixel_masked_frac_max is None and args.nb_pixel_masked_max is None:
+            # could make this, i.e. 10% of the estimated forest length the new default
+            # args.nb_pixel_masked_frac_max = 0.21  #this is a suggestion for a new default
+            args.nb_pixel_masked_max = 40  #this is the previous default
+    elif not (args.nb_pixel_masked_frac_max is None or args.nb_pixel_masked_max is None):
+        print("both nb_pixel_masked_frac_max and nb_pixel_masked_max were set, using the latter")
+        args.nb_pixel_masked_frac_max=None
+
+
+    print([[i, f] for i, f in enumerate(files)])
+
     if args.num_processors > 1:
         pool = Pool(args.num_processors)
         index_file_args = [(i, f, args) for i, f in enumerate(files)]

diff --git a/py/picca/_version.py b/py/picca/_version.py
@@ -1 +1 @@
-__version__ = '9.8.0'
+__version__ = '9.11.0'
diff --git a/py/picca/pk1d/postproc_pk1d.py b/py/picca/pk1d/postproc_pk1d.py
@@ -797,7 +797,11 @@ def compute_average_pk_redshift(
                 snr_bins = (snr_bin_edges[:-1] + snr_bin_edges[1:]) / 2
 
                 data_values = p1d_table[col][select]
-                data_values = data_values[mask_nan_p1d_values]
+                data_snr = p1d_table["forest_snr"][select]
+                data_snr, data_values = (
+                    data_snr[mask_nan_p1d_values],
+                    data_values[mask_nan_p1d_values],
+                )
                 # Fit function to observed dispersion in col:
                 standard_dev_col, _, _ = binned_statistic(
                     data_snr, data_values, statistic="std", bins=snr_bin_edges
@@ -820,15 +824,20 @@ def compute_average_pk_redshift(
                 variance_estimated_col = fitfunc_variance_pk1d(data_snr, *coef_col)
                 weights_col = 1.0 / variance_estimated_col
                 if apply_z_weights:
-                    weights *= redshift_weights
-                mean = np.average(data_values, weights=weights)
+                    mean = np.average(data_values, weights=weights * redshift_weights)
+                else:
+                    mean = np.average(data_values, weights=weights)
                 if apply_z_weights:
                     # Analytic expression for the re-weighted average:
                     error = np.sqrt(np.sum(weights_col * redshift_weights)) / np.sum(
                         weights_col
                     )
                 else:
                     error = np.sqrt(1.0 / np.sum(weights_col))
+                    # Variance estimator derived by Jean-Marc, we keep the estimated one.
+                    # error = np.sqrt(((np.sum(weights)**2 / np.sum(weights**2)) - 1 )**(-1) * (
+                    # ( np.sum(weights**2 * data_values**2) / np.sum(weights**2) ) - (
+                    # np.sum(weights * data_values)/ np.sum(weights) )**2 ))
                 if col == "Pk":
                     standard_dev = np.concatenate(
                         [
@@ -1142,7 +1151,7 @@ def compute_groups_for_one_forest(nbins_k, p1d_los):
         if number_in_bins != 0:
             weight = p1d_los["weight"][mask_ikbin][0]
             p1d_weights_id[ikbin] = weight
-            covariance_weights_id[ikbin] = np.sqrt(weight / number_in_bins)
+            covariance_weights_id[ikbin] = weight / number_in_bins
             p1d_groups_id[ikbin] = np.nansum(
                 p1d_los["pk"][mask_ikbin] * covariance_weights_id[ikbin]
             )
@@ -1199,11 +1208,11 @@ def compute_cov(
     mean_pk_product = np.outer(mean_pk, mean_pk)
 
     sum_p1d_weights = np.nansum(p1d_weights, axis=0)
-    weights_sum_of_product = np.outer(sum_p1d_weights, sum_p1d_weights)
+    weights_sum_product = np.outer(sum_p1d_weights, sum_p1d_weights)
 
     p1d_groups_product_sum = np.zeros((nbins_k, nbins_k))
-    covariance_weights_product_of_sum = np.zeros((nbins_k, nbins_k))
-    weights_product_of_sum = np.zeros((nbins_k, nbins_k))
+    covariance_weights_product_sum = np.zeros((nbins_k, nbins_k))
+    weights_product_sum = np.zeros((nbins_k, nbins_k))
 
     for i, p1d_group in enumerate(p1d_groups):
         # The summation is done with np.nansum instead of simple addition to not
@@ -1212,21 +1221,21 @@ def compute_cov(
         p1d_groups_product_sum = np.nansum(
             [p1d_groups_product_sum, np.outer(p1d_group, p1d_group)], axis=0
         )
-        covariance_weights_product_of_sum = np.nansum(
+        covariance_weights_product_sum = np.nansum(
             [
-                covariance_weights_product_of_sum,
+                covariance_weights_product_sum,
                 np.outer(covariance_weights[i], covariance_weights[i]),
             ],
             axis=0,
         )
-        weights_product_of_sum = np.nansum(
-            [weights_product_of_sum, np.outer(p1d_weights[i], p1d_weights[i])], axis=0
+        weights_product_sum = np.nansum(
+            [weights_product_sum, np.outer(p1d_weights[i], p1d_weights[i])], axis=0
         )
 
     del p1d_groups, covariance_weights, p1d_weights
 
-    covariance_matrix = (weights_product_of_sum / weights_sum_of_product) * (
-        (p1d_groups_product_sum / covariance_weights_product_of_sum) - mean_pk_product
+    covariance_matrix = ((weights_sum_product /weights_product_sum) - 1)**(-1) * (
+        (p1d_groups_product_sum / covariance_weights_product_sum) - mean_pk_product
     )
 
     # For fit_snr method, due to the SNR fitting scheme used for weighting,

diff --git a/py/picca/raw_io.py b/py/picca/raw_io.py
@@ -76,8 +76,14 @@ def read_transmission_file(filename, num_bins, objs_thingid, tracer='F_LYA', lam
     if np.in1d(thingid, objs_thingid).sum() == 0:
         hdul.close()
         return
-    ra = hdul['METADATA']['RA'][:].astype(np.float64) * np.pi / 180.
-    dec = hdul['METADATA']['DEC'][:].astype(np.float64) * np.pi / 180.
+    if 'RA' in hdul['METADATA'].get_colnames() and 'DEC' in hdul['METADATA'].get_colnames():
+        ra = hdul['METADATA']['RA'][:].astype(np.float64) * np.pi / 180.
+        dec = hdul['METADATA']['DEC'][:].astype(np.float64) * np.pi / 180.
+    elif 'TARGET_RA' in hdul['METADATA'].get_colnames() and 'TARGET_DEC' in hdul['METADATA'].get_colnames():
+        ra = hdul['METADATA']['TARGET_RA'][:].astype(np.float64) * np.pi / 180.
+        dec = hdul['METADATA']['TARGET_DEC'][:].astype(np.float64) * np.pi / 180.
+    else:
+        raise ValueError('RA and DEC columns not found in transmission files.')
     z = hdul['METADATA']['Z'][:]
 
     # Use "lambda_array" to store either lambda or log lambda
@@ -314,8 +320,14 @@ def convert_transmission_to_deltas(obj_path, out_dir, in_dir=None, in_filenames=
 
     w = hdu[z_key][:] > max(0., lambda_min / lambda_max_rest_frame - 1.)
     w &= hdu[z_key][:] < max(0., lambda_max / lambda_min_rest_frame - 1.)
-    objs_ra = hdu['RA'][:][w].astype('float64') * np.pi / 180.
-    objs_dec = hdu['DEC'][:][w].astype('float64') * np.pi / 180.
+    if 'RA' in key_val and 'DEC' in key_val:
+        objs_ra = hdu['RA'][:][w].astype('float64') * np.pi / 180.
+        objs_dec = hdu['DEC'][:][w].astype('float64') * np.pi / 180.
+    elif 'TARGET_RA' in key_val and 'TARGET_DEC' in key_val:
+        objs_ra = hdu['TARGET_RA'][:][w].astype('float64') * np.pi / 180.
+        objs_dec = hdu['TARGET_DEC'][:][w].astype('float64') * np.pi / 180.
+    else:
+        raise ValueError('RA and DEC columns not found in objects catalog.')
     objs_thingid = objs_thingid[w]
     hdul.close()
     userprint('INFO: Found {} quasars'.format(objs_ra.size))

diff --git a/py/picca/tests/data/test_Pk1D/meanPk1D_covariance.fits.gz b/py/picca/tests/data/test_Pk1D/meanPk1D_covariance.fits.gz