Merge branch 'features' of https://github.com/kujaku11/mth5 into feat…

…ures
kujaku11 · Feb 4, 2025 · 5450735 · 5450735
2 parents 2ba6552 + 3e46a59
commit 5450735
Show file tree

Hide file tree

Showing 2 changed files with 250 additions and 114 deletions.
diff --git a/mth5/data/make_mth5_from_asc.py b/mth5/data/make_mth5_from_asc.py
@@ -8,17 +8,27 @@
  that originally came from EMTF -- test1.asc and test2.asc.  Each ascii file
  represents five channels of data sampled at 1Hz at a synthetic station.
 
-Mirroring the original ascii files are:
-data/test1.h5
-data/test2.h5
-data/test12rr.h5
+TODO: Separate the handling of legacy EMTF data files, such as
+ reading into a dataframe from oddly delimited data, as well as flipping polarities of
+ the electric channels (possibly due to a baked in sign convention error in the legacy
+ data), so that a simple dataframe can be passed.  That will make the methods here more
+  easily generalize to work with other dataframes.  That would be useful in future when
+  we creating synthetic data at arbitrary sample rate.
+
+Development Notes:
+ Mirroring the original ascii files are:
+ data/test1.h5
+ data/test2.h5
+ data/test12rr.h5
+
+ Also created are some files with the same data but other channel_nomenclature schemes:
+ data/test12rr_LEMI34.h5
+ data/test1_LEMI12.h5
+
+ - 20231103: Added an 8Hz up-sampled version of test1.  No spectral content was added
+ so the band between the old and new Nyquist frequencies is bogus.
 
-Also created are some files with the same data but other channel_nomenclature schemes:
-data/test12rr_LEMI34.h5
-data/test1_LEMI12.h5
 
-- 20231103: Added an 8Hz up-sampled version of test1.  No spectral content was added
-so the band between the old and new Nyquist frequencies is bogus.
 
 """
 # =============================================================================
@@ -38,17 +48,20 @@
 from mth5.data.station_config import make_station_03
 from mth5.data.station_config import make_station_04
 from mth5.data.station_config import SyntheticRun
+from mth5.data.station_config import SyntheticStation
 from mth5.mth5 import MTH5
 from mth5.timeseries import ChannelTS, RunTS
 from mth5.utils.helpers import add_filters
 from mt_metadata.transfer_functions.processing.aurora import (
     ChannelNomenclature,
 )
+from mt_metadata.transfer_functions.processing.aurora.channel_nomenclature import SupportedNomenclature
+
 from mt_metadata.timeseries import Electric
 from mt_metadata.timeseries import Magnetic
 from mt_metadata.timeseries import Survey
 
-from typing import Optional, Union
+from typing import List, Literal, Optional, Union
 
 
 # =============================================================================
@@ -59,22 +72,24 @@
 
 
 def create_run_ts_from_synthetic_run(
-    run: SyntheticRun, df: pd.DataFrame, channel_nomenclature: str = "default"
+    run: SyntheticRun,
+    df: pd.DataFrame,
+    channel_nomenclature: SupportedNomenclature = "default"
 ):
     """
     Loop over channels of synthetic data in df and make ChannelTS objects.
 
-    :type run: mth5.data.station_config.SyntheticRun
     :param run: One-off data structure with information mth5 needs to initialize. Specifically sample_rate, filters.
-    :type df: pandas.DataFrame
+    :type run: mth5.data.station_config.SyntheticRun
     :param df: time series data in columns labelled from ["ex", "ey", "hx", "hy", "hz"]
-    :type channel_nomenclature : string
+    :type df: pandas.DataFrame
     :param channel_nomenclature : Keyword corresponding to channel nomenclature mapping
     in CHANNEL_MAPS variable from channel_nomenclature.py module in mt_metadata.
     Supported values include ['default', 'lemi12', 'lemi34', 'phoenix123']
+    :type channel_nomenclature : string
 
-    :rtype runts: RunTS
     :return runts: MTH5 run time series object, data and metadata bound into one.
+    :rtype runts: RunTS
 
     """
 
@@ -126,7 +141,7 @@ def create_run_ts_from_synthetic_run(
 
 def get_time_series_dataframe(
     run: SyntheticRun,
-    source_folder: Optional[Union[pathlib.Path, str]],
+    source_folder: Union[pathlib.Path, str],
     add_nan_values: Optional[bool] = False
 ) -> pd.DataFrame:
     """
@@ -136,10 +151,13 @@ def get_time_series_dataframe(
     Only tested for 8, to make 8Hz data for testing.  If run.sample_rate is default (1.0)
     then no up-sampling takes place.
 
+    TODO: Move noise, and nan addition out of this method.
+
     :type run: mth5.data.station_config.SyntheticRun
     :param run: Information needed to define/create the run
     :type source_folder: Optional[Union[pathlib.Path, str]]
-    :param source_folder: Where to load the ascii time series from
+    :param source_folder: Where to load the ascii time series from.  This overwrites any
+    previous value that may have been stored in the SyntheticRun
     :type add_nan_values: bool
     :param add_nan_values: If True, add some NaN, if False, do not add Nan.
     :rtype df: pandas.DataFrame
@@ -150,31 +168,14 @@ def get_time_series_dataframe(
     if source_folder:
         run.raw_data_path = source_folder.joinpath(run.raw_data_path.name)
 
-    # read in data
-    df = pd.read_csv(run.raw_data_path, names=run.channels, sep="\s+")
-
-    # Invert electric channels to fix phase swap due to modeling coordinates.
-    df[df.columns[-2]] = -df[df.columns[-2]]  #  df["ex"] = -df["ex"]
-    df[df.columns[-1]] = -df[df.columns[-1]]  #  df["ey"] = -df["ey"]
-    #  Column indices are used to avoid handling channel nomenclature here.
-
-    # upsample data if requested,
-    if run.run_metadata.sample_rate != 1.0:
-        df_orig = df.copy(deep=True)
-        new_data_dict = {}
-        for i_ch, ch in enumerate(run.channels):
-            data = df_orig[ch].to_numpy()
-            new_data_dict[ch] = ssig.resample(
-                data, int(run.run_metadata.sample_rate) * len(df_orig)
-            )
-        df = pd.DataFrame(data=new_data_dict)
-
-    # add noise
+    df = run._get_timeseries_dataframe()
+
+    # add noise if requested
     for col in run.channels:
         if run.noise_scalars[col]:
             df[col] += run.noise_scalars[col] * np.random.randn(len(df))
 
-    # add nan
+    # add nan if requested
     if add_nan_values:
         for col in run.channels:
             for [ndx, num_nan] in run.nan_indices[col]:
@@ -183,51 +184,55 @@ def get_time_series_dataframe(
 
 
 def create_mth5_synthetic_file(
-    station_cfgs: list,
+    station_cfgs: List[SyntheticStation],
     mth5_name: Union[pathlib.Path, str],
     target_folder: Optional[Union[pathlib.Path, str]] = "",
-    source_folder: Optional[Union[pathlib.Path, str]] = "",
-    plot: Optional[bool] = False,
-    add_nan_values: Optional[bool] = False,
-    file_version: Optional[str] = "0.1.0",
-    channel_nomenclature: Optional[str] = "default",
-    force_make_mth5: Optional[bool] = True,
-    survey_metadata: Optional[Union[Survey, None]] = None,
+    source_folder: Union[pathlib.Path, str] = "",
+    plot: bool = False,
+    add_nan_values: bool = False,
+    file_version: Literal["0.1.0", "0.2.0"] = "0.1.0",
+    channel_nomenclature: SupportedNomenclature = "default",
+    force_make_mth5: bool = True,
+    survey_metadata: Optional[Survey] = None,
 ):
     """
-    Creates an MTH5 from synthetic data
+    Creates an MTH5 from synthetic data.
+
+    Development Notes:
+     20250203: This function could be made more general, so that it operates on dataframes and legacy emtf ascii files.
 
-    :type station_cfgs: list
-    :param station_cfgs: Elements of the list are each dicts. The dicts are one-off
+    :param station_cfgs: Iterable of objects of type SyntheticStation. These are one-off
     data structure used to hold information mth5 needs to initialize, specifically
     sample_rate, filters, etc.
-    :type mth5_name: Union[pathlib.Path, str]
+    :type station_cfgs: List[SyntheticStation]
     :param mth5_name: Where the mth5 will be stored.  This is generated by the station_config,
     but may change in this method based on add_nan_values or channel_nomenclature
-    :type target_folder: Optional[Union[pathlib.Path, str]]
+    :type mth5_name: Union[pathlib.Path, str]
     :param target_folder: Where the mth5 file will be stored
-    :type source_folder: Optional[Union[pathlib.Path, str]] = "",
+    :type target_folder: Optional[Union[pathlib.Path, str]]
     :param source_folder:  Where the ascii source data are stored
-    :type plot: bool
+    :type source_folder: Optional[Union[pathlib.Path, str]] = "",
     :param plot: Set to false unless you want to look at a plot of the time series
-    :type add_nan_values: bool
+    :type plot: bool
     :param add_nan_values: If true, some np.nan are sprinkled into the time series.  Intended to be used for tests.
-    :type file_version: str
-    :param file_version: One of ["0.1.0", "0.2.0"], corresponding to the version of mth5 to create
-    :type channel_nomenclature: str
-    :param channel_nomenclature: Keyword corresponding to channel nomenclature mapping in CHANNEL_MAPS variable
-    from channel_nomenclature.py module in mt_metadata. Supported values are ['default', 'lemi12', 'lemi34', 'phoenix123']
+    :type add_nan_values: bool
+    :param file_version: One of the supported mth5 file versions.  This is the version of mth5 to create.
+    :type file_version: Literal["0.1.0", "0.2.0"] = "0.1.0",
+    :param channel_nomenclature: Keyword corresponding to channel nomenclature mapping in CHANNEL_MAPS variable,
+    for example ['default', 'lemi12', 'lemi34', 'phoenix123']
     A full list is in mt_metadata/transfer_functions/processing/aurora/standards/channel_nomenclatures.json
-    :type force_make_mth5: bool
-    :param force_make_mth5: str
+    :type channel_nomenclature: SupportedNomenclature
     :param force_make_mth5: If set to true, the file will be made, even if it already exists.
     If false, and file already exists, skip the make job.
-    :type survey_metadata: Survey
+    :type force_make_mth5: bool
     :param survey_metadata: Option to provide survey metadata, otherwise it will be created.
-    :rtype: mth5_path: pathlib.Path
+    :type survey_metadata: Survey
     :return: The path to the stored h5 file.
+    :rtype: mth5_path: pathlib.Path
+
     """
 
+    # Handle path and file name conventions
     if not target_folder:
         msg = f"No target folder provided for making {mth5_name}"
         logger.warning(msg)
@@ -238,15 +243,16 @@ def create_mth5_synthetic_file(
     try:
         target_folder.mkdir(exist_ok=True, parents=True)
     except OSError:
-        msg = "Aurora maybe installed on a read-only file system"
-        msg = f"{msg}: try setting target_path argument when calling create_mth5_synthetic_file"
+        msg = "MTH5 maybe installed on a read-only file system"
+        msg = f"{msg}: try setting `target_folder` argument when calling create_mth5_synthetic_file"
         logger.error(msg)
 
     mth5_path = target_folder.joinpath(mth5_name)
     mth5_path = _update_mth5_path(
         mth5_path, add_nan_values, channel_nomenclature
     )
 
+    # Only create file if needed
     if not force_make_mth5:
         if mth5_path.exists():
             return mth5_path
@@ -266,12 +272,16 @@ def create_mth5_synthetic_file(
             station_group = m.add_station(station_cfg.id, survey=survey_id)
 
             for run in station_cfg.runs:
+                # run is object of type SyntheticRun
                 df = get_time_series_dataframe(
                     run=run,
                     source_folder=source_folder,
                     add_nan_values=add_nan_values
                 )
 
+                # TODO: Add handling for noise, nan, and upsampling here
+                #  (They don't belong in get_time_Series_dataframe()
+
                 # cast to run_ts
                 runts = create_run_ts_from_synthetic_run(
                     run, df, channel_nomenclature=channel_nomenclature
@@ -302,8 +312,8 @@ def create_test1_h5(
     """
     Creates an MTH5 file for a single station named "test1".
 
-    :type file_version: str
     :param file_version: One of ["0.1.0", "0.2.0"], corresponding to the version of mth5 to create
+    :type file_version: str
     :type channel_nomenclature: Optional[str]
     :param channel_nomenclature: Keyword corresponding to channel nomenclature mapping in CHANNEL_MAPS variable
     from channel_nomenclature.py module in mt_metadata. Supported values are ['default', 'lemi12', 'lemi34', 'phoenix123']
@@ -584,7 +594,8 @@ def _update_mth5_path(
     add_nan_values: bool,
     channel_nomenclature: str
 ) -> pathlib.Path:
-    """set name for output h5 file"""
+    """ Modify the name of output h5 file based on wheter or not nan-data are included
+     as well as channel_nomenclature if not default. """
     path_str = mth5_path.__str__()
     if add_nan_values:
         path_str = path_str.replace(".h5", "_nan.h5")