Skip to content

Commit

Permalink
Merge branch 'features' of https://github.com/kujaku11/mth5 into feat…
Browse files Browse the repository at this point in the history
…ures
  • Loading branch information
kujaku11 committed Feb 4, 2025
2 parents 2ba6552 + 3e46a59 commit 5450735
Show file tree
Hide file tree
Showing 2 changed files with 250 additions and 114 deletions.
145 changes: 78 additions & 67 deletions mth5/data/make_mth5_from_asc.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,27 @@
that originally came from EMTF -- test1.asc and test2.asc. Each ascii file
represents five channels of data sampled at 1Hz at a synthetic station.
Mirroring the original ascii files are:
data/test1.h5
data/test2.h5
data/test12rr.h5
TODO: Separate the handling of legacy EMTF data files, such as
reading into a dataframe from oddly delimited data, as well as flipping polarities of
the electric channels (possibly due to a baked in sign convention error in the legacy
data), so that a simple dataframe can be passed. That will make the methods here more
easily generalize to work with other dataframes. That would be useful in future when
we creating synthetic data at arbitrary sample rate.
Development Notes:
Mirroring the original ascii files are:
data/test1.h5
data/test2.h5
data/test12rr.h5
Also created are some files with the same data but other channel_nomenclature schemes:
data/test12rr_LEMI34.h5
data/test1_LEMI12.h5
- 20231103: Added an 8Hz up-sampled version of test1. No spectral content was added
so the band between the old and new Nyquist frequencies is bogus.
Also created are some files with the same data but other channel_nomenclature schemes:
data/test12rr_LEMI34.h5
data/test1_LEMI12.h5
- 20231103: Added an 8Hz up-sampled version of test1. No spectral content was added
so the band between the old and new Nyquist frequencies is bogus.
"""
# =============================================================================
Expand All @@ -38,17 +48,20 @@
from mth5.data.station_config import make_station_03
from mth5.data.station_config import make_station_04
from mth5.data.station_config import SyntheticRun
from mth5.data.station_config import SyntheticStation
from mth5.mth5 import MTH5
from mth5.timeseries import ChannelTS, RunTS
from mth5.utils.helpers import add_filters
from mt_metadata.transfer_functions.processing.aurora import (
ChannelNomenclature,
)
from mt_metadata.transfer_functions.processing.aurora.channel_nomenclature import SupportedNomenclature

from mt_metadata.timeseries import Electric
from mt_metadata.timeseries import Magnetic
from mt_metadata.timeseries import Survey

from typing import Optional, Union
from typing import List, Literal, Optional, Union


# =============================================================================
Expand All @@ -59,22 +72,24 @@


def create_run_ts_from_synthetic_run(
run: SyntheticRun, df: pd.DataFrame, channel_nomenclature: str = "default"
run: SyntheticRun,
df: pd.DataFrame,
channel_nomenclature: SupportedNomenclature = "default"
):
"""
Loop over channels of synthetic data in df and make ChannelTS objects.
:type run: mth5.data.station_config.SyntheticRun
:param run: One-off data structure with information mth5 needs to initialize. Specifically sample_rate, filters.
:type df: pandas.DataFrame
:type run: mth5.data.station_config.SyntheticRun
:param df: time series data in columns labelled from ["ex", "ey", "hx", "hy", "hz"]
:type channel_nomenclature : string
:type df: pandas.DataFrame
:param channel_nomenclature : Keyword corresponding to channel nomenclature mapping
in CHANNEL_MAPS variable from channel_nomenclature.py module in mt_metadata.
Supported values include ['default', 'lemi12', 'lemi34', 'phoenix123']
:type channel_nomenclature : string
:rtype runts: RunTS
:return runts: MTH5 run time series object, data and metadata bound into one.
:rtype runts: RunTS
"""

Expand Down Expand Up @@ -126,7 +141,7 @@ def create_run_ts_from_synthetic_run(

def get_time_series_dataframe(
run: SyntheticRun,
source_folder: Optional[Union[pathlib.Path, str]],
source_folder: Union[pathlib.Path, str],
add_nan_values: Optional[bool] = False
) -> pd.DataFrame:
"""
Expand All @@ -136,10 +151,13 @@ def get_time_series_dataframe(
Only tested for 8, to make 8Hz data for testing. If run.sample_rate is default (1.0)
then no up-sampling takes place.
TODO: Move noise, and nan addition out of this method.
:type run: mth5.data.station_config.SyntheticRun
:param run: Information needed to define/create the run
:type source_folder: Optional[Union[pathlib.Path, str]]
:param source_folder: Where to load the ascii time series from
:param source_folder: Where to load the ascii time series from. This overwrites any
previous value that may have been stored in the SyntheticRun
:type add_nan_values: bool
:param add_nan_values: If True, add some NaN, if False, do not add Nan.
:rtype df: pandas.DataFrame
Expand All @@ -150,31 +168,14 @@ def get_time_series_dataframe(
if source_folder:
run.raw_data_path = source_folder.joinpath(run.raw_data_path.name)

# read in data
df = pd.read_csv(run.raw_data_path, names=run.channels, sep="\s+")

# Invert electric channels to fix phase swap due to modeling coordinates.
df[df.columns[-2]] = -df[df.columns[-2]] # df["ex"] = -df["ex"]
df[df.columns[-1]] = -df[df.columns[-1]] # df["ey"] = -df["ey"]
# Column indices are used to avoid handling channel nomenclature here.

# upsample data if requested,
if run.run_metadata.sample_rate != 1.0:
df_orig = df.copy(deep=True)
new_data_dict = {}
for i_ch, ch in enumerate(run.channels):
data = df_orig[ch].to_numpy()
new_data_dict[ch] = ssig.resample(
data, int(run.run_metadata.sample_rate) * len(df_orig)
)
df = pd.DataFrame(data=new_data_dict)

# add noise
df = run._get_timeseries_dataframe()

# add noise if requested
for col in run.channels:
if run.noise_scalars[col]:
df[col] += run.noise_scalars[col] * np.random.randn(len(df))

# add nan
# add nan if requested
if add_nan_values:
for col in run.channels:
for [ndx, num_nan] in run.nan_indices[col]:
Expand All @@ -183,51 +184,55 @@ def get_time_series_dataframe(


def create_mth5_synthetic_file(
station_cfgs: list,
station_cfgs: List[SyntheticStation],
mth5_name: Union[pathlib.Path, str],
target_folder: Optional[Union[pathlib.Path, str]] = "",
source_folder: Optional[Union[pathlib.Path, str]] = "",
plot: Optional[bool] = False,
add_nan_values: Optional[bool] = False,
file_version: Optional[str] = "0.1.0",
channel_nomenclature: Optional[str] = "default",
force_make_mth5: Optional[bool] = True,
survey_metadata: Optional[Union[Survey, None]] = None,
source_folder: Union[pathlib.Path, str] = "",
plot: bool = False,
add_nan_values: bool = False,
file_version: Literal["0.1.0", "0.2.0"] = "0.1.0",
channel_nomenclature: SupportedNomenclature = "default",
force_make_mth5: bool = True,
survey_metadata: Optional[Survey] = None,
):
"""
Creates an MTH5 from synthetic data
Creates an MTH5 from synthetic data.
Development Notes:
20250203: This function could be made more general, so that it operates on dataframes and legacy emtf ascii files.
:type station_cfgs: list
:param station_cfgs: Elements of the list are each dicts. The dicts are one-off
:param station_cfgs: Iterable of objects of type SyntheticStation. These are one-off
data structure used to hold information mth5 needs to initialize, specifically
sample_rate, filters, etc.
:type mth5_name: Union[pathlib.Path, str]
:type station_cfgs: List[SyntheticStation]
:param mth5_name: Where the mth5 will be stored. This is generated by the station_config,
but may change in this method based on add_nan_values or channel_nomenclature
:type target_folder: Optional[Union[pathlib.Path, str]]
:type mth5_name: Union[pathlib.Path, str]
:param target_folder: Where the mth5 file will be stored
:type source_folder: Optional[Union[pathlib.Path, str]] = "",
:type target_folder: Optional[Union[pathlib.Path, str]]
:param source_folder: Where the ascii source data are stored
:type plot: bool
:type source_folder: Optional[Union[pathlib.Path, str]] = "",
:param plot: Set to false unless you want to look at a plot of the time series
:type add_nan_values: bool
:type plot: bool
:param add_nan_values: If true, some np.nan are sprinkled into the time series. Intended to be used for tests.
:type file_version: str
:param file_version: One of ["0.1.0", "0.2.0"], corresponding to the version of mth5 to create
:type channel_nomenclature: str
:param channel_nomenclature: Keyword corresponding to channel nomenclature mapping in CHANNEL_MAPS variable
from channel_nomenclature.py module in mt_metadata. Supported values are ['default', 'lemi12', 'lemi34', 'phoenix123']
:type add_nan_values: bool
:param file_version: One of the supported mth5 file versions. This is the version of mth5 to create.
:type file_version: Literal["0.1.0", "0.2.0"] = "0.1.0",
:param channel_nomenclature: Keyword corresponding to channel nomenclature mapping in CHANNEL_MAPS variable,
for example ['default', 'lemi12', 'lemi34', 'phoenix123']
A full list is in mt_metadata/transfer_functions/processing/aurora/standards/channel_nomenclatures.json
:type force_make_mth5: bool
:param force_make_mth5: str
:type channel_nomenclature: SupportedNomenclature
:param force_make_mth5: If set to true, the file will be made, even if it already exists.
If false, and file already exists, skip the make job.
:type survey_metadata: Survey
:type force_make_mth5: bool
:param survey_metadata: Option to provide survey metadata, otherwise it will be created.
:rtype: mth5_path: pathlib.Path
:type survey_metadata: Survey
:return: The path to the stored h5 file.
:rtype: mth5_path: pathlib.Path
"""

# Handle path and file name conventions
if not target_folder:
msg = f"No target folder provided for making {mth5_name}"
logger.warning(msg)
Expand All @@ -238,15 +243,16 @@ def create_mth5_synthetic_file(
try:
target_folder.mkdir(exist_ok=True, parents=True)
except OSError:
msg = "Aurora maybe installed on a read-only file system"
msg = f"{msg}: try setting target_path argument when calling create_mth5_synthetic_file"
msg = "MTH5 maybe installed on a read-only file system"
msg = f"{msg}: try setting `target_folder` argument when calling create_mth5_synthetic_file"
logger.error(msg)

mth5_path = target_folder.joinpath(mth5_name)
mth5_path = _update_mth5_path(
mth5_path, add_nan_values, channel_nomenclature
)

# Only create file if needed
if not force_make_mth5:
if mth5_path.exists():
return mth5_path
Expand All @@ -266,12 +272,16 @@ def create_mth5_synthetic_file(
station_group = m.add_station(station_cfg.id, survey=survey_id)

for run in station_cfg.runs:
# run is object of type SyntheticRun
df = get_time_series_dataframe(
run=run,
source_folder=source_folder,
add_nan_values=add_nan_values
)

# TODO: Add handling for noise, nan, and upsampling here
# (They don't belong in get_time_Series_dataframe()

# cast to run_ts
runts = create_run_ts_from_synthetic_run(
run, df, channel_nomenclature=channel_nomenclature
Expand Down Expand Up @@ -302,8 +312,8 @@ def create_test1_h5(
"""
Creates an MTH5 file for a single station named "test1".
:type file_version: str
:param file_version: One of ["0.1.0", "0.2.0"], corresponding to the version of mth5 to create
:type file_version: str
:type channel_nomenclature: Optional[str]
:param channel_nomenclature: Keyword corresponding to channel nomenclature mapping in CHANNEL_MAPS variable
from channel_nomenclature.py module in mt_metadata. Supported values are ['default', 'lemi12', 'lemi34', 'phoenix123']
Expand Down Expand Up @@ -584,7 +594,8 @@ def _update_mth5_path(
add_nan_values: bool,
channel_nomenclature: str
) -> pathlib.Path:
"""set name for output h5 file"""
""" Modify the name of output h5 file based on wheter or not nan-data are included
as well as channel_nomenclature if not default. """
path_str = mth5_path.__str__()
if add_nan_values:
path_str = path_str.replace(".h5", "_nan.h5")
Expand Down
Loading

0 comments on commit 5450735

Please sign in to comment.