lhotse-speech · desh2608 · Apr 6, 2023 · Apr 6, 2023 · Apr 6, 2023 · Apr 6, 2023
diff --git a/lhotse/cut/data.py b/lhotse/cut/data.py
@@ -710,7 +710,7 @@ def pad(
         duration: Seconds = None,
         num_frames: int = None,
         num_samples: int = None,
-        pad_feat_value: float = LOG_EPSILON,
+        pad_feat_value: Optional[float] = None,
         direction: str = "right",
         preserve_id: bool = False,
         pad_value_dict: Optional[Dict[str, Union[int, float]]] = None,
@@ -725,7 +725,8 @@ def pad(
         :param num_frames: The cut's total number of frames after padding.
         :param num_samples: The cut's total number of samples after padding.
         :param pad_feat_value: A float value that's used for padding the features.
-            By default we assume a log-energy floor of approx. -23 (1e-10 after exp).
+            By default, we will use the value defined in the `FeatureExtractor.padding_value`
+            for the feature type.
         :param direction: string, 'left', 'right' or 'both'. Determines whether the padding is added before or after
             the cut.
         :param preserve_id: When ``True``, preserves the cut ID before padding.

diff --git a/lhotse/cut/mixed.py b/lhotse/cut/mixed.py
@@ -534,7 +534,7 @@ def pad(
         duration: Seconds = None,
         num_frames: int = None,
         num_samples: int = None,
-        pad_feat_value: float = LOG_EPSILON,
+        pad_feat_value: Optional[float] = None,
         direction: str = "right",
         preserve_id: bool = False,
         pad_value_dict: Optional[Dict[str, Union[int, float]]] = None,
@@ -549,7 +549,8 @@ def pad(
         :param num_frames: The cut's total number of frames after padding.
         :param num_samples: The cut's total number of samples after padding.
         :param pad_feat_value: A float value that's used for padding the features.
-            By default we assume a log-energy floor of approx. -23 (1e-10 after exp).
+            By default, we will use the value defined in the `FeatureExtractor.padding_value`
+            for the feature type.
         :param direction: string, 'left', 'right' or 'both'. Determines whether the padding is added before or after
             the cut.
         :param preserve_id: When ``True``, preserves the cut ID from before padding.
@@ -870,6 +871,7 @@ def load_features(self, mixed: bool = True) -> Optional[np.ndarray]:
                 feats = reference_feats  # manual caching to avoid duplicated I/O
             else:
                 feats = track.cut.load_features()
+
             mixer.add_to_mix(
                 feats=feats,
                 snr=track.snr,

diff --git a/lhotse/cut/padding.py b/lhotse/cut/padding.py
@@ -41,7 +41,7 @@ class PaddingCut(Cut):
     id: str
     duration: Seconds
     sampling_rate: int
-    feat_value: float
+    feat_value: Optional[float] = None
 
     # For frequency domain
     num_frames: Optional[int] = None
@@ -187,7 +187,7 @@ def pad(
         duration: Seconds = None,
         num_frames: int = None,
         num_samples: int = None,
-        pad_feat_value: float = LOG_EPSILON,
+        pad_feat_value: Optional[float] = None,
         direction: str = "right",
         preserve_id: bool = False,
         pad_value_dict: Optional[Dict[str, Union[int, float]]] = None,
@@ -202,7 +202,8 @@ def pad(
         :param num_frames: The cut's total number of frames after padding.
         :param num_samples: The cut's total number of samples after padding.
         :param pad_feat_value: A float value that's used for padding the features.
-            By default we assume a log-energy floor of approx. -23 (1e-10 after exp).
+            By default, we will use the value defined in the `FeatureExtractor.padding_value`
+            for the feature type.
         :param direction: string, 'left', 'right' or 'both'. Determines whether the padding is added before or after
             the cut.
         :param preserve_id: When ``True``, preserves the cut ID from before padding.
@@ -388,6 +389,7 @@ def compute_and_store_features(
         """
         return fastcopy(
             self,
+            feat_value=extractor.padding_value,
             num_features=extractor.feature_dim(self.sampling_rate),
             num_frames=compute_num_frames(
                 duration=self.duration,

diff --git a/lhotse/cut/set.py b/lhotse/cut/set.py
@@ -41,7 +41,11 @@
 from lhotse.cut.multi import MultiCut
 from lhotse.cut.padding import PaddingCut
 from lhotse.features import FeatureExtractor, Features, FeatureSet
-from lhotse.features.base import StatsAccumulator, compute_global_stats
+from lhotse.features.base import (
+    StatsAccumulator,
+    compute_global_stats,
+    create_default_feature_extractor,
+)
 from lhotse.features.io import FeaturesWriter, LilcomChunkyWriter
 from lhotse.lazy import AlgorithmMixin
 from lhotse.serialization import Serializable
@@ -1365,7 +1369,7 @@ def pad(
         duration: Seconds = None,
         num_frames: int = None,
         num_samples: int = None,
-        pad_feat_value: float = LOG_EPSILON,
+        pad_feat_value: Optional[float] = None,
         direction: str = "right",
         preserve_id: bool = False,
         pad_value_dict: Optional[Dict[str, Union[int, float]]] = None,
@@ -1384,7 +1388,8 @@ def pad(
         :param num_frames: The cut's total number of frames after padding.
         :param num_samples: The cut's total number of samples after padding.
         :param pad_feat_value: A float value that's used for padding the features.
-            By default we assume a log-energy floor of approx. -23 (1e-10 after exp).
+            By default, we will use the value defined in the `FeatureExtractor.padding_value`
+            for the feature type.
         :param direction: string, 'left', 'right' or 'both'. Determines whether the padding is added
             before or after the cut.
         :param preserve_id: When ``True``, preserves the cut ID from before padding.
@@ -2659,7 +2664,7 @@ def pad(
     duration: Seconds = None,
     num_frames: int = None,
     num_samples: int = None,
-    pad_feat_value: float = LOG_EPSILON,
+    pad_feat_value: Optional[float] = None,
     direction: str = "right",
     preserve_id: bool = False,
     pad_value_dict: Optional[Dict[str, Union[int, float]]] = None,
@@ -2675,7 +2680,8 @@ def pad(
     :param num_frames: The cut's total number of frames after padding.
     :param num_samples: The cut's total number of samples after padding.
     :param pad_feat_value: A float value that's used for padding the features.
-        By default we assume a log-energy floor of approx. -23 (1e-10 after exp).
+        By default, we will use the value defined in the `FeatureExtractor.padding_value`
+        for the feature type.
     :param direction: string, 'left', 'right' or 'both'. Determines whether the padding is added before or after
         the cut.
     :param preserve_id: When ``True``, preserves the cut ID before padding.
@@ -2763,6 +2769,16 @@ def pad(
             else None
         )
 
+    # If the user has not specified a feature value for padding, we will use the default value
+    # from the feature extractor.
+    if pad_feat_value is None and cut.has_features:
+        if isinstance(cut, PaddingCut):
+            pad_feat_value = cut.feat_value
+        else:
+            pad_feat_value = create_default_feature_extractor(
+                cut.features_type
+            ).padding_value
+
     padding_cut = PaddingCut(
         id=str(uuid4()),
         duration=round(duration - cut.duration, ndigits=8),

diff --git a/lhotse/features/base.py b/lhotse/features/base.py
@@ -52,6 +52,9 @@ class FeatureExtractor(metaclass=ABCMeta):
     * ``compute_energy``, and
     * ``mix``.
 
+    They should also implement the ``padding_value`` property, which will be used for padding features
+    during mixing. See :class:`~lhotse.features.FeatureMixer` for more details.
+
     By itself, the ``FeatureExtractor`` offers the following high-level methods
     that are not intended for overriding:
 
@@ -90,6 +93,10 @@ def frame_shift(self) -> Seconds:
     def feature_dim(self, sampling_rate: int) -> int:
         ...
 
+    @property
+    def padding_value(self) -> Optional[float]:
+        return None
+
     @property
     def device(self) -> Union[str, torch.device]:
         return "cpu"

diff --git a/lhotse/features/fbank.py b/lhotse/features/fbank.py
@@ -54,6 +54,10 @@ def _feature_fn(self, *args, **kwargs):
     def feature_dim(self, sampling_rate: int) -> int:
         return self.config.num_mel_bins
 
+    @property
+    def padding_value(self) -> float:
+        return -1000.0
+
     @staticmethod
     def mix(
         features_a: np.ndarray, features_b: np.ndarray, energy_scaling_factor_b: float

diff --git a/lhotse/features/kaldi/extractors.py b/lhotse/features/kaldi/extractors.py
@@ -80,6 +80,11 @@ def to(self, device: str):
         self.config.device = device
         self.extractor.to(device)
 
+    @property
+    def padding_value(self) -> float:
+        """Return the value that should be used to pad these features."""
+        return -1000.0
+
     def feature_dim(self, sampling_rate: int) -> int:
         return self.config.num_filters
 
@@ -205,6 +210,11 @@ def device(self) -> Union[str, torch.device]:
     def frame_shift(self) -> Seconds:
         return self.config.frame_shift
 
+    @property
+    def padding_value(self) -> float:
+        """Return the value that should be used to pad these features."""
+        return -1000.0
+
     def feature_dim(self, sampling_rate: int) -> int:
         return self.config.num_ceps
 
@@ -301,6 +311,11 @@ def device(self) -> Union[str, torch.device]:
     def frame_shift(self) -> Seconds:
         return self.config.frame_shift
 
+    @property
+    def padding_value(self) -> float:
+        """Return the value that should be used to pad these features."""
+        return EPSILON
+
     def feature_dim(self, sampling_rate: int) -> int:
         return self.config.num_ceps
 

diff --git a/lhotse/features/kaldifeat.py b/lhotse/features/kaldifeat.py
@@ -193,6 +193,10 @@ def __init__(self, config: Optional[KaldifeatFbankConfig] = None) -> None:
     def feature_dim(self, sampling_rate: int) -> int:
         return self.config.mel_opts.num_bins
 
+    @property
+    def padding_value(self) -> float:
+        return -1000.0 if self.config.use_log_fbank else EPSILON
 LOG_EPSILON = math.log(EPSILON) 
 LOG_EPSILON = math.log(EPSILON) 
+
     @staticmethod
     def mix(
         features_a: np.ndarray, features_b: np.ndarray, energy_scaling_factor_b: float
@@ -259,3 +263,7 @@ def __init__(self, config: Optional[KaldifeatMfccConfig] = None) -> None:
 
     def feature_dim(self, sampling_rate: int) -> int:
         return self.config.num_ceps
+
+    @property
+    def padding_value(self) -> float:
+        return -1000.0
diff --git a/lhotse/features/librosa_fbank.py b/lhotse/features/librosa_fbank.py
@@ -42,7 +42,7 @@ def pad_or_truncate_features(
     feats: np.ndarray,
     expected_num_frames: int,
     abs_tol: int = 1,
-    pad_value: float = LOG_EPSILON,
+    pad_value: float = -1000,
 ):
     frames_diff = feats.shape[0] - expected_num_frames
 
@@ -53,7 +53,7 @@ def pad_or_truncate_features(
             feats,
             ((0, -frames_diff), (0, 0)),
             mode="constant",
-            constant_values=LOG_EPSILON,
+            constant_values=-1000,
         )
     elif abs(frames_diff) > abs_tol:
         raise ValueError(
@@ -148,6 +148,10 @@ class LibrosaFbank(FeatureExtractor):
     def frame_shift(self) -> Seconds:
         return self.config.hop_size / self.config.sampling_rate
 
+    @property
+    def padding_value(self) -> float:
+        return -1000.0
+
     def feature_dim(self, sampling_rate: int) -> int:
         return self.config.num_mel_bins
 

diff --git a/lhotse/features/mfcc.py b/lhotse/features/mfcc.py
@@ -53,3 +53,7 @@ def _feature_fn(self, *args, **kwargs):
 
     def feature_dim(self, sampling_rate: int) -> int:
         return self.config.num_ceps
+
+    @property
+    def padding_value(self) -> float:
+        return -1000.0
diff --git a/lhotse/features/mixer.py b/lhotse/features/mixer.py
@@ -29,7 +29,7 @@ def __init__(
         feature_extractor: FeatureExtractor,
         base_feats: np.ndarray,
         frame_shift: Seconds,
-        padding_value: float = -1000.0,
+        padding_value: Optional[float] = None,
         reference_energy: Optional[float] = None,
     ):
         """
@@ -40,8 +40,7 @@ def __init__(
             in terms of energy and offset for all features mixed into them.
         :param frame_shift: Required to correctly compute offset and padding during the mix.
         :param padding_value: The value used to pad the shorter features during the mix.
-            This value is adequate only for log space features. For non-log space features,
-            e.g. energies, use either 0 or a small positive value like 1e-5.
+            If not provided, we will default to ``feature_extractor.padding_value``.
         :param reference_energy: Optionally pass a reference energy value to compute SNRs against.
             This might be required when ``base_feats`` correspond to padding energies.
         """
@@ -50,7 +49,12 @@ def __init__(
         self.num_channels = 1 if base_feats.ndim == 2 else base_feats.shape[-1]
         self.gains = []
         self.frame_shift = frame_shift
-        self.padding_value = padding_value
+        self.padding_value = padding_value or feature_extractor.padding_value
+        if self.padding_value is None:
+            raise ValueError(
+                f"FeatureMixer requires `padding_value` to be set, since {feature_extractor}"
+                f" does not define a default one."
+            )
         self.dtype = self.tracks[0].dtype
 
         # Keep a pre-computed energy value of the features that we initialize the Mixer with;

diff --git a/lhotse/features/opensmile.py b/lhotse/features/opensmile.py
@@ -5,7 +5,7 @@
 import numpy as np
 
 from lhotse.features.base import FeatureExtractor, register_extractor
-from lhotse.utils import Seconds, compute_num_frames, is_module_available
+from lhotse.utils import EPSILON, Seconds, compute_num_frames, is_module_available
 
 
 @dataclass
@@ -121,6 +121,10 @@ def frame_shift(self) -> Seconds:
                 f"frame_shift is not defined for Functionals feature level or for non default feature set. Defined featureset: {self.config.feature_set}"
             )
 
+    @property
+    def padding_value(self) -> float:
+        return EPSILON
+
     def feature_dim(self, sampling_rate: int) -> int:
         return len(self.feature_names)
 

diff --git a/lhotse/features/spectrogram.py b/lhotse/features/spectrogram.py
@@ -53,6 +53,10 @@ def feature_dim(self, sampling_rate: int) -> int:
             else window_size
         )
 
+    @property
+    def padding_value(self) -> float:
+        return EPSILON
+
     @staticmethod
     def mix(
         features_a: np.ndarray, features_b: np.ndarray, energy_scaling_factor_b: float

diff --git a/lhotse/features/ssl.py b/lhotse/features/ssl.py
@@ -68,6 +68,11 @@ def __init__(self, config: Optional[Any] = None):
     def frame_shift(self) -> Seconds:
         return self.config.frame_shift
 
+    @property
+    def padding_value(self) -> float:
+        """Return the value that should be used to pad these features."""
+        return EPSILON
+
     @property
     def sampling_rate(self) -> int:
         return self.config.sampling_rate