Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add padding_value attribute to features #1020

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
5 changes: 3 additions & 2 deletions lhotse/cut/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -710,7 +710,7 @@ def pad(
duration: Seconds = None,
num_frames: int = None,
num_samples: int = None,
pad_feat_value: float = LOG_EPSILON,
pad_feat_value: Optional[float] = None,
direction: str = "right",
preserve_id: bool = False,
pad_value_dict: Optional[Dict[str, Union[int, float]]] = None,
Expand All @@ -725,7 +725,8 @@ def pad(
:param num_frames: The cut's total number of frames after padding.
:param num_samples: The cut's total number of samples after padding.
:param pad_feat_value: A float value that's used for padding the features.
By default we assume a log-energy floor of approx. -23 (1e-10 after exp).
By default, we will use the value defined in the `FeatureExtractor.padding_value`
for the feature type.
:param direction: string, 'left', 'right' or 'both'. Determines whether the padding is added before or after
the cut.
:param preserve_id: When ``True``, preserves the cut ID before padding.
Expand Down
6 changes: 4 additions & 2 deletions lhotse/cut/mixed.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,7 +534,7 @@ def pad(
duration: Seconds = None,
num_frames: int = None,
num_samples: int = None,
pad_feat_value: float = LOG_EPSILON,
pad_feat_value: Optional[float] = None,
direction: str = "right",
preserve_id: bool = False,
pad_value_dict: Optional[Dict[str, Union[int, float]]] = None,
Expand All @@ -549,7 +549,8 @@ def pad(
:param num_frames: The cut's total number of frames after padding.
:param num_samples: The cut's total number of samples after padding.
:param pad_feat_value: A float value that's used for padding the features.
By default we assume a log-energy floor of approx. -23 (1e-10 after exp).
By default, we will use the value defined in the `FeatureExtractor.padding_value`
for the feature type.
:param direction: string, 'left', 'right' or 'both'. Determines whether the padding is added before or after
the cut.
:param preserve_id: When ``True``, preserves the cut ID from before padding.
Expand Down Expand Up @@ -870,6 +871,7 @@ def load_features(self, mixed: bool = True) -> Optional[np.ndarray]:
feats = reference_feats # manual caching to avoid duplicated I/O
else:
feats = track.cut.load_features()

mixer.add_to_mix(
feats=feats,
snr=track.snr,
Expand Down
8 changes: 5 additions & 3 deletions lhotse/cut/padding.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class PaddingCut(Cut):
id: str
duration: Seconds
sampling_rate: int
feat_value: float
feat_value: Optional[float] = None

# For frequency domain
num_frames: Optional[int] = None
Expand Down Expand Up @@ -187,7 +187,7 @@ def pad(
duration: Seconds = None,
num_frames: int = None,
num_samples: int = None,
pad_feat_value: float = LOG_EPSILON,
pad_feat_value: Optional[float] = None,
direction: str = "right",
preserve_id: bool = False,
pad_value_dict: Optional[Dict[str, Union[int, float]]] = None,
Expand All @@ -202,7 +202,8 @@ def pad(
:param num_frames: The cut's total number of frames after padding.
:param num_samples: The cut's total number of samples after padding.
:param pad_feat_value: A float value that's used for padding the features.
By default we assume a log-energy floor of approx. -23 (1e-10 after exp).
By default, we will use the value defined in the `FeatureExtractor.padding_value`
for the feature type.
:param direction: string, 'left', 'right' or 'both'. Determines whether the padding is added before or after
the cut.
:param preserve_id: When ``True``, preserves the cut ID from before padding.
Expand Down Expand Up @@ -388,6 +389,7 @@ def compute_and_store_features(
"""
return fastcopy(
self,
feat_value=extractor.padding_value,
num_features=extractor.feature_dim(self.sampling_rate),
num_frames=compute_num_frames(
duration=self.duration,
Expand Down
26 changes: 21 additions & 5 deletions lhotse/cut/set.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,11 @@
from lhotse.cut.multi import MultiCut
from lhotse.cut.padding import PaddingCut
from lhotse.features import FeatureExtractor, Features, FeatureSet
from lhotse.features.base import StatsAccumulator, compute_global_stats
from lhotse.features.base import (
StatsAccumulator,
compute_global_stats,
create_default_feature_extractor,
)
from lhotse.features.io import FeaturesWriter, LilcomChunkyWriter
from lhotse.lazy import AlgorithmMixin
from lhotse.serialization import Serializable
Expand Down Expand Up @@ -1365,7 +1369,7 @@ def pad(
duration: Seconds = None,
num_frames: int = None,
num_samples: int = None,
pad_feat_value: float = LOG_EPSILON,
pad_feat_value: Optional[float] = None,
direction: str = "right",
preserve_id: bool = False,
pad_value_dict: Optional[Dict[str, Union[int, float]]] = None,
Expand All @@ -1384,7 +1388,8 @@ def pad(
:param num_frames: The cut's total number of frames after padding.
:param num_samples: The cut's total number of samples after padding.
:param pad_feat_value: A float value that's used for padding the features.
By default we assume a log-energy floor of approx. -23 (1e-10 after exp).
By default, we will use the value defined in the `FeatureExtractor.padding_value`
for the feature type.
:param direction: string, 'left', 'right' or 'both'. Determines whether the padding is added
before or after the cut.
:param preserve_id: When ``True``, preserves the cut ID from before padding.
Expand Down Expand Up @@ -2659,7 +2664,7 @@ def pad(
duration: Seconds = None,
num_frames: int = None,
num_samples: int = None,
pad_feat_value: float = LOG_EPSILON,
pad_feat_value: Optional[float] = None,
direction: str = "right",
preserve_id: bool = False,
pad_value_dict: Optional[Dict[str, Union[int, float]]] = None,
Expand All @@ -2675,7 +2680,8 @@ def pad(
:param num_frames: The cut's total number of frames after padding.
:param num_samples: The cut's total number of samples after padding.
:param pad_feat_value: A float value that's used for padding the features.
By default we assume a log-energy floor of approx. -23 (1e-10 after exp).
By default, we will use the value defined in the `FeatureExtractor.padding_value`
for the feature type.
:param direction: string, 'left', 'right' or 'both'. Determines whether the padding is added before or after
the cut.
:param preserve_id: When ``True``, preserves the cut ID before padding.
Expand Down Expand Up @@ -2763,6 +2769,16 @@ def pad(
else None
)

# If the user has not specified a feature value for padding, we will use the default value
# from the feature extractor.
if pad_feat_value is None and cut.has_features:
if isinstance(cut, PaddingCut):
pad_feat_value = cut.feat_value
else:
pad_feat_value = create_default_feature_extractor(
cut.features_type
).padding_value

padding_cut = PaddingCut(
id=str(uuid4()),
duration=round(duration - cut.duration, ndigits=8),
Expand Down
7 changes: 7 additions & 0 deletions lhotse/features/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ class FeatureExtractor(metaclass=ABCMeta):
* ``compute_energy``, and
* ``mix``.

They should also implement the ``padding_value`` property, which will be used for padding features
during mixing. See :class:`~lhotse.features.FeatureMixer` for more details.

By itself, the ``FeatureExtractor`` offers the following high-level methods
that are not intended for overriding:

Expand Down Expand Up @@ -90,6 +93,10 @@ def frame_shift(self) -> Seconds:
def feature_dim(self, sampling_rate: int) -> int:
...

@property
def padding_value(self) -> Optional[float]:
return None

@property
def device(self) -> Union[str, torch.device]:
return "cpu"
Expand Down
4 changes: 4 additions & 0 deletions lhotse/features/fbank.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ def _feature_fn(self, *args, **kwargs):
def feature_dim(self, sampling_rate: int) -> int:
return self.config.num_mel_bins

@property
def padding_value(self) -> float:
return -1000.0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason to choose -1000.0 instead of LOG_EPSILON or just 0?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check the discussion in the main thread.


@staticmethod
def mix(
features_a: np.ndarray, features_b: np.ndarray, energy_scaling_factor_b: float
Expand Down
15 changes: 15 additions & 0 deletions lhotse/features/kaldi/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,11 @@ def to(self, device: str):
self.config.device = device
self.extractor.to(device)

@property
def padding_value(self) -> float:
"""Return the value that should be used to pad these features."""
return -1000.0

def feature_dim(self, sampling_rate: int) -> int:
return self.config.num_filters

Expand Down Expand Up @@ -205,6 +210,11 @@ def device(self) -> Union[str, torch.device]:
def frame_shift(self) -> Seconds:
return self.config.frame_shift

@property
def padding_value(self) -> float:
"""Return the value that should be used to pad these features."""
return -1000.0

def feature_dim(self, sampling_rate: int) -> int:
return self.config.num_ceps

Expand Down Expand Up @@ -301,6 +311,11 @@ def device(self) -> Union[str, torch.device]:
def frame_shift(self) -> Seconds:
return self.config.frame_shift

@property
def padding_value(self) -> float:
"""Return the value that should be used to pad these features."""
return EPSILON

def feature_dim(self, sampling_rate: int) -> int:
return self.config.num_ceps

Expand Down
8 changes: 8 additions & 0 deletions lhotse/features/kaldifeat.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,10 @@ def __init__(self, config: Optional[KaldifeatFbankConfig] = None) -> None:
def feature_dim(self, sampling_rate: int) -> int:
return self.config.mel_opts.num_bins

@property
def padding_value(self) -> float:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@csukuangfj could you check if this looks correct?

return -1000.0 if self.config.use_log_fbank else EPSILON
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should -1000 be replaced with

LOG_EPSILON = math.log(EPSILON)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, ideally it should, but I was basing this on the default value used in the FeatureMixer.


@staticmethod
def mix(
features_a: np.ndarray, features_b: np.ndarray, energy_scaling_factor_b: float
Expand Down Expand Up @@ -259,3 +263,7 @@ def __init__(self, config: Optional[KaldifeatMfccConfig] = None) -> None:

def feature_dim(self, sampling_rate: int) -> int:
return self.config.num_ceps

@property
def padding_value(self) -> float:
return -1000.0
8 changes: 6 additions & 2 deletions lhotse/features/librosa_fbank.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def pad_or_truncate_features(
feats: np.ndarray,
expected_num_frames: int,
abs_tol: int = 1,
pad_value: float = LOG_EPSILON,
pad_value: float = -1000,
):
frames_diff = feats.shape[0] - expected_num_frames

Expand All @@ -53,7 +53,7 @@ def pad_or_truncate_features(
feats,
((0, -frames_diff), (0, 0)),
mode="constant",
constant_values=LOG_EPSILON,
constant_values=-1000,
)
elif abs(frames_diff) > abs_tol:
raise ValueError(
Expand Down Expand Up @@ -148,6 +148,10 @@ class LibrosaFbank(FeatureExtractor):
def frame_shift(self) -> Seconds:
return self.config.hop_size / self.config.sampling_rate

@property
def padding_value(self) -> float:
return -1000.0

def feature_dim(self, sampling_rate: int) -> int:
return self.config.num_mel_bins

Expand Down
4 changes: 4 additions & 0 deletions lhotse/features/mfcc.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,7 @@ def _feature_fn(self, *args, **kwargs):

def feature_dim(self, sampling_rate: int) -> int:
return self.config.num_ceps

@property
def padding_value(self) -> float:
return -1000.0
12 changes: 8 additions & 4 deletions lhotse/features/mixer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def __init__(
feature_extractor: FeatureExtractor,
base_feats: np.ndarray,
frame_shift: Seconds,
padding_value: float = -1000.0,
padding_value: Optional[float] = None,
reference_energy: Optional[float] = None,
):
"""
Expand All @@ -40,8 +40,7 @@ def __init__(
in terms of energy and offset for all features mixed into them.
:param frame_shift: Required to correctly compute offset and padding during the mix.
:param padding_value: The value used to pad the shorter features during the mix.
This value is adequate only for log space features. For non-log space features,
e.g. energies, use either 0 or a small positive value like 1e-5.
If not provided, we will default to ``feature_extractor.padding_value``.
:param reference_energy: Optionally pass a reference energy value to compute SNRs against.
This might be required when ``base_feats`` correspond to padding energies.
"""
Expand All @@ -50,7 +49,12 @@ def __init__(
self.num_channels = 1 if base_feats.ndim == 2 else base_feats.shape[-1]
self.gains = []
self.frame_shift = frame_shift
self.padding_value = padding_value
self.padding_value = padding_value or feature_extractor.padding_value
if self.padding_value is None:
raise ValueError(
f"FeatureMixer requires `padding_value` to be set, since {feature_extractor}"
f" does not define a default one."
)
self.dtype = self.tracks[0].dtype

# Keep a pre-computed energy value of the features that we initialize the Mixer with;
Expand Down
6 changes: 5 additions & 1 deletion lhotse/features/opensmile.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import numpy as np

from lhotse.features.base import FeatureExtractor, register_extractor
from lhotse.utils import Seconds, compute_num_frames, is_module_available
from lhotse.utils import EPSILON, Seconds, compute_num_frames, is_module_available


@dataclass
Expand Down Expand Up @@ -121,6 +121,10 @@ def frame_shift(self) -> Seconds:
f"frame_shift is not defined for Functionals feature level or for non default feature set. Defined featureset: {self.config.feature_set}"
)

@property
def padding_value(self) -> float:
return EPSILON

def feature_dim(self, sampling_rate: int) -> int:
return len(self.feature_names)

Expand Down
4 changes: 4 additions & 0 deletions lhotse/features/spectrogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ def feature_dim(self, sampling_rate: int) -> int:
else window_size
)

@property
def padding_value(self) -> float:
return EPSILON

@staticmethod
def mix(
features_a: np.ndarray, features_b: np.ndarray, energy_scaling_factor_b: float
Expand Down
5 changes: 5 additions & 0 deletions lhotse/features/ssl.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ def __init__(self, config: Optional[Any] = None):
def frame_shift(self) -> Seconds:
return self.config.frame_shift

@property
def padding_value(self) -> float:
"""Return the value that should be used to pad these features."""
return EPSILON

@property
def sampling_rate(self) -> int:
return self.config.sampling_rate
Expand Down