Skip to content

Commit

Permalink
correction for cmvnw function
Browse files Browse the repository at this point in the history
  • Loading branch information
astorfi committed Nov 26, 2017
1 parent 2862414 commit e9f7cb6
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 54 deletions.
63 changes: 28 additions & 35 deletions docs/source/epilogue/test.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,34 +10,33 @@ The test example can be seen in ``test/test_package.py`` as below:

.. code-block:: python
import scipy.io.wavfile as wav
import numpy as np
import speechpy
import os
file_name = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Alesis-Sanctuary-QCard-AcoustcBas-C2.wav')
fs, signal = wav.read(file_name)
signal = signal[:,0]
# Example of staching frames
frames = speechpy.processing.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, Filter=lambda x: np.ones((x,)),
zero_padding=True)
# Example of extracting power spectrum
power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512)
print('power spectrum shape=', power_spectrum.shape)
############# Extract MFCC features #############
mfcc = speechpy.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
mfcc_cmvn = speechpy.cmvnw(mfcc,win_size=301,variance_normalization=True)
mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
mfcc_cmvn = speechpy.processing.cmvnw(mfcc,win_size=301,variance_normalization=True)
print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape)
mfcc_feature_cube = speechpy.extract_derivative_feature(mfcc)
mfcc_feature_cube = speechpy.feature.extract_derivative_feature(mfcc)
print('mfcc feature cube shape=', mfcc_feature_cube.shape)
############# Extract logenergy features #############
logenergy = speechpy.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
logenergy_feature_cube = speechpy.extract_derivative_feature(logenergy)
logenergy = speechpy.feature.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
logenergy_feature_cube = speechpy.feature.extract_derivative_feature(logenergy)
print('logenergy features=', logenergy.shape)
# Example of staching frames
signal = speechpy.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, Filter=lambda x: np.ones((x,)),
zero_padding=True)
-----------
Test Local
-----------
Expand All @@ -47,39 +46,33 @@ The local test example can be found in ``test/test_package.py`` as follows:

.. code-block:: python
import scipy.io.wavfile as wav
import numpy as np
import os
import sys
lib_path = os.path.abspath(os.path.join('..'))
print(lib_path)
sys.path.append(lib_path)
import speechpy
import os
file_name = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Alesis-Sanctuary-QCard-AcoustcBas-C2.wav')
fs, signal = wav.read(file_name)
signal = signal[:,0]
# Example of staching frames
frames = speechpy.processing.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, Filter=lambda x: np.ones((x,)),
zero_padding=True)
# Example of extracting power spectrum
power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512)
print('power spectrum shape=', power_spectrum.shape)
############# Extract MFCC features #############
mfcc = speechpy.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
mfcc_cmvn = speechpy.cmvnw(mfcc,win_size=301,variance_normalization=True)
mfcc_cmvn = speechpy.processing.cmvnw(mfcc,win_size=301,variance_normalization=True)
print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape)
mfcc_feature_cube = speechpy.extract_derivative_feature(mfcc)
mfcc_feature_cube = speechpy.feature.extract_derivative_feature(mfcc)
print('mfcc feature cube shape=', mfcc_feature_cube.shape)
############# Extract logenergy features #############
logenergy = speechpy.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
logenergy = speechpy.feature.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
logenergy_feature_cube = speechpy.extract_derivative_feature(logenergy)
logenergy_feature_cube = speechpy.feature.extract_derivative_feature(logenergy)
print('logenergy features=', logenergy.shape)
# Example of staching frames
signal = speechpy.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, Filter=lambda x: np.ones((x,)),
zero_padding=True)
For ectracting the feature at first, the signal samples will be stacked into frames. The features are computed for each frame in the stacked frames collection.
Expand Down
18 changes: 9 additions & 9 deletions speechpy/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,39 +77,39 @@ def stack_frames(sig, sampling_frequency, frame_length=0.020, frame_stride=0.020
return Extracted_Frames


def fft_spectrum(frames, fft_length=512):
def fft_spectrum(frames, fft_points=512):
"""This function computes the one-dimensional n-point discrete Fourier Transform (DFT) of a real-valued
array by means of an efficient algorithm called the Fast Fourier Transform (FFT). Please refer to
https://docs.scipy.org/doc/numpy/reference/generated/numpy.fft.rfft.html for further details.
:param frames: The frame array in which each row is a frame.
:param fft_length: The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.
:param fft_points: The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.
:param num_keep_coefficients: The number of coefficients that is kept.
:returns: If frames is an num_frames x sample_per_frame matrix, output will be num_frames x FFT_LENGTH.
"""
SPECTRUM_VECTOR = np.fft.rfft(frames, n=fft_length, axis=-1, norm=None)
SPECTRUM_VECTOR = np.fft.rfft(frames, n=fft_points, axis=-1, norm=None)
return np.absolute(SPECTRUM_VECTOR)


def power_spectrum(frames, fft_length=512):
def power_spectrum(frames, fft_points=512):
"""Power spectrum of each frame.
:param frames: The frame array in which each row is a frame.
:param fft_length: The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.
:param fft_points: The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.
:returns: If frames is an num_frames x sample_per_frame matrix, output will be num_frames x fft_length.
"""
return 1.0 / fft_length * np.square(fft_spectrum(frames, fft_length))
return 1.0 / fft_points * np.square(fft_spectrum(frames, fft_points))


def log_power_spectrum(frames, fft_length=512, normalize=True):
def log_power_spectrum(frames, fft_points=512, normalize=True):
"""Log power spectrum of each frame in frames.
:param frames: The frame array in which each row is a frame.
:param fft_length: The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.
:param fft_points: The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.
:param norm: If norm=1, the log power spectrum will be normalized.
:returns: If frames is an num_frames x sample_per_frame matrix, output will be num_frames x fft_length.
"""
power_spec = power_spectrum(frames, fft_length)
power_spec = power_spectrum(frames, fft_points)
power_spec[power_spec <= 1e-20] = 1e-20
log_power_spec = 10 * np.log10(power_spec)
if normalize:
Expand Down
3 changes: 2 additions & 1 deletion tests/test_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
zero_padding=True)

# Example of extracting power spectrum
frames = speechpy.processing.power_spectrum(frames, fft_length=512)
power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512)
print('power spectrum shape=', power_spectrum.shape)

############# Extract MFCC features #############
mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
Expand Down
22 changes: 13 additions & 9 deletions tests/test_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,28 @@
fs, signal = wav.read(file_name)
signal = signal[:,0]

# Example of staching frames
frames = speechpy.processing.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, Filter=lambda x: np.ones((x,)),
zero_padding=True)

# Example of extracting power spectrum
power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512)
print('power spectrum shape=', power_spectrum.shape)

############# Extract MFCC features #############
mfcc = speechpy.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
mfcc_cmvn = speechpy.cmvnw(mfcc,win_size=301,variance_normalization=True)
mfcc_cmvn = speechpy.processing.cmvnw(mfcc,win_size=301,variance_normalization=True)
print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape)

mfcc_feature_cube = speechpy.extract_derivative_feature(mfcc)
mfcc_feature_cube = speechpy.feature.extract_derivative_feature(mfcc)
print('mfcc feature cube shape=', mfcc_feature_cube.shape)

############# Extract logenergy features #############
logenergy = speechpy.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
logenergy = speechpy.feature.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
logenergy_feature_cube = speechpy.extract_derivative_feature(logenergy)
logenergy_feature_cube = speechpy.feature.extract_derivative_feature(logenergy)
print('logenergy features=', logenergy.shape)

# Example of staching frames
signal = speechpy.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, Filter=lambda x: np.ones((x,)),
zero_padding=True)



0 comments on commit e9f7cb6

Please sign in to comment.