correction for cmvnw function

astorfi · Nov 26, 2017 · e9f7cb6 · e9f7cb6
1 parent 2862414
commit e9f7cb6
Show file tree

Hide file tree

Showing 4 changed files with 52 additions and 54 deletions.
diff --git a/docs/source/epilogue/test.rst b/docs/source/epilogue/test.rst
@@ -10,34 +10,33 @@ The test example can be seen in ``test/test_package.py`` as below:
 
 .. code-block:: python
 
-    import scipy.io.wavfile as wav
-    import numpy as np
-    import speechpy
-    import os
-
     file_name = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Alesis-Sanctuary-QCard-AcoustcBas-C2.wav')
     fs, signal = wav.read(file_name)
     signal = signal[:,0]
 
+    # Example of staching frames
+    frames = speechpy.processing.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, Filter=lambda x: np.ones((x,)),
+             zero_padding=True)
+
+    # Example of extracting power spectrum
+    power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512)
+    print('power spectrum shape=', power_spectrum.shape)
+
     ############# Extract MFCC features #############
-    mfcc = speechpy.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
-               num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
-    mfcc_cmvn = speechpy.cmvnw(mfcc,win_size=301,variance_normalization=True)
+    mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
+                 num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
+    mfcc_cmvn = speechpy.processing.cmvnw(mfcc,win_size=301,variance_normalization=True)
     print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape)
 
-    mfcc_feature_cube = speechpy.extract_derivative_feature(mfcc)
+    mfcc_feature_cube = speechpy.feature.extract_derivative_feature(mfcc)
     print('mfcc feature cube shape=', mfcc_feature_cube.shape)
 
     ############# Extract logenergy features #############
-    logenergy = speechpy.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
-               num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
-    logenergy_feature_cube = speechpy.extract_derivative_feature(logenergy)
+    logenergy = speechpy.feature.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
+                 num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
+    logenergy_feature_cube = speechpy.feature.extract_derivative_feature(logenergy)
     print('logenergy features=', logenergy.shape)
 
-    # Example of staching frames
-    signal = speechpy.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, Filter=lambda x: np.ones((x,)),
-           zero_padding=True)
-
 -----------
 Test Local
 -----------
@@ -47,39 +46,33 @@ The local test example can be found in ``test/test_package.py`` as follows:
 
 .. code-block:: python
 
-    import scipy.io.wavfile as wav
-    import numpy as np
-    import os
-    import sys
-    lib_path = os.path.abspath(os.path.join('..'))
-    print(lib_path)
-    sys.path.append(lib_path)
-    import speechpy
-    import os
-
     file_name = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Alesis-Sanctuary-QCard-AcoustcBas-C2.wav')
     fs, signal = wav.read(file_name)
     signal = signal[:,0]
 
+    # Example of staching frames
+    frames = speechpy.processing.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, Filter=lambda x: np.ones((x,)),
+             zero_padding=True)
+
+    # Example of extracting power spectrum
+    power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512)
+    print('power spectrum shape=', power_spectrum.shape)
+
     ############# Extract MFCC features #############
-    mfcc = speechpy.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
+    mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
                  num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
-    mfcc_cmvn = speechpy.cmvnw(mfcc,win_size=301,variance_normalization=True)
+    mfcc_cmvn = speechpy.processing.cmvnw(mfcc,win_size=301,variance_normalization=True)
     print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape)
 
-    mfcc_feature_cube = speechpy.extract_derivative_feature(mfcc)
+    mfcc_feature_cube = speechpy.feature.extract_derivative_feature(mfcc)
     print('mfcc feature cube shape=', mfcc_feature_cube.shape)
 
     ############# Extract logenergy features #############
-    logenergy = speechpy.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
+    logenergy = speechpy.feature.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
                  num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
-    logenergy_feature_cube = speechpy.extract_derivative_feature(logenergy)
+    logenergy_feature_cube = speechpy.feature.extract_derivative_feature(logenergy)
     print('logenergy features=', logenergy.shape)
 
-    # Example of staching frames
-    signal = speechpy.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, Filter=lambda x: np.ones((x,)),
-             zero_padding=True)
-
 
 
 For ectracting the feature at first, the signal samples will be stacked into frames. The features are computed for each frame in the stacked frames collection.

diff --git a/speechpy/processing.py b/speechpy/processing.py
@@ -77,39 +77,39 @@ def stack_frames(sig, sampling_frequency, frame_length=0.020, frame_stride=0.020
     return Extracted_Frames
 
 
-def fft_spectrum(frames, fft_length=512):
+def fft_spectrum(frames, fft_points=512):
     """This function computes the one-dimensional n-point discrete Fourier Transform (DFT) of a real-valued
     array by means of an efficient algorithm called the Fast Fourier Transform (FFT). Please refer to
     https://docs.scipy.org/doc/numpy/reference/generated/numpy.fft.rfft.html for further details.
 
     :param frames: The frame array in which each row is a frame.
-    :param fft_length: The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.
+    :param fft_points: The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.
     :param num_keep_coefficients: The number of coefficients that is kept.
     :returns: If frames is an num_frames x sample_per_frame matrix, output will be num_frames x FFT_LENGTH.
     """
-    SPECTRUM_VECTOR = np.fft.rfft(frames, n=fft_length, axis=-1, norm=None)
+    SPECTRUM_VECTOR = np.fft.rfft(frames, n=fft_points, axis=-1, norm=None)
     return np.absolute(SPECTRUM_VECTOR)
 
 
-def power_spectrum(frames, fft_length=512):
+def power_spectrum(frames, fft_points=512):
     """Power spectrum of each frame.
 
     :param frames: The frame array in which each row is a frame.
-    :param fft_length: The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.
+    :param fft_points: The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.
     :returns: If frames is an num_frames x sample_per_frame matrix, output will be num_frames x fft_length.
     """
-    return 1.0 / fft_length * np.square(fft_spectrum(frames, fft_length))
+    return 1.0 / fft_points * np.square(fft_spectrum(frames, fft_points))
 
 
-def log_power_spectrum(frames, fft_length=512, normalize=True):
+def log_power_spectrum(frames, fft_points=512, normalize=True):
     """Log power spectrum of each frame in frames.
 
     :param frames: The frame array in which each row is a frame.
-    :param fft_length: The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.
+    :param fft_points: The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.
     :param norm: If norm=1, the log power spectrum will be normalized.
     :returns: If frames is an num_frames x sample_per_frame matrix, output will be num_frames x fft_length.
     """
-    power_spec = power_spectrum(frames, fft_length)
+    power_spec = power_spectrum(frames, fft_points)
     power_spec[power_spec <= 1e-20] = 1e-20
     log_power_spec = 10 * np.log10(power_spec)
     if normalize:

diff --git a/tests/test_local.py b/tests/test_local.py
@@ -17,7 +17,8 @@
          zero_padding=True)
 
 # Example of extracting power spectrum
-frames = speechpy.processing.power_spectrum(frames, fft_length=512)
+power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512)
+print('power spectrum shape=', power_spectrum.shape)
 
 ############# Extract MFCC features #############
 mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,

diff --git a/tests/test_package.py b/tests/test_package.py
@@ -7,24 +7,28 @@
 fs, signal = wav.read(file_name)
 signal = signal[:,0]
 
+# Example of staching frames
+frames = speechpy.processing.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, Filter=lambda x: np.ones((x,)),
+         zero_padding=True)
+
+# Example of extracting power spectrum
+power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512)
+print('power spectrum shape=', power_spectrum.shape)
+
 ############# Extract MFCC features #############
-mfcc = speechpy.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
+mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
              num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
-mfcc_cmvn = speechpy.cmvnw(mfcc,win_size=301,variance_normalization=True)
+mfcc_cmvn = speechpy.processing.cmvnw(mfcc,win_size=301,variance_normalization=True)
 print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape)
 
-mfcc_feature_cube = speechpy.extract_derivative_feature(mfcc)
+mfcc_feature_cube = speechpy.feature.extract_derivative_feature(mfcc)
 print('mfcc feature cube shape=', mfcc_feature_cube.shape)
 
 ############# Extract logenergy features #############
-logenergy = speechpy.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
+logenergy = speechpy.feature.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
              num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
-logenergy_feature_cube = speechpy.extract_derivative_feature(logenergy)
+logenergy_feature_cube = speechpy.feature.extract_derivative_feature(logenergy)
 print('logenergy features=', logenergy.shape)
 
-# Example of staching frames
-signal = speechpy.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, Filter=lambda x: np.ones((x,)),
-         zero_padding=True)
-