Source code for aeneas.mfcc

#!/usr/bin/env python
# coding=utf-8

# aeneas is a Python/C library and a set of tools
# to automagically synchronize audio and text (aka forced alignment)
#
# Copyright (C) 2012-2013, Alberto Pettarin (www.albertopettarin.it)
# Copyright (C) 2013-2015, ReadBeyond Srl   (www.readbeyond.it)
# Copyright (C) 2015-2017, Alberto Pettarin (www.albertopettarin.it)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""
This module contains the following classes:

* :class:`~aeneas.mfcc.MFCC`, computing Mel-frequency cepstral coefficients (MFCCs).

This file is a modified version of the ``mfcc.py`` file
by David Huggins-Daines from the CMU Sphinx-III project.
You can find the original file in the ``thirdparty/`` directory.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import numpy

from aeneas.logger import Loggable
from aeneas.runtimeconfiguration import RuntimeConfiguration


[docs]class MFCC(Loggable): """ A class for computing Mel-frequency cepstral coefficients (MFCCs). :param rconf: a runtime configuration :type rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration` :param logger: the logger object :type logger: :class:`~aeneas.logger.Logger` """ CUTOFF = 0.00001 """ Cut-off threshold """ MEL_10 = 2595.0 """ Base Mel frequency """ TAG = u"MFCC" def __init__(self, rconf=None, logger=None): super(MFCC, self).__init__(rconf=rconf, logger=logger) # store parameters in local attributes self.filter_bank_size = self.rconf[RuntimeConfiguration.MFCC_FILTERS] self.mfcc_size = self.rconf[RuntimeConfiguration.MFCC_SIZE] self.fft_order = self.rconf[RuntimeConfiguration.MFCC_FFT_ORDER] self.lower_frequency = self.rconf[RuntimeConfiguration.MFCC_LOWER_FREQUENCY] self.upper_frequency = self.rconf[RuntimeConfiguration.MFCC_UPPER_FREQUENCY] self.emphasis_factor = self.rconf[RuntimeConfiguration.MFCC_EMPHASIS_FACTOR] self.window_length = self.rconf[RuntimeConfiguration.MFCC_WINDOW_LENGTH] self.window_shift = self.rconf[RuntimeConfiguration.MFCC_WINDOW_SHIFT] # initialize DCT matrix self._create_dct_matrix() # initialized later by compute_from_data() self.data = None self.sample_rate = None self.filters = None self.hamming_window = None @classmethod def _hz2mel(cls, frequency): """ Convert the given frequency in Hz to the Mel scale. :param float frequency: the Hz frequency to convert :rtype: float """ return cls.MEL_10 * math.log10(1.0 + (frequency / 700.0)) @classmethod def _mel2hz(cls, mel): """ Convert the given Mel value to Hz frequency. :param float mel: the Mel value to convert :rtype: float """ return 700.0 * (10 ** (mel / cls.MEL_10) - 1) def _create_dct_matrix(self): """ Create the not-quite-DCT matrix as used by Sphinx, and store it in ```self.s2dct```. """ self.s2dct = numpy.zeros((self.mfcc_size, self.filter_bank_size)) for i in range(0, self.mfcc_size): freq = numpy.pi * float(i) / self.filter_bank_size self.s2dct[i] = numpy.cos(freq * numpy.arange(0.5, 0.5 + self.filter_bank_size, 1.0, 'float64')) self.s2dct[:, 0] *= 0.5 self.s2dct = self.s2dct.transpose() def _create_mel_filter_bank(self): """ Create the Mel filter bank, and store it in ``self.filters``. Note that it is a function of the audio sample rate, so it cannot be created in the class initializer, but only later in :func:`aeneas.mfcc.MFCC.compute_from_data`. """ self.filters = numpy.zeros((1 + (self.fft_order // 2), self.filter_bank_size), 'd') dfreq = float(self.sample_rate) / self.fft_order nyquist_frequency = self.sample_rate / 2 if self.upper_frequency > nyquist_frequency: self.log_exc(u"Upper frequency %f exceeds Nyquist frequency %f" % (self.upper_frequency, nyquist_frequency), None, True, ValueError) melmax = MFCC._hz2mel(self.upper_frequency) melmin = MFCC._hz2mel(self.lower_frequency) dmelbw = (melmax - melmin) / (self.filter_bank_size + 1) filt_edge = MFCC._mel2hz(melmin + dmelbw * numpy.arange(self.filter_bank_size + 2, dtype='d')) # TODO can this code be written more numpy-style? # (the performance loss is negligible, it is just ugly to see) for whichfilt in range(0, self.filter_bank_size): # int() casts to native int instead of working with numpy.float64 leftfr = int(round(filt_edge[whichfilt] / dfreq)) centerfr = int(round(filt_edge[whichfilt + 1] / dfreq)) rightfr = int(round(filt_edge[whichfilt + 2] / dfreq)) fwidth = (rightfr - leftfr) * dfreq height = 2.0 / fwidth if centerfr != leftfr: leftslope = height / (centerfr - leftfr) else: leftslope = 0 freq = leftfr + 1 while freq < centerfr: self.filters[freq, whichfilt] = (freq - leftfr) * leftslope freq = freq + 1 # the next if should always be true! if freq == centerfr: self.filters[freq, whichfilt] = height freq = freq + 1 if centerfr != rightfr: rightslope = height / (centerfr - rightfr) while freq < rightfr: self.filters[freq, whichfilt] = (freq - rightfr) * rightslope freq = freq + 1 def _pre_emphasis(self): """ Pre-emphasize the entire signal at once by self.emphasis_factor, overwriting ``self.data``. """ self.data = numpy.append(self.data[0], self.data[1:] - self.emphasis_factor * self.data[:-1])
[docs] def compute_from_data(self, data, sample_rate): """ Compute MFCCs for the given audio data. The audio data must be a 1D :class:`numpy.ndarray`, that is, it must represent a monoaural (single channel) array of ``float64`` values in ``[-1.0, 1.0]``. :param data: the audio data :type data: :class:`numpy.ndarray` (1D) :param int sample_rate: the sample rate of the audio data, in samples/s (Hz) :raises: ValueError: if the data is not a 1D :class:`numpy.ndarray` (i.e., not mono), or if the data is empty :raises: ValueError: if the upper frequency defined in the ``rconf`` is larger than the Nyquist frequenct (i.e., half of ``sample_rate``) """ def _process_frame(self, frame): """ Process each frame, returning the log(power()) of it. """ # apply Hamming window frame *= self.hamming_window # compute RFFT fft = numpy.fft.rfft(frame, self.fft_order) # equivalent to power = fft.real * fft.real + fft.imag * fft.imag power = numpy.square(numpy.absolute(fft)) # # return the log(power()) of the transformed vector # v1 # COMMENTED logspec = numpy.log(numpy.dot(power, self.filters).clip(self.CUTOFF, numpy.inf)) # COMMENTED return numpy.dot(logspec, self.s2dct) / self.filter_bank_size # v2 return numpy.log(numpy.dot(power, self.filters).clip(self.CUTOFF, numpy.inf)) if len(data.shape) != 1: self.log_exc(u"The audio data must be a 1D numpy array (mono).", None, True, ValueError) if len(data) < 1: self.log_exc(u"The audio data must not be empty.", None, True, ValueError) self.data = data self.sample_rate = sample_rate # number of samples in the audio data_length = len(self.data) # frame length in number of samples frame_length = int(self.window_length * self.sample_rate) # frame length must be at least equal to the FFT order frame_length_padded = max(frame_length, self.fft_order) # frame shift in number of samples frame_shift = int(self.window_shift * self.sample_rate) # number of MFCC vectors (one for each frame) # this number includes the last shift, # where the data will be padded with zeros # if the remaining samples are less than frame_length_padded number_of_frames = int((1.0 * data_length) / frame_shift) # create Hamming window self.hamming_window = numpy.hamming(frame_length_padded) # build Mel filter bank self._create_mel_filter_bank() # pre-emphasize the entire audio data self._pre_emphasis() # allocate the MFCCs matrix # v1 # COMMENTED mfcc = numpy.zeros((number_of_frames, self.mfcc_size), 'float64') # v2 mfcc = numpy.zeros((number_of_frames, self.filter_bank_size), 'float64') # compute MFCCs one frame at a time for frame_index in range(number_of_frames): # COMMENTED print("Computing frame %d / %d" % (frame_index, number_of_frames)) # get the start and end indices for this frame, # do not overrun the data length frame_start = frame_index * frame_shift frame_end = min(frame_start + frame_length_padded, data_length) # frame is zero-padded if the remaining samples # are less than its length frame = numpy.zeros(frame_length_padded) frame[0:(frame_end - frame_start)] = self.data[frame_start:frame_end] # process the frame mfcc[frame_index] = _process_frame(self, frame) # v1 # COMMENTED return mfcc # v2 # return the dot product with the DCT matrix return numpy.dot(mfcc, self.s2dct) / self.filter_bank_size