Source code for aeneas.vad

#!/usr/bin/env python
# coding=utf-8

# aeneas is a Python/C library and a set of tools
# to automagically synchronize audio and text (aka forced alignment)
# Copyright (C) 2012-2013, Alberto Pettarin (
# Copyright (C) 2013-2015, ReadBeyond Srl   (
# Copyright (C) 2015-2017, Alberto Pettarin (
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <>.

This module contains the following classes:

* :class:`~aeneas.vad.VAD`,
  a simple voice activity detector
  based on the energy of the 0-th MFCC.

Given an energy vector representing an audio file,
it will return a boolean mask
with elements set to ``True`` where speech is,
and ``False`` where nonspeech occurs.

.. versionadded:: 1.0.4

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy

from aeneas.logger import Loggable
from aeneas.runtimeconfiguration import RuntimeConfiguration

[docs]class VAD(Loggable): """ The voice activity detector (VAD). :param rconf: a runtime configuration :type rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration` :param logger: the logger object :type logger: :class:`~aeneas.logger.Logger` """ TAG = u"VAD"
[docs] def run_vad( self, wave_energy, log_energy_threshold=None, min_nonspeech_length=None, extend_before=None, extend_after=None ): """ Compute the time intervals containing speech and nonspeech, and return a boolean mask with speech frames set to ``True``, and nonspeech frames set to ``False``. The last four parameters might be ``None``: in this case, the corresponding RuntimeConfiguration values are applied. :param wave_energy: the energy vector of the audio file (0-th MFCC) :type wave_energy: :class:`numpy.ndarray` (1D) :param float log_energy_threshold: the minimum log energy threshold to consider a frame as speech :param int min_nonspeech_length: the minimum length, in frames, of a nonspeech interval :param int extend_before: extend each speech interval by this number of frames to the left (before) :param int extend_after: extend each speech interval by this number of frames to the right (after) :rtype: :class:`numpy.ndarray` (1D) """ self.log(u"Computing VAD for wave") mfcc_window_shift = self.rconf.mws self.log([u"MFCC window shift (s): %.3f", mfcc_window_shift]) if log_energy_threshold is None: log_energy_threshold = self.rconf[RuntimeConfiguration.VAD_LOG_ENERGY_THRESHOLD] self.log([u"Log energy threshold: %.3f", log_energy_threshold]) if min_nonspeech_length is None: min_nonspeech_length = int(self.rconf[RuntimeConfiguration.VAD_MIN_NONSPEECH_LENGTH] / mfcc_window_shift) self.log([u"Min nonspeech length (s): %.3f", self.rconf[RuntimeConfiguration.VAD_MIN_NONSPEECH_LENGTH]]) if extend_before is None: extend_before = int(self.rconf[RuntimeConfiguration.VAD_EXTEND_SPEECH_INTERVAL_BEFORE] / mfcc_window_shift) self.log([u"Extend speech before (s): %.3f", self.rconf[RuntimeConfiguration.VAD_EXTEND_SPEECH_INTERVAL_BEFORE]]) if extend_after is None: extend_after = int(self.rconf[RuntimeConfiguration.VAD_EXTEND_SPEECH_INTERVAL_AFTER] / mfcc_window_shift) self.log([u"Extend speech after (s): %.3f", self.rconf[RuntimeConfiguration.VAD_EXTEND_SPEECH_INTERVAL_AFTER]]) energy_length = len(wave_energy) energy_threshold = numpy.min(wave_energy) + log_energy_threshold self.log([u"Min nonspeech length (frames): %d", min_nonspeech_length]) self.log([u"Extend speech before (frames): %d", extend_before]) self.log([u"Extend speech after (frames): %d", extend_after]) self.log([u"Energy vector length (frames): %d", energy_length]) self.log([u"Energy threshold (log): %.3f", energy_threshold]) # using windows to be sure we have at least # min_nonspeech_length consecutive frames with nonspeech self.log(u"Determining initial labels...") mask = wave_energy >= energy_threshold windows = self._rolling_window(mask, min_nonspeech_length) nonspeech_runs = self._compute_runs((numpy.where(numpy.sum(windows, axis=1) == 0))[0]) self.log(u"Determining initial labels... done") # initially, everything is marked as speech # we remove the nonspeech intervals as needed, # possibly extending the adjacent speech interval # if requested by the user self.log(u"Determining final labels...") mask = numpy.ones(energy_length, dtype="bool") for ns in nonspeech_runs: start = ns[0] if (extend_after > 0) and (start > 0): start += extend_after stop = ns[-1] + min_nonspeech_length if (extend_before > 0) and (stop < energy_length - 1): stop -= extend_before mask[start:stop] = 0 self.log(u"Determining final labels... done") return mask
@classmethod def _compute_runs(self, array): """ Compute runs as a list of arrays, each containing the indices of a contiguous run. :param array: the data array :type array: numpy 1D array :rtype: list of numpy 1D arrays """ if len(array) < 1: return [] return numpy.split(array, numpy.where(numpy.diff(array) != 1)[0] + 1) @classmethod def _rolling_window(self, array, size): """ Compute rolling windows of width ``size`` of the given array. Return a numpy 2D stride array, where rows are the windows, each of ``size`` elements. :param array: the data array :type array: numpy 1D array (n) :param int size: the width of each window :rtype: numpy 2D stride array (n // size, size) """ shape = array.shape[:-1] + (array.shape[-1] - size + 1, size) strides = array.strides + (array.strides[-1],) return numpy.lib.stride_tricks.as_strided(array, shape=shape, strides=strides)