Source code for aeneas.ttswrappers.espeakttswrapper

#!/usr/bin/env python
# coding=utf-8

# aeneas is a Python/C library and a set of tools
# to automagically synchronize audio and text (aka forced alignment)
#
# Copyright (C) 2012-2013, Alberto Pettarin (www.albertopettarin.it)
# Copyright (C) 2013-2015, ReadBeyond Srl   (www.readbeyond.it)
# Copyright (C) 2015-2017, Alberto Pettarin (www.albertopettarin.it)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""
This module contains the following classes:

* :class:`~aeneas.ttswrappers.espeakttswrapper.ESPEAKTTSWrapper`,
  a wrapper for the ``eSpeak`` TTS engine.

Please refer to
http://espeak.sourceforge.net/
for further details.
"""

from __future__ import absolute_import
from __future__ import print_function

from aeneas.exacttiming import TimeValue
from aeneas.language import Language
from aeneas.runtimeconfiguration import RuntimeConfiguration
from aeneas.ttswrappers.basettswrapper import BaseTTSWrapper
import aeneas.globalfunctions as gf


[docs]class ESPEAKTTSWrapper(BaseTTSWrapper): """ A wrapper for the ``eSpeak`` TTS engine. This wrapper is the default TTS engine for ``aeneas``. This wrapper supports calling the TTS engine via ``subprocess`` or via Python C extension. In abstract terms, it performs one or more calls like :: $ espeak -v voice_code -w /tmp/output_file.wav < text To use this TTS engine, specify :: "tts=espeak" in the ``RuntimeConfiguration`` object. (You can omit this, since eSpeak is the default TTS engine.) To execute from a non-default location: :: "tts=espeak|tts_path=/path/to/espeak" To run the ``cew`` Python C extension in a separate process via :class:`~aeneas.cewsubprocess.CEWSubprocess`, use :: "cew_subprocess_enabled=True|cew_subprocess_path=/path/to/python" in the ``rconf`` object. See :class:`~aeneas.ttswrappers.basettswrapper.BaseTTSWrapper` for the available functions. Below are listed the languages supported by this wrapper. :param rconf: a runtime configuration :type rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration` :param logger: the logger object :type logger: :class:`~aeneas.logger.Logger` """ AFR = Language.AFR """ Afrikaans """ ARG = Language.ARG """ Aragonese (not tested) """ BOS = Language.BOS """ Bosnian (not tested) """ BUL = Language.BUL """ Bulgarian """ CAT = Language.CAT """ Catalan """ CES = Language.CES """ Czech """ CMN = Language.CMN """ Mandarin Chinese (not tested) """ CYM = Language.CYM """ Welsh """ DAN = Language.DAN """ Danish """ DEU = Language.DEU """ German """ ELL = Language.ELL """ Greek (Modern) """ ENG = Language.ENG """ English """ EPO = Language.EPO """ Esperanto (not tested) """ EST = Language.EST """ Estonian """ FAS = Language.FAS """ Persian """ FIN = Language.FIN """ Finnish """ FRA = Language.FRA """ French """ GLE = Language.GLE """ Irish """ GRC = Language.GRC """ Greek (Ancient) """ HIN = Language.HIN """ Hindi (not tested) """ HRV = Language.HRV """ Croatian """ HUN = Language.HUN """ Hungarian """ HYE = Language.HYE """ Armenian (not tested) """ IND = Language.IND """ Indonesian (not tested) """ ISL = Language.ISL """ Icelandic """ ITA = Language.ITA """ Italian """ JBO = Language.JBO """ Lojban (not tested) """ KAN = Language.KAN """ Kannada (not tested) """ KAT = Language.KAT """ Georgian (not tested) """ KUR = Language.KUR """ Kurdish (not tested) """ LAT = Language.LAT """ Latin """ LAV = Language.LAV """ Latvian """ LFN = Language.LFN """ Lingua Franca Nova (not tested) """ LIT = Language.LIT """ Lithuanian """ MAL = Language.MAL """ Malayalam (not tested) """ MKD = Language.MKD """ Macedonian (not tested) """ MSA = Language.MSA """ Malay (not tested) """ NEP = Language.NEP """ Nepali (not tested) """ NLD = Language.NLD """ Dutch """ NOR = Language.NOR """ Norwegian """ PAN = Language.PAN """ Panjabi (not tested) """ POL = Language.POL """ Polish """ POR = Language.POR """ Portuguese """ RON = Language.RON """ Romanian """ RUS = Language.RUS """ Russian """ SLK = Language.SLK """ Slovak """ SPA = Language.SPA """ Spanish """ SQI = Language.SQI """ Albanian (not tested) """ SRP = Language.SRP """ Serbian """ SWA = Language.SWA """ Swahili """ SWE = Language.SWE """ Swedish """ TAM = Language.TAM """ Tamil (not tested) """ TUR = Language.TUR """ Turkish """ UKR = Language.UKR """ Ukrainian """ VIE = Language.VIE """ Vietnamese (not tested) """ YUE = Language.YUE """ Yue Chinese (not tested) """ ZHO = Language.ZHO """ Chinese (not tested) """ ENG_GBR = "eng-GBR" """ English (GB) """ ENG_SCT = "eng-SCT" """ English (Scotland) (not tested) """ ENG_USA = "eng-USA" """ English (USA) """ SPA_ESP = "spa-ESP" """ Spanish (Castillan) """ FRA_BEL = "fra-BEL" """ French (Belgium) (not tested) """ FRA_FRA = "fra-FRA" """ French (France) """ POR_BRA = "por-bra" """ Portuguese (Brazil) (not tested) """ POR_PRT = "por-prt" """ Portuguese (Portugal) """ AF = "af" """ Afrikaans """ AN = "an" """ Aragonese (not tested) """ BG = "bg" """ Bulgarian """ BS = "bs" """ Bosnian (not tested) """ CA = "ca" """ Catalan """ CS = "cs" """ Czech """ CY = "cy" """ Welsh """ DA = "da" """ Danish """ DE = "de" """ German """ EL = "el" """ Greek (Modern) """ EN = "en" """ English """ EN_GB = "en-gb" """ English (GB) """ EN_SC = "en-sc" """ English (Scotland) (not tested) """ EN_UK_NORTH = "en-uk-north" """ English (Northern) (not tested) """ EN_UK_RP = "en-uk-rp" """ English (Received Pronunciation) (not tested) """ EN_UK_WMIDS = "en-uk-wmids" """ English (Midlands) (not tested) """ EN_US = "en-us" """ English (USA) """ EN_WI = "en-wi" """ English (West Indies) (not tested) """ EO = "eo" """ Esperanto (not tested) """ ES = "es" """ Spanish (Castillan) """ ES_LA = "es-la" """ Spanish (Latin America) (not tested) """ ET = "et" """ Estonian """ FA = "fa" """ Persian """ FA_PIN = "fa-pin" """ Persian (Pinglish) """ FI = "fi" """ Finnish """ FR = "fr" """ French """ FR_BE = "fr-be" """ French (Belgium) (not tested) """ FR_FR = "fr-fr" """ French (France) """ GA = "ga" """ Irish """ # NOTE already defined # COMMENTED GRC = "grc" # COMMENTED """ Greek (Ancient) """ HI = "hi" """ Hindi (not tested) """ HR = "hr" """ Croatian """ HU = "hu" """ Hungarian """ HY = "hy" """ Armenian (not tested) """ HY_WEST = "hy-west" """ Armenian (West) (not tested) """ ID = "id" """ Indonesian (not tested) """ IS = "is" """ Icelandic """ IT = "it" """ Italian """ # NOTE already defined # COMMENTED JBO = "jbo" # COMMENTED """ Lojban (not tested) """ KA = "ka" """ Georgian (not tested) """ KN = "kn" """ Kannada (not tested) """ KU = "ku" """ Kurdish (not tested) """ LA = "la" """ Latin """ # NOTE already defined # COMMENTED LFN = "lfn" # COMMENTED """ Lingua Franca Nova (not tested) """ LT = "lt" """ Lithuanian """ LV = "lv" """ Latvian """ MK = "mk" """ Macedonian (not tested) """ ML = "ml" """ Malayalam (not tested) """ MS = "ms" """ Malay (not tested) """ NE = "ne" """ Nepali (not tested) """ NL = "nl" """ Dutch """ NO = "no" """ Norwegian """ PA = "pa" """ Panjabi (not tested) """ PL = "pl" """ Polish """ PT = "pt" """ Portuguese """ PT_BR = "pt-br" """ Portuguese (Brazil) (not tested) """ PT_PT = "pt-pt" """ Portuguese (Portugal) """ RO = "ro" """ Romanian """ RU = "ru" """ Russian """ SQ = "sq" """ Albanian (not tested) """ SK = "sk" """ Slovak """ SR = "sr" """ Serbian """ SV = "sv" """ Swedish """ SW = "sw" """ Swahili """ TA = "ta" """ Tamil (not tested) """ TR = "tr" """ Turkish """ UK = "uk" """ Ukrainian """ VI = "vi" """ Vietnamese (not tested) """ VI_HUE = "vi-hue" """ Vietnamese (hue) (not tested) """ VI_SGN = "vi-sgn" """ Vietnamese (sgn) (not tested) """ ZH = "zh" """ Mandarin Chinese (not tested) """ ZH_YUE = "zh-yue" """ Yue Chinese (not tested) """ CODE_TO_HUMAN = { AFR: u"Afrikaans", ARG: u"Aragonese (not tested)", BOS: u"Bosnian (not tested)", BUL: u"Bulgarian", CAT: u"Catalan", CES: u"Czech", CMN: u"Mandarin Chinese (not tested)", CYM: u"Welsh", DAN: u"Danish", DEU: u"German", ELL: u"Greek (Modern)", ENG: u"English", EPO: u"Esperanto (not tested)", EST: u"Estonian", FAS: u"Persian", FIN: u"Finnish", FRA: u"French", GLE: u"Irish", GRC: u"Greek (Ancient)", HIN: u"Hindi (not tested)", HRV: u"Croatian", HUN: u"Hungarian", HYE: u"Armenian (not tested)", IND: u"Indonesian (not tested)", ISL: u"Icelandic", ITA: u"Italian", JBO: u"Lojban (not tested)", KAN: u"Kannada (not tested)", KAT: u"Georgian (not tested)", KUR: u"Kurdish (not tested)", LAT: u"Latin", LAV: u"Latvian", LFN: u"Lingua Franca Nova (not tested)", LIT: u"Lithuanian", MAL: u"Malayalam (not tested)", MKD: u"Macedonian (not tested)", MSA: u"Malay (not tested)", NEP: u"Nepali (not tested)", NLD: u"Dutch", NOR: u"Norwegian", PAN: u"Panjabi (not tested)", POL: u"Polish", POR: u"Portuguese", RON: u"Romanian", RUS: u"Russian", SLK: u"Slovak", SPA: u"Spanish", SQI: u"Albanian (not tested)", SRP: u"Serbian", SWA: u"Swahili", SWE: u"Swedish", TAM: u"Tamil (not tested)", TUR: u"Turkish", UKR: u"Ukrainian", VIE: u"Vietnamese (not tested)", YUE: u"Yue Chinese (not tested)", ZHO: u"Chinese (not tested)", ENG_GBR: u"English (GB)", ENG_SCT: u"English (Scotland) (not tested)", ENG_USA: u"English (USA)", SPA_ESP: u"Spanish (Castillan)", FRA_BEL: u"French (Belgium) (not tested)", FRA_FRA: u"French (France)", POR_BRA: u"Portuguese (Brazil) (not tested)", POR_PRT: u"Portuguese (Portugal)", AF: u"Afrikaans", AN: u"Aragonese (not tested)", BG: u"Bulgarian", BS: u"Bosnian (not tested)", CA: u"Catalan", CS: u"Czech", CY: u"Welsh", DA: u"Danish", DE: u"German", EL: u"Greek (Modern)", EN: u"English", EN_GB: u"English (GB)", EN_SC: u"English (Scotland) (not tested)", EN_UK_NORTH: u"English (Northern) (not tested)", EN_UK_RP: u"English (Received Pronunciation) (not tested)", EN_UK_WMIDS: u"English (Midlands) (not tested)", EN_US: u"English (USA)", EN_WI: u"English (West Indies) (not tested)", EO: u"Esperanto (not tested)", ES: u"Spanish (Castillan)", ES_LA: u"Spanish (Latin America) (not tested)", ET: u"Estonian", FA: u"Persian", FA_PIN: u"Persian (Pinglish)", FI: u"Finnish", FR: u"French", FR_BE: u"French (Belgium) (not tested)", FR_FR: u"French (France)", GA: u"Irish", HI: u"Hindi (not tested)", HR: u"Croatian", HU: u"Hungarian", HY: u"Armenian (not tested)", HY_WEST: u"Armenian (West) (not tested)", ID: u"Indonesian (not tested)", IS: u"Icelandic", IT: u"Italian", KA: u"Georgian (not tested)", KN: u"Kannada (not tested)", KU: u"Kurdish (not tested)", LA: u"Latin", LT: u"Lithuanian", LV: u"Latvian", MK: u"Macedonian (not tested)", ML: u"Malayalam (not tested)", MS: u"Malay (not tested)", NE: u"Nepali (not tested)", NL: u"Dutch", NO: u"Norwegian", PA: u"Panjabi (not tested)", PL: u"Polish", PT: u"Portuguese", PT_BR: u"Portuguese (Brazil) (not tested)", PT_PT: u"Portuguese (Portugal)", RO: u"Romanian", RU: u"Russian", SQ: u"Albanian (not tested)", SK: u"Slovak", SR: u"Serbian", SV: u"Swedish", SW: u"Swahili", TA: u"Tamil (not tested)", TR: u"Turkish", UK: u"Ukrainian", VI: u"Vietnamese (not tested)", VI_HUE: u"Vietnamese (hue) (not tested)", VI_SGN: u"Vietnamese (sgn) (not tested)", ZH: u"Mandarin Chinese (not tested)", ZH_YUE: u"Yue Chinese (not tested)", } CODE_TO_HUMAN_LIST = sorted([u"%s\t%s" % (k, v) for k, v in CODE_TO_HUMAN.items()]) LANGUAGE_TO_VOICE_CODE = { AF: "af", AN: "an", BG: "bg", BS: "bs", CA: "ca", CS: "cs", CY: "cy", DA: "da", DE: "de", EL: "el", EN: "en", EN_GB: "en-gb", EN_SC: "en-sc", EN_UK_NORTH: "en-uk-north", EN_UK_RP: "en-uk-rp", EN_UK_WMIDS: "en-uk-wmids", EN_US: "en-us", EN_WI: "en-wi", EO: "eo", ES: "es", ES_LA: "es-la", ET: "et", FA: "fa", FA_PIN: "fa-pin", FI: "fi", FR: "fr", FR_BE: "fr-be", FR_FR: "fr-fr", GA: "ga", # COMMENTED GRC: "grc", HI: "hi", HR: "hr", HU: "hu", HY: "hy", HY_WEST: "hy-west", ID: "id", IS: "is", IT: "it", # COMMENTED JBO: "jbo", KA: "ka", KN: "kn", KU: "ku", LA: "la", # COMMENTED LFN: "lfn", LT: "lt", LV: "lv", MK: "mk", ML: "ml", MS: "ms", NE: "ne", NL: "nl", NO: "no", PA: "pa", PL: "pl", PT: "pt", PT_BR: "pt-br", PT_PT: "pt-pt", RO: "ro", RU: "ru", SQ: "sq", SK: "sk", SR: "sr", SV: "sv", SW: "sw", TA: "ta", TR: "tr", UK: "ru", # NOTE mocking support for Ukrainian with Russian voice VI: "vi", VI_HUE: "vi-hue", VI_SGN: "vi-sgn", ZH: "zh", ZH_YUE: "zh-yue", AFR: "af", ARG: "an", BOS: "bs", BUL: "bg", CAT: "ca", CES: "cs", CMN: "zh", CYM: "cy", DAN: "da", DEU: "de", ELL: "el", ENG: "en", EPO: "eo", EST: "et", FAS: "fa", FIN: "fi", FRA: "fr", GLE: "ga", GRC: "grc", HIN: "hi", HRV: "hr", HUN: "hu", HYE: "hy", IND: "id", ISL: "is", ITA: "it", JBO: "jbo", KAN: "kn", KAT: "ka", KUR: "ku", LAT: "la", LAV: "lv", LFN: "lfn", LIT: "lt", MAL: "ml", MKD: "mk", MSA: "ms", NEP: "ne", NLD: "nl", NOR: "no", PAN: "pa", POL: "pl", POR: "pt", RON: "ro", RUS: "ru", SLK: "sk", SPA: "es", SQI: "sq", SRP: "sr", SWA: "sw", SWE: "sv", TAM: "ta", TUR: "tr", UKR: "ru", # NOTE mocking support for Ukrainian with Russian voice VIE: "vi", YUE: "zh-yue", ZHO: "zh", ENG_GBR: "en-gb", ENG_SCT: "en-sc", ENG_USA: "en-us", SPA_ESP: "es-es", FRA_BEL: "fr-be", FRA_FRA: "fr-fr", POR_BRA: "pt-br", POR_PRT: "pt-pt" } DEFAULT_LANGUAGE = ENG DEFAULT_TTS_PATH = "espeak" OUTPUT_AUDIO_FORMAT = ("pcm_s16le", 1, 22050) HAS_SUBPROCESS_CALL = True HAS_C_EXTENSION_CALL = True C_EXTENSION_NAME = "cew" TAG = u"ESPEAKTTSWrapper" def __init__(self, rconf=None, logger=None): super(ESPEAKTTSWrapper, self).__init__(rconf=rconf, logger=logger) self.set_subprocess_arguments([ self.tts_path, u"-v", self.CLI_PARAMETER_VOICE_CODE_STRING, u"-w", self.CLI_PARAMETER_WAVE_PATH, self.CLI_PARAMETER_TEXT_STDIN ]) def _synthesize_multiple_c_extension(self, text_file, output_file_path, quit_after=None, backwards=False): """ Synthesize multiple text fragments, using the cew extension. Return a tuple (anchors, total_time, num_chars). :rtype: (bool, (list, :class:`~aeneas.exacttiming.TimeValue`, int)) """ self.log(u"Synthesizing using C extension...") # convert parameters from Python values to C values try: c_quit_after = float(quit_after) except TypeError: c_quit_after = 0.0 c_backwards = 0 if backwards: c_backwards = 1 self.log([u"output_file_path: %s", output_file_path]) self.log([u"c_quit_after: %.3f", c_quit_after]) self.log([u"c_backwards: %d", c_backwards]) self.log(u"Preparing u_text...") u_text = [] fragments = text_file.fragments for fragment in fragments: f_lang = fragment.language f_text = fragment.filtered_text if f_lang is None: f_lang = self.DEFAULT_LANGUAGE f_voice_code = self._language_to_voice_code(f_lang) if f_text is None: f_text = u"" u_text.append((f_voice_code, f_text)) self.log(u"Preparing u_text... done") # call C extension sr = None sf = None intervals = None if self.rconf[RuntimeConfiguration.CEW_SUBPROCESS_ENABLED]: self.log(u"Using cewsubprocess to call aeneas.cew") try: self.log(u"Importing aeneas.cewsubprocess...") from aeneas.cewsubprocess import CEWSubprocess self.log(u"Importing aeneas.cewsubprocess... done") self.log(u"Calling aeneas.cewsubprocess...") cewsub = CEWSubprocess(rconf=self.rconf, logger=self.logger) sr, sf, intervals = cewsub.synthesize_multiple(output_file_path, c_quit_after, c_backwards, u_text) self.log(u"Calling aeneas.cewsubprocess... done") except Exception as exc: self.log_exc(u"An unexpected error occurred while running cewsubprocess", exc, False, None) # NOTE not critical, try calling aeneas.cew directly # COMMENTED return (False, None) if sr is None: self.log(u"Preparing c_text...") if gf.PY2: # Python 2 => pass byte strings c_text = [(gf.safe_bytes(t[0]), gf.safe_bytes(t[1])) for t in u_text] else: # Python 3 => pass Unicode strings c_text = [(gf.safe_unicode(t[0]), gf.safe_unicode(t[1])) for t in u_text] self.log(u"Preparing c_text... done") self.log(u"Calling aeneas.cew directly") try: self.log(u"Importing aeneas.cew...") import aeneas.cew.cew self.log(u"Importing aeneas.cew... done") self.log(u"Calling aeneas.cew...") sr, sf, intervals = aeneas.cew.cew.synthesize_multiple( output_file_path, c_quit_after, c_backwards, c_text ) self.log(u"Calling aeneas.cew... done") except Exception as exc: self.log_exc(u"An unexpected error occurred while running cew", exc, False, None) return (False, None) self.log([u"sr: %d", sr]) self.log([u"sf: %d", sf]) # create output anchors = [] current_time = TimeValue("0.000") num_chars = 0 if backwards: fragments = fragments[::-1] for i in range(sf): # get the correct fragment fragment = fragments[i] # store for later output anchors.append([ TimeValue(intervals[i][0]), fragment.identifier, fragment.filtered_text ]) # increase the character counter num_chars += fragment.characters # update current_time current_time = TimeValue(intervals[i][1]) # return output # NOTE anchors do not make sense if backwards == True self.log([u"Returning %d time anchors", len(anchors)]) self.log([u"Current time %.3f", current_time]) self.log([u"Synthesized %d characters", num_chars]) self.log(u"Synthesizing using C extension... done") return (True, (anchors, current_time, num_chars))