from malaya_speech.utils.text import (
convert_to_ascii,
collapse_whitespace,
put_spacing_num,
tts_encode,
TextIDS,
)
from malaya_speech.supervised import tts
import numpy as np
import logging
from typing import Callable
logger = logging.getLogger('malaya_speech.tts')
_tacotron2_availability = {
'male': {
'Size (MB)': 104,
'Quantized Size (MB)': 26.3,
'Combined loss': 0.1838,
'Understand punctuation': True,
'Is lowercase': True,
},
'female': {
'Size (MB)': 104,
'Quantized Size (MB)': 26.3,
'Combined loss': 0.1887,
'Understand punctuation': True,
'Is lowercase': True,
},
'husein': {
'Size (MB)': 104,
'Quantized Size (MB)': 26.3,
'Combined loss': 0.1165,
'Understand punctuation': True,
'Is lowercase': True,
},
'haqkiem': {
'Size (MB)': 104,
'Quantized Size (MB)': 26.3,
'Combined loss': 0.1375,
'Understand punctuation': True,
'Is lowercase': True,
},
'female-singlish': {
'Size (MB)': 104,
'Quantized Size (MB)': 26.3,
'Combined loss': 0.0923,
'Understand punctuation': True,
'Is lowercase': True,
},
'yasmin': {
'Size (MB)': 104,
'Quantized Size (MB)': 26.3,
'Combined loss': 0.06874,
'Understand punctuation': True,
'Is lowercase': False,
},
'osman': {
'Size (MB)': 104,
'Quantized Size (MB)': 26.3,
'Combined loss': 0.06911,
'Understand punctuation': True,
'Is lowercase': False,
},
}
_fastspeech2_availability = {
'male': {
'Size (MB)': 125,
'Quantized Size (MB)': 31.7,
'Combined loss': 1.8,
'Understand punctuation': True,
'Is lowercase': True,
},
'female': {
'Size (MB)': 125,
'Quantized Size (MB)': 31.7,
'Combined loss': 1.932,
'Understand punctuation': True,
'Is lowercase': True,
},
'husein': {
'Size (MB)': 125,
'Quantized Size (MB)': 31.7,
'Combined loss': 0.5832,
'Understand punctuation': True,
'Is lowercase': True,
},
'haqkiem': {
'Size (MB)': 125,
'Quantized Size (MB)': 31.7,
'Combined loss': 0.5663,
'Understand punctuation': True,
'Is lowercase': True,
},
'female-singlish': {
'Size (MB)': 125,
'Quantized Size (MB)': 31.7,
'Combined loss': 0.5112,
'Understand punctuation': True,
'Is lowercase': True,
},
'yasmin': {
'Size (MB)': 125,
'Quantized Size (MB)': 31.7,
'Combined loss': 0.7212,
'Understand punctuation': True,
'Is lowercase': False,
},
'yasmin-small': {
'Size (MB)': 32.9,
'Quantized Size (MB)': 8.5,
'Combined loss': 0.7994,
'Understand punctuation': True,
'Is lowercase': False,
},
'osman': {
'Size (MB)': 125,
'Quantized Size (MB)': 31.7,
'Combined loss': 0.7341,
'Understand punctuation': True,
'Is lowercase': False,
},
'osman-small': {
'Size (MB)': 32.9,
'Quantized Size (MB)': 8.5,
'Combined loss': 0.8182,
'Understand punctuation': True,
'Is lowercase': False,
},
}
_fastpitch_availability = {
'male': {
'Size (MB)': 123,
'Quantized Size (MB)': 31.1,
'Combined loss': 1.614,
'Understand punctuation': True,
'Is lowercase': True,
},
'female': {
'Size (MB)': 123,
'Quantized Size (MB)': 31.1,
'Combined loss': 1.669,
'Understand punctuation': True,
'Is lowercase': True,
},
'husein': {
'Size (MB)': 123,
'Quantized Size (MB)': 31.1,
'Combined loss': 0.52515,
'Understand punctuation': True,
'Is lowercase': True,
},
'haqkiem': {
'Size (MB)': 123,
'Quantized Size (MB)': 31.1,
'Combined loss': 0.5186,
'Understand punctuation': True,
'Is lowercase': True,
},
}
_glowtts_availability = {
'male': {
'Size (MB)': 119,
'Quantized Size (MB)': 27.6,
'Combined loss': -1.429,
'Understand punctuation': True,
'Is lowercase': True,
},
'female': {
'Size (MB)': 119,
'Quantized Size (MB)': 27.6,
'Combined loss': -1.464,
'Understand punctuation': True,
'Is lowercase': True,
},
'haqkiem': {
'Size (MB)': 119,
'Quantized Size (MB)': 27.6,
'Combined loss': -1.649,
'Understand punctuation': True,
'Is lowercase': True,
},
'female-singlish': {
'Size (MB)': 119,
'Quantized Size (MB)': 27.6,
'Combined loss': -1.728,
'Understand punctuation': True,
'Is lowercase': True,
},
'yasmin': {
'Size (MB)': 119,
'Quantized Size (MB)': 27.6,
'Combined loss': -1.908,
'Understand punctuation': True,
'Is lowercase': False,
},
'osman': {
'Size (MB)': 119,
'Quantized Size (MB)': 27.6,
'Combined loss': -1.908,
'Understand punctuation': True,
'Is lowercase': False,
},
'multispeaker': {
'Size (MB)': 404,
'Quantized Size (MB)': 79.9,
'Combined loss': -1.882,
'Understand punctuation': True,
'Is lowercase': True,
}
}
_lightspeech_availability = {
'yasmin': {
'Size (MB)': 39.9,
'Quantized Size (MB)': 10.2,
'Combined loss': 0.7541,
'Understand punctuation': True,
'Is lowercase': False,
},
'osman': {
'Size (MB)': 39.9,
'Quantized Size (MB)': 10.2,
'Combined loss': 0.7541,
'Understand punctuation': True,
'Is lowercase': False,
},
}
[docs]def available_tacotron2():
"""
List available Tacotron2, Text to Mel models.
"""
from malaya_speech.utils import describe_availability
return describe_availability(
_tacotron2_availability,
text='`husein`, `haqkiem` and `female-singlish` combined loss from training set',
)
[docs]def available_fastspeech2():
"""
List available FastSpeech2, Text to Mel models.
"""
from malaya_speech.utils import describe_availability
return describe_availability(
_fastspeech2_availability,
text='`husein`, `haqkiem` and `female-singlish` combined loss from training set',
)
[docs]def available_fastpitch():
"""
List available FastPitch, Text to Mel models.
"""
from malaya_speech.utils import describe_availability
return describe_availability(
_fastpitch_availability,
text='`husein` and `haqkiem` combined loss from training set',
)
[docs]def available_glowtts():
"""
List available GlowTTS, Text to Mel models.
"""
from malaya_speech.utils import describe_availability
return describe_availability(
_glowtts_availability,
text='`haqkiem` and `female-singlish` combined loss from training set',
)
[docs]def available_lightspeech():
"""
List available LightSpeech, Text to Mel models.
"""
from malaya_speech.utils import describe_availability
return describe_availability(
_lightspeech_availability,
)
[docs]def load_text_ids(
pad_to: int = 8,
understand_punct: bool = True,
is_lower: bool = True,
**kwargs,
):
"""
Load text normalizer module use by Malaya-Speech TTS.
"""
try:
import malaya
from packaging import version
except BaseException:
raise ModuleNotFoundError(
'malaya not installed. Please install it by `pip install malaya` and try again.'
)
if version.parse(malaya.__version__) < version.parse('4.7.5'):
logger.warning('To get better speech synthesis, make sure Malaya version >= 4.7.5')
normalizer = malaya.normalize.normalizer()
sentence_tokenizer = malaya.text.function.split_into_sentences
return TextIDS(
pad_to=pad_to,
understand_punct=understand_punct,
is_lower=is_lower,
normalizer=normalizer,
sentence_tokenizer=sentence_tokenizer,
)
[docs]def tacotron2(
model: str = 'yasmin',
quantized: bool = False,
pad_to: int = 8,
**kwargs
):
"""
Load Tacotron2 TTS model.
Parameters
----------
model : str, optional (default='yasmin')
Model architecture supported. Allowed values:
* ``'female'`` - Tacotron2 trained on female voice.
* ``'male'`` - Tacotron2 trained on male voice.
* ``'husein'`` - Tacotron2 trained on Husein voice, https://www.linkedin.com/in/husein-zolkepli/
* ``'haqkiem'`` - Tacotron2 trained on Haqkiem voice, https://www.linkedin.com/in/haqkiem-daim/
* ``'yasmin'`` - Tacotron2 trained on female Yasmin voice.
* ``'osman'`` - Tacotron2 trained on male Osman voice.
* ``'female-singlish'`` - Tacotron2 trained on female Singlish voice, https://www.imda.gov.sg/programme-listing/digital-services-lab/national-speech-corpus
quantized : bool, optional (default=False)
if True, will load 8-bit quantized model.
Quantized model not necessary faster, totally depends on the machine.
pad_to : int, optional (default=8)
size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.
Returns
-------
result : malaya_speech.model.synthesis.Tacotron class
"""
model = model.lower()
if model not in _tacotron2_availability:
raise ValueError(
'model not supported, please check supported models from `malaya_speech.tts.available_tacotron2()`.'
)
selected_model = _tacotron2_availability[model]
text_ids = load_text_ids(
pad_to=pad_to,
understand_punct=selected_model['Understand punctuation'],
is_lower=selected_model['Is lowercase'],
quantized=quantized,
**kwargs
)
return tts.tacotron_load(
model=model,
module='text-to-speech-tacotron',
normalizer=text_ids,
quantized=quantized,
**kwargs
)
[docs]def fastspeech2(
model: str = 'male',
quantized: bool = False,
pad_to: int = 8,
**kwargs
):
"""
Load Fastspeech2 TTS model.
Parameters
----------
model : str, optional (default='male')
Model architecture supported. Allowed values:
* ``'female'`` - Fastspeech2 trained on female voice.
* ``'male'`` - Fastspeech2 trained on male voice.
* ``'husein'`` - Fastspeech2 trained on Husein voice, https://www.linkedin.com/in/husein-zolkepli/
* ``'haqkiem'`` - Fastspeech2 trained on Haqkiem voice, https://www.linkedin.com/in/haqkiem-daim/
* ``'yasmin'`` - Fastspeech2 trained on female Yasmin voice.
* ``'osman'`` - Fastspeech2 trained on male Osman voice.
* ``'female-singlish'`` - Fastspeech2 trained on female Singlish voice, https://www.imda.gov.sg/programme-listing/digital-services-lab/national-speech-corpus
quantized : bool, optional (default=False)
if True, will load 8-bit quantized model.
Quantized model not necessary faster, totally depends on the machine.
pad_to : int, optional (default=8)
size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.
Returns
-------
result : malaya_speech.model.synthesis.Fastspeech class
"""
model = model.lower()
if model not in _fastspeech2_availability:
raise ValueError(
'model not supported, please check supported models from `malaya_speech.tts.available_fastspeech2()`.'
)
selected_model = _fastspeech2_availability[model]
text_ids = load_text_ids(
pad_to=pad_to,
understand_punct=selected_model['Understand punctuation'],
is_lower=selected_model['Is lowercase'],
quantized=quantized,
**kwargs
)
return tts.fastspeech_load(
model=model,
module='text-to-speech-fastspeech',
normalizer=text_ids,
quantized=quantized,
**kwargs
)
[docs]def fastpitch(
model: str = 'male',
quantized: bool = False,
pad_to: int = 8,
**kwargs
):
"""
Load Fastspitch TTS model.
Parameters
----------
model : str, optional (default='male')
Model architecture supported. Allowed values:
* ``'female'`` - Fastpitch trained on female voice.
* ``'male'`` - Fastpitch trained on male voice.
* ``'husein'`` - Fastpitch trained on Husein voice, https://www.linkedin.com/in/husein-zolkepli/
* ``'haqkiem'`` - Fastpitch trained on Haqkiem voice, https://www.linkedin.com/in/haqkiem-daim/
quantized : bool, optional (default=False)
if True, will load 8-bit quantized model.
Quantized model not necessary faster, totally depends on the machine.
pad_to : int, optional (default=8)
size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.
Returns
-------
result : malaya_speech.model.synthesis.Fastpitch class
"""
model = model.lower()
if model not in _fastpitch_availability:
raise ValueError(
'model not supported, please check supported models from `malaya_speech.tts.available_fastpitch()`.'
)
selected_model = _fastpitch_availability[model]
text_ids = load_text_ids(
pad_to=pad_to,
understand_punct=selected_model['Understand punctuation'],
is_lower=selected_model['Is lowercase'],
quantized=quantized,
**kwargs
)
return tts.fastpitch_load(
model=model,
module='text-to-speech-fastpitch',
normalizer=text_ids,
quantized=quantized,
**kwargs
)
[docs]def glowtts(model: str = 'yasmin',
quantized: bool = False,
pad_to: int = 2,
**kwargs):
"""
Load GlowTTS TTS model.
Parameters
----------
model : str, optional (default='yasmin')
Model architecture supported. Allowed values:
* ``'female'`` - GlowTTS trained on female voice.
* ``'male'`` - GlowTTS trained on male voice.
* ``'haqkiem'`` - GlowTTS trained on Haqkiem voice, https://www.linkedin.com/in/haqkiem-daim/
* ``'female-singlish'`` - GlowTTS trained on female Singlish voice, https://www.imda.gov.sg/programme-listing/digital-services-lab/national-speech-corpus
* ``'yasmin'`` - GlowTTS trained on female Yasmin voice.
* ``'osman'`` - GlowTTS trained on male Osman voice.
* ``'multispeaker'`` - Multispeaker GlowTTS trained on male, female, husein and haqkiem voices, also able to do voice conversion.
quantized : bool, optional (default=False)
if True, will load 8-bit quantized model.
Quantized model not necessary faster, totally depends on the machine.
pad_to : int, optional (default=2)
size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 2.
Returns
-------
result : malaya_speech.model.synthesis.GlowTTS class
"""
model = model.lower()
if model not in _glowtts_availability:
raise ValueError(
'model not supported, please check supported models from `malaya_speech.tts.available_glowtts()`.'
)
selected_model = _glowtts_availability[model]
text_ids = load_text_ids(
pad_to=pad_to,
understand_punct=selected_model['Understand punctuation'],
is_lower=selected_model['Is lowercase'],
quantized=quantized,
**kwargs
)
return tts.glowtts_load(
model=model,
module='text-to-speech-glowtts',
normalizer=text_ids,
quantized=quantized,
**kwargs
)
[docs]def lightspeech(
model: str = 'male',
quantized: bool = False,
pad_to: int = 8,
**kwargs
):
"""
Load LightSpeech TTS model.
Parameters
----------
model : str, optional (default='male')
Model architecture supported. Allowed values:
* ``'yasmin'`` - LightSpeech trained on female Yasmin voice.
* ``'osman'`` - LightSpeech trained on male Osman voice.
quantized : bool, optional (default=False)
if True, will load 8-bit quantized model.
Quantized model not necessary faster, totally depends on the machine.
pad_to : int, optional (default=8)
size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.
Returns
-------
result : malaya_speech.model.synthesis.Fastspeech class
"""
model = model.lower()
if model not in _lightspeech_availability:
raise ValueError(
'model not supported, please check supported models from `malaya_speech.tts.available_lightspeech()`.'
)
selected_model = _lightspeech_availability[model]
text_ids = load_text_ids(
pad_to=pad_to,
understand_punct=selected_model['Understand punctuation'],
is_lower=selected_model['Is lowercase'],
quantized=quantized,
**kwargs
)
return tts.fastspeech_load(
model=model,
module='text-to-speech-lightspeech',
normalizer=text_ids,
quantized=quantized,
**kwargs
)