Source code for malaya_speech.model.synthesis

from malaya_speech.model.frame import Frame
from malaya_speech.utils.padding import (
    sequence_1d,
)
from malaya_speech.utils.astype import float_to_int
from malaya_speech.utils.featurization import universal_mel
from malaya_speech.model.abstract import Abstract, TTS
from malaya_speech.utils.constant import MEL_MEAN, MEL_STD
from typing import Callable


[docs]class Vocoder(Abstract, TTS):
    def __init__(self, input_nodes, output_nodes, sess, model, name):
        self._input_nodes = input_nodes
        self._output_nodes = output_nodes
        self._sess = sess
        self.__model__ = model
        self.__name__ = name

[docs]    def predict(self, inputs):
        """
        Change Mel to Waveform.

        Parameters
        ----------
        inputs: List[np.array]
            List[np.array] or List[malaya_speech.model.frame.Frame].
        Returns
        -------
        result: List
        """
        inputs = [
            input.array if isinstance(input, Frame) else input
            for input in inputs
        ]
        padded, lens = sequence_1d(inputs, return_len=True)

        r = self._execute(
            inputs=[padded],
            input_labels=['Placeholder'],
            output_labels=['logits'],
        )
        return r['logits'][:, :, 0]

    def __call__(self, input):
        return self.predict([input])[0]


[docs]class Tacotron(Abstract, TTS):
    def __init__(
        self, input_nodes, output_nodes, normalizer, stats, sess, model, name
    ):
        TTS.__init__(self)

        self._input_nodes = input_nodes
        self._output_nodes = output_nodes
        self._normalizer = normalizer
        self._stats = stats
        self._sess = sess
        self.__model__ = model
        self.__name__ = name

[docs]    def predict(self, string, **kwargs):
        """
        Change string to Mel.

        Parameters
        ----------
        string: str

        Returns
        -------
        result: Dict[string, decoder-output, mel-output, universal-output, alignment]
        """

        t, ids = self._normalizer.normalize(string, **kwargs)
        r = self._execute(
            inputs=[[ids], [len(ids)]],
            input_labels=['Placeholder', 'Placeholder_1'],
            output_labels=[
                'decoder_output',
                'post_mel_outputs',
                'alignment_histories',
            ],
        )
        v = r['post_mel_outputs'][0] * self._stats[1] + self._stats[0]
        v = (v - MEL_MEAN) / MEL_STD
        return {
            'string': t,
            'ids': ids,
            'decoder-output': r['decoder_output'][0],
            'mel-output': r['post_mel_outputs'][0],
            'universal-output': v,
            'alignment': r['alignment_histories'][0],
        }

    def __call__(self, input):
        return self.predict(input)


[docs]class Fastspeech(Abstract, TTS):
    def __init__(
        self, input_nodes, output_nodes, normalizer, stats, sess, model, name
    ):
        TTS.__init__(self)

        self._input_nodes = input_nodes
        self._output_nodes = output_nodes
        self._normalizer = normalizer
        self._stats = stats
        self._sess = sess
        self.__model__ = model
        self.__name__ = name

[docs]    def predict(
        self,
        string,
        speed_ratio: float = 1.0,
        f0_ratio: float = 1.0,
        energy_ratio: float = 1.0,
        **kwargs,
    ):
        """
        Change string to Mel.

        Parameters
        ----------
        string: str
        speed_ratio: float, optional (default=1.0)
            Increase this variable will increase time voice generated.
        f0_ratio: float, optional (default=1.0)
            Increase this variable will increase frequency, low frequency will generate more deeper voice.
        energy_ratio: float, optional (default=1.0)
            Increase this variable will increase loudness.

        Returns
        -------
        result: Dict[string, decoder-output, mel-output, universal-output]
        """
        t, ids = self._normalizer.normalize(string, **kwargs)
        r = self._execute(
            inputs=[[ids], [speed_ratio], [f0_ratio], [energy_ratio]],
            input_labels=[
                'Placeholder',
                'speed_ratios',
                'f0_ratios',
                'energy_ratios',
            ],
            output_labels=['decoder_output', 'post_mel_outputs'],
        )
        v = r['post_mel_outputs'][0] * self._stats[1] + self._stats[0]
        v = (v - MEL_MEAN) / MEL_STD
        return {
            'string': t,
            'ids': ids,
            'decoder-output': r['decoder_output'][0],
            'mel-output': r['post_mel_outputs'][0],
            'universal-output': v,
        }

    def __call__(self, input, **kwargs):
        return self.predict(input, **kwargs)


[docs]class FastspeechSDP(Abstract, TTS):
    def __init__(
        self, input_nodes, output_nodes, normalizer, stats, sess, model, name
    ):
        TTS.__init__(self)

        self._input_nodes = input_nodes
        self._output_nodes = output_nodes
        self._normalizer = normalizer
        self._stats = stats
        self._sess = sess
        self.__model__ = model
        self.__name__ = name

[docs]    def predict(
        self,
        string,
        speed_ratio: float = 1.0,
        f0_ratio: float = 1.0,
        energy_ratio: float = 1.0,
        temperature_durator: float = 0.6666,
        **kwargs,
    ):
        """
        Change string to Mel.

        Parameters
        ----------
        string: str
        speed_ratio: float, optional (default=1.0)
            Increase this variable will increase time voice generated.
        f0_ratio: float, optional (default=1.0)
            Increase this variable will increase frequency, low frequency will generate more deeper voice.
        energy_ratio: float, optional (default=1.0)
            Increase this variable will increase loudness.
        temperature_durator: float, optional (default=0.66666)
            Durator trying to predict alignment with random.normal() * temperature_durator.

        Returns
        -------
        result: Dict[string, decoder-output, mel-output, universal-output]
        """
        t, ids = self._normalizer.normalize(string, **kwargs)
        r = self._execute(
            inputs=[[ids], speed_ratio, [f0_ratio], [energy_ratio], temperature_durator],
            input_labels=[
                'Placeholder',
                'speed_ratios',
                'f0_ratios',
                'energy_ratios',
                'noise_scale_w',
            ],
            output_labels=['decoder_output', 'post_mel_outputs'],
        )
        v = r['post_mel_outputs'][0] * self._stats[1] + self._stats[0]
        v = (v - MEL_MEAN) / MEL_STD
        return {
            'string': t,
            'ids': ids,
            'decoder-output': r['decoder_output'][0],
            'mel-output': r['post_mel_outputs'][0],
            'universal-output': v,
        }

    def __call__(self, input, **kwargs):
        return self.predict(input, **kwargs)


[docs]class E2E_FastSpeech(Abstract, TTS):
    def __init__(
        self, input_nodes, output_nodes, normalizer, stats, sess, model, name
    ):
        TTS.__init__(self, e2e=True)

        self._input_nodes = input_nodes
        self._output_nodes = output_nodes
        self._normalizer = normalizer
        self._stats = stats
        self._sess = sess
        self.__model__ = model
        self.__name__ = name

[docs]    def predict(
        self,
        string,
        speed_ratio: float = 1.0,
        f0_ratio: float = 1.0,
        energy_ratio: float = 1.0,
        temperature_durator: float = 0.6666,
        **kwargs,
    ):
        """
        Change string to Mel.

        Parameters
        ----------
        string: str
        speed_ratio: float, optional (default=1.0)
            Increase this variable will increase time voice generated.
        f0_ratio: float, optional (default=1.0)
            Increase this variable will increase frequency, low frequency will generate more deeper voice.
        energy_ratio: float, optional (default=1.0)
            Increase this variable will increase loudness.
        temperature_durator: float, optional (default=0.66666)
            Durator trying to predict alignment with random.normal() * temperature_durator.

        Returns
        -------
        result: Dict[string, decoder-output, y]
        """
        t, ids = self._normalizer.normalize(string, **kwargs)
        r = self._execute(
            inputs=[[ids], speed_ratio, [f0_ratio], [energy_ratio], temperature_durator],
            input_labels=[
                'Placeholder',
                'speed_ratios',
                'f0_ratios',
                'energy_ratios',
                'noise_scale_w',
            ],
            output_labels=['y_hat'],
        )
        return {
            'string': t,
            'ids': ids,
            'y': r['y_hat'],
        }

    def __call__(self, input, **kwargs):
        return self.predict(input, **kwargs)


[docs]class FastVC(Abstract):
    def __init__(
        self,
        input_nodes,
        output_nodes,
        speaker_vector,
        magnitude,
        sess,
        model,
        name,
    ):
        self._input_nodes = input_nodes
        self._output_nodes = output_nodes
        self._speaker_vector = speaker_vector
        self._magnitude = magnitude
        self._sess = sess
        self.__model__ = model
        self.__name__ = name

[docs]    def predict(self, original_audio, target_audio):
        """
        Change original voice audio to follow targeted voice.

        Parameters
        ----------
        original_audio: np.array or malaya_speech.model.frame.Frame
        target_audio: np.array or malaya_speech.model.frame.Frame

        Returns
        -------
        result: Dict[decoder-output, mel-output]
        """
        original_audio = (
            input.array if isinstance(original_audio, Frame) else original_audio
        )
        target_audio = (
            input.array if isinstance(target_audio, Frame) else target_audio
        )

        original_mel = universal_mel(original_audio)
        target_mel = universal_mel(target_audio)

        original_v = self._magnitude(self._speaker_vector([original_audio])[0])
        target_v = self._magnitude(self._speaker_vector([target_audio])[0])

        r = self._execute(
            inputs=[
                [original_mel],
                [original_v],
                [target_v],
                [len(original_mel)],
            ],
            input_labels=[
                'mel',
                'ori_vector',
                'target_vector',
                'mel_lengths',
            ],
            output_labels=['mel_before', 'mel_after'],
        )
        return {
            'decoder-output': r['mel_before'][0],
            'mel-output': r['mel_after'][0],
        }

    def __call__(self, original_audio, target_audio):
        return self.predict(original_audio, target_audio)


[docs]class Fastpitch(Abstract, TTS):
    def __init__(
        self, input_nodes, output_nodes, normalizer, stats, sess, model, name
    ):
        TTS.__init__(self)
        self._input_nodes = input_nodes
        self._output_nodes = output_nodes
        self._normalizer = normalizer
        self._stats = stats
        self._sess = sess
        self.__model__ = model
        self.__name__ = name

[docs]    def predict(
        self,
        string,
        speed_ratio: float = 1.0,
        pitch_ratio: float = 1.0,
        pitch_addition: float = 0.0,
        **kwargs,
    ):
        """
        Change string to Mel.

        Parameters
        ----------
        string: str
        speed_ratio: float, optional (default=1.0)
            Increase this variable will increase time voice generated.
        pitch_ratio: float, optional (default=1.0)
            pitch = pitch * pitch_ratio, amplify existing pitch contour.
        pitch_addition: float, optional (default=0.0)
            pitch = pitch + pitch_addition, change pitch contour.

        Returns
        -------
        result: Dict[string, decoder-output, mel-output, pitch-output, universal-output]
        """
        t, ids = self._normalizer.normalize(string, **kwargs)
        r = self._execute(
            inputs=[[ids], [speed_ratio], [pitch_ratio], [pitch_addition]],
            input_labels=[
                'Placeholder',
                'speed_ratios',
                'pitch_ratios',
                'pitch_addition',
            ],
            output_labels=['decoder_output', 'post_mel_outputs', 'pitch_outputs'],
        )
        v = r['post_mel_outputs'][0] * self._stats[1] + self._stats[0]
        v = (v - MEL_MEAN) / MEL_STD
        return {
            'string': t,
            'ids': ids,
            'decoder-output': r['decoder_output'][0],
            'mel-output': r['post_mel_outputs'][0],
            'pitch-output': r['pitch_outputs'][0],
            'universal-output': v,
        }

    def __call__(self, input, **kwargs):
        return self.predict(input, **kwargs)


class GlowTTS(Abstract, TTS):
    def __init__(
        self, input_nodes, output_nodes, normalizer, stats, sess, model, name, **kwargs
    ):
        TTS.__init__(self)
        self._input_nodes = input_nodes
        self._output_nodes = output_nodes
        self._normalizer = normalizer
        self._stats = stats
        self._sess = sess
        self.__model__ = model
        self.__name__ = name

    def predict(
        self,
        string,
        temperature: float = 0.3333,
        length_ratio: float = 1.0,
        **kwargs,
    ):
        """
        Change string to Mel.

        Parameters
        ----------
        string: str
        temperature: float, optional (default=0.3333)
            Decoder model trying to decode with encoder(text) + random.normal() * temperature.
        length_ratio: float, optional (default=1.0)
            Increase this variable will increase time voice generated.

        Returns
        -------
        result: Dict[string, ids, mel-output, alignment, universal-output]
        """
        t, ids = self._normalizer.normalize(string, **kwargs)
        r = self._execute(
            inputs=[[ids], [len(ids)], [temperature], [length_ratio]],
            input_labels=[
                'input_ids',
                'lens',
                'temperature',
                'length_ratio',
            ],
            output_labels=['mel_output', 'alignment_histories'],
        )
        v = r['mel_output'][0] * self._stats[1] + self._stats[0]
        v = (v - MEL_MEAN) / MEL_STD
        return {
            'string': t,
            'ids': ids,
            'mel-output': r['mel_output'][0],
            'alignment': r['alignment_histories'][0].T,
            'universal-output': v,
        }

    def __call__(self, input, **kwargs):
        return self.predict(input, **kwargs)


class GlowTTS_MultiSpeaker(Abstract, TTS):
    def __init__(
        self, input_nodes, output_nodes, normalizer, speaker_vector, stats, sess, model, name
    ):
        TTS.__init__(self)
        self._input_nodes = input_nodes
        self._output_nodes = output_nodes
        self._normalizer = normalizer
        self._speaker_vector = speaker_vector
        self._sess = sess
        self.__model__ = model
        self.__name__ = name

    def _predict(self,
                 string, left_audio, right_audio,
                 temperature: float = 0.3333,
                 length_ratio: float = 1.0,
                 **kwargs):
        t, ids = self._normalizer.normalize(string, **kwargs)
        left_v = self._speaker_vector([left_audio])
        right_v = self._speaker_vector([right_audio])
        r = self._execute(
            inputs=[[ids], [len(ids)], [temperature], [length_ratio], left_v, right_v],
            input_labels=[
                'input_ids',
                'lens',
                'temperature',
                'length_ratio',
                'speakers',
                'speakers_right',
            ],
            output_labels=['mel_output', 'alignment_histories'],
        )
        return {
            'string': t,
            'ids': ids,
            'alignment': r['alignment_histories'][0].T,
            'universal-output': r['mel_output'][0][:-8],
        }

    def predict(
        self,
        string,
        audio,
        temperature: float = 0.3333,
        length_ratio: float = 1.0,
        **kwargs,
    ):
        """
        Change string to Mel.

        Parameters
        ----------
        string: str
        audio: np.array
            np.array or malaya_speech.model.frame.Frame, must in 16k format.
            We only trained on `female`, `male`, `husein` and `haqkiem` speakers.
        temperature: float, optional (default=0.3333)
            Decoder model trying to decode with encoder(text) + random.normal() * temperature.
        length_ratio: float, optional (default=1.0)
            Increase this variable will increase time voice generated.

        Returns
        -------
        result: Dict[string, ids, alignment, universal-output]
        """
        return self._predict(string=string,
                             left_audio=audio, right_audio=audio,
                             temperature=temperature, length_ratio=length_ratio, **kwargs)

    def voice_conversion(self, string, original_audio, target_audio,
                         temperature: float = 0.3333,
                         length_ratio: float = 1.0,
                         **kwargs,):
        """
        Change string to Mel.

        Parameters
        ----------
        string: str
        original_audio: np.array
            original speaker to encode speaking style, must in 16k format.
        target_audio: np.array
            target speaker to follow speaking style from `original_audio`, must in 16k format.
        temperature: float, optional (default=0.3333)
            Decoder model trying to decode with encoder(text) + random.normal() * temperature.
        length_ratio: float, optional (default=1.0)
            Increase this variable will increase time voice generated.

        Returns
        -------
        result: Dict[string, ids, alignment, universal-output]
        """
        return self._predict(string=string,
                             left_audio=original_audio, right_audio=target_audio,
                             temperature=temperature, length_ratio=length_ratio, **kwargs)

    def __call__(self, input, **kwargs):
        return self.predict(input, **kwargs)


class VITS(Abstract, TTS):
    def __init__(
        self, input_nodes, output_nodes, normalizer, sess, model, name
    ):
        TTS.__init__(self, e2e=True)

        self._input_nodes = input_nodes
        self._output_nodes = output_nodes
        self._normalizer = normalizer
        self._sess = sess
        self.__model__ = model
        self.__name__ = name

    def predict(
        self,
        string,
        temperature: float = 0.5,
        temperature_durator: float = 0.5,
        length_ratio: float = 1.0,
        **kwargs,
    ):
        """
        Change string to waveform.

        Parameters
        ----------
        string: str
        temperature: float, optional (default=0.5)
            Decoder model trying to decode with encoder(text) + random.normal() * temperature.
        temperature_durator: float, optional (default=1.0)
            Durator trying to predict alignment with random.normal() * temperature_durator.
            Only useful for SDP-based models.
        length_ratio: float, optional (default=1.0)
            Increase this variable will increase time voice generated.

        Returns
        -------
        result: Dict[string, ids, mel-output, alignment, y]
        """
        t, ids = self._normalizer.normalize(string, **kwargs)
        inputs = [[ids], [len(ids)], [temperature], [length_ratio]]
        input_labels = [
            'input_ids',
            'lens',
            'temperature',
            'length_ratio',
        ]
        if 'sdp' in self.__model__:
            inputs.append([temperature_durator])
            input_labels.append('noise_scale_w')
        r = self._execute(
            inputs=inputs,
            input_labels=input_labels,
            output_labels=['mel_output', 'alignment_histories', 'y_hat'],
        )
        return {
            'string': t,
            'ids': ids,
            'mel-output': r['mel_output'],
            'alignment': r['alignment_histories'][0].T,
            'y': r['y_hat'],
        }

    def __call__(self, input, **kwargs):
        return self.predict(input, **kwargs)