API#

malaya_speech#

malaya_speech.augmentation.spectrogram#

malaya_speech.augmentation.spectrogram.mask_frequency(features, n_freq_mask=2, width_freq_mask=8, random_band=True)[source]#

Mask frequency.

Parameters
  • features (np.array) –

  • n_freq_mask (int, optional (default=2)) – loop size for masking.

  • width_freq_mask (int, optional (default=8)) – masking size.

Returns

result

Return type

np.array

malaya_speech.augmentation.spectrogram.mask_time(features, n_time_mask=2, width_time_mask=8, random_band=True)[source]#

Time frequency.

Parameters
  • features (np.array) –

  • n_time_mask (int, optional (default=2)) – loop size for masking.

  • width_time_mask (int, optional (default=8)) – masking size.

Returns

result

Return type

np.array

malaya_speech.augmentation.spectrogram.tf_mask_frequency(features, n_freq_mask=2, F=27)[source]#

Mask frequency using Tensorflow.

Parameters
  • features (np.array) –

  • F (size of mask for frequency) –

malaya_speech.augmentation.spectrogram.tf_mask_time(features, n_time_mask=2, T=80)[source]#

Mask time using Tensorflow.

Parameters
  • features (np.array) –

  • T (size of mask for time) –

malaya_speech.extra.rttm#

malaya_speech.extra.rttm.load(file)[source]#

Load RTTM file.

Parameters

file (str) –

Returns

result

Return type

Dict[str, malaya_speech.model.annotation.Annotation]

malaya_speech.extra.visualization#

malaya_speech.extra.visualization.visualize_vad(signal, preds, sample_rate=16000, figsize=(15, 3), ax=None, **kwargs)[source]#

Visualize signal given VAD labels. Green means got voice activity, while Red is not.

Parameters
  • signal (list / np.array) –

  • preds (List[Tuple[Frame, bool]]) –

  • sample_rate (int, optional (default=16000)) –

  • figsize (Tuple[int, int], optional (default=(15, 7))) – matplotlib figure size.

malaya_speech.extra.visualization.plot_classification(preds, description, ax=None, fontsize_text=14, x_text=0.05, y_text=0.2, ylim=(0.1, 0.9), figsize=(15, 3), **kwargs)[source]#

Visualize probability / boolean.

Parameters
  • preds (List[Tuple[Frame, label]]) –

  • description (str) –

  • ax (ax, optional (default = None)) –

  • fontsize_text (int, optional (default = 14)) –

  • x_text (float, optional (default = 0.05)) –

  • y_text (float, optional (default = 0.2)) –

malaya_speech.model.classification.Speakernet#

class malaya_speech.model.classification.Speakernet[source]#
vectorize(inputs)[source]#

Vectorize inputs.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result – returned [B, D].

Return type

np.array

malaya_speech.model.classification.Speaker2Vec#

class malaya_speech.model.classification.Speaker2Vec[source]#
vectorize(inputs)[source]#

Vectorize inputs.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result – returned [B, D].

Return type

np.array

malaya_speech.model.classification.SpeakernetClassification#

class malaya_speech.model.classification.SpeakernetClassification[source]#
predict_proba(inputs)[source]#

Predict inputs, will return probability.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result – returned [B, D].

Return type

np.array

predict(inputs)[source]#

Predict inputs, will return labels.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result – returned [B].

Return type

List[str]

malaya_speech.model.classification.Classification#

class malaya_speech.model.classification.Classification[source]#
predict_proba(inputs)[source]#

Predict inputs, will return probability.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result – returned [B, D].

Return type

np.array

predict(inputs)[source]#

Predict inputs, will return labels.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result – returned [B].

Return type

List[str]

malaya_speech.model.clustering.AgglomerativeClustering#

class malaya_speech.model.clustering.AgglomerativeClustering(min_clusters, max_clusters, metric='cosine', threshold=0.25, method='centroid')[source]#
fit_predict(X)[source]#

Fit predict.

Parameters

X (np.array) – inputs with size of [batch_size, embedding size]

Returns

result

Return type

np.array

malaya_speech.model.clustering.HiddenMarkovModelClustering#

class malaya_speech.model.clustering.HiddenMarkovModelClustering(min_clusters, max_clusters, metric='cosine', covariance_type='diag', threshold=0.25, single_cluster_detection_quantile=0.05, single_cluster_detection_threshold=1.15)[source]#
fit_predict(X)[source]#

Fit predict.

Parameters

X (np.array) – inputs with size of [batch_size, embedding size]

Returns

result

Return type

np.array

malaya_speech.model.huggingface.HuggingFace_CTC#

class malaya_speech.model.huggingface.HuggingFace_CTC[source]#
greedy_decoder(inputs)[source]#

Transcribe inputs using greedy decoder.

Parameters

input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result

Return type

List[str]

predict(inputs)[source]#

Predict logits from inputs using greedy decoder.

Parameters

input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result

Return type

List[str]

predict_logits(inputs, norm_func=<function softmax>)[source]#

Predict logits from inputs.

Parameters
  • input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

  • norm_func (Callable, optional (default=malaya.utils.activation.softmax)) –

Returns

result

Return type

List[np.array]

gradio(record_mode=True, lm_func=None, **kwargs)[source]#

Transcribe an input using beam decoder on Gradio interface.

Parameters
  • record_mode (bool, optional (default=True)) – if True, Gradio will use record mode, else, file upload mode.

  • lm_func (Callable, optional (default=None)) – if not None, will pass a logits with shape [T, D].

  • **kwargs (keyword arguments for iface.launch.) –

malaya_speech.model.huggingface.HuggingFace_Aligner#

class malaya_speech.model.huggingface.HuggingFace_Aligner[source]#
predict(input, transcription, sample_rate=16000)[source]#

Transcribe input, will return a string.

Parameters
  • input (np.array) – np.array or malaya_speech.model.frame.Frame.

  • transcription (str) – transcription of input audio.

  • sample_rate (int, optional (default=16000)) – sample rate for input.

Returns

result

Return type

Dict[chars_alignment, words_alignment, alignment]

malaya_speech.model.splitter.Split_Wav#

class malaya_speech.model.splitter.Split_Wav[source]#
predict(input)[source]#

Split an audio into 4 different speakers.

Parameters

input (np.array or malaya_speech.model.frame.Frame) –

Returns

result

Return type

np.array

malaya_speech.model.splitter.Split_Mel#

class malaya_speech.model.splitter.Split_Mel[source]#
predict(input)[source]#

Split an audio into 4 different speakers.

Parameters

input (np.array or malaya_speech.model.frame.Frame) –

Returns

result

Return type

np.array

malaya_speech.model.splitter.FastSpeechSplit#

class malaya_speech.model.splitter.FastSpeechSplit[source]#
predict(original_audio, target_audio, modes=['R', 'F', 'U', 'RF', 'RU', 'FU', 'RFU'])[source]#

Change original voice audio to follow targeted voice.

Parameters
  • original_audio (np.array or malaya_speech.model.frame.Frame) –

  • target_audio (np.array or malaya_speech.model.frame.Frame) –

  • modes (List[str], optional (default = ['R', 'F', 'U', 'RF', 'RU', 'FU', 'RFU'])) –

    R denotes rhythm, F denotes pitch target, U denotes speaker target (vector).

    • 'R' - maintain original_audio F and U on target_audio R.

    • 'F' - maintain original_audio R and U on target_audio F.

    • 'U' - maintain original_audio R and F on target_audio U.

    • 'RF' - maintain original_audio U on target_audio R and F.

    • 'RU' - maintain original_audio F on target_audio R and U.

    • 'FU' - maintain original_audio R on target_audio F and U.

    • 'RFU' - no conversion happened, just do encoder-decoder on target_audio

Returns

result

Return type

Dict[modes]

malaya_speech.model.synthesis.TTS#

class malaya_speech.model.synthesis.TTS[source]#
gradio(vocoder=None, **kwargs)[source]#

Text-to-Speech on Gradio interface.

Parameters
  • vocoder (Callable, optional (default=None)) – vocoder object that has predict method, prefer from malaya_speech itself. Not required if using End-to-End TTS model such as VITS.

  • **kwargs (keyword arguments for predict and iface.launch.) –

malaya_speech.model.synthesis.Vocoder#

class malaya_speech.model.synthesis.Vocoder[source]#
predict(inputs)[source]#

Change Mel to Waveform.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result

Return type

List

malaya_speech.model.synthesis.Tacotron#

class malaya_speech.model.synthesis.Tacotron[source]#
predict(string, **kwargs)[source]#

Change string to Mel.

Parameters

string (str) –

Returns

result

Return type

Dict[string, decoder-output, mel-output, universal-output, alignment]

malaya_speech.model.synthesis.Fastspeech#

class malaya_speech.model.synthesis.Fastspeech[source]#
predict(string, speed_ratio=1.0, f0_ratio=1.0, energy_ratio=1.0, **kwargs)[source]#

Change string to Mel.

Parameters
  • string (str) –

  • speed_ratio (float, optional (default=1.0)) – Increase this variable will increase time voice generated.

  • f0_ratio (float, optional (default=1.0)) – Increase this variable will increase frequency, low frequency will generate more deeper voice.

  • energy_ratio (float, optional (default=1.0)) – Increase this variable will increase loudness.

Returns

result

Return type

Dict[string, decoder-output, mel-output, universal-output]

malaya_speech.model.synthesis.FastspeechSDP#

class malaya_speech.model.synthesis.FastspeechSDP[source]#
predict(string, speed_ratio=1.0, f0_ratio=1.0, energy_ratio=1.0, temperature_durator=0.6666, **kwargs)[source]#

Change string to Mel.

Parameters
  • string (str) –

  • speed_ratio (float, optional (default=1.0)) – Increase this variable will increase time voice generated.

  • f0_ratio (float, optional (default=1.0)) – Increase this variable will increase frequency, low frequency will generate more deeper voice.

  • energy_ratio (float, optional (default=1.0)) – Increase this variable will increase loudness.

  • temperature_durator (float, optional (default=0.66666)) – Durator trying to predict alignment with random.normal() * temperature_durator.

Returns

result

Return type

Dict[string, decoder-output, mel-output, universal-output]

malaya_speech.model.synthesis.E2E_FastSpeech#

class malaya_speech.model.synthesis.E2E_FastSpeech[source]#
predict(string, speed_ratio=1.0, f0_ratio=1.0, energy_ratio=1.0, temperature_durator=0.6666, **kwargs)[source]#

Change string to Mel.

Parameters
  • string (str) –

  • speed_ratio (float, optional (default=1.0)) – Increase this variable will increase time voice generated.

  • f0_ratio (float, optional (default=1.0)) – Increase this variable will increase frequency, low frequency will generate more deeper voice.

  • energy_ratio (float, optional (default=1.0)) – Increase this variable will increase loudness.

  • temperature_durator (float, optional (default=0.66666)) – Durator trying to predict alignment with random.normal() * temperature_durator.

Returns

result

Return type

Dict[string, decoder-output, y]

malaya_speech.model.synthesis.FastVC#

class malaya_speech.model.synthesis.FastVC[source]#
predict(original_audio, target_audio)[source]#

Change original voice audio to follow targeted voice.

Parameters
  • original_audio (np.array or malaya_speech.model.frame.Frame) –

  • target_audio (np.array or malaya_speech.model.frame.Frame) –

Returns

result

Return type

Dict[decoder-output, mel-output]

malaya_speech.model.synthesis.Fastpitch#

class malaya_speech.model.synthesis.Fastpitch[source]#
predict(string, speed_ratio=1.0, pitch_ratio=1.0, pitch_addition=0.0, **kwargs)[source]#

Change string to Mel.

Parameters
  • string (str) –

  • speed_ratio (float, optional (default=1.0)) – Increase this variable will increase time voice generated.

  • pitch_ratio (float, optional (default=1.0)) – pitch = pitch * pitch_ratio, amplify existing pitch contour.

  • pitch_addition (float, optional (default=0.0)) – pitch = pitch + pitch_addition, change pitch contour.

Returns

result

Return type

Dict[string, decoder-output, mel-output, pitch-output, universal-output]

malaya_speech.model.transducer.Transducer#

class malaya_speech.model.transducer.Transducer[source]#
predict_alignment(input, combined=True)[source]#

Transcribe input and get timestamp, only support greedy decoder.

Parameters
  • input (np.array) – np.array or malaya_speech.model.frame.Frame.

  • combined (bool, optional (default=True)) – If True, will combined subwords to become a word.

Returns

result

Return type

List[Dict[text, start, end]]

greedy_decoder(inputs)[source]#

Transcribe inputs using greedy decoder.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result

Return type

List[str]

beam_decoder(inputs, beam_width=5, temperature=0.0, score_norm=True)[source]#

Transcribe inputs using beam decoder.

Parameters
  • inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

  • beam_width (int, optional (default=5)) – beam size for beam decoder.

  • temperature (float, optional (default=0.0)) – apply temperature function for logits, can help for certain case, logits += -np.log(-np.log(uniform_noise_shape_logits)) * temperature

  • score_norm (bool, optional (default=True)) – descending sort beam based on score / length of decoded.

Returns

result

Return type

List[str]

beam_decoder_lm(inputs, language_model, beam_width=5, token_min_logp=- 20.0, beam_prune_logp=- 5.0, temperature=0.0, score_norm=True)[source]#

Transcribe inputs using beam decoder + KenLM.

Parameters
  • inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

  • language_model (pyctcdecode.language_model.LanguageModel) – pyctcdecode language model, load from LanguageModel(kenlm_model, alpha = alpha, beta = beta).

  • beam_width (int, optional (default=5)) – beam size for beam decoder.

  • token_min_logp (float, optional (default=-20.0)) – minimum log probability to select a token.

  • beam_prune_logp (float, optional (default=-5.0)) – filter candidates >= max score lm + beam_prune_logp.

  • temperature (float, optional (default=0.0)) – apply temperature function for logits, can help for certain case, logits += -np.log(-np.log(uniform_noise_shape_logits)) * temperature

  • score_norm (bool, optional (default=True)) – descending sort beam based on score / length of decoded.

Returns

result

Return type

List[str]

predict(inputs)[source]#

Transcribe inputs using greedy decoder, will return list of strings.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result

Return type

List[str]

gradio(record_mode=True, **kwargs)[source]#

Transcribe an input using beam decoder on Gradio interface.

Parameters
  • record_mode (bool, optional (default=True)) – if True, Gradio will use record mode, else, file upload mode.

  • **kwargs (keyword arguments for beam decoder and iface.launch.) –

malaya_speech.model.transducer.TransducerAligner#

class malaya_speech.model.transducer.TransducerAligner[source]#
predict(input, transcription, sample_rate=16000)[source]#

Transcribe input, will return a string. :param input: np.array or malaya_speech.model.frame.Frame. :type input: np.array :type transcription: str :param transcription: transcription of input audio :type transcription: str :type sample_rate: int :param sample_rate: sample rate for input. :type sample_rate: int, optional (default=16000)

Returns

result

Return type

Dict[words_alignment, subwords_alignment, subwords, alignment]

malaya_speech.model.unet.UNET#

class malaya_speech.model.unet.UNET[source]#
predict(inputs)[source]#

Enhance inputs, will return melspectrogram.

Parameters

inputs (List[np.array]) –

Returns

result

Return type

List

malaya_speech.model.unet.UNETSTFT#

class malaya_speech.model.unet.UNETSTFT[source]#
predict(input)[source]#

Enhance inputs, will return waveform.

Parameters

input (np.array) – np.array or malaya_speech.model.frame.Frame.

Returns

result

Return type

Dict

malaya_speech.model.unet.UNET1D#

class malaya_speech.model.unet.UNET1D[source]#
predict(input)[source]#

Enhance inputs, will return waveform.

Parameters

input (np.array) – np.array or malaya_speech.model.frame.Frame.

Returns

result

Return type

np.array

malaya_speech.model.wav2vec.Wav2Vec2_CTC#

class malaya_speech.model.wav2vec.Wav2Vec2_CTC[source]#
greedy_decoder(inputs)[source]#

Transcribe inputs using greedy decoder.

Parameters

input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result

Return type

List[str]

beam_decoder(inputs, beam_width=100, **kwargs)[source]#

Transcribe inputs using beam decoder.

Parameters
  • input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

  • beam_width (int, optional (default=100)) – beam size for beam decoder.

Returns

result

Return type

List[str]

predict(inputs)[source]#

Predict logits from inputs using greedy decoder.

Parameters

input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result

Return type

List[str]

predict_logits(inputs, norm_func=<function softmax>)[source]#

Predict logits from inputs.

Parameters
  • input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

  • norm_func (Callable, optional (default=malaya.utils.activation.softmax)) –

Returns

result

Return type

List[np.array]

gradio(record_mode=True, lm_func=None, **kwargs)[source]#

Transcribe an input using beam decoder on Gradio interface.

Parameters
  • record_mode (bool, optional (default=True)) – if True, Gradio will use record mode, else, file upload mode.

  • lm_func (Callable, optional (default=None)) – if not None, will pass a logits with shape [T, D].

  • **kwargs (keyword arguments for beam decoder and iface.launch.) –

malaya_speech.model.wav2vec.Wav2Vec2_Aligner#

class malaya_speech.model.wav2vec.Wav2Vec2_Aligner[source]#
predict(input, transcription, sample_rate=16000)[source]#

Transcribe input, will return a string.

Parameters
  • input (np.array) – np.array or malaya_speech.model.frame.Frame.

  • transcription (str) – transcription of input audio.

  • sample_rate (int, optional (default=16000)) – sample rate for input.

Returns

result

Return type

Dict[chars_alignment, words_alignment, alignment]

malaya_speech.model.webrtc.WebRTC#

class malaya_speech.model.webrtc.WebRTC(vad, sample_rate=16000, minimum_amplitude=100)[source]#

malaya_speech.torch_model.super_resolution.VoiceFixer#

class malaya_speech.torch_model.super_resolution.VoiceFixer[source]#
predict(input, remove_higher_frequency=True)[source]#
Parameters
  • input (np.array) – np.array or malaya_speech.model.frame.Frame, must an audio with 44100 sampling rate.

  • remove_higher_frequency (bool, optional (default = True)) – Remove high frequency before neural upsampling.

Returns

result

Return type

np.array with 44100 sampling rate

forward(input, remove_higher_frequency=True)[source]#

Defines the computation performed at every call.

Should be overridden by all subclasses.

Note

Although the recipe for forward pass needs to be defined within this function, one should call the Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

malaya_speech.torch_model.super_resolution.NVSR#

class malaya_speech.torch_model.super_resolution.NVSR[source]#
predict(input)[source]#
Parameters

input (np.array) – np.array or malaya_speech.model.frame.Frame, must an audio with 44100 sampling rate.

Returns

result

Return type

np.array with 44100 sampling rate

malaya_speech.torch_model.synthesis.VITS#

class malaya_speech.torch_model.synthesis.VITS[source]#
predict(string, temperature=0.6666, temperature_durator=0.6666, length_ratio=1.0, **kwargs)[source]#

Change string to waveform.

Parameters
  • string (str) –

  • temperature (float, optional (default=0.6666)) – Decoder model trying to decode with encoder(text) + random.normal() * temperature.

  • temperature_durator (float, optional (default=0.6666)) – Durator trying to predict alignment with random.normal() * temperature_durator.

  • length_ratio (float, optional (default=1.0)) – Increase this variable will increase time voice generated.

Returns

result

Return type

Dict[string, ids, alignment, y]

malaya_speech.pipeline#

class malaya_speech.pipeline.Pipeline[source]#
visualize(filename='pipeline.png', **kwargs)[source]#

Render the computation of this object’s task graph using graphviz.

Requires graphviz to be installed.

Parameters
  • filename (str, optional) – The name of the file to write to disk.

  • kwargs – Graph attributes to pass to graphviz like rankdir="LR"

batching = <function batching>#
flatten = <function flatten>#
foreach_map = <function foreach_map>#
map = <function map>#
partition = <function partition>#
sliding_window = <function sliding_window>#
zip = <function zip>#

malaya_speech.pipeline.map#

class malaya_speech.pipeline.map[source]#

apply a function / method to the pipeline

Examples

>>> source = Pipeline()
>>> source.map(lambda x: x + 1).map(print)
>>> source.emit(1)
2

malaya_speech.pipeline.batching#

class malaya_speech.pipeline.batching[source]#

Batching stream into tuples

Examples

>>> source = Pipeline()
>>> source.batching(2).map(print)
>>> source.emit([1,2,3,4,5])
([1, 2], [3, 4], [5])

malaya_speech.pipeline.partition#

class malaya_speech.pipeline.partition[source]#

Partition stream into tuples of equal size

Examples

>>> source = Pipeline()
>>> source.partition(3).map(print)
>>> for i in range(10):
...     source.emit(i)
(0, 1, 2)
(3, 4, 5)
(6, 7, 8)

malaya_speech.pipeline.sliding_window#

class malaya_speech.pipeline.sliding_window[source]#

Produce overlapping tuples of size n

Parameters

return_partial (bool) – If True, yield tuples as soon as any events come in, each tuple being smaller or equal to the window size. If False, only start yielding tuples once a full window has accrued.

Examples

>>> source = Pipeline()
>>> source.sliding_window(3, return_partial=False).map(print)
>>> for i in range(8):
...     source.emit(i)
(0, 1, 2)
(1, 2, 3)
(2, 3, 4)
(3, 4, 5)
(4, 5, 6)
(5, 6, 7)

malaya_speech.pipeline.foreach_map#

class malaya_speech.pipeline.foreach_map[source]#

Apply a function to every element in a tuple in the stream.

Parameters
  • func (callable) –

  • method (str, optional (default='sync')) –

    method to process each elements.

    • 'sync' - loop one-by-one to process.

    • 'async' - async process all elements at the same time.

    • 'thread' - multithreading level to process all elements at the same time.

      Default is 1 worker. Override worker_size=n to increase.

    • 'process' - multiprocessing level to process all elements at the same time.

      Default is 1 worker. Override worker_size=n to increase.

  • *args – The arguments to pass to the function.

  • **kwargs – Keyword arguments to pass to func.

Examples

>>> source = Pipeline()
>>> source.foreach_map(lambda x: 2*x).map(print)
>>> for i in range(3):
...     source.emit((i, i))
(0, 0)
(2, 2)
(4, 4)

malaya_speech.pipeline.flatten#

class malaya_speech.pipeline.flatten[source]#

Flatten streams of lists or iterables into a stream of elements

Examples

>>> source = Pipeline()
>>> source.flatten().map(print)
>>> source.emit([[1, 2, 3], [4, 5], [6, 7, 7]])
[1, 2, 3, 4, 5, 6, 7, 7]

malaya_speech.pipeline.zip#

class malaya_speech.pipeline.zip[source]#

Combine 2 branches into 1 branch.

Examples

>>> source = Pipeline()
>>> left = source.map(lambda x: x + 1, name = 'left')
>>> right = source.map(lambda x: x + 10, name = 'right')
>>> left.zip(right).map(sum).map(print)
>>> source.emit(2)
15
pack_literals(tup)[source]#

Fill buffers for literals whenever we empty them

malaya_speech.streaming#

malaya_speech.streaming.record(vad, asr_model=None, classification_model=None, device=None, input_rate=16000, sample_rate=16000, blocks_per_second=50, padding_ms=300, ratio=0.75, min_length=0.1, filename=None, spinner=False)[source]#

Record an audio using pyaudio library. This record interface required a VAD model.

Parameters
  • vad (object) – vad model / pipeline.

  • asr_model (object) – ASR model / pipeline, will transcribe each subsamples realtime.

  • classification_model (object) – classification pipeline, will classify each subsamples realtime.

  • device (None) – device parameter for pyaudio, check available devices from sounddevice.query_devices().

  • input_rate (int, optional (default = 16000)) – sample rate from input device, this will auto resampling.

  • sample_rate (int, optional (default = 16000)) – output sample rate.

  • blocks_per_second (int, optional (default = 50)) – size of frame returned from pyaudio, frame size = sample rate / (blocks_per_second / 2). 50 is good for WebRTC, 30 or less is good for Malaya Speech VAD.

  • padding_ms (int, optional (default = 300)) – size of queue to store frames, size = padding_ms // (1000 * blocks_per_second // sample_rate)

  • ratio (float, optional (default = 0.75)) – if 75% of the queue is positive, assumed it is a voice activity.

  • min_length (float, optional (default=0.1)) – minimum length (s) to accept a subsample.

  • filename (str, optional (default=None)) – if None, will auto generate name based on timestamp.

  • spinner (bool, optional (default=False)) – if True, will use spinner object from halo library.

Returns

result

Return type

[filename, samples]

malaya_speech.utils.aligner#

class malaya_speech.utils.aligner.Point(token_index, time_index, score)[source]#
class malaya_speech.utils.aligner.Segment(label, start, end, score)[source]#
malaya_speech.utils.aligner.put_comma(alignment, min_threshold=0.5)[source]#

Put comma in alignment from force alignment model.

Parameters
  • alignment (List[Dict[text, start, end]]) –

  • min_threshold (float, optional (default=0.5)) – minimum threshold in term of seconds to assume a comma.

Returns

result

Return type

List[str]

malaya_speech.utils.aligner.plot_alignments(alignment, subs_alignment, words_alignment, waveform, separator=' ', sample_rate=16000, figsize=(16, 9), plot_score_char=False, plot_score_word=True)[source]#

plot alignment.

Parameters
  • alignment (np.array) – usually alignment output.

  • subs_alignment (list) – usually chars_alignment or subwords_alignment output.

  • words_alignment (list) – usually words_alignment output.

  • waveform (np.array) – input audio.

  • separator (str, optional (default=' ')) – separator between words, only useful if subs_alignment is character based.

  • sample_rate (int, optional (default=16000)) –

  • figsize (tuple, optional (default=(16, 9))) – figure size for matplotlib figsize.

  • plot_score_char (bool, optional (default=False)) – plot score on top of character plots.

  • plot_score_word (bool, optional (default=True)) – plot score on top of word plots.

malaya_speech.utils.astype#

malaya_speech.utils.astype.to_ndarray(array)[source]#

Change list / tuple / bytes into np.array

Parameters

array (list / tuple / bytes) –

Returns

result

Return type

np.array

malaya_speech.utils.astype.to_byte(array)[source]#

Change list / tuple / np.array into bytes

Parameters

array (list / tuple / np.array) –

Returns

result

Return type

bytes

malaya_speech.utils.astype.float_to_int(array, type=<class 'numpy.int16'>)[source]#

Change np.array float32 / float64 into np.int16

Parameters
  • array (np.array) –

  • type (np.int16) –

Returns

result

Return type

np.array

malaya_speech.utils.astype.int_to_float(array, type=<class 'numpy.float32'>)[source]#

Change np.array int16 into np.float32

Parameters
  • array (np.array) –

  • type (np.float32) –

Returns

result

Return type

np.array

malaya_speech.utils.char#

malaya_speech.utils.char.strip_ids(ids, ids_to_strip)[source]#

Strip ids_to_strip from the end ids.

malaya_speech.utils.char.generate_vocab(strings)[source]#

Generate character vocab sorted based on frequency.

Parameters

strings (List[str]) –

Returns

result

Return type

List[str]

malaya_speech.utils.char.encode(string, add_eos=True, add_blank=False, lookup=None)[source]#

Encode string to integer representation based on ascii table or lookup variable.

Parameters
  • string (str) –

  • add_eos (bool, optional (default=True)) – add EOS token at the end of encoded.

  • add_blank (bool, optional (default=False)) – add BLANK token at the starting of encoded, this is for transducer / transformer based.

  • lookup (List[str], optional (default=None)) – list of unique strings.

Returns

result

Return type

List[int]

malaya_speech.utils.char.decode(ids, lookup=None)[source]#

Decode integer representation to string based on ascii table or lookup variable.

Parameters
  • ids (List[int]) –

  • lookup (List[str], optional (default=None)) – list of unique strings.

Returns

result

Return type

str

malaya_speech.utils.combine#

malaya_speech.utils.combine.without_silent(frames, threshold_to_stop=0.1, silent_trail=500)[source]#

Group multiple frames based on label and threshold to stop.

Parameters
  • frames (List[Tuple[Frame, label]]) – Output from VAD.

  • threshold_to_stop (float, optional (default = 0.1)) – If threshold_to_stop is 0.1, means that, length same label samples must at least 0.1 second.

  • silent_trail (int, optional (default = 500)) – if detected a silent, will append first N frames and last N frames.

Returns

result

Return type

np.array

malaya_speech.utils.featurization#

malaya_speech.utils.featurization.normalize_signal(signal, gain=None)[source]#

Normalize float32 signal to [-1, 1] range

malaya_speech.utils.featurization.extract_pitch(y, hop_size=256, sr=22050, bad_f0=5.0, zero_value=- 10.0)[source]#

Originally from https://github.com/yl4579/PitchExtractor/blob/main/meldataset.py

malaya_speech.utils.generator#

malaya_speech.utils.generator.frames(audio, frame_duration_ms=30, sample_rate=16000, append_ending_trail=True)[source]#

Generates audio frames from audio. Takes the desired frame duration in milliseconds, the audio, and the sample rate.

Parameters
  • audio (np.array) –

  • frame_duration_ms (int, optional (default=30)) –

  • sample_rate (int, optional (default=16000)) –

  • append_ending_trail (bool, optional (default=True)) – if True, will append last trail and this last trail might not same length as frame_duration_ms.

Returns

result

Return type

List[malaya_speech.model.frame.Frame]

malaya_speech.utils.generator.mel_sampling(audio, frame_duration_ms=1200, overlap_ms=200, sample_rate=16000)[source]#

Generates audio frames from audio. This is for melspectrogram generative model. Takes the desired frame duration in milliseconds, the audio, and the sample rate.

Parameters
  • audio (np.array) –

  • frame_duration_ms (int, optional (default=1200)) –

  • overlap_ms (int, optional (default=200)) –

  • sample_rate (int, optional (default=16000)) –

Returns

result

Return type

List[np.array]

malaya_speech.utils.generator.combine_mel_sampling(samples, overlap_ms=200, sample_rate=16000, padded_ms=50)[source]#

To combine results from mel_sampling, output from melspectrogram generative model.

Parameters
  • samples (List[np.array]) –

  • overlap_ms (int, optional (default=200)) –

  • sample_rate (int, optional (default=16000)) –

Returns

result

Return type

List[np.array]

malaya_speech.utils.griffin_lim#

malaya_speech.utils.griffin_lim.from_mel(mel_, sr=16000, n_fft=2048, n_iter=32, win_length=1000, hop_length=100)[source]#

Change melspectrogram into waveform using Librosa.

Parameters

spectrogram (np.array) –

Returns

result

Return type

np.array

malaya_speech.utils.griffin_lim.from_mel_vocoder(mel, sr=22050, n_fft=1024, n_mels=256, fmin=80, fmax=7600, n_iter=32, win_length=None, hop_length=256)[source]#

Change melspectrogram into waveform using Librosa.

Parameters

spectrogram (np.array) –

Returns

result

Return type

np.array

malaya_speech.utils.group#

malaya_speech.utils.group.combine_frames(frames)[source]#

Combine multiple frames into one frame.

Parameters

frames (List[Frame]) –

Returns

result

Return type

Frame

malaya_speech.utils.group.group_frames(frames)[source]#

Group multiple frames based on label.

Parameters

frames (List[Tuple[Frame, label]]) –

Returns

result

Return type

List[Tuple[Frame, label]]

malaya_speech.utils.group.group_frames_threshold(frames, threshold_to_stop=0.3)[source]#

Group multiple frames based on label and threshold to stop.

Parameters
  • frames (List[Tuple[Frame, label]]) –

  • threshold_to_stop (float, optional (default = 0.3)) – If threshold_to_stop is 0.3, means that, length same label samples must at least 0.3 second.

Returns

result

Return type

List[Tuple[Frame, label]]

malaya_speech.utils.padding#

malaya_speech.utils.padding.sequence_1d(seq, maxlen=None, padding='post', pad_int=0, return_len=False)[source]#

padding sequence of 1d to become 2d array.

Parameters
  • seq (List[List[int]]) –

  • maxlen (int, optional (default=None)) – If None, will calculate max length in the function.

  • padding (str, optional (default='post')) – If pre, will add 0 on the starting side, else add 0 on the end side.

  • pad_int – padding value.

  • int – padding value.

  • (default=0) (optional) – padding value.

Returns

result

Return type

np.array

malaya_speech.utils.padding.sequence_nd(seq, maxlen=None, padding='post', pad_val=0.0, dim=1, return_len=False)[source]#

padding sequence of nd to become (n+1)d array.

Parameters
  • seq (list of nd array) –

  • maxlen (int, optional (default=None)) – If None, will calculate max length in the function.

  • padding (str, optional (default='post')) – If pre, will add 0 on the starting side, else add 0 on the end side.

  • pad_val – padding value.

  • float – padding value.

  • (default=0.0) (optional) – padding value.

  • dim (int, optional (default=1)) –

Returns

result

Return type

np.array

malaya_speech.utils.padding.tf_sequence_nd(seq, maxlen=None, padding='post', pad_val=0.0, dim=1, return_len=False)[source]#

padding sequence of nd to become (n+1)d array.

Parameters
  • seq (list of nd array) –

  • maxlen (int, optional (default=None)) – If None, will calculate max length in the function.

  • padding (str, optional (default='post')) – If pre, will add 0 on the starting side, else add 0 on the end side.

  • pad_val – padding value.

  • float – padding value.

  • (default=0.0) (optional) – padding value.

  • dim (int, optional (default=1)) –

Returns

result

Return type

np.array

malaya_speech.utils.read#

malaya_speech.utils.read.resample(data, old_samplerate, new_samplerate)[source]#

Resample signal.

Parameters
  • data (np.array) –

  • old_samplerate (int) – old sample rate.

  • new_samplerate (int) – new sample rate.

Returns

result

Return type

data

malaya_speech.utils.read.load(file, sr=16000, scale=True)[source]#

Read sound file, any format supported by soundfile.read

Parameters
  • file (str) –

  • sr (int, (default=16000)) – new sample rate. If input sample rate is not same, will resample automatically.

  • scale (bool, (default=True)) – Scale to -1 and 1.

Returns

result

Return type

(y, sr)

malaya_speech.utils.split#

malaya_speech.utils.split.split_vad(frames, n=3, negative_threshold=0.1)[source]#

Split a sample into multiple samples based n size of negative VAD.

Parameters
  • frames (List[Tuple[Frame, label]]) –

  • n (int, optional (default=3)) – n size of negative VAD to assume in one subsample.

  • negative_threshold (float, optional (default = 0.1)) – If negative_threshold is 0.1, means that, length negative samples must at least 0.1 second.

Returns

result

Return type

List[Frame]

malaya_speech.utils.split.split_vad_duration(frames, max_duration=5.0, negative_threshold=0.1)[source]#

Split a sample into multiple samples based maximum duration of voice activities.

Parameters
  • frames (List[Tuple[Frame, label]]) –

  • max_duration (float, optional (default = 5.0)) – Maximum duration to assume one sample combined from voice activities.

  • negative_threshold (float, optional (default = 0.1)) – If negative_threshold is 0.1, means that, length negative samples must at least 0.1 second.

Returns

result

Return type

List[Frame]

malaya_speech.utils.subword#

malaya_speech.utils.subword.generate_tokenizer(strings, target_vocab_size=1024, max_subword_length=4, max_corpus_chars=None, reserved_tokens=None)[source]#

Build a subword dictionary.

malaya_speech.utils.subword.save(tokenizer, path)[source]#

Save subword dictionary to a text file.

malaya_speech.utils.subword.load(path)[source]#

Load text file into subword dictionary.

malaya_speech.utils.subword.encode(tokenizer, string, add_blank=False)[source]#

Encode string to integer representation based on ascii table or lookup variable.

Parameters
  • tokenizer (object) – tokenizer object

  • string (str) –

  • add_blank (bool, optional (default=False)) – add BLANK token at the starting of encoded, this is for transducer / transformer based.

  • lookup (List[str], optional (default=None)) – list of unique strings.

Returns

result

Return type

List[int]

malaya_speech.utils.subword.decode(tokenizer, ids)[source]#

Decode integer representation to string based on tokenizer vocab.

Parameters
  • tokenizer (object) – tokenizer object

  • ids (List[int]) –

Returns

result

Return type

str

malaya_speech.utils.subword.decode_multilanguage(tokenizers, ids)[source]#

Decode integer representation to string using list of tokenizer objects.

Parameters
  • tokenizers (List[object]) – List of tokenizer objects.

  • ids (List[int]) –

Returns

result

Return type

str

malaya_speech.utils.tf_featurization#

malaya_speech.age_detection#

malaya_speech.age_detection.available_model()[source]#

List available age detection deep models.

malaya_speech.age_detection.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load age detection deep model.

Parameters
  • model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.age_detection.available_model().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.diarization#

malaya_speech.diarization.speaker_similarity(vad_results, speaker_vector, similarity_threshold=0.8, norm_function=None, return_embedding=False)[source]#

Speaker diarization using L2-Norm similarity.

Parameters
  • vad_results (List[Tuple[Frame, label]]) – results from VAD.

  • speaker_vector (callable) – speaker vector object.

  • similarity_threshold (float, optional (default=0.8)) – if current voice activity sample similar at least 80%, we assumed it is from the same speaker.

  • norm_function (Callable, optional(default=None)) – normalize function for speaker vectors.

  • speaker_change_threshold (float, optional (default=0.5)) – in one voice activity sample can be more than one speaker, split it using this threshold.

Returns

result

Return type

List[Tuple[Frame, label]]

malaya_speech.diarization.clustering(vad_results, speaker_vector, model, norm_function=<function l2_normalize>, log_distance_metric=None, return_embedding=False)[source]#

Speaker diarization using any clustering model.

Parameters
  • vad_results (List[Tuple[Frame, label]]) – results from VAD.

  • speaker_vector (callable) – speaker vector object.

  • model (callable) – Any unsupervised clustering model. Required fit_predict or apply or predict method.

  • norm_function (Callable, optional(default=malaya_speech.utils.dist.l2_normalize)) – normalize function for speaker vectors.

  • log_distance_metric (str, optional (default=None)) – post distance norm in log scale metrics. this parameter is necessary for model that required square array input. Common value is one of [‘cosine’, ‘angular’].

Returns

result

Return type

List[Tuple[Frame, label]]

malaya_speech.emotion#

malaya_speech.emotion.available_model()[source]#

List available emotion detection deep models.

malaya_speech.emotion.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load emotion detection deep model.

Parameters
  • model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.emotion.available_model().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.force_alignment#

malaya_speech.force_alignment.available_transducer()[source]#

List available Encoder-Transducer Aligner models.

malaya_speech.force_alignment.available_ctc()[source]#

List available Encoder-CTC Aligner models.

malaya_speech.force_alignment.available_huggingface()[source]#

List available HuggingFace Malaya-Speech Aligner models.

malaya_speech.force_alignment.deep_transducer(model='conformer-transducer', quantized=False, **kwargs)[source]#

Load Encoder-Transducer Aligner model.

Parameters
  • model (str, optional (default='conformer-transducer')) – Check available models at malaya_speech.force_alignment.available_aligner().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.transducer.TransducerAligner class

malaya_speech.force_alignment.deep_ctc(model='hubert-conformer', quantized=False, **kwargs)[source]#

Load Encoder-CTC ASR model.

Parameters
  • model (str, optional (default='hubert-conformer')) – Check available models at malaya_speech.stt.available_ctc().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.wav2vec.Wav2Vec2_Aligner class

malaya_speech.force_alignment.huggingface(model='mesolitica/wav2vec2-xls-r-300m-mixed')[source]#

Load Finetuned models from HuggingFace.

Parameters

model (str, optional (default='mesolitica/wav2vec2-xls-r-300m-mixed')) – Check available models at malaya_speech.stt.available_huggingface().

Returns

result

Return type

malaya_speech.model.huggingface.CTC class

malaya_speech.gender#

malaya_speech.gender.available_model()[source]#

List available gender detection deep models.

malaya_speech.gender.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load gender detection deep model.

Parameters
  • model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.gender.available_model().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.language_detection#

malaya_speech.language_detection.available_model()[source]#

List available language detection deep models.

malaya_speech.language_detection.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load language detection deep model.

Parameters
  • model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.language_detection.available_model().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.language_model#

malaya_speech.language_model.available_kenlm()[source]#

List available KenLM Language Model.

malaya_speech.language_model.available_gpt2()[source]#

List available GPT2 Language Model.

malaya_speech.language_model.available_mlm()[source]#

List available MLM Language Model.

malaya_speech.language_model.kenlm(model='dump-combined', **kwargs)[source]#

Load KenLM language model.

Parameters

model (str, optional (default='dump-combined')) – Check available models at malaya_speech.language_model.available_kenlm().

Returns

result

Return type

str

malaya_speech.language_model.gpt2(model='mesolitica/gpt2-117m-bahasa-cased', force_check=True, **kwargs)[source]#

Load GPT2 language model.

Parameters
  • model (str, optional (default='mesolitica/gpt2-117m-bahasa-cased')) – Check available models at malaya_speech.language_model.available_gpt2().

  • force_check (bool, optional (default=True)) – Force check model one of malaya model. Set to False if you have your own huggingface model.

Returns

result

Return type

malaya.torch_model.gpt2_lm.LM class

malaya_speech.language_model.mlm(model='mesolitica/bert-base-standard-bahasa-cased', force_check=True, **kwargs)[source]#

Load Masked language model.

Parameters
  • model (str, optional (default='mesolitica/bert-base-standard-bahasa-cased')) – Check available models at malaya_speech.language_model.available_mlm().

  • force_check (bool, optional (default=True)) – Force check model one of malaya model. Set to False if you have your own huggingface model.

Returns

result

Return type

malaya_speech.torch_model.mask_lm.LM class

malaya_speech.multispeaker_separation#

malaya_speech.multispeaker_separation.available_deep_wav()[source]#

List available FastSep models trained on raw 8k wav.

malaya_speech.multispeaker_separation.deep_wav(model='fastsep-4', quantized=False, **kwargs)[source]#

Load FastSep model, trained on raw 8k wav using SISNR PIT loss.

Parameters
  • model (str, optional (default='fastsep-4')) – Check available models at malaya_speech.multispeaker_separation.available_deep_wav().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.tf.Split class

malaya_speech.noise_reduction#

malaya_speech.noise_reduction.available_model()[source]#

List available Noise Reduction deep learning models.

malaya_speech.noise_reduction.deep_model(model='resnet-unet', quantized=False, **kwargs)[source]#

Load Noise Reduction deep learning model.

Parameters
  • model (str, optional (default='resnet-unet')) – Check available models at malaya_speech.noise_reduction.available_model().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.tf.UNET_STFT class

malaya_speech.speaker_change#

malaya_speech.speaker_change.available_model()[source]#

List available speaker change deep models.

malaya_speech.speaker_change.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load speaker change deep model.

Parameters
  • model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.speaker_change.available_model().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.speaker_change.split_activities(vad_results, speaker_change_results, speaker_change_threshold=0.5, sr=16000, ignore_not_activity=True)[source]#

split VAD based on speaker change threshold, worse-case O(N^2).

Parameters
  • vad_results (List[Tuple[Frame, label]]) – results from VAD.

  • speaker_change_results (List[Tuple[Frame, float]], optional (default=None)) – results from speaker change module, must in float result.

  • speaker_change_threshold (float, optional (default=0.5)) – in one voice activity sample can be more than one speaker, split it using this threshold.

  • sr (int, optional (default=16000)) – sample rate, classification model in malaya-speech use 16k.

  • ignore_not_activity (bool, optional (default=True)) – If True, will ignore if result VAD is False, else will try to split.

Returns

result

Return type

List[Tuple[Frame, label]]

malaya_speech.speaker_overlap#

malaya_speech.speaker_overlap.available_model()[source]#

List available speaker overlap deep models.

malaya_speech.speaker_overlap.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load speaker overlap deep model.

Parameters
  • model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.speaker_overlap.available_model().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.speaker_vector#

malaya_speech.speaker_vector.available_model()[source]#

List available speaker vector deep models.

malaya_speech.speaker_vector.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load Speaker2Vec model.

Parameters
  • model (str, optional (default='speakernet')) – Check available models at malaya_speech.speaker_vector.available_model().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.speech_enhancement#

malaya_speech.speech_enhancement.available_deep_masking()[source]#

List available Speech Enhancement STFT masking deep learning model.

malaya_speech.speech_enhancement.available_deep_enhance()[source]#

List available Speech Enhancement UNET Waveform sampling deep learning model.

malaya_speech.speech_enhancement.deep_masking(model='resnet-unet', quantized=False, **kwargs)[source]#

Load Speech Enhancement STFT UNET masking deep learning model.

Parameters
  • model (str, optional (default='resnet-unet')) – Check available models at malaya_speech.speech_enhancement.available_deep_masking().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.unet.UNETSTFT class

malaya_speech.speech_enhancement.deep_enhance(model='unet', quantized=False, **kwargs)[source]#

Load Speech Enhancement UNET Waveform sampling deep learning model.

Parameters
  • model (str, optional (default='unet')) – Check available models at malaya_speech.speech_enhancement.available_deep_enhance().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.unet.UNET1D class

malaya_speech.speechsplit_conversion#

malaya_speech.speechsplit_conversion.available_deep_conversion(f0_mode='pysptk')[source]#

List available Voice Conversion models.

Parameters

f0_mode (str, optional (default='pysptk')) –

F0 conversion supported. Allowed values:

malaya_speech.speechsplit_conversion.deep_conversion(model='fastspeechsplit-v2-vggvox-v2', f0_mode='pysptk', quantized=False, **kwargs)[source]#

Load Voice Conversion model.

Parameters
  • model (str, optional (default='fastspeechsplit-v2-vggvox-v2')) – Check available models at malaya_speech.speechsplit_conversion.available_deep_conversion(f0_mode = ‘{f0_mode}’)

  • f0_mode (str, optional (default='pysptk')) –

    F0 conversion supported. Allowed values:

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.splitter.FastSpeechSplit class

malaya_speech.stack#

malaya_speech.stack.classification_stack(models)[source]#

Stacking for classification models. All models should be in the same domain classification.

Parameters

models (List[Callable]) – list of models.

Returns

result

Return type

malaya_speech.stack.Stack class

malaya_speech.model.stack.Stack#

class malaya_speech.stack.Stack[source]#
predict_proba(inputs, aggregate=<function gmean>)[source]#

Stacking for predictive models, will return probability.

Parameters
  • inputs (List[np.array]) –

  • aggregate (Callable, optional (default=scipy.stats.mstats.gmean)) –

  • function. (Aggregate) –

Returns

result

Return type

np.array

predict(inputs, aggregate=<function gmean>)[source]#

Stacking for predictive models, will return labels.

Parameters
  • inputs (List[np.array]) –

  • aggregate (Callable, optional (default=scipy.stats.mstats.gmean)) –

  • function. (Aggregate) –

Returns

result

Return type

List[str]

malaya_speech.stt#

malaya_speech.stt.available_ctc()[source]#

List available Encoder-CTC ASR models.

malaya_speech.stt.available_transducer()[source]#

List available Encoder-Transducer ASR models.

malaya_speech.stt.available_huggingface()[source]#

List available HuggingFace Malaya-Speech ASR models.

malaya_speech.stt.deep_ctc(model='hubert-conformer', quantized=False, **kwargs)[source]#

Load Encoder-CTC ASR model.

Parameters
  • model (str, optional (default='hubert-conformer')) – Check available models at malaya_speech.stt.available_ctc().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.wav2vec.Wav2Vec2_CTC class

malaya_speech.stt.deep_transducer(model='conformer', quantized=False, **kwargs)[source]#

Load Encoder-Transducer ASR model.

Parameters
  • model (str, optional (default='conformer')) – Check available models at malaya_speech.stt.available_transducer().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.transducer.Transducer class

malaya_speech.stt.huggingface(model='mesolitica/wav2vec2-xls-r-300m-mixed', **kwargs)[source]#

Load Finetuned models from HuggingFace. Required Tensorflow >= 2.0.

Parameters

model (str, optional (default='mesolitica/wav2vec2-xls-r-300m-mixed')) – Check available models at malaya_speech.stt.available_huggingface().

Returns

result

Return type

malaya_speech.model.huggingface.CTC class

malaya_speech.super_resolution#

malaya_speech.super_resolution.available_unet()[source]#

List available Super Resolution 4x deep learning UNET models.

malaya_speech.super_resolution.available_tfgan()[source]#

List available Super Resolution deep learning UNET + TFGAN Vocoder models.

malaya_speech.super_resolution.available_audio_diffusion()[source]#

List available Super Resolution deep learning UNET + TFGAN Vocoder models.

malaya_speech.super_resolution.unet(model='srgan-256', quantized=False, **kwargs)[source]#

Load Super Resolution 4x deep learning UNET model.

Parameters
  • model (str, optional (default='srgan-256')) – Check available models at malaya_speech.super_resolution.available_unet().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.tf.UNET1D class

malaya_speech.super_resolution.tfgan(model='voicefixer', **kwargs)[source]#

Load TFGAN based Speech Resolution.

Parameters

model (str, optional (default='voicefixer')) – Check available models at malaya_speech.super_resolution.available_tfgan().

Returns

result

Return type

malaya_speech.torch_model.super_resolution.VoiceFixer

malaya_speech.super_resolution.audio_diffusion(model='nuwave2', **kwargs)[source]#

Load audio diffusion based Speech Resolution.

Parameters

model (str, optional (default='nuwave2')) – Check available models at malaya_speech.super_resolution.available_audio_diffusion().

Returns

result

Return type

malaya_speech.torch_model.super_resolution.NuWave2

malaya_speech.tts#

malaya_speech.tts.available_tacotron2()[source]#

List available Tacotron2, Text to Mel models.

malaya_speech.tts.available_fastspeech2()[source]#

List available FastSpeech2, Text to Mel models.

malaya_speech.tts.available_fastpitch()[source]#

List available FastPitch, Text to Mel models.

malaya_speech.tts.available_glowtts()[source]#

List available GlowTTS, Text to Mel models.

malaya_speech.tts.available_lightspeech()[source]#

List available LightSpeech, Text to Mel models.

malaya_speech.tts.available_vits()[source]#

List available VITS, End-to-End models.

malaya_speech.tts.available_e2e_fastspeech2()[source]#

List available FastSpeech2, End-to-End models.

malaya_speech.tts.load_text_ids(pad_to=8, understand_punct=True, is_lower=True, **kwargs)[source]#

Load text normalizer module use by Malaya-Speech TTS.

malaya_speech.tts.tacotron2(model='yasmin', quantized=False, pad_to=8, **kwargs)[source]#

Load Tacotron2 Text-to-Mel TTS model.

Parameters
  • model (str, optional (default='yasmin')) – Check available models at malaya_speech.tts.available_tacotron2().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

  • pad_to (int, optional (default=8)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.

Returns

result

Return type

malaya_speech.model.synthesis.Tacotron class

malaya_speech.tts.fastspeech2(model='osman', quantized=False, pad_to=8, **kwargs)[source]#

Load Fastspeech2 Text-to-Mel TTS model.

Parameters
  • model (str, optional (default='male')) – Check available models at malaya_speech.tts.available_fastspeech2().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

  • pad_to (int, optional (default=8)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.

Returns

result

Return type

malaya_speech.model.synthesis.Fastspeech class

malaya_speech.tts.fastpitch(model='male', quantized=False, pad_to=8, **kwargs)[source]#

Load Fastspitch Text-to-Mel TTS model.

Parameters
  • model (str, optional (default='male')) – Check available models at malaya_speech.tts.available_fastpitch().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

  • pad_to (int, optional (default=8)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.

Returns

result

Return type

malaya_speech.model.synthesis.Fastpitch class

malaya_speech.tts.glowtts(model='yasmin', quantized=False, pad_to=2, **kwargs)[source]#

Load GlowTTS Text-to-Mel TTS model.

Parameters
  • model (str, optional (default='yasmin')) – Check available models at malaya_speech.tts.available_glowtts().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

  • pad_to (int, optional (default=2)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 2.

Returns

result

Return type

malaya_speech.model.synthesis.GlowTTS class

malaya_speech.tts.lightspeech(model='male', quantized=False, pad_to=8, **kwargs)[source]#

Load LightSpeech Text-to-Mel TTS model.

Parameters
  • model (str, optional (default='male')) – Check available models at malaya_speech.tts.available_lightspeech().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

  • pad_to (int, optional (default=8)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.

Returns

result

Return type

malaya_speech.model.synthesis.Fastspeech class

malaya_speech.tts.vits(model='mesolitica/VITS-osman', **kwargs)[source]#

Load VITS End-to-End TTS model.

Parameters

model (str, optional (default='mesolitica/VITS-osman')) – Check available models at malaya_speech.tts.available_vits().

Returns

result

Return type

malaya_speech.torch_model.synthesis.VITS class

malaya_speech.tts.e2e_fastspeech2(model='osman', quantized=False, pad_to=8, **kwargs)[source]#

Load Fastspeech2 Text-to-Mel TTS model.

Parameters
  • model (str, optional (default='male')) – Check available models at malaya_speech.tts.available_e2e_fastspeech2().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

  • pad_to (int, optional (default=8)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.

Returns

result

Return type

malaya_speech.model.synthesis.E2E_FastSpeech class

malaya_speech.vad#

malaya_speech.vad.available_model()[source]#

List available VAD deep models.

malaya_speech.vad.webrtc(aggressiveness=3, sample_rate=16000, minimum_amplitude=100)[source]#

Load WebRTC VAD model.

Parameters
  • aggressiveness (int, optional (default=3)) – an integer between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive.

  • sample_rate (int, optional (default=16000)) – sample rate for samples.

  • minimum_amplitude (int, optional (default=100)) – abs(minimum_amplitude) to assume a sample is a voice activity. Else, automatically False.

Returns

result

Return type

malaya_speech.model.webrtc.WebRTC class

malaya_speech.vad.deep_model(model='marblenet-factor1', quantized=False, **kwargs)[source]#

Load VAD model.

Parameters
  • model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.vad.available_model().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.vocoder#

malaya_speech.vocoder.available_melgan()[source]#

List available MelGAN Mel-to-Speech models.

malaya_speech.vocoder.available_mbmelgan()[source]#

List available Multiband MelGAN Mel-to-Speech models.

malaya_speech.vocoder.available_hifigan()[source]#

List available HiFiGAN Mel-to-Speech models.

malaya_speech.vocoder.melgan(model='universal-1024', quantized=False, **kwargs)[source]#

Load MelGAN Vocoder model.

Parameters
  • model (str, optional (default='universal-1024')) – Check available models at malaya_speech.vocoder.available_melgan().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.synthesis.Vocoder class

malaya_speech.vocoder.mbmelgan(model='female', quantized=False, **kwargs)[source]#

Load Multiband MelGAN Vocoder model.

Parameters
  • model (str, optional (default='female')) – Check available models at malaya_speech.vocoder.available_mbmelgan().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.synthesis.Vocoder class

malaya_speech.vocoder.hifigan(model='universal-768', quantized=False, **kwargs)[source]#

Load HiFiGAN Vocoder model.

Parameters
  • model (str, optional (default='universal-768')) – Check available models at malaya_speech.vocoder.available_hifigan().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.synthesis.Vocoder class

malaya_speech.voice_conversion#

malaya_speech.voice_conversion.available_deep_conversion()[source]#

List available Voice Conversion models.

malaya_speech.voice_conversion.deep_conversion(model='fastvc-32-vggvox-v2', quantized=False, **kwargs)[source]#

Load Voice Conversion model.

Parameters
  • model (str, optional (default='fastvc-32-vggvox-v2')) – Check available models at malaya_speech.voice_conversion.available_deep_conversion().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.synthesis.FastVC class