API#

malaya_speech#

malaya_speech.augmentation.spectrogram#

malaya_speech.augmentation.spectrogram.mask_frequency(features, n_freq_mask=2, width_freq_mask=8, random_band=True)[source]#

Mask frequency.

Parameters
  • features (np.array) –

  • n_freq_mask (int, optional (default=2)) – loop size for masking.

  • width_freq_mask (int, optional (default=8)) – masking size.

Returns

result

Return type

np.array

malaya_speech.augmentation.spectrogram.mask_time(features, n_time_mask=2, width_time_mask=8, random_band=True)[source]#

Time frequency.

Parameters
  • features (np.array) –

  • n_time_mask (int, optional (default=2)) – loop size for masking.

  • width_time_mask (int, optional (default=8)) – masking size.

Returns

result

Return type

np.array

malaya_speech.augmentation.spectrogram.tf_mask_frequency(features, n_freq_mask=2, F=27)[source]#

Mask frequency using Tensorflow.

Parameters
  • features (np.array) –

  • F (size of mask for frequency) –

malaya_speech.augmentation.spectrogram.tf_mask_time(features, n_time_mask=2, T=80)[source]#

Mask time using Tensorflow.

Parameters
  • features (np.array) –

  • T (size of mask for time) –

malaya_speech.extra.rttm#

malaya_speech.extra.rttm.load(file)[source]#

Load RTTM file.

Parameters

file (str) –

Returns

result

Return type

Dict[str, malaya_speech.model.annotation.Annotation]

malaya_speech.extra.visualization#

malaya_speech.extra.visualization.visualize_vad(signal, preds, sample_rate=16000, figsize=(15, 3), ax=None, **kwargs)[source]#

Visualize signal given VAD labels. Green means got voice activity, while Red is not.

Parameters
  • signal (list / np.array) –

  • preds (List[Tuple[Frame, bool]]) –

  • sample_rate (int, optional (default=16000)) –

  • figsize (Tuple[int, int], optional (default=(15, 7))) – matplotlib figure size.

malaya_speech.extra.visualization.plot_classification(preds, description, ax=None, fontsize_text=14, x_text=0.05, y_text=0.2, ylim=(0.1, 0.9), figsize=(15, 3), **kwargs)[source]#

Visualize probability / boolean.

Parameters
  • preds (List[Tuple[Frame, label]]) –

  • description (str) –

  • ax (ax, optional (default = None)) –

  • fontsize_text (int, optional (default = 14)) –

  • x_text (float, optional (default = 0.05)) –

  • y_text (float, optional (default = 0.2)) –

malaya_speech.model.classification.Speakernet#

class malaya_speech.model.classification.Speakernet[source]#
vectorize(inputs)[source]#

Vectorize inputs.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result – returned [B, D].

Return type

np.array

malaya_speech.model.classification.Speaker2Vec#

class malaya_speech.model.classification.Speaker2Vec[source]#
vectorize(inputs)[source]#

Vectorize inputs.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result – returned [B, D].

Return type

np.array

malaya_speech.model.classification.SpeakernetClassification#

class malaya_speech.model.classification.SpeakernetClassification[source]#
predict_proba(inputs)[source]#

Predict inputs, will return probability.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result – returned [B, D].

Return type

np.array

predict(inputs)[source]#

Predict inputs, will return labels.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result – returned [B].

Return type

List[str]

malaya_speech.model.classification.Classification#

class malaya_speech.model.classification.Classification[source]#
predict_proba(inputs)[source]#

Predict inputs, will return probability.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result – returned [B, D].

Return type

np.array

predict(inputs)[source]#

Predict inputs, will return labels.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result – returned [B].

Return type

List[str]

malaya_speech.model.clustering.AgglomerativeClustering#

class malaya_speech.model.clustering.AgglomerativeClustering(min_clusters, max_clusters, metric='cosine', threshold=0.25, method='centroid')[source]#
fit_predict(X)[source]#

Fit predict.

Parameters

X (np.array) – inputs with size of [batch_size, embedding size]

Returns

result

Return type

np.array

malaya_speech.model.clustering.HiddenMarkovModelClustering#

class malaya_speech.model.clustering.HiddenMarkovModelClustering(min_clusters, max_clusters, metric='cosine', covariance_type='diag', threshold=0.25, single_cluster_detection_quantile=0.05, single_cluster_detection_threshold=1.15)[source]#
fit_predict(X)[source]#

Fit predict.

Parameters

X (np.array) – inputs with size of [batch_size, embedding size]

Returns

result

Return type

np.array

malaya_speech.model.clustering.StreamingKMeansMaxCluster#

class malaya_speech.model.clustering.StreamingKMeansMaxCluster[source]#

malaya_speech.model.clustering.StreamingKMeans#

class malaya_speech.model.clustering.StreamingKMeans[source]#

malaya_speech.model.clustering.StreamingSpeakerSimilarity#

class malaya_speech.model.clustering.StreamingSpeakerSimilarity(similarity_threshold=0.8, agg_function=<function mean>)[source]#

malaya_speech.model.splitter.Split_Wav#

class malaya_speech.model.splitter.Split_Wav[source]#
predict(input)[source]#

Split an audio into 4 different speakers.

Parameters

input (np.array or malaya_speech.model.frame.Frame) –

Returns

result

Return type

np.array

malaya_speech.model.splitter.Split_Mel#

class malaya_speech.model.splitter.Split_Mel[source]#
predict(input)[source]#

Split an audio into 4 different speakers.

Parameters

input (np.array or malaya_speech.model.frame.Frame) –

Returns

result

Return type

np.array

malaya_speech.model.splitter.FastSpeechSplit#

class malaya_speech.model.splitter.FastSpeechSplit[source]#
predict(original_audio, target_audio, modes=['R', 'F', 'U', 'RF', 'RU', 'FU', 'RFU'])[source]#

Change original voice audio to follow targeted voice.

Parameters
  • original_audio (np.array or malaya_speech.model.frame.Frame) –

  • target_audio (np.array or malaya_speech.model.frame.Frame) –

  • modes (List[str], optional (default = ['R', 'F', 'U', 'RF', 'RU', 'FU', 'RFU'])) –

    R denotes rhythm, F denotes pitch target, U denotes speaker target (vector).

    • 'R' - maintain original_audio F and U on target_audio R.

    • 'F' - maintain original_audio R and U on target_audio F.

    • 'U' - maintain original_audio R and F on target_audio U.

    • 'RF' - maintain original_audio U on target_audio R and F.

    • 'RU' - maintain original_audio F on target_audio R and U.

    • 'FU' - maintain original_audio R on target_audio F and U.

    • 'RFU' - no conversion happened, just do encoder-decoder on target_audio

Returns

result

Return type

Dict[modes]

malaya_speech.model.synthesis.TTS#

class malaya_speech.model.synthesis.TTS[source]#
gradio(vocoder=None, **kwargs)[source]#

Text-to-Speech on Gradio interface.

Parameters
  • vocoder (Callable, optional (default=None)) – vocoder object that has predict method, prefer from malaya_speech itself. Not required if using End-to-End TTS model such as VITS.

  • **kwargs (keyword arguments for predict and iface.launch.) –

malaya_speech.model.synthesis.Vocoder#

class malaya_speech.model.synthesis.Vocoder[source]#
predict(inputs)[source]#

Change Mel to Waveform.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result

Return type

List

malaya_speech.model.synthesis.Tacotron#

class malaya_speech.model.synthesis.Tacotron[source]#
predict(string, **kwargs)[source]#

Change string to Mel.

Parameters

string (str) –

Returns

result

Return type

Dict[string, decoder-output, mel-output, universal-output, alignment]

malaya_speech.model.synthesis.Fastspeech#

class malaya_speech.model.synthesis.Fastspeech[source]#
predict(string, speed_ratio=1.0, f0_ratio=1.0, energy_ratio=1.0, **kwargs)[source]#

Change string to Mel.

Parameters
  • string (str) –

  • speed_ratio (float, optional (default=1.0)) – Increase this variable will increase time voice generated.

  • f0_ratio (float, optional (default=1.0)) – Increase this variable will increase frequency, low frequency will generate more deeper voice.

  • energy_ratio (float, optional (default=1.0)) – Increase this variable will increase loudness.

Returns

result

Return type

Dict[string, decoder-output, mel-output, universal-output]

malaya_speech.model.synthesis.FastspeechSDP#

class malaya_speech.model.synthesis.FastspeechSDP[source]#
predict(string, speed_ratio=1.0, f0_ratio=1.0, energy_ratio=1.0, temperature_durator=0.6666, **kwargs)[source]#

Change string to Mel.

Parameters
  • string (str) –

  • speed_ratio (float, optional (default=1.0)) – Increase this variable will increase time voice generated.

  • f0_ratio (float, optional (default=1.0)) – Increase this variable will increase frequency, low frequency will generate more deeper voice.

  • energy_ratio (float, optional (default=1.0)) – Increase this variable will increase loudness.

  • temperature_durator (float, optional (default=0.66666)) – Durator trying to predict alignment with random.normal() * temperature_durator.

Returns

result

Return type

Dict[string, decoder-output, mel-output, universal-output]

malaya_speech.model.synthesis.E2E_FastSpeech#

class malaya_speech.model.synthesis.E2E_FastSpeech[source]#
predict(string, speed_ratio=1.0, f0_ratio=1.0, energy_ratio=1.0, temperature_durator=0.6666, **kwargs)[source]#

Change string to Mel.

Parameters
  • string (str) –

  • speed_ratio (float, optional (default=1.0)) – Increase this variable will increase time voice generated.

  • f0_ratio (float, optional (default=1.0)) – Increase this variable will increase frequency, low frequency will generate more deeper voice.

  • energy_ratio (float, optional (default=1.0)) – Increase this variable will increase loudness.

  • temperature_durator (float, optional (default=0.66666)) – Durator trying to predict alignment with random.normal() * temperature_durator.

Returns

result

Return type

Dict[string, decoder-output, y]

malaya_speech.model.synthesis.FastVC#

class malaya_speech.model.synthesis.FastVC[source]#
predict(original_audio, target_audio)[source]#

Change original voice audio to follow targeted voice.

Parameters
  • original_audio (np.array or malaya_speech.model.frame.Frame) –

  • target_audio (np.array or malaya_speech.model.frame.Frame) –

Returns

result

Return type

Dict[decoder-output, mel-output]

malaya_speech.model.synthesis.Fastpitch#

class malaya_speech.model.synthesis.Fastpitch[source]#
predict(string, speed_ratio=1.0, pitch_ratio=1.0, pitch_addition=0.0, **kwargs)[source]#

Change string to Mel.

Parameters
  • string (str) –

  • speed_ratio (float, optional (default=1.0)) – Increase this variable will increase time voice generated.

  • pitch_ratio (float, optional (default=1.0)) – pitch = pitch * pitch_ratio, amplify existing pitch contour.

  • pitch_addition (float, optional (default=0.0)) – pitch = pitch + pitch_addition, change pitch contour.

Returns

result

Return type

Dict[string, decoder-output, mel-output, pitch-output, universal-output]

malaya_speech.model.transducer.Transducer#

class malaya_speech.model.transducer.Transducer[source]#
predict_alignment(input, combined=True)[source]#

Transcribe input and get timestamp, only support greedy decoder.

Parameters
  • input (np.array) – np.array or malaya_speech.model.frame.Frame.

  • combined (bool, optional (default=True)) – If True, will combined subwords to become a word.

Returns

result

Return type

List[Dict[text, start, end]]

greedy_decoder(inputs)[source]#

Transcribe inputs using greedy decoder.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result

Return type

List[str]

beam_decoder(inputs, beam_width=5, temperature=0.0, score_norm=True)[source]#

Transcribe inputs using beam decoder.

Parameters
  • inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

  • beam_width (int, optional (default=5)) – beam size for beam decoder.

  • temperature (float, optional (default=0.0)) – apply temperature function for logits, can help for certain case, logits += -np.log(-np.log(uniform_noise_shape_logits)) * temperature

  • score_norm (bool, optional (default=True)) – descending sort beam based on score / length of decoded.

Returns

result

Return type

List[str]

beam_decoder_lm(inputs, language_model, beam_width=5, token_min_logp=- 20.0, beam_prune_logp=- 5.0, temperature=0.0, score_norm=True)[source]#

Transcribe inputs using beam decoder + KenLM.

Parameters
  • inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

  • language_model (pyctcdecode.language_model.LanguageModel) – pyctcdecode language model, load from LanguageModel(kenlm_model, alpha = alpha, beta = beta).

  • beam_width (int, optional (default=5)) – beam size for beam decoder.

  • token_min_logp (float, optional (default=-20.0)) – minimum log probability to select a token.

  • beam_prune_logp (float, optional (default=-5.0)) – filter candidates >= max score lm + beam_prune_logp.

  • temperature (float, optional (default=0.0)) – apply temperature function for logits, can help for certain case, logits += -np.log(-np.log(uniform_noise_shape_logits)) * temperature

  • score_norm (bool, optional (default=True)) – descending sort beam based on score / length of decoded.

Returns

result

Return type

List[str]

predict(inputs)[source]#

Transcribe inputs using greedy decoder, will return list of strings.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result

Return type

List[str]

gradio(record_mode=True, **kwargs)[source]#

Transcribe an input using beam decoder on Gradio interface.

Parameters
  • record_mode (bool, optional (default=True)) – if True, Gradio will use record mode, else, file upload mode.

  • **kwargs (keyword arguments for beam decoder and iface.launch.) –

malaya_speech.model.transducer.TransducerAligner#

class malaya_speech.model.transducer.TransducerAligner[source]#
predict(input, transcription, sample_rate=16000)[source]#

Transcribe input, will return a string.

Parameters
  • input (np.array) – np.array or malaya_speech.model.frame.Frame.

  • transcription (str) – transcription of input audio

  • sample_rate (int, optional (default=16000)) – sample rate for input.

Returns

result

Return type

Dict[words_alignment, subwords_alignment, subwords, alignment]

malaya_speech.model.unet.UNET#

class malaya_speech.model.unet.UNET[source]#
predict(inputs)[source]#

Enhance inputs, will return melspectrogram.

Parameters

inputs (List[np.array]) –

Returns

result

Return type

List

malaya_speech.model.unet.UNETSTFT#

class malaya_speech.model.unet.UNETSTFT[source]#
predict(input)[source]#

Enhance inputs, will return waveform.

Parameters

input (np.array) – np.array or malaya_speech.model.frame.Frame.

Returns

result

Return type

Dict

malaya_speech.model.unet.UNET1D#

class malaya_speech.model.unet.UNET1D[source]#
predict(input)[source]#

Enhance inputs, will return waveform.

Parameters

input (np.array) – np.array or malaya_speech.model.frame.Frame.

Returns

result

Return type

np.array

malaya_speech.model.wav2vec.Wav2Vec2_CTC#

class malaya_speech.model.wav2vec.Wav2Vec2_CTC[source]#
greedy_decoder(inputs)[source]#

Transcribe inputs using greedy decoder.

Parameters

input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result

Return type

List[str]

beam_decoder(inputs, beam_width=100, **kwargs)[source]#

Transcribe inputs using beam decoder.

Parameters
  • input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

  • beam_width (int, optional (default=100)) – beam size for beam decoder.

Returns

result

Return type

List[str]

predict(inputs)[source]#

Predict logits from inputs using greedy decoder.

Parameters

input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result

Return type

List[str]

predict_logits(inputs, norm_func=<function softmax>)[source]#

Predict logits from inputs.

Parameters
  • input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

  • norm_func (Callable, optional (default=malaya.utils.activation.softmax)) –

Returns

result

Return type

List[np.array]

gradio(record_mode=True, lm_func=None, **kwargs)[source]#

Transcribe an input using beam decoder on Gradio interface.

Parameters
  • record_mode (bool, optional (default=True)) – if True, Gradio will use record mode, else, file upload mode.

  • lm_func (Callable, optional (default=None)) – if not None, will pass a logits with shape [T, D].

  • **kwargs (keyword arguments for beam decoder and iface.launch.) –

malaya_speech.model.wav2vec.Wav2Vec2_Aligner#

class malaya_speech.model.wav2vec.Wav2Vec2_Aligner[source]#
predict(input, transcription, sample_rate=16000)[source]#

Transcribe input, will return a string.

Parameters
  • input (np.array) – np.array or malaya_speech.model.frame.Frame.

  • transcription (str) – transcription of input audio.

  • sample_rate (int, optional (default=16000)) – sample rate for input.

Returns

result

Return type

Dict[chars_alignment, words_alignment, alignment]

malaya_speech.model.webrtc.WebRTC#

class malaya_speech.model.webrtc.WebRTC(vad, sample_rate=16000, minimum_amplitude=100)[source]#

malaya_speech.torch_model.huggingface.CTC#

class malaya_speech.torch_model.huggingface.CTC[source]#
greedy_decoder(inputs)[source]#

Transcribe inputs using greedy decoder.

Parameters

input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result

Return type

List[str]

predict(inputs)[source]#

Predict logits from inputs using greedy decoder.

Parameters

input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result

Return type

List[str]

predict_logits(inputs, norm_func=<function softmax>)[source]#

Predict logits from inputs.

Parameters
  • input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

  • norm_func (Callable, optional (default=malaya.utils.activation.softmax)) –

Returns

result

Return type

List[np.array]

gradio(record_mode=True, lm_func=None, **kwargs)[source]#

Transcribe an input using beam decoder on Gradio interface.

Parameters
  • record_mode (bool, optional (default=True)) – if True, Gradio will use record mode, else, file upload mode.

  • lm_func (Callable, optional (default=None)) – if not None, will pass a logits with shape [T, D].

  • **kwargs (keyword arguments for iface.launch.) –

malaya_speech.torch_model.huggingface.Aligner#

class malaya_speech.torch_model.huggingface.Aligner[source]#
predict(input, transcription, sample_rate=16000)[source]#

Transcribe input, will return a string.

Parameters
  • input (np.array) – np.array or malaya_speech.model.frame.Frame.

  • transcription (str) – transcription of input audio.

  • sample_rate (int, optional (default=16000)) – sample rate for input.

Returns

result

Return type

Dict[chars_alignment, words_alignment, alignment]

malaya_speech.torch_model.huggingface.Seq2Seq#

class malaya_speech.torch_model.huggingface.Seq2Seq[source]#
generate(inputs, skip_special_tokens=True, **kwargs)[source]#

Transcribe inputs.

Returns

result

Return type

List[str]

Parameters
Returns

result

Return type

List[str]

predict_logits(inputs, norm_func=<function softmax>, **kwargs)[source]#

Predict logits from inputs.

Parameters
  • input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

  • norm_func (Callable, optional (default=malaya.utils.activation.softmax)) –

Returns

result

Return type

List[np.array]

malaya_speech.torch_model.huggingface.Seq2SeqAligner#

class malaya_speech.torch_model.huggingface.Seq2SeqAligner[source]#
predict(input, transcription, lang='ms', median_filter_size=7)[source]#

Transcribe input, will return a string. Based on https://github.com/openai/whisper/blob/main/notebooks/Multilingual_ASR.ipynb

Parameters
  • input (np.array) – np.array or malaya_speech.model.frame.Frame.

  • transcription (str) – transcription of input audio.

  • lang (str, optional (default='ms')) – if you feed singlish speech, it is better to give en language.

  • median_filter_size (int, optional (default=7)) – sliding median size.

Returns

result

Return type

Dict[chars_alignment, words_alignment, alignment]

malaya_speech.torch_model.huggingface.XVector#

class malaya_speech.torch_model.huggingface.XVector[source]#
vectorize(inputs)[source]#

Vectorize inputs.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result – returned [B, D].

Return type

np.array

forward(inputs)[source]#

Defines the computation performed at every call.

Should be overridden by all subclasses.

Note

Although the recipe for forward pass needs to be defined within this function, one should call the Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

malaya_speech.torch_model.nemo.SpeakerVector#

class malaya_speech.torch_model.nemo.SpeakerVector[source]#
forward(inputs)[source]#

Vectorize inputs.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

vectorize(inputs)[source]#

Vectorize inputs.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result

Return type

np.array

malaya_speech.torch_model.nemo.Classification#

class malaya_speech.torch_model.nemo.Classification[source]#
forward(inputs)[source]#

Vectorize inputs.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

predict_proba(inputs)[source]#

Predict inputs, will return probability.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result – returned [B, D].

Return type

np.array

predict(inputs)[source]#

Predict inputs, will return labels.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

Returns

result – returned [B].

Return type

List[str]

malaya_speech.torch_model.super_resolution.VoiceFixer#

class malaya_speech.torch_model.super_resolution.VoiceFixer[source]#
predict(input, remove_higher_frequency=True)[source]#
Parameters
  • input (np.array) – np.array or malaya_speech.model.frame.Frame, must an audio with 44100 sampling rate.

  • remove_higher_frequency (bool, optional (default = True)) – Remove high frequency before neural upsampling.

Returns

result

Return type

np.array with 44100 sampling rate

forward(input, remove_higher_frequency=True)[source]#

Defines the computation performed at every call.

Should be overridden by all subclasses.

Note

Although the recipe for forward pass needs to be defined within this function, one should call the Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

malaya_speech.torch_model.super_resolution.NVSR#

class malaya_speech.torch_model.super_resolution.NVSR[source]#
predict(input)[source]#
Parameters

input (np.array) – np.array or malaya_speech.model.frame.Frame, must an audio with 44100 sampling rate.

Returns

result

Return type

np.array with 44100 sampling rate

malaya_speech.torch_model.synthesis.VITS#

class malaya_speech.torch_model.synthesis.VITS[source]#

malaya_speech.torch_model.torchaudio.Conformer#

class malaya_speech.torch_model.torchaudio.Conformer[source]#
forward(inputs, beam_width=20)[source]#

Transcribe inputs using beam decoder.

Parameters
  • inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

  • beam_width (int, optional (default=20)) – beam size for beam decoder.

Returns

result

Return type

List[Tuple]

beam_decoder(inputs, beam_width=20)[source]#

Transcribe inputs using beam decoder.

Parameters
  • inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

  • beam_width (int, optional (default=20)) – beam size for beam decoder.

Returns

result

Return type

List[str]

malaya_speech.torch_model.torchaudio.ForceAlignment#

class malaya_speech.torch_model.torchaudio.ForceAlignment[source]#
predict(input, transcription, temperature=1.0)[source]#

Transcribe input, will return a string.

Parameters
  • input (np.array) – np.array or malaya_speech.model.frame.Frame.

  • transcription (str) – transcription of input audio

  • temperature (float, optional (default=1.0)) – temperature for logits.

Returns

result

Return type

Dict[words_alignment, subwords_alignment, subwords, alignment]

malaya_speech.pipeline#

class malaya_speech.pipeline.Pipeline[source]#
visualize(filename='pipeline.png', **kwargs)[source]#

Render the computation of this object’s task graph using graphviz.

Requires graphviz to be installed.

Parameters
  • filename (str, optional) – The name of the file to write to disk.

  • kwargs – Graph attributes to pass to graphviz like rankdir="LR"

batching = <function batching>#
flatten = <function flatten>#
foreach_map = <function foreach_map>#
map = <function map>#
partition = <function partition>#
sliding_window = <function sliding_window>#
zip = <function zip>#

malaya_speech.pipeline.map#

class malaya_speech.pipeline.map[source]#

apply a function / method to the pipeline

Examples

>>> source = Pipeline()
>>> source.map(lambda x: x + 1).map(print)
>>> source.emit(1)
2

malaya_speech.pipeline.batching#

class malaya_speech.pipeline.batching[source]#

Batching stream into tuples

Examples

>>> source = Pipeline()
>>> source.batching(2).map(print)
>>> source.emit([1,2,3,4,5])
([1, 2], [3, 4], [5])

malaya_speech.pipeline.partition#

class malaya_speech.pipeline.partition[source]#

Partition stream into tuples of equal size

Examples

>>> source = Pipeline()
>>> source.partition(3).map(print)
>>> for i in range(10):
...     source.emit(i)
(0, 1, 2)
(3, 4, 5)
(6, 7, 8)

malaya_speech.pipeline.sliding_window#

class malaya_speech.pipeline.sliding_window[source]#

Produce overlapping tuples of size n

Parameters

return_partial (bool) – If True, yield tuples as soon as any events come in, each tuple being smaller or equal to the window size. If False, only start yielding tuples once a full window has accrued.

Examples

>>> source = Pipeline()
>>> source.sliding_window(3, return_partial=False).map(print)
>>> for i in range(8):
...     source.emit(i)
(0, 1, 2)
(1, 2, 3)
(2, 3, 4)
(3, 4, 5)
(4, 5, 6)
(5, 6, 7)

malaya_speech.pipeline.foreach_map#

class malaya_speech.pipeline.foreach_map[source]#

Apply a function to every element in a tuple in the stream.

Parameters
  • func (callable) –

  • method (str, optional (default='sync')) –

    method to process each elements.

    • 'sync' - loop one-by-one to process.

    • 'async' - async process all elements at the same time.

    • 'thread' - multithreading level to process all elements at the same time.

      Default is 1 worker. Override worker_size=n to increase.

    • 'process' - multiprocessing level to process all elements at the same time.

      Default is 1 worker. Override worker_size=n to increase.

  • *args – The arguments to pass to the function.

  • **kwargs – Keyword arguments to pass to func.

Examples

>>> source = Pipeline()
>>> source.foreach_map(lambda x: 2*x).map(print)
>>> for i in range(3):
...     source.emit((i, i))
(0, 0)
(2, 2)
(4, 4)

malaya_speech.pipeline.flatten#

class malaya_speech.pipeline.flatten[source]#

Flatten streams of lists or iterables into a stream of elements

Examples

>>> source = Pipeline()
>>> source.flatten().map(print)
>>> source.emit([[1, 2, 3], [4, 5], [6, 7, 7]])
[1, 2, 3, 4, 5, 6, 7, 7]

malaya_speech.pipeline.zip#

class malaya_speech.pipeline.zip[source]#

Combine 2 branches into 1 branch.

Examples

>>> source = Pipeline()
>>> left = source.map(lambda x: x + 1, name = 'left')
>>> right = source.map(lambda x: x + 10, name = 'right')
>>> left.zip(right).map(sum).map(print)
>>> source.emit(2)
15
pack_literals(tup)[source]#

Fill buffers for literals whenever we empty them

malaya_speech.streaming.pyaudio#

malaya_speech.streaming.pyaudio.stream(vad_model=None, asr_model=None, classification_model=None, sample_rate=16000, segment_length=2560, num_padding_frames=20, ratio=0.75, min_length=0.1, max_length=10.0, realtime_print=True, **kwargs)[source]#

Stream an audio using pyaudio library.

Parameters
  • vad_model (object, optional (default=None)) – vad model / pipeline.

  • asr_model (object, optional (default=None)) – ASR model / pipeline, will transcribe each subsamples realtime.

  • classification_model (object, optional (default=None)) – classification pipeline, will classify each subsamples realtime.

  • device (None, optional (default=None)) – device parameter for pyaudio, check available devices from sounddevice.query_devices().

  • sample_rate (int, optional (default = 16000)) – output sample rate.

  • segment_length (int, optional (default=2560)) – usually derived from asr_model.segment_length * asr_model.hop_length, size of audio chunks, actual size in term of second is segment_length / sample_rate.

  • ratio (float, optional (default = 0.75)) – if 75% of the queue is positive, assumed it is a voice activity.

  • min_length (float, optional (default=0.1)) – minimum length (second) to accept a subsample.

  • max_length (float, optional (default=10.0)) – maximum length (second) to accept a subsample.

  • realtime_print (bool, optional (default=True)) – Will print results for ASR.

  • **kwargs (vector argument) – vector argument pass to malaya_speech.streaming.pyaudio.Audio interface.

Returns

result

Return type

List[dict]

malaya_speech.streaming.torchaudio#

https://pytorch.org/audio/stable/tutorials/online_asr_tutorial.html

class malaya_speech.streaming.torchaudio.ContextCacher(segment_length, context_length)[source]#

Cache the end of input data and prepend the next input data with it.

Parameters
  • segment_length (int) – The size of main segment. If the incoming segment is shorter, then the segment is padded.

  • context_length (int) – The size of the context, cached and appended.

malaya_speech.streaming.torchaudio.stream(src, vad_model=None, asr_model=None, classification_model=None, format=None, option=None, buffer_size=4096, sample_rate=16000, segment_length=2560, num_padding_frames=20, ratio=0.75, min_length=0.1, max_length=10.0, realtime_print=True, **kwargs)[source]#

Stream an audio using torchaudio library.

Parameters
  • vad_model (object, optional (default=None)) – vad model / pipeline.

  • asr_model (object, optional (default=None)) – ASR model / pipeline, will transcribe each subsamples realtime.

  • classification_model (object, optional (default=None)) – classification pipeline, will classify each subsamples realtime.

  • format (str, optional (default=None)) – Supported format for torchaudio.io.StreamReader, https://pytorch.org/audio/stable/generated/torchaudio.io.StreamReader.html#torchaudio.io.StreamReader

  • option (dict, optional (default=None)) – Supported option for torchaudio.io.StreamReader, https://pytorch.org/audio/stable/generated/torchaudio.io.StreamReader.html#torchaudio.io.StreamReader

  • buffer_size (int, optional (default=4096)) – Supported buffer_size for torchaudio.io.StreamReader, buffer size in byte. Used only when src is file-like object, https://pytorch.org/audio/stable/generated/torchaudio.io.StreamReader.html#torchaudio.io.StreamReader

  • sample_rate (int, optional (default = 16000)) – output sample rate.

  • segment_length (int, optional (default=2560)) – usually derived from asr_model.segment_length * asr_model.hop_length, size of audio chunks, actual size in term of second is segment_length / sample_rate.

  • num_padding_frames (int, optional (default=20)) – size of acceptable padding frames for queue.

  • ratio (float, optional (default = 0.75)) – if 75% of the queue is positive, assumed it is a voice activity.

  • min_length (float, optional (default=0.1)) – minimum length (second) to accept a subsample.

  • max_length (float, optional (default=10.0)) – maximum length (second) to accept a subsample.

  • realtime_print (bool, optional (default=True)) – Will print results for ASR.

  • **kwargs (vector argument) – vector argument pass to malaya_speech.streaming.pyaudio.Audio interface.

Returns

result

Return type

List[dict]

malaya_speech.streaming.torchaudio.stream_rnnt(src, asr_model=None, classification_model=None, format=None, option=None, beam_width=10, buffer_size=4096, sample_rate=16000, segment_length=2560, context_length=640, realtime_print=True, **kwargs)[source]#
Parameters

malaya_speech.utils.aligner#

class malaya_speech.utils.aligner.Point(token_index, time_index, score)[source]#
class malaya_speech.utils.aligner.Segment(label, start, end, score)[source]#
malaya_speech.utils.aligner.put_comma(alignment, min_threshold=0.5)[source]#

Put comma in alignment from force alignment model.

Parameters
  • alignment (List[Dict[text, start, end]]) –

  • min_threshold (float, optional (default=0.5)) – minimum threshold in term of seconds to assume a comma.

Returns

result

Return type

List[str]

malaya_speech.utils.aligner.plot_alignments(alignment, subs_alignment, words_alignment, waveform, separator=' ', sample_rate=16000, figsize=(16, 9), plot_score_char=False, plot_score_word=True)[source]#

plot alignment.

Parameters
  • alignment (np.array) – usually alignment output.

  • subs_alignment (list) – usually chars_alignment or subwords_alignment output.

  • words_alignment (list) – usually words_alignment output.

  • waveform (np.array) – input audio.

  • separator (str, optional (default=' ')) – separator between words, only useful if subs_alignment is character based.

  • sample_rate (int, optional (default=16000)) –

  • figsize (tuple, optional (default=(16, 9))) – figure size for matplotlib figsize.

  • plot_score_char (bool, optional (default=False)) – plot score on top of character plots.

  • plot_score_word (bool, optional (default=True)) – plot score on top of word plots.

malaya_speech.utils.astype#

malaya_speech.utils.astype.to_ndarray(array)[source]#

Change list / tuple / bytes into np.array

Parameters

array (list / tuple / bytes) –

Returns

result

Return type

np.array

malaya_speech.utils.astype.to_byte(array)[source]#

Change list / tuple / np.array into bytes

Parameters

array (list / tuple / np.array) –

Returns

result

Return type

bytes

malaya_speech.utils.astype.float_to_int(array, type=<class 'numpy.int16'>, divide_max_abs=True)[source]#

Change np.array float32 / float64 into np.int16

Parameters
  • array (np.array) –

  • type (np.int16) –

Returns

result

Return type

np.array

malaya_speech.utils.astype.int_to_float(array, type=<class 'numpy.float32'>)[source]#

Change np.array int16 into np.float32

Parameters
  • array (np.array) –

  • type (np.float32) –

Returns

result

Return type

np.array

malaya_speech.utils.char#

malaya_speech.utils.char.strip_ids(ids, ids_to_strip)[source]#

Strip ids_to_strip from the end ids.

malaya_speech.utils.char.generate_vocab(strings)[source]#

Generate character vocab sorted based on frequency.

Parameters

strings (List[str]) –

Returns

result

Return type

List[str]

malaya_speech.utils.char.encode(string, add_eos=True, add_blank=False, lookup=None)[source]#

Encode string to integer representation based on ascii table or lookup variable.

Parameters
  • string (str) –

  • add_eos (bool, optional (default=True)) – add EOS token at the end of encoded.

  • add_blank (bool, optional (default=False)) – add BLANK token at the starting of encoded, this is for transducer / transformer based.

  • lookup (List[str], optional (default=None)) – list of unique strings.

Returns

result

Return type

List[int]

malaya_speech.utils.char.decode(ids, lookup=None)[source]#

Decode integer representation to string based on ascii table or lookup variable.

Parameters
  • ids (List[int]) –

  • lookup (List[str], optional (default=None)) – list of unique strings.

Returns

result

Return type

str

malaya_speech.utils.combine#

malaya_speech.utils.combine.without_silent(frames, threshold_to_stop=0.1, silent_trail=500)[source]#

Group multiple frames based on label and threshold to stop.

Parameters
  • frames (List[Tuple[Frame, label]]) – Output from VAD.

  • threshold_to_stop (float, optional (default = 0.1)) – If threshold_to_stop is 0.1, means that, length same label samples must at least 0.1 second.

  • silent_trail (int, optional (default = 500)) – if detected a silent, will append first N frames and last N frames.

Returns

result

Return type

np.array

malaya_speech.utils.featurization#

malaya_speech.utils.featurization.normalize_signal(signal, gain=None)[source]#

Normalize float32 signal to [-1, 1] range

malaya_speech.utils.featurization.extract_pitch(y, hop_size=256, sr=22050, bad_f0=5.0, zero_value=- 10.0)[source]#

Originally from https://github.com/yl4579/PitchExtractor/blob/main/meldataset.py

malaya_speech.utils.generator#

malaya_speech.utils.generator.frames(audio, frame_duration_ms=30, sample_rate=16000, append_ending_trail=True)[source]#

Generates audio frames from audio. Takes the desired frame duration in milliseconds, the audio, and the sample rate.

Parameters
  • audio (np.array) –

  • frame_duration_ms (int, optional (default=30)) –

  • sample_rate (int, optional (default=16000)) –

  • append_ending_trail (bool, optional (default=True)) – if True, will append last trail and this last trail might not same length as frame_duration_ms.

Returns

result

Return type

List[malaya_speech.model.frame.Frame]

malaya_speech.utils.generator.mel_sampling(audio, frame_duration_ms=1200, overlap_ms=200, sample_rate=16000)[source]#

Generates audio frames from audio. This is for melspectrogram generative model. Takes the desired frame duration in milliseconds, the audio, and the sample rate.

Parameters
  • audio (np.array) –

  • frame_duration_ms (int, optional (default=1200)) –

  • overlap_ms (int, optional (default=200)) –

  • sample_rate (int, optional (default=16000)) –

Returns

result

Return type

List[np.array]

malaya_speech.utils.generator.combine_mel_sampling(samples, overlap_ms=200, sample_rate=16000, padded_ms=50)[source]#

To combine results from mel_sampling, output from melspectrogram generative model.

Parameters
  • samples (List[np.array]) –

  • overlap_ms (int, optional (default=200)) –

  • sample_rate (int, optional (default=16000)) –

Returns

result

Return type

List[np.array]

malaya_speech.utils.griffin_lim#

malaya_speech.utils.griffin_lim.from_mel(mel_, sr=16000, n_fft=2048, n_iter=32, win_length=1000, hop_length=100)[source]#

Change melspectrogram into waveform using Librosa.

Parameters

spectrogram (np.array) –

Returns

result

Return type

np.array

malaya_speech.utils.griffin_lim.from_mel_vocoder(mel, sr=22050, n_fft=1024, n_mels=256, fmin=80, fmax=7600, n_iter=32, win_length=None, hop_length=256)[source]#

Change melspectrogram into waveform using Librosa.

Parameters

spectrogram (np.array) –

Returns

result

Return type

np.array

malaya_speech.utils.group#

malaya_speech.utils.group.combine_frames(frames)[source]#

Combine multiple frames into one frame.

Parameters

frames (List[Frame]) –

Returns

result

Return type

Frame

malaya_speech.utils.group.group_frames(frames)[source]#

Group multiple frames based on label.

Parameters

frames (List[Tuple[Frame, label]]) –

Returns

result

Return type

List[Tuple[Frame, label]]

malaya_speech.utils.group.group_frames_threshold(frames, threshold_to_stop=0.3)[source]#

Group multiple frames based on label and threshold to stop.

Parameters
  • frames (List[Tuple[Frame, label]]) –

  • threshold_to_stop (float, optional (default = 0.3)) – If threshold_to_stop is 0.3, means that, length same label samples must at least 0.3 second.

Returns

result

Return type

List[Tuple[Frame, label]]

malaya_speech.utils.io#

malaya_speech.utils.io.write_srt(transcript, file)[source]#

Write list of transcription into SRT format.

Parameters
  • transcript (List[dict]) – list of {‘start’, ‘end’, ‘text’}

  • file (typing.TextIO) –

malaya_speech.utils.io.write_vtt(transcript, file)[source]#

Write list of transcription into VTT format.

Parameters
  • transcript (List[dict]) – list of {‘start’, ‘end’, ‘text’}

  • file (typing.TextIO) –

malaya_speech.utils.io.write_tsv(transcript, file)[source]#

Write list of transcription into TSV format.

Parameters
  • transcript (List[dict]) – list of {‘start’, ‘end’, ‘text’}

  • file (typing.TextIO) –

malaya_speech.utils.padding#

malaya_speech.utils.padding.sequence_1d(seq, maxlen=None, padding='post', pad_int=0, return_len=False)[source]#

padding sequence of 1d to become 2d array.

Parameters
  • seq (List[List[int]]) –

  • maxlen (int, optional (default=None)) – If None, will calculate max length in the function.

  • padding (str, optional (default='post')) – If pre, will add 0 on the starting side, else add 0 on the end side.

  • pad_int – padding value.

  • int – padding value.

  • (default=0) (optional) – padding value.

Returns

result

Return type

np.array

malaya_speech.utils.padding.sequence_nd(seq, maxlen=None, padding='post', pad_val=0.0, dim=1, return_len=False)[source]#

padding sequence of nd to become (n+1)d array.

Parameters
  • seq (list of nd array) –

  • maxlen (int, optional (default=None)) – If None, will calculate max length in the function.

  • padding (str, optional (default='post')) – If pre, will add 0 on the starting side, else add 0 on the end side.

  • pad_val – padding value.

  • float – padding value.

  • (default=0.0) (optional) – padding value.

  • dim (int, optional (default=1)) –

Returns

result

Return type

np.array

malaya_speech.utils.padding.tf_sequence_nd(seq, maxlen=None, padding='post', pad_val=0.0, dim=1, return_len=False)[source]#

padding sequence of nd to become (n+1)d array.

Parameters
  • seq (list of nd array) –

  • maxlen (int, optional (default=None)) – If None, will calculate max length in the function.

  • padding (str, optional (default='post')) – If pre, will add 0 on the starting side, else add 0 on the end side.

  • pad_val – padding value.

  • float – padding value.

  • (default=0.0) (optional) – padding value.

  • dim (int, optional (default=1)) –

Returns

result

Return type

np.array

malaya_speech.utils.read#

malaya_speech.utils.read.resample(data, old_samplerate, new_samplerate)[source]#

Resample signal.

Parameters
  • data (np.array) –

  • old_samplerate (int) – old sample rate.

  • new_samplerate (int) – new sample rate.

Returns

result

Return type

data

malaya_speech.utils.read.load(file, sr=16000, scale=True)[source]#

Read sound file, any format supported by soundfile.read and torchaudio.load

Parameters
  • file (str) –

  • sr (int, (default=16000)) – new sample rate. If input sample rate is not same, will resample automatically.

  • scale (bool, (default=True)) – Scale to -1 and 1.

Returns

result

Return type

(y, sr)

malaya_speech.utils.split#

malaya_speech.utils.split.split_vad(frames, n=3, negative_threshold=0.1)[source]#

Split a sample into multiple samples based n size of negative VAD.

Parameters
  • frames (List[Tuple[Frame, label]]) –

  • n (int, optional (default=3)) – n size of negative VAD to assume in one subsample.

  • negative_threshold (float, optional (default = 0.1)) – If negative_threshold is 0.1, means that, length negative samples must at least 0.1 second.

Returns

result

Return type

List[Frame]

malaya_speech.utils.split.split_vad_duration(frames, max_duration=5.0, negative_threshold=0.1)[source]#

Split a sample into multiple samples based maximum duration of voice activities.

Parameters
  • frames (List[Tuple[Frame, label]]) –

  • max_duration (float, optional (default = 5.0)) – Maximum duration to assume one sample combined from voice activities.

  • negative_threshold (float, optional (default = 0.1)) – If negative_threshold is 0.1, means that, length negative samples must at least 0.1 second.

Returns

result

Return type

List[Frame]

malaya_speech.utils.subword#

malaya_speech.utils.subword.generate_tokenizer(strings, target_vocab_size=1024, max_subword_length=4, max_corpus_chars=None, reserved_tokens=None)[source]#

Build a subword dictionary.

malaya_speech.utils.subword.save(tokenizer, path)[source]#

Save subword dictionary to a text file.

malaya_speech.utils.subword.load(path)[source]#

Load text file into subword dictionary.

malaya_speech.utils.subword.encode(tokenizer, string, add_blank=False)[source]#

Encode string to integer representation based on ascii table or lookup variable.

Parameters
  • tokenizer (object) – tokenizer object

  • string (str) –

  • add_blank (bool, optional (default=False)) – add BLANK token at the starting of encoded, this is for transducer / transformer based.

  • lookup (List[str], optional (default=None)) – list of unique strings.

Returns

result

Return type

List[int]

malaya_speech.utils.subword.decode(tokenizer, ids)[source]#

Decode integer representation to string based on tokenizer vocab.

Parameters
  • tokenizer (object) – tokenizer object

  • ids (List[int]) –

Returns

result

Return type

str

malaya_speech.utils.subword.decode_multilanguage(tokenizers, ids)[source]#

Decode integer representation to string using list of tokenizer objects.

Parameters
  • tokenizers (List[object]) – List of tokenizer objects.

  • ids (List[int]) –

Returns

result

Return type

str

malaya_speech.utils.subword.load_sentencepiece(model_file)[source]#
Parameters

model_file (str) – sentencepiece model file.

Returns

result

Return type

sentencepiece.SentencePieceProcessor

malaya_speech.utils.tf_featurization#

malaya_speech.utils.torch_featurization#

https://github.com/pytorch/audio/blob/main/examples/asr/librispeech_conformer_rnnt/transforms.py

malaya_speech.utils.torch_featurization.conformer_rnnt_model(*, input_dim, encoding_dim, time_reduction_stride, conformer_input_dim, conformer_ffn_dim, conformer_num_layers, conformer_num_heads, conformer_depthwise_conv_kernel_size, conformer_dropout, num_symbols, symbol_embedding_dim, num_lstm_layers, lstm_hidden_dim, lstm_layer_norm, lstm_layer_norm_epsilon, lstm_dropout, joiner_activation)[source]#

Builds Conformer-based recurrent neural network transducer (RNN-T) model.

Parameters
  • input_dim (int) – dimension of input sequence frames passed to transcription network.

  • encoding_dim (int) – dimension of transcription- and prediction-network-generated encodings passed to joint network.

  • time_reduction_stride (int) – factor by which to reduce length of input sequence.

  • conformer_input_dim (int) – dimension of Conformer input.

  • conformer_ffn_dim (int) – hidden layer dimension of each Conformer layer’s feedforward network.

  • conformer_num_layers (int) – number of Conformer layers to instantiate.

  • conformer_num_heads (int) – number of attention heads in each Conformer layer.

  • conformer_depthwise_conv_kernel_size (int) – kernel size of each Conformer layer’s depthwise convolution layer.

  • conformer_dropout (float) – Conformer dropout probability.

  • num_symbols (int) – cardinality of set of target tokens.

  • symbol_embedding_dim (int) – dimension of each target token embedding.

  • num_lstm_layers (int) – number of LSTM layers to instantiate.

  • lstm_hidden_dim (int) – output dimension of each LSTM layer.

  • lstm_layer_norm (bool) – if True, enables layer normalization for LSTM layers.

  • lstm_layer_norm_epsilon (float) – value of epsilon to use in LSTM layer normalization layers.

  • lstm_dropout (float) – LSTM dropout probability.

  • joiner_activation (str) – activation function to use in the joiner. Must be one of (“relu”, “tanh”). (Default: “relu”)

Returns

Conformer RNN-T model.

Return type

RNNT

class malaya_speech.utils.torch_featurization.FunctionalModule(functional)[source]#
forward(input)[source]#

Defines the computation performed at every call.

Should be overridden by all subclasses.

Note

Although the recipe for forward pass needs to be defined within this function, one should call the Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

class malaya_speech.utils.torch_featurization.GlobalStatsNormalization(global_stats_path)[source]#
forward(input)[source]#

Defines the computation performed at every call.

Should be overridden by all subclasses.

Note

Although the recipe for forward pass needs to be defined within this function, one should call the Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

class malaya_speech.utils.torch_featurization.FeatureExtractor(global_stats_path, pad=False)[source]#
forward(input)[source]#

Defines the computation performed at every call.

Should be overridden by all subclasses.

Note

Although the recipe for forward pass needs to be defined within this function, one should call the Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

malaya_speech.utils.torch_featurization.separate_sources(model, mix, segment=10.0, overlap=0.1, device=None)[source]#

Apply model to a given mixture. Use fade, and add segments together in order to add model segment by segment.

Parameters
  • segment (int) – segment length in seconds

  • device (torch.device, str, or None) – if provided, device on which to execute the computation, otherwise mix.device is assumed. When device is different from mix.device, only local computations will be on device, while the entire tracks will be stored on mix.device.

malaya_speech.age_detection#

malaya_speech.age_detection.available_model()[source]#

List available age detection deep models.

malaya_speech.age_detection.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load age detection deep model.

Parameters
  • model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.age_detection.available_model().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.diarization#

malaya_speech.diarization.streaming(vector, streaming_model, add_speaker_prefix=True)[source]#

Streaming speaker diarization.

Parameters
  • vector (np.array) – np.array or malaya_speech.model.frame.Frame.

  • streaming_model (Callable) – must have streaming method.

  • add_speaker_prefix (bool, optional (default=True)) – if True, will add ‘speaker ‘ as prefix.

Returns

result

Return type

str

malaya_speech.diarization.speaker_similarity(vad_results, speaker_vector, similarity_threshold=0.8, agg_function=<function mean>, return_embedding=False)[source]#

Speaker diarization using L2-Norm similarity.

Parameters
  • vad_results (List[Tuple[Frame, label]]) – results from VAD.

  • speaker_vector (callable) – speaker vector object.

  • similarity_threshold (float, optional (default=0.8)) – if current voice activity sample similar at least 0.8, we assumed it is from the same speaker.

Returns

result

Return type

List[Tuple[Frame, label]]

malaya_speech.diarization.clustering(vad_results, speaker_vector, model, norm_function=<function l2_normalize>, log_distance_metric=None, return_embedding=False)[source]#

Speaker diarization using any clustering model.

Parameters
  • vad_results (List[Tuple[Frame, label]]) – results from VAD.

  • speaker_vector (callable) – speaker vector object.

  • model (callable) – Any unsupervised clustering model. Required fit_predict or apply or predict method.

  • norm_function (Callable, optional(default=malaya_speech.utils.dist.l2_normalize)) – normalize function for speaker vectors.

  • log_distance_metric (str, optional (default=None)) – post distance norm in log scale metrics. this parameter is necessary for model that required square array input. Common value is one of [‘cosine’, ‘angular’].

Returns

result

Return type

List[Tuple[Frame, label]]

malaya_speech.diarization.combine(list_results, speaker_vector, similarity_threshold=0.8, agg_function=<function mean>, sortby_pagerank=True)[source]#

Combined multiple diarization results into single diarization results using PageRank. Required malaya and networkx libraries.

Parameters
  • vad_results (List[List[Tuple[Frame, label]]]) – results from multiple diarization results.

  • speaker_vector (callable) – speaker vector object.

  • similarity_threshold (float, optional (default=0.8)) – if current voice activity sample similar at least 0.8, we assumed it is from the existing speakers.

  • agg_function (Callable, optional (default=np.mean)) – aggregate function to aggregate when we have multiple samples for the same speaker.

  • sortby_pagerank (bool, optional (default=True)) – sort speaker names using pagerank score. This required malaya to be installed.

Returns

result

Return type

List[Tuple[Frame, label]]

malaya_speech.emotion#

malaya_speech.emotion.available_model()[source]#

List available emotion detection deep models.

malaya_speech.emotion.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load emotion detection deep model.

Parameters
  • model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.emotion.available_model().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.force_alignment.ctc#

malaya_speech.force_alignment.ctc.available_transformer()[source]#

List available Encoder-CTC Aligner models.

malaya_speech.force_alignment.ctc.available_huggingface()[source]#

List available HuggingFace Malaya-Speech Aligner models.

malaya_speech.force_alignment.ctc.transformer(model='hubert-conformer', quantized=False, **kwargs)[source]#

Load Encoder-CTC ASR model.

Parameters
  • model (str, optional (default='hubert-conformer')) – Check available models at malaya_speech.force_alignment.ctc.available_transformer().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.wav2vec.Wav2Vec2_Aligner class

malaya_speech.force_alignment.ctc.huggingface(model='mesolitica/wav2vec2-xls-r-300m-mixed', force_check=True, **kwargs)[source]#

Load Finetuned models from HuggingFace.

Parameters
  • model (str, optional (default='mesolitica/wav2vec2-xls-r-300m-mixed')) – Check available models at malaya_speech.force_alignment.ctc.available_huggingface().

  • force_check (bool, optional (default=True)) – Force check model one of malaya model. Set to False if you have your own huggingface model.

Returns

result

Return type

malaya_speech.model.huggingface.Aligner class

malaya_speech.force_alignment.seq2seq#

malaya_speech.force_alignment.seq2seq.huggingface(model='mesolitica/finetune-whisper-base-ms-singlish-v2', force_check=True, **kwargs)[source]#

Load Finetuned models from HuggingFace.

Parameters
  • model (str, optional (default='mesolitica/finetune-whisper-base-ms-singlish-v2')) – Check available models at malaya_speech.force_alignment.seq2seq.available_huggingface().

  • force_check (bool, optional (default=True)) – Force check model one of malaya model. Set to False if you have your own huggingface model.

Returns

result

Return type

malaya_speech.model.huggingface.Seq2SeqAligner class

malaya_speech.force_alignment.transducer#

malaya_speech.force_alignment.transducer.available_transformer()[source]#

List available Encoder-Transducer Aligner models.

malaya_speech.force_alignment.transducer.transformer(model='conformer-transducer', quantized=False, **kwargs)[source]#

Load Encoder-Transducer Aligner model.

Parameters
  • model (str, optional (default='conformer-transducer')) – Check available models at malaya_speech.force_alignment.transducer.available_transformer().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.transducer.TransducerAligner class

malaya_speech.force_alignment.transducer.pt_transformer(model='mesolitica/conformer-base', **kwargs)[source]#

Load Encoder-Transducer ASR model using Pytorch.

Parameters

model (str, optional (default='mesolitica/conformer-base')) – Check available models at malaya_speech.force_alignment.transducer.available_pt_transformer().

Returns

result

Return type

malaya_speech.torch_model.torchaudio.ForceAlignment class

malaya_speech.gender#

malaya_speech.gender.available_model()[source]#

List available gender detection deep models.

malaya_speech.gender.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load gender detection deep model.

Parameters
  • model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.gender.available_model().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.is_clean#

malaya_speech.is_clean.available_nemo()[source]#

List available Nvidia Nemo is clean models.

malaya_speech.is_clean.nemo(model='huseinzol05/nemo-is-clean-speakernet', **kwargs)[source]#

Load Nvidia Nemo is clean model. Trained on 100, 200, 300 ms frames.

Parameters

model (str, optional (default='huseinzol05/nemo-is-clean-speakernet')) – Check available models at malaya_speech.is_clean.available_nemo().

Returns

result

Return type

malaya_speech.torch_model.nemo.Classification class

malaya_speech.language_detection#

malaya_speech.language_detection.available_model()[source]#

List available language detection deep models.

malaya_speech.language_detection.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load language detection deep model.

Parameters
  • model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.language_detection.available_model().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.language_model#

malaya_speech.language_model.available_kenlm()[source]#

List available KenLM Language Model.

malaya_speech.language_model.available_gpt2()[source]#

List available GPT2 Language Model.

malaya_speech.language_model.available_mlm()[source]#

List available MLM Language Model.

malaya_speech.language_model.kenlm(model='dump-combined', **kwargs)[source]#

Load KenLM language model.

Parameters

model (str, optional (default='dump-combined')) – Check available models at malaya_speech.language_model.available_kenlm().

Returns

result

Return type

str

malaya_speech.language_model.gpt2(model='mesolitica/gpt2-117m-bahasa-cased', force_check=True, **kwargs)[source]#

Load GPT2 language model.

Parameters
  • model (str, optional (default='mesolitica/gpt2-117m-bahasa-cased')) – Check available models at malaya_speech.language_model.available_gpt2().

  • force_check (bool, optional (default=True)) – Force check model one of malaya model. Set to False if you have your own huggingface model.

Returns

result

Return type

malaya.torch_model.gpt2_lm.LM class

malaya_speech.language_model.mlm(model='mesolitica/bert-base-standard-bahasa-cased', force_check=True, **kwargs)[source]#

Load Masked language model.

Parameters
  • model (str, optional (default='mesolitica/bert-base-standard-bahasa-cased')) – Check available models at malaya_speech.language_model.available_mlm().

  • force_check (bool, optional (default=True)) – Force check model one of malaya model. Set to False if you have your own huggingface model.

Returns

result

Return type

malaya_speech.torch_model.mask_lm.LM class

malaya_speech.multispeaker_separation#

malaya_speech.multispeaker_separation.available_deep_wav()[source]#

List available FastSep models trained on raw 8k wav.

malaya_speech.multispeaker_separation.deep_wav(model='fastsep-4', quantized=False, **kwargs)[source]#

Load FastSep model, trained on raw 8k wav using SISNR PIT loss.

Parameters
  • model (str, optional (default='fastsep-4')) – Check available models at malaya_speech.multispeaker_separation.available_deep_wav().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.tf.Split class

malaya_speech.noise_reduction#

malaya_speech.noise_reduction.available_model()[source]#

List available Noise Reduction deep learning models.

malaya_speech.noise_reduction.deep_model(model='resnet-unet', quantized=False, **kwargs)[source]#

Load Noise Reduction deep learning model.

Parameters
  • model (str, optional (default='resnet-unet')) – Check available models at malaya_speech.noise_reduction.available_model().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.tf.UNET_STFT class

malaya_speech.speaker_change#

malaya_speech.speaker_change.available_model()[source]#

List available speaker change deep models.

malaya_speech.speaker_change.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load speaker change deep model.

Parameters
  • model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.speaker_change.available_model().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.speaker_change.split_activities(vad_results, speaker_change_results, speaker_change_threshold=0.5, sr=16000, ignore_not_activity=True)[source]#

split VAD based on speaker change threshold, worse-case O(N^2).

Parameters
  • vad_results (List[Tuple[Frame, label]]) – results from VAD.

  • speaker_change_results (List[Tuple[Frame, float]], optional (default=None)) – results from speaker change module, must in float result.

  • speaker_change_threshold (float, optional (default=0.5)) – in one voice activity sample can be more than one speaker, split it using this threshold.

  • sr (int, optional (default=16000)) – sample rate, classification model in malaya-speech use 16k.

  • ignore_not_activity (bool, optional (default=True)) – If True, will ignore if result VAD is False, else will try to split.

Returns

result

Return type

List[Tuple[Frame, label]]

malaya_speech.speaker_count#

malaya_speech.speaker_count.available_nemo()[source]#

List available Nvidia Nemo speaker count models.

malaya_speech.speaker_count.nemo(model='huseinzol05/nemo-speaker-count-speakernet', **kwargs)[source]#

Load Nvidia Nemo speaker count model. Trained on 300 ms frames.

Parameters

model (str, optional (default='huseinzol05/nemo-speaker-count-speakernet')) – Check available models at malaya_speech.speaker_count.available_nemo().

Returns

result

Return type

malaya_speech.torch_model.nemo.Classification class

malaya_speech.speaker_overlap#

malaya_speech.speaker_overlap.available_model()[source]#

List available speaker overlap deep models.

malaya_speech.speaker_overlap.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load speaker overlap deep model.

Parameters
  • model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.speaker_overlap.available_model().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.speaker_vector#

malaya_speech.speaker_vector.available_model()[source]#

List available speaker vector deep models using Tensorflow.

malaya_speech.speaker_vector.available_nemo()[source]#

List available Nvidia Nemo Speaker vector models.

malaya_speech.speaker_vector.available_huggingface()[source]#

List available HuggingFace Speaker vector models.

malaya_speech.speaker_vector.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load Speaker2Vec model.

Parameters
  • model (str, optional (default='speakernet')) – Check available models at malaya_speech.speaker_vector.available_model().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.speaker_vector.nemo(model='huseinzol05/nemo-ecapa-tdnn', **kwargs)[source]#

Load Nemo Speaker verification model.

Parameters

model (str, optional (default='huseinzol05/nemo-ecapa-tdnn')) – Check available models at malaya_speech.speaker_vector.available_nemo().

Returns

result

Return type

malaya_speech.torch_model.nemo.SpeakerVector class

malaya_speech.speaker_vector.huggingface(model='microsoft/wavlm-base-plus-sv', force_check=True, **kwargs)[source]#

Load Finetuned models from HuggingFace.

Parameters
  • model (str, optional (default='microsoft/wavlm-base-plus-sv')) – Check available models at malaya_speech.speaker_vector.available_huggingface().

  • force_check (bool, optional (default=True)) – Force check model one of malaya model. Set to False if you have your own huggingface model.

Returns

result

Return type

malaya_speech.torch_model.huggingface.XVector class

malaya_speech.speech_enhancement#

malaya_speech.speech_enhancement.available_deep_masking()[source]#

List available Speech Enhancement STFT masking deep learning model.

malaya_speech.speech_enhancement.available_deep_enhance()[source]#

List available Speech Enhancement UNET Waveform sampling deep learning model.

malaya_speech.speech_enhancement.deep_masking(model='resnet-unet', quantized=False, **kwargs)[source]#

Load Speech Enhancement STFT UNET masking deep learning model.

Parameters
  • model (str, optional (default='resnet-unet')) – Check available models at malaya_speech.speech_enhancement.available_deep_masking().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.unet.UNETSTFT class

malaya_speech.speech_enhancement.deep_enhance(model='unet', quantized=False, **kwargs)[source]#

Load Speech Enhancement UNET Waveform sampling deep learning model.

Parameters
  • model (str, optional (default='unet')) – Check available models at malaya_speech.speech_enhancement.available_deep_enhance().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.unet.UNET1D class

malaya_speech.speechsplit_conversion#

malaya_speech.speechsplit_conversion.available_deep_conversion(f0_mode='pysptk')[source]#

List available Voice Conversion models.

Parameters

f0_mode (str, optional (default='pysptk')) –

F0 conversion supported. Allowed values:

malaya_speech.speechsplit_conversion.deep_conversion(model='fastspeechsplit-v2-vggvox-v2', f0_mode='pysptk', quantized=False, **kwargs)[source]#

Load Voice Conversion model.

Parameters
  • model (str, optional (default='fastspeechsplit-v2-vggvox-v2')) – Check available models at malaya_speech.speechsplit_conversion.available_deep_conversion(f0_mode = ‘{f0_mode}’)

  • f0_mode (str, optional (default='pysptk')) –

    F0 conversion supported. Allowed values:

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.splitter.FastSpeechSplit class

malaya_speech.stack#

malaya_speech.stack.classification_stack(models)[source]#

Stacking for classification models. All models should be in the same domain classification.

Parameters

models (List[Callable]) – list of models.

Returns

result

Return type

malaya_speech.stack.Stack class

malaya_speech.model.stack.Stack#

class malaya_speech.stack.Stack[source]#
predict_proba(inputs, aggregate=<function gmean>)[source]#

Stacking for predictive models, will return probability.

Parameters
  • inputs (List[np.array]) –

  • aggregate (Callable, optional (default=scipy.stats.mstats.gmean)) –

  • function. (Aggregate) –

Returns

result

Return type

np.array

predict(inputs, aggregate=<function gmean>)[source]#

Stacking for predictive models, will return labels.

Parameters
  • inputs (List[np.array]) –

  • aggregate (Callable, optional (default=scipy.stats.mstats.gmean)) –

  • function. (Aggregate) –

Returns

result

Return type

List[str]

malaya_speech.stt.ctc#

malaya_speech.stt.ctc.available_transformer()[source]#

List available Encoder-CTC ASR models.

malaya_speech.stt.ctc.available_huggingface()[source]#

List available HuggingFace CTC ASR models.

malaya_speech.stt.ctc.transformer(model='hubert-conformer', quantized=False, **kwargs)[source]#

Load Encoder-CTC ASR model.

Parameters
  • model (str, optional (default='hubert-conformer')) – Check available models at malaya_speech.stt.ctc.available_transformer().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.wav2vec.Wav2Vec2_CTC class

malaya_speech.stt.ctc.huggingface(model='mesolitica/wav2vec2-xls-r-300m-mixed', force_check=True, **kwargs)[source]#

Load Finetuned models from HuggingFace.

Parameters
  • model (str, optional (default='mesolitica/wav2vec2-xls-r-300m-mixed')) – Check available models at malaya_speech.stt.ctc.available_huggingface().

  • force_check (bool, optional (default=True)) – Force check model one of malaya model. Set to False if you have your own huggingface model.

Returns

result

Return type

malaya_speech.torch_model.huggingface.CTC class

malaya_speech.stt.seq2seq#

malaya_speech.stt.seq2seq.available_huggingface()[source]#

List available HuggingFace Seq2Seq ASR models.

malaya_speech.stt.seq2seq.available_whisper()[source]#

List available OpenAI Whisper ASR models.

malaya_speech.stt.seq2seq.huggingface(model='mesolitica/finetune-whisper-base-ms-singlish-v2', force_check=True, **kwargs)[source]#

Load Finetuned models from HuggingFace.

Parameters
  • model (str, optional (default='mesolitica/finetune-whisper-base-ms-singlish-v2')) – Check available models at malaya_speech.stt.seq2seq.available_huggingface().

  • force_check (bool, optional (default=True)) – Force check model one of malaya model. Set to False if you have your own huggingface model.

Returns

result

Return type

malaya_speech.model.huggingface.Seq2Seq class

malaya_speech.stt.seq2seq.whisper(model='mesolitica/finetune-whisper-base-ms-singlish-v2', force_check=True, **kwargs)[source]#

Load Finetuned models from HuggingFace.

Parameters
  • model (str, optional (default='mesolitica/finetune-whisper-base-ms-singlish-v2')) – Check available models at malaya_speech.stt.seq2seq.available_whisper().

  • force_check (bool, optional (default=True)) – Force check model one of malaya model. Set to False if you have your own huggingface model.

Returns

result

Return type

whisper.model.Whisper class

malaya_speech.stt.transducer#

malaya_speech.stt.transducer.available_transformer()[source]#

List available Encoder-Transducer ASR models using Tensorflow.

malaya_speech.stt.transducer.available_pt_transformer()[source]#

List available Encoder-Transducer ASR models using Pytorch.

malaya_speech.stt.transducer.transformer(model='conformer', quantized=False, **kwargs)[source]#

Load Encoder-Transducer ASR model using Tensorflow.

Parameters
  • model (str, optional (default='conformer')) – Check available models at malaya_speech.stt.transducer.available_transformer().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.transducer.Transducer class

malaya_speech.stt.transducer.pt_transformer(model='mesolitica/conformer-base', **kwargs)[source]#

Load Encoder-Transducer ASR model using Pytorch.

Parameters

model (str, optional (default='mesolitica/conformer-base')) – Check available models at malaya_speech.stt.transducer.available_pt_transformer().

Returns

result

Return type

malaya_speech.torch_model.torchaudio.Conformer class

malaya_speech.super_resolution#

malaya_speech.super_resolution.available_unet()[source]#

List available Super Resolution 4x deep learning UNET models.

malaya_speech.super_resolution.available_vocoder()[source]#

List available Super Resolution deep learning vocoder models.

malaya_speech.super_resolution.available_diffusion()[source]#

List available Super Resolution deep learning diffusion models.

malaya_speech.super_resolution.unet(model='srgan-256', quantized=False, **kwargs)[source]#

Load Super Resolution 4x deep learning UNET model.

Parameters
  • model (str, optional (default='srgan-256')) – Check available models at malaya_speech.super_resolution.available_unet().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.tf.UNET1D class

malaya_speech.super_resolution.vocoder(model='hifigan-bwe', **kwargs)[source]#

Load vocoder based super resolution.

Parameters

model (str, optional (default='hifigan-bwe')) – Check available models at malaya_speech.super_resolution.available_vocoder().

Returns

result

Return type

malaya_speech.torch_model.super_resolution.*

malaya_speech.super_resolution.diffusion(model='nuwave2', **kwargs)[source]#

Load audio diffusion based super resolution.

Parameters

model (str, optional (default='nuwave2')) – Check available models at malaya_speech.super_resolution.available_diffusion().

Returns

result

Return type

malaya_speech.torch_model.super_resolution.NuWave2

malaya_speech.tts#

malaya_speech.tts.available_tacotron2()[source]#

List available Tacotron2, Text to Mel models.

malaya_speech.tts.available_fastspeech2()[source]#

List available FastSpeech2, Text to Mel models.

malaya_speech.tts.available_fastpitch()[source]#

List available FastPitch, Text to Mel models.

malaya_speech.tts.available_glowtts()[source]#

List available GlowTTS, Text to Mel models.

malaya_speech.tts.available_lightspeech()[source]#

List available LightSpeech, Text to Mel models.

malaya_speech.tts.available_e2e_fastspeech2()[source]#

List available FastSpeech2, End-to-End models.

malaya_speech.tts.available_vits()[source]#

List available VITS, End-to-End models.

malaya_speech.tts.available_vits_v2()[source]#

List available VITS V2, End-to-End models.

malaya_speech.tts.load_text_ids(pad_to=8, understand_punct=True, is_lower=True, **kwargs)[source]#

Load text normalizer module use by Malaya-Speech TTS.

malaya_speech.tts.tacotron2(model='yasmin', quantized=False, pad_to=8, **kwargs)[source]#

Load Tacotron2 Text-to-Mel TTS model.

Parameters
  • model (str, optional (default='yasmin')) – Check available models at malaya_speech.tts.available_tacotron2().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

  • pad_to (int, optional (default=8)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.

Returns

result

Return type

malaya_speech.model.synthesis.Tacotron class

malaya_speech.tts.fastspeech2(model='osman', quantized=False, pad_to=8, **kwargs)[source]#

Load Fastspeech2 Text-to-Mel TTS model.

Parameters
  • model (str, optional (default='male')) – Check available models at malaya_speech.tts.available_fastspeech2().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

  • pad_to (int, optional (default=8)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.

Returns

result

Return type

malaya_speech.model.synthesis.Fastspeech class

malaya_speech.tts.fastpitch(model='male', quantized=False, pad_to=8, **kwargs)[source]#

Load Fastspitch Text-to-Mel TTS model.

Parameters
  • model (str, optional (default='male')) – Check available models at malaya_speech.tts.available_fastpitch().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

  • pad_to (int, optional (default=8)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.

Returns

result

Return type

malaya_speech.model.synthesis.Fastpitch class

malaya_speech.tts.glowtts(model='yasmin', quantized=False, pad_to=2, **kwargs)[source]#

Load GlowTTS Text-to-Mel TTS model.

Parameters
  • model (str, optional (default='yasmin')) – Check available models at malaya_speech.tts.available_glowtts().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

  • pad_to (int, optional (default=2)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 2.

Returns

result

Return type

malaya_speech.model.synthesis.GlowTTS class

malaya_speech.tts.lightspeech(model='male', quantized=False, pad_to=8, **kwargs)[source]#

Load LightSpeech Text-to-Mel TTS model.

Parameters
  • model (str, optional (default='male')) – Check available models at malaya_speech.tts.available_lightspeech().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

  • pad_to (int, optional (default=8)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.

Returns

result

Return type

malaya_speech.model.synthesis.Fastspeech class

malaya_speech.tts.e2e_fastspeech2(model='osman', quantized=False, pad_to=8, **kwargs)[source]#

Load Fastspeech2 Text-to-Mel TTS model.

Parameters
  • model (str, optional (default='male')) – Check available models at malaya_speech.tts.available_e2e_fastspeech2().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

  • pad_to (int, optional (default=8)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.

Returns

result

Return type

malaya_speech.model.synthesis.E2E_FastSpeech class

malaya_speech.tts.vits(model='mesolitica/VITS-osman', **kwargs)[source]#

Load VITS End-to-End TTS model.

Parameters

model (str, optional (default='mesolitica/VITS-osman')) – Check available models at malaya_speech.tts.available_vits().

Returns

result

Return type

malaya_speech.torch_model.synthesis.VITS class

malaya_speech.tts.vits_v2(model='mesolitica/VITS-V2-husein', **kwargs)[source]#

Load VITS V2 End-to-End TTS model.

Parameters

model (str, optional (default='mesolitica/VITS-V2-husein')) – Check available models at malaya_speech.tts.available_vits().

Returns

result

Return type

malaya_speech.torch_model.synthesis.VITS class

malaya_speech.vad#

malaya_speech.vad.available_model()[source]#

List available VAD deep models.

malaya_speech.vad.available_nemo()[source]#

List available Nvidia Nemo VAD models.

malaya_speech.vad.webrtc(aggressiveness=3, sample_rate=16000, minimum_amplitude=100)[source]#

Load WebRTC VAD model. WebRTC prefer 30ms frame, https://github.com/wiseman/py-webrtcvad#how-to-use-it

Parameters
  • aggressiveness (int, optional (default=3)) – an integer between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive.

  • sample_rate (int, optional (default=16000)) – sample rate for samples.

  • minimum_amplitude (int, optional (default=100)) – abs(minimum_amplitude) to assume a sample is a voice activity. Else, automatically False.

Returns

result

Return type

malaya_speech.model.webrtc.WebRTC class

malaya_speech.vad.deep_model(model='marblenet-factor1', quantized=False, **kwargs)[source]#

Load VAD model. Prefer 50 ms or bigger frame.

Parameters
  • model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.vad.available_model().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.vad.nemo(model='huseinzol05/nemo-vad-marblenet', **kwargs)[source]#

Load Nemo VAD model. Nemo VAD prefer 63 ms frame, https://github.com/NVIDIA/NeMo/blob/02cf155b020964992a974e030b9e318426761e33/nemo/collections/asr/data/feature_to_label_dataset.py#L43

Parameters

model (str, optional (default='huseinzol05/vad-marblenet')) – Check available models at malaya_speech.vad.available_nemo().

Returns

result

Return type

malaya_speech.torch_model.nemo.Classification class

malaya_speech.vocoder#

malaya_speech.vocoder.available_melgan()[source]#

List available MelGAN Mel-to-Speech models.

malaya_speech.vocoder.available_mbmelgan()[source]#

List available Multiband MelGAN Mel-to-Speech models.

malaya_speech.vocoder.available_hifigan()[source]#

List available HiFiGAN Mel-to-Speech models.

malaya_speech.vocoder.available_pt_hifigan()[source]#

List available PyTorch HiFiGAN Mel-to-Speech models.

malaya_speech.vocoder.melgan(model='universal-1024', quantized=False, **kwargs)[source]#

Load MelGAN Vocoder model.

Parameters
  • model (str, optional (default='universal-1024')) – Check available models at malaya_speech.vocoder.available_melgan().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.synthesis.Vocoder class

malaya_speech.vocoder.mbmelgan(model='female', quantized=False, **kwargs)[source]#

Load Multiband MelGAN Vocoder model.

Parameters
  • model (str, optional (default='female')) – Check available models at malaya_speech.vocoder.available_mbmelgan().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.synthesis.Vocoder class

malaya_speech.vocoder.hifigan(model='universal-768', quantized=False, **kwargs)[source]#

Load HiFiGAN Vocoder model.

Parameters
  • model (str, optional (default='universal-768')) – Check available models at malaya_speech.vocoder.available_hifigan().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.synthesis.Vocoder class

malaya_speech.vocoder.pt_hifigan(model='huseinzol05/jik876-UNIVERSAL_V1', **kwargs)[source]#

Load PyTorch HiFiGAN Vocoder model, originally from https://github.com/jik876/hifi-gan.

Parameters

model (str, optional (default='huseinzol05/jik876-UNIVERSAL_V1')) –

Returns

result

Return type

malaya_speech.torch_model.synthesis.Vocoder class

malaya_speech.voice_conversion#

malaya_speech.voice_conversion.available_fastvc()[source]#

List available Voice Conversion models.

malaya_speech.voice_conversion.fastvc(model='fastvc-32-vggvox-v2', quantized=False, **kwargs)[source]#

Load Voice Conversion FastVC model.

Parameters
  • model (str, optional (default='fastvc-32-vggvox-v2')) – Check available models at malaya_speech.voice_conversion.available_deep_conversion().

  • quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.synthesis.FastVC class