API#

malaya_speech#

malaya_speech.augmentation.spectrogram#

malaya_speech.augmentation.spectrogram.mask_frequency(features, n_freq_mask=2, width_freq_mask=8, random_band=True)[source]#

Mask frequency.

Parameters

features (np.array) –
n_freq_mask (int, optional (default=2)) – loop size for masking.
width_freq_mask (int, optional (default=8)) – masking size.

Returns

result

Return type

np.array

malaya_speech.augmentation.spectrogram.mask_time(features, n_time_mask=2, width_time_mask=8, random_band=True)[source]#

Time frequency.

Parameters

features (np.array) –
n_time_mask (int, optional (default=2)) – loop size for masking.
width_time_mask (int, optional (default=8)) – masking size.

Returns

result

Return type

np.array

malaya_speech.augmentation.spectrogram.tf_mask_frequency(features, n_freq_mask=2, F=27)[source]#

Mask frequency using Tensorflow.

Parameters

features (np.array) –
F (size of mask for frequency) –

malaya_speech.augmentation.spectrogram.tf_mask_time(features, n_time_mask=2, T=80)[source]#

Mask time using Tensorflow.

Parameters

features (np.array) –
T (size of mask for time) –

malaya_speech.extra.rttm#

malaya_speech.extra.rttm.load(file)[source]#

Load RTTM file.

Parameters: file (str) –
Returns: result
Return type: Dict[str, malaya_speech.model.annotation.Annotation]

malaya_speech.extra.visualization#

malaya_speech.extra.visualization.visualize_vad(signal, preds, sample_rate=16000, figsize=(15, 3), ax=None, **kwargs)[source]#

Visualize signal given VAD labels. Green means got voice activity, while Red is not.

Parameters

signal (list / np.array) –
preds (List[Tuple[Frame, bool]]) –
sample_rate (int, optional (default=16000)) –
figsize (Tuple[int, int], optional (default=(15, 7))) – matplotlib figure size.

malaya_speech.extra.visualization.plot_classification(preds, description, ax=None, fontsize_text=14, x_text=0.05, y_text=0.2, ylim=(0.1, 0.9), figsize=(15, 3), **kwargs)[source]#

Visualize probability / boolean.

Parameters

preds (List[Tuple[Frame, label]]) –
description (str) –
ax (ax, optional (default = None)) –
fontsize_text (int, optional (default = 14)) –
x_text (float, optional (default = 0.05)) –
y_text (float, optional (default = 0.2)) –

malaya_speech.model.classification.Speakernet#

class malaya_speech.model.classification.Speakernet[source]#

vectorize(inputs)[source]#

Vectorize inputs.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result – returned [B, D].
Return type: np.array

malaya_speech.model.classification.Speaker2Vec#

class malaya_speech.model.classification.Speaker2Vec[source]#

vectorize(inputs)[source]#

Vectorize inputs.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result – returned [B, D].
Return type: np.array

malaya_speech.model.classification.SpeakernetClassification#

class malaya_speech.model.classification.SpeakernetClassification[source]#

predict_proba(inputs)[source]#

Predict inputs, will return probability.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result – returned [B, D].
Return type: np.array

predict(inputs)[source]#

Predict inputs, will return labels.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result – returned [B].
Return type: List[str]

malaya_speech.model.classification.Classification#

class malaya_speech.model.classification.Classification[source]#

predict_proba(inputs)[source]#

Predict inputs, will return probability.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result – returned [B, D].
Return type: np.array

predict(inputs)[source]#

Predict inputs, will return labels.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result – returned [B].
Return type: List[str]

malaya_speech.model.clustering.AgglomerativeClustering#

class malaya_speech.model.clustering.AgglomerativeClustering(min_clusters, max_clusters, metric='cosine', threshold=0.25, method='centroid')[source]#

fit_predict(X)[source]#

Fit predict.

Parameters: X (np.array) – inputs with size of [batch_size, embedding size]
Returns: result
Return type: np.array

malaya_speech.model.clustering.HiddenMarkovModelClustering#

class malaya_speech.model.clustering.HiddenMarkovModelClustering(min_clusters, max_clusters, metric='cosine', covariance_type='diag', threshold=0.25, single_cluster_detection_quantile=0.05, single_cluster_detection_threshold=1.15)[source]#

fit_predict(X)[source]#

Fit predict.

Parameters: X (np.array) – inputs with size of [batch_size, embedding size]
Returns: result
Return type: np.array

malaya_speech.model.clustering.StreamingKMeansMaxCluster#

class malaya_speech.model.clustering.StreamingKMeansMaxCluster[source]#

malaya_speech.model.clustering.StreamingKMeans#

class malaya_speech.model.clustering.StreamingKMeans[source]#

malaya_speech.model.clustering.StreamingSpeakerSimilarity#

class malaya_speech.model.clustering.StreamingSpeakerSimilarity(similarity_threshold=0.8, agg_function=<function mean>)[source]#

malaya_speech.model.splitter.Split_Wav#

class malaya_speech.model.splitter.Split_Wav[source]#

predict(input)[source]#

Split an audio into 4 different speakers.

Parameters: input (np.array or malaya_speech.model.frame.Frame) –
Returns: result
Return type: np.array

malaya_speech.model.splitter.Split_Mel#

class malaya_speech.model.splitter.Split_Mel[source]#

predict(input)[source]#

Split an audio into 4 different speakers.

Parameters: input (np.array or malaya_speech.model.frame.Frame) –
Returns: result
Return type: np.array

malaya_speech.model.splitter.FastSpeechSplit#

class malaya_speech.model.splitter.FastSpeechSplit[source]#

predict(original_audio, target_audio, modes=['R', 'F', 'U', 'RF', 'RU', 'FU', 'RFU'])[source]#

Change original voice audio to follow targeted voice.

Parameters

original_audio (np.array or malaya_speech.model.frame.Frame) –
target_audio (np.array or malaya_speech.model.frame.Frame) –
modes (List[str], optional (default = ['R', 'F', 'U', 'RF', 'RU', 'FU', 'RFU'])) –
R denotes rhythm, F denotes pitch target, U denotes speaker target (vector).
- 'R' - maintain original_audio F and U on target_audio R.
- 'F' - maintain original_audio R and U on target_audio F.
- 'U' - maintain original_audio R and F on target_audio U.
- 'RF' - maintain original_audio U on target_audio R and F.
- 'RU' - maintain original_audio F on target_audio R and U.
- 'FU' - maintain original_audio R on target_audio F and U.
- 'RFU' - no conversion happened, just do encoder-decoder on target_audio

Returns

result

Return type

Dict[modes]

malaya_speech.model.synthesis.TTS#

class malaya_speech.model.synthesis.TTS[source]#

gradio(vocoder=None, **kwargs)[source]#

Text-to-Speech on Gradio interface.

Parameters

vocoder (Callable, optional (default=None)) – vocoder object that has predict method, prefer from malaya_speech itself. Not required if using End-to-End TTS model such as VITS.
**kwargs (keyword arguments for predict and iface.launch.) –

malaya_speech.model.synthesis.Vocoder#

class malaya_speech.model.synthesis.Vocoder[source]#

predict(inputs)[source]#

Change Mel to Waveform.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result
Return type: List

malaya_speech.model.synthesis.Tacotron#

class malaya_speech.model.synthesis.Tacotron[source]#

predict(string, **kwargs)[source]#

Change string to Mel.

Parameters: string (str) –
Returns: result
Return type: Dict[string, decoder-output, mel-output, universal-output, alignment]

malaya_speech.model.synthesis.Fastspeech#

class malaya_speech.model.synthesis.Fastspeech[source]#

predict(string, speed_ratio=1.0, f0_ratio=1.0, energy_ratio=1.0, **kwargs)[source]#

Change string to Mel.

Parameters

string (str) –
speed_ratio (float, optional (default=1.0)) – Increase this variable will increase time voice generated.
f0_ratio (float, optional (default=1.0)) – Increase this variable will increase frequency, low frequency will generate more deeper voice.
energy_ratio (float, optional (default=1.0)) – Increase this variable will increase loudness.

Returns

result

Return type

Dict[string, decoder-output, mel-output, universal-output]

malaya_speech.model.synthesis.FastspeechSDP#

class malaya_speech.model.synthesis.FastspeechSDP[source]#

predict(string, speed_ratio=1.0, f0_ratio=1.0, energy_ratio=1.0, temperature_durator=0.6666, **kwargs)[source]#

Change string to Mel.

Parameters

string (str) –
speed_ratio (float, optional (default=1.0)) – Increase this variable will increase time voice generated.
f0_ratio (float, optional (default=1.0)) – Increase this variable will increase frequency, low frequency will generate more deeper voice.
energy_ratio (float, optional (default=1.0)) – Increase this variable will increase loudness.
temperature_durator (float, optional (default=0.66666)) – Durator trying to predict alignment with random.normal() * temperature_durator.

Returns

result

Return type

Dict[string, decoder-output, mel-output, universal-output]

malaya_speech.model.synthesis.E2E_FastSpeech#

class malaya_speech.model.synthesis.E2E_FastSpeech[source]#

predict(string, speed_ratio=1.0, f0_ratio=1.0, energy_ratio=1.0, temperature_durator=0.6666, **kwargs)[source]#

Change string to Mel.

Parameters

string (str) –
speed_ratio (float, optional (default=1.0)) – Increase this variable will increase time voice generated.
f0_ratio (float, optional (default=1.0)) – Increase this variable will increase frequency, low frequency will generate more deeper voice.
energy_ratio (float, optional (default=1.0)) – Increase this variable will increase loudness.
temperature_durator (float, optional (default=0.66666)) – Durator trying to predict alignment with random.normal() * temperature_durator.

Returns

result

Return type

Dict[string, decoder-output, y]

malaya_speech.model.synthesis.FastVC#

class malaya_speech.model.synthesis.FastVC[source]#

predict(original_audio, target_audio)[source]#

Change original voice audio to follow targeted voice.

Parameters

original_audio (np.array or malaya_speech.model.frame.Frame) –
target_audio (np.array or malaya_speech.model.frame.Frame) –

Returns

result

Return type

Dict[decoder-output, mel-output]

malaya_speech.model.synthesis.Fastpitch#

class malaya_speech.model.synthesis.Fastpitch[source]#

predict(string, speed_ratio=1.0, pitch_ratio=1.0, pitch_addition=0.0, **kwargs)[source]#

Change string to Mel.

Parameters

string (str) –
speed_ratio (float, optional (default=1.0)) – Increase this variable will increase time voice generated.
pitch_ratio (float, optional (default=1.0)) – pitch = pitch * pitch_ratio, amplify existing pitch contour.
pitch_addition (float, optional (default=0.0)) – pitch = pitch + pitch_addition, change pitch contour.

Returns

result

Return type

Dict[string, decoder-output, mel-output, pitch-output, universal-output]

malaya_speech.model.transducer.Transducer#

class malaya_speech.model.transducer.Transducer[source]#

predict_alignment(input, combined=True)[source]#

Transcribe input and get timestamp, only support greedy decoder.

Parameters

input (np.array) – np.array or malaya_speech.model.frame.Frame.
combined (bool, optional (default=True)) – If True, will combined subwords to become a word.

Returns

result

Return type

List[Dict[text, start, end]]

greedy_decoder(inputs)[source]#

Transcribe inputs using greedy decoder.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result
Return type: List[str]

beam_decoder(inputs, beam_width=5, temperature=0.0, score_norm=True)[source]#

Transcribe inputs using beam decoder.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
beam_width (int, optional (default=5)) – beam size for beam decoder.
temperature (float, optional (default=0.0)) – apply temperature function for logits, can help for certain case, logits += -np.log(-np.log(uniform_noise_shape_logits)) * temperature
score_norm (bool, optional (default=True)) – descending sort beam based on score / length of decoded.

Returns

result

Return type

List[str]

beam_decoder_lm(inputs, language_model, beam_width=5, token_min_logp=- 20.0, beam_prune_logp=- 5.0, temperature=0.0, score_norm=True)[source]#

Transcribe inputs using beam decoder + KenLM.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
language_model (pyctcdecode.language_model.LanguageModel) – pyctcdecode language model, load from LanguageModel(kenlm_model, alpha = alpha, beta = beta).
beam_width (int, optional (default=5)) – beam size for beam decoder.
token_min_logp (float, optional (default=-20.0)) – minimum log probability to select a token.
beam_prune_logp (float, optional (default=-5.0)) – filter candidates >= max score lm + beam_prune_logp.
temperature (float, optional (default=0.0)) – apply temperature function for logits, can help for certain case, logits += -np.log(-np.log(uniform_noise_shape_logits)) * temperature
score_norm (bool, optional (default=True)) – descending sort beam based on score / length of decoded.

Returns

result

Return type

List[str]

predict(inputs)[source]#

Transcribe inputs using greedy decoder, will return list of strings.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result
Return type: List[str]

gradio(record_mode=True, **kwargs)[source]#

Transcribe an input using beam decoder on Gradio interface.

Parameters

record_mode (bool, optional (default=True)) – if True, Gradio will use record mode, else, file upload mode.
**kwargs (keyword arguments for beam decoder and iface.launch.) –

malaya_speech.model.transducer.TransducerAligner#

class malaya_speech.model.transducer.TransducerAligner[source]#

predict(input, transcription, sample_rate=16000)[source]#

Transcribe input, will return a string.

Parameters

input (np.array) – np.array or malaya_speech.model.frame.Frame.
transcription (str) – transcription of input audio
sample_rate (int, optional (default=16000)) – sample rate for input.

Returns

result

Return type

Dict[words_alignment, subwords_alignment, subwords, alignment]

malaya_speech.model.unet.UNET#

class malaya_speech.model.unet.UNET[source]#

predict(inputs)[source]#

Enhance inputs, will return melspectrogram.

Parameters: inputs (List[np.array]) –
Returns: result
Return type: List

malaya_speech.model.unet.UNETSTFT#

class malaya_speech.model.unet.UNETSTFT[source]#

predict(input)[source]#

Enhance inputs, will return waveform.

Parameters: input (np.array) – np.array or malaya_speech.model.frame.Frame.
Returns: result
Return type: Dict

malaya_speech.model.unet.UNET1D#

class malaya_speech.model.unet.UNET1D[source]#

predict(input)[source]#

Enhance inputs, will return waveform.

Parameters: input (np.array) – np.array or malaya_speech.model.frame.Frame.
Returns: result
Return type: np.array

malaya_speech.model.wav2vec.Wav2Vec2_CTC#

class malaya_speech.model.wav2vec.Wav2Vec2_CTC[source]#

greedy_decoder(inputs)[source]#

Transcribe inputs using greedy decoder.

Parameters: input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result
Return type: List[str]

beam_decoder(inputs, beam_width=100, **kwargs)[source]#

Transcribe inputs using beam decoder.

Parameters

input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
beam_width (int, optional (default=100)) – beam size for beam decoder.

Returns

result

Return type

List[str]

predict(inputs)[source]#

Predict logits from inputs using greedy decoder.

Parameters: input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result
Return type: List[str]

predict_logits(inputs, norm_func=<function softmax>)[source]#

Predict logits from inputs.

Parameters

input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
norm_func (Callable, optional (default=malaya.utils.activation.softmax)) –

Returns

result

Return type

List[np.array]

gradio(record_mode=True, lm_func=None, **kwargs)[source]#

Transcribe an input using beam decoder on Gradio interface.

Parameters

record_mode (bool, optional (default=True)) – if True, Gradio will use record mode, else, file upload mode.
lm_func (Callable, optional (default=None)) – if not None, will pass a logits with shape [T, D].
**kwargs (keyword arguments for beam decoder and iface.launch.) –

malaya_speech.model.wav2vec.Wav2Vec2_Aligner#

class malaya_speech.model.wav2vec.Wav2Vec2_Aligner[source]#

predict(input, transcription, sample_rate=16000)[source]#

Transcribe input, will return a string.

Parameters

input (np.array) – np.array or malaya_speech.model.frame.Frame.
transcription (str) – transcription of input audio.
sample_rate (int, optional (default=16000)) – sample rate for input.

Returns

result

Return type

Dict[chars_alignment, words_alignment, alignment]

malaya_speech.model.webrtc.WebRTC#

class malaya_speech.model.webrtc.WebRTC(vad, sample_rate=16000, minimum_amplitude=100)[source]#

malaya_speech.torch_model.huggingface.CTC#

class malaya_speech.torch_model.huggingface.CTC[source]#

greedy_decoder(inputs)[source]#

Transcribe inputs using greedy decoder.

Parameters: input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result
Return type: List[str]

predict(inputs)[source]#

Predict logits from inputs using greedy decoder.

Parameters: input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result
Return type: List[str]

predict_logits(inputs, norm_func=<function softmax>)[source]#

Predict logits from inputs.

Parameters

input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
norm_func (Callable, optional (default=malaya.utils.activation.softmax)) –

Returns

result

Return type

List[np.array]

gradio(record_mode=True, lm_func=None, **kwargs)[source]#

Transcribe an input using beam decoder on Gradio interface.

Parameters

record_mode (bool, optional (default=True)) – if True, Gradio will use record mode, else, file upload mode.
lm_func (Callable, optional (default=None)) – if not None, will pass a logits with shape [T, D].
**kwargs (keyword arguments for iface.launch.) –

malaya_speech.torch_model.huggingface.Aligner#

class malaya_speech.torch_model.huggingface.Aligner[source]#

predict(input, transcription, sample_rate=16000)[source]#

Transcribe input, will return a string.

Parameters

input (np.array) – np.array or malaya_speech.model.frame.Frame.
transcription (str) – transcription of input audio.
sample_rate (int, optional (default=16000)) – sample rate for input.

Returns

result

Return type

Dict[chars_alignment, words_alignment, alignment]

malaya_speech.torch_model.huggingface.Seq2Seq#

class malaya_speech.torch_model.huggingface.Seq2Seq[source]#

generate(inputs, skip_special_tokens=True, **kwargs)[source]#

Transcribe inputs.

Returns

result

Return type

List[str]

Parameters

input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
skip_special_tokens (bool, optional (default=True)) – skip special tokens during decoding.
**kwargs (vector arguments pass to huggingface generate method.) – Read more at https://huggingface.co/docs/transformers/main_classes/text_generation

Returns

result

Return type

List[str]

predict_logits(inputs, norm_func=<function softmax>, **kwargs)[source]#

Predict logits from inputs.

Parameters

input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
norm_func (Callable, optional (default=malaya.utils.activation.softmax)) –

Returns

result

Return type

List[np.array]

malaya_speech.torch_model.huggingface.Seq2SeqAligner#

class malaya_speech.torch_model.huggingface.Seq2SeqAligner[source]#

predict(input, transcription, lang='ms', median_filter_size=7)[source]#

Transcribe input, will return a string. Based on https://github.com/openai/whisper/blob/main/notebooks/Multilingual_ASR.ipynb

Parameters

input (np.array) – np.array or malaya_speech.model.frame.Frame.
transcription (str) – transcription of input audio.
lang (str, optional (default='ms')) – if you feed singlish speech, it is better to give en language.
median_filter_size (int, optional (default=7)) – sliding median size.

Returns

result

Return type

Dict[chars_alignment, words_alignment, alignment]

malaya_speech.torch_model.huggingface.XVector#

class malaya_speech.torch_model.huggingface.XVector[source]#

vectorize(inputs)[source]#

Vectorize inputs.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result – returned [B, D].
Return type: np.array

forward(inputs)[source]#

Defines the computation performed at every call.

Should be overridden by all subclasses.

Note

Although the recipe for forward pass needs to be defined within this function, one should call the Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

malaya_speech.torch_model.nemo.SpeakerVector#

class malaya_speech.torch_model.nemo.SpeakerVector[source]#

forward(inputs)[source]#

Vectorize inputs.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

vectorize(inputs)[source]#

Vectorize inputs.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result
Return type: np.array

malaya_speech.torch_model.nemo.Classification#

class malaya_speech.torch_model.nemo.Classification[source]#

forward(inputs)[source]#

Vectorize inputs.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].

predict_proba(inputs)[source]#

Predict inputs, will return probability.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result – returned [B, D].
Return type: np.array

predict(inputs)[source]#

Predict inputs, will return labels.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result – returned [B].
Return type: List[str]

malaya_speech.torch_model.super_resolution.VoiceFixer#

class malaya_speech.torch_model.super_resolution.VoiceFixer[source]#

predict(input, remove_higher_frequency=True)[source]#

Parameters

input (np.array) – np.array or malaya_speech.model.frame.Frame, must an audio with 44100 sampling rate.
remove_higher_frequency (bool, optional (default = True)) – Remove high frequency before neural upsampling.

Returns

result

Return type

np.array with 44100 sampling rate

forward(input, remove_higher_frequency=True)[source]#

Defines the computation performed at every call.

Should be overridden by all subclasses.

Note

malaya_speech.torch_model.super_resolution.NVSR#

class malaya_speech.torch_model.super_resolution.NVSR[source]#

predict(input)[source]#

Parameters: input (np.array) – np.array or malaya_speech.model.frame.Frame, must an audio with 44100 sampling rate.
Returns: result
Return type: np.array with 44100 sampling rate

malaya_speech.torch_model.synthesis.VITS#

class malaya_speech.torch_model.synthesis.VITS[source]#

malaya_speech.torch_model.torchaudio.Conformer#

class malaya_speech.torch_model.torchaudio.Conformer[source]#

forward(inputs, beam_width=20)[source]#

Transcribe inputs using beam decoder.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
beam_width (int, optional (default=20)) – beam size for beam decoder.

Returns

result

Return type

List[Tuple]

beam_decoder(inputs, beam_width=20)[source]#

Transcribe inputs using beam decoder.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
beam_width (int, optional (default=20)) – beam size for beam decoder.

Returns

result

Return type

List[str]

malaya_speech.torch_model.torchaudio.ForceAlignment#

class malaya_speech.torch_model.torchaudio.ForceAlignment[source]#

predict(input, transcription, temperature=1.0)[source]#

Transcribe input, will return a string.

Parameters

input (np.array) – np.array or malaya_speech.model.frame.Frame.
transcription (str) – transcription of input audio
temperature (float, optional (default=1.0)) – temperature for logits.

Returns

result

Return type

Dict[words_alignment, subwords_alignment, subwords, alignment]

malaya_speech.pipeline#

class malaya_speech.pipeline.Pipeline[source]#

visualize(filename='pipeline.png', **kwargs)[source]#

Render the computation of this object’s task graph using graphviz.

Requires graphviz to be installed.

Parameters

filename (str, optional) – The name of the file to write to disk.
kwargs – Graph attributes to pass to graphviz like rankdir="LR"

batching = <function batching>#

flatten = <function flatten>#

foreach_map = <function foreach_map>#

map = <function map>#

partition = <function partition>#

sliding_window = <function sliding_window>#

zip = <function zip>#

malaya_speech.pipeline.map#

class malaya_speech.pipeline.map[source]#

apply a function / method to the pipeline

Examples

>>> source = Pipeline()
>>> source.map(lambda x: x + 1).map(print)
>>> source.emit(1)
2

malaya_speech.pipeline.batching#

class malaya_speech.pipeline.batching[source]#

Batching stream into tuples

Examples

>>> source = Pipeline()
>>> source.batching(2).map(print)
>>> source.emit([1,2,3,4,5])
([1, 2], [3, 4], [5])

malaya_speech.pipeline.partition#

class malaya_speech.pipeline.partition[source]#

Partition stream into tuples of equal size

Examples

>>> source = Pipeline()
>>> source.partition(3).map(print)
>>> for i in range(10):
...     source.emit(i)
(0, 1, 2)
(3, 4, 5)
(6, 7, 8)

malaya_speech.pipeline.sliding_window#

class malaya_speech.pipeline.sliding_window[source]#

Produce overlapping tuples of size n

Parameters: return_partial (bool) – If True, yield tuples as soon as any events come in, each tuple being smaller or equal to the window size. If False, only start yielding tuples once a full window has accrued.

Examples

>>> source = Pipeline()
>>> source.sliding_window(3, return_partial=False).map(print)
>>> for i in range(8):
...     source.emit(i)
(0, 1, 2)
(1, 2, 3)
(2, 3, 4)
(3, 4, 5)
(4, 5, 6)
(5, 6, 7)

malaya_speech.pipeline.foreach_map#

class malaya_speech.pipeline.foreach_map[source]#

Apply a function to every element in a tuple in the stream.

Parameters

func (callable) –
method (str, optional (default='sync')) –
method to process each elements.
- 'sync' - loop one-by-one to process.
- 'async' - async process all elements at the same time.
- 'thread' - multithreading level to process all elements at the same time.
  Default is 1 worker. Override worker_size=n to increase.
- 'process' - multiprocessing level to process all elements at the same time.
  Default is 1 worker. Override worker_size=n to increase.
*args – The arguments to pass to the function.
**kwargs – Keyword arguments to pass to func.

Examples

>>> source = Pipeline()
>>> source.foreach_map(lambda x: 2*x).map(print)
>>> for i in range(3):
...     source.emit((i, i))
(0, 0)
(2, 2)
(4, 4)

malaya_speech.pipeline.flatten#

class malaya_speech.pipeline.flatten[source]#

Flatten streams of lists or iterables into a stream of elements

Examples

>>> source = Pipeline()
>>> source.flatten().map(print)
>>> source.emit([[1, 2, 3], [4, 5], [6, 7, 7]])
[1, 2, 3, 4, 5, 6, 7, 7]

malaya_speech.pipeline.zip#

class malaya_speech.pipeline.zip[source]#

Combine 2 branches into 1 branch.

Examples

>>> source = Pipeline()
>>> left = source.map(lambda x: x + 1, name = 'left')
>>> right = source.map(lambda x: x + 10, name = 'right')
>>> left.zip(right).map(sum).map(print)
>>> source.emit(2)
15

pack_literals(tup)[source]#: Fill buffers for literals whenever we empty them

malaya_speech.streaming.pyaudio#

malaya_speech.streaming.pyaudio.stream(vad_model=None, asr_model=None, classification_model=None, sample_rate=16000, segment_length=2560, num_padding_frames=20, ratio=0.75, min_length=0.1, max_length=10.0, realtime_print=True, **kwargs)[source]#

Stream an audio using pyaudio library.

Parameters

vad_model (object, optional (default=None)) – vad model / pipeline.
asr_model (object, optional (default=None)) – ASR model / pipeline, will transcribe each subsamples realtime.
classification_model (object, optional (default=None)) – classification pipeline, will classify each subsamples realtime.
device (None, optional (default=None)) – device parameter for pyaudio, check available devices from sounddevice.query_devices().
sample_rate (int, optional (default = 16000)) – output sample rate.
segment_length (int, optional (default=2560)) – usually derived from asr_model.segment_length * asr_model.hop_length, size of audio chunks, actual size in term of second is segment_length / sample_rate.
ratio (float, optional (default = 0.75)) – if 75% of the queue is positive, assumed it is a voice activity.
min_length (float, optional (default=0.1)) – minimum length (second) to accept a subsample.
max_length (float, optional (default=10.0)) – maximum length (second) to accept a subsample.
realtime_print (bool, optional (default=True)) – Will print results for ASR.
**kwargs (vector argument) – vector argument pass to malaya_speech.streaming.pyaudio.Audio interface.

Returns

result

Return type

List[dict]

malaya_speech.streaming.torchaudio#

https://pytorch.org/audio/stable/tutorials/online_asr_tutorial.html

class malaya_speech.streaming.torchaudio.ContextCacher(segment_length, context_length)[source]#

Cache the end of input data and prepend the next input data with it.

Parameters

segment_length (int) – The size of main segment. If the incoming segment is shorter, then the segment is padded.
context_length (int) – The size of the context, cached and appended.

malaya_speech.streaming.torchaudio.stream(src, vad_model=None, asr_model=None, classification_model=None, format=None, option=None, buffer_size=4096, sample_rate=16000, segment_length=2560, num_padding_frames=20, ratio=0.75, min_length=0.1, max_length=10.0, realtime_print=True, **kwargs)[source]#

Stream an audio using torchaudio library.

Parameters

vad_model (object, optional (default=None)) – vad model / pipeline.
asr_model (object, optional (default=None)) – ASR model / pipeline, will transcribe each subsamples realtime.
classification_model (object, optional (default=None)) – classification pipeline, will classify each subsamples realtime.
format (str, optional (default=None)) – Supported format for torchaudio.io.StreamReader, https://pytorch.org/audio/stable/generated/torchaudio.io.StreamReader.html#torchaudio.io.StreamReader
option (dict, optional (default=None)) – Supported option for torchaudio.io.StreamReader, https://pytorch.org/audio/stable/generated/torchaudio.io.StreamReader.html#torchaudio.io.StreamReader
buffer_size (int, optional (default=4096)) – Supported buffer_size for torchaudio.io.StreamReader, buffer size in byte. Used only when src is file-like object, https://pytorch.org/audio/stable/generated/torchaudio.io.StreamReader.html#torchaudio.io.StreamReader
sample_rate (int, optional (default = 16000)) – output sample rate.
segment_length (int, optional (default=2560)) – usually derived from asr_model.segment_length * asr_model.hop_length, size of audio chunks, actual size in term of second is segment_length / sample_rate.
num_padding_frames (int, optional (default=20)) – size of acceptable padding frames for queue.
ratio (float, optional (default = 0.75)) – if 75% of the queue is positive, assumed it is a voice activity.
min_length (float, optional (default=0.1)) – minimum length (second) to accept a subsample.
max_length (float, optional (default=10.0)) – maximum length (second) to accept a subsample.
realtime_print (bool, optional (default=True)) – Will print results for ASR.
**kwargs (vector argument) – vector argument pass to malaya_speech.streaming.pyaudio.Audio interface.

Returns

result

Return type

List[dict]

malaya_speech.streaming.torchaudio.stream_rnnt(src, asr_model=None, classification_model=None, format=None, option=None, beam_width=10, buffer_size=4096, sample_rate=16000, segment_length=2560, context_length=640, realtime_print=True, **kwargs)[source]#

Parameters

src (str) – Supported src for torchaudio.io.StreamReader Read more at https://pytorch.org/audio/stable/tutorials/streamreader_basic_tutorial.html#sphx-glr-tutorials-streamreader-basic-tutorial-py or https://pytorch.org/audio/stable/tutorials/streamreader_advanced_tutorial.html#sphx-glr-tutorials-streamreader-advanced-tutorial-py
asr_model (object, optional (default=None)) – ASR model / pipeline, will transcribe each subsamples realtime. must be an object of malaya_speech.torch_model.torchaudio.Conformer.
classification_model (object, optional (default=None)) – classification pipeline, will classify each subsamples realtime.
format (str, optional (default=None)) – Supported format for torchaudio.io.StreamReader, https://pytorch.org/audio/stable/generated/torchaudio.io.StreamReader.html#torchaudio.io.StreamReader
option (dict, optional (default=None)) – Supported option for torchaudio.io.StreamReader, https://pytorch.org/audio/stable/generated/torchaudio.io.StreamReader.html#torchaudio.io.StreamReader
buffer_size (int, optional (default=4096)) – Supported buffer_size for torchaudio.io.StreamReader, buffer size in byte. Used only when src is file-like object, https://pytorch.org/audio/stable/generated/torchaudio.io.StreamReader.html#torchaudio.io.StreamReader
sample_rate (int, optional (default=16000)) – sample rate from input device, this will auto resampling.
segment_length (int, optional (default=2560)) – usually derived from asr_model.segment_length * asr_model.hop_length, size of audio chunks, actual size in term of second is segment_length / sample_rate.
context_length (int, optional (default=640)) – usually derived from asr_model.right_context_length * asr_model.hop_length, size of append context chunks, only useful for streaming RNNT.
beam_width (int, optional (default=10)) – width for beam decoding.
realtime_print (bool, optional (default=True)) – Will print results for ASR.

malaya_speech.utils.aligner#

class malaya_speech.utils.aligner.Point(token_index, time_index, score)[source]#

class malaya_speech.utils.aligner.Segment(label, start, end, score)[source]#

malaya_speech.utils.aligner.put_comma(alignment, min_threshold=0.5)[source]#

Put comma in alignment from force alignment model.

Parameters

alignment (List[Dict[text, start, end]]) –
min_threshold (float, optional (default=0.5)) – minimum threshold in term of seconds to assume a comma.

Returns

result

Return type

List[str]

malaya_speech.utils.aligner.plot_alignments(alignment, subs_alignment, words_alignment, waveform, separator=' ', sample_rate=16000, figsize=(16, 9), plot_score_char=False, plot_score_word=True)[source]#

plot alignment.

Parameters

alignment (np.array) – usually alignment output.
subs_alignment (list) – usually chars_alignment or subwords_alignment output.
words_alignment (list) – usually words_alignment output.
waveform (np.array) – input audio.
separator (str, optional (default=' ')) – separator between words, only useful if subs_alignment is character based.
sample_rate (int, optional (default=16000)) –
figsize (tuple, optional (default=(16, 9))) – figure size for matplotlib figsize.
plot_score_char (bool, optional (default=False)) – plot score on top of character plots.
plot_score_word (bool, optional (default=True)) – plot score on top of word plots.

malaya_speech.utils.astype#

malaya_speech.utils.astype.to_ndarray(array)[source]#

Change list / tuple / bytes into np.array

Parameters: array (list / tuple / bytes) –
Returns: result
Return type: np.array

malaya_speech.utils.astype.to_byte(array)[source]#

Change list / tuple / np.array into bytes

Parameters: array (list / tuple / np.array) –
Returns: result
Return type: bytes

malaya_speech.utils.astype.float_to_int(array, type=<class 'numpy.int16'>, divide_max_abs=True)[source]#

Change np.array float32 / float64 into np.int16

Parameters

array (np.array) –
type (np.int16) –

Returns

result

Return type

np.array

malaya_speech.utils.astype.int_to_float(array, type=<class 'numpy.float32'>)[source]#

Change np.array int16 into np.float32

Parameters

array (np.array) –
type (np.float32) –

Returns

result

Return type

np.array

malaya_speech.utils.char#

malaya_speech.utils.char.strip_ids(ids, ids_to_strip)[source]#: Strip ids_to_strip from the end ids.

malaya_speech.utils.char.generate_vocab(strings)[source]#

Generate character vocab sorted based on frequency.

Parameters: strings (List[str]) –
Returns: result
Return type: List[str]

malaya_speech.utils.char.encode(string, add_eos=True, add_blank=False, lookup=None)[source]#

Encode string to integer representation based on ascii table or lookup variable.

Parameters

string (str) –
add_eos (bool, optional (default=True)) – add EOS token at the end of encoded.
add_blank (bool, optional (default=False)) – add BLANK token at the starting of encoded, this is for transducer / transformer based.
lookup (List[str], optional (default=None)) – list of unique strings.

Returns

result

Return type

List[int]

malaya_speech.utils.char.decode(ids, lookup=None)[source]#

Decode integer representation to string based on ascii table or lookup variable.

Parameters

ids (List[int]) –
lookup (List[str], optional (default=None)) – list of unique strings.

Returns

result

Return type

str

malaya_speech.utils.combine#

malaya_speech.utils.combine.without_silent(frames, threshold_to_stop=0.1, silent_trail=500)[source]#

Group multiple frames based on label and threshold to stop.

Parameters

frames (List[Tuple[Frame, label]]) – Output from VAD.
threshold_to_stop (float, optional (default = 0.1)) – If threshold_to_stop is 0.1, means that, length same label samples must at least 0.1 second.
silent_trail (int, optional (default = 500)) – if detected a silent, will append first N frames and last N frames.

Returns

result

Return type

np.array

malaya_speech.utils.featurization#

malaya_speech.utils.featurization.normalize_signal(signal, gain=None)[source]#: Normalize float32 signal to [-1, 1] range

malaya_speech.utils.featurization.extract_pitch(y, hop_size=256, sr=22050, bad_f0=5.0, zero_value=- 10.0)[source]#: Originally from https://github.com/yl4579/PitchExtractor/blob/main/meldataset.py

malaya_speech.utils.generator#

malaya_speech.utils.generator.frames(audio, frame_duration_ms=30, sample_rate=16000, append_ending_trail=True)[source]#

Generates audio frames from audio. Takes the desired frame duration in milliseconds, the audio, and the sample rate.

Parameters

audio (np.array) –
frame_duration_ms (int, optional (default=30)) –
sample_rate (int, optional (default=16000)) –
append_ending_trail (bool, optional (default=True)) – if True, will append last trail and this last trail might not same length as frame_duration_ms.

Returns

result

Return type

List[malaya_speech.model.frame.Frame]

malaya_speech.utils.generator.mel_sampling(audio, frame_duration_ms=1200, overlap_ms=200, sample_rate=16000)[source]#

Generates audio frames from audio. This is for melspectrogram generative model. Takes the desired frame duration in milliseconds, the audio, and the sample rate.

Parameters

audio (np.array) –
frame_duration_ms (int, optional (default=1200)) –
overlap_ms (int, optional (default=200)) –
sample_rate (int, optional (default=16000)) –

Returns

result

Return type

List[np.array]

malaya_speech.utils.generator.combine_mel_sampling(samples, overlap_ms=200, sample_rate=16000, padded_ms=50)[source]#

To combine results from mel_sampling, output from melspectrogram generative model.

Parameters

samples (List[np.array]) –
overlap_ms (int, optional (default=200)) –
sample_rate (int, optional (default=16000)) –

Returns

result

Return type

List[np.array]

malaya_speech.utils.griffin_lim#

malaya_speech.utils.griffin_lim.from_mel(mel_, sr=16000, n_fft=2048, n_iter=32, win_length=1000, hop_length=100)[source]#

Change melspectrogram into waveform using Librosa.

Parameters: spectrogram (np.array) –
Returns: result
Return type: np.array

malaya_speech.utils.griffin_lim.from_mel_vocoder(mel, sr=22050, n_fft=1024, n_mels=256, fmin=80, fmax=7600, n_iter=32, win_length=None, hop_length=256)[source]#

Change melspectrogram into waveform using Librosa.

Parameters: spectrogram (np.array) –
Returns: result
Return type: np.array

malaya_speech.utils.group#

malaya_speech.utils.group.combine_frames(frames)[source]#

Combine multiple frames into one frame.

Parameters: frames (List[Frame]) –
Returns: result
Return type: Frame

malaya_speech.utils.group.group_frames(frames)[source]#

Group multiple frames based on label.

Parameters: frames (List[Tuple[Frame, label]]) –
Returns: result
Return type: List[Tuple[Frame, label]]

malaya_speech.utils.group.group_frames_threshold(frames, threshold_to_stop=0.3)[source]#

Group multiple frames based on label and threshold to stop.

Parameters

frames (List[Tuple[Frame, label]]) –
threshold_to_stop (float, optional (default = 0.3)) – If threshold_to_stop is 0.3, means that, length same label samples must at least 0.3 second.

Returns

result

Return type

List[Tuple[Frame, label]]

malaya_speech.utils.io#

malaya_speech.utils.io.write_srt(transcript, file)[source]#

Write list of transcription into SRT format.

Parameters

transcript (List[dict]) – list of {‘start’, ‘end’, ‘text’}
file (typing.TextIO) –

malaya_speech.utils.io.write_vtt(transcript, file)[source]#

Write list of transcription into VTT format.

Parameters

transcript (List[dict]) – list of {‘start’, ‘end’, ‘text’}
file (typing.TextIO) –

malaya_speech.utils.io.write_tsv(transcript, file)[source]#

Write list of transcription into TSV format.

Parameters

transcript (List[dict]) – list of {‘start’, ‘end’, ‘text’}
file (typing.TextIO) –

malaya_speech.utils.padding#

malaya_speech.utils.padding.sequence_1d(seq, maxlen=None, padding='post', pad_int=0, return_len=False)[source]#

padding sequence of 1d to become 2d array.

Parameters

seq (List[List[int]]) –
maxlen (int, optional (default=None)) – If None, will calculate max length in the function.
padding (str, optional (default='post')) – If pre, will add 0 on the starting side, else add 0 on the end side.
pad_int – padding value.
int – padding value.
(default=0) (optional) – padding value.

Returns

result

Return type

np.array

malaya_speech.utils.padding.sequence_nd(seq, maxlen=None, padding='post', pad_val=0.0, dim=1, return_len=False)[source]#

padding sequence of nd to become (n+1)d array.

Parameters

seq (list of nd array) –
maxlen (int, optional (default=None)) – If None, will calculate max length in the function.
padding (str, optional (default='post')) – If pre, will add 0 on the starting side, else add 0 on the end side.
pad_val – padding value.
float – padding value.
(default=0.0) (optional) – padding value.
dim (int, optional (default=1)) –

Returns

result

Return type

np.array

malaya_speech.utils.padding.tf_sequence_nd(seq, maxlen=None, padding='post', pad_val=0.0, dim=1, return_len=False)[source]#

padding sequence of nd to become (n+1)d array.

Parameters

seq (list of nd array) –
maxlen (int, optional (default=None)) – If None, will calculate max length in the function.
padding (str, optional (default='post')) – If pre, will add 0 on the starting side, else add 0 on the end side.
pad_val – padding value.
float – padding value.
(default=0.0) (optional) – padding value.
dim (int, optional (default=1)) –

Returns

result

Return type

np.array

malaya_speech.utils.read#

malaya_speech.utils.read.resample(data, old_samplerate, new_samplerate)[source]#

Resample signal.

Parameters

data (np.array) –
old_samplerate (int) – old sample rate.
new_samplerate (int) – new sample rate.

Returns

result

Return type

data

malaya_speech.utils.read.load(file, sr=16000, scale=True)[source]#

Read sound file, any format supported by soundfile.read and torchaudio.load

Parameters

file (str) –
sr (int, (default=16000)) – new sample rate. If input sample rate is not same, will resample automatically.
scale (bool, (default=True)) – Scale to -1 and 1.

Returns

result

Return type

(y, sr)

malaya_speech.utils.split#

malaya_speech.utils.split.split_vad(frames, n=3, negative_threshold=0.1)[source]#

Split a sample into multiple samples based n size of negative VAD.

Parameters

frames (List[Tuple[Frame, label]]) –
n (int, optional (default=3)) – n size of negative VAD to assume in one subsample.
negative_threshold (float, optional (default = 0.1)) – If negative_threshold is 0.1, means that, length negative samples must at least 0.1 second.

Returns

result

Return type

List[Frame]

malaya_speech.utils.split.split_vad_duration(frames, max_duration=5.0, negative_threshold=0.1)[source]#

Split a sample into multiple samples based maximum duration of voice activities.

Parameters

frames (List[Tuple[Frame, label]]) –
max_duration (float, optional (default = 5.0)) – Maximum duration to assume one sample combined from voice activities.
negative_threshold (float, optional (default = 0.1)) – If negative_threshold is 0.1, means that, length negative samples must at least 0.1 second.

Returns

result

Return type

List[Frame]

malaya_speech.utils.subword#

malaya_speech.utils.subword.generate_tokenizer(strings, target_vocab_size=1024, max_subword_length=4, max_corpus_chars=None, reserved_tokens=None)[source]#: Build a subword dictionary.

malaya_speech.utils.subword.save(tokenizer, path)[source]#: Save subword dictionary to a text file.

malaya_speech.utils.subword.load(path)[source]#: Load text file into subword dictionary.

malaya_speech.utils.subword.encode(tokenizer, string, add_blank=False)[source]#

Encode string to integer representation based on ascii table or lookup variable.

Parameters

tokenizer (object) – tokenizer object
string (str) –
add_blank (bool, optional (default=False)) – add BLANK token at the starting of encoded, this is for transducer / transformer based.
lookup (List[str], optional (default=None)) – list of unique strings.

Returns

result

Return type

List[int]

malaya_speech.utils.subword.decode(tokenizer, ids)[source]#

Decode integer representation to string based on tokenizer vocab.

Parameters

tokenizer (object) – tokenizer object
ids (List[int]) –

Returns

result

Return type

str

malaya_speech.utils.subword.decode_multilanguage(tokenizers, ids)[source]#

Decode integer representation to string using list of tokenizer objects.

Parameters

tokenizers (List[object]) – List of tokenizer objects.
ids (List[int]) –

Returns

result

Return type

str

malaya_speech.utils.subword.load_sentencepiece(model_file)[source]#

Parameters: model_file (str) – sentencepiece model file.
Returns: result
Return type: sentencepiece.SentencePieceProcessor

malaya_speech.utils.tf_featurization#

malaya_speech.utils.torch_featurization#

https://github.com/pytorch/audio/blob/main/examples/asr/librispeech_conformer_rnnt/transforms.py

malaya_speech.utils.torch_featurization.conformer_rnnt_model(*, input_dim, encoding_dim, time_reduction_stride, conformer_input_dim, conformer_ffn_dim, conformer_num_layers, conformer_num_heads, conformer_depthwise_conv_kernel_size, conformer_dropout, num_symbols, symbol_embedding_dim, num_lstm_layers, lstm_hidden_dim, lstm_layer_norm, lstm_layer_norm_epsilon, lstm_dropout, joiner_activation)[source]#

Builds Conformer-based recurrent neural network transducer (RNN-T) model.

Parameters

input_dim (int) – dimension of input sequence frames passed to transcription network.
encoding_dim (int) – dimension of transcription- and prediction-network-generated encodings passed to joint network.
time_reduction_stride (int) – factor by which to reduce length of input sequence.
conformer_input_dim (int) – dimension of Conformer input.
conformer_ffn_dim (int) – hidden layer dimension of each Conformer layer’s feedforward network.
conformer_num_layers (int) – number of Conformer layers to instantiate.
conformer_num_heads (int) – number of attention heads in each Conformer layer.
conformer_depthwise_conv_kernel_size (int) – kernel size of each Conformer layer’s depthwise convolution layer.
conformer_dropout (float) – Conformer dropout probability.
num_symbols (int) – cardinality of set of target tokens.
symbol_embedding_dim (int) – dimension of each target token embedding.
num_lstm_layers (int) – number of LSTM layers to instantiate.
lstm_hidden_dim (int) – output dimension of each LSTM layer.
lstm_layer_norm (bool) – if True, enables layer normalization for LSTM layers.
lstm_layer_norm_epsilon (float) – value of epsilon to use in LSTM layer normalization layers.
lstm_dropout (float) – LSTM dropout probability.
joiner_activation (str) – activation function to use in the joiner. Must be one of (“relu”, “tanh”). (Default: “relu”)

Returns

Conformer RNN-T model.

Return type

RNNT

class malaya_speech.utils.torch_featurization.FunctionalModule(functional)[source]#

forward(input)[source]#

Defines the computation performed at every call.

Should be overridden by all subclasses.

Note

class malaya_speech.utils.torch_featurization.GlobalStatsNormalization(global_stats_path)[source]#

forward(input)[source]#

Defines the computation performed at every call.

Should be overridden by all subclasses.

Note

class malaya_speech.utils.torch_featurization.FeatureExtractor(global_stats_path, pad=False)[source]#

forward(input)[source]#

Defines the computation performed at every call.

Should be overridden by all subclasses.

Note

malaya_speech.utils.torch_featurization.separate_sources(model, mix, segment=10.0, overlap=0.1, device=None)[source]#

Apply model to a given mixture. Use fade, and add segments together in order to add model segment by segment.

Parameters

segment (int) – segment length in seconds
device (torch.device, str, or None) – if provided, device on which to execute the computation, otherwise mix.device is assumed. When device is different from mix.device, only local computations will be on device, while the entire tracks will be stored on mix.device.

malaya_speech.age_detection#

malaya_speech.age_detection.available_model()[source]#: List available age detection deep models.

malaya_speech.age_detection.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load age detection deep model.

Parameters

model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.age_detection.available_model().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.diarization#

malaya_speech.diarization.streaming(vector, streaming_model, add_speaker_prefix=True)[source]#

Streaming speaker diarization.

Parameters

vector (np.array) – np.array or malaya_speech.model.frame.Frame.
streaming_model (Callable) – must have streaming method.
add_speaker_prefix (bool, optional (default=True)) – if True, will add ‘speaker ‘ as prefix.

Returns

result

Return type

str

malaya_speech.diarization.speaker_similarity(vad_results, speaker_vector, similarity_threshold=0.8, agg_function=<function mean>, return_embedding=False)[source]#

Speaker diarization using L2-Norm similarity.

Parameters

vad_results (List[Tuple[Frame, label]]) – results from VAD.
speaker_vector (callable) – speaker vector object.
similarity_threshold (float, optional (default=0.8)) – if current voice activity sample similar at least 0.8, we assumed it is from the same speaker.

Returns

result

Return type

List[Tuple[Frame, label]]

malaya_speech.diarization.clustering(vad_results, speaker_vector, model, norm_function=<function l2_normalize>, log_distance_metric=None, return_embedding=False)[source]#

Speaker diarization using any clustering model.

Parameters

vad_results (List[Tuple[Frame, label]]) – results from VAD.
speaker_vector (callable) – speaker vector object.
model (callable) – Any unsupervised clustering model. Required fit_predict or apply or predict method.
norm_function (Callable, optional(default=malaya_speech.utils.dist.l2_normalize)) – normalize function for speaker vectors.
log_distance_metric (str, optional (default=None)) – post distance norm in log scale metrics. this parameter is necessary for model that required square array input. Common value is one of [‘cosine’, ‘angular’].

Returns

result

Return type

List[Tuple[Frame, label]]

malaya_speech.diarization.combine(list_results, speaker_vector, similarity_threshold=0.8, agg_function=<function mean>, sortby_pagerank=True)[source]#

Combined multiple diarization results into single diarization results using PageRank. Required malaya and networkx libraries.

Parameters

vad_results (List[List[Tuple[Frame, label]]]) – results from multiple diarization results.
speaker_vector (callable) – speaker vector object.
similarity_threshold (float, optional (default=0.8)) – if current voice activity sample similar at least 0.8, we assumed it is from the existing speakers.
agg_function (Callable, optional (default=np.mean)) – aggregate function to aggregate when we have multiple samples for the same speaker.
sortby_pagerank (bool, optional (default=True)) – sort speaker names using pagerank score. This required malaya to be installed.

Returns

result

Return type

List[Tuple[Frame, label]]

malaya_speech.emotion#

malaya_speech.emotion.available_model()[source]#: List available emotion detection deep models.

malaya_speech.emotion.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load emotion detection deep model.

Parameters

model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.emotion.available_model().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.force_alignment.ctc#

malaya_speech.force_alignment.ctc.available_transformer()[source]#: List available Encoder-CTC Aligner models.

malaya_speech.force_alignment.ctc.available_huggingface()[source]#: List available HuggingFace Malaya-Speech Aligner models.

malaya_speech.force_alignment.ctc.transformer(model='hubert-conformer', quantized=False, **kwargs)[source]#

Load Encoder-CTC ASR model.

Parameters

model (str, optional (default='hubert-conformer')) – Check available models at malaya_speech.force_alignment.ctc.available_transformer().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.wav2vec.Wav2Vec2_Aligner class

malaya_speech.force_alignment.ctc.huggingface(model='mesolitica/wav2vec2-xls-r-300m-mixed', force_check=True, **kwargs)[source]#

Load Finetuned models from HuggingFace.

Parameters

model (str, optional (default='mesolitica/wav2vec2-xls-r-300m-mixed')) – Check available models at malaya_speech.force_alignment.ctc.available_huggingface().
force_check (bool, optional (default=True)) – Force check model one of malaya model. Set to False if you have your own huggingface model.

Returns

result

Return type

malaya_speech.model.huggingface.Aligner class

malaya_speech.force_alignment.seq2seq#

malaya_speech.force_alignment.seq2seq.huggingface(model='mesolitica/finetune-whisper-base-ms-singlish-v2', force_check=True, **kwargs)[source]#

Load Finetuned models from HuggingFace.

Parameters

model (str, optional (default='mesolitica/finetune-whisper-base-ms-singlish-v2')) – Check available models at malaya_speech.force_alignment.seq2seq.available_huggingface().
force_check (bool, optional (default=True)) – Force check model one of malaya model. Set to False if you have your own huggingface model.

Returns

result

Return type

malaya_speech.model.huggingface.Seq2SeqAligner class

malaya_speech.force_alignment.transducer#

malaya_speech.force_alignment.transducer.available_transformer()[source]#: List available Encoder-Transducer Aligner models.

malaya_speech.force_alignment.transducer.transformer(model='conformer-transducer', quantized=False, **kwargs)[source]#

Load Encoder-Transducer Aligner model.

Parameters

model (str, optional (default='conformer-transducer')) – Check available models at malaya_speech.force_alignment.transducer.available_transformer().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.transducer.TransducerAligner class

malaya_speech.force_alignment.transducer.pt_transformer(model='mesolitica/conformer-base', **kwargs)[source]#

Load Encoder-Transducer ASR model using Pytorch.

Parameters: model (str, optional (default='mesolitica/conformer-base')) – Check available models at malaya_speech.force_alignment.transducer.available_pt_transformer().
Returns: result
Return type: malaya_speech.torch_model.torchaudio.ForceAlignment class

malaya_speech.gender#

malaya_speech.gender.available_model()[source]#: List available gender detection deep models.

malaya_speech.gender.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load gender detection deep model.

Parameters

model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.gender.available_model().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.is_clean#

malaya_speech.is_clean.available_nemo()[source]#: List available Nvidia Nemo is clean models.

malaya_speech.is_clean.nemo(model='huseinzol05/nemo-is-clean-speakernet', **kwargs)[source]#

Load Nvidia Nemo is clean model. Trained on 100, 200, 300 ms frames.

Parameters: model (str, optional (default='huseinzol05/nemo-is-clean-speakernet')) – Check available models at malaya_speech.is_clean.available_nemo().
Returns: result
Return type: malaya_speech.torch_model.nemo.Classification class

malaya_speech.language_detection#

malaya_speech.language_detection.available_model()[source]#: List available language detection deep models.

malaya_speech.language_detection.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load language detection deep model.

Parameters

model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.language_detection.available_model().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.language_model#

malaya_speech.language_model.available_kenlm()[source]#: List available KenLM Language Model.

malaya_speech.language_model.available_gpt2()[source]#: List available GPT2 Language Model.

malaya_speech.language_model.available_mlm()[source]#: List available MLM Language Model.

malaya_speech.language_model.kenlm(model='dump-combined', **kwargs)[source]#

Load KenLM language model.

Parameters: model (str, optional (default='dump-combined')) – Check available models at malaya_speech.language_model.available_kenlm().
Returns: result
Return type: str

malaya_speech.language_model.gpt2(model='mesolitica/gpt2-117m-bahasa-cased', force_check=True, **kwargs)[source]#

Load GPT2 language model.

Parameters

model (str, optional (default='mesolitica/gpt2-117m-bahasa-cased')) – Check available models at malaya_speech.language_model.available_gpt2().
force_check (bool, optional (default=True)) – Force check model one of malaya model. Set to False if you have your own huggingface model.

Returns

result

Return type

malaya.torch_model.gpt2_lm.LM class

malaya_speech.language_model.mlm(model='mesolitica/bert-base-standard-bahasa-cased', force_check=True, **kwargs)[source]#

Load Masked language model.

Parameters

model (str, optional (default='mesolitica/bert-base-standard-bahasa-cased')) – Check available models at malaya_speech.language_model.available_mlm().
force_check (bool, optional (default=True)) – Force check model one of malaya model. Set to False if you have your own huggingface model.

Returns

result

Return type

malaya_speech.torch_model.mask_lm.LM class

malaya_speech.multispeaker_separation#

malaya_speech.multispeaker_separation.available_deep_wav()[source]#: List available FastSep models trained on raw 8k wav.

malaya_speech.multispeaker_separation.deep_wav(model='fastsep-4', quantized=False, **kwargs)[source]#

Load FastSep model, trained on raw 8k wav using SISNR PIT loss.

Parameters

model (str, optional (default='fastsep-4')) – Check available models at malaya_speech.multispeaker_separation.available_deep_wav().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.tf.Split class

malaya_speech.noise_reduction#

malaya_speech.noise_reduction.available_model()[source]#: List available Noise Reduction deep learning models.

malaya_speech.noise_reduction.deep_model(model='resnet-unet', quantized=False, **kwargs)[source]#

Load Noise Reduction deep learning model.

Parameters

model (str, optional (default='resnet-unet')) – Check available models at malaya_speech.noise_reduction.available_model().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.tf.UNET_STFT class

malaya_speech.speaker_change#

malaya_speech.speaker_change.available_model()[source]#: List available speaker change deep models.

malaya_speech.speaker_change.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load speaker change deep model.

Parameters

model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.speaker_change.available_model().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.speaker_change.split_activities(vad_results, speaker_change_results, speaker_change_threshold=0.5, sr=16000, ignore_not_activity=True)[source]#

split VAD based on speaker change threshold, worse-case O(N^2).

Parameters

vad_results (List[Tuple[Frame, label]]) – results from VAD.
speaker_change_results (List[Tuple[Frame, float]], optional (default=None)) – results from speaker change module, must in float result.
speaker_change_threshold (float, optional (default=0.5)) – in one voice activity sample can be more than one speaker, split it using this threshold.
sr (int, optional (default=16000)) – sample rate, classification model in malaya-speech use 16k.
ignore_not_activity (bool, optional (default=True)) – If True, will ignore if result VAD is False, else will try to split.

Returns

result

Return type

List[Tuple[Frame, label]]

malaya_speech.speaker_count#

malaya_speech.speaker_count.available_nemo()[source]#: List available Nvidia Nemo speaker count models.

malaya_speech.speaker_count.nemo(model='huseinzol05/nemo-speaker-count-speakernet', **kwargs)[source]#

Load Nvidia Nemo speaker count model. Trained on 300 ms frames.

Parameters: model (str, optional (default='huseinzol05/nemo-speaker-count-speakernet')) – Check available models at malaya_speech.speaker_count.available_nemo().
Returns: result
Return type: malaya_speech.torch_model.nemo.Classification class

malaya_speech.speaker_overlap#

malaya_speech.speaker_overlap.available_model()[source]#: List available speaker overlap deep models.

malaya_speech.speaker_overlap.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load speaker overlap deep model.

Parameters

model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.speaker_overlap.available_model().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.speaker_vector#

malaya_speech.speaker_vector.available_model()[source]#: List available speaker vector deep models using Tensorflow.

malaya_speech.speaker_vector.available_nemo()[source]#: List available Nvidia Nemo Speaker vector models.

malaya_speech.speaker_vector.available_huggingface()[source]#: List available HuggingFace Speaker vector models.

malaya_speech.speaker_vector.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load Speaker2Vec model.

Parameters

model (str, optional (default='speakernet')) – Check available models at malaya_speech.speaker_vector.available_model().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.speaker_vector.nemo(model='huseinzol05/nemo-ecapa-tdnn', **kwargs)[source]#

Load Nemo Speaker verification model.

Parameters: model (str, optional (default='huseinzol05/nemo-ecapa-tdnn')) – Check available models at malaya_speech.speaker_vector.available_nemo().
Returns: result
Return type: malaya_speech.torch_model.nemo.SpeakerVector class

malaya_speech.speaker_vector.huggingface(model='microsoft/wavlm-base-plus-sv', force_check=True, **kwargs)[source]#

Load Finetuned models from HuggingFace.

Parameters

model (str, optional (default='microsoft/wavlm-base-plus-sv')) – Check available models at malaya_speech.speaker_vector.available_huggingface().
force_check (bool, optional (default=True)) – Force check model one of malaya model. Set to False if you have your own huggingface model.

Returns

result

Return type

malaya_speech.torch_model.huggingface.XVector class

malaya_speech.speech_enhancement#

malaya_speech.speech_enhancement.available_deep_masking()[source]#: List available Speech Enhancement STFT masking deep learning model.

malaya_speech.speech_enhancement.available_deep_enhance()[source]#: List available Speech Enhancement UNET Waveform sampling deep learning model.

malaya_speech.speech_enhancement.deep_masking(model='resnet-unet', quantized=False, **kwargs)[source]#

Load Speech Enhancement STFT UNET masking deep learning model.

Parameters

model (str, optional (default='resnet-unet')) – Check available models at malaya_speech.speech_enhancement.available_deep_masking().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.unet.UNETSTFT class

malaya_speech.speech_enhancement.deep_enhance(model='unet', quantized=False, **kwargs)[source]#

Load Speech Enhancement UNET Waveform sampling deep learning model.

Parameters

model (str, optional (default='unet')) – Check available models at malaya_speech.speech_enhancement.available_deep_enhance().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.unet.UNET1D class

malaya_speech.speechsplit_conversion#

malaya_speech.speechsplit_conversion.available_deep_conversion(f0_mode='pysptk')[source]#

List available Voice Conversion models.

Parameters

f0_mode (str, optional (default='pysptk')) –

F0 conversion supported. Allowed values:

'pysptk' - https://github.com/r9y9/pysptk, sensitive towards gender.
'pyworld' - https://pypi.org/project/pyworld/

malaya_speech.speechsplit_conversion.deep_conversion(model='fastspeechsplit-v2-vggvox-v2', f0_mode='pysptk', quantized=False, **kwargs)[source]#

Load Voice Conversion model.

Parameters

model (str, optional (default='fastspeechsplit-v2-vggvox-v2')) – Check available models at malaya_speech.speechsplit_conversion.available_deep_conversion(f0_mode = ‘{f0_mode}’)
f0_mode (str, optional (default='pysptk')) –
F0 conversion supported. Allowed values:
- 'pysptk' - https://github.com/r9y9/pysptk, sensitive towards gender.
- 'pyworld' - https://pypi.org/project/pyworld/
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.splitter.FastSpeechSplit class

malaya_speech.stack#

malaya_speech.stack.classification_stack(models)[source]#

Stacking for classification models. All models should be in the same domain classification.

Parameters: models (List[Callable]) – list of models.
Returns: result
Return type: malaya_speech.stack.Stack class

malaya_speech.model.stack.Stack#

class malaya_speech.stack.Stack[source]#

predict_proba(inputs, aggregate=<function gmean>)[source]#

Stacking for predictive models, will return probability.

Parameters

inputs (List[np.array]) –
aggregate (Callable, optional (default=scipy.stats.mstats.gmean)) –
function. (Aggregate) –

Returns

result

Return type

np.array

predict(inputs, aggregate=<function gmean>)[source]#

Stacking for predictive models, will return labels.

Parameters

inputs (List[np.array]) –
aggregate (Callable, optional (default=scipy.stats.mstats.gmean)) –
function. (Aggregate) –

Returns

result

Return type

List[str]

malaya_speech.stt.ctc#

malaya_speech.stt.ctc.available_transformer()[source]#: List available Encoder-CTC ASR models.

malaya_speech.stt.ctc.available_huggingface()[source]#: List available HuggingFace CTC ASR models.

malaya_speech.stt.ctc.transformer(model='hubert-conformer', quantized=False, **kwargs)[source]#

Load Encoder-CTC ASR model.

Parameters

model (str, optional (default='hubert-conformer')) – Check available models at malaya_speech.stt.ctc.available_transformer().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.wav2vec.Wav2Vec2_CTC class

malaya_speech.stt.ctc.huggingface(model='mesolitica/wav2vec2-xls-r-300m-mixed', force_check=True, **kwargs)[source]#

Load Finetuned models from HuggingFace.

Parameters

model (str, optional (default='mesolitica/wav2vec2-xls-r-300m-mixed')) – Check available models at malaya_speech.stt.ctc.available_huggingface().
force_check (bool, optional (default=True)) – Force check model one of malaya model. Set to False if you have your own huggingface model.

Returns

result

Return type

malaya_speech.torch_model.huggingface.CTC class

malaya_speech.stt.seq2seq#

malaya_speech.stt.seq2seq.available_huggingface()[source]#: List available HuggingFace Seq2Seq ASR models.

malaya_speech.stt.seq2seq.available_whisper()[source]#: List available OpenAI Whisper ASR models.

malaya_speech.stt.seq2seq.huggingface(model='mesolitica/finetune-whisper-base-ms-singlish-v2', force_check=True, **kwargs)[source]#

Load Finetuned models from HuggingFace.

Parameters

model (str, optional (default='mesolitica/finetune-whisper-base-ms-singlish-v2')) – Check available models at malaya_speech.stt.seq2seq.available_huggingface().
force_check (bool, optional (default=True)) – Force check model one of malaya model. Set to False if you have your own huggingface model.

Returns

result

Return type

malaya_speech.model.huggingface.Seq2Seq class

malaya_speech.stt.seq2seq.whisper(model='mesolitica/finetune-whisper-base-ms-singlish-v2', force_check=True, **kwargs)[source]#

Load Finetuned models from HuggingFace.

Parameters

model (str, optional (default='mesolitica/finetune-whisper-base-ms-singlish-v2')) – Check available models at malaya_speech.stt.seq2seq.available_whisper().
force_check (bool, optional (default=True)) – Force check model one of malaya model. Set to False if you have your own huggingface model.

Returns

result

Return type

whisper.model.Whisper class

malaya_speech.stt.transducer#

malaya_speech.stt.transducer.available_transformer()[source]#: List available Encoder-Transducer ASR models using Tensorflow.

malaya_speech.stt.transducer.available_pt_transformer()[source]#: List available Encoder-Transducer ASR models using Pytorch.

malaya_speech.stt.transducer.transformer(model='conformer', quantized=False, **kwargs)[source]#

Load Encoder-Transducer ASR model using Tensorflow.

Parameters

model (str, optional (default='conformer')) – Check available models at malaya_speech.stt.transducer.available_transformer().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.transducer.Transducer class

malaya_speech.stt.transducer.pt_transformer(model='mesolitica/conformer-base', **kwargs)[source]#

Load Encoder-Transducer ASR model using Pytorch.

Parameters: model (str, optional (default='mesolitica/conformer-base')) – Check available models at malaya_speech.stt.transducer.available_pt_transformer().
Returns: result
Return type: malaya_speech.torch_model.torchaudio.Conformer class

malaya_speech.super_resolution#

malaya_speech.super_resolution.available_unet()[source]#: List available Super Resolution 4x deep learning UNET models.

malaya_speech.super_resolution.available_vocoder()[source]#: List available Super Resolution deep learning vocoder models.

malaya_speech.super_resolution.available_diffusion()[source]#: List available Super Resolution deep learning diffusion models.

malaya_speech.super_resolution.unet(model='srgan-256', quantized=False, **kwargs)[source]#

Load Super Resolution 4x deep learning UNET model.

Parameters

model (str, optional (default='srgan-256')) – Check available models at malaya_speech.super_resolution.available_unet().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.tf.UNET1D class

malaya_speech.super_resolution.vocoder(model='hifigan-bwe', **kwargs)[source]#

Load vocoder based super resolution.

Parameters: model (str, optional (default='hifigan-bwe')) – Check available models at malaya_speech.super_resolution.available_vocoder().
Returns: result
Return type: malaya_speech.torch_model.super_resolution.*

malaya_speech.super_resolution.diffusion(model='nuwave2', **kwargs)[source]#

Load audio diffusion based super resolution.

Parameters: model (str, optional (default='nuwave2')) – Check available models at malaya_speech.super_resolution.available_diffusion().
Returns: result
Return type: malaya_speech.torch_model.super_resolution.NuWave2

malaya_speech.tts#

malaya_speech.tts.available_tacotron2()[source]#: List available Tacotron2, Text to Mel models.

malaya_speech.tts.available_fastspeech2()[source]#: List available FastSpeech2, Text to Mel models.

malaya_speech.tts.available_fastpitch()[source]#: List available FastPitch, Text to Mel models.

malaya_speech.tts.available_glowtts()[source]#: List available GlowTTS, Text to Mel models.

malaya_speech.tts.available_lightspeech()[source]#: List available LightSpeech, Text to Mel models.

malaya_speech.tts.available_e2e_fastspeech2()[source]#: List available FastSpeech2, End-to-End models.

malaya_speech.tts.available_vits()[source]#: List available VITS, End-to-End models.

malaya_speech.tts.available_vits_v2()[source]#: List available VITS V2, End-to-End models.

malaya_speech.tts.load_text_ids(pad_to=8, understand_punct=True, is_lower=True, **kwargs)[source]#: Load text normalizer module use by Malaya-Speech TTS.

malaya_speech.tts.tacotron2(model='yasmin', quantized=False, pad_to=8, **kwargs)[source]#

Load Tacotron2 Text-to-Mel TTS model.

Parameters

model (str, optional (default='yasmin')) – Check available models at malaya_speech.tts.available_tacotron2().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.
pad_to (int, optional (default=8)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.

Returns

result

Return type

malaya_speech.model.synthesis.Tacotron class

malaya_speech.tts.fastspeech2(model='osman', quantized=False, pad_to=8, **kwargs)[source]#

Load Fastspeech2 Text-to-Mel TTS model.

Parameters

model (str, optional (default='male')) – Check available models at malaya_speech.tts.available_fastspeech2().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.
pad_to (int, optional (default=8)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.

Returns

result

Return type

malaya_speech.model.synthesis.Fastspeech class

malaya_speech.tts.fastpitch(model='male', quantized=False, pad_to=8, **kwargs)[source]#

Load Fastspitch Text-to-Mel TTS model.

Parameters

model (str, optional (default='male')) – Check available models at malaya_speech.tts.available_fastpitch().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.
pad_to (int, optional (default=8)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.

Returns

result

Return type

malaya_speech.model.synthesis.Fastpitch class

malaya_speech.tts.glowtts(model='yasmin', quantized=False, pad_to=2, **kwargs)[source]#

Load GlowTTS Text-to-Mel TTS model.

Parameters

model (str, optional (default='yasmin')) – Check available models at malaya_speech.tts.available_glowtts().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.
pad_to (int, optional (default=2)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 2.

Returns

result

Return type

malaya_speech.model.synthesis.GlowTTS class

malaya_speech.tts.lightspeech(model='male', quantized=False, pad_to=8, **kwargs)[source]#

Load LightSpeech Text-to-Mel TTS model.

Parameters

model (str, optional (default='male')) – Check available models at malaya_speech.tts.available_lightspeech().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.
pad_to (int, optional (default=8)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.

Returns

result

Return type

malaya_speech.model.synthesis.Fastspeech class

malaya_speech.tts.e2e_fastspeech2(model='osman', quantized=False, pad_to=8, **kwargs)[source]#

Load Fastspeech2 Text-to-Mel TTS model.

Parameters

model (str, optional (default='male')) – Check available models at malaya_speech.tts.available_e2e_fastspeech2().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.
pad_to (int, optional (default=8)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.

Returns

result

Return type

malaya_speech.model.synthesis.E2E_FastSpeech class

malaya_speech.tts.vits(model='mesolitica/VITS-osman', **kwargs)[source]#

Load VITS End-to-End TTS model.

Parameters: model (str, optional (default='mesolitica/VITS-osman')) – Check available models at malaya_speech.tts.available_vits().
Returns: result
Return type: malaya_speech.torch_model.synthesis.VITS class

malaya_speech.tts.vits_v2(model='mesolitica/VITS-V2-husein', **kwargs)[source]#

Load VITS V2 End-to-End TTS model.

Parameters: model (str, optional (default='mesolitica/VITS-V2-husein')) – Check available models at malaya_speech.tts.available_vits().
Returns: result
Return type: malaya_speech.torch_model.synthesis.VITS class

malaya_speech.vad#

malaya_speech.vad.available_model()[source]#: List available VAD deep models.

malaya_speech.vad.available_nemo()[source]#: List available Nvidia Nemo VAD models.

malaya_speech.vad.webrtc(aggressiveness=3, sample_rate=16000, minimum_amplitude=100)[source]#

Load WebRTC VAD model. WebRTC prefer 30ms frame, https://github.com/wiseman/py-webrtcvad#how-to-use-it

Parameters

aggressiveness (int, optional (default=3)) – an integer between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive.
sample_rate (int, optional (default=16000)) – sample rate for samples.
minimum_amplitude (int, optional (default=100)) – abs(minimum_amplitude) to assume a sample is a voice activity. Else, automatically False.

Returns

result

Return type

malaya_speech.model.webrtc.WebRTC class

malaya_speech.vad.deep_model(model='marblenet-factor1', quantized=False, **kwargs)[source]#

Load VAD model. Prefer 50 ms or bigger frame.

Parameters

model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.vad.available_model().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.vad.nemo(model='huseinzol05/nemo-vad-marblenet', **kwargs)[source]#

Load Nemo VAD model. Nemo VAD prefer 63 ms frame, https://github.com/NVIDIA/NeMo/blob/02cf155b020964992a974e030b9e318426761e33/nemo/collections/asr/data/feature_to_label_dataset.py#L43

Parameters: model (str, optional (default='huseinzol05/vad-marblenet')) – Check available models at malaya_speech.vad.available_nemo().
Returns: result
Return type: malaya_speech.torch_model.nemo.Classification class

malaya_speech.vocoder#

malaya_speech.vocoder.available_melgan()[source]#: List available MelGAN Mel-to-Speech models.

malaya_speech.vocoder.available_mbmelgan()[source]#: List available Multiband MelGAN Mel-to-Speech models.

malaya_speech.vocoder.available_hifigan()[source]#: List available HiFiGAN Mel-to-Speech models.

malaya_speech.vocoder.available_pt_hifigan()[source]#: List available PyTorch HiFiGAN Mel-to-Speech models.

malaya_speech.vocoder.melgan(model='universal-1024', quantized=False, **kwargs)[source]#

Load MelGAN Vocoder model.

Parameters

model (str, optional (default='universal-1024')) – Check available models at malaya_speech.vocoder.available_melgan().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.synthesis.Vocoder class

malaya_speech.vocoder.mbmelgan(model='female', quantized=False, **kwargs)[source]#

Load Multiband MelGAN Vocoder model.

Parameters

model (str, optional (default='female')) – Check available models at malaya_speech.vocoder.available_mbmelgan().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.synthesis.Vocoder class

malaya_speech.vocoder.hifigan(model='universal-768', quantized=False, **kwargs)[source]#

Load HiFiGAN Vocoder model.

Parameters

model (str, optional (default='universal-768')) – Check available models at malaya_speech.vocoder.available_hifigan().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.synthesis.Vocoder class

malaya_speech.vocoder.pt_hifigan(model='huseinzol05/jik876-UNIVERSAL_V1', **kwargs)[source]#

Load PyTorch HiFiGAN Vocoder model, originally from https://github.com/jik876/hifi-gan.

Parameters: model (str, optional (default='huseinzol05/jik876-UNIVERSAL_V1')) –
Returns: result
Return type: malaya_speech.torch_model.synthesis.Vocoder class

malaya_speech.voice_conversion#

malaya_speech.voice_conversion.available_fastvc()[source]#: List available Voice Conversion models.

malaya_speech.voice_conversion.fastvc(model='fastvc-32-vggvox-v2', quantized=False, **kwargs)[source]#

Load Voice Conversion FastVC model.

Parameters

model (str, optional (default='fastvc-32-vggvox-v2')) – Check available models at malaya_speech.voice_conversion.available_deep_conversion().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.synthesis.FastVC class

API

Contents

API#

malaya_speech#

malaya_speech.augmentation.spectrogram#

malaya_speech.extra.rttm#

malaya_speech.extra.visualization#

malaya_speech.model.classification.Speakernet#

malaya_speech.model.classification.Speaker2Vec#

malaya_speech.model.classification.SpeakernetClassification#

malaya_speech.model.classification.Classification#

malaya_speech.model.clustering.AgglomerativeClustering#

malaya_speech.model.clustering.HiddenMarkovModelClustering#

malaya_speech.model.clustering.StreamingKMeansMaxCluster#

malaya_speech.model.clustering.StreamingKMeans#

malaya_speech.model.clustering.StreamingSpeakerSimilarity#

malaya_speech.model.splitter.Split_Wav#

malaya_speech.model.splitter.Split_Mel#

malaya_speech.model.splitter.FastSpeechSplit#

malaya_speech.model.synthesis.TTS#

malaya_speech.model.synthesis.Vocoder#

malaya_speech.model.synthesis.Tacotron#

malaya_speech.model.synthesis.Fastspeech#

malaya_speech.model.synthesis.FastspeechSDP#

malaya_speech.model.synthesis.E2E_FastSpeech#

malaya_speech.model.synthesis.FastVC#

malaya_speech.model.synthesis.Fastpitch#

malaya_speech.model.transducer.Transducer#

malaya_speech.model.transducer.TransducerAligner#

malaya_speech.model.unet.UNET#

malaya_speech.model.unet.UNETSTFT#

malaya_speech.model.unet.UNET1D#

malaya_speech.model.wav2vec.Wav2Vec2_CTC#

malaya_speech.model.wav2vec.Wav2Vec2_Aligner#

malaya_speech.model.webrtc.WebRTC#

malaya_speech.torch_model.huggingface.CTC#

malaya_speech.torch_model.huggingface.Aligner#

malaya_speech.torch_model.huggingface.Seq2Seq#

malaya_speech.torch_model.huggingface.Seq2SeqAligner#

malaya_speech.torch_model.huggingface.XVector#

malaya_speech.torch_model.nemo.SpeakerVector#

malaya_speech.torch_model.nemo.Classification#

malaya_speech.torch_model.super_resolution.VoiceFixer#

malaya_speech.torch_model.super_resolution.NVSR#

malaya_speech.torch_model.synthesis.VITS#

malaya_speech.torch_model.torchaudio.Conformer#

malaya_speech.torch_model.torchaudio.ForceAlignment#

malaya_speech.pipeline#

malaya_speech.pipeline.map#

malaya_speech.pipeline.batching#

malaya_speech.pipeline.partition#

malaya_speech.pipeline.sliding_window#

malaya_speech.pipeline.foreach_map#

malaya_speech.pipeline.flatten#

malaya_speech.pipeline.zip#

malaya_speech.streaming.pyaudio#

malaya_speech.streaming.torchaudio#

malaya_speech.utils.aligner#

malaya_speech.utils.astype#

malaya_speech.utils.char#

malaya_speech.utils.combine#

malaya_speech.utils.featurization#

malaya_speech.utils.generator#

malaya_speech.utils.griffin_lim#

malaya_speech.utils.group#

malaya_speech.utils.io#

malaya_speech.utils.padding#

malaya_speech.utils.read#

malaya_speech.utils.split#

malaya_speech.utils.subword#

malaya_speech.utils.tf_featurization#

malaya_speech.utils.torch_featurization#

malaya_speech.age_detection#

malaya_speech.diarization#

malaya_speech.emotion#

malaya_speech.force_alignment.ctc#

malaya_speech.force_alignment.seq2seq#

malaya_speech.force_alignment.transducer#

malaya_speech.gender#

malaya_speech.is_clean#