API#

malaya_speech#

malaya_speech.augmentation.spectrogram#

malaya_speech.augmentation.spectrogram.mask_frequency(features, n_freq_mask=2, width_freq_mask=8, random_band=True)[source]#

Mask frequency.

Parameters

features (np.array) –
n_freq_mask (int, optional (default=2)) – loop size for masking.
width_freq_mask (int, optional (default=8)) – masking size.

Returns

result

Return type

np.array

malaya_speech.augmentation.spectrogram.mask_time(features, n_time_mask=2, width_time_mask=8, random_band=True)[source]#

Time frequency.

Parameters

features (np.array) –
n_time_mask (int, optional (default=2)) – loop size for masking.
width_time_mask (int, optional (default=8)) – masking size.

Returns

result

Return type

np.array

malaya_speech.augmentation.spectrogram.tf_mask_frequency(features, n_freq_mask=2, F=27)[source]#

Mask frequency using Tensorflow.

Parameters

features (np.array) –
F (size of mask for frequency) –

malaya_speech.augmentation.spectrogram.tf_mask_time(features, n_time_mask=2, T=80)[source]#

Mask time using Tensorflow.

Parameters

features (np.array) –
T (size of mask for time) –

malaya_speech.extra.rttm#

malaya_speech.extra.rttm.load(file)[source]#

Load RTTM file.

Parameters: file (str) –
Returns: result
Return type: Dict[str, malaya_speech.model.annotation.Annotation]

malaya_speech.extra.visualization#

malaya_speech.extra.visualization.visualize_vad(signal, preds, sample_rate=16000, figsize=(15, 3), ax=None, **kwargs)[source]#

Visualize signal given VAD labels. Green means got voice activity, while Red is not.

Parameters

signal (list / np.array) –
preds (List[Tuple[Frame, bool]]) –
sample_rate (int, optional (default=16000)) –
figsize (Tuple[int, int], optional (default=(15, 7))) – matplotlib figure size.

malaya_speech.extra.visualization.plot_classification(preds, description, ax=None, fontsize_text=14, x_text=0.05, y_text=0.2, ylim=(0.1, 0.9), figsize=(15, 3), **kwargs)[source]#

Visualize probability / boolean.

Parameters

preds (List[Tuple[Frame, label]]) –
description (str) –
ax (ax, optional (default = None)) –
fontsize_text (int, optional (default = 14)) –
x_text (float, optional (default = 0.05)) –
y_text (float, optional (default = 0.2)) –

malaya_speech.model.classification.Speakernet#

class malaya_speech.model.classification.Speakernet[source]#

vectorize(inputs)[source]#

Vectorize inputs.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result – returned [B, D].
Return type: np.array

malaya_speech.model.classification.Speaker2Vec#

class malaya_speech.model.classification.Speaker2Vec[source]#

vectorize(inputs)[source]#

Vectorize inputs.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result – returned [B, D].
Return type: np.array

malaya_speech.model.classification.SpeakernetClassification#

class malaya_speech.model.classification.SpeakernetClassification[source]#

predict_proba(inputs)[source]#

Predict inputs, will return probability.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result – returned [B, D].
Return type: np.array

predict(inputs)[source]#

Predict inputs, will return labels.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result – returned [B].
Return type: List[str]

malaya_speech.model.classification.Classification#

class malaya_speech.model.classification.Classification[source]#

predict_proba(inputs)[source]#

Predict inputs, will return probability.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result – returned [B, D].
Return type: np.array

predict(inputs)[source]#

Predict inputs, will return labels.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result – returned [B].
Return type: List[str]

malaya_speech.model.clustering.AgglomerativeClustering#

class malaya_speech.model.clustering.AgglomerativeClustering(min_clusters, max_clusters, metric='cosine', threshold=0.25, method='centroid')[source]#

fit_predict(X)[source]#

Fit predict.

Parameters: X (np.array) – inputs with size of [batch_size, embedding size]
Returns: result
Return type: np.array

malaya_speech.model.clustering.HiddenMarkovModelClustering#

class malaya_speech.model.clustering.HiddenMarkovModelClustering(min_clusters, max_clusters, metric='cosine', covariance_type='diag', threshold=0.25, single_cluster_detection_quantile=0.05, single_cluster_detection_threshold=1.15)[source]#

fit_predict(X)[source]#

Fit predict.

Parameters: X (np.array) – inputs with size of [batch_size, embedding size]
Returns: result
Return type: np.array

malaya_speech.model.huggingface.HuggingFace_CTC#

class malaya_speech.model.huggingface.HuggingFace_CTC[source]#

greedy_decoder(inputs)[source]#

Transcribe inputs using greedy decoder.

Parameters: input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result
Return type: List[str]

predict(inputs)[source]#

Predict logits from inputs using greedy decoder.

Parameters: input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result
Return type: List[str]

predict_logits(inputs, norm_func=<function softmax>)[source]#

Predict logits from inputs.

Parameters

input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
norm_func (Callable, optional (default=malaya.utils.activation.softmax)) –

Returns

result

Return type

List[np.array]

gradio(record_mode=True, lm_func=None, **kwargs)[source]#

Transcribe an input using beam decoder on Gradio interface.

Parameters

record_mode (bool, optional (default=True)) – if True, Gradio will use record mode, else, file upload mode.
lm_func (Callable, optional (default=None)) – if not None, will pass a logits with shape [T, D].
**kwargs (keyword arguments for iface.launch.) –

malaya_speech.model.huggingface.HuggingFace_Aligner#

class malaya_speech.model.huggingface.HuggingFace_Aligner[source]#

predict(input, transcription, sample_rate=16000)[source]#

Transcribe input, will return a string.

Parameters

input (np.array) – np.array or malaya_speech.model.frame.Frame.
transcription (str) – transcription of input audio.
sample_rate (int, optional (default=16000)) – sample rate for input.

Returns

result

Return type

Dict[chars_alignment, words_alignment, alignment]

malaya_speech.model.splitter.Split_Wav#

class malaya_speech.model.splitter.Split_Wav[source]#

predict(input)[source]#

Split an audio into 4 different speakers.

Parameters: input (np.array or malaya_speech.model.frame.Frame) –
Returns: result
Return type: np.array

malaya_speech.model.splitter.Split_Mel#

class malaya_speech.model.splitter.Split_Mel[source]#

predict(input)[source]#

Split an audio into 4 different speakers.

Parameters: input (np.array or malaya_speech.model.frame.Frame) –
Returns: result
Return type: np.array

malaya_speech.model.splitter.FastSpeechSplit#

class malaya_speech.model.splitter.FastSpeechSplit[source]#

predict(original_audio, target_audio, modes=['R', 'F', 'U', 'RF', 'RU', 'FU', 'RFU'])[source]#

Change original voice audio to follow targeted voice.

Parameters

original_audio (np.array or malaya_speech.model.frame.Frame) –
target_audio (np.array or malaya_speech.model.frame.Frame) –
modes (List[str], optional (default = ['R', 'F', 'U', 'RF', 'RU', 'FU', 'RFU'])) –
R denotes rhythm, F denotes pitch target, U denotes speaker target (vector).
- 'R' - maintain original_audio F and U on target_audio R.
- 'F' - maintain original_audio R and U on target_audio F.
- 'U' - maintain original_audio R and F on target_audio U.
- 'RF' - maintain original_audio U on target_audio R and F.
- 'RU' - maintain original_audio F on target_audio R and U.
- 'FU' - maintain original_audio R on target_audio F and U.
- 'RFU' - no conversion happened, just do encoder-decoder on target_audio

Returns

result

Return type

Dict[modes]

malaya_speech.model.synthesis.TTS#

class malaya_speech.model.synthesis.TTS[source]#

gradio(vocoder=None, **kwargs)[source]#

Text-to-Speech on Gradio interface.

Parameters

vocoder (Callable, optional (default=None)) – vocoder object that has predict method, prefer from malaya_speech itself. Not required if using End-to-End TTS model such as VITS.
**kwargs (keyword arguments for predict and iface.launch.) –

malaya_speech.model.synthesis.Vocoder#

class malaya_speech.model.synthesis.Vocoder[source]#

predict(inputs)[source]#

Change Mel to Waveform.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result
Return type: List

malaya_speech.model.synthesis.Tacotron#

class malaya_speech.model.synthesis.Tacotron[source]#

predict(string, **kwargs)[source]#

Change string to Mel.

Parameters: string (str) –
Returns: result
Return type: Dict[string, decoder-output, mel-output, universal-output, alignment]

malaya_speech.model.synthesis.Fastspeech#

class malaya_speech.model.synthesis.Fastspeech[source]#

predict(string, speed_ratio=1.0, f0_ratio=1.0, energy_ratio=1.0, **kwargs)[source]#

Change string to Mel.

Parameters

string (str) –
speed_ratio (float, optional (default=1.0)) – Increase this variable will increase time voice generated.
f0_ratio (float, optional (default=1.0)) – Increase this variable will increase frequency, low frequency will generate more deeper voice.
energy_ratio (float, optional (default=1.0)) – Increase this variable will increase loudness.

Returns

result

Return type

Dict[string, decoder-output, mel-output, universal-output]

malaya_speech.model.synthesis.FastspeechSDP#

class malaya_speech.model.synthesis.FastspeechSDP[source]#

predict(string, speed_ratio=1.0, f0_ratio=1.0, energy_ratio=1.0, temperature_durator=0.6666, **kwargs)[source]#

Change string to Mel.

Parameters

string (str) –
speed_ratio (float, optional (default=1.0)) – Increase this variable will increase time voice generated.
f0_ratio (float, optional (default=1.0)) – Increase this variable will increase frequency, low frequency will generate more deeper voice.
energy_ratio (float, optional (default=1.0)) – Increase this variable will increase loudness.
temperature_durator (float, optional (default=0.66666)) – Durator trying to predict alignment with random.normal() * temperature_durator.

Returns

result

Return type

Dict[string, decoder-output, mel-output, universal-output]

malaya_speech.model.synthesis.E2E_FastSpeech#

class malaya_speech.model.synthesis.E2E_FastSpeech[source]#

predict(string, speed_ratio=1.0, f0_ratio=1.0, energy_ratio=1.0, temperature_durator=0.6666, **kwargs)[source]#

Change string to Mel.

Parameters

string (str) –
speed_ratio (float, optional (default=1.0)) – Increase this variable will increase time voice generated.
f0_ratio (float, optional (default=1.0)) – Increase this variable will increase frequency, low frequency will generate more deeper voice.
energy_ratio (float, optional (default=1.0)) – Increase this variable will increase loudness.
temperature_durator (float, optional (default=0.66666)) – Durator trying to predict alignment with random.normal() * temperature_durator.

Returns

result

Return type

Dict[string, decoder-output, y]

malaya_speech.model.synthesis.FastVC#

class malaya_speech.model.synthesis.FastVC[source]#

predict(original_audio, target_audio)[source]#

Change original voice audio to follow targeted voice.

Parameters

original_audio (np.array or malaya_speech.model.frame.Frame) –
target_audio (np.array or malaya_speech.model.frame.Frame) –

Returns

result

Return type

Dict[decoder-output, mel-output]

malaya_speech.model.synthesis.Fastpitch#

class malaya_speech.model.synthesis.Fastpitch[source]#

predict(string, speed_ratio=1.0, pitch_ratio=1.0, pitch_addition=0.0, **kwargs)[source]#

Change string to Mel.

Parameters

string (str) –
speed_ratio (float, optional (default=1.0)) – Increase this variable will increase time voice generated.
pitch_ratio (float, optional (default=1.0)) – pitch = pitch * pitch_ratio, amplify existing pitch contour.
pitch_addition (float, optional (default=0.0)) – pitch = pitch + pitch_addition, change pitch contour.

Returns

result

Return type

Dict[string, decoder-output, mel-output, pitch-output, universal-output]

malaya_speech.model.transducer.Transducer#

class malaya_speech.model.transducer.Transducer[source]#

predict_alignment(input, combined=True)[source]#

Transcribe input and get timestamp, only support greedy decoder.

Parameters

input (np.array) – np.array or malaya_speech.model.frame.Frame.
combined (bool, optional (default=True)) – If True, will combined subwords to become a word.

Returns

result

Return type

List[Dict[text, start, end]]

greedy_decoder(inputs)[source]#

Transcribe inputs using greedy decoder.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result
Return type: List[str]

beam_decoder(inputs, beam_width=5, temperature=0.0, score_norm=True)[source]#

Transcribe inputs using beam decoder.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
beam_width (int, optional (default=5)) – beam size for beam decoder.
temperature (float, optional (default=0.0)) – apply temperature function for logits, can help for certain case, logits += -np.log(-np.log(uniform_noise_shape_logits)) * temperature
score_norm (bool, optional (default=True)) – descending sort beam based on score / length of decoded.

Returns

result

Return type

List[str]

beam_decoder_lm(inputs, language_model, beam_width=5, token_min_logp=- 20.0, beam_prune_logp=- 5.0, temperature=0.0, score_norm=True)[source]#

Transcribe inputs using beam decoder + KenLM.

Parameters

inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
language_model (pyctcdecode.language_model.LanguageModel) – pyctcdecode language model, load from LanguageModel(kenlm_model, alpha = alpha, beta = beta).
beam_width (int, optional (default=5)) – beam size for beam decoder.
token_min_logp (float, optional (default=-20.0)) – minimum log probability to select a token.
beam_prune_logp (float, optional (default=-5.0)) – filter candidates >= max score lm + beam_prune_logp.
temperature (float, optional (default=0.0)) – apply temperature function for logits, can help for certain case, logits += -np.log(-np.log(uniform_noise_shape_logits)) * temperature
score_norm (bool, optional (default=True)) – descending sort beam based on score / length of decoded.

Returns

result

Return type

List[str]

predict(inputs)[source]#

Transcribe inputs using greedy decoder, will return list of strings.

Parameters: inputs (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result
Return type: List[str]

gradio(record_mode=True, **kwargs)[source]#

Transcribe an input using beam decoder on Gradio interface.

Parameters

record_mode (bool, optional (default=True)) – if True, Gradio will use record mode, else, file upload mode.
**kwargs (keyword arguments for beam decoder and iface.launch.) –

malaya_speech.model.transducer.TransducerAligner#

class malaya_speech.model.transducer.TransducerAligner[source]#

predict(input, transcription, sample_rate=16000)[source]#

Transcribe input, will return a string. :param input: np.array or malaya_speech.model.frame.Frame. :type input: np.array :type transcription: str :param transcription: transcription of input audio :type transcription: str :type sample_rate: int :param sample_rate: sample rate for input. :type sample_rate: int, optional (default=16000)

Returns: result
Return type: Dict[words_alignment, subwords_alignment, subwords, alignment]

malaya_speech.model.unet.UNET#

class malaya_speech.model.unet.UNET[source]#

predict(inputs)[source]#

Enhance inputs, will return melspectrogram.

Parameters: inputs (List[np.array]) –
Returns: result
Return type: List

malaya_speech.model.unet.UNETSTFT#

class malaya_speech.model.unet.UNETSTFT[source]#

predict(input)[source]#

Enhance inputs, will return waveform.

Parameters: input (np.array) – np.array or malaya_speech.model.frame.Frame.
Returns: result
Return type: Dict

malaya_speech.model.unet.UNET1D#

class malaya_speech.model.unet.UNET1D[source]#

predict(input)[source]#

Enhance inputs, will return waveform.

Parameters: input (np.array) – np.array or malaya_speech.model.frame.Frame.
Returns: result
Return type: np.array

malaya_speech.model.wav2vec.Wav2Vec2_CTC#

class malaya_speech.model.wav2vec.Wav2Vec2_CTC[source]#

greedy_decoder(inputs)[source]#

Transcribe inputs using greedy decoder.

Parameters: input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result
Return type: List[str]

beam_decoder(inputs, beam_width=100, **kwargs)[source]#

Transcribe inputs using beam decoder.

Parameters

input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
beam_width (int, optional (default=100)) – beam size for beam decoder.

Returns

result

Return type

List[str]

predict(inputs)[source]#

Predict logits from inputs using greedy decoder.

Parameters: input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
Returns: result
Return type: List[str]

predict_logits(inputs, norm_func=<function softmax>)[source]#

Predict logits from inputs.

Parameters

input (List[np.array]) – List[np.array] or List[malaya_speech.model.frame.Frame].
norm_func (Callable, optional (default=malaya.utils.activation.softmax)) –

Returns

result

Return type

List[np.array]

gradio(record_mode=True, lm_func=None, **kwargs)[source]#

Transcribe an input using beam decoder on Gradio interface.

Parameters

record_mode (bool, optional (default=True)) – if True, Gradio will use record mode, else, file upload mode.
lm_func (Callable, optional (default=None)) – if not None, will pass a logits with shape [T, D].
**kwargs (keyword arguments for beam decoder and iface.launch.) –

malaya_speech.model.wav2vec.Wav2Vec2_Aligner#

class malaya_speech.model.wav2vec.Wav2Vec2_Aligner[source]#

predict(input, transcription, sample_rate=16000)[source]#

Transcribe input, will return a string.

Parameters

input (np.array) – np.array or malaya_speech.model.frame.Frame.
transcription (str) – transcription of input audio.
sample_rate (int, optional (default=16000)) – sample rate for input.

Returns

result

Return type

Dict[chars_alignment, words_alignment, alignment]

malaya_speech.model.webrtc.WebRTC#

class malaya_speech.model.webrtc.WebRTC(vad, sample_rate=16000, minimum_amplitude=100)[source]#

malaya_speech.torch_model.super_resolution.VoiceFixer#

class malaya_speech.torch_model.super_resolution.VoiceFixer[source]#

predict(input, remove_higher_frequency=True)[source]#

Parameters

input (np.array) – np.array or malaya_speech.model.frame.Frame, must an audio with 44100 sampling rate.
remove_higher_frequency (bool, optional (default = True)) – Remove high frequency before neural upsampling.

Returns

result

Return type

np.array with 44100 sampling rate

forward(input, remove_higher_frequency=True)[source]#

Defines the computation performed at every call.

Should be overridden by all subclasses.

Note

Although the recipe for forward pass needs to be defined within this function, one should call the Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

malaya_speech.torch_model.super_resolution.NVSR#

class malaya_speech.torch_model.super_resolution.NVSR[source]#

predict(input)[source]#

Parameters: input (np.array) – np.array or malaya_speech.model.frame.Frame, must an audio with 44100 sampling rate.
Returns: result
Return type: np.array with 44100 sampling rate

malaya_speech.torch_model.synthesis.VITS#

class malaya_speech.torch_model.synthesis.VITS[source]#

predict(string, temperature=0.6666, temperature_durator=0.6666, length_ratio=1.0, **kwargs)[source]#

Change string to waveform.

Parameters

string (str) –
temperature (float, optional (default=0.6666)) – Decoder model trying to decode with encoder(text) + random.normal() * temperature.
temperature_durator (float, optional (default=0.6666)) – Durator trying to predict alignment with random.normal() * temperature_durator.
length_ratio (float, optional (default=1.0)) – Increase this variable will increase time voice generated.

Returns

result

Return type

Dict[string, ids, alignment, y]

malaya_speech.pipeline#

class malaya_speech.pipeline.Pipeline[source]#

visualize(filename='pipeline.png', **kwargs)[source]#

Render the computation of this object’s task graph using graphviz.

Requires graphviz to be installed.

Parameters

filename (str, optional) – The name of the file to write to disk.
kwargs – Graph attributes to pass to graphviz like rankdir="LR"

batching = <function batching>#

flatten = <function flatten>#

foreach_map = <function foreach_map>#

map = <function map>#

partition = <function partition>#

sliding_window = <function sliding_window>#

zip = <function zip>#

malaya_speech.pipeline.map#

class malaya_speech.pipeline.map[source]#

apply a function / method to the pipeline

Examples

>>> source = Pipeline()
>>> source.map(lambda x: x + 1).map(print)
>>> source.emit(1)
2

malaya_speech.pipeline.batching#

class malaya_speech.pipeline.batching[source]#

Batching stream into tuples

Examples

>>> source = Pipeline()
>>> source.batching(2).map(print)
>>> source.emit([1,2,3,4,5])
([1, 2], [3, 4], [5])

malaya_speech.pipeline.partition#

class malaya_speech.pipeline.partition[source]#

Partition stream into tuples of equal size

Examples

>>> source = Pipeline()
>>> source.partition(3).map(print)
>>> for i in range(10):
...     source.emit(i)
(0, 1, 2)
(3, 4, 5)
(6, 7, 8)

malaya_speech.pipeline.sliding_window#

class malaya_speech.pipeline.sliding_window[source]#

Produce overlapping tuples of size n

Parameters: return_partial (bool) – If True, yield tuples as soon as any events come in, each tuple being smaller or equal to the window size. If False, only start yielding tuples once a full window has accrued.

Examples

>>> source = Pipeline()
>>> source.sliding_window(3, return_partial=False).map(print)
>>> for i in range(8):
...     source.emit(i)
(0, 1, 2)
(1, 2, 3)
(2, 3, 4)
(3, 4, 5)
(4, 5, 6)
(5, 6, 7)

malaya_speech.pipeline.foreach_map#

class malaya_speech.pipeline.foreach_map[source]#

Apply a function to every element in a tuple in the stream.

Parameters

func (callable) –
method (str, optional (default='sync')) –
method to process each elements.
- 'sync' - loop one-by-one to process.
- 'async' - async process all elements at the same time.
- 'thread' - multithreading level to process all elements at the same time.
  Default is 1 worker. Override worker_size=n to increase.
- 'process' - multiprocessing level to process all elements at the same time.
  Default is 1 worker. Override worker_size=n to increase.
*args – The arguments to pass to the function.
**kwargs – Keyword arguments to pass to func.

Examples

>>> source = Pipeline()
>>> source.foreach_map(lambda x: 2*x).map(print)
>>> for i in range(3):
...     source.emit((i, i))
(0, 0)
(2, 2)
(4, 4)

malaya_speech.pipeline.flatten#

class malaya_speech.pipeline.flatten[source]#

Flatten streams of lists or iterables into a stream of elements

Examples

>>> source = Pipeline()
>>> source.flatten().map(print)
>>> source.emit([[1, 2, 3], [4, 5], [6, 7, 7]])
[1, 2, 3, 4, 5, 6, 7, 7]

malaya_speech.pipeline.zip#

class malaya_speech.pipeline.zip[source]#

Combine 2 branches into 1 branch.

Examples

>>> source = Pipeline()
>>> left = source.map(lambda x: x + 1, name = 'left')
>>> right = source.map(lambda x: x + 10, name = 'right')
>>> left.zip(right).map(sum).map(print)
>>> source.emit(2)
15

pack_literals(tup)[source]#: Fill buffers for literals whenever we empty them

malaya_speech.streaming#

malaya_speech.streaming.record(vad, asr_model=None, classification_model=None, device=None, input_rate=16000, sample_rate=16000, blocks_per_second=50, padding_ms=300, ratio=0.75, min_length=0.1, filename=None, spinner=False)[source]#

Record an audio using pyaudio library. This record interface required a VAD model.

Parameters

vad (object) – vad model / pipeline.
asr_model (object) – ASR model / pipeline, will transcribe each subsamples realtime.
classification_model (object) – classification pipeline, will classify each subsamples realtime.
device (None) – device parameter for pyaudio, check available devices from sounddevice.query_devices().
input_rate (int, optional (default = 16000)) – sample rate from input device, this will auto resampling.
sample_rate (int, optional (default = 16000)) – output sample rate.
blocks_per_second (int, optional (default = 50)) – size of frame returned from pyaudio, frame size = sample rate / (blocks_per_second / 2). 50 is good for WebRTC, 30 or less is good for Malaya Speech VAD.
padding_ms (int, optional (default = 300)) – size of queue to store frames, size = padding_ms // (1000 * blocks_per_second // sample_rate)
ratio (float, optional (default = 0.75)) – if 75% of the queue is positive, assumed it is a voice activity.
min_length (float, optional (default=0.1)) – minimum length (s) to accept a subsample.
filename (str, optional (default=None)) – if None, will auto generate name based on timestamp.
spinner (bool, optional (default=False)) – if True, will use spinner object from halo library.

Returns

result

Return type

[filename, samples]

malaya_speech.utils.aligner#

class malaya_speech.utils.aligner.Point(token_index, time_index, score)[source]#

class malaya_speech.utils.aligner.Segment(label, start, end, score)[source]#

malaya_speech.utils.aligner.put_comma(alignment, min_threshold=0.5)[source]#

Put comma in alignment from force alignment model.

Parameters

alignment (List[Dict[text, start, end]]) –
min_threshold (float, optional (default=0.5)) – minimum threshold in term of seconds to assume a comma.

Returns

result

Return type

List[str]

malaya_speech.utils.aligner.plot_alignments(alignment, subs_alignment, words_alignment, waveform, separator=' ', sample_rate=16000, figsize=(16, 9), plot_score_char=False, plot_score_word=True)[source]#

plot alignment.

Parameters

alignment (np.array) – usually alignment output.
subs_alignment (list) – usually chars_alignment or subwords_alignment output.
words_alignment (list) – usually words_alignment output.
waveform (np.array) – input audio.
separator (str, optional (default=' ')) – separator between words, only useful if subs_alignment is character based.
sample_rate (int, optional (default=16000)) –
figsize (tuple, optional (default=(16, 9))) – figure size for matplotlib figsize.
plot_score_char (bool, optional (default=False)) – plot score on top of character plots.
plot_score_word (bool, optional (default=True)) – plot score on top of word plots.

malaya_speech.utils.astype#

malaya_speech.utils.astype.to_ndarray(array)[source]#

Change list / tuple / bytes into np.array

Parameters: array (list / tuple / bytes) –
Returns: result
Return type: np.array

malaya_speech.utils.astype.to_byte(array)[source]#

Change list / tuple / np.array into bytes

Parameters: array (list / tuple / np.array) –
Returns: result
Return type: bytes

malaya_speech.utils.astype.float_to_int(array, type=<class 'numpy.int16'>)[source]#

Change np.array float32 / float64 into np.int16

Parameters

array (np.array) –
type (np.int16) –

Returns

result

Return type

np.array

malaya_speech.utils.astype.int_to_float(array, type=<class 'numpy.float32'>)[source]#

Change np.array int16 into np.float32

Parameters

array (np.array) –
type (np.float32) –

Returns

result

Return type

np.array

malaya_speech.utils.char#

malaya_speech.utils.char.strip_ids(ids, ids_to_strip)[source]#: Strip ids_to_strip from the end ids.

malaya_speech.utils.char.generate_vocab(strings)[source]#

Generate character vocab sorted based on frequency.

Parameters: strings (List[str]) –
Returns: result
Return type: List[str]

malaya_speech.utils.char.encode(string, add_eos=True, add_blank=False, lookup=None)[source]#

Encode string to integer representation based on ascii table or lookup variable.

Parameters

string (str) –
add_eos (bool, optional (default=True)) – add EOS token at the end of encoded.
add_blank (bool, optional (default=False)) – add BLANK token at the starting of encoded, this is for transducer / transformer based.
lookup (List[str], optional (default=None)) – list of unique strings.

Returns

result

Return type

List[int]

malaya_speech.utils.char.decode(ids, lookup=None)[source]#

Decode integer representation to string based on ascii table or lookup variable.

Parameters

ids (List[int]) –
lookup (List[str], optional (default=None)) – list of unique strings.

Returns

result

Return type

str

malaya_speech.utils.combine#

malaya_speech.utils.combine.without_silent(frames, threshold_to_stop=0.1, silent_trail=500)[source]#

Group multiple frames based on label and threshold to stop.

Parameters

frames (List[Tuple[Frame, label]]) – Output from VAD.
threshold_to_stop (float, optional (default = 0.1)) – If threshold_to_stop is 0.1, means that, length same label samples must at least 0.1 second.
silent_trail (int, optional (default = 500)) – if detected a silent, will append first N frames and last N frames.

Returns

result

Return type

np.array

malaya_speech.utils.featurization#

malaya_speech.utils.featurization.normalize_signal(signal, gain=None)[source]#: Normalize float32 signal to [-1, 1] range

malaya_speech.utils.featurization.extract_pitch(y, hop_size=256, sr=22050, bad_f0=5.0, zero_value=- 10.0)[source]#: Originally from https://github.com/yl4579/PitchExtractor/blob/main/meldataset.py

malaya_speech.utils.generator#

malaya_speech.utils.generator.frames(audio, frame_duration_ms=30, sample_rate=16000, append_ending_trail=True)[source]#

Generates audio frames from audio. Takes the desired frame duration in milliseconds, the audio, and the sample rate.

Parameters

audio (np.array) –
frame_duration_ms (int, optional (default=30)) –
sample_rate (int, optional (default=16000)) –
append_ending_trail (bool, optional (default=True)) – if True, will append last trail and this last trail might not same length as frame_duration_ms.

Returns

result

Return type

List[malaya_speech.model.frame.Frame]

malaya_speech.utils.generator.mel_sampling(audio, frame_duration_ms=1200, overlap_ms=200, sample_rate=16000)[source]#

Generates audio frames from audio. This is for melspectrogram generative model. Takes the desired frame duration in milliseconds, the audio, and the sample rate.

Parameters

audio (np.array) –
frame_duration_ms (int, optional (default=1200)) –
overlap_ms (int, optional (default=200)) –
sample_rate (int, optional (default=16000)) –

Returns

result

Return type

List[np.array]

malaya_speech.utils.generator.combine_mel_sampling(samples, overlap_ms=200, sample_rate=16000, padded_ms=50)[source]#

To combine results from mel_sampling, output from melspectrogram generative model.

Parameters

samples (List[np.array]) –
overlap_ms (int, optional (default=200)) –
sample_rate (int, optional (default=16000)) –

Returns

result

Return type

List[np.array]

malaya_speech.utils.griffin_lim#

malaya_speech.utils.griffin_lim.from_mel(mel_, sr=16000, n_fft=2048, n_iter=32, win_length=1000, hop_length=100)[source]#

Change melspectrogram into waveform using Librosa.

Parameters: spectrogram (np.array) –
Returns: result
Return type: np.array

malaya_speech.utils.griffin_lim.from_mel_vocoder(mel, sr=22050, n_fft=1024, n_mels=256, fmin=80, fmax=7600, n_iter=32, win_length=None, hop_length=256)[source]#

Change melspectrogram into waveform using Librosa.

Parameters: spectrogram (np.array) –
Returns: result
Return type: np.array

malaya_speech.utils.group#

malaya_speech.utils.group.combine_frames(frames)[source]#

Combine multiple frames into one frame.

Parameters: frames (List[Frame]) –
Returns: result
Return type: Frame

malaya_speech.utils.group.group_frames(frames)[source]#

Group multiple frames based on label.

Parameters: frames (List[Tuple[Frame, label]]) –
Returns: result
Return type: List[Tuple[Frame, label]]

malaya_speech.utils.group.group_frames_threshold(frames, threshold_to_stop=0.3)[source]#

Group multiple frames based on label and threshold to stop.

Parameters

frames (List[Tuple[Frame, label]]) –
threshold_to_stop (float, optional (default = 0.3)) – If threshold_to_stop is 0.3, means that, length same label samples must at least 0.3 second.

Returns

result

Return type

List[Tuple[Frame, label]]

malaya_speech.utils.padding#

malaya_speech.utils.padding.sequence_1d(seq, maxlen=None, padding='post', pad_int=0, return_len=False)[source]#

padding sequence of 1d to become 2d array.

Parameters

seq (List[List[int]]) –
maxlen (int, optional (default=None)) – If None, will calculate max length in the function.
padding (str, optional (default='post')) – If pre, will add 0 on the starting side, else add 0 on the end side.
pad_int – padding value.
int – padding value.
(default=0) (optional) – padding value.

Returns

result

Return type

np.array

malaya_speech.utils.padding.sequence_nd(seq, maxlen=None, padding='post', pad_val=0.0, dim=1, return_len=False)[source]#

padding sequence of nd to become (n+1)d array.

Parameters

seq (list of nd array) –
maxlen (int, optional (default=None)) – If None, will calculate max length in the function.
padding (str, optional (default='post')) – If pre, will add 0 on the starting side, else add 0 on the end side.
pad_val – padding value.
float – padding value.
(default=0.0) (optional) – padding value.
dim (int, optional (default=1)) –

Returns

result

Return type

np.array

malaya_speech.utils.padding.tf_sequence_nd(seq, maxlen=None, padding='post', pad_val=0.0, dim=1, return_len=False)[source]#

padding sequence of nd to become (n+1)d array.

Parameters

seq (list of nd array) –
maxlen (int, optional (default=None)) – If None, will calculate max length in the function.
padding (str, optional (default='post')) – If pre, will add 0 on the starting side, else add 0 on the end side.
pad_val – padding value.
float – padding value.
(default=0.0) (optional) – padding value.
dim (int, optional (default=1)) –

Returns

result

Return type

np.array

malaya_speech.utils.read#

malaya_speech.utils.read.resample(data, old_samplerate, new_samplerate)[source]#

Resample signal.

Parameters

data (np.array) –
old_samplerate (int) – old sample rate.
new_samplerate (int) – new sample rate.

Returns

result

Return type

data

malaya_speech.utils.read.load(file, sr=16000, scale=True)[source]#

Read sound file, any format supported by soundfile.read

Parameters

file (str) –
sr (int, (default=16000)) – new sample rate. If input sample rate is not same, will resample automatically.
scale (bool, (default=True)) – Scale to -1 and 1.

Returns

result

Return type

(y, sr)

malaya_speech.utils.split#

malaya_speech.utils.split.split_vad(frames, n=3, negative_threshold=0.1)[source]#

Split a sample into multiple samples based n size of negative VAD.

Parameters

frames (List[Tuple[Frame, label]]) –
n (int, optional (default=3)) – n size of negative VAD to assume in one subsample.
negative_threshold (float, optional (default = 0.1)) – If negative_threshold is 0.1, means that, length negative samples must at least 0.1 second.

Returns

result

Return type

List[Frame]

malaya_speech.utils.split.split_vad_duration(frames, max_duration=5.0, negative_threshold=0.1)[source]#

Split a sample into multiple samples based maximum duration of voice activities.

Parameters

frames (List[Tuple[Frame, label]]) –
max_duration (float, optional (default = 5.0)) – Maximum duration to assume one sample combined from voice activities.
negative_threshold (float, optional (default = 0.1)) – If negative_threshold is 0.1, means that, length negative samples must at least 0.1 second.

Returns

result

Return type

List[Frame]

malaya_speech.utils.subword#

malaya_speech.utils.subword.generate_tokenizer(strings, target_vocab_size=1024, max_subword_length=4, max_corpus_chars=None, reserved_tokens=None)[source]#: Build a subword dictionary.

malaya_speech.utils.subword.save(tokenizer, path)[source]#: Save subword dictionary to a text file.

malaya_speech.utils.subword.load(path)[source]#: Load text file into subword dictionary.

malaya_speech.utils.subword.encode(tokenizer, string, add_blank=False)[source]#

Encode string to integer representation based on ascii table or lookup variable.

Parameters

tokenizer (object) – tokenizer object
string (str) –
add_blank (bool, optional (default=False)) – add BLANK token at the starting of encoded, this is for transducer / transformer based.
lookup (List[str], optional (default=None)) – list of unique strings.

Returns

result

Return type

List[int]

malaya_speech.utils.subword.decode(tokenizer, ids)[source]#

Decode integer representation to string based on tokenizer vocab.

Parameters

tokenizer (object) – tokenizer object
ids (List[int]) –

Returns

result

Return type

str

malaya_speech.utils.subword.decode_multilanguage(tokenizers, ids)[source]#

Decode integer representation to string using list of tokenizer objects.

Parameters

tokenizers (List[object]) – List of tokenizer objects.
ids (List[int]) –

Returns

result

Return type

str

malaya_speech.utils.tf_featurization#

malaya_speech.age_detection#

malaya_speech.age_detection.available_model()[source]#: List available age detection deep models.

malaya_speech.age_detection.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load age detection deep model.

Parameters

model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.age_detection.available_model().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.diarization#

malaya_speech.diarization.speaker_similarity(vad_results, speaker_vector, similarity_threshold=0.8, norm_function=None, return_embedding=False)[source]#

Speaker diarization using L2-Norm similarity.

Parameters

vad_results (List[Tuple[Frame, label]]) – results from VAD.
speaker_vector (callable) – speaker vector object.
similarity_threshold (float, optional (default=0.8)) – if current voice activity sample similar at least 80%, we assumed it is from the same speaker.
norm_function (Callable, optional(default=None)) – normalize function for speaker vectors.
speaker_change_threshold (float, optional (default=0.5)) – in one voice activity sample can be more than one speaker, split it using this threshold.

Returns

result

Return type

List[Tuple[Frame, label]]

malaya_speech.diarization.clustering(vad_results, speaker_vector, model, norm_function=<function l2_normalize>, log_distance_metric=None, return_embedding=False)[source]#

Speaker diarization using any clustering model.

Parameters

vad_results (List[Tuple[Frame, label]]) – results from VAD.
speaker_vector (callable) – speaker vector object.
model (callable) – Any unsupervised clustering model. Required fit_predict or apply or predict method.
norm_function (Callable, optional(default=malaya_speech.utils.dist.l2_normalize)) – normalize function for speaker vectors.
log_distance_metric (str, optional (default=None)) – post distance norm in log scale metrics. this parameter is necessary for model that required square array input. Common value is one of [‘cosine’, ‘angular’].

Returns

result

Return type

List[Tuple[Frame, label]]

malaya_speech.emotion#

malaya_speech.emotion.available_model()[source]#: List available emotion detection deep models.

malaya_speech.emotion.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load emotion detection deep model.

Parameters

model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.emotion.available_model().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.force_alignment#

malaya_speech.force_alignment.available_transducer()[source]#: List available Encoder-Transducer Aligner models.

malaya_speech.force_alignment.available_ctc()[source]#: List available Encoder-CTC Aligner models.

malaya_speech.force_alignment.available_huggingface()[source]#: List available HuggingFace Malaya-Speech Aligner models.

malaya_speech.force_alignment.deep_transducer(model='conformer-transducer', quantized=False, **kwargs)[source]#

Load Encoder-Transducer Aligner model.

Parameters

model (str, optional (default='conformer-transducer')) – Check available models at malaya_speech.force_alignment.available_aligner().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.transducer.TransducerAligner class

malaya_speech.force_alignment.deep_ctc(model='hubert-conformer', quantized=False, **kwargs)[source]#

Load Encoder-CTC ASR model.

Parameters

model (str, optional (default='hubert-conformer')) – Check available models at malaya_speech.stt.available_ctc().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.wav2vec.Wav2Vec2_Aligner class

malaya_speech.force_alignment.huggingface(model='mesolitica/wav2vec2-xls-r-300m-mixed')[source]#

Load Finetuned models from HuggingFace.

Parameters: model (str, optional (default='mesolitica/wav2vec2-xls-r-300m-mixed')) – Check available models at malaya_speech.stt.available_huggingface().
Returns: result
Return type: malaya_speech.model.huggingface.CTC class

malaya_speech.gender#

malaya_speech.gender.available_model()[source]#: List available gender detection deep models.

malaya_speech.gender.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load gender detection deep model.

Parameters

model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.gender.available_model().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.language_detection#

malaya_speech.language_detection.available_model()[source]#: List available language detection deep models.

malaya_speech.language_detection.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load language detection deep model.

Parameters

model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.language_detection.available_model().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.language_model#

malaya_speech.language_model.available_kenlm()[source]#: List available KenLM Language Model.

malaya_speech.language_model.available_gpt2()[source]#: List available GPT2 Language Model.

malaya_speech.language_model.available_mlm()[source]#: List available MLM Language Model.

malaya_speech.language_model.kenlm(model='dump-combined', **kwargs)[source]#

Load KenLM language model.

Parameters: model (str, optional (default='dump-combined')) – Check available models at malaya_speech.language_model.available_kenlm().
Returns: result
Return type: str

malaya_speech.language_model.gpt2(model='mesolitica/gpt2-117m-bahasa-cased', force_check=True, **kwargs)[source]#

Load GPT2 language model.

Parameters

model (str, optional (default='mesolitica/gpt2-117m-bahasa-cased')) – Check available models at malaya_speech.language_model.available_gpt2().
force_check (bool, optional (default=True)) – Force check model one of malaya model. Set to False if you have your own huggingface model.

Returns

result

Return type

malaya.torch_model.gpt2_lm.LM class

malaya_speech.language_model.mlm(model='mesolitica/bert-base-standard-bahasa-cased', force_check=True, **kwargs)[source]#

Load Masked language model.

Parameters

model (str, optional (default='mesolitica/bert-base-standard-bahasa-cased')) – Check available models at malaya_speech.language_model.available_mlm().
force_check (bool, optional (default=True)) – Force check model one of malaya model. Set to False if you have your own huggingface model.

Returns

result

Return type

malaya_speech.torch_model.mask_lm.LM class

malaya_speech.multispeaker_separation#

malaya_speech.multispeaker_separation.available_deep_wav()[source]#: List available FastSep models trained on raw 8k wav.

malaya_speech.multispeaker_separation.deep_wav(model='fastsep-4', quantized=False, **kwargs)[source]#

Load FastSep model, trained on raw 8k wav using SISNR PIT loss.

Parameters

model (str, optional (default='fastsep-4')) – Check available models at malaya_speech.multispeaker_separation.available_deep_wav().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.tf.Split class

malaya_speech.noise_reduction#

malaya_speech.noise_reduction.available_model()[source]#: List available Noise Reduction deep learning models.

malaya_speech.noise_reduction.deep_model(model='resnet-unet', quantized=False, **kwargs)[source]#

Load Noise Reduction deep learning model.

Parameters

model (str, optional (default='resnet-unet')) – Check available models at malaya_speech.noise_reduction.available_model().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.tf.UNET_STFT class

malaya_speech.speaker_change#

malaya_speech.speaker_change.available_model()[source]#: List available speaker change deep models.

malaya_speech.speaker_change.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load speaker change deep model.

Parameters

model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.speaker_change.available_model().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.speaker_change.split_activities(vad_results, speaker_change_results, speaker_change_threshold=0.5, sr=16000, ignore_not_activity=True)[source]#

split VAD based on speaker change threshold, worse-case O(N^2).

Parameters

vad_results (List[Tuple[Frame, label]]) – results from VAD.
speaker_change_results (List[Tuple[Frame, float]], optional (default=None)) – results from speaker change module, must in float result.
speaker_change_threshold (float, optional (default=0.5)) – in one voice activity sample can be more than one speaker, split it using this threshold.
sr (int, optional (default=16000)) – sample rate, classification model in malaya-speech use 16k.
ignore_not_activity (bool, optional (default=True)) – If True, will ignore if result VAD is False, else will try to split.

Returns

result

Return type

List[Tuple[Frame, label]]

malaya_speech.speaker_overlap#

malaya_speech.speaker_overlap.available_model()[source]#: List available speaker overlap deep models.

malaya_speech.speaker_overlap.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load speaker overlap deep model.

Parameters

model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.speaker_overlap.available_model().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.speaker_vector#

malaya_speech.speaker_vector.available_model()[source]#: List available speaker vector deep models.

malaya_speech.speaker_vector.deep_model(model='vggvox-v2', quantized=False, **kwargs)[source]#

Load Speaker2Vec model.

Parameters

model (str, optional (default='speakernet')) – Check available models at malaya_speech.speaker_vector.available_model().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.speech_enhancement#

malaya_speech.speech_enhancement.available_deep_masking()[source]#: List available Speech Enhancement STFT masking deep learning model.

malaya_speech.speech_enhancement.available_deep_enhance()[source]#: List available Speech Enhancement UNET Waveform sampling deep learning model.

malaya_speech.speech_enhancement.deep_masking(model='resnet-unet', quantized=False, **kwargs)[source]#

Load Speech Enhancement STFT UNET masking deep learning model.

Parameters

model (str, optional (default='resnet-unet')) – Check available models at malaya_speech.speech_enhancement.available_deep_masking().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.unet.UNETSTFT class

malaya_speech.speech_enhancement.deep_enhance(model='unet', quantized=False, **kwargs)[source]#

Load Speech Enhancement UNET Waveform sampling deep learning model.

Parameters

model (str, optional (default='unet')) – Check available models at malaya_speech.speech_enhancement.available_deep_enhance().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.unet.UNET1D class

malaya_speech.speechsplit_conversion#

malaya_speech.speechsplit_conversion.available_deep_conversion(f0_mode='pysptk')[source]#

List available Voice Conversion models.

Parameters

f0_mode (str, optional (default='pysptk')) –

F0 conversion supported. Allowed values:

'pysptk' - https://github.com/r9y9/pysptk, sensitive towards gender.
'pyworld' - https://pypi.org/project/pyworld/

malaya_speech.speechsplit_conversion.deep_conversion(model='fastspeechsplit-v2-vggvox-v2', f0_mode='pysptk', quantized=False, **kwargs)[source]#

Load Voice Conversion model.

Parameters

model (str, optional (default='fastspeechsplit-v2-vggvox-v2')) – Check available models at malaya_speech.speechsplit_conversion.available_deep_conversion(f0_mode = ‘{f0_mode}’)
f0_mode (str, optional (default='pysptk')) –
F0 conversion supported. Allowed values:
- 'pysptk' - https://github.com/r9y9/pysptk, sensitive towards gender.
- 'pyworld' - https://pypi.org/project/pyworld/
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.splitter.FastSpeechSplit class

malaya_speech.stack#

malaya_speech.stack.classification_stack(models)[source]#

Stacking for classification models. All models should be in the same domain classification.

Parameters: models (List[Callable]) – list of models.
Returns: result
Return type: malaya_speech.stack.Stack class

malaya_speech.model.stack.Stack#

class malaya_speech.stack.Stack[source]#

predict_proba(inputs, aggregate=<function gmean>)[source]#

Stacking for predictive models, will return probability.

Parameters

inputs (List[np.array]) –
aggregate (Callable, optional (default=scipy.stats.mstats.gmean)) –
function. (Aggregate) –

Returns

result

Return type

np.array

predict(inputs, aggregate=<function gmean>)[source]#

Stacking for predictive models, will return labels.

Parameters

inputs (List[np.array]) –
aggregate (Callable, optional (default=scipy.stats.mstats.gmean)) –
function. (Aggregate) –

Returns

result

Return type

List[str]

malaya_speech.stt#

malaya_speech.stt.available_ctc()[source]#: List available Encoder-CTC ASR models.

malaya_speech.stt.available_transducer()[source]#: List available Encoder-Transducer ASR models.

malaya_speech.stt.available_huggingface()[source]#: List available HuggingFace Malaya-Speech ASR models.

malaya_speech.stt.deep_ctc(model='hubert-conformer', quantized=False, **kwargs)[source]#

Load Encoder-CTC ASR model.

Parameters

model (str, optional (default='hubert-conformer')) – Check available models at malaya_speech.stt.available_ctc().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.wav2vec.Wav2Vec2_CTC class

malaya_speech.stt.deep_transducer(model='conformer', quantized=False, **kwargs)[source]#

Load Encoder-Transducer ASR model.

Parameters

model (str, optional (default='conformer')) – Check available models at malaya_speech.stt.available_transducer().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.transducer.Transducer class

malaya_speech.stt.huggingface(model='mesolitica/wav2vec2-xls-r-300m-mixed', **kwargs)[source]#

Load Finetuned models from HuggingFace. Required Tensorflow >= 2.0.

Parameters: model (str, optional (default='mesolitica/wav2vec2-xls-r-300m-mixed')) – Check available models at malaya_speech.stt.available_huggingface().
Returns: result
Return type: malaya_speech.model.huggingface.CTC class

malaya_speech.super_resolution#

malaya_speech.super_resolution.available_unet()[source]#: List available Super Resolution 4x deep learning UNET models.

malaya_speech.super_resolution.available_tfgan()[source]#: List available Super Resolution deep learning UNET + TFGAN Vocoder models.

malaya_speech.super_resolution.available_audio_diffusion()[source]#: List available Super Resolution deep learning UNET + TFGAN Vocoder models.

malaya_speech.super_resolution.unet(model='srgan-256', quantized=False, **kwargs)[source]#

Load Super Resolution 4x deep learning UNET model.

Parameters

model (str, optional (default='srgan-256')) – Check available models at malaya_speech.super_resolution.available_unet().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.tf.UNET1D class

malaya_speech.super_resolution.tfgan(model='voicefixer', **kwargs)[source]#

Load TFGAN based Speech Resolution.

Parameters: model (str, optional (default='voicefixer')) – Check available models at malaya_speech.super_resolution.available_tfgan().
Returns: result
Return type: malaya_speech.torch_model.super_resolution.VoiceFixer

malaya_speech.super_resolution.audio_diffusion(model='nuwave2', **kwargs)[source]#

Load audio diffusion based Speech Resolution.

Parameters: model (str, optional (default='nuwave2')) – Check available models at malaya_speech.super_resolution.available_audio_diffusion().
Returns: result
Return type: malaya_speech.torch_model.super_resolution.NuWave2

malaya_speech.tts#

malaya_speech.tts.available_tacotron2()[source]#: List available Tacotron2, Text to Mel models.

malaya_speech.tts.available_fastspeech2()[source]#: List available FastSpeech2, Text to Mel models.

malaya_speech.tts.available_fastpitch()[source]#: List available FastPitch, Text to Mel models.

malaya_speech.tts.available_glowtts()[source]#: List available GlowTTS, Text to Mel models.

malaya_speech.tts.available_lightspeech()[source]#: List available LightSpeech, Text to Mel models.

malaya_speech.tts.available_vits()[source]#: List available VITS, End-to-End models.

malaya_speech.tts.available_e2e_fastspeech2()[source]#: List available FastSpeech2, End-to-End models.

malaya_speech.tts.load_text_ids(pad_to=8, understand_punct=True, is_lower=True, **kwargs)[source]#: Load text normalizer module use by Malaya-Speech TTS.

malaya_speech.tts.tacotron2(model='yasmin', quantized=False, pad_to=8, **kwargs)[source]#

Load Tacotron2 Text-to-Mel TTS model.

Parameters

model (str, optional (default='yasmin')) – Check available models at malaya_speech.tts.available_tacotron2().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.
pad_to (int, optional (default=8)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.

Returns

result

Return type

malaya_speech.model.synthesis.Tacotron class

malaya_speech.tts.fastspeech2(model='osman', quantized=False, pad_to=8, **kwargs)[source]#

Load Fastspeech2 Text-to-Mel TTS model.

Parameters

model (str, optional (default='male')) – Check available models at malaya_speech.tts.available_fastspeech2().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.
pad_to (int, optional (default=8)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.

Returns

result

Return type

malaya_speech.model.synthesis.Fastspeech class

malaya_speech.tts.fastpitch(model='male', quantized=False, pad_to=8, **kwargs)[source]#

Load Fastspitch Text-to-Mel TTS model.

Parameters

model (str, optional (default='male')) – Check available models at malaya_speech.tts.available_fastpitch().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.
pad_to (int, optional (default=8)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.

Returns

result

Return type

malaya_speech.model.synthesis.Fastpitch class

malaya_speech.tts.glowtts(model='yasmin', quantized=False, pad_to=2, **kwargs)[source]#

Load GlowTTS Text-to-Mel TTS model.

Parameters

model (str, optional (default='yasmin')) – Check available models at malaya_speech.tts.available_glowtts().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.
pad_to (int, optional (default=2)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 2.

Returns

result

Return type

malaya_speech.model.synthesis.GlowTTS class

malaya_speech.tts.lightspeech(model='male', quantized=False, pad_to=8, **kwargs)[source]#

Load LightSpeech Text-to-Mel TTS model.

Parameters

model (str, optional (default='male')) – Check available models at malaya_speech.tts.available_lightspeech().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.
pad_to (int, optional (default=8)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.

Returns

result

Return type

malaya_speech.model.synthesis.Fastspeech class

malaya_speech.tts.vits(model='mesolitica/VITS-osman', **kwargs)[source]#

Load VITS End-to-End TTS model.

Parameters: model (str, optional (default='mesolitica/VITS-osman')) – Check available models at malaya_speech.tts.available_vits().
Returns: result
Return type: malaya_speech.torch_model.synthesis.VITS class

malaya_speech.tts.e2e_fastspeech2(model='osman', quantized=False, pad_to=8, **kwargs)[source]#

Load Fastspeech2 Text-to-Mel TTS model.

Parameters

model (str, optional (default='male')) – Check available models at malaya_speech.tts.available_e2e_fastspeech2().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.
pad_to (int, optional (default=8)) – size of pad character with 0. Increase can stable up prediction on short sentence, we trained on 8.

Returns

result

Return type

malaya_speech.model.synthesis.E2E_FastSpeech class

malaya_speech.vad#

malaya_speech.vad.available_model()[source]#: List available VAD deep models.

malaya_speech.vad.webrtc(aggressiveness=3, sample_rate=16000, minimum_amplitude=100)[source]#

Load WebRTC VAD model.

Parameters

aggressiveness (int, optional (default=3)) – an integer between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive.
sample_rate (int, optional (default=16000)) – sample rate for samples.
minimum_amplitude (int, optional (default=100)) – abs(minimum_amplitude) to assume a sample is a voice activity. Else, automatically False.

Returns

result

Return type

malaya_speech.model.webrtc.WebRTC class

malaya_speech.vad.deep_model(model='marblenet-factor1', quantized=False, **kwargs)[source]#

Load VAD model.

Parameters

model (str, optional (default='vggvox-v2')) – Check available models at malaya_speech.vad.available_model().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.supervised.classification.load function

malaya_speech.vocoder#

malaya_speech.vocoder.available_melgan()[source]#: List available MelGAN Mel-to-Speech models.

malaya_speech.vocoder.available_mbmelgan()[source]#: List available Multiband MelGAN Mel-to-Speech models.

malaya_speech.vocoder.available_hifigan()[source]#: List available HiFiGAN Mel-to-Speech models.

malaya_speech.vocoder.melgan(model='universal-1024', quantized=False, **kwargs)[source]#

Load MelGAN Vocoder model.

Parameters

model (str, optional (default='universal-1024')) – Check available models at malaya_speech.vocoder.available_melgan().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.synthesis.Vocoder class

malaya_speech.vocoder.mbmelgan(model='female', quantized=False, **kwargs)[source]#

Load Multiband MelGAN Vocoder model.

Parameters

model (str, optional (default='female')) – Check available models at malaya_speech.vocoder.available_mbmelgan().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.synthesis.Vocoder class

malaya_speech.vocoder.hifigan(model='universal-768', quantized=False, **kwargs)[source]#

Load HiFiGAN Vocoder model.

Parameters

model (str, optional (default='universal-768')) – Check available models at malaya_speech.vocoder.available_hifigan().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.synthesis.Vocoder class

malaya_speech.voice_conversion#

malaya_speech.voice_conversion.available_deep_conversion()[source]#: List available Voice Conversion models.

malaya_speech.voice_conversion.deep_conversion(model='fastvc-32-vggvox-v2', quantized=False, **kwargs)[source]#

Load Voice Conversion model.

Parameters

model (str, optional (default='fastvc-32-vggvox-v2')) – Check available models at malaya_speech.voice_conversion.available_deep_conversion().
quantized (bool, optional (default=False)) – if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine.

Returns

result

Return type

malaya_speech.model.synthesis.FastVC class

API

Contents

API#

malaya_speech#

malaya_speech.augmentation.spectrogram#

malaya_speech.extra.rttm#

malaya_speech.extra.visualization#

malaya_speech.model.classification.Speakernet#

malaya_speech.model.classification.Speaker2Vec#

malaya_speech.model.classification.SpeakernetClassification#

malaya_speech.model.classification.Classification#

malaya_speech.model.clustering.AgglomerativeClustering#

malaya_speech.model.clustering.HiddenMarkovModelClustering#

malaya_speech.model.huggingface.HuggingFace_CTC#

malaya_speech.model.huggingface.HuggingFace_Aligner#

malaya_speech.model.splitter.Split_Wav#

malaya_speech.model.splitter.Split_Mel#

malaya_speech.model.splitter.FastSpeechSplit#

malaya_speech.model.synthesis.TTS#

malaya_speech.model.synthesis.Vocoder#

malaya_speech.model.synthesis.Tacotron#

malaya_speech.model.synthesis.Fastspeech#

malaya_speech.model.synthesis.FastspeechSDP#

malaya_speech.model.synthesis.E2E_FastSpeech#

malaya_speech.model.synthesis.FastVC#

malaya_speech.model.synthesis.Fastpitch#

malaya_speech.model.transducer.Transducer#

malaya_speech.model.transducer.TransducerAligner#

malaya_speech.model.unet.UNET#

malaya_speech.model.unet.UNETSTFT#

malaya_speech.model.unet.UNET1D#

malaya_speech.model.wav2vec.Wav2Vec2_CTC#

malaya_speech.model.wav2vec.Wav2Vec2_Aligner#

malaya_speech.model.webrtc.WebRTC#

malaya_speech.torch_model.super_resolution.VoiceFixer#

malaya_speech.torch_model.super_resolution.NVSR#

malaya_speech.torch_model.synthesis.VITS#

malaya_speech.pipeline#

malaya_speech.pipeline.map#

malaya_speech.pipeline.batching#

malaya_speech.pipeline.partition#

malaya_speech.pipeline.sliding_window#

malaya_speech.pipeline.foreach_map#

malaya_speech.pipeline.flatten#

malaya_speech.pipeline.zip#

malaya_speech.streaming#

malaya_speech.utils.aligner#

malaya_speech.utils.astype#

malaya_speech.utils.char#

malaya_speech.utils.combine#

malaya_speech.utils.featurization#

malaya_speech.utils.generator#

malaya_speech.utils.griffin_lim#

malaya_speech.utils.group#

malaya_speech.utils.padding#

malaya_speech.utils.read#

malaya_speech.utils.split#

malaya_speech.utils.subword#

malaya_speech.utils.tf_featurization#

malaya_speech.age_detection#

malaya_speech.diarization#

malaya_speech.emotion#

malaya_speech.force_alignment#

malaya_speech.gender#

malaya_speech.language_detection#

malaya_speech.language_model#

malaya_speech.multispeaker_separation#

malaya_speech.noise_reduction#

malaya_speech.speaker_change#

malaya_speech.speaker_overlap#

malaya_speech.speaker_vector#

malaya_speech.speech_enhancement#

malaya_speech.speechsplit_conversion#

malaya_speech.stack#

malaya_speech.model.stack.Stack#

malaya_speech.stt#

malaya_speech.super_resolution#

malaya_speech.tts#

malaya_speech.vad#

malaya_speech.vocoder#