Classification stacking#

This tutorial is available as an IPython notebook at malaya-speech/example/classification-stacking.

This module is language independent, so it save to use on different languages. Pretrained models trained on multilanguages.

This is an application of malaya-speech Pipeline, read more about malaya-speech Pipeline at malaya-speech/example/pipeline.

Why Stacking?#

Sometime a single model is not good enough. So, you need to use multiple models to get a better result! It called stacking.

In this example, I am going to use gender detection module.

import malaya_speech
import numpy as np
from malaya_speech import Pipeline
y, sr = malaya_speech.load('speech/video/The-Singaporean-White-Boy.wav')
len(y), sr
(1634237, 16000)
# just going to take 30 seconds
y = y[:sr * 30]
import IPython.display as ipd
ipd.Audio(y, rate = sr)

Supported genders#

['male', 'female', 'not a gender']

List available deep model#

INFO:root:last accuracy during training session before early stopping.
Size (MB) Quantized Size (MB) Accuracy
vggvox-v2 31.1 7.92 0.9756
deep-speaker 96.9 24.40 0.9455

Load deep model#

def deep_model(model: str = 'vggvox-v2', quantized: bool = False, **kwargs):
    Load gender detection deep model.

    model : str, optional (default='vggvox-v2')
        Model architecture supported. Allowed values:

        * ``'vggvox-v2'`` - finetuned VGGVox V2.
        * ``'deep-speaker'`` - finetuned Deep Speaker.
    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model.
        Quantized model not necessary faster, totally depends on the machine.

    result : malaya_speech.supervised.classification.load function
vggvox_v2 = malaya_speech.gender.deep_model(model = 'vggvox-v2')
deep_speaker = malaya_speech.gender.deep_model(model = 'deep-speaker')

How to classify genders in an audio sample#

So we are going to use VAD to help us. Instead we are going to classify as a whole sample, we chunk it into multiple small samples and classify it.

vad = malaya_speech.vad.deep_model(model = 'vggvox-v2')

frames = list(malaya_speech.utils.generator.frames(y, 30, sr))
CPU times: user 1.08 ms, sys: 66 µs, total: 1.14 ms
Wall time: 1.15 ms
p = Pipeline()
pipeline = (

result = p.emit(frames)
/Users/huseinzolkepli/Documents/tf-1.15/env/lib/python3.7/site-packages/librosa/core/ UserWarning: n_fft=512 is too small for input signal of length=480
  n_fft, y.shape[-1]
CPU times: user 32.6 s, sys: 6.28 s, total: 38.9 s
Wall time: 9 s
dict_keys(['batching', 'predict', 'flatten'])
frames_vad = [(frame, result['flatten'][no]) for no, frame in enumerate(frames)]
grouped_vad =
grouped_vad =, threshold_to_stop = 0.3)
malaya_speech.extra.visualization.visualize_vad(y, grouped_vad, sr, figsize = (15, 3))
p_vggvox_v2 = Pipeline()
pipeline = (
p_deep_speaker = Pipeline()
pipeline = (

Stacking interface#

def classification_stack(models):
    Stacking for classification models. All models should be in the same domain classification.

    models: List[Callable]
        list of models.

    result: malaya_speech.stack.Stack class
def predict_proba(self, inputs, aggregate: Callable = gmean):
    Stacking for predictive models, will return probability.

    inputs: List[np.array]
    aggregate : Callable, optional (default=scipy.stats.mstats.gmean)
    Aggregate function.

    result: np.array
def predict(self, inputs, aggregate: Callable = gmean):
    Stacking for predictive models, will return labels.

    inputs: List[np.array]
    aggregate : Callable, optional (default=scipy.stats.mstats.gmean)
    Aggregate function.

    result: List[str]

By default, aggregated function for stacking is scipy.stats.mstats.gmean.

gender_stack = malaya_speech.stack.classification_stack([vggvox_v2, vggvox_v2, deep_speaker])
p_stacking = Pipeline()
pipeline = (

samples_vad = [g[0] for g in grouped_vad]
result_vggvox_v2 = p_vggvox_v2.emit(samples_vad)
CPU times: user 5.38 s, sys: 1.3 s, total: 6.68 s
Wall time: 2.3 s
dict_keys(['gender', 'flatten'])

samples_vad = [g[0] for g in grouped_vad]
result_deep_speaker = p_deep_speaker.emit(samples_vad)
CPU times: user 4.14 s, sys: 514 ms, total: 4.65 s
Wall time: 851 ms
dict_keys(['gender', 'flatten'])

samples_vad = [g[0] for g in grouped_vad]
result_stacking = p_stacking.emit(samples_vad)
CPU times: user 14.9 s, sys: 2.82 s, total: 17.7 s
Wall time: 3.14 s
dict_keys(['gender', 'flatten'])
samples_vad_vggvox_v2 = [(frame, result_vggvox_v2['flatten'][no]) for no, frame in enumerate(samples_vad)]
[(<malaya_speech.model.frame.Frame at 0x14d50fed0>, 'not a gender'),
 (<malaya_speech.model.frame.Frame at 0x14d526110>, 'not a gender'),
 (<malaya_speech.model.frame.Frame at 0x14d5260d0>, 'male'),
 (<malaya_speech.model.frame.Frame at 0x14d526190>, 'male'),
 (<malaya_speech.model.frame.Frame at 0x14d5261d0>, 'male'),
 (<malaya_speech.model.frame.Frame at 0x14d526250>, 'male'),
 (<malaya_speech.model.frame.Frame at 0x14d526290>, 'male'),
 (<malaya_speech.model.frame.Frame at 0x14d5262d0>, 'male'),
 (<malaya_speech.model.frame.Frame at 0x14d526350>, 'male'),
 (<malaya_speech.model.frame.Frame at 0x14d526310>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526210>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526390>, 'not a gender'),
 (<malaya_speech.model.frame.Frame at 0x14d5263d0>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526410>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526450>, 'not a gender'),
 (<malaya_speech.model.frame.Frame at 0x14d5264d0>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526490>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526550>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526590>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526510>, 'not a gender'),
 (<malaya_speech.model.frame.Frame at 0x14d526610>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d5265d0>, 'female')]
samples_vad_deep_speaker = [(frame, result_deep_speaker['flatten'][no]) for no, frame in enumerate(samples_vad)]
[(<malaya_speech.model.frame.Frame at 0x14d50fed0>, 'not a gender'),
 (<malaya_speech.model.frame.Frame at 0x14d526110>, 'not a gender'),
 (<malaya_speech.model.frame.Frame at 0x14d5260d0>, 'male'),
 (<malaya_speech.model.frame.Frame at 0x14d526190>, 'male'),
 (<malaya_speech.model.frame.Frame at 0x14d5261d0>, 'male'),
 (<malaya_speech.model.frame.Frame at 0x14d526250>, 'male'),
 (<malaya_speech.model.frame.Frame at 0x14d526290>, 'male'),
 (<malaya_speech.model.frame.Frame at 0x14d5262d0>, 'male'),
 (<malaya_speech.model.frame.Frame at 0x14d526350>, 'male'),
 (<malaya_speech.model.frame.Frame at 0x14d526310>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526210>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526390>, 'not a gender'),
 (<malaya_speech.model.frame.Frame at 0x14d5263d0>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526410>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526450>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d5264d0>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526490>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526550>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526590>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526510>, 'not a gender'),
 (<malaya_speech.model.frame.Frame at 0x14d526610>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d5265d0>, 'female')]
samples_vad_stacking = [(frame, result_stacking['flatten'][no]) for no, frame in enumerate(samples_vad)]
[(<malaya_speech.model.frame.Frame at 0x14d50fed0>, 'not a gender'),
 (<malaya_speech.model.frame.Frame at 0x14d526110>, 'not a gender'),
 (<malaya_speech.model.frame.Frame at 0x14d5260d0>, 'male'),
 (<malaya_speech.model.frame.Frame at 0x14d526190>, 'male'),
 (<malaya_speech.model.frame.Frame at 0x14d5261d0>, 'male'),
 (<malaya_speech.model.frame.Frame at 0x14d526250>, 'male'),
 (<malaya_speech.model.frame.Frame at 0x14d526290>, 'male'),
 (<malaya_speech.model.frame.Frame at 0x14d5262d0>, 'male'),
 (<malaya_speech.model.frame.Frame at 0x14d526350>, 'male'),
 (<malaya_speech.model.frame.Frame at 0x14d526310>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526210>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526390>, 'not a gender'),
 (<malaya_speech.model.frame.Frame at 0x14d5263d0>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526410>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526450>, 'not a gender'),
 (<malaya_speech.model.frame.Frame at 0x14d5264d0>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526490>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526550>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526590>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d526510>, 'not a gender'),
 (<malaya_speech.model.frame.Frame at 0x14d526610>, 'female'),
 (<malaya_speech.model.frame.Frame at 0x14d5265d0>, 'female')]
import matplotlib.pyplot as plt
[ ]:
nrows = 4
fig, ax = plt.subplots(nrows = nrows, ncols = 1)
fig.set_figheight(nrows * 3)
malaya_speech.extra.visualization.visualize_vad(y, grouped_vad, sr, ax = ax[0])
                                                      'emotion detection vggvox v2', ax = ax[1])
                                                      'emotion detection deep speaker', ax = ax[2])
                                                      'emotion detection stacking', ax = ax[3])