Speaker count Detection#

This tutorial is available as an IPython notebook at malaya-speech/example/speaker-count.

This module is language independent, so it save to use on different languages.

This is an application of malaya-speech Pipeline, read more about malaya-speech Pipeline at malaya-speech/example/pipeline.


Trained on Musan Speech, VCTK, LibriSpeech, Mandarin speakers and Malaya-Speech TTS dataset to detect number of speakers.

import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
import malaya_speech
import numpy as np
from malaya_speech import Pipeline
y, sr = malaya_speech.load('speech/vctk/p300_298_mic1.flac')
len(y) / sr
from datasets import Audio

y2, sr = malaya_speech.load('speech/podcast/toodia.mp3')

len(y2) / 16000
import IPython.display as ipd
ipd.Audio(y, rate = sr)
ipd.Audio(y2, rate = sr)

List available Nemo models#

original from Size (MB)
huseinzol05/nemo-is-clean-speakernet https://catalog.ngc.nvidia.com/orgs/nvidia/tea... 16.2
huseinzol05/nemo-is-clean-titanet_large https://catalog.ngc.nvidia.com/orgs/nvidia/tea... 88.8

Load Nemo model#

def nemo(
    model: str = 'huseinzol05/nemo-speaker-count-speakernet',
    Load Nvidia Nemo speaker count model.
    Trained on 300 ms frames.

    model : str, optional (default='huseinzol05/nemo-speaker-count-speakernet')
        Check available models at `malaya_speech.speaker_count.available_nemo()`.

    result : malaya_speech.torch_model.nemo.Classification class
model = malaya_speech.speaker_count.nemo(model = 'huseinzol05/nemo-speaker-count-titanet_large')
_ = model.eval()

How to use Speaker Count detection#

We finetuned nemo models on 300 ms frame.

frames = list(malaya_speech.utils.generator.frames(y, 300, sr, False))

probs = [(frame, model.predict([frame])[0]) for frame in frames]
vad = malaya_speech.vad.deep_model(model = 'vggvox-v2')
frames = list(malaya_speech.utils.generator.frames(y, 30, sr))
p = Pipeline()
pipeline = (
result = p.emit(frames)
dict_keys(['batching', 'predict', 'flatten'])
frames_vad = [(frame, result['flatten'][no]) for no, frame in enumerate(frames)]
grouped_vad = malaya_speech.utils.group.group_frames(frames_vad)
grouped_vad = malaya_speech.utils.group.group_frames_threshold(grouped_vad, threshold_to_stop = 0.3)
grouped_vad = malaya_speech.utils.group.group_frames(grouped_vad)
import matplotlib.pyplot as plt
import seaborn as sns
nrows = 2
fig, ax = plt.subplots(nrows = nrows, ncols = 1)
fig.set_figheight(nrows * 3)
malaya_speech.extra.visualization.visualize_vad(y, grouped_vad, sr, ax = ax[0])
malaya_speech.extra.visualization.plot_classification(probs, 'speaker count',
                                                      yaxis = True, ax = ax[1])
frames = list(malaya_speech.utils.generator.frames(y2, 300, sr, False))

probs = [(frame, model.predict([frame])[0]) for frame in frames]
vad = malaya_speech.vad.deep_model(model = 'vggvox-v2')
frames = list(malaya_speech.utils.generator.frames(y2, 30, sr))
p = Pipeline()
pipeline = (
result = p.emit(frames)
dict_keys(['batching', 'predict', 'flatten'])
frames_vad = [(frame, result['flatten'][no]) for no, frame in enumerate(frames)]
grouped_vad = malaya_speech.utils.group.group_frames(frames_vad)
grouped_vad = malaya_speech.utils.group.group_frames_threshold(grouped_vad, threshold_to_stop = 0.3)
grouped_vad = malaya_speech.utils.group.group_frames(grouped_vad)
nrows = 2
fig, ax = plt.subplots(nrows = nrows, ncols = 1)
fig.set_figheight(nrows * 3)
malaya_speech.extra.visualization.visualize_vad(y2, grouped_vad, sr, ax = ax[0])
malaya_speech.extra.visualization.plot_classification(probs, 'speaker count',
                                                      yaxis = True, ax = ax[1])
ipd.Audio(y2[20 * sr: 30 * sr], rate = sr)