Contents

WAV separation

Contents

WAV separation#

This tutorial is available as an IPython notebook at malaya-speech/example/multispeaker-separation-wav.

This module is language independent, so it save to use on different languages.

[1]:

import malaya_speech
import numpy as np
from malaya_speech import Pipeline
import matplotlib.pyplot as plt
import IPython.display as ipd

Multispeaker separation description#

FastSep-WAV only able to separate 8k sample rate.
FastSep-WAV trained to separate 4 unique speakers.
Trained on VCTK, Nepali, Mandarin and Malay mixed speakers.

List available FastSep-WAV#

[2]:

malaya_speech.multispeaker_separation.available_deep_wav()

INFO:root:Tested on 1k samples

[2]:

	Size (MB)	Quantized Size (MB)	SISNR PIT
fastsep-2	78.7	20.5	14.156882
fastsep-4	155.0	40.2	19.682500

Load model#

def deep_wav(model: str = 'fastsep-4', quantized: bool = False, **kwargs):
    """
    Load FastSep model, trained on raw 8k wav using SISNR PIT loss.

    Parameters
    ----------
    model : str, optional (default='fastsep-4')
        Model architecture supported. Allowed values:

        * ``'fastsep-4'`` - FastSep 4 layers trained on raw 8k wav.
        * ``'fastsep-2'`` - FastSep 2 layers trained on raw 8k wav.
    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model.
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result : malaya_speech.model.tf.Split class
    """

[3]:

model = malaya_speech.multispeaker_separation.deep_wav('fastsep-4')

Load quantized model#

[4]:

quantized_model = malaya_speech.multispeaker_separation.deep_wav('fastsep-4', quantized = True)

WARNING:root:Load quantized model will cause accuracy drop.

Generate random mixed audio#

[5]:

import random
import malaya_speech.augmentation.waveform as augmentation

sr = 8000
speakers_size = 4

def read_wav(f):
    return malaya_speech.load(f, sr = sr)


def random_sampling(s, length):
    return augmentation.random_sampling(s, sr = sr, length = length)

def combine_speakers(files, n = 5, limit = 4):
    w_samples = random.sample(files, n)
    w_samples = [read_wav(f)[0] for f in w_samples]
    w_lens = [len(w) / sr for w in w_samples]
    w_lens = int(min(min(w_lens) * 1000, random.randint(3000, 7000)))
    w_samples = [random_sampling(w, length = w_lens) for w in w_samples]
    y = [w_samples[0]]
    left = w_samples[0].copy()

    combined = None

    for i in range(1, n):
        right = w_samples[i].copy()
        overlap = random.uniform(0.98, 1.0)
        len_overlap = int(overlap * len(right))
        minus = len(left) - len_overlap
        if minus < 0:
            minus = 0
        padded_right = np.pad(right, (minus, 0))
        left = np.pad(left, (0, len(padded_right) - len(left)))

        left = left + padded_right

        if i >= (limit - 1):
            if combined is None:
                combined = padded_right
            else:
                combined = np.pad(
                    combined, (0, len(padded_right) - len(combined))
                )
                combined += padded_right

        else:
            y.append(padded_right)

    if combined is not None:
        y.append(combined)

    maxs = [max(left)]
    for i in range(len(y)):
        if len(y[i]) != len(left):
            y[i] = np.pad(y[i], (0, len(left) - len(y[i])))
            maxs.append(max(y[i]))

    max_amp = max(maxs)
    mix_scaling = 1 / max_amp * 0.9
    left = left * mix_scaling

    for i in range(len(y)):
        y[i] = y[i] * mix_scaling

    return left, y

[6]:

from glob import glob

wavs = glob('speech/example-speaker/*.wav')
wavs.extend(glob('speech/vctk/*.flac'))
len(wavs)

[6]:

[18]:

left, y = combine_speakers(wavs, speakers_size)
len(left) / sr, len(y)

[18]:

(3.45525, 4)

[19]:

ipd.Audio(left, rate = sr)

[19]:

[20]:

plt.plot(left, label = 'mixed')
plt.plot(y[0], label = 'y0')
plt.plot(y[1], label = 'y1')
plt.plot(y[2], label = 'y2')
plt.plot(y[3], label = 'y3')
plt.legend()
plt.show()

_images/multispeaker-separation-wav_16_0.png

Predict#

def predict(self, input):
    """
    Split an audio into 4 different speakers.

    Parameters
    ----------
    input: np.array or malaya_speech.model.frame.Frame

    Returns
    -------
    result: np.array
    """

[21]:

%%time

y = model.predict(left)
y.shape

CPU times: user 20.5 s, sys: 1.68 s, total: 22.2 s
Wall time: 3.51 s

[21]:

(4, 27642)

[22]:

%%time

quantized_y = quantized_model.predict(left)
quantized_y.shape

CPU times: user 20.8 s, sys: 1.73 s, total: 22.5 s
Wall time: 3.45 s

[22]:

(4, 27642)

Results#

[23]:

ipd.Audio(y[0], rate = sr)

[23]:

[24]:

ipd.Audio(y[1], rate = sr)

[24]:

[25]:

ipd.Audio(y[2], rate = sr)

[25]:

[26]:

ipd.Audio(y[3], rate = sr)

[26]:

Quantized results#

[27]:

ipd.Audio(quantized_y[0], rate = sr)

[27]:

[28]:

ipd.Audio(quantized_y[1], rate = sr)

[28]:

[29]:

ipd.Audio(quantized_y[2], rate = sr)

[29]:

[30]:

ipd.Audio(quantized_y[3], rate = sr)

[30]: