Remove silents using VAD

Remove silents actually is pretty hard, traditional people use certain dB threshold, if lower, we assume it is a silent with certain window size. If I set -20 dB for one sample audio, does not mean able to do it for another samples.

This tutorial is available as an IPython notebook at malaya-speech/example/remove-silents-vad.

This module is language independent, so it save to use on different languages.

This is an application of malaya-speech Pipeline, read more about malaya-speech Pipeline at malaya-speech/example/pipeline.

[1]:
import malaya_speech
import numpy as np
import librosa
from malaya_speech import Pipeline
[2]:
def norm_mel(y, sr):
    mel = librosa.feature.melspectrogram(y, sr = sr, n_mels = 80)
    return np.log10(np.maximum(mel, 1e-10)).T

def plot(y, sr):
    mel = norm_mel(y, sr)
    fig, axs = plt.subplots(2, figsize=(10, 8))
    axs[0].plot(y)
    im = axs[1].imshow(np.rot90(mel), aspect='auto', interpolation='none')
    fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=axs[1])
    plt.show()

Load easy example

[3]:
y, sr = malaya_speech.load('speech/podcast/nusantara.wav')
len(y) / sr
[3]:
12.27
[4]:
import matplotlib.pyplot as plt
import IPython.display as ipd
[5]:
ipd.Audio(y, rate = sr)
[5]:
[6]:
plot(y, sr)
_images/remove-silent-vad_11_0.png

If you see at waveform graph or mel graph, we can see silent periods at the start, middle and end.

Use librosa.effects.trim

[7]:
y_ = librosa.effects.trim(y, top_db = 20)[0]
[8]:
ipd.Audio(y_, rate = sr)
[8]:
[9]:
plot(y_, sr)
_images/remove-silent-vad_16_0.png

Looks good, but it missed silents at the middle.

Use pydub.silence.split_on_silence

[10]:
from pydub import AudioSegment
from pydub.silence import split_on_silence

Before changed from float np.array into audiosegment, need to cast to int.

[11]:
y_int = malaya_speech.astype.float_to_int(y)
audio = AudioSegment(
    y_int.tobytes(),
    frame_rate = sr,
    sample_width = y_int.dtype.itemsize,
    channels = 1
)
[12]:
audio_chunks = split_on_silence(
    audio,
    min_silence_len = 200,
    silence_thresh = -30,
    keep_silence = 100,
)
audio_chunks
[12]:
[<pydub.audio_segment.AudioSegment at 0x14fb01810>,
 <pydub.audio_segment.AudioSegment at 0x14fb01950>,
 <pydub.audio_segment.AudioSegment at 0x14fb01990>,
 <pydub.audio_segment.AudioSegment at 0x14fb01dd0>,
 <pydub.audio_segment.AudioSegment at 0x14fb07490>]
[13]:
y_ = sum(audio_chunks)
y_ = np.array(y_.get_array_of_samples())
y_ = malaya_speech.astype.int_to_float(y_)
[14]:
ipd.Audio(y_, rate = sr)
[14]:
[15]:
plot(y_, sr)
_images/remove-silent-vad_25_0.png

Looks good, but again, parameter silence_thresh is very important.

Use WebRTC VAD

We also can split using VAD, good thing about VAD, we do not need to define certain threshold, it depends on how good the VAD model.

[16]:
vad = malaya_speech.vad.webrtc()
[17]:
y_= malaya_speech.resample(y, sr, 16000)
y_ = malaya_speech.astype.float_to_int(y_)
frames = malaya_speech.generator.frames(y, 30, sr)
frames_ = list(malaya_speech.generator.frames(y_, 30, 16000, append_ending_trail = False))
frames_webrtc = [(frames[no], vad(frame)) for no, frame in enumerate(frames_)]

Or can use pipeline,

[18]:
p = Pipeline()

pipeline_left = (
    p.map(malaya_speech.generator.frames, frame_duration_ms = 30, sample_rate = 16000)
)

pipeline_right = (
    p.map(malaya_speech.resample, old_samplerate = sr, new_samplerate = 16000)
    .map(malaya_speech.astype.float_to_int)
    .map(malaya_speech.generator.frames, frame_duration_ms = 30, sample_rate = 16000,
         append_ending_trail = False)
    .foreach_map(vad)
)

pipeline_left.foreach_zip(pipeline_right).map(malaya_speech.combine.without_silent)

p.visualize()
[18]:
_images/remove-silent-vad_31_0.png
[19]:
results = p(y)
results.keys()
[19]:
dict_keys(['frames', 'resample', 'float_to_int', 'vad', 'foreach_zip', 'without_silent'])
[20]:
y_ = malaya_speech.combine.without_silent(frames_webrtc)
y_
[20]:
array([-1.83578565e-03,  2.21343555e-03,  5.58967798e-03, ...,
        0.00000000e+00, -1.54295286e-05,  0.00000000e+00])
[21]:
ipd.Audio(y_, rate = sr)
[21]:
[22]:
plot(y_, sr)
_images/remove-silent-vad_35_0.png
[23]:
ipd.Audio(results['without_silent'], rate = sr)
[23]:
[24]:
plot(y_, sr)
_images/remove-silent-vad_37_0.png

Load harder example

[25]:
y, sr = malaya_speech.load('speech/khutbah/wadi-annuar.wav')
y = y[: sr * 15]
len(y), sr
[25]:
(160000, 16000)
[26]:
ipd.Audio(y, rate = sr)
[26]:
[27]:
plot(y, sr)
_images/remove-silent-vad_41_0.png

Use pydub.silence.split_on_silence

[28]:
y_int = malaya_speech.astype.float_to_int(y)
audio = AudioSegment(
    y_int.tobytes(),
    frame_rate = sr,
    sample_width = y_int.dtype.itemsize,
    channels = 1
)
audio_chunks = split_on_silence(
    audio,
    min_silence_len = 200,
    silence_thresh = -30,
    keep_silence = 100,
)
audio_chunks
[28]:
[<pydub.audio_segment.AudioSegment at 0x1528b3950>,
 <pydub.audio_segment.AudioSegment at 0x1528b3910>]
[29]:
y_ = sum(audio_chunks)
y_ = np.array(y_.get_array_of_samples())
y_ = malaya_speech.astype.int_to_float(y_)
ipd.Audio(y_, rate = sr)
[29]:
[30]:
plot(y_, sr)
_images/remove-silent-vad_45_0.png

Use WebRTC VAD

[31]:
results = p(y)
results.keys()
[31]:
dict_keys(['frames', 'resample', 'float_to_int', 'vad', 'foreach_zip', 'without_silent'])
[32]:
ipd.Audio(results['without_silent'], rate = sr)
[32]:
[33]:
plot(y_, sr)
_images/remove-silent-vad_49_0.png

Use Deep learning VAD

[34]:
quantized_model = malaya_speech.vad.deep_model(model = 'vggvox-v2', quantized = True)
WARNING:root:Load quantized model will cause accuracy drop.
WARNING:tensorflow:From /Users/huseinzolkepli/Documents/malaya-speech/malaya_speech/utils/__init__.py:66: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.

WARNING:tensorflow:From /Users/huseinzolkepli/Documents/malaya-speech/malaya_speech/utils/__init__.py:66: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.

WARNING:tensorflow:From /Users/huseinzolkepli/Documents/malaya-speech/malaya_speech/utils/__init__.py:68: The name tf.GraphDef is deprecated. Please use tf.compat.v1.GraphDef instead.

WARNING:tensorflow:From /Users/huseinzolkepli/Documents/malaya-speech/malaya_speech/utils/__init__.py:68: The name tf.GraphDef is deprecated. Please use tf.compat.v1.GraphDef instead.

WARNING:tensorflow:From /Users/huseinzolkepli/Documents/malaya-speech/malaya_speech/utils/__init__.py:61: The name tf.InteractiveSession is deprecated. Please use tf.compat.v1.InteractiveSession instead.

WARNING:tensorflow:From /Users/huseinzolkepli/Documents/malaya-speech/malaya_speech/utils/__init__.py:61: The name tf.InteractiveSession is deprecated. Please use tf.compat.v1.InteractiveSession instead.

[35]:
p = Pipeline()

pipeline_left = (
    p.map(malaya_speech.generator.frames, frame_duration_ms = 30, sample_rate = 16000)
)

pipeline_right = (
    pipeline_left.batching(5)
    .foreach_map(quantized_model.predict)
    .flatten()
)

pipeline_left.foreach_zip(pipeline_right).map(malaya_speech.combine.without_silent,
                                             threshold_to_stop = 0.05)

p.visualize()
[35]:
_images/remove-silent-vad_52_0.png
[36]:
results = p(y)
results.keys()
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/librosa/core/spectrum.py:224: UserWarning: n_fft=512 is too small for input signal of length=480
  n_fft, y.shape[-1]
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/librosa/core/spectrum.py:224: UserWarning: n_fft=512 is too small for input signal of length=160
  n_fft, y.shape[-1]
[36]:
dict_keys(['frames', 'batching', 'predict', 'flatten', 'foreach_zip', 'without_silent'])
[37]:
ipd.Audio(results['without_silent'], rate = sr)
[37]:
[38]:
plot(results['without_silent'], sr)
_images/remove-silent-vad_55_0.png