Super Resolution audio diffusion#

This tutorial is available as an IPython notebook at malaya-speech/example/super-resolution-audio-diffusion.

This module is language independent, so it save to use on different languages. Pretrained models trained on multilanguages.

This is an application of malaya-speech Pipeline, read more about malaya-speech Pipeline at malaya-speech/example/pipeline.

Required PyTorch >= 1.10.

Purpose of this module to increase sample rate.

[1]:
import malaya_speech
import numpy as np
from malaya_speech import Pipeline
import IPython.display as ipd
[2]:
import logging

logging.basicConfig(level=logging.INFO)

List available audio diffusion deep models#

[3]:
malaya_speech.super_resolution.available_audio_diffusion()
INFO:malaya_speech.super_resolution:Only calculate SDR, ISR, SAR on voice sample. Higher is better.
[3]:
Size (MB)
nuwave2 20.9

Load audio diffusion deep model#

def audio_diffusion(model: str = 'nuwave2', **kwargs):
    """
    Load audio diffusion based Speech Resolution.

    Parameters
    ----------
    model : str, optional (default='nuwave2')
        Model architecture supported. Allowed values:

        * ``'nuwave2'`` - originally from https://github.com/mindslab-ai/nuwave2.

    Returns
    -------
    result : malaya_speech.torch_model.super_resolution.NuWave2
    """
[4]:
model = malaya_speech.super_resolution.audio_diffusion(model = 'nuwave2')
INFO:malaya_boilerplate.huggingface:downloading frozen huseinzol05/nuwave2/model.pth

Important factor#

  1. Currently audio diffusion only able to upsample up to 48000 sampling rate.

Predict#

def predict(self, input, sr: int, steps: int = 8):
    """
    Parameters
    ----------
    input: np.array
        np.array or malaya_speech.model.frame.Frame,
        prefer 8000, 12000, 16000 or 22050 or 44000 sampling rate.
    sr: int
        sampling rate, prefer 8000, 12000, 16000 or 22050 or 44000 sampling rate.
    steps: int, optional (default=8)
        diffusion steps.

    Returns
    -------
    result: np.array with 48k sampling rate
    """
[5]:
import librosa
import matplotlib.pyplot as plt
[6]:
sampling_rate = 44100
fft_size = 2048
hop_size = 256
win_length = None
window = 'hann'
num_mels = 128
fmin = 0
fmax = None

mel_basis = librosa.filters.mel(
    sr=sampling_rate,
    n_fft=fft_size,
    n_mels=num_mels,
    fmin=fmin,
    fmax=fmax,
)
[7]:
y, sr_ = malaya_speech.load('speech/call-centre/1.wav', sr = None)
y = y[sr_ * 5 :sr_ * 7]
sr_
[7]:
8000
[8]:
ipd.Audio(y, rate = sr_)
[8]:
[9]:
%%time

output = model(y, sr = sr_)
ipd.Audio(output, rate = 48000)
/Users/huseinzolkepli/Documents/malaya-speech/malaya_speech/train/model/nuwave2_torch/model.py:108: UserWarning: istft will require a complex-valued input tensor in a future PyTorch release. Matching the output from stft with return_complex=True.  (Triggered internally at  /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/native/SpectralOps.cpp:979.)
  center=True, normalized=True, onesided=True)
CPU times: user 2min 21s, sys: 34.5 s, total: 2min 56s
Wall time: 46.1 s
[9]:
[11]:
D = librosa.stft(
    output,
    n_fft=fft_size,
    hop_length=hop_size,
    win_length=win_length,
    window=window,
    pad_mode='reflect',
)
S, _ = librosa.magphase(D)
mel = np.log10(np.maximum(np.dot(mel_basis, S), 1e-10)).T
fig = plt.figure(figsize=(10, 8))
ax1 = fig.add_subplot(311)
ax1.set_title('output Mel-Spectrogram')
im = ax1.imshow(np.rot90(mel), aspect='auto', interpolation='none')
fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1)
plt.show()
_images/load-super-resolution-audio-diffusion_19_0.png
[14]:
D = librosa.stft(
    malaya_speech.resample(y, sr_, 48000),
    n_fft=fft_size,
    hop_length=hop_size,
    win_length=win_length,
    window=window,
    pad_mode='reflect',
)
S, _ = librosa.magphase(D)
mel = np.log10(np.maximum(np.dot(mel_basis, S), 1e-10)).T
fig = plt.figure(figsize=(10, 8))
ax1 = fig.add_subplot(311)
ax1.set_title('interpolate Mel-Spectrogram')
im = ax1.imshow(np.rot90(mel), aspect='auto', interpolation='none')
fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1)
plt.show()
_images/load-super-resolution-audio-diffusion_20_0.png

Use Pipeline#

Incase your audio is too long and you do not want to burden your machine. So, you can use malaya-speech Pipeline to split the audio splitted to 1 second, predict one-by-one and combine after that.

[39]:
p = Pipeline()
pipeline = (
    p.map(malaya_speech.load, sr = sr_)
    .map(lambda x: x[0])
    .map(lambda x: x[sr_ * 5: sr_ * 10], name = 'cut')
    .map(malaya_speech.generator.frames, frame_duration_ms = 1200, sample_rate = sr_, append_ending_trail = False)
    .foreach_map(model.predict, sr = sr_)
    .map(np.concatenate)
)
p.visualize()
[39]:
_images/load-super-resolution-audio-diffusion_23_0.png
[40]:
%%time

results = p('speech/call-centre/1.wav')
CPU times: user 6min 16s, sys: 1min 34s, total: 7min 50s
Wall time: 2min 6s
[41]:
ipd.Audio(results['concatenate'], rate = 48000)
[41]:
[42]:
resampled = malaya_speech.resample(results['cut'], sr_, 48000)
[44]:
ipd.Audio(resampled, rate = 48000)
[44]:
[45]:
D = librosa.stft(
    results['concatenate'],
    n_fft=fft_size,
    hop_length=hop_size,
    win_length=win_length,
    window=window,
    pad_mode='reflect',
)
S, _ = librosa.magphase(D)
mel = np.log10(np.maximum(np.dot(mel_basis, S), 1e-10)).T
fig = plt.figure(figsize=(10, 8))
ax1 = fig.add_subplot(311)
ax1.set_title('output Mel-Spectrogram')
im = ax1.imshow(np.rot90(mel), aspect='auto', interpolation='none')
fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1)
plt.show()
_images/load-super-resolution-audio-diffusion_28_0.png
[46]:
D = librosa.stft(
    resampled,
    n_fft=fft_size,
    hop_length=hop_size,
    win_length=win_length,
    window=window,
    pad_mode='reflect',
)
S, _ = librosa.magphase(D)
mel = np.log10(np.maximum(np.dot(mel_basis, S), 1e-10)).T
fig = plt.figure(figsize=(10, 8))
ax1 = fig.add_subplot(311)
ax1.set_title('interpolate Mel-Spectrogram')
im = ax1.imshow(np.rot90(mel), aspect='auto', interpolation='none')
fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1)
plt.show()
_images/load-super-resolution-audio-diffusion_29_0.png