Source code for malaya_speech.noise_reduction

from malaya_speech.supervised import unet
from malaya_speech.utils.astype import int_to_float
from herpetologist import check_type
from malaya_speech.utils import describe_availability
import librosa
import numpy as np
import logging

logger = logging.getLogger(__name__)

# https://github.com/sigsep/sigsep-mus-eval/blob/master/museval/__init__.py#L364
# Only calculate SDR, ISR, SAR on voice sample

_availability = {
    'unet': {
        'Size (MB)': 78.9,
        'Quantized Size (MB)': 20,
        'SUM MAE': 0.862316,
        'MAE_SPEAKER': 0.460676,
        'MAE_NOISE': 0.401640,
        'SDR': 9.17312,
        'ISR': 13.92435,
        'SAR': 13.20592,
    },
    'resnet-unet': {
        'Size (MB)': 96.4,
        'Quantized Size (MB)': 24.6,
        'SUM MAE': 0.82535,
        'MAE_SPEAKER': 0.43885,
        'MAE_NOISE': 0.38649,
        'SDR': 9.45413,
        'ISR': 13.9639,
        'SAR': 13.60276,
    },
    'resnext-unet': {
        'Size (MB)': 75.4,
        'Quantized Size (MB)': 19,
        'SUM MAE': 0.81102,
        'MAE_SPEAKER': 0.44719,
        'MAE_NOISE': 0.363830,
        'SDR': 8.992832,
        'ISR': 13.49194,
        'SAR': 13.13210,
    },
}


[docs]def available_model():
    """
    List available Noise Reduction deep learning models.
    """

    logger.info('Only calculate SDR, ISR, SAR on voice sample. Higher is better.')

    return describe_availability(_availability)


[docs]@check_type
def deep_model(model: str = 'resnet-unet', quantized: bool = False, **kwargs):
    """
    Load Noise Reduction deep learning model.

    Parameters
    ----------
    model : str, optional (default='resnet-unet')
        Check available models at `malaya_speech.noise_reduction.available_model()`.
    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model.
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result : malaya_speech.model.tf.UNET_STFT class
    """

    model = model.lower()
    if model not in _availability:
        raise ValueError(
            'model not supported, please check supported models from `malaya_speech.noise_reduction.available_model()`.'
        )

    return unet.load_stft(
        model=model,
        module='noise-reduction',
        instruments=['voice', 'noise'],
        quantized=quantized,
        **kwargs
    )


# https://github.com/dodiku/noise_reduction/blob/master/noise.py
def reduce_noise_power(y, sr=16000):
    from pysndfx import AudioEffectsChain

    y = int_to_float(y)
    cent = librosa.feature.spectral_centroid(y=y, sr=sr)

    threshold_h = round(np.median(cent)) * 1.5
    threshold_l = round(np.median(cent)) * 0.1

    less_noise = (
        AudioEffectsChain()
        .lowshelf(gain=-30.0, frequency=threshold_l, slope=0.8)
        .highshelf(gain=-12.0, frequency=threshold_h, slope=0.5)
    )
    y_clean = less_noise(y)

    return y_clean


# https://github.com/dodiku/noise_reduction/blob/master/noise.py
def reduce_noise_centroid_s(y, sr=16000):
    try:
        from pysndfx import AudioEffectsChain
    except Exception as e:
        raise ModuleNotFoundError(
            'pysndfx not installed. Please install it by `pip install pysndfx` and try again.'
        )

    y = int_to_float(y)
    cent = librosa.feature.spectral_centroid(y=y, sr=sr)

    threshold_h = np.max(cent)
    threshold_l = np.min(cent)

    less_noise = (
        AudioEffectsChain()
        .lowshelf(gain=-12.0, frequency=threshold_l, slope=0.5)
        .highshelf(gain=-12.0, frequency=threshold_h, slope=0.5)
        .limiter(gain=6.0)
    )

    y_cleaned = less_noise(y)

    return y_cleaned


# https://github.com/dodiku/noise_reduction/blob/master/noise.py
def reduce_noise_centroid_mb(y, sr=16000):
    try:
        from pysndfx import AudioEffectsChain
    except Exception as e:
        raise ModuleNotFoundError(
            'pysndfx not installed. Please install it by `pip install pysndfx` and try again.'
        )

    y = int_to_float(y)
    cent = librosa.feature.spectral_centroid(y=y, sr=sr)

    threshold_h = np.max(cent)
    threshold_l = np.min(cent)

    less_noise = (
        AudioEffectsChain()
        .lowshelf(gain=-30.0, frequency=threshold_l, slope=0.5)
        .highshelf(gain=-30.0, frequency=threshold_h, slope=0.5)
        .limiter(gain=10.0)
    )
    y_cleaned = less_noise(y)

    cent_cleaned = librosa.feature.spectral_centroid(y=y_cleaned, sr=sr)
    columns, rows = cent_cleaned.shape
    boost_h = math.floor(rows / 3 * 2)
    boost_l = math.floor(rows / 6)
    boost = math.floor(rows / 3)
    boost_bass = AudioEffectsChain().lowshelf(
        gain=16.0, frequency=boost_h, slope=0.5
    )
    y_clean_boosted = boost_bass(y_cleaned)

    return y_clean_boosted


# https://github.com/dodiku/noise_reduction/blob/master/noise.py
def reduce_noise_mfcc_down(y, sr=16000):
    try:
        from pysndfx import AudioEffectsChain
        import python_speech_features
    except Exception as e:
        raise ModuleNotFoundError(
            'pysndfx, python_speech_features not installed. Please install it by `pip install pysndfx python_speech_features` and try again.'
        )

    y = int_to_float(y)
    hop_length = 512

    mfcc = python_speech_features.base.logfbank(y)
    mfcc = python_speech_features.base.lifter(mfcc)

    sum_of_squares = []
    index = -1
    for r in mfcc:
        sum_of_squares.append(0)
        index = index + 1
        for n in r:
            sum_of_squares[index] = sum_of_squares[index] + n ** 2

    strongest_frame = sum_of_squares.index(max(sum_of_squares))
    hz = python_speech_features.base.mel2hz(mfcc[strongest_frame])

    max_hz = max(hz)
    min_hz = min(hz)

    speech_booster = (
        AudioEffectsChain()
        .highshelf(frequency=min_hz * (-1) * 1.2, gain=-12.0, slope=0.6)
        .limiter(gain=8.0)
    )
    y_speach_boosted = speech_booster(y)

    return y_speach_boosted


# https://github.com/dodiku/noise_reduction/blob/master/noise.py
def reduce_noise_mfcc_up(y, sr=16000):
    try:
        from pysndfx import AudioEffectsChain
        import python_speech_features
    except Exception as e:
        raise ModuleNotFoundError(
            'pysndfx, python_speech_features not installed. Please install it by `pip install pysndfx python_speech_features` and try again.'
        )

    y = int_to_float(y)
    hop_length = 512
    mfcc = python_speech_features.base.logfbank(y)
    mfcc = python_speech_features.base.lifter(mfcc)

    sum_of_squares = []
    index = -1
    for r in mfcc:
        sum_of_squares.append(0)
        index = index + 1
        for n in r:
            sum_of_squares[index] = sum_of_squares[index] + n ** 2

    strongest_frame = sum_of_squares.index(max(sum_of_squares))
    hz = python_speech_features.base.mel2hz(mfcc[strongest_frame])

    max_hz = max(hz)
    min_hz = min(hz)

    speech_booster = AudioEffectsChain().lowshelf(
        frequency=min_hz * (-1), gain=12.0, slope=0.5
    )
    y_speach_boosted = speech_booster(y)

    return y_speach_boosted


def trim_silence(
    y,
    top_db=20,
    frame_length=2,
    hop_length=500,
    return_trimmed_length=False,
):
    y = int_to_float(y)
    y_trimmed, index = librosa.effects.trim(
        y, top_db=top_db, frame_length=frame_length, hop_length=hop_length
    )
    trimmed_length = librosa.get_duration(y) - librosa.get_duration(y_trimmed)

    if return_trimmed_length:
        return y_trimmed, trimmed_length
    else:
        return y_trimmed