Source code for malaya_speech.vad

from malaya_speech.model.webrtc import WebRTC
from malaya_speech.supervised import classification
from malaya_speech.utils import describe_availability
from herpetologist import check_type

_availability = {
    'vggvox-v1': {
        'Size (MB)': 70.8,
        'Quantized Size (MB)': 17.7,
        'Accuracy': 0.80984375,
    },
    'vggvox-v2': {
        'Size (MB)': 31.1,
        'Quantized Size (MB)': 7.92,
        'Accuracy': 0.8196875,
    },
    'speakernet': {
        'Size (MB)': 20.3,
        'Quantized Size (MB)': 5.18,
        'Accuracy': 0.7340625,
    },
    'marblenet-factor1': {
        'Size (MB)': 0.526,
        'Quantized Size (MB)': 0.232,
        'Accuracy': 0.8491875,
    },
    'marblenet-factor3': {
        'Size (MB)': 3.21,
        'Quantized Size (MB)': 0.934,
        'Accuracy': 0.83855625,
    },
    'marblenet-factor5': {
        'Size (MB)': 8.38,
        'Quantized Size (MB)': 2.21,
        'Accuracy': 0.843540625,
    }
}


[docs]def available_model(): """ List available VAD deep models. """ return describe_availability(_availability)
[docs]@check_type def webrtc( aggressiveness: int = 3, sample_rate: int = 16000, minimum_amplitude: int = 100, ): """ Load WebRTC VAD model. Parameters ---------- aggressiveness: int, optional (default=3) an integer between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive. sample_rate: int, optional (default=16000) sample rate for samples. minimum_amplitude: int, optional (default=100) abs(minimum_amplitude) to assume a sample is a voice activity. Else, automatically False. Returns ------- result : malaya_speech.model.webrtc.WebRTC class """ try: import webrtcvad except BaseException: raise ModuleNotFoundError( 'webrtcvad not installed. Please install it by `pip install webrtcvad` and try again.' ) vad = webrtcvad.Vad(aggressiveness) return WebRTC(vad, sample_rate, minimum_amplitude)
[docs]@check_type def deep_model(model: str = 'marblenet-factor1', quantized: bool = False, **kwargs): """ Load VAD model. Parameters ---------- model : str, optional (default='vggvox-v2') Check available models at `malaya_speech.vad.available_model()`. quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result : malaya_speech.supervised.classification.load function """ model = model.lower() if model not in _availability: raise ValueError( 'model not supported, please check supported models from `malaya_speech.vad.available_model()`.' ) settings = { 'vggvox-v1': {'frame_len': 0.005, 'frame_step': 0.0005}, 'vggvox-v2': {'hop_length': 24, 'concat': False, 'mode': 'eval'}, 'speakernet': {'frame_ms': 20, 'stride_ms': 1.0}, 'marblenet-factor1': {'feature_type': 'mfcc'}, 'marblenet-factor3': {'feature_type': 'mfcc'}, 'marblenet-factor5': {'feature_type': 'mfcc'}, } return classification.load( model=model, module='vad', extra=settings[model], label=[False, True], quantized=quantized, **kwargs )