Source code for malaya_speech.speaker_vector

from malaya_speech.supervised import classification
from malaya_speech.utils import describe_availability
from herpetologist import check_type
import logging
import warnings

logger = logging.getLogger(__name__)

# EER calculation, https://github.com/huseinzol05/malaya-speech/tree/master/pretrained-model/speaker-embedding/calculate-EER
# EER tested on VoxCeleb2 test set.

_availability = {
    'deep-speaker': {
        'Size (MB)': 96.7,
        'Quantized Size (MB)': 24.4,
        'Embedding Size': 512,
        'EER': 0.2187,
    },
    'vggvox-v1': {
        'Size (MB)': 70.8,
        'Quantized Size (MB)': 17.7,
        'Embedding Size': 1024,
        'EER': 0.13944,
    },
    'vggvox-v2': {
        'Size (MB)': 43.2,
        'Quantized Size (MB)': 7.92,
        'Embedding Size': 512,
        'EER': 0.0446,
    },
    'conformer-base': {
        'Size (MB)': 99.4,
        'Quantized Size (MB)': 27.2,
        'Embedding Size': 512,
        'EER': 0.06938,
    },
    'conformer-tiny': {
        'Size (MB)': 20.3,
        'Quantized Size (MB)': 6.21,
        'Embedding Size': 512,
        'EER': 0.08687,
    },
}


trillsson_accuracy = {
    'trillsson-1': {
        'url': 'https://tfhub.dev/google/nonsemantic-speech-benchmark/trillsson1/1',
        'EER': 0.3804599,
    },
    'trillsson-2': {
        'url': 'https://tfhub.dev/google/nonsemantic-speech-benchmark/trillsson2/1',
        'EER': 0.3898799,
    }
}

dvector_accuracy = {
    'original from': 'https://github.com/yistLin/dvector',
    'Size (MB)': 5.45,
    'Embedding Size': 256,
    'EER': 0.1356490598298,
}

_nemo_availability = {
    'huseinzol05/nemo-ecapa-tdnn': {
        'Size (MB)': 96.8,
        'Embedding Size': 192,
        'EER': 0.0249200000000007,
        'original from': 'https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/ecapa_tdnn',
    },
    'huseinzol05/nemo-speakernet': {
        'Size (MB)': 23.6,
        'Embedding Size': 192,
        'EER': 0.0427898305,
        'original from': 'https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/speakerverification_speakernet',
    },
    'huseinzol05/nemo-titanet_large': {
        'Size (MB)': 101.6,
        'Embedding Size': 192,
        'EER': 0.02277999999996,
        'original from': 'https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/titanet_large',
    }
}

_huggingface_availability = {
    'microsoft/wavlm-base-sv': {
        'Size (MB)': 405,
        'Embedding Size': 512,
        'EER': 0.07827375115,
    },
    'microsoft/wavlm-base-plus-sv': {
        'Size (MB)': 405,
        'Embedding Size': 512,
        'EER': 0.06688427572,
    },
    'microsoft/unispeech-sat-large-sv': {
        'Size (MB)': 1290,
        'Embedding Size': 512,
        'EER': 0.2032767553,
    },
    'microsoft/unispeech-sat-base-sv': {
        'Size (MB)': 404,
        'Embedding Size': 512,
        'EER': 0.0782815656,
    },
    'microsoft/unispeech-sat-base-plus-sv': {
        'Size (MB)': 404,
        'Embedding Size': 512,
        'EER': 0.0761281698,
    },
}


def _describe():
    logger.info('tested on VoxCeleb2 test set. Lower EER is better.')
    logger.info('download the test set at https://github.com/huseinzol05/malaya-speech/tree/master/data/voxceleb')


[docs]def available_model(): """ List available speaker vector deep models using Tensorflow. """ _describe() return describe_availability(_availability)
[docs]def available_nemo(): """ List available Nvidia Nemo Speaker vector models. """ _describe() return describe_availability(_nemo_availability)
[docs]def available_huggingface(): """ List available HuggingFace Speaker vector models. """ _describe() return describe_availability(_huggingface_availability)
[docs]@check_type def deep_model(model: str = 'vggvox-v2', quantized: bool = False, **kwargs): """ Load Speaker2Vec model. Parameters ---------- model : str, optional (default='speakernet') Check available models at `malaya_speech.speaker_vector.available_model()`. quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result : malaya_speech.supervised.classification.load function """ model = model.lower() if model not in _availability: raise ValueError( 'model not supported, please check supported models from `malaya_speech.speaker_vector.available_model()`.' ) return classification.load( model=model, module='speaker-vector', extra={}, label=None, quantized=quantized, **kwargs )
[docs]@check_type def nemo( model: str = 'huseinzol05/nemo-ecapa-tdnn', **kwargs, ): """ Load Nemo Speaker verification model. Parameters ---------- model : str, optional (default='huseinzol05/nemo-ecapa-tdnn') Check available models at `malaya_speech.speaker_vector.available_nemo()`. Returns ------- result : malaya_speech.torch_model.nemo.SpeakerVector class """ if model not in _nemo_availability: raise ValueError( 'model not supported, please check supported models from `malaya_speech.speaker_vector.available_nemo()`.' ) return classification.nemo_speaker_vector( model=model, **kwargs )
[docs]@check_type def huggingface( model: str = 'microsoft/wavlm-base-plus-sv', force_check: bool = True, **kwargs, ): """ Load Finetuned models from HuggingFace. Parameters ---------- model : str, optional (default='microsoft/wavlm-base-plus-sv') Check available models at `malaya_speech.speaker_vector.available_huggingface()`. force_check: bool, optional (default=True) Force check model one of malaya model. Set to False if you have your own huggingface model. Returns ------- result : malaya_speech.torch_model.huggingface.XVector class """ if model not in _huggingface_availability and force_check: raise ValueError( 'model not supported, please check supported models from `malaya_speech.speaker_vector.available_huggingface()`.' ) return classification.huggingface_xvector( model=model, **kwargs )