Source code for malaya_speech.speaker_change

from malaya_speech.supervised import classification
from malaya_speech.model.frame import Frame
from malaya_speech.utils import describe_availability
from herpetologist import check_type
import logging

logger = logging.getLogger(__name__)

_availability = {
    'vggvox-v2': {
        'Size (MB)': 31.1,
        'Quantized Size (MB)': 7.92,
        'Accuracy': 0.63979,
    },
    'speakernet': {
        'Size (MB)': 20.3,
        'Quantized Size (MB)': 5.18,
        'Accuracy': 0.64524,
    },
}


[docs]def available_model(): """ List available speaker change deep models. """ logger.info('last accuracy during training session before early stopping.') return describe_availability(_availability)
[docs]@check_type def deep_model(model: str = 'vggvox-v2', quantized: bool = False, **kwargs): """ Load speaker change deep model. Parameters ---------- model : str, optional (default='vggvox-v2') Check available models at `malaya_speech.speaker_change.available_model()`. quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result : malaya_speech.supervised.classification.load function """ model = model.lower() if model not in _availability: raise ValueError( 'model not supported, please check supported models from `malaya_speech.speaker_change.available_model()`.' ) settings = { 'vggvox-v2': {'hop_length': 50, 'concat': False, 'mode': 'eval'}, 'speakernet': {'frame_ms': 20, 'stride_ms': 2}, } return classification.load( model=model, module='speaker-change', extra=settings[model], label=[False, True], quantized=quantized, **kwargs )
[docs]def split_activities( vad_results, speaker_change_results, speaker_change_threshold: float = 0.5, sr: int = 16000, ignore_not_activity=True, ): """ split VAD based on speaker change threshold, worse-case O(N^2). Parameters ---------- vad_results: List[Tuple[Frame, label]] results from VAD. speaker_change_results: List[Tuple[Frame, float]], optional (default=None) results from speaker change module, must in float result. speaker_change_threshold: float, optional (default=0.5) in one voice activity sample can be more than one speaker, split it using this threshold. sr: int, optional (default=16000) sample rate, classification model in malaya-speech use 16k. ignore_not_activity: bool, optional (default=True) If True, will ignore if result VAD is False, else will try to split. Returns ------- result : List[Tuple[Frame, label]] """ if not 0 < speaker_change_threshold <= 1.0: raise ValueError( 'speaker_change_threshold must, 0 < speaker_change_threshold <= 1.0' ) results = [] for result in vad_results: if not result[1] and ignore_not_activity: results.append(result) else: group = [] for change in speaker_change_results: from_vad = result[0].timestamp until_vad = result[0].duration + from_vad from_change = change[0].timestamp until_change = (change[0].duration / 2) + from_change change_result = change[1] if ( until_change >= from_vad and until_change <= until_vad and change_result >= speaker_change_threshold ): group.append(until_change) if len(group): before = 0 before_timestamp = result[0].timestamp for t in group: after = t - before_timestamp f = Frame( result[0].array[ int(before * sr): int((before + after) * sr) ], before_timestamp, after, ) results.append((f, result[1])) before = after before_timestamp = t if result[0].timestamp + result[0].duration > before_timestamp: f = Frame( result[0].array[int(before * sr):], before_timestamp, (result[0].timestamp + result[0].duration) - before_timestamp, ) results.append((f, result[1])) else: results.append(result) return results