from malaya_speech.supervised import classification
from malaya_speech.model.frame import Frame
from malaya_speech.utils import describe_availability
from herpetologist import check_type
import logging
logger = logging.getLogger(__name__)
_availability = {
'vggvox-v2': {
'Size (MB)': 31.1,
'Quantized Size (MB)': 7.92,
'Accuracy': 0.63979,
},
'speakernet': {
'Size (MB)': 20.3,
'Quantized Size (MB)': 5.18,
'Accuracy': 0.64524,
},
}
[docs]def available_model():
"""
List available speaker change deep models.
"""
logger.info('last accuracy during training session before early stopping.')
return describe_availability(_availability)
[docs]@check_type
def deep_model(model: str = 'vggvox-v2', quantized: bool = False, **kwargs):
"""
Load speaker change deep model.
Parameters
----------
model : str, optional (default='vggvox-v2')
Check available models at `malaya_speech.speaker_change.available_model()`.
quantized : bool, optional (default=False)
if True, will load 8-bit quantized model.
Quantized model not necessary faster, totally depends on the machine.
Returns
-------
result : malaya_speech.supervised.classification.load function
"""
model = model.lower()
if model not in _availability:
raise ValueError(
'model not supported, please check supported models from `malaya_speech.speaker_change.available_model()`.'
)
settings = {
'vggvox-v2': {'hop_length': 50, 'concat': False, 'mode': 'eval'},
'speakernet': {'frame_ms': 20, 'stride_ms': 2},
}
return classification.load(
model=model,
module='speaker-change',
extra=settings[model],
label=[False, True],
quantized=quantized,
**kwargs
)
[docs]def split_activities(
vad_results,
speaker_change_results,
speaker_change_threshold: float = 0.5,
sr: int = 16000,
ignore_not_activity=True,
):
"""
split VAD based on speaker change threshold, worse-case O(N^2).
Parameters
----------
vad_results: List[Tuple[Frame, label]]
results from VAD.
speaker_change_results: List[Tuple[Frame, float]], optional (default=None)
results from speaker change module, must in float result.
speaker_change_threshold: float, optional (default=0.5)
in one voice activity sample can be more than one speaker, split it using this threshold.
sr: int, optional (default=16000)
sample rate, classification model in malaya-speech use 16k.
ignore_not_activity: bool, optional (default=True)
If True, will ignore if result VAD is False, else will try to split.
Returns
-------
result : List[Tuple[Frame, label]]
"""
if not 0 < speaker_change_threshold <= 1.0:
raise ValueError(
'speaker_change_threshold must, 0 < speaker_change_threshold <= 1.0'
)
results = []
for result in vad_results:
if not result[1] and ignore_not_activity:
results.append(result)
else:
group = []
for change in speaker_change_results:
from_vad = result[0].timestamp
until_vad = result[0].duration + from_vad
from_change = change[0].timestamp
until_change = (change[0].duration / 2) + from_change
change_result = change[1]
if (
until_change >= from_vad
and until_change <= until_vad
and change_result >= speaker_change_threshold
):
group.append(until_change)
if len(group):
before = 0
before_timestamp = result[0].timestamp
for t in group:
after = t - before_timestamp
f = Frame(
result[0].array[
int(before * sr): int((before + after) * sr)
],
before_timestamp,
after,
)
results.append((f, result[1]))
before = after
before_timestamp = t
if result[0].timestamp + result[0].duration > before_timestamp:
f = Frame(
result[0].array[int(before * sr):],
before_timestamp,
(result[0].timestamp + result[0].duration)
- before_timestamp,
)
results.append((f, result[1]))
else:
results.append(result)
return results