from scipy.spatial.distance import cdist
from malaya_speech.model.clustering import ClusteringAP
from malaya_speech.utils.dist import l2_normalize, compute_log_dist_matrix
import numpy as np
from herpetologist import check_type
from typing import Callable
[docs]@check_type
def speaker_similarity(
vad_results,
speaker_vector,
similarity_threshold: float = 0.8,
norm_function: Callable = None,
return_embedding: bool = False,
):
"""
Speaker diarization using L2-Norm similarity.
Parameters
----------
vad_results: List[Tuple[Frame, label]]
results from VAD.
speaker_vector: callable
speaker vector object.
similarity_threshold: float, optional (default=0.8)
if current voice activity sample similar at least 80%, we assumed it is from the same speaker.
norm_function: Callable, optional(default=None)
normalize function for speaker vectors.
speaker_change_threshold: float, optional (default=0.5)
in one voice activity sample can be more than one speaker, split it using this threshold.
Returns
-------
result : List[Tuple[Frame, label]]
"""
if not 0 < similarity_threshold <= 1.0:
raise ValueError(
'similarity_threshold must, 0 < similarity_threshold <= 1.0'
)
speakers, embedding = [], []
for result in vad_results:
if result[1]:
vector = speaker_vector([result[0]])[0]
if len(embedding):
a = np.array(embedding)
if norm_function:
a = norm_function(a)
s = 1 - cdist([vector], a, metric='cosine')[0]
where = np.where(s >= similarity_threshold)[0]
if len(where):
argsort = (np.argsort(s)[::-1]).tolist()
argsort = [a for a in argsort if a in where]
speakers.append(f'speaker {argsort[0]}')
else:
speakers.append(f'speaker {len(embedding)}')
embedding.append(vector)
else:
speakers.append(f'speaker {len(embedding)}')
embedding.append(vector)
else:
speakers.append('not a speaker')
results = []
for no, result in enumerate(vad_results):
results.append((result[0], speakers[no]))
if return_embedding:
return results, embedding
else:
return results
[docs]@check_type
def n_clustering(
vad_results,
speaker_vector,
model,
norm_function: Callable = l2_normalize,
return_embedding=False,
):
"""
Speaker diarization using any clustering model.
Parameters
----------
vad_results: List[Tuple[Frame, label]]
results from VAD.
speaker_vector: callable
speaker vector object.
model: callable
Prefer any sklearn unsupervised clustering model.
Required `fit_predict` or `apply` method.
norm_function: Callable, optional(default=malaya_speech.utils.dist.l2_normalize)
normalize function for speaker vectors.
log_distance_metric: str, optional (default='cosine')
post distance norm in log scale metrics.
Returns
-------
result : List[Tuple[Frame, label]]
"""
if not hasattr(model, 'fit_predict') and not hasattr(model, 'apply'):
raise ValueError('model must have `fit_predict` or `apply` method.')
speakers, activities, mapping = [], [], {}
for no, result in enumerate(vad_results):
if result[1]:
speakers.append('got')
mapping[len(activities)] = no
vector = speaker_vector([result[0]])[0]
activities.append(vector)
else:
speakers.append('not a speaker')
activities = np.array(activities)
if norm_function:
activities = norm_function(activities)
if hasattr(model, 'fit_predict'):
cluster_labels = model.fit_predict(activities)
if hasattr(model, 'apply'):
cluster_labels = model.apply(activities)
for k, v in mapping.items():
speakers[v] = f'speaker {cluster_labels[k]}'
results = []
for no, result in enumerate(vad_results):
results.append((result[0], speakers[no]))
if return_embedding:
return results, activities
else:
return results
[docs]@check_type
def affinity_propagation(
vad_results,
speaker_vector,
norm_function: Callable = l2_normalize,
log_distance_metric: str = 'cosine',
damping: float = 0.8,
preference: float = None,
return_embedding=False,
):
"""
Speaker diarization using sklearn Affinity Propagation.
Parameters
----------
vad_results: List[Tuple[Frame, label]]
results from VAD.
speaker_vector: callable
speaker vector object.
norm_function: Callable, optional(default=malaya_speech.utils.dist.l2_normalize)
normalize function for speaker vectors.
log_distance_metric: str, optional (default='cosine')
post distance norm in log scale metrics.
Returns
-------
result : List[Tuple[Frame, label]]
"""
affinity = ClusteringAP(
metric=log_distance_metric, damping=damping, preference=preference
)
return n_clustering(
vad_results=vad_results,
speaker_vector=speaker_vector,
model=affinity,
norm_function=norm_function,
return_embedding=return_embedding,
)
[docs]@check_type
def spectral_cluster(
vad_results,
speaker_vector,
min_clusters: int = None,
max_clusters: int = None,
norm_function: Callable = l2_normalize,
log_distance_metric: str = None,
return_embedding=False,
**kwargs,
):
"""
Speaker diarization using SpectralCluster, https://github.com/wq2012/SpectralCluster
Parameters
----------
vad_results: List[Tuple[Frame, label]]
results from VAD.
speaker_vector: callable
speaker vector object.
min_clusters: int, optional (default=None)
minimal number of clusters allowed (only effective if not None).
max_clusters: int, optional (default=None)
maximal number of clusters allowed (only effective if not None).
can be used together with min_clusters to fix the number of clusters.
norm_function: Callable, optional(default=malaya_speech.utils.dist.l2_normalize)
normalize function for speaker vectors.
log_distance_metric: str, optional (default=None)
post distance norm in log scale metrics.
Returns
-------
result : List[Tuple[Frame, label]]
"""
try:
from spectralcluster import SpectralClusterer
except BaseException:
raise ModuleNotFoundError(
'spectralcluster not installed. Please install it by `pip install spectralcluster` and try again.'
)
clusterer = SpectralClusterer(
min_clusters=min_clusters,
max_clusters=max_clusters,
**kwargs,
)
speakers, activities, mapping = [], [], {}
for no, result in enumerate(vad_results):
if result[1]:
speakers.append('got')
mapping[len(activities)] = no
vector = speaker_vector([result[0]])[0]
activities.append(vector)
else:
speakers.append('not a speaker')
activities = np.array(activities)
if norm_function:
activities = norm_function(activities)
if log_distance_metric:
activities = compute_log_dist_matrix(activities, log_distance_metric)
cluster_labels = clusterer.predict(activities)
for k, v in mapping.items():
speakers[v] = f'speaker {cluster_labels[k]}'
results = []
for no, result in enumerate(vad_results):
results.append((result[0], speakers[no]))
if return_embedding:
return results, activities
else:
return results