Source code for malaya_speech.utils.combine

import numpy as np
from typing import List
from malaya_speech.utils.split import group_frames, group_frames_threshold


[docs]def without_silent(
    frames, threshold_to_stop: float = 0.1, silent_trail: int = 500
):
    """
    Group multiple frames based on label and threshold to stop.

    Parameters
    ----------
    frames: List[Tuple[Frame, label]]
        Output from VAD.
    threshold_to_stop: float, optional (default = 0.1)
        If `threshold_to_stop` is 0.1, means that, length same label samples must at least 0.1 second.
    silent_trail: int, optional (default = 500)
        if detected a silent, will append first N frames and last N frames.

    Returns
    -------
    result : np.array
    """
    grouped = group_frames(frames)
    grouped = group_frames_threshold(grouped, threshold_to_stop)
    r = []
    for g in grouped:
        if g[1]:
            g = g[0].array
        else:
            g = np.concatenate(
                [g[0].array[:silent_trail], g[0].array[-silent_trail:]]
            )
        r.append(g)
    audio = np.concatenate(r)
    return audio