Source code for malaya_speech.streaming.pyaudio

# https://github.com/mozilla/DeepSpeech-examples/blob/r0.8/mic_vad_streaming/mic_vad_streaming.py

import collections
import queue
import numpy as np
import logging
from scipy import signal
from typing import Callable
from malaya_speech.streaming import stream as base_stream

logger = logging.getLogger(__name__)

pyaudio_available = False
try:
    import pyaudio
    pyaudio_available = True
except Exception as e:
    logger.warning(f'`pyaudio` is not available, `{__name__}` is not able to use.')


class Audio:
    def __init__(
        self,
        vad_model=None,
        callback=None,
        device=None,
        input_rate: int = 16000,
        sample_rate: int = 16000,
        segment_length: int = 320,
        channels: int = 1,
        stream_callback: Callable = None,
        hard_utterence: bool = True,
        **kwargs,
    ):

        self.vad_model = vad_model
        self.input_rate = input_rate
        self.sample_rate = sample_rate
        self.channels = channels
        self.block_size_input = segment_length
        self.segment_length = segment_length
        self.hard_utterence = hard_utterence

        self.buffer_queue = queue.Queue()
        self.device = device
        self.format = pyaudio.paFloat32

        if callback is None:
            def callback(in_data): return self.buffer_queue.put(in_data)

        def proxy_callback(in_data, frame_count, time_info, status):
            if self.chunk is not None:
                in_data = self.wf.readframes(self.chunk)
            callback(in_data)
            return (None, pyaudio.paContinue)

        self.pa = pyaudio.PyAudio()
        kwargs = {
            'format': self.format,
            'channels': self.channels,
            'rate': self.input_rate,
            'input': True,
            'frames_per_buffer': self.block_size_input,
            'stream_callback': proxy_callback if stream_callback is None else stream_callback,
        }

        self.chunk = None
        if self.device:
            kwargs['input_device_index'] = self.device

        self.stream = self.pa.open(**kwargs)
        self.stream.start_stream()

    def resample(self, data, input_rate):
        data16 = np.fromstring(string=data, dtype=np.int16)
        resample_size = int(len(data16) / self.input_rate * self.sample_rate)
        resample = signal.resample(data16, resample_size)
        resample16 = np.array(resample, dtype=np.int16)
        return resample16.tostring()

    def read_resampled(self):
        return self.resample(
            data=self.buffer_queue.get(), input_rate=self.input_rate
        )

    def read(self):
        """Return a block of audio data, blocking if necessary."""
        return self.buffer_queue.get()

    def frame_generator(self):
        if self.input_rate == self.sample_rate:
            while True:
                yield self.read()
        else:
            while True:
                yield self.read_resampled()

    def destroy(self):
        self.stream.stop_stream()
        self.stream.close()
        self.pa.terminate()

    def vad_collector(self, num_padding_frames=20, ratio=0.75):
        """
        Generator that yields series of consecutive audio frames comprising each utterence, separated by yielding a single None.
        Determines voice activity by ratio of frames in padding_ms. Uses a buffer to include padding_ms prior to being triggered.
        Example: (frame, ..., frame, None, frame, ..., frame, None, ...)
                    |---utterence---|        |---utterence---|
        """
        frames = self.frame_generator()
        ring_buffer = collections.deque(maxlen=num_padding_frames)
        triggered = False

        for i, frame in enumerate(frames):
            if len(frame) < 640:
                return

            frame = np.frombuffer(frame, np.float32)

            if self.vad_model:
                try:
                    is_speech = self.vad_model(frame)
                    if isinstance(is_speech, dict):
                        is_speech = is_speech['vad']
                except Exception as e:
                    logger.debug(e)
                    is_speech = False
            else:
                is_speech = True

            logger.debug(is_speech)
            frame = (frame, i * self.segment_length)

            if not self.hard_utterence:
                yield frame

            if not triggered:
                ring_buffer.append((frame, is_speech))
                num_voiced = len([f for f, speech in ring_buffer if speech])
                if num_voiced > ratio * ring_buffer.maxlen:
                    triggered = True
                    if self.hard_utterence:
                        for f, s in ring_buffer:
                            yield f
                    ring_buffer.clear()

            else:
                if self.hard_utterence:
                    yield frame
                ring_buffer.append((frame, is_speech))
                num_unvoiced = len(
                    [f for f, speech in ring_buffer if not speech]
                )
                if num_unvoiced > ratio * ring_buffer.maxlen:
                    triggered = False
                    yield None
                    ring_buffer.clear()


[docs]def stream(
    vad_model=None,
    asr_model=None,
    classification_model=None,
    sample_rate: int = 16000,
    segment_length: int = 2560,
    num_padding_frames: int = 20,
    ratio: float = 0.75,
    min_length: float = 0.1,
    max_length: float = 10.0,
    realtime_print: bool = True,
    **kwargs,
):
    """
    Stream an audio using pyaudio library.

    Parameters
    ----------
    vad_model: object, optional (default=None)
        vad model / pipeline.
    asr_model: object, optional (default=None)
        ASR model / pipeline, will transcribe each subsamples realtime.
    classification_model: object, optional (default=None)
        classification pipeline, will classify each subsamples realtime.
    device: None, optional (default=None)
        `device` parameter for pyaudio, check available devices from `sounddevice.query_devices()`.
    sample_rate: int, optional (default = 16000)
        output sample rate.
    segment_length: int, optional (default=2560)
        usually derived from asr_model.segment_length * asr_model.hop_length,
        size of audio chunks, actual size in term of second is `segment_length` / `sample_rate`.
    ratio: float, optional (default = 0.75)
        if 75% of the queue is positive, assumed it is a voice activity.
    min_length: float, optional (default=0.1)
        minimum length (second) to accept a subsample.
    max_length: float, optional (default=10.0)
        maximum length (second) to accept a subsample.
    realtime_print: bool, optional (default=True)
        Will print results for ASR.
    **kwargs: vector argument
        vector argument pass to malaya_speech.streaming.pyaudio.Audio interface.

    Returns
    -------
    result : List[dict]
    """
    if not pyaudio_available:
        raise ModuleNotFoundError(
            'pyaudio not installed. Please install it by `pip install pyaudio` and try again.'
        )

    return base_stream(
        audio_class=Audio,
        vad_model=vad_model,
        asr_model=asr_model,
        classification_model=classification_model,
        sample_rate=sample_rate,
        segment_length=segment_length,
        num_padding_frames=num_padding_frames,
        ratio=ratio,
        min_length=min_length,
        max_length=max_length,
        realtime_print=realtime_print,
        **kwargs,
    )