Source code for malaya_speech.streaming.pyaudio

# https://github.com/mozilla/DeepSpeech-examples/blob/r0.8/mic_vad_streaming/mic_vad_streaming.py

import collections
import queue
import numpy as np
import logging
from scipy import signal
from typing import Callable
from malaya_speech.streaming import stream as base_stream

logger = logging.getLogger(__name__)

pyaudio_available = False
try:
    import pyaudio
    pyaudio_available = True
except Exception as e:
    logger.warning(f'`pyaudio` is not available, `{__name__}` is not able to use.')


class Audio:
    def __init__(
        self,
        vad_model=None,
        callback=None,
        device=None,
        input_rate: int = 16000,
        sample_rate: int = 16000,
        segment_length: int = 320,
        channels: int = 1,
        stream_callback: Callable = None,
        hard_utterence: bool = True,
        **kwargs,
    ):

        self.vad_model = vad_model
        self.input_rate = input_rate
        self.sample_rate = sample_rate
        self.channels = channels
        self.block_size_input = segment_length
        self.segment_length = segment_length
        self.hard_utterence = hard_utterence

        self.buffer_queue = queue.Queue()
        self.device = device
        self.format = pyaudio.paFloat32

        if callback is None:
            def callback(in_data): return self.buffer_queue.put(in_data)

        def proxy_callback(in_data, frame_count, time_info, status):
            if self.chunk is not None:
                in_data = self.wf.readframes(self.chunk)
            callback(in_data)
            return (None, pyaudio.paContinue)

        self.pa = pyaudio.PyAudio()
        kwargs = {
            'format': self.format,
            'channels': self.channels,
            'rate': self.input_rate,
            'input': True,
            'frames_per_buffer': self.block_size_input,
            'stream_callback': proxy_callback if stream_callback is None else stream_callback,
        }

        self.chunk = None
        if self.device:
            kwargs['input_device_index'] = self.device

        self.stream = self.pa.open(**kwargs)
        self.stream.start_stream()

    def resample(self, data, input_rate):
        data16 = np.fromstring(string=data, dtype=np.int16)
        resample_size = int(len(data16) / self.input_rate * self.sample_rate)
        resample = signal.resample(data16, resample_size)
        resample16 = np.array(resample, dtype=np.int16)
        return resample16.tostring()

    def read_resampled(self):
        return self.resample(
            data=self.buffer_queue.get(), input_rate=self.input_rate
        )

    def read(self):
        """Return a block of audio data, blocking if necessary."""
        return self.buffer_queue.get()

    def frame_generator(self):
        if self.input_rate == self.sample_rate:
            while True:
                yield self.read()
        else:
            while True:
                yield self.read_resampled()

    def destroy(self):
        self.stream.stop_stream()
        self.stream.close()
        self.pa.terminate()

    def vad_collector(self, num_padding_frames=20, ratio=0.75):
        """
        Generator that yields series of consecutive audio frames comprising each utterence, separated by yielding a single None.
        Determines voice activity by ratio of frames in padding_ms. Uses a buffer to include padding_ms prior to being triggered.
        Example: (frame, ..., frame, None, frame, ..., frame, None, ...)
                    |---utterence---|        |---utterence---|
        """
        frames = self.frame_generator()
        ring_buffer = collections.deque(maxlen=num_padding_frames)
        triggered = False

        for i, frame in enumerate(frames):
            if len(frame) < 640:
                return

            frame = np.frombuffer(frame, np.float32)

            if self.vad_model:
                try:
                    is_speech = self.vad_model(frame)
                    if isinstance(is_speech, dict):
                        is_speech = is_speech['vad']
                except Exception as e:
                    logger.debug(e)
                    is_speech = False
            else:
                is_speech = True

            logger.debug(is_speech)
            frame = (frame, i * self.segment_length)

            if not self.hard_utterence:
                yield frame

            if not triggered:
                ring_buffer.append((frame, is_speech))
                num_voiced = len([f for f, speech in ring_buffer if speech])
                if num_voiced > ratio * ring_buffer.maxlen:
                    triggered = True
                    if self.hard_utterence:
                        for f, s in ring_buffer:
                            yield f
                    ring_buffer.clear()

            else:
                if self.hard_utterence:
                    yield frame
                ring_buffer.append((frame, is_speech))
                num_unvoiced = len(
                    [f for f, speech in ring_buffer if not speech]
                )
                if num_unvoiced > ratio * ring_buffer.maxlen:
                    triggered = False
                    yield None
                    ring_buffer.clear()


[docs]def stream( vad_model=None, asr_model=None, classification_model=None, sample_rate: int = 16000, segment_length: int = 2560, num_padding_frames: int = 20, ratio: float = 0.75, min_length: float = 0.1, max_length: float = 10.0, realtime_print: bool = True, **kwargs, ): """ Stream an audio using pyaudio library. Parameters ---------- vad_model: object, optional (default=None) vad model / pipeline. asr_model: object, optional (default=None) ASR model / pipeline, will transcribe each subsamples realtime. classification_model: object, optional (default=None) classification pipeline, will classify each subsamples realtime. device: None, optional (default=None) `device` parameter for pyaudio, check available devices from `sounddevice.query_devices()`. sample_rate: int, optional (default = 16000) output sample rate. segment_length: int, optional (default=2560) usually derived from asr_model.segment_length * asr_model.hop_length, size of audio chunks, actual size in term of second is `segment_length` / `sample_rate`. ratio: float, optional (default = 0.75) if 75% of the queue is positive, assumed it is a voice activity. min_length: float, optional (default=0.1) minimum length (second) to accept a subsample. max_length: float, optional (default=10.0) maximum length (second) to accept a subsample. realtime_print: bool, optional (default=True) Will print results for ASR. **kwargs: vector argument vector argument pass to malaya_speech.streaming.pyaudio.Audio interface. Returns ------- result : List[dict] """ if not pyaudio_available: raise ModuleNotFoundError( 'pyaudio not installed. Please install it by `pip install pyaudio` and try again.' ) return base_stream( audio_class=Audio, vad_model=vad_model, asr_model=asr_model, classification_model=classification_model, sample_rate=sample_rate, segment_length=segment_length, num_padding_frames=num_padding_frames, ratio=ratio, min_length=min_length, max_length=max_length, realtime_print=realtime_print, **kwargs, )