{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Realtime ASR without VAD\n", "\n", "Let say you want to transcribe realtime recording / input using PyAudio without VAD, malaya-speech able to do that." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", " | Size (MB) | \n", "malay-malaya | \n", "malay-fleur102 | \n", "Language | \n", "singlish | \n", "
---|---|---|---|---|---|
mesolitica/conformer-tiny | \n", "38.5 | \n", "{'WER': 0.17341180814, 'CER': 0.05957485024} | \n", "{'WER': 0.19524478979, 'CER': 0.0830808938} | \n", "[malay] | \n", "NaN | \n", "
mesolitica/conformer-base | \n", "121 | \n", "{'WER': 0.122076123261, 'CER': 0.03879606324} | \n", "{'WER': 0.1326737206665, 'CER': 0.05032914857} | \n", "[malay] | \n", "NaN | \n", "
mesolitica/conformer-medium | \n", "243 | \n", "{'WER': 0.1054817492564, 'CER': 0.0313518992842} | \n", "{'WER': 0.1172708897486, 'CER': 0.0431050488} | \n", "[malay] | \n", "NaN | \n", "
mesolitica/emformer-base | \n", "162 | \n", "{'WER': 0.175762423786, 'CER': 0.06233919000537} | \n", "{'WER': 0.18303839134, 'CER': 0.0773853362} | \n", "[malay] | \n", "NaN | \n", "
mesolitica/conformer-base-singlish | \n", "121 | \n", "NaN | \n", "NaN | \n", "[singlish] | \n", "{'WER': 0.06517537334361, 'CER': 0.03265430876} | \n", "
mesolitica/conformer-medium-mixed | \n", "243 | \n", "{'WER': 0.111166517935, 'CER': 0.03410958328} | \n", "{'WER': 0.108354748, 'CER': 0.037785722} | \n", "[malay, singlish] | \n", "{'WER': 0.091969755225, 'CER': 0.044627194623} | \n", "
mesolitica/conformer-medium-mixed-augmented | \n", "243 | \n", "{'WER': 0.1015719878, 'CER': 0.0326360923} | \n", "{'WER': 0.1103884742, 'CER': 0.0385676182} | \n", "[malay, singlish] | \n", "{'WER': 0.086342166, 'CER': 0.0413572066} | \n", "
mesolitica/conformer-large-mixed-augmented | \n", "413 | \n", "{'WER': 0.0919852874, 'CER': 0.026612152} | \n", "{'WER': 0.103593636, 'CER': 0.036611048} | \n", "[malay, singlish] | \n", "{'WER': 0.08727157, 'CER': 0.04318735972} | \n", "