{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Speech-to-Text RNNT web inference using Gradio" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Encoder model + RNNT loss web inference using Gradio" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", " | Size (MB) | \n", "Quantized Size (MB) | \n", "WER | \n", "CER | \n", "WER-LM | \n", "CER-LM | \n", "Language | \n", "
---|---|---|---|---|---|---|---|
tiny-conformer | \n", "24.4 | \n", "9.14 | \n", "0.212811 | \n", "0.081369 | \n", "0.199683 | \n", "0.077004 | \n", "[malay] | \n", "
small-conformer | \n", "49.2 | \n", "18.1 | \n", "0.198533 | \n", "0.074495 | \n", "0.185361 | \n", "0.071143 | \n", "[malay] | \n", "
conformer | \n", "125 | \n", "37.1 | \n", "0.163602 | \n", "0.058744 | \n", "0.156182 | \n", "0.05719 | \n", "[malay] | \n", "
large-conformer | \n", "404 | \n", "107 | \n", "0.156684 | \n", "0.061971 | \n", "0.148622 | \n", "0.05901 | \n", "[malay] | \n", "
conformer-stack-2mixed | \n", "130 | \n", "38.5 | \n", "0.103608 | \n", "0.050069 | \n", "0.102911 | \n", "0.050201 | \n", "[malay, singlish] | \n", "
conformer-stack-3mixed | \n", "130 | \n", "38.5 | \n", "0.234768 | \n", "0.133944 | \n", "0.229241 | \n", "0.130702 | \n", "[malay, singlish, mandarin] | \n", "
small-conformer-singlish | \n", "49.2 | \n", "18.1 | \n", "0.087831 | \n", "0.045686 | \n", "0.087333 | \n", "0.045317 | \n", "[singlish] | \n", "
conformer-singlish | \n", "125 | \n", "37.1 | \n", "0.077792 | \n", "0.040362 | \n", "0.077186 | \n", "0.03987 | \n", "[singlish] | \n", "
large-conformer-singlish | \n", "404 | \n", "107 | \n", "0.070147 | \n", "0.035872 | \n", "0.069812 | \n", "0.035723 | \n", "[singlish] | \n", "