{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Speech-to-Text CTC web inference using Gradio" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Encoder model + CTC loss web inference using Gradio" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", " | Size (MB) | \n", "Quantized Size (MB) | \n", "WER | \n", "CER | \n", "WER-LM | \n", "CER-LM | \n", "Language | \n", "
---|---|---|---|---|---|---|---|
hubert-conformer-tiny | \n", "36.6 | \n", "10.3 | \n", "0.335968 | \n", "0.0882573 | \n", "0.199227 | \n", "0.0635223 | \n", "[malay] | \n", "
hubert-conformer | \n", "115 | \n", "31.1 | \n", "0.238714 | \n", "0.0608998 | \n", "0.141479 | \n", "0.0450751 | \n", "[malay] | \n", "
hubert-conformer-large | \n", "392 | \n", "100 | \n", "0.220314 | \n", "0.054927 | \n", "0.128006 | \n", "0.0385329 | \n", "[malay] | \n", "
hubert-conformer-large-3mixed | \n", "392 | \n", "100 | \n", "0.241126 | \n", "0.0787939 | \n", "0.132761 | \n", "0.057482 | \n", "[malay, singlish, mandarin] | \n", "
best-rq-conformer-tiny | \n", "36.6 | \n", "10.3 | \n", "0.319291 | \n", "0.078988 | \n", "0.179582 | \n", "0.055521 | \n", "[malay] | \n", "
best-rq-conformer | \n", "115 | \n", "31.1 | \n", "0.253678 | \n", "0.0658045 | \n", "0.154206 | \n", "0.0482278 | \n", "[malay] | \n", "
best-rq-conformer-large | \n", "392 | \n", "100 | \n", "0.234651 | \n", "0.0601605 | \n", "0.130082 | \n", "0.044521 | \n", "[malay] | \n", "