{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Speech-to-Text CTC + pyctcdecode + GPT2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Encoder model + CTC loss + pyctcdecode with GPT2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", " | Size (MB) | \n", "Quantized Size (MB) | \n", "WER | \n", "CER | \n", "WER-LM | \n", "CER-LM | \n", "Language | \n", "
---|---|---|---|---|---|---|---|
hubert-conformer-tiny | \n", "36.6 | \n", "10.3 | \n", "0.335968 | \n", "0.088257 | \n", "0.199227 | \n", "0.063522 | \n", "[malay] | \n", "
hubert-conformer | \n", "115 | \n", "31.1 | \n", "0.238714 | \n", "0.0609 | \n", "0.141479 | \n", "0.045075 | \n", "[malay] | \n", "
hubert-conformer-large | \n", "392 | \n", "100 | \n", "0.220314 | \n", "0.054927 | \n", "0.128006 | \n", "0.038533 | \n", "[malay] | \n", "
hubert-conformer-large-3mixed | \n", "392 | \n", "100 | \n", "0.241126 | \n", "0.078794 | \n", "0.132761 | \n", "0.057482 | \n", "[malay, singlish, mandarin] | \n", "
best-rq-conformer-tiny | \n", "36.6 | \n", "10.3 | \n", "0.319291 | \n", "0.078988 | \n", "0.179582 | \n", "0.055521 | \n", "[malay] | \n", "
best-rq-conformer | \n", "115 | \n", "31.1 | \n", "0.253678 | \n", "0.065805 | \n", "0.154206 | \n", "0.048228 | \n", "[malay] | \n", "
best-rq-conformer-large | \n", "392 | \n", "100 | \n", "0.234651 | \n", "0.06016 | \n", "0.130082 | \n", "0.044521 | \n", "[malay] | \n", "