{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Speech-to-Text RNNT + GPT2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Encoder model + RNNT loss + GPT2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", " | Size (MB) | \n", "Quantized Size (MB) | \n", "malay-malaya | \n", "malay-fleur102 | \n", "Language | \n", "singlish | \n", "
---|---|---|---|---|---|---|
tiny-conformer | \n", "24.4 | \n", "9.14 | \n", "{'WER': 0.2128108, 'CER': 0.08136871, 'WER-LM'... | \n", "{'WER': 0.2682816, 'CER': 0.13052725, 'WER-LM'... | \n", "[malay] | \n", "NaN | \n", "
small-conformer | \n", "49.2 | \n", "18.1 | \n", "{'WER': 0.19853302, 'CER': 0.07449528, 'WER-LM... | \n", "{'WER': 0.23412149, 'CER': 0.1138314813, 'WER-... | \n", "[malay] | \n", "NaN | \n", "
conformer | \n", "125 | \n", "37.1 | \n", "{'WER': 0.16340855635999124, 'CER': 0.05897205... | \n", "{'WER': 0.20090442596, 'CER': 0.09616901, 'WER... | \n", "[malay] | \n", "NaN | \n", "
large-conformer | \n", "404 | \n", "107 | \n", "{'WER': 0.1566839, 'CER': 0.0619715, 'WER-LM':... | \n", "{'WER': 0.1711028238, 'CER': 0.077953559, 'WER... | \n", "[malay] | \n", "NaN | \n", "
conformer-stack-2mixed | \n", "130 | \n", "38.5 | \n", "{'WER': 0.1889883954, 'CER': 0.0726845531, 'WE... | \n", "{'WER': 0.244836948, 'CER': 0.117409327, 'WER-... | \n", "[malay, singlish] | \n", "{'WER': 0.08535878149, 'CER': 0.0452357273822,... | \n", "
small-conformer-singlish | \n", "49.2 | \n", "18.1 | \n", "NaN | \n", "NaN | \n", "[singlish] | \n", "{'WER': 0.087831, 'CER': 0.0456859, 'WER-LM': ... | \n", "
conformer-singlish | \n", "125 | \n", "37.1 | \n", "NaN | \n", "NaN | \n", "[singlish] | \n", "{'WER': 0.07779246, 'CER': 0.0403616, 'WER-LM'... | \n", "
large-conformer-singlish | \n", "404 | \n", "107 | \n", "NaN | \n", "NaN | \n", "[singlish] | \n", "{'WER': 0.07014733, 'CER': 0.03587201, 'WER-LM... | \n", "