{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Long Audio Classification\n", "\n", "Let say you want to classify long audio using TorchAudio, malaya-speech able to do that." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", " | Size (MB) | \n", "malay-malaya | \n", "malay-fleur102 | \n", "Language | \n", "singlish | \n", "
---|---|---|---|---|---|
mesolitica/conformer-tiny | \n", "38.5 | \n", "{'WER': 0.17341180814, 'CER': 0.05957485024} | \n", "{'WER': 0.19524478979, 'CER': 0.0830808938} | \n", "[malay] | \n", "NaN | \n", "
mesolitica/conformer-base | \n", "121 | \n", "{'WER': 0.122076123261, 'CER': 0.03879606324} | \n", "{'WER': 0.1326737206665, 'CER': 0.05032914857} | \n", "[malay] | \n", "NaN | \n", "
mesolitica/conformer-medium | \n", "243 | \n", "{'WER': 0.11723275992, 'CER': 0.03398158434893} | \n", "{'WER': 0.12977366262, 'CER': 0.048497925111} | \n", "[malay] | \n", "NaN | \n", "
mesolitica/emformer-base | \n", "162 | \n", "{'WER': 0.175762423786, 'CER': 0.06233919000537} | \n", "{'WER': 0.18303839134, 'CER': 0.0773853362} | \n", "[malay] | \n", "NaN | \n", "
mesolitica/conformer-singlish | \n", "121 | \n", "NaN | \n", "NaN | \n", "[singlish] | \n", "{'WER': 0.08535878149, 'CER': 0.0452357273822,... | \n", "
mesolitica/conformer-medium-mixed | \n", "243 | \n", "{'WER': 0.122076123261, 'CER': 0.03879606324} | \n", "{'WER': 0.1326737206665, 'CER': 0.05032914857} | \n", "[malay, singlish] | \n", "{'WER': 0.08535878149, 'CER': 0.0452357273822,... | \n", "