{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Speech-to-Text RNNT PyTorch" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Encoder model + RNNT loss using PyTorch" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "\n", "This tutorial is available as an IPython notebook at [malaya-speech/example/stt-transducer-model-pt](https://github.com/huseinzol05/malaya-speech/tree/master/example/stt-transducer-model-pt).\n", " \n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "\n", "This module is not language independent, so it not save to use on different languages. Pretrained models trained on hyperlocal languages.\n", " \n", "
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "os.environ['CUDA_VISIBLE_DEVICES'] = ''" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "`pyaudio` is not available, `malaya_speech.streaming.pyaudio` is not able to use.\n" ] } ], "source": [ "import malaya_speech\n", "import numpy as np\n", "from malaya_speech import Pipeline" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import logging\n", "\n", "logging.basicConfig(level=logging.INFO)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### List available RNNT model" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:malaya_speech.stt:for `malay-fleur102` language, tested on FLEURS102 `ms_my` test set, https://github.com/huseinzol05/malaya-speech/tree/master/pretrained-model/prepare-stt\n", "INFO:malaya_speech.stt:for `malay-malaya` language, tested on malaya-speech test set, https://github.com/huseinzol05/malaya-speech/tree/master/pretrained-model/prepare-stt\n", "INFO:malaya_speech.stt:for `singlish` language, tested on IMDA malaya-speech test set, https://github.com/huseinzol05/malaya-speech/tree/master/pretrained-model/prepare-stt\n", "INFO:malaya_speech.stt:for `whisper-mixed` language, tested on semisupervised Whisper Large V2 test set, https://github.com/huseinzol05/malaya-speech/tree/master/pretrained-model/prepare-stt\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Size (MB)malay-malayamalay-fleur102Languagesinglishwhisper-mixed
mesolitica/conformer-tiny38.5{'WER': 0.17341180814, 'CER': 0.05957485024}{'WER': 0.19524478979, 'CER': 0.0830808938}[malay]NaNNaN
mesolitica/conformer-base121{'WER': 0.122076123261, 'CER': 0.03879606324}{'WER': 0.1326737206665, 'CER': 0.05032914857}[malay]NaNNaN
mesolitica/conformer-medium243{'WER': 0.1054817492564, 'CER': 0.0313518992842}{'WER': 0.1172708897486, 'CER': 0.0431050488}[malay]NaNNaN
mesolitica/emformer-base162{'WER': 0.175762423786, 'CER': 0.06233919000537}{'WER': 0.18303839134, 'CER': 0.0773853362}[malay]NaNNaN
mesolitica/conformer-base-singlish121NaNNaN[singlish]{'WER': 0.06517537334361, 'CER': 0.03265430876}NaN
mesolitica/conformer-medium-mixed243{'WER': 0.111166517935, 'CER': 0.03410958328}{'WER': 0.108354748, 'CER': 0.037785722}[malay, singlish]{'WER': 0.091969755225, 'CER': 0.044627194623}NaN
mesolitica/conformer-medium-malay-whisper243{'WER': 0.092561502, 'CER': 0.0245421736}{'WER': 0.097128574, 'CER': 0.03392603}[malay, mixed]NaN{'WER': 0.1705298134, 'CER': 0.10580679153}
mesolitica/conformer-large-malay-whisper413{'WER': 0.10028492039, 'CER': 0.0310868406}{'WER': 0.09544850396, 'CER': 0.03258454692}[malay, mixed]NaN{'WER': 0.20429079189, 'CER': 0.12111372327}
\n", "
" ], "text/plain": [ " Size (MB) \\\n", "mesolitica/conformer-tiny 38.5 \n", "mesolitica/conformer-base 121 \n", "mesolitica/conformer-medium 243 \n", "mesolitica/emformer-base 162 \n", "mesolitica/conformer-base-singlish 121 \n", "mesolitica/conformer-medium-mixed 243 \n", "mesolitica/conformer-medium-malay-whisper 243 \n", "mesolitica/conformer-large-malay-whisper 413 \n", "\n", " malay-malaya \\\n", "mesolitica/conformer-tiny {'WER': 0.17341180814, 'CER': 0.05957485024} \n", "mesolitica/conformer-base {'WER': 0.122076123261, 'CER': 0.03879606324} \n", "mesolitica/conformer-medium {'WER': 0.1054817492564, 'CER': 0.0313518992842} \n", "mesolitica/emformer-base {'WER': 0.175762423786, 'CER': 0.06233919000537} \n", "mesolitica/conformer-base-singlish NaN \n", "mesolitica/conformer-medium-mixed {'WER': 0.111166517935, 'CER': 0.03410958328} \n", "mesolitica/conformer-medium-malay-whisper {'WER': 0.092561502, 'CER': 0.0245421736} \n", "mesolitica/conformer-large-malay-whisper {'WER': 0.10028492039, 'CER': 0.0310868406} \n", "\n", " malay-fleur102 \\\n", "mesolitica/conformer-tiny {'WER': 0.19524478979, 'CER': 0.0830808938} \n", "mesolitica/conformer-base {'WER': 0.1326737206665, 'CER': 0.05032914857} \n", "mesolitica/conformer-medium {'WER': 0.1172708897486, 'CER': 0.0431050488} \n", "mesolitica/emformer-base {'WER': 0.18303839134, 'CER': 0.0773853362} \n", "mesolitica/conformer-base-singlish NaN \n", "mesolitica/conformer-medium-mixed {'WER': 0.108354748, 'CER': 0.037785722} \n", "mesolitica/conformer-medium-malay-whisper {'WER': 0.097128574, 'CER': 0.03392603} \n", "mesolitica/conformer-large-malay-whisper {'WER': 0.09544850396, 'CER': 0.03258454692} \n", "\n", " Language \\\n", "mesolitica/conformer-tiny [malay] \n", "mesolitica/conformer-base [malay] \n", "mesolitica/conformer-medium [malay] \n", "mesolitica/emformer-base [malay] \n", "mesolitica/conformer-base-singlish [singlish] \n", "mesolitica/conformer-medium-mixed [malay, singlish] \n", "mesolitica/conformer-medium-malay-whisper [malay, mixed] \n", "mesolitica/conformer-large-malay-whisper [malay, mixed] \n", "\n", " singlish \\\n", "mesolitica/conformer-tiny NaN \n", "mesolitica/conformer-base NaN \n", "mesolitica/conformer-medium NaN \n", "mesolitica/emformer-base NaN \n", "mesolitica/conformer-base-singlish {'WER': 0.06517537334361, 'CER': 0.03265430876} \n", "mesolitica/conformer-medium-mixed {'WER': 0.091969755225, 'CER': 0.044627194623} \n", "mesolitica/conformer-medium-malay-whisper NaN \n", "mesolitica/conformer-large-malay-whisper NaN \n", "\n", " whisper-mixed \n", "mesolitica/conformer-tiny NaN \n", "mesolitica/conformer-base NaN \n", "mesolitica/conformer-medium NaN \n", "mesolitica/emformer-base NaN \n", "mesolitica/conformer-base-singlish NaN \n", "mesolitica/conformer-medium-mixed NaN \n", "mesolitica/conformer-medium-malay-whisper {'WER': 0.1705298134, 'CER': 0.10580679153} \n", "mesolitica/conformer-large-malay-whisper {'WER': 0.20429079189, 'CER': 0.12111372327} " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "malaya_speech.stt.transducer.available_pt_transformer()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'malay-malaya': {'WER': 0.16477548774, 'CER': 0.05973209121},\n", " 'malay-fleur102': {'WER': 0.109588779, 'CER': 0.047891527},\n", " 'singlish': {'WER': 0.4941349, 'CER': 0.3026296}}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "malaya_speech.stt.google_accuracy" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'tiny': {'Size (MB)': 72.1,\n", " 'malay-malaya': {'WER': 0.7897730947, 'CER': 0.341671582346},\n", " 'malay-fleur102': {'WER': 0.640224185, 'CER': 0.2869274323},\n", " 'singlish': {'WER': 0.4751720563, 'CER': 0.35132630877}},\n", " 'base': {'Size (MB)': 139,\n", " 'malay-malaya': {'WER': 0.5138481614, 'CER': 0.19487665487},\n", " 'malay-fleur102': {'WER': 0.4268323797, 'CER': 0.1545261803},\n", " 'singlish': {'WER': 0.5354453439, 'CER': 0.4287910359}},\n", " 'small': {'Size (MB)': 461,\n", " 'malay-malaya': {'WER': 0.2818371132, 'CER': 0.09588120693},\n", " 'malay-fleur102': {'WER': 0.2436472703, 'CER': 0.0913692568},\n", " 'singlish': {'WER': 0.5971608337, 'CER': 0.5003890601}},\n", " 'medium': {'Size (MB)': 1400,\n", " 'malay-malaya': {'WER': 0.18945585961, 'CER': 0.0658303076},\n", " 'malay-fleur102': {'WER': 0.1647166507, 'CER': 0.065537127},\n", " 'singlish': {'WER': 0.68563087121, 'CER': 0.601676254253}},\n", " 'large-v2': {'Size (MB)': 2900,\n", " 'malay-malaya': {'WER': 0.1585939185, 'CER': 0.054978161091},\n", " 'malay-fleur102': {'WER': 0.127483122485, 'CER': 0.05648688907},\n", " 'singlish': {'WER': 0.6174993839, 'CER': 0.54582068858}}}" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "malaya_speech.stt.whisper_accuracy" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**You should be skeptical with google and whisper accuracies, test set been applied with malaya-speech postprocessing, this can cause higher WER and CER**." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load RNNT model\n", "\n", "```python\n", "def pt_transformer(\n", " model: str = 'mesolitica/conformer-base',\n", " **kwargs,\n", "):\n", " \"\"\"\n", " Load Encoder-Transducer ASR model using Pytorch.\n", "\n", " Parameters\n", " ----------\n", " model : str, optional (default='mesolitica/conformer-base')\n", " Check available models at `malaya_speech.stt.transducer.available_pt_transformer()`.\n", "\n", " Returns\n", " -------\n", " result : malaya_speech.torch_model.torchaudio.Conformer class\n", " \"\"\"\n", "```" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:malaya_boilerplate.huggingface:downloading frozen mesolitica/conformer-base/model.pt\n", "INFO:malaya_boilerplate.huggingface:downloading frozen mesolitica/conformer-base/malay-stt.model\n", "INFO:malaya_boilerplate.huggingface:downloading frozen mesolitica/conformer-base/malay-stats.json\n" ] } ], "source": [ "model = malaya_speech.stt.transducer.pt_transformer(model = 'mesolitica/conformer-base')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:malaya_boilerplate.huggingface:downloading frozen mesolitica/conformer-medium-mixed/model.pt\n", "INFO:malaya_boilerplate.huggingface:downloading frozen mesolitica/conformer-medium-mixed/malay-stt.model\n", "INFO:malaya_boilerplate.huggingface:downloading frozen mesolitica/conformer-medium-mixed/malay-stats.json\n" ] } ], "source": [ "model_mixed = malaya_speech.stt.transducer.pt_transformer(model = 'mesolitica/conformer-medium-mixed')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load sample" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "ceramah, sr = malaya_speech.load('speech/khutbah/wadi-annuar.wav')\n", "record1, sr = malaya_speech.load('speech/record/savewav_2020-11-26_22-36-06_294832.wav')\n", "record2, sr = malaya_speech.load('speech/record/savewav_2020-11-26_22-40-56_929661.wav')\n", "shafiqah_idayu, sr = malaya_speech.load('speech/example-speaker/shafiqah-idayu.wav')\n", "mas_aisyah, sr = malaya_speech.load('speech/example-speaker/mas-aisyah.wav')\n", "khalil, sr = malaya_speech.load('speech/example-speaker/khalil-nooh.wav')\n", "singlish0, _ = malaya_speech.load('speech/singlish/singlish0.wav')\n", "singlish1, _ = malaya_speech.load('speech/singlish/singlish1.wav')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import IPython.display as ipd\n", "\n", "ipd.Audio(ceramah, rate = sr)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As we can hear, the speaker speaks in kedahan dialects plus some arabic words, let see how good our model is." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ipd.Audio(record1, rate = sr)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ipd.Audio(record2, rate = sr)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ipd.Audio(shafiqah_idayu, rate = sr)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ipd.Audio(mas_aisyah, rate = sr)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ipd.Audio(khalil, rate = sr)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ipd.Audio(singlish0, rate = sr)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ipd.Audio(singlish1, rate = sr)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Predict using beam decoder\n", "\n", "```python\n", "def beam_decoder(self, inputs, beam_width: int = 20):\n", " \"\"\"\n", " Transcribe inputs using beam decoder.\n", "\n", " Parameters\n", " ----------\n", " inputs: List[np.array]\n", " List[np.array] or List[malaya_speech.model.frame.Frame].\n", " beam_width: int, optional (default=20)\n", " beam size for beam decoder.\n", "\n", " Returns\n", " -------\n", " result: List[str]\n", " \"\"\"\n", "```" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 42.9 s, sys: 317 ms, total: 43.3 s\n", "Wall time: 3.97 s\n" ] }, { "data": { "text/plain": [ "['jadi dalam perjalanan ini dunia yang susah ini ketika nabi mengajar muadz bin jabal tadi ni allah maha ini',\n", " 'helo nama saya pusing saya tak suka mandi ke tak saya masak',\n", " 'hello nama saya husin saya suka mandi saya mandi titik hari',\n", " 'nama saya syafiqa idayu',\n", " 'sebut perkataan angka',\n", " 'tolong sebut antikata',\n", " 'nanti how day broway handsome okey',\n", " 'minta toyol']" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "model.beam_decoder([ceramah, record1, record2, shafiqah_idayu, mas_aisyah, khalil,\n", " singlish0, singlish1])" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 42.9 s, sys: 616 ms, total: 43.5 s\n", "Wall time: 3.77 s\n" ] }, { "data": { "text/plain": [ "['jadi dalam perjalanan ini dunia yang susah ini ketika nabi mengajar muaz bin jabal tadi ni allah maaf ini',\n", " 'hello nama saya husin saya tak suka mandi ke tak saya',\n", " 'hello nama saya husin saya suka mandi saya mandi tiap hari',\n", " 'nama saya syafiqah idayu',\n", " 'sebut perkataan angka',\n", " 'tolong sebut ertikata',\n", " 'and then see how they roll it in film okay actually',\n", " 'then you tech to your eyes']" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "model_mixed.beam_decoder([ceramah, record1, record2, shafiqah_idayu, mas_aisyah, khalil,\n", " singlish0, singlish1])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }