{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Speech-to-Text RNNT PyTorch Multilanguage" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Encoder model + RNNT loss using PyTorch" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "\n", "This tutorial is available as an IPython notebook at [malaya-speech/example/stt-transducer-model-pt-multilanguage](https://github.com/huseinzol05/malaya-speech/tree/master/example/stt-transducer-model-pt-multilanguage).\n", " \n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "\n", "This module is not language independent, so it not save to use on different languages. Pretrained models trained on hyperlocal languages.\n", " \n", "
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "os.environ['CUDA_VISIBLE_DEVICES'] = ''" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "`pyaudio` is not available, `malaya_speech.streaming.pyaudio` is not able to use.\n" ] } ], "source": [ "import malaya_speech\n", "import numpy as np\n", "from malaya_speech import Pipeline" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import logging\n", "\n", "logging.basicConfig(level=logging.INFO)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### List available RNNT model" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:malaya_speech.stt:for `malay-fleur102` language, tested on FLEURS102 `ms_my` test set, https://github.com/huseinzol05/malaya-speech/tree/master/pretrained-model/prepare-stt\n", "INFO:malaya_speech.stt:for `malay-malaya` language, tested on malaya-speech test set, https://github.com/huseinzol05/malaya-speech/tree/master/pretrained-model/prepare-stt\n", "INFO:malaya_speech.stt:for `singlish` language, tested on IMDA malaya-speech test set, https://github.com/huseinzol05/malaya-speech/tree/master/pretrained-model/prepare-stt\n", "INFO:malaya_speech.stt:for `whisper-mixed` language, tested on semisupervised Whisper Large V2 test set, https://github.com/huseinzol05/malaya-speech/tree/master/pretrained-model/prepare-stt\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Size (MB)malay-malayamalay-fleur102Languagesinglishwhisper-mixed
mesolitica/conformer-tiny38.5{'WER': 0.17341180814, 'CER': 0.05957485024}{'WER': 0.19524478979, 'CER': 0.0830808938}[malay]NaNNaN
mesolitica/conformer-base121{'WER': 0.122076123261, 'CER': 0.03879606324}{'WER': 0.1326737206665, 'CER': 0.05032914857}[malay]NaNNaN
mesolitica/conformer-medium243{'WER': 0.1054817492564, 'CER': 0.0313518992842}{'WER': 0.1172708897486, 'CER': 0.0431050488}[malay]NaNNaN
mesolitica/emformer-base162{'WER': 0.175762423786, 'CER': 0.06233919000537}{'WER': 0.18303839134, 'CER': 0.0773853362}[malay]NaNNaN
mesolitica/conformer-base-singlish121NaNNaN[singlish]{'WER': 0.06517537334361, 'CER': 0.03265430876}NaN
mesolitica/conformer-medium-mixed243{'WER': 0.111166517935, 'CER': 0.03410958328}{'WER': 0.108354748, 'CER': 0.037785722}[malay, singlish]{'WER': 0.091969755225, 'CER': 0.044627194623}NaN
mesolitica/conformer-medium-malay-whisper243{'WER': 0.092561502, 'CER': 0.0245421736}{'WER': 0.097128574, 'CER': 0.03392603}[malay, mixed]NaN{'WER': 0.1705298134, 'CER': 0.10580679153}
mesolitica/conformer-large-malay-whisper413{'WER': 0.10028492039, 'CER': 0.0310868406}{'WER': 0.09544850396, 'CER': 0.03258454692}[malay, mixed]NaN{'WER': 0.20429079189, 'CER': 0.12111372327}
\n", "
" ], "text/plain": [ " Size (MB) \\\n", "mesolitica/conformer-tiny 38.5 \n", "mesolitica/conformer-base 121 \n", "mesolitica/conformer-medium 243 \n", "mesolitica/emformer-base 162 \n", "mesolitica/conformer-base-singlish 121 \n", "mesolitica/conformer-medium-mixed 243 \n", "mesolitica/conformer-medium-malay-whisper 243 \n", "mesolitica/conformer-large-malay-whisper 413 \n", "\n", " malay-malaya \\\n", "mesolitica/conformer-tiny {'WER': 0.17341180814, 'CER': 0.05957485024} \n", "mesolitica/conformer-base {'WER': 0.122076123261, 'CER': 0.03879606324} \n", "mesolitica/conformer-medium {'WER': 0.1054817492564, 'CER': 0.0313518992842} \n", "mesolitica/emformer-base {'WER': 0.175762423786, 'CER': 0.06233919000537} \n", "mesolitica/conformer-base-singlish NaN \n", "mesolitica/conformer-medium-mixed {'WER': 0.111166517935, 'CER': 0.03410958328} \n", "mesolitica/conformer-medium-malay-whisper {'WER': 0.092561502, 'CER': 0.0245421736} \n", "mesolitica/conformer-large-malay-whisper {'WER': 0.10028492039, 'CER': 0.0310868406} \n", "\n", " malay-fleur102 \\\n", "mesolitica/conformer-tiny {'WER': 0.19524478979, 'CER': 0.0830808938} \n", "mesolitica/conformer-base {'WER': 0.1326737206665, 'CER': 0.05032914857} \n", "mesolitica/conformer-medium {'WER': 0.1172708897486, 'CER': 0.0431050488} \n", "mesolitica/emformer-base {'WER': 0.18303839134, 'CER': 0.0773853362} \n", "mesolitica/conformer-base-singlish NaN \n", "mesolitica/conformer-medium-mixed {'WER': 0.108354748, 'CER': 0.037785722} \n", "mesolitica/conformer-medium-malay-whisper {'WER': 0.097128574, 'CER': 0.03392603} \n", "mesolitica/conformer-large-malay-whisper {'WER': 0.09544850396, 'CER': 0.03258454692} \n", "\n", " Language \\\n", "mesolitica/conformer-tiny [malay] \n", "mesolitica/conformer-base [malay] \n", "mesolitica/conformer-medium [malay] \n", "mesolitica/emformer-base [malay] \n", "mesolitica/conformer-base-singlish [singlish] \n", "mesolitica/conformer-medium-mixed [malay, singlish] \n", "mesolitica/conformer-medium-malay-whisper [malay, mixed] \n", "mesolitica/conformer-large-malay-whisper [malay, mixed] \n", "\n", " singlish \\\n", "mesolitica/conformer-tiny NaN \n", "mesolitica/conformer-base NaN \n", "mesolitica/conformer-medium NaN \n", "mesolitica/emformer-base NaN \n", "mesolitica/conformer-base-singlish {'WER': 0.06517537334361, 'CER': 0.03265430876} \n", "mesolitica/conformer-medium-mixed {'WER': 0.091969755225, 'CER': 0.044627194623} \n", "mesolitica/conformer-medium-malay-whisper NaN \n", "mesolitica/conformer-large-malay-whisper NaN \n", "\n", " whisper-mixed \n", "mesolitica/conformer-tiny NaN \n", "mesolitica/conformer-base NaN \n", "mesolitica/conformer-medium NaN \n", "mesolitica/emformer-base NaN \n", "mesolitica/conformer-base-singlish NaN \n", "mesolitica/conformer-medium-mixed NaN \n", "mesolitica/conformer-medium-malay-whisper {'WER': 0.1705298134, 'CER': 0.10580679153} \n", "mesolitica/conformer-large-malay-whisper {'WER': 0.20429079189, 'CER': 0.12111372327} " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "malaya_speech.stt.transducer.available_pt_transformer()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'malay-malaya': {'WER': 0.16477548774, 'CER': 0.05973209121},\n", " 'malay-fleur102': {'WER': 0.109588779, 'CER': 0.047891527},\n", " 'singlish': {'WER': 0.4941349, 'CER': 0.3026296}}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "malaya_speech.stt.google_accuracy" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'tiny': {'Size (MB)': 72.1,\n", " 'malay-malaya': {'WER': 0.7897730947, 'CER': 0.341671582346},\n", " 'malay-fleur102': {'WER': 0.640224185, 'CER': 0.2869274323},\n", " 'singlish': {'WER': 0.4751720563, 'CER': 0.35132630877}},\n", " 'base': {'Size (MB)': 139,\n", " 'malay-malaya': {'WER': 0.5138481614, 'CER': 0.19487665487},\n", " 'malay-fleur102': {'WER': 0.4268323797, 'CER': 0.1545261803},\n", " 'singlish': {'WER': 0.5354453439, 'CER': 0.4287910359}},\n", " 'small': {'Size (MB)': 461,\n", " 'malay-malaya': {'WER': 0.2818371132, 'CER': 0.09588120693},\n", " 'malay-fleur102': {'WER': 0.2436472703, 'CER': 0.0913692568},\n", " 'singlish': {'WER': 0.5971608337, 'CER': 0.5003890601}},\n", " 'medium': {'Size (MB)': 1400,\n", " 'malay-malaya': {'WER': 0.18945585961, 'CER': 0.0658303076},\n", " 'malay-fleur102': {'WER': 0.1647166507, 'CER': 0.065537127},\n", " 'singlish': {'WER': 0.68563087121, 'CER': 0.601676254253}},\n", " 'large-v2': {'Size (MB)': 2900,\n", " 'malay-malaya': {'WER': 0.1585939185, 'CER': 0.054978161091},\n", " 'malay-fleur102': {'WER': 0.127483122485, 'CER': 0.05648688907},\n", " 'singlish': {'WER': 0.6174993839, 'CER': 0.54582068858}}}" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "malaya_speech.stt.whisper_accuracy" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**You should be skeptical with google and whisper accuracies, test set been applied with malaya-speech postprocessing, this can cause higher WER and CER**." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load RNNT model\n", "\n", "```python\n", "def pt_transformer(\n", " model: str = 'mesolitica/conformer-base',\n", " **kwargs,\n", "):\n", " \"\"\"\n", " Load Encoder-Transducer ASR model using Pytorch.\n", "\n", " Parameters\n", " ----------\n", " model : str, optional (default='mesolitica/conformer-base')\n", " Check available models at `malaya_speech.stt.transducer.available_pt_transformer()`.\n", "\n", " Returns\n", " -------\n", " result : malaya_speech.torch_model.torchaudio.Conformer class\n", " \"\"\"\n", "```" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:malaya_boilerplate.huggingface:downloading frozen mesolitica/conformer-medium-mixed/model.pt\n", "INFO:malaya_boilerplate.huggingface:downloading frozen mesolitica/conformer-medium-mixed/malay-stt.model\n", "INFO:malaya_boilerplate.huggingface:downloading frozen mesolitica/conformer-medium-mixed/malay-stats.json\n" ] } ], "source": [ "model_mixed = malaya_speech.stt.transducer.pt_transformer(model = 'mesolitica/conformer-medium-mixed')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:malaya_boilerplate.huggingface:downloading frozen mesolitica/conformer-medium-malay-whisper/model.pt\n", "INFO:malaya_boilerplate.huggingface:downloading frozen mesolitica/conformer-medium-malay-whisper/malay-stt.model\n", "INFO:malaya_boilerplate.huggingface:downloading frozen mesolitica/conformer-medium-malay-whisper/malay-stats.json\n" ] } ], "source": [ "medium_whisper = malaya_speech.stt.transducer.pt_transformer(model = 'mesolitica/conformer-medium-malay-whisper')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "_ = model_mixed.eval()\n", "_ = medium_whisper.eval()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load sample" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from datasets import Audio\n", "\n", "sr = 16000\n", "audio = Audio(sampling_rate=sr)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "y, _ = malaya_speech.load('speech/example-speaker/husein-zolkepli.wav')\n", "y1 = audio.decode_example(audio.encode_example('speech/example-speaker/husein-zolkepli-mixed-1.mp3'))['array']\n", "y2 = audio.decode_example(audio.encode_example('speech/example-speaker/husein-zolkepli-mixed-2.mp3'))['array']" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import IPython.display as ipd\n", "\n", "ipd.Audio(y, rate = sr)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ipd.Audio(y1, rate = sr)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ipd.Audio(y2, rate = sr)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Predict using beam decoder\n", "\n", "```python\n", "def beam_decoder(self, inputs, beam_width: int = 20):\n", " \"\"\"\n", " Transcribe inputs using beam decoder.\n", "\n", " Parameters\n", " ----------\n", " inputs: List[np.array]\n", " List[np.array] or List[malaya_speech.model.frame.Frame].\n", " beam_width: int, optional (default=20)\n", " beam size for beam decoder.\n", "\n", " Returns\n", " -------\n", " result: List[str]\n", " \"\"\"\n", "```" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 26.5 s, sys: 758 ms, total: 27.2 s\n", "Wall time: 2.38 s\n" ] }, { "data": { "text/plain": [ "['testing nama saya hussein bin zulkifli',\n", " 'hello nama saya mesin i hate fish but like three chicken thank you',\n", " 'oh hari ini saya nak cakap tentang harian saya sampai is good something is bad but most of the day is good markets avanition sister mainan di ruang']" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "model_mixed.beam_decoder([y, y1, y2])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 48.1 s, sys: 550 ms, total: 48.7 s\n", "Wall time: 4.5 s\n" ] }, { "data": { "text/plain": [ "['testing nama saya hussein bin zulcaply',\n", " 'hello nama saya hussein i hate fish but lighty chicken thank you',\n", " 'hari ini saya nak cakap tentang harian saya something is good something is bad but most of the day is good markets affanny electoral dan saya suka main dengan di ruang']" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "medium_whisper.beam_decoder([y, y1, y2])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Compare with Google STT" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "import speech_recognition as sr\n", "\n", "r = sr.Recognizer()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "import soundfile as sf\n", "\n", "sf.write('test-mixed1.wav', y1, 16000)\n", "sf.write('test-mixed2.wav', y2, 16000)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'testing Nama saya Hussein bin Zulkifli'" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with sr.AudioFile('speech/example-speaker/husein-zolkepli.wav') as source:\n", " a = r.record(source)\n", "\n", "text = r.recognize_google(a, language = 'ms')\n", "text" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Helo nama saya Hussein Aidil Hafiz lagi pun Thank you'" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with sr.AudioFile('test-mixed1.wav') as source:\n", " a = r.record(source)\n", "\n", "text = r.recognize_google(a, language = 'ms')\n", "text" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'sains nak cakap dengan angah harian saya macam saya juga nak tengok cepat sebab musuh boleh diskaun semakin Zam pahala kita hujan Saya suka main dia orang'" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with sr.AudioFile('test-mixed2.wav') as source:\n", " a = r.record(source)\n", "\n", "text = r.recognize_google(a, language = 'ms')\n", "text" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Straight bad." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }