Force Alignment using Transducer PyTorch#

Forced alignment is a technique to take an orthographic transcription of an audio file and generate a time-aligned version.

This tutorial is available as an IPython notebook at malaya-speech/example/force-alignment-transducer-pt.

This module is not language independent, so it not save to use on different languages. Pretrained models trained on hyperlocal languages.

[1]:
import malaya_speech
import numpy as np
from malaya_speech import Pipeline
import IPython.display as ipd
import matplotlib.pyplot as plt
from malaya_speech.utils.aligner import plot_alignments
`pyaudio` is not available, `malaya_speech.streaming.pyaudio` is not able to use.

List available Force Aligner model#

[2]:
malaya_speech.force_alignment.transducer.available_pt_transformer()
[2]:
Size (MB) malay-malaya malay-fleur102 Language singlish
mesolitica/conformer-tiny 38.5 {'WER': 0.17341180814, 'CER': 0.05957485024} {'WER': 0.19524478979, 'CER': 0.0830808938} [malay] NaN
mesolitica/conformer-base 121 {'WER': 0.122076123261, 'CER': 0.03879606324} {'WER': 0.1326737206665, 'CER': 0.05032914857} [malay] NaN
mesolitica/conformer-medium 243 {'WER': 0.1054817492564, 'CER': 0.0313518992842} {'WER': 0.1172708897486, 'CER': 0.0431050488} [malay] NaN
mesolitica/emformer-base 162 {'WER': 0.175762423786, 'CER': 0.06233919000537} {'WER': 0.18303839134, 'CER': 0.0773853362} [malay] NaN
mesolitica/conformer-base-singlish 121 NaN NaN [singlish] {'WER': 0.06517537334361, 'CER': 0.03265430876}
mesolitica/conformer-medium-mixed 243 {'WER': 0.111166517935, 'CER': 0.03410958328} {'WER': 0.108354748, 'CER': 0.037785722} [malay, singlish] {'WER': 0.091969755225, 'CER': 0.044627194623}

Load Transducer Aligner model#

def pt_transformer(
    model: str = 'mesolitica/conformer-base',
    **kwargs,
):
    """
    Load Encoder-Transducer ASR model using Pytorch.

    Parameters
    ----------
    model : str, optional (default='mesolitica/conformer-base')
        Check available models at `malaya_speech.force_alignment.transducer.available_pt_transformer()`.

    Returns
    -------
    result : malaya_speech.torch_model.torchaudio.ForceAlignment class
    """
[3]:
model = malaya_speech.force_alignment.transducer.pt_transformer(model = 'mesolitica/conformer-medium-mixed')

Load sample#

Malay samples#

[4]:
malay1, sr = malaya_speech.load('speech/example-speaker/shafiqah-idayu.wav')
malay2, sr = malaya_speech.load('speech/example-speaker/haqkiem.wav')
[5]:
texts = ['nama saya shafiqah idayu',
        'sebagai pembangkang yang matang dan sejahtera pas akan menghadapi pilihan raya umum dan tidak menumbang kerajaan dari pintu belakang']
[6]:
ipd.Audio(malay2, rate = sr)
[6]:

Singlish samples#

[7]:
import json
import os
from glob import glob

with open('speech/imda/output.json') as fopen:
    data = json.load(fopen)

data
[7]:
{'221931702.WAV': 'wantan mee is a traditional local cuisine',
 '221931818.WAV': 'ahmad khan adelene wee chin suan and robert ibbetson',
 '221931727.WAV': 'saravanan gopinathan george yeo yong boon and tay kheng soon'}
[8]:
wavs = glob('speech/imda/*.WAV')
wavs
[8]:
['speech/imda/221931727.WAV',
 'speech/imda/221931818.WAV',
 'speech/imda/221931702.WAV']
[9]:
y, sr = malaya_speech.load(wavs[0])
[10]:
ipd.Audio(y, rate = sr)
[10]:

Predict#

def predict(self, input, transcription: str, temperature: float = 1.0):
    """
    Transcribe input, will return a string.

    Parameters
    ----------
    input: np.array
        np.array or malaya_speech.model.frame.Frame.
    transcription: str
        transcription of input audio
    temperature: float, optional (default=1.0)
        temperature for logits.

    Returns
    -------
    result: Dict[words_alignment, subwords_alignment, subwords, alignment]
    """

Predict Malay#

Our original text is: ‘sebagai pembangkang yang matang dan sejahtera pas akan menghadapi pilihan raya umum dan tidak menumbang kerajaan dari pintu belakang’

[11]:
results = model.predict(malay2, texts[1])
[12]:
results.keys()
[12]:
dict_keys(['words_alignment', 'subwords_alignment', 'subwords', 'alignment'])
[13]:
results['words_alignment']
[13]:
[{'text': 'sebagai',
  'start': 0.07991686893203884,
  'end': 0.11987530339805826,
  'start_t': 0,
  'end_t': 0,
  'score': 0.8652574},
 {'text': 'pembangkang',
  'start': 0.5594180825242718,
  'end': 1.0389192961165048,
  'start_t': 2,
  'end_t': 4,
  'score': 2.2811141e-06},
 {'text': 'yang',
  'start': 1.158794599514563,
  'end': 1.1987530339805823,
  'start_t': 5,
  'end_t': 5,
  'score': 4.3430943e-09},
 {'text': 'matang',
  'start': 1.3985452063106798,
  'end': 1.7182126820388348,
  'start_t': 6,
  'end_t': 7,
  'score': 4.1879557e-06},
 {'text': 'dan',
  'start': 1.798129550970874,
  'end': 1.8380879854368932,
  'start_t': 7,
  'end_t': 8,
  'score': 1.9011324e-05},
 {'text': 'sejahtera',
  'start': 2.0378801577669905,
  'end': 2.477422936893204,
  'start_t': 8,
  'end_t': 10,
  'score': 2.5778693e-06},
 {'text': 'pas',
  'start': 2.7970904126213596,
  'end': 2.996882584951457,
  'start_t': 12,
  'end_t': 12,
  'score': 2.9329883e-10},
 {'text': 'akan',
  'start': 3.1167578883495146,
  'end': 3.156716322815534,
  'start_t': 13,
  'end_t': 13,
  'score': 1.2522179e-06},
 {'text': 'menghadapi',
  'start': 3.396466929611651,
  'end': 3.796051274271845,
  'start_t': 14,
  'end_t': 16,
  'score': 1.6652191e-05},
 {'text': 'pilihan',
  'start': 4.035801881067961,
  'end': 4.395427791262136,
  'start_t': 17,
  'end_t': 18,
  'score': 5.8163936e-09},
 {'text': 'raya',
  'start': 4.515303094660195,
  'end': 4.555261529126214,
  'start_t': 19,
  'end_t': 19,
  'score': 1.4073122e-09},
 {'text': 'umum',
  'start': 4.83497057038835,
  'end': 5.194596480582525,
  'start_t': 20,
  'end_t': 21,
  'score': 9.590183e-08},
 {'text': 'dan',
  'start': 5.314471783980583,
  'end': 5.354430218446602,
  'start_t': 22,
  'end_t': 22,
  'score': 0.032095414},
 {'text': 'tidak',
  'start': 5.514263956310679,
  'end': 5.554222390776698,
  'start_t': 23,
  'end_t': 23,
  'score': 9.107502e-08},
 {'text': 'menumbang',
  'start': 5.833931432038835,
  'end': 6.113640473300971,
  'start_t': 24,
  'end_t': 25,
  'score': 6.4145235e-07},
 {'text': 'kerajaan',
  'start': 6.313432645631068,
  'end': 6.353391080097087,
  'start_t': 26,
  'end_t': 26,
  'score': 2.017659e-07},
 {'text': 'dari',
  'start': 6.992726031553398,
  'end': 7.032684466019417,
  'start_t': 29,
  'end_t': 29,
  'score': 7.6962095e-07},
 {'text': 'pintu',
  'start': 7.312393507281554,
  'end': 7.352351941747573,
  'start_t': 30,
  'end_t': 30,
  'score': 1.0205139e-06},
 {'text': 'belakang',
  'start': 7.672019417475729,
  'end': 7.711977851941748,
  'start_t': 32,
  'end_t': 32,
  'score': 2.4569164e-07}]
[14]:
len(results['subwords'])
[14]:
34
[15]:
results['alignment'].shape
[15]:
(34, 34)
[16]:
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111)
ax.set_title('Alignment steps')
im = ax.imshow(
    results['alignment'],
    aspect='auto',
    origin='lower',
    interpolation='none')
ax.set_yticks(range(len(results['subwords'])))
labels = [item.get_text() for item in ax.get_yticklabels()]
ax.set_yticklabels(results['subwords'])
fig.colorbar(im, ax=ax)
xlabel = 'Encoder timestep'
plt.xlabel(xlabel)
plt.ylabel('Decoder timestep')
plt.tight_layout()
plt.show()
_images/force-alignment-transducer-pt_25_0.png
[17]:
plot_alignments(alignment = results['alignment'],
                subs_alignment = results['subwords_alignment'],
                words_alignment = results['words_alignment'],
                waveform = malay2,
                sample_rate = 16000,
                figsize = (16, 9))
_images/force-alignment-transducer-pt_26_0.png

Predict Singlish#

Our original text is: ‘saravanan gopinathan george yeo yong boon and tay kheng soon’

[20]:
results = model.predict(y, data[os.path.split(wavs[0])[1]])
[21]:
results.keys()
[21]:
dict_keys(['words_alignment', 'subwords_alignment', 'subwords', 'alignment'])
[22]:
results['words_alignment']
[22]:
[{'text': 'saravanan',
  'start': 0.8410552763819095,
  'end': 1.5219095477386935,
  'start_t': 3,
  'end_t': 6,
  'score': 9.532287e-07},
 {'text': 'gopinathan',
  'start': 1.8423115577889446,
  'end': 2.6032663316582916,
  'start_t': 7,
  'end_t': 10,
  'score': 1.789202e-06},
 {'text': 'george',
  'start': 3.364221105527638,
  'end': 3.6045226130653267,
  'start_t': 13,
  'end_t': 14,
  'score': 1.8525572e-05},
 {'text': 'yeo',
  'start': 3.844824120603015,
  'end': 4.045075376884422,
  'start_t': 15,
  'end_t': 16,
  'score': 1.0203461e-05},
 {'text': 'yong',
  'start': 4.1652261306532665,
  'end': 4.365477386934674,
  'start_t': 16,
  'end_t': 17,
  'score': 1.0203461e-05},
 {'text': 'boon',
  'start': 4.685879396984925,
  'end': 4.926180904522613,
  'start_t': 18,
  'end_t': 19,
  'score': 1.3545392e-07},
 {'text': 'and',
  'start': 5.727185929648241,
  'end': 5.767236180904522,
  'start_t': 22,
  'end_t': 22,
  'score': 3.4447204e-08},
 {'text': 'tay',
  'start': 6.007537688442211,
  'end': 6.207788944723618,
  'start_t': 23,
  'end_t': 24,
  'score': 5.423156e-07},
 {'text': 'kheng',
  'start': 6.28788944723618,
  'end': 6.488140703517588,
  'start_t': 24,
  'end_t': 25,
  'score': 1.647781e-06},
 {'text': 'soon',
  'start': 6.568241206030151,
  'end': 6.848592964824121,
  'start_t': 26,
  'end_t': 27,
  'score': 9.646859e-07}]
[23]:
results['subwords_alignment']
[23]:
[{'text': '▁sa',
  'start': 0.8410552763819095,
  'end': 0.8811055276381908,
  'start_t': 3,
  'end_t': 3,
  'score': 1.2220553e-07},
 {'text': 'ra',
  'start': 1.081356783919598,
  'end': 1.1214070351758796,
  'start_t': 4,
  'end_t': 4,
  'score': 9.243605e-08},
 {'text': 'v',
  'start': 1.201507537688442,
  'end': 1.2415577889447236,
  'start_t': 5,
  'end_t': 5,
  'score': 1.0656663e-08},
 {'text': 'an',
  'start': 1.3216582914572863,
  'end': 1.3617085427135678,
  'start_t': 5,
  'end_t': 5,
  'score': 5.4261122e-08},
 {'text': 'an',
  'start': 1.481859296482412,
  'end': 1.5219095477386935,
  'start_t': 6,
  'end_t': 6,
  'score': 8.020109e-06},
 {'text': '▁go',
  'start': 1.8423115577889446,
  'end': 1.882361809045226,
  'start_t': 7,
  'end_t': 7,
  'score': 3.209235e-06},
 {'text': 'p',
  'start': 1.962462311557789,
  'end': 2.0025125628140703,
  'start_t': 8,
  'end_t': 8,
  'score': 2.0064768e-08},
 {'text': 'in',
  'start': 2.1226633165829143,
  'end': 2.1627135678391958,
  'start_t': 8,
  'end_t': 8,
  'score': 6.117405e-08},
 {'text': 'at',
  'start': 2.2428140703517587,
  'end': 2.28286432160804,
  'start_t': 9,
  'end_t': 9,
  'score': 1.5955281e-06},
 {'text': 'han',
  'start': 2.56321608040201,
  'end': 2.6032663316582916,
  'start_t': 10,
  'end_t': 10,
  'score': 1.7622291e-08},
 {'text': '▁ge',
  'start': 3.364221105527638,
  'end': 3.4042713567839193,
  'start_t': 13,
  'end_t': 13,
  'score': 0.000544741},
 {'text': 'or',
  'start': 3.4843718592964823,
  'end': 3.5244221105527638,
  'start_t': 14,
  'end_t': 14,
  'score': 2.721273e-07},
 {'text': 'ge',
  'start': 3.5644723618090453,
  'end': 3.6045226130653267,
  'start_t': 14,
  'end_t': 14,
  'score': 4.555813e-06},
 {'text': '▁',
  'start': 3.844824120603015,
  'end': 3.8848743718592966,
  'start_t': 15,
  'end_t': 15,
  'score': 1.9964894e-07},
 {'text': 'y',
  'start': 3.884874371859296,
  'end': 3.9249246231155777,
  'start_t': 15,
  'end_t': 15,
  'score': 8.268911e-06},
 {'text': 'e',
  'start': 3.964974874371859,
  'end': 4.005025125628141,
  'start_t': 15,
  'end_t': 16,
  'score': 0.99506426},
 {'text': 'o',
  'start': 4.005025125628141,
  'end': 4.045075376884422,
  'start_t': 16,
  'end_t': 16,
  'score': 0.67549676},
 {'text': '▁',
  'start': 4.1652261306532665,
  'end': 4.205276381909548,
  'start_t': 16,
  'end_t': 16,
  'score': 4.2439947e-06},
 {'text': 'y',
  'start': 4.205276381909548,
  'end': 4.2453266331658295,
  'start_t': 16,
  'end_t': 17,
  'score': 0.0015577499},
 {'text': 'ong',
  'start': 4.325427135678392,
  'end': 4.365477386934674,
  'start_t': 17,
  'end_t': 17,
  'score': 6.180808e-08},
 {'text': '▁bo',
  'start': 4.685879396984925,
  'end': 4.725929648241206,
  'start_t': 18,
  'end_t': 18,
  'score': 4.90003e-07},
 {'text': 'on',
  'start': 4.886130653266331,
  'end': 4.926180904522613,
  'start_t': 19,
  'end_t': 19,
  'score': 4.4304317e-05},
 {'text': '▁and',
  'start': 5.727185929648241,
  'end': 5.767236180904522,
  'start_t': 22,
  'end_t': 22,
  'score': 0.95396453},
 {'text': '▁ta',
  'start': 6.007537688442211,
  'end': 6.047587939698492,
  'start_t': 23,
  'end_t': 24,
  'score': 0.99530405},
 {'text': 'y',
  'start': 6.167738693467337,
  'end': 6.207788944723618,
  'start_t': 24,
  'end_t': 24,
  'score': 0.999545},
 {'text': '▁k',
  'start': 6.28788944723618,
  'end': 6.327939698492462,
  'start_t': 24,
  'end_t': 25,
  'score': 0.38012785},
 {'text': 'h',
  'start': 6.367989949748743,
  'end': 6.408040201005025,
  'start_t': 25,
  'end_t': 25,
  'score': 0.000104434235},
 {'text': 'e',
  'start': 6.408040201005025,
  'end': 6.448090452261306,
  'start_t': 25,
  'end_t': 25,
  'score': 1.6290902e-06},
 {'text': 'ng',
  'start': 6.448090452261306,
  'end': 6.488140703517588,
  'start_t': 25,
  'end_t': 25,
  'score': 4.857545e-07},
 {'text': '▁so',
  'start': 6.568241206030151,
  'end': 6.608291457286432,
  'start_t': 26,
  'end_t': 26,
  'score': 1.017138e-07},
 {'text': 'on',
  'start': 6.80854271356784,
  'end': 6.848592964824121,
  'start_t': 26,
  'end_t': 27,
  'score': 7.557719e-06}]
[24]:
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111)
ax.set_title('Alignment steps')
im = ax.imshow(
    results['alignment'].T,
    aspect='auto',
    origin='lower',
    interpolation='none')
ax.set_yticks(range(len(results['subwords'])))
labels = [item.get_text() for item in ax.get_yticklabels()]
ax.set_yticklabels(results['subwords'])
fig.colorbar(im, ax=ax)
xlabel = 'Encoder timestep'
plt.xlabel(xlabel)
plt.ylabel('Decoder timestep')
plt.tight_layout()
plt.show()
_images/force-alignment-transducer-pt_32_0.png
[25]:
plot_alignments(alignment = results['alignment'],
                subs_alignment = results['subwords_alignment'],
                words_alignment = results['words_alignment'],
                waveform = y,
                sample_rate = 16000,
                figsize = (16, 9))
_images/force-alignment-transducer-pt_33_0.png

What if we give wrong transcription?#

[27]:
results = model.predict(y, 'husein sangat comel')
results
[27]:
{'words_alignment': [{'text': 'husein',
   'start': 0.8410552763819095,
   'end': 1.281608040201005,
   'start_t': 1,
   'end_t': 1,
   'score': 9.416054e-08},
  {'text': 'sangat',
   'start': 2.2828643216080398,
   'end': 2.3229145728643212,
   'start_t': 2,
   'end_t': 2,
   'score': 6.818997e-07},
  {'text': 'comel',
   'start': 3.324170854271357,
   'end': 3.8848743718592966,
   'start_t': 3,
   'end_t': 3,
   'score': 1.3340428e-05}],
 'subwords_alignment': [{'text': '▁hu',
   'start': 0.8410552763819095,
   'end': 0.8811055276381908,
   'start_t': 1,
   'end_t': 1,
   'score': 9.416054e-08},
  {'text': 'se',
   'start': 1.1214070351758794,
   'end': 1.1614572864321608,
   'start_t': 1,
   'end_t': 1,
   'score': 4.8340604e-07},
  {'text': 'in',
   'start': 1.2415577889447236,
   'end': 1.281608040201005,
   'start_t': 1,
   'end_t': 1,
   'score': 4.0840082e-09},
  {'text': '▁sangat',
   'start': 2.2828643216080398,
   'end': 2.3229145728643212,
   'start_t': 2,
   'end_t': 2,
   'score': 2.5190607e-06},
  {'text': '▁co',
   'start': 3.324170854271357,
   'end': 3.3642211055276383,
   'start_t': 3,
   'end_t': 3,
   'score': 0.00043028969},
  {'text': 'me',
   'start': 3.5644723618090453,
   'end': 3.6045226130653267,
   'start_t': 3,
   'end_t': 3,
   'score': 0.00030472153},
  {'text': 'l',
   'start': 3.844824120603015,
   'end': 3.8848743718592966,
   'start_t': 3,
   'end_t': 3,
   'score': 4.8931256e-06}],
 'subwords': ['▁hu', 'se', 'in', '▁sangat', '▁co', 'me', 'l'],
 'alignment': array([[2.72694809e-08, 1.46691752e-06, 2.72644751e-09, 7.75767646e-08,
         2.31194292e-07, 1.09011422e-08, 1.23739765e-08],
        [9.41605407e-08, 4.83406041e-07, 4.08400824e-09, 1.76549584e-08,
         2.92625117e-07, 2.61610573e-08, 8.27328950e-06],
        [3.89696273e-04, 6.81899678e-07, 1.51673867e-05, 2.51906067e-06,
         4.52378590e-05, 6.25562825e-06, 7.36032205e-04],
        [2.27073040e-02, 1.52006196e-05, 1.33404283e-05, 2.23881052e-05,
         4.30289685e-04, 3.04721529e-04, 4.89312561e-06],
        [3.43373443e-07, 4.62461092e-09, 1.42082008e-08, 3.31597818e-07,
         1.25402551e-07, 2.50174423e-08, 7.87086810e-06],
        [1.73348526e-05, 6.10900361e-06, 1.20940858e-05, 3.23311951e-06,
         1.60034288e-05, 2.12415078e-04, 3.16820992e-03],
        [4.66601477e-07, 8.34146192e-08, 7.52743574e-08, 1.89134155e-07,
         1.75862976e-06, 3.68233337e-08, 1.27846804e-02]], dtype=float32)}
[28]:
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111)
ax.set_title('Alignment steps')
im = ax.imshow(
    results['alignment'].T,
    aspect='auto',
    origin='lower',
    interpolation='none')
ax.set_yticks(range(len(results['subwords'])))
labels = [item.get_text() for item in ax.get_yticklabels()]
ax.set_yticklabels(results['subwords'])
fig.colorbar(im, ax=ax)
xlabel = 'Encoder timestep'
plt.xlabel(xlabel)
plt.ylabel('Decoder timestep')
plt.tight_layout()
plt.show()
_images/force-alignment-transducer-pt_36_0.png

The text output not able to align.