Source code for malaya_speech.utils.char

import six
import string
from typing import List

PAD = '<PAD>'
EOS = '<EOS>'
RESERVED_TOKENS = [PAD, EOS]
NUM_RESERVED_TOKENS = len(RESERVED_TOKENS)
PAD_ID = RESERVED_TOKENS.index(PAD)
EOS_ID = RESERVED_TOKENS.index(EOS)
VOCAB_SIZE = 256
BLANK = 0
CTC_VOCAB = [''] + list(string.ascii_lowercase + string.digits) + [' ']
CTC_VOCAB_IDX = {c: i for i, c in enumerate(CTC_VOCAB)}
HF_CTC_VOCAB = [''] + list(string.ascii_lowercase + string.digits) + [' ', '?']
HF_CTC_VOCAB_IDX = {c: i for i, c in enumerate(HF_CTC_VOCAB)}


[docs]def strip_ids(ids, ids_to_strip):
    """Strip ids_to_strip from the end ids."""
    ids = list(ids)
    while ids and ids[-1] in ids_to_strip:
        ids.pop()
    return ids


[docs]def generate_vocab(strings: List[str]):
    """
    Generate character vocab sorted based on frequency.

    Parameters
    -----------
    strings: List[str]

    Returns
    --------
    result: List[str]
    """
    joined = ' '.join(strings)
    unique_chars = set(joined)
    unique_chars = [(c, joined.count(c)) for c in unique_chars]
    unique_chars = sorted(
        unique_chars, key=lambda element: element[1], reverse=True
    )
    unique_chars, _ = zip(*unique_chars)
    unique_chars = list(unique_chars)
    return RESERVED_TOKENS + unique_chars


[docs]def encode(
    string: str,
    add_eos: bool = True,
    add_blank: bool = False,
    lookup: List[str] = None,
):
    """
    Encode string to integer representation based on ascii table or lookup variable.

    Parameters
    -----------
    string: str
    add_eos: bool, optional (default=True)
        add EOS token at the end of encoded.
    add_blank: bool, optional (default=False)
        add BLANK token at the starting of encoded, this is for transducer / transformer based.
    lookup: List[str], optional (default=None)
        list of unique strings.

    Returns
    --------
    result: List[int]
    """
    if lookup:
        if len(lookup) != len(set(lookup)):
            raise ValueError('lookup must be a list of unique strings')
        r = [lookup.index(c) for c in string]
    else:
        r = [c + NUM_RESERVED_TOKENS for c in string.encode('utf-8')]
    if add_eos:
        r = r + [1]
    if add_blank:
        r = [BLANK] + r
    return r


[docs]def decode(ids, lookup: List[str] = None):
    """
    Decode integer representation to string based on ascii table or lookup variable.

    Parameters
    -----------
    ids: List[int]
    lookup: List[str], optional (default=None)
        list of unique strings.

    Returns
    --------
    result: str
    """
    decoded_ids = []
    int2byte = six.int2byte
    for id_ in ids:
        if lookup:
            decoded_ids.append(lookup[id_])
        else:
            decoded_ids.append(
                int2byte(id_ - NUM_RESERVED_TOKENS).decode('utf-8')
            )

    return ''.join(decoded_ids)