Source code for malaya_speech.utils.bpe

from typing import List
import logging

logger = logging.getLogger(__name__)

sentencepiece_available = False
try:
    import sentencepiece as spm
    sentencepiece_available = True
except Exception as e:
    logger.warning('`sentencepiece` is not available, any models that use sentencepiece will not able to use.')


[docs]def load_sentencepiece(model_file): """ Parameters ---------- model_file: str sentencepiece model file. Returns -------- result: sentencepiece.SentencePieceProcessor """ if not sentencepiece_available: raise ModuleNotFoundError( 'sentencepiece not installed. Please install it by `pip install sentencepiece` and try again.' ) return spm.SentencePieceProcessor(model_file=model_file)
class SentencePieceTokenProcessor: def __init__(self, sp_model_path): self.sp_model = load_sentencepiece(model_file=sp_model_path) self.post_process_remove_list = { self.sp_model.unk_id(), self.sp_model.eos_id(), self.sp_model.pad_id(), } def __call__(self, tokens: List[int], lstrip: bool = True) -> str: filtered_hypo_tokens = [ token_index for token_index in tokens[1:] if token_index not in self.post_process_remove_list ] output_string = ''.join(self.sp_model.id_to_piece(filtered_hypo_tokens)).replace('\u2581', ' ') if lstrip: return output_string.lstrip() else: return output_string