Source code for malaya_speech.torch_model.nemo

import torch
import yaml
import numpy as np
from malaya_speech.utils.padding import sequence_1d
from malaya_speech.model.frame import Frame
from malaya_speech.utils import nemo_featurization
from malaya_speech.utils.nemo_featurization import (
    AudioToMelSpectrogramPreprocessor,
)
from malaya_speech.nemo import conv_asr
from malaya_speech.nemo.conv_asr import SpeakerDecoder
from malaya_speech.utils.activation import softmax
from malaya_boilerplate.torch_utils import to_tensor_cuda, to_numpy


[docs]class SpeakerVector(torch.nn.Module): def __init__(self, config, pth, model, name): super().__init__() with open(config) as stream: try: d = yaml.safe_load(stream) except yaml.YAMLError as exc: raise ValueError('invalid yaml') preprocessor = d['preprocessor'].copy() preprocessor.pop('_target_') encoder = d['encoder'].copy() encoder_target = encoder.pop('_target_').split('.')[-1] decoder = d['decoder'].copy() decoder.pop('_target_') self.preprocessor = AudioToMelSpectrogramPreprocessor(**preprocessor) self.encoder = getattr(conv_asr, encoder_target)(**encoder) self.decoder = SpeakerDecoder(**decoder) self.load_state_dict(torch.load(pth, map_location='cpu')) self.__model__ = model self.__name__ = name
[docs] def forward(self, inputs): """ Vectorize inputs. Parameters ---------- inputs: List[np.array] List[np.array] or List[malaya_speech.model.frame.Frame]. """ inputs = [ input.array if isinstance(input, Frame) else input for input in inputs ] cuda = next(self.parameters()).is_cuda inputs, lengths = sequence_1d( inputs, return_len=True ) inputs = to_tensor_cuda(torch.Tensor(inputs.astype(np.float32)), cuda) lengths = to_tensor_cuda(torch.Tensor(lengths), cuda) o_processor = self.preprocessor(inputs, lengths) o_encoder = self.encoder(*o_processor) return self.decoder(*o_encoder)
[docs] def vectorize(self, inputs): """ Vectorize inputs. Parameters ---------- inputs: List[np.array] List[np.array] or List[malaya_speech.model.frame.Frame]. Returns ------- result: np.array """ r = self.forward(inputs=inputs) return to_numpy(r[1])
def __call__(self, inputs): return self.vectorize(inputs)
[docs]class Classification(torch.nn.Module): def __init__(self, config, pth, label, model, name): super().__init__() with open(config) as stream: try: d = yaml.safe_load(stream) except yaml.YAMLError as exc: raise ValueError('invalid yaml') preprocessor = d['preprocessor'].copy() preprocessor_target = (preprocessor.pop('_target_', None) or preprocessor.pop('cls', None)).split('.')[-1] if 'params' in preprocessor: preprocessor = preprocessor['params'] encoder = d['encoder'].copy() encoder_target = (encoder.pop('_target_', None) or encoder.pop('cls', None)).split('.')[-1] if 'params' in encoder: encoder = encoder['params'] decoder = d['decoder'].copy() decoder_target = (decoder.pop('_target_', None) or decoder.pop('cls', None)).split('.')[-1] if 'params' in decoder: decoder = decoder['params'] self.preprocessor = getattr(nemo_featurization, preprocessor_target)(**preprocessor) self.encoder = getattr(conv_asr, encoder_target)(**encoder) self.decoder = getattr(conv_asr, decoder_target)(**decoder) self.load_state_dict(torch.load(pth, map_location='cpu')) self.labels = label self.__model__ = model self.__name__ = name
[docs] def forward(self, inputs): """ Vectorize inputs. Parameters ---------- inputs: List[np.array] List[np.array] or List[malaya_speech.model.frame.Frame]. """ inputs = [ input.array if isinstance(input, Frame) else input for input in inputs ] cuda = next(self.parameters()).is_cuda inputs, lengths = sequence_1d( inputs, return_len=True ) inputs = to_tensor_cuda(torch.Tensor(inputs.astype(np.float32)), cuda) lengths = to_tensor_cuda(torch.Tensor(lengths), cuda) o_processor = self.preprocessor(inputs, lengths) o_encoder = self.encoder(*o_processor) try: r = self.decoder(*o_encoder) except BaseException: r = self.decoder(o_encoder[0]) return r
[docs] def predict_proba(self, inputs): """ Predict inputs, will return probability. Parameters ---------- inputs: List[np.array] List[np.array] or List[malaya_speech.model.frame.Frame]. Returns ------- result: np.array returned [B, D]. """ o = self.forward(inputs=inputs) if isinstance(o, tuple): o = o[0] r = to_numpy(o) return softmax(r, axis=-1)
[docs] def predict(self, inputs): """ Predict inputs, will return labels. Parameters ---------- inputs: List[np.array] List[np.array] or List[malaya_speech.model.frame.Frame]. Returns ------- result: List[str] returned [B]. """ o = self.forward(inputs=inputs) if isinstance(o, tuple): o = o[0] r = to_numpy(o) probs = np.argmax(r, axis=1) return [self.labels[p] for p in probs]
def __call__(self, input): """ Predict input, will return label. Parameters ---------- inputs: np.array np.array or malaya_speech.model.frame.Frame. Returns ------- result: str """ return self.predict([input])[0]