@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Speech Dataset from IIUM Confession texts,
  author = {Husein, Zolkepli},
  title = {Malay-Dataset},
  year = {2018},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/huseinzol05/malaya-speech/tree/master/data/iium}}
}

IIUM-Clear #

Read random sentences from IIUM Confession, cleaner version.

voice by Husein Zolkepli.
Heavily speaking in Selangor dialect.
Recorded using mid-end tech microphone.
44100 sample rate, random 7 - 11 words window.
approximate 0.1 hours.

@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Speech Dataset from IIUM Confession texts,
  author = {Husein, Zolkepli},
  title = {Malay-Dataset},
  year = {2018},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/huseinzol05/malaya-speech/tree/master/data/iium}}
}

IMDA #

Mirror link for IMDA dataset, https://www.imda.gov.sg/programme-listing/digital-services-lab/national-speech-corpus, only downloaded PART 3 and SST dataset.

16000 sample rate.
supervised approximate 2024 hours.

language #

Gather youtube urls for hyperlocal language detection from speech {malay, indonesian, manglish, english, mandarin}.

Check hyperlocal language detection models at https://malaya-speech.readthedocs.io/en/latest/load-language-detection.html

@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Hyperlocal languages for speech dataset,
  author = {Husein, Zolkepli},
  title = {Malay-Dataset},
  year = {2018},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/huseinzol05/malaya-speech/tree/master/data/language}}
}

mixed-stt #

Malay, Singlish and Mandarin STT dataset in TFRecord format. Included scripts how to load using torch.dataset.

news #

Read random sentences from bahasa news.

voice by Husein Zolkepli.
Heavily speaking in Selangor dialect.
Recorded using mid-end tech microphone, suitable for text to speech.
44100 sample rate, random 7 - 11 words window.
approximate 3.01 hours.
Still on going recording.

@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Speech Dataset from local news texts,
  author = {Husein, Zolkepli},
  title = {Malay-Dataset},
  year = {2018},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/huseinzol05/malaya-speech/tree/master/data/news}}
}

noise #

Simple noises gathered from Youtube.

Sebut perkataan #

Read random words from malay dictionary started with ‘tolong sebut ‘.

sebut-perkataan-man voice by Husein Zolkepli
tolong-sebut voice by Khalil Nooh
sebut-perkataan-woman voice by Mas Aisyah Ahmad
Recorded using low-end tech microphones.

@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Short Speech Dataset,
  author = {Husein, Zolkepli},
  title = {Malay-Dataset},
  year = {2018},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/huseinzol05/malaya-speech/tree/master/data/sebut-perkataan}}
}

Semisupervised audiobook #

Semisupervised malay audiobooks from Nusantara Audiobook using Google Speech to Text.

44100 sample rate, super clean.
semisupervised approximate 45.29 hours.
windowed using Malaya-Speech VAD, each atleast 5 negative voice activities.

@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Semisupervised Speech Recognition from Audiobook,
  author = {Husein, Zolkepli},
  title = {Malay-Dataset},
  year = {2018},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/huseinzol05/malaya-speech/tree/master/data/semisupervised-audiobook}}
}

Semisupervised malay #

Semisupervised malay youtube videos using Google Speech to Text, after that corrected by human.

16000 sample rate.
semisupervised approximate 1804 hours.
random length between 2 - 20 seconds, windowed using google VAD.
supervised 768 samples, approximate 1.3 hours.

@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Semisupervised Speech Recognition from Malay Youtube Videos,
  author = {Husein, Zolkepli},
  title = {Malay-Dataset},
  year = {2018},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/huseinzol05/malaya-speech/tree/master/data/semisupervised-malay}}
}

Semisupervised manglish #

Semisupervised manglish youtube videos using Google Speech to Text.

16000 sample rate.
semisupervised approximate 107 hours.
random length between 2 - 20 seconds, windowed using google VAD.

@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Semisupervised Speech Recognition from Manglish Youtube Videos,
  author = {Husein, Zolkepli},
  title = {Malay-Dataset},
  year = {2018},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/huseinzol05/malaya-speech/tree/master/data/semisupervised-manglish}}
}

wattpad #

Read random sentences from bahasa wattpad.

voice by Husein Zolkepli.
Heavily speaking in Selangor dialect.
Recorded using mid-end tech microphone, suitable for text to speech.
44100 sample rate, random 7 - 11 words window.
approximate 0.15 hours.
Still on going recording.

@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Speech Dataset from Wattpad texts,
  author = {Husein, Zolkepli},
  title = {Malay-Dataset},
  year = {2018},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/huseinzol05/malaya-speech/tree/master/data/wattpad}}
}

Wikipedia #

Read random sentences from Bahasa Wikipedia.

voice by Husein Zolkepli.
Heavily speaking in Selangor dialect.
Recorded using low-end tech microphone.
44100 sample rate, 4 words window.
approximate 3.4 hours.
Still on going recording.

@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Speech Dataset from Wikipedia texts,
  author = {Husein, Zolkepli},
  title = {Malay-Dataset},
  year = {2018},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/huseinzol05/malaya-speech/tree/master/data/wikipedia}}
}

youtube #

Semisupervised transcription and Unsupervised Speaker Diarization on 5k malay speakers youtube videos.

Speech Dataset

Contents

Speech Dataset#

How we gather dataset?#

License#

Dataset#

Ambient #

Audiobook #

Azure-TTS #

GCP-TTS #

Emotion #

IIUM #