Source code for stable_datasets.timeseries.VoiceGenderDetection
import zipfile
from stable_datasets.schema import (
ClassLabel,
DatasetInfo,
DatasetSource,
DownloadInfo,
Features,
Sequence,
Value,
Version,
)
from stable_datasets.utils import BaseDatasetBuilder
from ._audio_utils import audiosegment_bytes_to_series
[docs]
class VoiceGenderDetection(BaseDatasetBuilder):
"""Voice gender detection dataset derived from VoxCeleb clips."""
VERSION = Version("1.0.0")
SOURCE = DatasetSource(
homepage="https://dagshub.com/DagsHub/audio-datasets/src/main/voice_gender_detection",
assets={
"train": DownloadInfo(
url="https://drive.google.com/u/0/uc?id=1HRbWocxwClGy9Fj1MQeugpR4vOaL9ebO&export=download",
filename="VoxCeleb_gender.zip",
),
},
citation="See dataset homepage.",
)
def _info(self):
return DatasetInfo(
description="Voice gender detection dataset with male/female labels.",
features=Features(
{
"series": Sequence(Sequence(Value("float32"))),
"label": ClassLabel(names=["male", "female"]),
"gender": Value("string"),
"filename": Value("string"),
}
),
supervised_keys=("series", "label"),
homepage=self.SOURCE["homepage"],
citation=self.SOURCE["citation"],
)
def _generate_examples(self, data_path, split):
del split
with zipfile.ZipFile(data_path) as archive:
for name in sorted(archive.namelist()):
if name.endswith("/") or not name.lower().endswith(".m4a"):
continue
parts = name.split("/")
if len(parts) < 2:
continue
gender = parts[-2]
if gender not in {"males", "females"}:
continue
filename = parts[-1]
yield (
name,
{
"series": audiosegment_bytes_to_series(archive.read(name), format="m4a"),
"label": 0 if gender == "males" else 1,
"gender": gender[:-1],
"filename": filename,
},
)