Source code for stable_datasets.timeseries.VoiceGenderDetection

import zipfile

from stable_datasets.schema import (
    ClassLabel,
    DatasetInfo,
    DatasetSource,
    DownloadInfo,
    Features,
    Sequence,
    Value,
    Version,
)
from stable_datasets.utils import BaseDatasetBuilder

from ._audio_utils import audiosegment_bytes_to_series



[docs]
class VoiceGenderDetection(BaseDatasetBuilder):
    """Voice gender detection dataset derived from VoxCeleb clips."""

    VERSION = Version("1.0.0")
    SOURCE = DatasetSource(
        homepage="https://dagshub.com/DagsHub/audio-datasets/src/main/voice_gender_detection",
        assets={
            "train": DownloadInfo(
                url="https://drive.google.com/u/0/uc?id=1HRbWocxwClGy9Fj1MQeugpR4vOaL9ebO&export=download",
                filename="VoxCeleb_gender.zip",
            ),
        },
        citation="See dataset homepage.",
    )

    def _info(self):
        return DatasetInfo(
            description="Voice gender detection dataset with male/female labels.",
            features=Features(
                {
                    "series": Sequence(Sequence(Value("float32"))),
                    "label": ClassLabel(names=["male", "female"]),
                    "gender": Value("string"),
                    "filename": Value("string"),
                }
            ),
            supervised_keys=("series", "label"),
            homepage=self.SOURCE["homepage"],
            citation=self.SOURCE["citation"],
        )

    def _generate_examples(self, data_path, split):
        del split
        with zipfile.ZipFile(data_path) as archive:
            for name in sorted(archive.namelist()):
                if name.endswith("/") or not name.lower().endswith(".m4a"):
                    continue
                parts = name.split("/")
                if len(parts) < 2:
                    continue
                gender = parts[-2]
                if gender not in {"males", "females"}:
                    continue
                filename = parts[-1]
                yield (
                    name,
                    {
                        "series": audiosegment_bytes_to_series(archive.read(name), format="m4a"),
                        "label": 0 if gender == "males" else 1,
                        "gender": gender[:-1],
                        "filename": filename,
                    },
                )