Source code for stable_datasets.timeseries.picidae

import zipfile

from stable_datasets.schema import (
    ClassLabel,
    DatasetInfo,
    DatasetSource,
    DownloadInfo,
    Features,
    Sequence,
    Value,
    Version,
)
from stable_datasets.utils import BaseDatasetBuilder

from ._audio_utils import wav_bytes_to_series


PICIDAE_LABELS = [
    "BackgroundNoise",
    "DendrocoposLeucotos-call",
    "DendrocoposLeucotos-drumming",
    "DendrocoposMajor-call",
    "DendrocoposMajor-drumming",
    "DendrocoposMedius-call",
    "DendrocoposMedius-song",
    "DendrocoposMinor-call",
    "DendrocoposMinor-drumming",
    "DryocopusMartius-call",
    "DryocopusMartius-drumming",
    "JynxTorquilla-song",
    "PicusViridis-song",
]


[docs] class Picidae(BaseDatasetBuilder): """Picidae birdsong classification dataset.""" VERSION = Version("1.0.0") SOURCE = DatasetSource( homepage="https://zenodo.org/records/574438", assets={ "train": DownloadInfo( url="https://zenodo.org/record/574438/files/PicidaeDataset.zip?download=1", fallbacks=["https://zenodo.org/records/574438/files/PicidaeDataset.zip"], filename="PicidaeDataset.zip", ), }, citation="See dataset homepage.", ) def _info(self): return DatasetInfo( description="Picidae birdsong classification dataset with species/song-type labels.", features=Features( { "series": Sequence(Sequence(Value("float32"))), "label": ClassLabel(names=PICIDAE_LABELS), "label_name": Value("string"), "xc_identifier": Value("string"), "filename": Value("string"), } ), supervised_keys=("series", "label"), homepage=self.SOURCE["homepage"], citation=self.SOURCE["citation"], ) def _generate_examples(self, data_path, split): del split with zipfile.ZipFile(data_path) as archive: for name in sorted(archive.namelist()): if not name.lower().endswith(".wav") or "/._" in name or name.rsplit("/", 1)[-1].startswith("._"): continue parts = name.split("/") if len(parts) < 3: continue label_name = parts[-2] if label_name not in PICIDAE_LABELS: continue filename = parts[-1] xc_identifier = filename.split("-")[0] yield ( name, { "series": wav_bytes_to_series(archive.read(name)), "label": PICIDAE_LABELS.index(label_name), "label_name": label_name, "xc_identifier": xc_identifier, "filename": filename, }, )