Source code for stable_datasets.timeseries.sonycust

import csv
import tarfile

import numpy as np

from stable_datasets.schema import DatasetInfo, DatasetSource, DownloadInfo, Features, Sequence, Value, Version
from stable_datasets.splits import Split, SplitGenerator
from stable_datasets.utils import BaseDatasetBuilder, bulk_download

from ._audio_utils import wav_bytes_to_series


COARSE_LABELS = [
    "engine",
    "machinery-impact",
    "non-machinery-impact",
    "powered-saw",
    "alert-signal",
    "music",
    "human-voice",
    "dog",
]

FINE_LABEL_BLOCKS = [0, 4, 9, 10, 14, 19, 23, 28, 29]


[docs] class SONYCUST(BaseDatasetBuilder): """SONYC Urban Sound Tagging development dataset.""" VERSION = Version("1.0.0") SOURCE = DatasetSource( homepage="https://zenodo.org/records/3233082", assets={ "audio": DownloadInfo( url="https://zenodo.org/record/3233082/files/audio-dev.tar.gz?download=1", fallbacks=["https://zenodo.org/records/3233082/files/audio-dev.tar.gz"], filename="audio-dev.tar.gz", ), "annotations": DownloadInfo( url="https://zenodo.org/record/3233082/files/annotations-dev.csv?download=1", fallbacks=["https://zenodo.org/records/3233082/files/annotations-dev.csv"], filename="annotations-dev.csv", ), }, citation="See dataset homepage.", ) def _info(self): return DatasetInfo( description="SONYC Urban Sound Tagging development dataset with fine and coarse multilabel annotations.", features=Features( { "series": Sequence(Sequence(Value("float32"))), "fine_labels": Sequence(Value("int32")), "coarse_labels": Sequence(Value("int32")), "fine_label_names": Sequence(Value("string")), "coarse_label_names": Sequence(Value("string")), "relative_path": Value("string"), "filename": Value("string"), "metadata": Sequence(Value("string")), } ), supervised_keys=None, homepage=self.SOURCE["homepage"], citation=self.SOURCE["citation"], ) def _split_generators(self): source = self._source() audio_path, annotations_path = bulk_download( [ self._normalize_download_info(source["assets"]["audio"], asset_name="audio"), self._normalize_download_info(source["assets"]["annotations"], asset_name="annotations"), ], dest_folder=self._raw_download_dir, ) return [ SplitGenerator( name=Split.TRAIN, gen_kwargs={"audio_path": audio_path, "annotations_path": annotations_path, "split": "train"}, ) ] def _candidate_splits(self) -> list: return [Split.TRAIN] def _generate_examples(self, audio_path, annotations_path, split): del split with open(annotations_path, newline="", encoding="utf-8") as fh: reader = csv.reader(fh) header = next(reader) rows = list(reader) fine_label_names = header[4:33] with tarfile.open(audio_path, "r:gz") as archive: for idx, row in enumerate(rows): if len(row) < 33: continue relative_path = f"{row[0]}/{row[2]}" member = _find_member(archive, relative_path) fine_labels = np.asarray(row[4:33], dtype="float32").astype("int32") coarse_labels = [] for start, stop in zip(FINE_LABEL_BLOCKS[:-1], FINE_LABEL_BLOCKS[1:]): coarse_labels.append(int(fine_labels[start:stop].max())) extracted = archive.extractfile(member) if extracted is None: continue yield ( idx, { "series": wav_bytes_to_series(extracted.read()), "fine_labels": fine_labels.tolist(), "coarse_labels": coarse_labels, "fine_label_names": fine_label_names, "coarse_label_names": COARSE_LABELS, "relative_path": relative_path, "filename": row[2], "metadata": row[:4], }, )
def _find_member(archive: tarfile.TarFile, suffix: str) -> tarfile.TarInfo: suffix = suffix.lstrip("/") for member in archive.getmembers(): if member.name.endswith(suffix): return member raise FileNotFoundError(f"Could not find {suffix!r} in {archive.name}")