Source code for stable_datasets.timeseries.JapaneseVowels

import tempfile
import zipfile
from pathlib import Path

import numpy as np

from stable_datasets.schema import (
    ClassLabel,
    DatasetInfo,
    DatasetSource,
    DownloadInfo,
    Features,
    Sequence,
    Value,
    Version,
)
from stable_datasets.utils import BaseDatasetBuilder, load_from_tsfile_to_dataframe


[docs] class JapaneseVowels(BaseDatasetBuilder): """JapaneseVowels multivariate timeseries classification dataset.""" VERSION = Version("1.0.0") SOURCE = DatasetSource( homepage="http://www.timeseriesclassification.com/description.php?Dataset=JapaneseVowels", assets={ "train": DownloadInfo(url="http://www.timeseriesclassification.com/Downloads/JapaneseVowels.zip"), "test": DownloadInfo(url="http://www.timeseriesclassification.com/Downloads/JapaneseVowels.zip"), }, citation="See dataset homepage.", ) SEQUENCE_LENGTH = 29 def _info(self): return DatasetInfo( description="JapaneseVowels multivariate timeseries classification dataset.", features=Features( { "series": Sequence(Sequence(Value("float32"))), "label": ClassLabel(names=[f"speaker_{idx}" for idx in range(9)]), } ), supervised_keys=("series", "label"), homepage=self.SOURCE["homepage"], citation=self.SOURCE["citation"], ) def _generate_examples(self, data_path, split): with zipfile.ZipFile(data_path) as archive: member = _find_member(archive, f"JapaneseVowels_{split.upper()}.ts") with tempfile.TemporaryDirectory() as tmp: archive.extract(member, tmp) ts_path = Path(tmp) / member x_df, labels = load_from_tsfile_to_dataframe(ts_path) dims = [] for col in x_df.columns: dims.append(np.stack(list(x_df[col].map(lambda x: x.reindex(range(self.SEQUENCE_LENGTH)))))) series = np.nan_to_num(np.stack(dims, axis=-1).astype("float32")) label_to_id = {name: idx for idx, name in enumerate(sorted({_label_name(v) for v in labels}))} for idx, (x, y) in enumerate(zip(series, labels)): yield idx, {"series": x, "label": label_to_id[_label_name(y)]}
def _find_member(archive: zipfile.ZipFile, suffix: str) -> str: suffix = suffix.lower() for name in archive.namelist(): if name.lower().endswith(suffix): return name raise FileNotFoundError(f"Could not find {suffix!r} in {archive.filename}") def _label_name(value) -> str: if isinstance(value, bytes): return value.decode("utf-8") if hasattr(value, "item"): value = value.item() return str(value)