Source code for stable_datasets.timeseries.JapaneseVowels
import tempfile
import zipfile
from pathlib import Path
import numpy as np
from stable_datasets.schema import (
ClassLabel,
DatasetInfo,
DatasetSource,
DownloadInfo,
Features,
Sequence,
Value,
Version,
)
from stable_datasets.utils import BaseDatasetBuilder, load_from_tsfile_to_dataframe
[docs]
class JapaneseVowels(BaseDatasetBuilder):
"""JapaneseVowels multivariate timeseries classification dataset."""
VERSION = Version("1.0.0")
SOURCE = DatasetSource(
homepage="http://www.timeseriesclassification.com/description.php?Dataset=JapaneseVowels",
assets={
"train": DownloadInfo(url="http://www.timeseriesclassification.com/Downloads/JapaneseVowels.zip"),
"test": DownloadInfo(url="http://www.timeseriesclassification.com/Downloads/JapaneseVowels.zip"),
},
citation="See dataset homepage.",
)
SEQUENCE_LENGTH = 29
def _info(self):
return DatasetInfo(
description="JapaneseVowels multivariate timeseries classification dataset.",
features=Features(
{
"series": Sequence(Sequence(Value("float32"))),
"label": ClassLabel(names=[f"speaker_{idx}" for idx in range(9)]),
}
),
supervised_keys=("series", "label"),
homepage=self.SOURCE["homepage"],
citation=self.SOURCE["citation"],
)
def _generate_examples(self, data_path, split):
with zipfile.ZipFile(data_path) as archive:
member = _find_member(archive, f"JapaneseVowels_{split.upper()}.ts")
with tempfile.TemporaryDirectory() as tmp:
archive.extract(member, tmp)
ts_path = Path(tmp) / member
x_df, labels = load_from_tsfile_to_dataframe(ts_path)
dims = []
for col in x_df.columns:
dims.append(np.stack(list(x_df[col].map(lambda x: x.reindex(range(self.SEQUENCE_LENGTH))))))
series = np.nan_to_num(np.stack(dims, axis=-1).astype("float32"))
label_to_id = {name: idx for idx, name in enumerate(sorted({_label_name(v) for v in labels}))}
for idx, (x, y) in enumerate(zip(series, labels)):
yield idx, {"series": x, "label": label_to_id[_label_name(y)]}
def _find_member(archive: zipfile.ZipFile, suffix: str) -> str:
suffix = suffix.lower()
for name in archive.namelist():
if name.lower().endswith(suffix):
return name
raise FileNotFoundError(f"Could not find {suffix!r} in {archive.filename}")
def _label_name(value) -> str:
if isinstance(value, bytes):
return value.decode("utf-8")
if hasattr(value, "item"):
value = value.item()
return str(value)