import csv
import io
import os
import tarfile
import datasets
from PIL import Image
from stable_datasets.utils import BaseDatasetBuilder
[docs]
class HASYv2(BaseDatasetBuilder):
"""HASYv2 Dataset
Abstract
The HASYv2 dataset contains handwritten symbol images of 369 classes. It includes over 168,000 samples categorized into various classes like Latin characters, numerals, and symbols. Each image is 32x32 pixels in size. The dataset was created to benchmark the classification of mathematical symbols and handwritten characters.
Context
Recognizing handwritten mathematical symbols is a challenging task due to the similarity between classes (e.g., '1', 'l', '|') and the large number of unique symbols used in scientific notation. HASYv2 serves as a standard benchmark for testing classifiers on a large number of classes (369) with low resolution (32x32).
Content
The dataset consists of:
- **Images:** 168,236 black-and-white images (32x32 pixels).
- **Labels:** 369 distinct classes.
- **Splits:** The dataset includes 10 pre-defined folds. This implementation uses 'Fold 1' as the standard train/test split.
"""
VERSION = datasets.Version("1.0.0")
BUILDER_CONFIGS = [
datasets.BuilderConfig(
name=f"fold-{i}",
version=datasets.Version("1.0.0"),
description=f"HASYv2 dataset using fold {i} as the test set.",
)
for i in range(1, 11)
]
DEFAULT_CONFIG_NAME = "fold-1"
_HASYV2_URL = "https://zenodo.org/record/259444/files/HASYv2.tar.bz2?download=1"
SOURCE = {
"homepage": "https://github.com/MartinThoma/HASY",
"citation": """@article{thoma2017hasyv2,
title={The hasyv2 dataset},
author={Thoma, Martin},
journal={arXiv preprint arXiv:1701.08380},
year={2017}}""",
"assets": {"train": _HASYV2_URL, "test": _HASYV2_URL},
}
def _info(self):
return datasets.DatasetInfo(
description=f"HASYv2 dataset (Config: {self.config.name})",
features=datasets.Features(
{
"image": datasets.Image(),
"label": datasets.ClassLabel(names=self._labels()),
}
),
supervised_keys=("image", "label"),
homepage=self.SOURCE["homepage"],
citation=self.SOURCE["citation"],
)
def _generate_examples(self, data_path, split):
fold_name = self.config.name
csv_internal_path = f"classification-task/{fold_name}/{split}.csv"
image_label_map = {}
with tarfile.open(data_path, "r:bz2") as tar:
csv_member = None
for member in tar.getmembers():
if member.name.endswith(csv_internal_path):
csv_member = member
break
if csv_member:
f = tar.extractfile(csv_member)
content = f.read().decode("utf-8").splitlines()
reader = csv.DictReader(content)
for row in reader:
filename = os.path.basename(row["path"])
symbol_id = str(row["symbol_id"])
image_label_map[filename] = symbol_id
if not image_label_map:
return
for member in tar.getmembers():
if not member.isfile():
continue
member_filename = os.path.basename(member.name)
if member_filename in image_label_map:
f = tar.extractfile(member)
if f:
image_bytes = f.read()
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
label = image_label_map[member_filename]
yield (
member.name,
{
"image": image,
"label": label,
},
)
@staticmethod
def _labels():
"""Returns the list of 369 symbol IDs as strings."""
return [
"31",
"32",
"33",
"34",
"35",
"36",
"37",
"38",
"39",
"40",
"41",
"42",
"43",
"44",
"45",
"46",
"47",
"48",
"49",
"50",
"51",
"52",
"53",
"54",
"55",
"56",
"59",
"70",
"71",
"72",
"73",
"74",
"75",
"76",
"77",
"78",
"79",
"81",
"82",
"87",
"88",
"89",
"90",
"91",
"92",
"93",
"94",
"95",
"96",
"97",
"98",
"99",
"100",
"101",
"102",
"103",
"104",
"105",
"106",
"107",
"108",
"110",
"111",
"112",
"113",
"114",
"115",
"116",
"117",
"150",
"151",
"152",
"153",
"154",
"155",
"156",
"157",
"158",
"159",
"160",
"161",
"162",
"163",
"164",
"165",
"166",
"167",
"168",
"169",
"170",
"171",
"174",
"175",
"176",
"177",
"178",
"179",
"180",
"181",
"182",
"183",
"184",
"185",
"186",
"187",
"188",
"189",
"190",
"191",
"192",
"193",
"194",
"195",
"196",
"197",
"254",
"257",
"259",
"260",
"261",
"262",
"263",
"264",
"265",
"266",
"267",
"268",
"269",
"508",
"510",
"511",
"512",
"513",
"514",
"517",
"520",
"521",
"523",
"524",
"526",
"527",
"528",
"529",
"530",
"531",
"532",
"533",
"534",
"535",
"536",
"537",
"538",
"539",
"540",
"541",
"542",
"544",
"549",
"550",
"553",
"555",
"562",
"564",
"574",
"577",
"582",
"583",
"584",
"591",
"595",
"600",
"601",
"603",
"604",
"605",
"607",
"608",
"609",
"610",
"611",
"612",
"613",
"614",
"615",
"616",
"617",
"618",
"620",
"621",
"622",
"630",
"631",
"634",
"635",
"636",
"639",
"640",
"644",
"647",
"650",
"661",
"671",
"678",
"679",
"683",
"684",
"698",
"711",
"712",
"713",
"716",
"728",
"739",
"741",
"743",
"748",
"751",
"753",
"756",
"757",
"758",
"759",
"761",
"762",
"763",
"764",
"765",
"767",
"768",
"770",
"771",
"775",
"777",
"778",
"783",
"785",
"786",
"788",
"791",
"792",
"801",
"809",
"812",
"817",
"822",
"823",
"827",
"837",
"838",
"881",
"882",
"884",
"885",
"886",
"887",
"888",
"889",
"890",
"891",
"892",
"894",
"901",
"912",
"913",
"914",
"915",
"916",
"917",
"918",
"919",
"920",
"921",
"922",
"923",
"924",
"934",
"936",
"941",
"943",
"944",
"945",
"946",
"947",
"948",
"949",
"950",
"951",
"953",
"956",
"957",
"958",
"959",
"960",
"965",
"968",
"971",
"972",
"973",
"974",
"977",
"992",
"993",
"994",
"995",
"996",
"997",
"998",
"999",
"1000",
"1004",
"1005",
"1006",
"1007",
"1008",
"1010",
"1011",
"1012",
"1013",
"1016",
"1018",
"1019",
"1031",
"1037",
"1042",
"1045",
"1046",
"1051",
"1053",
"1062",
"1064",
"1065",
"1066",
"1074",
"1075",
"1077",
"1078",
"1079",
"1080",
"1082",
"1086",
"1090",
"1093",
"1101",
"1102",
"1103",
"1111",
"1112",
"1115",
"1116",
"1117",
"1168",
"1169",
"1177",
"1184",
"1185",
"1187",
"1314",
"1315",
"1316",
"1317",
"1369",
"1371",
"1374",
"1382",
"1385",
"1394",
"1395",
"1396",
"1400",
]