Source code for stable_datasets.images.hasy_v2

import csv
import io
import os
import tarfile

import datasets
from PIL import Image

from stable_datasets.utils import BaseDatasetBuilder


[docs] class HASYv2(BaseDatasetBuilder): """HASYv2 Dataset Abstract The HASYv2 dataset contains handwritten symbol images of 369 classes. It includes over 168,000 samples categorized into various classes like Latin characters, numerals, and symbols. Each image is 32x32 pixels in size. The dataset was created to benchmark the classification of mathematical symbols and handwritten characters. Context Recognizing handwritten mathematical symbols is a challenging task due to the similarity between classes (e.g., '1', 'l', '|') and the large number of unique symbols used in scientific notation. HASYv2 serves as a standard benchmark for testing classifiers on a large number of classes (369) with low resolution (32x32). Content The dataset consists of: - **Images:** 168,236 black-and-white images (32x32 pixels). - **Labels:** 369 distinct classes. - **Splits:** The dataset includes 10 pre-defined folds. This implementation uses 'Fold 1' as the standard train/test split. """ VERSION = datasets.Version("1.0.0") BUILDER_CONFIGS = [ datasets.BuilderConfig( name=f"fold-{i}", version=datasets.Version("1.0.0"), description=f"HASYv2 dataset using fold {i} as the test set.", ) for i in range(1, 11) ] DEFAULT_CONFIG_NAME = "fold-1" _HASYV2_URL = "https://zenodo.org/record/259444/files/HASYv2.tar.bz2?download=1" SOURCE = { "homepage": "https://github.com/MartinThoma/HASY", "citation": """@article{thoma2017hasyv2, title={The hasyv2 dataset}, author={Thoma, Martin}, journal={arXiv preprint arXiv:1701.08380}, year={2017}}""", "assets": {"train": _HASYV2_URL, "test": _HASYV2_URL}, } def _info(self): return datasets.DatasetInfo( description=f"HASYv2 dataset (Config: {self.config.name})", features=datasets.Features( { "image": datasets.Image(), "label": datasets.ClassLabel(names=self._labels()), } ), supervised_keys=("image", "label"), homepage=self.SOURCE["homepage"], citation=self.SOURCE["citation"], ) def _generate_examples(self, data_path, split): fold_name = self.config.name csv_internal_path = f"classification-task/{fold_name}/{split}.csv" image_label_map = {} with tarfile.open(data_path, "r:bz2") as tar: csv_member = None for member in tar.getmembers(): if member.name.endswith(csv_internal_path): csv_member = member break if csv_member: f = tar.extractfile(csv_member) content = f.read().decode("utf-8").splitlines() reader = csv.DictReader(content) for row in reader: filename = os.path.basename(row["path"]) symbol_id = str(row["symbol_id"]) image_label_map[filename] = symbol_id if not image_label_map: return for member in tar.getmembers(): if not member.isfile(): continue member_filename = os.path.basename(member.name) if member_filename in image_label_map: f = tar.extractfile(member) if f: image_bytes = f.read() image = Image.open(io.BytesIO(image_bytes)).convert("RGB") label = image_label_map[member_filename] yield ( member.name, { "image": image, "label": label, }, ) @staticmethod def _labels(): """Returns the list of 369 symbol IDs as strings.""" return [ "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "59", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79", "81", "82", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", "98", "99", "100", "101", "102", "103", "104", "105", "106", "107", "108", "110", "111", "112", "113", "114", "115", "116", "117", "150", "151", "152", "153", "154", "155", "156", "157", "158", "159", "160", "161", "162", "163", "164", "165", "166", "167", "168", "169", "170", "171", "174", "175", "176", "177", "178", "179", "180", "181", "182", "183", "184", "185", "186", "187", "188", "189", "190", "191", "192", "193", "194", "195", "196", "197", "254", "257", "259", "260", "261", "262", "263", "264", "265", "266", "267", "268", "269", "508", "510", "511", "512", "513", "514", "517", "520", "521", "523", "524", "526", "527", "528", "529", "530", "531", "532", "533", "534", "535", "536", "537", "538", "539", "540", "541", "542", "544", "549", "550", "553", "555", "562", "564", "574", "577", "582", "583", "584", "591", "595", "600", "601", "603", "604", "605", "607", "608", "609", "610", "611", "612", "613", "614", "615", "616", "617", "618", "620", "621", "622", "630", "631", "634", "635", "636", "639", "640", "644", "647", "650", "661", "671", "678", "679", "683", "684", "698", "711", "712", "713", "716", "728", "739", "741", "743", "748", "751", "753", "756", "757", "758", "759", "761", "762", "763", "764", "765", "767", "768", "770", "771", "775", "777", "778", "783", "785", "786", "788", "791", "792", "801", "809", "812", "817", "822", "823", "827", "837", "838", "881", "882", "884", "885", "886", "887", "888", "889", "890", "891", "892", "894", "901", "912", "913", "914", "915", "916", "917", "918", "919", "920", "921", "922", "923", "924", "934", "936", "941", "943", "944", "945", "946", "947", "948", "949", "950", "951", "953", "956", "957", "958", "959", "960", "965", "968", "971", "972", "973", "974", "977", "992", "993", "994", "995", "996", "997", "998", "999", "1000", "1004", "1005", "1006", "1007", "1008", "1010", "1011", "1012", "1013", "1016", "1018", "1019", "1031", "1037", "1042", "1045", "1046", "1051", "1053", "1062", "1064", "1065", "1066", "1074", "1075", "1077", "1078", "1079", "1080", "1082", "1086", "1090", "1093", "1101", "1102", "1103", "1111", "1112", "1115", "1116", "1117", "1168", "1169", "1177", "1184", "1185", "1187", "1314", "1315", "1316", "1317", "1369", "1371", "1374", "1382", "1385", "1394", "1395", "1396", "1400", ]