Source code for stable_datasets.images.arabic_digits
import io
from zipfile import ZipFile
import datasets
import numpy as np
from PIL import Image
from tqdm import tqdm
from stable_datasets.utils import BaseDatasetBuilder
[docs]
class ArabicDigits(BaseDatasetBuilder):
"""Arabic Handwritten Digits Dataset."""
VERSION = datasets.Version("1.0.0")
# Single source-of-truth for dataset provenance + download locations.
SOURCE = {
"homepage": "https://github.com/mloey/Arabic-Handwritten-Digits-Dataset",
"assets": {
# Both splits come from the same CSV zip file
"train": "https://raw.githubusercontent.com/mloey/Arabic-Handwritten-Digits-Dataset/master/Arabic%20Handwritten%20Digits%20Dataset%20CSV.zip",
"test": "https://raw.githubusercontent.com/mloey/Arabic-Handwritten-Digits-Dataset/master/Arabic%20Handwritten%20Digits%20Dataset%20CSV.zip",
},
"citation": """@inproceedings{el2016cnn,
title={CNN for handwritten arabic digits recognition based on LeNet-5},
author={El-Sawy, Ahmed and Hazem, EL-Bakry and Loey, Mohamed},
booktitle={International conference on advanced intelligent systems and informatics},
pages={566--575},
year={2016},
organization={Springer}
}""",
}
def _info(self):
return datasets.DatasetInfo(
description="""Arabic Handwritten Digits Dataset containing 70,000 images of Arabic digits (0-9)
written by 700 participants. Images are 28x28 grayscale pixels.""",
features=datasets.Features(
{"image": datasets.Image(), "label": datasets.ClassLabel(names=[str(i) for i in range(10)])}
),
supervised_keys=("image", "label"),
homepage=self.SOURCE["homepage"],
citation=self.SOURCE["citation"],
)
def _generate_examples(self, data_path, split):
"""Generate examples from the CSV zip archive."""
# File names inside the zip
if split == "train":
images_file = "csvTrainImages 60k x 784.csv"
labels_file = "csvTrainLabel 60k x 1.csv"
else: # test
images_file = "csvTestImages.csv"
labels_file = "csvTestLabel 10k x 1.csv"
with ZipFile(data_path, "r") as archive:
# Load images CSV (each row is 784 flattened pixels)
with archive.open(images_file) as f:
content = f.read().decode("utf-8")
images = np.loadtxt(io.StringIO(content), delimiter=",", dtype=np.uint8)
# Reshape from (N, 784) to (N, 28, 28) using Fortran order (MATLAB origin)
images = images.reshape(-1, 28, 28, order="F")
# Load labels CSV
with archive.open(labels_file) as f:
content = f.read().decode("utf-8")
labels = np.loadtxt(io.StringIO(content), dtype=np.int32)
# Generate examples
for idx, (image, label) in enumerate(
tqdm(zip(images, labels), total=len(labels), desc=f"Processing {split} set")
):
# Convert numpy array to PIL Image
pil_image = Image.fromarray(image, mode="L") # "L" for grayscale
yield idx, {"image": pil_image, "label": int(label)}