Source code for stable_datasets.images.arabic_digits

import io
from zipfile import ZipFile

import datasets
import numpy as np
from PIL import Image
from tqdm import tqdm

from stable_datasets.utils import BaseDatasetBuilder



[docs]
class ArabicDigits(BaseDatasetBuilder):
    """Arabic Handwritten Digits Dataset."""

    VERSION = datasets.Version("1.0.0")

    # Single source-of-truth for dataset provenance + download locations.
    SOURCE = {
        "homepage": "https://github.com/mloey/Arabic-Handwritten-Digits-Dataset",
        "assets": {
            # Both splits come from the same CSV zip file
            "train": "https://raw.githubusercontent.com/mloey/Arabic-Handwritten-Digits-Dataset/master/Arabic%20Handwritten%20Digits%20Dataset%20CSV.zip",
            "test": "https://raw.githubusercontent.com/mloey/Arabic-Handwritten-Digits-Dataset/master/Arabic%20Handwritten%20Digits%20Dataset%20CSV.zip",
        },
        "citation": """@inproceedings{el2016cnn,
                        title={CNN for handwritten arabic digits recognition based on LeNet-5},
                        author={El-Sawy, Ahmed and Hazem, EL-Bakry and Loey, Mohamed},
                        booktitle={International conference on advanced intelligent systems and informatics},
                        pages={566--575},
                        year={2016},
                        organization={Springer}
                        }""",
    }

    def _info(self):
        return datasets.DatasetInfo(
            description="""Arabic Handwritten Digits Dataset containing 70,000 images of Arabic digits (0-9)
                           written by 700 participants. Images are 28x28 grayscale pixels.""",
            features=datasets.Features(
                {"image": datasets.Image(), "label": datasets.ClassLabel(names=[str(i) for i in range(10)])}
            ),
            supervised_keys=("image", "label"),
            homepage=self.SOURCE["homepage"],
            citation=self.SOURCE["citation"],
        )

    def _generate_examples(self, data_path, split):
        """Generate examples from the CSV zip archive."""
        # File names inside the zip
        if split == "train":
            images_file = "csvTrainImages 60k x 784.csv"
            labels_file = "csvTrainLabel 60k x 1.csv"
        else:  # test
            images_file = "csvTestImages.csv"
            labels_file = "csvTestLabel 10k x 1.csv"

        with ZipFile(data_path, "r") as archive:
            # Load images CSV (each row is 784 flattened pixels)
            with archive.open(images_file) as f:
                content = f.read().decode("utf-8")
                images = np.loadtxt(io.StringIO(content), delimiter=",", dtype=np.uint8)
                # Reshape from (N, 784) to (N, 28, 28) using Fortran order (MATLAB origin)
                images = images.reshape(-1, 28, 28, order="F")

            # Load labels CSV
            with archive.open(labels_file) as f:
                content = f.read().decode("utf-8")
                labels = np.loadtxt(io.StringIO(content), dtype=np.int32)

        # Generate examples
        for idx, (image, label) in enumerate(
            tqdm(zip(images, labels), total=len(labels), desc=f"Processing {split} set")
        ):
            # Convert numpy array to PIL Image
            pil_image = Image.fromarray(image, mode="L")  # "L" for grayscale
            yield idx, {"image": pil_image, "label": int(label)}