Source code for stable_datasets.images.linnaeus5

import io

import datasets
import rarfile
from PIL import Image

from stable_datasets.utils import BaseDatasetBuilder



[docs]
class Linnaeus5(BaseDatasetBuilder):
    """Linnaeus 5 Dataset

    Abstract
    The Linnaeus 5 dataset contains 1,600 RGB images sized 256x256 pixels, categorized into 5 classes: berry, bird, dog, flower, and other (negative set). It was created to benchmark fine-grained classification and object recognition tasks.

    Context
    While many datasets focus on broad object categories (like CIFAR-10), Linnaeus 5 offers a focused challenge on specific natural objects plus a "negative" class ('other'). It serves as a good middle-ground benchmark between simple digit recognition (MNIST) and large-scale natural image classification (ImageNet).

    Content
    The dataset consists of:
    - **Images:** 8,000 color images (256x256 pixels).
    - **Classes:** 5 categories (berry, bird, dog, flower, other).
    - **Splits:** Pre-split into Training (1,200 images per class) and Test (400 images per class).
    """

    VERSION = datasets.Version("1.0.0")

    SOURCE = {
        "homepage": "http://chaladze.com/l5/",
        "citation": """@article{chaladze2017linnaeus,
                      title={Linnaeus 5 dataset for machine learning},
                      author={Chaladze, G and Kalatozishvili, L},
                      journal={chaladze.com},
                      year={2017}}""",
        "assets": {
            "data": "http://chaladze.com/l5/img/Linnaeus%205%20256X256.rar",
        },
    }

    def _info(self):
        return datasets.DatasetInfo(
            description="Linnaeus 5 dataset with 5 classes (berry, bird, dog, flower, other).",
            features=datasets.Features(
                {
                    "image": datasets.Image(),
                    "label": datasets.ClassLabel(names=self._labels()),
                }
            ),
            supervised_keys=("image", "label"),
            homepage=self.SOURCE["homepage"],
            citation=self.SOURCE["citation"],
        )

    def _split_generators(self, dl_manager):
        source = self._source()
        url = source["assets"]["data"]

        archive_path = dl_manager.download(url)

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "archive_path": archive_path,
                    "split_name": "train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "archive_path": archive_path,
                    "split_name": "test",
                },
            ),
        ]

    def _generate_examples(self, archive_path, split_name):
        """Iterate over the RAR archive and yield images matching the split."""

        with rarfile.RarFile(archive_path) as rf:
            for member in rf.infolist():
                if member.isdir():
                    continue

                filename = member.filename
                if f"/{split_name}/" in filename.lower() and filename.lower().endswith((".jpg", ".jpeg")):
                    try:
                        parts = filename.replace("\\", "/").split("/")
                        label_name = parts[-2]
                    except IndexError:
                        continue

                    if label_name in self._labels():
                        with rf.open(member) as f:
                            image_bytes = f.read()

                        image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

                        yield (
                            filename,
                            {
                                "image": image,
                                "label": label_name,
                            },
                        )

    @staticmethod
    def _labels():
        return ["berry", "bird", "dog", "flower", "other"]