Source code for stable_datasets.images.linnaeus5

import io

import datasets
import rarfile
from PIL import Image

from stable_datasets.utils import BaseDatasetBuilder


[docs] class Linnaeus5(BaseDatasetBuilder): """Linnaeus 5 Dataset Abstract The Linnaeus 5 dataset contains 1,600 RGB images sized 256x256 pixels, categorized into 5 classes: berry, bird, dog, flower, and other (negative set). It was created to benchmark fine-grained classification and object recognition tasks. Context While many datasets focus on broad object categories (like CIFAR-10), Linnaeus 5 offers a focused challenge on specific natural objects plus a "negative" class ('other'). It serves as a good middle-ground benchmark between simple digit recognition (MNIST) and large-scale natural image classification (ImageNet). Content The dataset consists of: - **Images:** 8,000 color images (256x256 pixels). - **Classes:** 5 categories (berry, bird, dog, flower, other). - **Splits:** Pre-split into Training (1,200 images per class) and Test (400 images per class). """ VERSION = datasets.Version("1.0.0") SOURCE = { "homepage": "http://chaladze.com/l5/", "citation": """@article{chaladze2017linnaeus, title={Linnaeus 5 dataset for machine learning}, author={Chaladze, G and Kalatozishvili, L}, journal={chaladze.com}, year={2017}}""", "assets": { "data": "http://chaladze.com/l5/img/Linnaeus%205%20256X256.rar", }, } def _info(self): return datasets.DatasetInfo( description="Linnaeus 5 dataset with 5 classes (berry, bird, dog, flower, other).", features=datasets.Features( { "image": datasets.Image(), "label": datasets.ClassLabel(names=self._labels()), } ), supervised_keys=("image", "label"), homepage=self.SOURCE["homepage"], citation=self.SOURCE["citation"], ) def _split_generators(self, dl_manager): source = self._source() url = source["assets"]["data"] archive_path = dl_manager.download(url) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "archive_path": archive_path, "split_name": "train", }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "archive_path": archive_path, "split_name": "test", }, ), ] def _generate_examples(self, archive_path, split_name): """Iterate over the RAR archive and yield images matching the split.""" with rarfile.RarFile(archive_path) as rf: for member in rf.infolist(): if member.isdir(): continue filename = member.filename if f"/{split_name}/" in filename.lower() and filename.lower().endswith((".jpg", ".jpeg")): try: parts = filename.replace("\\", "/").split("/") label_name = parts[-2] except IndexError: continue if label_name in self._labels(): with rf.open(member) as f: image_bytes = f.read() image = Image.open(io.BytesIO(image_bytes)).convert("RGB") yield ( filename, { "image": image, "label": label_name, }, ) @staticmethod def _labels(): return ["berry", "bird", "dog", "flower", "other"]