import io
import tarfile
import datasets
import pandas as pd
from PIL import Image
from stable_datasets.utils import BaseDatasetBuilder
[docs]
class CUB200(BaseDatasetBuilder):
"""Caltech-UCSD Birds-200-2011 (CUB-200-2011) Dataset"""
VERSION = datasets.Version("1.0.0")
SOURCE = {
"homepage": "https://www.vision.caltech.edu/datasets/cub_200_2011/",
"assets": {
"train": "https://data.caltech.edu/records/65de6-vp158/files/CUB_200_2011.tgz?download=1",
"test": "https://data.caltech.edu/records/65de6-vp158/files/CUB_200_2011.tgz?download=1",
},
"citation": """@techreport{WahCUB_200_2011,
Title = {The Caltech-UCSD Birds-200-2011 Dataset},
Author = {Wah, C. and Branson, S. and Welinder, P. and Perona, P. and Belongie, S.},
Year = {2011},
Institution = {California Institute of Technology},
Number = {CNS-TR-2011-001}}""",
}
def _info(self):
return datasets.DatasetInfo(
description="""The Caltech-UCSD Birds-200-2011 dataset consists of 11,788 images of 200 bird species.""",
features=datasets.Features(
{"image": datasets.Image(), "label": datasets.ClassLabel(names=self._labels())}
),
supervised_keys=("image", "label"),
homepage=self.SOURCE["homepage"],
citation=self.SOURCE["citation"],
)
def _generate_examples(self, data_path, split):
"""Generate examples from the extracted directory."""
with tarfile.open(data_path, "r:gz") as archive:
# Read metadata files from inside the archive
with archive.extractfile("CUB_200_2011/image_class_labels.txt") as f:
labels_df = pd.read_csv(io.BytesIO(f.read()), sep=r"\s+", header=None, names=["image_id", "label"])
with archive.extractfile("CUB_200_2011/images.txt") as f:
images_df = pd.read_csv(io.BytesIO(f.read()), sep=r"\s+", header=None, names=["image_id", "file_path"])
with archive.extractfile("CUB_200_2011/train_test_split.txt") as f:
split_df = pd.read_csv(
io.BytesIO(f.read()), sep=r"\s+", header=None, names=["image_id", "is_training"]
)
# Merge metadata into a single DataFrame
data_df = images_df.merge(labels_df, on="image_id").merge(split_df, on="image_id")
data_df["label"] -= 1 # Zero-index the labels
# Filter by the specified split
is_training_split = 1 if split == "train" else 0
split_data = data_df[data_df["is_training"] == is_training_split]
# Generate examples
for _, row in split_data.iterrows():
image_path = f"CUB_200_2011/images/{row['file_path']}"
label = row["label"]
# Read the image from the tar archive
with archive.extractfile(image_path) as img_file:
image = Image.open(io.BytesIO(img_file.read())).convert("RGB")
yield (
row["image_id"],
{
"image": image,
"label": label,
},
)
@staticmethod
def _labels():
return [
"Black_footed_Albatross",
"Laysan_Albatross",
"Sooty_Albatross",
"Groove_billed_Ani",
"Crested_Auklet",
"Least_Auklet",
"Parakeet_Auklet",
"Rhinoceros_Auklet",
"Brewer_Blackbird",
"Red_winged_Blackbird",
"Rusty_Blackbird",
"Yellow_headed_Blackbird",
"Bobolink",
"Indigo_Bunting",
"Lazuli_Bunting",
"Painted_Bunting",
"Cardinal",
"Spotted_Catbird",
"Gray_Catbird",
"Yellow_breasted_Chat",
"Eastern_Towhee",
"Chuck_will_Widow",
"Brandt_Cormorant",
"Red_faced_Cormorant",
"Pelagic_Cormorant",
"Bronzed_Cowbird",
"Shiny_Cowbird",
"Brown_Creeper",
"American_Crow",
"Fish_Crow",
"Black_billed_Cuckoo",
"Mangrove_Cuckoo",
"Yellow_billed_Cuckoo",
"Gray_crowned_Rosy_Finch",
"Purple_Finch",
"Northern_Flicker",
"Acadian_Flycatcher",
"Great_Crested_Flycatcher",
"Least_Flycatcher",
"Olive_sided_Flycatcher",
"Scissor_tailed_Flycatcher",
"Vermilion_Flycatcher",
"Yellow_bellied_Flycatcher",
"Frigatebird",
"Northern_Fulmar",
"Gadwall",
"American_Goldfinch",
"European_Goldfinch",
"Boat_tailed_Grackle",
"Eared_Grebe",
"Horned_Grebe",
"Pied_billed_Grebe",
"Western_Grebe",
"Blue_Grosbeak",
"Evening_Grosbeak",
"Pine_Grosbeak",
"Rose_breasted_Grosbeak",
"Pigeon_Guillemot",
"California_Gull",
"Glaucous_winged_Gull",
"Heermann_Gull",
"Herring_Gull",
"Ivory_Gull",
"Ring_billed_Gull",
"Slaty_backed_Gull",
"Western_Gull",
"Anna_Hummingbird",
"Ruby_throated_Hummingbird",
"Rufous_Hummingbird",
"Green_Violetear",
"Long_tailed_Jaeger",
"Pomarine_Jaeger",
"Blue_Jay",
"Florida_Jay",
"Green_Jay",
"Dark_eyed_Junco",
"Tropical_Kingbird",
"Gray_Kingbird",
"Belted_Kingfisher",
"Green_Kingfisher",
"Pied_Kingfisher",
"Ringed_Kingfisher",
"White_breasted_Kingfisher",
"Red_legged_Kittiwake",
"Horned_Lark",
"Pacific_Loon",
"Mallard",
"Western_Meadowlark",
"Hooded_Merganser",
"Red_breasted_Merganser",
"Mockingbird",
"Nighthawk",
"Clark_Nutcracker",
"White_breasted_Nuthatch",
"Baltimore_Oriole",
"Hooded_Oriole",
"Orchard_Oriole",
"Scott_Oriole",
"Ovenbird",
"Brown_Pelican",
"White_Pelican",
"Western_Wood_Pewee",
"Sayornis",
"American_Pipit",
"Whip_poor_Will",
"Horned_Puffin",
"Common_Raven",
"White_necked_Raven",
"American_Redstart",
"Geococcyx",
"Loggerhead_Shrike",
"Great_Grey_Shrike",
"Baird_Sparrow",
"Black_throated_Sparrow",
"Brewer_Sparrow",
"Chipping_Sparrow",
"Clay_colored_Sparrow",
"House_Sparrow",
"Field_Sparrow",
"Fox_Sparrow",
"Grasshopper_Sparrow",
"Harris_Sparrow",
"Henslow_Sparrow",
"Le_Conte_Sparrow",
"Lincoln_Sparrow",
"Nelson_Sharp_tailed_Sparrow",
"Savannah_Sparrow",
"Seaside_Sparrow",
"Song_Sparrow",
"Tree_Sparrow",
"Vesper_Sparrow",
"White_crowned_Sparrow",
"White_throated_Sparrow",
"Cape_Glossy_Starling",
"Bank_Swallow",
"Barn_Swallow",
"Cliff_Swallow",
"Tree_Swallow",
"Scarlet_Tanager",
"Summer_Tanager",
"Arctic_Tern",
"Black_Tern",
"Caspian_Tern",
"Common_Tern",
"Elegant_Tern",
"Forster_Tern",
"Least_Tern",
"Green_tailed_Towhee",
"Brown_Thrasher",
"Sage_Thrasher",
"Black_capped_Vireo",
"Blue_headed_Vireo",
"Philadelphia_Vireo",
"Red_eyed_Vireo",
"Warbling_Vireo",
"White_eyed_Vireo",
"Yellow_throated_Vireo",
"Bay_breasted_Warbler",
"Black_and_white_Warbler",
"Black_throated_Blue_Warbler",
"Blue_winged_Warbler",
"Canada_Warbler",
"Cape_May_Warbler",
"Cerulean_Warbler",
"Chestnut_sided_Warbler",
"Golden_winged_Warbler",
"Hooded_Warbler",
"Kentucky_Warbler",
"Magnolia_Warbler",
"Mourning_Warbler",
"Myrtle_Warbler",
"Nashville_Warbler",
"Orange_crowned_Warbler",
"Palm_Warbler",
"Pine_Warbler",
"Prairie_Warbler",
"Prothonotary_Warbler",
"Swainson_Warbler",
"Tennessee_Warbler",
"Wilson_Warbler",
"Worm_eating_Warbler",
"Yellow_Warbler",
"Northern_Waterthrush",
"Louisiana_Waterthrush",
"Bohemian_Waxwing",
"Cedar_Waxwing",
"American_Three_toed_Woodpecker",
"Pileated_Woodpecker",
"Red_bellied_Woodpecker",
"Red_cockaded_Woodpecker",
"Red_headed_Woodpecker",
"Downy_Woodpecker",
"Bewick_Wren",
"Cactus_Wren",
"Carolina_Wren",
"House_Wren",
"Marsh_Wren",
"Rock_Wren",
"Winter_Wren",
"Common_Yellowthroat",
]