Source code for stable_datasets.images.clevrer

import json
import os
import zipfile
from pathlib import Path

import datasets

from stable_datasets.utils import BaseDatasetBuilder, _default_dest_folder, bulk_download



[docs]
class CLEVRER(BaseDatasetBuilder):
    """CLEVRER: CoLlision Events for Video REpresentation and Reasoning.

    A diagnostic video dataset for systematic evaluation of computational models
    on a wide range of reasoning tasks. The dataset includes four types of questions:
    descriptive (e.g., "what color"), explanatory ("what's responsible for"),
    predictive ("what will happen next"), and counterfactual ("what if").

    The dataset contains 20,000 synthetic videos of moving and colliding objects.
    Each video is 5 seconds long and contains 128 frames with resolution 480 x 320.

    Splits:
        - train: 10,000 videos (index 0 - 9999)
        - validation: 5,000 videos (index 10000 - 14999)
        - test: 5,000 videos (index 15000 - 19999)
    """

    VERSION = datasets.Version("1.0.0")

    SOURCE = {
        "homepage": "http://clevrer.csail.mit.edu/",
        "assets": {
            "train_videos": "http://data.csail.mit.edu/clevrer/videos/train/video_train.zip",
            "train_annotations": "http://data.csail.mit.edu/clevrer/annotations/train/annotation_train.zip",
            "train_questions": "http://data.csail.mit.edu/clevrer/questions/train.json",
            "validation_videos": "http://data.csail.mit.edu/clevrer/videos/validation/video_validation.zip",
            "validation_annotations": "http://data.csail.mit.edu/clevrer/annotations/validation/annotation_validation.zip",
            "validation_questions": "http://data.csail.mit.edu/clevrer/questions/validation.json",
            "test_videos": "http://data.csail.mit.edu/clevrer/videos/test/video_test.zip",
            "test_questions": "http://data.csail.mit.edu/clevrer/questions/test.json",
        },
        "citation": """@inproceedings{yi2020clevrer,
            title={CLEVRER: CoLlision Events for Video REpresentation and Reasoning},
            author={Yi, Kexin and Gan, Chuang and Li, Yunzhu and Kohli, Pushmeet and Wu, Jiajun and Torralba, Antonio and Tenenbaum, Joshua B},
            booktitle={International Conference on Learning Representations},
            year={2020}
        }""",
    }

    def _info(self):
        return datasets.DatasetInfo(
            description="""CLEVRER is a diagnostic video dataset for temporal and causal reasoning.
            It contains 20,000 synthetic videos of moving and colliding objects, with four types
            of questions: descriptive, explanatory, predictive, and counterfactual.""",
            features=datasets.Features(
                {
                    "video": datasets.Video(),
                    "scene_index": datasets.Value("int32"),
                    "video_filename": datasets.Value("string"),
                    # Store questions as JSON string to avoid nested Sequence issues
                    "questions_json": datasets.Value("string"),
                    # Store annotations as JSON string
                    "annotations_json": datasets.Value("string"),
                }
            ),
            supervised_keys=None,
            homepage=self.SOURCE["homepage"],
            license="CC0",
            citation=self.SOURCE["citation"],
        )

    def _split_generators(self, dl_manager):
        source = self._source()
        assets = source["assets"]

        download_dir = getattr(self, "_raw_download_dir", None)
        if download_dir is None:
            download_dir = _default_dest_folder()
        download_dir = Path(download_dir)

        # Download all files concurrently using bulk_download
        urls = list(assets.values())
        downloaded_paths = bulk_download(urls, dest_folder=download_dir)
        url_to_path = dict(zip(urls, downloaded_paths))

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "videos_path": url_to_path[assets["train_videos"]],
                    "annotations_path": url_to_path[assets["train_annotations"]],
                    "questions_path": url_to_path[assets["train_questions"]],
                    "split": "train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "videos_path": url_to_path[assets["validation_videos"]],
                    "annotations_path": url_to_path[assets["validation_annotations"]],
                    "questions_path": url_to_path[assets["validation_questions"]],
                    "split": "validation",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "videos_path": url_to_path[assets["test_videos"]],
                    "annotations_path": None,  # Test split has no annotations
                    "questions_path": url_to_path[assets["test_questions"]],
                    "split": "test",
                },
            ),
        ]

    def _generate_examples(self, videos_path, annotations_path, questions_path, split):
        # Load questions
        with open(questions_path) as f:
            questions_data = json.load(f)

        # Create a mapping from scene_index to questions
        scene_to_questions = {item["scene_index"]: item for item in questions_data}

        # Load annotations if available (not available for test split)
        scene_to_annotations = {}
        if annotations_path is not None:
            with zipfile.ZipFile(annotations_path, "r") as ann_zip:
                for filename in ann_zip.namelist():
                    if filename.endswith(".json"):
                        with ann_zip.open(filename) as f:
                            ann_data = json.load(f)
                            scene_to_annotations[ann_data["scene_index"]] = ann_data

        # Extract videos directory
        extract_dir = Path(videos_path).parent / f"clevrer_{split}_videos"
        if not extract_dir.exists():
            with zipfile.ZipFile(videos_path, "r") as vid_zip:
                vid_zip.extractall(extract_dir)

        # Generate examples
        for scene_index, question_item in scene_to_questions.items():
            video_filename = question_item["video_filename"]

            # Find the video file
            video_path = self._find_video_file(extract_dir, video_filename)
            if video_path is None:
                continue

            # Get questions from the question file
            questions = question_item.get("questions", [])

            # Get annotations if available
            annotations = {}
            if scene_index in scene_to_annotations:
                annotations = scene_to_annotations[scene_index]

            yield (
                scene_index,
                {
                    "video": str(video_path),
                    "scene_index": scene_index,
                    "video_filename": video_filename,
                    "questions_json": json.dumps(questions),
                    "annotations_json": json.dumps(annotations),
                },
            )

    def _find_video_file(self, extract_dir, video_filename):
        """Find a video file in the extracted directory structure."""
        # Videos are arranged in folders per 1000 files
        for root, dirs, files in os.walk(extract_dir):
            if video_filename in files:
                return Path(root) / video_filename
        return None