Source code for lours.dataset.io.coco

import json
import shutil
from collections.abc import Iterable
from datetime import date
from pathlib import Path

import pandas as pd
from imageio.v3 import imread, imwrite

from lours.utils import BBOX_COLUMN_NAMES
from lours.utils.bbox_converter import (
    column_names_from_format_string,
    export_bbox,
    import_bbox,
)
from lours.utils.testing import assert_images_valid

from ..dataset import Dataset
from .common import parse_annotation_name



[docs]
def from_coco(
    coco_json: Path | str,
    images_root: Path | str | None = None,
    dataset_name: str | None = None,
    split: str | None = None,
    label_map: dict[int, str] | None = None,
    box_format: str = "XYWH",
    drop_columns: Iterable[str] = ("iscrowd", "segmentation"),
) -> Dataset:
    """Load a coco json file into a dictionary.
    Note that there is only one split per file, which needs to be given by caller.
    See `specifications`__ (only Object detection)

        .. __: https://cocodataset.org/#format-data

    Notes:
        - ``from_coco`` is compatible with bounding box annotations without
          ``category_id`` field, but then you will need to have a label map of only one
          entry, which will be assigned to every bounding box.
        - If split value is not given, it will try to deduce it from the file name.
          More specifically, it will search a ``<name>_<split>.json`` pattern and assign
          ``name`` to the dataset name and ``split`` to the split value.

    Args:
        coco_json: path of json file
        images_root: folder which file_name of images are relative to
        dataset_name: If specified, will be the dataset name, used when showing the
            dataset or exporting in other formats such as fiftyone. If not specified,
            the dataset name will be deduced from the name of the json file.
        split: split of given json file.
            If not set, will try to deduce from filename. Defaults to None.
        label_map: Optional dictionary to specify the name of each category id. If not
            set, will try to deduce it from the json itself, in the field `categories`
            at its root.
        box_format: what type of annotation the json file will have.
            It will be converted back to XYWH. Defaults to XYWH
        drop_columns: list of names of columns that need to be dropped from the parsed
            json dictionary.

    Returns:
        Loaded dataset object
    """
    # If given paths are string, convert them to Path
    coco_json = Path(coco_json)
    if images_root is not None:
        images_root = Path(images_root)
    else:
        images_root = coco_json.parent
    parsed_dataset_name, parsed_split = parse_annotation_name(
        annotations_file_path=coco_json
    )
    if dataset_name is None:
        dataset_name = parsed_dataset_name
    if split is None:
        split = parsed_split
    with open(coco_json) as f:
        coco_annotations = json.load(f)

    images = pd.json_normalize(coco_annotations["images"]).set_index("id")
    images["relative_path"] = images["file_name"].apply(Path)  # pyright: ignore
    images["type"] = images["relative_path"].apply(lambda x: x.suffix)
    if split is not None:
        images["split"] = split
    images = images.drop("file_name", axis=1)

    if label_map is None:
        try:
            label_map = {c["id"]: c["name"] for c in coco_annotations["categories"]}
        except KeyError:
            label_map = {}

    try:
        annotations = pd.json_normalize(coco_annotations["annotations"]).set_index("id")
    except KeyError:
        annotations = pd.DataFrame(
            [],
            columns=[
                *column_names_from_format_string("XYWH"),
                "image_id",
                "category_id",
            ],
        )
        annotations.index.name = "id"
        return Dataset(images_root, images, annotations, label_map)
    # Don't deal with iscrowd=1 for now
    if "iscrowd" in annotations.columns:
        annotations = annotations[annotations["iscrowd"] == 0]
    bboxes = pd.DataFrame(
        list(annotations["bbox"]),
        index=annotations.index,
        columns=column_names_from_format_string(box_format),
    )
    bboxes = import_bbox(
        bboxes, images, image_ids=annotations["image_id"], input_format=box_format
    )
    annotations = pd.concat([annotations, bboxes], axis=1)
    annotations = annotations.drop([*drop_columns, "bbox"], axis=1, errors="ignore")

    if "category_id" not in annotations.columns:
        assert len(label_map) == 1
        annotations["category_id"] = list(label_map)[0]

    if annotations["category_id"].hasnans:
        raise ValueError(
            "Some category ids in annotations are undefined. Make sure either every"
            " annotation in your coco file has a `category_id` field, or none of them"
            " have"
        )

    if "score" in annotations:
        annotations = annotations.rename(columns={"score": "confidence"})
    return Dataset(images_root, images, annotations, label_map, dataset_name)




[docs]
def from_coco_keypoints(
    coco_json: Path | str,
    images_root: Path | str | None = None,
    dataset_name: str | None = None,
    split: str | None = None,
    box_format: str = "XY",
    category_name: str | None = "head",
):
    """Special coco loading function for crowds, it will assume point box format
    (either XY or xy), only one category, with an id of 0, and a category name of person
    (unless specified otherwise in the coco format)

    Args:
        coco_json: path of json file
        images_root: folder which file_name of images are relative to
        dataset_name: If specified, will be the dataset name, used when showing the
            dataset or exporting in other formats such as fiftyone. If not specified,
            the dataset name will be deduced from the name of the json file.
        split: split of given json file.
            If not set, will try to deduce from filename. Defaults to None.
        box_format: what type of annotation the json file will have.
            It will be converted back to XYWH, with box width and height set to 0.
            Defaults to XY
        category_name: name of the only category of this coco json file.
            It will then call the ``from_coco`` original version with a label map option
            set to ``{0: category_name}``.
            If set to None, will deduce it from coco file. Defaults to "person".

    Returns:
        Loaded dataset object
    """
    return from_coco(
        coco_json=coco_json,
        images_root=images_root,
        dataset_name=dataset_name,
        split=split,
        box_format=box_format,
        label_map={0: category_name} if category_name is not None else None,
    )




[docs]
def dataset_to_coco(
    dataset: Dataset,
    output_path: Path | str,
    copy_images: bool = False,
    to_jpg: bool = True,
    overwrite_images: bool = False,
    overwrite_labels: bool = False,
    add_split_suffix: bool | None = None,
    box_format: str = "XYWH",
    version: str = "0",
    contributor: str = "XXII",
) -> None:
    """Save dataset to COCO format. Note that by default, no image or image path is
    manipulated

    Args:
        dataset: Dataset object to save
        output_path: output folder where to save the json file,
            and optionally the images. Can also be the name of the output json file when
            there is only one split value.
        copy_images: If True, will copy images linked by annotations in
            a "data" folder, similar to 51. Defaults to False.
        to_jpg: if True, along with previous option, will convert
            images to jpg if needed. Defaults to True.
        overwrite_images: if False with copy_images True,
            will skip images that are already copied. Defaults to True.
        overwrite_labels: if False,
            will skip annotation that are already created. Defaults to True.
        add_split_suffix: if True, will add the split name to the
            name of the json file. cannot be False if dataset has multiple splits.
            If not set, will add suffix only if dataset has multiple splits.
        box_format: what type of annotation the json file will have.
            It will be converted from XYWH. Defaults to XYWH
        version: Arbitrary version number for dataset metadata.
            Defaults to "0".
        contributor: Arbitrary contributor info for dataset metadata.
            Defaults to "XXII".
    """
    if copy_images:
        try:
            assert_images_valid(dataset, load_images=False)
        except AssertionError as e:
            raise ValueError(
                "Dataset images are missing, check that the images root folder is the"
                " right one"
            ) from e
    output_path = Path(output_path).resolve()
    dataset = dataset.debooleanize()
    if output_path.suffix == ".json":
        output_file_name = output_path.stem
        output_path = output_path.parent
    else:
        output_path.mkdir(exist_ok=True, parents=True)
        output_file_name = "annotations"
    now = date.today()

    if add_split_suffix is None:
        add_split_suffix = len(dataset.annotations.split.unique()) > 1

    if not add_split_suffix:
        assert (
            "split" not in dataset.annotations
            or len(dataset.annotations.split.unique()) == 1
        ), "Cannot remove split suffix because dataset has more than one split"

    if copy_images:
        output_img_paths = dataset.images["relative_path"].apply(
            lambda x: output_path / "data" / x
        )
        if to_jpg:
            output_img_paths = output_img_paths.apply(lambda x: x.with_suffix(".jpg"))
        for i, row in dataset.images.iterrows():
            input_img_path = (dataset.images_root / row["relative_path"]).resolve()
            output_img_path = output_img_paths.loc[i]  # pyright: ignore
            if overwrite_images or not output_img_path.is_file():
                output_img_path.parent.mkdir(parents=True, exist_ok=True)
                if not to_jpg or row["type"].lower() in [".jpg", ".jpeg"]:
                    shutil.copy(input_img_path, output_img_path)
                else:
                    image = imread(input_img_path)
                    imwrite(output_img_path, image[..., :3])

    if copy_images and to_jpg:
        image_paths_str = dataset.images["relative_path"].apply(
            lambda x: str(x.with_suffix(".jpg"))
        )
    else:
        image_paths_str = dataset.images["relative_path"].apply(str)
    converted_images = dataset.images.drop(["relative_path", "type"], axis=1)
    converted_images["file_name"] = image_paths_str
    converted_bbox = export_bbox(
        dataset.annotations, dataset.images, output_format=box_format
    )
    converted_bboxes = pd.Series(
        converted_bbox[column_names_from_format_string(box_format)].to_numpy().tolist(),
        index=dataset.annotations.index,
        name="bbox",
    )
    converted_annotations = dataset.annotations.drop(
        [*BBOX_COLUMN_NAMES, "category_str"], axis=1
    )
    if "confidence" in converted_annotations:
        converted_annotations = converted_annotations.rename(
            columns={"confidence": "score"}
        )
    converted_annotations = pd.concat([converted_annotations, converted_bboxes], axis=1)
    converted_annotations["iscrowd"] = 0

    if "split" not in converted_images:
        converted_images["split"] = None
    if "split" not in converted_annotations:
        converted_annotations["split"] = dataset.images.loc[
            converted_annotations["image_id"], "split"
        ]

    for split_name, split_imgs in converted_images.groupby("split", dropna=False):
        if add_split_suffix and not pd.isna(split_name):
            output_json_path = output_path / f"{output_file_name}_{split_name}.json"
        else:
            output_json_path = output_path / f"{output_file_name}.json"
        if not overwrite_labels and output_json_path.is_file():
            continue
        print(f"saving {split_name} to {output_json_path.name}...")
        info_dict = {
            "description": "Made with XXII dataset helper",
            "url": "https://xxii.fr",
            "version": version,
            "year": now.year,
            "contributor": contributor,
        }
        images_list = split_imgs.reset_index().to_dict("records")
        if pd.isna(split_name):
            split_annotations = converted_annotations[
                converted_annotations["split"].isna()
            ]
        else:
            split_annotations = converted_annotations[
                converted_annotations["split"] == split_name
            ]
        print(split_annotations)
        print(split_annotations.keys())
        annotations_list = split_annotations.reset_index().to_dict("records")

        categories_list = [
            {
                "supercategory": "",
                "id": cat_id,
                "name": cat_name,
            }  # Not implemented yet,
            for cat_id, cat_name in dataset.label_map.items()
        ]
        coco_dict = {
            "info": info_dict,
            "images": images_list,
            "annotations": annotations_list,
            "categories": categories_list,
        }
        with open(output_json_path, "w") as f:
            json.dump(coco_dict, f, indent=2)