Source code for lours.dataset.merge

from os.path import commonpath
from pathlib import Path

import pandas as pd

from lours.dataset import Dataset
from lours.utils.column_booleanizer import broadcast_booleanization
from lours.utils.label_map_merger import merge_label_maps
from lours.utils.testing import assert_frame_intersections_equal



[docs]
def merge_datasets(
    dataset1: Dataset,
    dataset2: Dataset,
    allow_overlapping_image_ids: bool = True,
    realign_label_map: bool = False,
    ignore_index: bool = False,
    mark_origin: bool = False,
    overwrite_origin: bool = False,
) -> Dataset:
    """Merge two datasets and return a unique dataset object containing
    Samples from both. Result's images_root will be the common path of both
    datasets, and the image relative paths will be updated accordingly.
    Result's label map will be the superset of both label map,
    provided one is included in the other.

    Notes:
        - If possible, booleanized columns for images and annotations will be broadcast
          together. See :func:`lours.utils.column_booleanizer.broadcast_booleanization`
        - If one of the dataset has an absolute path as ``images_root``, the other
          dataset images root path will also be converted to absolute.
        - If both datasets have the same name, the output will have the same name
          as well.
        - If datasets have a different name, the output will have the concatenation
          of both names separate by a "+" sign. The merge output of "A" and "B" will
          be thus names "A+B".
        - If one dataset has no name (``dataset.name`` is ``None``), the output will
          take the name of the other.
        - If ``mark_origin`` is selected, it will be effective only if datasets have
          different actual names (not ``None``)

    Args:
        dataset1: First dataset to merge.
        dataset2: Second dataset to merge with dataset1. This dataset must be
            compatible with the first one, i.e. one label map is included with the
            other,  image and annotation ids are mutually exclusives between
            datasets (unless `ignore_index` is False), and booleanized columns are
            compatible with each other.
        allow_overlapping_image_ids: if set to True, will try to join images
            dataframes with overlapping ids. The whole rows (i.e. with values from
            columns present in both dataframes) must match, as well as
            the images_root. In that case, annotations with this image_id
            (from self or other) will be assumed to come from the same image.
            Defaults to True
        realign_label_map: If set to True, will try to remap classes of dataset2 to
            match the label map fo dataset1, to avoid a potential error due to
            incompatible label maps.
        ignore_index: if set to True, will ignore overlapping ids
            for images and annotations and reset them. Will update the ``image_id``
            column in the annotations accordingly. Note that this option makes the
            former option useless. Defaults to False.
        mark_origin: If set to True, and if both datasets have a different name, will
            add two columns "origina_dataset_name" and "origin" for images and
            annotations dataframes, indicating respectively the name
            of the origin dataset, and its id in the original dataset. Defaults to True.
        overwrite_origin: If set to True, will overwrite already existing columns in
            input datasets dataframes. Otherwise, will only mark origin if it's not
            present. Defaults to False.

    Raises:
        ValueError: Error if the two datasets are incompatible (see above)

    Returns:
        Merged dataset.
    """
    needs_new_name = (
        dataset1.dataset_name is not None
        and dataset2.dataset_name is not None
        and dataset2.dataset_name != dataset1.dataset_name
    )

    def mark_origin_to_dataset(dataset: Dataset) -> Dataset:
        dataset_columns = set(dataset.images.columns) & set(dataset.annotations.columns)
        if (
            "origin" in dataset_columns
            and "origin_id" in dataset_columns
            and not overwrite_origin
        ):
            return dataset
        return dataset.from_template(
            images=dataset.images.assign(
                origin=dataset.dataset_name,
                origin_id=dataset.images.index,
            ),
            annotations=dataset.annotations.assign(
                origin=dataset.dataset_name,
                origin_id=dataset.annotations.index,
            ),
        )

    if mark_origin and needs_new_name:
        dataset1 = mark_origin_to_dataset(dataset1)
        dataset2 = mark_origin_to_dataset(dataset2)

    if ignore_index:
        return merge_datasets(
            dataset1.reset_index(),
            dataset2.reset_index(len(dataset1)),
            allow_overlapping_image_ids=True,
            ignore_index=False,
        )

    # images_root to grab images might not be the same
    # Get the common path and update images relative paths to
    # be relative to that new path
    if dataset1.images_root != dataset2.images_root:
        if dataset1.images_root.is_absolute() or dataset2.images_root.is_absolute():
            images_root = commonpath(
                [
                    dataset1.images_root.absolute(),
                    dataset2.images_root.absolute(),
                ]
            )
        else:
            images_root = commonpath([dataset1.images_root, dataset2.images_root])
        images_root = Path(images_root)
        dataset1 = dataset1.reset_images_root(images_root)
        dataset2 = dataset2.reset_images_root(images_root)
    else:
        images_root = dataset1.images_root

    if realign_label_map:
        dataset2 = dataset2.remap_from_other(dataset1)
        label_map = merge_label_maps(
            dataset1.label_map, dataset2.label_map, method="outer"
        )
    else:
        label_map = merge_label_maps(
            dataset1.label_map, dataset2.label_map, method="outer"
        )

    dataset1_images, dataset2_images, booleanized_image_columns = (
        broadcast_booleanization(
            dataset1.images,
            dataset2.images,
            booleanized_columns1=dataset1.booleanized_columns["images"],
            booleanized_columns2=dataset2.booleanized_columns["images"],
        )
    )
    dataset1_annotations, dataset2_annotations, booleanized_annotations_columns = (
        broadcast_booleanization(
            dataset1.annotations,
            dataset2.annotations,
            booleanized_columns1=dataset1.booleanized_columns["annotations"],
            booleanized_columns2=dataset2.booleanized_columns["annotations"],
        )
    )

    dataset1_images_ids = set(dataset1.images.index)
    dataset2_images_ids = set(dataset2.images.index)
    mutual_images_ids = dataset1_images_ids & dataset2_images_ids
    dataset1_images_columns = set(dataset1.images.columns)
    dataset2_images_columns = set(dataset2.images.columns)
    mutual_images_columns = dataset1_images_columns & dataset2_images_columns

    if mutual_images_ids and not allow_overlapping_image_ids:
        raise ValueError(
            "Overlapping image ids not permitted. Consider using the"
            " allow_overlapping_image_ids or ignore_index options"
        )

    assert_frame_intersections_equal(
        dataset1_images.drop(["origin", "origin_id"], axis=1, errors="ignore"),
        dataset2_images.drop(["origin", "origin_id"], axis=1, errors="ignore"),
    )

    # Concat horizontally by extending images from dataset1 with columns from dataset2
    # and then vertically by extending images with dataset2 images which id is not
    # in dataset1 images index.
    dataset1_images = dataset1_images.join(
        dataset2_images.loc[
            list(mutual_images_ids),
            list(dataset2_images_columns - mutual_images_columns),
        ]
    )
    dataset2_images = dataset2_images.loc[
        list(dataset2_images_ids - mutual_images_ids), :
    ]

    images = pd.concat([dataset1_images, dataset2_images])

    # Merge annotations.
    mutual_instance_ids = set(dataset1_annotations.index).intersection(
        set(dataset2_annotations.index)
    )
    # Only reset index of dataset2's annotations if there is overlap.
    # However, keep the index of first dataset as is
    if mutual_instance_ids:
        dataset2_annotations.index += (
            dataset1_annotations.index.max() - dataset2_annotations.index.min() + 1
        )
    annotations = pd.concat([dataset1_annotations, dataset2_annotations])
    annotations.index.name = "id"

    if needs_new_name:
        output_dataset_name = f"{dataset1.dataset_name}+{dataset2.dataset_name}"
    elif dataset2.dataset_name is None:
        output_dataset_name = dataset1.dataset_name
    else:
        output_dataset_name = dataset2.dataset_name
    output = Dataset(
        images_root=images_root,
        images=images,
        annotations=annotations,
        label_map=label_map,
        dataset_name=output_dataset_name,
    )
    output.booleanized_columns["images"] = booleanized_image_columns
    output.booleanized_columns["annotations"] = booleanized_annotations_columns

    return output