Source code for lours.utils.difftools

"""Set of tools to differentiate datasets or evaluators"""

from collections.abc import Iterable
from warnings import warn

from lours.dataset import Dataset
from lours.utils.label_map_merger import IncompatibleLabelMapsError, merge_label_maps



[docs]
def dataset_diff(
    left_dataset: Dataset,
    right_dataset: Dataset,
    exclude_image_columns: Iterable[str] = (),
    exclude_annotations_columns: Iterable[str] = (),
) -> tuple[Dataset, Dataset, Dataset]:
    """Differentiate two datasets and construct the difference datasets, only containing
    elements that are in one of the two datasets but not the other

    this function outputs the differences with 2 datasets that are constructed with
    images and annotations specific to each dataset and a third dataset with common
    images and annotations.

    As such, you should theoretically be able to reconstruct the left dataset with
    the first difference dataset and the common dataset, and reconstruct the right
    dataset with the second difference dataset and the common dataset.

    Note:
        if one dataset has a column in its dataframes the other dataset doesn't have,
        and that column is not included in ``exclude_image_columns`` or
        ``exclude_annotations_column``, the dataframes and thus the datasets will be
        considered entirely different, and the common dataset will be empty

    Note:
        if ``exclude_image_columns`` or ``exclude_annotations_columns`` is not empty,
        it is not guaranteed to be able to reconstruct left or right dataset with common
        datasets and difference datasets, only the datasets minus the excluded columns.


    Args:
        left_dataset: left dataset to compare
        right_dataset: right dataset to compare
        exclude_image_columns: list of names of columns to ignore in image dataframes
            for the comparison.
        exclude_annotations_columns: list of names of columns to ignore in annotations
            dataframes for the comparison.

    Returns:
        tuple with 3 datasets
         - dataset with images and annotations that are specific to ``left_dataset``
         - dataset with images and annotations that are specific to ``right_dataset``
         - dataset with images and annotations that are common to both input datasets.
    """
    try:
        merge_label_maps(left_dataset.label_map, right_dataset.label_map)
    except IncompatibleLabelMapsError:
        warn("Incompatible label maps, dataset cannot have mutual info", RuntimeWarning)
        return left_dataset, right_dataset, Dataset()

    exclude_annotations_columns = list(exclude_annotations_columns)
    exclude_image_columns = list(exclude_image_columns)
    left_images, left_annotations = left_dataset.images, left_dataset.annotations
    right_images, right_annotations = right_dataset.images, right_dataset.annotations
    left_images = left_images.drop(columns=exclude_image_columns, errors="ignore")
    right_images = right_images.drop(columns=exclude_image_columns, errors="ignore")
    left_annotations = left_annotations.drop(
        columns=exclude_annotations_columns, errors="ignore"
    )
    right_annotations = right_annotations.drop(
        columns=exclude_annotations_columns, errors="ignore"
    )

    if exclude_annotations_columns or exclude_image_columns:
        # Construct new datasets with the excluded columns removed.
        # these one shoulb have the exact same columns
        return dataset_diff(
            left_dataset.loc[:, left_images.columns].loc_annot[
                :, left_annotations.columns
            ],
            right_dataset.loc[:, right_images.columns].loc_annot[
                :, right_annotations.columns
            ],
        )

    # If datasets are equal, just output left dataset with the excluded columns
    if left_images.equals(right_images) and left_annotations.equals(right_annotations):
        return left_dataset.loc[[]], right_dataset.loc[[]], left_dataset

    if set(left_images.columns) != set(right_images.columns):
        warn(
            "Column mismatch between Image DataFrames, consider using"
            " 'exclude_image_columns' argument, or comparing after manually excluding"
            " not shared columns in your datasets images frames",
            RuntimeWarning,
        )
        return left_dataset, right_dataset, Dataset()

    if set(left_annotations.columns) != set(right_annotations.columns):
        warn(
            "Column mismatch between Annotations DataFrames, consider using"
            " 'exclude_annotations_columns' argument, or comparing after manually"
            " excluding not shared columns in your datasets annotations frames",
            RuntimeWarning,
        )
        return left_dataset, right_dataset, Dataset()

    left_image_ids = set(left_images.index)
    right_image_ids = set(right_images.index)

    common_image_ids = left_image_ids & right_image_ids
    common_image_ids_list = list(common_image_ids)
    only_left_image_ids = left_image_ids - right_image_ids
    only_right_image_ids = right_image_ids - left_image_ids

    # compare images that share the same id across both datasets
    common_left_images = left_images.loc[common_image_ids_list]
    common_right_images = right_images.loc[common_image_ids_list]

    changed_values = common_left_images != common_right_images
    # None values should be considered the same here, the same pandas does with the
    # `equals` method
    changed_values[common_left_images.isna() & common_left_images.isna()] = False
    changed_images = changed_values.any(axis=1)
    changed_images_ids = set(common_left_images.loc[changed_images].index)

    left_annotations_ids = set(
        left_dataset.loc[common_image_ids_list].annotations.index
    )
    right_annotations_ids = set(
        right_dataset.loc[common_image_ids_list].annotations.index
    )
    changed_images_annotations_ids = set(
        left_dataset.loc[list(changed_images_ids)].annotations.index
    ) | set(right_dataset.loc[list(changed_images_ids)].annotations.index)

    # annotations that are the same but are linked to images that are different
    # are not considered the same
    common_annotations_ids = (
        left_annotations_ids & right_annotations_ids
    ) - changed_images_annotations_ids
    common_annotations_ids_list = list(common_annotations_ids)
    only_left_annotations_ids = left_annotations_ids - right_annotations_ids
    only_right_annotations_ids = right_annotations_ids - left_annotations_ids

    # Now compare annotations that share the same id and are linked to images that
    # are the same across datasets
    common_left_annot = left_annotations.loc[common_annotations_ids_list]
    common_right_annot = right_annotations.loc[common_annotations_ids_list]
    changed_annotations_values = common_left_annot != common_right_annot
    # Same as for images, None values in both frames should be considered the same
    changed_annotations_values[common_left_annot.isna() & common_right_annot.isna()] = (
        False
    )
    changed_annotations = changed_annotations_values.any(axis=1)
    changed_annotations_ids = set(common_left_annot.loc[changed_annotations].index)

    # resulting left dataset is composed of:
    # - images that are only in left dataset and all their annotations
    # - images that are different from right dataset and all
    #   their annotations
    # - annotation that are only in left dataset and their linked image
    # - annotations that are different from right dataset and their linked image
    only_left_dataset = (
        left_dataset.loc[list(only_left_image_ids | changed_images_ids)]
        + left_dataset.loc_annot[
            list(only_left_annotations_ids | changed_annotations_ids)
        ].remove_empty_images()
    )

    # Same applied for right dataset
    only_right_dataset = (
        right_dataset.loc[list(only_right_image_ids | changed_images_ids)]
        + right_dataset.loc_annot[
            list(only_right_annotations_ids | changed_annotations_ids)
        ].remove_empty_images()
    )

    # Resulting common dataset is composed of images that are in both datasets and
    # that are the same across datasets, and only their annotations if they are
    # in both datasets and are the same across datasets.
    common_dataset = (
        left_dataset.loc[list(common_image_ids - changed_images_ids)]
        .loc_annot[list(common_annotations_ids - changed_annotations_ids)]
        .remove_empty_images()
    )

    return only_left_dataset, only_right_dataset, common_dataset