Source code for lours.dataset.merge

from os.path import commonpath
from pathlib import Path

import pandas as pd

from lours.dataset import Dataset
from lours.utils.column_booleanizer import broadcast_booleanization
from lours.utils.label_map_merger import merge_label_maps
from lours.utils.testing import assert_frame_intersections_equal


[docs] def merge_datasets( dataset1: Dataset, dataset2: Dataset, allow_overlapping_image_ids: bool = True, realign_label_map: bool = False, ignore_index: bool = False, mark_origin: bool = False, overwrite_origin: bool = False, ) -> Dataset: """Merge two datasets and return a unique dataset object containing Samples from both. Result's images_root will be the common path of both datasets, and the image relative paths will be updated accordingly. Result's label map will be the superset of both label map, provided one is included in the other. Notes: - If possible, booleanized columns for images and annotations will be broadcast together. See :func:`lours.utils.column_booleanizer.broadcast_booleanization` - If one of the dataset has an absolute path as ``images_root``, the other dataset images root path will also be converted to absolute. - If both datasets have the same name, the output will have the same name as well. - If datasets have a different name, the output will have the concatenation of both names separate by a "+" sign. The merge output of "A" and "B" will be thus names "A+B". - If one dataset has no name (``dataset.name`` is ``None``), the output will take the name of the other. - If ``mark_origin`` is selected, it will be effective only if datasets have different actual names (not ``None``) Args: dataset1: First dataset to merge. dataset2: Second dataset to merge with dataset1. This dataset must be compatible with the first one, i.e. one label map is included with the other, image and annotation ids are mutually exclusives between datasets (unless `ignore_index` is False), and booleanized columns are compatible with each other. allow_overlapping_image_ids: if set to True, will try to join images dataframes with overlapping ids. The whole rows (i.e. with values from columns present in both dataframes) must match, as well as the images_root. In that case, annotations with this image_id (from self or other) will be assumed to come from the same image. Defaults to True realign_label_map: If set to True, will try to remap classes of dataset2 to match the label map fo dataset1, to avoid a potential error due to incompatible label maps. ignore_index: if set to True, will ignore overlapping ids for images and annotations and reset them. Will update the ``image_id`` column in the annotations accordingly. Note that this option makes the former option useless. Defaults to False. mark_origin: If set to True, and if both datasets have a different name, will add two columns "origina_dataset_name" and "origin" for images and annotations dataframes, indicating respectively the name of the origin dataset, and its id in the original dataset. Defaults to True. overwrite_origin: If set to True, will overwrite already existing columns in input datasets dataframes. Otherwise, will only mark origin if it's not present. Defaults to False. Raises: ValueError: Error if the two datasets are incompatible (see above) Returns: Merged dataset. """ needs_new_name = ( dataset1.dataset_name is not None and dataset2.dataset_name is not None and dataset2.dataset_name != dataset1.dataset_name ) def mark_origin_to_dataset(dataset: Dataset) -> Dataset: dataset_columns = set(dataset.images.columns) & set(dataset.annotations.columns) if ( "origin" in dataset_columns and "origin_id" in dataset_columns and not overwrite_origin ): return dataset return dataset.from_template( images=dataset.images.assign( origin=dataset.dataset_name, origin_id=dataset.images.index, ), annotations=dataset.annotations.assign( origin=dataset.dataset_name, origin_id=dataset.annotations.index, ), ) if mark_origin and needs_new_name: dataset1 = mark_origin_to_dataset(dataset1) dataset2 = mark_origin_to_dataset(dataset2) if ignore_index: return merge_datasets( dataset1.reset_index(), dataset2.reset_index(len(dataset1)), allow_overlapping_image_ids=True, ignore_index=False, ) # images_root to grab images might not be the same # Get the common path and update images relative paths to # be relative to that new path if dataset1.images_root != dataset2.images_root: if dataset1.images_root.is_absolute() or dataset2.images_root.is_absolute(): images_root = commonpath( [ dataset1.images_root.absolute(), dataset2.images_root.absolute(), ] ) else: images_root = commonpath([dataset1.images_root, dataset2.images_root]) images_root = Path(images_root) dataset1 = dataset1.reset_images_root(images_root) dataset2 = dataset2.reset_images_root(images_root) else: images_root = dataset1.images_root if realign_label_map: dataset2 = dataset2.remap_from_other(dataset1) label_map = merge_label_maps( dataset1.label_map, dataset2.label_map, method="outer" ) else: label_map = merge_label_maps( dataset1.label_map, dataset2.label_map, method="outer" ) dataset1_images, dataset2_images, booleanized_image_columns = ( broadcast_booleanization( dataset1.images, dataset2.images, booleanized_columns1=dataset1.booleanized_columns["images"], booleanized_columns2=dataset2.booleanized_columns["images"], ) ) dataset1_annotations, dataset2_annotations, booleanized_annotations_columns = ( broadcast_booleanization( dataset1.annotations, dataset2.annotations, booleanized_columns1=dataset1.booleanized_columns["annotations"], booleanized_columns2=dataset2.booleanized_columns["annotations"], ) ) dataset1_images_ids = set(dataset1.images.index) dataset2_images_ids = set(dataset2.images.index) mutual_images_ids = dataset1_images_ids & dataset2_images_ids dataset1_images_columns = set(dataset1.images.columns) dataset2_images_columns = set(dataset2.images.columns) mutual_images_columns = dataset1_images_columns & dataset2_images_columns if mutual_images_ids and not allow_overlapping_image_ids: raise ValueError( "Overlapping image ids not permitted. Consider using the" " allow_overlapping_image_ids or ignore_index options" ) assert_frame_intersections_equal( dataset1_images.drop(["origin", "origin_id"], axis=1, errors="ignore"), dataset2_images.drop(["origin", "origin_id"], axis=1, errors="ignore"), ) # Concat horizontally by extending images from dataset1 with columns from dataset2 # and then vertically by extending images with dataset2 images which id is not # in dataset1 images index. dataset1_images = dataset1_images.join( dataset2_images.loc[ list(mutual_images_ids), list(dataset2_images_columns - mutual_images_columns), ] ) dataset2_images = dataset2_images.loc[ list(dataset2_images_ids - mutual_images_ids), : ] images = pd.concat([dataset1_images, dataset2_images]) # Merge annotations. mutual_instance_ids = set(dataset1_annotations.index).intersection( set(dataset2_annotations.index) ) # Only reset index of dataset2's annotations if there is overlap. # However, keep the index of first dataset as is if mutual_instance_ids: dataset2_annotations.index += ( dataset1_annotations.index.max() - dataset2_annotations.index.min() + 1 ) annotations = pd.concat([dataset1_annotations, dataset2_annotations]) annotations.index.name = "id" if needs_new_name: output_dataset_name = f"{dataset1.dataset_name}+{dataset2.dataset_name}" elif dataset2.dataset_name is None: output_dataset_name = dataset1.dataset_name else: output_dataset_name = dataset2.dataset_name output = Dataset( images_root=images_root, images=images, annotations=annotations, label_map=label_map, dataset_name=output_dataset_name, ) output.booleanized_columns["images"] = booleanized_image_columns output.booleanized_columns["annotations"] = booleanized_annotations_columns return output