Source code for lours.utils.testing

"""Set of functions used to test some assertions on datasets.
Useful when used in unit tests
"""

from collections.abc import Hashable, Iterable, Sequence

import numpy as np
import pandas as pd
from imageio.v3 import imread
from pandas.testing import assert_frame_equal
from tqdm.auto import tqdm

from ..dataset import Dataset
from ..utils import BBOX_COLUMN_NAMES

BOX_XMIN, BOX_YMIN, BOX_WIDTH, BOX_HEIGHT = BBOX_COLUMN_NAMES


[docs] def assert_column( input_df: pd.DataFrame, assertion: pd.Series | np.ndarray, message: str = "", n_first_occurrences: int | None = 1, ) -> None: """From a given input dataframe and a boolean series of the same length, construct an error message if the boolean has at least one False value, with the row in input dataframe corresponding to the row of the first occurrence of False value in the assertion series Args: input_df: Dataframe to show the row from, to better understand what went wrong assertion: Boolean Series of the same length as ``input_df``, expected to be full of True value message: Message to display when raising the error. Will be followed with information of faulty rows n_first_occurrences: Number of occurrences to show in case of a failure. Useful when showing duplicate values. If set to None, will show all occurrences. Raises: AssertionError: If there is at least one occurrence of False in ``assertion`` Series, raise an assertion and print the corresponding row of first occurrence in ``input_df`` """ assert len(input_df) == len(assertion) assert n_first_occurrences is None or n_first_occurrences > 0 assertion = assertion.astype(bool) if not assertion.all(): failure = input_df[~assertion].iloc[:n_first_occurrences] if n_first_occurrences is None: raise AssertionError(f"Assertion failed. {message}. {failure}") elif n_first_occurrences == 1: raise AssertionError( f"Assertion failed. {message}. First occurrence at row" f" {failure.index[0]} : {failure.iloc[0]}" ) else: raise AssertionError( f"Assertion failed. {message}. First occurrences at rows" f" {failure.index[:n_first_occurrences]} :\n{failure.iloc[:n_first_occurrences]}" )
[docs] def assert_columns_properly_normalized( input_df: pd.DataFrame, separator: str = "." ) -> None: """Checks that columns in input dataframes are well normalized, i.e. checks that if column 'A' exists, column 'A.B' does not exists. This is useful when loading json files to checks that a key cannot be both a sub dictionary and a value Args: input_df: Input DataFrame to test separator: Character used to separate name in flattened key. Defaults to ".". Raises: AssertionError: if there exist a column name where both the name and a variation of name + separator exists """ for c in input_df.columns: prefix = f"{c}{separator}" for c2 in input_df.columns: if c2.startswith(prefix): raise AssertionError( f"DataFrame is not properly normalized. Column '{c}' cannot be" f" both a value and a subdictionary, but column '{c2}' exists" )
[docs] def assert_dataset_equal( dataset1: Dataset, dataset2: Dataset, ignore_index: bool = False, optional_columns: Iterable[str] = ("area", "confidence"), remove_na_columns: bool = False, ) -> None: """Compare two datasets and raise an assertion error if datasets are not equal. This function is mainly intended to be used in the context of unit tests. Rules: - Index order is not relevant. This is similar to ``check_like`` option in :func:`pandas.testing.assert_frame_equal` - Indexes for rows and columns still must be the same when reordered - Some columns in annotations are optional and are thus ignored if present in one but not the other dataset. If both are present, the columns' values are still compared. - Label maps must be the same. Again, order is ignored (as it normally is for dictionaries) - If ``ignore_index`` option is set to ``True``, index for rows are not checked, but we still check that the key in annotations' ``image_id`` points to the same rows in images dataframe Args: dataset1: First dataset to test dataset2: Second dataset to test, must be the same according to mentioned rules or the function will raise an error ignore_index: If set, will ignore both annotations and images dataframe index, but will still check that link between annotations and image row with ``image_id`` is the same. Defaults to False. optional_columns: Iterable of column names that will considered as optional, i.e. only check them if they are both present. Defaults to the column names "area" and "confidence". remove_na_columns: If set to True, will remove from dataframes columns where all values are equivalent to panda's ``<NA>``. This more lenient comparison is useful for columns where its absence and its values being all ``<NA>`` are treated the same, like the ``split`` column. Raises: AssertionError: raised when datasets are detected to be different """ optional_columns = list(optional_columns) def assert_frame_equal_optional_columns( frame1: pd.DataFrame, frame2: pd.DataFrame, optional_columns: Sequence[str], dataframe_name: str, ) -> None: """Assert dataframe are equal, but first remove the optional columns if they are present in one dataframe and not in the other. Otherwise, if present in both, keep them for comparison """ if remove_na_columns: frame1 = frame1.dropna(axis="columns", how="all") frame2 = frame2.dropna(axis="columns", how="all") for column_name in optional_columns: if column_name in frame1.columns and column_name not in frame2.columns: frame1 = frame1.drop(column_name, axis=1) if column_name in frame2.columns and column_name not in frame1.columns: frame2 = frame2.drop(column_name, axis=1) try: assert_frame_equal(frame1, frame2, check_like=True, check_dtype="equiv") except AssertionError as e: raise AssertionError(f"{dataframe_name} dataframes don't match") from e if ignore_index: dataset1 = dataset1.reset_index() dataset2 = dataset2.reset_index() assert_frame_equal_optional_columns( dataset1.images, dataset2.images, optional_columns, "Images" ) assert_frame_equal_optional_columns( dataset1.annotations, dataset2.annotations, optional_columns, "Annotations" ) assert ( dataset1.label_map == dataset2.label_map ), f"label_maps don't match {dataset1.label_map} vs {dataset2.label_map}" assert dataset1.booleanized_columns == dataset2.booleanized_columns
[docs] def assert_frame_intersections_equal(df1: pd.DataFrame, df2: pd.DataFrame) -> None: """Construct inner dataframes from overlapping ids and columns and check they are equal These are the rows and columns present in both images dataframes The two dataframes must have the same values for the merge to be valid Args: df1: First dataframe to test df2: Second dataframe to test Raises: AssertionError: Raise error if both subdataframe constructed with intersections of indexes and columns are not the same. """ df1_ids = set(df1.index) df2_ids = set(df2.index) mutual_ids = list(df1_ids & df2_ids) if not mutual_ids: return df1_columns = set(df1.columns) df2_columns = set(df2.columns) mutual_columns = list(df1_columns & df2_columns) if not mutual_columns: return inner_df1 = df1.loc[mutual_ids, mutual_columns] inner_df2 = df2.loc[mutual_ids, mutual_columns] try: assert_frame_equal(inner_df1, inner_df2) except AssertionError as e: raise AssertionError( "sub-Dataframes constructed from ids and columns in both DataFrames are not" " equal." ) from e
[docs] def assert_images_valid( dataset: Dataset, assert_is_symlink: bool = False, load_images: bool = True, check_exhaustive: bool = False, ) -> None: """Checks that the image paths in the dataset. Namely, checks that all path are indeed pointing to a file, and are valid file format that can be loaded with ``imageio``. Note: Todo: better error messages Args: dataset: Dataset to check assert_is_symlink: If set, will check that paths are symlinks rather than files. Defaults to False. load_images: If set to True, will not only check that images are valid files, but also that image can be loaded (i.e. are not corrupted files) and that their sizes match the ones included in ``dataset.images`` dataframe. Note that this makes the function significantly slower. Defaults to True. check_exhaustive: If set to True, will check that all images in the images_root folder are in the image dataframe, and that the dataset is indeed exhaustive """ get_invalid_images( dataset, assert_is_symlink, load_images, check_exhaustive, raise_if_error=True )
[docs] class InvalidImage(AssertionError): pass
[docs] class MissingImages(AssertionError): pass
[docs] def get_invalid_images( dataset: Dataset, check_symlink: bool = False, load_images: bool = True, check_exhaustive: bool = False, raise_if_error: bool = True, ) -> pd.DataFrame: """Checks dataset's images and return an indexed error report to retrieve them. Namely, checks that all path are indeed pointing to a file, and are valid file format that can be loaded with ``imageio``. If unsuccessful, add a row to the output dataframe with the same index as the faulty images, and info about the error in corresponding columns Args: dataset: Dataset to check check_symlink: If set, will check that paths are symlinks rather than files. Defaults to False. load_images: If set to True, will not only check that images are valid files, but also that image can be loaded (i.e. are not corrupted files) and that their sizes match the ones included in ``dataset.images`` dataframe. Note that this makes the function significantly slower. Defaults to True. check_exhaustive: If set to True, will check that all images in the images_root folder are in the image dataframe, and that the dataset is indeed exhaustive raise_if_error: If set to True, will raise an InvalidImage error as soon as one image does not meet the requirements. Raises: InvalidImage: Raised if ``raise_if_error`` is selected and one image is not valid. Can be because the path is not right, the image loading failed, or the metadata is not compliant with actual image data. MissingImages: Raised if ``raise_if_error`` is selected and some images where found in the ``images_root`` folder but not in the dataset's ``images`` dataframe. Returns: Error report in the form of a Dataframe with "reason" and "additional_info" columns. Index values are the same as the corresponding images in the original dataset, so that you can retrieve the faulty images full data. """ error_report = {} def error( message: str, additional_info: str, row_id: Hashable, image_data: "pd.Series[str]", ) -> None: error_report[row_id] = {"reason": message, "additional_info": additional_info} if raise_if_error: raise InvalidImage( f"{message}, {additional_info}\n row : {row_id}\n data:" f" {image_data.to_dict()}" ) for row, img_data in tqdm(dataset.images.iterrows(), total=len(dataset)): if img_data["relative_path"].is_absolute(): error("relative path is absolute", "", row, img_data) continue img_path = dataset.images_root / img_data["relative_path"] if check_symlink and not img_path.is_symlink(): error("Not a symlink", "", row, img_data) continue valid_path = ( img_path.is_symlink() and img_path.readlink().is_file() or img_path.is_file() ) if not valid_path: error("Not a valid path", "", row, img_data) continue if load_images: try: img = imread(img_path) except OSError: error( "corrupted file", "Image cannot be loaded with imageio", row, img_data, ) continue if not isinstance(img_data["width"], int) or img_data["width"] <= 0: error( "Invalid image width", f"got {img_data['width']} pixels", row, img_data, ) continue if not isinstance(img_data["height"], int) or img_data["height"] <= 0: error( "Invalid image height", f"got {img_data['height']} pixels", row, img_data, ) continue if len(img.shape) not in [2, 3, 4]: error( "invalid image shape", f"Shape is with {len(img.shape)} dimensions instead of 2" " (grayscale), 3 (RGB/RGBA) or 4 (GIf anim)", row, img_data, ) if len(img.shape) == 4: _, height, width, _ = img.shape elif len(img.shape) == 3: height, width, _ = img.shape else: height, width = img.shape if img_data["width"] != width: error( "Image width in metadata is different from actual image width", f"{width} (actual) vs {img_data['width']} (metadata)", row, img_data, ) if img_data["height"] != height: error( "Image height in metadata is different from actual image height", f"{height} (actual) vs {img_data['height']} (metadata)", row, img_data, ) if check_exhaustive: from ..dataset import from_folder highest_id = dataset.images.index.max() all_images = ( from_folder(images_root=dataset.images_root) .reset_index(start_image_id=highest_id + 1) .images ) missing_images = all_images.loc[ ~all_images["relative_path"].isin(dataset.images["relative_path"]), "relative_path", ].apply(str) if len(missing_images) > 0 and raise_if_error: raise AssertionError( "Dataset is not exhaustive : the following images are present in" " images root but not in dataset image dataframe" f" :\n{', '.join(missing_images)}" ) for row, relative_path in missing_images.items(): error_report[row] = { "message": "missing image", "additional_info": relative_path, } return pd.DataFrame.from_dict(error_report, orient="index")
[docs] def assert_ids_well_formed(dataset: Dataset) -> None: """Assert ids follow the right convention. - DataFrames indexes must be named "id" - indexes must have no duplicates - images ``relative_path`` column must have no duplicates - annotation ``image_id`` values must all be in images index - annotation ``category_id`` values must be in dataset's label map Note: Todo: Better error messages Args: dataset: Dataset object to test. """ assert dataset.images.index.name == "id", ( "dataset's image index must be named 'id', got" f" {dataset.images.index.name} instead" ) assert_column( dataset.images, ~dataset.images.index.duplicated(keep=False), "Dataset image index has duplicate values", ) assert_column( dataset.images, ~dataset.images["relative_path"].duplicated(keep=False), "Dataset image relative path has duplicate values", ) assert dataset.annotations.index.name == "id", ( "dataset's annotation index must be named 'id', got" f" {dataset.annotations.index.name} instead" ) assert_column( dataset.annotations, ~dataset.annotations.index.duplicated(keep=False), "Dataset annotations index has duplicate values", ) assert_column( dataset.annotations, dataset.annotations["image_id"].isin(dataset.images.index), "All image_id values in annotations must in dataset images index", ) assert_column( dataset.annotations, dataset.annotations["category_id"].isin(dataset.label_map.keys()), "All category ids must be in dataset label map", )
[docs] def assert_bounding_boxes_well_formed( dataset: Dataset, allow_keypoints: bool = False ) -> None: """Assert bounding boxes are well-formed in dataset's annotations. - Boxes x and y coordinates must be within their respective image size - Boxes width and height must be positive and so that xmax and ymin are within their respective image size - in the case of keypoints, Boxes with size 0 will be tolerated Args: dataset: Dataset to test allow_keypoints: If set to True, will not raise error if bounding box size (width or height) is 0. Defaults to False. """ get_malformed_bounding_boxes(dataset, allow_keypoints, raise_if_error=True)
[docs] def get_malformed_bounding_boxes( dataset: Dataset, allow_keypoints: bool = False, raise_if_error: bool = False ) -> pd.DataFrame: """Get malformed bounding in dataset's annotations, as a boolean dataframe where index is id of bounding box in dataset's annotations dataframe, and columns are known reasons for bounding boxes to be invalid - Boxes x and y coordinates must be within their respective image size - Boxes width and height must be positive and so that xmax and ymin are within their respective image size - in the case of keypoints, Boxes with size 0 will be tolerated An invalid bounding box is then related to a row in the result dataframe where at least one of the value is True. Note that valid bounding boxes are NOT in the result dataframe. This means that if the dataset has no invalid bounding box, the result dataframe will be empty, and for each row in the result dataframe, there will be at least one ``True`` value. Args: dataset: Dataset to test allow_keypoints: If set to True, will not raise error if bounding box size (width or height) is 0. Defaults to False. raise_if_error: If set to True, will raise an error as soon as one bounding box is detected to be invalid. Defaults to False. Raises: AssertionError: When ``raise_if_error`` is set, raise an error as soon as one bounding box is invalid. Returns: Error report as a dataframe with boolean columns. - Each column is a reason why the bounding box can be faulty. - Each row is a faulty bounding box, with its corresponding index in dataset's annotation dataframe. Its value explain how the bounding box is invalid. - Only the faulty bounding boxes are kept in the error report, so all rows have at least one value set to True. """ error_report = pd.DataFrame(index=dataset.annotations.index) def report_if_error( assertion: "pd.Series[bool]", error_name: str, message: str, ) -> None: error_report[error_name] = ~assertion if raise_if_error: assert_column( dataset.annotations, assertion, message, n_first_occurrences=1 ) report_if_error( dataset.annotations[BOX_XMIN] >= 0, "Negative X value", "Bounding boxes must have positive X values", ) report_if_error( dataset.annotations[BOX_YMIN] >= 0, "Negative Y value", "Bounding boxes must have positive Y values", ) report_if_error( dataset.annotations[BOX_WIDTH] >= 0, "Negative width", "Bounding boxes must have positive width", ) report_if_error( dataset.annotations[BOX_HEIGHT] >= 0, "Negative height", "Bounding boxes must have positive height", ) if not allow_keypoints: report_if_error( dataset.annotations[BOX_WIDTH] > 0, "0 width", "Bounding boxes must have strictly positive width", ) report_if_error( dataset.annotations[BOX_HEIGHT] > 0, "0 height", "Bounding boxes must have strictly positive height", ) x_max = dataset.annotations[BOX_XMIN] + dataset.annotations[BOX_WIDTH] y_max = dataset.annotations[BOX_YMIN] + dataset.annotations[BOX_HEIGHT] im_width = dataset.images.loc[dataset.annotations["image_id"], "width"] im_height = dataset.images.loc[dataset.annotations["image_id"], "height"] im_width.index = x_max.index im_height.index = x_max.index report_if_error( x_max <= im_width, "right side outside of image", "Bounding boxes must have X values below image width", ) report_if_error( y_max <= im_height, "bottom side outside of image", "Bounding boxes must have Y values below image height", ) return error_report[error_report.any(axis=1)]
[docs] def assert_label_map_well_formed(dataset: Dataset) -> None: """Assert label map has no category name duplicate Args: dataset: dataset to test. """ label_map = pd.Series(dataset.label_map) assert ( not label_map.duplicated().any() ), f"Label map dictionary has duplicate values : {dataset.label_map}"
[docs] def assert_required_columns_present( input_df: pd.DataFrame, required_columns: set[str], df_name: str ) -> None: """Simple function to check that required columns are present and raise a custom error if it's not the case Args: input_df: dataframe object to check. required_columns: set of column names to find in the columns of ``input_df``. df_name: name of the dataframe, used to add context to the error message. Raises: ValueError: Raised when not all required columns are present in the columns of ``input_df``. """ missing_columns = required_columns - set(input_df.columns) if missing_columns: raise ValueError( f"DataFrame {df_name} must have all these columns" f" :\n{', '.join(required_columns)}\nbut is missing" f" {', '.join(missing_columns)}" )
[docs] def full_check_dataset_detection( dataset: Dataset, check_symlink: bool = False, allow_keypoints: bool = False, check_exhaustive: bool = False, ) -> None: """Perform a full check of the dataset. Images must be reachable for the test to perform. Args: dataset: dataset to test check_symlink: If set to True, will check that image relative paths are indeed relative links and not actual files. Defaults to False. allow_keypoints: If set to True, will not raise an error for bounding boxes with size 0 (width or height). Defaults to False. check_exhaustive: If set to True, will check that all images in the images_root folder are in the image dataframe, and that the dataset is indeed exhaustive """ print("Checking Image and annotations Ids ...") assert_ids_well_formed(dataset) print("Checking Bounding boxes ..") assert_bounding_boxes_well_formed(dataset, allow_keypoints=allow_keypoints) print("Checking label map ...") assert_label_map_well_formed(dataset) print("Checking images are valid ...") get_invalid_images( dataset, check_symlink, True, check_exhaustive, raise_if_error=True )