"""Set of functions used to test some assertions on datasets.
Useful when used in unit tests
"""
from collections.abc import Hashable, Iterable, Sequence
import numpy as np
import pandas as pd
from imageio.v3 import imread
from pandas.testing import assert_frame_equal
from tqdm.auto import tqdm
from ..dataset import Dataset
from ..utils import BBOX_COLUMN_NAMES
BOX_XMIN, BOX_YMIN, BOX_WIDTH, BOX_HEIGHT = BBOX_COLUMN_NAMES
[docs]
def assert_column(
input_df: pd.DataFrame,
assertion: pd.Series | np.ndarray,
message: str = "",
n_first_occurrences: int | None = 1,
) -> None:
"""From a given input dataframe and a boolean series of the same length, construct
an error message if the boolean has at least one False value, with the row in
input dataframe corresponding to the row of the first occurrence of False value in
the assertion series
Args:
input_df: Dataframe to show the row from, to better understand what went wrong
assertion: Boolean Series of the same length as ``input_df``, expected to be
full of True value
message: Message to display when raising the error. Will be followed with
information of faulty rows
n_first_occurrences: Number of occurrences to show in case of a failure. Useful
when showing duplicate values. If set to None, will show all occurrences.
Raises:
AssertionError: If there is at least one occurrence of False in ``assertion``
Series, raise an assertion and print the corresponding row of first
occurrence in ``input_df``
"""
assert len(input_df) == len(assertion)
assert n_first_occurrences is None or n_first_occurrences > 0
assertion = assertion.astype(bool)
if not assertion.all():
failure = input_df[~assertion].iloc[:n_first_occurrences]
if n_first_occurrences is None:
raise AssertionError(f"Assertion failed. {message}. {failure}")
elif n_first_occurrences == 1:
raise AssertionError(
f"Assertion failed. {message}. First occurrence at row"
f" {failure.index[0]} : {failure.iloc[0]}"
)
else:
raise AssertionError(
f"Assertion failed. {message}. First occurrences at rows"
f" {failure.index[:n_first_occurrences]} :\n{failure.iloc[:n_first_occurrences]}"
)
[docs]
def assert_columns_properly_normalized(
input_df: pd.DataFrame, separator: str = "."
) -> None:
"""Checks that columns in input dataframes are well normalized, i.e. checks that
if column 'A' exists, column 'A.B' does not exists.
This is useful when loading json files to checks that a key cannot be both a
sub dictionary and a value
Args:
input_df: Input DataFrame to test
separator: Character used to separate name in flattened key. Defaults to ".".
Raises:
AssertionError: if there exist a column name where both the name and a variation
of name + separator exists
"""
for c in input_df.columns:
prefix = f"{c}{separator}"
for c2 in input_df.columns:
if c2.startswith(prefix):
raise AssertionError(
f"DataFrame is not properly normalized. Column '{c}' cannot be"
f" both a value and a subdictionary, but column '{c2}' exists"
)
[docs]
def assert_dataset_equal(
dataset1: Dataset,
dataset2: Dataset,
ignore_index: bool = False,
optional_columns: Iterable[str] = ("area", "confidence"),
remove_na_columns: bool = False,
) -> None:
"""Compare two datasets and raise an assertion error if datasets are not equal.
This function is mainly intended to be used in the context of unit tests.
Rules:
- Index order is not relevant. This is similar to ``check_like`` option
in :func:`pandas.testing.assert_frame_equal`
- Indexes for rows and columns still must be the same when reordered
- Some columns in annotations are optional and are thus ignored if present in
one but not the other dataset.
If both are present, the columns' values are still compared.
- Label maps must be the same. Again, order is ignored (as it normally is for
dictionaries)
- If ``ignore_index`` option is set to ``True``, index for rows are not checked,
but we still check that the key in annotations' ``image_id`` points to the
same rows in images dataframe
Args:
dataset1: First dataset to test
dataset2: Second dataset to test, must be the same according to mentioned rules
or the function will raise an error
ignore_index: If set, will ignore both annotations and images dataframe index,
but will still check that link between annotations and image row with
``image_id`` is the same. Defaults to False.
optional_columns: Iterable of column names that will considered as optional,
i.e. only check them if they are both present. Defaults to the column names
"area" and "confidence".
remove_na_columns: If set to True, will remove from dataframes columns where all
values are equivalent to panda's ``<NA>``. This more lenient comparison is
useful for columns where its absence and its values being all ``<NA>`` are
treated the same, like the ``split`` column.
Raises:
AssertionError: raised when datasets are detected to be different
"""
optional_columns = list(optional_columns)
def assert_frame_equal_optional_columns(
frame1: pd.DataFrame,
frame2: pd.DataFrame,
optional_columns: Sequence[str],
dataframe_name: str,
) -> None:
"""Assert dataframe are equal, but first remove the optional columns if they are
present in one dataframe and not in the other. Otherwise, if present in both,
keep them for comparison
"""
if remove_na_columns:
frame1 = frame1.dropna(axis="columns", how="all")
frame2 = frame2.dropna(axis="columns", how="all")
for column_name in optional_columns:
if column_name in frame1.columns and column_name not in frame2.columns:
frame1 = frame1.drop(column_name, axis=1)
if column_name in frame2.columns and column_name not in frame1.columns:
frame2 = frame2.drop(column_name, axis=1)
try:
assert_frame_equal(frame1, frame2, check_like=True, check_dtype="equiv")
except AssertionError as e:
raise AssertionError(f"{dataframe_name} dataframes don't match") from e
if ignore_index:
dataset1 = dataset1.reset_index()
dataset2 = dataset2.reset_index()
assert_frame_equal_optional_columns(
dataset1.images, dataset2.images, optional_columns, "Images"
)
assert_frame_equal_optional_columns(
dataset1.annotations, dataset2.annotations, optional_columns, "Annotations"
)
assert (
dataset1.label_map == dataset2.label_map
), f"label_maps don't match {dataset1.label_map} vs {dataset2.label_map}"
assert dataset1.booleanized_columns == dataset2.booleanized_columns
[docs]
def assert_frame_intersections_equal(df1: pd.DataFrame, df2: pd.DataFrame) -> None:
"""Construct inner dataframes from overlapping ids and columns and check they are
equal
These are the rows and columns present in both images dataframes
The two dataframes must have the same values for the merge to be valid
Args:
df1: First dataframe to test
df2: Second dataframe to test
Raises:
AssertionError: Raise error if both subdataframe constructed with intersections
of indexes and columns are not the same.
"""
df1_ids = set(df1.index)
df2_ids = set(df2.index)
mutual_ids = list(df1_ids & df2_ids)
if not mutual_ids:
return
df1_columns = set(df1.columns)
df2_columns = set(df2.columns)
mutual_columns = list(df1_columns & df2_columns)
if not mutual_columns:
return
inner_df1 = df1.loc[mutual_ids, mutual_columns]
inner_df2 = df2.loc[mutual_ids, mutual_columns]
try:
assert_frame_equal(inner_df1, inner_df2)
except AssertionError as e:
raise AssertionError(
"sub-Dataframes constructed from ids and columns in both DataFrames are not"
" equal."
) from e
[docs]
def assert_images_valid(
dataset: Dataset,
assert_is_symlink: bool = False,
load_images: bool = True,
check_exhaustive: bool = False,
) -> None:
"""Checks that the image paths in the dataset. Namely, checks that all path
are indeed pointing to a file, and are valid file format that can be loaded with
``imageio``.
Note:
Todo: better error messages
Args:
dataset: Dataset to check
assert_is_symlink: If set, will check that paths are symlinks rather than
files. Defaults to False.
load_images: If set to True, will not only check that images are valid files,
but also that image can be loaded (i.e. are not corrupted files) and that
their sizes match the ones included in ``dataset.images``
dataframe. Note that this makes the function significantly slower.
Defaults to True.
check_exhaustive: If set to True, will check that all images in the images_root
folder are in the image dataframe, and that the dataset is indeed exhaustive
"""
get_invalid_images(
dataset, assert_is_symlink, load_images, check_exhaustive, raise_if_error=True
)
[docs]
class InvalidImage(AssertionError):
pass
[docs]
class MissingImages(AssertionError):
pass
[docs]
def get_invalid_images(
dataset: Dataset,
check_symlink: bool = False,
load_images: bool = True,
check_exhaustive: bool = False,
raise_if_error: bool = True,
) -> pd.DataFrame:
"""Checks dataset's images and return an indexed error report to retrieve them.
Namely, checks that all path are indeed pointing to a file, and are valid file
format that can be loaded with ``imageio``. If unsuccessful, add a row to the output
dataframe with the same index as the faulty images, and info about the error in
corresponding columns
Args:
dataset: Dataset to check
check_symlink: If set, will check that paths are symlinks rather than
files. Defaults to False.
load_images: If set to True, will not only check that images are valid files,
but also that image can be loaded (i.e. are not corrupted files) and that
their sizes match the ones included in ``dataset.images``
dataframe. Note that this makes the function significantly slower.
Defaults to True.
check_exhaustive: If set to True, will check that all images in the images_root
folder are in the image dataframe, and that the dataset is indeed exhaustive
raise_if_error: If set to True, will raise an InvalidImage error as soon as
one image does not meet the requirements.
Raises:
InvalidImage: Raised if ``raise_if_error`` is selected and one image is not
valid. Can be because the path is not right, the image loading failed,
or the metadata is not compliant with actual image data.
MissingImages: Raised if ``raise_if_error`` is selected and some images
where found in the ``images_root`` folder but not in the dataset's
``images`` dataframe.
Returns:
Error report in the form of a Dataframe with "reason" and "additional_info"
columns. Index values are the same as the corresponding images in the original
dataset, so that you can retrieve the faulty images full data.
"""
error_report = {}
def error(
message: str,
additional_info: str,
row_id: Hashable,
image_data: "pd.Series[str]",
) -> None:
error_report[row_id] = {"reason": message, "additional_info": additional_info}
if raise_if_error:
raise InvalidImage(
f"{message}, {additional_info}\n row : {row_id}\n data:"
f" {image_data.to_dict()}"
)
for row, img_data in tqdm(dataset.images.iterrows(), total=len(dataset)):
if img_data["relative_path"].is_absolute():
error("relative path is absolute", "", row, img_data)
continue
img_path = dataset.images_root / img_data["relative_path"]
if check_symlink and not img_path.is_symlink():
error("Not a symlink", "", row, img_data)
continue
valid_path = (
img_path.is_symlink()
and img_path.readlink().is_file()
or img_path.is_file()
)
if not valid_path:
error("Not a valid path", "", row, img_data)
continue
if load_images:
try:
img = imread(img_path)
except OSError:
error(
"corrupted file",
"Image cannot be loaded with imageio",
row,
img_data,
)
continue
if not isinstance(img_data["width"], int) or img_data["width"] <= 0:
error(
"Invalid image width",
f"got {img_data['width']} pixels",
row,
img_data,
)
continue
if not isinstance(img_data["height"], int) or img_data["height"] <= 0:
error(
"Invalid image height",
f"got {img_data['height']} pixels",
row,
img_data,
)
continue
if len(img.shape) not in [2, 3, 4]:
error(
"invalid image shape",
f"Shape is with {len(img.shape)} dimensions instead of 2"
" (grayscale), 3 (RGB/RGBA) or 4 (GIf anim)",
row,
img_data,
)
if len(img.shape) == 4:
_, height, width, _ = img.shape
elif len(img.shape) == 3:
height, width, _ = img.shape
else:
height, width = img.shape
if img_data["width"] != width:
error(
"Image width in metadata is different from actual image width",
f"{width} (actual) vs {img_data['width']} (metadata)",
row,
img_data,
)
if img_data["height"] != height:
error(
"Image height in metadata is different from actual image height",
f"{height} (actual) vs {img_data['height']} (metadata)",
row,
img_data,
)
if check_exhaustive:
from ..dataset import from_folder
highest_id = dataset.images.index.max()
all_images = (
from_folder(images_root=dataset.images_root)
.reset_index(start_image_id=highest_id + 1)
.images
)
missing_images = all_images.loc[
~all_images["relative_path"].isin(dataset.images["relative_path"]),
"relative_path",
].apply(str)
if len(missing_images) > 0 and raise_if_error:
raise AssertionError(
"Dataset is not exhaustive : the following images are present in"
" images root but not in dataset image dataframe"
f" :\n{', '.join(missing_images)}"
)
for row, relative_path in missing_images.items():
error_report[row] = {
"message": "missing image",
"additional_info": relative_path,
}
return pd.DataFrame.from_dict(error_report, orient="index")
[docs]
def assert_required_columns_present(
input_df: pd.DataFrame, required_columns: set[str], df_name: str
) -> None:
"""Simple function to check that required columns are present and raise a custom
error if it's not the case
Args:
input_df: dataframe object to check.
required_columns: set of column names to find in the columns of ``input_df``.
df_name: name of the dataframe, used to add context to the error message.
Raises:
ValueError: Raised when not all required columns are present in the
columns of ``input_df``.
"""
missing_columns = required_columns - set(input_df.columns)
if missing_columns:
raise ValueError(
f"DataFrame {df_name} must have all these columns"
f" :\n{', '.join(required_columns)}\nbut is missing"
f" {', '.join(missing_columns)}"
)
[docs]
def full_check_dataset_detection(
dataset: Dataset,
check_symlink: bool = False,
allow_keypoints: bool = False,
check_exhaustive: bool = False,
) -> None:
"""Perform a full check of the dataset. Images must be reachable for the test to
perform.
Args:
dataset: dataset to test
check_symlink: If set to True, will check that image relative paths are indeed
relative links and not actual files. Defaults to False.
allow_keypoints: If set to True, will not raise an error for bounding boxes with
size 0 (width or height). Defaults to False.
check_exhaustive: If set to True, will check that all images in the images_root
folder are in the image dataframe, and that the dataset is indeed exhaustive
"""
print("Checking Image and annotations Ids ...")
assert_ids_well_formed(dataset)
print("Checking Bounding boxes ..")
assert_bounding_boxes_well_formed(dataset, allow_keypoints=allow_keypoints)
print("Checking label map ...")
assert_label_map_well_formed(dataset)
print("Checking images are valid ...")
get_invalid_images(
dataset, check_symlink, True, check_exhaustive, raise_if_error=True
)