Source code for lours.dataset.io.common

from collections.abc import Iterable, Sequence
from pathlib import Path
from typing import Any

import imagesize
import pandas as pd

from lours.dataset import Dataset

from ...utils.bbox_converter import column_names_from_format_string, import_bbox

IMG_FORMATS = "bmp", "dng", "jpeg", "jpg", "mpo", "png", "tif", "tiff", "webp", "pfm"


[docs] def construct_label_map(annotations: pd.DataFrame) -> dict[int, str]: """Construct label map from annotation DataFrame, with ``category_id`` and ``category_str`` columns. Get all category string associated with each category id. Normally, there should be only one per id Args: annotations: DataFrame containing category id and category name information. Should contain at least ``category_id`` and ``category_str`` columns. Raises: ValueError: Inconsistency in category ids and names. The ``id -> name`` mapping should be bijective. Returns: dictionary containing label map, with category id as key, and category name as value """ label_map_df = ( annotations[["category_id", "category_str"]] .groupby("category_id")["category_str"] .unique() ) label_map_check = label_map_df.apply(len) != 1 if label_map_check.any(): print("Problem with label map, some category ids have multiple different names") print(label_map_df[label_map_check]) raise ValueError("Invalid label map") label_map = {k: v[0] for k, v in label_map_df.items()} # Raise an error if two ids have the same category name if len(set(label_map.values())) != len(label_map): print("Problem with label map, some category names are present in multiple ids") print(label_map) raise ValueError("Invalid label map") return label_map # pyright: ignore
[docs] def convert_str(string: str) -> str | int | float: """String converter tool to read a file, parse and automatically convert the string to integer or float if possible. Will first try to convert to int, then float, then will return as is. Args: string: string containing information to be parsed Returns: converted string, in the most convenient format """ try: result = float(string) if result == int(result): return int(result) else: return result except ValueError: return string
[docs] def get_relative_image_path(dataset_path: Path, image_path: Path | str) -> Path: """Tool function to get relative path between dataset_path and image_path, which might be absolute. Used to populate the ``relative_path`` in the images dataframe of the dataset or evaluator object, which should check the fact that ``dataset.images_root / relative_path`` should always lead to a valid image file Args: dataset_path: root path of considered dataset image_path: image path of a particular image. May be absolute, and need to be converted to be relative the dataset_path Raises: ValueError: image_path is not included in dataset_path. Probably means the dataset path is too specific and should be higher in file hierarchy Returns: Converted image path to be relative to dataset path """ image_path = Path(image_path) if image_path.is_absolute(): try: return image_path.relative_to(dataset_path) except ValueError as e: raise ValueError( "Image paths are not contained in given dataset folder. " "If you want to use absolute path, " "try giving '/' to dataset_path argument" ) from e else: return image_path
[docs] def get_images_from_folder( folder_path: Path, img_formats: Iterable[str] = IMG_FORMATS ) -> list[Path]: """Function to scrape all images in a folder, starting from a list of img formats Args: folder_path: where to search images img_formats: list of file extensions to consider during the globbing Returns: list of all paths leading to an image with the desired extensions. """ output = [] for img_format in img_formats: output.extend(folder_path.glob(f"**/*.{img_format}")) return [path.relative_to(folder_path) for path in output]
[docs] def get_image_info( image_number: int, relative_path: Path, absolute_path: Path | None, image_info: pd.DataFrame | None = None, ) -> dict[str, Any]: """Get image information, either from image info dataframe or from image itself, getting the image dimension by reading its header Args: image_number: number of image in the file list. If image_info is not available, will be used for image id relative_path: path that will be used to find the image in the image_info dataframe, if given absolute_path: absolute to load the image data directly from the file. Can be None if image_info has an entry with the same ``relative_path`` value image_info: DataFrame including image size and image id to match the ids of another dataset for example. Must have at least ``relative_path``, ``width`` and ``height`` columns. Defaults to None. Returns: dictionary with width height and id keys """ current_image = {"relative_path": relative_path, "type": relative_path.suffix} if image_info is None or relative_path not in image_info["relative_path"].values: if absolute_path is None: raise ValueError( "You must at least provide valid image_info or absolute_path" ) width, height = imagesize.get(absolute_path) current_image["width"] = width current_image["height"] = height image_id = image_number current_image["id"] = image_id else: current_image_info = image_info[ image_info["relative_path"] == relative_path ].iloc[0] current_image["width"] = current_image_info["width"] current_image["height"] = current_image_info["height"] image_id = current_image_info.name current_image["id"] = image_id return current_image
[docs] def to_dataset_object( images_root: Path, label_map: dict[int, str] | None, images: Sequence[dict], annotations: Sequence[dict], box_format: str = "cxwcyh", ids_map: dict[int, dict[str, Any]] | None = None, ) -> Dataset: """Create the dataset object from aggregated lists of dictionaries Args: images_root: path where the images are located and from where relative paths are given label_map: dictionary of category id vs category name images: list of image dictionaries. Each dictionary is one image annotations: list of annotations dictionaries box_format: expected type of box format. See :mod:`lours.utils.bbox_converter` Defaults to "cxwcyh" ids_map: dictionary to remap classes back to their original id values. This is a special case of darknet where the ids are almost always changed because they need to be sequential Returns: created dataset objects with the right category ids """ if images: images_df = pd.DataFrame(images).set_index("id") else: assert ( len(annotations) == 0 ), "Got empty image sequence, but a non empty annotation sequence" images_df = None if annotations: annotations_df = pd.DataFrame(annotations) if "id" in annotations_df.columns: annotations_df = annotations_df.set_index("id") else: annotations_df.index.name = "id" else: annotations_df = None if images_df is not None and annotations_df is not None: bboxes = import_bbox(annotations_df, images_df, input_format=box_format) columns_to_drop = column_names_from_format_string(box_format) annotations_df = annotations_df.drop( columns_to_drop, axis=1, ) annotations_df = pd.concat([annotations_df, bboxes], axis=1) if label_map is None: label_map = construct_label_map(annotations_df) dataset = Dataset( images_root, images_df, annotations_df, label_map, ) if ids_map is not None: id_remapping = {int(i): line["id"] for i, line in ids_map.items()} new_label_map = {line["id"]: line["name"] for line in ids_map.values()} dataset = dataset.remap_classes(id_remapping, new_label_map) return dataset
[docs] def parse_annotation_name( annotations_file_path: str | Path, split_name_mapping: dict[str, list[str]] | None = None, ) -> tuple[str | None, str | None]: """Deduce name of dataset and name of split by assuming it is in the form '<dataset_name>_<split_name>.<extension>' For example, 'coco_train.json' will be parsed to return 'coco' and 'train' Args: annotations_file_path: name of the annotation file without extension or path to the annotation file which name will be parsed. split_name_mapping: Dictionary with split names you want to appear in the lours dataset as keys and a list of possible words you want this name to replace as values. For example, remap split names abbreviations to their full name so that "val" becomes "validation". If set to None, will simply map variations of 'train', 'valid', 'eval' to them, i.e. 'training' gets replaced by 'train', 'val' and 'validation' get replaced by 'valid' and 'evaluation' and 'test' get replaced by 'eval'. Defaults to None. Returns: tuple containing two names : dataset name and split name. They can be none in the case parsing was not successful. Example: >>> parse_annotation_name("my_dataset_test") ('my_dataset', 'eval') >>> parse_annotation_name("my_dataset_hello", {"hey": ["hello", "hi"]}) ('my_dataset', 'hey') >>> parse_annotation_name("my_dataset") ('my', 'dataset') >>> parse_annotation_name("mydataset") ('mydataset', None) """ if split_name_mapping is None: split_name_mapping = { "train": ["train", "training"], "valid": ["val", "valid", "validation"], "eval": ["eval", "evaluation", "test"], } if isinstance(annotations_file_path, Path): name = annotations_file_path.stem else: name = annotations_file_path if "_" in name: parsed_name, parsed_split = name.rsplit("_", maxsplit=1) for possible_split, possible_names in split_name_mapping.items(): if parsed_split in possible_names: parsed_split = possible_split return parsed_name, parsed_split return name, None