Source code for lours.utils.doc_utils

from collections.abc import Sequence
from pathlib import Path
from typing import Literal, TypeGuard

import numpy as np
import pandas as pd
from faker import Faker

from ..dataset import Dataset
from .bbox_converter import import_bbox


[docs] def construct_attribute_column( numpy_generator: np.random.Generator, n_rows: int, labels: Sequence[str], probs: Sequence[float] | None = None, is_list_column: bool = True, ) -> pd.Categorical | list[list[str]]: """Generate a column with lists of elements taken in a finite pool the generated sequence of lists will be in the form of a numpy array, which will become a column in a DataFrame. Args: numpy_generator: numpy random Generator object used to generate random integers n_rows: number of rows of generated numpy array labels: label strings to use for the attributes probs: sequence of probabilities to construct each row. If set to None, will use the probabilities by default: for attribute lists, each probability will be 0.5, and for simple attribute, probabilities will be evenly distributed. Defaults to None. is_list_column: if set to True, will construct a column with list of attributes, that constitute a subset of the set of labels. Otherwise, will simply construct a simple attribute column, where each row is a single label taken from ``labels`` according to the probability distribution given by ``probs``. Defaults to True. Returns: list of lists that will be incorporated in a dataframe. """ if is_list_column: if probs is None: booleanized_column = numpy_generator.integers( 0, 2, (n_rows, len(labels)) ).astype(bool) else: booleanized_column = np.stack( list( numpy_generator.choice([True, False], size=n_rows, p=[p_, 1 - p_]) for p_ in probs ), axis=-1, ) labels_np = np.array(list(labels)) return [list(labels_np[indices]) for indices in booleanized_column] else: return pd.Categorical( numpy_generator.choice(labels, size=n_rows, p=probs), categories=labels )
random_attribute_column_type = ( int | Sequence[int] | Sequence[str] | Sequence[Sequence[float]] | dict[str, int] | dict[str, Sequence[float]] | dict[str, Sequence[str]] | dict[str, dict[str, float]] ) """The random attribute columns type is a way to design a column with random attributes. It will create :math:`N` columns, each :math:`i` th column with :math:`M_i` labels, the labels being distributed according to the probabilities in :math:`(p_i)_j` (:math:`(p_i)_j` being of length :math:`M_i`, with values :math:`p_{i,j}` between 0 and 1). In the case the column is an non-list attribute column, each vector :math:`(p_i)_j` must addup to 1. Otherwise, each probability :math:`p_{i,j}` is the probability that the :math:`j` th label of :math:`i` th column is in the attribute list for each cell. Depending on the type, the values :math:`N`, :math:`M`, :math:`(p_i)_j` and the names will be constructed differently. If not specified, column header and labels are randomly generated with ``Faker.unique.word()`` If not specified, the probabilities pi will be either uniform probabilities for non-list attribute columns, or all set to 0.5 for attribute list columns. The input can be either - An integer: :math:`N` is the given integer, :math:`M_i` are random integers between 2 and 10 - A sequence of integers: :math:`N` is the length of the sequence, :math:`M_i` are the integers of that sequence. - A sequence of str: :math:`N` is the length of the sequence, the column headers are the sequence elements, and :math:`M_i` are random integers between 2 and 10. - A sequence of sequences of float: :math:`N` is the length of the sequence, :math:`M_i` is the length of each :math:`i` th sequence, and :math:`(p_i)_j` is the :math:`i` th sequence of floats. - A dictionary of integers: :math:`N` is the length of the dictionary. The column headers are the dictionary keys, and :math:`M_i` are the integer values. - A dictionary of float sequences: :math:`N` is the length of the dictionary. The column headers are the dictionary keys, :math:`M_i` is the length of the :math:`i` th float sequence, and :math:`(p_i)_j` is the :math:`i` th float sequence - A dictionary of string sequences: :math:`N` is the length of the dictionary. The column headers are the dictionary keys, :math:`M_i` is the length of the :math:`i` th string sequence, and the :math:`j` th label of the :math:`i` th column is the :math:`j` th element of the :math:`i` th sequence. - A dictionary of float dictionaries. :math:`N` is the length of the root dictionary. The column headers are the dictionary keys, :math:`M_i` is the length of the :math:`i` th sub-dictionary, the :math:`j` th label of the :math:`i` th column is the :math:`j` th key of the :math:`i` th sub-dictionary and the probability :math:`p_{i,j}` is the corresponding sub-dictionary value """
[docs] def set_attribute_columns_labels( input_dataframe: pd.DataFrame, columns_specs: random_attribute_column_type, numpy_generator: np.random.Generator, fake_generator: Faker, is_list: bool = False, min_labels: int = 2, max_labels: int = 10, ) -> list[str]: """From a specification given according to the :obj:`random_attribute_column_type` type, add attribute columns to the given dataframe and return the name of added columns. Depending on ``is_list``, it will be either an attribute column, where each row has a single value, taken from a fixed set of possible string labels or an attribute list column where each row has a subsset of values from a fixed superset of possible string labels. Args: input_dataframe: DataFrame which will be assigned new columns columns_specs: specification of columns, according to the aforementioned syntax numpy_generator: random generator for numpy arrays fake_generator: random generator for random unique words is_list: if set to True, will construct list attribute columns. Otherwise, will construct simple attribute columns. Defaults to False min_labels: When number of labels if not specified, minimum random number of labels to generate for the current column. Defaults to 2. max_labels: When number of labels if not specified, maximum random number of labels to generate for the current column. Defaults to 10. Returns: The header of added columns. Useful to keep track of list attribute columns to booleanize them. """ def random_labels(n_labels: int) -> list[str]: return [fake_generator.unique.word() for _ in range(n_labels)] def is_float_sequence( sequence: Sequence[str] | Sequence[float], ) -> TypeGuard[Sequence[float]]: types = set(map(type, sequence)) if types not in [{float}, {str}]: raise ValueError( "The input specification accepts sequence of only float or only string" f" labels for dictionary values, got {types} instead" ) return isinstance(sequence[0], float) # This typeguard is needed because typeguard is mostly lacking the type-narrowing # feature. See PEP 742 https://peps.python.org/pep-0742/ def is_str_sequence( sequence: Sequence[str] | Sequence[float], ) -> TypeGuard[Sequence[str]]: return isinstance(sequence[0], str) def construct_detailed_column_spec( input_spec: int | Sequence[str] | Sequence[float] | dict[str, float], ) -> tuple[Sequence[str], Sequence[float] | None]: if isinstance(input_spec, int): labels = random_labels(n_labels=input_spec) probs = None elif isinstance(input_spec, dict): labels = list(input_spec.keys()) probs = list(input_spec.values()) else: if is_float_sequence(input_spec): labels = random_labels(len(input_spec)) probs = input_spec else: assert is_str_sequence(input_spec) labels = input_spec probs = None return labels, probs # specification dictionary: key is the name of the column header, and values # is two vectors: names and probabilities. probabilities vector can be None column_labels: dict[str, tuple[Sequence[str], Sequence[float] | None]] = {} if isinstance(columns_specs, int): for _ in range(columns_specs): n_labels = numpy_generator.integers(min_labels, max_labels) header_name = fake_generator.unique.word() labels = random_labels(n_labels) column_labels[header_name] = (labels, None) elif isinstance(columns_specs, dict): for header_name, specific_column_spec in columns_specs.items(): column_labels[header_name] = construct_detailed_column_spec( specific_column_spec ) else: # Simple sequence for specific_column_spec in columns_specs: if isinstance(specific_column_spec, str): header_name = specific_column_spec specific_column_spec = numpy_generator.integers(min_labels, max_labels) else: header_name = fake_generator.unique.word() column_labels[header_name] = construct_detailed_column_spec( specific_column_spec ) for header_name, (labels, probs) in column_labels.items(): input_dataframe[header_name] = construct_attribute_column( numpy_generator, len(input_dataframe), labels, probs, is_list ) return list(column_labels.keys())
[docs] def dummy_dataset( n_imgs: int = 2, n_annot: int = 2, n_labels: int = 3, split_names: None | str | Sequence[str] = ("train", "val", "eval"), split_shares: Sequence[float] = (0.8, 0.1, 0.1), n_list_columns_images: random_attribute_column_type = 0, n_list_columns_annotations: random_attribute_column_type = 0, n_attribute_columns_images: random_attribute_column_type = 0, n_attributes_columns_annotations: random_attribute_column_type = 0, booleanize: Literal["all", "random", "none"] = "none", keypoints_share: float = 0, add_confidence: bool = False, generate_real_images: bool = False, seed: int = 0, **existing_elements, ) -> Dataset: """Generate a Dummy dataset for demonstration purpose Might also be used for tests Args: n_imgs: number of frame in the fake dataset n_annot: number of annotations n_labels: length of the label map split_names: sequence containing names of the splits to apply to the dataset as a column of images dataframe. If set to None, no "split" column will be added to the images dataframe. If empty, will assume all splits are ``None``. If not empty, and with 2 elements or more, must be the same size as ``split_shares``. Defaults to ``("train", "val", "eval")``. split_shares: sequence containing share of each split whose name was given in ``split_names``. The ith element in ``split_shares`` represents the share (written as a float number between 0 and 1) of the dataset that will be assigned to this split. If ``split_names`` is empty or has a length of 1, it will be ignored. Otherwise, its size must match length of ``split_names``, and the value must all add up to 1. Defaults to ``(0.8, 0.1, 0.1)``. n_list_columns_images: Definition of the attribute lists columns for images. A list column cell contains a subset of a larger set of possible attributes, fixed for the whole columns, in the form of a list or a set. These columns are designed to be booleanized and are created with the function :func:`~construct_list_column`. See :obj:`random_attribute_column_type` for an in depth explanation of the syntax. Defaults to 0 n_list_columns_annotations: number of list columns to add to the annotations dataframe. A list column cell contains a subset of a larger set of possible attributes, fixed for the whole columns, in the form of a list or a set. These columns are designed to be booleanized and are created with the function :func:`~construct_list_column`. See :obj:`random_attribute_column_type` for an in depth explanation of the syntax. Defaults to 0 n_attribute_columns_images: number of attributes columns to add to the images dataframe. An attribute column cell contains one element for a set fixed for the whole column. These columns are created with the function :func:`~construct_list_column`. See :obj:`random_attribute_column_type` for an in depth explanation of the syntax. Defaults to 0 n_attributes_columns_annotations: number of attributes columns to add to the annotations dataframe. An attribute column cell contains one element for a set fixed for the whole column. These columns are created with the function :func:`~construct_list_column`. See :obj:`random_attribute_column_type` for an in depth explanation of the syntax. Defaults to 0 booleanize: how to booleanize the list columns. Can be "all", "random" and "none". Defaults to "none". - "all" means all the list columns will converted to multiple boolean columns - "none" means the list columns will be unchanged - "random" means a random number of list columns will be booleanized. The number of booleanized columns is chosen randomly, and the choice of these n booleanized columns is also done randomly. keypoints_share: Share of bounding box which are keypoints, i.e. with a height and width of 0. Set it to 1 to only have keypoints, and to 0 to have no keypoint. Defaults to 0. add_confidence: If set to True, will add a "confidence" column to annotations with random values between 0 and 1. Use this option to generate random predictions, to be used in e.g. an evaluator. Defaults to False. generate_real_images: if set to True, will generate random images and save them in the ``/tmp/`` folder under a random file name. Otherwise, will just generate random file path to images without creating any. Defaults to False. seed: seed number for the generation. This will ensure that for a given seed number, the same dataset will be created. **existing_elements: optional existing dataset elements that you want not to be random. Returns: Dummy generated dataset Example: >>> dummy_dataset() Dataset object containing 2 images and 2 objects Name : inside_else_memory Images root : such/serious Images : width height relative_path type split id 0 342 136 help/me.jpeg .jpeg train 1 377 167 whatever/wait.png .png train Annotations : image_id category_str category_id ... box_y_min box_width box_height id ... 0 0 step 15 ... 73.932999 71.552480 42.673983 1 0 why 19 ... 4.567638 248.551257 122.602211 <BLANKLINE> [2 rows x 8 columns] Label map : {15: 'step', 19: 'why', 25: 'interview'} Change the seed option to another random dataset following the same rules >>> dummy_dataset(seed=1) Dataset object containing 2 images and 2 objects Name : shake_effort_many Images root : care/suggest Images : width height relative_path type split id 0 955 229 determine/story.jpg .jpg train 1 131 840 air/method.bmp .bmp train Annotations : image_id category_str category_id ... box_y_min box_width box_height id ... 0 1 listen 14 ... 276.974642 9.718823 184.684056 1 0 reach 22 ... 6.311037 123.141689 174.239136 <BLANKLINE> [2 rows x 8 columns] Label map : {14: 'listen', 15: 'marriage', 22: 'reach'} Use the ``split_share`` and ``split_names`` to set splits values. Use the ``keypoints_share`` option to set a share of bounding box with size of 0 >>> dataset = dummy_dataset( ... 10, ... 100, ... split_shares=(0.5, 0.5), ... split_names=("foo", "bar"), ... keypoints_share=0.3, ... add_confidence=True, ... ) >>> dataset Dataset object containing 10 images and 100 objects Name : inside_else_memory Images root : such/serious Images : width height relative_path type split id 0 342 645 help/me.jpeg .jpeg foo 1 377 973 whatever/wait.png .png foo 2 136 756 chair/mother.gif .gif bar 3 167 669 someone/challenge.jpeg .jpeg foo 4 114 589 successful/present.bmp .bmp bar 5 257 603 no/where.jpeg .jpeg foo 6 831 941 play/take.tiff .tiff foo 7 684 349 bit/force.gif .gif bar 8 921 834 way/back.tiff .tiff bar 9 553 703 marriage/give.tiff .tiff foo Annotations : image_id category_str category_id ... box_width box_height confidence id ... 0 0 interview 25 ... 11.569934 591.860047 0.136767 1 3 step 15 ... 70.680613 101.235900 0.663684 2 8 interview 25 ... 0.000000 0.000000 0.749956 3 5 why 19 ... 99.047865 266.499060 0.163943 4 0 why 19 ... 69.419403 61.451991 0.689302 .. ... ... ... ... ... ... ... 95 7 step 15 ... 518.765436 55.277118 0.942361 96 0 step 15 ... 0.000000 0.000000 0.802246 97 5 interview 25 ... 0.000000 0.000000 0.122368 98 4 why 19 ... 89.054816 254.947600 0.124429 99 9 why 19 ... 181.630916 86.810354 0.616242 <BLANKLINE> [100 rows x 9 columns] Label map : {15: 'step', 19: 'why', 25: 'interview'} >>> (dataset.annotations["box_width"] > 0).value_counts() / dataset.len_annot() box_width True 0.69 False 0.31 Name: count, dtype: float64 Add list columns, that can be booleanized later >>> dummy_dataset(n_list_columns_images=1, n_list_columns_annotations=1) Dataset object containing 2 images and 2 objects Name : inside_else_memory Images root : such/serious Images : width height ... split discover id ... 0 342 136 ... train [chair, challenge] 1 377 167 ... train [someone, beyond, present, enough] <BLANKLINE> [2 rows x 6 columns] Annotations : image_id category_str ... box_height where id ... 0 0 step ... 42.673983 [take, play, week, force, bit] 1 0 why ... 122.602211 [no, season, take, play, choice, bit] <BLANKLINE> [2 rows x 9 columns] Label map : {15: 'step', 19: 'why', 25: 'interview'} Or booleanize them right away >>> dummy_dataset( ... n_list_columns_images=1, n_list_columns_annotations=1, booleanize="all" ... ) Dataset object containing 2 images and 2 objects Name : inside_else_memory Images root : such/serious Images : width height ... discover.present discover.someone id ... 0 342 136 ... False False 1 377 167 ... True True <BLANKLINE> [2 rows x 11 columns] Annotations : image_id category_str category_id ... where.season where.take where.week id ... 0 0 step 15 ... False True True 1 0 why 19 ... True True False <BLANKLINE> [2 rows x 16 columns] Label map : {15: 'step', 19: 'why', 25: 'interview'} Add attribute columns which then are transformed into categorical columns. >>> example = dummy_dataset( ... n_attribute_columns_images={"a": 2, "b": 3}, ... n_list_columns_annotations=2, ... ) >>> example Dataset object containing 2 images and 2 objects Name : inside_else_memory Images root : such/serious Images : width height relative_path type split a b id 0 342 136 help/me.jpeg .jpeg train play force 1 377 167 whatever/wait.png .png train take force Annotations : image_id ... where id ... 0 0 ... [] 1 0 ... [no, season] <BLANKLINE> [2 rows x 10 columns] Label map : {15: 'step', 19: 'why', 25: 'interview'} >>> example.images["b"] id 0 force 1 force Name: b, dtype: category Categories (3, object): ['week', 'choice', 'force'] Instead of integers, use lists of probabilities to steer the distribution of attributes. >>> example = dummy_dataset( ... 200, n_attribute_columns_images=[[0.1, 0.1, 0.8]], seed=1 ... ) >>> example Dataset object containing 200 images and 2 objects Name : shake_effort_many Images root : care/suggest Images : width height relative_path type split could id 0 955 488 determine/story.jpg .jpg train note 1 131 895 air/method.bmp .bmp train firm 2 229 880 political/lead.jpg .jpg train firm 3 840 384 like/safe.bmp .bmp train note 4 953 668 suffer/set.jpeg .jpeg train note .. ... ... ... ... ... ... 195 122 437 state/almost.tiff .tiff train firm 196 752 300 weight/tend.jpeg .jpeg train note 197 554 228 remember/summer.png .png train note 198 688 605 yet/though.png .png eval note 199 243 227 describe/road.tiff .tiff train note <BLANKLINE> [200 rows x 6 columns] Annotations : image_id category_str category_id ... box_y_min box_width box_height id ... 0 77 reach 22 ... 45.427512 40.116677 318.073851 1 137 marriage 15 ... 202.481384 435.389400 475.375279 <BLANKLINE> [2 rows x 8 columns] Label map : {14: 'listen', 15: 'marriage', 22: 'reach'} >>> example.images["could"].value_counts() / len(example) could note 0.82 firm 0.09 lead 0.09 Name: count, dtype: float64 Finally, you can generate fake images as well if you want to test the io functions that need images to be valid. >>> dataset = dummy_dataset(generate_real_images=True) >>> dataset Dataset object containing 2 images and 2 objects Name : inside_else_memory Images root : /tmp/such/serious Images : width height relative_path type split id 0 342 136 help/me.jpeg .jpeg train 1 377 167 whatever/wait.png .png train Annotations : image_id category_str category_id ... box_y_min box_width box_height id ... 0 0 step 15 ... 73.932999 71.552480 42.673983 1 0 why 19 ... 4.567638 248.551257 122.602211 <BLANKLINE> [2 rows x 8 columns] Label map : {15: 'step', 19: 'why', 25: 'interview'} >>> dataset.check() Checking Image and annotations Ids ... Checking Bounding boxes .. Checking label map ... Checking images are valid ... """ gen = np.random.default_rng(seed) Faker.seed(seed=seed) fake_generator = Faker() images_root = existing_elements.get( "images_root", Path("/".join(fake_generator.words(2))) ) if generate_real_images and not images_root.is_absolute(): images_root = "/tmp" / images_root dataset_name = existing_elements.get( "dataset_name", "_".join(fake_generator.words(3)) ) if "label_map" in existing_elements: label_map = existing_elements["label_map"] else: label_ids = gen.integers(0, 10 * n_labels, size=n_labels) label_map = { int(label_id): fake_generator.unique.word() for label_id in label_ids } if "images" in existing_elements: images = existing_elements["images"] assert isinstance(images, pd.DataFrame), "Images can only be a dataframe" image_ids = images.index n_imgs = len(image_ids) else: image_ids = np.arange(n_imgs) image_paths = [ Path(fake_generator.file_path(depth=1, category="image", absolute=False)) for i in range(n_imgs) ] width = gen.integers(100, 1000, size=n_imgs) height = gen.integers(100, 1000, size=n_imgs) images = pd.DataFrame( data={ "width": width, "height": height, "relative_path": image_paths, }, index=image_ids, ) annot_ids = np.arange(n_annot) annot_image_ids = gen.choice(image_ids, size=n_annot) category_ids = gen.choice(a=np.array(list(label_map.keys())), size=n_annot) bbox = gen.random((2, 2, n_annot)) box_x_min, box_y_min = bbox.min(axis=0) box_width, box_height = np.abs(bbox[0] - bbox[1]) if keypoints_share > 0: to_zero = gen.choice( a=[True, False], size=n_annot, p=[keypoints_share, 1 - keypoints_share] ) box_width[to_zero] = 0 box_height[to_zero] = 0 bbox = np.stack([box_x_min, box_y_min, box_width, box_height], axis=1) if split_names is not None: if isinstance(split_names, str): split_names = [split_names] n_splits = len(split_names) if n_splits == 0: images["split"] = None elif n_splits == 1: images["split"] = split_names[0] else: if len(split_shares) != n_splits: raise ValueError( "Size mismatch between 'split_names' and 'split_shares'" f" ({len(split_names)} vs {len(split_shares)})" ) if sum(split_shares) != 1: raise ValueError( "Split share values must addup to 1. Got" f" {sum(split_shares)} instead" ) split = gen.choice(list(split_names), size=n_imgs, p=list(split_shares)) images["split"] = split bbox = import_bbox( bbox, images_df=images, image_ids=annot_image_ids, input_format="xywh" ) bbox.index = pd.Index(annot_ids) annotations = pd.DataFrame( data={ "image_id": annot_image_ids, "category_id": category_ids, }, index=annot_ids, ) if add_confidence: annotations["confidence"] = gen.random(n_annot) annotations = pd.concat([annotations, bbox], axis="columns") columns_to_booleanize = [] if n_list_columns_images: columns_to_booleanize.extend( set_attribute_columns_labels( input_dataframe=images, columns_specs=n_list_columns_images, numpy_generator=gen, fake_generator=fake_generator, is_list=True, ) ) if n_list_columns_annotations: columns_to_booleanize.extend( set_attribute_columns_labels( input_dataframe=annotations, columns_specs=n_list_columns_annotations, numpy_generator=gen, fake_generator=fake_generator, is_list=True, ) ) if n_attribute_columns_images: set_attribute_columns_labels( input_dataframe=images, columns_specs=n_attribute_columns_images, numpy_generator=gen, fake_generator=fake_generator, is_list=False, ) if n_attributes_columns_annotations: set_attribute_columns_labels( input_dataframe=annotations, columns_specs=n_attributes_columns_annotations, numpy_generator=gen, fake_generator=fake_generator, ) dataset = Dataset( label_map=label_map, images=images, annotations=annotations, images_root=images_root, dataset_name=dataset_name, ) if booleanize == "all": dataset = dataset.booleanize(columns_to_booleanize) elif booleanize == "random": subset_to_booleanize = gen.choice( columns_to_booleanize, size=gen.integers(len(columns_to_booleanize)), replace=False, ) dataset = dataset.booleanize(subset_to_booleanize) elif booleanize != "none": raise ValueError( "Invalid booleanize option. Possible values are 'all', 'random' or 'none'," f" got {booleanize}" ) if generate_real_images: for i, image_data in dataset.images.iterrows(): image_path = dataset.images_root / image_data["relative_path"] format = image_data["type"][1:] if format.lower() == "jpg": format = "jpeg" image_path.parent.mkdir(parents=True, exist_ok=True) with open(image_path, "wb") as f: image_data_stream = fake_generator.image( image_format=format, size=(image_data["width"], image_data["height"]), ) f.write(image_data_stream) return dataset