from collections.abc import Sequence
from pathlib import Path
from typing import Literal, TypeGuard
import numpy as np
import pandas as pd
from faker import Faker
from ..dataset import Dataset
from .bbox_converter import import_bbox
[docs]
def construct_attribute_column(
numpy_generator: np.random.Generator,
n_rows: int,
labels: Sequence[str],
probs: Sequence[float] | None = None,
is_list_column: bool = True,
) -> pd.Categorical | list[list[str]]:
"""Generate a column with lists of elements taken in a finite pool
the generated sequence of lists will be in the form of a numpy array, which will
become a column in a DataFrame.
Args:
numpy_generator: numpy random Generator object used to generate random integers
n_rows: number of rows of generated numpy array
labels: label strings to use for the attributes
probs: sequence of probabilities to construct each row. If set to None, will use
the probabilities by default: for attribute lists, each probability will
be 0.5, and for simple attribute, probabilities will be evenly distributed.
Defaults to None.
is_list_column: if set to True, will construct a column with list of attributes,
that constitute a subset of the set of labels. Otherwise, will simply
construct a simple attribute column, where each row is a single label taken
from ``labels`` according to the probability distribution given by
``probs``. Defaults to True.
Returns:
list of lists that will be incorporated in a dataframe.
"""
if is_list_column:
if probs is None:
booleanized_column = numpy_generator.integers(
0, 2, (n_rows, len(labels))
).astype(bool)
else:
booleanized_column = np.stack(
list(
numpy_generator.choice([True, False], size=n_rows, p=[p_, 1 - p_])
for p_ in probs
),
axis=-1,
)
labels_np = np.array(list(labels))
return [list(labels_np[indices]) for indices in booleanized_column]
else:
return pd.Categorical(
numpy_generator.choice(labels, size=n_rows, p=probs), categories=labels
)
random_attribute_column_type = (
int
| Sequence[int]
| Sequence[str]
| Sequence[Sequence[float]]
| dict[str, int]
| dict[str, Sequence[float]]
| dict[str, Sequence[str]]
| dict[str, dict[str, float]]
)
"""The random attribute columns type is a way to design a column with random
attributes.
It will create :math:`N` columns, each :math:`i` th column with :math:`M_i` labels,
the labels being distributed according to the probabilities in :math:`(p_i)_j`
(:math:`(p_i)_j` being of length :math:`M_i`, with values :math:`p_{i,j}` between
0 and 1).
In the case the column is an non-list attribute column, each vector :math:`(p_i)_j` must
addup to 1. Otherwise, each probability :math:`p_{i,j}` is the probability that the
:math:`j` th label of :math:`i` th column is in the attribute list for each cell.
Depending on the type, the values :math:`N`, :math:`M`, :math:`(p_i)_j` and the names
will be constructed differently.
If not specified, column header and labels are randomly generated with
``Faker.unique.word()``
If not specified, the probabilities pi will be either uniform probabilities for
non-list attribute columns, or all set to 0.5 for attribute list columns.
The input can be either
- An integer: :math:`N` is the given integer, :math:`M_i` are random integers between
2 and 10
- A sequence of integers: :math:`N` is the length of the sequence, :math:`M_i` are the
integers of that sequence.
- A sequence of str: :math:`N` is the length of the sequence, the column headers are the
sequence elements, and :math:`M_i` are random integers between 2 and 10.
- A sequence of sequences of float: :math:`N` is the length of the sequence,
:math:`M_i` is the length of each :math:`i` th sequence, and :math:`(p_i)_j` is the
:math:`i` th sequence of floats.
- A dictionary of integers: :math:`N` is the length of the dictionary. The column
headers are the dictionary keys, and :math:`M_i` are the integer values.
- A dictionary of float sequences: :math:`N` is the length of the dictionary. The column
headers are the dictionary keys, :math:`M_i` is the length of the :math:`i` th float
sequence, and :math:`(p_i)_j` is the :math:`i` th float sequence
- A dictionary of string sequences: :math:`N` is the length of the dictionary. The
column headers are the dictionary keys, :math:`M_i` is the length of the :math:`i` th
string sequence, and the :math:`j` th label of the :math:`i` th column is the
:math:`j` th element of the :math:`i` th sequence.
- A dictionary of float dictionaries. :math:`N` is the length of the root dictionary.
The column headers are the dictionary keys, :math:`M_i` is the length of the
:math:`i` th sub-dictionary, the :math:`j` th label of the :math:`i` th column is the
:math:`j` th key of the :math:`i` th sub-dictionary and the probability
:math:`p_{i,j}` is the corresponding sub-dictionary value
"""
[docs]
def set_attribute_columns_labels(
input_dataframe: pd.DataFrame,
columns_specs: random_attribute_column_type,
numpy_generator: np.random.Generator,
fake_generator: Faker,
is_list: bool = False,
min_labels: int = 2,
max_labels: int = 10,
) -> list[str]:
"""From a specification given according to the :obj:`random_attribute_column_type`
type, add attribute columns to the given dataframe and return the name of added
columns.
Depending on ``is_list``, it will be either an attribute column, where each row
has a single value, taken from a fixed set of possible string labels or an attribute
list column where each row has a subsset of values from a fixed superset of possible
string labels.
Args:
input_dataframe: DataFrame which will be assigned new columns
columns_specs: specification of columns, according to the aforementioned
syntax
numpy_generator: random generator for numpy arrays
fake_generator: random generator for random unique words
is_list: if set to True, will construct list attribute columns. Otherwise, will
construct simple attribute columns. Defaults to False
min_labels: When number of labels if not specified, minimum random number of
labels to generate for the current column. Defaults to 2.
max_labels: When number of labels if not specified, maximum random number of
labels to generate for the current column. Defaults to 10.
Returns:
The header of added columns. Useful to keep track of list attribute columns
to booleanize them.
"""
def random_labels(n_labels: int) -> list[str]:
return [fake_generator.unique.word() for _ in range(n_labels)]
def is_float_sequence(
sequence: Sequence[str] | Sequence[float],
) -> TypeGuard[Sequence[float]]:
types = set(map(type, sequence))
if types not in [{float}, {str}]:
raise ValueError(
"The input specification accepts sequence of only float or only string"
f" labels for dictionary values, got {types} instead"
)
return isinstance(sequence[0], float)
# This typeguard is needed because typeguard is mostly lacking the type-narrowing
# feature. See PEP 742 https://peps.python.org/pep-0742/
def is_str_sequence(
sequence: Sequence[str] | Sequence[float],
) -> TypeGuard[Sequence[str]]:
return isinstance(sequence[0], str)
def construct_detailed_column_spec(
input_spec: int | Sequence[str] | Sequence[float] | dict[str, float],
) -> tuple[Sequence[str], Sequence[float] | None]:
if isinstance(input_spec, int):
labels = random_labels(n_labels=input_spec)
probs = None
elif isinstance(input_spec, dict):
labels = list(input_spec.keys())
probs = list(input_spec.values())
else:
if is_float_sequence(input_spec):
labels = random_labels(len(input_spec))
probs = input_spec
else:
assert is_str_sequence(input_spec)
labels = input_spec
probs = None
return labels, probs
# specification dictionary: key is the name of the column header, and values
# is two vectors: names and probabilities. probabilities vector can be None
column_labels: dict[str, tuple[Sequence[str], Sequence[float] | None]] = {}
if isinstance(columns_specs, int):
for _ in range(columns_specs):
n_labels = numpy_generator.integers(min_labels, max_labels)
header_name = fake_generator.unique.word()
labels = random_labels(n_labels)
column_labels[header_name] = (labels, None)
elif isinstance(columns_specs, dict):
for header_name, specific_column_spec in columns_specs.items():
column_labels[header_name] = construct_detailed_column_spec(
specific_column_spec
)
else: # Simple sequence
for specific_column_spec in columns_specs:
if isinstance(specific_column_spec, str):
header_name = specific_column_spec
specific_column_spec = numpy_generator.integers(min_labels, max_labels)
else:
header_name = fake_generator.unique.word()
column_labels[header_name] = construct_detailed_column_spec(
specific_column_spec
)
for header_name, (labels, probs) in column_labels.items():
input_dataframe[header_name] = construct_attribute_column(
numpy_generator, len(input_dataframe), labels, probs, is_list
)
return list(column_labels.keys())
[docs]
def dummy_dataset(
n_imgs: int = 2,
n_annot: int = 2,
n_labels: int = 3,
split_names: None | str | Sequence[str] = ("train", "val", "eval"),
split_shares: Sequence[float] = (0.8, 0.1, 0.1),
n_list_columns_images: random_attribute_column_type = 0,
n_list_columns_annotations: random_attribute_column_type = 0,
n_attribute_columns_images: random_attribute_column_type = 0,
n_attributes_columns_annotations: random_attribute_column_type = 0,
booleanize: Literal["all", "random", "none"] = "none",
keypoints_share: float = 0,
add_confidence: bool = False,
generate_real_images: bool = False,
seed: int = 0,
**existing_elements,
) -> Dataset:
"""Generate a Dummy dataset for demonstration purpose
Might also be used for tests
Args:
n_imgs: number of frame in the fake dataset
n_annot: number of annotations
n_labels: length of the label map
split_names: sequence containing names of the splits to apply to the dataset as
a column of images dataframe. If set to None, no "split" column will be
added to the images dataframe. If empty, will assume all splits are
``None``. If not empty, and with 2 elements or more, must be the same size
as ``split_shares``. Defaults to ``("train", "val", "eval")``.
split_shares: sequence containing share of each split whose name was given in
``split_names``. The ith element in ``split_shares`` represents the share
(written as a float number between 0 and 1) of the dataset that will be
assigned to this split. If ``split_names`` is empty or has a length of 1, it
will be ignored. Otherwise, its size must match length of ``split_names``,
and the value must all add up to 1. Defaults to ``(0.8, 0.1, 0.1)``.
n_list_columns_images: Definition of the attribute lists columns for images.
A list column cell contains a subset of a larger set of possible
attributes, fixed for the whole columns, in the form of a list or a set.
These columns are designed to be booleanized and are created with
the function :func:`~construct_list_column`.
See :obj:`random_attribute_column_type` for an in depth explanation of the
syntax. Defaults to 0
n_list_columns_annotations: number of list columns to add to the annotations
dataframe. A list column cell contains a subset of a larger set of possible
attributes, fixed for the whole columns, in the form of a list or a set.
These columns are designed to be booleanized and are created with
the function :func:`~construct_list_column`.
See :obj:`random_attribute_column_type` for an in depth explanation of the
syntax. Defaults to 0
n_attribute_columns_images: number of attributes columns to add to the images
dataframe. An attribute column cell contains one element for a set fixed for
the whole column. These columns are created with the function
:func:`~construct_list_column`. See :obj:`random_attribute_column_type`
for an in depth explanation of the syntax. Defaults to 0
n_attributes_columns_annotations: number of attributes columns to add to the
annotations dataframe. An attribute column cell contains one element for a
set fixed for the whole column. These columns are created with the function
:func:`~construct_list_column`. See :obj:`random_attribute_column_type`
for an in depth explanation of the syntax. Defaults to 0
booleanize: how to booleanize the list columns. Can be "all", "random" and
"none". Defaults to "none".
- "all" means all the list columns will converted to multiple boolean
columns
- "none" means the list columns will be unchanged
- "random" means a random number of list columns will be booleanized. The
number of booleanized columns is chosen randomly, and the choice of
these n booleanized columns is also done randomly.
keypoints_share: Share of bounding box which are keypoints, i.e. with a height
and width of 0. Set it to 1 to only have keypoints, and to 0 to have no
keypoint. Defaults to 0.
add_confidence: If set to True, will add a "confidence" column to annotations
with random values between 0 and 1. Use this option to generate random
predictions, to be used in e.g. an evaluator. Defaults to False.
generate_real_images:
if set to True, will generate random images and save them in the ``/tmp/``
folder under a random file name. Otherwise, will just generate random
file path to images without creating any. Defaults to False.
seed: seed number for the generation. This will ensure that for a given seed
number, the same dataset will be created.
**existing_elements: optional existing dataset elements that you want not to be
random.
Returns:
Dummy generated dataset
Example:
>>> dummy_dataset()
Dataset object containing 2 images and 2 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height relative_path type split
id
0 342 136 help/me.jpeg .jpeg train
1 377 167 whatever/wait.png .png train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 step 15 ... 73.932999 71.552480 42.673983
1 0 why 19 ... 4.567638 248.551257 122.602211
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
Change the seed option to another random dataset following the same rules
>>> dummy_dataset(seed=1)
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
Use the ``split_share`` and ``split_names`` to set splits values.
Use the ``keypoints_share`` option to set a share of bounding box with size of 0
>>> dataset = dummy_dataset(
... 10,
... 100,
... split_shares=(0.5, 0.5),
... split_names=("foo", "bar"),
... keypoints_share=0.3,
... add_confidence=True,
... )
>>> dataset
Dataset object containing 10 images and 100 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height relative_path type split
id
0 342 645 help/me.jpeg .jpeg foo
1 377 973 whatever/wait.png .png foo
2 136 756 chair/mother.gif .gif bar
3 167 669 someone/challenge.jpeg .jpeg foo
4 114 589 successful/present.bmp .bmp bar
5 257 603 no/where.jpeg .jpeg foo
6 831 941 play/take.tiff .tiff foo
7 684 349 bit/force.gif .gif bar
8 921 834 way/back.tiff .tiff bar
9 553 703 marriage/give.tiff .tiff foo
Annotations :
image_id category_str category_id ... box_width box_height confidence
id ...
0 0 interview 25 ... 11.569934 591.860047 0.136767
1 3 step 15 ... 70.680613 101.235900 0.663684
2 8 interview 25 ... 0.000000 0.000000 0.749956
3 5 why 19 ... 99.047865 266.499060 0.163943
4 0 why 19 ... 69.419403 61.451991 0.689302
.. ... ... ... ... ... ... ...
95 7 step 15 ... 518.765436 55.277118 0.942361
96 0 step 15 ... 0.000000 0.000000 0.802246
97 5 interview 25 ... 0.000000 0.000000 0.122368
98 4 why 19 ... 89.054816 254.947600 0.124429
99 9 why 19 ... 181.630916 86.810354 0.616242
<BLANKLINE>
[100 rows x 9 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
>>> (dataset.annotations["box_width"] > 0).value_counts() / dataset.len_annot()
box_width
True 0.69
False 0.31
Name: count, dtype: float64
Add list columns, that can be booleanized later
>>> dummy_dataset(n_list_columns_images=1, n_list_columns_annotations=1)
Dataset object containing 2 images and 2 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height ... split discover
id ...
0 342 136 ... train [chair, challenge]
1 377 167 ... train [someone, beyond, present, enough]
<BLANKLINE>
[2 rows x 6 columns]
Annotations :
image_id category_str ... box_height where
id ...
0 0 step ... 42.673983 [take, play, week, force, bit]
1 0 why ... 122.602211 [no, season, take, play, choice, bit]
<BLANKLINE>
[2 rows x 9 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
Or booleanize them right away
>>> dummy_dataset(
... n_list_columns_images=1, n_list_columns_annotations=1, booleanize="all"
... )
Dataset object containing 2 images and 2 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height ... discover.present discover.someone
id ...
0 342 136 ... False False
1 377 167 ... True True
<BLANKLINE>
[2 rows x 11 columns]
Annotations :
image_id category_str category_id ... where.season where.take where.week
id ...
0 0 step 15 ... False True True
1 0 why 19 ... True True False
<BLANKLINE>
[2 rows x 16 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
Add attribute columns which then are transformed into categorical columns.
>>> example = dummy_dataset(
... n_attribute_columns_images={"a": 2, "b": 3},
... n_list_columns_annotations=2,
... )
>>> example
Dataset object containing 2 images and 2 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height relative_path type split a b
id
0 342 136 help/me.jpeg .jpeg train play force
1 377 167 whatever/wait.png .png train take force
Annotations :
image_id ... where
id ...
0 0 ... []
1 0 ... [no, season]
<BLANKLINE>
[2 rows x 10 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
>>> example.images["b"]
id
0 force
1 force
Name: b, dtype: category
Categories (3, object): ['week', 'choice', 'force']
Instead of integers, use lists of probabilities to steer the distribution of
attributes.
>>> example = dummy_dataset(
... 200, n_attribute_columns_images=[[0.1, 0.1, 0.8]], seed=1
... )
>>> example
Dataset object containing 200 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split could
id
0 955 488 determine/story.jpg .jpg train note
1 131 895 air/method.bmp .bmp train firm
2 229 880 political/lead.jpg .jpg train firm
3 840 384 like/safe.bmp .bmp train note
4 953 668 suffer/set.jpeg .jpeg train note
.. ... ... ... ... ... ...
195 122 437 state/almost.tiff .tiff train firm
196 752 300 weight/tend.jpeg .jpeg train note
197 554 228 remember/summer.png .png train note
198 688 605 yet/though.png .png eval note
199 243 227 describe/road.tiff .tiff train note
<BLANKLINE>
[200 rows x 6 columns]
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 77 reach 22 ... 45.427512 40.116677 318.073851
1 137 marriage 15 ... 202.481384 435.389400 475.375279
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.images["could"].value_counts() / len(example)
could
note 0.82
firm 0.09
lead 0.09
Name: count, dtype: float64
Finally, you can generate fake images as well if you want to test the io
functions that need images to be valid.
>>> dataset = dummy_dataset(generate_real_images=True)
>>> dataset
Dataset object containing 2 images and 2 objects
Name :
inside_else_memory
Images root :
/tmp/such/serious
Images :
width height relative_path type split
id
0 342 136 help/me.jpeg .jpeg train
1 377 167 whatever/wait.png .png train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 step 15 ... 73.932999 71.552480 42.673983
1 0 why 19 ... 4.567638 248.551257 122.602211
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
>>> dataset.check()
Checking Image and annotations Ids ...
Checking Bounding boxes ..
Checking label map ...
Checking images are valid ...
"""
gen = np.random.default_rng(seed)
Faker.seed(seed=seed)
fake_generator = Faker()
images_root = existing_elements.get(
"images_root", Path("/".join(fake_generator.words(2)))
)
if generate_real_images and not images_root.is_absolute():
images_root = "/tmp" / images_root
dataset_name = existing_elements.get(
"dataset_name", "_".join(fake_generator.words(3))
)
if "label_map" in existing_elements:
label_map = existing_elements["label_map"]
else:
label_ids = gen.integers(0, 10 * n_labels, size=n_labels)
label_map = {
int(label_id): fake_generator.unique.word() for label_id in label_ids
}
if "images" in existing_elements:
images = existing_elements["images"]
assert isinstance(images, pd.DataFrame), "Images can only be a dataframe"
image_ids = images.index
n_imgs = len(image_ids)
else:
image_ids = np.arange(n_imgs)
image_paths = [
Path(fake_generator.file_path(depth=1, category="image", absolute=False))
for i in range(n_imgs)
]
width = gen.integers(100, 1000, size=n_imgs)
height = gen.integers(100, 1000, size=n_imgs)
images = pd.DataFrame(
data={
"width": width,
"height": height,
"relative_path": image_paths,
},
index=image_ids,
)
annot_ids = np.arange(n_annot)
annot_image_ids = gen.choice(image_ids, size=n_annot)
category_ids = gen.choice(a=np.array(list(label_map.keys())), size=n_annot)
bbox = gen.random((2, 2, n_annot))
box_x_min, box_y_min = bbox.min(axis=0)
box_width, box_height = np.abs(bbox[0] - bbox[1])
if keypoints_share > 0:
to_zero = gen.choice(
a=[True, False], size=n_annot, p=[keypoints_share, 1 - keypoints_share]
)
box_width[to_zero] = 0
box_height[to_zero] = 0
bbox = np.stack([box_x_min, box_y_min, box_width, box_height], axis=1)
if split_names is not None:
if isinstance(split_names, str):
split_names = [split_names]
n_splits = len(split_names)
if n_splits == 0:
images["split"] = None
elif n_splits == 1:
images["split"] = split_names[0]
else:
if len(split_shares) != n_splits:
raise ValueError(
"Size mismatch between 'split_names' and 'split_shares'"
f" ({len(split_names)} vs {len(split_shares)})"
)
if sum(split_shares) != 1:
raise ValueError(
"Split share values must addup to 1. Got"
f" {sum(split_shares)} instead"
)
split = gen.choice(list(split_names), size=n_imgs, p=list(split_shares))
images["split"] = split
bbox = import_bbox(
bbox, images_df=images, image_ids=annot_image_ids, input_format="xywh"
)
bbox.index = pd.Index(annot_ids)
annotations = pd.DataFrame(
data={
"image_id": annot_image_ids,
"category_id": category_ids,
},
index=annot_ids,
)
if add_confidence:
annotations["confidence"] = gen.random(n_annot)
annotations = pd.concat([annotations, bbox], axis="columns")
columns_to_booleanize = []
if n_list_columns_images:
columns_to_booleanize.extend(
set_attribute_columns_labels(
input_dataframe=images,
columns_specs=n_list_columns_images,
numpy_generator=gen,
fake_generator=fake_generator,
is_list=True,
)
)
if n_list_columns_annotations:
columns_to_booleanize.extend(
set_attribute_columns_labels(
input_dataframe=annotations,
columns_specs=n_list_columns_annotations,
numpy_generator=gen,
fake_generator=fake_generator,
is_list=True,
)
)
if n_attribute_columns_images:
set_attribute_columns_labels(
input_dataframe=images,
columns_specs=n_attribute_columns_images,
numpy_generator=gen,
fake_generator=fake_generator,
is_list=False,
)
if n_attributes_columns_annotations:
set_attribute_columns_labels(
input_dataframe=annotations,
columns_specs=n_attributes_columns_annotations,
numpy_generator=gen,
fake_generator=fake_generator,
)
dataset = Dataset(
label_map=label_map,
images=images,
annotations=annotations,
images_root=images_root,
dataset_name=dataset_name,
)
if booleanize == "all":
dataset = dataset.booleanize(columns_to_booleanize)
elif booleanize == "random":
subset_to_booleanize = gen.choice(
columns_to_booleanize,
size=gen.integers(len(columns_to_booleanize)),
replace=False,
)
dataset = dataset.booleanize(subset_to_booleanize)
elif booleanize != "none":
raise ValueError(
"Invalid booleanize option. Possible values are 'all', 'random' or 'none',"
f" got {booleanize}"
)
if generate_real_images:
for i, image_data in dataset.images.iterrows():
image_path = dataset.images_root / image_data["relative_path"]
format = image_data["type"][1:]
if format.lower() == "jpg":
format = "jpeg"
image_path.parent.mkdir(parents=True, exist_ok=True)
with open(image_path, "wb") as f:
image_data_stream = fake_generator.image(
image_format=format,
size=(image_data["width"], image_data["height"]),
)
f.write(image_data_stream)
return dataset