from collections.abc import Iterable, Iterator, Sequence
from copy import deepcopy
from os.path import normpath, relpath
from pathlib import Path
from pprint import pformat
from typing import TYPE_CHECKING, Any, Literal
try:
from typing import Self
except ImportError:
# Fallback mechanism for python 3.10
from typing_extensions import Self
from warnings import warn
import numpy as np
import pandas as pd
from numpy import ndarray
from pandas._typing import Dtype
from ..utils import BBOX_COLUMN_NAMES
from ..utils.column_booleanizer import booleanize, debooleanize, get_bool_columns
from ..utils.grouper import get_group_names, group_list, groups_to_list
from ..utils.label_map_merger import IncompatibleLabelMapsError
from ..utils.parquet_saver import dict_to_parquet
from .split.dataset_splitter import split_dataframe
if TYPE_CHECKING:
import fiftyone as fo
from ..utils.annotations_appender import AnnotationAppender
from .indexing import DatasetAnnotLocator, DatasetImLocator
[docs]
class Dataset:
"""Dataset base class for manipulation
The behaviour of the Dataset is inspired from numpy arrays or pandas dataframes.
See Also:
- `related doc <UPDATE-ME>`_
for a complete explanation of main principles.
- :ref:`Dataset demo notebook </notebooks/1_demo_dataset.ipynb>`
""" # noqa: E501
dataset_name: str | None
images_root: Path
images: pd.DataFrame
annotations: pd.DataFrame
label_map: dict[int, str]
_image_required_columns: set[str] = {"width", "height", "relative_path"}
_default_image_columns_with_types: dict[str, Dtype] = {
"width": int,
"height": int,
"relative_path": object,
"type": str,
"split": str,
}
_annotations_required_columns: set[str] = {
"image_id",
"category_id",
*BBOX_COLUMN_NAMES,
}
_default_annotation_columns_with_types: dict[str, Dtype] = {
"image_id": int,
"category_str": str,
"category_id": int,
"split": str,
**{n: float for n in BBOX_COLUMN_NAMES},
}
booleanized_columns: dict[str, set[str]] = {"images": set(), "annotations": set()}
def __init__(
self,
images_root: Path | None = None,
images: pd.DataFrame | None = None,
annotations: pd.DataFrame | None = None,
label_map: dict[int, str] | None = None,
dataset_name: str | None = None,
):
"""Main Constructor
Args:
images_root: root path from where the ``relative_path`` values are relative
to, in images
images: DataFrame comprising image data. This dataframe should be referred
to by annotations with the ``image_id`` column
annotations: DataFrame comprising annotation data. Must have at least
``image_id`` column
label_map: Mapping from ``category_id`` to ``category_str``, in the case the
annotations have a ``category_id`` id. Useful for detections and
classification
dataset_name: Optional name for dataset. Will be used in function that need
a name when the name cannot be easily deduced from images_root
See Also:
:meth:`from_template`
Example:
>>> Dataset()
Dataset object containing 0 image and 0 object
Name :
None
Images root :
.
Images :
Empty DataFrame
Columns: [width, height, relative_path, type]
Index: []
Annotations :
Empty DataFrame
Columns: [image_id, category_str, category_id, box_x_min, box_y_min, box_width, box_height]
Index: []
Label map :
{}
>>> images = pd.DataFrame(
... data={
... "width": [1920, 1280],
... "height": [1080, 720],
... "relative_path": [Path("0.jpg"), Path("1.jpg")],
... "split": ["train", "valid"],
... },
... index=[0, 1],
... )
>>> annotations = pd.DataFrame(
... data={
... "image_id": [0, 1],
... "category_id": [1, 0],
... "box_x_min": [10, 20],
... "box_y_min": [30, 40],
... "box_width": [100, 200],
... "box_height": [200, 300],
... },
... index=[2, 3],
... )
>>> label_map = {0: "this", 1: "that"}
>>> Dataset(
... images=images,
... annotations=annotations,
... label_map=label_map,
... dataset_name="my_dataset",
... )
Dataset object containing 2 images and 2 objects
Name :
my_dataset
Images root :
.
Images :
width height relative_path type split
id
0 1920 1080 0.jpg .jpg train
1 1280 720 1.jpg .jpg valid
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
2 0 that 1 ... 30.0 100.0 200.0
3 1 this 0 ... 40.0 200.0 300.0
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{0: 'this', 1: 'that'}
"""
if images_root is None:
self.images_root = Path("")
else:
self.images_root = images_root
if images is None:
self.images = pd.DataFrame([], columns=list(self._image_required_columns))
else:
self.images = images
self.init_images()
# Note, although probably unnecessary, we do a full copy of annotation because
# otherwise we get a warning from pandas. To be investigated, should potential
# data fill the entire RAM one day
if annotations is None:
self.annotations = pd.DataFrame(
[], columns=list(self._annotations_required_columns)
)
else:
self.annotations = annotations.copy()
if label_map is None:
self.label_map = {}
else:
self.label_map = label_map
self.booleanized_columns = {"images": set(), "annotations": set()}
self.dataset_name = dataset_name
self.init_annotations()
[docs]
def from_template(
self,
reset_booleanized: bool = False,
**kwargs,
) -> Self:
"""Create a new Dataset object from an existing Dataset.
Optionally, give new values for images_root, images, annotations or label map
by providing supplementary kw arguments, which are to be fed to Dataset's
``__init__`` function.
Note:
- Although the Dataset object is a new one, dataframes are NOT cloned
- booleanized columns are kept from other dataset to the new one.
Args:
reset_booleanized: If set to True, will reset booleanized columns for
changed dataframes (and only for changed dataframes).
Otherwise, the self.booleanized_columns dictionary of sets will only be
updated so that columns that are not present anymore will be removed.
Defaults to False
**kwargs: keywords to overwrite other dataset's data with other values in
the called constructor
Returns:
Resulting dataset, constructed from other dataset's data and optional
additional data.
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 2, seed=0)
>>> example
Dataset object containing 2 images and 2 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height relative_path type split
id
0 342 136 help/me.jpeg .jpeg train
1 377 167 whatever/wait.png .png train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 step 15 ... 73.932999 71.552480 42.673983
1 0 why 19 ... 4.567638 248.551257 122.602211
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
>>> annotations = pd.DataFrame(
... data={
... "image_id": [0, 1],
... "category_id": [12, 21],
... "box_x_min": [10, 20],
... "box_y_min": [30, 40],
... "box_width": [100, 200],
... "box_height": [200, 300],
... },
... index=[2, 3],
... )
>>> Dataset.from_template(example, annotations=annotations)
Dataset object containing 2 images and 2 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height relative_path type split
id
0 342 136 help/me.jpeg .jpeg train
1 377 167 whatever/wait.png .png train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
2 0 12 12 ... 30.0 100.0 200.0
3 1 21 21 ... 40.0 200.0 300.0
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{12: '12', 15: 'step', 19: 'why', 21: '21', 25: 'interview'}
"""
booleanized_columns = deepcopy(self.booleanized_columns)
if "images_root" not in kwargs:
kwargs["images_root"] = self.images_root
if "images" not in kwargs:
kwargs["images"] = self.images
elif reset_booleanized:
booleanized_columns["images"] = set()
if "annotations" not in kwargs:
kwargs["annotations"] = self.annotations
elif reset_booleanized:
booleanized_columns["annotations"] = set()
if "label_map" not in kwargs:
kwargs["label_map"] = self.label_map
if "dataset_name" not in kwargs:
kwargs["dataset_name"] = self.dataset_name
DatasetSubclass = type(self)
output_dataset = DatasetSubclass(**kwargs)
updated_booleanized_columns = {"images": set(), "annotations": set()}
for name, frame in zip(
["images", "annotations"],
[output_dataset.images, output_dataset.annotations],
):
for prefix in booleanized_columns[name]:
try:
columns = get_bool_columns(frame, prefix)
if columns:
updated_booleanized_columns[name].add(prefix)
except ValueError as e:
warn(
f"Prefix {prefix} will be ignored from generated dataset's"
f" {name} booleanized columns because of the following error:"
f" \n{e}",
RuntimeWarning,
)
output_dataset.booleanized_columns = updated_booleanized_columns
return output_dataset
[docs]
def rename(self, dataset_name: str) -> Self:
"""Simple function to change the name fo the dataset.
The dataset name is used when printing it, showing it in jupyter or exporting
it in other formats such as fiftyone.
Equivalent to ``my_dataset.dataset_name = "new_name"``, but creates a new
dataset instance (without copying the dataframes). It can be useful when using
method chaining.
Args:
dataset_name: Name to give to the dataset.
Returns:
Renamed dataset
Example:
>>> Dataset().rename("my dataset")
Dataset object containing 0 image and 0 object
Name :
my dataset
Images root :
.
Images :
Empty DataFrame
Columns: [width, height, relative_path, type]
Index: []
Annotations :
Empty DataFrame
Columns: [image_id, category_str, category_id, box_x_min, box_y_min, box_width, box_height]
Index: []
Label map :
{}
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 2, seed=1)
>>> example
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.loc[example.images["type"] == ".jpg"].rename("only_jpeg")
Dataset object containing 1 image and 1 object
Name :
only_jpeg
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
"""
return self.from_template(dataset_name=dataset_name)
@property
def loc(self) -> "DatasetImLocator[Self]":
"""Filter a dataset by indexing the images you want with their ids
Similar to :attr:`pandas.DataFrame.loc` for images, but will create a new
Dataset object and filter annotations accordingly.
Note:
You cannot set item with this method the same way you can in pandas
Returns:
Locator with a ``[]`` functionality relative to image id
See Also:
- :ref:`related tutorial </notebooks/1_demo_dataset.ipynb#Image-based-sampling>`
- :class:`.indexing.DatasetImLocator`
- :attr:`iloc`
- :meth:`filter_images`
- :attr:`loc_annot`
- :attr:`iloc_annot`
- :meth:`filter_annotations`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 2, seed=1)
>>> example
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.loc[example.images["type"] == ".jpg"]
Dataset object containing 1 image and 1 object
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
"""
from .indexing import DatasetImLocator
return DatasetImLocator(self, mode="loc")
@property
def iloc(self) -> "DatasetImLocator[Self]":
"""Filter a dataset by indexing the images you want with their row number.
Similar to :attr:`pandas.DataFrame.iloc` for images, but will create a new
Dataset object and filter annotations accordingly.
Note:
You cannot set item with this method the same way you can in pandas
Returns:
Locator with a ``[]`` functionality relative to image row
See Also:
- :ref:`related tutorial </notebooks/1_demo_dataset.ipynb#Image-based-sampling>`
- :class:`.indexing.DatasetImLocator`
- :attr:`loc`
- :meth:`filter_images`
- :attr:`loc_annot`
- :attr:`iloc_annot`
- :meth:`filter_annotations`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 2, seed=1)
>>> example
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.iloc[0]
Dataset object containing 1 image and 1 object
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
"""
from .indexing import DatasetImLocator
return DatasetImLocator(self, mode="iloc")
@property
def loc_annot(self) -> "DatasetAnnotLocator[Self]":
"""Filter a dataset by indexing the annotations you want with their id.
Similar to :attr:`pandas.DataFrame.loc` for annotations, but will create a new
Dataset object
Note:
- You cannot set item with this method the same way you can in pandas
- Images emptied of annotation are NOT removed. If you want to remove
emptied images, :meth:`.Dataset.filter_annotations` is better suited.
Returns:
Locator with a ``[]`` functionality relative to annotations id
See Also:
- :ref:`related tutorial </notebooks/1_demo_dataset.ipynb#Annotation-based-sampling>`
- :class:`.indexing.DatasetAnnotLocator`
- :attr:`loc`
- :attr:`iloc`
- :meth:`filter_images`
- :attr:`iloc_annot`
- :meth:`filter_annotations`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 2, seed=1)
>>> example
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.loc_annot[example.annotations["box_height"] > 180]
Dataset object containing 2 images and 1 object
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
"""
from .indexing import DatasetAnnotLocator
return DatasetAnnotLocator(self, mode="loc")
@property
def iloc_annot(self) -> "DatasetAnnotLocator[Self]":
"""Filter a dataset by indexing the annotations you want with their row number.
Similar to :attr:`pandas.DataFrame.iloc` for annotations, but will create a new
Dataset object
Note:
- You cannot set item with this method the same way you can in pandas
- Images emptied of annotation are NOT removed. If you want to remove
emptied images, :meth:`.Dataset.filter_annotations` is better suited.
Returns:
Locator with a ``[]`` functionality relative to annotations row
See Also:
- :ref:`related tutorial </notebooks/1_demo_dataset.ipynb#Annotation-based-sampling>`
- :class:`.indexing.DatasetAnnotLocator`
- :attr:`loc`
- :attr:`iloc`
- :meth:`filter_images`
- :attr:`loc_annot`
- :meth:`filter_annotations`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 2, seed=1)
>>> example
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.iloc_annot[0]
Dataset object containing 2 images and 1 object
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
"""
from .indexing import DatasetAnnotLocator
return DatasetAnnotLocator(self, mode="iloc")
[docs]
def filter_images(self, index: Any, mode: Literal["loc", "iloc"] = "loc") -> Self:
"""Method equivalent of :attr:`.Dataset.loc` and :attr:`.Dataset.iloc`
Args:
index: Index object used in ``self.images.loc[]`` or ``self.images.iloc[]``
mode: whether to be equivalent to :attr:`.Dataset.loc` or
:attr:`.Dataset.iloc`. Defaults to "loc"
Returns:
Filtered dataset
See Also:
- :ref:`related tutorial </notebooks/1_demo_dataset.ipynb#Image-based-sampling>`
- :class:`.indexing.DatasetImLocator`
- :attr:`loc`
- :attr:`iloc`
- :attr:`loc_annot`
- :attr:`iloc_annot`
- :meth:`filter_annotations`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 2, seed=1)
>>> example
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.filter_images(example.images["type"] == ".jpg", mode="loc")
Dataset object containing 1 image and 1 object
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.filter_images(0, mode="iloc")
Dataset object containing 1 image and 1 object
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
"""
if mode == "loc":
return self.loc[index]
else:
return self.iloc[index]
[docs]
def filter_annotations(
self,
index: Any,
mode: Literal["loc", "iloc"] = "loc",
remove_emptied_images: bool = False,
) -> Self:
"""Method equivalent of :attr:`loc_annot` and
:attr:`iloc_annot`, except you can choose to remove emptied images as
well.
Args:
index: Index object used in ``self.annotations.loc[]`` or
``self.annotations.iloc[]``
mode: whether to be equivalent to :meth:`.Dataset.loc_annot` or
:attr:`.Dataset.iloc_annot`. Default to "loc"
remove_emptied_images: if set to True, will remove images that were
initially with annotations, but are now empty. In that case, it will
keep the images that were already empty before calling this method.
Default to False.
Returns:
Filtered dataset
See Also:
- :ref:`related tutorial </notebooks/1_demo_dataset.ipynb#Annotation-based-sampling>`
- :class:`.indexing.DatasetAnnotLocator`
- :attr:`loc`
- :attr:`iloc`
- :meth:`filter_images`
- :attr:`loc_annot`
- :attr:`iloc_annot`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 2, seed=1)
>>> example
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.filter_annotations(example.annotations["box_height"] > 180)
Dataset object containing 2 images and 1 object
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.filter_annotations(0, mode="iloc")
Dataset object containing 2 images and 1 object
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.filter_annotations(0, mode="iloc", remove_emptied_images=True)
Dataset object containing 1 image and 1 object
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
"""
from .indexing import DatasetAnnotLocator
indexer = DatasetAnnotLocator(
self, mode=mode, remove_emptied_images=remove_emptied_images
)
return indexer[index]
[docs]
def empty_annotations(self) -> Self:
"""Create a dataset object with an empty annotation dataframe, but with the same
columns, and the same images dataframe.
Useful when trying to construct a prediction dataset from another dataset
Returns:
New dataset instance with the same images as the original dataset, but an
empty annotation dataframe
See Also:
- :meth:`filter_annotations`
- :attr:`loc_annot`
- :attr:`iloc_annot`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 2, seed=1)
>>> example
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.empty_annotations()
Dataset object containing 2 images and 0 object
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
Empty DataFrame
Columns: [image_id, category_str, category_id, split, box_x_min, box_y_min, box_width, box_height]
Index: []
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
"""
return self.iloc_annot[[]]
[docs]
def init_images(self):
"""Initialize images by checking required fields are present and converting
fields to the right dtype. Also reorder columns so that required columns are
first and tags last
"""
from ..utils.dataframe_formatter import reorder_columns, set_dataframe_dtypes
from ..utils.testing import assert_required_columns_present
assert_required_columns_present(
self.images, self._image_required_columns, df_name="images"
)
if self.images.index.has_duplicates:
raise ValueError(
"Dataset images ids are not exclusive, it will create ambiguity for"
" annotation"
)
self.images.index.name = "id"
self.images = self.images.assign(
relative_path=self.images["relative_path"].apply(Path) # pyright: ignore
)
if "type" not in self.images.columns:
self.images["type"] = self.images["relative_path"].apply(lambda x: x.suffix)
self.images = set_dataframe_dtypes(
self.images,
self._default_image_columns_with_types,
nullable_types=["split"],
)
self.images = reorder_columns(
self.images, list(self._default_image_columns_with_types.keys()), "."
)
[docs]
def init_annotations(self):
"""Initialize annotations by adding info and checking index
- add ``category_str`` column (for informative purpose only, label map prevails)
- add ``split`` column (for informative purpose only, images split prevails)
- reset index if it has duplicates.
- apply the right dtypes
- reorder the columns so that required columns are first and attributes last
"""
from ..utils.dataframe_formatter import reorder_columns, set_dataframe_dtypes
from ..utils.testing import assert_required_columns_present
assert_required_columns_present(
self.annotations,
required_columns=self._annotations_required_columns,
df_name="annotations",
)
valid_image_ids = self.annotations["image_id"].isin(self.images.index)
if not valid_image_ids.all():
wrong_ids = (
self.annotations.loc[~valid_image_ids, "image_id"].unique().tolist()
)
raise ValueError(
"The following image ids are not present in the dataset's images"
f" dataframe: {', '.join(wrong_ids)}"
)
self.annotations.index.name = "id"
all_cat_ids = set(self.annotations["category_id"].unique())
if not all_cat_ids.issubset(self.label_map):
missing_ids = all_cat_ids - self.label_map.keys()
warn(
"Incomplete Label map, setting following label of the following id to"
f" their string equivalent : {missing_ids}"
)
for i in missing_ids:
self.label_map[i] = str(i)
self.annotations["category_str"] = (
self.annotations["category_id"].astype(object).replace(self.label_map)
)
if "split" in self.images.columns:
self.annotations["split"] = self.images.loc[
self.annotations["image_id"], "split"
].values
self.annotations = set_dataframe_dtypes(
self.annotations,
self._default_annotation_columns_with_types,
nullable_types=["split"],
)
if self.annotations.index.has_duplicates:
warn(
"Dataset annotations have duplicates ids, resetting them ...",
RuntimeWarning,
)
self.annotations.index = pd.RangeIndex(len(self.annotations), name="id")
self.annotations = reorder_columns(
self.annotations,
list(self._default_annotation_columns_with_types.keys()),
".",
)
# TODO maybe have a few checks on annotations integrity ?
# Bboxes size, position, etc
[docs]
def reset_images_root(self, new_path: Path | str) -> Self:
"""Replace the images_root with a new path. Relative path to images are updated
accordingly so that ``new_path/new_relative_path`` still point to the right
path.
Args:
new_path: New path to replace current images_root with
Returns:
New dataset object with updated images (relative_path column) and
images_root.
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 2, seed=1, generate_real_images=True)
>>> example
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
/tmp/care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.check()
Checking Image and annotations Ids ...
Checking Bounding boxes ..
Checking label map ...
Checking images are valid ...
>>> example = example.reset_images_root("/tmp/")
>>> example
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
/tmp
Images :
width height relative_path type split
id
0 955 229 care/suggest/determine/story.jpg .jpg train
1 131 840 care/suggest/air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.check()
Checking Image and annotations Ids ...
Checking Bounding boxes ..
Checking label map ...
Checking images are valid ...
"""
new_path = Path(new_path)
new_path_absolute = new_path.absolute()
relative_path = Path(normpath(relpath(self.images_root, new_path)))
new_image_paths = self.images["relative_path"].apply(
lambda x: Path( # pyright: ignore
normpath(
relpath(
new_path_absolute / relative_path / x,
new_path_absolute,
)
)
)
)
new_images = self.images.assign(relative_path=new_image_paths)
return self.from_template(
images=new_images, images_root=new_path, reset_booleanized=False
)
[docs]
def check(
self,
check_symlink: bool = False,
allow_keypoints: bool = False,
check_exhaustive: bool = False,
):
"""Make a full check of dataset, Ids, Bounding boxes, label maps and images
See Also:
:func:`.full_check_dataset_detection`
Args:
check_symlink: Whether the dataset should be using symlinks.
Defaults to False.
allow_keypoints: Whether a bounding box with a width and height of 0
is acceptable and assumed to be a keypoint
check_exhaustive: If set to True, will check that all images in the
images_root folder are in the image dataframe, and that the dataset is
indeed exhaustive
"""
from lours.utils.testing import full_check_dataset_detection
full_check_dataset_detection(
self,
check_symlink=check_symlink,
allow_keypoints=allow_keypoints,
check_exhaustive=check_exhaustive,
)
[docs]
def remove_invalid_images(self, load_images: bool = True) -> Self:
"""Remove invalid images from dataset.
See Also:
- :func:`.get_invalid_images`
- :meth:`.remove_invalid_annotations`
Args:
load_images: If set to True, will not only check that images are valid
files, but also that image can be loaded (i.e. are not corrupted files)
and that their sizes match the ones included in ``images``
dataframe. Note that this makes the function significantly slower.
Defaults to True.
Returns:
The same dataset, without the invalid images and their related annotations.
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 2, seed=1, generate_real_images=True)
>>> example.images.loc[0, "relative_path"] = Path("bad_path.jpg")
>>> example
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
/tmp/care/suggest
Images :
width height relative_path type split
id
0 955 229 bad_path.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.remove_invalid_images()
Removed 1 image, with 1 annotation
Dataset object containing 1 image and 1 object
Name :
shake_effort_many
Images root :
/tmp/care/suggest
Images :
width height relative_path type split
id
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
"""
from lours.utils.testing import get_invalid_images
invalid_images = get_invalid_images(
self,
check_symlink=False,
load_images=load_images,
raise_if_error=False,
)
n_invalid_images = len(invalid_images)
n_invalid_annotations = self.loc[invalid_images.index].len_annot()
print(
f"Removed {n_invalid_images} image{'s' if n_invalid_images > 1 else ''},"
" with"
f" {n_invalid_annotations} annotation{'s' if n_invalid_annotations > 1 else ''}"
)
return self.loc[~self.images.index.isin(invalid_images.index)]
[docs]
def remove_invalid_annotations(
self,
allow_keypoints: bool = False,
remove_related_images: bool = False,
remove_emptied_images: bool = False,
) -> Self:
"""Remove Invalid annotations from dataset.
Optionally, remove images that have at least one invalid annotation, or remove
images that have only invalid annotations
See Also:
- :func:`.get_malformed_bounding_boxes`
- :meth:`.filter_annotations`
- :meth:`.remove_invalid_images`
Args:
allow_keypoints: If set to True, will keep keypoints, i.e. bounding box
with height and width of 0. Otherwise, will remove them.
Defaults to False.
remove_related_images: If set to True, will remove any image that has an
invalid annotation. Defaults to False.
remove_emptied_images: If set to True, will remove images that are empty
after removing the invalid annotations. In other word, remove images
where all annotations are invalid. Note that already empty images
are not removed. Defaults to False.
Returns:
The same dataset, without the invalid annotations and optionally without
their related and/or emptied images.
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 4, seed=1)
>>> example.annotations.loc[0, "box_width"] = -1
>>> example
Dataset object containing 2 images and 4 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg eval
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 marriage 15 ... 276.974642 -1.000000 353.331683
1 0 listen 14 ... 64.213606 358.653949 116.336568
2 0 reach 22 ... 69.431616 525.305264 41.677117
3 1 listen 14 ... 380.938227 36.133726 442.881021
<BLANKLINE>
[4 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.remove_invalid_annotations()
Removed 1 annotation, in 1 image
Dataset object containing 2 images and 3 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg eval
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
1 0 listen 14 ... 64.213606 358.653949 116.336568
2 0 reach 22 ... 69.431616 525.305264 41.677117
3 1 listen 14 ... 380.938227 36.133726 442.881021
<BLANKLINE>
[3 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.remove_invalid_annotations(remove_related_images=True)
Removed 1 image with invalid annotations
Dataset object containing 1 image and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg eval
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
1 0 listen 14 ... 64.213606 358.653949 116.336568
2 0 reach 22 ... 69.431616 525.305264 41.677117
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 4, seed=1)
>>> example.annotations.loc[[0, 3], "box_width"] = -1
>>> example
Dataset object containing 2 images and 4 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg eval
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 marriage 15 ... 276.974642 -1.000000 353.331683
1 0 listen 14 ... 64.213606 358.653949 116.336568
2 0 reach 22 ... 69.431616 525.305264 41.677117
3 1 listen 14 ... 380.938227 -1.000000 442.881021
<BLANKLINE>
[4 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.remove_invalid_annotations(remove_emptied_images=True)
Removed 2 annotations, in 1 image
Dataset object containing 1 image and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg eval
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
1 0 listen 14 ... 64.213606 358.653949 116.336568
2 0 reach 22 ... 69.431616 525.305264 41.677117
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
"""
from lours.utils.testing import get_malformed_bounding_boxes
invalid_annots = get_malformed_bounding_boxes(
self, allow_keypoints=allow_keypoints, raise_if_error=False
)
invalid_images = self.annotations.loc[invalid_annots.index, "image_id"].unique()
if remove_related_images:
n_images = len(invalid_images)
print(
f"Removed {n_images} image{'s' if n_images > 1 else ''} with invalid"
" annotations"
)
return self.loc[~self.images.index.isin(invalid_images)]
else:
n_annots = len(invalid_annots)
n_images = len(invalid_images)
print(
f"Removed {n_annots} annotation{'s' if n_annots > 1 else ''}, in"
f" {n_images} image{'s' if n_images > 1 else ''}"
)
return self.filter_annotations(
~self.annotations.index.isin(invalid_annots.index),
remove_emptied_images=remove_emptied_images,
)
[docs]
def get_one_frame(self, n: int) -> tuple[pd.Series, pd.DataFrame]:
"""Sample a single image from the dataset. Image data is returned as a pandas
Series, and corresponding annotations is returned as a DataFrame.
This equivalent to ``dataset.iloc[n]`` except the returned object is the bare
image info and annotation dataframe. This can be useful when using lours as e.g.
a pytorch dataset.
Note:
The id of the image is the name of the image Series
Args:
n: row number of wanted image. Note that this does NOT use the index of
self.images.
Returns:
tuple containing image data as Series and annotations as a (possibly empty)
DataFrame.
See Also:
- :ref:`related tutorial </notebooks/1_demo_dataset.ipynb#Iterating-through-the-dataset>`
- :meth:`iter_images`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 2, seed=1)
>>> example
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> frame, annotations = example.get_one_frame(0)
>>> frame
width 955
height 229
relative_path determine/story.jpg
type .jpg
split train
Name: 0, dtype: object
>>> annotations
image_id category_str category_id ... box_y_min box_width box_height
id ...
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[1 rows x 8 columns]
"""
image_data = self.images.iloc[n]
annotations = self.annotations[
self.annotations["image_id"] == self.images.index[n]
]
return image_data, annotations
[docs]
def iter_images(self) -> Iterator[tuple[pd.Series, pd.DataFrame]]:
"""Iterate through images, by yielding
Yields:
tuple containing:
- image Series with image data, and named as the image id
- annotations DataFrame
See Also:
- :ref:`related tutorial </notebooks/1_demo_dataset.ipynb#Iterating-through-the-dataset>`
- :meth:`get_one_frame`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 2, seed=1)
>>> example
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> for i, (frame, annot) in enumerate(example.iter_images()):
... print(f"Frame {i}")
... print(frame)
... print(annot)
...
Frame 0
width 955
height 229
relative_path determine/story.jpg
type .jpg
split train
Name: 0, dtype: object
image_id category_str category_id ... box_y_min box_width box_height
id ...
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[1 rows x 8 columns]
Frame 1
width 131
height 840
relative_path air/method.bmp
type .bmp
split train
Name: 1, dtype: object
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
<BLANKLINE>
[1 rows x 8 columns]
"""
for i in range(len(self)):
yield self.get_one_frame(i)
[docs]
def get_image_attributes(self) -> list[str]:
"""Get the name of columns related to image attributes. In other words, get
columns that are NOT the default ones.
The actual attribute values can then be
``self.images[self.get_image_attributes()]``
Returns:
list of column names in ``self.images`` that represent tags
See Also:
:meth:`get_annotations_attributes`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 2, seed=1)
>>> example.images["something"] = True
>>> example.images["else"] = 10
>>> example
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split something else
id
0 955 229 determine/story.jpg .jpg train True 10
1 131 840 air/method.bmp .bmp train True 10
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.get_image_attributes()
['something', 'else']
"""
return [
str(c)
for c in self.images.columns
if c not in self._default_image_columns_with_types.keys()
]
[docs]
def get_annotations_attributes(self) -> list[str]:
"""Get the name of columns related to annotations attributes. In other words,
get columns that are NOT the default ones.
the actual attribute values can then be
``self.annotations[self.get_annotations_attributes()]``
Returns:
list of column names in ``self.annotations`` that represent attributes
See Also:
:meth:`get_image_attributes`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 2, seed=1)
>>> example.annotations["else"] = 10
>>> example.annotations["something"] = True
>>> example
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_height else something
id ...
0 1 listen 14 ... 184.684056 10 True
1 0 reach 22 ... 174.239136 10 True
<BLANKLINE>
[2 rows x 10 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.get_annotations_attributes()
['else', 'something']
"""
return [
str(c)
for c in self.annotations.columns
if c not in self._default_annotation_columns_with_types.keys()
]
[docs]
def __getitem__(self, args: Any) -> Self:
"""``__getitem__`` implementation for the Dataset object. The iteration is made
image wise. Constructs a sub dataset so that the image index of that new
dataset is the result of ``self.images[args]``. ``args`` could be anything like
slices, ellipsis and so on.
Note:
This is equivalent to calling the dataset indexer :attr:`.Dataset.iloc`
Args:
args: usual parameters for indexing or slicing a numpy array or a pandas
array with ``iloc``. This will be used to index image indices for the
returned dataset object.
Returns:
Sub-dataset including image data indices and corresponding annotations
"""
return self.iloc[args]
[docs]
def __len__(self) -> int:
"""Return number of images in dataset.
to get number of annotations,
use the method :meth:`.Dataset.len_annot`
Returns:
Length of ``self.images`` dataframe
"""
return len(self.images)
[docs]
def len_annot(self) -> int:
"""Return number of annotations in total
Returns:
Length of ``self.annotations`` dataframe
"""
return len(self.annotations)
def __bool__(self) -> bool:
return len(self.images) > 0
def _description(self) -> str:
images_word = "images" if len(self) > 1 else "image"
annotations_word = "objects" if self.len_annot() > 1 else "object"
return (
f"Dataset object containing {len(self):,} {images_word} "
f"and {self.len_annot():,} {annotations_word}\n"
f"Name :\n\t{self.dataset_name}\n"
f"Images root :\n\t{self.images_root}"
)
def __repr__(self) -> str:
return (
f"{self._description()}\n"
f"Images :\n{self.images}\n"
f"Annotations :\n{self.annotations}\n"
f"Label map :\n{pformat(self.label_map)}"
)
def _ipython_display_(self):
"""Function to display the Dataset as an HTML widget when using notebooks"""
import ipywidgets as widgets
from ipykernel.zmqshell import ZMQInteractiveShell
from IPython.core.getipython import get_ipython
from IPython.display import display
from ..utils.notebook_utils import display_booleanized_dataframe
is_notebook = isinstance(get_ipython(), ZMQInteractiveShell)
if not is_notebook:
print(self)
return
tab = widgets.Tab()
descr_str = (
"<p><span style='white-space: pre-wrap; font-weight:"
f" bold'>{self._description()}</span></p>"
)
title = widgets.HTML(descr_str)
label_map_df = (
pd.Series(self.label_map, name="category string").to_frame().sort_index()
)
label_map_df.index.name = "category_id"
# create output widgets
widget_images = widgets.Output()
widget_annotations = widgets.Output()
widget_label_map = widgets.Output()
# render in output widgets
with widget_images:
display_booleanized_dataframe(
self.images, self.booleanized_columns["images"]
)
with widget_annotations:
display_booleanized_dataframe(
self.annotations, self.booleanized_columns["annotations"]
)
with widget_label_map:
display(label_map_df)
tab.children = [widget_images, widget_annotations, widget_label_map]
tab.titles = ["Images", "Annotations", "Label Map"]
display(widgets.VBox([title, tab]))
[docs]
def get_split(self, split: str | None) -> Self:
"""Get a particular split from the dataset
Args:
split: Split name, usually "train", "val", "eval". If set to None, will
retrieve all image with a null split value (None, pd.NA or np.nan)
Returns:
filtered dataset, with only samples within the wanted split
See Also:
:meth:`iter_splits`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(
... 2,
... 2,
... split_names=["train", "eval"],
... split_shares=[0.5, 0.5],
... seed=14,
... )
>>> example
Dataset object containing 2 images and 2 objects
Name :
present_wait_even
Images root :
blood/reflect
Images :
width height relative_path type split
id
0 424 732 listen/reason.bmp .bmp train
1 179 413 return/man.jpeg .jpeg eval
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 film 24 ... 192.940695 2.862400 74.219110
1 0 especially 4 ... 419.039943 276.766197 119.753886
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{4: 'especially', 19: 'similar', 24: 'film'}
>>> example.get_split("eval")
Dataset object containing 1 image and 1 object
Name :
present_wait_even
Images root :
blood/reflect
Images :
width height relative_path type split
id
1 179 413 return/man.jpeg .jpeg eval
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 film 24 ... 192.940695 2.8624 74.21911
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{4: 'especially', 19: 'similar', 24: 'film'}
"""
if "split" not in self.images.columns:
warn("Dataset has no split value")
return self.loc[[]]
if split is not None:
split_image_ids = self.images["split"] == split
else:
split_image_ids = self.images["split"].isnull()
return self.loc[split_image_ids]
[docs]
def iter_splits(self) -> Iterator[tuple[str | None, Self]]:
"""Iterate though split values of the dataset, by yielding for each split
the split name and the corresponding sub-dataset.
If no split is available, the split value is assumed to be ``None`` for the
whole dataset.
Yields:
tuple containing:
- the name of the split
- the corresponding subset of the original dataset
See Also:
:meth`get_split`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 2, seed=2)
>>> example
Dataset object containing 2 images and 2 objects
Name :
argue_be_structure
Images root :
what/way
Images :
width height relative_path type split
id
0 368 832 police/enter.jpeg .jpeg train
1 472 506 also/policy.gif .gif train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 table 7 ... 228.774514 137.766169 131.174304
1 0 relationship 3 ... 546.984268 34.928954 9.871084
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{3: 'relationship', 7: 'table', 25: 'simply'}
>>> for split_name, split in example.iter_splits():
... print(f"Split: {split_name}")
... print(split)
...
Split: train
Dataset object containing 2 images and 2 objects
Name :
argue_be_structure
Images root :
what/way
Images :
width height relative_path type split
id
0 368 832 police/enter.jpeg .jpeg train
1 472 506 also/policy.gif .gif train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 table 7 ... 228.774514 137.766169 131.174304
1 0 relationship 3 ... 546.984268 34.928954 9.871084
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{3: 'relationship', 7: 'table', 25: 'simply'}
"""
if "split" not in self.images.columns:
yield None, self
for split_value in self.images["split"].unique():
yield split_value, self.get_split(split_value)
[docs]
def reset_index(
self,
start_image_id: int = 0,
start_annotations_id: int = 0,
sort_images_by: None | str | Sequence[str] = "relative_path",
sort_annotations_by: None | str | Sequence[str] = (
"image_id",
"category_id",
*BBOX_COLUMN_NAMES,
),
) -> Self:
"""Reset index of ``self.images`` dataframe, and reset index of self.annotations
However, keep the 'image_id' column in ``self.annotations`` pointing to the
right rows in the ``self.images`` dataframe.
Note:
Both images and annotations dataframes will be reorder according to specific
columns. You can change them with the ``sort_images_by`` and
``sort_annotations_by`` parameters, but the default behaviour is:
- images dataframe will be reordered according to ``relative_path``
- annotations dataframe will be reordered according to ``image_id``,
``category_id`` and the bounding box coordinates, i.e. ``box_x_min``,
``box_y_min``, ``box_width`` and ``box_height``
Args:
start_image_id: Number at which the image index starts. This is used to
construct two datasets without overlapping ids.
start_annotations_id: Similar to start_image_id, number at which the
annotations index starts.
sort_images_by: columns to sort the images dataframe by. It is advised to
chose a collection of columns that makes the sorting unique. If set to
None or an empty sequence, will no sort the images dataframe before
applying a range index to it. Defaults to ``relative_path``
sort_annotations_by: columns to sort the annotations dataframe by. It is
advised to chose a collection of columns that makes the sorting unique.
If set to None or an empty sequence, will not sort the annotations
dataframe before applying a range index to it. Defaults to
``("image_id", "category_id", "box_x_min",
"box_y_min", "box_width", box_height")``.
Returns:
Dataset with ``self.images`` and ``self.annotations`` with updated indexes
See Also:
- :ref:`related tutorial </notebooks/1_demo_dataset.ipynb#Resetting-index>`
- :meth:`reset_index_from_mapping`
- :meth:`match_index`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(10, 10, seed=2)
>>> example.iloc[1::2]
Dataset object containing 5 images and 4 objects
Name :
argue_be_structure
Images root :
what/way
Images :
width height relative_path type split
id
1 472 892 also/policy.gif .gif train
3 506 602 increase/pull.jpg .jpg val
5 401 281 would/off.jpeg .jpeg train
7 831 375 ahead/truth.bmp .bmp train
9 993 334 husband/whatever.jpeg .jpeg eval
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 relationship 3 ... 606.391824 29.194750 194.387036
1 7 relationship 3 ... 313.193702 230.609055 5.269920
5 9 simply 25 ... 198.210135 474.192703 57.594892
9 9 table 7 ... 60.522880 425.022919 144.458578
<BLANKLINE>
[4 rows x 8 columns]
Label map :
{3: 'relationship', 7: 'table', 25: 'simply'}
>>> example.iloc[1::2].reset_index(10, 5)
Dataset object containing 5 images and 4 objects
Name :
argue_be_structure
Images root :
what/way
Images :
width height relative_path type split
id
10 831 375 ahead/truth.bmp .bmp train
11 472 892 also/policy.gif .gif train
12 993 334 husband/whatever.jpeg .jpeg eval
13 506 602 increase/pull.jpg .jpg val
14 401 281 would/off.jpeg .jpeg train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
5 10 relationship 3 ... 313.193702 230.609055 5.269920
6 11 relationship 3 ... 606.391824 29.194750 194.387036
7 12 table 7 ... 60.522880 425.022919 144.458578
8 12 simply 25 ... 198.210135 474.192703 57.594892
<BLANKLINE>
[4 rows x 8 columns]
Label map :
{3: 'relationship', 7: 'table', 25: 'simply'}
"""
if sort_images_by is None:
sort_images_by = []
elif isinstance(sort_images_by, str):
sort_images_by = [sort_images_by]
else:
sort_images_by = [*sort_images_by]
if sort_annotations_by is None:
sort_annotations_by = []
elif isinstance(sort_annotations_by, str):
sort_annotations_by = [sort_annotations_by]
else:
sort_annotations_by = [*sort_annotations_by]
if len(sort_images_by) > 0:
new_images = self.images.sort_values(sort_images_by)
else:
new_images = self.images
new_images = new_images.assign(
new_id=np.arange(start_image_id, start_image_id + len(self))
)
new_annotations = self.annotations.assign(
image_id=new_images.loc[self.annotations["image_id"], "new_id"].to_numpy()
)
if len(sort_annotations_by) > 0:
new_annotations = new_annotations.sort_values(
sort_annotations_by
).reset_index(drop=True)
new_annotations.index.name = "id"
new_annotations.index += start_annotations_id
new_images = new_images.set_index("new_id")
new_images.index.name = "id"
return self.from_template(images=new_images, annotations=new_annotations)
[docs]
def reset_index_from_mapping(
self,
images_index_map: dict[int, int] | pd.DataFrame | pd.Series | None = None,
annotations_index_map: dict[int, int] | pd.DataFrame | pd.Series | None = None,
remove_unmapped: bool = False,
) -> Self:
"""Reset index of images and annotations dataframe with index maps
(index -> new_index) where the value is new index to apply.
The mapping can be either a dictionary, a pandas Series or a DataFrame with
only one column. If the dataframe has more than 1 column, this function will
raise an error
Args:
images_index_map: Mapping from original image index to new image index.
If it is a DataFrame, it must have only one column.
If set to None, will apply the identity mapping. Defaults to None.
annotations_index_map: Mapping. Same as ``images_index_map``, but this
mapping applies for annotations. If set to None, will apply the identity
mapping. Default to None.
remove_unmapped: If set to True, will remove the entries in the original
dataframes which index is not present in the given mappings. Otherwise,
will apply a default mapping so that it is bijective. A range index
starting at the highest mapped index+1 will be applied to the missing
values in the mapping index. Defaults to False.
Returns:
Dataset: new dataset instance with images and annotations dataframes which
index have been remapped. The annotations will be filtered out according to
removed images, and its "image_id" column will be modified to match the new
image index.
See Also:
- :ref:`related tutorial </notebooks/1_demo_dataset.ipynb#Reindex-with-mapping>`
- :meth:`reset_index`
- :meth:`match_index`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(3, 3, seed=2)
>>> example
Dataset object containing 3 images and 3 objects
Name :
argue_be_structure
Images root :
what/way
Images :
width height relative_path type split
id
0 368 506 police/enter.jpeg .jpeg train
1 472 182 also/policy.gif .gif train
2 832 401 cold/responsibility.png .png val
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 relationship 3 ... 27.311332 69.768824 97.006466
1 2 simply 25 ... 157.041558 20.174848 16.443389
2 2 relationship 3 ... 75.088280 337.101681 193.299936
<BLANKLINE>
[3 rows x 8 columns]
Label map :
{3: 'relationship', 7: 'table', 25: 'simply'}
Note that unmapped index gets remapped to a range index starting after the
highest value of mapped index, hence the annotation id "2" that gets mapped
to "3" even if index "1" was available.
>>> example.reset_index_from_mapping(
... images_index_map={0: 1, 2: 0}, annotations_index_map={1: 2, 2: 0}
... )
Dataset object containing 3 images and 3 objects
Name :
argue_be_structure
Images root :
what/way
Images :
width height relative_path type split
id
1 368 506 police/enter.jpeg .jpeg train
0 832 401 cold/responsibility.png .png val
2 472 182 also/policy.gif .gif train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
2 0 simply 25 ... 157.041558 20.174848 16.443389
0 0 relationship 3 ... 75.088280 337.101681 193.299936
3 2 relationship 3 ... 27.311332 69.768824 97.006466
<BLANKLINE>
[3 rows x 8 columns]
Label map :
{3: 'relationship', 7: 'table', 25: 'simply'}
>>> example.reset_index_from_mapping(
... images_index_map={0: 1, 2: 0},
... annotations_index_map={1: 2, 2: 0},
... remove_unmapped=True,
... )
Dataset object containing 2 images and 2 objects
Name :
argue_be_structure
Images root :
what/way
Images :
width height relative_path type split
id
1 368 506 police/enter.jpeg .jpeg train
0 832 401 cold/responsibility.png .png val
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
2 0 simply 25 ... 157.041558 20.174848 16.443389
0 0 relationship 3 ... 75.088280 337.101681 193.299936
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{3: 'relationship', 7: 'table', 25: 'simply'}
"""
def convert_mapping_to_series(
input_mapping: dict[int, int] | pd.DataFrame | pd.Series | None,
mapping_name: str,
) -> pd.Series | None:
if isinstance(input_mapping, pd.DataFrame):
if len(input_mapping.columns) > 1:
raise ValueError(
"Index mapping can only be a Series or a DataFrame with 1"
f" column. The mapping {mapping_name} got"
f" {len(input_mapping.columns)} columns instead"
)
return input_mapping.iloc[:, 0]
if isinstance(input_mapping, dict):
input_mapping = pd.Series(input_mapping)
return input_mapping
images_index_map = convert_mapping_to_series(
images_index_map, "images_index_map"
)
annotations_index_map = convert_mapping_to_series(
annotations_index_map, "annotations_index_map"
)
def reindex_dataframe(
input_df: pd.DataFrame,
index_mapping: pd.Series | None,
remove_unmapped: bool,
) -> tuple[pd.DataFrame, pd.Series | None]:
if index_mapping is None:
return input_df, None
mapped_index = input_df.index.intersection(index_mapping.index)
index_mapping = index_mapping.loc[mapped_index]
if len(mapped_index) != len(input_df) and not remove_unmapped:
unmapped = input_df.index.difference(
mapped_index,
sort=False,
)
residual_mapping = pd.Series(
np.arange(len(unmapped)) + index_mapping.max() + 1,
index=unmapped,
)
index_mapping = pd.concat([index_mapping, residual_mapping])
return (
input_df.loc[index_mapping.index].set_index(index_mapping),
index_mapping,
)
new_annotations, _ = reindex_dataframe(
self.annotations, annotations_index_map, remove_unmapped
)
new_images, images_index_map = reindex_dataframe(
self.images, images_index_map, remove_unmapped
)
if images_index_map is not None:
if remove_unmapped and len(images_index_map) < len(self):
new_annotations = new_annotations[
new_annotations["image_id"].isin(images_index_map.index)
]
new_annotations = new_annotations.assign(
image_id=images_index_map.loc[new_annotations["image_id"]].values
)
return self.from_template(images=new_images, annotations=new_annotations)
[docs]
def match_index(
self,
other_images: "pd.DataFrame | Dataset",
on: str = "relative_path",
remove_unmatched: bool = False,
) -> Self:
"""Reindex a dataset from another images DataFrame.
The given ``on`` column is used to retrieve the index values from the reference
images dataframe.
Note:
If index of rows which value in ``on`` column does not match any row in
``other_images``, DataFrame's index will be reset to a range index without
sorting it.
Args:
other_images: images DataFrame taken from another dataset. Must have the
column specified in ``on``
on: name of the column to use to retrieve indexes. Must be present in both
columns of ``self.images`` and ``other_images``.
Defaults to "relative_path".
remove_unmatched: if set to True, will remove images from dataset that don't
match any row in the ``other_images`` dataframe. The corresponding
annotations will also be removed.
Returns:
Dataset with updated image indexes, along with values in ``image_id`` column
of annotations.
See Also:
- :ref:`related tutorial </notebooks/1_demo_dataset.ipynb#Reindex-images-index-from-other-dataframe>`
- :meth:`reset_index`
- :meth:`reset_index_from_mapping`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(5, 5, seed=2)
>>> example
Dataset object containing 5 images and 5 objects
Name :
argue_be_structure
Images root :
what/way
Images :
width height relative_path type split
id
0 368 401 police/enter.jpeg .jpeg train
1 472 640 also/policy.gif .gif val
2 832 831 cold/responsibility.png .png train
3 506 755 increase/pull.jpg .jpg train
4 182 993 Mr/trade.tiff .tiff train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 simply 25 ... 273.908994 168.756932 4.288302
1 4 table 7 ... 106.456857 19.340529 282.426602
2 0 simply 25 ... 41.921967 38.506811 33.166314
3 2 table 7 ... 167.785089 242.139038 119.708224
4 1 simply 25 ... 327.082223 234.360304 238.965568
<BLANKLINE>
[5 rows x 8 columns]
Label map :
{3: 'relationship', 7: 'table', 25: 'simply'}
>>> images_modified = example.images.iloc[::2].reset_index(drop=True)
>>> images_modified
width height relative_path type split
0 368 401 police/enter.jpeg .jpeg train
1 832 831 cold/responsibility.png .png train
2 182 993 Mr/trade.tiff .tiff train
>>> example.match_index(images_modified)
Dataset object containing 5 images and 5 objects
Name :
argue_be_structure
Images root :
what/way
Images :
width height relative_path type split
id
0 368 401 police/enter.jpeg .jpeg train
1 832 831 cold/responsibility.png .png train
2 182 993 Mr/trade.tiff .tiff train
3 472 640 also/policy.gif .gif val
4 506 755 increase/pull.jpg .jpg train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 simply 25 ... 273.908994 168.756932 4.288302
1 2 table 7 ... 106.456857 19.340529 282.426602
2 0 simply 25 ... 41.921967 38.506811 33.166314
3 1 table 7 ... 167.785089 242.139038 119.708224
4 3 simply 25 ... 327.082223 234.360304 238.965568
<BLANKLINE>
[5 rows x 8 columns]
Label map :
{3: 'relationship', 7: 'table', 25: 'simply'}
"""
if isinstance(other_images, Dataset):
other_images = other_images.images
other_on_index = pd.Index(other_images[on])
self_on_index = pd.Index(self.images[on])
if other_on_index.has_duplicates:
raise ValueError(
f"The column {on} of the input image dataframe has duplicate values"
)
if self_on_index.has_duplicates:
raise ValueError(
f"The column {on} of the dataset's image dataframe has duplicate values"
)
# Construct 2 Series indexed by the "on" anchor
# So that we can align them and be able to telle which index in self images
# corresponds to which index in the other images
other_images_index_values = pd.Series(
other_images.index, index=other_on_index, name="other_id"
)
index_values_to_match = pd.Series(
self.images.index, index=self_on_index, name="self_id"
)
# Concatenante the series with the "inner" join to remove index values without
# correspondence. By setting the index and selecting the column, we now have
# a Series that models the original id -> new id mapping
matched_ids_map = pd.concat(
[other_images_index_values, index_values_to_match], join="inner", axis=1
).set_index("self_id")["other_id"]
return self.reset_index_from_mapping(
images_index_map=matched_ids_map, remove_unmapped=remove_unmatched
)
[docs]
def merge(
self,
other: "Dataset",
allow_overlapping_image_ids: bool = True,
realign_label_map: bool = False,
ignore_index: bool = False,
mark_origin: bool = False,
overwrite_origin: bool = False,
) -> "Dataset":
"""Merge two datasets and return a unique dataset object containing
Samples from both. Result's images_root will be the common path of both
datasets, and the image relative paths will be updated accordingly.
Result's label map will be the superset of both label map,
provided one is included in the other.
Notes:
- This function is also usable with the `+` operator
- If possible, booleanized columns for images and annotations will be
broadcast together.
See :func:`lours.utils.column_booleanizer.broadcast_booleanization`
- If one of the dataset has an absolute path as ``images_root``, the other
dataset images root path will also be converted to absolute.
- If both datasets have the same name, the output will have the same name
as well.
- If datasets have a different name, the output will have the concatenation
of both names separate by a "+" sign. The merge output of "A" and "B" will
be thus names "A+B".
- If one dataset has no name (``dataset.name`` is ``None``), the output will
take the name of the other.
- If ``mark_origin`` is selected, it will be effective only if datasets have
different actual names (not ``None``)
Args:
other: Other dataset to merge with. This dataset must be
compatible with the first one, i.e. one label map is included with the
other, and image and annotation ids are mutually exclusives between
datasets (unless `ignore_index` is False)
allow_overlapping_image_ids: if set to True, will try to join images
dataframes with overlapping ids. The whole rows (i.e. with values from
columns present in both dataframes) must match, as well as
the images_root. In that case, annotations with this image_id
(from self or other) will be assumed to come from the same image.
Defaults to True
realign_label_map: If set to True, will try to remap classes of other
dataset to match this dataset's label map, to avoid a potential error
due to incompatible label maps.
ignore_index: if set to True, will ignore overlapping ids
for images and annotations and reset them. Will update the ``image_id``
column in the annotations accordingly. Note that this option makes the
former option useless. Defaults to False.
mark_origin: If set to True, and if both datasets have a different name,
will add two columns "origin" and "origin_id" for images and
annotations dataframes, indicating respectively the name of the origin
dataset, and its id in the original dataset. Defaults to True.
overwrite_origin: If set to True, will overwrite already existing columns in
input datasets dataframes. Otherwise, will only mark origin if it's not
present. Defaults to False.
Raises:
ValueError: Error if the two datasets are incompatible (see above)
Returns:
Merged dataset.
See Also:
- :ref:`related tutorial </notebooks/1_demo_dataset.ipynb#Dataset-merge>`
- :func:`merge.merge_datasets`
- :meth:`Dataset.__add__`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example1 = dummy_dataset(2, 2, seed=0)
>>> example2 = dummy_dataset(2, 2, seed=1)
>>> example1
Dataset object containing 2 images and 2 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height relative_path type split
id
0 342 136 help/me.jpeg .jpeg train
1 377 167 whatever/wait.png .png train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 step 15 ... 73.932999 71.552480 42.673983
1 0 why 19 ... 4.567638 248.551257 122.602211
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
>>> example2
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
Notice how the two label maps have overlapping index (the id 15)
>>> example1 + example2
Using the following class remapping dictionary :
{14: 14, 15: 16, 22: 22}
Dataset object containing 4 images and 4 objects
Name :
inside_else_memory+shake_effort_many
Images root :
.
Images :
width height relative_path type split
id
0 342 136 such/serious/help/me.jpeg .jpeg train
1 377 167 such/serious/whatever/wait.png .png train
2 131 840 care/suggest/air/method.bmp .bmp train
3 955 229 care/suggest/determine/story.jpg .jpg train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 step 15 ... 73.932999 71.552480 42.673983
1 0 why 19 ... 4.567638 248.551257 122.602211
2 2 listen 14 ... 276.974642 9.718823 184.684056
3 3 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[4 rows x 8 columns]
Label map :
{14: 'listen',
15: 'step',
16: 'marriage',
19: 'why',
22: 'reach',
25: 'interview'}
>>> example1.merge(example2, realign_label_map=False)
Traceback (most recent call last):
...
lours.utils.label_map_merger.IncompatibleLabelMapsError: Label maps are incompatible
>>> example1.merge(
... example2, realign_label_map=True, allow_overlapping_image_ids=False
... )
Traceback (most recent call last):
...
ValueError: Overlapping image ids not permitted. Consider using the allow_overlapping_image_ids or ignore_index options
This will raise an error because overlapping image ids is possible only if
the rows are compatible : fields that are present in both rows have the
same value
>>> example1.merge(
... example2, realign_label_map=True, allow_overlapping_image_ids=True
... )
Traceback (most recent call last):
...
AssertionError: sub-Dataframes constructed from ids and columns in both DataFrames are not equal.
The only way to merge these datasets is to remap the label map and then
reset the indexes with the option ``ignore_index`` set to ``True``, similar
to :func:`pandas.concat`.
>>> example1.merge(
... example2.remap_classes({15: 1}, remove_not_mapped=False),
... ignore_index=True,
... )
Dataset object containing 4 images and 4 objects
Name :
inside_else_memory+shake_effort_many
Images root :
.
Images :
width height relative_path type split
id
0 342 136 such/serious/help/me.jpeg .jpeg train
1 377 167 such/serious/whatever/wait.png .png train
2 131 840 care/suggest/air/method.bmp .bmp train
3 955 229 care/suggest/determine/story.jpg .jpg train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 step 15 ... 73.932999 71.552480 42.673983
1 0 why 19 ... 4.567638 248.551257 122.602211
2 2 listen 14 ... 276.974642 9.718823 184.684056
3 3 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[4 rows x 8 columns]
Label map :
{1: 'marriage',
14: 'listen',
15: 'step',
19: 'why',
22: 'reach',
25: 'interview'}
Let's construct two datasets sharing image info and label maps
>>> example = dummy_dataset(5, 5, seed=0)
>>> example1 = example.iloc_annot[::2].iloc[1:]
>>> example2 = example.iloc_annot[1::2].iloc[:-1]
>>> example1
Dataset object containing 4 images and 3 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height relative_path type split
id
1 377 831 whatever/wait.png .png train
2 136 684 chair/mother.gif .gif train
3 167 921 someone/challenge.jpeg .jpeg train
4 114 553 successful/present.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 3 why 19 ... 498.685784 31.192237 404.663563
2 3 interview 25 ... 389.294931 19.083146 209.778063
4 2 step 15 ... 85.009761 18.228218 181.012493
<BLANKLINE>
[3 rows x 8 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
>>> example2
Dataset object containing 4 images and 1 object
Name :
inside_else_memory
Images root :
such/serious
Images :
width height relative_path type split
id
0 342 257 help/me.jpeg .jpeg train
1 377 831 whatever/wait.png .png train
2 136 684 chair/mother.gif .gif train
3 167 921 someone/challenge.jpeg .jpeg train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
3 3 step 15 ... 26.082417 34.739663 607.977022
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
>>> example1.merge(example2)
Dataset object containing 5 images and 4 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height relative_path type split
id
1 377 831 whatever/wait.png .png train
2 136 684 chair/mother.gif .gif train
3 167 921 someone/challenge.jpeg .jpeg train
4 114 553 successful/present.bmp .bmp train
0 342 257 help/me.jpeg .jpeg train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 3 why 19 ... 498.685784 31.192237 404.663563
2 3 interview 25 ... 389.294931 19.083146 209.778063
4 2 step 15 ... 85.009761 18.228218 181.012493
3 3 step 15 ... 26.082417 34.739663 607.977022
<BLANKLINE>
[4 rows x 8 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
See that if we use the ``ignore_index`` option, the images are duplicated
because it is assumed the two images dataframes don't have any overlap.
>>> example1.merge(example2, ignore_index=True)
Dataset object containing 8 images and 4 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height relative_path type split
id
0 136 684 chair/mother.gif .gif train
1 167 921 someone/challenge.jpeg .jpeg train
2 114 553 successful/present.bmp .bmp train
3 377 831 whatever/wait.png .png train
4 136 684 chair/mother.gif .gif train
5 342 257 help/me.jpeg .jpeg train
6 167 921 someone/challenge.jpeg .jpeg train
7 377 831 whatever/wait.png .png train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 step 15 ... 85.009761 18.228218 181.012493
1 1 why 19 ... 498.685784 31.192237 404.663563
2 1 interview 25 ... 389.294931 19.083146 209.778063
3 6 step 15 ... 26.082417 34.739663 607.977022
<BLANKLINE>
[4 rows x 8 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
Finally, you can mark the origin of your datasets in dedicated columns in
the resulting dataset's dataframes.
>>> example1 = dummy_dataset(
... 2, 2, seed=0, label_map={0: "car"}, dataset_name="A"
... )
>>> example2 = dummy_dataset(
... 2, 2, seed=1, label_map={0: "car"}, dataset_name="B"
... )
>>> example1
Dataset object containing 2 images and 2 objects
Name :
A
Images root :
such/serious
Images :
width height relative_path type split
id
0 865 560 step/why.jpg .jpg train
1 673 342 help/me.jpeg .jpeg val
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 car 0 ... 511.143123 616.718121 12.497434
1 0 car 0 ... 339.716034 233.243139 117.161956
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{0: 'car'}
>>> example2
Dataset object containing 2 images and 2 objects
Name :
B
Images root :
care/suggest
Images :
width height relative_path type split
id
0 525 779 reach/marriage.jpg .jpg train
1 560 955 determine/story.jpg .jpg train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 car 0 ... 21.468549 283.211413 308.302755
1 0 car 0 ... 586.986712 124.825174 57.793609
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{0: 'car'}
>>> merged_examples = example1.merge(
... example2, mark_origin=True, ignore_index=True
... )
>>> merged_examples
Dataset object containing 4 images and 4 objects
Name :
A+B
Images root :
.
Images :
width height relative_path ... split origin origin_id
id ...
0 673 342 such/serious/help/me.jpeg ... val A 1
1 865 560 such/serious/step/why.jpg ... train A 0
2 560 955 care/suggest/determine/story.jpg ... train B 1
3 525 779 care/suggest/reach/marriage.jpg ... train B 0
<BLANKLINE>
[4 rows x 7 columns]
Annotations :
image_id category_str category_id ... box_height origin origin_id
id ...
0 1 car 0 ... 12.497434 A 0
1 1 car 0 ... 117.161956 A 1
2 3 car 0 ... 57.793609 B 1
3 3 car 0 ... 308.302755 B 0
<BLANKLINE>
[4 rows x 10 columns]
Label map :
{0: 'car'}
By default, dataset which already feature an origin for its sample will
retain it for further merges. Optionally, you can decide to overwrite the
origin to the actual dataset that is being merged and forget the old origin.
>>> example3 = dummy_dataset(
... 2, 2, seed=2, label_map={0: "car"}, dataset_name="C"
... )
>>> merged_examples.merge(example3, mark_origin=True, ignore_index=True)
Dataset object containing 6 images and 6 objects
Name :
A+B+C
Images root :
.
Images :
width height relative_path ... split origin origin_id
id ...
0 560 955 care/suggest/determine/story.jpg ... train B 1
1 525 779 care/suggest/reach/marriage.jpg ... train B 0
2 673 342 such/serious/help/me.jpeg ... val A 1
3 865 560 such/serious/step/why.jpg ... train A 0
4 335 368 what/way/police/enter.jpeg ... train C 1
5 853 198 what/way/relationship/table.tiff ... train C 0
<BLANKLINE>
[6 rows x 7 columns]
Annotations :
image_id category_str category_id ... box_height origin origin_id
id ...
0 1 car 0 ... 57.793609 B 1
1 1 car 0 ... 308.302755 B 0
2 3 car 0 ... 12.497434 A 0
3 3 car 0 ... 117.161956 A 1
4 4 car 0 ... 137.766169 C 1
5 5 car 0 ... 14.083247 C 0
<BLANKLINE>
[6 rows x 10 columns]
Label map :
{0: 'car'}
>>> merged_examples.merge(
... example3, mark_origin=True, ignore_index=True, overwrite_origin=True
... )
Dataset object containing 6 images and 6 objects
Name :
A+B+C
Images root :
.
Images :
width height relative_path ... split origin origin_id
id ...
0 560 955 care/suggest/determine/story.jpg ... train A+B 2
1 525 779 care/suggest/reach/marriage.jpg ... train A+B 3
2 673 342 such/serious/help/me.jpeg ... val A+B 0
3 865 560 such/serious/step/why.jpg ... train A+B 1
4 335 368 what/way/police/enter.jpeg ... train C 1
5 853 198 what/way/relationship/table.tiff ... train C 0
<BLANKLINE>
[6 rows x 7 columns]
Annotations :
image_id category_str category_id ... box_height origin origin_id
id ...
0 1 car 0 ... 57.793609 A+B 2
1 1 car 0 ... 308.302755 A+B 3
2 3 car 0 ... 12.497434 A+B 0
3 3 car 0 ... 117.161956 A+B 1
4 4 car 0 ... 137.766169 C 1
5 5 car 0 ... 14.083247 C 0
<BLANKLINE>
[6 rows x 10 columns]
Label map :
{0: 'car'}
"""
from .merge import merge_datasets
return merge_datasets(
self,
other,
allow_overlapping_image_ids=allow_overlapping_image_ids,
realign_label_map=realign_label_map,
ignore_index=ignore_index,
mark_origin=mark_origin,
overwrite_origin=overwrite_origin,
)
def __radd__(self, other: "int | Dataset") -> "Dataset":
if isinstance(other, int):
# Stub function so that we can use the sum function for datasets
return self
else:
return self.__add__(other)
[docs]
def __add__(self, other: "Dataset") -> "Dataset":
"""Overloading of the "+" operator for Datasets.
It will call the :meth:`Dataset.merge` method multiple times if needed:
- Once with default parameters
- If it fails because of an incompatible label map, it will try to remap the
other dataset's label map to match this dataset.
- If it fails because of another error, it will try to use the ``ignore_index``
option set to ``True``.
See Also:
:meth:`Dataset.merge`
Args:
other: Other dataset to merge the first dataset with.
Returns:
Merged Dataset.
"""
try:
return self.merge(other)
except IncompatibleLabelMapsError:
warn(
"Addition failed because of incompatible label maps, trying to"
" remap classes of right value and retry the merge",
RuntimeWarning,
)
return self + other.remap_from_other(self)
except (ValueError, AssertionError):
warn(
"Addition failed, retrying merge with ignore_index set to True",
RuntimeWarning,
)
return self.merge(other, ignore_index=True)
def __sub__(self, other: "Dataset") -> tuple["Dataset", "Dataset", "Dataset"]:
from lours.utils.difftools import dataset_diff
assert isinstance(other, Dataset), "subtracted object must be a dataset"
return dataset_diff(left_dataset=self, right_dataset=other)
[docs]
def remove_empty_images(self) -> Self:
"""Remove images without annotations from dataset.
Note: This does NOT remove empty images from the disk, but simply from the
dataset object, and thus they will not be copied when saving the dataset
elsewhere.
Returns:
Dataset object with the rows of empty images removed from the
``self.images`` dataframe.
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 1, seed=0)
>>> example
Dataset object containing 2 images and 1 object
Name :
inside_else_memory
Images root :
such/serious
Images :
width height relative_path type split
id
0 342 136 help/me.jpeg .jpeg eval
1 377 167 whatever/wait.png .png val
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 interview 25 ... 73.932999 62.674584 8.569467
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
>>> example.remove_empty_images()
Dataset object containing 1 image and 1 object
Name :
inside_else_memory
Images root :
such/serious
Images :
width height relative_path type split
id
0 342 136 help/me.jpeg .jpeg eval
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 interview 25 ... 73.932999 62.674584 8.569467
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
"""
not_empty = self.images.index.isin(self.annotations["image_id"])
return self.loc[not_empty]
[docs]
def cap_bounding_box_coordinates(self) -> Self:
"""Method to ensure the bounding box coordinates are inside the picture frame.
Indeed, some dataset (like crowdhuman) do use outside of picture bounding box
Returns:
New Dataset with bounding box capped so that X and Y coordinates are inside
corresponding picture dimensions
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> from lours.utils.testing import assert_bounding_boxes_well_formed
>>> example = dummy_dataset(1, 1)
>>> example.annotations.loc[0, "box_y_min"] = -0.5
>>> example.annotations.loc[0, "box_height"] = (
... example.images["height"][0] + 1
... )
>>> example
Dataset object containing 1 image and 1 object
Name :
inside_else_memory
Images root :
such/serious
Images :
width height relative_path type split
id
0 342 377 help/me.jpeg .jpeg train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 interview 25 ... -0.5 306.509956 378.0
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
>>> assert_bounding_boxes_well_formed(example)
Traceback (most recent call last):
...
AssertionError: Assertion failed. Bounding boxes must have positive Y values. First occurrence at row 0 : image_id 0
category_str interview
category_id 25
split train
box_x_min 5.652451
box_y_min -0.5
box_width 306.509956
box_height 378.0
Name: 0, dtype: object
>>> example.cap_bounding_box_coordinates()
Dataset object containing 1 image and 1 object
Name :
inside_else_memory
Images root :
such/serious
Images :
width height relative_path type split
id
0 342 377 help/me.jpeg .jpeg train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 interview 25 ... 0.0 306.509956 377.0
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
"""
xmin, ymin, box_width, box_height = BBOX_COLUMN_NAMES
im_dimensions = self.images.loc[
self.annotations["image_id"], ["width", "height"]
]
im_dimensions.index = self.annotations.index
capped_xmin_series = self.annotations[xmin].clip(0, im_dimensions["width"])
capped_ymin_series = self.annotations[ymin].clip(0, im_dimensions["height"])
capped_box_width_series = self.annotations[box_width].clip(
0, im_dimensions["width"] - capped_xmin_series
)
capped_box_height_series = self.annotations[box_height].clip(
0, im_dimensions["height"] - capped_ymin_series
)
capped_annotations = self.annotations.assign(
**{
xmin: capped_xmin_series,
ymin: capped_ymin_series,
box_width: capped_box_width_series,
box_height: capped_box_height_series,
}
)
return self.from_template(annotations=capped_annotations)
[docs]
def booleanize(
self,
column_names: str | Iterable[str] | None = None,
missing_ok: bool = False,
**possible_values: set,
) -> Self:
"""Convert given column in ``self.images`` or ``self.annotations`` from lists to
columns of booleans.
See :func:`.util.column_booleanize.booleanize`
Note:
in the case column name is present in both images and annotations, the
column in ``self.images`` takes precedence
Args:
column_names: columns to convert. After conversion, it will be dropped
from corresponding DataFrames
missing_ok: If set to True, will not raise a KeyError if the column name is
neither in ``self.images`` nor ``self.annotations``
**possible_values: keyword arguments dictionary for possible values. If a
column name in ``column_names`` is not present in this dictionary, will
deduce from occurrence in the dataset
Raises:
KeyError: if ``missing_ok`` is set to False, the given ``column_name`` must
be either in ``self.images`` columns or in ``self.annotations`` columns.
TypeError: When for a particular column possible values need to be deduced,
the column must have value that are all iterable except strings.
Returns:
New dataset with multiple boolean columns in the form
``{column_name}.{value}``.
See Also:
:ref:`related tutorial </notebooks/7_demo_booleanize.ipynb>`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(
... n_imgs=3,
... n_annot=3,
... n_list_columns_images=[2, 3],
... n_list_columns_annotations=1,
... )
>>> example
Dataset object containing 3 images and 3 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height ... beyond father
id ...
0 342 167 ... [enough] [challenge, someone]
1 377 114 ... [present, successful] [challenge]
2 136 257 ... [present, successful, enough] [challenge, someone]
<BLANKLINE>
[3 rows x 7 columns]
Annotations :
image_id category_str ... box_height where
id ...
0 2 why ... 138.451739 [no, season, play, choice, force, bit]
1 1 why ... 63.576932 [no, choice, force]
2 2 step ... 99.999123 [no, season, play, week, bit]
<BLANKLINE>
[3 rows x 9 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
>>> modified = example.booleanize(column_names=["beyond", "where"])
>>> modified
Dataset object containing 3 images and 3 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height ... beyond.present beyond.successful
id ...
0 342 167 ... False False
1 377 114 ... True True
2 136 257 ... True True
<BLANKLINE>
[3 rows x 9 columns]
Annotations :
image_id category_str category_id ... where.play where.season where.week
id ...
0 2 why 19 ... True True False
1 1 why 19 ... False False False
2 2 step 15 ... True True True
<BLANKLINE>
[3 rows x 15 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
>>> modified.annotations.dtypes
image_id int64
category_str object
category_id int64
split object
box_x_min float64
box_y_min float64
box_width float64
box_height float64
where.bit bool
where.choice bool
where.force bool
where.no bool
where.play bool
where.season bool
where.week bool
dtype: object
>>> modified.booleanized_columns
{'images': {'beyond'}, 'annotations': {'where'}}
>>> example.booleanize(beyond={"enough", "successful"})
Dataset object containing 3 images and 3 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height ... beyond.enough beyond.successful
id ...
0 342 167 ... True False
1 377 114 ... False True
2 136 257 ... True True
<BLANKLINE>
[3 rows x 8 columns]
Annotations :
image_id category_str ... box_height where
id ...
0 2 why ... 138.451739 [no, season, play, choice, force, bit]
1 1 why ... 63.576932 [no, choice, force]
2 2 step ... 99.999123 [no, season, play, week, bit]
<BLANKLINE>
[3 rows x 9 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
"""
if column_names is None:
column_names = set()
if isinstance(column_names, str):
column_names = {column_names}
else:
column_names = {*column_names}
if possible_values:
column_names = set(column_names).union(possible_values.keys())
elif not column_names:
# Nothing to booleanize, return immediately
warn("Nothing to booleanize, dataset returned as is", RuntimeWarning)
return self
images_booleanize = set()
annotations_booleanize = set()
while column_names:
name = column_names.pop()
if name in self.images.columns:
images_booleanize.add(name)
elif name in self.annotations.columns:
annotations_booleanize.add(name)
elif not missing_ok:
raise KeyError(
f"Column name {name} is neither in self.images nor self.annotations"
)
new_images = booleanize(
self.images,
separator=".",
**{name: possible_values.get(name, None) for name in images_booleanize},
)
new_annotations = booleanize(
self.annotations,
separator=".",
**{
name: possible_values.get(name, None) for name in annotations_booleanize
},
)
output_dataset = self.from_template(
images=new_images,
annotations=new_annotations,
reset_booleanized=False,
)
output_dataset.booleanized_columns["images"] |= set(images_booleanize)
output_dataset.booleanized_columns["annotations"] |= set(annotations_booleanize)
return output_dataset
[docs]
def debooleanize(
self,
dataframe: Literal["both", "images", "annotations"] = "both",
) -> Self:
"""Convert booleanized columns back to list form, for exporting purpose.
Note:
This will only debooleanize columns that have been explicitly booleanized,
and not just boolean columns. It will look for values in
``self.booleanized_columns`` and retrieve all the column with the name
in the form ``column_name.entry`` to reconstruct the ``column_name``
column.
Args:
dataframe: Which dataframe you want to booleanize.
Can be either "images", "annotations" or None.
If set to None, will debooleanize both dataframes. Defaults to None.
Returns:
New dataset object with converted columns, booleanized columns are dropped.
See Also:
:ref:`related tutorial </notebooks/7_demo_booleanize.ipynb>`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(
... n_imgs=3,
... n_annot=3,
... n_list_columns_images=[2, 3],
... n_list_columns_annotations=1,
... )
>>> example
Dataset object containing 3 images and 3 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height ... beyond father
id ...
0 342 167 ... [enough] [challenge, someone]
1 377 114 ... [present, successful] [challenge]
2 136 257 ... [present, successful, enough] [challenge, someone]
<BLANKLINE>
[3 rows x 7 columns]
Annotations :
image_id category_str ... box_height where
id ...
0 2 why ... 138.451739 [no, season, play, choice, force, bit]
1 1 why ... 63.576932 [no, choice, force]
2 2 step ... 99.999123 [no, season, play, week, bit]
<BLANKLINE>
[3 rows x 9 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
>>> modified = example.booleanize(column_names=["beyond", "where"])
>>> modified
Dataset object containing 3 images and 3 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height ... beyond.present beyond.successful
id ...
0 342 167 ... False False
1 377 114 ... True True
2 136 257 ... True True
<BLANKLINE>
[3 rows x 9 columns]
Annotations :
image_id category_str category_id ... where.play where.season where.week
id ...
0 2 why 19 ... True True False
1 1 why 19 ... False False False
2 2 step 15 ... True True True
<BLANKLINE>
[3 rows x 15 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
>>> modified.debooleanize()
Dataset object containing 3 images and 3 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height ... beyond father
id ...
0 342 167 ... [enough] [challenge, someone]
1 377 114 ... [present, successful] [challenge]
2 136 257 ... [enough, present, successful] [challenge, someone]
<BLANKLINE>
[3 rows x 7 columns]
Annotations :
image_id category_str ... box_height where
id ...
0 2 why ... 138.451739 [bit, choice, force, no, play, season]
1 1 why ... 63.576932 [choice, force, no]
2 2 step ... 99.999123 [bit, no, play, season, week]
<BLANKLINE>
[3 rows x 9 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
>>> modified.debooleanize(dataframe="images")
Dataset object containing 3 images and 3 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height ... beyond father
id ...
0 342 167 ... [enough] [challenge, someone]
1 377 114 ... [present, successful] [challenge]
2 136 257 ... [enough, present, successful] [challenge, someone]
<BLANKLINE>
[3 rows x 7 columns]
Annotations :
image_id category_str category_id ... where.play where.season where.week
id ...
0 2 why 19 ... True True False
1 1 why 19 ... False False False
2 2 step 15 ... True True True
<BLANKLINE>
[3 rows x 15 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
"""
images = self.images
annotations = self.annotations
do_images = dataframe in ["both", "images"]
do_annot = dataframe in ["both", "annotations"]
if do_images:
images = debooleanize(images, self.booleanized_columns["images"], ".")
if do_annot:
annotations = debooleanize(
annotations, self.booleanized_columns["annotations"], "."
)
output = self.from_template(
images=images, annotations=annotations, reset_booleanized=True
)
return output
[docs]
def remap_classes(
self,
class_mapping: dict[int, int],
new_names: dict[int, str] | None = None,
remove_not_mapped: bool = True,
remove_emptied_images: bool = False,
) -> Self:
"""Remap classes ids and names according to a dictionary
Note:
In case of class fusion, the class name of the last category_id with respect
to ``class_mapping`` order will be deduced.
Note:
if ``remove_not_mapped`` is True, Classes that are not present in the
dictionary are removed from the dataset altogether.
Otherwise, they are kept as if the identity mapping was in the bottom of
``class_mapping`` for this particular class.
For potential class fusion, the name of the unmapped class will be used.
See Also:
- :ref:`related tutorial </notebooks/1_demo_dataset.ipynb#Remap-classes>`
- :meth:`.remap_from_preset`
- :meth:`.remap_from_dataframe`
- :meth:`.remap_from_csv`
- :meth:`.remap_from_other`
- :meth:`.remove_classes`
- :meth:`.keep_classes`
Args:
class_mapping: ``old_id`` -> ``new_id`` mapping
new_names: Optimal ``new_id`` -> ``new_name`` mapping, essentially the new
label_map.
If category_id is missing from keys, will deduce it from the former one.
Defaults to None.
remove_not_mapped: If set to True, will remove classes that are not in
class mapping. Otherwise, keep them as is (with potential class fusion).
Defaults to True.
remove_emptied_images: If set to True, will remove from self.images the
images that are now empty of annotation.
Note that it will keep the images that were empty before the remapping.
Defaults to False.
Returns:
New dataset object with updated label maps, category ids and
category_names
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 2, seed=1)
>>> example
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.remap_classes({14: 1})
Dataset object containing 2 images and 1 object
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 1 ... 276.974642 9.718823 184.684056
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{1: 'listen'}
>>> example.remap_classes({14: 1}, remove_not_mapped=False)
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 1 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{1: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.remap_classes(
... {14: 1},
... remove_not_mapped=False,
... new_names={1: "new_listen", 15: "new_marriage"},
... )
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 new_listen 1 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{1: 'new_listen', 15: 'new_marriage', 22: 'reach'}
>>> example.remap_classes(
... {14: 1},
... remove_emptied_images=True,
... )
Dataset object containing 1 image and 1 object
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 1 ... 276.974642 9.718823 184.684056
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{1: 'listen'}
Note that only empited images are removed. Images that were already empty
before are kept
>>> example = dummy_dataset(2, 2)
>>> example
Dataset object containing 2 images and 2 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height relative_path type split
id
0 342 136 help/me.jpeg .jpeg train
1 377 167 whatever/wait.png .png train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 step 15 ... 73.932999 71.552480 42.673983
1 0 why 19 ... 4.567638 248.551257 122.602211
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
>>> example.remap_classes({25: 1}, remove_emptied_images=True)
Dataset object containing 1 image and 0 object
Name :
inside_else_memory
Images root :
such/serious
Images :
width height relative_path type split
id
1 377 167 whatever/wait.png .png train
Annotations :
Empty DataFrame
Columns: [image_id, category_str, category_id, split, box_x_min, box_y_min, box_width, box_height]
Index: []
Label map :
{1: 'interview'}
""" # noqa: E501
if not remove_not_mapped:
not_mapped = {
category_id: category_id
for category_id in self.label_map.keys()
if category_id not in class_mapping.keys()
}
class_mapping = {**class_mapping, **not_mapped}
new_label_map = {
v: self.label_map[k]
for k, v in class_mapping.items()
if k in self.label_map
}
if new_names is not None:
new_label_map = {**new_label_map, **new_names}
# Only keep classes referenced in the class_mapping
new_annotations = self.annotations[
self.annotations["category_id"].isin(class_mapping)
]
# Replace both id and class in annotation dataframe
new_annotations = new_annotations.replace({"category_id": class_mapping})
new_annotations["category_str"] = new_annotations["category_id"].map(
new_label_map
)
if remove_emptied_images:
already_empty_images = ~self.images.index.isin(self.annotations["image_id"])
already_empty_images_ids = self.images.index[already_empty_images].tolist()
new_images = self.images.loc[
already_empty_images_ids + new_annotations["image_id"].unique().tolist()
]
else:
new_images = self.images
return self.from_template(
images=new_images,
annotations=new_annotations,
label_map=new_label_map,
)
[docs]
def remap_from_preset(
self,
input_dataset_map: str,
output_dataset_map: str,
remove_not_mapped: bool = True,
remove_emptied_images: bool = False,
) -> Self:
"""Same as class remap, but instead of taking a dictionary, you give the name
of a preset. Registered presets are stored in remap_presets folders, with csv
files in the form ``{inputr_dataset_map}_to_{output_dataset_map}``
Args:
input_dataset_map: Name of label map to convert from.
output_dataset_map: Name of label to convert to.
remove_not_mapped: If set to True, will remove classes that are not in
class mapping. Otherwise, keep them as is (with potential class fusion).
Defaults to True.
remove_emptied_images: If set to True, will remove from ``self.images`` the
images that are now empty of annotation.
Note that it will keep the images that were empty before the remapping.
Defaults to False.
Returns:
New dataset object with remapped classes according to the preset
Raises:
KeyError: raised when the input/output pair does not exists in presets.
See Also:
- :ref:`related tutorial </notebooks/1_demo_dataset.ipynb#Remap-classes>`
- :meth:`.remap_classes`
- :meth:`.remap_from_dataframe`
- :meth:`.remap_from_csv`
- :meth:`.remap_from_other`
- :meth:`.remove_classes`
- :meth:`.keep_classes`
"""
from . import remap_presets
try:
id_mapping, new_names = remap_presets.presets[
(input_dataset_map, output_dataset_map)
]
except KeyError as e:
raise ValueError(
"Preset not available. Available presets are : \n"
f"{remap_presets.list_available_presets()}"
) from e
return self.remap_classes(
id_mapping, new_names, remove_not_mapped, remove_emptied_images
)
[docs]
def remap_from_dataframe(
self,
df: pd.DataFrame,
remove_not_mapped: bool = True,
remove_emptied_images: bool = False,
) -> Self:
"""Same as class remap, but instead of taking a dictionary, you give a
dataframe.
Dataframe must have at least these two columns:
- ``input_category_id``
- ``output_category_id``
Optional columns for category names:
- ``input_category_name``
- ``output_category_name``
Args:
df: dataframe with aforementioned columns
remove_not_mapped: If set to True, will remove classes that are not in
class mapping. Otherwise, keep them as is (with potential class fusion).
Defaults to True.
remove_emptied_images: If set to True, will remove from self.images the
images that are now empty of annotation.
Note that it will keep the images that were empty before the remapping.
Defaults to False.
Returns:
new dataset object with remapped classes according to
the given table in the dataframe
See Also:
- :ref:`related tutorial </notebooks/1_demo_dataset.ipynb#Remap-classes>`
- :meth:`.remap_classes`
- :meth:`.remap_from_preset`
- :meth:`.remap_from_csv`
- :meth:`.remap_from_other`
- :meth:`.remove_classes`
- :meth:`.keep_classes`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 2, seed=1)
>>> example
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> remap_df = pd.DataFrame(
... data={
... "input_category_id": [14, 22],
... "output_category_id": [0, 1],
... "output_category_name": ["new_listen", "new_reach"],
... }
... )
>>> remap_df
input_category_id output_category_id output_category_name
0 14 0 new_listen
1 22 1 new_reach
>>> example.remap_from_dataframe(remap_df)
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 new_listen 0 ... 276.974642 9.718823 184.684056
1 0 new_reach 1 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{0: 'new_listen', 1: 'new_reach'}
""" # noqa: E501
if df.index.name == "input_category_id":
mapping_df = df
else:
mapping_df = df.set_index("input_category_id")
mapping_dict = mapping_df["output_category_id"].to_dict()
mapping_names = (
mapping_df.groupby("output_category_id")["output_category_name"]
.first()
.to_dict()
)
return self.remap_classes(
mapping_dict, # pyright: ignore
mapping_names, # pyright: ignore
remove_not_mapped,
remove_emptied_images,
)
[docs]
def remap_from_csv(
self,
csv: Path,
remove_not_mapped: bool = True,
remove_emptied_images: bool = False,
) -> Self:
"""Same as class remap, but instead of taking a dictionary, you give the path
to a csv file.
csv file must have at least these two columns :
- ``input_category_id``
- ``output_category_id``
Optional columns for category names :
- ``input_category_name``
- ``output_category_name``
Args:
csv: path to csv file, to be read by pandas
remove_not_mapped: If set to True, will remove classes that are not in
class mapping. Otherwise, keep them as is (with potential class fusion).
Defaults to True.
remove_emptied_images: If set to True, will remove from self.images the
images that are now empty of annotation.
Note that it will keep the images that were empty before the remapping.
Defaults to False.
Returns:
New dataset object with remapped classes according to
the given table in the csv file
See Also:
- :ref:`related tutorial </notebooks/1_demo_dataset.ipynb#Remap-classes>`
- :meth:`.remap_classes`
- :meth:`.remap_from_preset`
- :meth:`.remap_from_dataframe`
- :meth:`.remap_from_other`
- :meth:`.remove_classes`
- :meth:`.keep_classes`
""" # noqa: E501
mapping_df = pd.read_csv(csv).set_index("input_category_id")
return self.remap_from_dataframe(
mapping_df, remove_not_mapped, remove_emptied_images
)
[docs]
def remap_from_other(
self,
other: "Dataset",
remove_not_mapped: bool = False,
remove_emptied_images: bool = False,
) -> Self:
"""Try to remap classes of dataset to match the ones in another dataset by
retrieving categories with the same name.
This is useful when trying to merge together two dataset with incompatible label
maps.
The mapping is constructed so that no category id represents different category
labels between other dataset and remapped dataset.
This function works by first applying the mapping on objects with the same
category strings as some other objects in other dataset, and reassign the other
categories so that the ids don't overlap. categories whose name is only present
in the current and have the same id as some other category in the other dataset
will be iteratively set to the lowest unoccupied category id of all label maps.
Note:
The name of a category is ambiguous. Another method of class remapping
should be preferred if possible.
See :ref:`related tutorial </notebooks/1_demo_dataset.ipynb#Remap-classes>`
Args:
other: Other dataset to align the output's label map with.
remove_not_mapped: If set to True, will remove classes that are in self,
but not in other dataset's class mapping.
Otherwise, keep them as is. Defaults to False.
remove_emptied_images: If set to True, will remove from self.images the
images that are now empty of annotation.
Note that it will keep the images that were empty before the remapping.
Defaults to False.
Raises:
AssertionError: Error raised if label map of one of the two dataset don't
have unique category names.
Returns:
Dataset: New dataset with remapped classes to match the ones in `other`
See Also:
- :ref:`related tutorial </notebooks/1_demo_dataset.ipynb#Remap-classes>`
- :meth:`.remap_classes`
- :meth:`.remap_from_preset`
- :meth:`.remap_from_dataframe`
- :meth:`.remap_from_csv`
- :meth:`.remove_classes`
- :meth:`.keep_classes`
Example:
current dataset has label map ``{1: car, 2: person, 3:truck}`` and other
dataset has label map ``{1: train, 2: car, 3: person}``. This method will
construct this mapping dictionary: ``{1: 2, 2: 3, 3: 4}`` so that the
remapped dataset has the following label map: ``{2:car, 3:person, 4:truck}``
which is now compatible with other dataset's label map (no overlap)
In the case you merge the two datasets, the resulting merged label map will
be: ``{1: train, 2: car, 3: person, 4: truck}``
>>> from lours.utils.doc_utils import dummy_dataset
>>> example1 = dummy_dataset(
... n_imgs=2,
... n_annot=2,
... label_map={1: "car", 2: "person", 3: "truck"},
... seed=3,
... )
>>> example1
Dataset object containing 2 images and 2 objects
Name :
have_page_personal
Images root :
draw/name
Images :
width height relative_path type split
id
0 830 261 add/police.bmp .bmp train
1 177 313 ok/event.jpg .jpg train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 truck 3 ... 102.110558 531.572263 22.921831
1 1 person 2 ... 49.998280 56.543521 111.741397
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{1: 'car', 2: 'person', 3: 'truck'}
>>> example2 = dummy_dataset(
... n_imgs=2,
... n_annot=2,
... label_map={1: "train", 2: "car", 3: "person"},
... seed=1,
... )
>>> example2
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 525 779 reach/marriage.jpg .jpg train
1 560 955 determine/story.jpg .jpg train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 person 3 ... 586.986712 124.825174 57.793609
1 0 person 3 ... 318.766127 207.777851 100.447514
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{1: 'train', 2: 'car', 3: 'person'}
>>> example1.remap_from_other(example2)
Using the following class remapping dictionary :
{1: 2, 2: 3, 3: 4}
Dataset object containing 2 images and 2 objects
Name :
have_page_personal
Images root :
draw/name
Images :
width height relative_path type split
id
0 830 261 add/police.bmp .bmp train
1 177 313 ok/event.jpg .jpg train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 truck 4 ... 102.110558 531.572263 22.921831
1 1 person 3 ... 49.998280 56.543521 111.741397
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{2: 'car', 3: 'person', 4: 'truck'}
>>> example1.remap_from_other(example2, remove_not_mapped=True)
Using the following class remapping dictionary :
{1: 2, 2: 3}
Dataset object containing 2 images and 1 object
Name :
have_page_personal
Images root :
draw/name
Images :
width height relative_path type split
id
0 830 261 add/police.bmp .bmp train
1 177 313 ok/event.jpg .jpg train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
1 1 person 3 ... 49.99828 56.543521 111.741397
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{2: 'car', 3: 'person'}
>>> example1.remap_from_other(
... example2, remove_not_mapped=True, remove_emptied_images=True
... )
Using the following class remapping dictionary :
{1: 2, 2: 3}
Dataset object containing 1 image and 1 object
Name :
have_page_personal
Images root :
draw/name
Images :
width height relative_path type split
id
1 177 313 ok/event.jpg .jpg train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
1 1 person 3 ... 49.99828 56.543521 111.741397
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{2: 'car', 3: 'person'}
"""
from ..utils.testing import assert_label_map_well_formed
def lowest_missing_value(input_list: Iterable[int]) -> int:
sorted_values = sorted(set(input_list))
for s1, s2 in zip(sorted_values[:-1], sorted_values[1:]):
if s2 - s1 > 1:
return s1 + 1
return max(sorted_values) + 1
assert_label_map_well_formed(self)
assert_label_map_well_formed(other)
class_mapping = {}
inverted_label_map_reference = {v: k for k, v in other.label_map.items()}
for k, v in self.label_map.items():
new_id = inverted_label_map_reference.get(v)
if new_id is not None:
class_mapping[k] = new_id
elif not remove_not_mapped:
if k in other.label_map.keys():
class_mapping[k] = lowest_missing_value(
[
*self.label_map,
*other.label_map,
*class_mapping.values(),
]
)
else:
# This is not needed, but the printed remapping dictionary will be
# more comprehensive that way
class_mapping[k] = k
print(
"Using the following class remapping dictionary"
f" :\n{pformat(class_mapping)}"
)
return self.remap_classes(
class_mapping=class_mapping,
remove_not_mapped=remove_not_mapped,
remove_emptied_images=remove_emptied_images,
)
[docs]
def remove_classes(
self, to_remove: int | Iterable[int], remove_emptied_images: bool = False
) -> Self:
"""Perform a simple remapping, where given classes are removed
Notes:
- This function is equivalent to calling :meth:`.remap_classes` where the
remapping dictionary is the identity except removed classes do not appear.
- This function is the complementary to :meth:`.keep_classes`.
Args:
to_remove: list of class ids to remove.
remove_emptied_images: If set to True, will remove from ``self.images`` the
images that are now empty of annotation.
Note that it will keep the images that were empty before the remapping.
Defaults to False.
Returns:
New dataset object where given classes have been removed
See Also:
- :ref:`related tutorial </notebooks/1_demo_dataset.ipynb#Remap-classes>`
- :meth:`.keep_classes`
- :meth:`.remap_classes`
- :meth:`.remap_from_preset`
- :meth:`.remap_from_dataframe`
- :meth:`.remap_from_csv`
- :meth:`.remap_from_other`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 2, seed=1)
>>> example
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.remove_classes(14)
Dataset object containing 2 images and 1 object
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{15: 'marriage', 22: 'reach'}
>>> example.remove_classes([14, 15])
Dataset object containing 2 images and 1 object
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{22: 'reach'}
>>> example.remove_classes(14, remove_emptied_images=True)
Dataset object containing 1 image and 1 object
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{15: 'marriage', 22: 'reach'}
"""
if isinstance(to_remove, int):
to_remove = [to_remove]
class_mapping = {i: i for i in self.label_map if i not in to_remove}
return self.remap_classes(
class_mapping,
remove_not_mapped=True,
remove_emptied_images=remove_emptied_images,
)
[docs]
def keep_classes(
self, to_keep: int | Iterable[int], remove_emptied_images: bool = False
) -> Self:
"""Perform a simple remapping, where given classes kept, and other are removed
Notes:
- This function is equivalent to calling :meth:`.remap_classes` where the
remapping dictionary is the identity except only kept classes appear.
- This function is the complementary to :meth:`.remove_classes`.
Args:
to_keep: list of class ids to keep.
remove_emptied_images: If set to True, will remove from ``self.images`` the
images that are now empty of annotation.
Note that it will keep the images that were empty before the remapping.
Defaults to False.
Returns:
New dataset object where given classes have been kept, and the rest removed.
See Also:
- :ref:`related tutorial </notebooks/1_demo_dataset.ipynb#Remap-classes>`
- :meth:`.remap_classes`
- :meth:`.remap_from_preset`
- :meth:`.remap_from_dataframe`
- :meth:`.remap_from_csv`
- :meth:`.remap_from_other`
- :meth:`.remove_classes`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(2, 2, seed=1)
>>> example
Dataset object containing 2 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 1 listen 14 ... 276.974642 9.718823 184.684056
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.keep_classes([15, 22])
Dataset object containing 2 images and 1 object
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{15: 'marriage', 22: 'reach'}
>>> example.keep_classes(22)
Dataset object containing 2 images and 1 object
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
1 131 840 air/method.bmp .bmp train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{22: 'reach'}
>>> example.keep_classes([15, 22], remove_emptied_images=True)
Dataset object containing 1 image and 1 object
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 229 determine/story.jpg .jpg train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
1 0 reach 22 ... 6.311037 123.141689 174.239136
<BLANKLINE>
[1 rows x 8 columns]
Label map :
{15: 'marriage', 22: 'reach'}
"""
if isinstance(to_keep, int):
to_keep = [to_keep]
class_mapping = {i: i for i in self.label_map if i in to_keep}
return self.remap_classes(
class_mapping,
remove_not_mapped=True,
remove_emptied_images=remove_emptied_images,
)
[docs]
def simple_split(
self,
input_seed: int = 0,
split_names: Sequence[str] = ("train", "valid"),
target_split_shares: Sequence[float] = (0.8, 0.2),
inplace: bool = False,
) -> Self:
"""Simple version of splitting method, splitting images randomly.
Args:
input_seed: Random seed for splitting images. Defaults to 0.
split_names: Names of splits. Must be more than 1 element long and the same
size as ``target_split_shares``. Defaults to ``("train", "valid")``.
target_split_shares: Share values of each split. Must be the same size as
``split_names``. Must add up to 1. Defaults to ``(0.8, 0.2)``.
inplace: If set to True, will perform the splitting inplace without creating
a new dataset. Defaults to False.
Returns:
Dataset with new splits applied to its images DataFrame.
See Also:
- More in-depth explanation in this :ref:`tutorial </notebooks/2_demo_split.ipynb>`
- :meth:`split`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(200, 200, seed=1, split_names=None)
>>> example
Dataset object containing 200 images and 200 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type
id
0 955 488 determine/story.jpg .jpg
1 131 895 air/method.bmp .bmp
2 229 880 political/lead.jpg .jpg
3 840 384 like/safe.bmp .bmp
4 953 668 suffer/set.jpeg .jpeg
.. ... ... ... ...
195 122 437 state/almost.tiff .tiff
196 752 300 weight/tend.jpeg .jpeg
197 554 228 remember/summer.png .png
198 688 605 yet/though.png .png
199 243 227 describe/road.tiff .tiff
<BLANKLINE>
[200 rows x 4 columns]
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 77 marriage 15 ... 425.688592 29.159255 39.517594
1 137 marriage 15 ... 383.838546 551.353799 285.211136
2 158 marriage 15 ... 174.889594 144.774339 183.531195
3 111 reach 22 ... 151.265769 97.611967 282.485307
4 121 marriage 15 ... 38.236459 522.170458 36.783181
.. ... ... ... ... ... ... ...
195 129 reach 22 ... 190.935508 104.385252 3.669239
196 33 listen 14 ... 322.704987 469.556266 193.375897
197 181 listen 14 ... 403.794364 349.250089 66.745395
198 55 reach 22 ... 2.534284 119.223978 110.346924
199 89 reach 22 ... 172.664334 658.570932 282.920285
<BLANKLINE>
[200 rows x 7 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> splitted = example.simple_split()
>>> splitted
Dataset object containing 200 images and 200 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split
id
0 955 488 determine/story.jpg .jpg train
1 131 895 air/method.bmp .bmp train
2 229 880 political/lead.jpg .jpg train
3 840 384 like/safe.bmp .bmp train
4 953 668 suffer/set.jpeg .jpeg valid
.. ... ... ... ... ...
195 122 437 state/almost.tiff .tiff train
196 752 300 weight/tend.jpeg .jpeg train
197 554 228 remember/summer.png .png train
198 688 605 yet/though.png .png valid
199 243 227 describe/road.tiff .tiff train
<BLANKLINE>
[200 rows x 5 columns]
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 77 marriage 15 ... 425.688592 29.159255 39.517594
1 137 marriage 15 ... 383.838546 551.353799 285.211136
2 158 marriage 15 ... 174.889594 144.774339 183.531195
3 111 reach 22 ... 151.265769 97.611967 282.485307
4 121 marriage 15 ... 38.236459 522.170458 36.783181
.. ... ... ... ... ... ... ...
195 129 reach 22 ... 190.935508 104.385252 3.669239
196 33 listen 14 ... 322.704987 469.556266 193.375897
197 181 listen 14 ... 403.794364 349.250089 66.745395
198 55 reach 22 ... 2.534284 119.223978 110.346924
199 89 reach 22 ... 172.664334 658.570932 282.920285
<BLANKLINE>
[200 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> splitted.images["split"].value_counts() / len(splitted)
split
train 0.725
valid 0.275
Name: count, dtype: float64
"""
if len(split_names) <= 1:
raise ValueError(
f"Must provide at least 2 split names. Got {split_names} of size"
f" {len(split_names)} instead."
)
if len(target_split_shares) != len(split_names):
raise ValueError(
"Size mismatch between 'split_names' and 'split_shares'"
f" ({len(split_names)} vs {len(target_split_shares)})"
)
if sum(target_split_shares) != 1:
raise ValueError(
"Split share values must addup to 1. Got"
f" {sum(target_split_shares)} instead"
)
gen = np.random.default_rng(input_seed)
split = gen.choice(
list(split_names), size=len(self), p=list(target_split_shares)
)
if inplace:
self.images["split"] = split
return self
else:
return self.from_template(images=self.images.assign(split=split))
[docs]
def split(
self,
input_seed: int = 0,
split_names: Sequence[str] = ("train", "valid"),
target_split_shares: Sequence[float] = (0.8, 0.2),
keep_separate_groups: group_list = ("image_id",),
keep_balanced_groups: group_list = ("category_id",),
keep_balanced_groups_weights: Sequence[float] | None = None,
inplace: bool = False,
hist_cost_weight: float = 1,
share_cost_weight: float = 1,
earth_mover_regularization: float = 0,
) -> Self:
"""Perform the split operation on annotations and images.
This algorithm works in 2 steps:
1. divide the dataframe into atomic sub frames. Given the image and annotation
attributes that need to be kept separate, we can construct sub frame of
elements that cannot be in different splits.
2. Construct the split dataframes iteratively by trying to keep given column
values with a balanced repartition between splits, along with keeping split
sizes as close to target share as possible. Each atomic sub-dataframe is
routed to the split that minimize a cost function which try to optimize
repartition targets.
Warning:
if self.images and ``self.annotations`` each have a column with the same
name, the column in ``self.images`` will be ignored. Make sure column names
are mutually exclusive to avoid problems.
See :func:`pandas.split_dataframe`
Args:
input_seed: Seed used for shuffling sub dataframes before beginning step 2
of splitting algorithm. Defaults to 0.
split_names: Names of splits. Must be the same length as ``split_shares``.
Defaults to ("train", "valid").
target_split_shares: List of target relative size of each split. Must be the
same length as ``split_names``, and will be normalized so that its sum
is 1. Defaults to (0.8, 0.2).
keep_separate_groups: columns or groups
(see :obj:`.group`) in annotations or images DataFrame to keep separate.
That is for a particular column or group, two rows with the same value
cannot be in different splits. Note that ``image_id`` will be added to
that list, because split happen at the image level.
Defaults to ("image_id",).
keep_balanced_groups: columns or groups
(see :obj:`.group`) in annotations or images DataFrame to keep balanced.
That is for a particular group, the distribution of values is the same
between original DataFrame and its split, as much as possible.
Defaults to ("category_id",).
keep_balanced_groups_weights: Importance of each group to keep balanced when
computing histogram cost. If not None, must be a single float or the
same size as ``keep_separate_groups``. Defaults to None.
inplace: If set, will modify dataframes inplace. This can silently modify
some objects (like Datasets) that use them but has a lower memory
footprint. Defaults to False.
hist_cost_weight: importance of histogram cost for balanced groups.
The higher, the more important the histogram cost will be for the
decision of where to put each split. Defaults to 1.
share_cost_weight: importance of share cost for balanced groups.
The higher, the more important the share cost will be for the decision
of where to put each split. Defaults to 1.
earth_mover_regularization: Regularization parameter applied to sinkhorn's
algorithm during earth mover distance computation. See
:func:`lours.dataset.split.balanced_group.earth_mover_distance`.
Defaults to 0.
Returns:
new Dataset with the split column populated with the corresponding
split names.
See Also:
- More in-depth explanation in this :ref:`tutorial </notebooks/2_demo_split.ipynb>`
- :meth:`simple_split`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset(
... 200,
... n_attribute_columns_images={"balanced": 10, "separate": 10},
... split_names=None,
... seed=1,
... )
>>> example
Dataset object containing 200 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type balanced separate
id
0 955 488 determine/story.jpg .jpg send system
1 131 895 air/method.bmp .bmp note system
2 229 880 political/lead.jpg .jpg anything law
3 840 384 like/safe.bmp .bmp anything likely
4 953 668 suffer/set.jpeg .jpeg training attack
.. ... ... ... ... ... ...
195 122 437 state/almost.tiff .tiff anything star
196 752 300 weight/tend.jpeg .jpeg could rest
197 554 228 remember/summer.png .png anything system
198 688 605 yet/though.png .png note number
199 243 227 describe/road.tiff .tiff end number
<BLANKLINE>
[200 rows x 6 columns]
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 77 reach 22 ... 45.427512 40.116677 318.073851
1 137 marriage 15 ... 202.481384 435.389400 475.375279
<BLANKLINE>
[2 rows x 7 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> example.images["separate"].value_counts()
separate
star 27
likely 27
number 27
attack 22
rest 20
law 18
entire 17
enough 16
system 15
often 11
Name: count, dtype: int64
>>> splitted = example.split(
... keep_balanced_groups=["balanced"], keep_separate_groups=["separate"]
... )
Splitting annotations ...
Separating input data into atomic chunks
1 chunks to distribute across 2 splits
Splitting images ...
Separating input data into atomic chunks
10 chunks to distribute across 2 splits
>>> splitted
Dataset object containing 200 images and 2 objects
Name :
shake_effort_many
Images root :
care/suggest
Images :
width height relative_path type split balanced separate
id
0 955 488 determine/story.jpg .jpg train send system
1 131 895 air/method.bmp .bmp train note system
2 229 880 political/lead.jpg .jpg valid anything law
3 840 384 like/safe.bmp .bmp train anything likely
4 953 668 suffer/set.jpeg .jpeg train training attack
.. ... ... ... ... ... ... ...
195 122 437 state/almost.tiff .tiff valid anything star
196 752 300 weight/tend.jpeg .jpeg train could rest
197 554 228 remember/summer.png .png train anything system
198 688 605 yet/though.png .png train note number
199 243 227 describe/road.tiff .tiff train end number
<BLANKLINE>
[200 rows x 7 columns]
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 77 reach 22 ... 45.427512 40.116677 318.073851
1 137 marriage 15 ... 202.481384 435.389400 475.375279
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{14: 'listen', 15: 'marriage', 22: 'reach'}
>>> splitted.images.groupby("split")["separate"].value_counts()
split separate
train likely 27
number 27
attack 22
rest 20
entire 17
enough 16
system 15
often 11
star 0
law 0
valid star 27
law 18
entire 0
attack 0
rest 0
likely 0
system 0
often 0
enough 0
number 0
Name: count, dtype: int64
>>> splitted.images.groupby("split")["balanced"].value_counts()
split balanced
train could 21
coach 21
send 19
firm 18
end 17
anything 14
training 14
region 11
lead 10
note 10
valid could 8
end 7
note 6
anything 5
send 5
firm 4
training 3
region 3
lead 2
coach 2
Name: count, dtype: int64
"""
if len(split_names) <= 1:
raise ValueError(
f"Must provide at least 2 split names. Got {split_names} of size"
f" {len(split_names)} instead."
)
if len(target_split_shares) != len(split_names):
raise ValueError(
"Size mismatch between 'split_names' and 'split_shares'"
f" ({len(split_names)} vs {len(target_split_shares)})"
)
if sum(target_split_shares) != 1:
raise ValueError(
"Split share values must addup to 1. Got"
f" {sum(target_split_shares)} instead"
)
if (
(not keep_separate_groups)
or keep_separate_groups == "image_id"
or keep_separate_groups == ("image_id",)
or keep_separate_groups == ["image_id"]
) and (not keep_balanced_groups):
print("Using simple random split")
return self.simple_split(
input_seed, split_names, target_split_shares, inplace
)
keep_separate_groups = groups_to_list(keep_separate_groups)
keep_balanced_groups = groups_to_list(keep_balanced_groups)
if keep_balanced_groups_weights is None:
keep_balanced_groups_weights = [1] * len(keep_balanced_groups)
else:
keep_balanced_groups_weights = [*keep_balanced_groups_weights]
keep_balanced_group_names = get_group_names(keep_balanced_groups)
keep_balanced_image_groups_indices = [
i
for i, name in enumerate(keep_balanced_group_names)
if (name in self.images.columns and name not in self.annotations.columns)
]
keep_balanced_image_groups = [
keep_balanced_groups[i] for i in keep_balanced_image_groups_indices
]
keep_balanced_image_groups_weights = [
keep_balanced_groups_weights[i] for i in keep_balanced_image_groups_indices
]
keep_separate_group_names = get_group_names(keep_separate_groups)
keep_separate_image_groups = [
name
for name in keep_separate_group_names
if (name in self.images.columns and name not in self.annotations.columns)
]
print("Splitting annotations ...")
splitted_annotations, splitted_images = split_dataframe(
root_data=self.images,
input_data=self.annotations,
input_seed=input_seed,
split_names=split_names,
target_split_shares=target_split_shares,
keep_separate_groups=keep_separate_groups,
keep_balanced_groups=keep_balanced_groups,
keep_balanced_groups_weights=keep_balanced_groups_weights,
inplace=inplace,
split_at_root_level=True,
hist_cost_weight=hist_cost_weight,
share_cost_weight=share_cost_weight,
earth_mover_regularization=earth_mover_regularization,
)
print("Splitting images ...")
splitted_images = split_dataframe(
input_data=splitted_images,
root_data=None,
input_seed=input_seed,
split_names=split_names,
target_split_shares=target_split_shares,
keep_separate_groups=keep_separate_image_groups,
keep_balanced_groups=keep_balanced_image_groups,
keep_balanced_groups_weights=keep_balanced_image_groups_weights,
inplace=inplace,
split_at_root_level=False,
hist_cost_weight=hist_cost_weight,
share_cost_weight=share_cost_weight,
earth_mover_regularization=earth_mover_regularization,
)
if inplace:
self.images = splitted_images
self.annotations = splitted_annotations
return self
return self.from_template(
images=splitted_images, annotations=splitted_annotations
)
[docs]
def to_parquet(self, output_dir: Path | str, overwrite: bool = False) -> None:
"""Save dataset object to a folder containing parquet files for dataframes
and a metadata.yaml file for other attributes.
Note:
The dataframe dtypes must be serializable as parquet. This includes int,
float, strings, lists; but not custom objects like e.g.
:class:`pathlib.Path`
Args:
output_dir: folder path where to save the object's attributes.
If ``overwrite`` is set to False, it must not already exist.
overwrite: If set to True, will remove the ``output_dir`` directory if it
already exists. Defaults to False
See Also:
:mod:`lours.utils.parquet_saver`
"""
dict_to_parquet(
{k: v for k, v in vars(self).items() if not k.startswith("_")}
| {"__name__": self.__class__.__name__},
Path(output_dir),
overwrite=overwrite,
)
[docs]
def to_darknet(
self,
output_path: Path | str,
copy_images: bool = False,
overwrite_images: bool = True,
overwrite_labels: bool = True,
create_split_folder: bool = False,
) -> None:
"""Save dataset in darknet format, readable by
`darknet <https://github.com/AlexeyAB/darknet>`__ .
Save in the same folder the images, annotations and data files
Args:
output_path: folder where images and annotations will be stored
copy_images: If set to False,
will create a symbolic link instead of copying. Much faster,
but needs to keep original images in the same relative path.
Defaults to False.
overwrite_images: if set to False, will skip images that are already copied.
Defaults to True.
overwrite_labels: if set to False, will skip annotation that are already
created. Defaults to True.
create_split_folder: if set to True, will create a dedicated folder for each
split and will save images in it. Image paths in {split}.txt will be
changed accordingly. Note that this changes the dataset structure.
Defaults to False
See Also:
- :mod:`lours.dataset.io.darknet`
- :meth:`to_yolov5`
- :meth:`to_yolov7`
"""
from .io.darknet import dataset_to_darknet
return dataset_to_darknet(
self,
output_path,
copy_images,
overwrite_images,
overwrite_labels,
yolo_version=1,
create_split_folder=create_split_folder,
)
[docs]
def to_yolov5(
self,
output_path: Path | str,
copy_images: bool = False,
overwrite_images: bool = True,
overwrite_labels: bool = True,
split_name_mapping: dict[str, str] | None = None,
create_split_folder: bool = False,
) -> None:
"""Save dataset in format readable by
`Yolov5 <https://github.com/ultralytics/yolov5>`__ .
Save each split in its dedicated split file containing paths to corresponding
images, separate images and annotations with the folders ``images`` and
``labels``, and save corresponding info in data.yaml, at the root of the output
path.
Optionally, remap the split values so that it fits the training script.
Normally, yolov5 expect ``train``, ``val`` and ``test`` sets. The default
mapping replaces ``valid`` and ``validation`` to ``val``, and ``eval`` to
``test``.
Args:
output_path: folder where images and annotations will be stored
copy_images: If set to False,
will create a symbolic link instead of copying. Much faster,
but needs to keep original images in the same relative path.
Defaults to False.
overwrite_images: if set to False, will skip images that are already copied.
Defaults to True.
overwrite_labels: if set to False, will skip annotation that are already
created. Defaults to True.
split_name_mapping: mapping dict to replace split names to other ones. split
names not present in mapping will not be modified. If set to None,
will apply yolov5 conventional mapping, i.e.
``{'valid': 'val', 'validation': 'val', 'eval': 'test'}``.
Defaults to None
create_split_folder: if set to True, will create a dedicated folder for each
split and will save images in it. Image paths in {split}.txt will be
changed accordingly. Note that this changes the dataset structure.
Defaults to False
See Also:
- :mod:`lours.dataset.io.darknet`
- :meth:`to_darknet`
- :meth:`to_yolov7`
"""
from .io.darknet import dataset_to_darknet
return dataset_to_darknet(
self,
output_path,
copy_images,
overwrite_images,
overwrite_labels,
yolo_version=5,
split_name_mapping=split_name_mapping,
create_split_folder=create_split_folder,
)
[docs]
def to_yolov7(
self,
output_path: Path | str,
copy_images: bool = False,
overwrite_images: bool = True,
overwrite_labels: bool = True,
split_name_mapping: dict[str, str] | None = None,
create_split_folder: bool = False,
) -> None:
"""Save dataset in format readable by
`Yolov7 <https://github.com/WongKinYiu/yolov7>`__ .
Save each split in its dedicated split file containing paths to corresponding
images, separate images and annotations with the folders ``images`` and
``labels``, and save corresponding info in data.yaml, at the root of the output
path.
Optionally, remap the split values so that it fits the training script.
Normally, yolov5 expect ``train``, ``val`` and ``test`` sets. The default
mapping replaces ``valid`` and ``validation`` to ``val``, and ``eval`` to
``test``.
Note:
The only difference with :func:`.to_yolov5` is the fact that path to split
list files are absolute and not relative to the yaml file parent folder.
Args:
output_path: folder where images and annotations will be stored
copy_images: If set to False,
will create a symbolic link instead of copying. Much faster,
but needs to keep original images in the same relative path.
Defaults to False.
overwrite_images: if set to False, will skip images that are already copied.
Defaults to True.
overwrite_labels: if set to False, will skip annotation that are already
created. Defaults to True.
split_name_mapping: mapping dict to replace split names to other ones. split
names not present in mapping will not be modified. If set to None,
will apply yolov5 conventional mapping, i.e.
``{'valid': 'val', 'validation': 'val', 'eval': 'test'}``.
Defaults to None
create_split_folder: if set to True, will create a dedicated folder for each
split and will save images in it. Image paths in {split}.txt will be
changed accordingly. Note that this changes the dataset structure.
Defaults to False
See Also:
- :mod:`lours.dataset.io.darknet`
- :meth:`to_darknet`
- :meth:`to_yolov5`
"""
from .io.darknet import dataset_to_darknet
return dataset_to_darknet(
self,
output_path,
copy_images,
overwrite_images,
overwrite_labels,
yolo_version=7,
split_name_mapping=split_name_mapping,
create_split_folder=create_split_folder,
)
[docs]
def to_coco(
self,
output_path: Path | str,
copy_images: bool = False,
to_jpg: bool = True,
overwrite_images: bool = True,
overwrite_labels: bool = True,
add_split_suffix: bool | None = None,
box_format: str = "XYWH",
) -> None:
"""Save dataset in coco format. Will create in output directory one
JSON file per split present in the dataset.
Args:
output_path: Output folder where to save the JSON files
copy_images: If set to False,
will create a symbolic link instead of copying. Much faster,
but needs to keep original images in the same relative path.
Defaults to False.
to_jpg: if True, along with previous option, will convert images to jpg if
needed. Defaults to True.
overwrite_images: if set to False, will skip images that are already copied.
Defaults to True.
overwrite_labels: if set to False, will skip JSON files that are already
created. Defaults to True.
add_split_suffix: if set to True, will append the name of the split to the
json output files. Cannot be False if dataset has multiple splits.
If not set, will add suffix only if dataset has multiple splits.
box_format: what type of annotation the json file will have.
It will be converted from XYWH. Defaults to XYWH
See Also:
- :mod:`lours.dataset.io.coco`
"""
from .io.coco import dataset_to_coco
return dataset_to_coco(
self,
output_path,
copy_images,
to_jpg,
overwrite_images,
overwrite_labels,
add_split_suffix,
box_format=box_format,
)
[docs]
def to_caipy(
self,
output_path: Path | str,
use_schema: bool = False,
json_schema: Path | str | None = "default",
copy_images: bool = True,
to_jpg: bool = True,
overwrite_images: bool = True,
overwrite_labels: bool = True,
flatten_paths: bool = True,
) -> None:
"""Convert dataset to cAIpy format.
Note:
- Unless specified otherwise, relative paths of images a flattened during
the export, which modifies the dataset if the images and annotations
were stored in subfolders, but will put all images and annotations of a
particular split in their respective root folder.
- If schema is not given, the nested dictionary will be deduced from column
names with the separator "."
Args:
output_path: folder where cAIpy folder will be recreated
use_schema: If set to True, and ``json_schema`` is not None, will use schema
for validation and formatting (see option ``json_schema``)
json_schema: Path to a schema that output json dicts will be tested against
for compliance. They will also be used to remove columns for fields no
included in the schema. Can be either a url or a path object.
If set to None, or ``use_schema`` is set to False,
will not perform any test. Defaults to default schema.
copy_images: If set to False,
will create a symbolic link instead of copying. Much faster,
but needs to keep original images in the same relative path.
Defaults to False.
to_jpg: if True, along with previous option, will convert
images to jpg if needed. Defaults to True.
overwrite_images: if set to False,
will skip images that are already copied. Defaults to True.
overwrite_labels: if set to False,
will skip annotation that are already created. Defaults to True.
flatten_paths: if set to True, will put all files in the root Annotations
and Images folders by replacing folder separation ("/") with "_" in
relative path. Defaults to False
See Also:
- :mod:`lours.dataset.io.caipy`
- :meth:`to_caipy_generic`
""" # noqa: E501
from .io.caipy import dataset_to_caipy
return dataset_to_caipy(
self,
output_path,
use_schema,
json_schema,
copy_images,
to_jpg,
overwrite_images,
overwrite_labels,
flatten_paths,
)
[docs]
def to_caipy_generic(
self,
output_images_folder: Path | str | None,
output_annotations_folder: Path | str,
use_schema: bool = False,
json_schema: Path | str | None = "default",
copy_images: bool = True,
to_jpg: bool = True,
overwrite_images: bool = True,
overwrite_labels: bool = True,
flatten_paths: bool = True,
) -> None:
"""Convert dataset to cAIpy format, but with the possibility to specify images
and annotations folders rather than a root folder with Images and Annotations
sub-folders. It is especially useful when creating predictions or saving
variations of a annotations.
Note:
- Unless specified otherwise, relative paths of images a flattened during
the export, which modifies the dataset if the images and annotations
were stored in subfolders, but will put all images and annotations of a
particular split in their respective root folder.
- If schema is not given, the nested dictionary will be deduced from column
names with the separator "."
Args:
output_images_folder: root folder where the images will be saved. If None,
will not save images. Useful when only saving predictions or a
variations of annotations.
output_annotations_folder: root folder where the json file will be saved.
use_schema: If set to True, and ``json_schema`` is not None, will use schema
for validation and formatting (see option ``json_schema``)
json_schema: Path to a schema that output json dicts will be tested against
for compliance. They will also be used to remove columns for fields no
included in the schema. Can be either a url or a path object.
If set to None, or ``use_schema`` is set to False,
will not perform any test. Defaults to default schema.
copy_images: If set to False, will create a symbolic link instead of
copying. Much faster, but needs to keep original images in the same
relative path. Defaults to False.
to_jpg: if True, will convert images to jpg if needed. Defaults to True.
overwrite_images: if set to False, will skip images that are already copied.
Defaults to True.
overwrite_labels: if set to False, will skip annotation that are already
created. Defaults to True.
flatten_paths: if set to True, will put all files in the root Annotations
and Images folders by replacing folder separation ("/") with "_" in
relative path. Defaults to True
See Also:
- :mod:`lours.dataset.io.caipy`
- :meth:`to_caipy`
""" # noqa: E501
from .io.caipy import dataset_to_caipy_generic
return dataset_to_caipy_generic(
self,
output_images_folder,
output_annotations_folder,
use_schema,
json_schema,
copy_images,
to_jpg,
overwrite_images,
overwrite_labels,
flatten_paths,
)
[docs]
def to_fiftyone(
self,
dataset_name: str | None = None,
annotations_name: str = "groundtruth",
allow_keypoints: bool = False,
record_fo_ids: bool = False,
existing: Literal["update", "erase", "error"] = "error",
) -> "fo.Dataset":
"""Convert the dataset into a
:class:`fiftyone dataset <fiftyone.core.dataset.Dataset>`, that can then be
inspected with Fiftyone's webapp. The resulting dataset will have the sample
field with the name specified in the argument ``annotations_name``.
Args:
dataset_name: Name of the fiftyone dataset to add the samples to.
If the dataset does not exist, it will be created.
If set to None, will be the folder name of self.dataset_folder.
Defaults to None.
annotations_name: Name of the sample field for the annotations. If the
dataset already exists, the sample field will be created if it does
not exist, and it will be merged if it already exists.
allow_keypoints: if set to True, will convert bounding boxes of size 0 to be
keypoints instead of detection objects.
record_fo_ids: whether to record the fiftyone ids of samples and
annotations. If set to True, will create ``fo_id`` column in self.images
and ``fo_id`` and ``is_keypoint`` column in self.annotations to be able
to reindex them in the created fiftyone dataset
existing: What to do in case there is already a fiftyone dataset with the
same name.
- "error": will raise an error.
- "erase": will erase the existing dataset before uploading
this one
- "update": will try to update the dataset by fusing together samples
with the same "relative_path"
Defaults to "error".
Returns:
:class:`fiftyone.core.dataset.Dataset`: Fiftyone one dataset that can then
be used to launch the webapp with
``fiftyone.launch_app(evaluator.to_fiftyone("dataset"))``
See Also:
- :ref:`Related tutorial </notebooks/5_demo_fiftyone.ipynb>`
- :mod:`lours.utils.fiftyone_convert`
"""
from ..utils.fiftyone_convert import create_fo_dataset
if (
self.booleanized_columns["images"]
or self.booleanized_columns["annotations"]
):
return self.debooleanize().to_fiftyone(
dataset_name, annotations_name, allow_keypoints, record_fo_ids, existing
)
if dataset_name is None:
if self.dataset_name is None:
dataset_name = self.images_root.name
else:
dataset_name = self.dataset_name
fo_dataset, fo_image_ids, fo_annotations_ids = create_fo_dataset(
name=dataset_name,
images_root=self.images_root,
images=self.images,
annotations={annotations_name: self.annotations},
label_map=self.label_map,
image_tag_columns=self.get_image_attributes(),
annotations_attributes_columns=self.get_annotations_attributes(),
allow_keypoints=allow_keypoints,
existing=existing,
)
if record_fo_ids:
self.images["fo_id"] = fo_image_ids
# a simple concat with axis=1 could work, but this snippet is clearer
# in terms of what column are expected and acts as a check
self.annotations["fo_id"] = fo_annotations_ids[annotations_name]["fo_id"]
self.annotations["is_keypoint"] = fo_annotations_ids[annotations_name][
"is_keypoint"
]
return fo_dataset
[docs]
def add_detection_annotation(
self,
image_id: int | Sequence[int] | ndarray,
bbox_coordinates: Sequence[float] | Sequence[Sequence[float]] | ndarray,
category_id: ndarray | int | Sequence[int],
format_string: str = "XYWH",
inplace: bool = False,
label_map: dict[int, str] | None = None,
category_ids_mapping: dict[int, int] | None = None,
confidence: float | ndarray | Sequence[float] | None = None,
**other_columns: float | str | ndarray | Sequence[float] | Sequence[str],
) -> Self:
"""Add one or multiple detection annotations to the current dataset.
In the case of a single annotation, every option can be a single value, but in
the case of multiple annotations, every option needs to be an array of such
values, and every array needs to be the same length.
Note:
In additions to the following options, you can add other fields as well,
with keyword arguments.
Args:
image_id: image identifier to link each detection to the corresponding image
bbox_coordinates: list of coordinates for the bounding box. Can follow any
compatible format, as long as it is given in the next format
category_id: category of each detection. Label will be deduced from
dataset's label map
format_string: format of coordinates, whether coordinates are relatives,
using corner points of the box, box dimensions, etc.
See :func:`.import_bbox` for more info
inplace: if set to True, will modify the dataset inplace and return self.
Else, will return a modified Dataset. Defaults to False.
label_map: In the case the current dataset's label map is incomplete, merge
it with this new label map. current label map and new label map must be
compatible, see :func:`.merge_label_maps`. Defaults to None.
category_ids_mapping: Optional dictionary to map annotated category ids into
the right ids. This is useful for example when a neural network can only
use a contiguous label map. Defaults to None
confidence: Optional field for confidence, in the case annotations are
actually predictions. Defaults to None.
**other_columns: kwargs of additional optional fields
Raises:
ValueError: raised when giving numpy arrays are not the same number of
elements, or if the bounding box coordinates is not of the shape either
[4], or [N, 4]
Returns:
A new Dataset with appended annotations to ``self.annotations`` if ``inplace``
is False, or itself otherwise.
See Also:
:mod:`lours.utils.annotations_appender`
:meth:`annotation_append`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset()
>>> example
Dataset object containing 2 images and 2 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height relative_path type split
id
0 342 136 help/me.jpeg .jpeg train
1 377 167 whatever/wait.png .png train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 step 15 ... 73.932999 71.552480 42.673983
1 0 why 19 ... 4.567638 248.551257 122.602211
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
>>> example.add_detection_annotation(
... image_id=0,
... bbox_coordinates=[0, 0, 0.5, 0.5],
... format_string="xyxy",
... category_id=14,
... confidence=0.5,
... )
Dataset object containing 2 images and 3 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height relative_path type split
id
0 342 136 help/me.jpeg .jpeg train
1 377 167 whatever/wait.png .png train
Annotations :
image_id category_str category_id ... box_width box_height confidence
id ...
0 0 step 15 ... 71.552480 42.673983 NaN
1 0 why 19 ... 248.551257 122.602211 NaN
2 0 14 14 ... 171.000000 68.000000 0.5
<BLANKLINE>
[3 rows x 9 columns]
Label map :
{14: '14', 15: 'step', 19: 'why', 25: 'interview'}
"""
from ..utils.annotations_appender import add_detection_annotation
return add_detection_annotation(
input_dataset=self,
image_id=image_id,
bbox_coordinates=bbox_coordinates,
format_string=format_string,
category_id=category_id,
inplace=inplace,
label_map=label_map,
category_ids_mapping=category_ids_mapping,
confidence=confidence,
**other_columns,
)
[docs]
def annotation_append(
self,
format_string: str = "XYWH",
category_ids_mapping: dict[int, int] | None = None,
label_map: dict[int, str] | None = None,
) -> "AnnotationAppender":
"""Create a context manager to add detection tensors to the current dataset with
the :meth:`.AnnotationAppender.append` method, as if the Dataset was a list.
After the appending is finished, the appender construct big numpy arrays to
concatenate to the dataset's annotations dataframe.
Note:
The dataset object from which this context manager is created is
modified inplace, similar to a list append.
Args:
format_string: format string for incoming bounding boxes. Depend on your
detector conventions. Defaults to "XYWH".
category_ids_mapping: Optional dictionary to map annotated category ids into
the right ids. This is useful for example when a neural network can only
use a contiguous label map. Defaults to None
label_map: Optional label map for objects outside the current label map.
Must be compatible with the current label map (i.e. no category id
clash). Defaults to None.
Returns:
:class:`.AnnotationAppender`: Context
manager to easily add detection tensors
See Also:
:mod:`lours.utils.annotations_appender`
:meth:`add_detection_annotation`
Example:
>>> from lours.utils.doc_utils import dummy_dataset
>>> example = dummy_dataset()
>>> example
Dataset object containing 2 images and 2 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height relative_path type split
id
0 342 136 help/me.jpeg .jpeg train
1 377 167 whatever/wait.png .png train
Annotations :
image_id category_str category_id ... box_y_min box_width box_height
id ...
0 0 step 15 ... 73.932999 71.552480 42.673983
1 0 why 19 ... 4.567638 248.551257 122.602211
<BLANKLINE>
[2 rows x 8 columns]
Label map :
{15: 'step', 19: 'why', 25: 'interview'}
>>> with example.annotation_append(
... format_string="xyxy", label_map={1: "new_class"}
... ) as appender:
... appender.append(
... image_id=0,
... bbox_coordinates=np.array([[0, 0, 0.5, 0.5]]),
... category_id=15,
... confidence=0.5,
... other_attribute=0,
... )
... appender.append(
... image_id=[1, 0],
... bbox_coordinates=np.array(
... [[0.1, 0.1, 0.9, 0.9], [0.2, 0.3, 0.5, 0.5]]
... ),
... category_id=np.array([1, 15]),
... confidence=np.array([0.2, 0.3]),
... other_attribute=np.array([3, 4]),
... )
>>> example
Dataset object containing 2 images and 5 objects
Name :
inside_else_memory
Images root :
such/serious
Images :
width height relative_path type split
id
0 342 136 help/me.jpeg .jpeg train
1 377 167 whatever/wait.png .png train
Annotations :
image_id category_str category_id ... box_height confidence other_attribute
id ...
0 0 step 15 ... 42.673983 NaN NaN
1 0 why 19 ... 122.602211 NaN NaN
2 0 step 15 ... 68.000000 0.5 0.0
3 1 new_class 1 ... 133.600000 0.2 3.0
4 0 step 15 ... 27.200000 0.3 4.0
<BLANKLINE>
[5 rows x 10 columns]
Label map :
{1: 'new_class', 15: 'step', 19: 'why', 25: 'interview'}
"""
from ..utils.annotations_appender import AnnotationAppender
return AnnotationAppender(
self,
format_string=format_string,
category_ids_mapping=category_ids_mapping,
label_map=label_map,
)