"""Set of functions to construct groups in a dataset, and compute analytics pre group
during e.g. evaluation
"""
from collections.abc import Iterable, Sequence
from dataclasses import dataclass
from typing import Literal, overload
import numpy as np
import pandas as pd
[docs]
@dataclass
class ContinuousGroup:
"""Data Class to encapsulate information to give to the cutting function of pandas
as parameters, typically used to group continuous data by a limited number of
groups, similarly to an histogram.
Depending on the attributes, il will use either :func:`pandas.cut` or
:func:`pandas.qcut` to give a particular label for each row of you dataframe
"""
name: str
"""Name of the column to use the cutting function on"""
bins: float | list[float] = 10
"""value given to the ``bin`` parameter of pandas' function. Can be either a float
(for the number of bins), or a list of values that will be used as actual bins. Note
that in the case of :func:`pandas.qcut`, only this attribute being a float makes
sense."""
qcut: bool = False
"""Whether to use :func:`pandas.qcut` or :func:`pandas.cut`. Qcut will design the
bins so that each interval will contain the same number of samples, while cut will
design the bins so that first and last bins are minimum and maximum value of
considered column, and all the bins are equally spaced
(similar to :func:`numpy.linspace`)
"""
log: bool = False
"""When using cut (and not qcut), whether to separate bins equally in the linear
space or the log space.
As such, bins for lower values would be closer to each other"""
label_type: str = "intervals"
"""What type of label to give to each group given by the cutting function.
Can be either:
- "intervals" (default): :class:`pandas.Interval` object usually given as Series
values by :func:`pandas.cut` and :func:`pandas.qcut`
- "mid": mid point between the two bins of each interval
- "mean": mean value of data points comprised in a given interval
- "median": median value of data points comprised in a given interval
"""
[docs]
def to_dict(self) -> dict[str, str | float | list[float] | bool]:
"""Serialize the ContinuousGroup object into a dictionary that can then be used
as kwargs for :func:`.cut_group`
Returns:
Dictionary containing parameters to be read by :func:`.cut_group`
"""
return {
"group_name": self.name,
"bins": self.bins,
"log": self.log,
"qcut": self.qcut,
"label_type": self.label_type,
}
group = str | ContinuousGroup
"""
Type alias to define a group
Group is either
- the name of a column (for discret groups, such as ``category_id``)
- a :class:`.ContinuousGroup` object to divide continuous data into a given number
of groups, similar to histograms.
these parameters will be used for the function :func:`lours.util.grouper.cut_group`
Examples:
Discret group::
"size"
Continuous group::
continuousGroup(name="size", bins=10, log=False, qcut=True)
Continuous group with bins::
continuousGroup(name="size", bins=[0, 10, 20, 30], log=False, qcut=False)
"""
group_list = group | Sequence[group]
"""
Group list is either a group or an iterable of groups
"""
[docs]
def cut_group(
data: pd.Series | pd.DataFrame,
group_name: str | None = None,
bins: int | Iterable[float] = 10,
label_type: str = "intervals",
log: bool = False,
qcut: bool = False,
) -> pd.Series:
"""Cut a dataframe according to one of its column values and criteria
See :func:`pandas.cut`, :func:`pandas.qcut`
Args:
data: Dataframe to extract the column name from
group_name: name of the column to extract
bins: parameter used by both :func:`pandas.cut`, :func:`pandas.qcut`. Namely,
it can be an int to describe the number of bins, or a list of floats, to
either describe the actual bin edges for :func:`pandas.cut` or the quantile
edges for :func:`pandas.qcut`
label_type: what type of label to give to each group given by the cutting
function.
Can be either:
- "intervals" (default): :class:`pandas.Interval` object usually given as
Series values by :func:`pandas.cut` and :func:`pandas.qcut`
- "mid": mid-point between the two bins of each interval
- "mean": mean value of data points comprised in a given interval
- "median": median value of data points comprised in a given interval
log: Whether to use logarithmic scale or not, when bins is an integer.
Useful when the values are not uniformly distributed. Defaults to False.
qcut: Whether to use :func:`pandas.qcut` instead of :func:`pandas.cut`.
See corresponding documentation for the differences.
TL;DR, :func:`pandas.qcut` is based on quantiles (same number of occurrences
in each bin) while :func:`pandas.cut` is based on values (same interval
length for each bin). Defaults to False.
Raises:
ValueError: Raises an error when log option is selected but the extracted column
has negative values
Returns:
Series with the same length as data, describing a mapping from id to bin. Bin
labels are Interval Indices describing the upper and lower bound.
See :class:`pandas.IntervalIndex`
"""
if isinstance(data, pd.DataFrame):
assert group_name is not None
to_cut = data[group_name]
else:
to_cut = data
if group_name is not None:
to_cut.name = group_name
cut_function = pd.qcut if qcut else pd.cut
if (not log) or (not isinstance(bins, int)):
if isinstance(bins, Iterable):
bins = [*bins]
result = cut_function(to_cut, bins)
else:
if to_cut.min() < 0:
raise ValueError("Cannot use log on negative values")
log_to_cut = np.log(to_cut)
log_cut = cut_function(log_to_cut, bins)
assert isinstance(log_cut, pd.Series)
# Change labels to match actual values and not log ones
def exp_labels(x: pd.Interval) -> pd.Interval:
return pd.Interval(np.exp(x.left), np.exp(x.right))
normal_labels = log_cut.cat.categories.map(exp_labels)
result = log_cut.cat.rename_categories(normal_labels)
if label_type == "intervals":
return result
elif label_type == "mid":
return result.apply(lambda x: x.mid)
elif label_type == "mean":
means = to_cut.groupby(result, observed=False).mean()
return result.apply(lambda x: means.loc[x])
elif label_type == "median":
means = to_cut.groupby(result, observed=False).median()
return result.apply(lambda x: means.loc[x])
raise ValueError("invalid label_type")
@overload
def make_pandas_compatible(
data: pd.DataFrame, g: str
) -> tuple[str, str, Literal[True]]:
pass
@overload
def make_pandas_compatible(
data: pd.DataFrame,
g: ContinuousGroup,
root_data: pd.DataFrame | None = None,
key_to_root: str = "image_id",
) -> tuple[str, pd.Series, Literal[False]]:
pass
@overload
def make_pandas_compatible(
data: pd.DataFrame,
g: group,
root_data: pd.DataFrame | None = None,
key_to_root: str = "image_id",
) -> tuple[str, str | pd.Series, bool]:
pass
[docs]
def make_pandas_compatible(
data: pd.DataFrame,
g: group,
root_data: pd.DataFrame | None = None,
key_to_root: str = "image_id",
) -> tuple[str, str | pd.Series, bool]:
"""Construct group from :obj:`group` that will be used for pandas'
`groupby <https://pandas.pydata.org/docs/reference/groupby.html>`__ method.
- In the case it's only a name, keep it like that
- Otherwise, we need to construct an index of data cut according to the given
bins. This will create a :class:`pandas.Series` with categorical data
Args:
data: input DataFrame, must contain the column considered in group ``g``
g: group depicting a column from ``data`` with potential bins.
See :obj:`group`
root_data: Potential root data where some ids in ``data`` refer to a particular.
columns in ``root_data``. Defaults to None.
key_to_root: column containing ``root_data`` row ids. Defaults to "image_id".
Returns:
Tuple with the 3 following values:
1. group name
2. group that can be understood by pandas'
`groupby <https://pandas.pydata.org/docs/reference/groupby.html>`__ method.
Can be a simple string referring to a column, or a :class:`pandas.Series`
with categorical data
3. boolean indicating whether the group is categorical
(on which different values are independent of each other)
or continuous (on which different values represent ranges of a continuous
value, constructing a discretized histogram)
"""
def construct_column_group_from_root(
root_dataframe: pd.DataFrame, input_group_name: str
) -> pd.Series:
column = root_dataframe.loc[data[key_to_root], input_group_name]
column.index = data.index
return column
is_category = True
if isinstance(g, str):
group_name = g
if group_name in data.columns:
group = g
else:
assert root_data is not None
assert group_name in root_data.columns
group = construct_column_group_from_root(root_data, g).astype("category")
else:
group_name = g.name
if group_name in data.columns:
group = cut_group(data, **g.to_dict()) # pyright: ignore
else:
assert root_data is not None
assert group_name in root_data.columns
group = cut_group(
construct_column_group_from_root(root_data, group_name),
**g.to_dict(), # pyright: ignore
)
is_category = False
return group_name, group, is_category
[docs]
def get_group_names(groups: group_list) -> list[str]:
"""From a list of groups, get the list of associated names.
Args:
groups: single group lor Sequence of groups to extract the names from.
Returns:
Names of given groups.
"""
return [g if isinstance(g, str) else g.name for g in groups_to_list(groups)]
[docs]
def groups_to_list(groups: group_list) -> list[group]:
"""Convert a single group or Sequence of groups to a list of groups
(possibly with one element)
Args:
groups: Sequence of groups or single groups to convert
Returns:
Actual list of groups, more easily handled by other functions.
"""
if isinstance(groups, str | ContinuousGroup):
return [groups]
else:
return list(groups)
[docs]
def group_relational_data(
input_data: pd.DataFrame,
groups: group_list,
root_data: pd.DataFrame | None = None,
key_to_root: str = "image_id",
) -> tuple[dict[str, str | pd.Series], list[str], list[str]]:
"""Create groups that will be applied on ``input_data`` with the
:meth:`pandas.DataFrame.groupby` method. can be used with a ``root_data`` relational
DataFrame containing values that we might want to group, provided ``input_data``
contains a column with reference to a row in ``root_data``.
Args:
input_data: DataFrame to group
groups: groups to apply to ``input_data`` or `root_data``.
Can be a simple string in the case of categorical data, or a dictionary.
See :obj:`.group`.
root_data: DataFrame containing information ``input_data`` may refer to.
Defaults to None.
key_to_root: column name in ``input_data`` for the key to ``root_data``.
Defaults to "image_id".
Returns:
3 different objects are returned:
1. A dictionary with the created groups and their name as a key. The groups can
be directly used in a `input_data.groupby` call
2. A list of all category groups, where different values are independent
from each other
3. A list of all continuous groups, on which different values represent ranges
of a continuous value, constructing a discretized histogram
Note that the two list together should be as long as the group dictionary, and
their elements must refer to all the actual keys of the dictionary.
"""
groups = groups_to_list(groups)
groups_dict: dict[str, str | pd.Series] = {}
category_groups: list[str] = []
continuous_groups: list[str] = []
for g in groups:
name, group, is_category = make_pandas_compatible(
input_data, g, root_data, key_to_root
)
if is_category:
category_groups.append(name)
else:
assert isinstance(group, pd.Series)
continuous_groups.append(name)
groups_dict[name] = group
return groups_dict, category_groups, continuous_groups