from collections.abc import Iterable
from functools import partial
import pandas as pd
[docs]
def booleanize(
input_df: pd.DataFrame,
column_names: str | Iterable[str] | None = None,
separator: str = ".",
**possible_values: set | None,
) -> pd.DataFrame:
"""Convert given column in input DataFrame from lists to boolean
This is mainly used when a particular attribute can have multiple possible
values at once.
Every possible value given will be tested to see if it's inside every row's list
which will give a boolean column.
In the end, the column will be dropped and N new boolean columns will be created
with the name in form ``{column_name}{separator}{value}``
Args:
input_df: DataFrame on which performing the booleanization. The operation is not
inplace.
column_names: columns to convert. After conversion, it will be dropped
from input DataFrame. Can be either a single string or a list of strings.
separator: character used to separate original column and value. Defaults to '.'
**possible_values: kwargs for sets of possible values. Each key in this
dictionary must match a column name. If the corresponding value is None,
will deduce it from all occurrences in lists of column given by key.
Defaults to None.
Raises:
KeyError: The given ``column_name`` must be in the columns of ``input_df``
TypeError: When for a particular column possible values need to be deduced,
the column must have value that are all iterable except strings.
Returns:
New dataset with multiple boolean columns in the form
``{column_name}{separator}{value}``.
"""
def is_true(iterable, value) -> bool:
try:
return value in iterable
except TypeError:
return value == iterable
if column_names is None:
column_names = set()
if isinstance(column_names, str):
column_names = {column_names}
else:
column_names = {*column_names}
if possible_values:
column_names = set(column_names).union(possible_values.keys())
elif not column_names:
# Nothing to booleanize, return immediately
return input_df
for column_name in column_names:
if input_df[column_name].dropna().apply(lambda x: isinstance(x, str)).any():
raise TypeError(
f"Column {column_names} cannot contain a single string, use lists"
" instead"
)
enum = possible_values.get(column_name, None)
if enum is None:
enum = set().union(*input_df[column_name].dropna().to_list())
column_name_index = input_df.columns.get_loc(column_name)
before_columns = input_df.columns[:column_name_index]
after_columns = input_df.columns[column_name_index + 1 :]
new_columns = []
for v in enum:
booleanized_column_name = f"{column_name}{separator}{v}"
input_df = input_df.assign(
**{
booleanized_column_name: input_df[column_name].apply(
partial(is_true, value=v)
)
}
)
new_columns.append(booleanized_column_name)
input_df = input_df[
[
*before_columns,
*new_columns,
*after_columns,
]
]
return input_df
[docs]
def broadcast_booleanization(
df1: pd.DataFrame,
df2: pd.DataFrame,
booleanized_columns1: Iterable[str] = (),
booleanized_columns2: Iterable[str] = (),
ignore_index: bool = False,
separator: str = ".",
) -> tuple[pd.DataFrame, pd.DataFrame, set[str]]:
"""Broadcast two dataframes so that they have the same booleanized columns.
Booleanized columns from ``df1`` that are not present in ``df2`` will be created and
set to False and vice versa.
Note: if ``ignore_index`` is set to False, the overlapping ids will be set to the
value in the other dataframe instead of just False
Args:
df1: first dataframe to broadcast
df2: second dataframe to broadcast
booleanized_columns1: Columns in ``df1`` that are booleanized. Defaults to ().
booleanized_columns2: Columns in ``df2`` that are booleanized. Defaults to ().
ignore_index: if set to True, will create boolean columns full of False
regardless of index overlap between the two dataframes.
If set to False, tries to retrieve boolean value in one dataframe
from the other when creating the column. Defaults to False.
separator: Character used to separate column prefix and value.
Defaults to ".".
Returns:
tuple containing updated dataframes ``df1`` and ``df2`` with the same
booleanized columns
"""
booleanized_columns = set().union(booleanized_columns1, booleanized_columns2)
for column in booleanized_columns:
if column not in booleanized_columns2 and column in df2.columns:
df2 = booleanize(df2, column, separator=separator)
if column not in booleanized_columns1 and column in df1.columns:
df1 = booleanize(df1, column, separator=separator)
bool_columns1 = get_bool_columns(df1, column, separator)
bool_columns2 = get_bool_columns(df2, column, separator)
for bool_column in set().union(bool_columns1, bool_columns2):
if bool_column not in df1.columns:
df1 = df1.assign(
**{
bool_column: (
False
if ignore_index
else df2[bool_column].reindex(df1.index, fill_value=False)
)
}
)
if bool_column not in df2.columns:
df2 = df2.assign(
**{
bool_column: (
False
if ignore_index
else df1[bool_column].reindex(df2.index, fill_value=False)
)
}
)
return df1, df2, booleanized_columns
[docs]
def get_bool_columns(
input_df: pd.DataFrame, column_prefix: str, separator: str = "."
) -> list[str]:
"""Given a prefix and a separator, get all columns that start with
``{column_prefix}{separator}``
This is used in e.g. :func:`.debooleanize`
Args:
input_df: DataFrame to get the columns from
column_prefix: Name of column prefix to retrieve boolean columns.
separator: Character used to separate column prefix and value.
Defaults to ".".
Raises:
ValueError: Raised when column following the pattern are not boolean
Returns:
List of columns that follow the pattern and will be used to construct the list.
"""
full_prefix = f"{column_prefix}{separator}"
columns = [
name
for name in input_df.columns
if isinstance(name, str) and name.startswith(full_prefix)
]
column_dtypes = input_df[columns].dtypes
if any(
dtype not in [bool, pd.BooleanDtype()]
for column_name, dtype in column_dtypes.items()
):
raise ValueError(
f"Expected bool type for columns starting with {column_prefix}, but got"
f" the following dtypes : {column_dtypes}."
)
return columns
[docs]
def debooleanize(
input_df: pd.DataFrame,
column_prefixes: str | Iterable[str],
separator: str = ".",
) -> pd.DataFrame:
"""Inverse operation of :func:`.booleanize`. Take all columns that start with
``{column_prefix}{separator}`` and, assuming they are all boolean columns, convert
them into a single column of list values.
Note:
The column order will be preserved, the debooleanized column will be inserted
at the same spot the multiple booleanized columns were.
Args:
input_df: Input DataFrame we will take the columns from.
column_prefixes: Name of column prefix (or prefixes) to retrieve boolean
columns. Also, the name of resulting column (or columns)
separator: Character used to separate column prefix and value.
Defaults to ".".
Raises:
TypeError: all columns with given prefix must be of boolean dtype
Returns:
pd.DataFrame: Resulting DataFrame, with all boolean column which name correspond
to the prefix drop and a single column added with lists
"""
if isinstance(column_prefixes, str):
column_prefixes = [column_prefixes]
for column_prefix in column_prefixes:
full_prefix = f"{column_prefix}{separator}"
columns = get_bool_columns(input_df, column_prefix, separator)
if not columns:
continue
first_column_pos = input_df.columns.get_loc(columns[0])
columns_df = input_df[columns]
input_df = input_df.drop(columns=columns)
columns_df = columns_df.rename(
columns={n: n.replace(full_prefix, "") for n in columns_df.columns}
)
single_column = columns_df.apply(lambda x: x[x].index.tolist(), axis=1)
input_df = input_df.assign(**{column_prefix: single_column})
# Now reorder columns so that the newly created column is where the
# booleanized ones were
input_df = input_df[
[
*input_df.columns[:first_column_pos],
column_prefix,
*input_df.columns[first_column_pos:-1],
]
]
return input_df