Source code for lours.dataset.remap_presets

"""Registry for known useful preset.

to add a preset, add a new entry in remap_presets
value should be a tuple of two dictionaries,
with a ``old_id -> new_id`` mapping and a ``new_id -> new_name`` mapping
"""

from importlib.resources import files

import pandas as pd

__all__ = ["presets", "list_available_presets"]

presets = {}
presets_folder = files("lours") / "dataset" / "remap_presets"


for p in presets_folder.iterdir():
    if p.is_file() and p.name.endswith(".csv"):
        try:
            input_dataset, output_dataset = p.name.removesuffix(".csv").split("_to_")
        except ValueError as e:
            raise NameError(
                "Badly named csv preset file. Should be in the form "
                f"'<dataset1>_to_<dataset2>.csv', but got {p.name} instead."
            ) from e
        preset_df = pd.read_csv(p.open())
        indexed_preset = preset_df.set_index("input_category_id")
        preset_dict = indexed_preset["output_category_id"].to_dict()
        preset_names = (
            indexed_preset.groupby("output_category_id")["output_category_name"]
            .first()
            .to_dict()
        )
        presets[(input_dataset, output_dataset)] = (preset_dict, preset_names)
        is_invertible = preset_df["output_category_id"].is_unique
        if is_invertible and (output_dataset, input_dataset) not in presets:
            inverted_preset = preset_df.set_index("output_category_id")
            inverted_preset_dict = inverted_preset["input_category_id"].to_dict()
            inverted_preset_names = (
                inverted_preset.groupby("input_category_id")["input_category_name"]
                .first()
                .to_dict()
            )
            presets[(output_dataset, input_dataset)] = (
                inverted_preset_dict,
                inverted_preset_names,
            )


[docs] def list_available_presets(): mapping_names = [f"{in_map}\t->\t{out_map}" for in_map, out_map in presets] return "\n".join(mapping_names)