"""Consumes raw data and transforms it into a CSV that `LyProX`_ understands.
To do so, it needs a dictionary that defines a mapping from raw columns to the LyProX
style data format. See the documentation of the :py:func:`.transform_to_lyprox` function
for more information.
.. _LyProX: https://lyprox.org
"""
import importlib.util
import warnings
from pathlib import Path
from typing import Annotated, Any
import lydata # noqa: F401
import pandas as pd
from loguru import logger
from lydata import C
from pydantic import AfterValidator, Field, FilePath
from lyscripts.cli import assemble_main
from lyscripts.configs import BaseCLI
from lyscripts.data.utils import save_table_to_csv
from lyscripts.utils import delete_private_keys, flatten, load_patient_data
warnings.simplefilter(action="ignore", category=FutureWarning)
[docs]
def ensure_python_file(file: Path) -> Path:
"""Check if the file is a Python file."""
if file.suffix != ".py":
raise ValueError("Mapping file must be a Python file.")
return file
[docs]
def ensure_column_map(file: Path) -> Path:
"""Ensure the Python file contains a ``COLUMN_MAP`` dictionary."""
spec = importlib.util.spec_from_file_location("map_module", file)
mapping = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mapping)
if not hasattr(mapping, "COLUMN_MAP"):
raise ValueError("Mapping file must contain a `COLUMN_MAP` dictionary.")
return file
[docs]
class LyproxifyCLI(BaseCLI):
"""Map any CSV file to the LyProX format with the help of a Python mapping dict."""
input_file: FilePath = Field(description="Location of raw CSV data.")
num_header_rows: int = Field(
default=1,
description="Number of rows comprising the header of the raw CSV file.",
)
mapping_file: Annotated[
FilePath,
AfterValidator(ensure_python_file),
AfterValidator(ensure_column_map),
] = Field(
description=(
"Location of Python file containing a `COLUMN_MAP` dictionary. It may also "
"contain an `EXCLUDE` list of tuples `(column, check)` to exclude patients."
),
)
drop_rows: list[int] = Field(
default=[],
description=(
"Delete rows of specified indices. Counting of rows start at 0 _after_ "
"the `header-rows`."
),
)
drop_cols: list[int] = Field(
default=[],
description="Delete columns of specified indices.",
)
output_file: Path = Field(description="Location to store the lyproxified CSV file.")
[docs]
def cli_cmd(self) -> None:
"""Start the ``lyproxify`` subcommand.
After reading in the specified file, it will first ``drop_rows`` and
``drop_cols``, as specified in the command line arguments. Then, it will
call :py:func:`.exclude_patients` which will further remove patients based
on the ``EXCLUDE`` object in the ``mapping_file``. Finally, it will call
:py:func:`.transform_to_lyprox` to transform the data into the LyProX format
given the ``COLUMN_MAP`` object in the ``mapping_file``.
"""
logger.debug(self.model_dump_json(indent=2))
raw = load_patient_data(
file_path=self.input_file,
header=list(range(self.num_header_rows)),
)
raw = clean_header(
table=raw,
num_cols=raw.shape[1],
num_header_rows=self.num_header_rows,
)
cols_to_drop = raw.columns[self.drop_cols]
trimmed = raw.drop(cols_to_drop, axis="columns")
trimmed = trimmed.drop(index=self.drop_rows)
trimmed = trimmed.dropna(axis="index", how="all")
logger.info(f"Dropped rows {self.drop_rows} and columns {cols_to_drop}.")
spec = importlib.util.spec_from_file_location("map_module", self.mapping_file)
mapping = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mapping)
logger.info(f"Imported mapping instructions from {self.mapping_file}")
reduced = exclude_patients(trimmed, mapping.EXCLUDE)
processed = transform_to_lyprox(reduced, mapping.COLUMN_MAP)
if "side" in processed.ly:
processed = leftright_to_ipsicontra(processed)
save_table_to_csv(file_path=self.output_file, table=processed)
[docs]
class ParsingError(Exception):
"""Error while parsing the CSV file."""
[docs]
def get_instruction_depth(nested_column_map: dict[tuple, dict[str, Any]]) -> int:
"""Get the depth at which the column mapping instructions are nested.
Instructions are a dictionary that contains either a 'func' or 'default' key.
>>> nested_column_map = {"patient": {"age": {"func": int}}}
>>> get_instruction_depth(nested_column_map)
2
>>> flat_column_map = flatten(nested_column_map, max_depth=2)
>>> get_instruction_depth(flat_column_map)
1
>>> nested_column_map = {"patient": {"__doc__": "some patient info", "age": 61}}
>>> get_instruction_depth(nested_column_map)
Traceback (most recent call last):
...
ValueError: Leaf of column map must be a dictionary with 'func' or 'default' key.
"""
for _, value in nested_column_map.items():
if isinstance(value, dict):
if "func" in value or "default" in value:
return 1
return 1 + get_instruction_depth(value)
raise ValueError(
"Leaf of column map must be a dictionary with 'func' or 'default' key.",
)
raise ValueError("Empty column map.")
[docs]
def generate_markdown_docs(
nested_column_map: dict[tuple, dict[str, Any]],
depth: int = 0,
indent_len: int = 4,
) -> str:
r"""Generate a markdown nested, ordered list as documentation for the column map.
A key in the doctionary is supposed to be documented, when its value is a dictionary
containing a ``"__doc__"`` key.
>>> nested_column_map = {
... "patient": {
... "__doc__": "some patient info",
... "age": {
... "__doc__": "age of the patient",
... "func": int,
... "columns": ["age"],
... },
... },
... }
>>> generate_markdown_docs(nested_column_map)
'1. **`patient:`** some patient info\n 1. **`age:`** age of the patient\n'
"""
md_docs = ""
indent = " " * indent_len * depth
i = 1
for key, value in nested_column_map.items():
if isinstance(value, dict):
if "__doc__" in value:
md_docs += f"{indent}{i}. **`{key}:`** {value['__doc__']}\n"
i += 1
md_docs += generate_markdown_docs(value, depth + 1, indent_len)
return md_docs
[docs]
def leftright_to_ipsicontra(data: pd.DataFrame):
"""Change absolute side reporting to tumor-relative.
Transform reporting of LNL involvement by absolute side (right & left) to a
reporting relative to the tumor (ipsi- & contralateral). The table ``data`` should
already be in the format LyProX requires, except for the side-reporting of LNL
involvement.
"""
len_before = len(data)
left_data = data.ly.query(C("side") != "right")
right_data = data.ly.query(C("side") == "right")
left_data = left_data.rename(columns={"left": "ipsi"}, level=1)
left_data = left_data.rename(columns={"right": "contra"}, level=1)
right_data = right_data.rename(columns={"left": "contra"}, level=1)
right_data = right_data.rename(columns={"right": "ipsi"}, level=1)
data = pd.concat([left_data, right_data], ignore_index=True)
if len_before != len(data):
raise RuntimeError("Number of patients changed")
logger.info("Transformed side reporting to ipsi- and contralateral.")
return data
[docs]
def exclude_patients(raw: pd.DataFrame, exclude: list[tuple[str, Any]]):
"""Exclude patients in the ``raw`` data based on a list of what to ``exclude``.
The ``exclude`` list contains tuples ``(column, check)``. The ``check`` function
will then exclude any patients from the cohort where ``check(raw[column])``
evaluates to ``True``.
>>> exclude = [("age", lambda s: s > 50)]
>>> table = pd.DataFrame({
... "age": [43, 82, 18, 67],
... "T-category": [ 3, 4, 2, 1],
... })
>>> exclude_patients(table, exclude)
age T-category
0 43 3
2 18 2
"""
num_before = len(raw)
filtered = raw.copy()
for column, check in exclude:
is_excluded = check(filtered[column])
filtered = filtered.loc[~is_excluded]
num_after = len(filtered)
logger.info(f"Excluded {num_before - num_after} patients.")
return filtered
if __name__ == "__main__":
main = assemble_main(settings_cls=LyproxifyCLI, prog_name="lyproxify")
main()