Source code for lyscripts.data.join

"""Join multiple lymphatic progression datasets into a single dataset."""

from pathlib import Path

import pandas as pd
from lydata.validator import cast_dtypes
from pydantic import Field

from lyscripts.cli import assemble_main
from lyscripts.configs import BaseCLI, DataConfig
from lyscripts.data.utils import save_table_to_csv



[docs]
class JoinCLI(BaseCLI):
    """Join multiple lymphatic progression datasets into a single dataset."""

    inputs: list[DataConfig] = Field(description="The datasets to join.")
    output_file: Path = Field(description="The path to the output dataset.")


[docs]
    def cli_cmd(self) -> None:
        r"""Start the ``join`` subcommand.

        This will load all datasets specified in the ``inputs`` attribute and
        concatenate them into a single dataset.

        Unfortunately, the use of `pydantic`_ does make this particular command a
        little bit more complicated (but also more powerful): If one simply wants to
        concatenate multiple datasets on disk, the ``inputs`` should be provided like
        this:

        .. code-block:: bash

            lyscripts data join \
            --inputs '{"source": "file1.csv"}' \
            --inputs '{"source": "file2.csv"}' \
            --output-file "joined.csv"

        But it also allows for concatenating datasets fetched directly from the
        `lydata Github repo`_. Due to the rather complex command signature, we
        recommend defining what to concatenate using a YAML file:

        .. code-block:: yaml

            inputs:
              - data.year: 2021
                data.institution: "usz"
                data.subsite: "oropharynx"
              - data.year: 2021
                data.institution: "clb"
                data.subsite: "oropharynx"

        Then, the command will look like this:

        .. code-block:: bash

            lyscripts data join --configs datasets.ly.yaml --output-file joined.csv

        .. _pydantic: https://docs.pydantic.dev/latest/
        .. _lydata Github repo: https://github.com/lycosystem/lydata
        """
        joined = None

        for data_config in self.inputs:
            data = data_config.load()
            # `cast_dtypes()` ensures that e.g. boolean values are not suddenly
            # converted to strings when a dataset with missing values is concatenated.
            data = cast_dtypes(data)
            if joined is None:
                joined = data
            else:
                joined = pd.concat(
                    [joined, data],
                    axis="index",
                    ignore_index=True,
                )

        save_table_to_csv(file_path=self.output_file, table=joined)




if __name__ == "__main__":
    main = assemble_main(settings_cls=JoinCLI, prog_name="join")
    main()