Source code for lyscripts.data.join

"""Join multiple lymphatic progression datasets into a single dataset."""

from pathlib import Path

import pandas as pd
from lydata.validator import cast_dtypes
from pydantic import Field

from lyscripts.cli import assemble_main
from lyscripts.configs import BaseCLI, DataConfig
from lyscripts.data.utils import save_table_to_csv


[docs] class JoinCLI(BaseCLI): """Join multiple lymphatic progression datasets into a single dataset.""" inputs: list[DataConfig] = Field(description="The datasets to join.") output_file: Path = Field(description="The path to the output dataset.")
[docs] def cli_cmd(self) -> None: r"""Start the ``join`` subcommand. This will load all datasets specified in the ``inputs`` attribute and concatenate them into a single dataset. Unfortunately, the use of `pydantic`_ does make this particular command a little bit more complicated (but also more powerful): If one simply wants to concatenate multiple datasets on disk, the ``inputs`` should be provided like this: .. code-block:: bash lyscripts data join \ --inputs '{"source": "file1.csv"}' \ --inputs '{"source": "file2.csv"}' \ --output-file "joined.csv" But it also allows for concatenating datasets fetched directly from the `lydata Github repo`_. Due to the rather complex command signature, we recommend defining what to concatenate using a YAML file: .. code-block:: yaml inputs: - data.year: 2021 data.institution: "usz" data.subsite: "oropharynx" - data.year: 2021 data.institution: "clb" data.subsite: "oropharynx" Then, the command will look like this: .. code-block:: bash lyscripts data join --configs datasets.ly.yaml --output-file joined.csv .. _pydantic: https://docs.pydantic.dev/latest/ .. _lydata Github repo: https://github.com/lycosystem/lydata """ joined = None for data_config in self.inputs: data = data_config.load() # `cast_dtypes()` ensures that e.g. boolean values are not suddenly # converted to strings when a dataset with missing values is concatenated. data = cast_dtypes(data) if joined is None: joined = data else: joined = pd.concat( [joined, data], axis="index", ignore_index=True, ) save_table_to_csv(file_path=self.output_file, table=joined)
if __name__ == "__main__": main = assemble_main(settings_cls=JoinCLI, prog_name="join") main()