Split Data#

Split a dataset into cross-validation folds based on params.yaml file.

pydantic settings lyscripts.data.split.SplitCLI[source]#

Bases: BaseCLI

Split a dataset into cross-validation folds.

Show JSON schema
{
   "title": "SplitCLI",
   "description": "Split a dataset into cross-validation folds.",
   "type": "object",
   "properties": {
      "configs": {
         "default": [
            "config.yaml"
         ],
         "description": "Path to the YAML file(s) that contain the configuration(s). Configs from YAML files may be overwritten by command line arguments. When multiple files are specified, the configs are merged in the order they are given. Note that every config file must have a `version: 1` key in it.",
         "items": {
            "format": "path",
            "type": "string"
         },
         "title": "Configs",
         "type": "array"
      },
      "input": {
         "$ref": "#/$defs/DataConfig"
      },
      "cross_validation": {
         "$ref": "#/$defs/CrossValidationConfig",
         "default": {
            "seed": 42,
            "folds": 5
         }
      },
      "output_dir": {
         "description": "The folder to store the split CSV files in.",
         "format": "path",
         "title": "Output Dir",
         "type": "string"
      }
   },
   "$defs": {
      "CrossValidationConfig": {
         "description": "Configs for splitting a dataset into cross-validation folds.",
         "properties": {
            "seed": {
               "default": 42,
               "description": "Seed for the random number generator.",
               "title": "Seed",
               "type": "integer"
            },
            "folds": {
               "default": 5,
               "description": "Number of folds to split the dataset into.",
               "title": "Folds",
               "type": "integer"
            }
         },
         "title": "CrossValidationConfig",
         "type": "object"
      },
      "DataConfig": {
         "description": "Where to load lymphatic progression data from and how to feed it into a model.",
         "properties": {
            "source": {
               "anyOf": [
                  {
                     "format": "file-path",
                     "type": "string"
                  },
                  {
                     "$ref": "#/$defs/LyDataset"
                  }
               ],
               "description": "Either a path to a CSV file or a config that specifies how and where to fetch the data from.",
               "title": "Source"
            },
            "side": {
               "anyOf": [
                  {
                     "enum": [
                        "ipsi",
                        "contra"
                     ],
                     "type": "string"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null,
               "description": "Side of the neck to load data for. Only for Unilateral models.",
               "title": "Side"
            },
            "mapping": {
               "additionalProperties": {
                  "anyOf": [
                     {
                        "type": "integer"
                     },
                     {
                        "type": "string"
                     }
                  ]
               },
               "description": "Optional mapping of numeric T-stages to model T-stages.",
               "title": "Mapping",
               "type": "object"
            }
         },
         "required": [
            "source"
         ],
         "title": "DataConfig",
         "type": "object"
      },
      "LyDataset": {
         "description": "Specification of a dataset.",
         "properties": {
            "year": {
               "description": "Release year of dataset.",
               "exclusiveMinimum": 0,
               "maximum": 2026,
               "title": "Year",
               "type": "integer"
            },
            "institution": {
               "description": "Institution's short code. E.g., University Hospital Zurich: `usz`.",
               "minLength": 1,
               "title": "Institution",
               "type": "string"
            },
            "subsite": {
               "description": "Tumor subsite(s) patients in this dataset were diagnosed with.",
               "minLength": 1,
               "title": "Subsite",
               "type": "string"
            },
            "repo_name": {
               "anyOf": [
                  {
                     "minLength": 1,
                     "type": "string"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": "lycosystem/lydata",
               "description": "GitHub `repository/owner`.",
               "title": "Repo Name"
            },
            "ref": {
               "anyOf": [
                  {
                     "minLength": 1,
                     "type": "string"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": "main",
               "description": "Branch/tag/commit of the repo.",
               "title": "Ref"
            },
            "local_dataset_dir": {
               "anyOf": [
                  {
                     "format": "directory-path",
                     "type": "string"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null,
               "description": "Path to directory containing all the dataset subdirectories. So, e.g. if `path_on_disk` is `~/datasets` and the dataset is `2023-clb-multisite`, then the CSV file is expected to be at `~/datasets/2023-clb-multisite/data.csv`.",
               "title": "Local Dataset Dir"
            }
         },
         "required": [
            "year",
            "institution",
            "subsite"
         ],
         "title": "LyDataset",
         "type": "object"
      }
   },
   "required": [
      "input",
      "output_dir"
   ]
}

field input: DataConfig [Required]#
field cross_validation: CrossValidationConfig = CrossValidationConfig(seed=42, folds=5)#
field output_dir: Path [Required]#

The folder to store the split CSV files in.

cli_cmd() None[source]#

Run the split subcommand.

This will load the dataset specified in the input argument and split it into the number of folds specified in the cross_validation argument. The resulting splits will be stored in the folder specified in the output_dir argument.

Command Help#

Usage: lyscripts data split [-h] [--configs list[Path]] [--input [JSON]]
                            [--input.source [{Path,JSON}]]
                            [--input.source.year int]
                            [--input.source.institution str]
                            [--input.source.subsite str]
                            [--input.source.repo-name {str,null}]
                            [--input.source.ref {str,null}]
                            [--input.source.local-dataset-dir {Path,null}]
                            [--input.side {{ipsi,contra},null}]
                            [--input.mapping dict[{{0,1,2,3,4},str},{int,str}]]
                            [--cross-validation [JSON]]
                            [--cross-validation.seed int]
                            [--cross-validation.folds int] [--output-dir Path]

Split a dataset into cross-validation folds.

Options:
  -h, --help            show this help message and exit
  --configs list[Path]  Path to the YAML file(s) that contain the
                        configuration(s). Configs from YAML files may be
                        overwritten by command line arguments. When multiple
                        files are specified, the configs are merged in the
                        order they are given. Note that every config file must
                        have a `version: 1` key in it. (default:
                        ['config.yaml'])
  --output-dir Path     The folder to store the split CSV files in. (required)

Input Options:
  Where to load lymphatic progression data from and how to feed it into a
  model.

  --input [JSON]        set input from JSON string (default: {})
  --input.side {{ipsi,contra},null}
                        Side of the neck to load data for. Only for Unilateral
                        models. (default: null)
  --input.mapping dict[{{0,1,2,3,4},str},{int,str}]
                        Optional mapping of numeric T-stages to model
                        T-stages. (default factory: DataConfig.<lambda>)

Input.Source Options:
  Specification of a dataset.

  --input.source [{Path,JSON}]
                        set input.source from JSON string (default: {})
  --input.source.year int
                        Release year of dataset. (required)
  --input.source.institution str
                        Institution's short code. E.g., University Hospital
                        Zurich: `usz`. (required)
  --input.source.subsite str
                        Tumor subsite(s) patients in this dataset were
                        diagnosed with. (required)
  --input.source.repo-name {str,null}
                        GitHub `repository/owner`. (default:
                        lycosystem/lydata)
  --input.source.ref {str,null}
                        Branch/tag/commit of the repo. (default: main)
  --input.source.local-dataset-dir {Path,null}
                        Path to directory containing all the dataset
                        subdirectories. So, e.g. if `path_on_disk` is
                        `~/datasets` and the dataset is `2023-clb-multisite`,
                        then the CSV file is expected to be at
                        `~/datasets/2023-clb-multisite/data.csv`. (default:
                        null)

Cross-Validation Options:
  Configs for splitting a dataset into cross-validation folds.

  --cross-validation [JSON]
                        set cross-validation from JSON string (default: {})
  --cross-validation.seed int
                        Seed for the random number generator. (default: 42)
  --cross-validation.folds int
                        Number of folds to split the dataset into. (default:
                        5)