Generating Synthetic Data#

Script to generate a synthetic dataset.

The generation is done by the draw_patients() method of the lymph package, which is why this requires the specification of a model via the ModelConfig class.

pydantic settings lyscripts.data.generate.GenerateCLI[source]#

Bases: BaseCLI

Settings for the command-line interface.

Show JSON schema
{
   "title": "GenerateCLI",
   "description": "Settings for the command-line interface.",
   "type": "object",
   "properties": {
      "configs": {
         "default": [
            "config.yaml"
         ],
         "description": "Path to the YAML file(s) that contain the configuration(s). Configs from YAML files may be overwritten by command line arguments. When multiple files are specified, the configs are merged in the order they are given. Note that every config file must have a `version: 1` key in it.",
         "items": {
            "format": "path",
            "type": "string"
         },
         "title": "Configs",
         "type": "array"
      },
      "graph": {
         "$ref": "#/$defs/GraphConfig"
      },
      "model": {
         "$ref": "#/$defs/ModelConfig",
         "default": {
            "external_file": null,
            "class_name": "Unilateral",
            "constructor": "binary",
            "max_time": 10,
            "named_params": null,
            "kwargs": {}
         }
      },
      "distributions": {
         "additionalProperties": {
            "$ref": "#/$defs/DistributionConfig"
         },
         "default": {},
         "description": "Mapping of model T-categories to predefined distributions over diagnose times.",
         "title": "Distributions",
         "type": "object"
      },
      "t_stages_dist": {
         "additionalProperties": {
            "type": "number"
         },
         "description": "Specify what fraction of generated patients should come from the respective T-Stage.",
         "title": "T Stages Dist",
         "type": "object"
      },
      "modalities": {
         "additionalProperties": {
            "$ref": "#/$defs/ModalityConfig"
         },
         "title": "Modalities",
         "type": "object"
      },
      "params": {
         "additionalProperties": {
            "type": "number"
         },
         "title": "Params",
         "type": "object"
      },
      "num_patients": {
         "default": 200,
         "title": "Num Patients",
         "type": "integer"
      },
      "output_file": {
         "title": "Output File",
         "type": "string"
      },
      "seed": {
         "default": 42,
         "title": "Seed",
         "type": "integer"
      }
   },
   "$defs": {
      "DistributionConfig": {
         "description": "Configuration defining a distribution over diagnose times.",
         "properties": {
            "kind": {
               "default": "frozen",
               "description": "Parametric distributions may be updated.",
               "enum": [
                  "frozen",
                  "parametric"
               ],
               "title": "Kind",
               "type": "string"
            },
            "func": {
               "const": "binomial",
               "default": "binomial",
               "description": "Name of predefined function to use as distribution.",
               "title": "Func",
               "type": "string"
            },
            "params": {
               "additionalProperties": {
                  "anyOf": [
                     {
                        "type": "integer"
                     },
                     {
                        "type": "number"
                     }
                  ]
               },
               "default": {},
               "description": "Parameters to pass to the predefined function.",
               "title": "Params",
               "type": "object"
            }
         },
         "title": "DistributionConfig",
         "type": "object"
      },
      "GraphConfig": {
         "description": "Specifies how the tumor(s) and LNLs are connected in a DAG.",
         "properties": {
            "tumor": {
               "additionalProperties": {
                  "items": {
                     "type": "string"
                  },
                  "type": "array"
               },
               "description": "Define the name of the tumor(s) and which LNLs it/they drain to.",
               "title": "Tumor",
               "type": "object"
            },
            "lnl": {
               "additionalProperties": {
                  "items": {
                     "type": "string"
                  },
                  "type": "array"
               },
               "description": "Define the name of the LNL(s) and which LNLs it/they drain to.",
               "title": "Lnl",
               "type": "object"
            }
         },
         "required": [
            "tumor",
            "lnl"
         ],
         "title": "GraphConfig",
         "type": "object"
      },
      "ModalityConfig": {
         "description": "Define a diagnostic or pathological modality.",
         "properties": {
            "spec": {
               "description": "Specificity of the modality.",
               "maximum": 1.0,
               "minimum": 0.5,
               "title": "Spec",
               "type": "number"
            },
            "sens": {
               "description": "Sensitivity of the modality.",
               "maximum": 1.0,
               "minimum": 0.5,
               "title": "Sens",
               "type": "number"
            },
            "kind": {
               "default": "clinical",
               "description": "Clinical modalities cannot detect microscopic disease.",
               "enum": [
                  "clinical",
                  "pathological"
               ],
               "title": "Kind",
               "type": "string"
            }
         },
         "required": [
            "spec",
            "sens"
         ],
         "title": "ModalityConfig",
         "type": "object"
      },
      "ModelConfig": {
         "description": "Define which of the ``lymph`` models to use and how to set them up.",
         "properties": {
            "external_file": {
               "anyOf": [
                  {
                     "format": "file-path",
                     "type": "string"
                  },
                  {
                     "type": "null"
                  }
               ],
               "default": null,
               "description": "Path to a Python file that defines a model.",
               "title": "External File"
            },
            "class_name": {
               "default": "Unilateral",
               "description": "Name of the model class to use.",
               "enum": [
                  "Unilateral",
                  "Bilateral",
                  "Midline"
               ],
               "title": "Class Name",
               "type": "string"
            },
            "constructor": {
               "default": "binary",
               "description": "Trinary models differentiate btw. micro- and macroscopic disease.",
               "enum": [
                  "binary",
                  "trinary"
               ],
               "title": "Constructor",
               "type": "string"
            },
            "max_time": {
               "default": 10,
               "description": "Max. number of time-steps to evolve the model over.",
               "title": "Max Time",
               "type": "integer"
            },
            "named_params": {
               "default": null,
               "description": "Subset of valid model parameters a sampler may provide in the form of a dictionary to the model instead of as an array. Or, after sampling, with this list, one may safely recover which parameter corresponds to which index in the sample.",
               "items": {
                  "type": "string"
               },
               "title": "Named Params",
               "type": "array"
            },
            "kwargs": {
               "additionalProperties": true,
               "default": {},
               "description": "Additional keyword arguments to pass to the model constructor.",
               "title": "Kwargs",
               "type": "object"
            }
         },
         "title": "ModelConfig",
         "type": "object"
      }
   },
   "required": [
      "graph",
      "t_stages_dist",
      "modalities",
      "params",
      "output_file"
   ]
}

field graph: GraphConfig [Required]#
field model: ModelConfig = ModelConfig(external_file=None, class_name='Unilateral', constructor='binary', max_time=10, named_params=None, kwargs={})#
field distributions: dict[str, DistributionConfig] = {}#

Mapping of model T-categories to predefined distributions over diagnose times.

field t_stages_dist: dict[str, float] [Required]#

Specify what fraction of generated patients should come from the respective T-Stage.

field modalities: dict[str, ModalityConfig] [Required]#
field params: dict[str, float] [Required]#
field num_patients: int = 200#
field output_file: str [Required]#
field seed: int = 42#
model_post_init(_GenerateCLI__context) None[source]#

Make sure distribution over T-stages is normalized.

cli_cmd() None[source]#

Run the generate command.

Here, the command constructs a model from the settings provided via the arguments. It then generates a synthetic dataset using the draw_patients() from the lymph package.

Command Help#

Usage: lyscripts data generate [-h] [--configs list[Path]] [--graph [JSON]]
                               [--graph.tumor dict[str,list[str]]]
                               [--graph.lnl dict[str,list[str]]]
                               [--model [JSON]]
                               [--model.external-file {Path,null}]
                               [--model.class-name {Unilateral,Bilateral,Midline}]
                               [--model.constructor {binary,trinary}]
                               [--model.max-time int]
                               [--model.named-params Sequence[str]]
                               [--model.kwargs dict[str,Any]]
                               [--distributions dict[str,JSON]]
                               [--t-stages-dist dict[str,float]]
                               [--modalities dict[str,JSON]]
                               [--params dict[str,float]] [--num-patients int]
                               [--output-file str] [--seed int]

Settings for the command-line interface.

Options:
  -h, --help            show this help message and exit
  --configs list[Path]  Path to the YAML file(s) that contain the
                        configuration(s). Configs from YAML files may be
                        overwritten by command line arguments. When multiple
                        files are specified, the configs are merged in the
                        order they are given. Note that every config file must
                        have a `version: 1` key in it. (default:
                        ['config.yaml'])
  --distributions dict[str,JSON]
                        Mapping of model T-categories to predefined
                        distributions over diagnose times. (default: {})
  --t-stages-dist dict[str,float]
                        Specify what fraction of generated patients should
                        come from the respective T-Stage. (required)
  --modalities dict[str,JSON]
                        (required)
  --params dict[str,float]
                        (required)
  --num-patients int    (default: 200)
  --output-file str     (required)
  --seed int            (default: 42)

Graph Options:
  Specifies how the tumor(s) and LNLs are connected in a DAG.

  --graph [JSON]        set graph from JSON string (default: {})
  --graph.tumor dict[str,list[str]]
                        Define the name of the tumor(s) and which LNLs it/they
                        drain to. (required)
  --graph.lnl dict[str,list[str]]
                        Define the name of the LNL(s) and which LNLs it/they
                        drain to. (required)

Model Options:
  Define which of the ``lymph`` models to use and how to set them up.

  --model [JSON]        set model from JSON string (default: {})
  --model.external-file {Path,null}
                        Path to a Python file that defines a model. (default:
                        None)
  --model.class-name {Unilateral,Bilateral,Midline}
                        Name of the model class to use. (default: Unilateral)
  --model.constructor {binary,trinary}
                        Trinary models differentiate btw. micro- and
                        macroscopic disease. (default: binary)
  --model.max-time int  Max. number of time-steps to evolve the model over.
                        (default: 10)
  --model.named-params Sequence[str]
                        Subset of valid model parameters a sampler may provide
                        in the form of a dictionary to the model instead of as
                        an array. Or, after sampling, with this list, one may
                        safely recover which parameter corresponds to which
                        index in the sample. (default: None)
  --model.kwargs dict[str,Any]
                        Additional keyword arguments to pass to the model
                        constructor. (default: {})