From bca9a9147ca2d1b35dc702f9a11f0c7c7966ac2a Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Fri, 15 Nov 2024 10:55:12 -0800 Subject: [PATCH 01/30] Make CLI lazy load This will allow parameters to be added when processing the `start` step (for the upcomming config change). --- metaflow/cli.py | 706 +------------------------- metaflow/cli_components/__init__.py | 0 metaflow/cli_components/dump_cmd.py | 90 ++++ metaflow/cli_components/init_cmd.py | 51 ++ metaflow/cli_components/run_cmds.py | 357 +++++++++++++ metaflow/cli_components/step_cmd.py | 182 +++++++ metaflow/cli_components/utils.py | 82 +++ metaflow/extension_support/plugins.py | 68 ++- metaflow/plugins/__init__.py | 4 + 9 files changed, 824 insertions(+), 716 deletions(-) create mode 100644 metaflow/cli_components/__init__.py create mode 100644 metaflow/cli_components/dump_cmd.py create mode 100644 metaflow/cli_components/init_cmd.py create mode 100644 metaflow/cli_components/run_cmds.py create mode 100644 metaflow/cli_components/step_cmd.py create mode 100644 metaflow/cli_components/utils.py diff --git a/metaflow/cli.py b/metaflow/cli.py index 498ea1b74b2..83f340967c6 100644 --- a/metaflow/cli.py +++ b/metaflow/cli.py @@ -1,17 +1,15 @@ import inspect -import json import sys import traceback from datetime import datetime -from functools import wraps import metaflow.tracing as tracing from metaflow._vendor import click -from metaflow.client.core import get_metadata -from . import decorators, lint, metaflow_version, namespace, parameters, plugins +from . import decorators, lint, metaflow_version, parameters, plugins from .cli_args import cli_args -from .datastore import FlowDataStore, TaskDataStore, TaskDataStoreSet +from .cli_components.utils import LazyGroup, LazyPluginCommandCollection +from .datastore import FlowDataStore from .exception import CommandException, MetaflowException from .graph import FlowGraph from .metaflow_config import ( @@ -26,8 +24,6 @@ from .metaflow_current import current from metaflow.system import _system_monitor, _system_logger from .metaflow_environment import MetaflowEnvironment -from .mflog import LOG_SOURCES, mflog -from .package import MetaflowPackage from .plugins import ( DATASTORES, ENVIRONMENTS, @@ -37,16 +33,7 @@ ) from .pylint_wrapper import PyLint from .R import metaflow_r_version, use_r -from .runtime import NativeRuntime -from .tagging_util import validate_tags -from .task import MetaflowTask -from .unbounded_foreach import UBF_CONTROL, UBF_TASK -from .util import ( - decompress_list, - get_latest_run_id, - resolve_identity, - write_latest_run_id, -) +from .util import resolve_identity ERASE_TO_EOL = "\033[K" HIGHLIGHT = "red" @@ -56,13 +43,6 @@ LOGGER_COLOR = "green" LOGGER_BAD_COLOR = "red" -try: - # Python 2 - import cPickle as pickle -except ImportError: - # Python 3 - import pickle - def echo_dev_null(*args, **kwargs): pass @@ -141,7 +121,16 @@ def config_merge_cb(ctx, param, value): return tuple(list(value) + DECOSPECS.split()) -@click.group() +@click.group( + cls=LazyGroup, + lazy_subcommands={ + "init": "metaflow.cli_components.init_cmd.init", + "dump": "metaflow.cli_components.dump_cmd.dump", + "step": "metaflow.cli_components.step_cmd.step", + "run": "metaflow.cli_components.run_cmds.run", + "resume": "metaflow.cli_components.run_cmds.resume", + }, +) def cli(ctx): pass @@ -221,653 +210,6 @@ def output_dot(obj): echo_always(obj.graph.output_dot(), err=False) -@cli.command( - help="Get data artifacts of a task or all tasks in a step. " - "The format for input-path is either / or " - "//." -) -@click.argument("input-path") -@click.option( - "--private/--no-private", - default=False, - show_default=True, - help="Show also private attributes.", -) -@click.option( - "--max-value-size", - default=1000, - show_default=True, - type=int, - help="Show only values that are smaller than this number. " - "Set to 0 to see only keys.", -) -@click.option( - "--include", - type=str, - default="", - help="Include only artifacts in the given comma-separated list.", -) -@click.option( - "--file", type=str, default=None, help="Serialize artifacts in the given file." -) -@click.pass_obj -def dump(obj, input_path, private=None, max_value_size=None, include=None, file=None): - output = {} - kwargs = { - "show_private": private, - "max_value_size": max_value_size, - "include": {t for t in include.split(",") if t}, - } - - # Pathspec can either be run_id/step_name or run_id/step_name/task_id. - parts = input_path.split("/") - if len(parts) == 2: - run_id, step_name = parts - task_id = None - elif len(parts) == 3: - run_id, step_name, task_id = parts - else: - raise CommandException( - "input_path should either be run_id/step_name or run_id/step_name/task_id" - ) - - datastore_set = TaskDataStoreSet( - obj.flow_datastore, - run_id, - steps=[step_name], - prefetch_data_artifacts=kwargs.get("include"), - ) - if task_id: - ds_list = [datastore_set.get_with_pathspec(input_path)] - else: - ds_list = list(datastore_set) # get all tasks - - for ds in ds_list: - echo( - "Dumping output of run_id=*{run_id}* " - "step=*{step}* task_id=*{task_id}*".format( - run_id=ds.run_id, step=ds.step_name, task_id=ds.task_id - ), - fg="magenta", - ) - - if file is None: - echo_always( - ds.format(**kwargs), highlight="green", highlight_bold=False, err=False - ) - else: - output[ds.pathspec] = ds.to_dict(**kwargs) - - if file is not None: - with open(file, "wb") as f: - pickle.dump(output, f, protocol=pickle.HIGHEST_PROTOCOL) - echo("Artifacts written to *%s*" % file) - - -# TODO - move step and init under a separate 'internal' subcommand - - -@cli.command(help="Internal command to execute a single task.", hidden=True) -@click.argument("step-name") -@click.option( - "--run-id", - default=None, - required=True, - help="ID for one execution of all steps in the flow.", -) -@click.option( - "--task-id", - default=None, - required=True, - show_default=True, - help="ID for this instance of the step.", -) -@click.option( - "--input-paths", - help="A comma-separated list of pathspecs specifying inputs for this step.", -) -@click.option( - "--input-paths-filename", - type=click.Path(exists=True, readable=True, dir_okay=False, resolve_path=True), - help="A filename containing the argument typically passed to `input-paths`", - hidden=True, -) -@click.option( - "--split-index", - type=int, - default=None, - show_default=True, - help="Index of this foreach split.", -) -@click.option( - "--tag", - "opt_tag", - multiple=True, - default=None, - help="Annotate this run with the given tag. You can specify " - "this option multiple times to attach multiple tags in " - "the task.", -) -@click.option( - "--namespace", - "opt_namespace", - default=None, - help="Change namespace from the default (your username) to the specified tag.", -) -@click.option( - "--retry-count", - default=0, - help="How many times we have attempted to run this task.", -) -@click.option( - "--max-user-code-retries", - default=0, - help="How many times we should attempt running the user code.", -) -@click.option( - "--clone-only", - default=None, - help="Pathspec of the origin task for this task to clone. Do " - "not execute anything.", -) -@click.option( - "--clone-run-id", - default=None, - help="Run id of the origin flow, if this task is part of a flow being resumed.", -) -@click.option( - "--with", - "decospecs", - multiple=True, - help="Add a decorator to this task. You can specify this " - "option multiple times to attach multiple decorators " - "to this task.", -) -@click.option( - "--ubf-context", - default="none", - type=click.Choice(["none", UBF_CONTROL, UBF_TASK]), - help="Provides additional context if this task is of type unbounded foreach.", -) -@click.option( - "--num-parallel", - default=0, - type=int, - help="Number of parallel instances of a step. Ignored in local mode (see parallel decorator code).", -) -@click.pass_context -def step( - ctx, - step_name, - opt_tag=None, - run_id=None, - task_id=None, - input_paths=None, - input_paths_filename=None, - split_index=None, - opt_namespace=None, - retry_count=None, - max_user_code_retries=None, - clone_only=None, - clone_run_id=None, - decospecs=None, - ubf_context="none", - num_parallel=None, -): - if ubf_context == "none": - ubf_context = None - if opt_namespace is not None: - namespace(opt_namespace or None) - - func = None - try: - func = getattr(ctx.obj.flow, step_name) - except: - raise CommandException("Step *%s* doesn't exist." % step_name) - if not func.is_step: - raise CommandException("Function *%s* is not a step." % step_name) - echo("Executing a step, *%s*" % step_name, fg="magenta", bold=False) - - if decospecs: - decorators._attach_decorators_to_step(func, decospecs) - - step_kwargs = ctx.params - # Remove argument `step_name` from `step_kwargs`. - step_kwargs.pop("step_name", None) - # Remove `opt_*` prefix from (some) option keys. - step_kwargs = dict( - [(k[4:], v) if k.startswith("opt_") else (k, v) for k, v in step_kwargs.items()] - ) - cli_args._set_step_kwargs(step_kwargs) - - ctx.obj.metadata.add_sticky_tags(tags=opt_tag) - if not input_paths and input_paths_filename: - with open(input_paths_filename, mode="r", encoding="utf-8") as f: - input_paths = f.read().strip(" \n\"'") - - paths = decompress_list(input_paths) if input_paths else [] - - task = MetaflowTask( - ctx.obj.flow, - ctx.obj.flow_datastore, - ctx.obj.metadata, - ctx.obj.environment, - ctx.obj.echo, - ctx.obj.event_logger, - ctx.obj.monitor, - ubf_context, - ) - if clone_only: - task.clone_only( - step_name, - run_id, - task_id, - clone_only, - retry_count, - ) - else: - task.run_step( - step_name, - run_id, - task_id, - clone_run_id, - paths, - split_index, - retry_count, - max_user_code_retries, - ) - - echo("Success", fg="green", bold=True, indent=True) - - -@parameters.add_custom_parameters(deploy_mode=False) -@cli.command(help="Internal command to initialize a run.", hidden=True) -@click.option( - "--run-id", - default=None, - required=True, - help="ID for one execution of all steps in the flow.", -) -@click.option( - "--task-id", default=None, required=True, help="ID for this instance of the step." -) -@click.option( - "--tag", - "tags", - multiple=True, - default=None, - help="Tags for this instance of the step.", -) -@click.pass_obj -def init(obj, run_id=None, task_id=None, tags=None, **kwargs): - # init is a separate command instead of an option in 'step' - # since we need to capture user-specified parameters with - # @add_custom_parameters. Adding custom parameters to 'step' - # is not desirable due to the possibility of name clashes between - # user-specified parameters and our internal options. Note that - # user-specified parameters are often defined as environment - # variables. - - obj.metadata.add_sticky_tags(tags=tags) - - runtime = NativeRuntime( - obj.flow, - obj.graph, - obj.flow_datastore, - obj.metadata, - obj.environment, - obj.package, - obj.logger, - obj.entrypoint, - obj.event_logger, - obj.monitor, - run_id=run_id, - ) - obj.flow._set_constants(obj.graph, kwargs) - runtime.persist_constants(task_id=task_id) - - -def common_run_options(func): - @click.option( - "--tag", - "tags", - multiple=True, - default=None, - help="Annotate this run with the given tag. You can specify " - "this option multiple times to attach multiple tags in " - "the run.", - ) - @click.option( - "--max-workers", - default=16, - show_default=True, - help="Maximum number of parallel processes.", - ) - @click.option( - "--max-num-splits", - default=100, - show_default=True, - help="Maximum number of splits allowed in a foreach. This " - "is a safety check preventing bugs from triggering " - "thousands of steps inadvertently.", - ) - @click.option( - "--max-log-size", - default=10, - show_default=True, - help="Maximum size of stdout and stderr captured in " - "megabytes. If a step outputs more than this to " - "stdout/stderr, its output will be truncated.", - ) - @click.option( - "--with", - "decospecs", - multiple=True, - help="Add a decorator to all steps. You can specify this " - "option multiple times to attach multiple decorators " - "in steps.", - ) - @click.option( - "--run-id-file", - default=None, - show_default=True, - type=str, - help="Write the ID of this run to the file specified.", - ) - @click.option( - "--runner-attribute-file", - default=None, - show_default=True, - type=str, - help="Write the metadata and pathspec of this run to the file specified. Used internally for Metaflow's Runner API.", - ) - @wraps(func) - def wrapper(*args, **kwargs): - return func(*args, **kwargs) - - return wrapper - - -@click.option( - "--origin-run-id", - default=None, - help="ID of the run that should be resumed. By default, the " - "last run executed locally.", -) -@click.option( - "--run-id", - default=None, - help="Run ID for the new run. By default, a new run-id will be generated", - hidden=True, -) -@click.option( - "--clone-only/--no-clone-only", - default=False, - show_default=True, - help="Only clone tasks without continuing execution", - hidden=True, -) -@click.option( - "--reentrant/--no-reentrant", - default=False, - show_default=True, - hidden=True, - help="If specified, allows this call to be called in parallel", -) -@click.option( - "--resume-identifier", - default=None, - show_default=True, - hidden=True, - help="If specified, it identifies the task that started this resume call. It is in the form of {step_name}-{task_id}", -) -@click.argument("step-to-rerun", required=False) -@cli.command(help="Resume execution of a previous run of this flow.") -@common_run_options -@click.pass_obj -def resume( - obj, - tags=None, - step_to_rerun=None, - origin_run_id=None, - run_id=None, - clone_only=False, - reentrant=False, - max_workers=None, - max_num_splits=None, - max_log_size=None, - decospecs=None, - run_id_file=None, - resume_identifier=None, - runner_attribute_file=None, -): - before_run(obj, tags, decospecs) - - if origin_run_id is None: - origin_run_id = get_latest_run_id(obj.echo, obj.flow.name) - if origin_run_id is None: - raise CommandException( - "A previous run id was not found. Specify --origin-run-id." - ) - - if step_to_rerun is None: - steps_to_rerun = set() - else: - # validate step name - if step_to_rerun not in obj.graph.nodes: - raise CommandException( - "invalid step name {0} specified, must be step present in " - "current form of execution graph. Valid step names include: {1}".format( - step_to_rerun, ",".join(list(obj.graph.nodes.keys())) - ) - ) - steps_to_rerun = {step_to_rerun} - - if run_id: - # Run-ids that are provided by the metadata service are always integers. - # External providers or run-ids (like external schedulers) always need to - # be non-integers to avoid any clashes. This condition ensures this. - try: - int(run_id) - except: - pass - else: - raise CommandException("run-id %s cannot be an integer" % run_id) - - runtime = NativeRuntime( - obj.flow, - obj.graph, - obj.flow_datastore, - obj.metadata, - obj.environment, - obj.package, - obj.logger, - obj.entrypoint, - obj.event_logger, - obj.monitor, - run_id=run_id, - clone_run_id=origin_run_id, - clone_only=clone_only, - reentrant=reentrant, - steps_to_rerun=steps_to_rerun, - max_workers=max_workers, - max_num_splits=max_num_splits, - max_log_size=max_log_size * 1024 * 1024, - resume_identifier=resume_identifier, - ) - write_file(run_id_file, runtime.run_id) - runtime.print_workflow_info() - - runtime.persist_constants() - - if runner_attribute_file: - with open(runner_attribute_file, "w", encoding="utf-8") as f: - json.dump( - { - "run_id": runtime.run_id, - "flow_name": obj.flow.name, - "metadata": obj.metadata.metadata_str(), - }, - f, - ) - - # We may skip clone-only resume if this is not a resume leader, - # and clone is already complete. - if runtime.should_skip_clone_only_execution(): - return - - current._update_env( - { - "run_id": runtime.run_id, - } - ) - _system_logger.log_event( - level="info", - module="metaflow.resume", - name="start", - payload={ - "msg": "Resuming run", - }, - ) - - with runtime.run_heartbeat(): - if clone_only: - runtime.clone_original_run() - else: - runtime.clone_original_run(generate_task_obj=True, verbose=False) - runtime.execute() - - -@tracing.cli_entrypoint("cli/run") -@parameters.add_custom_parameters(deploy_mode=True) -@cli.command(help="Run the workflow locally.") -@common_run_options -@click.option( - "--namespace", - "user_namespace", - default=None, - help="Change namespace from the default (your username) to " - "the specified tag. Note that this option does not alter " - "tags assigned to the objects produced by this run, just " - "what existing objects are visible in the client API. You " - "can enable the global namespace with an empty string." - "--namespace=", -) -@click.pass_obj -def run( - obj, - tags=None, - max_workers=None, - max_num_splits=None, - max_log_size=None, - decospecs=None, - run_id_file=None, - runner_attribute_file=None, - user_namespace=None, - **kwargs -): - if user_namespace is not None: - namespace(user_namespace or None) - before_run(obj, tags, decospecs) - - runtime = NativeRuntime( - obj.flow, - obj.graph, - obj.flow_datastore, - obj.metadata, - obj.environment, - obj.package, - obj.logger, - obj.entrypoint, - obj.event_logger, - obj.monitor, - max_workers=max_workers, - max_num_splits=max_num_splits, - max_log_size=max_log_size * 1024 * 1024, - ) - write_latest_run_id(obj, runtime.run_id) - write_file(run_id_file, runtime.run_id) - - obj.flow._set_constants(obj.graph, kwargs) - current._update_env( - { - "run_id": runtime.run_id, - } - ) - _system_logger.log_event( - level="info", - module="metaflow.run", - name="start", - payload={ - "msg": "Starting run", - }, - ) - with runtime.run_heartbeat(): - runtime.print_workflow_info() - runtime.persist_constants() - - if runner_attribute_file: - with open(runner_attribute_file, "w", encoding="utf-8") as f: - json.dump( - { - "run_id": runtime.run_id, - "flow_name": obj.flow.name, - "metadata": obj.metadata.metadata_str(), - }, - f, - ) - runtime.execute() - - -def write_file(file_path, content): - if file_path is not None: - with open(file_path, "w") as f: - f.write(str(content)) - - -def before_run(obj, tags, decospecs): - validate_tags(tags) - - # There's a --with option both at the top-level and for the run - # subcommand. Why? - # - # "run --with shoes" looks so much better than "--with shoes run". - # This is a very common use case of --with. - # - # A downside is that we need to have the following decorators handling - # in two places in this module and make sure _init_step_decorators - # doesn't get called twice. - - # We want the order to be the following: - # - run level decospecs - # - top level decospecs - # - environment decospecs - all_decospecs = ( - list(decospecs or []) - + obj.tl_decospecs - + list(obj.environment.decospecs() or []) - ) - if all_decospecs: - decorators._attach_decorators(obj.flow, all_decospecs) - obj.graph = FlowGraph(obj.flow.__class__) - - obj.check(obj.graph, obj.flow, obj.environment, pylint=obj.pylint) - # obj.environment.init_environment(obj.logger) - - decorators._init_step_decorators( - obj.flow, obj.graph, obj.environment, obj.flow_datastore, obj.logger - ) - - obj.metadata.add_sticky_tags(tags=tags) - - # Package working directory only once per run. - # We explicitly avoid doing this in `start` since it is invoked for every - # step in the run. - obj.package = MetaflowPackage( - obj.flow, obj.environment, obj.echo, obj.package_suffixes - ) - - @cli.command(help="Print the Metaflow version") @click.pass_obj def version(obj): @@ -877,8 +219,9 @@ def version(obj): @tracing.cli_entrypoint("cli/start") @decorators.add_decorator_options @click.command( - cls=click.CommandCollection, - sources=[cli] + plugins.get_plugin_cli(), + cls=LazyPluginCommandCollection, + sources=[cli], + lazy_sources=plugins.get_plugin_cli_path(), invoke_without_command=True, ) @click.option( @@ -983,7 +326,6 @@ def start( ctx.obj.pylint = pylint ctx.obj.top_cli = cli ctx.obj.package_suffixes = package_suffixes.split(",") - ctx.obj.reconstruct_cli = _reconstruct_cli ctx.obj.environment = [ e for e in ENVIRONMENTS + [MetaflowEnvironment] if e.TYPE == environment @@ -1079,20 +421,6 @@ def start( ctx.invoke(check) -def _reconstruct_cli(params): - for k, v in params.items(): - if v: - if k == "decospecs": - k = "with" - k = k.replace("_", "-") - if not isinstance(v, tuple): - v = [v] - for value in v: - yield "--%s" % k - if not isinstance(value, bool): - yield str(value) - - def _check(graph, flow, environment, pylint=True, warnings=False, **kwargs): echo("Validating your flow...", fg="magenta", bold=False) linter = lint.linter diff --git a/metaflow/cli_components/__init__.py b/metaflow/cli_components/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/metaflow/cli_components/dump_cmd.py b/metaflow/cli_components/dump_cmd.py new file mode 100644 index 00000000000..6845f01fa73 --- /dev/null +++ b/metaflow/cli_components/dump_cmd.py @@ -0,0 +1,90 @@ +import pickle + +from metaflow._vendor import click + +from ..cli import echo_always, echo +from ..datastore import TaskDataStoreSet +from ..exception import CommandException + + +@click.command( + help="Get data artifacts of a task or all tasks in a step. " + "The format for input-path is either / or " + "//." +) +@click.argument("input-path") +@click.option( + "--private/--no-private", + default=False, + show_default=True, + help="Show also private attributes.", +) +@click.option( + "--max-value-size", + default=1000, + show_default=True, + type=int, + help="Show only values that are smaller than this number. " + "Set to 0 to see only keys.", +) +@click.option( + "--include", + type=str, + default="", + help="Include only artifacts in the given comma-separated list.", +) +@click.option( + "--file", type=str, default=None, help="Serialize artifacts in the given file." +) +@click.pass_obj +def dump(obj, input_path, private=None, max_value_size=None, include=None, file=None): + output = {} + kwargs = { + "show_private": private, + "max_value_size": max_value_size, + "include": {t for t in include.split(",") if t}, + } + + # Pathspec can either be run_id/step_name or run_id/step_name/task_id. + parts = input_path.split("/") + if len(parts) == 2: + run_id, step_name = parts + task_id = None + elif len(parts) == 3: + run_id, step_name, task_id = parts + else: + raise CommandException( + "input_path should either be run_id/step_name or run_id/step_name/task_id" + ) + + datastore_set = TaskDataStoreSet( + obj.flow_datastore, + run_id, + steps=[step_name], + prefetch_data_artifacts=kwargs.get("include"), + ) + if task_id: + ds_list = [datastore_set.get_with_pathspec(input_path)] + else: + ds_list = list(datastore_set) # get all tasks + + for ds in ds_list: + echo( + "Dumping output of run_id=*{run_id}* " + "step=*{step}* task_id=*{task_id}*".format( + run_id=ds.run_id, step=ds.step_name, task_id=ds.task_id + ), + fg="magenta", + ) + + if file is None: + echo_always( + ds.format(**kwargs), highlight="green", highlight_bold=False, err=False + ) + else: + output[ds.pathspec] = ds.to_dict(**kwargs) + + if file is not None: + with open(file, "wb") as f: + pickle.dump(output, f, protocol=pickle.HIGHEST_PROTOCOL) + echo("Artifacts written to *%s*" % file) diff --git a/metaflow/cli_components/init_cmd.py b/metaflow/cli_components/init_cmd.py new file mode 100644 index 00000000000..404a3a6911e --- /dev/null +++ b/metaflow/cli_components/init_cmd.py @@ -0,0 +1,51 @@ +from metaflow._vendor import click + +from .. import parameters +from ..runtime import NativeRuntime + + +@parameters.add_custom_parameters(deploy_mode=False) +@click.command(help="Internal command to initialize a run.", hidden=True) +@click.option( + "--run-id", + default=None, + required=True, + help="ID for one execution of all steps in the flow.", +) +@click.option( + "--task-id", default=None, required=True, help="ID for this instance of the step." +) +@click.option( + "--tag", + "tags", + multiple=True, + default=None, + help="Tags for this instance of the step.", +) +@click.pass_obj +def init(obj, run_id=None, task_id=None, tags=None, **kwargs): + # init is a separate command instead of an option in 'step' + # since we need to capture user-specified parameters with + # @add_custom_parameters. Adding custom parameters to 'step' + # is not desirable due to the possibility of name clashes between + # user-specified parameters and our internal options. Note that + # user-specified parameters are often defined as environment + # variables. + + obj.metadata.add_sticky_tags(tags=tags) + + runtime = NativeRuntime( + obj.flow, + obj.graph, + obj.flow_datastore, + obj.metadata, + obj.environment, + obj.package, + obj.logger, + obj.entrypoint, + obj.event_logger, + obj.monitor, + run_id=run_id, + ) + obj.flow._set_constants(obj.graph, kwargs) + runtime.persist_constants(task_id=task_id) diff --git a/metaflow/cli_components/run_cmds.py b/metaflow/cli_components/run_cmds.py new file mode 100644 index 00000000000..ec34c3bb4c3 --- /dev/null +++ b/metaflow/cli_components/run_cmds.py @@ -0,0 +1,357 @@ +import json + +from functools import wraps + +from metaflow._vendor import click + +from .. import decorators, namespace, parameters, tracing +from ..exception import CommandException +from ..graph import FlowGraph +from ..metaflow_current import current +from ..package import MetaflowPackage +from ..runtime import NativeRuntime +from ..system import _system_logger + +from ..tagging_util import validate_tags +from ..util import get_latest_run_id, write_latest_run_id + + +def before_run(obj, tags, decospecs): + validate_tags(tags) + + # There's a --with option both at the top-level and for the run + # subcommand. Why? + # + # "run --with shoes" looks so much better than "--with shoes run". + # This is a very common use case of --with. + # + # A downside is that we need to have the following decorators handling + # in two places in this module and make sure _init_step_decorators + # doesn't get called twice. + + # We want the order to be the following: + # - run level decospecs + # - top level decospecs + # - environment decospecs + all_decospecs = ( + list(decospecs or []) + + obj.tl_decospecs + + list(obj.environment.decospecs() or []) + ) + if all_decospecs: + decorators._attach_decorators(obj.flow, all_decospecs) + obj.graph = FlowGraph(obj.flow.__class__) + + obj.check(obj.graph, obj.flow, obj.environment, pylint=obj.pylint) + # obj.environment.init_environment(obj.logger) + + decorators._init_step_decorators( + obj.flow, obj.graph, obj.environment, obj.flow_datastore, obj.logger + ) + + obj.metadata.add_sticky_tags(tags=tags) + + # Package working directory only once per run. + # We explicitly avoid doing this in `start` since it is invoked for every + # step in the run. + obj.package = MetaflowPackage( + obj.flow, obj.environment, obj.echo, obj.package_suffixes + ) + + +def write_file(file_path, content): + if file_path is not None: + with open(file_path, "w", encoding="utf-8") as f: + f.write(str(content)) + + +def common_run_options(func): + @click.option( + "--tag", + "tags", + multiple=True, + default=None, + help="Annotate this run with the given tag. You can specify " + "this option multiple times to attach multiple tags in " + "the run.", + ) + @click.option( + "--max-workers", + default=16, + show_default=True, + help="Maximum number of parallel processes.", + ) + @click.option( + "--max-num-splits", + default=100, + show_default=True, + help="Maximum number of splits allowed in a foreach. This " + "is a safety check preventing bugs from triggering " + "thousands of steps inadvertently.", + ) + @click.option( + "--max-log-size", + default=10, + show_default=True, + help="Maximum size of stdout and stderr captured in " + "megabytes. If a step outputs more than this to " + "stdout/stderr, its output will be truncated.", + ) + @click.option( + "--with", + "decospecs", + multiple=True, + help="Add a decorator to all steps. You can specify this " + "option multiple times to attach multiple decorators " + "in steps.", + ) + @click.option( + "--run-id-file", + default=None, + show_default=True, + type=str, + help="Write the ID of this run to the file specified.", + ) + @click.option( + "--runner-attribute-file", + default=None, + show_default=True, + type=str, + help="Write the metadata and pathspec of this run to the file specified. Used internally for Metaflow's Runner API.", + ) + @wraps(func) + def wrapper(*args, **kwargs): + return func(*args, **kwargs) + + return wrapper + + +@click.option( + "--origin-run-id", + default=None, + help="ID of the run that should be resumed. By default, the " + "last run executed locally.", +) +@click.option( + "--run-id", + default=None, + help="Run ID for the new run. By default, a new run-id will be generated", + hidden=True, +) +@click.option( + "--clone-only/--no-clone-only", + default=False, + show_default=True, + help="Only clone tasks without continuing execution", + hidden=True, +) +@click.option( + "--reentrant/--no-reentrant", + default=False, + show_default=True, + hidden=True, + help="If specified, allows this call to be called in parallel", +) +@click.option( + "--resume-identifier", + default=None, + show_default=True, + hidden=True, + help="If specified, it identifies the task that started this resume call. It is in the form of {step_name}-{task_id}", +) +@click.argument("step-to-rerun", required=False) +@click.command(help="Resume execution of a previous run of this flow.") +@common_run_options +@click.pass_obj +def resume( + obj, + tags=None, + step_to_rerun=None, + origin_run_id=None, + run_id=None, + clone_only=False, + reentrant=False, + max_workers=None, + max_num_splits=None, + max_log_size=None, + decospecs=None, + run_id_file=None, + resume_identifier=None, + runner_attribute_file=None, +): + before_run(obj, tags, decospecs) + + if origin_run_id is None: + origin_run_id = get_latest_run_id(obj.echo, obj.flow.name) + if origin_run_id is None: + raise CommandException( + "A previous run id was not found. Specify --origin-run-id." + ) + + if step_to_rerun is None: + steps_to_rerun = set() + else: + # validate step name + if step_to_rerun not in obj.graph.nodes: + raise CommandException( + "invalid step name {0} specified, must be step present in " + "current form of execution graph. Valid step names include: {1}".format( + step_to_rerun, ",".join(list(obj.graph.nodes.keys())) + ) + ) + steps_to_rerun = {step_to_rerun} + + if run_id: + # Run-ids that are provided by the metadata service are always integers. + # External providers or run-ids (like external schedulers) always need to + # be non-integers to avoid any clashes. This condition ensures this. + try: + int(run_id) + except: + pass + else: + raise CommandException("run-id %s cannot be an integer" % run_id) + + runtime = NativeRuntime( + obj.flow, + obj.graph, + obj.flow_datastore, + obj.metadata, + obj.environment, + obj.package, + obj.logger, + obj.entrypoint, + obj.event_logger, + obj.monitor, + run_id=run_id, + clone_run_id=origin_run_id, + clone_only=clone_only, + reentrant=reentrant, + steps_to_rerun=steps_to_rerun, + max_workers=max_workers, + max_num_splits=max_num_splits, + max_log_size=max_log_size * 1024 * 1024, + resume_identifier=resume_identifier, + ) + write_file(run_id_file, runtime.run_id) + runtime.print_workflow_info() + + runtime.persist_constants() + + if runner_attribute_file: + with open(runner_attribute_file, "w", encoding="utf-8") as f: + json.dump( + { + "run_id": runtime.run_id, + "flow_name": obj.flow.name, + "metadata": obj.metadata.metadata_str(), + }, + f, + ) + + # We may skip clone-only resume if this is not a resume leader, + # and clone is already complete. + if runtime.should_skip_clone_only_execution(): + return + + current._update_env( + { + "run_id": runtime.run_id, + } + ) + _system_logger.log_event( + level="info", + module="metaflow.resume", + name="start", + payload={ + "msg": "Resuming run", + }, + ) + + with runtime.run_heartbeat(): + if clone_only: + runtime.clone_original_run() + else: + runtime.clone_original_run(generate_task_obj=True, verbose=False) + runtime.execute() + + +@parameters.add_custom_parameters(deploy_mode=True) +@click.command(help="Run the workflow locally.") +@tracing.cli_entrypoint("cli/run") +@common_run_options +@click.option( + "--namespace", + "user_namespace", + default=None, + help="Change namespace from the default (your username) to " + "the specified tag. Note that this option does not alter " + "tags assigned to the objects produced by this run, just " + "what existing objects are visible in the client API. You " + "can enable the global namespace with an empty string." + "--namespace=", +) +@click.pass_obj +def run( + obj, + tags=None, + max_workers=None, + max_num_splits=None, + max_log_size=None, + decospecs=None, + run_id_file=None, + runner_attribute_file=None, + user_namespace=None, + **kwargs +): + if user_namespace is not None: + namespace(user_namespace or None) + before_run(obj, tags, decospecs) + + runtime = NativeRuntime( + obj.flow, + obj.graph, + obj.flow_datastore, + obj.metadata, + obj.environment, + obj.package, + obj.logger, + obj.entrypoint, + obj.event_logger, + obj.monitor, + max_workers=max_workers, + max_num_splits=max_num_splits, + max_log_size=max_log_size * 1024 * 1024, + ) + write_latest_run_id(obj, runtime.run_id) + write_file(run_id_file, runtime.run_id) + + obj.flow._set_constants(obj.graph, kwargs) + current._update_env( + { + "run_id": runtime.run_id, + } + ) + _system_logger.log_event( + level="info", + module="metaflow.run", + name="start", + payload={ + "msg": "Starting run", + }, + ) + with runtime.run_heartbeat(): + runtime.print_workflow_info() + runtime.persist_constants() + + if runner_attribute_file: + with open(runner_attribute_file, "w", encoding="utf-8") as f: + json.dump( + { + "run_id": runtime.run_id, + "flow_name": obj.flow.name, + "metadata": obj.metadata.metadata_str(), + }, + f, + ) + runtime.execute() diff --git a/metaflow/cli_components/step_cmd.py b/metaflow/cli_components/step_cmd.py new file mode 100644 index 00000000000..6c7015e60bc --- /dev/null +++ b/metaflow/cli_components/step_cmd.py @@ -0,0 +1,182 @@ +from metaflow._vendor import click + +from .. import decorators, namespace +from ..cli import echo +from ..cli_args import cli_args +from ..exception import CommandException +from ..task import MetaflowTask +from ..unbounded_foreach import UBF_CONTROL, UBF_TASK +from ..util import decompress_list + + +@click.command(help="Internal command to execute a single task.", hidden=True) +@click.argument("step-name") +@click.option( + "--run-id", + default=None, + required=True, + help="ID for one execution of all steps in the flow.", +) +@click.option( + "--task-id", + default=None, + required=True, + show_default=True, + help="ID for this instance of the step.", +) +@click.option( + "--input-paths", + help="A comma-separated list of pathspecs specifying inputs for this step.", +) +@click.option( + "--input-paths-filename", + type=click.Path(exists=True, readable=True, dir_okay=False, resolve_path=True), + help="A filename containing the argument typically passed to `input-paths`", + hidden=True, +) +@click.option( + "--split-index", + type=int, + default=None, + show_default=True, + help="Index of this foreach split.", +) +@click.option( + "--tag", + "opt_tag", + multiple=True, + default=None, + help="Annotate this run with the given tag. You can specify " + "this option multiple times to attach multiple tags in " + "the task.", +) +@click.option( + "--namespace", + "opt_namespace", + default=None, + help="Change namespace from the default (your username) to the specified tag.", +) +@click.option( + "--retry-count", + default=0, + help="How many times we have attempted to run this task.", +) +@click.option( + "--max-user-code-retries", + default=0, + help="How many times we should attempt running the user code.", +) +@click.option( + "--clone-only", + default=None, + help="Pathspec of the origin task for this task to clone. Do " + "not execute anything.", +) +@click.option( + "--clone-run-id", + default=None, + help="Run id of the origin flow, if this task is part of a flow being resumed.", +) +@click.option( + "--with", + "decospecs", + multiple=True, + help="Add a decorator to this task. You can specify this " + "option multiple times to attach multiple decorators " + "to this task.", +) +@click.option( + "--ubf-context", + default="none", + type=click.Choice(["none", UBF_CONTROL, UBF_TASK]), + help="Provides additional context if this task is of type unbounded foreach.", +) +@click.option( + "--num-parallel", + default=0, + type=int, + help="Number of parallel instances of a step. Ignored in local mode (see parallel decorator code).", +) +@click.pass_context +def step( + ctx, + step_name, + opt_tag=None, + run_id=None, + task_id=None, + input_paths=None, + input_paths_filename=None, + split_index=None, + opt_namespace=None, + retry_count=None, + max_user_code_retries=None, + clone_only=None, + clone_run_id=None, + decospecs=None, + ubf_context="none", + num_parallel=None, +): + if ubf_context == "none": + ubf_context = None + if opt_namespace is not None: + namespace(opt_namespace or None) + + func = None + try: + func = getattr(ctx.obj.flow, step_name) + except: + raise CommandException("Step *%s* doesn't exist." % step_name) + if not func.is_step: + raise CommandException("Function *%s* is not a step." % step_name) + echo("Executing a step, *%s*" % step_name, fg="magenta", bold=False) + + if decospecs: + decorators._attach_decorators_to_step(func, decospecs) + + step_kwargs = ctx.params + # Remove argument `step_name` from `step_kwargs`. + step_kwargs.pop("step_name", None) + # Remove `opt_*` prefix from (some) option keys. + step_kwargs = dict( + [(k[4:], v) if k.startswith("opt_") else (k, v) for k, v in step_kwargs.items()] + ) + cli_args._set_step_kwargs(step_kwargs) + + ctx.obj.metadata.add_sticky_tags(tags=opt_tag) + if not input_paths and input_paths_filename: + with open(input_paths_filename, mode="r", encoding="utf-8") as f: + input_paths = f.read().strip(" \n\"'") + + paths = decompress_list(input_paths) if input_paths else [] + + task = MetaflowTask( + ctx.obj.flow, + ctx.obj.flow_datastore, + ctx.obj.metadata, + ctx.obj.environment, + ctx.obj.echo, + ctx.obj.event_logger, + ctx.obj.monitor, + ubf_context, + ) + if clone_only: + task.clone_only( + step_name, + run_id, + task_id, + clone_only, + retry_count, + ) + else: + task.run_step( + step_name, + run_id, + task_id, + clone_run_id, + paths, + split_index, + retry_count, + max_user_code_retries, + ) + + echo("Success", fg="green", bold=True, indent=True) diff --git a/metaflow/cli_components/utils.py b/metaflow/cli_components/utils.py new file mode 100644 index 00000000000..5de8d25f084 --- /dev/null +++ b/metaflow/cli_components/utils.py @@ -0,0 +1,82 @@ +import importlib +from metaflow._vendor import click +from metaflow.extension_support.plugins import get_plugin + + +class LazyPluginCommandCollection(click.CommandCollection): + # lazy_source should only point to things that are resolved as CLI plugins. + def __init__(self, *args, lazy_sources=None, **kwargs): + super().__init__(*args, **kwargs) + # lazy_sources is a list of strings in the form + # "{plugin_name}" -> "{module-name}.{command-object-name}" + self.lazy_sources = lazy_sources or {} + self._lazy_loaded = {} + + def list_commands(self, ctx): + base = super().list_commands(ctx) + for source_name, source in self.lazy_sources.items(): + subgroup = self._lazy_load(source_name, source) + base.extend(subgroup.list_commands(ctx)) + return base + + def get_command(self, ctx, cmd_name): + base_cmd = super().get_command(ctx, cmd_name) + if base_cmd is not None: + return base_cmd + for source_name, source in self.lazy_sources.items(): + subgroup = self._lazy_load(source_name, source) + cmd = subgroup.get_command(ctx, cmd_name) + if cmd is not None: + return cmd + return None + + def _lazy_load(self, source_name, source_path): + if source_name in self._lazy_loaded: + return self._lazy_loaded[source_name] + cmd_object = get_plugin("cli", source_path, source_name) + if not isinstance(cmd_object, click.Group): + raise ValueError( + f"Lazy loading of {source_name} failed by returning " + "a non-group object" + ) + self._lazy_loaded[source_name] = cmd_object + return cmd_object + + +class LazyGroup(click.Group): + def __init__(self, *args, lazy_subcommands=None, **kwargs): + super().__init__(*args, **kwargs) + # lazy_subcommands is a list of strings in the form + # "{command} -> "{module-name}.{command-object-name}" + self.lazy_subcommands = lazy_subcommands or {} + self._lazy_loaded = {} + + def list_commands(self, ctx): + base = super().list_commands(ctx) + lazy = sorted(self.lazy_subcommands.keys()) + return base + lazy + + def get_command(self, ctx, cmd_name): + if cmd_name in self.lazy_subcommands: + return self._lazy_load(cmd_name) + return super().get_command(ctx, cmd_name) + + def _lazy_load(self, cmd_name): + if cmd_name in self._lazy_loaded: + return self._lazy_loaded[cmd_name] + + import_path = self.lazy_subcommands[cmd_name] + modname, cmd = import_path.rsplit(".", 1) + # do the import + mod = importlib.import_module(modname) + # get the Command object from that module + cmd_object = getattr(mod, cmd) + # check the result to make debugging easier. note that wrapped BaseCommand + # can be functions + if not isinstance(cmd_object, click.BaseCommand): + raise ValueError( + f"Lazy loading of {import_path} failed by returning " + f"a non-command object {type(cmd_object)}" + ) + self._lazy_loaded[cmd_name] = cmd_object + return cmd_object diff --git a/metaflow/extension_support/plugins.py b/metaflow/extension_support/plugins.py index 9472202c510..8007917793b 100644 --- a/metaflow/extension_support/plugins.py +++ b/metaflow/extension_support/plugins.py @@ -93,7 +93,32 @@ def merge_lists(base, overrides, attr): base[:] = l[:] -def resolve_plugins(category): +def get_plugin(category, class_path, name): + path, cls_name = class_path.rsplit(".", 1) + try: + plugin_module = importlib.import_module(path) + except ImportError as e: + raise ValueError( + "Cannot locate %s plugin '%s' at '%s'" % (category, name, path) + ) from e + cls = getattr(plugin_module, cls_name, None) + if cls is None: + raise ValueError( + "Cannot locate '%s' class for %s plugin at '%s'" + % (cls_name, category, path) + ) + extracted_name = get_plugin_name(category, cls) + if extracted_name and extracted_name != name: + raise ValueError( + "Class '%s' at '%s' for %s plugin expected to be named '%s' but got '%s'" + % (cls_name, path, category, name, extracted_name) + ) + globals()[cls_name] = cls + _ext_debug(" Added %s plugin '%s' from '%s'" % (category, name, class_path)) + return cls + + +def resolve_plugins(category, path_only=False): # Called to return a list of classes that are the available plugins for 'category' # The ENABLED_ variable is set in process_plugins @@ -114,7 +139,7 @@ def resolve_plugins(category): available_plugins = globals()[_dict_for_category(category)] name_extractor = _plugin_categories[category] - if not name_extractor: + if path_only or not name_extractor: # If we have no name function, it means we just use the name in the dictionary # and we return a dictionary. This is for sidecars mostly as they do not have # a field that indicates their name @@ -132,32 +157,14 @@ def resolve_plugins(category): "Configuration requested %s plugin '%s' but no such plugin is available" % (category, name) ) - path, cls_name = class_path.rsplit(".", 1) - try: - plugin_module = importlib.import_module(path) - except ImportError: - raise ValueError( - "Cannot locate %s plugin '%s' at '%s'" % (category, name, path) - ) - cls = getattr(plugin_module, cls_name, None) - if cls is None: - raise ValueError( - "Cannot locate '%s' class for %s plugin at '%s'" - % (cls_name, category, path) - ) - if name_extractor and name_extractor(cls) != name: - raise ValueError( - "Class '%s' at '%s' for %s plugin expected to be named '%s' but got '%s'" - % (cls_name, path, category, name, name_extractor(cls)) - ) - globals()[cls_name] = cls - if name_extractor is not None: - to_return.append(cls) + if path_only: + to_return[name] = class_path else: - to_return[name] = cls - _ext_debug( - " Added %s plugin '%s' from '%s'" % (category, name, class_path) - ) + if name_extractor is not None: + to_return.append(get_plugin(category, class_path, name)) + else: + to_return[name] = get_plugin(category, class_path, name) + return to_return @@ -193,6 +200,13 @@ def resolve_plugins(category): } +def get_plugin_name(category, plugin): + extractor = _plugin_categories[category] + if extractor: + return extractor(plugin) + return None + + def _list_for_category(category): # Convenience function to name the variable containing List[Tuple[str, str]] where # each tuple contains: diff --git a/metaflow/plugins/__init__.py b/metaflow/plugins/__init__.py index f2a4d7cdb43..0e6ff820cba 100644 --- a/metaflow/plugins/__init__.py +++ b/metaflow/plugins/__init__.py @@ -164,6 +164,10 @@ def get_plugin_cli(): return resolve_plugins("cli") +def get_plugin_cli_path(): + return resolve_plugins("cli", path_only=True) + + STEP_DECORATORS = resolve_plugins("step_decorator") FLOW_DECORATORS = resolve_plugins("flow_decorator") ENVIRONMENTS = resolve_plugins("environment") From d5543dfbc3a121f8aac9edce10834de8cd18857d Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Tue, 19 Nov 2024 00:34:54 -0800 Subject: [PATCH 02/30] Try to fix tests --- metaflow/cli.py | 3 +++ metaflow/cli_components/dump_cmd.py | 8 +++++++- metaflow/cli_components/step_cmd.py | 8 +++++++- metaflow/cmd/develop/stub_generator.py | 11 ++++++++++- test/core/run_tests.py | 3 ++- 5 files changed, 29 insertions(+), 4 deletions(-) diff --git a/metaflow/cli.py b/metaflow/cli.py index 83f340967c6..f19fa605584 100644 --- a/metaflow/cli.py +++ b/metaflow/cli.py @@ -88,6 +88,9 @@ def echo_always(line, **kwargs): click.secho(ERASE_TO_EOL, **kwargs) +echo = None + + def logger(body="", system_msg=False, head="", bad=False, timestamp=True, nl=True): if timestamp: if timestamp is True: diff --git a/metaflow/cli_components/dump_cmd.py b/metaflow/cli_components/dump_cmd.py index 6845f01fa73..2038d21d771 100644 --- a/metaflow/cli_components/dump_cmd.py +++ b/metaflow/cli_components/dump_cmd.py @@ -2,7 +2,7 @@ from metaflow._vendor import click -from ..cli import echo_always, echo +from ..cli import echo_always, echo_dev_null from ..datastore import TaskDataStoreSet from ..exception import CommandException @@ -38,6 +38,12 @@ ) @click.pass_obj def dump(obj, input_path, private=None, max_value_size=None, include=None, file=None): + + if obj.is_quiet: + echo = echo_dev_null + else: + echo = echo_always + output = {} kwargs = { "show_private": private, diff --git a/metaflow/cli_components/step_cmd.py b/metaflow/cli_components/step_cmd.py index 6c7015e60bc..88f300bd679 100644 --- a/metaflow/cli_components/step_cmd.py +++ b/metaflow/cli_components/step_cmd.py @@ -1,7 +1,7 @@ from metaflow._vendor import click from .. import decorators, namespace -from ..cli import echo +from ..cli import echo_always, echo_dev_null from ..cli_args import cli_args from ..exception import CommandException from ..task import MetaflowTask @@ -116,6 +116,12 @@ def step( ubf_context="none", num_parallel=None, ): + + if ctx.obj.is_quiet: + echo = echo_dev_null + else: + echo = echo_always + if ubf_context == "none": ubf_context = None if opt_namespace is not None: diff --git a/metaflow/cmd/develop/stub_generator.py b/metaflow/cmd/develop/stub_generator.py index a43d3e72d87..346b5053d2f 100644 --- a/metaflow/cmd/develop/stub_generator.py +++ b/metaflow/cmd/develop/stub_generator.py @@ -1238,19 +1238,28 @@ def exploit_default(default_value: Any) -> Optional[str]: buff.write(indentation + deco + "\n") buff.write(indentation + "def " + name + "(") kw_only_param = False + has_var_args = False for i, (par_name, parameter) in enumerate(my_sign.parameters.items()): annotation = self._exploit_annotation(parameter.annotation) default = exploit_default(parameter.default) - if kw_only_param and parameter.kind != inspect.Parameter.KEYWORD_ONLY: + if ( + kw_only_param + and not has_var_args + and parameter.kind != inspect.Parameter.KEYWORD_ONLY + ): raise RuntimeError( "In function '%s': cannot have a positional parameter after a " "keyword only parameter" % name ) + + has_var_args |= parameter.kind == inspect.Parameter.VAR_KEYWORD + if ( parameter.kind == inspect.Parameter.KEYWORD_ONLY and not kw_only_param + and not has_var_args ): kw_only_param = True buff.write("*, ") diff --git a/test/core/run_tests.py b/test/core/run_tests.py index 01783f68323..0cece96de17 100644 --- a/test/core/run_tests.py +++ b/test/core/run_tests.py @@ -11,7 +11,8 @@ from multiprocessing import Pool from metaflow._vendor import click -from metaflow.cli import run, start +from metaflow.cli import start +from metaflow.cli_components.run_cmds import run skip_api_executor = False From 1a251e1e65bb0b42fee9071ddb18044de10c7867 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Tue, 19 Nov 2024 12:22:20 -0800 Subject: [PATCH 03/30] Support lazy CLI in runner --- metaflow/cli.py | 19 +++-- metaflow/plugins/datatools/s3/s3op.py | 6 +- metaflow/plugins/kubernetes/kubernetes_cli.py | 2 +- metaflow/runner/click_api.py | 76 ++++++++++++------- metaflow/sidecar/sidecar_worker.py | 2 +- 5 files changed, 66 insertions(+), 39 deletions(-) diff --git a/metaflow/cli.py b/metaflow/cli.py index f19fa605584..192a26f4855 100644 --- a/metaflow/cli.py +++ b/metaflow/cli.py @@ -1,3 +1,4 @@ +import functools import inspect import sys import traceback @@ -88,9 +89,6 @@ def echo_always(line, **kwargs): click.secho(ERASE_TO_EOL, **kwargs) -echo = None - - def logger(body="", system_msg=False, head="", bad=False, timestamp=True, nl=True): if timestamp: if timestamp is True: @@ -147,7 +145,13 @@ def cli(ctx): ) @click.pass_obj def check(obj, warnings=False): - _check(obj.graph, obj.flow, obj.environment, pylint=obj.pylint, warnings=warnings) + if obj.is_quiet: + echo = echo_dev_null + else: + echo = echo_always + _check( + echo, obj.graph, obj.flow, obj.environment, pylint=obj.pylint, warnings=warnings + ) fname = inspect.getfile(obj.flow.__class__) echo( "\n*'{cmd} show'* shows a description of this flow.\n" @@ -219,7 +223,6 @@ def version(obj): echo_always(obj.version) -@tracing.cli_entrypoint("cli/start") @decorators.add_decorator_options @click.command( cls=LazyPluginCommandCollection, @@ -227,6 +230,7 @@ def version(obj): lazy_sources=plugins.get_plugin_cli_path(), invoke_without_command=True, ) +@tracing.cli_entrypoint("cli/start") @click.option( "--quiet/--not-quiet", show_default=True, @@ -304,7 +308,6 @@ def start( monitor=None, **deco_options ): - global echo if quiet: echo = echo_dev_null else: @@ -325,8 +328,8 @@ def start( ctx.obj.is_quiet = quiet ctx.obj.graph = FlowGraph(ctx.obj.flow.__class__) ctx.obj.logger = logger - ctx.obj.check = _check ctx.obj.pylint = pylint + ctx.obj.check = functools.partial(_check, echo) ctx.obj.top_cli = cli ctx.obj.package_suffixes = package_suffixes.split(",") @@ -424,7 +427,7 @@ def start( ctx.invoke(check) -def _check(graph, flow, environment, pylint=True, warnings=False, **kwargs): +def _check(echo, graph, flow, environment, pylint=True, warnings=False, **kwargs): echo("Validating your flow...", fg="magenta", bold=False) linter = lint.linter # TODO set linter settings diff --git a/metaflow/plugins/datatools/s3/s3op.py b/metaflow/plugins/datatools/s3/s3op.py index a0479ddd18a..85a4d0e355b 100644 --- a/metaflow/plugins/datatools/s3/s3op.py +++ b/metaflow/plugins/datatools/s3/s3op.py @@ -722,8 +722,8 @@ def cli(): pass -@tracing.cli_entrypoint("s3op/list") @cli.command("list", help="List S3 objects") +@tracing.cli_entrypoint("s3op/list") @click.option( "--recursive/--no-recursive", default=False, @@ -782,8 +782,8 @@ def lst( print(format_result_line(idx, url.prefix, url.url, str(size))) -@tracing.cli_entrypoint("s3op/put") @cli.command(help="Upload files to S3") +@tracing.cli_entrypoint("s3op/put") @click.option( "--file", "files", @@ -977,8 +977,8 @@ def _populate_prefixes(prefixes, inputs): return prefixes, is_transient_retry -@tracing.cli_entrypoint("s3op/get") @cli.command(help="Download files from S3") +@tracing.cli_entrypoint("s3op/get") @click.option( "--recursive/--no-recursive", default=False, diff --git a/metaflow/plugins/kubernetes/kubernetes_cli.py b/metaflow/plugins/kubernetes/kubernetes_cli.py index ef8f1677dae..def709cadea 100644 --- a/metaflow/plugins/kubernetes/kubernetes_cli.py +++ b/metaflow/plugins/kubernetes/kubernetes_cli.py @@ -33,12 +33,12 @@ def kubernetes(): pass -@tracing.cli_entrypoint("kubernetes/step") @kubernetes.command( help="Execute a single task on Kubernetes. This command calls the top-level step " "command inside a Kubernetes pod with the given options. Typically you do not call " "this command directly; it is used internally by Metaflow." ) +@tracing.cli_entrypoint("kubernetes/step") @click.argument("step-name") @click.argument("code-package-sha") @click.argument("code-package-url") diff --git a/metaflow/runner/click_api.py b/metaflow/runner/click_api.py index 6bc1fc9b691..47692113804 100644 --- a/metaflow/runner/click_api.py +++ b/metaflow/runner/click_api.py @@ -9,6 +9,7 @@ ) import datetime +import functools import importlib import inspect import itertools @@ -124,6 +125,29 @@ def _method_sanity_check( return method_params +def _lazy_load_command( + cli_collection: click.Group, flow_parameters: List[Parameter], _self, name: str +): + + # Context is not used in get_command so we can pass None. Since we pin click, + # this won't change from under us. + cmd_obj = cli_collection.get_command(None, name) + if cmd_obj: + if isinstance(cmd_obj, click.Group): + # TODO: possibly check for fake groups with cmd_obj.name in ["cli", "main"] + result = extract_group(cmd_obj, flow_parameters) + elif isinstance(cmd_obj, click.Command): + result = functools.partial(extract_command(cmd_obj, flow_parameters), _self) + else: + raise RuntimeError( + "Cannot handle %s of type %s" % (cmd_obj.name, type(cmd_obj)) + ) + setattr(_self, name, result) + return result + else: + raise AttributeError() + + def get_annotation(param: Union[click.Argument, click.Option]): py_type = click_to_python_types[type(param.type)] if not param.required: @@ -204,19 +228,18 @@ def from_cli(cls, flow_file: str, cli_collection: Callable) -> Callable: with flow_context(flow_cls) as _: add_decorator_options(cli_collection) - class_dict = {"__module__": "metaflow", "_API_NAME": flow_file} - command_groups = cli_collection.sources - for each_group in command_groups: - for _, cmd_obj in each_group.commands.items(): - if isinstance(cmd_obj, click.Group): - # TODO: possibly check for fake groups with cmd_obj.name in ["cli", "main"] - class_dict[cmd_obj.name] = extract_group(cmd_obj, flow_parameters) - elif isinstance(cmd_obj, click.Command): - class_dict[cmd_obj.name] = extract_command(cmd_obj, flow_parameters) - else: - raise RuntimeError( - "Cannot handle %s of type %s" % (cmd_obj.name, type(cmd_obj)) - ) + def getattr_wrapper(_self, name): + # Functools.partial do not automatically bind self (no __get__) + return _self._internal_getattr(_self, name) + + class_dict = { + "__module__": "metaflow", + "_API_NAME": flow_file, + "_internal_getattr": functools.partial( + _lazy_load_command, cli_collection, flow_parameters + ), + "__getattr__": getattr_wrapper, + } to_return = type(flow_file, (MetaflowAPI,), class_dict) to_return.__name__ = flow_file @@ -240,8 +263,8 @@ def _method(_self, **kwargs): return to_return(parent=None, **method_params) m = _method - m.__name__ = cmd_obj.name - m.__doc__ = getattr(cmd_obj, "help", None) + m.__name__ = cli_collection.name + m.__doc__ = getattr(cli_collection, "help", None) m.__signature__ = inspect.signature(_method).replace( parameters=params_sigs.values() ) @@ -324,17 +347,18 @@ def extract_all_params(cmd_obj: Union[click.Command, click.Group]): def extract_group(cmd_obj: click.Group, flow_parameters: List[Parameter]) -> Callable: - class_dict = {"__module__": "metaflow", "_API_NAME": cmd_obj.name} - for _, sub_cmd_obj in cmd_obj.commands.items(): - if isinstance(sub_cmd_obj, click.Group): - # recursion - class_dict[sub_cmd_obj.name] = extract_group(sub_cmd_obj, flow_parameters) - elif isinstance(sub_cmd_obj, click.Command): - class_dict[sub_cmd_obj.name] = extract_command(sub_cmd_obj, flow_parameters) - else: - raise RuntimeError( - "Cannot handle %s of type %s" % (sub_cmd_obj.name, type(sub_cmd_obj)) - ) + def getattr_wrapper(_self, name): + # Functools.partial do not automatically bind self (no __get__) + return _self._internal_getattr(_self, name) + + class_dict = { + "__module__": "metaflow", + "_API_NAME": cmd_obj.name, + "_internal_getattr": functools.partial( + _lazy_load_command, cmd_obj, flow_parameters + ), + "__getattr__": getattr_wrapper, + } resulting_class = type(cmd_obj.name, (MetaflowAPI,), class_dict) resulting_class.__name__ = cmd_obj.name diff --git a/metaflow/sidecar/sidecar_worker.py b/metaflow/sidecar/sidecar_worker.py index fab675a92f9..ebec63ed602 100644 --- a/metaflow/sidecar/sidecar_worker.py +++ b/metaflow/sidecar/sidecar_worker.py @@ -48,8 +48,8 @@ def process_messages(worker_type, worker): pass -@tracing.cli_entrypoint("sidecar") @click.command(help="Initialize workers") +@tracing.cli_entrypoint("sidecar") @click.argument("worker-type") def main(worker_type): sidecar_type = SIDECARS.get(worker_type) From 4a6346b0dbe8eaa2313bfa96ac630a07631c4798 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Tue, 19 Nov 2024 12:27:48 -0800 Subject: [PATCH 04/30] Stub fixes --- metaflow/cmd/develop/stub_generator.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/metaflow/cmd/develop/stub_generator.py b/metaflow/cmd/develop/stub_generator.py index 346b5053d2f..939466bef88 100644 --- a/metaflow/cmd/develop/stub_generator.py +++ b/metaflow/cmd/develop/stub_generator.py @@ -1241,7 +1241,6 @@ def exploit_default(default_value: Any) -> Optional[str]: has_var_args = False for i, (par_name, parameter) in enumerate(my_sign.parameters.items()): annotation = self._exploit_annotation(parameter.annotation) - default = exploit_default(parameter.default) if ( @@ -1254,8 +1253,6 @@ def exploit_default(default_value: Any) -> Optional[str]: "keyword only parameter" % name ) - has_var_args |= parameter.kind == inspect.Parameter.VAR_KEYWORD - if ( parameter.kind == inspect.Parameter.KEYWORD_ONLY and not kw_only_param @@ -1266,6 +1263,7 @@ def exploit_default(default_value: Any) -> Optional[str]: if parameter.kind == inspect.Parameter.VAR_KEYWORD: par_name = "**%s" % par_name elif parameter.kind == inspect.Parameter.VAR_POSITIONAL: + has_var_args = True par_name = "*%s" % par_name if default: From 12dfeefb160642a081051789e432882e85fba68c Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Wed, 21 Aug 2024 23:13:21 -0700 Subject: [PATCH 05/30] Change top-level flow decorator options to be prefixed by METAFLOW_FLOW_ Previously, options like `branch` and `name` (injected by the project decorator for example) could be set using `METAFLOW_BRANCH`. They now need to be set using `METAFLOW_FLOW_BRANCH`. This change is made to prevent clashes between regular metaflow configuration settings and decorator level options. No other changes are made so `METAFLOW_RUN_MAX_WORKERS` still works as expected and `METAFLOW_PYLINT` as well. --- metaflow/decorators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/metaflow/decorators.py b/metaflow/decorators.py index 262102551e4..efd27e2ffeb 100644 --- a/metaflow/decorators.py +++ b/metaflow/decorators.py @@ -218,6 +218,7 @@ def add_decorator_options(cmd): ) raise MetaflowInternalError(msg) else: + kwargs["envvar"] = "METAFLOW_FLOW_%s" % option.upper() seen[option] = deco.name cmd.params.insert(0, click.Option(("--" + option,), **kwargs)) return cmd From b1a8ccc03ce445a555b0aed8c7b1e1b8fe1689ab Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Thu, 15 Aug 2024 09:59:09 -0700 Subject: [PATCH 06/30] Initial Config object --- metaflow/metaflow_environment.py | 11 +- metaflow/parameters.py | 2 + metaflow/user_configs.py | 266 +++++++++++++++++++++++++++++++ 3 files changed, 276 insertions(+), 3 deletions(-) create mode 100644 metaflow/user_configs.py diff --git a/metaflow/metaflow_environment.py b/metaflow/metaflow_environment.py index 0ac1ca3266c..cea6e18697b 100644 --- a/metaflow/metaflow_environment.py +++ b/metaflow/metaflow_environment.py @@ -6,6 +6,7 @@ from . import metaflow_version from metaflow.exception import MetaflowException from metaflow.extension_support import dump_module_info +from metaflow.user_configs import dump_config_values from metaflow.mflog import BASH_MFLOG from . import R @@ -18,7 +19,7 @@ class MetaflowEnvironment(object): TYPE = "local" def __init__(self, flow): - pass + self._flow = flow def init_environment(self, echo): """ @@ -177,7 +178,7 @@ def get_package_commands(self, code_package_url, datastore_type): ] return cmds - def get_environment_info(self, include_ext_info=False): + def get_environment_info(self, full_info=False): # note that this dict goes into the code package # so variables here should be relatively stable (no # timestamps) so the hash won't change all the time @@ -198,10 +199,14 @@ def get_environment_info(self, include_ext_info=False): env["metaflow_r_version"] = R.metaflow_r_version() env["r_version"] = R.r_version() env["r_version_code"] = R.r_version_code() - if include_ext_info: + if full_info: # Information about extension modules (to load them in the proper order) ext_key, ext_val = dump_module_info() env[ext_key] = ext_val + # Information about configurations (to be able to reload them) + user_configs = dump_config_values(self._flow) + if user_configs: + env[user_configs[0]] = user_configs[1] return env def executable(self, step_name, default=None): diff --git a/metaflow/parameters.py b/metaflow/parameters.py index e5778e6cd1e..dcdb7bd2b14 100644 --- a/metaflow/parameters.py +++ b/metaflow/parameters.py @@ -210,6 +210,8 @@ def __repr__(self): def deploy_time_eval(value): if isinstance(value, DeployTimeField): return value(deploy_time=True) + elif isinstance(value, DelayedEvaluationParameter): + return value(return_str=True) else: return value diff --git a/metaflow/user_configs.py b/metaflow/user_configs.py new file mode 100644 index 00000000000..1570af25b55 --- /dev/null +++ b/metaflow/user_configs.py @@ -0,0 +1,266 @@ +import json +import os + +from typing import Any, Dict, Optional, Union, TYPE_CHECKING + +from metaflow import INFO_FILE +from metaflow._vendor import click + +from .exception import MetaflowException +from .parameters import ( + DelayedEvaluationParameter, + Parameter, + current_flow, +) +import functools + +if TYPE_CHECKING: + from metaflow import FlowSpec + +# _tracefunc_depth = 0 + + +# def tracefunc(func): +# """Decorates a function to show its trace.""" + +# @functools.wraps(func) +# def tracefunc_closure(*args, **kwargs): +# global _tracefunc_depth +# """The closure.""" +# print(f"{_tracefunc_depth}: {func.__name__}(args={args}, kwargs={kwargs})") +# _tracefunc_depth += 1 +# result = func(*args, **kwargs) +# _tracefunc_depth -= 1 +# print(f"{_tracefunc_depth} => {result}") +# return result + +# return tracefunc_closure + + +def dump_config_values(flow: FlowSpec): + if hasattr(flow, "_user_configs"): + return "user_configs", flow._user_configs + return None, None + + +def load_config_values() -> Optional[Dict[str, Any]]: + try: + with open(INFO_FILE, encoding="utf-8") as contents: + return json.load(contents).get("user_configs", {}) + except IOError: + return None + + +class ConfigValue(object): + # Thin wrapper to allow configuration values to be accessed using a "." notation + # as well as a [] notation. + + def __init__(self, data: Dict[str, Any]): + self._data = data + + for key, value in data.items(): + if isinstance(value, dict): + value = ConfigValue(value) + elif isinstance(value, list): + value = [ConfigValue(v) for v in value] + setattr(self, key, value) + + def __getitem__(self, key): + value = self._data[key] + if isinstance(value, dict): + value = ConfigValue(value) + elif isinstance(value, list): + value = [ConfigValue(v) for v in value] + return value + + def __repr__(self): + return repr(self._data) + + +class ConfigInput(click.ParamType): + name = "ConfigInput" + + # Contains the values loaded from the INFO file. We make this a class method + # so that if there are multiple configs, we just need to read the file once. + # It is OK to be globally unique because this is only evoked in scenario A.2 (see + # convert method) which means we are already just executing a single task and so + # there is no concern about it "leaking" to things running with Runner for example + # (ie: even if Runner is evoked in that task, we won't "share" this global value's + # usage). + loaded_configs = None # type: Optional[Dict[str, Dict[str, Any]]] + + def __init__(self): + self._flow_cls = getattr(current_flow, "flow_cls", None) + if self._flow_cls is None: + raise MetaflowException("ConfigInput can only be used inside a flow") + if not hasattr(self._flow_cls, "_user_configs"): + self._flow_cls._user_configs = {} + + @staticmethod + def _make_key_name(name: str) -> str: + return "kv." + name.lower() + + @classmethod + def get_config(cls, config_name: str) -> Optional[Dict[str, Any]]: + if cls.loaded_configs is None: + all_configs = load_config_values() + if all_configs is None: + raise MetaflowException( + "Could not load expected configuration values " + "the INFO file. This is a Metaflow bug. Please contact support." + ) + cls.loaded_configs = all_configs + return cls.loaded_configs.get(config_name, None) + + def convert(self, value, param, ctx): + # Click can call convert multiple times, so we need to make sure to only + # convert once. + if isinstance(value, (ConfigValue, DelayedEvaluationParameter)): + return value + + # There are two paths we need to worry about: + # - Scenario A: deploying to a scheduler + # A.1 In this case, when deploying (using `step-functions create` for example), + # the value passed to click (or the default value) will be converted and we + # will: + # - store the configuration in the flow object under _user_configs (so that it + # can later be dumped to the INFO file when packaging) + # - return a DelayedEvaluationParameter object so that when the scheduler + # evaluates it (with return_str set to True), it gets back the *string* + # kv. which indicates that this + # configuration should be fetched from INFO + # A.2 When the scheduler runs the flow, the value returned in A.1 (the kv. + # string) will be passed to convert again. This time, we directly return a + # ConfigValue after having fetched/loaded the configuration from INFO. + # + # - Scenario B: running with the native Runtime + # The value passed in will be similarly stored under _user_configs. We also + # return a DelayedEvaluationParameter object but when the _set_constants in + # the runtime calls it, it calls it with return_str set to False and it will + # return a ConfigValue directly which can then be persisted in the artifact + # store. + + # The value we get in to convert can be: + # - a dictionary + # - a path to a YAML or JSON file + # - the string representation of a YAML or JSON file + # In all cases, we also store the configuration in the flow object under _user_configs. + # It will *not* be stored as an artifact but is a good place to store it so we + # can access it when packaging to store it in the INFO file. The config itself + # will be stored as regular artifacts (the ConfigValue object basically) + + def _delay_eval(name: str, value: ConfigValue, return_str=False): + if return_str: + # Scenario A.1 when deploy_time_eval is called by the scheduler + # (or, in some cases, some schedulers directly identify the + # DelayedEvaluationParameter value and call it directory with + # return_str=True) + return name + # Scenario B + return value + + if isinstance(value, dict): + # Scenario A.1 or B. + self._flow_cls._user_configs[self._make_key_name(param.name)] = value + return DelayedEvaluationParameter( + param.name, "value", functools.partial(_delay_eval, param.name, value) + ) + elif not isinstance(value, str): + raise MetaflowException( + "Configuration value for '%s' must be a string or a dictionary" + % param.name + ) + + # Here we are sure we have a string + if value.startswith("kv."): + # This is scenario A.2 + value = self.get_config(value) + if value is None: + raise MetaflowException( + "Could not find configuration '%s' in INFO file" % value + ) + return ConfigValue(value) + + elif os.path.isfile(value): + try: + with open(value, "r") as f: + content = f.read() + except OSError as e: + raise MetaflowException( + "Could not read configuration file '%s'" % value + ) from e + try: + value = json.loads(content) + except json.JSONDecodeError as e: + raise MetaflowException( + "Configuration file '%s' is not valid JSON" % value + ) from e + # TODO: Support YAML + self._flow_cls._user_configs[self._make_key_name(param.name)] = value + else: + try: + value = json.loads(value) + except json.JSONDecodeError as e: + raise MetaflowException( + "Configuration value for '%s' is not valid JSON" % param.name + ) from e + # TODO: Support YAML + self._flow_cls._user_configs[self._make_key_name(param.name)] = value + return DelayedEvaluationParameter( + param.name, "value", functools.partial(_delay_eval, param.name, value) + ) + + def __str__(self): + return repr(self) + + def __repr__(self): + return "ConfigInput" + + +ConfigArgType = Union[str, Dict[str, Any]] + + +class Config(Parameter): + """ + Includes a configuration for this flow. + + `Config` is a special type of `Parameter` but differs in a few key areas: + - it is immutable and determined at deploy time (or prior to running if not deploying + to a scheduler) + - as such, it can be used anywhere in your code including in Metaflow decorators + + + Parameters + ---------- + name : str + User-visible parameter name. + default : Union[ConfigArgType, Callable[[ParameterContext], ConfigArgType]] + Default configuration either as a path to a file, the string representation of + a YAML or JSON file or a dictionary. If specified as a function, the function + will be evaluated to get the value to use. + required : bool, default False + Require that the user specified a value for the parameter. + `required=True` implies that the `default` value is ignored. + help : str, optional + Help text to show in `run --help`. + show_default : bool, default True + If True, show the default value in the help text. + """ + + def __init__( + self, + name: str, + required: bool = False, + help: Optional[str] = None, + **kwargs: Dict[str, str] + ): + super(Config, self).__init__( + name, + required=required, + help=help, + type=ConfigInput(), + **kwargs, + ) + + def load_parameter(self, v): + return v From 5b71681279b7a776c42de6bd83df5d64227756b7 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Tue, 20 Aug 2024 00:56:13 -0700 Subject: [PATCH 07/30] Move from -- to --config --- metaflow/__init__.py | 2 + metaflow/cli.py | 27 +- metaflow/cli_components/init_cmd.py | 2 +- metaflow/cli_components/run_cmds.py | 3 +- metaflow/cli_components/step_cmd.py | 1 + metaflow/decorators.py | 45 +- metaflow/flowspec.py | 140 +++++- metaflow/package.py | 2 +- metaflow/parameters.py | 16 +- metaflow/plugins/aws/batch/batch_decorator.py | 4 +- .../kubernetes/kubernetes_decorator.py | 4 +- metaflow/plugins/pypi/conda_decorator.py | 13 +- metaflow/plugins/timeout_decorator.py | 4 +- metaflow/runner/click_api.py | 4 +- metaflow/runtime.py | 180 ++++--- metaflow/user_configs.py | 438 +++++++++++++----- metaflow/util.py | 11 + 17 files changed, 668 insertions(+), 228 deletions(-) diff --git a/metaflow/__init__.py b/metaflow/__init__.py index 409922a49d6..951b9acde0c 100644 --- a/metaflow/__init__.py +++ b/metaflow/__init__.py @@ -103,6 +103,8 @@ class and related decorators. from .parameters import Parameter, JSONTypeClass, JSONType +from .user_configs import Config, FlowConfig, config_expr, eval_config + # data layer # For historical reasons, we make metaflow.plugins.datatools accessible as # metaflow.datatools. S3 is also a tool that has historically been available at the diff --git a/metaflow/cli.py b/metaflow/cli.py index 192a26f4855..962b80fa2f7 100644 --- a/metaflow/cli.py +++ b/metaflow/cli.py @@ -35,6 +35,7 @@ from .pylint_wrapper import PyLint from .R import metaflow_r_version, use_r from .util import resolve_identity +from .user_configs import LocalFileInput, config_options ERASE_TO_EOL = "\033[K" HIGHLIGHT = "red" @@ -223,7 +224,10 @@ def version(obj): echo_always(obj.version) +# NOTE: add_decorator_options should be TL because it checks to make sure +# that no option conflict with the ones below @decorators.add_decorator_options +@config_options @click.command( cls=LazyPluginCommandCollection, sources=[cli], @@ -293,6 +297,15 @@ def version(obj): type=click.Choice(MONITOR_SIDECARS), help="Monitoring backend type", ) +@click.option( + "--local-info-file", + type=LocalFileInput(exists=True, readable=True, dir_okay=False, resolve_path=True), + required=False, + default=None, + help="A filename containing a subset of the INFO file. Internal use only.", + hidden=True, + is_eager=True, +) @click.pass_context def start( ctx, @@ -306,6 +319,8 @@ def start( pylint=None, event_logger=None, monitor=None, + local_info_file=None, + config_options=None, **deco_options ): if quiet: @@ -322,11 +337,17 @@ def start( echo(" executing *%s*" % ctx.obj.flow.name, fg="magenta", nl=False) echo(" for *%s*" % resolve_identity(), fg="magenta") + # At this point, we are able to resolve the user-configuration options so we can + # process all those decorators that the user added that will modify the flow based + # on those configurations. It is important to do this as early as possible since it + # actually modifies the flow itself + ctx.obj.flow = ctx.obj.flow._process_config_funcs(config_options) + cli_args._set_top_kwargs(ctx.params) ctx.obj.echo = echo ctx.obj.echo_always = echo_always ctx.obj.is_quiet = quiet - ctx.obj.graph = FlowGraph(ctx.obj.flow.__class__) + ctx.obj.graph = ctx.obj.flow._graph ctx.obj.logger = logger ctx.obj.pylint = pylint ctx.obj.check = functools.partial(_check, echo) @@ -377,6 +398,10 @@ def start( ctx.obj.monitor, ) + ctx.obj.config_options = config_options + + decorators._resolve_configs(ctx.obj.flow) + # It is important to initialize flow decorators early as some of the # things they provide may be used by some of the objects initialized after. decorators._init_flow_decorators( diff --git a/metaflow/cli_components/init_cmd.py b/metaflow/cli_components/init_cmd.py index 404a3a6911e..fdd64bdcc54 100644 --- a/metaflow/cli_components/init_cmd.py +++ b/metaflow/cli_components/init_cmd.py @@ -47,5 +47,5 @@ def init(obj, run_id=None, task_id=None, tags=None, **kwargs): obj.monitor, run_id=run_id, ) - obj.flow._set_constants(obj.graph, kwargs) + obj.flow._set_constants(obj.graph, kwargs, obj.config_options) runtime.persist_constants(task_id=task_id) diff --git a/metaflow/cli_components/run_cmds.py b/metaflow/cli_components/run_cmds.py index ec34c3bb4c3..ec31e2d3aab 100644 --- a/metaflow/cli_components/run_cmds.py +++ b/metaflow/cli_components/run_cmds.py @@ -40,6 +40,7 @@ def before_run(obj, tags, decospecs): ) if all_decospecs: decorators._attach_decorators(obj.flow, all_decospecs) + decorators._init(obj.flow, only_non_static=True) obj.graph = FlowGraph(obj.flow.__class__) obj.check(obj.graph, obj.flow, obj.environment, pylint=obj.pylint) @@ -326,7 +327,7 @@ def run( write_latest_run_id(obj, runtime.run_id) write_file(run_id_file, runtime.run_id) - obj.flow._set_constants(obj.graph, kwargs) + obj.flow._set_constants(obj.graph, kwargs, obj.config_options) current._update_env( { "run_id": runtime.run_id, diff --git a/metaflow/cli_components/step_cmd.py b/metaflow/cli_components/step_cmd.py index 88f300bd679..9302870fec6 100644 --- a/metaflow/cli_components/step_cmd.py +++ b/metaflow/cli_components/step_cmd.py @@ -138,6 +138,7 @@ def step( if decospecs: decorators._attach_decorators_to_step(func, decospecs) + decorators._init(ctx.obj.flow, only_non_static=True) step_kwargs = ctx.params # Remove argument `step_name` from `step_kwargs`. diff --git a/metaflow/decorators.py b/metaflow/decorators.py index efd27e2ffeb..8df702ebf10 100644 --- a/metaflow/decorators.py +++ b/metaflow/decorators.py @@ -12,6 +12,7 @@ ) from .parameters import current_flow +from .user_configs import DelayEvaluator from metaflow._vendor import click @@ -123,6 +124,30 @@ def __init__(self, attributes=None, statically_defined=False): else: raise InvalidDecoratorAttribute(self.name, k, self.defaults) + def resolve_configs(self): + """ + Resolve any configuration options that may be set in the decorator's attributes + """ + + def _resolve_delayed_evaluator(v): + if isinstance(v, DelayEvaluator): + return v() + if isinstance(v, dict): + return { + _resolve_delayed_evaluator(k): _resolve_delayed_evaluator(v) + for k, v in v.items() + } + if isinstance(v, list): + return [_resolve_delayed_evaluator(x) for x in v] + if isinstance(v, tuple): + return tuple(_resolve_delayed_evaluator(x) for x in v) + if isinstance(v, set): + return {_resolve_delayed_evaluator(x) for x in v} + return v + + for k, v in self.attributes.items(): + self.attributes[k] = _resolve_delayed_evaluator(v) + @classmethod def _parse_decorator_spec(cls, deco_spec): if len(deco_spec) == 0: @@ -203,10 +228,13 @@ def get_top_level_options(self): # compare this to parameters.add_custom_parameters def add_decorator_options(cmd): - seen = {} flow_cls = getattr(current_flow, "flow_cls", None) if flow_cls is None: return cmd + + seen = {} + existing_params = set(p.name.lower() for p in cmd.params) + # Add decorator options for deco in flow_decorators(flow_cls): for option, kwargs in deco.options.items(): if option in seen: @@ -217,6 +245,11 @@ def add_decorator_options(cmd): % (deco.name, option, seen[option]) ) raise MetaflowInternalError(msg) + elif deco.name.lower() in existing_params: + raise MetaflowInternalError( + "Flow decorator '%s' uses an option '%s' which is a reserved " + "keyword. Please use a different option name." % (deco.name, option) + ) else: kwargs["envvar"] = "METAFLOW_FLOW_%s" % option.upper() seen[option] = deco.name @@ -511,6 +544,16 @@ def _attach_decorators_to_step(step, decospecs): step.decorators.append(deco) +def _resolve_configs(flow): + # We get the datastore for the _parameters step which can contain + for decorators in flow._flow_decorators.values(): + for deco in decorators: + deco.resolve_configs() + for step in flow: + for deco in step.decorators: + deco.resolve_configs() + + def _init_flow_decorators( flow, graph, environment, flow_datastore, metadata, logger, echo, deco_options ): diff --git a/metaflow/flowspec.py b/metaflow/flowspec.py index 7a97b9714fa..44e812dd342 100644 --- a/metaflow/flowspec.py +++ b/metaflow/flowspec.py @@ -6,7 +6,7 @@ from itertools import islice from types import FunctionType, MethodType -from typing import Any, Callable, List, Optional, Tuple +from typing import TYPE_CHECKING, Any, Callable, Generator, List, Optional, Tuple from . import cmd_with_io, parameters from .parameters import DelayedEvaluationParameter, Parameter @@ -20,6 +20,7 @@ from .graph import FlowGraph from .unbounded_foreach import UnboundedForeachInput +from .user_configs import ConfigInput, ConfigValue from .util import to_pod from .metaflow_config import INCLUDE_FOREACH_STACK, MAXIMUM_FOREACH_VALUE_CHARS @@ -71,10 +72,66 @@ def __new__(cls, name, bases, dct): # This makes sure to give _flow_decorators to each # child class (and not share it with the FlowSpec base # class). This is important to not make a "global" - # _flow_decorators + # _flow_decorators. Same deal with user configurations f._flow_decorators = {} + f._user_configs = {} + + # We also cache parameter names to avoid having to recompute what is a parameter + # in the dir of a flow + f._cached_parameters = None + + # Finally attach all functions that need to be evaluated once user configurations + # are available + f._config_funcs = [] + return f + @property + def configs(cls) -> Generator[Tuple[str, "ConfigValue"], None, None]: + """ + Iterate over all user configurations in this flow + + Use this to parameterize your flow based on configuration. As an example: + ``` + def parametrize(flow): + val = next(flow.configs)[1].steps.start.cpu + flow.start = environment(vars={'mycpu': val})(flow.start) + return flow + + @parametrize + class TestFlow(FlowSpec): + config = Config('myconfig.json') + + @step + def start(self): + pass + ``` + can be used to add an environment decorator to the `start` step. + + Yields + ------ + Tuple[str, ConfigValue] + Iterates over the configurations of the flow + """ + # When configs are parsed, they are loaded in _user_configs + for name, value in cls._user_configs.items(): + yield name, ConfigValue(value) + + @property + def steps(cls) -> Generator[Tuple[str, Any], None, None]: + """ + Iterate over all the steps in this flow + + Yields + ------ + Tuple[str, Any] + A tuple with the step name and the step itself + """ + for var in dir(cls): + potential_step = getattr(cls, var) + if callable(potential_step) and hasattr(potential_step, "is_step"): + yield var, potential_step + class FlowSpec(metaclass=FlowSpecMeta): """ @@ -96,6 +153,9 @@ class FlowSpec(metaclass=FlowSpecMeta): "_cached_input", "_graph", "_flow_decorators", + "_user_configs", + "_cached_parameters", + "_config_funcs", "_steps", "index", "input", @@ -148,14 +208,7 @@ def script_name(self) -> str: fname = fname[:-1] return os.path.basename(fname) - def _set_constants(self, graph, kwargs): - from metaflow.decorators import ( - flow_decorators, - ) # To prevent circular dependency - - # Persist values for parameters and other constants (class level variables) - # only once. This method is called before persist_constants is called to - # persist all values set using setattr + def _check_parameters(self): seen = set() for var, param in self._get_parameters(): norm = param.name.lower() @@ -166,13 +219,69 @@ def _set_constants(self, graph, kwargs): "case-insensitive." % param.name ) seen.add(norm) - seen.clear() + + def _process_config_funcs(self, config_options): + current_cls = self.__class__ + + # Fast path for no user configurations + if not self._config_funcs: + return self + + # We need to convert all the user configurations from DelayedEvaluationParameters + # to actual values so they can be used as is in the config functions. + + # We then reset them to be proper parameters so they can be re-evaluated in + # _set_constants + to_reset_params = [] + self._check_parameters() + for var, param in self._get_parameters(): + if not param.IS_FLOW_PARAMETER: + continue + to_reset_params.append((var, param)) + val = config_options[param.name.replace("-", "_").lower()] + if isinstance(val, DelayedEvaluationParameter): + val = val() + setattr(current_cls, var, val) + + # Run all the functions. They will now be able to access the configuration + # values directly from the class + for func in self._config_funcs: + current_cls = func(current_cls) + + # Reset all configs that were already present in the class. + # TODO: This means that users can't override configs directly. Not sure if this + # is a pattern we want to support + for var, param in to_reset_params: + setattr(current_cls, var, param) + + # We reset cached_parameters on the very off chance that the user added + # more configurations based on the configuration + current_cls._cached_parameters = None + + # Set the current flow class we are in (the one we just created) + parameters.replace_flow_context(current_cls) + return current_cls(use_cli=False) + + def _set_constants(self, graph, kwargs, config_options): + from metaflow.decorators import ( + flow_decorators, + ) # To prevent circular dependency + + # Persist values for parameters and other constants (class level variables) + # only once. This method is called before persist_constants is called to + # persist all values set using setattr + self._check_parameters() + + seen = set() self._success = True parameters_info = [] for var, param in self._get_parameters(): seen.add(var) - val = kwargs[param.name.replace("-", "_").lower()] + if param.IS_FLOW_PARAMETER: + val = config_options[param.name.replace("-", "_").lower()] + else: + val = kwargs[param.name.replace("-", "_").lower()] # Support for delayed evaluation of parameters. if isinstance(val, DelayedEvaluationParameter): val = val() @@ -218,6 +327,11 @@ def _set_constants(self, graph, kwargs): @classmethod def _get_parameters(cls): + if cls._cached_parameters is not None: + for var in cls._cached_parameters: + yield var, getattr(cls, var) + return + build_list = [] for var in dir(cls): if var[0] == "_" or var in cls._NON_PARAMETERS: continue @@ -226,7 +340,9 @@ def _get_parameters(cls): except: continue if isinstance(val, Parameter): + build_list.append(var) yield var, val + cls._cached_parameters = build_list def _set_datastore(self, datastore): self._datastore = datastore diff --git a/metaflow/package.py b/metaflow/package.py index 30435dce47f..55968a03d65 100644 --- a/metaflow/package.py +++ b/metaflow/package.py @@ -153,7 +153,7 @@ def path_tuples(self): def _add_info(self, tar): info = tarfile.TarInfo(os.path.basename(INFO_FILE)) - env = self.environment.get_environment_info(include_ext_info=True) + env = self.environment.get_environment_info(full_info=True) buf = BytesIO() buf.write(json.dumps(env).encode("utf-8")) buf.seek(0) diff --git a/metaflow/parameters.py b/metaflow/parameters.py index dcdb7bd2b14..ae6ff4168b4 100644 --- a/metaflow/parameters.py +++ b/metaflow/parameters.py @@ -72,6 +72,16 @@ def flow_context(flow_cls): context_proto = None +def replace_flow_context(flow_cls): + """ + Replace the current flow context with a new flow class. This is used + when we change the current flow class after having run user configuration functions + """ + current_flow.flow_cls_stack = current_flow.flow_cls_stack[1:] + current_flow.flow_cls_stack.insert(0, flow_cls) + current_flow.flow_cls = current_flow.flow_cls_stack[0] + + class JSONTypeClass(click.ParamType): name = "JSON" @@ -299,6 +309,8 @@ class MyFlow(FlowSpec): If True, show the default value in the help text. """ + IS_FLOW_PARAMETER = False + def __init__( self, name: str, @@ -439,7 +451,9 @@ def wrapper(cmd): flow_cls = getattr(current_flow, "flow_cls", None) if flow_cls is None: return cmd - parameters = [p for _, p in flow_cls._get_parameters()] + parameters = [ + p for _, p in flow_cls._get_parameters() if not p.IS_FLOW_PARAMETER + ] for arg in parameters[::-1]: kwargs = arg.option_kwargs(deploy_mode) cmd.params.insert(0, click.Option(("--" + arg.name,), **kwargs)) diff --git a/metaflow/plugins/aws/batch/batch_decorator.py b/metaflow/plugins/aws/batch/batch_decorator.py index 52291d49cd5..16ca5c6f768 100644 --- a/metaflow/plugins/aws/batch/batch_decorator.py +++ b/metaflow/plugins/aws/batch/batch_decorator.py @@ -138,8 +138,8 @@ class BatchDecorator(StepDecorator): supports_conda_environment = True target_platform = "linux-64" - def __init__(self, attributes=None, statically_defined=False): - super(BatchDecorator, self).__init__(attributes, statically_defined) + def resolve_configs(self): + super(BatchDecorator, self).resolve_configs() # If no docker image is explicitly specified, impute a default image. if not self.attributes["image"]: diff --git a/metaflow/plugins/kubernetes/kubernetes_decorator.py b/metaflow/plugins/kubernetes/kubernetes_decorator.py index d051101b91b..8f1d7be42d7 100644 --- a/metaflow/plugins/kubernetes/kubernetes_decorator.py +++ b/metaflow/plugins/kubernetes/kubernetes_decorator.py @@ -151,8 +151,8 @@ class KubernetesDecorator(StepDecorator): supports_conda_environment = True target_platform = "linux-64" - def __init__(self, attributes=None, statically_defined=False): - super(KubernetesDecorator, self).__init__(attributes, statically_defined) + def resolve_configs(self): + super(KubernetesDecorator, self).resolve_configs() if not self.attributes["namespace"]: self.attributes["namespace"] = KUBERNETES_NAMESPACE diff --git a/metaflow/plugins/pypi/conda_decorator.py b/metaflow/plugins/pypi/conda_decorator.py index 74418ae9f54..b6ac1b91d88 100644 --- a/metaflow/plugins/pypi/conda_decorator.py +++ b/metaflow/plugins/pypi/conda_decorator.py @@ -49,7 +49,9 @@ class CondaStepDecorator(StepDecorator): # CONDA_CHANNELS in their environment. For pinning specific packages to specific # conda channels, users can specify channel::package as the package name. - def __init__(self, attributes=None, statically_defined=False): + def resolve_configs(self): + super(CondaStepDecorator, self).resolve_configs() + self._user_defined_attributes = ( attributes.copy() if attributes is not None else {} ) @@ -172,9 +174,7 @@ def runtime_init(self, flow, graph, package, run_id): encoding="utf-8", ) as f: f.write( - json.dumps( - self.environment.get_environment_info(include_ext_info=True) - ) + json.dumps(self.environment.get_environment_info(full_info=True)) ) # Support metaflow extensions. @@ -332,11 +332,16 @@ class CondaFlowDecorator(FlowDecorator): "disabled": None, } +<<<<<<< HEAD def __init__(self, attributes=None, statically_defined=False): self._user_defined_attributes = ( attributes.copy() if attributes is not None else {} ) super(CondaFlowDecorator, self).__init__(attributes, statically_defined) +======= + def resolve_configs(self): + super(CondaFlowDecorator, self).resolve_configs() +>>>>>>> 99d06ae8 (More WIP) # Support legacy 'libraries=' attribute for the decorator. self.attributes["packages"] = { diff --git a/metaflow/plugins/timeout_decorator.py b/metaflow/plugins/timeout_decorator.py index e2c04dbcb31..648e318d36a 100644 --- a/metaflow/plugins/timeout_decorator.py +++ b/metaflow/plugins/timeout_decorator.py @@ -37,8 +37,8 @@ class TimeoutDecorator(StepDecorator): name = "timeout" defaults = {"seconds": 0, "minutes": 0, "hours": 0} - def __init__(self, *args, **kwargs): - super(TimeoutDecorator, self).__init__(*args, **kwargs) + def resolve_configs(self): + super().resolve_configs() # Initialize secs in __init__ so other decorators could safely use this # value without worrying about decorator order. # Convert values in attributes to type:int since they can be type:str diff --git a/metaflow/runner/click_api.py b/metaflow/runner/click_api.py index 47692113804..83d7d3f9081 100644 --- a/metaflow/runner/click_api.py +++ b/metaflow/runner/click_api.py @@ -224,7 +224,9 @@ def name(self): @classmethod def from_cli(cls, flow_file: str, cli_collection: Callable) -> Callable: flow_cls = extract_flow_class_from_file(flow_file) - flow_parameters = [p for _, p in flow_cls._get_parameters()] + flow_parameters = [ + p for _, p in flow_cls._get_parameters() if not p.IS_FLOW_PARAMETER + ] with flow_context(flow_cls) as _: add_decorator_options(cli_collection) diff --git a/metaflow/runtime.py b/metaflow/runtime.py index 9de81daaa54..d16e96728a5 100644 --- a/metaflow/runtime.py +++ b/metaflow/runtime.py @@ -6,10 +6,12 @@ """ from __future__ import print_function +import json import os import sys import fcntl import re +import tempfile import time import subprocess from datetime import datetime @@ -40,6 +42,9 @@ UBF_CONTROL, UBF_TASK, ) + +from .user_configs import ConfigInput, dump_config_values + import metaflow.tracing as tracing MAX_WORKERS = 16 @@ -471,82 +476,95 @@ def execute(self): else: self._queue_push("start", {}) progress_tstamp = time.time() - try: - # main scheduling loop - exception = None - while self._run_queue or self._active_tasks[0] > 0 or self._cloned_tasks: - # 1. are any of the current workers finished? - if self._cloned_tasks: - finished_tasks = self._cloned_tasks - # reset the list of cloned tasks and let poll_workers handle - # the remaining transition - self._cloned_tasks = [] - else: - finished_tasks = list(self._poll_workers()) - # 2. push new tasks triggered by the finished tasks to the queue - self._queue_tasks(finished_tasks) - # 3. if there are available worker slots, pop and start tasks - # from the queue. - self._launch_workers() - - if time.time() - progress_tstamp > PROGRESS_INTERVAL: - progress_tstamp = time.time() - tasks_print = ", ".join( - [ - "%s (%d running; %d done)" % (k, v[0], v[1]) - for k, v in self._active_tasks.items() - if k != 0 and v[0] > 0 - ] - ) - if self._active_tasks[0] == 0: - msg = "No tasks are running." + with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8") as config_file: + # Configurations are passed through a file to avoid overloading the + # command-line. We only need to create this file once and it can be reused + # for any task launch + config_key, config_value = dump_config_values(self._flow) + if config_value: + json.dump({config_key: config_value}, config_file) + config_file.flush() + self._config_file_name = config_file.name + else: + self._config_file_name = None + try: + # main scheduling loop + exception = None + while ( + self._run_queue or self._active_tasks[0] > 0 or self._cloned_tasks + ): + # 1. are any of the current workers finished? + if self._cloned_tasks: + finished_tasks = self._cloned_tasks + # reset the list of cloned tasks and let poll_workers handle + # the remaining transition + self._cloned_tasks = [] else: - if self._active_tasks[0] == 1: - msg = "1 task is running: " + finished_tasks = list(self._poll_workers()) + # 2. push new tasks triggered by the finished tasks to the queue + self._queue_tasks(finished_tasks) + # 3. if there are available worker slots, pop and start tasks + # from the queue. + self._launch_workers() + + if time.time() - progress_tstamp > PROGRESS_INTERVAL: + progress_tstamp = time.time() + tasks_print = ", ".join( + [ + "%s (%d running; %d done)" % (k, v[0], v[1]) + for k, v in self._active_tasks.items() + if k != 0 and v[0] > 0 + ] + ) + if self._active_tasks[0] == 0: + msg = "No tasks are running." else: - msg = "%d tasks are running: " % self._active_tasks[0] - msg += "%s." % tasks_print + if self._active_tasks[0] == 1: + msg = "1 task is running: " + else: + msg = "%d tasks are running: " % self._active_tasks[0] + msg += "%s." % tasks_print - self._logger(msg, system_msg=True) + self._logger(msg, system_msg=True) - if len(self._run_queue) == 0: - msg = "No tasks are waiting in the queue." - else: - if len(self._run_queue) == 1: - msg = "1 task is waiting in the queue: " + if len(self._run_queue) == 0: + msg = "No tasks are waiting in the queue." else: - msg = "%d tasks are waiting in the queue." % len( - self._run_queue - ) + if len(self._run_queue) == 1: + msg = "1 task is waiting in the queue: " + else: + msg = "%d tasks are waiting in the queue." % len( + self._run_queue + ) - self._logger(msg, system_msg=True) - if len(self._unprocessed_steps) > 0: - if len(self._unprocessed_steps) == 1: - msg = "%s step has not started" % ( - next(iter(self._unprocessed_steps)), - ) - else: - msg = "%d steps have not started: " % len( - self._unprocessed_steps - ) - msg += "%s." % ", ".join(self._unprocessed_steps) self._logger(msg, system_msg=True) - - except KeyboardInterrupt as ex: - self._logger("Workflow interrupted.", system_msg=True, bad=True) - self._killall() - exception = ex - raise - except Exception as ex: - self._logger("Workflow failed.", system_msg=True, bad=True) - self._killall() - exception = ex - raise - finally: - # on finish clean tasks - for step in self._flow: - for deco in step.decorators: - deco.runtime_finished(exception) + if len(self._unprocessed_steps) > 0: + if len(self._unprocessed_steps) == 1: + msg = "%s step has not started" % ( + next(iter(self._unprocessed_steps)), + ) + else: + msg = "%d steps have not started: " % len( + self._unprocessed_steps + ) + msg += "%s." % ", ".join(self._unprocessed_steps) + self._logger(msg, system_msg=True) + + except KeyboardInterrupt as ex: + self._logger("Workflow interrupted.", system_msg=True, bad=True) + self._killall() + exception = ex + raise + except Exception as ex: + self._logger("Workflow failed.", system_msg=True, bad=True) + self._killall() + exception = ex + raise + finally: + # on finish clean tasks + for step in self._flow: + for deco in step.decorators: + deco.runtime_finished(exception) # assert that end was executed and it was successful if ("end", ()) in self._finished: @@ -957,7 +975,7 @@ def _launch_worker(self, task): ) return - worker = Worker(task, self._max_log_size) + worker = Worker(task, self._max_log_size, self._config_file_name) for fd in worker.fds(): self._workers[fd] = worker self._poll.add(fd) @@ -1237,7 +1255,6 @@ def __init__( # Open the output datastore only if the task is not being cloned. if not self._is_cloned: self.new_attempt() - for deco in decos: deco.runtime_task_created( self._ds, @@ -1504,6 +1521,14 @@ def __init__(self, task): for deco in flow_decorators(self.task.flow): self.top_level_options.update(deco.get_top_level_options()) + # We also pass configuration options using the kv. syntax which will cause + # the configuration options to be loaded from the INFO file (or local-info-file + # in the case of the local runtime) + if self.task.flow._user_configs: + self.top_level_options["config"] = [ + (k, ConfigInput.make_key_name(k)) for k in self.task.flow._user_configs + ] + self.commands = ["step"] self.command_args = [self.task.step] self.command_options = { @@ -1537,7 +1562,9 @@ def _options(mapping): for value in v: yield "--%s" % k if not isinstance(value, bool): - yield to_unicode(value) + value = value if isinstance(value, tuple) else (value,) + for vv in value: + yield to_unicode(vv) args = list(self.entrypoint) args.extend(_options(self.top_level_options)) @@ -1554,8 +1581,9 @@ def __str__(self): class Worker(object): - def __init__(self, task, max_logs_size): + def __init__(self, task, max_logs_size, config_file_name): self.task = task + self._config_file_name = config_file_name self._proc = self._launch() if task.retries > task.user_code_retries: @@ -1607,6 +1635,12 @@ def _launch(self): self.task.user_code_retries, self.task.ubf_context, ) + + # Add user configurations using a file to avoid using up too much space on the + # command line + if self._config_file_name: + args.top_level_options["local-info-file"] = self._config_file_name + # Pass configuration options env.update(args.get_env()) env["PYTHONUNBUFFERED"] = "x" tracing.inject_tracing_vars(env) diff --git a/metaflow/user_configs.py b/metaflow/user_configs.py index 1570af25b55..ed255e18996 100644 --- a/metaflow/user_configs.py +++ b/metaflow/user_configs.py @@ -1,15 +1,16 @@ import json import os -from typing import Any, Dict, Optional, Union, TYPE_CHECKING +from typing import Any, Callable, Dict, List, Optional, Union, TYPE_CHECKING from metaflow import INFO_FILE from metaflow._vendor import click -from .exception import MetaflowException +from .exception import MetaflowException, MetaflowInternalError from .parameters import ( DelayedEvaluationParameter, Parameter, + ParameterContext, current_flow, ) import functools @@ -37,21 +38,23 @@ # return tracefunc_closure -def dump_config_values(flow: FlowSpec): - if hasattr(flow, "_user_configs"): +def dump_config_values(flow: "FlowSpec"): + if flow._user_configs: return "user_configs", flow._user_configs return None, None -def load_config_values() -> Optional[Dict[str, Any]]: +def load_config_values(info_file: Optional[str] = None) -> Optional[Dict[str, Any]]: + if info_file is None: + info_file = INFO_FILE try: - with open(INFO_FILE, encoding="utf-8") as contents: + with open(info_file, encoding="utf-8") as contents: return json.load(contents).get("user_configs", {}) except IOError: return None -class ConfigValue(object): +class ConfigValue: # Thin wrapper to allow configuration values to be accessed using a "." notation # as well as a [] notation. @@ -61,25 +64,47 @@ def __init__(self, data: Dict[str, Any]): for key, value in data.items(): if isinstance(value, dict): value = ConfigValue(value) - elif isinstance(value, list): - value = [ConfigValue(v) for v in value] setattr(self, key, value) def __getitem__(self, key): value = self._data[key] if isinstance(value, dict): value = ConfigValue(value) - elif isinstance(value, list): - value = [ConfigValue(v) for v in value] return value def __repr__(self): return repr(self._data) + def __str__(self): + return json.dumps(self._data) + -class ConfigInput(click.ParamType): +class PathOrStr(click.ParamType): name = "ConfigInput" + def convert(self, value, param, ctx): + if value is None: + return None + + if isinstance(value, dict): + return "converted:" + json.dumps(value) + + if value.startswith("converted:"): + return value + + if os.path.isfile(value): + try: + with open(value, "r") as f: + content = f.read() + except OSError as e: + raise click.UsageError( + "Could not read configuration file '%s'" % value + ) from e + return "converted:" + content + return "converted:" + value + + +class ConfigInput: # Contains the values loaded from the INFO file. We make this a class method # so that if there are multiple configs, we just need to read the file once. # It is OK to be globally unique because this is only evoked in scenario A.2 (see @@ -88,127 +113,81 @@ class ConfigInput(click.ParamType): # (ie: even if Runner is evoked in that task, we won't "share" this global value's # usage). loaded_configs = None # type: Optional[Dict[str, Dict[str, Any]]] + info_file = None # type: Optional[str] - def __init__(self): - self._flow_cls = getattr(current_flow, "flow_cls", None) - if self._flow_cls is None: - raise MetaflowException("ConfigInput can only be used inside a flow") - if not hasattr(self._flow_cls, "_user_configs"): - self._flow_cls._user_configs = {} + def __init__( + self, + req_configs: List[str], + parsers: Dict[str, Callable[[str], Dict[str, Any]]], + ): + self._req_configs = req_configs + self._parsers = parsers @staticmethod - def _make_key_name(name: str) -> str: + def make_key_name(name: str) -> str: return "kv." + name.lower() + @classmethod + def set_info_file(cls, info_file: str): + cls.info_file = info_file + @classmethod def get_config(cls, config_name: str) -> Optional[Dict[str, Any]]: if cls.loaded_configs is None: - all_configs = load_config_values() + all_configs = load_config_values(cls.info_file) if all_configs is None: raise MetaflowException( "Could not load expected configuration values " - "the INFO file. This is a Metaflow bug. Please contact support." + "from the INFO file. This is a Metaflow bug. Please contact support." ) cls.loaded_configs = all_configs return cls.loaded_configs.get(config_name, None) - def convert(self, value, param, ctx): - # Click can call convert multiple times, so we need to make sure to only - # convert once. - if isinstance(value, (ConfigValue, DelayedEvaluationParameter)): - return value - - # There are two paths we need to worry about: - # - Scenario A: deploying to a scheduler - # A.1 In this case, when deploying (using `step-functions create` for example), - # the value passed to click (or the default value) will be converted and we - # will: - # - store the configuration in the flow object under _user_configs (so that it - # can later be dumped to the INFO file when packaging) - # - return a DelayedEvaluationParameter object so that when the scheduler - # evaluates it (with return_str set to True), it gets back the *string* - # kv. which indicates that this - # configuration should be fetched from INFO - # A.2 When the scheduler runs the flow, the value returned in A.1 (the kv. - # string) will be passed to convert again. This time, we directly return a - # ConfigValue after having fetched/loaded the configuration from INFO. - # - # - Scenario B: running with the native Runtime - # The value passed in will be similarly stored under _user_configs. We also - # return a DelayedEvaluationParameter object but when the _set_constants in - # the runtime calls it, it calls it with return_str set to False and it will - # return a ConfigValue directly which can then be persisted in the artifact - # store. - - # The value we get in to convert can be: - # - a dictionary - # - a path to a YAML or JSON file - # - the string representation of a YAML or JSON file - # In all cases, we also store the configuration in the flow object under _user_configs. - # It will *not* be stored as an artifact but is a good place to store it so we - # can access it when packaging to store it in the INFO file. The config itself - # will be stored as regular artifacts (the ConfigValue object basically) - - def _delay_eval(name: str, value: ConfigValue, return_str=False): - if return_str: - # Scenario A.1 when deploy_time_eval is called by the scheduler - # (or, in some cases, some schedulers directly identify the - # DelayedEvaluationParameter value and call it directory with - # return_str=True) - return name - # Scenario B - return value - - if isinstance(value, dict): - # Scenario A.1 or B. - self._flow_cls._user_configs[self._make_key_name(param.name)] = value - return DelayedEvaluationParameter( - param.name, "value", functools.partial(_delay_eval, param.name, value) - ) - elif not isinstance(value, str): - raise MetaflowException( - "Configuration value for '%s' must be a string or a dictionary" - % param.name + def process_configs(self, ctx, param, value): + flow_cls = getattr(current_flow, "flow_cls", None) + if flow_cls is None: + # This is an error + raise MetaflowInternalError( + "Config values should be processed for a FlowSpec" ) - # Here we are sure we have a string - if value.startswith("kv."): - # This is scenario A.2 - value = self.get_config(value) - if value is None: - raise MetaflowException( - "Could not find configuration '%s' in INFO file" % value - ) - return ConfigValue(value) + # First validate if we have all the required parameters + # Here value is a list of tuples. Each tuple has the name of the configuration + # and the string representation of the config (it was already read + # from a file if applicable). + missing = set(self._req_configs) - set([v[0] for v in value]) + if missing: + raise click.UsageError( + "Missing required configuration values: %s" % ", ".join(missing) + ) - elif os.path.isfile(value): - try: - with open(value, "r") as f: - content = f.read() - except OSError as e: - raise MetaflowException( - "Could not read configuration file '%s'" % value - ) from e - try: - value = json.loads(content) - except json.JSONDecodeError as e: - raise MetaflowException( - "Configuration file '%s' is not valid JSON" % value - ) from e - # TODO: Support YAML - self._flow_cls._user_configs[self._make_key_name(param.name)] = value - else: - try: - value = json.loads(value) - except json.JSONDecodeError as e: - raise MetaflowException( - "Configuration value for '%s' is not valid JSON" % param.name - ) from e - # TODO: Support YAML - self._flow_cls._user_configs[self._make_key_name(param.name)] = value - return DelayedEvaluationParameter( - param.name, "value", functools.partial(_delay_eval, param.name, value) - ) + to_return = {} + for name, val in value: + name = name.lower() + val = val[10:] # Remove the "converted:" prefix + if val.startswith("kv."): + # This means to load it from a file + read_value = self.get_config(val[3:]) + if read_value is None: + raise click.UsageError( + "Could not find configuration '%s' in INFO file" % val + ) + flow_cls._user_configs[name] = read_value + to_return[name] = ConfigValue(read_value) + else: + if self._parsers[name]: + read_value = self._parsers[name](val) + else: + try: + read_value = json.loads(val) + except json.JSONDecodeError as e: + raise click.UsageError( + "Configuration value for '%s' is not valid JSON" % name + ) from e + # TODO: Support YAML + flow_cls._user_configs[name] = read_value + to_return[name] = ConfigValue(read_value) + return to_return def __str__(self): return repr(self) @@ -217,9 +196,140 @@ def __repr__(self): return "ConfigInput" +class LocalFileInput(click.Path): + name = "LocalFileInput" + + def convert(self, value, param, ctx): + super().convert(value, param, ctx) + ConfigInput.set_info_file(value) + # This purposefully returns None which means it is *not* passed down + # when commands use ctx.parent.parent.params to get all the configuration + # values. + + def __str__(self): + return repr(self) + + def __repr__(self): + return "LocalFileInput" + + ConfigArgType = Union[str, Dict[str, Any]] +class DelayEvaluator: + """ + Small wrapper that allows the evaluation of a Config() value in a delayed manner. + This is used when we want to use config.* values in decorators for example. + """ + + def __init__(self, config_expr: str, is_var_only=True): + self._config_expr = config_expr + if is_var_only: + self._access = [] + else: + self._access = None + self._is_var_only = is_var_only + + def __getattr__(self, name): + if self._access is None: + raise AttributeError() + self._access.append(name) + return self + + def __call__(self): + flow_cls = getattr(current_flow, "flow_cls", None) + if flow_cls is None: + # We are not executing inside a flow (ie: not the CLI) + raise MetaflowException( + "Config object can only be used directly in the FlowSpec defining them. " + "If using outside of the FlowSpec, please use ConfigEval" + ) + if self._access is not None: + # Build the final expression by adding all the fields in access as . fields + self._config_expr = ".".join([self._config_expr] + self._access) + # Evaluate the expression setting the config values as local variables + return eval( + self._config_expr, + globals(), + {k: ConfigValue(v) for k, v in flow_cls._user_configs.items()}, + ) + + +def config_expr(expr: str) -> DelayEvaluator: + return DelayEvaluator(expr) + + +def eval_config(f: Callable[["FlowSpec"], "FlowSpec"]) -> "FlowSpec": + """ + Decorator to allow you to add Python decorators to a FlowSpec that makes use of + user configurations. + + As an example: + + ``` + def parameterize(f): + for s in f: + # Iterate over all the steps + if s.name in f.config.add_env_to_steps: + setattr(f, s.name) = environment(vars={**f.config.env_vars})(s) + return f + + @eval_config(parameterize) + class MyFlow(FlowSpec): + config = Config("config") + ... + ``` + + allows you to add an environment decorator to all steps in `add_env_to_steps`. Both + the steps to add this decorator to and the values to add are extracted from the + configuration passed to the Flow through config. + + Parameters + ---------- + f : Callable[[FlowSpec], FlowSpec] + Decorator function + + Returns + ------- + FlowSpec + The modified FlowSpec + """ + + def _wrapper(flow_spec: "FlowSpec"): + flow_spec._config_funcs.append(f) + return flow_spec + + return _wrapper + + +class FlowConfig(DelayEvaluator): + def __init__(self, config_name: str): + """ + Small wrapper to allow you to refer to a flow's configuration in a flow-level + decorator. + + As an example: + + @project(name=FlowConfig("config").project.name) + class MyFlow(FlowSpec): + config = Config("config") + ... + + This will allow you to specify a `project.name` value in your configuration + and have it used in the flow-level decorator. + + Without this construct, it would be difficult to access `config` inside the + arguments of the decorator. + + Parameters + ---------- + config_name : str + Name of the configuration being used. This should be the name given to + the `Config` constructor. + """ + super().__init__(config_name, is_var_only=True) + + class Config(Parameter): """ Includes a configuration for this flow. @@ -233,34 +343,110 @@ class Config(Parameter): Parameters ---------- name : str - User-visible parameter name. - default : Union[ConfigArgType, Callable[[ParameterContext], ConfigArgType]] - Default configuration either as a path to a file, the string representation of - a YAML or JSON file or a dictionary. If specified as a function, the function - will be evaluated to get the value to use. + User-visible configuration name. + default : Union[str, Dict[str, Any], Callable[[ParameterContext], Union[str, Dict[str, Any]]]], optional, default None + Default value for the parameter. A function + implies that the value will be computed using that function. + help : str, optional, default None + Help text to show in `run --help`. required : bool, default False Require that the user specified a value for the parameter. - `required=True` implies that the `default` value is ignored. - help : str, optional - Help text to show in `run --help`. + `required=True` implies that the `default` is not used. + parser : Callable[[str], Dict[str, Any]], optional, default None show_default : bool, default True If True, show the default value in the help text. """ + IS_FLOW_PARAMETER = True + def __init__( self, name: str, - required: bool = False, + default: Optional[ + Union[ + str, + Dict[str, Any], + Callable[[ParameterContext], Union[str, Dict[str, Any]]], + ] + ] = None, help: Optional[str] = None, - **kwargs: Dict[str, str] + required: bool = False, + parser: Optional[Callable[[str], Dict[str, Any]]] = None, + **kwargs: Dict[str, str], ): super(Config, self).__init__( name, + default=default, required=required, help=help, - type=ConfigInput(), + type=str, **kwargs, ) + if isinstance(kwargs.get("default", None), str): + kwargs["default"] = json.dumps(kwargs["default"]) + self.parser = parser def load_parameter(self, v): return v + + def __getattr__(self, name): + ev = DelayEvaluator(self.name, is_var_only=True) + return ev.__getattr__(name) + + +def config_options(cmd): + help_strs = [] + required_names = [] + defaults = [] + config_seen = set() + parsers = {} + flow_cls = getattr(current_flow, "flow_cls", None) + if flow_cls is None: + return cmd + + parameters = [p for _, p in flow_cls._get_parameters() if p.IS_FLOW_PARAMETER] + # List all the configuration options + for arg in parameters[::-1]: + kwargs = arg.option_kwargs(False) + if arg.name.lower() in config_seen: + msg = ( + "Multiple configurations use the same name '%s'. Note that names are " + "case-insensitive. Please change the " + "names of some of your configurations" % arg.name + ) + raise MetaflowException(msg) + config_seen.add(arg.name.lower()) + if kwargs["required"]: + required_names.append(arg.name) + if kwargs.get("default") is not None: + defaults.append((arg.name.lower(), kwargs["default"])) + else: + defaults.append(None) + help_strs.append(" - %s: %s" % (arg.name.lower(), kwargs.get("help", ""))) + parsers[arg.name.lower()] = arg.parser + + print("DEFAULTS %s" % defaults) + if not config_seen: + # No configurations -- don't add anything + return cmd + + help_str = ( + "Configuration options for the flow. " + "Multiple configurations can be specified." + ) + help_str = "\n\n".join([help_str] + help_strs) + cmd.params.insert( + 0, + click.Option( + ["--config", "config_options"], + nargs=2, + multiple=True, + type=click.Tuple([click.Choice(config_seen), PathOrStr()]), + callback=ConfigInput(required_names, parsers).process_configs, + help=help_str, + envvar="METAFLOW_FLOW_CONFIG", + show_default=False, + default=defaults, + ), + ) + return cmd diff --git a/metaflow/util.py b/metaflow/util.py index 03add41ee0c..6243abd31b9 100644 --- a/metaflow/util.py +++ b/metaflow/util.py @@ -296,6 +296,9 @@ def get_metaflow_root(): def dict_to_cli_options(params): + # Prevent circular imports + from metaflow.user_configs import ConfigInput, ConfigValue + for k, v in params.items(): # Omit boolean options set to false or None, but preserve options with an empty # string argument. @@ -304,6 +307,14 @@ def dict_to_cli_options(params): # keyword in Python, so we call it 'decospecs' in click args if k == "decospecs": k = "with" + if k == "config_options": + # Special handling here since we gather them all in one option but actually + # need to send them one at a time using --config kv. + for config_name in v.keys(): + yield "--config" + yield to_unicode(config_name) + yield to_unicode(ConfigInput.make_key_name(config_name)) + continue k = k.replace("_", "-") v = v if isinstance(v, (list, tuple, set)) else [v] for value in v: From fafa977ec63b05eb44114127594205d4f46ce36c Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Wed, 28 Aug 2024 23:51:01 -0700 Subject: [PATCH 08/30] Fix runner use of configs --- metaflow/runner/click_api.py | 2 ++ metaflow/user_configs.py | 9 ++------- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/metaflow/runner/click_api.py b/metaflow/runner/click_api.py index 83d7d3f9081..4b69d7b61e3 100644 --- a/metaflow/runner/click_api.py +++ b/metaflow/runner/click_api.py @@ -39,6 +39,7 @@ from metaflow.exception import MetaflowException from metaflow.includefile import FilePathClass from metaflow.parameters import JSONTypeClass, flow_context +from metaflow.user_configs import LocalFileInput # Define a recursive type alias for JSON JSON = Union[Dict[str, "JSON"], List["JSON"], str, int, float, bool, None] @@ -56,6 +57,7 @@ File: str, JSONTypeClass: JSON, FilePathClass: str, + LocalFileInput: str, } diff --git a/metaflow/user_configs.py b/metaflow/user_configs.py index ed255e18996..bd3f7a176d5 100644 --- a/metaflow/user_configs.py +++ b/metaflow/user_configs.py @@ -372,15 +372,10 @@ def __init__( help: Optional[str] = None, required: bool = False, parser: Optional[Callable[[str], Dict[str, Any]]] = None, - **kwargs: Dict[str, str], + **kwargs: Dict[str, str] ): super(Config, self).__init__( - name, - default=default, - required=required, - help=help, - type=str, - **kwargs, + name, default=default, required=required, help=help, type=str, **kwargs ) if isinstance(kwargs.get("default", None), str): kwargs["default"] = json.dumps(kwargs["default"]) From b386b11033b9f99f22f9c1a621b80690a1284f30 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Fri, 6 Sep 2024 01:53:54 -0700 Subject: [PATCH 09/30] Multiple fix plus sample flow Several fixes: - fixed an issue with default values - better handling of parameter defaults as configs - handle config defaults as functions - ConfigValue is more "dict"-like - made .configs and .steps work properly - renamed resolve_configs to init --- metaflow/__init__.py | 2 +- metaflow/cli.py | 13 +- metaflow/decorators.py | 11 +- metaflow/flowspec.py | 7 +- metaflow/includefile.py | 2 + metaflow/parameters.py | 15 +- metaflow/plugins/aws/batch/batch_decorator.py | 4 +- .../kubernetes/kubernetes_decorator.py | 4 +- metaflow/plugins/pypi/conda_decorator.py | 16 +- metaflow/plugins/timeout_decorator.py | 4 +- metaflow/user_configs.py | 185 ++++++++++++------ test/core/metaflow_test/__init__.py | 1 + test/core/metaflow_test/formatter.py | 6 +- test_config/config2.json | 4 + test_config/helloconfig.py | 139 +++++++++++++ 15 files changed, 323 insertions(+), 90 deletions(-) create mode 100644 test_config/config2.json create mode 100644 test_config/helloconfig.py diff --git a/metaflow/__init__.py b/metaflow/__init__.py index 951b9acde0c..4f58772c78d 100644 --- a/metaflow/__init__.py +++ b/metaflow/__init__.py @@ -103,7 +103,7 @@ class and related decorators. from .parameters import Parameter, JSONTypeClass, JSONType -from .user_configs import Config, FlowConfig, config_expr, eval_config +from .user_configs import Config, config_expr, eval_config # data layer # For historical reasons, we make metaflow.plugins.datatools accessible as diff --git a/metaflow/cli.py b/metaflow/cli.py index 962b80fa2f7..13c81082bb6 100644 --- a/metaflow/cli.py +++ b/metaflow/cli.py @@ -235,11 +235,15 @@ def version(obj): invoke_without_command=True, ) @tracing.cli_entrypoint("cli/start") +# Quiet is eager to make sure it is available when processing --config options since +# we need it to construct a context to pass to any DeployTimeField for the default +# value. @click.option( "--quiet/--not-quiet", show_default=True, default=False, help="Suppress unnecessary messages", + is_eager=True, ) @click.option( "--metadata", @@ -255,12 +259,14 @@ def version(obj): type=click.Choice(["local"] + [m.TYPE for m in ENVIRONMENTS]), help="Execution environment type", ) +# See comment for --quiet @click.option( "--datastore", default=DEFAULT_DATASTORE, show_default=True, type=click.Choice([d.TYPE for d in DATASTORES]), help="Data backend type", + is_eager=True, ) @click.option("--datastore-root", help="Root path for datastore") @click.option( @@ -400,7 +406,7 @@ def start( ctx.obj.config_options = config_options - decorators._resolve_configs(ctx.obj.flow) + decorators._init(ctx.obj.flow) # It is important to initialize flow decorators early as some of the # things they provide may be used by some of the objects initialized after. @@ -424,7 +430,10 @@ def start( # initialize current and parameter context for deploy-time parameters current._set_env(flow=ctx.obj.flow, is_running=False) parameters.set_parameter_context( - ctx.obj.flow.name, ctx.obj.echo, ctx.obj.flow_datastore + ctx.obj.flow.name, + ctx.obj.echo, + ctx.obj.flow_datastore, + dict(ctx.obj.flow.configs), ) if ctx.invoked_subcommand not in ("run", "resume"): diff --git a/metaflow/decorators.py b/metaflow/decorators.py index 8df702ebf10..b9bf32a21a7 100644 --- a/metaflow/decorators.py +++ b/metaflow/decorators.py @@ -124,9 +124,10 @@ def __init__(self, attributes=None, statically_defined=False): else: raise InvalidDecoratorAttribute(self.name, k, self.defaults) - def resolve_configs(self): + def init(self): """ - Resolve any configuration options that may be set in the decorator's attributes + Initializes the decorator. In general, any operation you would do in __init__ + should be done here. """ def _resolve_delayed_evaluator(v): @@ -544,14 +545,14 @@ def _attach_decorators_to_step(step, decospecs): step.decorators.append(deco) -def _resolve_configs(flow): +def _init(flow): # We get the datastore for the _parameters step which can contain for decorators in flow._flow_decorators.values(): for deco in decorators: - deco.resolve_configs() + deco.init() for step in flow: for deco in step.decorators: - deco.resolve_configs() + deco.init() def _init_flow_decorators( diff --git a/metaflow/flowspec.py b/metaflow/flowspec.py index 44e812dd342..0ea5eea2a22 100644 --- a/metaflow/flowspec.py +++ b/metaflow/flowspec.py @@ -238,7 +238,8 @@ def _process_config_funcs(self, config_options): if not param.IS_FLOW_PARAMETER: continue to_reset_params.append((var, param)) - val = config_options[param.name.replace("-", "_").lower()] + # Note that a config with no default and not required will be None + val = config_options.get(param.name.replace("-", "_").lower()) if isinstance(val, DelayedEvaluationParameter): val = val() setattr(current_cls, var, val) @@ -279,7 +280,7 @@ def _set_constants(self, graph, kwargs, config_options): for var, param in self._get_parameters(): seen.add(var) if param.IS_FLOW_PARAMETER: - val = config_options[param.name.replace("-", "_").lower()] + val = config_options.get(param.name.replace("-", "_").lower()) else: val = kwargs[param.name.replace("-", "_").lower()] # Support for delayed evaluation of parameters. @@ -361,6 +362,8 @@ def __iter__(self): return iter(self._steps) def __getattr__(self, name: str): + if name in ("configs", "steps"): + return getattr(self.__class__, name) if self._datastore and name in self._datastore: # load the attribute from the datastore... x = self._datastore[name] diff --git a/metaflow/includefile.py b/metaflow/includefile.py index 4bc16172863..b93f8c49224 100644 --- a/metaflow/includefile.py +++ b/metaflow/includefile.py @@ -20,6 +20,7 @@ ) from .plugins import DATACLIENTS +from .user_configs import ConfigValue from .util import get_username import functools @@ -136,6 +137,7 @@ def convert(self, value, param, ctx): parameter_name=param.name, logger=ctx.obj.echo, ds_type=ctx.obj.datastore_impl.TYPE, + configs=ConfigValue(dict(ctx.obj.flow.__class__.configs)), ) if len(value) > 0 and (value.startswith("{") or value.startswith('"{')): diff --git a/metaflow/parameters.py b/metaflow/parameters.py index ae6ff4168b4..927930ecc8a 100644 --- a/metaflow/parameters.py +++ b/metaflow/parameters.py @@ -3,7 +3,7 @@ from contextlib import contextmanager from threading import local -from typing import Any, Callable, Dict, NamedTuple, Optional, Type, Union +from typing import Any, Callable, Dict, NamedTuple, Optional, TYPE_CHECKING, Type, Union from metaflow._vendor import click @@ -14,6 +14,9 @@ MetaflowException, ) +if TYPE_CHECKING: + from .user_configs import ConfigValue + try: # Python2 strtype = basestring @@ -32,6 +35,7 @@ ("parameter_name", str), ("logger", Callable[..., None]), ("ds_type", str), + ("configs", "ConfigValue"), ], ) @@ -227,7 +231,9 @@ def deploy_time_eval(value): # this is called by cli.main -def set_parameter_context(flow_name, echo, datastore): +def set_parameter_context(flow_name, echo, datastore, configs): + from .user_configs import ConfigValue # Prevent circular dependency + global context_proto context_proto = ParameterContext( flow_name=flow_name, @@ -235,6 +241,7 @@ def set_parameter_context(flow_name, echo, datastore): parameter_name=None, logger=echo, ds_type=datastore.TYPE, + configs=ConfigValue(dict(configs)), ) @@ -303,8 +310,8 @@ class MyFlow(FlowSpec): help : str, optional Help text to show in `run --help`. required : bool, default False - Require that the user specified a value for the parameter. - `required=True` implies that the `default` is not used. + Require that the user specified a value for the parameter. If a non-None + default is specified, that default will be used if no other value is provided show_default : bool, default True If True, show the default value in the help text. """ diff --git a/metaflow/plugins/aws/batch/batch_decorator.py b/metaflow/plugins/aws/batch/batch_decorator.py index 16ca5c6f768..e24591dda32 100644 --- a/metaflow/plugins/aws/batch/batch_decorator.py +++ b/metaflow/plugins/aws/batch/batch_decorator.py @@ -138,8 +138,8 @@ class BatchDecorator(StepDecorator): supports_conda_environment = True target_platform = "linux-64" - def resolve_configs(self): - super(BatchDecorator, self).resolve_configs() + def init(self): + super(BatchDecorator, self).init() # If no docker image is explicitly specified, impute a default image. if not self.attributes["image"]: diff --git a/metaflow/plugins/kubernetes/kubernetes_decorator.py b/metaflow/plugins/kubernetes/kubernetes_decorator.py index 8f1d7be42d7..53f08daf051 100644 --- a/metaflow/plugins/kubernetes/kubernetes_decorator.py +++ b/metaflow/plugins/kubernetes/kubernetes_decorator.py @@ -151,8 +151,8 @@ class KubernetesDecorator(StepDecorator): supports_conda_environment = True target_platform = "linux-64" - def resolve_configs(self): - super(KubernetesDecorator, self).resolve_configs() + def init(self): + super(KubernetesDecorator, self).init() if not self.attributes["namespace"]: self.attributes["namespace"] = KUBERNETES_NAMESPACE diff --git a/metaflow/plugins/pypi/conda_decorator.py b/metaflow/plugins/pypi/conda_decorator.py index b6ac1b91d88..5c7859f24a2 100644 --- a/metaflow/plugins/pypi/conda_decorator.py +++ b/metaflow/plugins/pypi/conda_decorator.py @@ -49,8 +49,8 @@ class CondaStepDecorator(StepDecorator): # CONDA_CHANNELS in their environment. For pinning specific packages to specific # conda channels, users can specify channel::package as the package name. - def resolve_configs(self): - super(CondaStepDecorator, self).resolve_configs() + def init(self): + super(CondaStepDecorator, self).init() self._user_defined_attributes = ( attributes.copy() if attributes is not None else {} @@ -332,16 +332,12 @@ class CondaFlowDecorator(FlowDecorator): "disabled": None, } -<<<<<<< HEAD - def __init__(self, attributes=None, statically_defined=False): + def init(self): + super(CondaFlowDecorator, self).init() + self._user_defined_attributes = ( - attributes.copy() if attributes is not None else {} + self.attributes.copy() if self.attributes is not None else {} ) - super(CondaFlowDecorator, self).__init__(attributes, statically_defined) -======= - def resolve_configs(self): - super(CondaFlowDecorator, self).resolve_configs() ->>>>>>> 99d06ae8 (More WIP) # Support legacy 'libraries=' attribute for the decorator. self.attributes["packages"] = { diff --git a/metaflow/plugins/timeout_decorator.py b/metaflow/plugins/timeout_decorator.py index 648e318d36a..de50281b3ac 100644 --- a/metaflow/plugins/timeout_decorator.py +++ b/metaflow/plugins/timeout_decorator.py @@ -37,8 +37,8 @@ class TimeoutDecorator(StepDecorator): name = "timeout" defaults = {"seconds": 0, "minutes": 0, "hours": 0} - def resolve_configs(self): - super().resolve_configs() + def init(self): + super().init() # Initialize secs in __init__ so other decorators could safely use this # value without worrying about decorator order. # Convert values in attributes to type:int since they can be type:str diff --git a/metaflow/user_configs.py b/metaflow/user_configs.py index bd3f7a176d5..18f26ee6eec 100644 --- a/metaflow/user_configs.py +++ b/metaflow/user_configs.py @@ -1,5 +1,7 @@ +import collections.abc import json import os +import re from typing import Any, Callable, Dict, List, Optional, Union, TYPE_CHECKING @@ -8,12 +10,13 @@ from .exception import MetaflowException, MetaflowInternalError from .parameters import ( - DelayedEvaluationParameter, + DeployTimeField, Parameter, ParameterContext, current_flow, ) -import functools + +from .util import get_username if TYPE_CHECKING: from metaflow import FlowSpec @@ -54,7 +57,7 @@ def load_config_values(info_file: Optional[str] = None) -> Optional[Dict[str, An return None -class ConfigValue: +class ConfigValue(collections.abc.Mapping): # Thin wrapper to allow configuration values to be accessed using a "." notation # as well as a [] notation. @@ -72,6 +75,12 @@ def __getitem__(self, key): value = ConfigValue(value) return value + def __len__(self): + return len(self._data) + + def __iter__(self): + return iter(self._data) + def __repr__(self): return repr(self._data) @@ -80,9 +89,10 @@ def __str__(self): class PathOrStr(click.ParamType): - name = "ConfigInput" + name = "PathOrStr" - def convert(self, value, param, ctx): + @staticmethod + def convert_value(value): if value is None: return None @@ -94,7 +104,7 @@ def convert(self, value, param, ctx): if os.path.isfile(value): try: - with open(value, "r") as f: + with open(value, "r", encoding="utf-8") as f: content = f.read() except OSError as e: raise click.UsageError( @@ -103,6 +113,9 @@ def convert(self, value, param, ctx): return "converted:" + content return "converted:" + value + def convert(self, value, param, ctx): + return self.convert_value(value) + class ConfigInput: # Contains the values loaded from the INFO file. We make this a class method @@ -118,9 +131,11 @@ class ConfigInput: def __init__( self, req_configs: List[str], + defaults: Dict[str, Union[str, Dict[str, Any]]], parsers: Dict[str, Callable[[str], Dict[str, Any]]], ): - self._req_configs = req_configs + self._req_configs = set(req_configs) + self._defaults = defaults self._parsers = parsers @staticmethod @@ -144,6 +159,8 @@ def get_config(cls, config_name: str) -> Optional[Dict[str, Any]]: return cls.loaded_configs.get(config_name, None) def process_configs(self, ctx, param, value): + from .cli import echo_always, echo_dev_null # Prevent circular import + flow_cls = getattr(current_flow, "flow_cls", None) if flow_cls is None: # This is an error @@ -151,19 +168,51 @@ def process_configs(self, ctx, param, value): "Config values should be processed for a FlowSpec" ) - # First validate if we have all the required parameters - # Here value is a list of tuples. Each tuple has the name of the configuration - # and the string representation of the config (it was already read - # from a file if applicable). - missing = set(self._req_configs) - set([v[0] for v in value]) - if missing: - raise click.UsageError( - "Missing required configuration values: %s" % ", ".join(missing) - ) - + # value is a list of tuples (name, value). + # Click will provide: + # - all the defaults if nothing is provided on the command line + # - provide *just* the passed in value if anything is provided on the command + # line. + # + # We therefore "merge" the defaults with what we are provided by click to form + # a full set of values + # We therefore get a full set of values where: + # - the name will correspond to the configuration name + # - the value will be the default (including None if there is no default) or + # the string representation of the value (this will always include + # the "converted:" prefix as it will have gone through the PathOrStr + # conversion function). A value of None basically means that the config has + # no default and was not specified on the command line. to_return = {} + + merged_configs = dict(self._defaults) for name, val in value: + # Don't replace by None -- this is needed to avoid replacing a function + # default + if val: + merged_configs[name] = val + + print("PARAMS: %s" % str(ctx.params)) + missing_configs = set() + for name, val in merged_configs.items(): name = name.lower() + # convert is idempotent so if it is already converted, it will just return + # the value. This is used to make sure we process the defaults + if isinstance(val, DeployTimeField): + # We will form our own context and pass it down + param_ctx = ParameterContext( + flow_name=ctx.obj.flow.name, + user_name=get_username(), + parameter_name=name, + logger=echo_dev_null if ctx.params["quiet"] else echo_always, + ds_type=ctx.params["datastore"], + configs=None, + ) + val = val.fun(param_ctx) + val = PathOrStr.convert_value(val) + if val is None: + missing_configs.add(name) + continue val = val[10:] # Remove the "converted:" prefix if val.startswith("kv."): # This means to load it from a file @@ -187,6 +236,11 @@ def process_configs(self, ctx, param, value): # TODO: Support YAML flow_cls._user_configs[name] = read_value to_return[name] = ConfigValue(read_value) + + if missing_configs.intersection(self._req_configs): + raise click.UsageError( + "Missing configuration values for %s" % ", ".join(missing_configs) + ) return to_return def __str__(self): @@ -222,13 +276,17 @@ class DelayEvaluator: This is used when we want to use config.* values in decorators for example. """ - def __init__(self, config_expr: str, is_var_only=True): - self._config_expr = config_expr - if is_var_only: + id_pattern = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$") + + def __init__(self, ex: str): + self._config_expr = ex + if self.id_pattern.match(self._config_expr): + # This is a variable only so allow things like config_expr("config").var + self._is_var_only = True self._access = [] else: + self._is_var_only = False self._access = None - self._is_var_only = is_var_only def __getattr__(self, name): if self._access is None: @@ -236,7 +294,9 @@ def __getattr__(self, name): self._access.append(name) return self - def __call__(self): + def __call__(self, ctx=None, deploy_time=False): + # Two additional arguments are only used by DeployTimeField which will call + # this function with those two additional arguments. They are ignored. flow_cls = getattr(current_flow, "flow_cls", None) if flow_cls is None: # We are not executing inside a flow (ie: not the CLI) @@ -256,6 +316,32 @@ def __call__(self): def config_expr(expr: str) -> DelayEvaluator: + """ + Function to allow you to use an expression involving a config parameter in + places where it may not be directory accessible or if you want a more complicated + expression than just a single variable. + + You can use it as follows: + - When the config is not directly accessible: + + @project(name=config_expr("config").project.name) + class MyFlow(FlowSpec): + config = Config("config") + ... + - When you want a more complex expression: + class MyFlow(FlowSpec): + config = Config("config") + + @environment(vars={"foo": config_expr("config.bar.baz.lower()")}) + @step + def start(self): + ... + + Parameters + ---------- + expr : str + Expression using the config values. + """ return DelayEvaluator(expr) @@ -302,34 +388,6 @@ def _wrapper(flow_spec: "FlowSpec"): return _wrapper -class FlowConfig(DelayEvaluator): - def __init__(self, config_name: str): - """ - Small wrapper to allow you to refer to a flow's configuration in a flow-level - decorator. - - As an example: - - @project(name=FlowConfig("config").project.name) - class MyFlow(FlowSpec): - config = Config("config") - ... - - This will allow you to specify a `project.name` value in your configuration - and have it used in the flow-level decorator. - - Without this construct, it would be difficult to access `config` inside the - arguments of the decorator. - - Parameters - ---------- - config_name : str - Name of the configuration being used. This should be the name given to - the `Config` constructor. - """ - super().__init__(config_name, is_var_only=True) - - class Config(Parameter): """ Includes a configuration for this flow. @@ -374,9 +432,12 @@ def __init__( parser: Optional[Callable[[str], Dict[str, Any]]] = None, **kwargs: Dict[str, str] ): + + print("Config %s, default is %s" % (name, default)) super(Config, self).__init__( name, default=default, required=required, help=help, type=str, **kwargs ) + if isinstance(kwargs.get("default", None), str): kwargs["default"] = json.dumps(kwargs["default"]) self.parser = parser @@ -385,14 +446,14 @@ def load_parameter(self, v): return v def __getattr__(self, name): - ev = DelayEvaluator(self.name, is_var_only=True) + ev = DelayEvaluator(self.name) return ev.__getattr__(name) def config_options(cmd): help_strs = [] required_names = [] - defaults = [] + defaults = {} config_seen = set() parsers = {} flow_cls = getattr(current_flow, "flow_cls", None) @@ -400,8 +461,10 @@ def config_options(cmd): return cmd parameters = [p for _, p in flow_cls._get_parameters() if p.IS_FLOW_PARAMETER] + config_opt_required = False # List all the configuration options for arg in parameters[::-1]: + save_default = arg.kwargs.get("default", None) kwargs = arg.option_kwargs(False) if arg.name.lower() in config_seen: msg = ( @@ -413,14 +476,17 @@ def config_options(cmd): config_seen.add(arg.name.lower()) if kwargs["required"]: required_names.append(arg.name) - if kwargs.get("default") is not None: - defaults.append((arg.name.lower(), kwargs["default"])) - else: - defaults.append(None) + if save_default is None: + # We need at least one option if we have a required configuration. + config_opt_required = True + defaults[arg.name.lower()] = save_default help_strs.append(" - %s: %s" % (arg.name.lower(), kwargs.get("help", ""))) parsers[arg.name.lower()] = arg.parser - print("DEFAULTS %s" % defaults) + print( + "DEFAULTS %s" + % str(dict((k, v if not callable(v) else "FUNC") for k, v in defaults.items())) + ) if not config_seen: # No configurations -- don't add anything return cmd @@ -437,11 +503,12 @@ def config_options(cmd): nargs=2, multiple=True, type=click.Tuple([click.Choice(config_seen), PathOrStr()]), - callback=ConfigInput(required_names, parsers).process_configs, + callback=ConfigInput(required_names, defaults, parsers).process_configs, help=help_str, envvar="METAFLOW_FLOW_CONFIG", show_default=False, - default=defaults, + default=[(k, v if not callable(v) else None) for k, v in defaults.items()], + required=config_opt_required, ), ) return cmd diff --git a/test/core/metaflow_test/__init__.py b/test/core/metaflow_test/__init__.py index 19263df26ff..f2e574fe627 100644 --- a/test/core/metaflow_test/__init__.py +++ b/test/core/metaflow_test/__init__.py @@ -155,6 +155,7 @@ class MetaflowTest(object): PRIORITY = 999999999 PARAMETERS = {} INCLUDE_FILES = {} + CONFIGS = {} CLASS_VARS = {} HEADER = "" diff --git a/test/core/metaflow_test/formatter.py b/test/core/metaflow_test/formatter.py index 4461645c217..0da52b49501 100644 --- a/test/core/metaflow_test/formatter.py +++ b/test/core/metaflow_test/formatter.py @@ -85,7 +85,7 @@ def _flow_lines(self): tags.extend(tag.split("(")[0] for tag in step.tags) yield 0, "# -*- coding: utf-8 -*-" - yield 0, "from metaflow import FlowSpec, step, Parameter, project, IncludeFile, JSONType, current, parallel" + yield 0, "from metaflow import Config, FlowSpec, step, Parameter, project, IncludeFile, JSONType, current, parallel" yield 0, "from metaflow_test import assert_equals, assert_equals_metadata, assert_exception, ExpectationFailed, is_resumed, ResumeFromHere, TestRetry, try_to_get_card" if tags: yield 0, "from metaflow import %s" % ",".join(tags) @@ -104,6 +104,10 @@ def _flow_lines(self): kwargs = ["%s=%s" % (k, v) for k, v in include.items()] yield 1, '%s = IncludeFile("%s", %s)' % (var, var, ",".join(kwargs)) + for var, include in self.test.CONFIGS.items(): + kwargs = ["%s=%s" % (k, v) for k, v in include.items()] + yield 1, '%s = Config("%s", %s)' % (var, var, ",".join(kwargs)) + for name, node in self.graphspec["graph"].items(): step = self._choose_step(name, node) self.used.add(step) diff --git a/test_config/config2.json b/test_config/config2.json new file mode 100644 index 00000000000..12ec1d8f996 --- /dev/null +++ b/test_config/config2.json @@ -0,0 +1,4 @@ +{ + "default_param": 456, + "default_param2": 789 +} diff --git a/test_config/helloconfig.py b/test_config/helloconfig.py new file mode 100644 index 00000000000..be8246cc6b2 --- /dev/null +++ b/test_config/helloconfig.py @@ -0,0 +1,139 @@ +import os + +from metaflow import ( + Config, + FlowSpec, + Parameter, + environment, + step, + project, + config_expr, + eval_config, + titus, +) + + +def silly_parser(s): + k, v = s.split(":") + return {k: v} + + +def param_func(ctx): + return ctx.configs.config2.default_param2 + 1 + + +def config_func(ctx): + return {"val": 123} + + +default_config = { + "run_on_titus": ["hello"], + "cpu_count": 2, + "env_to_start": "Romain", + "magic_value": 42, + "project_name": "hirec", +} + +silly_config = "baz:awesome" + + +def titus_or_not(flow): + to_replace = [] + for name, s in flow.steps: + if name in flow.config.run_on_titus: + to_replace.append((name, titus(cpu=flow.config.cpu_count)(s))) + for name, val in to_replace: + setattr(flow, name, val) + return flow + + +def add_env_to_start(flow): + # Add a decorator directly to a step + flow.start = environment(vars={"hello": config_expr("config").env_to_start})( + flow.start + ) + return flow + + +@eval_config(titus_or_not) +@add_env_to_start +@project(name=config_expr("config").project_name) +class HelloConfig(FlowSpec): + """ + A flow where Metaflow prints 'Hi'. + + Run this flow to validate that Metaflow is installed correctly. + + """ + + default_from_config = Parameter( + "default_from_config", default=config_expr("config2").default_param, type=int + ) + + default_from_func = Parameter("default_from_func", default=param_func, type=int) + + config = Config("config", default=default_config, help="Help for config") + sconfig = Config( + "sconfig", + default="sillyconfig.txt", + parser=silly_parser, + help="Help for sconfig", + required=True, + ) + config2 = Config("config2", required=True) + + config3 = Config("config3", default=config_func) + + @step + def start(self): + """ + This is the 'start' step. All flows must have a step named 'start' that + is the first step in the flow. + + """ + print("HelloConfig is %s (should be awesome)" % self.sconfig.baz) + print( + "Environment variable hello %s (should be Romain)" % os.environ.get("hello") + ) + + print( + "Parameters are: default_from_config: %s, default_from_func: %s" + % (self.default_from_config, self.default_from_func) + ) + + print("Config3 has value: %s" % self.config3.val) + self.next(self.hello) + + @environment( + vars={ + "normal": config.env_to_start, + "stringify": config_expr("str(config.magic_value)"), + } + ) + @step + def hello(self): + """ + A step for metaflow to introduce itself. + + """ + print( + "In this step, we got a normal variable %s, one that is stringified %s" + % ( + os.environ.get("normal"), + os.environ.get("stringify"), + ) + ) + self.next(self.end) + + @step + def end(self): + """ + This is the 'end' step. All flows must have an 'end' step, which is the + last step in the flow. + + """ + print("HelloFlow is all done") + + +if __name__ == "__main__": + HelloConfig() From 7cc9c974eba87d1dd8d59134ce7912362a971fa8 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Tue, 10 Sep 2024 00:58:34 -0700 Subject: [PATCH 10/30] Addressed comments. Added more documentation/explanation Specifically: - moved things out of the INFO file - added to_dict - renamed user_configs to config_parameters --- metaflow/__init__.py | 2 +- metaflow/cli.py | 6 +- .../{user_configs.py => config_parameters.py} | 135 +++++++++++++----- metaflow/decorators.py | 2 +- metaflow/flowspec.py | 2 +- metaflow/includefile.py | 4 +- metaflow/metaflow_environment.py | 11 +- metaflow/package.py | 20 ++- metaflow/parameters.py | 4 +- metaflow/plugins/pypi/conda_decorator.py | 4 +- metaflow/runner/click_api.py | 2 +- metaflow/runtime.py | 10 +- metaflow/util.py | 2 +- 13 files changed, 139 insertions(+), 65 deletions(-) rename metaflow/{user_configs.py => config_parameters.py} (76%) diff --git a/metaflow/__init__.py b/metaflow/__init__.py index 4f58772c78d..fbc08342d9c 100644 --- a/metaflow/__init__.py +++ b/metaflow/__init__.py @@ -103,7 +103,7 @@ class and related decorators. from .parameters import Parameter, JSONTypeClass, JSONType -from .user_configs import Config, config_expr, eval_config +from .config_parameters import Config, config_expr, eval_config # data layer # For historical reasons, we make metaflow.plugins.datatools accessible as diff --git a/metaflow/cli.py b/metaflow/cli.py index 13c81082bb6..da418ad20cd 100644 --- a/metaflow/cli.py +++ b/metaflow/cli.py @@ -35,7 +35,7 @@ from .pylint_wrapper import PyLint from .R import metaflow_r_version, use_r from .util import resolve_identity -from .user_configs import LocalFileInput, config_options +from .config_parameters import LocalFileInput, config_options ERASE_TO_EOL = "\033[K" HIGHLIGHT = "red" @@ -304,11 +304,11 @@ def version(obj): help="Monitoring backend type", ) @click.option( - "--local-info-file", + "--local-config-file", type=LocalFileInput(exists=True, readable=True, dir_okay=False, resolve_path=True), required=False, default=None, - help="A filename containing a subset of the INFO file. Internal use only.", + help="A filename containing the dumped configuration values. Internal use only.", hidden=True, is_eager=True, ) diff --git a/metaflow/user_configs.py b/metaflow/config_parameters.py similarity index 76% rename from metaflow/user_configs.py rename to metaflow/config_parameters.py index 18f26ee6eec..3a76500e6dd 100644 --- a/metaflow/user_configs.py +++ b/metaflow/config_parameters.py @@ -5,7 +5,6 @@ from typing import Any, Callable, Dict, List, Optional, Union, TYPE_CHECKING -from metaflow import INFO_FILE from metaflow._vendor import click from .exception import MetaflowException, MetaflowInternalError @@ -40,16 +39,20 @@ # return tracefunc_closure +CONFIG_FILE = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "CONFIG_PARAMETERS" +) + def dump_config_values(flow: "FlowSpec"): if flow._user_configs: - return "user_configs", flow._user_configs - return None, None + return {"user_configs": flow._user_configs} + return {} -def load_config_values(info_file: Optional[str] = None) -> Optional[Dict[str, Any]]: +def load_config_values(info_file: Optional[str] = None) -> Optional[Dict[Any, Any]]: if info_file is None: - info_file = INFO_FILE + info_file = os.path.basename(CONFIG_FILE) try: with open(info_file, encoding="utf-8") as contents: return json.load(contents).get("user_configs", {}) @@ -58,10 +61,19 @@ def load_config_values(info_file: Optional[str] = None) -> Optional[Dict[str, An class ConfigValue(collections.abc.Mapping): + """ + ConfigValue is a thin wrapper around an arbitrarily nested dictionary-like + configuration object. It allows you to access elements of this nested structure + using either a "." notation or a [] notation. As an example, if your configuration + object is: + {"foo": {"bar": 42}} + you can access the value 42 using either config["foo"]["bar"] or config.foo.bar. + """ + # Thin wrapper to allow configuration values to be accessed using a "." notation # as well as a [] notation. - def __init__(self, data: Dict[str, Any]): + def __init__(self, data: Dict[Any, Any]): self._data = data for key, value in data.items(): @@ -69,7 +81,20 @@ def __init__(self, data: Dict[str, Any]): value = ConfigValue(value) setattr(self, key, value) - def __getitem__(self, key): + def __getitem__(self, key: Any) -> Any: + """ + Access an element of this configuration + + Parameters + ---------- + key : Any + Element to access + + Returns + ------- + Any + Element of the configuration + """ value = self._data[key] if isinstance(value, dict): value = ConfigValue(value) @@ -87,12 +112,31 @@ def __repr__(self): def __str__(self): return json.dumps(self._data) + def to_dict(self) -> Dict[Any, Any]: + """ + Returns a dictionary representation of this configuration object. + + Returns + ------- + Dict[Any, Any] + Dictionary equivalent of this configuration object. + """ + return dict(self._data) + class PathOrStr(click.ParamType): + # Click parameter type for a configuration value -- it can either be the string + # representation of the configuration value (like a JSON string or any other + # string that the configuration parser can parse) or the path to a file containing + # such a content. The value will be initially assumed to be that of a file and will + # only be considered not a file if no file exists. name = "PathOrStr" @staticmethod def convert_value(value): + # Click requires this to be idempotent. We therefore check if the value + # starts with "converted:" which is our marker for "we already processed this + # value". if value is None: return None @@ -118,21 +162,22 @@ def convert(self, value, param, ctx): class ConfigInput: - # Contains the values loaded from the INFO file. We make this a class method - # so that if there are multiple configs, we just need to read the file once. - # It is OK to be globally unique because this is only evoked in scenario A.2 (see - # convert method) which means we are already just executing a single task and so - # there is no concern about it "leaking" to things running with Runner for example - # (ie: even if Runner is evoked in that task, we won't "share" this global value's - # usage). - loaded_configs = None # type: Optional[Dict[str, Dict[str, Any]]] - info_file = None # type: Optional[str] + # ConfigInput is an internal class responsible for processing all the --config + # options. It gathers information from the --local-config-file (to figure out + # where options are stored) and is also responsible for processing any `--config` + # options and processing the default value of `Config(...)` objects. + + # It will then store this information in the flow spec for use later in processing. + # It is stored in the flow spec to avoid being global to support the Runner. + + loaded_configs = None # type: Optional[Dict[str, Dict[Any, Any]]] + config_file = None # type: Optional[str] def __init__( self, req_configs: List[str], - defaults: Dict[str, Union[str, Dict[str, Any]]], - parsers: Dict[str, Callable[[str], Dict[str, Any]]], + defaults: Dict[str, Union[str, Dict[Any, Any]]], + parsers: Dict[str, Callable[[str], Dict[Any, Any]]], ): self._req_configs = set(req_configs) self._defaults = defaults @@ -140,20 +185,24 @@ def __init__( @staticmethod def make_key_name(name: str) -> str: + # Special mark to indicate that the configuration value is not content or a file + # name but a value that should be read in the config file (effectively where + # the value has already been materialized). return "kv." + name.lower() @classmethod - def set_info_file(cls, info_file: str): - cls.info_file = info_file + def set_config_file(cls, config_file: str): + cls.config_file = config_file @classmethod - def get_config(cls, config_name: str) -> Optional[Dict[str, Any]]: + def get_config(cls, config_name: str) -> Optional[Dict[Any, Any]]: if cls.loaded_configs is None: - all_configs = load_config_values(cls.info_file) + all_configs = load_config_values(cls.config_file) if all_configs is None: raise MetaflowException( "Could not load expected configuration values " - "from the INFO file. This is a Metaflow bug. Please contact support." + "from the CONFIG_PARAMETERS file. This is a Metaflow bug. " + "Please contact support." ) cls.loaded_configs = all_configs return cls.loaded_configs.get(config_name, None) @@ -168,7 +217,8 @@ def process_configs(self, ctx, param, value): "Config values should be processed for a FlowSpec" ) - # value is a list of tuples (name, value). + # This function is called by click when processing all the --config options. + # The value passed in is a list of tuples (name, value). # Click will provide: # - all the defaults if nothing is provided on the command line # - provide *just* the passed in value if anything is provided on the command @@ -197,9 +247,15 @@ def process_configs(self, ctx, param, value): for name, val in merged_configs.items(): name = name.lower() # convert is idempotent so if it is already converted, it will just return - # the value. This is used to make sure we process the defaults + # the value. This is used to make sure we process the defaults which do + # NOT make it through the PathOrStr convert function if isinstance(val, DeployTimeField): - # We will form our own context and pass it down + # This supports a default value that is a deploy-time field (similar + # to Parameter).) + # We will form our own context and pass it down -- note that you cannot + # use configs in the default value of configs as this introduces a bit + # of circularity. Note also that quiet and datastore are *eager* + # options so are available here. param_ctx = ParameterContext( flow_name=ctx.obj.flow.name, user_name=get_username(), @@ -251,14 +307,19 @@ def __repr__(self): class LocalFileInput(click.Path): + # Small wrapper around click.Path to set the value from which to read configuration + # values. This is set immediately upon processing the --local-config-file + # option and will therefore then be available when processing any of the other + # --config options (which will call ConfigInput.process_configs name = "LocalFileInput" def convert(self, value, param, ctx): super().convert(value, param, ctx) - ConfigInput.set_info_file(value) + ConfigInput.set_config_file(value) # This purposefully returns None which means it is *not* passed down # when commands use ctx.parent.parent.params to get all the configuration - # values. + # values (it becomes hidden because its only purpose is to update the + # config file in ConfigInput) def __str__(self): return repr(self) @@ -267,7 +328,7 @@ def __repr__(self): return "LocalFileInput" -ConfigArgType = Union[str, Dict[str, Any]] +ConfigArgType = Union[str, Dict[Any, Any]] class DelayEvaluator: @@ -402,15 +463,17 @@ class Config(Parameter): ---------- name : str User-visible configuration name. - default : Union[str, Dict[str, Any], Callable[[ParameterContext], Union[str, Dict[str, Any]]]], optional, default None + default : Union[str, Dict[Any, Any], Callable[[ParameterContext], Union[str, Dict[Any, Any]]]], optional, default None Default value for the parameter. A function implies that the value will be computed using that function. help : str, optional, default None Help text to show in `run --help`. required : bool, default False - Require that the user specified a value for the parameter. - `required=True` implies that the `default` is not used. - parser : Callable[[str], Dict[str, Any]], optional, default None + Require that the user specified a value for the parameter. Note that if + a default is provided, the required flag is ignored. + parser : Callable[[str], Dict[Any, Any]], optional, default None + An optional function that can parse the configuration string into an arbitrarily + nested dictionary. show_default : bool, default True If True, show the default value in the help text. """ @@ -423,13 +486,13 @@ def __init__( default: Optional[ Union[ str, - Dict[str, Any], - Callable[[ParameterContext], Union[str, Dict[str, Any]]], + Dict[Any, Any], + Callable[[ParameterContext], Union[str, Dict[Any, Any]]], ] ] = None, help: Optional[str] = None, required: bool = False, - parser: Optional[Callable[[str], Dict[str, Any]]] = None, + parser: Optional[Callable[[str], Dict[Any, Any]]] = None, **kwargs: Dict[str, str] ): diff --git a/metaflow/decorators.py b/metaflow/decorators.py index b9bf32a21a7..b852ade200f 100644 --- a/metaflow/decorators.py +++ b/metaflow/decorators.py @@ -12,7 +12,7 @@ ) from .parameters import current_flow -from .user_configs import DelayEvaluator +from .config_parameters import DelayEvaluator from metaflow._vendor import click diff --git a/metaflow/flowspec.py b/metaflow/flowspec.py index 0ea5eea2a22..ede2206eb60 100644 --- a/metaflow/flowspec.py +++ b/metaflow/flowspec.py @@ -20,7 +20,7 @@ from .graph import FlowGraph from .unbounded_foreach import UnboundedForeachInput -from .user_configs import ConfigInput, ConfigValue +from .config_parameters import ConfigInput, ConfigValue from .util import to_pod from .metaflow_config import INCLUDE_FOREACH_STACK, MAXIMUM_FOREACH_VALUE_CHARS diff --git a/metaflow/includefile.py b/metaflow/includefile.py index b93f8c49224..499b4cd6a90 100644 --- a/metaflow/includefile.py +++ b/metaflow/includefile.py @@ -20,7 +20,7 @@ ) from .plugins import DATACLIENTS -from .user_configs import ConfigValue +from .config_parameters import ConfigValue from .util import get_username import functools @@ -137,7 +137,7 @@ def convert(self, value, param, ctx): parameter_name=param.name, logger=ctx.obj.echo, ds_type=ctx.obj.datastore_impl.TYPE, - configs=ConfigValue(dict(ctx.obj.flow.__class__.configs)), + configs=ConfigValue(dict(ctx.obj.flow.configs)), ) if len(value) > 0 and (value.startswith("{") or value.startswith('"{')): diff --git a/metaflow/metaflow_environment.py b/metaflow/metaflow_environment.py index cea6e18697b..0ac1ca3266c 100644 --- a/metaflow/metaflow_environment.py +++ b/metaflow/metaflow_environment.py @@ -6,7 +6,6 @@ from . import metaflow_version from metaflow.exception import MetaflowException from metaflow.extension_support import dump_module_info -from metaflow.user_configs import dump_config_values from metaflow.mflog import BASH_MFLOG from . import R @@ -19,7 +18,7 @@ class MetaflowEnvironment(object): TYPE = "local" def __init__(self, flow): - self._flow = flow + pass def init_environment(self, echo): """ @@ -178,7 +177,7 @@ def get_package_commands(self, code_package_url, datastore_type): ] return cmds - def get_environment_info(self, full_info=False): + def get_environment_info(self, include_ext_info=False): # note that this dict goes into the code package # so variables here should be relatively stable (no # timestamps) so the hash won't change all the time @@ -199,14 +198,10 @@ def get_environment_info(self, full_info=False): env["metaflow_r_version"] = R.metaflow_r_version() env["r_version"] = R.r_version() env["r_version_code"] = R.r_version_code() - if full_info: + if include_ext_info: # Information about extension modules (to load them in the proper order) ext_key, ext_val = dump_module_info() env[ext_key] = ext_val - # Information about configurations (to be able to reload them) - user_configs = dump_config_values(self._flow) - if user_configs: - env[user_configs[0]] = user_configs[1] return env def executable(self, step_name, default=None): diff --git a/metaflow/package.py b/metaflow/package.py index 55968a03d65..9666d929d08 100644 --- a/metaflow/package.py +++ b/metaflow/package.py @@ -6,6 +6,7 @@ import json from io import BytesIO +from .config_parameters import CONFIG_FILE, dump_config_values from .extension_support import EXT_PKG, package_mfext_all from .metaflow_config import DEFAULT_PACKAGE_SUFFIXES from .exception import MetaflowException @@ -151,11 +152,23 @@ def path_tuples(self): for path_tuple in self._walk(flowdir, suffixes=self.suffixes): yield path_tuple + def _add_configs(self, tar): + buf = BytesIO() + buf.write(json.dumps(dump_config_values(self._flow)).encode("utf-8")) + self._add_file(tar, os.path.basename(CONFIG_FILE), buf) + def _add_info(self, tar): - info = tarfile.TarInfo(os.path.basename(INFO_FILE)) - env = self.environment.get_environment_info(full_info=True) buf = BytesIO() - buf.write(json.dumps(env).encode("utf-8")) + buf.write( + json.dumps( + self.environment.get_environment_info(include_ext_info=True) + ).encode("utf-8") + ) + self._add_file(tar, os.path.basename(INFO_FILE), buf) + + @staticmethod + def _add_file(tar, filename, buf): + info = tarfile.TarInfo(filename) buf.seek(0) info.size = len(buf.getvalue()) # Setting this default to Dec 3, 2019 @@ -175,6 +188,7 @@ def no_mtime(tarinfo): fileobj=buf, mode="w:gz", compresslevel=3, dereference=True ) as tar: self._add_info(tar) + self._add_configs(tar) for path, arcname in self.path_tuples(): tar.add(path, arcname=arcname, recursive=False, filter=no_mtime) diff --git a/metaflow/parameters.py b/metaflow/parameters.py index 927930ecc8a..f020a5cf406 100644 --- a/metaflow/parameters.py +++ b/metaflow/parameters.py @@ -15,7 +15,7 @@ ) if TYPE_CHECKING: - from .user_configs import ConfigValue + from .config_parameters import ConfigValue try: # Python2 @@ -232,7 +232,7 @@ def deploy_time_eval(value): # this is called by cli.main def set_parameter_context(flow_name, echo, datastore, configs): - from .user_configs import ConfigValue # Prevent circular dependency + from .config_parameters import ConfigValue # Prevent circular dependency global context_proto context_proto = ParameterContext( diff --git a/metaflow/plugins/pypi/conda_decorator.py b/metaflow/plugins/pypi/conda_decorator.py index 5c7859f24a2..49b42f7e55e 100644 --- a/metaflow/plugins/pypi/conda_decorator.py +++ b/metaflow/plugins/pypi/conda_decorator.py @@ -174,7 +174,9 @@ def runtime_init(self, flow, graph, package, run_id): encoding="utf-8", ) as f: f.write( - json.dumps(self.environment.get_environment_info(full_info=True)) + json.dumps( + self.environment.get_environment_info(include_ext_info=True) + ) ) # Support metaflow extensions. diff --git a/metaflow/runner/click_api.py b/metaflow/runner/click_api.py index 4b69d7b61e3..0001ed39a95 100644 --- a/metaflow/runner/click_api.py +++ b/metaflow/runner/click_api.py @@ -39,7 +39,7 @@ from metaflow.exception import MetaflowException from metaflow.includefile import FilePathClass from metaflow.parameters import JSONTypeClass, flow_context -from metaflow.user_configs import LocalFileInput +from metaflow.config_parameters import LocalFileInput # Define a recursive type alias for JSON JSON = Union[Dict[str, "JSON"], List["JSON"], str, int, float, bool, None] diff --git a/metaflow/runtime.py b/metaflow/runtime.py index d16e96728a5..2ea2e55f9d2 100644 --- a/metaflow/runtime.py +++ b/metaflow/runtime.py @@ -43,7 +43,7 @@ UBF_TASK, ) -from .user_configs import ConfigInput, dump_config_values +from .config_parameters import ConfigInput, dump_config_values import metaflow.tracing as tracing @@ -480,9 +480,9 @@ def execute(self): # Configurations are passed through a file to avoid overloading the # command-line. We only need to create this file once and it can be reused # for any task launch - config_key, config_value = dump_config_values(self._flow) + config_value = dump_config_values(self._flow) if config_value: - json.dump({config_key: config_value}, config_file) + json.dump(config_value, config_file) config_file.flush() self._config_file_name = config_file.name else: @@ -1522,7 +1522,7 @@ def __init__(self, task): self.top_level_options.update(deco.get_top_level_options()) # We also pass configuration options using the kv. syntax which will cause - # the configuration options to be loaded from the INFO file (or local-info-file + # the configuration options to be loaded from the CONFIG file (or local-config-file # in the case of the local runtime) if self.task.flow._user_configs: self.top_level_options["config"] = [ @@ -1639,7 +1639,7 @@ def _launch(self): # Add user configurations using a file to avoid using up too much space on the # command line if self._config_file_name: - args.top_level_options["local-info-file"] = self._config_file_name + args.top_level_options["local-config-file"] = self._config_file_name # Pass configuration options env.update(args.get_env()) env["PYTHONUNBUFFERED"] = "x" diff --git a/metaflow/util.py b/metaflow/util.py index 6243abd31b9..a7fcd315920 100644 --- a/metaflow/util.py +++ b/metaflow/util.py @@ -297,7 +297,7 @@ def get_metaflow_root(): def dict_to_cli_options(params): # Prevent circular imports - from metaflow.user_configs import ConfigInput, ConfigValue + from metaflow.config_parameters import ConfigInput, ConfigValue for k, v in params.items(): # Omit boolean options set to false or None, but preserve options with an empty From 6246f337e3c5702d83685d7eef21b12d87e9c14a Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Tue, 10 Sep 2024 12:36:14 -0700 Subject: [PATCH 11/30] Added test, more cleanup Specifically: - made config values immutable - cleaned up state stored in FlowSpec - added a test exercising configs in various places --- metaflow/config_parameters.py | 77 ++++++++++++--- metaflow/flowspec.py | 50 +++++----- metaflow/runtime.py | 6 +- test/core/metaflow_test/formatter.py | 2 +- test/core/tests/basic_config_parameters.py | 106 +++++++++++++++++++++ 5 files changed, 203 insertions(+), 38 deletions(-) create mode 100644 test/core/tests/basic_config_parameters.py diff --git a/metaflow/config_parameters.py b/metaflow/config_parameters.py index 3a76500e6dd..eb20fd6f1f2 100644 --- a/metaflow/config_parameters.py +++ b/metaflow/config_parameters.py @@ -8,6 +8,7 @@ from metaflow._vendor import click from .exception import MetaflowException, MetaflowInternalError + from .parameters import ( DeployTimeField, Parameter, @@ -45,8 +46,11 @@ def dump_config_values(flow: "FlowSpec"): - if flow._user_configs: - return {"user_configs": flow._user_configs} + from .flowspec import _FlowState # Prevent circular import + + configs = flow._flow_state.get(_FlowState.CONFIGS) + if configs: + return {"user_configs": configs} return {} @@ -76,10 +80,34 @@ class ConfigValue(collections.abc.Mapping): def __init__(self, data: Dict[Any, Any]): self._data = data - for key, value in data.items(): - if isinstance(value, dict): - value = ConfigValue(value) - setattr(self, key, value) + def __getattr__(self, key: str) -> Any: + """ + Access an element of this configuration + + Parameters + ---------- + key : str + Element to access + + Returns + ------- + Any + Element of the configuration + """ + if key == "_data": + # Called during unpickling. Special case to not run into infinite loop + # below. + raise AttributeError(key) + + if key in self._data: + return self[key] + raise AttributeError(key) + + def __setattr__(self, name: str, value: Any) -> None: + # Prevent configuration modification + if name == "_data": + return super().__setattr__(name, value) + raise TypeError("ConfigValue is immutable") def __getitem__(self, key: Any) -> Any: """ @@ -209,6 +237,7 @@ def get_config(cls, config_name: str) -> Optional[Dict[Any, Any]]: def process_configs(self, ctx, param, value): from .cli import echo_always, echo_dev_null # Prevent circular import + from .flowspec import _FlowState # Prevent circular import flow_cls = getattr(current_flow, "flow_cls", None) if flow_cls is None: @@ -216,7 +245,7 @@ def process_configs(self, ctx, param, value): raise MetaflowInternalError( "Config values should be processed for a FlowSpec" ) - + flow_cls._flow_state[_FlowState.CONFIGS] = {} # This function is called by click when processing all the --config options. # The value passed in is a list of tuples (name, value). # Click will provide: @@ -277,7 +306,7 @@ def process_configs(self, ctx, param, value): raise click.UsageError( "Could not find configuration '%s' in INFO file" % val ) - flow_cls._user_configs[name] = read_value + flow_cls._flow_state[_FlowState.CONFIGS][name] = read_value to_return[name] = ConfigValue(read_value) else: if self._parsers[name]: @@ -290,7 +319,7 @@ def process_configs(self, ctx, param, value): "Configuration value for '%s' is not valid JSON" % name ) from e # TODO: Support YAML - flow_cls._user_configs[name] = read_value + flow_cls._flow_state[_FlowState.CONFIGS][name] = read_value to_return[name] = ConfigValue(read_value) if missing_configs.intersection(self._req_configs): @@ -331,6 +360,23 @@ def __repr__(self): ConfigArgType = Union[str, Dict[Any, Any]] +class MultipleTuple(click.Tuple): + # Small wrapper around a click.Tuple to allow the environment variable for + # configurations to be a JSON string. Otherwise the default behavior is splitting + # by whitespace which is totally not what we want + # You can now pass multiple configuration options through an environment variable + # using something like: + # METAFLOW_FLOW_CONFIG='{"config1": "filenameforconfig1.json", "config2": {"key1": "value1"}}' + + def split_envvar_value(self, rv): + loaded = json.loads(rv) + return list( + item if isinstance(item, str) else json.dumps(item) + for pair in loaded.items() + for item in pair + ) + + class DelayEvaluator: """ Small wrapper that allows the evaluation of a Config() value in a delayed manner. @@ -356,6 +402,8 @@ def __getattr__(self, name): return self def __call__(self, ctx=None, deploy_time=False): + from .flowspec import _FlowState # Prevent circular import + # Two additional arguments are only used by DeployTimeField which will call # this function with those two additional arguments. They are ignored. flow_cls = getattr(current_flow, "flow_cls", None) @@ -372,7 +420,10 @@ def __call__(self, ctx=None, deploy_time=False): return eval( self._config_expr, globals(), - {k: ConfigValue(v) for k, v in flow_cls._user_configs.items()}, + { + k: ConfigValue(v) + for k, v in flow_cls._flow_state.get(_FlowState.CONFIGS, {}).items() + }, ) @@ -443,7 +494,9 @@ class MyFlow(FlowSpec): """ def _wrapper(flow_spec: "FlowSpec"): - flow_spec._config_funcs.append(f) + from .flowspec import _FlowState + + flow_spec._flow_state.setdefault(_FlowState.CONFIG_FUNCS, []).append(f) return flow_spec return _wrapper @@ -565,7 +618,7 @@ def config_options(cmd): ["--config", "config_options"], nargs=2, multiple=True, - type=click.Tuple([click.Choice(config_seen), PathOrStr()]), + type=MultipleTuple([click.Choice(config_seen), PathOrStr()]), callback=ConfigInput(required_names, defaults, parsers).process_configs, help=help_str, envvar="METAFLOW_FLOW_CONFIG", diff --git a/metaflow/flowspec.py b/metaflow/flowspec.py index ede2206eb60..3fd3f86aa0c 100644 --- a/metaflow/flowspec.py +++ b/metaflow/flowspec.py @@ -4,6 +4,7 @@ import traceback import reprlib +from enum import Enum from itertools import islice from types import FunctionType, MethodType from typing import TYPE_CHECKING, Any, Callable, Generator, List, Optional, Tuple @@ -66,23 +67,26 @@ def __getitem__(self, item): return item or 0 # item is None for the control task, but it is also split 0 +class _FlowState(Enum): + CONFIGS = 1 + CONFIG_FUNCS = 2 + CACHED_PARAMETERS = 3 + + class FlowSpecMeta(type): def __new__(cls, name, bases, dct): f = super().__new__(cls, name, bases, dct) - # This makes sure to give _flow_decorators to each - # child class (and not share it with the FlowSpec base - # class). This is important to not make a "global" - # _flow_decorators. Same deal with user configurations - f._flow_decorators = {} - f._user_configs = {} + # We store some state in the flow class itself. This is primarily used to + # attach global state to a flow. It is *not* an actual global because of + # Runner/NBRunner. This is also created here in the meta class to avoid it being + # shared between different children classes. - # We also cache parameter names to avoid having to recompute what is a parameter - # in the dir of a flow - f._cached_parameters = None + # We should move _flow_decorators into this structure as well but keeping it + # out to limit the changes for now. + f._flow_decorators = {} - # Finally attach all functions that need to be evaluated once user configurations - # are available - f._config_funcs = [] + # Keys are _FlowState enum values + f._flow_state = {} return f @@ -113,8 +117,8 @@ def start(self): Tuple[str, ConfigValue] Iterates over the configurations of the flow """ - # When configs are parsed, they are loaded in _user_configs - for name, value in cls._user_configs.items(): + # When configs are parsed, they are loaded in _flow_state[_FlowState.CONFIGS] + for name, value in cls._flow_state.get(_FlowState.CONFIGS, {}).items(): yield name, ConfigValue(value) @property @@ -153,9 +157,7 @@ class FlowSpec(metaclass=FlowSpecMeta): "_cached_input", "_graph", "_flow_decorators", - "_user_configs", - "_cached_parameters", - "_config_funcs", + "_flow_state", "_steps", "index", "input", @@ -224,7 +226,7 @@ def _process_config_funcs(self, config_options): current_cls = self.__class__ # Fast path for no user configurations - if not self._config_funcs: + if not self._flow_state.get(_FlowState.CONFIG_FUNCS): return self # We need to convert all the user configurations from DelayedEvaluationParameters @@ -246,7 +248,7 @@ def _process_config_funcs(self, config_options): # Run all the functions. They will now be able to access the configuration # values directly from the class - for func in self._config_funcs: + for func in self._flow_state[_FlowState.CONFIG_FUNCS]: current_cls = func(current_cls) # Reset all configs that were already present in the class. @@ -257,7 +259,8 @@ def _process_config_funcs(self, config_options): # We reset cached_parameters on the very off chance that the user added # more configurations based on the configuration - current_cls._cached_parameters = None + if _FlowState.CACHED_PARAMETERS in current_cls._flow_state: + del current_cls._flow_state[_FlowState.CACHED_PARAMETERS] # Set the current flow class we are in (the one we just created) parameters.replace_flow_context(current_cls) @@ -328,8 +331,9 @@ def _set_constants(self, graph, kwargs, config_options): @classmethod def _get_parameters(cls): - if cls._cached_parameters is not None: - for var in cls._cached_parameters: + cached = cls._flow_state.get(_FlowState.CACHED_PARAMETERS) + if cached is not None: + for var in cached: yield var, getattr(cls, var) return build_list = [] @@ -343,7 +347,7 @@ def _get_parameters(cls): if isinstance(val, Parameter): build_list.append(var) yield var, val - cls._cached_parameters = build_list + cls._flow_state[_FlowState.CACHED_PARAMETERS] = build_list def _set_datastore(self, datastore): self._datastore = datastore diff --git a/metaflow/runtime.py b/metaflow/runtime.py index 2ea2e55f9d2..6218e78a03a 100644 --- a/metaflow/runtime.py +++ b/metaflow/runtime.py @@ -34,6 +34,7 @@ from .datastore import TaskDataStoreSet from .debug import debug from .decorators import flow_decorators +from .flowspec import _FlowState from .mflog import mflog, RUNTIME_LOG_SOURCE from .util import to_unicode, compress_list, unicode_type from .clone_util import clone_task_helper @@ -1524,9 +1525,10 @@ def __init__(self, task): # We also pass configuration options using the kv. syntax which will cause # the configuration options to be loaded from the CONFIG file (or local-config-file # in the case of the local runtime) - if self.task.flow._user_configs: + configs = self.task.flow._flow_state.get(_FlowState.CONFIGS) + if configs: self.top_level_options["config"] = [ - (k, ConfigInput.make_key_name(k)) for k in self.task.flow._user_configs + (k, ConfigInput.make_key_name(k)) for k in configs ] self.commands = ["step"] diff --git a/test/core/metaflow_test/formatter.py b/test/core/metaflow_test/formatter.py index 0da52b49501..096afe78ebb 100644 --- a/test/core/metaflow_test/formatter.py +++ b/test/core/metaflow_test/formatter.py @@ -85,7 +85,7 @@ def _flow_lines(self): tags.extend(tag.split("(")[0] for tag in step.tags) yield 0, "# -*- coding: utf-8 -*-" - yield 0, "from metaflow import Config, FlowSpec, step, Parameter, project, IncludeFile, JSONType, current, parallel" + yield 0, "from metaflow import Config, config_expr, eval_config, FlowSpec, step, Parameter, project, IncludeFile, JSONType, current, parallel" yield 0, "from metaflow_test import assert_equals, assert_equals_metadata, assert_exception, ExpectationFailed, is_resumed, ResumeFromHere, TestRetry, try_to_get_card" if tags: yield 0, "from metaflow import %s" % ",".join(tags) diff --git a/test/core/tests/basic_config_parameters.py b/test/core/tests/basic_config_parameters.py new file mode 100644 index 00000000000..dc367bef524 --- /dev/null +++ b/test/core/tests/basic_config_parameters.py @@ -0,0 +1,106 @@ +from metaflow_test import MetaflowTest, ExpectationFailed, steps, tag + + +class BasicConfigTest(MetaflowTest): + PRIORITY = 1 + PARAMETERS = { + "default_from_config": { + "default": "config_expr('config2').default_param", + "type": "int", + }, + "default_from_func": {"default": "param_default", "type": "int"}, + } + CONFIGS = { + "config": {"default": "default_config"}, + "silly_config": {"required": True, "parser": "silly_parser"}, + "config2": {}, + "config3": {"default": "config_default"}, + } + HEADER = """ +import json +import os + +os.environ['METAFLOW_FLOW_CONFIG'] = json.dumps( + { + "config2": {"default_param": 123}, + "silly_config": "baz:amazing" + } +) + +def silly_parser(s): + k, v = s.split(":") + return {k: v} + +default_config = { + "value": 42, + "str_value": "foobar", + "project_name": "test_config", + "nested": {"value": 43}, +} + +def param_default(ctx): + return ctx.configs.config2.default_param + 1 + +def config_default(ctx): + return {"val": 456} + +# Test flow-level decorator configurations +@project(name=config_expr("config").project_name) +""" + + # Test step level decorators with configs + @tag( + "environment(vars={'normal': config.str_value, 'stringify': config_expr('str(config.value)')})" + ) + @steps(0, ["all"]) + def step_all(self): + # Test flow-level decorator configs + assert_equals(current.project_name, "test_config") + + # Test step-level decorator configs + assert_equals(os.environ["normal"], "foobar") + assert_equals(os.environ["stringify"], "42") + + # Test parameters reading configs + assert_equals(self.default_from_config, 123) + assert_equals(self.default_from_func, 124) + + # Test configs are accessible as artifacts + assert_equals(self.config.value, 42) + assert_equals(self.config["value"], 42) + assert_equals(self.config.nested.value, 43) + assert_equals(self.config["nested"]["value"], 43) + assert_equals(self.config.nested["value"], 43) + assert_equals(self.config["nested"].value, 43) + + assert_equals(self.silly_config.baz, "amazing") + assert_equals(self.silly_config["baz"], "amazing") + + assert_equals(self.config3.val, 456) + + try: + self.config3["val"] = 5 + raise ExpectationFailed(TypeError, "configs should be immutable") + except TypeError: + pass + + try: + self.config3.val = 5 + raise ExpectationFailed(TypeError, "configs should be immutable") + except TypeError: + pass + + def check_results(self, flow, checker): + for step in flow: + checker.assert_artifact( + step.name, + "config", + { + "value": 42, + "str_value": "foobar", + "project_name": "test_config", + "nested": {"value": 43}, + }, + ) + checker.assert_artifact(step.name, "config2", {"default_param": 123}) + checker.assert_artifact(step.name, "silly_config", {"baz": "amazing"}) From 8c25d42a8d60ba9abd4d1b8053ff8c97381b90d3 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Tue, 10 Sep 2024 12:54:48 -0700 Subject: [PATCH 12/30] Fixup conda decorator --- metaflow/plugins/pypi/conda_decorator.py | 31 +++++++++++++++++++----- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/metaflow/plugins/pypi/conda_decorator.py b/metaflow/plugins/pypi/conda_decorator.py index 49b42f7e55e..f881f7f6b05 100644 --- a/metaflow/plugins/pypi/conda_decorator.py +++ b/metaflow/plugins/pypi/conda_decorator.py @@ -49,14 +49,23 @@ class CondaStepDecorator(StepDecorator): # CONDA_CHANNELS in their environment. For pinning specific packages to specific # conda channels, users can specify channel::package as the package name. - def init(self): - super(CondaStepDecorator, self).init() - + def __init__(self, attributes=None, statically_defined=False): self._user_defined_attributes = ( attributes.copy() if attributes is not None else {} ) super(CondaStepDecorator, self).__init__(attributes, statically_defined) + def init(self): + super(CondaStepDecorator, self).init() + + # We have to go back and fixup _user_defined_attributes for potential + # config resolution + self._user_defined_attributes = { + k: v + for k, v in self.attributes.items() + if k in self._user_defined_attributes + } + # Support legacy 'libraries=' attribute for the decorator. self.attributes["packages"] = { **self.attributes["libraries"], @@ -334,12 +343,22 @@ class CondaFlowDecorator(FlowDecorator): "disabled": None, } + def __init__(self, attributes=None, statically_defined=False): + self._user_defined_attributes = ( + attributes.copy() if attributes is not None else {} + ) + super(CondaFlowDecorator, self).__init__(attributes, statically_defined) + def init(self): super(CondaFlowDecorator, self).init() - self._user_defined_attributes = ( - self.attributes.copy() if self.attributes is not None else {} - ) + # We have to go back and fixup _user_defined_attributes for potential + # config resolution + self._user_defined_attributes = { + k: v + for k, v in self.attributes.items() + if k in self._user_defined_attributes + } # Support legacy 'libraries=' attribute for the decorator. self.attributes["packages"] = { From 83d89fa69b6804d63a10981660a12e74dabde6eb Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Tue, 10 Sep 2024 15:15:48 -0700 Subject: [PATCH 13/30] Fix parallel tests --- metaflow/cli.py | 2 +- metaflow/cli_args.py | 15 +++++++++++++++ metaflow/config_parameters.py | 7 ++----- metaflow/util.py | 4 ++++ 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/metaflow/cli.py b/metaflow/cli.py index da418ad20cd..e44baf9ecde 100644 --- a/metaflow/cli.py +++ b/metaflow/cli.py @@ -325,7 +325,7 @@ def start( pylint=None, event_logger=None, monitor=None, - local_info_file=None, + local_config_file=None, config_options=None, **deco_options ): diff --git a/metaflow/cli_args.py b/metaflow/cli_args.py index 40918f984ff..62f997b8566 100644 --- a/metaflow/cli_args.py +++ b/metaflow/cli_args.py @@ -12,7 +12,14 @@ # well as the converting of options in runtime.py. We should make it so that we # can properly shlex things and un-shlex when using. Ideally this should all be # done in one place. +# +# NOTE: There is an important between these two as well: +# - this one will include local_config_file whereas the other one WILL NOT. +# This is because this is used when constructing the parallel UBF command which +# executes locally and therefore needs the local_config_file but the other (remote) +# commands do not. +from .config_parameters import ConfigInput from .util import to_unicode @@ -65,6 +72,14 @@ def _options(mapping): # keyword in Python, so we call it 'decospecs' in click args if k == "decospecs": k = "with" + if k == "config_options": + # Special handling here since we gather them all in one option but actually + # need to send them one at a time using --config kv. + for config_name in v.keys(): + yield "--config" + yield to_unicode(config_name) + yield to_unicode(ConfigInput.make_key_name(config_name)) + continue k = k.replace("_", "-") v = v if isinstance(v, (list, tuple, set)) else [v] for value in v: diff --git a/metaflow/config_parameters.py b/metaflow/config_parameters.py index eb20fd6f1f2..fe823d1e1d8 100644 --- a/metaflow/config_parameters.py +++ b/metaflow/config_parameters.py @@ -343,12 +343,9 @@ class LocalFileInput(click.Path): name = "LocalFileInput" def convert(self, value, param, ctx): - super().convert(value, param, ctx) + v = super().convert(value, param, ctx) ConfigInput.set_config_file(value) - # This purposefully returns None which means it is *not* passed down - # when commands use ctx.parent.parent.params to get all the configuration - # values (it becomes hidden because its only purpose is to update the - # config file in ConfigInput) + return v def __str__(self): return repr(self) diff --git a/metaflow/util.py b/metaflow/util.py index a7fcd315920..286dfe0580f 100644 --- a/metaflow/util.py +++ b/metaflow/util.py @@ -315,6 +315,10 @@ def dict_to_cli_options(params): yield to_unicode(config_name) yield to_unicode(ConfigInput.make_key_name(config_name)) continue + if k == "local_config_file": + # Skip this value -- it should only be used locally and never when + # forming another command line + continue k = k.replace("_", "-") v = v if isinstance(v, (list, tuple, set)) else [v] for value in v: From 6218b8b38cfc33b2945daa780acd03b0418113dc Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Tue, 10 Sep 2024 15:48:50 -0700 Subject: [PATCH 14/30] Fix current singleton test (conflict with `steps`) --- test/core/tests/current_singleton.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/core/tests/current_singleton.py b/test/core/tests/current_singleton.py index b8064b7f9ee..036005796e3 100644 --- a/test/core/tests/current_singleton.py +++ b/test/core/tests/current_singleton.py @@ -22,7 +22,7 @@ def step_start(self): self.flow_names = {current.flow_name} self.run_ids = {current.run_id} self.origin_run_ids = {current.origin_run_id} - self.steps = {current.step_name} + self.seen_steps = {current.step_name} self.step_name = current.step_name self.namespaces = {current.namespace} self.usernames = {current.username} @@ -49,7 +49,7 @@ def step_join(self): self.flow_names = set(chain(*(i.flow_names for i in inputs))) self.run_ids = set(chain(*(i.run_ids for i in inputs))) self.origin_run_ids = set(chain(*(i.origin_run_ids for i in inputs))) - self.steps = set(chain(*(i.steps for i in inputs))) + self.seen_steps = set(chain(*(i.seen_steps for i in inputs))) self.namespaces = set(chain(*(i.namespaces for i in inputs))) self.usernames = set(chain(*(i.usernames for i in inputs))) self.task_data = {} @@ -68,7 +68,7 @@ def step_join(self): self.origin_run_ids.add(current.origin_run_id) self.namespaces.add(current.namespace) self.usernames.add(current.username) - self.steps.add(current.step_name) + self.seen_steps.add(current.step_name) self.uuid = str(uuid4()) self.task_data[current.pathspec] = self.uuid self.tags.update(current.tags) @@ -90,7 +90,7 @@ def step_all(self): self.namespaces.add(current.namespace) self.usernames.add(current.username) self.step_name = current.step_name - self.steps.add(current.step_name) + self.seen_steps.add(current.step_name) self.uuid = str(uuid4()) self.task_data[current.pathspec] = self.uuid self.tags.update(current.tags) From cc23d7bb5ef92636ba54e63e5bbaa226671abd22 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Thu, 12 Sep 2024 08:24:24 -0700 Subject: [PATCH 15/30] Call decorator init method on non-static decorators --- metaflow/cli.py | 1 + metaflow/decorators.py | 12 +++++++----- metaflow/plugins/airflow/airflow_cli.py | 1 + metaflow/plugins/argo/argo_workflows_cli.py | 1 + .../plugins/aws/step_functions/step_functions_cli.py | 1 + metaflow/plugins/pypi/pypi_decorator.py | 1 + 6 files changed, 12 insertions(+), 5 deletions(-) diff --git a/metaflow/cli.py b/metaflow/cli.py index e44baf9ecde..55c38ab8a18 100644 --- a/metaflow/cli.py +++ b/metaflow/cli.py @@ -444,6 +444,7 @@ def start( ) if all_decospecs: decorators._attach_decorators(ctx.obj.flow, all_decospecs) + decorators._init(ctx.obj.flow, only_non_static=True) # Regenerate graph if we attached more decorators ctx.obj.graph = FlowGraph(ctx.obj.flow.__class__) diff --git a/metaflow/decorators.py b/metaflow/decorators.py index b852ade200f..11b7308c7f0 100644 --- a/metaflow/decorators.py +++ b/metaflow/decorators.py @@ -545,14 +545,16 @@ def _attach_decorators_to_step(step, decospecs): step.decorators.append(deco) -def _init(flow): +def _init(flow, only_non_static=False): # We get the datastore for the _parameters step which can contain for decorators in flow._flow_decorators.values(): for deco in decorators: - deco.init() - for step in flow: - for deco in step.decorators: - deco.init() + if not only_non_static or not deco.statically_defined: + deco.init() + for flowstep in flow: + for deco in flowstep.decorators: + if not only_non_static or not deco.statically_defined: + deco.init() def _init_flow_decorators( diff --git a/metaflow/plugins/airflow/airflow_cli.py b/metaflow/plugins/airflow/airflow_cli.py index b80d82fbcee..33e0e0f48d6 100644 --- a/metaflow/plugins/airflow/airflow_cli.py +++ b/metaflow/plugins/airflow/airflow_cli.py @@ -283,6 +283,7 @@ def make_flow( ): # Attach @kubernetes. decorators._attach_decorators(obj.flow, [KubernetesDecorator.name]) + decorators._init(obj.flow, only_non_static=True) decorators._init_step_decorators( obj.flow, obj.graph, obj.environment, obj.flow_datastore, obj.logger diff --git a/metaflow/plugins/argo/argo_workflows_cli.py b/metaflow/plugins/argo/argo_workflows_cli.py index bd755684102..734c50d21ee 100644 --- a/metaflow/plugins/argo/argo_workflows_cli.py +++ b/metaflow/plugins/argo/argo_workflows_cli.py @@ -470,6 +470,7 @@ def make_flow( decorators._attach_decorators( obj.flow, [KubernetesDecorator.name, EnvironmentDecorator.name] ) + decorators._init(obj.flow, only_non_static=True) decorators._init_step_decorators( obj.flow, obj.graph, obj.environment, obj.flow_datastore, obj.logger diff --git a/metaflow/plugins/aws/step_functions/step_functions_cli.py b/metaflow/plugins/aws/step_functions/step_functions_cli.py index efa4e7f35f4..373b5daef1c 100644 --- a/metaflow/plugins/aws/step_functions/step_functions_cli.py +++ b/metaflow/plugins/aws/step_functions/step_functions_cli.py @@ -326,6 +326,7 @@ def make_flow( # Attach AWS Batch decorator to the flow decorators._attach_decorators(obj.flow, [BatchDecorator.name]) + decorators._init(obj.flow, only_non_static=True) decorators._init_step_decorators( obj.flow, obj.graph, obj.environment, obj.flow_datastore, obj.logger ) diff --git a/metaflow/plugins/pypi/pypi_decorator.py b/metaflow/plugins/pypi/pypi_decorator.py index 17925446bde..3f650281dc5 100644 --- a/metaflow/plugins/pypi/pypi_decorator.py +++ b/metaflow/plugins/pypi/pypi_decorator.py @@ -140,6 +140,7 @@ def flow_init( from metaflow import decorators decorators._attach_decorators(flow, ["pypi"]) + decorators._init(flow, only_non_static=True) # @pypi uses a conda environment to create a virtual environment. # The conda environment can be created through micromamba. From 74f1be88caa67a4482c8d752b13fae6d25b049ea Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Mon, 30 Sep 2024 13:43:07 -0700 Subject: [PATCH 16/30] Several fixes - Separate out value and file (so default and default_value and --config and --config-value) - Provide classes for step and flow config decorators with proxy objects - Split things into several files (it was getting too long) - Addressed all bugs discussed --- metaflow/__init__.py | 3 +- metaflow/cli.py | 21 +- metaflow/cli_args.py | 10 +- metaflow/config_parameters.py | 627 --------------------- metaflow/decorators.py | 27 +- metaflow/flowspec.py | 87 ++- metaflow/includefile.py | 2 +- metaflow/package.py | 2 +- metaflow/parameters.py | 10 +- metaflow/runner/click_api.py | 2 +- metaflow/runtime.py | 5 +- metaflow/user_configs/__init__.py | 0 metaflow/user_configs/config_decorators.py | 214 +++++++ metaflow/user_configs/config_options.py | 384 +++++++++++++ metaflow/user_configs/config_parameters.py | 350 ++++++++++++ metaflow/util.py | 10 +- test/core/metaflow_test/formatter.py | 2 +- test_config/helloconfig.py | 41 +- 18 files changed, 1071 insertions(+), 726 deletions(-) delete mode 100644 metaflow/config_parameters.py create mode 100644 metaflow/user_configs/__init__.py create mode 100644 metaflow/user_configs/config_decorators.py create mode 100644 metaflow/user_configs/config_options.py create mode 100644 metaflow/user_configs/config_parameters.py diff --git a/metaflow/__init__.py b/metaflow/__init__.py index fbc08342d9c..2ce16d0b909 100644 --- a/metaflow/__init__.py +++ b/metaflow/__init__.py @@ -103,7 +103,8 @@ class and related decorators. from .parameters import Parameter, JSONTypeClass, JSONType -from .config_parameters import Config, config_expr, eval_config +from .user_configs.config_parameters import Config, config_expr +from .user_configs.config_decorators import FlowConfigDecorator, StepConfigDecorator # data layer # For historical reasons, we make metaflow.plugins.datatools accessible as diff --git a/metaflow/cli.py b/metaflow/cli.py index 55c38ab8a18..4f9d2759930 100644 --- a/metaflow/cli.py +++ b/metaflow/cli.py @@ -12,6 +12,7 @@ from .cli_components.utils import LazyGroup, LazyPluginCommandCollection from .datastore import FlowDataStore from .exception import CommandException, MetaflowException +from .flowspec import _FlowState from .graph import FlowGraph from .metaflow_config import ( DECOSPECS, @@ -35,7 +36,8 @@ from .pylint_wrapper import PyLint from .R import metaflow_r_version, use_r from .util import resolve_identity -from .config_parameters import LocalFileInput, config_options +from .user_configs.config_options import LocalFileInput, config_options +from .user_configs.config_parameters import ConfigValue ERASE_TO_EOL = "\033[K" HIGHLIGHT = "red" @@ -326,7 +328,8 @@ def start( event_logger=None, monitor=None, local_config_file=None, - config_options=None, + config_file_options=None, + config_value_options=None, **deco_options ): if quiet: @@ -347,7 +350,12 @@ def start( # process all those decorators that the user added that will modify the flow based # on those configurations. It is important to do this as early as possible since it # actually modifies the flow itself - ctx.obj.flow = ctx.obj.flow._process_config_funcs(config_options) + + # When we process the options, the first one processed will return None and the + # second one processed will return the actual options. The order of processing + # depends on what (and in what order) the user specifies on the command line. + config_options = config_file_options or config_value_options + ctx.obj.flow = ctx.obj.flow._process_config_decorators(config_options) cli_args._set_top_kwargs(ctx.params) ctx.obj.echo = echo @@ -433,7 +441,12 @@ def start( ctx.obj.flow.name, ctx.obj.echo, ctx.obj.flow_datastore, - dict(ctx.obj.flow.configs), + { + k: ConfigValue(v) + for k, v in ctx.obj.flow.__class__._flow_state.get( + _FlowState.CONFIGS, {} + ).items() + }, ) if ctx.invoked_subcommand not in ("run", "resume"): diff --git a/metaflow/cli_args.py b/metaflow/cli_args.py index 62f997b8566..3c411627395 100644 --- a/metaflow/cli_args.py +++ b/metaflow/cli_args.py @@ -19,7 +19,7 @@ # executes locally and therefore needs the local_config_file but the other (remote) # commands do not. -from .config_parameters import ConfigInput +from .user_configs.config_options import ConfigInput from .util import to_unicode @@ -72,11 +72,13 @@ def _options(mapping): # keyword in Python, so we call it 'decospecs' in click args if k == "decospecs": k = "with" - if k == "config_options": + if k in ("config_file_options", "config_value_options"): # Special handling here since we gather them all in one option but actually - # need to send them one at a time using --config kv. + # need to send them one at a time using --config-value kv.. + # Note it can be either config_file_options or config_value_options depending + # on click processing order. for config_name in v.keys(): - yield "--config" + yield "--config-value" yield to_unicode(config_name) yield to_unicode(ConfigInput.make_key_name(config_name)) continue diff --git a/metaflow/config_parameters.py b/metaflow/config_parameters.py deleted file mode 100644 index fe823d1e1d8..00000000000 --- a/metaflow/config_parameters.py +++ /dev/null @@ -1,627 +0,0 @@ -import collections.abc -import json -import os -import re - -from typing import Any, Callable, Dict, List, Optional, Union, TYPE_CHECKING - -from metaflow._vendor import click - -from .exception import MetaflowException, MetaflowInternalError - -from .parameters import ( - DeployTimeField, - Parameter, - ParameterContext, - current_flow, -) - -from .util import get_username - -if TYPE_CHECKING: - from metaflow import FlowSpec - -# _tracefunc_depth = 0 - - -# def tracefunc(func): -# """Decorates a function to show its trace.""" - -# @functools.wraps(func) -# def tracefunc_closure(*args, **kwargs): -# global _tracefunc_depth -# """The closure.""" -# print(f"{_tracefunc_depth}: {func.__name__}(args={args}, kwargs={kwargs})") -# _tracefunc_depth += 1 -# result = func(*args, **kwargs) -# _tracefunc_depth -= 1 -# print(f"{_tracefunc_depth} => {result}") -# return result - -# return tracefunc_closure - -CONFIG_FILE = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "CONFIG_PARAMETERS" -) - - -def dump_config_values(flow: "FlowSpec"): - from .flowspec import _FlowState # Prevent circular import - - configs = flow._flow_state.get(_FlowState.CONFIGS) - if configs: - return {"user_configs": configs} - return {} - - -def load_config_values(info_file: Optional[str] = None) -> Optional[Dict[Any, Any]]: - if info_file is None: - info_file = os.path.basename(CONFIG_FILE) - try: - with open(info_file, encoding="utf-8") as contents: - return json.load(contents).get("user_configs", {}) - except IOError: - return None - - -class ConfigValue(collections.abc.Mapping): - """ - ConfigValue is a thin wrapper around an arbitrarily nested dictionary-like - configuration object. It allows you to access elements of this nested structure - using either a "." notation or a [] notation. As an example, if your configuration - object is: - {"foo": {"bar": 42}} - you can access the value 42 using either config["foo"]["bar"] or config.foo.bar. - """ - - # Thin wrapper to allow configuration values to be accessed using a "." notation - # as well as a [] notation. - - def __init__(self, data: Dict[Any, Any]): - self._data = data - - def __getattr__(self, key: str) -> Any: - """ - Access an element of this configuration - - Parameters - ---------- - key : str - Element to access - - Returns - ------- - Any - Element of the configuration - """ - if key == "_data": - # Called during unpickling. Special case to not run into infinite loop - # below. - raise AttributeError(key) - - if key in self._data: - return self[key] - raise AttributeError(key) - - def __setattr__(self, name: str, value: Any) -> None: - # Prevent configuration modification - if name == "_data": - return super().__setattr__(name, value) - raise TypeError("ConfigValue is immutable") - - def __getitem__(self, key: Any) -> Any: - """ - Access an element of this configuration - - Parameters - ---------- - key : Any - Element to access - - Returns - ------- - Any - Element of the configuration - """ - value = self._data[key] - if isinstance(value, dict): - value = ConfigValue(value) - return value - - def __len__(self): - return len(self._data) - - def __iter__(self): - return iter(self._data) - - def __repr__(self): - return repr(self._data) - - def __str__(self): - return json.dumps(self._data) - - def to_dict(self) -> Dict[Any, Any]: - """ - Returns a dictionary representation of this configuration object. - - Returns - ------- - Dict[Any, Any] - Dictionary equivalent of this configuration object. - """ - return dict(self._data) - - -class PathOrStr(click.ParamType): - # Click parameter type for a configuration value -- it can either be the string - # representation of the configuration value (like a JSON string or any other - # string that the configuration parser can parse) or the path to a file containing - # such a content. The value will be initially assumed to be that of a file and will - # only be considered not a file if no file exists. - name = "PathOrStr" - - @staticmethod - def convert_value(value): - # Click requires this to be idempotent. We therefore check if the value - # starts with "converted:" which is our marker for "we already processed this - # value". - if value is None: - return None - - if isinstance(value, dict): - return "converted:" + json.dumps(value) - - if value.startswith("converted:"): - return value - - if os.path.isfile(value): - try: - with open(value, "r", encoding="utf-8") as f: - content = f.read() - except OSError as e: - raise click.UsageError( - "Could not read configuration file '%s'" % value - ) from e - return "converted:" + content - return "converted:" + value - - def convert(self, value, param, ctx): - return self.convert_value(value) - - -class ConfigInput: - # ConfigInput is an internal class responsible for processing all the --config - # options. It gathers information from the --local-config-file (to figure out - # where options are stored) and is also responsible for processing any `--config` - # options and processing the default value of `Config(...)` objects. - - # It will then store this information in the flow spec for use later in processing. - # It is stored in the flow spec to avoid being global to support the Runner. - - loaded_configs = None # type: Optional[Dict[str, Dict[Any, Any]]] - config_file = None # type: Optional[str] - - def __init__( - self, - req_configs: List[str], - defaults: Dict[str, Union[str, Dict[Any, Any]]], - parsers: Dict[str, Callable[[str], Dict[Any, Any]]], - ): - self._req_configs = set(req_configs) - self._defaults = defaults - self._parsers = parsers - - @staticmethod - def make_key_name(name: str) -> str: - # Special mark to indicate that the configuration value is not content or a file - # name but a value that should be read in the config file (effectively where - # the value has already been materialized). - return "kv." + name.lower() - - @classmethod - def set_config_file(cls, config_file: str): - cls.config_file = config_file - - @classmethod - def get_config(cls, config_name: str) -> Optional[Dict[Any, Any]]: - if cls.loaded_configs is None: - all_configs = load_config_values(cls.config_file) - if all_configs is None: - raise MetaflowException( - "Could not load expected configuration values " - "from the CONFIG_PARAMETERS file. This is a Metaflow bug. " - "Please contact support." - ) - cls.loaded_configs = all_configs - return cls.loaded_configs.get(config_name, None) - - def process_configs(self, ctx, param, value): - from .cli import echo_always, echo_dev_null # Prevent circular import - from .flowspec import _FlowState # Prevent circular import - - flow_cls = getattr(current_flow, "flow_cls", None) - if flow_cls is None: - # This is an error - raise MetaflowInternalError( - "Config values should be processed for a FlowSpec" - ) - flow_cls._flow_state[_FlowState.CONFIGS] = {} - # This function is called by click when processing all the --config options. - # The value passed in is a list of tuples (name, value). - # Click will provide: - # - all the defaults if nothing is provided on the command line - # - provide *just* the passed in value if anything is provided on the command - # line. - # - # We therefore "merge" the defaults with what we are provided by click to form - # a full set of values - # We therefore get a full set of values where: - # - the name will correspond to the configuration name - # - the value will be the default (including None if there is no default) or - # the string representation of the value (this will always include - # the "converted:" prefix as it will have gone through the PathOrStr - # conversion function). A value of None basically means that the config has - # no default and was not specified on the command line. - to_return = {} - - merged_configs = dict(self._defaults) - for name, val in value: - # Don't replace by None -- this is needed to avoid replacing a function - # default - if val: - merged_configs[name] = val - - print("PARAMS: %s" % str(ctx.params)) - missing_configs = set() - for name, val in merged_configs.items(): - name = name.lower() - # convert is idempotent so if it is already converted, it will just return - # the value. This is used to make sure we process the defaults which do - # NOT make it through the PathOrStr convert function - if isinstance(val, DeployTimeField): - # This supports a default value that is a deploy-time field (similar - # to Parameter).) - # We will form our own context and pass it down -- note that you cannot - # use configs in the default value of configs as this introduces a bit - # of circularity. Note also that quiet and datastore are *eager* - # options so are available here. - param_ctx = ParameterContext( - flow_name=ctx.obj.flow.name, - user_name=get_username(), - parameter_name=name, - logger=echo_dev_null if ctx.params["quiet"] else echo_always, - ds_type=ctx.params["datastore"], - configs=None, - ) - val = val.fun(param_ctx) - val = PathOrStr.convert_value(val) - if val is None: - missing_configs.add(name) - continue - val = val[10:] # Remove the "converted:" prefix - if val.startswith("kv."): - # This means to load it from a file - read_value = self.get_config(val[3:]) - if read_value is None: - raise click.UsageError( - "Could not find configuration '%s' in INFO file" % val - ) - flow_cls._flow_state[_FlowState.CONFIGS][name] = read_value - to_return[name] = ConfigValue(read_value) - else: - if self._parsers[name]: - read_value = self._parsers[name](val) - else: - try: - read_value = json.loads(val) - except json.JSONDecodeError as e: - raise click.UsageError( - "Configuration value for '%s' is not valid JSON" % name - ) from e - # TODO: Support YAML - flow_cls._flow_state[_FlowState.CONFIGS][name] = read_value - to_return[name] = ConfigValue(read_value) - - if missing_configs.intersection(self._req_configs): - raise click.UsageError( - "Missing configuration values for %s" % ", ".join(missing_configs) - ) - return to_return - - def __str__(self): - return repr(self) - - def __repr__(self): - return "ConfigInput" - - -class LocalFileInput(click.Path): - # Small wrapper around click.Path to set the value from which to read configuration - # values. This is set immediately upon processing the --local-config-file - # option and will therefore then be available when processing any of the other - # --config options (which will call ConfigInput.process_configs - name = "LocalFileInput" - - def convert(self, value, param, ctx): - v = super().convert(value, param, ctx) - ConfigInput.set_config_file(value) - return v - - def __str__(self): - return repr(self) - - def __repr__(self): - return "LocalFileInput" - - -ConfigArgType = Union[str, Dict[Any, Any]] - - -class MultipleTuple(click.Tuple): - # Small wrapper around a click.Tuple to allow the environment variable for - # configurations to be a JSON string. Otherwise the default behavior is splitting - # by whitespace which is totally not what we want - # You can now pass multiple configuration options through an environment variable - # using something like: - # METAFLOW_FLOW_CONFIG='{"config1": "filenameforconfig1.json", "config2": {"key1": "value1"}}' - - def split_envvar_value(self, rv): - loaded = json.loads(rv) - return list( - item if isinstance(item, str) else json.dumps(item) - for pair in loaded.items() - for item in pair - ) - - -class DelayEvaluator: - """ - Small wrapper that allows the evaluation of a Config() value in a delayed manner. - This is used when we want to use config.* values in decorators for example. - """ - - id_pattern = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$") - - def __init__(self, ex: str): - self._config_expr = ex - if self.id_pattern.match(self._config_expr): - # This is a variable only so allow things like config_expr("config").var - self._is_var_only = True - self._access = [] - else: - self._is_var_only = False - self._access = None - - def __getattr__(self, name): - if self._access is None: - raise AttributeError() - self._access.append(name) - return self - - def __call__(self, ctx=None, deploy_time=False): - from .flowspec import _FlowState # Prevent circular import - - # Two additional arguments are only used by DeployTimeField which will call - # this function with those two additional arguments. They are ignored. - flow_cls = getattr(current_flow, "flow_cls", None) - if flow_cls is None: - # We are not executing inside a flow (ie: not the CLI) - raise MetaflowException( - "Config object can only be used directly in the FlowSpec defining them. " - "If using outside of the FlowSpec, please use ConfigEval" - ) - if self._access is not None: - # Build the final expression by adding all the fields in access as . fields - self._config_expr = ".".join([self._config_expr] + self._access) - # Evaluate the expression setting the config values as local variables - return eval( - self._config_expr, - globals(), - { - k: ConfigValue(v) - for k, v in flow_cls._flow_state.get(_FlowState.CONFIGS, {}).items() - }, - ) - - -def config_expr(expr: str) -> DelayEvaluator: - """ - Function to allow you to use an expression involving a config parameter in - places where it may not be directory accessible or if you want a more complicated - expression than just a single variable. - - You can use it as follows: - - When the config is not directly accessible: - - @project(name=config_expr("config").project.name) - class MyFlow(FlowSpec): - config = Config("config") - ... - - When you want a more complex expression: - class MyFlow(FlowSpec): - config = Config("config") - - @environment(vars={"foo": config_expr("config.bar.baz.lower()")}) - @step - def start(self): - ... - - Parameters - ---------- - expr : str - Expression using the config values. - """ - return DelayEvaluator(expr) - - -def eval_config(f: Callable[["FlowSpec"], "FlowSpec"]) -> "FlowSpec": - """ - Decorator to allow you to add Python decorators to a FlowSpec that makes use of - user configurations. - - As an example: - - ``` - def parameterize(f): - for s in f: - # Iterate over all the steps - if s.name in f.config.add_env_to_steps: - setattr(f, s.name) = environment(vars={**f.config.env_vars})(s) - return f - - @eval_config(parameterize) - class MyFlow(FlowSpec): - config = Config("config") - ... - ``` - - allows you to add an environment decorator to all steps in `add_env_to_steps`. Both - the steps to add this decorator to and the values to add are extracted from the - configuration passed to the Flow through config. - - Parameters - ---------- - f : Callable[[FlowSpec], FlowSpec] - Decorator function - - Returns - ------- - FlowSpec - The modified FlowSpec - """ - - def _wrapper(flow_spec: "FlowSpec"): - from .flowspec import _FlowState - - flow_spec._flow_state.setdefault(_FlowState.CONFIG_FUNCS, []).append(f) - return flow_spec - - return _wrapper - - -class Config(Parameter): - """ - Includes a configuration for this flow. - - `Config` is a special type of `Parameter` but differs in a few key areas: - - it is immutable and determined at deploy time (or prior to running if not deploying - to a scheduler) - - as such, it can be used anywhere in your code including in Metaflow decorators - - - Parameters - ---------- - name : str - User-visible configuration name. - default : Union[str, Dict[Any, Any], Callable[[ParameterContext], Union[str, Dict[Any, Any]]]], optional, default None - Default value for the parameter. A function - implies that the value will be computed using that function. - help : str, optional, default None - Help text to show in `run --help`. - required : bool, default False - Require that the user specified a value for the parameter. Note that if - a default is provided, the required flag is ignored. - parser : Callable[[str], Dict[Any, Any]], optional, default None - An optional function that can parse the configuration string into an arbitrarily - nested dictionary. - show_default : bool, default True - If True, show the default value in the help text. - """ - - IS_FLOW_PARAMETER = True - - def __init__( - self, - name: str, - default: Optional[ - Union[ - str, - Dict[Any, Any], - Callable[[ParameterContext], Union[str, Dict[Any, Any]]], - ] - ] = None, - help: Optional[str] = None, - required: bool = False, - parser: Optional[Callable[[str], Dict[Any, Any]]] = None, - **kwargs: Dict[str, str] - ): - - print("Config %s, default is %s" % (name, default)) - super(Config, self).__init__( - name, default=default, required=required, help=help, type=str, **kwargs - ) - - if isinstance(kwargs.get("default", None), str): - kwargs["default"] = json.dumps(kwargs["default"]) - self.parser = parser - - def load_parameter(self, v): - return v - - def __getattr__(self, name): - ev = DelayEvaluator(self.name) - return ev.__getattr__(name) - - -def config_options(cmd): - help_strs = [] - required_names = [] - defaults = {} - config_seen = set() - parsers = {} - flow_cls = getattr(current_flow, "flow_cls", None) - if flow_cls is None: - return cmd - - parameters = [p for _, p in flow_cls._get_parameters() if p.IS_FLOW_PARAMETER] - config_opt_required = False - # List all the configuration options - for arg in parameters[::-1]: - save_default = arg.kwargs.get("default", None) - kwargs = arg.option_kwargs(False) - if arg.name.lower() in config_seen: - msg = ( - "Multiple configurations use the same name '%s'. Note that names are " - "case-insensitive. Please change the " - "names of some of your configurations" % arg.name - ) - raise MetaflowException(msg) - config_seen.add(arg.name.lower()) - if kwargs["required"]: - required_names.append(arg.name) - if save_default is None: - # We need at least one option if we have a required configuration. - config_opt_required = True - defaults[arg.name.lower()] = save_default - help_strs.append(" - %s: %s" % (arg.name.lower(), kwargs.get("help", ""))) - parsers[arg.name.lower()] = arg.parser - - print( - "DEFAULTS %s" - % str(dict((k, v if not callable(v) else "FUNC") for k, v in defaults.items())) - ) - if not config_seen: - # No configurations -- don't add anything - return cmd - - help_str = ( - "Configuration options for the flow. " - "Multiple configurations can be specified." - ) - help_str = "\n\n".join([help_str] + help_strs) - cmd.params.insert( - 0, - click.Option( - ["--config", "config_options"], - nargs=2, - multiple=True, - type=MultipleTuple([click.Choice(config_seen), PathOrStr()]), - callback=ConfigInput(required_names, defaults, parsers).process_configs, - help=help_str, - envvar="METAFLOW_FLOW_CONFIG", - show_default=False, - default=[(k, v if not callable(v) else None) for k, v in defaults.items()], - required=config_opt_required, - ), - ) - return cmd diff --git a/metaflow/decorators.py b/metaflow/decorators.py index 11b7308c7f0..b615d5744ac 100644 --- a/metaflow/decorators.py +++ b/metaflow/decorators.py @@ -12,7 +12,7 @@ ) from .parameters import current_flow -from .config_parameters import DelayEvaluator +from .user_configs.config_parameters import DelayEvaluator from metaflow._vendor import click @@ -116,10 +116,12 @@ class Decorator(object): def __init__(self, attributes=None, statically_defined=False): self.attributes = self.defaults.copy() self.statically_defined = statically_defined + self._user_defined_attributes = set() if attributes: for k, v in attributes.items(): - if k in self.defaults: + self._user_defined_attributes.add(k) + if k in self.defaults or k.startswith("_unpacked_delayed_"): self.attributes[k] = v else: raise InvalidDecoratorAttribute(self.name, k, self.defaults) @@ -146,7 +148,27 @@ def _resolve_delayed_evaluator(v): return {_resolve_delayed_evaluator(x) for x in v} return v + # Expand any eventual _unpacked_delayed_ attributes. These are special attributes + # that allow the delay unpacking of configuration values. + delayed_upack_keys = [ + k for k in self.attributes if k.startswith("_unpacked_delayed_") + ] + if delayed_upack_keys: + for k in delayed_upack_keys: + unpacked = _resolve_delayed_evaluator(self.attributes[k]) + for uk, uv in unpacked.items(): + if uk in self._user_defined_attributes: + raise SyntaxError( + "keyword argument repeated: %s" % uk, "", 0, "" + ) + self._user_defined_attributes.add(uk) + self.attributes[uk] = uv + del self.attributes[k] + + # Now resolve all attributes for k, v in self.attributes.items(): + # This is a special attribute that means we are going to unpack + # the configuration valu self.attributes[k] = _resolve_delayed_evaluator(v) @classmethod @@ -460,6 +482,7 @@ def _base_step_decorator(decotype, *args, **kwargs): Decorator prototype for all step decorators. This function gets specialized and imported for all decorators types by _import_plugin_decorators(). """ + if args: # No keyword arguments specified for the decorator, e.g. @foobar. # The first argument is the function to be decorated. diff --git a/metaflow/flowspec.py b/metaflow/flowspec.py index 3fd3f86aa0c..a501832914e 100644 --- a/metaflow/flowspec.py +++ b/metaflow/flowspec.py @@ -14,6 +14,7 @@ from .exception import ( MetaflowException, MissingInMergeArtifactsException, + MetaflowInternalError, UnhandledInMergeArtifactsException, ) @@ -21,7 +22,12 @@ from .graph import FlowGraph from .unbounded_foreach import UnboundedForeachInput -from .config_parameters import ConfigInput, ConfigValue +from .user_configs.config_decorators import ( + FlowConfigDecorator, + FlowSpecProxy, + StepConfigDecorator, + StepProxy, +) from .util import to_pod from .metaflow_config import INCLUDE_FOREACH_STACK, MAXIMUM_FOREACH_VALUE_CHARS @@ -69,7 +75,7 @@ def __getitem__(self, item): class _FlowState(Enum): CONFIGS = 1 - CONFIG_FUNCS = 2 + CONFIG_DECORATORS = 2 CACHED_PARAMETERS = 3 @@ -90,52 +96,6 @@ def __new__(cls, name, bases, dct): return f - @property - def configs(cls) -> Generator[Tuple[str, "ConfigValue"], None, None]: - """ - Iterate over all user configurations in this flow - - Use this to parameterize your flow based on configuration. As an example: - ``` - def parametrize(flow): - val = next(flow.configs)[1].steps.start.cpu - flow.start = environment(vars={'mycpu': val})(flow.start) - return flow - - @parametrize - class TestFlow(FlowSpec): - config = Config('myconfig.json') - - @step - def start(self): - pass - ``` - can be used to add an environment decorator to the `start` step. - - Yields - ------ - Tuple[str, ConfigValue] - Iterates over the configurations of the flow - """ - # When configs are parsed, they are loaded in _flow_state[_FlowState.CONFIGS] - for name, value in cls._flow_state.get(_FlowState.CONFIGS, {}).items(): - yield name, ConfigValue(value) - - @property - def steps(cls) -> Generator[Tuple[str, Any], None, None]: - """ - Iterate over all the steps in this flow - - Yields - ------ - Tuple[str, Any] - A tuple with the step name and the step itself - """ - for var in dir(cls): - potential_step = getattr(cls, var) - if callable(potential_step) and hasattr(potential_step, "is_step"): - yield var, potential_step - class FlowSpec(metaclass=FlowSpecMeta): """ @@ -222,11 +182,11 @@ def _check_parameters(self): ) seen.add(norm) - def _process_config_funcs(self, config_options): + def _process_config_decorators(self, config_options): current_cls = self.__class__ # Fast path for no user configurations - if not self._flow_state.get(_FlowState.CONFIG_FUNCS): + if not self._flow_state.get(_FlowState.CONFIG_DECORATORS): return self # We need to convert all the user configurations from DelayedEvaluationParameters @@ -246,10 +206,29 @@ def _process_config_funcs(self, config_options): val = val() setattr(current_cls, var, val) - # Run all the functions. They will now be able to access the configuration - # values directly from the class - for func in self._flow_state[_FlowState.CONFIG_FUNCS]: - current_cls = func(current_cls) + # Run all the decorators + for deco in self._flow_state[_FlowState.CONFIG_DECORATORS]: + if isinstance(deco, FlowConfigDecorator): + # Sanity check to make sure we are applying the decorator to the right + # class + if not deco._flow_cls == current_cls and not issubclass( + current_cls, deco._flow_cls + ): + raise MetaflowInternalError( + "FlowConfigDecorator registered on the wrong flow -- " + "expected %s but got %s" + % (deco._flow_cls.__name__, current_cls.__name__) + ) + deco.evaluate(FlowSpecProxy(current_cls)) + elif isinstance(deco, StepConfigDecorator): + # Again some sanity checks + if deco._flow_cls != current_cls: + raise MetaflowInternalError( + "StepConfigDecorator registered on the wrong flow -- " + "expected %s but got %s" + % (deco._flow_cls.__name__, current_cls.__name__) + ) + deco.evaluate(StepConfigDecorator(deco._my_step)) # Reset all configs that were already present in the class. # TODO: This means that users can't override configs directly. Not sure if this diff --git a/metaflow/includefile.py b/metaflow/includefile.py index 499b4cd6a90..c81a701dfb3 100644 --- a/metaflow/includefile.py +++ b/metaflow/includefile.py @@ -20,7 +20,7 @@ ) from .plugins import DATACLIENTS -from .config_parameters import ConfigValue +from .user_configs.config_parameters import ConfigValue from .util import get_username import functools diff --git a/metaflow/package.py b/metaflow/package.py index 9666d929d08..1385883d5a7 100644 --- a/metaflow/package.py +++ b/metaflow/package.py @@ -6,7 +6,7 @@ import json from io import BytesIO -from .config_parameters import CONFIG_FILE, dump_config_values +from .user_configs.config_parameters import CONFIG_FILE, dump_config_values from .extension_support import EXT_PKG, package_mfext_all from .metaflow_config import DEFAULT_PACKAGE_SUFFIXES from .exception import MetaflowException diff --git a/metaflow/parameters.py b/metaflow/parameters.py index f020a5cf406..8a29a787808 100644 --- a/metaflow/parameters.py +++ b/metaflow/parameters.py @@ -15,7 +15,7 @@ ) if TYPE_CHECKING: - from .config_parameters import ConfigValue + from .user_configs.config_parameters import ConfigValue try: # Python2 @@ -232,7 +232,9 @@ def deploy_time_eval(value): # this is called by cli.main def set_parameter_context(flow_name, echo, datastore, configs): - from .config_parameters import ConfigValue # Prevent circular dependency + from .user_configs.config_parameters import ( + ConfigValue, + ) # Prevent circular dependency global context_proto context_proto = ParameterContext( @@ -328,7 +330,9 @@ def __init__( int, bool, Dict[str, Any], - Callable[[], Union[str, float, int, bool, Dict[str, Any]]], + Callable[ + [ParameterContext], Union[str, float, int, bool, Dict[str, Any]] + ], ] ] = None, type: Optional[ diff --git a/metaflow/runner/click_api.py b/metaflow/runner/click_api.py index 0001ed39a95..96faa235281 100644 --- a/metaflow/runner/click_api.py +++ b/metaflow/runner/click_api.py @@ -39,7 +39,7 @@ from metaflow.exception import MetaflowException from metaflow.includefile import FilePathClass from metaflow.parameters import JSONTypeClass, flow_context -from metaflow.config_parameters import LocalFileInput +from metaflow.config_options import LocalFileInput # Define a recursive type alias for JSON JSON = Union[Dict[str, "JSON"], List["JSON"], str, int, float, bool, None] diff --git a/metaflow/runtime.py b/metaflow/runtime.py index 6218e78a03a..2feca2c2be0 100644 --- a/metaflow/runtime.py +++ b/metaflow/runtime.py @@ -44,7 +44,8 @@ UBF_TASK, ) -from .config_parameters import ConfigInput, dump_config_values +from .user_configs.config_options import ConfigInput +from .user_configs.config_parameters import dump_config_values import metaflow.tracing as tracing @@ -1527,7 +1528,7 @@ def __init__(self, task): # in the case of the local runtime) configs = self.task.flow._flow_state.get(_FlowState.CONFIGS) if configs: - self.top_level_options["config"] = [ + self.top_level_options["config-value"] = [ (k, ConfigInput.make_key_name(k)) for k in configs ] diff --git a/metaflow/user_configs/__init__.py b/metaflow/user_configs/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/metaflow/user_configs/config_decorators.py b/metaflow/user_configs/config_decorators.py new file mode 100644 index 00000000000..16e9476e08e --- /dev/null +++ b/metaflow/user_configs/config_decorators.py @@ -0,0 +1,214 @@ +from functools import partial +from typing import Any, Callable, Generator, TYPE_CHECKING, Tuple, Union + +from metaflow.exception import MetaflowException +from metaflow.parameters import Parameter +from metaflow.user_configs.config_parameters import ConfigValue + +if TYPE_CHECKING: + from metaflow.flowspec import FlowSpec + from metaflow.decorators import FlowSpecDerived, StepDecorator + + +class StepProxy: + """ + A StepProxy is a wrapper passed to the `StepConfigDecorator`'s `evaluate` method + to allow the decorator to interact with the step and providing easy methods to + modify the behavior of the step. + """ + + def __init__( + self, + step: Union[ + Callable[["FlowSpecDerived"], None], + Callable[["FlowSpecDerived", Any], None], + ], + ): + self._my_step = step + + def remove_decorator(self, deco_name: str, all: bool = True, **kwargs) -> bool: + """ + Remove one or more Metaflow decorators from a step. + + Some decorators can be applied multiple times to a step. This method allows you + to choose which decorator to remove or just remove all of them or one of them. + + Parameters + ---------- + deco_name : str + Name of the decorator to remove + all : bool, default True + If True, remove all instances of the decorator that match the filters + passed using kwargs (or all the instances of the decorator if no filters are + passed). If False, removes only the first found instance of the decorator. + + Returns + ------- + bool + Returns True if at least one decorator was removed. + """ + new_deco_list = [] + did_remove = False + for deco in self._my_step.decorators: + if deco.name == deco_name: + # Check filters + match_ok = True + if kwargs: + for k, v in kwargs.items(): + match_ok = k in deco.attributes and deco.attributes[k] == v + if match_ok is False: + break + if match_ok: + did_remove = True + else: + new_deco_list.append(deco) + else: + new_deco_list.append(deco) + if did_remove and not all: + break + + self._my_step.decorators = new_deco_list + return did_remove + + def add_decorator(self, deco_type: partial, **kwargs) -> None: + """ + Add a Metaflow decorator to a step. + + Parameters + ---------- + deco_type : partial + The decorator class to add to this step + """ + # Prevent circular import + from metaflow.decorators import DuplicateStepDecoratorException, StepDecorator + + # Validate deco_type + if ( + not isinstance(deco_type, partial) + or len(deco_type.args) != 1 + or not issubclass(deco_type.args[0], StepDecorator) + ): + raise TypeError("add_decorator takes a StepDecorator") + + deco_type = deco_type.args[0] + if ( + deco_type.name in [deco.name for deco in self._my_step.decorators] + and not deco_type.allow_multiple + ): + raise DuplicateStepDecoratorException(deco_type.name, self._my_step) + + self._my_step.decorators.append( + deco_type(attributes=kwargs, statically_defined=True) + ) + + +class FlowSpecProxy: + def __init__(self, flow_spec: "FlowSpec"): + self._flow_cls = flow_spec + + @property + def configs(self) -> Generator[Tuple[str, ConfigValue], None, None]: + """ + Iterate over all user configurations in this flow + + Use this to parameterize your flow based on configuration. As an example, the + `evaluate` method of your `FlowConfigDecorator` can use this to add an + environment decorator. + ``` + class MyDecorator(FlowConfigDecorator): + def evaluate(flow: FlowSpecProxy): + val = next(flow.configs)[1].steps.start.cpu + flow.start.add_decorator(environment, vars={'mycpu': val}) + return flow + + @MyDecorator() + class TestFlow(FlowSpec): + config = Config('myconfig.json') + + @step + def start(self): + pass + ``` + can be used to add an environment decorator to the `start` step. + + Yields + ------ + Tuple[str, ConfigValue] + Iterates over the configurations of the flow + """ + from metaflow.flowspec import _FlowState + + # When configs are parsed, they are loaded in _flow_state[_FlowState.CONFIGS] + for name, value in self._flow_cls._flow_state.get( + _FlowState.CONFIGS, {} + ).items(): + yield name, ConfigValue(value) + + @property + def steps(self) -> Generator[Tuple[str, StepProxy], None, None]: + """ + Iterate over all the steps in this flow + + Yields + ------ + Tuple[str, StepProxy] + A tuple with the step name and the step proxy + """ + for var in dir(self._flow_cls): + potential_step = getattr(self._flow_cls, var) + if callable(potential_step) and hasattr(potential_step, "is_step"): + yield var, StepProxy(potential_step) + + def __getattr__(self, name): + # We allow direct access to the steps, configs and parameters but nothing else + attr = getattr(self._flow_cls, name) + if attr: + # Steps + if callable(attr) and hasattr(attr, "is_step"): + return StepProxy(attr) + if name[0] == "_" or name in self._flow_cls._NON_PARAMETERS: + raise AttributeError(self, name) + return attr + raise AttributeError(self, name) + + +class FlowConfigDecorator: + def __call__(self, flow_spec: "FlowSpec") -> "FlowSpec": + from ..flowspec import _FlowState + + flow_spec._flow_state.setdefault(_FlowState.CONFIG_DECORATORS, []).append(self) + self._flow_cls = flow_spec + return flow_spec + + def evaluate(self, flow_proxy: FlowSpecProxy) -> None: + raise NotImplementedError() + + +class StepConfigDecorator: + def __call__( + self, + step: Union[ + Callable[["FlowSpecDerived"], None], + Callable[["FlowSpecDerived", Any], None], + ], + ) -> Union[ + Callable[["FlowSpecDerived"], None], + Callable[["FlowSpecDerived", Any], None], + ]: + from ..flowspec import _FlowState + + if not hasattr(step, "is_step"): + raise MetaflowException( + "StepConfigDecorators must be applied to a step function" + ) + self._my_step = step + # Get the flow + flow_spec = step.__globals__[step.__qualname__.rsplit(".", 1)[0]] + flow_spec._flow_state.setdefault(_FlowState.CONFIG_DECORATORS, []).append(self) + + self._flow_cls = flow_spec + + return step + + def evaluate(self, step_proxy: StepProxy) -> None: + raise NotImplementedError() diff --git a/metaflow/user_configs/config_options.py b/metaflow/user_configs/config_options.py new file mode 100644 index 00000000000..2af0cc8d807 --- /dev/null +++ b/metaflow/user_configs/config_options.py @@ -0,0 +1,384 @@ +import json +import os + +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +from metaflow._vendor import click + +from .config_parameters import CONFIG_FILE, ConfigValue +from ..exception import MetaflowException, MetaflowInternalError +from ..parameters import DeployTimeField, ParameterContext, current_flow +from ..util import get_username + + +def _load_config_values(info_file: Optional[str] = None) -> Optional[Dict[Any, Any]]: + if info_file is None: + info_file = os.path.basename(CONFIG_FILE) + try: + with open(info_file, encoding="utf-8") as contents: + return json.load(contents).get("user_configs", {}) + except IOError: + return None + + +class ConvertPath(click.Path): + name = "ConvertPath" + + def convert(self, value, param, ctx): + if isinstance(value, str) and value.startswith("converted:"): + return value + v = super().convert(value, param, ctx) + return self.convert_value(v) + + @staticmethod + def convert_value(value): + if value is None: + return None + try: + with open(value, "r", encoding="utf-8") as f: + content = f.read() + except OSError: + return "converted:!!NO_FILE!!%s" % value + return "converted:" + content + + +class ConvertDictOrStr(click.ParamType): + name = "ConvertDictOrStr" + + def convert(self, value, param, ctx): + return self.convert_value(value) + + @staticmethod + def convert_value(value): + if value is None: + return None + + if isinstance(value, dict): + return "converted:" + json.dumps(value) + + if value.startswith("converted:"): + return value + + return "converted:" + value + + +class MultipleTuple(click.Tuple): + # Small wrapper around a click.Tuple to allow the environment variable for + # configurations to be a JSON string. Otherwise the default behavior is splitting + # by whitespace which is totally not what we want + # You can now pass multiple configuration options through an environment variable + # using something like: + # METAFLOW_FLOW_CONFIG='{"config1": "filenameforconfig1.json", "config2": {"key1": "value1"}}' + + def split_envvar_value(self, rv): + loaded = json.loads(rv) + return list( + item if isinstance(item, str) else json.dumps(item) + for pair in loaded.items() + for item in pair + ) + + +class ConfigInput: + # ConfigInput is an internal class responsible for processing all the --config and + # --config-value options. + # It gathers information from the --local-config-file (to figure out + # where options are stored) and is also responsible for processing any `--config` or + # `--config-value` options. Note that the process_configs function will be called + # *twice* (once for the configs and another for the config-values). This makes + # this function a little bit more tricky. We need to wait for both calls before + # being able to process anything. + + # It will then store this information in the flow spec for use later in processing. + # It is stored in the flow spec to avoid being global to support the Runner. + + loaded_configs = None # type: Optional[Dict[str, Dict[Any, Any]]] + config_file = None # type: Optional[str] + + def __init__( + self, + req_configs: List[str], + defaults: Dict[str, Tuple[Union[str, Dict[Any, Any]], bool]], + parsers: Dict[str, Callable[[str], Dict[Any, Any]]], + ): + self._req_configs = set(req_configs) + self._defaults = defaults + self._parsers = parsers + self._path_values = None + self._value_values = None + + @staticmethod + def make_key_name(name: str) -> str: + # Special mark to indicate that the configuration value is not content or a file + # name but a value that should be read in the config file (effectively where + # the value has already been materialized). + return "kv." + name.lower() + + @classmethod + def set_config_file(cls, config_file: str): + cls.config_file = config_file + + @classmethod + def get_config(cls, config_name: str) -> Optional[Dict[Any, Any]]: + if cls.loaded_configs is None: + all_configs = _load_config_values(cls.config_file) + if all_configs is None: + raise MetaflowException( + "Could not load expected configuration values " + "from the CONFIG_PARAMETERS file. This is a Metaflow bug. " + "Please contact support." + ) + cls.loaded_configs = all_configs + return cls.loaded_configs.get(config_name, None) + + def process_configs(self, ctx, param, value): + from ..cli import echo_always, echo_dev_null # Prevent circular import + from ..flowspec import _FlowState # Prevent circular import + + flow_cls = getattr(current_flow, "flow_cls", None) + if flow_cls is None: + # This is an error + raise MetaflowInternalError( + "Config values should be processed for a FlowSpec" + ) + + # This function is called by click when processing all the --config and + # --config-value options. + # The value passed in is a list of tuples (name, value). + # Click will provide: + # - all the defaults if nothing is provided on the command line + # - provide *just* the passed in value if anything is provided on the command + # line. + # + # We need to get all config and config-value options and click will call this + # function twice. We will first get all the values on the command line and + # *then* merge with the defaults to form a full set of values. + # We therefore get a full set of values where: + # - the name will correspond to the configuration name + # - the value will be the default (including None if there is no default) or + # the string representation of the value (this will always include + # the "converted:" prefix as it will have gone through the ConvertPath or + # ConvertDictOrStr conversion function). + # A value of None basically means that the config has + # no default and was not specified on the command line. + + print("Got arg name %s and values %s" % (param.name, str(value))) + do_return = self._value_values is None and self._path_values is None + if param.name == "config_value_options": + self._value_values = {k.lower(): v for k, v in value if v is not None} + else: + self._path_values = {k.lower(): v for k, v in value if v is not None} + if do_return: + # One of config_value_options or config_file_options will be None + return None + + # The second go around, we process all the values and merge them. + # Check that the user didn't provide *both* a path and a value. We know that + # defaults are not both non None (this is an error) so if they are both + # non-None (and actual files) here, it means the user explicitly provided both. + common_keys = set(self._value_values or []).intersection( + [ + k + for k, v in self._path_values.items() + if v and not v.startswith("converted:!!NO_FILE!!") + ] + or [] + ) + if common_keys: + raise click.UsageError( + "Cannot provide both a value and a file for the same configuration. " + "Found such values for '%s'" % "', '".join(common_keys) + ) + + # NOTE: Important to start with _path_values as they can have the + # NO_FILE special value. They will be used (and trigger an error) iff there is + # no other value provided. + all_values = dict(self._path_values or {}) + all_values.update(self._value_values or {}) + + print("Got all values: %s" % str(all_values)) + flow_cls._flow_state[_FlowState.CONFIGS] = {} + + to_return = {} + merged_configs = {} + for name, (val, is_path) in self._defaults.items(): + n = name.lower() + if n in all_values: + merged_configs[n] = all_values[n] + else: + if isinstance(val, DeployTimeField): + # This supports a default value that is a deploy-time field (similar + # to Parameter).) + # We will form our own context and pass it down -- note that you cannot + # use configs in the default value of configs as this introduces a bit + # of circularity. Note also that quiet and datastore are *eager* + # options so are available here. + param_ctx = ParameterContext( + flow_name=ctx.obj.flow.name, + user_name=get_username(), + parameter_name=n, + logger=echo_dev_null if ctx.params["quiet"] else echo_always, + ds_type=ctx.params["datastore"], + configs=None, + ) + val = val.fun(param_ctx) + if is_path: + # This is a file path + merged_configs[n] = ConvertPath.convert_value(val) + else: + # This is a value + merged_configs[n] = ConvertDictOrStr.convert_value(val) + + missing_configs = set() + no_file = [] + msgs = [] + for name, val in merged_configs.items(): + if val is None: + missing_configs.add(name) + continue + if val.startswith("converted:!!NO_FILE!!"): + no_file.append(name) + continue + val = val[10:] # Remove the "converted:" prefix + if val.startswith("kv."): + # This means to load it from a file + read_value = self.get_config(val[3:]) + if read_value is None: + raise click.UsageError( + "Could not find configuration '%s' in INFO file" % val + ) + flow_cls._flow_state[_FlowState.CONFIGS][name] = read_value + to_return[name] = ConfigValue(read_value) + else: + if self._parsers[name]: + read_value = self._parsers[name](val) + else: + try: + read_value = json.loads(val) + except json.JSONDecodeError as e: + msgs.append( + "configuration value for '%s' is not valid JSON: %s" + % (name, e) + ) + continue + # TODO: Support YAML + flow_cls._flow_state[_FlowState.CONFIGS][name] = read_value + to_return[name] = ConfigValue(read_value) + + reqs = missing_configs.intersection(self._req_configs) + for missing in reqs: + msgs.append("missing configuration for '%s'" % missing) + for missing in no_file: + msgs.append( + "configuration file '%s' could not be read for '%s'" + % (merged_configs[missing][21:], missing) + ) + if msgs: + raise click.UsageError( + "Bad values passed for configuration options: %s" % ", ".join(msgs) + ) + return to_return + + def __str__(self): + return repr(self) + + def __repr__(self): + return "ConfigInput" + + +class LocalFileInput(click.Path): + # Small wrapper around click.Path to set the value from which to read configuration + # values. This is set immediately upon processing the --local-config-file + # option and will therefore then be available when processing any of the other + # --config options (which will call ConfigInput.process_configs + name = "LocalFileInput" + + def convert(self, value, param, ctx): + v = super().convert(value, param, ctx) + ConfigInput.set_config_file(value) + return v + + def __str__(self): + return repr(self) + + def __repr__(self): + return "LocalFileInput" + + +def config_options(cmd): + help_strs = [] + required_names = [] + defaults = {} + config_seen = set() + parsers = {} + flow_cls = getattr(current_flow, "flow_cls", None) + if flow_cls is None: + return cmd + + parameters = [p for _, p in flow_cls._get_parameters() if p.IS_FLOW_PARAMETER] + # List all the configuration options + for arg in parameters[::-1]: + save_default = arg.kwargs.get("default", None) + kwargs = arg.option_kwargs(False) + if arg.name.lower() in config_seen: + msg = ( + "Multiple configurations use the same name '%s'. Note that names are " + "case-insensitive. Please change the " + "names of some of your configurations" % arg.name + ) + raise MetaflowException(msg) + config_seen.add(arg.name.lower()) + if kwargs["required"]: + required_names.append(arg.name) + defaults[arg.name.lower()] = (save_default, arg._default_is_file) + help_strs.append(" - %s: %s" % (arg.name.lower(), kwargs.get("help", ""))) + parsers[arg.name.lower()] = arg.parser + + if not config_seen: + # No configurations -- don't add anything + return cmd + + help_str = ( + "Configuration options for the flow. " + "Multiple configurations can be specified." + ) + help_str = "\n\n".join([help_str] + help_strs) + cb_func = ConfigInput(required_names, defaults, parsers).process_configs + + cmd.params.insert( + 0, + click.Option( + ["--config-value", "config_value_options"], + nargs=2, + multiple=True, + type=MultipleTuple([click.Choice(config_seen), ConvertDictOrStr()]), + callback=cb_func, + help=help_str, + envvar="METAFLOW_FLOW_CONFIG_VALUE", + show_default=False, + default=[ + (k, v[0] if not callable(v[0]) and not v[1] else None) + for k, v in defaults.items() + ], + required=False, + ), + ) + cmd.params.insert( + 0, + click.Option( + ["--config", "config_file_options"], + nargs=2, + multiple=True, + type=MultipleTuple([click.Choice(config_seen), ConvertPath()]), + callback=cb_func, + help=help_str, + envvar="METAFLOW_FLOW_CONFIG", + show_default=False, + default=[ + (k, v[0] if not callable(v) and v[1] else None) + for k, v in defaults.items() + ], + required=False, + ), + ) + return cmd diff --git a/metaflow/user_configs/config_parameters.py b/metaflow/user_configs/config_parameters.py new file mode 100644 index 00000000000..4a312487829 --- /dev/null +++ b/metaflow/user_configs/config_parameters.py @@ -0,0 +1,350 @@ +import collections.abc +import json +import os +import re + +from typing import Any, Callable, Dict, Optional, TYPE_CHECKING, Union + + +from ..exception import MetaflowException + +from ..parameters import ( + Parameter, + ParameterContext, + current_flow, +) + +if TYPE_CHECKING: + from metaflow import FlowSpec + +# _tracefunc_depth = 0 + + +# def tracefunc(func): +# """Decorates a function to show its trace.""" + +# @functools.wraps(func) +# def tracefunc_closure(*args, **kwargs): +# global _tracefunc_depth +# """The closure.""" +# print(f"{_tracefunc_depth}: {func.__name__}(args={args}, kwargs={kwargs})") +# _tracefunc_depth += 1 +# result = func(*args, **kwargs) +# _tracefunc_depth -= 1 +# print(f"{_tracefunc_depth} => {result}") +# return result + +# return tracefunc_closure + +CONFIG_FILE = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "CONFIG_PARAMETERS" +) + +ID_PATTERN = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$") + + +def dump_config_values(flow: "FlowSpec"): + from ..flowspec import _FlowState # Prevent circular import + + configs = flow._flow_state.get(_FlowState.CONFIGS) + if configs: + return {"user_configs": configs} + return {} + + +class ConfigValue(collections.abc.Mapping): + """ + ConfigValue is a thin wrapper around an arbitrarily nested dictionary-like + configuration object. It allows you to access elements of this nested structure + using either a "." notation or a [] notation. As an example, if your configuration + object is: + {"foo": {"bar": 42}} + you can access the value 42 using either config["foo"]["bar"] or config.foo.bar. + + All "keys"" need to be valid Python identifiers + """ + + # Thin wrapper to allow configuration values to be accessed using a "." notation + # as well as a [] notation. + + def __init__(self, data: Dict[str, Any]): + if any(not ID_PATTERN.match(k) for k in data.keys()): + raise MetaflowException( + "All keys in the configuration must be valid Python identifiers" + ) + self._data = data + + def __getattr__(self, key: str) -> Any: + """ + Access an element of this configuration + + Parameters + ---------- + key : str + Element to access + + Returns + ------- + Any + Element of the configuration + """ + if key == "_data": + # Called during unpickling. Special case to not run into infinite loop + # below. + raise AttributeError(key) + + if key in self._data: + return self[key] + raise AttributeError(key) + + def __setattr__(self, name: str, value: Any) -> None: + # Prevent configuration modification + if name == "_data": + return super().__setattr__(name, value) + raise TypeError("ConfigValue is immutable") + + def __getitem__(self, key: Any) -> Any: + """ + Access an element of this configuration + + Parameters + ---------- + key : Any + Element to access + + Returns + ------- + Any + Element of the configuration + """ + value = self._data[key] + if isinstance(value, dict): + value = ConfigValue(value) + return value + + def __len__(self): + return len(self._data) + + def __iter__(self): + return iter(self._data) + + def __repr__(self): + return repr(self._data) + + def __str__(self): + return json.dumps(self._data) + + def to_dict(self) -> Dict[Any, Any]: + """ + Returns a dictionary representation of this configuration object. + + Returns + ------- + Dict[Any, Any] + Dictionary equivalent of this configuration object. + """ + return dict(self._data) + + +class DelayEvaluator(collections.abc.Mapping): + """ + Small wrapper that allows the evaluation of a Config() value in a delayed manner. + This is used when we want to use config.* values in decorators for example. + + It also allows the following "delayed" access on an obj that is a DelayEvaluation + - obj.x.y.z (ie: accessing members of DelayEvaluator; acesses will be delayed until + the DelayEvaluator is evaluated) + - **obj (ie: unpacking the DelayEvaluator as a dictionary). Note that this requires + special handling in whatever this is being unpacked into, specifically the handling + of _unpacked_delayed_* + """ + + def __init__(self, ex: str): + self._config_expr = ex + if ID_PATTERN.match(self._config_expr): + # This is a variable only so allow things like config_expr("config").var + self._is_var_only = True + self._access = [] + else: + self._is_var_only = False + self._access = None + + def __iter__(self): + yield "_unpacked_delayed_%d" % id(self) + + def __getitem__(self, key): + if key == "_unpacked_delayed_%d" % id(self): + return self + raise KeyError(key) + + def __len__(self): + return 1 + + def __getattr__(self, name): + if self._access is None: + raise AttributeError() + self._access.append(name) + return self + + def __call__(self, ctx=None, deploy_time=False): + from ..flowspec import _FlowState # Prevent circular import + + # Two additional arguments are only used by DeployTimeField which will call + # this function with those two additional arguments. They are ignored. + flow_cls = getattr(current_flow, "flow_cls", None) + if flow_cls is None: + # We are not executing inside a flow (ie: not the CLI) + raise MetaflowException( + "Config object can only be used directly in the FlowSpec defining them. " + "If using outside of the FlowSpec, please use ConfigEval" + ) + if self._access is not None: + # Build the final expression by adding all the fields in access as . fields + self._config_expr = ".".join([self._config_expr] + self._access) + # Evaluate the expression setting the config values as local variables + try: + return eval( + self._config_expr, + globals(), + { + k: ConfigValue(v) + for k, v in flow_cls._flow_state.get(_FlowState.CONFIGS, {}).items() + }, + ) + except NameError as e: + potential_config_name = self._config_expr.split(".")[0] + if potential_config_name not in flow_cls._flow_state.get( + _FlowState.CONFIGS, {} + ): + raise MetaflowException( + "Config '%s' not found in the flow (maybe not required and not " + "provided?)" % potential_config_name + ) from e + raise + + +def config_expr(expr: str) -> DelayEvaluator: + """ + Function to allow you to use an expression involving a config parameter in + places where it may not be directory accessible or if you want a more complicated + expression than just a single variable. + + You can use it as follows: + - When the config is not directly accessible: + + @project(name=config_expr("config").project.name) + class MyFlow(FlowSpec): + config = Config("config") + ... + - When you want a more complex expression: + class MyFlow(FlowSpec): + config = Config("config") + + @environment(vars={"foo": config_expr("config.bar.baz.lower()")}) + @step + def start(self): + ... + + Parameters + ---------- + expr : str + Expression using the config values. + """ + return DelayEvaluator(expr) + + +class Config(Parameter, collections.abc.Mapping): + """ + Includes a configuration for this flow. + + `Config` is a special type of `Parameter` but differs in a few key areas: + - it is immutable and determined at deploy time (or prior to running if not deploying + to a scheduler) + - as such, it can be used anywhere in your code including in Metaflow decorators + + The value of the configuration is determines as follows: + - use the user-provided file path or value. It is an error to provide both + - if none are present: + - if a default file path (default) is provided, attempt to read this file + - if the file is present, use that value. Note that the file will be used + even if it has an invalid syntax + - if the file is not present, and a default value is present, use that + - if still None and is required, this is an error. + + Parameters + ---------- + name : str + User-visible configuration name. + default : Union[str, Callable[[ParameterContext], str], optional, default None + Default path from where to read this configuration. A function implies that the + value will be computed using that function. + You can only specify default or default_value. + default_value : Union[str, Dict[str, Any], Callable[[ParameterContext, Union[str, Dict[str, Any]]], Any], optional, default None + Default value for the parameter. A function + implies that the value will be computed using that function. + You can only specify default or default_value. + help : str, optional, default None + Help text to show in `run --help`. + required : bool, default False + Require that the user specified a value for the parameter. Note that if + a default is provided, the required flag is ignored. + parser : Callable[[str], Dict[Any, Any]], optional, default None + An optional function that can parse the configuration string into an arbitrarily + nested dictionary. + show_default : bool, default True + If True, show the default value in the help text. + """ + + IS_FLOW_PARAMETER = True + + def __init__( + self, + name: str, + default: Optional[Union[str, Callable[[ParameterContext], str]]] = None, + default_value: Optional[ + Union[ + str, + Dict[str, Any], + Callable[[ParameterContext], Union[str, Dict[str, Any]]], + ] + ] = None, + help: Optional[str] = None, + required: bool = False, + parser: Optional[Callable[[str], Dict[Any, Any]]] = None, + **kwargs: Dict[str, str] + ): + + if default and default_value: + raise MetaflowException( + "For config '%s', you can only specify default or default_value, not both" + % name + ) + self._default_is_file = default is not None + default = default or default_value + + super(Config, self).__init__( + name, default=default, required=required, help=help, type=str, **kwargs + ) + + if isinstance(kwargs.get("default", None), str): + kwargs["default"] = json.dumps(kwargs["default"]) + self.parser = parser + + self._delay_self = DelayEvaluator(name.lower()) + + def load_parameter(self, v): + return v + + # Support . syntax + def __getattr__(self, name): + return self._delay_self.__getattr__(name) + + # Next three methods are to implement mapping to support ** syntax + def __iter__(self): + return iter(self._delay_self) + + def __len__(self): + return len(self._delay_self) + + def __getitem__(self, key): + return self._delay_self[key] diff --git a/metaflow/util.py b/metaflow/util.py index 286dfe0580f..7df4b83f2eb 100644 --- a/metaflow/util.py +++ b/metaflow/util.py @@ -297,7 +297,7 @@ def get_metaflow_root(): def dict_to_cli_options(params): # Prevent circular imports - from metaflow.config_parameters import ConfigInput, ConfigValue + from .user_configs.config_options import ConfigInput for k, v in params.items(): # Omit boolean options set to false or None, but preserve options with an empty @@ -307,11 +307,13 @@ def dict_to_cli_options(params): # keyword in Python, so we call it 'decospecs' in click args if k == "decospecs": k = "with" - if k == "config_options": + if k in ("config_file_options", "config_value_options"): # Special handling here since we gather them all in one option but actually - # need to send them one at a time using --config kv. + # need to send them one at a time using --config-value kv. + # Note it can be either config_file_options or config_value_options depending + # on click processing order. for config_name in v.keys(): - yield "--config" + yield "--config-value" yield to_unicode(config_name) yield to_unicode(ConfigInput.make_key_name(config_name)) continue diff --git a/test/core/metaflow_test/formatter.py b/test/core/metaflow_test/formatter.py index 096afe78ebb..f9e39d9e983 100644 --- a/test/core/metaflow_test/formatter.py +++ b/test/core/metaflow_test/formatter.py @@ -85,7 +85,7 @@ def _flow_lines(self): tags.extend(tag.split("(")[0] for tag in step.tags) yield 0, "# -*- coding: utf-8 -*-" - yield 0, "from metaflow import Config, config_expr, eval_config, FlowSpec, step, Parameter, project, IncludeFile, JSONType, current, parallel" + yield 0, "from metaflow import Config, config_expr, FlowSpec, step, Parameter, project, IncludeFile, JSONType, current, parallel" yield 0, "from metaflow_test import assert_equals, assert_equals_metadata, assert_exception, ExpectationFailed, is_resumed, ResumeFromHere, TestRetry, try_to_get_card" if tags: yield 0, "from metaflow import %s" % ",".join(tags) diff --git a/test_config/helloconfig.py b/test_config/helloconfig.py index be8246cc6b2..897d3167204 100644 --- a/test_config/helloconfig.py +++ b/test_config/helloconfig.py @@ -8,7 +8,8 @@ step, project, config_expr, - eval_config, + FlowConfigDecorator, + StepConfigDecorator, titus, ) @@ -37,26 +38,21 @@ def config_func(ctx): silly_config = "baz:awesome" -def titus_or_not(flow): - to_replace = [] - for name, s in flow.steps: - if name in flow.config.run_on_titus: - to_replace.append((name, titus(cpu=flow.config.cpu_count)(s))) - for name, val in to_replace: - setattr(flow, name, val) - return flow +class TitusOrNot(FlowConfigDecorator): + def evaluate(self, flow_proxy): + for name, s in flow_proxy.steps: + if name in flow_proxy.config.run_on_titus: + s.add_decorator(titus, cpu=flow_proxy.config.cpu_count) -def add_env_to_start(flow): - # Add a decorator directly to a step - flow.start = environment(vars={"hello": config_expr("config").env_to_start})( - flow.start - ) - return flow +class AddEnvToStart(FlowConfigDecorator): + def evaluate(self, flow_proxy): + s = flow_proxy.start + s.add_decorator(environment, vars={"hello": flow_proxy.config.env_to_start}) -@eval_config(titus_or_not) -@add_env_to_start +@TitusOrNot() +@AddEnvToStart() @project(name=config_expr("config").project_name) class HelloConfig(FlowSpec): """ @@ -72,7 +68,7 @@ class HelloConfig(FlowSpec): default_from_func = Parameter("default_from_func", default=param_func, type=int) - config = Config("config", default=default_config, help="Help for config") + config = Config("config", default_value=default_config, help="Help for config") sconfig = Config( "sconfig", default="sillyconfig.txt", @@ -80,9 +76,11 @@ class HelloConfig(FlowSpec): help="Help for sconfig", required=True, ) - config2 = Config("config2", required=True) + config2 = Config("config2") + + config3 = Config("config3", default_value=config_func) - config3 = Config("config3", default=config_func) + env_config = Config("env_config", default_value={"vars": {"name": "Romain"}}) @step def start(self): @@ -125,6 +123,7 @@ def hello(self): ) self.next(self.end) + @environment(**env_config) @step def end(self): """ @@ -132,7 +131,7 @@ def end(self): last step in the flow. """ - print("HelloFlow is all done") + print("HelloFlow is all done for %s" % os.environ.get("name")) if __name__ == "__main__": From 66d7b1a6c6764a2bed30e2ba1deafcf00bd28c11 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Wed, 16 Oct 2024 13:40:48 -0700 Subject: [PATCH 17/30] Better handling of default options --- metaflow/user_configs/config_options.py | 141 +++++++++++++++++------- 1 file changed, 101 insertions(+), 40 deletions(-) diff --git a/metaflow/user_configs/config_options.py b/metaflow/user_configs/config_options.py index 2af0cc8d807..8a24905c137 100644 --- a/metaflow/user_configs/config_options.py +++ b/metaflow/user_configs/config_options.py @@ -11,6 +11,15 @@ from ..util import get_username +_CONVERT_PREFIX = "@!c!@:" +_DEFAULT_PREFIX = "@!d!@:" +_NO_FILE = "@!n!@" + +_CONVERTED_DEFAULT = _CONVERT_PREFIX + _DEFAULT_PREFIX +_CONVERTED_NO_FILE = _CONVERT_PREFIX + _NO_FILE +_CONVERTED_DEFAULT_NO_FILE = _CONVERTED_DEFAULT + _NO_FILE + + def _load_config_values(info_file: Optional[str] = None) -> Optional[Dict[Any, Any]]: if info_file is None: info_file = os.path.basename(CONFIG_FILE) @@ -25,41 +34,67 @@ class ConvertPath(click.Path): name = "ConvertPath" def convert(self, value, param, ctx): - if isinstance(value, str) and value.startswith("converted:"): + if isinstance(value, str) and value.startswith(_CONVERT_PREFIX): return value - v = super().convert(value, param, ctx) - return self.convert_value(v) + is_default = False + if value and value.startswith(_DEFAULT_PREFIX): + is_default = True + value = super().convert(value[len(_DEFAULT_PREFIX) :], param, ctx) + return self.convert_value(value, is_default) @staticmethod - def convert_value(value): + def mark_as_default(value): + if value is None: + return None + return _DEFAULT_PREFIX + str(value) + + @staticmethod + def convert_value(value, is_default): + default_str = _DEFAULT_PREFIX if is_default else "" if value is None: return None try: with open(value, "r", encoding="utf-8") as f: content = f.read() except OSError: - return "converted:!!NO_FILE!!%s" % value - return "converted:" + content + return _CONVERT_PREFIX + default_str + _NO_FILE + value + return _CONVERT_PREFIX + default_str + content class ConvertDictOrStr(click.ParamType): name = "ConvertDictOrStr" def convert(self, value, param, ctx): - return self.convert_value(value) + is_default = False + if isinstance(value, str): + if value.startswith(_CONVERT_PREFIX): + return value + if value.startswith(_DEFAULT_PREFIX): + is_default = True + + return self.convert_value(value, is_default) @staticmethod - def convert_value(value): + def convert_value(value, is_default): + default_str = _DEFAULT_PREFIX if is_default else "" if value is None: return None if isinstance(value, dict): - return "converted:" + json.dumps(value) + return _CONVERT_PREFIX + default_str + json.dumps(value) - if value.startswith("converted:"): + if value.startswith(_CONVERT_PREFIX): return value - return "converted:" + value + return _CONVERT_PREFIX + default_str + value + + @staticmethod + def mark_as_default(value): + if value is None: + return None + if isinstance(value, dict): + return _DEFAULT_PREFIX + json.dumps(value) + return _DEFAULT_PREFIX + str(value) class MultipleTuple(click.Tuple): @@ -155,34 +190,37 @@ def process_configs(self, ctx, param, value): # *then* merge with the defaults to form a full set of values. # We therefore get a full set of values where: # - the name will correspond to the configuration name - # - the value will be the default (including None if there is no default) or - # the string representation of the value (this will always include - # the "converted:" prefix as it will have gone through the ConvertPath or - # ConvertDictOrStr conversion function). - # A value of None basically means that the config has - # no default and was not specified on the command line. + # - the value will be: + # - the default (including None if there is no default). If the default is + # not None, it will start with _CONVERTED_DEFAULT since Click will make + # the value go through ConvertPath or ConvertDictOrStr + # - the actual value passed through prefixed with _CONVERT_PREFIX print("Got arg name %s and values %s" % (param.name, str(value))) do_return = self._value_values is None and self._path_values is None + # We only keep around non default values. We could simplify by checking just one + # value and if it is default it means all are but this doesn't seem much more effort + # and is clearer if param.name == "config_value_options": - self._value_values = {k.lower(): v for k, v in value if v is not None} + self._value_values = { + k.lower(): v + for k, v in value + if v is not None and not v.startswith(_CONVERTED_DEFAULT) + } else: - self._path_values = {k.lower(): v for k, v in value if v is not None} + self._path_values = { + k.lower(): v + for k, v in value + if v is not None and not v.startswith(_CONVERTED_DEFAULT) + } if do_return: # One of config_value_options or config_file_options will be None return None # The second go around, we process all the values and merge them. - # Check that the user didn't provide *both* a path and a value. We know that - # defaults are not both non None (this is an error) so if they are both - # non-None (and actual files) here, it means the user explicitly provided both. + # Check that the user didn't provide *both* a path and a value. common_keys = set(self._value_values or []).intersection( - [ - k - for k, v in self._path_values.items() - if v and not v.startswith("converted:!!NO_FILE!!") - ] - or [] + [k for k, v in self._path_values.items()] or [] ) if common_keys: raise click.UsageError( @@ -190,9 +228,6 @@ def process_configs(self, ctx, param, value): "Found such values for '%s'" % "', '".join(common_keys) ) - # NOTE: Important to start with _path_values as they can have the - # NO_FILE special value. They will be used (and trigger an error) iff there is - # no other value provided. all_values = dict(self._path_values or {}) all_values.update(self._value_values or {}) @@ -224,22 +259,26 @@ def process_configs(self, ctx, param, value): val = val.fun(param_ctx) if is_path: # This is a file path - merged_configs[n] = ConvertPath.convert_value(val) + merged_configs[n] = ConvertPath.convert_value(val, False) else: # This is a value - merged_configs[n] = ConvertDictOrStr.convert_value(val) + merged_configs[n] = ConvertDictOrStr.convert_value(val, False) missing_configs = set() no_file = [] + no_default_file = [] msgs = [] for name, val in merged_configs.items(): if val is None: missing_configs.add(name) continue - if val.startswith("converted:!!NO_FILE!!"): + if val.startswith(_CONVERTED_NO_FILE): no_file.append(name) continue - val = val[10:] # Remove the "converted:" prefix + if val.startswith(_CONVERTED_DEFAULT_NO_FILE): + no_default_file.append(name) + continue + val = val[len(_CONVERT_PREFIX) :] # Remove the _CONVERT_PREFIX if val.startswith("kv."): # This means to load it from a file read_value = self.get_config(val[3:]) @@ -271,7 +310,12 @@ def process_configs(self, ctx, param, value): for missing in no_file: msgs.append( "configuration file '%s' could not be read for '%s'" - % (merged_configs[missing][21:], missing) + % (merged_configs[missing][len(_CONVERTED_NO_FILE) :], missing) + ) + for missing in no_default_file: + msgs.append( + "default configuration file '%s' could not be read for '%s'" + % (merged_configs[missing][len(_CONVERTED_DEFAULT_NO_FILE) :], missing) ) if msgs: raise click.UsageError( @@ -318,7 +362,6 @@ def config_options(cmd): parameters = [p for _, p in flow_cls._get_parameters() if p.IS_FLOW_PARAMETER] # List all the configuration options for arg in parameters[::-1]: - save_default = arg.kwargs.get("default", None) kwargs = arg.option_kwargs(False) if arg.name.lower() in config_seen: msg = ( @@ -330,7 +373,11 @@ def config_options(cmd): config_seen.add(arg.name.lower()) if kwargs["required"]: required_names.append(arg.name) - defaults[arg.name.lower()] = (save_default, arg._default_is_file) + + defaults[arg.name.lower()] = ( + arg.kwargs.get("default", None), + arg._default_is_file, + ) help_strs.append(" - %s: %s" % (arg.name.lower(), kwargs.get("help", ""))) parsers[arg.name.lower()] = arg.parser @@ -357,7 +404,14 @@ def config_options(cmd): envvar="METAFLOW_FLOW_CONFIG_VALUE", show_default=False, default=[ - (k, v[0] if not callable(v[0]) and not v[1] else None) + ( + k, + ( + ConvertDictOrStr.mark_as_default(v[0]) + if not callable(v[0]) and not v[1] + else None + ), + ) for k, v in defaults.items() ], required=False, @@ -375,7 +429,14 @@ def config_options(cmd): envvar="METAFLOW_FLOW_CONFIG", show_default=False, default=[ - (k, v[0] if not callable(v) and v[1] else None) + ( + k, + ( + ConvertPath.mark_as_default(v[0]) + if not callable(v) and v[1] + else None + ), + ) for k, v in defaults.items() ], required=False, From 6544ca8c298de3e1d2b34afaf54b7272b723284a Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Thu, 17 Oct 2024 00:22:16 -0700 Subject: [PATCH 18/30] Changed names --- metaflow/__init__.py | 2 +- metaflow/flowspec.py | 20 +++++++-------- metaflow/user_configs/config_decorators.py | 30 +++++++++++----------- test_config/helloconfig.py | 8 +++--- 4 files changed, 30 insertions(+), 30 deletions(-) diff --git a/metaflow/__init__.py b/metaflow/__init__.py index 2ce16d0b909..b4f0d8d2253 100644 --- a/metaflow/__init__.py +++ b/metaflow/__init__.py @@ -104,7 +104,7 @@ class and related decorators. from .parameters import Parameter, JSONTypeClass, JSONType from .user_configs.config_parameters import Config, config_expr -from .user_configs.config_decorators import FlowConfigDecorator, StepConfigDecorator +from .user_configs.config_decorators import CustomFlowDecorator, CustomStepDecorator # data layer # For historical reasons, we make metaflow.plugins.datatools accessible as diff --git a/metaflow/flowspec.py b/metaflow/flowspec.py index a501832914e..621dc38f329 100644 --- a/metaflow/flowspec.py +++ b/metaflow/flowspec.py @@ -23,10 +23,10 @@ from .graph import FlowGraph from .unbounded_foreach import UnboundedForeachInput from .user_configs.config_decorators import ( - FlowConfigDecorator, - FlowSpecProxy, - StepConfigDecorator, - StepProxy, + CustomFlowDecorator, + CustomStepDecorator, + MutableFlow, + MutableStep, ) from .util import to_pod from .metaflow_config import INCLUDE_FOREACH_STACK, MAXIMUM_FOREACH_VALUE_CHARS @@ -208,27 +208,27 @@ def _process_config_decorators(self, config_options): # Run all the decorators for deco in self._flow_state[_FlowState.CONFIG_DECORATORS]: - if isinstance(deco, FlowConfigDecorator): + if isinstance(deco, CustomFlowDecorator): # Sanity check to make sure we are applying the decorator to the right # class if not deco._flow_cls == current_cls and not issubclass( current_cls, deco._flow_cls ): raise MetaflowInternalError( - "FlowConfigDecorator registered on the wrong flow -- " + "CustomFlowDecorator registered on the wrong flow -- " "expected %s but got %s" % (deco._flow_cls.__name__, current_cls.__name__) ) - deco.evaluate(FlowSpecProxy(current_cls)) - elif isinstance(deco, StepConfigDecorator): + deco.evaluate(MutableFlow(current_cls)) + elif isinstance(deco, CustomStepDecorator): # Again some sanity checks if deco._flow_cls != current_cls: raise MetaflowInternalError( - "StepConfigDecorator registered on the wrong flow -- " + "CustomStepDecorator registered on the wrong flow -- " "expected %s but got %s" % (deco._flow_cls.__name__, current_cls.__name__) ) - deco.evaluate(StepConfigDecorator(deco._my_step)) + deco.evaluate(CustomStepDecorator(deco._my_step)) # Reset all configs that were already present in the class. # TODO: This means that users can't override configs directly. Not sure if this diff --git a/metaflow/user_configs/config_decorators.py b/metaflow/user_configs/config_decorators.py index 16e9476e08e..58be6bc9c42 100644 --- a/metaflow/user_configs/config_decorators.py +++ b/metaflow/user_configs/config_decorators.py @@ -10,9 +10,9 @@ from metaflow.decorators import FlowSpecDerived, StepDecorator -class StepProxy: +class MutableStep: """ - A StepProxy is a wrapper passed to the `StepConfigDecorator`'s `evaluate` method + A MutableStep is a wrapper passed to the `CustomStepDecorator`'s `evaluate` method to allow the decorator to interact with the step and providing easy methods to modify the behavior of the step. """ @@ -102,7 +102,7 @@ def add_decorator(self, deco_type: partial, **kwargs) -> None: ) -class FlowSpecProxy: +class MutableFlow: def __init__(self, flow_spec: "FlowSpec"): self._flow_cls = flow_spec @@ -112,11 +112,11 @@ def configs(self) -> Generator[Tuple[str, ConfigValue], None, None]: Iterate over all user configurations in this flow Use this to parameterize your flow based on configuration. As an example, the - `evaluate` method of your `FlowConfigDecorator` can use this to add an + `evaluate` method of your `CustomFlowDecorator` can use this to add an environment decorator. ``` - class MyDecorator(FlowConfigDecorator): - def evaluate(flow: FlowSpecProxy): + class MyDecorator(CustomFlowDecorator): + def evaluate(flow: MutableFlow): val = next(flow.configs)[1].steps.start.cpu flow.start.add_decorator(environment, vars={'mycpu': val}) return flow @@ -145,19 +145,19 @@ def start(self): yield name, ConfigValue(value) @property - def steps(self) -> Generator[Tuple[str, StepProxy], None, None]: + def steps(self) -> Generator[Tuple[str, MutableStep], None, None]: """ Iterate over all the steps in this flow Yields ------ - Tuple[str, StepProxy] + Tuple[str, MutableStep] A tuple with the step name and the step proxy """ for var in dir(self._flow_cls): potential_step = getattr(self._flow_cls, var) if callable(potential_step) and hasattr(potential_step, "is_step"): - yield var, StepProxy(potential_step) + yield var, MutableStep(potential_step) def __getattr__(self, name): # We allow direct access to the steps, configs and parameters but nothing else @@ -165,14 +165,14 @@ def __getattr__(self, name): if attr: # Steps if callable(attr) and hasattr(attr, "is_step"): - return StepProxy(attr) + return MutableStep(attr) if name[0] == "_" or name in self._flow_cls._NON_PARAMETERS: raise AttributeError(self, name) return attr raise AttributeError(self, name) -class FlowConfigDecorator: +class CustomFlowDecorator: def __call__(self, flow_spec: "FlowSpec") -> "FlowSpec": from ..flowspec import _FlowState @@ -180,11 +180,11 @@ def __call__(self, flow_spec: "FlowSpec") -> "FlowSpec": self._flow_cls = flow_spec return flow_spec - def evaluate(self, flow_proxy: FlowSpecProxy) -> None: + def evaluate(self, mutable_flow: MutableFlow) -> None: raise NotImplementedError() -class StepConfigDecorator: +class CustomStepDecorator: def __call__( self, step: Union[ @@ -199,7 +199,7 @@ def __call__( if not hasattr(step, "is_step"): raise MetaflowException( - "StepConfigDecorators must be applied to a step function" + "CustomStepDecorator must be applied to a step function" ) self._my_step = step # Get the flow @@ -210,5 +210,5 @@ def __call__( return step - def evaluate(self, step_proxy: StepProxy) -> None: + def evaluate(self, mutable_step: MutableStep) -> None: raise NotImplementedError() diff --git a/test_config/helloconfig.py b/test_config/helloconfig.py index 897d3167204..c357685a837 100644 --- a/test_config/helloconfig.py +++ b/test_config/helloconfig.py @@ -8,8 +8,8 @@ step, project, config_expr, - FlowConfigDecorator, - StepConfigDecorator, + CustomFlowDecorator, + CustomStepDecorator, titus, ) @@ -38,14 +38,14 @@ def config_func(ctx): silly_config = "baz:awesome" -class TitusOrNot(FlowConfigDecorator): +class TitusOrNot(CustomFlowDecorator): def evaluate(self, flow_proxy): for name, s in flow_proxy.steps: if name in flow_proxy.config.run_on_titus: s.add_decorator(titus, cpu=flow_proxy.config.cpu_count) -class AddEnvToStart(FlowConfigDecorator): +class AddEnvToStart(CustomFlowDecorator): def evaluate(self, flow_proxy): s = flow_proxy.start s.add_decorator(environment, vars={"hello": flow_proxy.config.env_to_start}) From 49bc4a8c2a12ee646acbc3b149c26cc22c994bfe Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Thu, 17 Oct 2024 13:13:17 -0700 Subject: [PATCH 19/30] Fix includefile --- metaflow/includefile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metaflow/includefile.py b/metaflow/includefile.py index c81a701dfb3..d599de3f1a8 100644 --- a/metaflow/includefile.py +++ b/metaflow/includefile.py @@ -137,7 +137,7 @@ def convert(self, value, param, ctx): parameter_name=param.name, logger=ctx.obj.echo, ds_type=ctx.obj.datastore_impl.TYPE, - configs=ConfigValue(dict(ctx.obj.flow.configs)), + configs=None, ) if len(value) > 0 and (value.startswith("{") or value.startswith('"{')): From 86ed067ad058c8a949db270ec756bbdba408ab56 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Thu, 17 Oct 2024 17:54:27 -0700 Subject: [PATCH 20/30] Remove more old code --- metaflow/flowspec.py | 2 -- metaflow/parameters.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/metaflow/flowspec.py b/metaflow/flowspec.py index 621dc38f329..27754d7725c 100644 --- a/metaflow/flowspec.py +++ b/metaflow/flowspec.py @@ -345,8 +345,6 @@ def __iter__(self): return iter(self._steps) def __getattr__(self, name: str): - if name in ("configs", "steps"): - return getattr(self.__class__, name) if self._datastore and name in self._datastore: # load the attribute from the datastore... x = self._datastore[name] diff --git a/metaflow/parameters.py b/metaflow/parameters.py index 8a29a787808..d852c42e8e8 100644 --- a/metaflow/parameters.py +++ b/metaflow/parameters.py @@ -35,7 +35,7 @@ ("parameter_name", str), ("logger", Callable[..., None]), ("ds_type", str), - ("configs", "ConfigValue"), + ("configs", Optional["ConfigValue"]), ], ) From 15a08122ed82e4cadc37e292a45b5e89cb30a09a Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Wed, 23 Oct 2024 00:52:30 -0700 Subject: [PATCH 21/30] Bug fixes and better Custom*Decorator behavior. Fixed some typos and updated test to reflect latest code. Fixed a few other issues: - fixed an issue where a config was used in different decorators causing it to produce an incorrect access string - made the decorators work with or without arguments --- metaflow/runner/click_api.py | 2 +- metaflow/user_configs/config_decorators.py | 109 +++++++++++++++++++-- metaflow/user_configs/config_options.py | 8 +- metaflow/user_configs/config_parameters.py | 15 ++- test/core/metaflow_test/formatter.py | 12 ++- test/core/tests/basic_config_parameters.py | 6 +- test_config/helloconfig.py | 18 ++-- 7 files changed, 133 insertions(+), 37 deletions(-) diff --git a/metaflow/runner/click_api.py b/metaflow/runner/click_api.py index 96faa235281..2d946fe7133 100644 --- a/metaflow/runner/click_api.py +++ b/metaflow/runner/click_api.py @@ -39,7 +39,7 @@ from metaflow.exception import MetaflowException from metaflow.includefile import FilePathClass from metaflow.parameters import JSONTypeClass, flow_context -from metaflow.config_options import LocalFileInput +from metaflow.user_configs.config_options import LocalFileInput # Define a recursive type alias for JSON JSON = Union[Dict[str, "JSON"], List["JSON"], str, int, float, bool, None] diff --git a/metaflow/user_configs/config_decorators.py b/metaflow/user_configs/config_decorators.py index 58be6bc9c42..2e5202406ce 100644 --- a/metaflow/user_configs/config_decorators.py +++ b/metaflow/user_configs/config_decorators.py @@ -1,13 +1,13 @@ from functools import partial -from typing import Any, Callable, Generator, TYPE_CHECKING, Tuple, Union +from typing import Any, Callable, Generator, Optional, TYPE_CHECKING, Tuple, Union from metaflow.exception import MetaflowException from metaflow.parameters import Parameter from metaflow.user_configs.config_parameters import ConfigValue if TYPE_CHECKING: - from metaflow.flowspec import FlowSpec - from metaflow.decorators import FlowSpecDerived, StepDecorator + from metaflow.flowspec import _FlowSpecMeta + from metaflow.decorators import FlowSpecDerived class MutableStep: @@ -173,19 +173,105 @@ def __getattr__(self, name): class CustomFlowDecorator: - def __call__(self, flow_spec: "FlowSpec") -> "FlowSpec": + def __init__(self, *args, **kwargs): + from ..flowspec import FlowSpec, _FlowSpecMeta + + if args and isinstance(args[0], (CustomFlowDecorator, _FlowSpecMeta)): + # This means the decorator is bare like @MyDecorator + # and the first argument is the FlowSpec or another decorator (they + # can be stacked) + if isinstance(args[0], _FlowSpecMeta): + self._set_flow_cls(args[0]) + else: + self._set_flow_cls(args[0]._flow_cls) + else: + # The arguments are actually passed to the init function for this decorator + self._args = args + self._kwargs = kwargs + + def __call__(self, flow_spec: Optional["_FlowSpecMeta"] = None) -> "_FlowSpecMeta": + if flow_spec: + # This is the case of a decorator @MyDecorator(foo=1, bar=2) and so + # we already called __init__ and saved foo and bar and are now calling + # this on the flow itself. + self.init(*self._args, **self._kwargs) + return self._set_flow_cls(flow_spec) + elif not self._flow_cls: + # This means that somehow the initialization did not happen properly + # so this may have been applied to a non flow + raise MetaflowException( + "A CustomFlowDecorator can only be applied to a FlowSpec" + ) + return self._flow_cls() + + def _set_flow_cls(self, flow_spec: "_FlowSpecMeta") -> "_FlowSpecMeta": from ..flowspec import _FlowState flow_spec._flow_state.setdefault(_FlowState.CONFIG_DECORATORS, []).append(self) self._flow_cls = flow_spec return flow_spec + def init(self, *args, **kwargs): + """ + This method is intended to be optionally overridden if you need to + have an initializer. + + Raises + ------ + NotImplementedError: If the method is not overridden in a subclass. + """ + raise NotImplementedError() + def evaluate(self, mutable_flow: MutableFlow) -> None: raise NotImplementedError() class CustomStepDecorator: + def __init__(self, *args, **kwargs): + if args and ( + isinstance(args[0], CustomStepDecorator) + or callable(args[0]) + and hasattr(args[0], "is_step") + ): + # This means the decorator is bare like @MyDecorator + # and the first argument is the step or another decorator (they + # can be stacked) + if isinstance(args[0], CustomStepDecorator): + self._set_my_step(args[0]._my_step) + else: + self._set_my_step(args[0]) + else: + # The arguments are actually passed to the init function for this decorator + self._args = args + self._kwargs = kwargs + def __call__( + self, + step: Optional[ + Union[ + Callable[["FlowSpecDerived"], None], + Callable[["FlowSpecDerived", Any], None], + ] + ] = None, + ) -> Union[ + Callable[["FlowSpecDerived"], None], + Callable[["FlowSpecDerived", Any], None], + ]: + if step: + # This is the case of a decorator @MyDecorator(foo=1, bar=2) and so + # we already called __init__ and saved foo and bar and are now calling + # this on the step itself. + self.init(*self._args, **self._kwargs) + return self._set_my_step(step) + elif not self._my_step: + # This means that somehow the initialization did not happen properly + # so this may have been applied to a non step + raise MetaflowException( + "A CustomStepDecorator can only be applied to a step function" + ) + return self._my_step + + def _set_my_step( self, step: Union[ Callable[["FlowSpecDerived"], None], @@ -197,10 +283,6 @@ def __call__( ]: from ..flowspec import _FlowState - if not hasattr(step, "is_step"): - raise MetaflowException( - "CustomStepDecorator must be applied to a step function" - ) self._my_step = step # Get the flow flow_spec = step.__globals__[step.__qualname__.rsplit(".", 1)[0]] @@ -208,7 +290,16 @@ def __call__( self._flow_cls = flow_spec - return step + def init(self, *args, **kwargs): + """ + This method is intended to be optionally overridden if you need to + have an initializer. + + Raises + ------ + NotImplementedError: If the method is not overridden in a subclass. + """ + raise NotImplementedError() def evaluate(self, mutable_step: MutableStep) -> None: raise NotImplementedError() diff --git a/metaflow/user_configs/config_options.py b/metaflow/user_configs/config_options.py index 8a24905c137..21cf2fcfaec 100644 --- a/metaflow/user_configs/config_options.py +++ b/metaflow/user_configs/config_options.py @@ -196,7 +196,7 @@ def process_configs(self, ctx, param, value): # the value go through ConvertPath or ConvertDictOrStr # - the actual value passed through prefixed with _CONVERT_PREFIX - print("Got arg name %s and values %s" % (param.name, str(value))) + # print("Got arg name %s and values %s" % (param.name, str(value))) do_return = self._value_values is None and self._path_values is None # We only keep around non default values. We could simplify by checking just one # value and if it is default it means all are but this doesn't seem much more effort @@ -231,7 +231,7 @@ def process_configs(self, ctx, param, value): all_values = dict(self._path_values or {}) all_values.update(self._value_values or {}) - print("Got all values: %s" % str(all_values)) + # print("Got all values: %s" % str(all_values)) flow_cls._flow_state[_FlowState.CONFIGS] = {} to_return = {} @@ -263,7 +263,7 @@ def process_configs(self, ctx, param, value): else: # This is a value merged_configs[n] = ConvertDictOrStr.convert_value(val, False) - + # print("Merged configs: %s" % str(merged_configs)) missing_configs = set() no_file = [] no_default_file = [] @@ -433,7 +433,7 @@ def config_options(cmd): k, ( ConvertPath.mark_as_default(v[0]) - if not callable(v) and v[1] + if not callable(v[0]) and v[1] else None ), ) diff --git a/metaflow/user_configs/config_parameters.py b/metaflow/user_configs/config_parameters.py index 4a312487829..e6173bd762d 100644 --- a/metaflow/user_configs/config_parameters.py +++ b/metaflow/user_configs/config_parameters.py @@ -320,31 +320,28 @@ def __init__( % name ) self._default_is_file = default is not None - default = default or default_value - + kwargs["default"] = default or default_value super(Config, self).__init__( - name, default=default, required=required, help=help, type=str, **kwargs + name, required=required, help=help, type=str, **kwargs ) if isinstance(kwargs.get("default", None), str): kwargs["default"] = json.dumps(kwargs["default"]) self.parser = parser - self._delay_self = DelayEvaluator(name.lower()) - def load_parameter(self, v): return v # Support . syntax def __getattr__(self, name): - return self._delay_self.__getattr__(name) + return DelayEvaluator(self.name.lower()).__getattr__(name) # Next three methods are to implement mapping to support ** syntax def __iter__(self): - return iter(self._delay_self) + return iter(DelayEvaluator(self.name.lower())) def __len__(self): - return len(self._delay_self) + return len(DelayEvaluator(self.name.lower())) def __getitem__(self, key): - return self._delay_self[key] + return DelayEvaluator(self.name.lower())[key] diff --git a/test/core/metaflow_test/formatter.py b/test/core/metaflow_test/formatter.py index f9e39d9e983..f7df9660f6b 100644 --- a/test/core/metaflow_test/formatter.py +++ b/test/core/metaflow_test/formatter.py @@ -85,8 +85,16 @@ def _flow_lines(self): tags.extend(tag.split("(")[0] for tag in step.tags) yield 0, "# -*- coding: utf-8 -*-" - yield 0, "from metaflow import Config, config_expr, FlowSpec, step, Parameter, project, IncludeFile, JSONType, current, parallel" - yield 0, "from metaflow_test import assert_equals, assert_equals_metadata, assert_exception, ExpectationFailed, is_resumed, ResumeFromHere, TestRetry, try_to_get_card" + yield 0, ( + "from metaflow import Config, config_expr, FlowSpec, step, Parameter, " + "project, IncludeFile, JSONType, current, parallel, CustomFlowDecorator, " + "CustomStepDecorator" + ) + yield 0, ( + "from metaflow_test import assert_equals, assert_equals_metadata, " + "assert_exception, ExpectationFailed, is_resumed, ResumeFromHere, " + "TestRetry, try_to_get_card" + ) if tags: yield 0, "from metaflow import %s" % ",".join(tags) diff --git a/test/core/tests/basic_config_parameters.py b/test/core/tests/basic_config_parameters.py index dc367bef524..e333b645a73 100644 --- a/test/core/tests/basic_config_parameters.py +++ b/test/core/tests/basic_config_parameters.py @@ -11,16 +11,16 @@ class BasicConfigTest(MetaflowTest): "default_from_func": {"default": "param_default", "type": "int"}, } CONFIGS = { - "config": {"default": "default_config"}, + "config": {"default_value": "default_config"}, "silly_config": {"required": True, "parser": "silly_parser"}, "config2": {}, - "config3": {"default": "config_default"}, + "config3": {"default_value": "config_default"}, } HEADER = """ import json import os -os.environ['METAFLOW_FLOW_CONFIG'] = json.dumps( +os.environ['METAFLOW_FLOW_CONFIG_VALUE'] = json.dumps( { "config2": {"default_param": 123}, "silly_config": "baz:amazing" diff --git a/test_config/helloconfig.py b/test_config/helloconfig.py index c357685a837..e62e795f615 100644 --- a/test_config/helloconfig.py +++ b/test_config/helloconfig.py @@ -39,20 +39,20 @@ def config_func(ctx): class TitusOrNot(CustomFlowDecorator): - def evaluate(self, flow_proxy): - for name, s in flow_proxy.steps: - if name in flow_proxy.config.run_on_titus: - s.add_decorator(titus, cpu=flow_proxy.config.cpu_count) + def evaluate(self, mutable_flow): + for name, s in mutable_flow.steps: + if name in mutable_flow.config.run_on_titus: + s.add_decorator(titus, cpu=mutable_flow.config.cpu_count) class AddEnvToStart(CustomFlowDecorator): - def evaluate(self, flow_proxy): - s = flow_proxy.start - s.add_decorator(environment, vars={"hello": flow_proxy.config.env_to_start}) + def evaluate(self, mutable_flow): + s = mutable_flow.start + s.add_decorator(environment, vars={"hello": mutable_flow.config.env_to_start}) -@TitusOrNot() -@AddEnvToStart() +@TitusOrNot +@AddEnvToStart @project(name=config_expr("config").project_name) class HelloConfig(FlowSpec): """ From d17901637b90c663dcce2c6bb07d85a09f46bb3d Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Wed, 23 Oct 2024 13:39:36 -0700 Subject: [PATCH 22/30] do not map config parameters to CLI command for argo/step functions --- metaflow/decorators.py | 61 ++++-------- metaflow/flowspec.py | 23 +++-- metaflow/includefile.py | 65 ++++++++----- metaflow/parameters.py | 63 +++++++++---- metaflow/plugins/argo/argo_workflows.py | 42 ++++++++- .../aws/step_functions/step_functions.py | 32 +++++++ metaflow/runner/click_api.py | 2 +- metaflow/user_configs/config_decorators.py | 94 ++++++++++++++----- metaflow/user_configs/config_options.py | 2 +- metaflow/user_configs/config_parameters.py | 38 +++++++- test/core/tests/basic_config_parameters.py | 35 ++++++- test/core/tests/basic_config_silly.txt | 1 + test/core/tests/custom_decorators.py | 0 13 files changed, 334 insertions(+), 124 deletions(-) create mode 100644 test/core/tests/basic_config_silly.txt create mode 100644 test/core/tests/custom_decorators.py diff --git a/metaflow/decorators.py b/metaflow/decorators.py index b615d5744ac..11743278d13 100644 --- a/metaflow/decorators.py +++ b/metaflow/decorators.py @@ -12,7 +12,11 @@ ) from .parameters import current_flow -from .user_configs.config_parameters import DelayEvaluator +from .user_configs.config_parameters import ( + UNPACK_KEY, + resolve_delayed_evaluator, + unpack_delayed_evaluator, +) from metaflow._vendor import click @@ -117,12 +121,14 @@ def __init__(self, attributes=None, statically_defined=False): self.attributes = self.defaults.copy() self.statically_defined = statically_defined self._user_defined_attributes = set() + self._ran_init = False if attributes: for k, v in attributes.items(): - self._user_defined_attributes.add(k) - if k in self.defaults or k.startswith("_unpacked_delayed_"): + if k in self.defaults or k.startswith(UNPACK_KEY): self.attributes[k] = v + if not k.startswith(UNPACK_KEY): + self._user_defined_attributes.add(k) else: raise InvalidDecoratorAttribute(self.name, k, self.defaults) @@ -132,44 +138,17 @@ def init(self): should be done here. """ - def _resolve_delayed_evaluator(v): - if isinstance(v, DelayEvaluator): - return v() - if isinstance(v, dict): - return { - _resolve_delayed_evaluator(k): _resolve_delayed_evaluator(v) - for k, v in v.items() - } - if isinstance(v, list): - return [_resolve_delayed_evaluator(x) for x in v] - if isinstance(v, tuple): - return tuple(_resolve_delayed_evaluator(x) for x in v) - if isinstance(v, set): - return {_resolve_delayed_evaluator(x) for x in v} - return v - - # Expand any eventual _unpacked_delayed_ attributes. These are special attributes - # that allow the delay unpacking of configuration values. - delayed_upack_keys = [ - k for k in self.attributes if k.startswith("_unpacked_delayed_") - ] - if delayed_upack_keys: - for k in delayed_upack_keys: - unpacked = _resolve_delayed_evaluator(self.attributes[k]) - for uk, uv in unpacked.items(): - if uk in self._user_defined_attributes: - raise SyntaxError( - "keyword argument repeated: %s" % uk, "", 0, "" - ) - self._user_defined_attributes.add(uk) - self.attributes[uk] = uv - del self.attributes[k] - - # Now resolve all attributes - for k, v in self.attributes.items(): - # This is a special attribute that means we are going to unpack - # the configuration valu - self.attributes[k] = _resolve_delayed_evaluator(v) + # In some cases (specifically when using remove_decorator), we may need to call + # init multiple times. Short-circuit re-evaluating. + if self._ran_init: + return + + # Note that by design, later values override previous ones. + self.attributes = unpack_delayed_evaluator(self.attributes) + self._user_defined_attributes.update(self.attributes.keys()) + self.attributes = resolve_delayed_evaluator(self.attributes) + + self._ran_init = True @classmethod def _parse_decorator_spec(cls, deco_spec): diff --git a/metaflow/flowspec.py b/metaflow/flowspec.py index 27754d7725c..b24a659dbde 100644 --- a/metaflow/flowspec.py +++ b/metaflow/flowspec.py @@ -170,9 +170,11 @@ def script_name(self) -> str: fname = fname[:-1] return os.path.basename(fname) - def _check_parameters(self): + def _check_parameters(self, config_parameters=False): seen = set() - for var, param in self._get_parameters(): + for _, param in self._get_parameters(): + if param.IS_CONFIG_PARAMETER != config_parameters: + continue norm = param.name.lower() if norm in seen: raise MetaflowException( @@ -190,20 +192,20 @@ def _process_config_decorators(self, config_options): return self # We need to convert all the user configurations from DelayedEvaluationParameters - # to actual values so they can be used as is in the config functions. + # to actual values so they can be used as is in the config decorators. # We then reset them to be proper parameters so they can be re-evaluated in # _set_constants to_reset_params = [] - self._check_parameters() + self._check_parameters(config_parameters=True) for var, param in self._get_parameters(): - if not param.IS_FLOW_PARAMETER: + if not param.IS_CONFIG_PARAMETER: continue - to_reset_params.append((var, param)) # Note that a config with no default and not required will be None val = config_options.get(param.name.replace("-", "_").lower()) if isinstance(val, DelayedEvaluationParameter): val = val() + to_reset_params.append((var, param, val)) setattr(current_cls, var, val) # Run all the decorators @@ -230,6 +232,11 @@ def _process_config_decorators(self, config_options): ) deco.evaluate(CustomStepDecorator(deco._my_step)) + # Process parameters to allow them to also use config values easily + for var, param in self._get_parameters(): + if param.IS_CONFIG_PARAMETER: + continue + param.init() # Reset all configs that were already present in the class. # TODO: This means that users can't override configs directly. Not sure if this # is a pattern we want to support @@ -253,7 +260,7 @@ def _set_constants(self, graph, kwargs, config_options): # Persist values for parameters and other constants (class level variables) # only once. This method is called before persist_constants is called to # persist all values set using setattr - self._check_parameters() + self._check_parameters(config_parameters=False) seen = set() self._success = True @@ -261,7 +268,7 @@ def _set_constants(self, graph, kwargs, config_options): parameters_info = [] for var, param in self._get_parameters(): seen.add(var) - if param.IS_FLOW_PARAMETER: + if param.IS_CONFIG_PARAMETER: val = config_options.get(param.name.replace("-", "_").lower()) else: val = kwargs[param.name.replace("-", "_").lower()] diff --git a/metaflow/includefile.py b/metaflow/includefile.py index d599de3f1a8..3c5863485af 100644 --- a/metaflow/includefile.py +++ b/metaflow/includefile.py @@ -245,29 +245,58 @@ class IncludeFile(Parameter): default : Union[str, Callable[ParameterContext, str]] Default path to a local file. A function implies that the parameter corresponds to a *deploy-time parameter*. - is_text : bool, default True + is_text : bool, optional, default None Convert the file contents to a string using the provided `encoding`. - If False, the artifact is stored in `bytes`. - encoding : str, optional, default 'utf-8' - Use this encoding to decode the file contexts if `is_text=True`. - required : bool, default False + If False, the artifact is stored in `bytes`. A value of None is equivalent to + True. + encoding : str, optional, default None + Use this encoding to decode the file contexts if `is_text=True`. A value of None + is equivalent to "utf-8". + required : bool, optional, default None Require that the user specified a value for the parameter. - `required=True` implies that the `default` is not used. + `required=True` implies that the `default` is not used. A value of None is + equivalent to False help : str, optional Help text to show in `run --help`. show_default : bool, default True - If True, show the default value in the help text. + If True, show the default value in the help text. A value of None is equivalent + to True. """ def __init__( self, name: str, - required: bool = False, - is_text: bool = True, - encoding: str = "utf-8", + required: Optional[bool] = None, + is_text: Optional[bool] = None, + encoding: Optional[str] = None, help: Optional[str] = None, **kwargs: Dict[str, str] ): + self._includefile_overrides = {} + if is_text is not None: + self._includefile_overrides["is_text"] = is_text + if encoding is not None: + self._includefile_overrides["encoding"] = encoding + super(IncludeFile, self).__init__( + name, + required=required, + help=help, + type=FilePathClass(is_text, encoding), + **kwargs, + ) + + def init(self): + super(IncludeFile, self).init() + + # This will use the values set explicitly in the args if present, else will + # use and remove from kwargs else will use True/utf-8 + is_text = self._includefile_overrides.get( + "is_text", self.kwargs.pop("is_text", True) + ) + encoding = self._includefile_overrides.get( + "encoding", self.kwargs.pop("encoding", "utf-8") + ) + # If a default is specified, it needs to be uploaded when the flow is deployed # (for example when doing a `step-functions create`) so we make the default # be a DeployTimeField. This means that it will be evaluated in two cases: @@ -277,7 +306,7 @@ def __init__( # In the first case, we will need to fully upload the file whereas in the # second case, we can just return the string as the FilePath.convert method # will take care of evaluating things. - v = kwargs.get("default") + v = self.kwargs.get("default") if v is not None: # If the default is a callable, we have two DeployTimeField: # - the callable nature of the default will require us to "call" the default @@ -290,23 +319,15 @@ def __init__( # (call the default) if callable(v) and not isinstance(v, DeployTimeField): # If default is a callable, make it a DeployTimeField (the inner one) - v = DeployTimeField(name, str, "default", v, return_str=True) - kwargs["default"] = DeployTimeField( - name, + v = DeployTimeField(self.name, str, "default", v, return_str=True) + self.kwargs["default"] = DeployTimeField( + self.name, str, "default", IncludeFile._eval_default(is_text, encoding, v), print_representation=v, ) - super(IncludeFile, self).__init__( - name, - required=required, - help=help, - type=FilePathClass(is_text, encoding), - **kwargs, - ) - def load_parameter(self, v): if v is None: return v diff --git a/metaflow/parameters.py b/metaflow/parameters.py index d852c42e8e8..ddcf43263c5 100644 --- a/metaflow/parameters.py +++ b/metaflow/parameters.py @@ -311,14 +311,15 @@ class MyFlow(FlowSpec): to the type of `default` or `str` if none specified. help : str, optional Help text to show in `run --help`. - required : bool, default False - Require that the user specified a value for the parameter. If a non-None - default is specified, that default will be used if no other value is provided - show_default : bool, default True - If True, show the default value in the help text. + required : bool, optional, default None + Require that the user specified a value for the parameter. `required=True` implies + that `default` is not used. A value of None is equivalent to False. + show_default : bool, optional, default None + If True, show the default value in the help text. A value of None is equivalent + to True. """ - IS_FLOW_PARAMETER = False + IS_CONFIG_PARAMETER = False def __init__( self, @@ -339,24 +340,44 @@ def __init__( Union[Type[str], Type[float], Type[int], Type[bool], JSONTypeClass] ] = None, help: Optional[str] = None, - required: bool = False, - show_default: bool = True, + required: Optional[bool] = None, + show_default: Optional[bool] = None, **kwargs: Dict[str, Any] ): self.name = name self.kwargs = kwargs - for k, v in { + self._override_kwargs = { "default": default, "type": type, "help": help, "required": required, "show_default": show_default, - }.items(): - if v is not None: - self.kwargs[k] = v + } + + def init(self): + # Prevent circular import + from .user_configs.config_parameters import ( + resolve_delayed_evaluator, + unpack_delayed_evaluator, + ) + + # Resolve any value from configurations + self.kwargs = unpack_delayed_evaluator(self.kwargs) + self.kwargs = resolve_delayed_evaluator(self.kwargs) + + # This was the behavior before configs: values specified in args would override + # stuff in kwargs which is what we implement here as well + for key, value in self._override_kwargs.items(): + if value is not None: + self.kwargs[key] = value + # Set two default values if no-one specified them + self.kwargs.setdefault("required", False) + self.kwargs.setdefault("show_default", True) + + # Continue processing kwargs free of any configuration values :) # TODO: check that the type is one of the supported types - param_type = self.kwargs["type"] = self._get_type(kwargs) + param_type = self.kwargs["type"] = self._get_type(self.kwargs) reserved_params = [ "params", @@ -381,23 +402,27 @@ def __init__( raise MetaflowException( "Parameter name '%s' is a reserved " "word. Please use a different " - "name for your parameter." % (name) + "name for your parameter." % (self.name) ) # make sure the user is not trying to pass a function in one of the # fields that don't support function-values yet for field in ("show_default", "separator", "required"): - if callable(kwargs.get(field)): + if callable(self.kwargs.get(field)): raise MetaflowException( "Parameter *%s*: Field '%s' cannot " - "have a function as its value" % (name, field) + "have a function as its value" % (self.name, field) ) # default can be defined as a function default_field = self.kwargs.get("default") if callable(default_field) and not isinstance(default_field, DeployTimeField): self.kwargs["default"] = DeployTimeField( - name, param_type, "default", self.kwargs["default"], return_str=True + self.name, + param_type, + "default", + self.kwargs["default"], + return_str=True, ) # note that separator doesn't work with DeployTimeFields unless you @@ -406,7 +431,7 @@ def __init__( if self.separator and not self.is_string_type: raise MetaflowException( "Parameter *%s*: Separator is only allowed " - "for string parameters." % name + "for string parameters." % self.name ) def __repr__(self): @@ -463,7 +488,7 @@ def wrapper(cmd): if flow_cls is None: return cmd parameters = [ - p for _, p in flow_cls._get_parameters() if not p.IS_FLOW_PARAMETER + p for _, p in flow_cls._get_parameters() if not p.IS_CONFIG_PARAMETER ] for arg in parameters[::-1]: kwargs = arg.option_kwargs(deploy_mode) diff --git a/metaflow/plugins/argo/argo_workflows.py b/metaflow/plugins/argo/argo_workflows.py index eee17b243ca..1214545522a 100644 --- a/metaflow/plugins/argo/argo_workflows.py +++ b/metaflow/plugins/argo/argo_workflows.py @@ -60,6 +60,7 @@ ) from metaflow.plugins.kubernetes.kubernetes_jobsets import KubernetesArgoJobSet from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK +from metaflow.user_configs.config_options import ConfigInput from metaflow.util import ( compress_list, dict_to_cli_options, @@ -168,6 +169,7 @@ def __init__( self.enable_heartbeat_daemon = enable_heartbeat_daemon self.enable_error_msg_capture = enable_error_msg_capture self.parameters = self._process_parameters() + self.config_parameters = self._process_config_parameters() self.triggers, self.trigger_options = self._process_triggers() self._schedule, self._timezone = self._get_schedule() @@ -455,6 +457,10 @@ def _process_parameters(self): "case-insensitive." % param.name ) seen.add(norm) + # NOTE: We skip config parameters as these do not have dynamic values, + # and need to be treated differently. + if param.IS_FLOW_PARAMETER: + continue extra_attrs = {} if param.kwargs.get("type") == JSONType: @@ -488,6 +494,7 @@ def _process_parameters(self): # execution - which needs to be fixed imminently. if not is_required or default_value is not None: default_value = json.dumps(default_value) + parameters[param.name] = dict( name=param.name, value=default_value, @@ -498,6 +505,27 @@ def _process_parameters(self): ) return parameters + def _process_config_parameters(self): + parameters = [] + seen = set() + for var, param in self.flow._get_parameters(): + if not param.IS_FLOW_PARAMETER: + continue + # Throw an exception if the parameter is specified twice. + norm = param.name.lower() + if norm in seen: + raise MetaflowException( + "Parameter *%s* is specified twice. " + "Note that parameter names are " + "case-insensitive." % param.name + ) + seen.add(norm) + + parameters.append( + dict(name=param.name, kv_name=ConfigInput.make_key_name(param.name)) + ) + return parameters + def _process_triggers(self): # Impute triggers for Argo Workflow Template specified through @trigger and # @trigger_on_finish decorators @@ -520,8 +548,13 @@ def _process_triggers(self): # convert them to lower case since Metaflow parameters are case # insensitive. seen = set() + # NOTE: We skip config parameters as their values can not be set through event payloads params = set( - [param.name.lower() for var, param in self.flow._get_parameters()] + [ + param.name.lower() + for var, param in self.flow._get_parameters() + if not param.IS_FLOW_PARAMETER + ] ) trigger_deco = self.flow._flow_decorators.get("trigger")[0] trigger_deco.format_deploytime_value() @@ -1720,6 +1753,13 @@ def _container_templates(self): metaflow_version["production_token"] = self.production_token env["METAFLOW_VERSION"] = json.dumps(metaflow_version) + # map config values + cfg_env = { + param["name"]: param["kv_name"] for param in self.config_parameters + } + if cfg_env: + env["METAFLOW_FLOW_CONFIG_VALUE"] = json.dumps(cfg_env) + # Set the template inputs and outputs for passing state. Very simply, # the container template takes in input-paths as input and outputs # the task-id (which feeds in as input-paths to the subsequent task). diff --git a/metaflow/plugins/aws/step_functions/step_functions.py b/metaflow/plugins/aws/step_functions/step_functions.py index 0534cd2179d..61862183883 100644 --- a/metaflow/plugins/aws/step_functions/step_functions.py +++ b/metaflow/plugins/aws/step_functions/step_functions.py @@ -18,6 +18,7 @@ SFN_S3_DISTRIBUTED_MAP_OUTPUT_PATH, ) from metaflow.parameters import deploy_time_eval +from metaflow.user_configs.config_options import ConfigInput from metaflow.util import dict_to_cli_options, to_pascalcase from ..batch.batch import Batch @@ -71,6 +72,7 @@ def __init__( self.username = username self.max_workers = max_workers self.workflow_timeout = workflow_timeout + self.config_parameters = self._process_config_parameters() # https://aws.amazon.com/blogs/aws/step-functions-distributed-map-a-serverless-solution-for-large-scale-parallel-data-processing/ self.use_distributed_map = use_distributed_map @@ -485,6 +487,10 @@ def _process_parameters(self): "case-insensitive." % param.name ) seen.add(norm) + # NOTE: We skip config parameters as these do not have dynamic values, + # and need to be treated differently. + if param.IS_FLOW_PARAMETER: + continue is_required = param.kwargs.get("required", False) # Throw an exception if a schedule is set for a flow with required @@ -501,6 +507,27 @@ def _process_parameters(self): parameters.append(dict(name=param.name, value=value)) return parameters + def _process_config_parameters(self): + parameters = [] + seen = set() + for var, param in self.flow._get_parameters(): + if not param.IS_FLOW_PARAMETER: + continue + # Throw an exception if the parameter is specified twice. + norm = param.name.lower() + if norm in seen: + raise MetaflowException( + "Parameter *%s* is specified twice. " + "Note that parameter names are " + "case-insensitive." % param.name + ) + seen.add(norm) + + parameters.append( + dict(name=param.name, kv_name=ConfigInput.make_key_name(param.name)) + ) + return parameters + def _batch(self, node): attrs = { # metaflow.user is only used for setting the AWS Job Name. @@ -747,6 +774,11 @@ def _batch(self, node): metaflow_version["production_token"] = self.production_token env["METAFLOW_VERSION"] = json.dumps(metaflow_version) + # map config values + cfg_env = {param["name"]: param["kv_name"] for param in self.config_parameters} + if cfg_env: + env["METAFLOW_FLOW_CONFIG_VALUE"] = json.dumps(cfg_env) + # Set AWS DynamoDb Table Name for state tracking for for-eaches. # There are three instances when metaflow runtime directly interacts # with AWS DynamoDB. diff --git a/metaflow/runner/click_api.py b/metaflow/runner/click_api.py index 2d946fe7133..49f5d1902fb 100644 --- a/metaflow/runner/click_api.py +++ b/metaflow/runner/click_api.py @@ -227,7 +227,7 @@ def name(self): def from_cli(cls, flow_file: str, cli_collection: Callable) -> Callable: flow_cls = extract_flow_class_from_file(flow_file) flow_parameters = [ - p for _, p in flow_cls._get_parameters() if not p.IS_FLOW_PARAMETER + p for _, p in flow_cls._get_parameters() if not p.IS_CONFIG_PARAMETER ] with flow_context(flow_cls) as _: add_decorator_options(cli_collection) diff --git a/metaflow/user_configs/config_decorators.py b/metaflow/user_configs/config_decorators.py index 2e5202406ce..2360f7ea969 100644 --- a/metaflow/user_configs/config_decorators.py +++ b/metaflow/user_configs/config_decorators.py @@ -2,12 +2,14 @@ from typing import Any, Callable, Generator, Optional, TYPE_CHECKING, Tuple, Union from metaflow.exception import MetaflowException -from metaflow.parameters import Parameter -from metaflow.user_configs.config_parameters import ConfigValue +from metaflow.user_configs.config_parameters import ( + ConfigValue, + resolve_delayed_evaluator, +) if TYPE_CHECKING: - from metaflow.flowspec import _FlowSpecMeta - from metaflow.decorators import FlowSpecDerived + import metaflow.flowspec + import metaflow.decorators class MutableStep: @@ -20,8 +22,8 @@ class MutableStep: def __init__( self, step: Union[ - Callable[["FlowSpecDerived"], None], - Callable[["FlowSpecDerived", Any], None], + Callable[["metaflow.decorators.FlowSpecDerived"], None], + Callable[["metaflow.decorators.FlowSpecDerived", Any], None], ], ): self._my_step = step @@ -51,6 +53,9 @@ def remove_decorator(self, deco_name: str, all: bool = True, **kwargs) -> bool: did_remove = False for deco in self._my_step.decorators: if deco.name == deco_name: + # Evaluate all the configuration values if any + deco.init() + # Check filters match_ok = True if kwargs: @@ -144,6 +149,10 @@ def start(self): ).items(): yield name, ConfigValue(value) + @property + def parameters(self) -> Generator[Tuple[str, Any], None, None]: + pass + @property def steps(self) -> Generator[Tuple[str, MutableStep], None, None]: """ @@ -174,13 +183,22 @@ def __getattr__(self, name): class CustomFlowDecorator: def __init__(self, *args, **kwargs): - from ..flowspec import FlowSpec, _FlowSpecMeta + from ..flowspec import FlowSpecMeta - if args and isinstance(args[0], (CustomFlowDecorator, _FlowSpecMeta)): + if args and isinstance(args[0], (CustomFlowDecorator, FlowSpecMeta)): # This means the decorator is bare like @MyDecorator # and the first argument is the FlowSpec or another decorator (they # can be stacked) - if isinstance(args[0], _FlowSpecMeta): + + # If we have a init function, we call it with no arguments -- this can + # happen if the user defines a function with default parameters for example + try: + self.init() + except NotImplementedError: + pass + + # Now set the flow class we apply to + if isinstance(args[0], FlowSpecMeta): self._set_flow_cls(args[0]) else: self._set_flow_cls(args[0]._flow_cls) @@ -189,12 +207,25 @@ def __init__(self, *args, **kwargs): self._args = args self._kwargs = kwargs - def __call__(self, flow_spec: Optional["_FlowSpecMeta"] = None) -> "_FlowSpecMeta": + def __call__( + self, flow_spec: Optional["metaflow.flowspec.FlowSpecMeta"] = None + ) -> "metaflow.flowspec.FlowSpecMeta": if flow_spec: # This is the case of a decorator @MyDecorator(foo=1, bar=2) and so - # we already called __init__ and saved foo and bar and are now calling - # this on the flow itself. - self.init(*self._args, **self._kwargs) + # we already called __init__ and saved foo and bar in self._args and + # self._kwargs and are now calling this on the flow itself. + + # You can use config values in the arguments to a CustomFlowDecorator + # so we resolve those as well + new_args = [resolve_delayed_evaluator(arg) for arg in self._args] + try: + self.init(*self._args, **self._kwargs) + except NotImplementedError as e: + raise MetaflowException( + "CustomFlowDecorator '%s' is used with arguments " + "but does not implement init" % str(self.__class__) + ) from e + return self._set_flow_cls(flow_spec) elif not self._flow_cls: # This means that somehow the initialization did not happen properly @@ -204,7 +235,9 @@ def __call__(self, flow_spec: Optional["_FlowSpecMeta"] = None) -> "_FlowSpecMet ) return self._flow_cls() - def _set_flow_cls(self, flow_spec: "_FlowSpecMeta") -> "_FlowSpecMeta": + def _set_flow_cls( + self, flow_spec: "metaflow.flowspec.FlowSpecMeta" + ) -> "metaflow.flowspec.FlowSpecMeta": from ..flowspec import _FlowState flow_spec._flow_state.setdefault(_FlowState.CONFIG_DECORATORS, []).append(self) @@ -223,6 +256,19 @@ def init(self, *args, **kwargs): raise NotImplementedError() def evaluate(self, mutable_flow: MutableFlow) -> None: + """ + Implement this method to act on the flow and modify it as needed. + + Parameters + ---------- + mutable_flow : MutableFlow + Flow + + Raises + ------ + NotImplementedError + _description_ + """ raise NotImplementedError() @@ -249,18 +295,18 @@ def __call__( self, step: Optional[ Union[ - Callable[["FlowSpecDerived"], None], - Callable[["FlowSpecDerived", Any], None], + Callable[["metaflow.decorators.FlowSpecDerived"], None], + Callable[["metaflow.decorators.FlowSpecDerived", Any], None], ] ] = None, ) -> Union[ - Callable[["FlowSpecDerived"], None], - Callable[["FlowSpecDerived", Any], None], + Callable[["metaflow.decorators.FlowSpecDerived"], None], + Callable[["metaflow.decorators.FlowSpecDerived", Any], None], ]: if step: # This is the case of a decorator @MyDecorator(foo=1, bar=2) and so - # we already called __init__ and saved foo and bar and are now calling - # this on the step itself. + # we already called __init__ and saved foo and bar into self._args and + # self._kwargs and are now calling this on the step itself. self.init(*self._args, **self._kwargs) return self._set_my_step(step) elif not self._my_step: @@ -274,12 +320,12 @@ def __call__( def _set_my_step( self, step: Union[ - Callable[["FlowSpecDerived"], None], - Callable[["FlowSpecDerived", Any], None], + Callable[["metaflow.decorators.FlowSpecDerived"], None], + Callable[["metaflow.decorators.FlowSpecDerived", Any], None], ], ) -> Union[ - Callable[["FlowSpecDerived"], None], - Callable[["FlowSpecDerived", Any], None], + Callable[["metaflow.decorators.FlowSpecDerived"], None], + Callable[["metaflow.decorators.FlowSpecDerived", Any], None], ]: from ..flowspec import _FlowState diff --git a/metaflow/user_configs/config_options.py b/metaflow/user_configs/config_options.py index 21cf2fcfaec..9502e0eab66 100644 --- a/metaflow/user_configs/config_options.py +++ b/metaflow/user_configs/config_options.py @@ -359,7 +359,7 @@ def config_options(cmd): if flow_cls is None: return cmd - parameters = [p for _, p in flow_cls._get_parameters() if p.IS_FLOW_PARAMETER] + parameters = [p for _, p in flow_cls._get_parameters() if p.IS_CONFIG_PARAMETER] # List all the configuration options for arg in parameters[::-1]: kwargs = arg.option_kwargs(False) diff --git a/metaflow/user_configs/config_parameters.py b/metaflow/user_configs/config_parameters.py index e6173bd762d..81e1a0f8738 100644 --- a/metaflow/user_configs/config_parameters.py +++ b/metaflow/user_configs/config_parameters.py @@ -42,6 +42,8 @@ ID_PATTERN = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$") +UNPACK_KEY = "_unpacked_delayed_" + def dump_config_values(flow: "FlowSpec"): from ..flowspec import _FlowState # Prevent circular import @@ -152,7 +154,7 @@ class DelayEvaluator(collections.abc.Mapping): This is used when we want to use config.* values in decorators for example. It also allows the following "delayed" access on an obj that is a DelayEvaluation - - obj.x.y.z (ie: accessing members of DelayEvaluator; acesses will be delayed until + - obj.x.y.z (ie: accessing members of DelayEvaluator; accesses will be delayed until the DelayEvaluator is evaluated) - **obj (ie: unpacking the DelayEvaluator as a dictionary). Note that this requires special handling in whatever this is being unpacked into, specifically the handling @@ -170,10 +172,10 @@ def __init__(self, ex: str): self._access = None def __iter__(self): - yield "_unpacked_delayed_%d" % id(self) + yield "%s%d" % (UNPACK_KEY, id(self)) def __getitem__(self, key): - if key == "_unpacked_delayed_%d" % id(self): + if key == "%s%d" % (UNPACK_KEY, id(self)): return self raise KeyError(key) @@ -295,7 +297,7 @@ class Config(Parameter, collections.abc.Mapping): If True, show the default value in the help text. """ - IS_FLOW_PARAMETER = True + IS_CONFIG_PARAMETER = True def __init__( self, @@ -345,3 +347,31 @@ def __len__(self): def __getitem__(self, key): return DelayEvaluator(self.name.lower())[key] + + +def resolve_delayed_evaluator(v: Any) -> Any: + if isinstance(v, DelayEvaluator): + return v() + if isinstance(v, dict): + return { + resolve_delayed_evaluator(k): resolve_delayed_evaluator(v) + for k, v in v.items() + } + if isinstance(v, list): + return [resolve_delayed_evaluator(x) for x in v] + if isinstance(v, tuple): + return tuple(resolve_delayed_evaluator(x) for x in v) + if isinstance(v, set): + return {resolve_delayed_evaluator(x) for x in v} + return v + + +def unpack_delayed_evaluator(to_unpack: Dict[str, Any]) -> Dict[str, Any]: + result = {} + for k, v in to_unpack.items(): + if not isinstance(k, str) or not k.startswith(UNPACK_KEY): + result[k] = v + else: + # k.startswith(UNPACK_KEY) + result.update(resolve_delayed_evaluator(v[k])) + return result diff --git a/test/core/tests/basic_config_parameters.py b/test/core/tests/basic_config_parameters.py index e333b645a73..b8e91a0f259 100644 --- a/test/core/tests/basic_config_parameters.py +++ b/test/core/tests/basic_config_parameters.py @@ -11,19 +11,36 @@ class BasicConfigTest(MetaflowTest): "default_from_func": {"default": "param_default", "type": "int"}, } CONFIGS = { + # Test a default value as a dict "config": {"default_value": "default_config"}, - "silly_config": {"required": True, "parser": "silly_parser"}, + # Test parser, various arguments and overriden default + "silly_config": { + "required": True, + "parser": "silly_parser", + "default": "silly.txt", + }, "config2": {}, + # Test using a function to get the value "config3": {"default_value": "config_default"}, + # Test ** notation + "config_env": {}, } HEADER = """ import json import os +# Test passing values directly on the command line os.environ['METAFLOW_FLOW_CONFIG_VALUE'] = json.dumps( { - "config2": {"default_param": 123}, - "silly_config": "baz:amazing" + "config2": {"default_param": 123} + "config_env": {"vars": {"var1": "value1", "var2": "value2"}} + } +) + +# Test overriding a file (the default one) +os.environ['METAFLOW_FLOW_CONFIG'] = json.dumps( + { + "silly_config": "basic_config_silly.txt" } ) @@ -73,6 +90,7 @@ def step_all(self): assert_equals(self.config.nested["value"], 43) assert_equals(self.config["nested"].value, 43) + # Test parser assert_equals(self.silly_config.baz, "amazing") assert_equals(self.silly_config["baz"], "amazing") @@ -90,6 +108,13 @@ def step_all(self): except TypeError: pass + @tag("environment(**config_env)") + @steps(0, ["start"]) + def step_start(self): + # Here we check the environment based on the ** notation + assert_equals(os.environ["var1"], "value1") + assert_equals(os.environ["var2"], "value2") + def check_results(self, flow, checker): for step in flow: checker.assert_artifact( @@ -103,4 +128,8 @@ def check_results(self, flow, checker): }, ) checker.assert_artifact(step.name, "config2", {"default_param": 123}) + checker.assert_artifact(step.name, "config3", {"val": 456}) checker.assert_artifact(step.name, "silly_config", {"baz": "amazing"}) + checker.assert_artifact( + step.name, "config_env", {"vars": {"var1": "value1", "var2": "value2"}} + ) diff --git a/test/core/tests/basic_config_silly.txt b/test/core/tests/basic_config_silly.txt new file mode 100644 index 00000000000..c438d89d5e0 --- /dev/null +++ b/test/core/tests/basic_config_silly.txt @@ -0,0 +1 @@ +baz:amazing diff --git a/test/core/tests/custom_decorators.py b/test/core/tests/custom_decorators.py new file mode 100644 index 00000000000..e69de29bb2d From 84892fbde68006e92055c23a2d29b8c08844259b Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Sun, 24 Nov 2024 10:35:17 -0800 Subject: [PATCH 23/30] Added string parsers. Added test for mutable flow/step. Added debug messages Use METAFLOW_DEBUG_USERCONF=1 to get a bit more detail. Should be feature complete now. --- metaflow/cli.py | 2 +- metaflow/cli_components/utils.py | 55 +++ metaflow/decorators.py | 5 +- metaflow/flowspec.py | 66 ++-- metaflow/metaflow_config.py | 2 +- metaflow/parameters.py | 13 +- metaflow/plugins/argo/argo_workflows.py | 6 +- .../aws/step_functions/step_functions.py | 4 +- metaflow/runtime.py | 1 + metaflow/user_configs/config_decorators.py | 356 ++++++++++++++---- metaflow/user_configs/config_options.py | 40 +- metaflow/user_configs/config_parameters.py | 25 +- test/core/metaflow_test/formatter.py | 5 +- test_config/config_simple.json | 1 + test_config/config_simple.py | 61 +++ test_config/mutable_flow.py | 165 ++++++++ 16 files changed, 688 insertions(+), 119 deletions(-) create mode 100644 test_config/config_simple.json create mode 100644 test_config/config_simple.py create mode 100644 test_config/mutable_flow.py diff --git a/metaflow/cli.py b/metaflow/cli.py index 4f9d2759930..8d355f8465a 100644 --- a/metaflow/cli.py +++ b/metaflow/cli.py @@ -449,7 +449,7 @@ def start( }, ) - if ctx.invoked_subcommand not in ("run", "resume"): + if ctx.protected_args and ctx.protected_args[0] not in ("run", "resume"): # run/resume are special cases because they can add more decorators with --with, # so they have to take care of themselves. all_decospecs = ctx.obj.tl_decospecs + list( diff --git a/metaflow/cli_components/utils.py b/metaflow/cli_components/utils.py index 5de8d25f084..d56eab486cc 100644 --- a/metaflow/cli_components/utils.py +++ b/metaflow/cli_components/utils.py @@ -12,6 +12,61 @@ def __init__(self, *args, lazy_sources=None, **kwargs): self.lazy_sources = lazy_sources or {} self._lazy_loaded = {} + def invoke(self, ctx): + # NOTE: This is copied from MultiCommand.invoke. The change is that we + # behave like chain in the sense that we evaluate the subcommand *after* + # invoking the base command but we don't chain the commands like self.chain + # would otherwise indicate. + # The goal of this is to make sure that the first command is properly executed + # *first* prior to loading the other subcommands. It's more a lazy_subcommand_load + # than a chain. + # Look for CHANGE HERE in this code to see where the changes are made. + # If click is updated, this may also need to be updated. This version is for + # click 7.1.2. + def _process_result(value): + if self.result_callback is not None: + value = ctx.invoke(self.result_callback, value, **ctx.params) + return value + + if not ctx.protected_args: + # If we are invoked without command the chain flag controls + # how this happens. If we are not in chain mode, the return + # value here is the return value of the command. + # If however we are in chain mode, the return value is the + # return value of the result processor invoked with an empty + # list (which means that no subcommand actually was executed). + if self.invoke_without_command: + # CHANGE HERE: We behave like self.chain = False here + + # if not self.chain: + return click.Command.invoke(self, ctx) + # with ctx: + # click.Command.invoke(self, ctx) + # return _process_result([]) + + ctx.fail("Missing command.") + + # Fetch args back out + args = ctx.protected_args + ctx.args + ctx.args = [] + ctx.protected_args = [] + + # If we're not in chain mode, we only allow the invocation of a + # single command but we also inform the current context about the + # name of the command to invoke. + # CHANGE HERE: We change this block to do the invoke *before* the resolve_command + # Make sure the context is entered so we do not clean up + # resources until the result processor has worked. + with ctx: + ctx.invoked_subcommand = "*" if args else None + click.Command.invoke(self, ctx) + cmd_name, cmd, args = self.resolve_command(ctx, args) + sub_ctx = cmd.make_context(cmd_name, args, parent=ctx) + with sub_ctx: + return _process_result(sub_ctx.command.invoke(sub_ctx)) + + # CHANGE HERE: Removed all the part of chain mode. + def list_commands(self, ctx): base = super().list_commands(ctx) for source_name, source in self.lazy_sources.items(): diff --git a/metaflow/decorators.py b/metaflow/decorators.py index 11743278d13..35ce578db59 100644 --- a/metaflow/decorators.py +++ b/metaflow/decorators.py @@ -12,6 +12,7 @@ ) from .parameters import current_flow +from .user_configs.config_decorators import CustomStepDecorator from .user_configs.config_parameters import ( UNPACK_KEY, resolve_delayed_evaluator, @@ -466,6 +467,8 @@ def _base_step_decorator(decotype, *args, **kwargs): # No keyword arguments specified for the decorator, e.g. @foobar. # The first argument is the function to be decorated. func = args[0] + if isinstance(func, CustomStepDecorator): + func = func._my_step if not hasattr(func, "is_step"): raise BadStepDecoratorException(decotype.name, func) @@ -548,7 +551,6 @@ def _attach_decorators_to_step(step, decospecs): def _init(flow, only_non_static=False): - # We get the datastore for the _parameters step which can contain for decorators in flow._flow_decorators.values(): for deco in decorators: if not only_non_static or not deco.statically_defined: @@ -669,6 +671,7 @@ def foo(self): """ f.is_step = True f.decorators = [] + f.config_decorators = [] try: # python 3 f.name = f.__name__ diff --git a/metaflow/flowspec.py b/metaflow/flowspec.py index b24a659dbde..2db958c2ff8 100644 --- a/metaflow/flowspec.py +++ b/metaflow/flowspec.py @@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, Any, Callable, Generator, List, Optional, Tuple from . import cmd_with_io, parameters +from .debug import debug from .parameters import DelayedEvaluationParameter, Parameter from .exception import ( MetaflowException, @@ -191,12 +192,13 @@ def _process_config_decorators(self, config_options): if not self._flow_state.get(_FlowState.CONFIG_DECORATORS): return self + debug.userconf_exec("Processing mutating step/flow decorators") # We need to convert all the user configurations from DelayedEvaluationParameters # to actual values so they can be used as is in the config decorators. - # We then reset them to be proper parameters so they can be re-evaluated in + # We then reset them to be proper configs so they can be re-evaluated in # _set_constants - to_reset_params = [] + to_reset_configs = [] self._check_parameters(config_parameters=True) for var, param in self._get_parameters(): if not param.IS_CONFIG_PARAMETER: @@ -205,10 +207,32 @@ def _process_config_decorators(self, config_options): val = config_options.get(param.name.replace("-", "_").lower()) if isinstance(val, DelayedEvaluationParameter): val = val() - to_reset_params.append((var, param, val)) + # We store the value as well so that in _set_constants, we don't try + # to recompute (no guarantee that it is stable) + param._store_value(val) + to_reset_configs.append((var, param)) + debug.userconf_exec("Setting config %s to %s" % (var, str(val))) setattr(current_cls, var, val) - # Run all the decorators + # Run all the decorators. Step decorators are directly in the step and + # we will run those first and *then* we run all the flow level decorators + for step in self._steps: + for deco in step.config_decorators: + if isinstance(deco, CustomStepDecorator): + debug.userconf_exec( + "Evaluating step level decorator %s for %s" + % (deco.__class__.__name__, step.name) + ) + deco.evaluate(MutableStep(current_cls, step)) + else: + raise MetaflowInternalError( + "A non CustomFlowDecorator found in step custom decorators" + ) + if step.config_decorators: + # We remove all mention of the custom step decorator + setattr(current_cls, step.name, step) + + mutable_flow = MutableFlow(current_cls) for deco in self._flow_state[_FlowState.CONFIG_DECORATORS]: if isinstance(deco, CustomFlowDecorator): # Sanity check to make sure we are applying the decorator to the right @@ -221,16 +245,18 @@ def _process_config_decorators(self, config_options): "expected %s but got %s" % (deco._flow_cls.__name__, current_cls.__name__) ) - deco.evaluate(MutableFlow(current_cls)) - elif isinstance(deco, CustomStepDecorator): - # Again some sanity checks - if deco._flow_cls != current_cls: - raise MetaflowInternalError( - "CustomStepDecorator registered on the wrong flow -- " - "expected %s but got %s" - % (deco._flow_cls.__name__, current_cls.__name__) - ) - deco.evaluate(CustomStepDecorator(deco._my_step)) + debug.userconf_exec( + "Evaluating flow level decorator %s" % deco.__class__.__name__ + ) + deco.evaluate(mutable_flow) + # We reset cached_parameters on the very off chance that the user added + # more configurations based on the configuration + if _FlowState.CACHED_PARAMETERS in current_cls._flow_state: + del current_cls._flow_state[_FlowState.CACHED_PARAMETERS] + else: + raise MetaflowInternalError( + "A non CustomFlowDecorator found in flow custom decorators" + ) # Process parameters to allow them to also use config values easily for var, param in self._get_parameters(): @@ -240,14 +266,9 @@ def _process_config_decorators(self, config_options): # Reset all configs that were already present in the class. # TODO: This means that users can't override configs directly. Not sure if this # is a pattern we want to support - for var, param in to_reset_params: + for var, param in to_reset_configs: setattr(current_cls, var, param) - # We reset cached_parameters on the very off chance that the user added - # more configurations based on the configuration - if _FlowState.CACHED_PARAMETERS in current_cls._flow_state: - del current_cls._flow_state[_FlowState.CACHED_PARAMETERS] - # Set the current flow class we are in (the one we just created) parameters.replace_flow_context(current_cls) return current_cls(use_cli=False) @@ -269,7 +290,10 @@ def _set_constants(self, graph, kwargs, config_options): for var, param in self._get_parameters(): seen.add(var) if param.IS_CONFIG_PARAMETER: - val = config_options.get(param.name.replace("-", "_").lower()) + # Use computed value if already evaluated, else get from config_options + val = param._computed_value or config_options.get( + param.name.replace("-", "_").lower() + ) else: val = kwargs[param.name.replace("-", "_").lower()] # Support for delayed evaluation of parameters. diff --git a/metaflow/metaflow_config.py b/metaflow/metaflow_config.py index 9b13e2c0d28..a590b40527b 100644 --- a/metaflow/metaflow_config.py +++ b/metaflow/metaflow_config.py @@ -440,7 +440,7 @@ ### # Debug configuration ### -DEBUG_OPTIONS = ["subcommand", "sidecar", "s3client", "tracing", "stubgen"] +DEBUG_OPTIONS = ["subcommand", "sidecar", "s3client", "tracing", "stubgen", "userconf"] for typ in DEBUG_OPTIONS: vars()["DEBUG_%s" % typ.upper()] = from_conf("DEBUG_%s" % typ.upper(), False) diff --git a/metaflow/parameters.py b/metaflow/parameters.py index ddcf43263c5..99fe869ff17 100644 --- a/metaflow/parameters.py +++ b/metaflow/parameters.py @@ -300,7 +300,11 @@ class MyFlow(FlowSpec): ---------- name : str User-visible parameter name. - default : str or float or int or bool or `JSONType` or a function. + default : Union[str, float, int, bool, Dict[str, Any], + Callable[ + [ParameterContext], Union[str, float, int, bool, Dict[str, Any]] + ], + ], optional, default None Default value for the parameter. Use a special `JSONType` class to indicate that the value must be a valid JSON object. A function implies that the parameter corresponds to a *deploy-time parameter*. @@ -309,11 +313,12 @@ class MyFlow(FlowSpec): If `default` is not specified, define the parameter type. Specify one of `str`, `float`, `int`, `bool`, or `JSONType`. If None, defaults to the type of `default` or `str` if none specified. - help : str, optional + help : str, optional, default None Help text to show in `run --help`. required : bool, optional, default None - Require that the user specified a value for the parameter. `required=True` implies - that `default` is not used. A value of None is equivalent to False. + Require that the user specified a value for the parameter. Note that if + a default is provide, the required flag is ignored. + A value of None is equivalent to False. show_default : bool, optional, default None If True, show the default value in the help text. A value of None is equivalent to True. diff --git a/metaflow/plugins/argo/argo_workflows.py b/metaflow/plugins/argo/argo_workflows.py index 1214545522a..ea0d6c6798e 100644 --- a/metaflow/plugins/argo/argo_workflows.py +++ b/metaflow/plugins/argo/argo_workflows.py @@ -459,7 +459,7 @@ def _process_parameters(self): seen.add(norm) # NOTE: We skip config parameters as these do not have dynamic values, # and need to be treated differently. - if param.IS_FLOW_PARAMETER: + if param.IS_CONFIG_PARAMETER: continue extra_attrs = {} @@ -509,7 +509,7 @@ def _process_config_parameters(self): parameters = [] seen = set() for var, param in self.flow._get_parameters(): - if not param.IS_FLOW_PARAMETER: + if not param.IS_CONFIG_PARAMETER: continue # Throw an exception if the parameter is specified twice. norm = param.name.lower() @@ -553,7 +553,7 @@ def _process_triggers(self): [ param.name.lower() for var, param in self.flow._get_parameters() - if not param.IS_FLOW_PARAMETER + if not param.IS_CONFIG_PARAMETER ] ) trigger_deco = self.flow._flow_decorators.get("trigger")[0] diff --git a/metaflow/plugins/aws/step_functions/step_functions.py b/metaflow/plugins/aws/step_functions/step_functions.py index 61862183883..6154192a6f8 100644 --- a/metaflow/plugins/aws/step_functions/step_functions.py +++ b/metaflow/plugins/aws/step_functions/step_functions.py @@ -489,7 +489,7 @@ def _process_parameters(self): seen.add(norm) # NOTE: We skip config parameters as these do not have dynamic values, # and need to be treated differently. - if param.IS_FLOW_PARAMETER: + if param.IS_CONFIG_PARAMETER: continue is_required = param.kwargs.get("required", False) @@ -511,7 +511,7 @@ def _process_config_parameters(self): parameters = [] seen = set() for var, param in self.flow._get_parameters(): - if not param.IS_FLOW_PARAMETER: + if not param.IS_CONFIG_PARAMETER: continue # Throw an exception if the parameter is specified twice. norm = param.name.lower() diff --git a/metaflow/runtime.py b/metaflow/runtime.py index 2feca2c2be0..c301b1a9ce5 100644 --- a/metaflow/runtime.py +++ b/metaflow/runtime.py @@ -1573,6 +1573,7 @@ def _options(mapping): args.extend(_options(self.top_level_options)) args.extend(self.commands) args.extend(self.command_args) + args.extend(_options(self.command_options)) return args diff --git a/metaflow/user_configs/config_decorators.py b/metaflow/user_configs/config_decorators.py index 2360f7ea969..afcec2ba077 100644 --- a/metaflow/user_configs/config_decorators.py +++ b/metaflow/user_configs/config_decorators.py @@ -1,6 +1,7 @@ from functools import partial from typing import Any, Callable, Generator, Optional, TYPE_CHECKING, Tuple, Union +from metaflow.debug import debug from metaflow.exception import MetaflowException from metaflow.user_configs.config_parameters import ( ConfigValue, @@ -8,8 +9,9 @@ ) if TYPE_CHECKING: - import metaflow.flowspec import metaflow.decorators + import metaflow.flowspec + import metaflow.parameters class MutableStep: @@ -21,13 +23,76 @@ class MutableStep: def __init__( self, + flow_spec: "metaflow.flowspec.FlowSpec", step: Union[ Callable[["metaflow.decorators.FlowSpecDerived"], None], Callable[["metaflow.decorators.FlowSpecDerived", Any], None], ], ): + self._mutable_container = MutableFlow(flow_spec) self._my_step = step + @property + def flow(self) -> "MutableFlow": + """ + The flow that contains this step + + Returns + ------- + MutableFlow + The flow that contains this step + """ + return self._mutable_container + + @property + def decorators(self) -> Generator["metaflow.decorators.StepDecorator", None, None]: + """ + Iterate over all the decorators of this step. Note that the same type of decorator + may be present multiple times and no order is guaranteed. + + Yields + ------ + metaflow.decorators.StepDecorator + A decorator of the step + """ + for deco in self._my_step.decorators: + yield deco + + def add_decorator(self, deco_type: partial, **kwargs) -> None: + """ + Add a Metaflow decorator to a step. + + Parameters + ---------- + deco_type : partial + The decorator class to add to this step + """ + # Prevent circular import + from metaflow.decorators import DuplicateStepDecoratorException, StepDecorator + + # Validate deco_type + if ( + not isinstance(deco_type, partial) + or len(deco_type.args) != 1 + or not issubclass(deco_type.args[0], StepDecorator) + ): + raise TypeError("add_decorator takes a StepDecorator") + + deco_type = deco_type.args[0] + if ( + deco_type.name in [deco.name for deco in self._my_step.decorators] + and not deco_type.allow_multiple + ): + raise DuplicateStepDecoratorException(deco_type.name, self._my_step) + + debug.userconf_exec( + "Mutable decorator adding step decorator %s to step %s" + % (deco_type.name, self._my_step.name) + ) + self._my_step.decorators.append( + deco_type(attributes=kwargs, statically_defined=True) + ) + def remove_decorator(self, deco_name: str, all: bool = True, **kwargs) -> bool: """ Remove one or more Metaflow decorators from a step. @@ -65,6 +130,10 @@ def remove_decorator(self, deco_name: str, all: bool = True, **kwargs) -> bool: break if match_ok: did_remove = True + debug.userconf_exec( + "Mutable decorator removing step decorator %s from step %s" + % (deco.name, self._my_step.name) + ) else: new_deco_list.append(deco) else: @@ -75,42 +144,26 @@ def remove_decorator(self, deco_name: str, all: bool = True, **kwargs) -> bool: self._my_step.decorators = new_deco_list return did_remove - def add_decorator(self, deco_type: partial, **kwargs) -> None: - """ - Add a Metaflow decorator to a step. - - Parameters - ---------- - deco_type : partial - The decorator class to add to this step - """ - # Prevent circular import - from metaflow.decorators import DuplicateStepDecoratorException, StepDecorator - - # Validate deco_type - if ( - not isinstance(deco_type, partial) - or len(deco_type.args) != 1 - or not issubclass(deco_type.args[0], StepDecorator) - ): - raise TypeError("add_decorator takes a StepDecorator") - - deco_type = deco_type.args[0] - if ( - deco_type.name in [deco.name for deco in self._my_step.decorators] - and not deco_type.allow_multiple - ): - raise DuplicateStepDecoratorException(deco_type.name, self._my_step) - - self._my_step.decorators.append( - deco_type(attributes=kwargs, statically_defined=True) - ) - class MutableFlow: - def __init__(self, flow_spec: "FlowSpec"): + def __init__(self, flow_spec: "metaflow.flowspec.FlowSpec"): self._flow_cls = flow_spec + @property + def decorators(self) -> Generator["metaflow.decorators.FlowDecorator", None, None]: + """ + Iterate over all the decorators of this flow. Note that the same type of decorator + may be present multiple times and no order is guaranteed. + + Yields + ------ + metaflow.decorators.FlowDecorator + A decorator of the flow + """ + for decos in self._flow_cls._flow_decorators.values(): + for deco in decos: + yield deco + @property def configs(self) -> Generator[Tuple[str, ConfigValue], None, None]: """ @@ -151,12 +204,16 @@ def start(self): @property def parameters(self) -> Generator[Tuple[str, Any], None, None]: - pass + for var, param in self._flow_cls._get_parameters(): + if param.IS_CONFIG_PARAMETER: + continue + yield var, param @property def steps(self) -> Generator[Tuple[str, MutableStep], None, None]: """ - Iterate over all the steps in this flow + Iterate over all the steps in this flow. The order of the steps + returned is not guaranteed. Yields ------ @@ -166,18 +223,160 @@ def steps(self) -> Generator[Tuple[str, MutableStep], None, None]: for var in dir(self._flow_cls): potential_step = getattr(self._flow_cls, var) if callable(potential_step) and hasattr(potential_step, "is_step"): - yield var, MutableStep(potential_step) + yield var, MutableStep(self._flow_cls, potential_step) + + def add_parameter( + self, name: str, value: "metaflow.parameters.Parameter", overwrite: bool = False + ) -> None: + from metaflow.parameters import Parameter + + if hasattr(self._flow_cls, name) and not overwrite: + raise MetaflowException( + "Flow '%s' already has a class member '%s' -- " + "set overwrite=True in add_parameter to overwrite it." + % (self._flow_cls.__name__, name) + ) + if not isinstance(value, Parameter) or value.IS_CONFIG_PARAMETER: + raise MetaflowException( + "Only a Parameter or an IncludeFile can be added using `add_parameter`" + "; got %s" % type(value) + ) + debug.userconf_exec("Mutable flow decorator adding parameter %s to flow" % name) + setattr(self._flow_cls, name, value) + + def remove_parameter(self, parameter_name: str) -> bool: + """ + Remove a parameter from the flow. + + The name given should match the name of the parameter (can be different + from the name of the parameter in the flow. You can not remove config parameters. + + Parameters + ---------- + parameter_name : str + Name of the parameter + + Returns + ------- + bool + Returns True if the parameter was removed + """ + from metaflow.flowspec import _FlowState + + for var, param in self._flow_cls._get_parameters(): + if param.IS_CONFIG_PARAMETER: + continue + if param.name == parameter_name: + delattr(self._flow_cls, var) + debug.userconf_exec( + "Mutable flow decorator removing parameter %s from flow" % var + ) + # Reset so that we don't list it again + del self._flow_cls._flow_state[_FlowState.CACHED_PARAMETERS] + return True + return False + + def add_decorator(self, deco_type: partial, **kwargs) -> None: + """ + Add a Metaflow decorator to a flow. + + Parameters + ---------- + deco_type : partial + The decorator class to add to this flow + """ + # Prevent circular import + from metaflow.decorators import DuplicateFlowDecoratorException, FlowDecorator + + # Validate deco_type + if ( + not isinstance(deco_type, partial) + or len(deco_type.args) != 1 + or not issubclass(deco_type.args[0], FlowDecorator) + ): + raise TypeError("add_decorator takes a FlowDecorator") + + deco_type = deco_type.args[0] + if ( + deco_type.name in self._flow_cls._flow_decorators + and not deco_type.allow_multiple + ): + raise DuplicateFlowDecoratorException(deco_type.name) + + self._flow_cls._flow_decorators.setdefault(deco_type.name, []).append( + deco_type(attributes=kwargs, statically_defined=True) + ) + debug.userconf_exec( + "Mutable flow decorator adding decorator %s to flow" % deco_type.name + ) + + def remove_decorator(self, deco_name: str, all: bool = True, **kwargs) -> bool: + """ + Remove one or more Metaflow decorators from a flow. + + Some decorators can be applied multiple times to a flow. This method allows you + to choose which decorator to remove or just remove all of them or one of them. + + Parameters + ---------- + deco_name : str + Name of the decorator to remove + all : bool, default True + If True, remove all instances of the decorator that match the filters + passed using kwargs (or all the instances of the decorator if no filters are + passed). If False, removes only the first found instance of the decorator. + + Returns + ------- + bool + Returns True if at least one decorator was removed. + """ + new_deco_list = [] + old_deco_list = self._flow_cls._flow_decorators.get(deco_name) + if old_deco_list is None: + return False + + did_remove = False + for deco in old_deco_list: + # Evaluate all the configuration values if any + deco.init() + + # Check filters + match_ok = True + if kwargs: + for k, v in kwargs.items(): + match_ok = k in deco.attributes and deco.attributes[k] == v + if match_ok is False: + break + if match_ok: + did_remove = True + debug.userconf_exec( + "Mutable flow decorator removing decorator %s from flow" % deco.name + ) + else: + new_deco_list.append(deco) + if did_remove and not all: + break + + if new_deco_list: + self._flow_cls._flow_decorators[deco_name] = new_deco_list + else: + del self._flow_cls._flow_decorators[deco_name] + return did_remove def __getattr__(self, name): # We allow direct access to the steps, configs and parameters but nothing else + from metaflow.parameters import Parameter + attr = getattr(self._flow_cls, name) if attr: # Steps if callable(attr) and hasattr(attr, "is_step"): - return MutableStep(attr) + return MutableStep(self._flow_cls, attr) if name[0] == "_" or name in self._flow_cls._NON_PARAMETERS: raise AttributeError(self, name) - return attr + if isinstance(attr, (Parameter, ConfigValue)): + return attr raise AttributeError(self, name) @@ -207,6 +406,19 @@ def __init__(self, *args, **kwargs): self._args = args self._kwargs = kwargs + def __get__(self, instance, owner): + # Required so that we "present" as a FlowSpec when the flow decorator is + # of the form + # @MyFlowDecorator + # class MyFlow(FlowSpec): + # pass + # + # In that case, if we don't have __get__, the object is a CustomFlowDecorator + # and not a FlowSpec. This is more critical for steps (and CustomStepDecorator) + # because other parts of the code rely on steps having is_step. There are + # other ways to solve this but this allowed for minimal changes going forward. + return self() + def __call__( self, flow_spec: Optional["metaflow.flowspec.FlowSpecMeta"] = None ) -> "metaflow.flowspec.FlowSpecMeta": @@ -218,13 +430,15 @@ def __call__( # You can use config values in the arguments to a CustomFlowDecorator # so we resolve those as well new_args = [resolve_delayed_evaluator(arg) for arg in self._args] - try: - self.init(*self._args, **self._kwargs) - except NotImplementedError as e: + new_kwargs = { + k: resolve_delayed_evaluator(v) for k, v in self._kwargs.items() + } + self.init(*new_args, **new_kwargs) + if hasattr(self, "_empty_init"): raise MetaflowException( "CustomFlowDecorator '%s' is used with arguments " "but does not implement init" % str(self.__class__) - ) from e + ) return self._set_flow_cls(flow_spec) elif not self._flow_cls: @@ -248,12 +462,8 @@ def init(self, *args, **kwargs): """ This method is intended to be optionally overridden if you need to have an initializer. - - Raises - ------ - NotImplementedError: If the method is not overridden in a subclass. """ - raise NotImplementedError() + self._empty_init = True def evaluate(self, mutable_flow: MutableFlow) -> None: """ @@ -274,23 +484,25 @@ def evaluate(self, mutable_flow: MutableFlow) -> None: class CustomStepDecorator: def __init__(self, *args, **kwargs): - if args and ( - isinstance(args[0], CustomStepDecorator) - or callable(args[0]) - and hasattr(args[0], "is_step") - ): - # This means the decorator is bare like @MyDecorator - # and the first argument is the step or another decorator (they - # can be stacked) + arg = None + if args: if isinstance(args[0], CustomStepDecorator): - self._set_my_step(args[0]._my_step) + arg = args[0]._my_step else: - self._set_my_step(args[0]) + arg = args[0] + if arg and callable(arg) and hasattr(arg, "is_step"): + # This means the decorator is bare like @MyDecorator + # and the first argument is the step + self._set_my_step(arg) else: # The arguments are actually passed to the init function for this decorator self._args = args self._kwargs = kwargs + def __get__(self, instance, owner): + # See explanation in CustomFlowDecorator.__get__ + return self() + def __call__( self, step: Optional[ @@ -304,10 +516,18 @@ def __call__( Callable[["metaflow.decorators.FlowSpecDerived", Any], None], ]: if step: - # This is the case of a decorator @MyDecorator(foo=1, bar=2) and so - # we already called __init__ and saved foo and bar into self._args and - # self._kwargs and are now calling this on the step itself. - self.init(*self._args, **self._kwargs) + # You can use config values in the arguments to a CustomFlowDecorator + # so we resolve those as well + new_args = [resolve_delayed_evaluator(arg) for arg in self._args] + new_kwargs = { + k: resolve_delayed_evaluator(v) for k, v in self._kwargs.items() + } + self.init(*new_args, **new_kwargs) + if hasattr(self, "_empty_init"): + raise MetaflowException( + "CustomStepDecorator '%s' is used with arguments " + "but does not implement init" % str(self.__class__) + ) return self._set_my_step(step) elif not self._my_step: # This means that somehow the initialization did not happen properly @@ -327,25 +547,17 @@ def _set_my_step( Callable[["metaflow.decorators.FlowSpecDerived"], None], Callable[["metaflow.decorators.FlowSpecDerived", Any], None], ]: - from ..flowspec import _FlowState self._my_step = step - # Get the flow - flow_spec = step.__globals__[step.__qualname__.rsplit(".", 1)[0]] - flow_spec._flow_state.setdefault(_FlowState.CONFIG_DECORATORS, []).append(self) - - self._flow_cls = flow_spec + self._my_step.config_decorators.append(self) + return self._my_step def init(self, *args, **kwargs): """ This method is intended to be optionally overridden if you need to have an initializer. - - Raises - ------ - NotImplementedError: If the method is not overridden in a subclass. """ - raise NotImplementedError() + self._empty_init = True def evaluate(self, mutable_step: MutableStep) -> None: raise NotImplementedError() diff --git a/metaflow/user_configs/config_options.py b/metaflow/user_configs/config_options.py index 9502e0eab66..380ea7555f4 100644 --- a/metaflow/user_configs/config_options.py +++ b/metaflow/user_configs/config_options.py @@ -1,9 +1,11 @@ +import importlib import json import os from typing import Any, Callable, Dict, List, Optional, Tuple, Union from metaflow._vendor import click +from metaflow.debug import debug from .config_parameters import CONFIG_FILE, ConfigValue from ..exception import MetaflowException, MetaflowInternalError @@ -134,7 +136,7 @@ def __init__( self, req_configs: List[str], defaults: Dict[str, Tuple[Union[str, Dict[Any, Any]], bool]], - parsers: Dict[str, Callable[[str], Dict[Any, Any]]], + parsers: Dict[str, Union[str, Callable[[str], Dict[Any, Any]]]], ): self._req_configs = set(req_configs) self._defaults = defaults @@ -196,7 +198,11 @@ def process_configs(self, ctx, param, value): # the value go through ConvertPath or ConvertDictOrStr # - the actual value passed through prefixed with _CONVERT_PREFIX - # print("Got arg name %s and values %s" % (param.name, str(value))) + debug.userconf_exec( + "Processing configs for %s -- incoming values: %s" + % (param.name, str(value)) + ) + do_return = self._value_values is None and self._path_values is None # We only keep around non default values. We could simplify by checking just one # value and if it is default it means all are but this doesn't seem much more effort @@ -215,6 +221,7 @@ def process_configs(self, ctx, param, value): } if do_return: # One of config_value_options or config_file_options will be None + debug.userconf_exec("Incomplete config options; waiting for more") return None # The second go around, we process all the values and merge them. @@ -231,7 +238,8 @@ def process_configs(self, ctx, param, value): all_values = dict(self._path_values or {}) all_values.update(self._value_values or {}) - # print("Got all values: %s" % str(all_values)) + debug.userconf_exec("All config values: %s" % str(all_values)) + flow_cls._flow_state[_FlowState.CONFIGS] = {} to_return = {} @@ -263,7 +271,9 @@ def process_configs(self, ctx, param, value): else: # This is a value merged_configs[n] = ConvertDictOrStr.convert_value(val, False) - # print("Merged configs: %s" % str(merged_configs)) + + debug.userconf_exec("Configs merged with defaults: %s" % str(merged_configs)) + missing_configs = set() no_file = [] no_default_file = [] @@ -290,7 +300,7 @@ def process_configs(self, ctx, param, value): to_return[name] = ConfigValue(read_value) else: if self._parsers[name]: - read_value = self._parsers[name](val) + read_value = self._call_parser(self._parsers[name], val) else: try: read_value = json.loads(val) @@ -321,6 +331,8 @@ def process_configs(self, ctx, param, value): raise click.UsageError( "Bad values passed for configuration options: %s" % ", ".join(msgs) ) + + debug.userconf_exec("Finalized configs: %s" % str(to_return)) return to_return def __str__(self): @@ -329,6 +341,24 @@ def __str__(self): def __repr__(self): return "ConfigInput" + @staticmethod + def _call_parser(parser, val): + if isinstance(parser, str): + if len(parser) and parser[0] == ".": + parser = "metaflow" + parser + path, func = parser.rsplit(".", 1) + try: + func_module = importlib.import_module(path) + except ImportError as e: + raise ValueError("Cannot locate parser %s" % parser) from e + parser = getattr(func_module, func, None) + if parser is None or not callable(parser): + raise ValueError( + "Parser %s is either not part of %s or not a callable" + % (func, path) + ) + return parser(val) + class LocalFileInput(click.Path): # Small wrapper around click.Path to set the value from which to read configuration diff --git a/metaflow/user_configs/config_parameters.py b/metaflow/user_configs/config_parameters.py index 81e1a0f8738..18b27d84615 100644 --- a/metaflow/user_configs/config_parameters.py +++ b/metaflow/user_configs/config_parameters.py @@ -287,12 +287,16 @@ class Config(Parameter, collections.abc.Mapping): You can only specify default or default_value. help : str, optional, default None Help text to show in `run --help`. - required : bool, default False - Require that the user specified a value for the parameter. Note that if - a default is provided, the required flag is ignored. - parser : Callable[[str], Dict[Any, Any]], optional, default None - An optional function that can parse the configuration string into an arbitrarily - nested dictionary. + required : bool, optional, default None + Require that the user specified a value for the configuration. Note that if + a default is provided, the required flag is ignored. A value of None is + equivalent to False. + parser : Union[str, Callable[[str], Dict[Any, Any]]], optional, default None + If a callable, it is a function that can parse the configuration string + into an arbitrarily nested dictionary. If a string, the string should refer to + a function (like "my_parser_package.my_parser.my_parser_function") which should + be able to parse the configuration string into an arbitrarily nested dictionary. + If the name starts with a ".", it is assumed to be relative to "metaflow". show_default : bool, default True If True, show the default value in the help text. """ @@ -311,8 +315,8 @@ def __init__( ] ] = None, help: Optional[str] = None, - required: bool = False, - parser: Optional[Callable[[str], Dict[Any, Any]]] = None, + required: Optional[bool] = None, + parser: Optional[Union[str, Callable[[str], Dict[Any, Any]]]] = None, **kwargs: Dict[str, str] ): @@ -326,14 +330,19 @@ def __init__( super(Config, self).__init__( name, required=required, help=help, type=str, **kwargs ) + super(Config, self).init() if isinstance(kwargs.get("default", None), str): kwargs["default"] = json.dumps(kwargs["default"]) self.parser = parser + self._computed_value = None def load_parameter(self, v): return v + def _store_value(self, v: Any) -> None: + self._computed_value = v + # Support . syntax def __getattr__(self, name): return DelayEvaluator(self.name.lower()).__getattr__(name) diff --git a/test/core/metaflow_test/formatter.py b/test/core/metaflow_test/formatter.py index f7df9660f6b..1ba08d5a45f 100644 --- a/test/core/metaflow_test/formatter.py +++ b/test/core/metaflow_test/formatter.py @@ -1,6 +1,7 @@ -import sys import inspect +from metaflow.user_configs.config_decorators import CustomStepDecorator + INDENT = 4 @@ -49,6 +50,8 @@ def _index_steps(self, test): steps = [] for attr in dir(test): obj = getattr(test, attr) + if isinstance(obj, CustomStepDecorator): + steps.append(obj._my_step) if hasattr(obj, "is_step"): steps.append(obj) return list(sorted(steps, key=lambda x: x.prio)) diff --git a/test_config/config_simple.json b/test_config/config_simple.json new file mode 100644 index 00000000000..73d35ad496a --- /dev/null +++ b/test_config/config_simple.json @@ -0,0 +1 @@ +{"some": {"value": 5}} diff --git a/test_config/config_simple.py b/test_config/config_simple.py new file mode 100644 index 00000000000..e125836d551 --- /dev/null +++ b/test_config/config_simple.py @@ -0,0 +1,61 @@ +import os + +from metaflow import Config, FlowSpec, config_expr, environment, project, step + + +def audit(run, parameters, stdout_path): + # We should only have one run here + if len(run) != 1: + raise RuntimeError("Expected only one run; got %d" % len(run)) + run = run[0] + + # Check successful run + if not run.successful: + raise RuntimeError("Run was not successful") + # Check that we have the proper project name + if "project:config_project" not in run.tags: + raise RuntimeError("Project name is incorrect.") + + # Check the value of the artifacts in the end step + end_task = run["end"].task + if ( + end_task.config_val != 5 + or end_task.config_val_2 != "41" + or end_task.config_from_env != "5" + or end_task.config_from_env_2 != "41" + ): + raise RuntimeError("Config values are incorrect.") + + return None + + +@project(name=config_expr("cfg_default_value.a.project_name")) +class ConfigSimple(FlowSpec): + + cfg = Config("cfg", default="config_simple.json") + cfg_default_value = Config( + "cfg_default_value", + default_value={"a": {"b": "41", "project_name": "config_project"}}, + ) + + @environment( + vars={ + "TSTVAL": config_expr("str(cfg.some.value)"), + "TSTVAL2": cfg_default_value.a.b, + } + ) + @step + def start(self): + self.config_from_env = os.environ.get("TSTVAL") + self.config_from_env_2 = os.environ.get("TSTVAL2") + self.config_val = self.cfg.some.value + self.config_val_2 = self.cfg_default_value.a.b + self.next(self.end) + + @step + def end(self): + pass + + +if __name__ == "__main__": + ConfigSimple() diff --git a/test_config/mutable_flow.py b/test_config/mutable_flow.py new file mode 100644 index 00000000000..ef22f06c64b --- /dev/null +++ b/test_config/mutable_flow.py @@ -0,0 +1,165 @@ +import os + +from metaflow import ( + Config, + CustomFlowDecorator, + CustomStepDecorator, + FlowSpec, + Parameter, + config_expr, + environment, + project, + step, +) + + +def audit(run, parameters, stdout_path): + # We should only have one run here + if len(run) != 1: + raise RuntimeError("Expected only one run; got %d" % len(run)) + run = run[0] + + # Check successful run + if not run.successful: + raise RuntimeError("Run was not successful") + # Check that we have the proper project name + if "project:config_project" not in run.tags: + raise RuntimeError("Project name is incorrect.") + + return None + + +class ModifyFlow(CustomFlowDecorator): + def evaluate(self, mutable_flow): + steps = ["start", "end"] + count = 0 + for name, s in mutable_flow.steps: + assert name in steps, "Unexpected step name" + steps.remove(name) + count += 1 + assert count == 2, "Unexpected number of steps" + + count = 0 + parameters = [] + for name, c in mutable_flow.configs: + assert name == "config", "Unexpected config name" + parameters = c["parameters"] + count += 1 + assert count == 1, "Unexpected number of configs" + + count = 0 + for name, p in mutable_flow.parameters: + assert name == parameters[count]["name"], "Unexpected parameter name" + count += 1 + + # Do some actual modification, we are going to update an environment decorator. + # Note that in this flow, we have an environment decorator which is then + to_add = mutable_flow.config["flow_add_environment"]["vars"] + for name, s in mutable_flow.steps: + if name == "start": + decos = [deco for deco in s.decorators] + assert len(decos) == 1, "Unexpected number of decorators" + assert decos[0].name == "environment", "Unexpected decorator" + for k, v in to_add.items(): + decos[0].attributes["vars"][k] = v + else: + s.add_decorator( + environment, **mutable_flow.config["flow_add_environment"].to_dict() + ) + + +class ModifyFlowWithArgs(CustomFlowDecorator): + def init(self, *args, **kwargs): + self._field_to_check = args[0] + + def evaluate(self, mutable_flow): + parameters = mutable_flow.config.get(self._field_to_check, []) + for param in parameters: + mutable_flow.add_parameter( + param["name"], + Parameter(param["name"], type=str, default=param["default"]), + overwrite=True, + ) + + +class ModifyStep(CustomStepDecorator): + def evaluate(self, mutable_step): + mutable_step.remove_decorator("environment") + + for deco in mutable_step.decorators: + assert deco.name != "environment", "Unexpected decorator" + + mutable_step.add_decorator( + environment, **mutable_step.flow.config["step_add_environment"].to_dict() + ) + + +class ModifyStep2(CustomStepDecorator): + def evaluate(self, mutable_step): + to_add = mutable_step.flow.config["step_add_environment_2"]["vars"] + for deco in mutable_step.decorators: + if deco.name == "environment": + for k, v in to_add.items(): + deco.attributes["vars"][k] = v + + +@ModifyFlow +@ModifyFlowWithArgs("parameters") +@project(name=config_expr("config.project_name")) +class ConfigMutableFlow(FlowSpec): + + config = Config( + "config", + default_value={ + "parameters": [ + {"name": "param1", "default": "41"}, + {"name": "param2", "default": "42"}, + ], + "step_add_environment": {"vars": {"STEP_LEVEL": "2"}}, + "step_add_environment_2": {"vars": {"STEP_LEVEL_2": "3"}}, + "flow_add_environment": {"vars": {"FLOW_LEVEL": "4"}}, + "project_name": "config_project", + }, + ) + + def _check(self, step_decorators): + assert self.param1 == "41", "param1 does not match expected value" + assert self.param2 == "42", "param2 does not match expected value" + assert ( + os.environ.get("SHOULD_NOT_EXIST") is None + ), "Unexpected environment variable" + assert ( + os.environ.get("FLOW_LEVEL") == "4" + ), "Flow level environment variable not set" + if step_decorators: + assert os.environ.get("STEP_LEVEL") == "2", "Missing step_level decorator" + assert ( + os.environ.get("STEP_LEVEL_2") == "3" + ), "Missing step_level_2 decorator" + else: + assert ( + os.environ.get("STEP_LEVEL") is None + ), "Step level environment variable set" + assert ( + os.environ.get("STEP_LEVEL_2") is None + ), "Step level 2 environment variable set" + + @ModifyStep2 + @ModifyStep + @environment(vars={"SHOULD_NOT_EXIST": "1"}) + @step + def start(self): + print("Starting start step...") + self._check(step_decorators=True) + print("All checks are good.") + self.next(self.end) + + @step + def end(self): + print("Starting end step...") + self._check(step_decorators=False) + print("All checks are good.") + + +if __name__ == "__main__": + ConfigMutableFlow() From 41e42fb3363867490dee328f23f5c6ab648c332b Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Mon, 25 Nov 2024 02:22:43 -0800 Subject: [PATCH 24/30] Call param init even when no config present --- metaflow/flowspec.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/metaflow/flowspec.py b/metaflow/flowspec.py index 2db958c2ff8..d331f7d25aa 100644 --- a/metaflow/flowspec.py +++ b/metaflow/flowspec.py @@ -190,6 +190,11 @@ def _process_config_decorators(self, config_options): # Fast path for no user configurations if not self._flow_state.get(_FlowState.CONFIG_DECORATORS): + # Process parameters to allow them to also use config values easily + for var, param in self._get_parameters(): + if param.IS_CONFIG_PARAMETER: + continue + param.init() return self debug.userconf_exec("Processing mutating step/flow decorators") From fe5300b029ff97ebf3e01dbb8d9700c874d9ff2a Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Mon, 25 Nov 2024 12:20:04 -0800 Subject: [PATCH 25/30] Reset cached parameters properly --- metaflow/flowspec.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/metaflow/flowspec.py b/metaflow/flowspec.py index d331f7d25aa..e73f8613138 100644 --- a/metaflow/flowspec.py +++ b/metaflow/flowspec.py @@ -274,6 +274,10 @@ def _process_config_decorators(self, config_options): for var, param in to_reset_configs: setattr(current_cls, var, param) + # Reset cached parameters again since we added back the config parameters + if _FlowState.CACHED_PARAMETERS in current_cls._flow_state: + del current_cls._flow_state[_FlowState.CACHED_PARAMETERS] + # Set the current flow class we are in (the one we just created) parameters.replace_flow_context(current_cls) return current_cls(use_cli=False) From 05e5700d71f11a4762be580d6d767140b641d7a7 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Mon, 25 Nov 2024 12:21:26 -0800 Subject: [PATCH 26/30] Updated tests --- test_config/config_simple.py | 23 ++++++--- test_config/mutable_flow.py | 97 ++++++++++++++++++++++++++++-------- 2 files changed, 93 insertions(+), 27 deletions(-) diff --git a/test_config/config_simple.py b/test_config/config_simple.py index e125836d551..067eb5ab44c 100644 --- a/test_config/config_simple.py +++ b/test_config/config_simple.py @@ -1,9 +1,12 @@ +import json import os from metaflow import Config, FlowSpec, config_expr, environment, project, step +default_config = {"a": {"b": "41", "project_name": "config_project"}} -def audit(run, parameters, stdout_path): + +def audit(run, parameters, configs, stdout_path): # We should only have one run here if len(run) != 1: raise RuntimeError("Expected only one run; got %d" % len(run)) @@ -12,17 +15,23 @@ def audit(run, parameters, stdout_path): # Check successful run if not run.successful: raise RuntimeError("Run was not successful") + + if configs and configs.get("cfg_default_value"): + config = json.loads(configs["cfg_default_value"]) + else: + config = default_config + # Check that we have the proper project name - if "project:config_project" not in run.tags: + if f"project:{config['a']['project_name']}" not in run.tags: raise RuntimeError("Project name is incorrect.") # Check the value of the artifacts in the end step end_task = run["end"].task if ( - end_task.config_val != 5 - or end_task.config_val_2 != "41" - or end_task.config_from_env != "5" - or end_task.config_from_env_2 != "41" + end_task.data.config_val != 5 + or end_task.data.config_val_2 != config["a"]["b"] + or end_task.data.config_from_env != "5" + or end_task.data.config_from_env_2 != config["a"]["b"] ): raise RuntimeError("Config values are incorrect.") @@ -35,7 +44,7 @@ class ConfigSimple(FlowSpec): cfg = Config("cfg", default="config_simple.json") cfg_default_value = Config( "cfg_default_value", - default_value={"a": {"b": "41", "project_name": "config_project"}}, + default_value=default_config, ) @environment( diff --git a/test_config/mutable_flow.py b/test_config/mutable_flow.py index ef22f06c64b..0ba5d7aefcd 100644 --- a/test_config/mutable_flow.py +++ b/test_config/mutable_flow.py @@ -1,3 +1,4 @@ +import json import os from metaflow import ( @@ -12,8 +13,30 @@ step, ) +default_config = { + "parameters": [ + {"name": "param1", "default": "41"}, + {"name": "param2", "default": "42"}, + ], + "step_add_environment": {"vars": {"STEP_LEVEL": "2"}}, + "step_add_environment_2": {"vars": {"STEP_LEVEL_2": "3"}}, + "flow_add_environment": {"vars": {"FLOW_LEVEL": "4"}}, + "project_name": "config_project", +} + + +def find_param_in_parameters(parameters, name): + for param in parameters: + splits = param.split(" ") + try: + idx = splits.index("--" + name) + return splits[idx + 1] + except ValueError: + continue + return None + -def audit(run, parameters, stdout_path): +def audit(run, parameters, configs, stdout_path): # We should only have one run here if len(run) != 1: raise RuntimeError("Expected only one run; got %d" % len(run)) @@ -22,10 +45,44 @@ def audit(run, parameters, stdout_path): # Check successful run if not run.successful: raise RuntimeError("Run was not successful") + + if configs: + # We should have one config called "config" + if len(configs) != 1 or not configs.get("config"): + raise RuntimeError("Expected one config called 'config'") + config = json.loads(configs["config"]) + else: + config = default_config + # Check that we have the proper project name - if "project:config_project" not in run.tags: + if f"project:{config['project_name']}" not in run.tags: raise RuntimeError("Project name is incorrect.") + # Check the start step that all values are properly set. We don't need + # to check end step as it would be a duplicate + start_task_data = run["start"].task.data + for param in config["parameters"]: + value = find_param_in_parameters(parameters, param["name"]) or param["default"] + if not hasattr(start_task_data, param["name"]): + raise RuntimeError(f"Missing parameter {param['name']}") + if getattr(start_task_data, param["name"]) != value: + raise RuntimeError( + f"Parameter {param['name']} has incorrect value %s versus %s expected" + % (getattr(start_task_data, param["name"]), value) + ) + assert ( + start_task_data.flow_level + == config["flow_add_environment"]["vars"]["FLOW_LEVEL"] + ) + assert ( + start_task_data.step_level + == config["step_add_environment"]["vars"]["STEP_LEVEL"] + ) + assert ( + start_task_data.step_level_2 + == config["step_add_environment_2"]["vars"]["STEP_LEVEL_2"] + ) + return None @@ -108,34 +165,34 @@ def evaluate(self, mutable_step): @project(name=config_expr("config.project_name")) class ConfigMutableFlow(FlowSpec): - config = Config( - "config", - default_value={ - "parameters": [ - {"name": "param1", "default": "41"}, - {"name": "param2", "default": "42"}, - ], - "step_add_environment": {"vars": {"STEP_LEVEL": "2"}}, - "step_add_environment_2": {"vars": {"STEP_LEVEL_2": "3"}}, - "flow_add_environment": {"vars": {"FLOW_LEVEL": "4"}}, - "project_name": "config_project", - }, - ) + config = Config("config", default_value=default_config) def _check(self, step_decorators): - assert self.param1 == "41", "param1 does not match expected value" - assert self.param2 == "42", "param2 does not match expected value" + for p in self.config.parameters: + assert hasattr(self, p["name"]), "Missing parameter" + assert ( os.environ.get("SHOULD_NOT_EXIST") is None ), "Unexpected environment variable" + assert ( - os.environ.get("FLOW_LEVEL") == "4" + os.environ.get("FLOW_LEVEL") + == self.config.flow_add_environment["vars"]["FLOW_LEVEL"] ), "Flow level environment variable not set" + self.flow_level = os.environ.get("FLOW_LEVEL") + if step_decorators: - assert os.environ.get("STEP_LEVEL") == "2", "Missing step_level decorator" assert ( - os.environ.get("STEP_LEVEL_2") == "3" + os.environ.get("STEP_LEVEL") + == self.config.step_add_environment.vars.STEP_LEVEL + ), "Missing step_level decorator" + assert ( + os.environ.get("STEP_LEVEL_2") + == self.config["step_add_environment_2"]["vars"].STEP_LEVEL_2 ), "Missing step_level_2 decorator" + + self.step_level = os.environ.get("STEP_LEVEL") + self.step_level_2 = os.environ.get("STEP_LEVEL_2") else: assert ( os.environ.get("STEP_LEVEL") is None From e1162fed070018e3860a93c035192dea7f1f4201 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Tue, 26 Nov 2024 00:09:21 -0800 Subject: [PATCH 27/30] More tests and a few tweaks --- metaflow/cli.py | 3 +- metaflow/cli_components/utils.py | 3 + test_config/config_parser.py | 103 +++++++++++++++++ test_config/config_parser_requirements.txt | 2 + test_config/config_simple.py | 30 ++++- test_config/mutable_flow.py | 30 ++++- test_config/test.py | 122 +++++++++++++++++++++ 7 files changed, 290 insertions(+), 3 deletions(-) create mode 100644 test_config/config_parser.py create mode 100644 test_config/config_parser_requirements.txt create mode 100644 test_config/test.py diff --git a/metaflow/cli.py b/metaflow/cli.py index 8d355f8465a..de875033768 100644 --- a/metaflow/cli.py +++ b/metaflow/cli.py @@ -449,7 +449,7 @@ def start( }, ) - if ctx.protected_args and ctx.protected_args[0] not in ("run", "resume"): + if ctx.saved_args and ctx.saved_args[0] not in ("run", "resume"): # run/resume are special cases because they can add more decorators with --with, # so they have to take care of themselves. all_decospecs = ctx.obj.tl_decospecs + list( @@ -471,6 +471,7 @@ def start( # TODO (savin): Enable lazy instantiation of package ctx.obj.package = None + if ctx.invoked_subcommand is None: ctx.invoke(check) diff --git a/metaflow/cli_components/utils.py b/metaflow/cli_components/utils.py index d56eab486cc..88c0684c1f4 100644 --- a/metaflow/cli_components/utils.py +++ b/metaflow/cli_components/utils.py @@ -50,6 +50,9 @@ def _process_result(value): args = ctx.protected_args + ctx.args ctx.args = [] ctx.protected_args = [] + # CHANGE HERE: Add saved_args so we have access to it in the command to be + # able to infer what we are calling next + ctx.saved_args = args # If we're not in chain mode, we only allow the invocation of a # single command but we also inform the current context about the diff --git a/test_config/config_parser.py b/test_config/config_parser.py new file mode 100644 index 00000000000..f081e48d7b7 --- /dev/null +++ b/test_config/config_parser.py @@ -0,0 +1,103 @@ +import json +import os + +from metaflow import ( + Config, + FlowSpec, + Parameter, + config_expr, + current, + environment, + project, + pypi_base, + req_parser, + step, +) + +default_config = {"project_name": "config_parser"} + + +def audit(run, parameters, configs, stdout_path): + # We should only have one run here + if len(run) != 1: + raise RuntimeError("Expected only one run; got %d" % len(run)) + run = run[0] + + # Check successful run + if not run.successful: + raise RuntimeError("Run was not successful") + + if len(parameters) > 1: + expected_tokens = parameters[-1].split() + if len(expected_tokens) < 8: + raise RuntimeError("Unexpected parameter list: %s" % str(expected_tokens)) + expected_token = expected_tokens[7] + else: + expected_token = "" + + # Check that we have the proper project name + if f"project:{default_config['project_name']}" not in run.tags: + raise RuntimeError("Project name is incorrect.") + + # Check the value of the artifacts in the end step + end_task = run["end"].task + assert end_task.data.trigger_param == expected_token + + if end_task.data.lib_version != "2.5.148": + raise RuntimeError("Library version is incorrect.") + + # Check we properly parsed the requirements file + if len(end_task.data.req_config) != 2: + raise RuntimeError( + "Requirements file is incorrect -- expected 2 keys, saw %s" + % str(end_task.data.req_config) + ) + if end_task.data.req_config["python"] != "3.10.*": + raise RuntimeError( + "Requirements file is incorrect -- got python version %s" + % end_task.data.req_config["python"] + ) + + if end_task.data.req_config["packages"] != {"regex": "2024.11.6"}: + raise RuntimeError( + "Requirements file is incorrect -- got packages %s" + % end_task.data.req_config["packages"] + ) + + return None + + +def trigger_name_func(ctx): + return [current.project_flow_name + "Trigger"] + + +@project(name=config_expr("cfg.project_name")) +@pypi_base(**config_expr("req_config")) +class ConfigParser(FlowSpec): + + trigger_param = Parameter( + "trigger_param", + default="", + external_trigger=True, + external_artifact=trigger_name_func, + ) + cfg = Config("cfg", default_value=default_config) + + req_config = Config( + "req_config", default="config_parser_requirements.txt", parser=req_parser + ) + + @step + def start(self): + import regex + + self.lib_version = regex.__version__ # Should be '2.5.148' + self.next(self.end) + + @step + def end(self): + pass + + +if __name__ == "__main__": + ConfigParser() diff --git a/test_config/config_parser_requirements.txt b/test_config/config_parser_requirements.txt new file mode 100644 index 00000000000..b692401b2f4 --- /dev/null +++ b/test_config/config_parser_requirements.txt @@ -0,0 +1,2 @@ +python==3.10.* +regex==2024.11.6 diff --git a/test_config/config_simple.py b/test_config/config_simple.py index 067eb5ab44c..d4637e7c679 100644 --- a/test_config/config_simple.py +++ b/test_config/config_simple.py @@ -1,7 +1,16 @@ import json import os -from metaflow import Config, FlowSpec, config_expr, environment, project, step +from metaflow import ( + Config, + FlowSpec, + Parameter, + config_expr, + current, + environment, + project, + step, +) default_config = {"a": {"b": "41", "project_name": "config_project"}} @@ -21,12 +30,21 @@ def audit(run, parameters, configs, stdout_path): else: config = default_config + if len(parameters) > 1: + expected_tokens = parameters[-1].split() + if len(expected_tokens) < 8: + raise RuntimeError("Unexpected parameter list: %s" % str(expected_tokens)) + expected_token = expected_tokens[7] + else: + expected_token = "" + # Check that we have the proper project name if f"project:{config['a']['project_name']}" not in run.tags: raise RuntimeError("Project name is incorrect.") # Check the value of the artifacts in the end step end_task = run["end"].task + assert end_task.data.trigger_param == expected_token if ( end_task.data.config_val != 5 or end_task.data.config_val_2 != config["a"]["b"] @@ -38,9 +56,19 @@ def audit(run, parameters, configs, stdout_path): return None +def trigger_name_func(ctx): + return [current.project_flow_name + "Trigger"] + + @project(name=config_expr("cfg_default_value.a.project_name")) class ConfigSimple(FlowSpec): + trigger_param = Parameter( + "trigger_param", + default="", + external_trigger=True, + external_artifact=trigger_name_func, + ) cfg = Config("cfg", default="config_simple.json") cfg_default_value = Config( "cfg_default_value", diff --git a/test_config/mutable_flow.py b/test_config/mutable_flow.py index 0ba5d7aefcd..05ccd2b21bb 100644 --- a/test_config/mutable_flow.py +++ b/test_config/mutable_flow.py @@ -8,6 +8,7 @@ FlowSpec, Parameter, config_expr, + current, environment, project, step, @@ -54,6 +55,14 @@ def audit(run, parameters, configs, stdout_path): else: config = default_config + if len(parameters) > 1: + expected_tokens = parameters[-1].split() + if len(expected_tokens) < 8: + raise RuntimeError("Unexpected parameter list: %s" % str(expected_tokens)) + expected_token = expected_tokens[7] + else: + expected_token = "" + # Check that we have the proper project name if f"project:{config['project_name']}" not in run.tags: raise RuntimeError("Project name is incorrect.") @@ -61,6 +70,8 @@ def audit(run, parameters, configs, stdout_path): # Check the start step that all values are properly set. We don't need # to check end step as it would be a duplicate start_task_data = run["start"].task.data + + assert start_task_data.trigger_param == expected_token for param in config["parameters"]: value = find_param_in_parameters(parameters, param["name"]) or param["default"] if not hasattr(start_task_data, param["name"]): @@ -106,6 +117,8 @@ def evaluate(self, mutable_flow): count = 0 for name, p in mutable_flow.parameters: + if name == "trigger_param": + continue assert name == parameters[count]["name"], "Unexpected parameter name" count += 1 @@ -134,7 +147,12 @@ def evaluate(self, mutable_flow): for param in parameters: mutable_flow.add_parameter( param["name"], - Parameter(param["name"], type=str, default=param["default"]), + Parameter( + param["name"], + type=str, + default=param["default"], + external_artifact=trigger_name_func, + ), overwrite=True, ) @@ -160,11 +178,21 @@ def evaluate(self, mutable_step): deco.attributes["vars"][k] = v +def trigger_name_func(ctx): + return [current.project_flow_name + "Trigger"] + + @ModifyFlow @ModifyFlowWithArgs("parameters") @project(name=config_expr("config.project_name")) class ConfigMutableFlow(FlowSpec): + trigger_param = Parameter( + "trigger_param", + default="", + external_trigger=True, + external_artifact=trigger_name_func, + ) config = Config("config", default_value=default_config) def _check(self, step_decorators): diff --git a/test_config/test.py b/test_config/test.py new file mode 100644 index 00000000000..64b63a43a0f --- /dev/null +++ b/test_config/test.py @@ -0,0 +1,122 @@ +import json +import os +import uuid + +from typing import Any, Dict, List, Optional + +maestro_rand = str(uuid.uuid4())[:8] +scheduler_cluster = os.environ.get("NETFLIX_ENVIRONMENT", "sandbox") +# Use sandbox for tests +if scheduler_cluster == "prod": + scheduler_cluster = "sandbox" + + +# Generates tests for regular, titus and maestro invocations +def all_three_options( + id_base: str, + flow: str, + config_values: Optional[List[Dict[str, Any]]] = None, + configs: Optional[List[Dict[str, str]]] = None, + addl_params: Optional[List[str]] = None, +): + result = [] + if config_values is None: + config_values = [{}] + if configs is None: + configs = [{}] + if addl_params is None: + addl_params = [] + + if len(config_values) < len(configs): + config_values.extend([{}] * (len(configs) - len(config_values))) + if len(configs) < len(config_values): + configs.extend([{}] * (len(config_values) - len(configs))) + if len(addl_params) < len(config_values): + addl_params.extend([""] * (len(config_values) - len(addl_params))) + + for idx, (config_value, config) in enumerate(zip(config_values, configs)): + # Regular run + result.append( + { + "id": f"{id_base}_{idx}", + "flow": flow, + "config_values": config_value, + "configs": config, + "params": "run " + addl_params[idx], + } + ) + + # Titus run + result.append( + { + "id": f"{id_base}_titus_{idx}", + "flow": flow, + "config_values": config_value, + "configs": config, + "params": "run --with titus " + addl_params[idx], + } + ) + + # Maestro run + result.append( + { + "id": f"{id_base}_maestro_{idx}", + "flow": flow, + "config_values": config_value, + "configs": config, + "params": [ + # Create the flow + f"--branch {maestro_rand}_{id_base}_maestro_{idx} maestro " + f"--cluster {scheduler_cluster} create", + # Trigger the run + f"--branch {maestro_rand}_{id_base}_maestro_{idx} maestro " + f"--cluster {scheduler_cluster} trigger --trigger_param " + f"{maestro_rand} --force " + addl_params[idx], + ], + "user_environment": {"METAFLOW_SETUP_GANDALF_POLICY": "0"}, + } + ) + return result + + +TESTS = [ + *all_three_options( + "config_simple", + "config_simple.py", + [ + {}, + { + "cfg_default_value": json.dumps( + {"a": {"project_name": "config_project_2", "b": "56"}} + ) + }, + ], + ), + *all_three_options( + "mutable_flow", + "mutable_flow.py", + [ + {}, + { + "config": json.dumps( + { + "parameters": [ + {"name": "param3", "default": "43"}, + {"name": "param4", "default": "44"}, + ], + "step_add_environment": {"vars": {"STEP_LEVEL": "5"}}, + "step_add_environment_2": {"vars": {"STEP_LEVEL_2": "6"}}, + "flow_add_environment": {"vars": {"FLOW_LEVEL": "7"}}, + "project_name": "config_project_2", + } + ) + }, + ], + addl_params=["", "--param3 45"], + ), + *all_three_options( + "config_parser_flow", + "config_parser.py", + [{}], + ), +] From 1539cb49ca0db175464cd22af72826fdc0306fb0 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Mon, 2 Dec 2024 00:09:49 -0800 Subject: [PATCH 28/30] Fix test --- metaflow/includefile.py | 7 +- metaflow/runner/click_api.py | 42 +++++++-- metaflow/user_configs/config_options.py | 104 ++++++++++++--------- test/core/metaflow_test/formatter.py | 1 + test/core/run_tests.py | 4 + test/core/tests/basic_config_parameters.py | 7 +- 6 files changed, 112 insertions(+), 53 deletions(-) diff --git a/metaflow/includefile.py b/metaflow/includefile.py index 3c5863485af..e5b136f2ff7 100644 --- a/metaflow/includefile.py +++ b/metaflow/includefile.py @@ -277,11 +277,16 @@ def __init__( self._includefile_overrides["is_text"] = is_text if encoding is not None: self._includefile_overrides["encoding"] = encoding + # NOTA: Right now, there is an issue where these can't be overridden by config + # in all circumstances. Ignoring for now. super(IncludeFile, self).__init__( name, required=required, help=help, - type=FilePathClass(is_text, encoding), + type=FilePathClass( + self._includefile_overrides.get("is_text", True), + self._includefile_overrides.get("encoding", "utf-8"), + ), **kwargs, ) diff --git a/metaflow/runner/click_api.py b/metaflow/runner/click_api.py index 49f5d1902fb..88e276bcfea 100644 --- a/metaflow/runner/click_api.py +++ b/metaflow/runner/click_api.py @@ -128,11 +128,20 @@ def _method_sanity_check( def _lazy_load_command( - cli_collection: click.Group, flow_parameters: List[Parameter], _self, name: str + cli_collection: click.Group, + flow_parameters: Union[str, List[Parameter]], + _self, + name: str, ): # Context is not used in get_command so we can pass None. Since we pin click, # this won't change from under us. + + if isinstance(flow_parameters, str): + # Resolve flow_parameters -- for start, this is a function which we + # need to call to figure out the actual parameters (may be changed by configs) + flow_parameters = getattr(_self, flow_parameters)() + cmd_obj = cli_collection.get_command(None, name) if cmd_obj: if isinstance(cmd_obj, click.Group): @@ -205,9 +214,11 @@ def extract_flow_class_from_file(flow_file: str) -> FlowSpec: class MetaflowAPI(object): - def __init__(self, parent=None, **kwargs): + def __init__(self, parent=None, flow_cls=None, **kwargs): self._parent = parent self._chain = [{self._API_NAME: kwargs}] + self._flow_cls = flow_cls + self._cached_computed_parameters = None @property def parent(self): @@ -226,9 +237,7 @@ def name(self): @classmethod def from_cli(cls, flow_file: str, cli_collection: Callable) -> Callable: flow_cls = extract_flow_class_from_file(flow_file) - flow_parameters = [ - p for _, p in flow_cls._get_parameters() if not p.IS_CONFIG_PARAMETER - ] + with flow_context(flow_cls) as _: add_decorator_options(cli_collection) @@ -240,7 +249,7 @@ def getattr_wrapper(_self, name): "__module__": "metaflow", "_API_NAME": flow_file, "_internal_getattr": functools.partial( - _lazy_load_command, cli_collection, flow_parameters + _lazy_load_command, cli_collection, "_compute_flow_parameters" ), "__getattr__": getattr_wrapper, } @@ -264,7 +273,7 @@ def _method(_self, **kwargs): defaults, **kwargs, ) - return to_return(parent=None, **method_params) + return to_return(parent=None, flow_cls=flow_cls, **method_params) m = _method m.__name__ = cli_collection.name @@ -314,6 +323,25 @@ def execute(self) -> List[str]: return components + def _compute_flow_parameters(self): + if self._flow_cls is None or self._parent is not None: + raise RuntimeError( + "Computing flow-level parameters for a non start API. " + "Please report to the Metaflow team." + ) + # TODO: We need to actually compute the new parameters (based on configs) which + # would involve processing the options at least partially. We will do this + # before GA but for now making it work for regular parameters + if self._cached_computed_parameters is not None: + return self._cached_computed_parameters + self._cached_computed_parameters = [] + for _, param in self._flow_cls._get_parameters(): + if param.IS_CONFIG_PARAMETER: + continue + param.init() + self._cached_computed_parameters.append(param) + return self._cached_computed_parameters + def extract_all_params(cmd_obj: Union[click.Command, click.Group]): arg_params_sigs = OrderedDict() diff --git a/metaflow/user_configs/config_options.py b/metaflow/user_configs/config_options.py index 380ea7555f4..c15e4173a04 100644 --- a/metaflow/user_configs/config_options.py +++ b/metaflow/user_configs/config_options.py @@ -73,6 +73,7 @@ def convert(self, value, param, ctx): return value if value.startswith(_DEFAULT_PREFIX): is_default = True + value = value[len(_DEFAULT_PREFIX) :] return self.convert_value(value, is_default) @@ -105,7 +106,8 @@ class MultipleTuple(click.Tuple): # by whitespace which is totally not what we want # You can now pass multiple configuration options through an environment variable # using something like: - # METAFLOW_FLOW_CONFIG='{"config1": "filenameforconfig1.json", "config2": {"key1": "value1"}}' + # METAFLOW_FLOW_CONFIG_VALUE='{"config1": {"key0": "value0"}, "config2": {"key1": "value1"}}' + # or METAFLOW_FLOW_CONFIG='{"config1": "file1", "config2": "file2"}' def split_envvar_value(self, rv): loaded = json.loads(rv) @@ -225,53 +227,71 @@ def process_configs(self, ctx, param, value): return None # The second go around, we process all the values and merge them. - # Check that the user didn't provide *both* a path and a value. - common_keys = set(self._value_values or []).intersection( - [k for k, v in self._path_values.items()] or [] + + # If we are processing options that start with kv., we know we are in a subprocess + # and ignore other stuff. In particular, environment variables used to pass + # down configurations (like METAFLOW_FLOW_CONFIG) could still be present and + # would cause an issue -- we can ignore those as the kv. values should trump + # everything else. + all_keys = set(self._value_values).union(self._path_values) + # Make sure we have at least some keys (ie: some non default values) + has_all_kv = all_keys and all( + self._value_values.get(k, "").startswith(_CONVERT_PREFIX + "kv.") + for k in all_keys ) - if common_keys: - raise click.UsageError( - "Cannot provide both a value and a file for the same configuration. " - "Found such values for '%s'" % "', '".join(common_keys) - ) - all_values = dict(self._path_values or {}) - all_values.update(self._value_values or {}) + flow_cls._flow_state[_FlowState.CONFIGS] = {} + to_return = {} - debug.userconf_exec("All config values: %s" % str(all_values)) + if not has_all_kv: + # Check that the user didn't provide *both* a path and a value. + common_keys = set(self._value_values or []).intersection( + [k for k, v in self._path_values.items()] or [] + ) + if common_keys: + raise click.UsageError( + "Cannot provide both a value and a file for the same configuration. " + "Found such values for '%s'" % "', '".join(common_keys) + ) - flow_cls._flow_state[_FlowState.CONFIGS] = {} + all_values = dict(self._path_values or {}) + all_values.update(self._value_values or {}) - to_return = {} - merged_configs = {} - for name, (val, is_path) in self._defaults.items(): - n = name.lower() - if n in all_values: - merged_configs[n] = all_values[n] - else: - if isinstance(val, DeployTimeField): - # This supports a default value that is a deploy-time field (similar - # to Parameter).) - # We will form our own context and pass it down -- note that you cannot - # use configs in the default value of configs as this introduces a bit - # of circularity. Note also that quiet and datastore are *eager* - # options so are available here. - param_ctx = ParameterContext( - flow_name=ctx.obj.flow.name, - user_name=get_username(), - parameter_name=n, - logger=echo_dev_null if ctx.params["quiet"] else echo_always, - ds_type=ctx.params["datastore"], - configs=None, - ) - val = val.fun(param_ctx) - if is_path: - # This is a file path - merged_configs[n] = ConvertPath.convert_value(val, False) - else: - # This is a value - merged_configs[n] = ConvertDictOrStr.convert_value(val, False) + debug.userconf_exec("All config values: %s" % str(all_values)) + merged_configs = {} + for name, (val, is_path) in self._defaults.items(): + n = name.lower() + if n in all_values: + merged_configs[n] = all_values[n] + else: + if isinstance(val, DeployTimeField): + # This supports a default value that is a deploy-time field (similar + # to Parameter).) + # We will form our own context and pass it down -- note that you cannot + # use configs in the default value of configs as this introduces a bit + # of circularity. Note also that quiet and datastore are *eager* + # options so are available here. + param_ctx = ParameterContext( + flow_name=ctx.obj.flow.name, + user_name=get_username(), + parameter_name=n, + logger=( + echo_dev_null if ctx.params["quiet"] else echo_always + ), + ds_type=ctx.params["datastore"], + configs=None, + ) + val = val.fun(param_ctx) + if is_path: + # This is a file path + merged_configs[n] = ConvertPath.convert_value(val, False) + else: + # This is a value + merged_configs[n] = ConvertDictOrStr.convert_value(val, False) + else: + debug.userconf_exec("Fast path due to pre-processed values") + merged_configs = self._value_values debug.userconf_exec("Configs merged with defaults: %s" % str(merged_configs)) missing_configs = set() diff --git a/test/core/metaflow_test/formatter.py b/test/core/metaflow_test/formatter.py index 1ba08d5a45f..b366f381ea2 100644 --- a/test/core/metaflow_test/formatter.py +++ b/test/core/metaflow_test/formatter.py @@ -18,6 +18,7 @@ def __init__(self, graphspec, test): self.steps = self._index_steps(test) self.flow_code = self._pretty_print(self._flow_lines()) self.check_code = self._pretty_print(self._check_lines()) + self.copy_files = getattr(test, "REQUIRED_FILES", []) self.valid = True for step in self.steps: diff --git a/test/core/run_tests.py b/test/core/run_tests.py index 0cece96de17..35487f17616 100644 --- a/test/core/run_tests.py +++ b/test/core/run_tests.py @@ -172,6 +172,10 @@ def construct_arg_dicts_from_click_api(): os.path.join(cwd, "metaflow_test"), os.path.join(tempdir, "metaflow_test") ) + # Copy files required by the test + for file in formatter.copy_files: + shutil.copy2(os.path.join(cwd, "tests", file), os.path.join(tempdir, file)) + path = os.path.join(tempdir, "test_flow.py") original_env = os.environ.copy() diff --git a/test/core/tests/basic_config_parameters.py b/test/core/tests/basic_config_parameters.py index b8e91a0f259..64f953a3ab8 100644 --- a/test/core/tests/basic_config_parameters.py +++ b/test/core/tests/basic_config_parameters.py @@ -3,6 +3,7 @@ class BasicConfigTest(MetaflowTest): PRIORITY = 1 + REQUIRED_FILES = ["basic_config_silly.txt"] PARAMETERS = { "default_from_config": { "default": "config_expr('config2').default_param", @@ -17,7 +18,7 @@ class BasicConfigTest(MetaflowTest): "silly_config": { "required": True, "parser": "silly_parser", - "default": "silly.txt", + "default": "'silly.txt'", }, "config2": {}, # Test using a function to get the value @@ -32,7 +33,7 @@ class BasicConfigTest(MetaflowTest): # Test passing values directly on the command line os.environ['METAFLOW_FLOW_CONFIG_VALUE'] = json.dumps( { - "config2": {"default_param": 123} + "config2": {"default_param": 123}, "config_env": {"vars": {"var1": "value1", "var2": "value2"}} } ) @@ -46,7 +47,7 @@ class BasicConfigTest(MetaflowTest): def silly_parser(s): k, v = s.split(":") - return {k: v} + return {k: v.strip()} default_config = { "value": 42, From 02c4243c337715b004b875c38f8ebf88184d83d6 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Fri, 6 Dec 2024 02:17:47 -0800 Subject: [PATCH 29/30] Trigger tests --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 999ee14e3f1..f1a0b13ef23 100644 --- a/README.md +++ b/README.md @@ -67,3 +67,4 @@ There are several ways to get in touch with us: ## Contributing We welcome contributions to Metaflow. Please see our [contribution guide](https://docs.metaflow.org/introduction/contributing-to-metaflow) for more details. + From 7a3a3beb7daf73bce418698a2a5c972726750c53 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Fri, 6 Dec 2024 14:24:26 -0800 Subject: [PATCH 30/30] Fix no command case; fix deployer --- metaflow/cli.py | 6 +++++- metaflow/runner/click_api.py | 28 +++++++++++++--------------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/metaflow/cli.py b/metaflow/cli.py index de875033768..1999c3c29f9 100644 --- a/metaflow/cli.py +++ b/metaflow/cli.py @@ -449,7 +449,11 @@ def start( }, ) - if ctx.saved_args and ctx.saved_args[0] not in ("run", "resume"): + if ( + hasattr(ctx, "saved_args") + and ctx.saved_args + and ctx.saved_args[0] not in ("run", "resume") + ): # run/resume are special cases because they can add more decorators with --with, # so they have to take care of themselves. all_decospecs = ctx.obj.tl_decospecs + list( diff --git a/metaflow/runner/click_api.py b/metaflow/runner/click_api.py index 88e276bcfea..b89ddae6831 100644 --- a/metaflow/runner/click_api.py +++ b/metaflow/runner/click_api.py @@ -141,12 +141,11 @@ def _lazy_load_command( # Resolve flow_parameters -- for start, this is a function which we # need to call to figure out the actual parameters (may be changed by configs) flow_parameters = getattr(_self, flow_parameters)() - cmd_obj = cli_collection.get_command(None, name) if cmd_obj: if isinstance(cmd_obj, click.Group): # TODO: possibly check for fake groups with cmd_obj.name in ["cli", "main"] - result = extract_group(cmd_obj, flow_parameters) + result = functools.partial(extract_group(cmd_obj, flow_parameters), _self) elif isinstance(cmd_obj, click.Command): result = functools.partial(extract_command(cmd_obj, flow_parameters), _self) else: @@ -379,18 +378,17 @@ def extract_all_params(cmd_obj: Union[click.Command, click.Group]): def extract_group(cmd_obj: click.Group, flow_parameters: List[Parameter]) -> Callable: - def getattr_wrapper(_self, name): - # Functools.partial do not automatically bind self (no __get__) - return _self._internal_getattr(_self, name) - - class_dict = { - "__module__": "metaflow", - "_API_NAME": cmd_obj.name, - "_internal_getattr": functools.partial( - _lazy_load_command, cmd_obj, flow_parameters - ), - "__getattr__": getattr_wrapper, - } + class_dict = {"__module__": "metaflow", "_API_NAME": cmd_obj.name} + for _, sub_cmd_obj in cmd_obj.commands.items(): + if isinstance(sub_cmd_obj, click.Group): + # recursion + class_dict[sub_cmd_obj.name] = extract_group(sub_cmd_obj, flow_parameters) + elif isinstance(sub_cmd_obj, click.Command): + class_dict[sub_cmd_obj.name] = extract_command(sub_cmd_obj, flow_parameters) + else: + raise RuntimeError( + "Cannot handle %s of type %s" % (sub_cmd_obj.name, type(sub_cmd_obj)) + ) resulting_class = type(cmd_obj.name, (MetaflowAPI,), class_dict) resulting_class.__name__ = cmd_obj.name @@ -407,7 +405,7 @@ def _method(_self, **kwargs): method_params = _method_sanity_check( possible_arg_params, possible_opt_params, annotations, defaults, **kwargs ) - return resulting_class(parent=_self, **method_params) + return resulting_class(parent=_self, flow_cls=None, **method_params) m = _method m.__name__ = cmd_obj.name