morphology-pipeline

#!/usr/bin/env python3

"""
morphology-pipeline is used to generate complex morphological plots for one or
multiple COLIBRE simulation runs. These plot require a relatively expensive
analysis of galaxies in the simulation on an individual galaxy basis and are
therefore more challenging to generate than the plots in the conventional
pipeline.

Usage:
  ./morphology-pipeline \
    -C/--config      <configuration directory> \
    -i/--input       <input folder that contains the snapshots and catalogues> \
    -s/--snapshots   <input swift snapshot containing particle properties> \
    -g/--groups      <input swift snapshot containing particle membership> \
    -c/--catalogues  <input catalogue> \
    -o/--output      <directory where output images and web pages are stored> \
   [-n/--run-names   <name for the run>] \
   [-j/--num-of-cpus <number of parallel processes to use>] \
   [-m/--metadata    <prefix for the meta-data output file] \
   [-d/--debug] \
   [-l/--lazy]
(the last four arguments are optional). See the README for more details.
"""

import argparse as ap
import multiprocessing as mp
import os

from typing import Dict, Union

# set up the command line argument parser
parser = ap.ArgumentParser(
    prog="morphology-pipeline",
    description=(
        "Compute morphological quantities for a simulation"
        " and create corresponding plots and a web page."
    ),
    epilog=(
        "Example usage:\n"
        "  ./morphology-pipeline \\ \n"
        "    -C/--config      <configuration directory> \\ \n"
        "    -i/--input       <input folder that contains the snapshots and catalogues> \\ \n"
        "    -s/--snapshots   <input swift snapshot containing particle properties> \\ \n"
        "    -g/--groups      <input swift snapshot containing particle membership> \\ \n"
        "    -c/--catalogues  <input catalogue> \\ \n"
        "    -o/--output      <directory where output images and web pages are stored> \\ \n"
        "   [-n/--run-names   <name for the run>] \\ \n"
        "   [-j/--num-of-cpus <number of parallel processes to use>] \\ \n"
        "   [-m/--metadata    <prefix for the meta-data output file] \\ \n"
        "   [-d/--debug] \\ \n"
        "   [-l/--lazy]"
    ),
    # we need this to preserve the line breaks in the epilog
    formatter_class=ap.RawDescriptionHelpFormatter,
)

parser.add_argument(
    "-C",
    "--config",
    type=str,
    required=True,
    help=("Configuration directory, containing config.yml."),
)

parser.add_argument(
    "-c",
    "--catalogues",
    type=str,
    required=True,
    help="Name of the SOAP HDF5 properties file(s). Required.",
    nargs="*",
)

parser.add_argument(
    "-g",
    "--groups",
    type=str,
    required=True,
    help="Name of the SOAP HDF5 memberships file(s). Required.",
    nargs="*",
)

parser.add_argument(
    "-s",
    "--snapshots",
    required=True,
    type=str,
    help="Name of the snapshot file(s). Required.",
    nargs="*",
)

parser.add_argument(
    "-o",
    "--output",
    type=str,
    required=True,
    help="Output directory for figures. Required.",
)

parser.add_argument(
    "-i",
    "--input",
    type=str,
    required=False,
    default=".",
    help=(
        "Input directory where the snapshot(s) and properties file(s) are located. "
        "Default is the current working directory. If you are running for comparison "
        "purposes you will need to ensure that the metadata yaml files have been "
        "generated in these folders and have the same basename (--metadata) as is "
        "given here."
    ),
    nargs="*",
)

# debug mode?
parser.add_argument(
    "-d",
    "--debug",
    required=False,
    default=False,
    action="store_true",
    help="Run in debug mode if this flag is present. Default: no.",
)


parser.add_argument(
    "-m",
    "--metadata",
    required=False,
    default="morphology_data",
    help=(
        "Base name of the written metadata file in the input directory. "
        "By default this is morphology_data, leading to morphology_data_XXXX.yml"
    ),
)

parser.add_argument(
    "-n",
    "--run-names",
    required=False,
    default=None,
    nargs="*",
    help=(
        "Overwrite the names given to each run? If not present, the default names "
        "from the snapshots are used, and in the case where there are multiple "
        "redshifts, we append the redshift."
    ),
)

parser.add_argument(
    "-j",
    "--num-of-cpus",
    required=False,
    type=int,
    default=None,
    help=(
        "Number of CPUs to use for running scripts in parallel. If not specified, uses "
        "the maximum number of CPUs avaliable in the system."
    ),
)

parser.add_argument(
    "-l",
    "--lazy",
    required=False,
    action="store_true",
    help=(
        "Run in lazy mode: do not recompute anything, but simply read the data"
        " from the existing metadata file (assuming it does exist)."
    ),
)


def init_child_process(env: Dict, temp_folder: str, stylesheet_path: Union[str, None]):
    """
    Matplotlib with LaTeX support is not safe to use in parallel.
    The reason is that Matplotlib saves some intermediary files in a temporary
    directory (MPLCONFIGDIR environment variable) in order to post-process
    figures that use LaTeX. All these temporary files end up in the same
    directory and apparently can have the same names when you run multiple
    Python processes that try to save a figure at the same time.
    The only way around this is to use a different temporary directory for each
    subprocess, which is what this function does: it is run exactly once when
    the subprocess is created and sets the MPLCONFIGDIR environment variable
    for this subprocess to a unique value.
    """
    env["MPLCONFIGDIR"] = f"{temp_folder}/temp_folder_pid_{mp.current_process().pid}"
    os.environ = env

    import matplotlib

    matplotlib.use("Agg")
    import matplotlib.pyplot as pl

    if stylesheet_path is not None:
        pl.style.use(stylesheet_path)


if __name__ == "__main__":
    """
    Main entry point.

    Note that this block is not executed by subprocesses, while everything above
    that is not in a function is. All large memory allocations done below will
    only affect the main process.
    """

    from velociraptor import load as load_catalogue
    from swiftsimio import load as load_snapshot

    from morpholopy.filtered_catalogue import FilteredCatalogue
    from morpholopy.galaxy_data import process_galaxy, AllGalaxyData

    from morpholopy.HI_size import plot_HI_size_mass
    from morpholopy.morphology import plot_morphology
    from morpholopy.morphology import plot_scaleheights
    from morpholopy.KS import plot_KS_relations

    from morpholopy.logging import MainLog

    from morpholopy.config import MorphologyConfig
    from swiftpipeline.html import WebpageCreator

    import matplotlib

    matplotlib.use("Agg")
    import matplotlib.pyplot as pl

    import unyt
    import os
    import tempfile
    import shutil

    # minimise the memory footprint of parallel processes by ensuring only
    # relevant data in memory are copied
    mp.set_start_method("forkserver")

    # parse the command line arguments
    args = parser.parse_args()

    # set up the logging
    # there should only be one MainLog; subprocesses should obtain their own
    # GalaxyLog from the MainLog using MainLog.get_galaxy_log().
    main_log = MainLog(log_level="WORKERDEBUG" if args.debug else "WORKER")

    # create a temporary directory for the per subprocess Matplotlib cache
    # directories
    # see the documentation of init_child_process() above
    tmpdir = tempfile.mkdtemp()
    main_log.debug(f"Will save temporary files in {tmpdir}")

    # read the configuration file
    # we use a custom subclass of swiftpipeline.Config, since we need some
    # additional configuration options (and we need to use a hack to add
    # our images to the web page)
    config = MorphologyConfig(config_directory=args.config)

    # set the Matplotlib style
    stylesheet_path = None
    if config.matplotlib_stylesheet != "default":
        stylesheet_path = f"{config.config_directory}/{config.matplotlib_stylesheet}"
        pl.style.use(stylesheet_path)

    # list the input snapshots
    snapshots = [
        load_snapshot(f"{input}/{snapshot}")
        for input, snapshot in zip(args.input, args.snapshots)
    ]
    # create names for the different runs if not provided
    if args.run_names is not None:
        run_names = args.run_names
    else:
        # First, check if the snapshots are all at the same redshift
        redshifts = {data.metadata.redshift for data in snapshots}
        # If the size of the set is one, then all redshifts are the same
        if len(redshifts) == 1:
            # All redshifts are the same! No need to modify runs' names
            run_names = [data.metadata.run_name for data in snapshots]
        # If the size of the set > 1, then at least two runs have different redshifts
        else:
            # Need to append appropriate redshifts to names.
            run_names = [
                f"{data.metadata.run_name} (z={data.metadata.redshift:1.3f})"
                for data in snapshots
            ]

    # get the observational data path
    observational_data_path = (
        f"{config.config_directory}/{config.observational_data_directory}/data"
    )
    main_log.debug(f"Observational data path: {observational_data_path}")

    # create the output directory if it does not exist
    if not os.path.exists(args.output):
        os.mkdir(args.output)

    # check if we are running in single mode or comparison mode
    is_comparison = len(args.snapshots) > 1

    # create empty lists to store images and galaxy data
    all_galaxies_list = []
    all_images = {}
    if not is_comparison:
        # compose the meta-data file name
        metadata_filename = (
            f"{args.input[0]}/{args.metadata}_{args.snapshots[0][-9:-5]}.yml"
        )

        if not args.lazy:
            all_gallery_images = {}
            # compute the galaxy properties
            # start by getting the catalogue and snapshot file name
            halo_catalogue_filename = f"{args.input[0]}/{args.catalogues[0]}"
            halo_membership_filename = f"{args.input[0]}/{args.groups[0]}"
            snapshot_filename = f"{args.input[0]}/{args.snapshots[0]}"

            # load the catalogue and create the filtered catalogue that only
            # contains galaxies above our threshold mass limits
            catalogue = load_catalogue(halo_catalogue_filename, disregard_units=True)
            filtered_catalogue = FilteredCatalogue(
                catalogue,
                config.mass_limit_stars_in_Msun,
                config.mass_variable_stars,
                config.mass_limit_gas_in_Msun,
                config.mass_variable_gas,
                config.plotting_lower_mass_limit_in_Msun,
                config.plotting_upper_mass_limit_in_Msun,
                config.plotting_number_of_galaxies,
                config.plotting_random_seed,
            )
            # close the catalogue by unloading it explicitly
            catalogue = None

            # create an empty object to store morphological data for all galaxies
            number_of_galaxies = len(filtered_catalogue.galaxy_indices)
            all_galaxies = AllGalaxyData(number_of_galaxies)

            # create a list of arguments for parallel processing
            # these should correspond to the argument needed by
            # process_galaxy():
            #  - the index in the global galaxy list
            #  - the catalogue index of the galaxy (its SOAP index)
            #  - the catalogue file name
            #  - the halo membership file name
            #  - the snapshot file name
            #  - the output directory name
            #  - the observational data path
            #  - the orientation method string
            #  - whether or not to make individual plots for this galaxy
            #  - the MainLog
            arglist = [
                (
                    index,
                    galaxy_index,
                    halo_catalogue_filename,
                    halo_membership_filename,
                    snapshot_filename,
                    args.output,
                    observational_data_path,
                    config.scaleheight_binsize_kpc,
                    config.scaleheight_lower_gasmass_limit_in_number_of_particles,
                    config.make_individual_KS_plots,
                    config.orientation_method,
                    make_plots,
                    main_log,
                )
                for index, (galaxy_index, make_plots) in enumerate(
                    zip(
                        filtered_catalogue.galaxy_indices,
                        filtered_catalogue.plot_galaxy,
                    )
                )
            ]

            # make sure galaxies with plots are processed first, since they take
            # longer than average
            arglist = sorted(arglist, key=lambda x: x[11], reverse=True)

            # determine the appropriate number of parallel processes to use:
            #  - the number requested by the user
            #  - in the absence thereof, the available number of threads reported
            #    by multiprocessing.cpu_count()
            num_proc = args.num_of_cpus
            if num_proc is None:
                num_proc = mp.cpu_count()
            # make sure we don't use more processes than there are galaxies
            num_proc = min(num_proc, number_of_galaxies)
            main_log.message(
                f"Computing properties for {number_of_galaxies} galaxies using"
                f" {num_proc} parallel processes..."
            )
            # create the pool of subprocesses that will analyse individual
            # galaxies. We initialise every subprocess using init_child_process(),
            # which requires a copy of the environment variables (so that we can
            # change them), the name of the temporary directory we created, and the
            # Matplotlib style sheet.
            parent_env = os.environ.copy()
            pool = mp.Pool(
                num_proc,
                initializer=init_child_process,
                initargs=(parent_env, tmpdir, stylesheet_path),
            )
            # counter used to display progress
            # we cannot simply use 'index', since the order of processing is not
            # fixed
            galaxy_count = 0
            # process all galaxies using the parallel pool
            for index, galaxy_data, images, gallery_images in pool.imap_unordered(
                process_galaxy, arglist
            ):
                galaxy_count += 1
                main_log.debug(
                    f"Finished processing galaxy {index} [{galaxy_count}/{number_of_galaxies}]"
                )
                # add galaxy contribution to global data
                all_galaxies[index] = galaxy_data
                # add galaxy images to the images, if there are any
                if images is not None:
                    main_log.debug(f"Adding figures for galaxy {index}")
                    all_images.update(images)
                if gallery_images is not None:
                    all_gallery_images.update(gallery_images)
            # properly terminate the parallel pool
            # while not strictly necessary, not doing this sometimes spawns
            # confusing warning messages
            pool.close()
            pool.join()

            all_images["YYY - Gallery"] = all_gallery_images

            main_log.debug("Creating metadata output file")
            # save plot data using the given metadata name
            all_galaxies.output(metadata_filename)

        main_log.debug("Regenerating metadata from output file")
        # regenerate the data from the file, for consistency with the comparison case
        # (and because we did not compute anything if we run in lazy mode)
        all_galaxies_list = [AllGalaxyData.fromfile(metadata_filename)]
    else:
        # Need to generate our data again from the existing meta-data files.
        metadata_filenames = [
            f"{input}/{args.metadata}_{snapshot[-9:-5]}.yml"
            for input, snapshot in zip(args.input, args.snapshots)
        ]

        main_log.debug("Regenerating metadata from files")
        # recreate data from metadata files
        all_galaxies_list = [
            AllGalaxyData.fromfile(filename) for filename in metadata_filenames
        ]

    # create global plots
    main_log.debug("Making global plots")
    all_images.update(
        plot_HI_size_mass(
            args.output, observational_data_path, run_names, all_galaxies_list
        )
    )
    all_images.update(
        plot_morphology(
            args.output, observational_data_path, run_names, all_galaxies_list
        )
    )
    all_images.update(
        plot_scaleheights(
            args.output, observational_data_path, run_names, all_galaxies_list
        )
    )
    all_images.update(
        plot_KS_relations(
            args.output, observational_data_path, run_names, all_galaxies_list
        )
    )

    # now add the images to the configuration file
    # this is a bit of a hack: we pretend that the images are "scripts" and
    # reuse the existing pipeline support for those. This is done by our
    # custom swiftpipeline.Config subclass
    config.add_images(all_images)

    # Create the webpage
    # This simply uses the swiftpipeline mechanism
    main_log.debug("Creating webpage")
    webpage = WebpageCreator()
    webpage.add_config_metadata(config=config, is_comparison=is_comparison)
    webpage.add_metadata(page_name=" | ".join(run_names))
    webpage.add_run_metadata(config=config, snapshots=snapshots)
    webpage.render_webpage()
    webpage.save_html(f"{args.output}/index.html")

    # Remove the temporary directory hosting the Matplotlib caches
    main_log.debug(f"Removing temporary directory {tmpdir}")
    shutil.rmtree(tmpdir)

    # Tell the user we are done
    main_log.message(f"Done running pipeline.")