From 54cae9ecf2118efb3a4ee5f08e14319438eac084 Mon Sep 17 00:00:00 2001 From: porteratzo <44075849+porteratzo@users.noreply.github.com> Date: Thu, 15 Feb 2024 13:58:52 -0600 Subject: [PATCH] Add ray grouped runtime (#910) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix flake8 error in local runtime (#764) * Removes unnecessary dict comprehension Signed-off-by: Patrick Foley * Removes unnecessary dict comprehension Signed-off-by: Patrick Foley --------- Signed-off-by: Patrick Foley Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Update ROADMAP.md (#765) Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Update README.md Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Update GOVERNANCE.md Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Update ROADMAP.md (#785) Typos Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Updated integrations to GaNDLF (#781) * renaming loader and runner Signed-off-by: sarthakpati * updated plan to pick the new names Signed-off-by: sarthakpati * new key name Signed-off-by: sarthakpati * allow the ability to pass a file to `gandlf_config_dict` in addition to fully-fledged parameters Signed-off-by: sarthakpati * checking this differently Signed-off-by: sarthakpati * rename variable for clarity Signed-off-by: sarthakpati --------- Signed-off-by: sarthakpati Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Update README.md Removed references to Intel's ownship, given it's now owned by the LF AI and Data. Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Fix Flake8 C419 for Ubuntu CI (#800) C419 Unnecessary list comprehension passed to any()/all() prevents short-circuiting - rewrite as a generator Signed-off-by: Aleksander Kantak Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Introduced shard descriptor based collaborator private attributes Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Adding batch size for train, and test in config.yaml file Files modified: 1. config.yaml 2. mnist_shard_descriptor.py 3. Workflow_Interface_101_MNIST.ipynb 4. participants.py Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Introducing multiple config yaml files Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Removing unnecessary config.yaml file. Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Added collaborator private atribute delayed initialization for local_runtime Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Incorporated review comments Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Added multi-pricessing ray backend support and, aggregator yaml file Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Updated multi-processing code Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * RayExecutor class moved from participants.py to localruntime.py Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * RayExecytor moved from interface/pariticipants.py to runtime/local_runtime.py Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Added Aggregator private attribute initialation in runtime Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Removed unnecessary import statements Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Code cleaned up, validated checkpoints manually Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Refactored, and added some new doc string Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Resolved Flake8 instructions Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Recusrsion removal + Serialization removal integrated Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Incoporated Review Comments Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Removed configuration YAML files, and added functionality to initialize private attributes by calling a callback function created by end-user Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Removed commented code Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Implemented new approach, two example files given 1. Workflow_Interface_101_MNIST.py 2. Workflow_Interface_301_MNIST_Watermarking.py Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Internal Review Comments Incorporated Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * No private attributes are required If private attributes are not provided, by default take an empty dictionary no need to pass a callable function. Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Update participants.py Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Added a check for GPU Resource Allocation Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Modified error message for resource allocation Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Resolved bug found during testing phase Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Modifide all the test cases, and following tutorials 1. Privacy Meter 2. FedProx Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Added following test cases: 1. Workflow_Interface_101_MNIST.ipynb 2. Workflow_Interface_102_Aggregator_Validation.ipynb 3. Workflow_Interface_301_MNIST_Watermarking.ipynb 4. Workflow_Interface_201_Exclusive_GPUs_with_Ray.ipynb 5. Workflow_Interface_103_Cyclic_Institutional_Incremental_Learning.ipynb Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Modified and Added Global_DP tutorials. Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Modified and Added tutorial Workflow-Interface_201_Exclusive_GPUs_with_Ray.ipynb Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Modified documentation for Workflow_Interface_201 tutorial. Signed-off-by: Parth Mandaliya Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * fixed flake-8 errors Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * reverted import module code Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya * Resolved merge conflicts in local_runtime.py --------- Fix flake8 error in local runtime (#764) * Removes unnecessary dict comprehension Signed-off-by: Patrick Foley * Removes unnecessary dict comprehension Signed-off-by: Patrick Foley --------- Signed-off-by: Patrick Foley --------- Signed-off-by: Parth Mandaliya * Update README.md Signed-off-by: Parth Mandaliya * Fix warnings and issues in docs (#825) * Fix warnings and issues in docs Signed-off-by: Aleksander Kantak * fixup! Fix warnings and issues in docs Signed-off-by: Aleksander Kantak --------- Signed-off-by: Aleksander Kantak Signed-off-by: Parth Mandaliya * Add Logo (#827) * Add Logo * Update README.md Signed-off-by: Parth Mandaliya * Change OpenFL documentation font to improve accessibility (#809) This replaces the font of OpenFL documents with Intel One Mono font for low vision developers. Known issues: 1. The text font within the images has not been changed. 2. Some icons that do not exist in the new font cannot be displayed properly. Fixes securefederatedai#799 Co-authored-by: Wang, Le Signed-off-by: He, Dan H Signed-off-by: Jiang, Jiaqiu Signed-off-by: Li, Qingqing Signed-off-by: Wang, Le Signed-off-by: Wu, Caili Signed-off-by: He, Dan H Co-authored-by: He, Dan H Signed-off-by: Parth Mandaliya * Update unit tests to improve code coverage (#821) * Update ci config Signed-off-by: Fang, Xiaoran * Add unit test for following files - openfl/federated/plan/plan.py - openfl/interface/aggregation_functions/core/adaptive_aggregation.py Signed-off-by: Fang, Xiaoran * Add some test cases for databases module Signed-off-by: Fang, Xiaoran * Fix bugs for databases module unittest Signed-off-by: Fang, Xiaoran * Update unit tests for component module Signed-off-by: Fang, Xiaoran * Restore workflow config and update some comments Signed-off-by: Fang, Xiaoran * Enable save_ test case. Add yaml under test dir for unit test usage. Signed-off-by: Fang, Xiaoran * Remove plan to new dir. Signed-off-by: Fang, Xiaoran * Remove plan to new dir. Signed-off-by: Fang, Xiaoran * Add aggregator start test cases. Signed-off-by: Fang, Xiaoran * Add 2 aggregator test cases. Signed-off-by: Fang, Xiaoran * Add 1 aggregator test case. Signed-off-by: Fang, Xiaoran * Format code. Signed-off-by: Fang, Xiaoran * Refactor code. Signed-off-by: Fang, Xiaoran * Add collaborator start test cases. Signed-off-by: Fang, Xiaoran * Add 1 collaborator test case. Signed-off-by: Fang, Xiaoran * Format with flake8 Signed-off-by: Fang, Xiaoran * Remove TODO comments Signed-off-by: Fang, Xiaoran --------- Signed-off-by: Fang, Xiaoran Co-authored-by: Wang, Wenjie Co-authored-by: Lei5 Chen Signed-off-by: Parth Mandaliya * Add PyTorch linear regression example (#808) This adds a new tutorial example on distributing a linear regression task over OpenFL cluster. The model is defined by Pytorch which is able to run over both cpu (by default) and gpu. The dataset is generated by make_regression from sklearn.datasets with pre-defined parameters. Fixes #797 Co-authored-by: Jiang, Jiaqiu Signed-off-by: He, Dan H Signed-off-by: Jiang, Jiaqiu Signed-off-by: Li, Qingqing Signed-off-by: Wang, Le Signed-off-by: Wu, Caili Signed-off-by: He, Dan H Signed-off-by: Parth Mandaliya * This prints out the hash of the CSR to disk for both the aggregator and (#813) * This prints out the hash of the CSR to disk for both the aggregator and collaborator. The user then compares and approves this hash with the hash printed out of the file to validate the CSR. In addition, a warning message is pritned if certify is run in silent mode. Fixes securefederatedai#692 Signed-off-by: Grant Baker * Refactor read_csr function to use get_csr_hash Signed-off-by: Grant Baker * Ask to check hashes before prompt --------- Signed-off-by: Grant Baker Co-authored-by: Grant Baker Signed-off-by: Parth Mandaliya * Improve workspace requirements import (#810) Remove the dump_requirement_file operation in export_ method. Fixes securefederatedai#767 Co-authored-by: Li, Qingqing Co-authored-by: Wu, Caili Signed-off-by: He, Dan H Signed-off-by: Jiang, Jiaqiu Signed-off-by: Li, Qingqing Signed-off-by: Wang, Le Signed-off-by: Wu, Caili Signed-off-by: He, Dan H Signed-off-by: Parth Mandaliya * Issue 506 Added Example using FedProx (#818) * created new ineractive_api dir to hold pytorch fedprox mnist example corrected files changed to FedProxOptimizer and ran set_old_weights for new FedProx Pytorch example renamed FedProx notebook used mode.parrameters() to get pytorch model weights got weights using state_dict changed old wieghts to list (for serialization) and fixed README input wieghts before zero_grad [Enhancement: 506] Add an example that uses the FedProx optimizer in the interative_api This duplicates the MedNIST_2D example in the interative api but changes it to use the FedProx optimizer. Fixes: #506 Signed-off-by: Klemme, Beverly Signed-off-by: Baker, Grant Signed-off-by: ELizabeth Simon, Neethu Signed-off-by: Jillela, Emmanuel Signed-off-by: Beverly Klemme * [Enhancement: 506] Add an example that uses the FedProx optimizer in the interative_api This duplicates the MedNIST_2D example in the interative api but changes it to use the FedProx optimizer. Fixes: securefederatedai#506 Signed-off-by: Klemme, Beverly Signed-off-by: Baker, Grant Signed-off-by: Elizabeth Simon, Neethu Signed-off-by: Jillela, Emmanuel Signed-off-by: Beverly Klemme * Update README.md Signed-off-by: Beverly Klemme * addressed comments by psfoley: corrected words in the jupyter notebook metadata and added a link to the FedProx paper in the README. Signed-off-by: Beverly Klemme --------- Signed-off-by: Klemme, Beverly Signed-off-by: Baker, Grant Signed-off-by: ELizabeth Simon, Neethu Signed-off-by: Jillela, Emmanuel Signed-off-by: Beverly Klemme Signed-off-by: Elizabeth Simon, Neethu Signed-off-by: Parth Mandaliya * [Bug: 768] FX CLI: Separate create, cert gen commands (#807) This change separates existing command "fx collaborator.py generate-cert-request" command into two commands. "fx collaborator create -n {NAME} -d {DATA_PATH: optional}". "fx collaborator generate-cert-request -n {NAME}". Fixes #768 Signed-off-by: Emmanuel Jillela Co-authored-by: Emmanuel Jillela Signed-off-by: Parth Mandaliya * Add new tutorial example to OpenFL interactive API (#812) * Add new tutorial example to OpenFL interactive API This adds a new tutorial example on distributing a linear regression task over OpenFL cluster The model is defined by scikit-learn which is able to run over both cpu (by default) and gpu. The dataset is 1-dimensional noisy data of sinusoid with pre-defined parameters. Fixes #798 Co-authored-by: Beverly Klemme Co-authored-by: Grant Baker Signed-off-by: Yi CAO * reduced requirements.txt in workspace Signed-off-by: Beverly Klemme --------- Signed-off-by: Yi CAO Signed-off-by: Beverly Klemme Co-authored-by: Yi CAO Signed-off-by: Parth Mandaliya * build(deps): bump tensorflow in /openfl-workspace/tf_cnn_histology (#776) Bumps [tensorflow](https://github.com/tensorflow/tensorflow) from 2.9.3 to 2.11.1. - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v2.9.3...v2.11.1) --- updated-dependencies: - dependency-name: tensorflow dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Parth Mandaliya * build(deps): bump tensorflow (#777) Bumps [tensorflow](https://github.com/tensorflow/tensorflow) from 2.9.3 to 2.11.1. - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v2.9.3...v2.11.1) --- updated-dependencies: - dependency-name: tensorflow dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Parth Mandaliya * Running a federation with GaNDLF Documentation (#794) * Initial commit of Running the federation with GaNDLF Documentation Signed-off-by: Patrick Foley * Update docs/running_the_federation_with_gandlf.rst Co-authored-by: Sarthak Pati Signed-off-by: Patrick Foley * Update docs/running_the_federation_with_gandlf.rst Co-authored-by: Sarthak Pati Signed-off-by: Patrick Foley * Update docs/running_the_federation_with_gandlf.rst Co-authored-by: Sarthak Pati Signed-off-by: Patrick Foley * Update docs/running_the_federation_with_gandlf.rst Co-authored-by: Sarthak Pati Signed-off-by: Patrick Foley * Update docs/running_the_federation_with_gandlf.rst Co-authored-by: Sarthak Pati Signed-off-by: Patrick Foley * Update docs/running_the_federation_with_gandlf.rst Co-authored-by: Sarthak Pati Signed-off-by: Patrick Foley * Update docs/running_the_federation_with_gandlf.rst Co-authored-by: Sarthak Pati Signed-off-by: Patrick Foley * Update docs/running_the_federation_with_gandlf.rst Co-authored-by: Sarthak Pati Signed-off-by: Patrick Foley * Update docs/running_the_federation_with_gandlf.rst Co-authored-by: Sarthak Pati Signed-off-by: Patrick Foley * Update docs/running_the_federation_with_gandlf.rst Co-authored-by: Sarthak Pati Signed-off-by: Patrick Foley * Update README.md Removed references to Intel's ownship, given it's now owned by the LF AI and Data. Signed-off-by: Patrick Foley * Fix Flake8 C419 for Ubuntu CI (#800) C419 Unnecessary list comprehension passed to any()/all() prevents short-circuiting - rewrite as a generator Signed-off-by: Aleksander Kantak Signed-off-by: Patrick Foley * Update README.md Signed-off-by: Patrick Foley * Fix warnings and issues in docs (#825) * Fix warnings and issues in docs Signed-off-by: Aleksander Kantak * fixup! Fix warnings and issues in docs Signed-off-by: Aleksander Kantak --------- Signed-off-by: Aleksander Kantak Signed-off-by: Patrick Foley * Add Logo (#827) * Add Logo * Update README.md Signed-off-by: Patrick Foley * Change OpenFL documentation font to improve accessibility (#809) This replaces the font of OpenFL documents with Intel One Mono font for low vision developers. Known issues: 1. The text font within the images has not been changed. 2. Some icons that do not exist in the new font cannot be displayed properly. Fixes securefederatedai#799 Co-authored-by: Wang, Le Signed-off-by: He, Dan H Signed-off-by: Jiang, Jiaqiu Signed-off-by: Li, Qingqing Signed-off-by: Wang, Le Signed-off-by: Wu, Caili Signed-off-by: He, Dan H Co-authored-by: He, Dan H Signed-off-by: Patrick Foley * Update unit tests to improve code coverage (#821) * Update ci config Signed-off-by: Fang, Xiaoran * Add unit test for following files - openfl/federated/plan/plan.py - openfl/interface/aggregation_functions/core/adaptive_aggregation.py Signed-off-by: Fang, Xiaoran * Add some test cases for databases module Signed-off-by: Fang, Xiaoran * Fix bugs for databases module unittest Signed-off-by: Fang, Xiaoran * Update unit tests for component module Signed-off-by: Fang, Xiaoran * Restore workflow config and update some comments Signed-off-by: Fang, Xiaoran * Enable save_ test case. Add yaml under test dir for unit test usage. Signed-off-by: Fang, Xiaoran * Remove plan to new dir. Signed-off-by: Fang, Xiaoran * Remove plan to new dir. Signed-off-by: Fang, Xiaoran * Add aggregator start test cases. Signed-off-by: Fang, Xiaoran * Add 2 aggregator test cases. Signed-off-by: Fang, Xiaoran * Add 1 aggregator test case. Signed-off-by: Fang, Xiaoran * Format code. Signed-off-by: Fang, Xiaoran * Refactor code. Signed-off-by: Fang, Xiaoran * Add collaborator start test cases. Signed-off-by: Fang, Xiaoran * Add 1 collaborator test case. Signed-off-by: Fang, Xiaoran * Format with flake8 Signed-off-by: Fang, Xiaoran * Remove TODO comments Signed-off-by: Fang, Xiaoran --------- Signed-off-by: Fang, Xiaoran Co-authored-by: Wang, Wenjie Co-authored-by: Lei5 Chen Signed-off-by: Patrick Foley * Add PyTorch linear regression example (#808) This adds a new tutorial example on distributing a linear regression task over OpenFL cluster. The model is defined by Pytorch which is able to run over both cpu (by default) and gpu. The dataset is generated by make_regression from sklearn.datasets with pre-defined parameters. Fixes #797 Co-authored-by: Jiang, Jiaqiu Signed-off-by: He, Dan H Signed-off-by: Jiang, Jiaqiu Signed-off-by: Li, Qingqing Signed-off-by: Wang, Le Signed-off-by: Wu, Caili Signed-off-by: He, Dan H Signed-off-by: Patrick Foley * This prints out the hash of the CSR to disk for both the aggregator and (#813) * This prints out the hash of the CSR to disk for both the aggregator and collaborator. The user then compares and approves this hash with the hash printed out of the file to validate the CSR. In addition, a warning message is pritned if certify is run in silent mode. Fixes securefederatedai#692 Signed-off-by: Grant Baker * Refactor read_csr function to use get_csr_hash Signed-off-by: Grant Baker * Ask to check hashes before prompt --------- Signed-off-by: Grant Baker Co-authored-by: Grant Baker Signed-off-by: Patrick Foley * Improve workspace requirements import (#810) Remove the dump_requirement_file operation in export_ method. Fixes securefederatedai#767 Co-authored-by: Li, Qingqing Co-authored-by: Wu, Caili Signed-off-by: He, Dan H Signed-off-by: Jiang, Jiaqiu Signed-off-by: Li, Qingqing Signed-off-by: Wang, Le Signed-off-by: Wu, Caili Signed-off-by: He, Dan H Signed-off-by: Patrick Foley * Issue 506 Added Example using FedProx (#818) * created new ineractive_api dir to hold pytorch fedprox mnist example corrected files changed to FedProxOptimizer and ran set_old_weights for new FedProx Pytorch example renamed FedProx notebook used mode.parrameters() to get pytorch model weights got weights using state_dict changed old wieghts to list (for serialization) and fixed README input wieghts before zero_grad [Enhancement: 506] Add an example that uses the FedProx optimizer in the interative_api This duplicates the MedNIST_2D example in the interative api but changes it to use the FedProx optimizer. Fixes: #506 Signed-off-by: Klemme, Beverly Signed-off-by: Baker, Grant Signed-off-by: ELizabeth Simon, Neethu Signed-off-by: Jillela, Emmanuel Signed-off-by: Beverly Klemme * [Enhancement: 506] Add an example that uses the FedProx optimizer in the interative_api This duplicates the MedNIST_2D example in the interative api but changes it to use the FedProx optimizer. Fixes: securefederatedai#506 Signed-off-by: Klemme, Beverly Signed-off-by: Baker, Grant Signed-off-by: Elizabeth Simon, Neethu Signed-off-by: Jillela, Emmanuel Signed-off-by: Beverly Klemme * Update README.md Signed-off-by: Beverly Klemme * addressed comments by psfoley: corrected words in the jupyter notebook metadata and added a link to the FedProx paper in the README. Signed-off-by: Beverly Klemme --------- Signed-off-by: Klemme, Beverly Signed-off-by: Baker, Grant Signed-off-by: ELizabeth Simon, Neethu Signed-off-by: Jillela, Emmanuel Signed-off-by: Beverly Klemme Signed-off-by: Elizabeth Simon, Neethu Signed-off-by: Patrick Foley * [Bug: 768] FX CLI: Separate create, cert gen commands (#807) This change separates existing command "fx collaborator.py generate-cert-request" command into two commands. "fx collaborator create -n {NAME} -d {DATA_PATH: optional}". "fx collaborator generate-cert-request -n {NAME}". Fixes #768 Signed-off-by: Emmanuel Jillela Co-authored-by: Emmanuel Jillela Signed-off-by: Patrick Foley * Add new tutorial example to OpenFL interactive API (#812) * Add new tutorial example to OpenFL interactive API This adds a new tutorial example on distributing a linear regression task over OpenFL cluster The model is defined by scikit-learn which is able to run over both cpu (by default) and gpu. The dataset is 1-dimensional noisy data of sinusoid with pre-defined parameters. Fixes #798 Co-authored-by: Beverly Klemme Co-authored-by: Grant Baker Signed-off-by: Yi CAO * reduced requirements.txt in workspace Signed-off-by: Beverly Klemme --------- Signed-off-by: Yi CAO Signed-off-by: Beverly Klemme Co-authored-by: Yi CAO Signed-off-by: Patrick Foley * build(deps): bump tensorflow in /openfl-workspace/tf_cnn_histology (#776) Bumps [tensorflow](https://github.com/tensorflow/tensorflow) from 2.9.3 to 2.11.1. - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v2.9.3...v2.11.1) --- updated-dependencies: - dependency-name: tensorflow dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Patrick Foley * build(deps): bump tensorflow (#777) Bumps [tensorflow](https://github.com/tensorflow/tensorflow) from 2.9.3 to 2.11.1. - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v2.9.3...v2.11.1) --- updated-dependencies: - dependency-name: tensorflow dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Patrick Foley * Update GaNDLF repo location and test CI with master branch Signed-off-by: Patrick Foley * Update GaNDLF repo location and test CI with master branch Signed-off-by: Patrick Foley * Update GaNDLF repo location and test CI with master branch Signed-off-by: Patrick Foley * Fix documentation links. Change path names and templates for CI Signed-off-by: Patrick Foley * Fix paths Signed-off-by: Patrick Foley * Fix paths Signed-off-by: Patrick Foley * Fix breaking tests Signed-off-by: Patrick Foley * Add compatible onnx version to requirements.txt file Signed-off-by: Patrick Foley * Fix wrong csv file name Signed-off-by: Patrick Foley * Fix wrong csv file name Signed-off-by: Patrick Foley * Fix wrong names in workflow file Signed-off-by: Patrick Foley * Fix wrong data path Signed-off-by: Patrick Foley * Fix lint in test_gandlf.py Signed-off-by: Patrick Foley * Fix lint errors Signed-off-by: Patrick Foley --------- Signed-off-by: Patrick Foley Signed-off-by: Aleksander Kantak Signed-off-by: He, Dan H Signed-off-by: Fang, Xiaoran Signed-off-by: Klemme, Beverly Signed-off-by: Baker, Grant Signed-off-by: ELizabeth Simon, Neethu Signed-off-by: Jillela, Emmanuel Signed-off-by: Beverly Klemme Signed-off-by: Elizabeth Simon, Neethu Signed-off-by: Emmanuel Jillela Signed-off-by: Yi CAO Co-authored-by: Sarthak Pati Co-authored-by: Prashant Shah <40899779+SprashAI@users.noreply.github.com> Co-authored-by: akantak Co-authored-by: wangleflex <106506636+wangleflex@users.noreply.github.com> Co-authored-by: He, Dan H Co-authored-by: xiaoranf Co-authored-by: Wang, Wenjie Co-authored-by: Lei5 Chen Co-authored-by: Beverly Klemme <35578090+bjklemme-intel@users.noreply.github.com> Co-authored-by: Emmanuel Jillela Co-authored-by: Yi CAO Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Parth Mandaliya * Fixed GaNDLF rst issues. Add sphinxcontrib-mermaid (#841) Signed-off-by: Patrick Foley Signed-off-by: Parth Mandaliya * Fix GaNDLF documentation links (#842) * Fixed GaNDLF rst issues. Add sphinxcontrib-mermaid Signed-off-by: Patrick Foley * Fix links in GaNDLF Documentation * Fixed GaNDLF rst issues. Add sphinxcontrib-mermaid Signed-off-by: Patrick Foley * Fix links in GaNDLF Documentation Signed-off-by: Patrick Foley --------- Signed-off-by: Patrick Foley Signed-off-by: Parth Mandaliya * Fix incorrectly formatted link in docs (#839) Signed-off-by: Francis Storr Signed-off-by: Parth Mandaliya * Resolving merge conflicts in local-runtime.py Integrated aggregator as stateful actor branch, tested. Signed-off-by: Parth Mandaliya -------- Signed-off-by: Parth Mandaliya * build(deps): bump onnx in /openfl-workspace/gandlf_seg_test (#840) Bumps [onnx](https://github.com/onnx/onnx) from 1.12 to 1.13.0. - [Release notes](https://github.com/onnx/onnx/releases) - [Changelog](https://github.com/onnx/onnx/blob/main/docs/Changelog.md) - [Commits](https://github.com/onnx/onnx/compare/v1.12.0...v1.13.0) --- updated-dependencies: - dependency-name: onnx dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Parth Mandaliya * Merged changes of remove-torch-dependency branch Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * Update setup.py Upgrading protobuf to 3.20.3 as per tensorboard requirement Signed-off-by: Parth Mandaliya * Resolving merge conflicts Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * Accessibility updates (#861) * Fix incorrectly formatted link in docs Signed-off-by: Francis Storr * Font styling, color contrast, other accessibility updates This update: 1. Restores Roboto and Lato fonts for most body copy, leaving Intel One Mono for code samples. 2. Adds colors (in `colors.css`) 3. Adds a new `accessibility_overrides.css` file containing CSS that improves the accessibility of the documentation and, where possible, Read The Docs. These updates remediate numerous non-conforming WCAG 2.x Level AA bugs. The use of a separate file for this hopefully makes these changes easier to manage and less likely to be accessibility overwritten in the future. Closes #848 Signed-off-by: Francis Storr --------- Signed-off-by: Francis Storr Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * build(deps): bump tensorflow from 2.8.4 to 2.11.1 in /openfl-workspace/keras_nlp (#773) * build(deps): bump tensorflow in /openfl-workspace/keras_nlp Bumps [tensorflow](https://github.com/tensorflow/tensorflow) from 2.8.4 to 2.11.1. - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v2.8.4...v2.11.1) --- updated-dependencies: - dependency-name: tensorflow dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: Patrick Foley * Update RMSProp optimizer import Signed-off-by: Patrick Foley * flake8 Signed-off-by: Patrick Foley --------- Signed-off-by: Patrick Foley Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Patrick Foley Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * build(deps): bump tensorflow from 2.8.4 to 2.11.1 in /openfl-workspace/keras_cnn_mnist (#771) * build(deps): bump tensorflow in /openfl-workspace/keras_cnn_mnist Bumps [tensorflow](https://github.com/tensorflow/tensorflow) from 2.8.4 to 2.11.1. - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v2.8.4...v2.11.1) --- updated-dependencies: - dependency-name: tensorflow dependency-type: direct:production ... Signed-off-by: dependabot[bot] * revert experimental Adam to legacy (#863) Signed-off-by: kta-intel --------- Signed-off-by: dependabot[bot] Signed-off-by: kta-intel Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Kevin Ta <116312994+kta-intel@users.noreply.github.com> Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * build(deps): bump tensorflow from 2.8.4 to 2.11.1 in /openfl-workspace/keras_cnn_with_compression (#770) * build(deps): bump tensorflow Bumps [tensorflow](https://github.com/tensorflow/tensorflow) from 2.8.4 to 2.11.1. - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v2.8.4...v2.11.1) --- updated-dependencies: - dependency-name: tensorflow dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: Patrick Foley * Update Adam Optimizer import Signed-off-by: Patrick Foley --------- Signed-off-by: Patrick Foley Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Patrick Foley Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * build(deps): bump tensorflow from 2.9.3 to 2.11.1 in /openfl-tutorials/interactive_api/Flax_CNN_CIFAR (#775) * build(deps): bump tensorflow Bumps [tensorflow](https://github.com/tensorflow/tensorflow) from 2.9.3 to 2.11.1. - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v2.9.3...v2.11.1) --- updated-dependencies: - dependency-name: tensorflow dependency-type: direct:production ... Signed-off-by: dependabot[bot] * Fixed breaking backages Signed-off-by: Patrick Foley * Add quiet flag back to pip install Signed-off-by: Patrick Foley --------- Signed-off-by: dependabot[bot] Signed-off-by: Patrick Foley Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Patrick Foley Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * build(deps): bump tensorflow-cpu from 2.8.4 to 2.11.1 in /openfl-workspace/keras_nlp_gramine_ready (#769) * build(deps): bump tensorflow-cpu Bumps [tensorflow-cpu](https://github.com/tensorflow/tensorflow) from 2.8.4 to 2.11.1. - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v2.8.4...v2.11.1) --- updated-dependencies: - dependency-name: tensorflow-cpu dependency-type: direct:production ... Signed-off-by: dependabot[bot] * Use legacy RMSprop optimizer Signed-off-by: Patrick Foley --------- Signed-off-by: dependabot[bot] Signed-off-by: Patrick Foley Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Patrick Foley Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * Accessibility color contrast fixes (#864) * Fix incorrectly formatted link in docs Signed-off-by: Francis Storr * Font styling, color contrast, other accessibility updates This update: 1. Restores Roboto and Lato fonts for most body copy, leaving Intel One Mono for code samples. 2. Adds colors (in `colors.css`) 3. Adds a new `accessibility_overrides.css` file containing CSS that improves the accessibility of the documentation and, where possible, Read The Docs. These updates remediate numerous non-conforming WCAG 2.x Level AA bugs. The use of a separate file for this hopefully makes these changes easier to manage and less likely to be accessibility overwritten in the future. Closes #848 Signed-off-by: Francis Storr * Color contrast updates for accessibility Color contrast updates for accessibility - update generic `a` element - update color of links in the toggle-able read-the-docs panel - update the color of the text in search results - update the color of notes headers Signed-off-by: Francis Storr --------- Signed-off-by: Francis Storr Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * Tweak link color so it’s not so aggressive (#865) Signed-off-by: Francis Storr Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * build(deps): bump tensorflow from 2.8.4 to 2.11.1 in /tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy (#772) * build(deps): bump tensorflow Bumps [tensorflow](https://github.com/tensorflow/tensorflow) from 2.8.4 to 2.11.1. - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v2.8.4...v2.11.1) --- updated-dependencies: - dependency-name: tensorflow dependency-type: direct:production ... Signed-off-by: dependabot[bot] * Update sd_requirements.txt * revert to legacy SGD and install tensorflow==2.11 for workflow Signed-off-by: kta-intel --------- Signed-off-by: dependabot[bot] Signed-off-by: kta-intel Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Patrick Foley Co-authored-by: kta-intel Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * build(deps): bump tensorflow from 2.8.4 to 2.11.1 in /openfl-workspace/tf_2dunet (#774) * build(deps): bump tensorflow in /openfl-workspace/tf_2dunet Bumps [tensorflow](https://github.com/tensorflow/tensorflow) from 2.8.4 to 2.11.1. - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v2.8.4...v2.11.1) --- updated-dependencies: - dependency-name: tensorflow dependency-type: direct:production ... Signed-off-by: dependabot[bot] * Update requirements.txt to retrigger CI * Update requirements.txt --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Kevin Ta <116312994+kta-intel@users.noreply.github.com> Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * Update Tensorflow, gRPC, Protobuf dependencies (#868) * Update Tensorflow to latest, finally update grpcio/protobuf Signed-off-by: Patrick Foley * Lint issue fix and missing tf reference Signed-off-by: Patrick Foley * pyzmq version fixed * fix taskrunner tests for windows Signed-off-by: Mansi Sharma * fix taskrunner test syntax for windows Signed-off-by: Mansi Sharma * adding user option to workspace pip install requirements for windows Signed-off-by: Mansi Sharma * fix windows CI test Signed-off-by: Mansi Sharma * testing virtual env for windows github actions Signed-off-by: Mansi Sharma * testing virtual env for windows github actions Signed-off-by: Mansi Sharma * testing virtual env for windows github actions Signed-off-by: Mansi Sharma * testing venv for windows Signed-off-by: Mansi Sharma * test venv for windows * test venv for windows * Added new KerasSerializer. Fixed other Interactive API experiments * Update taskrunner.yml * Update taskrunner.yml * Update workspace.py * Update workspace.py * Update taskrunner.yml * Remove get_model import from global namespace so dependencies are not loaded into memory unnecessarily (breaking windows build) * Refactoring and cleaning up imports to support Windows install * Fixed logger import paths * Fix missing imports * Fix native import * Fix lint errors * Fix keras optimizer patch. Remove irrelevant unit test * Format logs in UTF-8 for windows * Update interactive-kvasir.yml * Consolidate github actions python versions to single file * Update python versions * Update python versions * Update python versions * Reduce # of DataLoader workers for Pytorch Kvasir CI test * Fix Windows encoding * Fix Windows encoding and limit rounds so Github Actions CI doesn't run out of memory Signed-off-by: Patrick Foley * Fix windows encoding * Fix Windows encoding --------- Signed-off-by: Patrick Foley Signed-off-by: Mansi Sharma Co-authored-by: Mansi Sharma <77758170+mansishr@users.noreply.github.com> Co-authored-by: Mansi Sharma Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * Add FL plan description to documentation (#872) * Add plan description to documentation Signed-off-by: Mansi Sharma * fix indentation Signed-off-by: Mansi Sharma * Apply suggestions from code review Co-authored-by: Patrick Foley --------- Signed-off-by: Mansi Sharma Co-authored-by: Patrick Foley Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * Resolved flake8 issues Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * GPU Added for aggregator Fixed issue in 103 Cyclic Institutional Incremental Learning tutorial Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * Resolve Coverity Issues (#874) * Fix coverity issues * Resolve remaining coverity issues Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * Migrate to Ubuntu 22.04 LTS release (supported through 2027) (#875) Signed-off-by: Patrick Foley Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * Updated documentation: docs/workflow_interface.rst Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * Update Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * Update Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * Update Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * Updated Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * Updated Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * Updated documentation Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * Update workflow_interface.rst Fixing typo Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * Added best model and last model extraction technique in docs/workflow_interface.rst Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * Added GPU for aggregator Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * Resolving merge conflicts in 103 cyclic tutorial notebook fixing FedAvg in workflow interface tutorials to be compatible with latest numpy stable release (1.24.3) (#833) * fixing FedAvg averaging in order to be compatible with numpy v1.24+ Signed-off-by: kta-intel * uncommenting installations for consistency with other tutorials Signed-off-by: kta-intel * fixing 301_MNIST_Watermarking tutorial FedAvg Signed-off-by: kta-intel * fixing 301_MNIST_Watermarki ng tutorial FedAvg Signed-off-by: kta-intel * Switching to py38 kernel and clearing cell outputs Signed-off-by: kta-intel --------- Signed-off-by: kta-intel --------- Signed-off-by: Parth Mandaliya * Resolved merge conflicts in tests/github/experimental/testflow_datastore_cli.py Testflow for verifying stdout redirection to Metaflow datastore (#758) * implemented ray.wait * reverted changes back after testing * adding datastore cli test case * removed unused variables * removed stderr validation * fixed lint suggestions Signed-off-by: Parth Mandaliya * Added weighted_average aggregation function under openfl.experimental.interface.{keras,torch}.aggregation_funtions Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya * Update EdenPipeline in the documentation (#877) Signed-off-by: Amit Portnoy <1131991+amitport@users.noreply.github.com> Signed-off-by: Parth Mandaliya * WIP: CI Scans (#873) * Initial scans commit for bandit, hadolint, trivy Signed-off-by: Patrick Foley * Address bandit scan results Signed-off-by: Patrick Foley * Fix Trivy action Signed-off-by: Patrick Foley * Fix linting Signed-off-by: Patrick Foley * Add Coverity Badge Signed-off-by: Patrick Foley * Update Hadolint threshold to flag errors only Signed-off-by: Patrick Foley * Update Hadolint threshold to flag errors only Signed-off-by: Patrick Foley --------- Signed-off-by: Patrick Foley Signed-off-by: Parth Mandaliya * Update ROADMAP.md (#878) Signed-off-by: Parth Mandaliya * initial commit * add docstrings * change importlib to import * remove unnecesary files, replace ray with ray_grouped * remove max concurency, add number of actors * Trigger CI * run tests * lint changes * flake * changed number of actors to num_actors, added docs * Fixed workflow API tests Signed-off-by: Patrick Foley * lint fixes Signed-off-by: Patrick Foley --------- Signed-off-by: Patrick Foley Signed-off-by: ParthM-GitHub Signed-off-by: Parth Mandaliya Signed-off-by: sarthakpati Signed-off-by: Aleksander Kantak Signed-off-by: Parth Mandaliya Signed-off-by: Parth Mandaliya Signed-off-by: He, Dan H Signed-off-by: Fang, Xiaoran Signed-off-by: Grant Baker Signed-off-by: Klemme, Beverly Signed-off-by: Baker, Grant Signed-off-by: ELizabeth Simon, Neethu Signed-off-by: Jillela, Emmanuel Signed-off-by: Beverly Klemme Signed-off-by: Elizabeth Simon, Neethu Signed-off-by: Emmanuel Jillela Signed-off-by: Yi CAO Signed-off-by: dependabot[bot] Signed-off-by: Francis Storr Signed-off-by: kta-intel Signed-off-by: Mansi Sharma Signed-off-by: Amit Portnoy <1131991+amitport@users.noreply.github.com> Co-authored-by: Patrick Foley Co-authored-by: Olga Perepelkina Co-authored-by: Joe Devon <138038+joedevon@users.noreply.github.com> Co-authored-by: Sarthak Pati Co-authored-by: Prashant Shah <40899779+SprashAI@users.noreply.github.com> Co-authored-by: akantak Co-authored-by: Parth Mandaliya Co-authored-by: Parth Mandaliya Co-authored-by: Parth Mandaliya Co-authored-by: Keerti Talwar Co-authored-by: KeertiX Co-authored-by: wangleflex <106506636+wangleflex@users.noreply.github.com> Co-authored-by: He, Dan H Co-authored-by: xiaoranf Co-authored-by: Wang, Wenjie Co-authored-by: Lei5 Chen Co-authored-by: Beverly Klemme <35578090+bjklemme-intel@users.noreply.github.com> Co-authored-by: Grant Baker Co-authored-by: Emmanuel Jillela Co-authored-by: Yi CAO Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Sarthak Pati Co-authored-by: Francis Storr Co-authored-by: Patrick Foley Co-authored-by: Kevin Ta <116312994+kta-intel@users.noreply.github.com> Co-authored-by: kta-intel Co-authored-by: Mansi Sharma <77758170+mansishr@users.noreply.github.com> Co-authored-by: Mansi Sharma Co-authored-by: Sachin Gupta Co-authored-by: Keerti Prakash Talwar <115972088+KeertiX@users.noreply.github.com> Co-authored-by: Amit Portnoy <1131991+amitport@users.noreply.github.com> Signed-off-by: nammbash --- .gitignore | 1 - docs/workflow_interface.rst | 189 +++- ...rkflow_Interface_Mnist_Implementation_1.py | 95 +- ...rkflow_Interface_Mnist_Implementation_2.py | 82 +- .../Global_DP/requirements_global_dp.txt | 2 +- .../experimental/Privacy_Meter/cifar10_PM.py | 87 +- .../requirements_privacy_meter.txt | 4 +- .../Workflow_Interface_VFL_Two_Party.ipynb | 58 +- .../Workflow_Interface_Vertical_FL.ipynb | 39 +- .../Workflow_Interface_101_MNIST.ipynb | 182 ++-- ..._Interface_102_Aggregator_Validation.ipynb | 79 +- ...c_Institutional_Incremental_Learning.ipynb | 173 ++-- ...w_Interface_104_Keras_MNIST_with_GPU.ipynb | 363 +++++++ ...nterface_201_Exclusive_GPUs_with_Ray.ipynb | 122 +-- ...low_Interface_301_MNIST_Watermarking.ipynb | 113 ++- ...ce_401_FedProx_with_Synthetic_nonIID.ipynb | 822 ++++++++++++++++ .../requirements_workflow_interface.txt | 9 +- openfl/experimental/interface/__init__.py | 4 +- openfl/experimental/interface/fl_spec.py | 46 +- .../experimental/interface/keras/__init__.py | 7 + .../keras/aggregation_functions/__init__.py | 7 + .../aggregation_functions/weighted_average.py | 13 + openfl/experimental/interface/participants.py | 283 ++++-- .../experimental/interface/torch/__init__.py | 7 + .../torch/aggregation_functions/__init__.py | 7 + .../aggregation_functions/weighted_average.py | 77 ++ openfl/experimental/placement/__init__.py | 4 +- openfl/experimental/placement/placement.py | 67 +- openfl/experimental/runtime/__init__.py | 3 +- openfl/experimental/runtime/local_runtime.py | 918 +++++++++++++----- openfl/experimental/utilities/__init__.py | 11 +- openfl/experimental/utilities/exceptions.py | 8 +- .../experimental/utilities/metaflow_utils.py | 2 - openfl/experimental/utilities/resources.py | 27 +- .../experimental/utilities/runtime_utils.py | 69 +- .../experimental/utilities/stream_redirect.py | 3 + openfl/experimental/utilities/ui.py | 7 +- openfl/experimental/utilities/utils.py | 8 - setup.py | 4 + ...ements_experimental_localruntime_tests.txt | 5 + .../experimental/testflow_datastore_cli.py | 41 +- tests/github/experimental/testflow_exclude.py | 39 +- tests/github/experimental/testflow_include.py | 38 +- .../experimental/testflow_include_exclude.py | 24 +- .../experimental/testflow_internalloop.py | 24 +- .../testflow_privateattributes.py | 58 +- .../github/experimental/testflow_reference.py | 101 +- .../testflow_reference_with_exclude.py | 162 +--- .../testflow_reference_with_include.py | 154 +-- .../testflow_subset_of_collaborators.py | 40 +- 50 files changed, 3406 insertions(+), 1282 deletions(-) create mode 100644 openfl-tutorials/experimental/Workflow_Interface_104_Keras_MNIST_with_GPU.ipynb create mode 100644 openfl-tutorials/experimental/Workflow_Interface_401_FedProx_with_Synthetic_nonIID.ipynb create mode 100644 openfl/experimental/interface/keras/__init__.py create mode 100644 openfl/experimental/interface/keras/aggregation_functions/__init__.py create mode 100644 openfl/experimental/interface/keras/aggregation_functions/weighted_average.py create mode 100644 openfl/experimental/interface/torch/__init__.py create mode 100644 openfl/experimental/interface/torch/aggregation_functions/__init__.py create mode 100644 openfl/experimental/interface/torch/aggregation_functions/weighted_average.py delete mode 100644 openfl/experimental/utilities/utils.py create mode 100644 tests/github/experimental/requirements_experimental_localruntime_tests.txt diff --git a/.gitignore b/.gitignore index 8c0419b1a9e..578b6ed1123 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,6 @@ venv/* .idea *_pb2.py *_pb2_grpc.py - *.jpg *.crt *.key diff --git a/docs/workflow_interface.rst b/docs/workflow_interface.rst index 53feb71efdf..0886aa3fc96 100644 --- a/docs/workflow_interface.rst +++ b/docs/workflow_interface.rst @@ -146,30 +146,60 @@ A :code:`Runtime` defines where the flow will be executed, who the participants .. code-block:: python - # Setup participants - aggregator = Aggregator() - aggregator.private_attributes = {} - - # Setup collaborators with private attributes - collaborator_names = ['Portland', 'Seattle', 'Chandler','Bangalore'] - collaborators = [Collaborator(name=name) for name in collaborator_names] - for idx, collaborator in enumerate(collaborators): - local_train = deepcopy(mnist_train) - local_test = deepcopy(mnist_test) - local_train.data = mnist_train.data[idx::len(collaborators)] - local_train.targets = mnist_train.targets[idx::len(collaborators)] - local_test.data = mnist_test.data[idx::len(collaborators)] - local_test.targets = mnist_test.targets[idx::len(collaborators)] - collaborator.private_attributes = { - 'train_loader': torch.utils.data.DataLoader(local_train,batch_size=batch_size_train, shuffle=True), - 'test_loader': torch.utils.data.DataLoader(local_test,batch_size=batch_size_train, shuffle=True) + # Aggregator + aggregator_ = Aggregator() + + collaborator_names = ["Portland", "Seattle", "Chandler", "Bangalore"] + + def callable_to_initialize_collaborator_private_attributes(index, n_collaborators, batch_size, train_dataset, test_dataset): + train = deepcopy(train_dataset) + test = deepcopy(test_dataset) + train.data = train_dataset.data[index::n_collaborators] + train.targets = train_dataset.targets[index::n_collaborators] + test.data = test_dataset.data[index::n_collaborators] + test.targets = test_dataset.targets[index::n_collaborators] + + return { + "train_loader": torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True), + "test_loader": torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=True), } - # This is equivalent to: - # local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators, backend='single_process') - local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators) - -Let's break this down, starting with the :code:`Aggregator` and :code:`Collaborator` placeholders. These placeholders represent the nodes where tasks will be executed. Each participant placeholder has its own set of :code:`private_attributes`; a dictionary where the key is the name of the attribute, and the value is the object. In the above example, each of the four collaborators ('Portland', 'Seattle', 'Chandler', and 'Bangalore'), have a :code:`train_loader` and `test_loader` that they can access. These private attributes can be named anything, and do not necessarily need to be the same across each participant. + # Setup collaborators private attributes via callable function + collaborators = [] + for idx, collaborator_name in enumerate(collaborator_names): + collaborators.append( + Collaborator( + name=collaborator_name, + private_attributes_callable=callable_to_initialize_collaborator_private_attributes, + index=idx, + n_collaborators=len(collaborator_names), + train_dataset=mnist_train, + test_dataset=mnist_test, + batch_size=64 + ) + ) + + local_runtime = LocalRuntime(aggregator=aggregator_, collaborators=collaborators) + +Let's break this down, starting with the :code:`Aggregator` and :code:`Collaborator` components. These components represent the *Participants* in a Federated Learning experiment. Each participant has its own set of *private attributes* that represent the information / data specific to its role or requirements. As the name suggests these *private attributes* are accessible only to the particular participant, and are appropriately inserted into or filtered out of current Flow state when transferring from between Participants. For e.g. Collaborator private attributes are inserted into :code:`flow` when transitioning from Aggregator to Collaborator and are filtered out when transitioning from Collaborator to Aggregator. + +In the above :code:`FederatedFlow`, each collaborator accesses train and test datasets via *private attributes* :code:`train_loader` and :code:`test_loader`. These *private attributes* need to be set using a (user defined) callback function while instantiating the participant. Participant *private attributes* are returned by the callback function in form of a dictionary, where the key is the name of the attribute and the value is the object. + +In this example callback function :code:`callable_to_initialize_collaborator_private_attributes()` returns the collaborator private attributes :code:`train_loader` and :code:`test_loader` that are accessed by collaborator steps (:code:`aggregated_model_validation`, :code:`train` and :code:`local_model_validation`). Some important points to remember while creating callback function and private attributes are: + + - Callback Function needs to be defined by the user and should return the *private attributes* required by the participant in form of a key/value pair + - In above example multiple collaborators have the same callback function. Depending on the Federated Learning requirements, user can specify unique callback functions for each Participant + - If no Callback Function is specified then the Participant shall not have any *private attributes* + - Callback function can be provided with any parameters required as arguments. In this example, parameters essential for the callback function are supplied with corresponding values bearing *same names* during the instantiation of the Collaborator + + * :code:`index`: Index of the particular collaborator needed to shard the dataset + * :code:`n_collaborators`: Total number of collaborators in which the dataset is sharded + * :code:`batch_size`: For the train and test loaders + * :code:`train_dataset`: Train Dataset to be sharded between n_collaborators + * :code:`test_dataset`: Test Dataset to be sharded between n_collaborators + + - Callback function needs to be specified by user while instantiating the participant. Callback function is invoked by the OpenFL runtime at the time participant is created and once created these attributes cannot be modified + - Private attributes are accessible only in the Participant steps Now let's see how the runtime for a flow is assigned, and the flow gets run: @@ -184,23 +214,43 @@ And that's it! This will run an instance of the :code:`FederatedFlow` on a singl Runtime Backends ================ -The Runtime defines where code will run, but the Runtime has a :code:`Backend` - which defines the underlying implementation of *how* the flow will be executed. :code:`'single_process'` is the default in the :code:`LocalRuntime`: it executes all code sequentially within a single python process, and is well suited to run both on high spec and low spec hardware. For users with large servers or multiple GPUs they wish to take advantage of, we also provide a `Ray ` backend. The Ray backend enables parallel task execution for collaborators, and optionally allows users to request dedicated GPUs for collaborator tasks in the placement decorator, as follows: +The Runtime defines where code will run, but the Runtime has a :code:`Backend` - which defines the underlying implementation of *how* the flow will be executed. :code:`single_process` is the default in the :code:`LocalRuntime`: it executes all code sequentially within a single python process, and is well suited to run both on high spec and low spec hardware + +For users with large servers or multiple GPUs they wish to take advantage of, we also provide a :code:`ray` `` backend. The Ray backend enables parallel task execution for collaborators, and optionally allows users to request dedicated CPU / GPUs for Participants by using the :code:`num_cpus` and :code:`num_gpus` arguments while instantiating the Participant in following manner: .. code-block:: python - ExampleDedicatedGPUFlow(FLSpec): - ... - # We request one dedicated GPU for this task - @collaborator(num_gpus=1) - def training(self): - print(f'CUDA_VISIBLE_DEVICES: {os.environ["CUDA_VISIBLE_DEVICES"]}')) - self.loss = train_func(self.model, self.train_loader) - self.next(self.validation) - ... - + # Aggregator + aggregator_ = Aggregator(num_gpus=0.2) + + collaborator_names = ["Portland", "Seattle", "Chandler", "Bangalore"] + + def callable_to_initialize_collaborator_private_attributes(index, n_collaborators, batch_size, train_dataset, test_dataset): + ... + + # Setup collaborators private attributes via callable function + collaborators = [] + for idx, collaborator_name in enumerate(collaborator_names): + collaborators.append( + Collaborator( + name=collaborator_name, + num_gpus=0.2, # Number of the GPU allocated to Participant + private_attributes_callable=callable_to_initialize_collaborator_private_attributes, + index=idx, + n_collaborators=len(collaborator_names), + train_dataset=mnist_train, + test_dataset=mnist_test, + batch_size=64 + ) + ) + # The Ray Backend will now be used for local execution local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators, backend='ray') +In the above example, we have used :code:`num_gpus=0.2` while instantiating Aggregator and Collaborator to specify that each participant shall use 1/5th of GPU - this results in one GPU being dedicated for a total of 4 collaborators and 1 Aggregator. Users can tune these arguments based on their Federated Learning requirements and available hardware resources. Configurations where one Participant is shared across GPUs is not supported. For e.g. trying to run 5 participants on 2 GPU hardware with :code:`num_gpus=0.4` will not work since 80% of each GPU is allocated to 4 participants and 5th participant does not have any available GPU remaining for use. + +**Note:** It is not necessary to have ALL the participants use GPUs. For e.g. only the Collaborator are allocated to GPUs. In this scenario user should ensure that the artifacts returned by Collaborators to Aggregator (e.g. locally trained model object) should be loaded back to CPU before exiting the collaborator step (i.e. before the join step). As Tensorflow manages the object allocation by default therefore this step is needed only for Pytorch. + Debugging with the Metaflow Client ================================== @@ -218,19 +268,38 @@ After the flow has started running, you can use the Metaflow Client to get inter .. code-block:: python - from metaflow import Flow, Run, Task, Step - + from metaflow import Metaflow, Flow, Step, Task + + # Initialize Metaflow object and obtain list of executed flows: + m = Metaflow() + list(m) + > [Flow('FederatedFlow'), Flow('AggregatorValidationFlow'), Flow('FederatedFlow_MNIST_Watermarking')] + # The name of the flow is the name of the class - flow = Flow('FederatedFlow') - run = flow.latest_run + # Identify the Flow name + flow_name = 'FederatedFlow' + + # List all instances of Federatedflow executed under distinct run IDs + flow = Flow(flow_name) + list(flow) + > [Run('FederatedFlow/1692946840822001'), + Run('FederatedFlow/1692946796234386'), + Run('FederatedFlow/1692902602941163'), + Run('FederatedFlow/1692902559123920'),] + + # To Retrieve the latest run of the Federatedflow + run = Flow(flow_name).latest_run + print(run) + > Run('FederatedFlow/1692946840822001') + list(run) - > [Step('FederatedFlow/1671152854447797/end'), - Step('FederatedFlow/1671152854447797/join'), - Step('FederatedFlow/1671152854447797/local_model_validation'), - Step('FederatedFlow/1671152854447797/train'), - Step('FederatedFlow/1671152854447797/aggregated_model_validation'), - Step('FederatedFlow/1671152854447797/start')] - step = Step('FederatedFlow/1671152854447797/aggregated_model_validation') + > [Step('FederatedFlow/1692946840822001/end'), + Step('FederatedFlow/1692946840822001/join'), + Step('FederatedFlow/1692946840822001/local_model_validation'), + Step('FederatedFlow/1692946840822001/train'), + Step('FederatedFlow/1692946840822001/aggregated_model_validation'), + Step('FederatedFlow/1692946840822001/start')] + step = Step('FederatedFlow/1692946840822001/aggregated_model_validation') for task in step: if task.data.input == 'Portland': print(task.data) @@ -260,6 +329,37 @@ And if we wanted to get log or error message for that task, you can just run: print(portland_task.stderr) > [No output] +Also, If we wanted to get the best model and the last model, you can just run: + +.. code-block:: python + + # Choose the specific step containing the desired models (e.g., 'join' step): + step = Step('FederatedFlow/1692946840822001/join') + list(step) + > [Task('FederatedFlow/1692946840822001/join/12'),--> Round 3 + Task('FederatedFlow/1692946840822001/join/9'), --> Round 2 + Task('FederatedFlow/1692946840822001/join/6'), --> Round 1 + Task('FederatedFlow/1692946840822001/join/3')] --> Round 0 + + """The sequence of tasks represents each round, with the most recent task corresponding to the final round and the preceding tasks indicating the previous rounds + in chronological order. + To determine the best model, analyze the command line logs and model accuracy for each round. Then, provide the corresponding task ID associated with that Task""" + task = Task('FederatedFlow/1692946840822001/join/9') + + # Access the best model and its associated data + best_model = task.data.model + best_local_model_accuracy = task.data.local_model_accuracy + best_aggregated_model_accuracy = t.data.aggregated_model_accuracy + + # To retrieve the last model, select the most recent Task i.e last round. + task = Task('FederatedFlow/1692946840822001/join/12') + last_model = task.data.model + + # Save the chosen models using a suitable framework (e.g., PyTorch in this example): + import torch + torch.save(last_model.state_dict(), PATH) + torch.save(best_model.state_dict(), PATH) + While this information is useful for debugging, depending on your workflow it may require significant disk space. For this reason, `checkpoint` is disabled by default. Runtimes: Future Plans @@ -279,4 +379,3 @@ Our goal is to make it a one line change to configure where and how a flow is ex federated_runtime = FederatedRuntime(...) flow.runtime = federated_runtime flow.run() - diff --git a/openfl-tutorials/experimental/Global_DP/Workflow_Interface_Mnist_Implementation_1.py b/openfl-tutorials/experimental/Global_DP/Workflow_Interface_Mnist_Implementation_1.py index eae51d76c3a..8b9dbf4722d 100644 --- a/openfl-tutorials/experimental/Global_DP/Workflow_Interface_Mnist_Implementation_1.py +++ b/openfl-tutorials/experimental/Global_DP/Workflow_Interface_Mnist_Implementation_1.py @@ -39,7 +39,7 @@ # Fixing the seed for result repeatation: remove below to stop repeatable runs # ---------------------------------- random_seed = 5495300300540669060 -g_device = torch.Generator(device='cuda') +g_device = torch.Generator(device="cuda") # Uncomment the line below to use g_cpu if not using cuda # g_device = torch.Generator() # noqa: E800 # NOTE: remove below to stop repeatable runs @@ -93,7 +93,6 @@ def forward(self, x): def default_optimizer(model): - """ Return a new optimizer: we have only tested torch.optim.SGD w/ momentum however, we encouraging users to test other optimizers (i.e. torch.optim.Adam) @@ -106,7 +105,6 @@ def default_optimizer(model): def FedAvg(models, previous_global_model=None, dp_params=None): # NOQA: N802 - """ Return a Federated average model based on Fedavg algorithm: H. B. Mcmahan, E. Moore, D. Ramage, S. Hampson, and B. A. Y.Arcas, @@ -152,7 +150,7 @@ def FedAvg(models, previous_global_model=None, dp_params=None): # NOQA: N802 if len(state_dicts) > 1: for key in models[0].state_dict(): state_dict[key] = np.sum( - [state[key] for state in state_dicts], axis=0 + np.array([state[key] for state in state_dicts], dtype=object), axis=0 ) / len(models) new_model.load_state_dict(state_dict) return new_model @@ -181,7 +179,6 @@ def inference(network, test_loader, device): def optimizer_to_device(optimizer, device): - """ Sending the "torch.optim.Optimizer" object into the specified device for model training and inference @@ -244,7 +241,6 @@ def clip_testing_on_optimizer_parameters( def validate_dp_params(dp_params): - """ The differential privacy block should have the exact keys as provided below. @@ -275,7 +271,6 @@ def validate_dp_params(dp_params): def parse_config(config_path): - """ Parse "test_config.yml". @@ -288,7 +283,6 @@ def parse_config(config_path): def add_noise_on_aggegated_parameters(collaborators, model, dp_params): - """ Adds noise on aggregated model parameters performed at the aggregator. @@ -404,10 +398,7 @@ def start(self): print(f"No collaborator selected for training at Round: {self.round}") self.next(self.check_round_completion) - # Uncomment this if you don't have GPU in the machine and - # want this application to run on CPU instead - # @collaborator - @collaborator(num_gpus=1) # Assuming GPU(s) is available in the machine + @collaborator def aggregated_model_validation(self): print(f"Performing aggregated model validation for collaborator {self.input}") self.model = self.model.to(self.device) @@ -422,10 +413,7 @@ def aggregated_model_validation(self): self.collaborator_name = self.input self.next(self.train) - # Uncomment this if you don't have GPU in the machine - # and want this application to run on CPU instead - # @collaborator - @collaborator(num_gpus=1) # Assuming GPU(s) is available in the machine + @collaborator def train(self): print(f"Performing model training for collaborator {self.input}") self.optimizer = ClipOptimizer( @@ -490,10 +478,7 @@ def train(self): torch.cuda.empty_cache() self.next(self.local_model_validation) - # Uncomment this if you don't have GPU in the machine - # and want this application to run on CPU instead - # @collaborator - @collaborator(num_gpus=1) # Assuming GPU(s) is available in the machine + @collaborator def local_model_validation(self): print(f"Performing local model validation for collaborator {self.input}") self.local_validation_score = inference( @@ -615,7 +600,6 @@ def end(self): if __name__ == "__main__": - argparser = argparse.ArgumentParser(description=__doc__) argparser.add_argument( "--config_path", help="Absolute path to the flow configuration file" @@ -628,11 +612,16 @@ def end(self): args = argparser.parse_args() + if torch.cuda.is_available(): + device = torch.device("cuda:0") + else: + device = torch.device("cpu") + # Setup participants - aggregator = Aggregator() - aggregator.private_attributes = {} + # Set `num_gpus=0.09` to `num_gpus=0.0` in order to run this tutorial on CPU + aggregator = Aggregator(num_gpus=0.09) - # Setup collaborators with private attributes + # Collaborator names collaborator_names = [ "Portland", "Seattle", @@ -645,34 +634,46 @@ def end(self): "CostaRica", "Guadalajara", ] - collaborators = [Collaborator(name=name) for name in collaborator_names] - - if torch.cuda.is_available(): - device = torch.device( - "cuda:0" - ) # This will enable Ray library to reserve available GPU(s) for the task - else: - # Uncomment appropriate collaborator decorators in FederatedFlow class if - # you want the application to run on CPU - device = torch.device("cpu") - for idx, collab in enumerate(collaborators): - local_train = deepcopy(mnist_train) - local_test = deepcopy(mnist_test) - local_train.data = mnist_train.data[idx::len(collaborators)] - local_train.targets = mnist_train.targets[idx::len(collaborators)] - local_test.data = mnist_test.data[idx::len(collaborators)] - local_test.targets = mnist_test.targets[idx::len(collaborators)] - collab.private_attributes = { - "train_loader": DataLoader( - local_train, batch_size=batch_size_train, shuffle=True + def callable_to_initialize_collaborator_private_attributes( + index, n_collaborators, batch_size, train_dataset, test_dataset + ): + train = deepcopy(train_dataset) + test = deepcopy(test_dataset) + train.data = train_dataset.data[index::n_collaborators] + train.targets = train_dataset.targets[index::n_collaborators] + test.data = test_dataset.data[index::n_collaborators] + test.targets = test_dataset.targets[index::n_collaborators] + + return { + "train_loader": torch.utils.data.DataLoader( + train, batch_size=batch_size, shuffle=True ), - "test_loader": DataLoader( - local_test, batch_size=batch_size_train, shuffle=True + "test_loader": torch.utils.data.DataLoader( + test, batch_size=batch_size, shuffle=True ), } - local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators) + collaborators = [] + for idx, collaborator_name in enumerate(collaborator_names): + collaborators.append( + Collaborator( + name=collaborator_name, + private_attributes_callable=callable_to_initialize_collaborator_private_attributes, + # Set `num_gpus=0.09` to `num_gpus=0.0` in order to run this tutorial on CPU + num_cpus=0.0, + num_gpus=0.09, # Assuming GPU(s) is available in the machine + index=idx, + n_collaborators=len(collaborator_names), + batch_size=batch_size_train, + train_dataset=mnist_train, + test_dataset=mnist_test, + ) + ) + + local_runtime = LocalRuntime( + aggregator=aggregator, collaborators=collaborators, backend="ray" + ) print(f"Local runtime collaborators = {local_runtime.collaborators}") top_model_accuracy = 0 diff --git a/openfl-tutorials/experimental/Global_DP/Workflow_Interface_Mnist_Implementation_2.py b/openfl-tutorials/experimental/Global_DP/Workflow_Interface_Mnist_Implementation_2.py index 88dab0269d9..e9f7a16a1b9 100644 --- a/openfl-tutorials/experimental/Global_DP/Workflow_Interface_Mnist_Implementation_2.py +++ b/openfl-tutorials/experimental/Global_DP/Workflow_Interface_Mnist_Implementation_2.py @@ -35,7 +35,7 @@ random_seed = 5495300300540669060 -g_device = torch.Generator(device='cuda') +g_device = torch.Generator(device="cuda") # Uncomment the line below to use g_cpu if not using cuda # g_device = torch.Generator() # noqa: E800 # NOTE: remove below to stop repeatable runs @@ -400,10 +400,7 @@ def start(self): self.round += 1 self.next(self.start) - # Uncomment below line if you are using the ray backend and - # do not have a GPU accessible - # @collaborator - @collaborator(num_gpus=1) # Assuming GPU(s) is available in the machine + @collaborator def aggregated_model_validation(self): print(f"Performing aggregated model validation for collaborator {self.input}") self.model = self.model.to(self.device) @@ -418,10 +415,7 @@ def aggregated_model_validation(self): self.collaborator_name = self.input self.next(self.train) - # Uncomment below line if you are using the ray backend and - # do not have a GPU accessible - # @collaborator - @collaborator(num_gpus=1) # Assuming GPU(s) is available in the machine + @collaborator def train(self): print(f"Performing model training for collaborator {self.input}") @@ -487,10 +481,7 @@ def train(self): torch.cuda.empty_cache() self.next(self.local_model_validation) - # Uncomment below line if you are using the ray backend and - # do not have a GPU accessible - # @collaborator - @collaborator(num_gpus=1) # Assuming GPU(s) is available in the machine + @collaborator def local_model_validation(self): print(f"Performing local model validation for collaborator {self.input}") self.local_validation_score = inference( @@ -588,7 +579,6 @@ def end(self): if __name__ == "__main__": - argparser = argparse.ArgumentParser(description=__doc__) argparser.add_argument( "--config_path", help="Absolute path to the flow configuration file." @@ -601,9 +591,14 @@ def end(self): args = argparser.parse_args() + if torch.cuda.is_available(): + device = torch.device("cuda:0") + else: + device = torch.device("cpu") + # Setup participants - aggregator = Aggregator() - aggregator.private_attributes = {} + # Set `num_gpus=0.09` to `num_gpus=0.0` in order to run this tutorial on CPU + aggregator = Aggregator(num_gpus=0.09) # Setup collaborators with private attributes collaborator_names = [ @@ -618,35 +613,48 @@ def end(self): "CostaRica", "Guadalajara", ] - collaborators = [Collaborator(name=name) for name in collaborator_names] - if torch.cuda.is_available(): - device = torch.device( - "cuda:0" - ) # This will enable Ray library to reserve available GPU(s) for the task - else: - # Uncomment appropriate collaborator decorators in FederatedFlow class if - # you want the application to run on CPU - device = torch.device("cpu") - - for idx, collab in enumerate(collaborators): - local_train = deepcopy(mnist_train) - local_test = deepcopy(mnist_test) - local_train.data = mnist_train.data[idx::len(collaborators)] - local_train.targets = mnist_train.targets[idx::len(collaborators)] - local_test.data = mnist_test.data[idx::len(collaborators)] - local_test.targets = mnist_test.targets[idx::len(collaborators)] - collab.private_attributes = { + def callable_to_initialize_collaborator_private_attributes( + index, n_collaborators, batch_size, train_dataset, test_dataset + ): + train = deepcopy(train_dataset) + test = deepcopy(test_dataset) + train.data = train_dataset.data[index::n_collaborators] + train.targets = train_dataset.targets[index::n_collaborators] + test.data = test_dataset.data[index::n_collaborators] + test.targets = test_dataset.targets[index::n_collaborators] + + return { "train_loader": torch.utils.data.DataLoader( - local_train, batch_size=batch_size_train, shuffle=True + train, batch_size=batch_size, shuffle=True ), "test_loader": torch.utils.data.DataLoader( - local_test, batch_size=batch_size_train, shuffle=True + test, batch_size=batch_size, shuffle=True ), } - local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators) + collaborators = [] + for idx, collaborator_name in enumerate(collaborator_names): + collaborators.append( + Collaborator( + name=collaborator_name, + private_attributes_callable=callable_to_initialize_collaborator_private_attributes, + # Set `num_gpus=0.09` to `num_gpus=0.0` in order to run this tutorial on CPU + num_cpus=0.0, + num_gpus=0.09, # Assuming GPU(s) is available in the machine + index=idx, + n_collaborators=len(collaborator_names), + batch_size=batch_size_train, + train_dataset=mnist_train, + test_dataset=mnist_test, + ) + ) + + local_runtime = LocalRuntime( + aggregator=aggregator, collaborators=collaborators, backend="ray" + ) print(f"Local runtime collaborators = {local_runtime.collaborators}") + best_model = None initial_model = Net() top_model_accuracy = 0 diff --git a/openfl-tutorials/experimental/Global_DP/requirements_global_dp.txt b/openfl-tutorials/experimental/Global_DP/requirements_global_dp.txt index 1a1c19a1773..fafb9bf8eb0 100644 --- a/openfl-tutorials/experimental/Global_DP/requirements_global_dp.txt +++ b/openfl-tutorials/experimental/Global_DP/requirements_global_dp.txt @@ -1,6 +1,6 @@ numpy==1.23.3 torch==1.13.1 -torchvision==0.13.1 +torchvision==0.14.1 opacus==1.2.0 matplotlib==3.6.0 pillow==10.2.0 diff --git a/openfl-tutorials/experimental/Privacy_Meter/cifar10_PM.py b/openfl-tutorials/experimental/Privacy_Meter/cifar10_PM.py index 84fb8ecd568..8a185065f28 100644 --- a/openfl-tutorials/experimental/Privacy_Meter/cifar10_PM.py +++ b/openfl-tutorials/experimental/Privacy_Meter/cifar10_PM.py @@ -146,7 +146,6 @@ def inference(network, test_loader, device): def optimizer_to_device(optimizer, device): - """ Sending the "torch.optim.Optimizer" object into the specified device for model training and inference @@ -345,8 +344,7 @@ def start(self): exclude=["private"], ) - # @collaborator # Uncomment if you want ro run on CPU - @collaborator(num_gpus=1) # Assuming GPU(s) is available in the machine + @collaborator def aggregated_model_validation(self): print( ( @@ -359,8 +357,7 @@ def aggregated_model_validation(self): self.collaborator_name = self.input self.next(self.train) - # @collaborator # Uncomment if you want ro run on CPU - @collaborator(num_gpus=1) # Assuming GPU(s) is available on the machine + @collaborator def train(self): print(20 * "#") print( @@ -417,8 +414,7 @@ def train(self): torch.cuda.empty_cache() self.next(self.local_model_validation) - # @collaborator # Uncomment if you want ro run on CPU - @collaborator(num_gpus=1) # Assuming GPU(s) is available in the machine + @collaborator def local_model_validation(self): print( ( @@ -456,8 +452,7 @@ def local_model_validation(self): else: self.next(self.join, exclude=["training_completed"]) - # @collaborator # Uncomment if you want ro run on CPU - @collaborator(num_gpus=1) # Assuming GPU(s) is available in the machine + @collaborator def audit(self): print( ( @@ -585,7 +580,6 @@ def end(self): if __name__ == "__main__": - argparser = argparse.ArgumentParser(description=__doc__) argparser.add_argument( "--audit_dataset_ratio", @@ -662,27 +656,24 @@ def end(self): args = argparser.parse_args() # Setup participants - aggregator = Aggregator() - aggregator.private_attributes = {} + # Set `num_gpus=0.0` to `num_gpus=0.3` to run on GPU + aggregator = Aggregator(num_gpus=0.0) - # Setup collaborators with private attributes collaborator_names = ["Portland", "Seattle"] - collaborators = [Collaborator(name=name) for name in collaborator_names] if torch.cuda.is_available(): - device = torch.device( - "cuda:0" - ) # This will enable Ray library to reserve available GPU(s) for the task + device = torch.device("cuda:0") else: device = torch.device("cpu") + # Download and setup the train, and test dataset transform = transforms.Compose([transforms.ToTensor()]) cifar_train = CIFAR10(root="./data", train=True, download=True, transform=transform) cifar_test = CIFAR10(root="./data", train=False, download=True, transform=transform) - # split the dataset + # Split the dataset in train, test, and population dataset N_total_samples = len(cifar_test) + len(cifar_train) train_dataset_size = int(N_total_samples * args.train_dataset_ratio) test_dataset_size = int(N_total_samples * args.test_dataset_ratio) @@ -699,9 +690,9 @@ def end(self): train_dataset.targets = Y[:train_dataset_size] test_dataset = deepcopy(cifar_test) - test_dataset.data = X[train_dataset_size:train_dataset_size + test_dataset_size] + test_dataset.data = X[train_dataset_size: train_dataset_size + test_dataset_size] test_dataset.targets = Y[ - train_dataset_size:train_dataset_size + test_dataset_size + train_dataset_size: train_dataset_size + test_dataset_size ] population_dataset = deepcopy(cifar_test) @@ -717,22 +708,25 @@ def end(self): ) ) - # partition the dataset for clients - for idx, collab in enumerate(collaborators): - + # Split train, test, and population dataset among collaborators + # this function will be called before executing collaborator steps + # which will return private attributes dictionary for each collaborator + def callable_to_initialize_collaborator_private_attributes( + index, n_collaborators, train_ds, test_ds, population_ds, args + ): # construct the training and test and population dataset - local_train = deepcopy(train_dataset) - local_test = deepcopy(test_dataset) - local_population = deepcopy(population_dataset) + local_train = deepcopy(train_ds) + local_test = deepcopy(test_ds) + local_population = deepcopy(population_ds) - local_train.data = train_dataset.data[idx::len(collaborators)] - local_train.targets = train_dataset.targets[idx::len(collaborators)] + local_train.data = train_ds.data[index::n_collaborators] + local_train.targets = train_ds.targets[index::n_collaborators] - local_test.data = test_dataset.data[idx::len(collaborators)] - local_test.targets = test_dataset.targets[idx::len(collaborators)] + local_test.data = test_ds.data[index::n_collaborators] + local_test.targets = test_ds.targets[index::n_collaborators] - local_population.data = population_dataset.data[idx::len(collaborators)] - local_population.targets = population_dataset.targets[idx::len(collaborators)] + local_population.data = population_ds.data[index::n_collaborators] + local_population.targets = population_ds.targets[index::n_collaborators] # initialize pm report to track the privacy loss during the training local_pm_info = PM_report( @@ -763,7 +757,7 @@ def end(self): Path(local_pm_info.log_dir).mkdir(parents=True, exist_ok=True) Path(global_pm_info.log_dir).mkdir(parents=True, exist_ok=True) - collab.private_attributes = { + return { "local_pm_info": local_pm_info, "global_pm_info": global_pm_info, "train_dataset": local_train, @@ -777,9 +771,30 @@ def end(self): ), } - # To activate the ray backend with parallel collaborator tasks run in their own process - # and exclusive GPUs assigned to tasks, set LocalRuntime with backend='ray': - local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators) + collaborators = [] + for idx, collab_name in enumerate(collaborator_names): + collaborators.append( + Collaborator( + name=collab_name, + private_attributes_callable=callable_to_initialize_collaborator_private_attributes, + # If 1 GPU is available in the machine + # Set `num_gpus=0.0` to `num_gpus=0.3` to run on GPU + # with ray backend with 2 collaborators + num_cpus=0.0, + num_gpus=0.0, + index=idx, + n_collaborators=len(collaborator_names), + train_ds=train_dataset, + test_ds=test_dataset, + population_ds=population_dataset, + args=args, + ) + ) + + # Set backend='ray' to use ray-backend + local_runtime = LocalRuntime( + aggregator=aggregator, collaborators=collaborators, backend="single_process" + ) print(f"Local runtime collaborators = {local_runtime.collaborators}") diff --git a/openfl-tutorials/experimental/Privacy_Meter/requirements_privacy_meter.txt b/openfl-tutorials/experimental/Privacy_Meter/requirements_privacy_meter.txt index 8de0fc16604..6021e8f14c1 100644 --- a/openfl-tutorials/experimental/Privacy_Meter/requirements_privacy_meter.txt +++ b/openfl-tutorials/experimental/Privacy_Meter/requirements_privacy_meter.txt @@ -1,9 +1,9 @@ torch==1.13.1 -torchvision==0.14.0 +torchvision==0.14.1 matplotlib pillow opacus==1.3.0 numpy==1.23.5 cloudpickle scikit-learn -git+https://github.com/privacytrustlab/ml_privacy_meter.git@ac181a885815f85b3809317c247f422e6596cb4a \ No newline at end of file +git+https://github.com/privacytrustlab/ml_privacy_meter.git@ac181a885815f85b3809317c247f422e6596cb4a diff --git a/openfl-tutorials/experimental/Vertical_FL/Workflow_Interface_VFL_Two_Party.ipynb b/openfl-tutorials/experimental/Vertical_FL/Workflow_Interface_VFL_Two_Party.ipynb index 7e68bf0e0ef..5ba5a6fbce2 100644 --- a/openfl-tutorials/experimental/Vertical_FL/Workflow_Interface_VFL_Two_Party.ipynb +++ b/openfl-tutorials/experimental/Vertical_FL/Workflow_Interface_VFL_Two_Party.ipynb @@ -150,21 +150,47 @@ "source": [ "# Setup participants\n", "aggregator = Aggregator()\n", - "aggregator.private_attributes['trainloader'] = trainloader\n", - "aggregator.private_attributes['label_model'] = label_model\n", - "aggregator.private_attributes['label_model_optimizer'] = label_model_optimizer\n", "\n", - "# Setup collaborators with private attributes\n", + "def callable_to_initialize_aggregator_private_attributes(train_loader,label_model,label_model_optimizer):\n", + " return {\"trainloader\": train_loader,\n", + " \"label_model\" : label_model,\n", + " \"label_model_optimizer\":label_model_optimizer\n", + " } \n", + "\n", + "# Setup aggregator private attributes via callable function\n", + "aggregator = Aggregator(\n", + " name=\"agg\",\n", + " private_attributes_callable=callable_to_initialize_aggregator_private_attributes,\n", + " train_loader = trainloader,\n", + " label_model=label_model,\n", + " label_model_optimizer=label_model_optimizer\n", + ")\n", + "\n", + "# Setup collaborators private attributes via callable function\n", "collaborator_names = ['Portland']\n", - "collaborators = [Collaborator(name=name) for name in collaborator_names]\n", "\n", - "for idx, collaborator in enumerate(collaborators):\n", - " collaborator.private_attributes['data_model'] = data_model\n", - " collaborator.private_attributes['data_model_optimizer'] = data_model_optimizer\n", - " collaborator.private_attributes['trainloader'] = deepcopy(trainloader)\n", + "def callable_to_initialize_collaborator_private_attributes(index,data_model,data_model_optimizer,train_loader):\n", + " return {\n", + " \"data_model\": data_model,\n", + " \"data_model_optimizer\": data_model_optimizer,\n", + " \"trainloader\" : deepcopy(train_loader)\n", + " }\n", + "\n", + "collaborators = []\n", + "for idx, collaborator_name in enumerate(collaborator_names):\n", + " collaborators.append(\n", + " Collaborator(\n", + " name=collaborator_name,\n", + " private_attributes_callable=callable_to_initialize_collaborator_private_attributes,\n", + " index=idx,\n", + " data_model = data_model,\n", + " data_model_optimizer = data_model_optimizer,\n", + " train_loader = trainloader\n", + " )\n", + " )\n", "\n", "local_runtime = LocalRuntime(\n", - " aggregator=aggregator, collaborators=collaborators, backend='single_process')\n", + " aggregator=aggregator, collaborators=collaborators, backend='ray')\n", "print(f'Local runtime collaborators = {local_runtime.collaborators}')\n", "\n", "epochs = 100\n", @@ -192,16 +218,6 @@ "run_id = vflow._run_id" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "composed-burst", - "metadata": {}, - "outputs": [], - "source": [ - "import metaflow" - ] - }, { "cell_type": "code", "execution_count": null, @@ -250,7 +266,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.8.16" } }, "nbformat": 4, diff --git a/openfl-tutorials/experimental/Vertical_FL/Workflow_Interface_Vertical_FL.ipynb b/openfl-tutorials/experimental/Vertical_FL/Workflow_Interface_Vertical_FL.ipynb index 599d670b7a4..03bd4581938 100644 --- a/openfl-tutorials/experimental/Vertical_FL/Workflow_Interface_Vertical_FL.ipynb +++ b/openfl-tutorials/experimental/Vertical_FL/Workflow_Interface_Vertical_FL.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "aedbb436", "metadata": {}, @@ -122,14 +123,15 @@ "source": [ "# Setup participants\n", "aggregator = Aggregator()\n", - "aggregator.private_attributes = {}\n", "\n", - "# Setup collaborators with private attributes\n", + "# Setup collaborators private attributes via callable function\n", "collaborator_names = ['Portland', 'Seattle', 'Chandler', 'Bangalore']\n", - "collaborators = [Collaborator(name=name) for name in collaborator_names]\n", + "collaborators = []\n", + "for idx, collaborator_name in enumerate(collaborator_names):\n", + " collaborators.append(Collaborator(name=collaborator_name))\n", "\n", "local_runtime = LocalRuntime(\n", - " aggregator=aggregator, collaborators=collaborators)\n", + " aggregator=aggregator, collaborators=collaborators,backend='ray')\n", "print(f'Local runtime collaborators = {local_runtime.collaborators}')\n", "\n", "vflow = VerticalFlow(checkpoint=True)\n", @@ -139,6 +141,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "308b5d35", "metadata": {}, @@ -156,16 +159,6 @@ "run_id = vflow._run_id" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "composed-burst", - "metadata": {}, - "outputs": [], - "source": [ - "import metaflow" - ] - }, { "cell_type": "code", "execution_count": null, @@ -286,22 +279,6 @@ "source": [ "t.data.round" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "auburn-working", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1ca61148", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -320,7 +297,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.8.16" } }, "nbformat": 4, diff --git a/openfl-tutorials/experimental/Workflow_Interface_101_MNIST.ipynb b/openfl-tutorials/experimental/Workflow_Interface_101_MNIST.ipynb index f861522f956..156fd56ddec 100644 --- a/openfl-tutorials/experimental/Workflow_Interface_101_MNIST.ipynb +++ b/openfl-tutorials/experimental/Workflow_Interface_101_MNIST.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "14821d97", "metadata": {}, @@ -10,6 +11,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "bd059520", "metadata": {}, @@ -23,6 +25,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "39c3d86a", "metadata": {}, @@ -31,6 +34,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "a7989e72", "metadata": {}, @@ -39,6 +43,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "fc8e35da", "metadata": {}, @@ -47,6 +52,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "4dbb89b6", "metadata": {}, @@ -63,14 +69,17 @@ "source": [ "!pip install git+https://github.com/intel/openfl.git\n", "!pip install -r requirements_workflow_interface.txt\n", + "!pip install torch\n", + "!pip install torchvision\n", "\n", "# Uncomment this if running in Google Colab\n", - "#!pip install -r https://raw.githubusercontent.com/intel/openfl/develop/openfl-tutorials/experimental/requirements_workflow_interface.txt\n", - "#import os\n", - "#os.environ[\"USERNAME\"] = \"colab\"" + "# !pip install -r https://raw.githubusercontent.com/intel/openfl/develop/openfl-tutorials/experimental/requirements_workflow_interface.txt\n", + "# import os\n", + "# os.environ[\"USERNAME\"] = \"colab\"" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "7237eac4", "metadata": {}, @@ -103,19 +112,29 @@ "torch.backends.cudnn.enabled = False\n", "torch.manual_seed(random_seed)\n", "\n", - "mnist_train = torchvision.datasets.MNIST('files/', train=True, download=True,\n", - " transform=torchvision.transforms.Compose([\n", - " torchvision.transforms.ToTensor(),\n", - " torchvision.transforms.Normalize(\n", - " (0.1307,), (0.3081,))\n", - " ]))\n", + "mnist_train = torchvision.datasets.MNIST(\n", + " \"./files/\",\n", + " train=True,\n", + " download=True,\n", + " transform=torchvision.transforms.Compose(\n", + " [\n", + " torchvision.transforms.ToTensor(),\n", + " torchvision.transforms.Normalize((0.1307,), (0.3081,)),\n", + " ]\n", + " ),\n", + ")\n", "\n", - "mnist_test = torchvision.datasets.MNIST('files/', train=False, download=True,\n", - " transform=torchvision.transforms.Compose([\n", - " torchvision.transforms.ToTensor(),\n", - " torchvision.transforms.Normalize(\n", - " (0.1307,), (0.3081,))\n", - " ]))\n", + "mnist_test = torchvision.datasets.MNIST(\n", + " \"./files/\",\n", + " train=False,\n", + " download=True,\n", + " transform=torchvision.transforms.Compose(\n", + " [\n", + " torchvision.transforms.ToTensor(),\n", + " torchvision.transforms.Normalize((0.1307,), (0.3081,)),\n", + " ]\n", + " ),\n", + ")\n", "\n", "class Net(nn.Module):\n", " def __init__(self):\n", @@ -154,6 +173,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "cd268911", "metadata": {}, @@ -217,7 +237,7 @@ "source": [ "class FederatedFlow(FLSpec):\n", "\n", - " def __init__(self, model = None, optimizer = None, rounds=3, **kwargs):\n", + " def __init__(self, model=None, optimizer=None, rounds=3, **kwargs):\n", " super().__init__(**kwargs)\n", " if model is not None:\n", " self.model = model\n", @@ -225,7 +245,7 @@ " else:\n", " self.model = Net()\n", " self.optimizer = optim.SGD(self.model.parameters(), lr=learning_rate,\n", - " momentum=momentum)\n", + " momentum=momentum)\n", " self.rounds = rounds\n", "\n", " @aggregator\n", @@ -234,12 +254,12 @@ " self.collaborators = self.runtime.collaborators\n", " self.private = 10\n", " self.current_round = 0\n", - " self.next(self.aggregated_model_validation,foreach='collaborators',exclude=['private'])\n", + " self.next(self.aggregated_model_validation, foreach='collaborators', exclude=['private'])\n", "\n", " @collaborator\n", " def aggregated_model_validation(self):\n", " print(f'Performing aggregated model validation for collaborator {self.input}')\n", - " self.agg_validation_score = inference(self.model,self.test_loader)\n", + " self.agg_validation_score = inference(self.model, self.test_loader)\n", " print(f'{self.input} value of {self.agg_validation_score}')\n", " self.next(self.train)\n", "\n", @@ -250,32 +270,35 @@ " momentum=momentum)\n", " train_losses = []\n", " for batch_idx, (data, target) in enumerate(self.train_loader):\n", - " self.optimizer.zero_grad()\n", - " output = self.model(data)\n", - " loss = F.nll_loss(output, target)\n", - " loss.backward()\n", - " self.optimizer.step()\n", - " if batch_idx % log_interval == 0:\n", - " print('Train Epoch: 1 [{}/{} ({:.0f}%)]\\tLoss: {:.6f}'.format(\n", - " batch_idx * len(data), len(self.train_loader.dataset),\n", - " 100. * batch_idx / len(self.train_loader), loss.item()))\n", - " self.loss = loss.item()\n", - " torch.save(self.model.state_dict(), 'model.pth')\n", - " torch.save(self.optimizer.state_dict(), 'optimizer.pth')\n", + " self.optimizer.zero_grad()\n", + " output = self.model(data)\n", + " loss = F.nll_loss(output, target)\n", + " loss.backward()\n", + " self.optimizer.step()\n", + " if batch_idx % log_interval == 0:\n", + " print('Train Epoch: 1 [{}/{} ({:.0f}%)]\\tLoss: {:.6f}'.format(\n", + " batch_idx * len(data), len(self.train_loader.dataset),\n", + " 100. * batch_idx / len(self.train_loader), loss.item()))\n", + " self.loss = loss.item()\n", + " torch.save(self.model.state_dict(), 'model.pth')\n", + " torch.save(self.optimizer.state_dict(), 'optimizer.pth')\n", " self.training_completed = True\n", " self.next(self.local_model_validation)\n", "\n", " @collaborator\n", " def local_model_validation(self):\n", - " self.local_validation_score = inference(self.model,self.test_loader)\n", - " print(f'Doing local model validation for collaborator {self.input}: {self.local_validation_score}')\n", + " self.local_validation_score = inference(self.model, self.test_loader)\n", + " print(\n", + " f'Doing local model validation for collaborator {self.input}: {self.local_validation_score}')\n", " self.next(self.join, exclude=['training_completed'])\n", "\n", " @aggregator\n", - " def join(self,inputs):\n", - " self.average_loss = sum(input.loss for input in inputs)/len(inputs)\n", - " self.aggregated_model_accuracy = sum(input.agg_validation_score for input in inputs)/len(inputs)\n", - " self.local_model_accuracy = sum(input.local_validation_score for input in inputs)/len(inputs)\n", + " def join(self, inputs):\n", + " self.average_loss = sum(input.loss for input in inputs) / len(inputs)\n", + " self.aggregated_model_accuracy = sum(\n", + " input.agg_validation_score for input in inputs) / len(inputs)\n", + " self.local_model_accuracy = sum(\n", + " input.local_validation_score for input in inputs) / len(inputs)\n", " print(f'Average aggregated model validation values = {self.aggregated_model_accuracy}')\n", " print(f'Average training loss = {self.average_loss}')\n", " print(f'Average local model validation values = {self.local_model_accuracy}')\n", @@ -283,23 +306,27 @@ " self.optimizer = [input.optimizer for input in inputs][0]\n", " self.current_round += 1\n", " if self.current_round < self.rounds:\n", - " self.next(self.aggregated_model_validation, foreach='collaborators', exclude=['private'])\n", + " self.next(self.aggregated_model_validation,\n", + " foreach='collaborators', exclude=['private'])\n", " else:\n", " self.next(self.end)\n", - " \n", + "\n", " @aggregator\n", " def end(self):\n", - " print(f'This is the end of the flow') " + " print(f'This is the end of the flow')" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "2aabf61e", "metadata": {}, "source": [ - "You'll notice in the `FederatedFlow` definition above that there were certain attributes that the flow was not initialized with, namely the `train_loader` and `test_loader` for each of the collaborators. These are **private_attributes** that are exposed only throught he runtime. Each participant has it's own set of private attributes: a dictionary where the key is the attribute name, and the value is the object that will be made accessible through that participant's task. \n", + "You'll notice in the `FederatedFlow` definition above that there were certain attributes that the flow was not initialized with, namely the `train_loader` and `test_loader` for each of the collaborators. These are **private attributes** of the participant which are specified via a callback function while instantiating the participant. The callback function returns the private attributes in form of a dictionary where the key is the attribute name, and the value is the object that will be made accessible to that participant's task\n", + "\n", + "The callback function, `callable_to_initialize_collaborator_private_attributes`, segment shards of the MNIST dataset for four collaborators: `Portland`, `Seattle`, `Chandler`, and `Bangalore`. Each collaborator has their own slice of the dataset that is accessible through the `train_loader` and `test_loader` attributes. Parameters required by the callback function `index`, `n_collaborators`, `train_dataset`, `test_dataset` and `batch_size` are passed appropriate values with the same names in the Collaborator constructor\n", "\n", - "Below, we segment shards of the MNIST dataset for **four collaborators**: Portland, Seattle, Chandler, and Portland. Each has their own slice of the dataset that's accessible via the `train_loader` or `test_loader` attribute. Note that the private attributes are flexible, and you can choose to pass in a completely different type of object to any of the collaborators or aggregator (with an arbitrary name). These private attributes will always be filtered out of the current state when transfering from collaborator to aggregator, or vice versa. " + "Note that the private attributes are flexible, and you can choose to pass in a completely different type of object to any of the collaborators or aggregator (with an arbitrary name). These private attributes will always be filtered out of the current state when transfering from collaborator to aggregator, or vice versa" ] }, { @@ -309,30 +336,46 @@ "metadata": {}, "outputs": [], "source": [ - "# Setup participants\n", - "aggregator = Aggregator()\n", - "aggregator.private_attributes = {}\n", + "# Aggregator\n", + "aggregator_ = Aggregator()\n", "\n", - "# Setup collaborators with private attributes\n", - "collaborator_names = ['Portland', 'Seattle', 'Chandler','Bangalore']\n", - "collaborators = [Collaborator(name=name) for name in collaborator_names]\n", - "for idx, collaborator in enumerate(collaborators):\n", - " local_train = deepcopy(mnist_train)\n", - " local_test = deepcopy(mnist_test)\n", - " local_train.data = mnist_train.data[idx::len(collaborators)]\n", - " local_train.targets = mnist_train.targets[idx::len(collaborators)]\n", - " local_test.data = mnist_test.data[idx::len(collaborators)]\n", - " local_test.targets = mnist_test.targets[idx::len(collaborators)]\n", - " collaborator.private_attributes = {\n", - " 'train_loader': torch.utils.data.DataLoader(local_train,batch_size=batch_size_train, shuffle=True),\n", - " 'test_loader': torch.utils.data.DataLoader(local_test,batch_size=batch_size_train, shuffle=True)\n", + "collaborator_names = [\"Portland\", \"Seattle\", \"Chandler\", \"Bangalore\"]\n", + "\n", + "def callable_to_initialize_collaborator_private_attributes(index, n_collaborators, batch_size, train_dataset, test_dataset):\n", + " train = deepcopy(train_dataset)\n", + " test = deepcopy(test_dataset)\n", + " train.data = train_dataset.data[index::n_collaborators]\n", + " train.targets = train_dataset.targets[index::n_collaborators]\n", + " test.data = test_dataset.data[index::n_collaborators]\n", + " test.targets = test_dataset.targets[index::n_collaborators]\n", + "\n", + " return {\n", + " \"train_loader\": torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True),\n", + " \"test_loader\": torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=True),\n", " }\n", "\n", - "local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators, backend='single_process')\n", + "# Setup collaborators private attributes via callable function\n", + "collaborators = []\n", + "for idx, collaborator_name in enumerate(collaborator_names):\n", + " collaborators.append(\n", + " Collaborator(\n", + " name=collaborator_name,\n", + " private_attributes_callable=callable_to_initialize_collaborator_private_attributes,\n", + " index=idx, \n", + " n_collaborators=len(collaborator_names),\n", + " train_dataset=mnist_train, \n", + " test_dataset=mnist_test, \n", + " batch_size=64\n", + " )\n", + " )\n", + "\n", + "local_runtime = LocalRuntime(aggregator=aggregator_, collaborators=collaborators,\n", + " backend=\"ray\")\n", "print(f'Local runtime collaborators = {local_runtime.collaborators}')" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "278ad46b", "metadata": {}, @@ -343,24 +386,25 @@ { "cell_type": "code", "execution_count": null, - "id": "16937a65", + "id": "a175b4d6", "metadata": {}, "outputs": [], "source": [ "model = None\n", "best_model = None\n", "optimizer = None\n", - "flflow = FederatedFlow(model,optimizer)\n", + "flflow = FederatedFlow(model, optimizer, checkpoint=True)\n", "flflow.runtime = local_runtime\n", "flflow.run()" ] }, { + "attachments": {}, "cell_type": "markdown", - "id": "c32e0844", + "id": "86b3dd2e", "metadata": {}, "source": [ - "Now that the flow has completed, let's get the final model and accuracy:" + "Now that the flow has completed, let's get the final model and accuracy" ] }, { @@ -376,6 +420,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "5dd1558c", "metadata": {}, @@ -392,12 +437,13 @@ "metadata": {}, "outputs": [], "source": [ - "flflow2 = FederatedFlow(model=flflow.model,optimizer=flflow.optimizer,checkpoint=True)\n", + "flflow2 = FederatedFlow(model=flflow.model, optimizer=flflow.optimizer, checkpoint=True)\n", "flflow2.runtime = local_runtime\n", "flflow2.run()" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "a61a876d", "metadata": {}, @@ -447,6 +493,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "b55ccb19", "metadata": {}, @@ -475,6 +522,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "e5efa1ff", "metadata": {}, @@ -493,6 +541,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "3292b2e0", "metadata": {}, @@ -531,6 +580,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "eb1866b7", "metadata": {}, @@ -559,6 +609,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "ef877a50", "metadata": {}, @@ -587,6 +638,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "9826c45f", "metadata": {}, @@ -605,6 +657,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "dd962ddc", "metadata": {}, @@ -623,6 +676,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "426f2395", "metadata": {}, diff --git a/openfl-tutorials/experimental/Workflow_Interface_102_Aggregator_Validation.ipynb b/openfl-tutorials/experimental/Workflow_Interface_102_Aggregator_Validation.ipynb index af8e9513dc9..79e9ec7ec04 100644 --- a/openfl-tutorials/experimental/Workflow_Interface_102_Aggregator_Validation.ipynb +++ b/openfl-tutorials/experimental/Workflow_Interface_102_Aggregator_Validation.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "14821d97", "metadata": {}, @@ -10,6 +11,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "bd059520", "metadata": {}, @@ -18,6 +20,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "fc8e35da", "metadata": {}, @@ -26,6 +29,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "4dbb89b6", "metadata": {}, @@ -42,6 +46,8 @@ "source": [ "!pip install git+https://github.com/intel/openfl.git\n", "!pip install -r requirements_workflow_interface.txt\n", + "!pip install torch\n", + "!pip install torchvision\n", "\n", "# Uncomment this if running in Google Colab\n", "#!pip install -r https://raw.githubusercontent.com/intel/openfl/develop/openfl-tutorials/experimental/requirements_workflow_interface.txt\n", @@ -50,6 +56,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "7237eac4", "metadata": {}, @@ -133,6 +140,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "cd268911", "metadata": {}, @@ -171,6 +179,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "b2e45614", "metadata": { @@ -279,13 +288,14 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "7a133f9f", "metadata": {}, "source": [ - "You'll notice in the `FederatedFlow` definition above that there were certain attributes that the flow was not initialized with, namely the `train_loader` and `test_loader` for each of the collaborators. These are **private_attributes** that are exposed only throught he runtime. Each participant has it's own set of private attributes: a dictionary where the key is the attribute name, and the value is the object that will be made accessible through that participant's task. \n", + "You'll notice in the `FederatedFlow` definition above that there were certain attributes that the flow was not initialized with, namely the `train_loader` and `test_loader` for each of the collaborators. Each participant has it's own set of private attributes which can be set using callback function while instantiating the participant. The callback function returns the private attributes (`train_loader` & `test_loader`) in form of a dictionary where the key is the attribute name, and the value is the object that will be made accessible to that participant's task\n", "\n", - "Below, we segment shards of the MNIST dataset for **four collaborators**: Portland, Seattle, Chandler, and Portland. Each has their own slice of the dataset that's accessible via the `train_loader` or `test_loader` attribute. Note that the private attributes are flexible, and you can choose to pass in a completely different type of object to any of the collaborators or aggregator (with an arbitrary name). These private attributes will always be filtered out of the current state when transfering from collaborator to aggregator, or vice versa. " + "Callback function, `callable_to_initialize_collaborator_private_attributes`, segment shards of the MNIST dataset for four collaborators: `Portland`, `Seattle`, `Chandler`, and `Bangalore`. Callback function, `callable_to_initialize_aggregator_private_attributes`, returns the private attribute `test_loader` of the Aggregator." ] }, { @@ -295,37 +305,55 @@ "metadata": {}, "outputs": [], "source": [ - "# Setup participants\n", - "aggregator = Aggregator()\n", - "\n", - "# Setup collaborators with private attributes\n", "collaborator_names = ['Portland', 'Seattle', 'Chandler','Bangalore']\n", - "collaborators = [Collaborator(name=name) for name in collaborator_names]\n", "\n", - "aggregator_test = deepcopy(mnist_test)\n", - "aggregator_test.targets = mnist_test.targets[len(collaborators)::len(collaborators)+1]\n", - "aggregator_test.data = mnist_test.data[len(collaborators)::len(collaborators)+1]\n", - "aggregator.private_attributes = {\n", - " 'test_loader': torch.utils.data.DataLoader(aggregator_test,batch_size=batch_size_train, shuffle=True)\n", - "}\n", + "def callable_to_initialize_aggregator_private_attributes(n_collaborators, test_dataset, batch_size_train):\n", + " aggregator_test = deepcopy(test_dataset)\n", + " aggregator_test.targets = test_dataset.targets[n_collaborators::n_collaborators+1]\n", + " aggregator_test.data = test_dataset.data[n_collaborators::n_collaborators+1]\n", + " return {\n", + " 'test_loader': torch.utils.data.DataLoader(aggregator_test,batch_size=batch_size_train, shuffle=True)\n", + " }\n", + "\n", + "# Setup Aggregator private attributes via callable function\n", + "aggregator = Aggregator(\n", + " name=\"agg\",\n", + " private_attributes_callable=callable_to_initialize_aggregator_private_attributes,\n", + " n_collaborators=len(collaborator_names),\n", + " test_dataset=mnist_test, batch_size_train=batch_size_train\n", + ")\n", "\n", - "for idx, collaborator in enumerate(collaborators):\n", - " local_train = deepcopy(mnist_train)\n", - " local_test = deepcopy(mnist_test)\n", - " local_train.data = mnist_train.data[idx::len(collaborators)]\n", - " local_train.targets = mnist_train.targets[idx::len(collaborators)]\n", - " local_test.data = mnist_test.data[idx::len(collaborators)+1]\n", - " local_test.targets = mnist_test.targets[idx::len(collaborators)+1]\n", - " collaborator.private_attributes = {\n", - " 'train_loader': torch.utils.data.DataLoader(local_train,batch_size=batch_size_train, shuffle=True),\n", - " 'test_loader': torch.utils.data.DataLoader(local_test,batch_size=batch_size_train, shuffle=True)\n", + "# Setup collaborators private attributes via callable function\n", + "def callable_to_initialize_collaborator_private_attributes(index, n_collaborators, train_dataset, test_dataset, batch_size_train):\n", + " local_train = deepcopy(train_dataset)\n", + " local_test = deepcopy(test_dataset)\n", + " local_train.data = train_dataset.data[index::n_collaborators]\n", + " local_train.targets = train_dataset.targets[index::n_collaborators]\n", + " local_test.data = test_dataset.data[index::n_collaborators]\n", + " local_test.targets = test_dataset.targets[index::n_collaborators]\n", + " \n", + " return {\n", + " 'train_loader': torch.utils.data.DataLoader(local_train,batch_size=batch_size_train, shuffle=True),\n", + " 'test_loader': torch.utils.data.DataLoader(local_test,batch_size=batch_size_train, shuffle=True)\n", " }\n", "\n", - "local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators, backend='single_process')\n", + "collaborators=[]\n", + "for idx, collaborator_name in enumerate(collaborator_names):\n", + " collaborators.append(\n", + " Collaborator(\n", + " name=collaborator_name, num_cpus=0, num_gpus=0,\n", + " private_attributes_callable=callable_to_initialize_collaborator_private_attributes,\n", + " index=idx, n_collaborators=len(collaborator_names),\n", + " train_dataset=mnist_train, test_dataset=mnist_test, batch_size_train=batch_size_train,\n", + " )\n", + " )\n", + "\n", + "local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators, backend='ray')\n", "print(f'Local runtime collaborators = {local_runtime.collaborators}')" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "0525eaa9", "metadata": {}, @@ -343,12 +371,13 @@ "model = None\n", "best_model = None\n", "optimizer = None\n", - "flflow = AggregatorValidationFlow(model,optimizer)\n", + "flflow = AggregatorValidationFlow(model, optimizer)\n", "flflow.runtime = local_runtime\n", "flflow.run()" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "8b9f8d25", "metadata": {}, diff --git a/openfl-tutorials/experimental/Workflow_Interface_103_Cyclic_Institutional_Incremental_Learning.ipynb b/openfl-tutorials/experimental/Workflow_Interface_103_Cyclic_Institutional_Incremental_Learning.ipynb index 00c643542af..11e6f891049 100644 --- a/openfl-tutorials/experimental/Workflow_Interface_103_Cyclic_Institutional_Incremental_Learning.ipynb +++ b/openfl-tutorials/experimental/Workflow_Interface_103_Cyclic_Institutional_Incremental_Learning.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "14821d97", "metadata": {}, @@ -28,6 +29,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "fc8e35da", "metadata": {}, @@ -36,6 +38,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "4dbb89b6", "metadata": {}, @@ -52,6 +55,8 @@ "source": [ "!pip install git+https://github.com/intel/openfl.git\n", "!pip install -r requirements_workflow_interface.txt\n", + "!pip install torch\n", + "!pip install torchvision\n", "\n", "# Uncomment this if running in Google Colab\n", "#!pip install -r https://raw.githubusercontent.com/intel/openfl/develop/openfl-tutorials/experimental/requirements_workflow_interface.txt\n", @@ -60,6 +65,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "7237eac4", "metadata": {}, @@ -143,6 +149,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "cd268911", "metadata": {}, @@ -181,6 +188,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "b2e45614", "metadata": { @@ -298,13 +306,24 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "7a133f9f", "metadata": {}, "source": [ - "You'll notice in the `CyclicFlow` definition above that each collaborator performs **aggregated_model_validation**, **training**, and **local_model_validation** before passing it's model on to the next collaborator (through the aggregator). \n", + "You'll notice in the `CyclicFlow` definition above that each collaborator performs **aggregated_model_validation**, **training**, and **local_model_validation** before passing it's model on to the next collaborator (through the aggregator)" + ] + }, + { + "cell_type": "markdown", + "id": "a7f4c614", + "metadata": {}, + "source": [ + "Let's define the Participants and runtime now ! Each participant has it's own set of private attributes which can be set using callback function while instantiating the participant. The callback function returns the private attributes in form of a dictionary where the key is the attribute name, and the value is the object that will be made accessible to that participant's task\n", "\n", - "Below, we segment shards of the MNIST dataset for **four collaborators**: Portland, Seattle, Chandler, and Portland **equally and IID**. Each has their own slice of the dataset that's accessible via the `train_loader` or `test_loader` attribute. Note that the private attributes are flexible, and you can choose to pass in a completely different type of object to any of the collaborators or aggregator (with an arbitrary name). These private attributes will always be filtered out of the current state when transfering from collaborator to aggregator, or vice versa. " + "Callback function, `callable_to_initialize_aggregator_private_attributes`, returns the private attribute `test_loader` of the Aggregator.\n", + "\n", + "Callback function, `callable_to_initialize_collaborator_private_attributes`, segment shards of the MNIST dataset for four collaborators: `Portland`, `Seattle`, `Chandler`, and `Bangalore` and returns the private attribute `train_loader` and `test_loader`" ] }, { @@ -314,37 +333,58 @@ "metadata": {}, "outputs": [], "source": [ - "# Setup participants\n", - "agg = Aggregator()\n", + "collaborator_names = ['Portland', 'Seattle','Chandler','Bangalore']\n", "\n", - "# Setup collaborators with private attributes\n", - "collaborator_names = ['Portland', 'Seattle', 'Chandler','Bangalore']\n", - "collaborators = [Collaborator(name=name) for name in collaborator_names]\n", - "\n", - "aggregator_test = deepcopy(mnist_test)\n", - "aggregator_test.targets = mnist_test.targets[len(collaborators)::len(collaborators)+1]\n", - "aggregator_test.data = mnist_test.data[len(collaborators)::len(collaborators)+1]\n", - "aggregator.private_attributes = {\n", - " 'test_loader': torch.utils.data.DataLoader(aggregator_test,batch_size=batch_size_train, shuffle=True)\n", - "}\n", - "\n", - "for idx, col in enumerate(collaborators):\n", - " local_train = deepcopy(mnist_train)\n", - " local_test = deepcopy(mnist_test)\n", - " local_train.data = mnist_train.data[idx::len(collaborators)]\n", - " local_train.targets = mnist_train.targets[idx::len(collaborators)]\n", - " local_test.data = mnist_test.data[idx::len(collaborators)+1]\n", - " local_test.targets = mnist_test.targets[idx::len(collaborators)+1]\n", - " col.private_attributes = {\n", - " 'train_loader': torch.utils.data.DataLoader(local_train,batch_size=batch_size_train, shuffle=True),\n", - " 'test_loader': torch.utils.data.DataLoader(local_test,batch_size=batch_size_train, shuffle=True)\n", + "def callable_to_initialize_aggregator_private_attributes(n_collaborators, test_dataset,\n", + " batch_size):\n", + " aggregator_test = deepcopy(test_dataset)\n", + " aggregator_test.targets = test_dataset.targets[n_collaborators::n_collaborators+1]\n", + " aggregator_test.data = test_dataset.data[n_collaborators::n_collaborators+1]\n", + "\n", + " return {\n", + " 'test_loader': torch.utils.data.DataLoader(aggregator_test, batch_size=batch_size, shuffle=True)\n", + " }\n", + "\n", + "# Setup Aggregator private attributes via callable function\n", + "agg = Aggregator(\n", + " name=\"agg\",\n", + " private_attributes_callable=callable_to_initialize_aggregator_private_attributes,\n", + " n_collaborators=len(collaborator_names), test_dataset=mnist_test,\n", + " batch_size=batch_size_test\n", + ")\n", + "\n", + "def callable_to_initialize_collaborator_private_attributes(index, n_collaborators,\n", + " batch_size_train, train_dataset, test_dataset):\n", + " local_train = deepcopy(train_dataset)\n", + " local_test = deepcopy(test_dataset)\n", + " local_train.data = train_dataset.data[index::n_collaborators]\n", + " local_train.targets = train_dataset.targets[index::n_collaborators]\n", + " local_test.data = test_dataset.data[index::n_collaborators+1]\n", + " local_test.targets = test_dataset.targets[index::n_collaborators+1]\n", + "\n", + " return {\n", + " 'train_loader': torch.utils.data.DataLoader(local_train, batch_size=batch_size_train, shuffle=True),\n", + " 'test_loader': torch.utils.data.DataLoader(local_test, batch_size=batch_size_train, shuffle=True)\n", " }\n", "\n", - "local_runtime = LocalRuntime(aggregator=agg, collaborators=collaborators, backend='single_process')\n", + "# Setup collaborators private attributes via callable function\n", + "collaborators=[]\n", + "for idx, collaborator_name in enumerate(collaborator_names):\n", + " collaborators.append(\n", + " Collaborator( \n", + " name=collaborator_name, num_cpus=0, num_gpus=0,\n", + " private_attributes_callable=callable_to_initialize_collaborator_private_attributes,\n", + " index=idx, n_collaborators=len(collaborator_names), batch_size_train=batch_size_train,\n", + " train_dataset=mnist_train, test_dataset=mnist_test\n", + " )\n", + " )\n", + "\n", + "local_runtime = LocalRuntime(aggregator=agg, collaborators=collaborators, backend='ray') \n", "print(f'Local runtime collaborators = {local_runtime.collaborators}')" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "0525eaa9", "metadata": {}, @@ -368,6 +408,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "ad93c508", "metadata": {}, @@ -482,6 +523,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "e58fe9cd", "metadata": {}, @@ -496,45 +538,50 @@ "metadata": {}, "outputs": [], "source": [ - "# Setup participants\n", - "agg = Aggregator()\n", - "\n", - "# Setup collaborators with private attributes\n", "collaborator_names = ['Portland', 'Seattle', 'Chandler','Bangalore']\n", - "collaborators = [Collaborator(name=name) for name in collaborator_names]\n", - "\n", - "aggregator_test = deepcopy(mnist_test)\n", - "aggregator_test.targets = mnist_test.targets[len(collaborators)::len(collaborators)+1]\n", - "aggregator_test.data = mnist_test.data[len(collaborators)::len(collaborators)+1]\n", - "aggregator.private_attributes = {\n", - " 'test_loader': torch.utils.data.DataLoader(aggregator_test,batch_size=batch_size_train, shuffle=True)\n", - "}\n", - "\n", - "for idx, col in enumerate(collaborators):\n", - " local_train = deepcopy(mnist_train)\n", - " local_test = deepcopy(mnist_test)\n", - " local_train.data = mnist_train.data[idx::len(collaborators)]\n", - " local_train.targets = mnist_train.targets[idx::len(collaborators)]\n", - " if col.name == 'Portland':\n", + "\n", + "def callable_to_initialize_aggregator_private_attributes(n_collaborators, test_dataset, batch_size):\n", + " aggregator_test = deepcopy(test_dataset)\n", + " aggregator_test.targets = test_dataset.targets[n_collaborators::n_collaborators+1]\n", + " aggregator_test.data = test_dataset.data[n_collaborators::n_collaborators+1]\n", + "\n", + " return {\n", + " 'test_loader': torch.utils.data.DataLoader(aggregator_test, batch_size=batch_size, shuffle=True)\n", + " }\n", + "\n", + "# Setup Aggregator private attributes via callable function\n", + "agg = Aggregator(\n", + " name=\"agg\",\n", + " private_attributes_callable=callable_to_initialize_aggregator_private_attributes,\n", + " n_collaborators = len(collaborator_names), test_dataset=mnist_test,\n", + " batch_size=batch_size_test\n", + ")\n", + "\n", + "def callable_to_initialize_collaborator_private_attributes(index, collaborator_name, n_collaborators, batch_size_train, train_dataset, test_dataset):\n", + " local_train = deepcopy(train_dataset)\n", + " local_test = deepcopy(test_dataset)\n", + " local_train.data = train_dataset.data[index::n_collaborators]\n", + " local_train.targets = train_dataset.targets[index::n_collaborators]\n", + " if collaborator_name == 'Portland':\n", " # Remove the 0 class from Portland\n", " mask = local_train.targets != 1\n", " local_train.data = local_train.data[mask]\n", " local_train.targets = local_train.targets[mask]\n", - " if col.name == 'Seattle':\n", + " if collaborator_name == 'Seattle':\n", " # Seattle has 500 samples of class 1 (exclusively)\n", " mask = local_train.targets == 1\n", " local_train.data = local_train.data[mask]\n", " local_train.targets = local_train.targets[mask]\n", " local_train.data = local_train.data[:500]\n", " local_train.targets = local_train.targets[:500]\n", - " if col.name == 'Chandler':\n", + " if collaborator_name == 'Chandler':\n", " # Chandler has 300 samples of class 2 (exclusively)\n", " mask = local_train.targets == 2\n", " local_train.data = local_train.data[mask]\n", " local_train.targets = local_train.targets[mask]\n", " local_train.data = local_train.data[:300]\n", " local_train.targets = local_train.targets[:300]\n", - " if col.name == 'Bangalore':\n", + " if collaborator_name == 'Bangalore':\n", " # Bangalore has 300 samples of class 3 (exclusively)\n", " mask = local_train.targets == 3\n", " local_train.data = local_train.data[mask]\n", @@ -542,14 +589,26 @@ " local_train.data = local_train.data[:500]\n", " local_train.targets = local_train.targets[:500]\n", " # Test data is left unchanged (all classes represented)\n", - " local_test.data = mnist_test.data[idx::len(collaborators)+1]\n", - " local_test.targets = mnist_test.targets[idx::len(collaborators)+1]\n", - " col.private_attributes = {\n", - " 'train_loader': torch.utils.data.DataLoader(local_train,batch_size=batch_size_train, shuffle=True),\n", - " 'test_loader': torch.utils.data.DataLoader(local_test,batch_size=batch_size_train, shuffle=True)\n", + " local_test.data = test_dataset.data[index::n_collaborators+1]\n", + " local_test.targets = test_dataset.targets[index::n_collaborators+1]\n", + " return {\n", + " 'train_loader': torch.utils.data.DataLoader(local_train,batch_size=batch_size_train, shuffle=True),\n", + " 'test_loader': torch.utils.data.DataLoader(local_test,batch_size=batch_size_train, shuffle=True)\n", " }\n", "\n", - "local_runtime = LocalRuntime(aggregator=agg, collaborators=collaborators, backend='single_process')\n", + "# Setup collaborators private attributes via callable function\n", + "collaborators=[]\n", + "for idx, collaborator_name in enumerate(collaborator_names):\n", + " collaborators.append(\n", + " Collaborator(\n", + " name=collaborator_name, num_cpus=0, num_gpus=0,\n", + " private_attributes_callable=callable_to_initialize_collaborator_private_attributes,\n", + " index=idx, collaborator_name=collaborator_name, n_collaborators=len(collaborator_names),\n", + " batch_size_train=batch_size_train, train_dataset=mnist_train, test_dataset=mnist_test,\n", + " )\n", + " )\n", + "\n", + "local_runtime = LocalRuntime(aggregator=agg, collaborators=collaborators, backend='ray')\n", "print(f'Local runtime collaborators = {local_runtime.collaborators}')" ] }, @@ -563,7 +622,7 @@ "model = None\n", "best_model = None\n", "optimizer = None\n", - "clflow2 = CyclicLearningFlow(model,optimizer,rounds=4)\n", + "clflow2 = CyclicLearningFlow(model, optimizer, rounds=4)\n", "clflow2.runtime = local_runtime\n", "clflow2.run()" ] @@ -578,12 +637,13 @@ "model = None\n", "best_model = None\n", "optimizer = None\n", - "flflow2 = FederatedFlow(model,optimizer,rounds=4)\n", + "flflow2 = FederatedFlow(model, optimizer, rounds=4)\n", "flflow2.runtime = local_runtime\n", "flflow2.run()" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "285d63a9", "metadata": {}, @@ -592,6 +652,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "8b9f8d25", "metadata": {}, diff --git a/openfl-tutorials/experimental/Workflow_Interface_104_Keras_MNIST_with_GPU.ipynb b/openfl-tutorials/experimental/Workflow_Interface_104_Keras_MNIST_with_GPU.ipynb new file mode 100644 index 00000000000..5046f373cab --- /dev/null +++ b/openfl-tutorials/experimental/Workflow_Interface_104_Keras_MNIST_with_GPU.ipynb @@ -0,0 +1,363 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Workflow Interface 104: Working with Keras\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/openfl/blob/develop/openfl-tutorials/experimental/Workflow_Interface_104_Keras_MNIST_with_GPU.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial, we will demonstrate how to use Keras with Workflow Interface. Even though this tutorial is meant to be ran with GPU, in case GPU is not available this can be ran on CPU as well by changing `num_gpus=0.3` to `num_gpus=0`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Getting Started" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First we start by installing the necessary dependencies for the workflow interface. Please note if you intent to run this tutorial on GPU then install CUDA and cuDNN versions for TensorFlow 2.7 as mentioned [here](https://www.tensorflow.org/install/source#gpu)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install git+https://github.com/intel/openfl.git\n", + "!pip install -r requirements_workflow_interface.txt\n", + "!pip install tensorflow==2.7.0\n", + "\n", + "# Uncomment this if running in Google Colab\n", + "# !pip install -r https://raw.githubusercontent.com/intel/openfl/develop/openfl-tutorials/experimental/requirements_workflow_interface.txt\n", + "# import os\n", + "# os.environ[\"USERNAME\"] = \"colab\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Configure Tensorflow to allocate GPU memory as it is needed by the processes (instead of TF default policy to allocate nearly all of the memory on GPUs). Refer [Limiting GPU memory growth](https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth)\n", + "\n", + "IMPORTANT NOTE: This is needed to demonstrate fractional usage of GPUs by Ray backend and avoid conflict between Ray and Tensorflow while allocating GPU memory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"TF_FORCE_GPU_ALLOW_GROWTH\"] = \"true\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We begin with the quintessential example of a small keras CNN model trained on the MNIST dataset. Let's start define our dataloaders, model, optimizer, and some helper functions like we would for any other deep learning experiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tensorflow.keras.layers import Flatten, Dense, Dropout, Conv2D, MaxPool2D\n", + "from tensorflow.keras.models import Sequential\n", + "from tensorflow.keras.datasets import mnist\n", + "from tensorflow.keras.utils import to_categorical\n", + "\n", + "nb_classes = 10\n", + "batch_size=32\n", + "(X_train, y_train), (X_test, y_test) = mnist.load_data()\n", + "print(\"X_train original shape\", X_train.shape)\n", + "print(\"y_train original shape\", y_train.shape)\n", + "\n", + "X_train = X_train.astype(\"float32\")\n", + "X_test = X_test.astype(\"float32\")\n", + "X_train /= 255.0\n", + "X_test /= 255.0\n", + "print(\"Training matrix shape\", X_train.shape)\n", + "print(\"Testing matrix shape\", X_test.shape)\n", + "\n", + "Y_train = to_categorical(y_train, nb_classes)\n", + "Y_test = to_categorical(y_test, nb_classes)\n", + "\n", + "train_dataset=(X_train, Y_train)\n", + "test_dataset=(X_test, Y_test)\n", + "\n", + "model = Sequential([\n", + " Conv2D(filters=32, kernel_size=(3, 3), activation=\"relu\", input_shape=(28, 28, 1)),\n", + " MaxPool2D(),\n", + " Flatten(),\n", + " Dense(512, activation=\"relu\"),\n", + " Dropout(0.2),\n", + " Dense(512, activation=\"relu\"),\n", + " Dropout(0.2),\n", + " Dense(nb_classes, activation=\"softmax\"),\n", + "])\n", + "\n", + "model.compile(optimizer=\"SGD\", loss=\"categorical_crossentropy\", metrics=[\"accuracy\"])\n", + "print(model.summary())\n", + "\n", + "\n", + "def inference(model, test_loader, batch_size):\n", + " x_test, y_test = test_loader\n", + " loss, accuracy = model.evaluate(\n", + " x_test,\n", + " y_test,\n", + " batch_size=batch_size,\n", + " verbose=0\n", + " )\n", + " accuracy_percentage = accuracy * 100\n", + " print(f\"Test set: Avg. loss: {loss}, Accuracy: {accuracy_percentage:.2f}%\")\n", + " return accuracy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we import the `FLSpec`, `LocalRuntime`, and placement decorators.\n", + "\n", + "- `FLSpec` – Defines the flow specification. User defined flows are subclasses of this.\n", + "- `Runtime` – Defines where the flow runs, infrastructure for task transitions (how information gets sent). The `LocalRuntime` runs the flow on a single node.\n", + "- `aggregator/collaborator` - placement decorators that define where the task will be assigned" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from openfl.experimental.interface import FLSpec\n", + "from openfl.experimental.runtime import LocalRuntime\n", + "from openfl.experimental.placement import aggregator, collaborator\n", + "import numpy as np\n", + "\n", + "\n", + "def FedAvg(models):\n", + " new_model = models[0]\n", + " state_dicts = [model.weights for model in models]\n", + " state_dict = new_model.weights\n", + " for idx, _ in enumerate(models[1].weights):\n", + " state_dict[idx] = np.sum(np.array([state[idx]\n", + " for state in state_dicts], dtype=object), axis=0) / len(models)\n", + " new_model.set_weights(state_dict)\n", + " return new_model" + ] + }, + { + "attachments": { + "image.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we come to the flow definition. The OpenFL Workflow Interface adopts the conventions set by Metaflow, that every workflow begins with `start` and concludes with the `end` task. The aggregator begins with an optionally passed in model and optimizer. The aggregator begins the flow with the `start` task, where the list of collaborators is extracted from the runtime (`self.collaborators = self.runtime.collaborators`) and is then used as the list of participants to run the task listed in `self.next`, `aggregated_model_validation`. The model, optimizer, and anything that is not explicitly excluded from the next function will be passed from the `start` function on the aggregator to the `aggregated_model_validation` task on the collaborator. Where the tasks run is determined by the placement decorator that precedes each task definition (`@aggregator` or `@collaborator`). Once each of the collaborators (defined in the runtime) complete the `aggregated_model_validation` task, they pass their current state onto the `train` task, from `train` to `local_model_validation`, and then finally to `join` at the aggregator. It is in `join` that an average is taken of the model weights, and the next round can begin.\n", + "\n", + "![image.png](attachment:image.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class KerasMNISTFlow(FLSpec):\n", + " def __init__(self, model, rounds=3, **kwargs):\n", + " super().__init__(**kwargs)\n", + " self.model = model\n", + " self.n_rounds = rounds\n", + " self.current_round = 1\n", + "\n", + " @aggregator\n", + " def start(self):\n", + " self.collaborators = self.runtime.collaborators\n", + " self.next(self.aggregated_model_validation, foreach='collaborators')\n", + "\n", + " @collaborator\n", + " def aggregated_model_validation(self):\n", + " print(f'Performing aggregated model validation for collaborator {self.input}')\n", + " self.agg_validation_score = inference(self.model, self.test_loader, self.batch_size)\n", + " print(f'{self.input} value of {self.agg_validation_score}')\n", + " self.next(self.train)\n", + "\n", + " @collaborator\n", + " def train(self):\n", + " x_train, y_train = self.train_loader\n", + " history = self.model.fit(\n", + " x_train, y_train,\n", + " batch_size=self.batch_size,\n", + " epochs=1,\n", + " verbose=1,\n", + " )\n", + " self.loss = history.history[\"loss\"][0]\n", + " self.next(self.local_model_validation)\n", + "\n", + " @collaborator\n", + " def local_model_validation(self):\n", + " self.local_validation_score = inference(self.model, self.test_loader, self.batch_size)\n", + " print(\n", + " f'Doing local model validation for collaborator {self.input}: {self.local_validation_score}')\n", + " self.next(self.join)\n", + "\n", + " @aggregator\n", + " def join(self, inputs):\n", + " self.average_loss = sum(input.loss for input in inputs) / len(inputs)\n", + " self.aggregated_model_accuracy = sum(\n", + " input.agg_validation_score for input in inputs) / len(inputs)\n", + " self.local_model_accuracy = sum(\n", + " input.local_validation_score for input in inputs) / len(inputs)\n", + " print(f'Average aggregated model validation values = {self.aggregated_model_accuracy}')\n", + " print(f'Average training loss = {self.average_loss}')\n", + " print(f'Average local model validation values = {self.local_model_accuracy}')\n", + " print(\"Taking FedAvg of models of all collaborators\")\n", + " self.model = FedAvg([input.model for input in inputs])\n", + "\n", + " self.next(self.internal_loop)\n", + "\n", + " @aggregator\n", + " def internal_loop(self):\n", + " if self.current_round == self.n_rounds:\n", + " self.next(self.end)\n", + " else:\n", + " self.current_round += 1\n", + " self.next(self.aggregated_model_validation, foreach='collaborators')\n", + "\n", + " @aggregator\n", + " def end(self):\n", + " print(f'This is the end of the flow')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's define the Participants and runtime now ! Each participant has it's own set of private attributes which can be set using callback function while instantiating the participant. The callback function returns the private attributes in form of a dictionary where the key is the attribute name, and the value is the object that will be made accessible to that participant's task\n", + "\n", + "Callback function, `callable_to_initialize_collaborator_private_attributes`, segment shards of the MNIST dataset for two collaborators: `Portland`, and `Seattle`and returns the private attribute `train_loader` and `test_loader`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from openfl.experimental.interface import Aggregator, Collaborator\n", + "\n", + "# Aggregator\n", + "agg = Aggregator()\n", + "\n", + "# Setup collaborators with private attributes\n", + "collaborator_names = [\"Portland\", \"Seattle\"]\n", + "def callable_to_initialize_collaborator_private_attributes(n_collaborators, index, train_dataset, test_dataset, batch_size):\n", + " from openfl.utilities.data_splitters import EqualNumPyDataSplitter\n", + " train_splitter = EqualNumPyDataSplitter()\n", + " test_splitter = EqualNumPyDataSplitter()\n", + "\n", + " X_train, y_train = train_dataset\n", + " X_test, y_test = test_dataset\n", + "\n", + " train_idx = train_splitter.split(y_train, n_collaborators)\n", + " valid_idx = test_splitter.split(y_test, n_collaborators)\n", + "\n", + " train_dataset = X_train[train_idx[index]], y_train[train_idx[index]]\n", + " test_dataset = X_test[valid_idx[index]], y_test[valid_idx[index]]\n", + "\n", + " return {\n", + " \"train_loader\": train_dataset, \"test_loader\": test_dataset,\n", + " \"batch_size\": batch_size\n", + " }\n", + "\n", + "collaborators = []\n", + "for idx, collaborator_name in enumerate(collaborator_names):\n", + " collaborators.append(\n", + " Collaborator(\n", + " name=collaborator_name, num_cpus=0, num_gpus=0.3,\n", + " private_attributes_callable=callable_to_initialize_collaborator_private_attributes,\n", + " n_collaborators=len(collaborator_names), index=idx, train_dataset=(X_train, Y_train),\n", + " test_dataset=(X_test, Y_test), batch_size=32\n", + " )\n", + " )\n", + "\n", + "local_runtime = LocalRuntime(aggregator=agg, collaborators=collaborators, backend=\"ray\")\n", + "print(f'Local runtime collaborators = {local_runtime.collaborators}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have our flow and runtime defined, let's run the experiment! " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "flflow = KerasMNISTFlow(model, rounds=3, checkpoint=True)\n", + "flflow.runtime = local_runtime\n", + "flflow.run()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Congratulations!\n", + "Now that you've completed your this notebook, see some of the more advanced things you can do in our [other tutorials](broken_link), including:\n", + "\n", + "- Using the LocalRuntime Ray Backend for dedicated GPU access\n", + "- Vertical Federated Learning\n", + "- Model Watermarking\n", + "- Differential Privacy\n", + "- And More!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "keras_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.17" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/openfl-tutorials/experimental/Workflow_Interface_201_Exclusive_GPUs_with_Ray.ipynb b/openfl-tutorials/experimental/Workflow_Interface_201_Exclusive_GPUs_with_Ray.ipynb index 0c5f2592724..34e7adcb4c8 100644 --- a/openfl-tutorials/experimental/Workflow_Interface_201_Exclusive_GPUs_with_Ray.ipynb +++ b/openfl-tutorials/experimental/Workflow_Interface_201_Exclusive_GPUs_with_Ray.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "14821d97", "metadata": {}, @@ -10,6 +11,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "bd059520", "metadata": {}, @@ -18,6 +20,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "fc8e35da", "metadata": {}, @@ -26,6 +29,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "4dbb89b6", "metadata": {}, @@ -42,6 +46,8 @@ "source": [ "!pip install git+https://github.com/intel/openfl.git\n", "!pip install -r requirements_workflow_interface.txt\n", + "!pip install torch\n", + "!pip install torchvision\n", "\n", "# Uncomment this if running in Google Colab\n", "#!pip install -r https://raw.githubusercontent.com/intel/openfl/develop/openfl-tutorials/experimental/requirements_workflow_interface.txt\n", @@ -50,6 +56,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "7237eac4", "metadata": {}, @@ -138,6 +145,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "cd268911", "metadata": {}, @@ -178,13 +186,14 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "b2e45614", "metadata": { "scrolled": true }, "source": [ - "Now we come to the updated flow definition. Here we request `@collaborator(num_gpus=1)` as the placement decorator, which will require a dedicated GPU for each collaborator task. Tune this based on your use case, but because this uses Ray internally, you can also pass through a [fraction of a GPU](https://docs.ray.io/en/latest/ray-core/tasks/using-ray-with-gpus.html#fractional-gpus), which will allow more than one task to run on each GPU (i.e. `@collaborator(num_gpus=0.5)` would result in two tasks per GPU). " + "Now we come to the updated flow definition." ] }, { @@ -215,14 +224,14 @@ " self.current_round = 0\n", " self.next(self.aggregated_model_validation,foreach='collaborators',exclude=['private'])\n", "\n", - " @collaborator(num_gpus=1)\n", + " @collaborator\n", " def aggregated_model_validation(self):\n", " print(f'Performing aggregated model validation for collaborator {self.input}')\n", " self.agg_validation_score = inference(self.model,self.test_loader)\n", " print(f'{self.input} value of {self.agg_validation_score}')\n", " self.next(self.train)\n", "\n", - " @collaborator(num_gpus=1)\n", + " @collaborator\n", " def train(self):\n", " \"\"\"\n", " Train the model.\n", @@ -247,7 +256,7 @@ " self.loss = loss.item()\n", " self.next(self.local_model_validation)\n", "\n", - " @collaborator(num_gpus=1)\n", + " @collaborator\n", " def local_model_validation(self):\n", " self.local_validation_score = inference(self.model,self.test_loader)\n", " print(f'Doing local model validation for collaborator {self.input}: {self.local_validation_score}')\n", @@ -271,19 +280,18 @@ " \n", " @aggregator\n", " def end(self):\n", - " print(f'This is the end of the flow') " + " print(f'This is the end of the flow')" ] }, { + "attachments": {}, "cell_type": "markdown", - "id": "7a133f9f", + "id": "49c4afa8", "metadata": {}, "source": [ - "You'll notice in the `FederatedFlow` definition above that there were certain attributes that the flow was not initialized with, namely the `train_loader` and `test_loader` for each of the collaborators. These are **private_attributes** that are exposed only throught he runtime. Each participant has it's own set of private attributes: a dictionary where the key is the attribute name, and the value is the object that will be made accessible through that participant's task. \n", + "In this step we define entities necessary to run the flow and create a function which returns dataset as private attributes of collaborator. As described in [quickstart](https://github.com/securefederatedai/openfl/blob/develop/openfl-tutorials/experimental/Workflow_Interface_101_MNIST.ipynb) we define entities necessary for the flow.\n", "\n", - "Below, we segment shards of the MNIST dataset for **four collaborators**: Portland, Seattle, Chandler, and Portland. Each has their own slice of the dataset that's accessible via the `train_loader` or `test_loader` attribute. Note that the private attributes are flexible, and you can choose to pass in a completely different type of object to any of the collaborators or aggregator (with an arbitrary name). These private attributes will always be filtered out of the current state when transfering from collaborator to aggregator, or vice versa. \n", - "\n", - "The LocalRuntime is now initialized **without** the backend argument. Now the LocalRuntime will default to `backend='ray'`, which allows passing through the `num_gpus` argument in the placement decorator" + "To request GPU(s) with ray-backend, we specify `num_gpus=0.3` as the argument while instantiating Aggregator and Collaborator, this will reserve 0.3 GPU for each of the 2 collaborators and the aggregator and therefore require a dedicated GPU for the experiment. Tune this based on your use case, for example `num_gpus=0.4` for an experiment with 4 collaborators and the aggregator will require 2 dedicated GPUs. **NOTE:** Collaborator cannot span over multiple GPUs, for example `num_gpus=0.4` with 5 collaborators will require 3 dedicated GPUs. In this case collaborator 1 and 2 use GPU#1, collaborator 3 and 4 use GPU#2, and collaborator 5 uses GPU#3." ] }, { @@ -293,39 +301,52 @@ "metadata": {}, "outputs": [], "source": [ - "# Setup participants\n", - "aggregator = Aggregator()\n", - "aggregator.private_attributes = {}\n", + "# Setup Aggregator private attributes via callable function\n", + "aggregator = Aggregator(num_gpus=0.3)\n", + "\n", + "collaborator_names = ['Portland', 'Seattle']\n", + "\n", + "def callable_to_initialize_collaborator_private_attributes(index, n_collaborators,\n", + " train_dataset, test_dataset, batch_size_train):\n", + " local_train = deepcopy(train_dataset)\n", + " local_test = deepcopy(test_dataset)\n", + " local_train.data = train_dataset.data[index::n_collaborators]\n", + " local_train.targets = train_dataset.targets[index::n_collaborators]\n", + " local_test.data = test_dataset.data[index::n_collaborators]\n", + " local_test.targets = test_dataset.targets[index::n_collaborators]\n", "\n", - "# Setup collaborators with private attributes\n", - "collaborator_names = ['Portland', 'Seattle', 'Chandler','Bangalore']\n", - "collaborators = [Collaborator(name=name) for name in collaborator_names]\n", - "for idx, collaborator in enumerate(collaborators):\n", - " local_train = deepcopy(mnist_train)\n", - " local_test = deepcopy(mnist_test)\n", - " local_train.data = mnist_train.data[idx::len(collaborators)]\n", - " local_train.targets = mnist_train.targets[idx::len(collaborators)]\n", - " local_test.data = mnist_test.data[idx::len(collaborators)]\n", - " local_test.targets = mnist_test.targets[idx::len(collaborators)]\n", - " collaborator.private_attributes = {\n", - " 'train_loader': torch.utils.data.DataLoader(local_train,batch_size=batch_size_train, shuffle=True),\n", - " 'test_loader': torch.utils.data.DataLoader(local_test,batch_size=batch_size_train, shuffle=True)\n", + " return {\n", + " 'train_loader': torch.utils.data.DataLoader(local_train,batch_size=batch_size_train, shuffle=True),\n", + " 'test_loader': torch.utils.data.DataLoader(local_test, batch_size=batch_size_train, shuffle=True)\n", " }\n", "\n", + "# Setup collaborators private attributes via callable function\n", + "collaborators = []\n", + "for idx, collaborator_name in enumerate(collaborator_names):\n", + " collaborators.append(\n", + " Collaborator(\n", + " name=collaborator_name, num_cpus=0, num_gpus=0.3,\n", + " private_attributes_callable=callable_to_initialize_collaborator_private_attributes,\n", + " index=idx, n_collaborators=len(collaborator_names),\n", + " train_dataset=mnist_train, test_dataset=mnist_test, batch_size_train=batch_size_train\n", + " )\n", + " )\n", + " \n", "# The following is equivalent to\n", "# local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators, **backend='ray'**)\n", - "local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators)\n", + "local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators, backend='ray')\n", "print(f'Local runtime collaborators = {local_runtime.collaborators}')" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "0525eaa9", "metadata": {}, "source": [ "Now that we have our flow and runtime defined, let's run the experiment! \n", "\n", - "(If you run this example on Google Colab with the GPU Runtime, you should see one task executing at a time.)" + "(If you run this example on Google Colab with the GPU Runtime, you should see two task executing at a time.)" ] }, { @@ -338,12 +359,13 @@ "model = None\n", "best_model = None\n", "optimizer = None\n", - "flflow = CollaboratorGPUFlow(model,optimizer,checkpoint=True)\n", + "flflow = CollaboratorGPUFlow(model, optimizer, checkpoint=True)\n", "flflow.runtime = local_runtime\n", "flflow.run()" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "10616d60", "metadata": {}, @@ -364,6 +386,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "8e084b41", "metadata": {}, @@ -381,16 +404,6 @@ "run_id = flflow._run_id" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "composed-burst", - "metadata": {}, - "outputs": [], - "source": [ - "import metaflow" - ] - }, { "cell_type": "code", "execution_count": null, @@ -413,6 +426,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "f8f7d05f", "metadata": {}, @@ -441,6 +455,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "a206b36c", "metadata": {}, @@ -459,6 +474,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "bf4ec317", "metadata": {}, @@ -497,11 +513,12 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "0c5522b7", "metadata": {}, "source": [ - "Now we see **12** steps: **4** collaborators each performed **3** rounds of model training " + "Now we see **6** steps: **2** collaborators each performed **3** rounds of model training " ] }, { @@ -511,7 +528,7 @@ "metadata": {}, "outputs": [], "source": [ - "t = Task(f'FederatedFlow/{run_id}/train/9')" + "t = Task(f'CollaboratorGPUFlow/{run_id}/train/11')" ] }, { @@ -525,6 +542,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "efd5da76", "metadata": {}, @@ -553,6 +571,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "3e92fab0", "metadata": {}, @@ -571,6 +590,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "ced6e90e", "metadata": {}, @@ -589,6 +609,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "8b9f8d25", "metadata": {}, @@ -601,21 +622,13 @@ "- Differential Privacy\n", "- And More!" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "34fcbaa6", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "workflow-interface-py38", + "display_name": "runtime-env", "language": "python", - "name": "workflow-interface-py38" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -627,12 +640,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" - }, - "vscode": { - "interpreter": { - "hash": "a9b3ea793f0a9a343a81a73b472831cd604f7c2c0cb7677aa4f9120271015e80" - } + "version": "3.8.0" } }, "nbformat": 4, diff --git a/openfl-tutorials/experimental/Workflow_Interface_301_MNIST_Watermarking.ipynb b/openfl-tutorials/experimental/Workflow_Interface_301_MNIST_Watermarking.ipynb index 72d9eacd49b..b6c17061629 100644 --- a/openfl-tutorials/experimental/Workflow_Interface_301_MNIST_Watermarking.ipynb +++ b/openfl-tutorials/experimental/Workflow_Interface_301_MNIST_Watermarking.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "dc13070c", "metadata": {}, @@ -11,6 +12,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "8f28c451", "metadata": {}, @@ -27,6 +29,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "a4394089", "metadata": {}, @@ -35,6 +38,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "857f9995", "metadata": {}, @@ -51,8 +55,9 @@ "source": [ "!pip install git+https://github.com/intel/openfl.git\n", "!pip install -r requirements_workflow_interface.txt\n", - "!pip install matplotlib\n", + "!pip install torch\n", "!pip install torchvision\n", + "!pip install matplotlib\n", "!pip install git+https://github.com/pyviz-topics/imagen.git@master\n", "\n", "\n", @@ -63,6 +68,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "7bd566df", "metadata": {}, @@ -195,6 +201,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "f0c55175", "metadata": {}, @@ -375,6 +382,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "d82d34fd", "metadata": {}, @@ -430,7 +438,7 @@ "\n", "Notice that both the PRE-TRAIN and RE-TRAIN tasks are defined as Aggregator processing tasks\n", "\n", - "![image.png](attachment:image.png)\\\n", + "![image.png](attachment:image.png)\n", "\n", "
Workflow for Watermarking" ] @@ -479,6 +487,7 @@ " self.model.parameters(), lr=watermark_retrain_learning_rate\n", " )\n", " self.round_number = round_number\n", + " self.watermark_pretraining_completed = False\n", "\n", " @aggregator\n", " def start(self):\n", @@ -668,6 +677,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "c6da2c42", "metadata": {}, @@ -676,7 +686,7 @@ "\n", "- Collaborator attributes are created in the same manner as described in [quickstart](https://github.com/psfoley/openfl/blob/experimental-workflow-interface/openfl-tutorials/experimental/Workflow_Interface_101_MNIST.ipynb)\n", "\n", - "- `watermark_data_loader` is created as a **private attribute** of the Aggregator and it is exposed only via the runtime. This property enables the Watermark dataset to be hidden from the collaborators as Aggregator private attributes are filtered before the state is transferred to Collaborators (in the same manner as Collaborator private attributes are hidden from Aggregator)\n", + "- `watermark_data_loader` is created as a **private attribute** of the Aggregator which is set by `callable_to_initialize_aggregator_private_attributes` callable function. It is exposed only via the runtime. This property enables the Watermark dataset to be hidden from the collaborators as Aggregator private attributes are filtered before the state is transferred to Collaborators (in the same manner as Collaborator private attributes are hidden from Aggregator)\n", "\n", "Lets define these attributes along with some other parameters (seed, batch-sizes, optimizer parameters) and create the LocalRuntime" ] @@ -708,15 +718,18 @@ "watermark_pretrain_learning_rate = 1e-1\n", "watermark_pretrain_momentum = 5e-1\n", "watermark_pretrain_weight_decay = 5e-05\n", - "watermark_retrain_learning_rate = 5e-3\n" + "watermark_retrain_learning_rate = 5e-3" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "3d7ce52f", "metadata": {}, "source": [ - "## Setup Federation" + "Private attributes can be set using callback function while instantiating the participant\n", + "\n", + "Aggregator callable function `callable_to_initialize_aggregator_private_attributes` returns `watermark_data_loader`, `pretrain_epochs`, `retrain_epochs`, `watermark_acc_threshold`, and `watermark_pretraining_completed`. Collaborator callable function `callable_to_initialize_aggregator_private_attributes` returns `train_loader` and `test_loader` of the collaborator." ] }, { @@ -726,19 +739,24 @@ "metadata": {}, "outputs": [], "source": [ - "# Setup Aggregator with private attributes\n", - "aggregator = Aggregator()\n", - "aggregator.private_attributes = {\n", - " \"watermark_data_loader\": torch.utils.data.DataLoader(\n", - " watermark_data, batch_size=batch_size_watermark, shuffle=True\n", - " ),\n", - " \"pretrain_epochs\": 25,\n", - " \"retrain_epochs\": 25,\n", - " \"watermark_acc_threshold\": 0.98,\n", - " \"watermark_pretraining_completed\": False,\n", - "}\n", + "def callable_to_initialize_aggregator_private_attributes(watermark_data, batch_size):\n", + " return {\n", + " \"watermark_data_loader\": torch.utils.data.DataLoader(\n", + " watermark_data, batch_size=batch_size, shuffle=True\n", + " ),\n", + " \"pretrain_epochs\": 25,\n", + " \"retrain_epochs\": 25,\n", + " \"watermark_acc_threshold\": 0.98,\n", + " }\n", + "\n", + "# Setup Aggregator private attributes via callable function\n", + "aggregator = Aggregator(\n", + " name=\"agg\",\n", + " private_attributes_callable=callable_to_initialize_aggregator_private_attributes,\n", + " watermark_data=watermark_data,\n", + " batch_size=batch_size_watermark,\n", + " )\n", "\n", - "# Setup Collaborators with private attributes\n", "collaborator_names = [\n", " \"Portland\",\n", " \"Seattle\",\n", @@ -746,30 +764,38 @@ " \"Bangalore\",\n", " \"New Delhi\",\n", "]\n", - "print(f\"Creating collaborators {collaborator_names}\")\n", - "collaborators = [Collaborator(name=name) for name in collaborator_names]\n", - "\n", - "for idx, collaborator in enumerate(collaborators):\n", - " local_train = deepcopy(mnist_train)\n", - " local_test = deepcopy(mnist_test)\n", - " local_train.data = mnist_train.data[idx :: len(collaborators)]\n", - " local_train.targets = mnist_train.targets[idx :: len(collaborators)]\n", - " local_test.data = mnist_test.data[idx :: len(collaborators)]\n", - " local_test.targets = mnist_test.targets[idx :: len(collaborators)]\n", - " collaborator.private_attributes = {\n", - " \"train_loader\": torch.utils.data.DataLoader(\n", - " local_train, batch_size=batch_size_train, shuffle=True\n", - " ),\n", - " \"test_loader\": torch.utils.data.DataLoader(\n", - " local_test, batch_size=batch_size_train, shuffle=True\n", - " ),\n", + "\n", + "def callable_to_initialize_collaborator_private_attributes(index, n_collaborators, batch_size, train_dataset, test_dataset):\n", + " train = deepcopy(train_dataset)\n", + " test = deepcopy(test_dataset)\n", + " train.data = train_dataset.data[index::n_collaborators]\n", + " train.targets = train_dataset.targets[index::n_collaborators]\n", + " test.data = test_dataset.data[index::n_collaborators]\n", + " test.targets = test_dataset.targets[index::n_collaborators]\n", + "\n", + " return {\n", + " \"train_loader\": torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True),\n", + " \"test_loader\": torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=True),\n", " }\n", "\n", - "local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators)\n", + "# Setup Collaborators private attributes via callable function\n", + "collaborators = []\n", + "for idx, collaborator_name in enumerate(collaborator_names):\n", + " collaborators.append(\n", + " Collaborator(\n", + " name=collaborator_name, num_cpus=0, num_gpus=0,\n", + " private_attributes_callable=callable_to_initialize_collaborator_private_attributes,\n", + " index=idx, n_collaborators=len(collaborator_names),\n", + " train_dataset=mnist_train, test_dataset=mnist_test, batch_size=64\n", + " )\n", + " )\n", + "\n", + "local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators, backend=\"ray\")\n", "print(f\"Local runtime collaborators = {local_runtime.collaborators}\")" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "02935ccf", "metadata": {}, @@ -818,6 +844,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "bf66c1cd", "metadata": {}, @@ -836,22 +863,6 @@ "if flflow._checkpoint:\n", " InspectFlow(flflow, flflow._run_id, show_html=True)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f60118f1", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fefef69c", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/openfl-tutorials/experimental/Workflow_Interface_401_FedProx_with_Synthetic_nonIID.ipynb b/openfl-tutorials/experimental/Workflow_Interface_401_FedProx_with_Synthetic_nonIID.ipynb new file mode 100644 index 00000000000..6149abbb3b0 --- /dev/null +++ b/openfl-tutorials/experimental/Workflow_Interface_401_FedProx_with_Synthetic_nonIID.ipynb @@ -0,0 +1,822 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Workflow Interface 401: Synthetic non-IID Dataset with FedProx Optimizer\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/openfl/blob/develop/openfl-tutorials/experimental/Workflow_Interface_401_FedProx_with_Synthetic_nonIID.ipynb)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this OpenFL workflow interface tutorial, we shall learn how to implement FedProx and compare its performance with FedAvg algorithm using a Synthetic non-IID dataset. Reference: [Federated Optimization in Heterogeneous Networks](https://arxiv.org/pdf/1812.06127.pdf)." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Getting Started" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First we start by installing the necessary dependencies for the workflow interface" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install git+https://github.com/intel/openfl.git\n", + "!pip install -r requirements_workflow_interface.txt\n", + "!pip install torch\n", + "!pip install torchvision\n", + "!pip install matplotlib\n", + "!pip install seaborn\n", + "\n", + "# Uncomment following lines if running in Google Colab\n", + "# import os\n", + "# os.environ[\"USERNAME\"] = \"colab\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we import necessary libraries, and define Synthetic non-iid dataset as described in [Federated Optimization in Heterogeneous Networks](https://arxiv.org/pdf/1812.06127.pdf)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch as pt\n", + "import torch.utils.data as data\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "\n", + "import numpy as np\n", + "\n", + "import random\n", + "import collections\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "RANDOM_SEED = 10\n", + "batch_size = 10\n", + "\n", + "# Sets seed to reproduce the results\n", + "def set_seed(seed):\n", + " pt.manual_seed(seed)\n", + " pt.cuda.manual_seed_all(seed)\n", + " pt.use_deterministic_algorithms(True)\n", + " pt.backends.cudnn.deterministic = True\n", + " pt.backends.cudnn.benchmark = False\n", + " pt.backends.cudnn.enabled = False\n", + " np.random.seed(seed)\n", + " random.seed(seed)\n", + "\n", + "set_seed(RANDOM_SEED)\n", + "\n", + "\n", + "def one_hot(labels, classes):\n", + " return np.eye(classes)[labels]\n", + "\n", + "\n", + "def softmax(x):\n", + " ex = np.exp(x)\n", + " sum_ex = np.sum(np.exp(x))\n", + " return ex / sum_ex\n", + "\n", + "\n", + "def generate_synthetic(alpha, beta, iid, num_collaborators, num_classes):\n", + " dimension = 60\n", + " NUM_CLASS = num_classes\n", + " NUM_USER = num_collaborators\n", + "\n", + " samples_per_user = np.random.lognormal(4, 2, (NUM_USER)).astype(int) + 50\n", + " num_samples = np.sum(samples_per_user)\n", + "\n", + " X_split = [[] for _ in range(NUM_USER)]\n", + " y_split = [[] for _ in range(NUM_USER)]\n", + "\n", + " #### define some eprior ####\n", + " mean_W = np.random.normal(0, alpha, NUM_USER)\n", + " mean_b = mean_W\n", + " B = np.random.normal(0, beta, NUM_USER)\n", + " mean_x = np.zeros((NUM_USER, dimension))\n", + "\n", + " diagonal = np.zeros(dimension)\n", + " for j in range(dimension):\n", + " diagonal[j] = np.power((j + 1), -1.2)\n", + " cov_x = np.diag(diagonal)\n", + "\n", + " for i in range(NUM_USER):\n", + " if iid == 1:\n", + " mean_x[i] = np.ones(dimension) * B[i] # all zeros\n", + " else:\n", + " mean_x[i] = np.random.normal(B[i], 1, dimension)\n", + "\n", + " if iid == 1:\n", + " W_global = np.random.normal(0, 1, (dimension, NUM_CLASS))\n", + " b_global = np.random.normal(0, 1, NUM_CLASS)\n", + "\n", + " for i in range(NUM_USER):\n", + "\n", + " W = np.random.normal(mean_W[i], 1, (dimension, NUM_CLASS))\n", + " b = np.random.normal(mean_b[i], 1, NUM_CLASS)\n", + "\n", + " if iid == 1:\n", + " W = W_global\n", + " b = b_global\n", + "\n", + " xx = np.random.multivariate_normal(\n", + " mean_x[i], cov_x, samples_per_user[i])\n", + " yy = np.zeros(samples_per_user[i])\n", + "\n", + " for j in range(samples_per_user[i]):\n", + " tmp = np.dot(xx[j], W) + b\n", + " yy[j] = np.argmax(softmax(tmp))\n", + "\n", + " X_split[i] = xx.tolist()\n", + " y_split[i] = yy.tolist()\n", + "\n", + " return X_split, y_split\n", + "\n", + "\n", + "class SyntheticFederatedDataset:\n", + " def __init__(self, num_collaborators, batch_size=1, num_classes=10, **kwargs):\n", + " self.batch_size = batch_size\n", + " X, y = generate_synthetic(0.0, 0.0, 0, num_collaborators, num_classes)\n", + " X = [np.array([np.array(sample).astype(np.float32)\n", + " for sample in col]) for col in X]\n", + " y = [np.array([np.array(one_hot(int(sample), num_classes))\n", + " for sample in col]) for col in y]\n", + " self.X_train_all = np.array([col[:int(0.9 * len(col))] for col in X], dtype=np.ndarray)\n", + " self.X_valid_all = np.array([col[int(0.9 * len(col)):] for col in X], dtype=np.ndarray)\n", + " self.y_train_all = np.array([col[:int(0.9 * len(col))] for col in y], dtype=np.ndarray)\n", + " self.y_valid_all = np.array([col[int(0.9 * len(col)):] for col in y], dtype=np.ndarray)\n", + "\n", + " def split(self, index):\n", + " return {\n", + " \"train_loader\":\n", + " data.DataLoader(\n", + " data.TensorDataset(\n", + " pt.from_numpy(self.X_train_all[index]),\n", + " pt.from_numpy(self.y_train_all[index])\n", + " ), \n", + " batch_size=batch_size, shuffle=True\n", + " ),\n", + " \"test_loader\":\n", + " data.DataLoader(\n", + " data.TensorDataset(\n", + " pt.from_numpy(self.X_valid_all[index]),\n", + " pt.from_numpy(self.y_valid_all[index])\n", + " ), \n", + " batch_size=batch_size, shuffle=True\n", + " )\n", + " }" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have defined dataset class. Let define model, optimizer, and some helper functions like we would for any other deep learning experiment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from openfl.interface.aggregation_functions.weighted_average import weighted_average as wa\n", + "\n", + "\n", + "class Net(nn.Module):\n", + " \"\"\"\n", + " Model to train the dataset\n", + "\n", + " Args:\n", + " None\n", + " \n", + " Returns:\n", + " model: class Net object\n", + " \"\"\"\n", + " def __init__(self):\n", + " # Set RANDOM_STATE to reproduce same model\n", + " pt.set_rng_state(pt.manual_seed(RANDOM_SEED).get_state())\n", + " super(Net, self).__init__()\n", + " self.linear1 = nn.Linear(60, 100)\n", + " self.linear2 = nn.Linear(100, 10)\n", + "\n", + " def forward(self, x):\n", + " x = self.linear1(x)\n", + " x = self.linear2(x)\n", + " return x\n", + "\n", + "\n", + "def cross_entropy(output, target):\n", + " \"\"\"\n", + " cross-entropy metric\n", + "\n", + " Args:\n", + " output: model ouput,\n", + " target: target label\n", + "\n", + " Returns:\n", + " crossentropy_loss: float\n", + " \"\"\"\n", + " return F.cross_entropy(output, pt.max(target, 1)[1])\n", + "\n", + "\n", + "def compute_loss_and_acc(network, dataloader):\n", + " \"\"\"\n", + " Model test method\n", + "\n", + " Args:\n", + " network: class Net object (model)\n", + " dataloader: torch.utils.data.DataLoader\n", + "\n", + " Returns:\n", + " (accuracy,\n", + " loss,\n", + " correct,\n", + " dataloader_size)\n", + " \"\"\"\n", + " network.eval()\n", + " test_loss = 0\n", + " correct = 0\n", + " with pt.no_grad():\n", + " for data, target in dataloader:\n", + " output = network(data)\n", + " test_loss += cross_entropy(output, target).item()\n", + " tar = target.argmax(dim=1, keepdim=True)\n", + " pred = output.argmax(dim=1, keepdim=True)\n", + " correct += pred.eq(tar).sum().cpu().numpy()\n", + " dataloader_size = len(dataloader.dataset)\n", + " test_loss /= dataloader_size\n", + " accuracy = float(correct / dataloader_size)\n", + " return accuracy, test_loss, correct\n", + "\n", + "\n", + "def weighted_average(tensors, weights):\n", + " \"\"\"\n", + " Take weighted average of models / optimizers / loss / accuracy\n", + " Incase of taking weighted average of optimizer do the following steps:\n", + " 1. Call \"_get_optimizer_state\" (openfl.federated.task.runner_pt._get_optimizer_state)\n", + " pass optimizer to it, to take optimizer state dictionary.\n", + " 2. Pass optimizer state dictionaries list to here.\n", + " 3. To set the weighted average optimizer state dictionary back to optimizer,\n", + " call \"_set_optimizer_state\" (openfl.federated.task.runner_pt._set_optimizer_state)\n", + " and pass optimizer, device, and optimizer dictionary received in step 2.\n", + "\n", + " Args:\n", + " tensors: Models state_dict list or optimizers state_dict list or loss list or accuracy list\n", + " weights: Weight for each element in the list\n", + "\n", + " Returns:\n", + " dict: Incase model list / optimizer list OR\n", + " float: Incase of loss list or accuracy list\n", + " \"\"\"\n", + " # Check the type of first element of tensors list\n", + " if type(tensors[0]) in (dict, collections.OrderedDict):\n", + " optimizer = False\n", + " # If __opt_state_needed found then optimizer state dictionary is passed\n", + " if \"__opt_state_needed\" in tensors[0]:\n", + " optimizer = True\n", + " # Remove __opt_state_needed from all state dictionary in list\n", + " [tensor.pop(\"__opt_state_needed\") for tensor in tensors]\n", + " tmp_list = []\n", + " # Take keys in order to rebuild the state dictionary taking keys back up\n", + " input_state_dict_keys = tensors[0].keys()\n", + " for tensor in tensors:\n", + " # Append values of each state dictionary in list\n", + " # If type(value) is Tensor then it needs to be detached\n", + " tmp_list.append(np.array([value.detach() if type(value) is pt.Tensor else value for value in tensor.values()], dtype=object))\n", + " # Take weighted average of list of arrays\n", + " # new_params passed is weighted average of each array in tmp_list\n", + " new_params = wa(tmp_list, weights)\n", + " new_state = {}\n", + " # Take weighted average parameters and building a dictionary\n", + " [new_state.update({k:new_params[i]}) if optimizer else new_state.update({k:pt.from_numpy(new_params[i].numpy())}) \\\n", + " for i, k in enumerate(input_state_dict_keys)]\n", + " return new_state\n", + " else:\n", + " return wa(tensors, weights)" + ] + }, + { + "attachments": { + "federated-flow-diagram.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let us now define the Workflow for our experiment. Here we use the methodology as provided in [quickstart](https://github.com/intel/openfl/blob/develop/openfl-tutorials/experimental/Workflow_Interface_101_MNIST.ipynb), and define the workflow consisting of following steps:\n", + "-\t`start`: Start of the flow \n", + "-\t`compute_loss_and_accuracy`: Compute Train Loss and Test Accuracy on aggregated model. Performed *foreach collaborator* in Federation\n", + "-\t`gather_results_and_take_weighted_average`: Collect train loss, and test accuracy metrics for each collaborator and take weighted average to compute the *Aggregated* Train Loss and Test Accuracy. Performed on Aggregator\n", + "-\t`select_collaborators`: Randomly select *n_selected_collaborators* from the entire set of collaborators in Federation. Performed on Aggregator\n", + "-\t‘train_selected_collaborators` - Train selected collaborators on its individual datasets for *local_epoch* number of times. Performed on *n_selected_collaborators*\n", + "-\t`join`: Take weighted average of the model. Performed on Aggregator\n", + "-\t`end`: End of one round of flow. Flow can be run for *n_epochs* to obtain the desired results\n", + "\n", + "We also import the FedProxOptimizer from openfl.utilities.optimizer\n", + "\n", + "![federated-flow-diagram.png](attachment:federated-flow-diagram.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from openfl.experimental.interface import FLSpec, Aggregator, Collaborator\n", + "from openfl.experimental.runtime import LocalRuntime\n", + "from openfl.experimental.placement import aggregator, collaborator\n", + "from openfl.utilities.optimizers.torch import FedProxOptimizer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class FedProxFlow(FLSpec):\n", + "\n", + " def __init__(self, model=None, optimizer=None, n_selected_collaborators=10, n_rounds=10, **kwargs):\n", + " super(FedProxFlow, self).__init__(**kwargs)\n", + " self.round_number = 1\n", + " self.n_selected_collaborators = n_selected_collaborators\n", + " self.n_rounds = n_rounds\n", + " self.loss_and_acc = {\"Train Loss\": [], \"Test Accuracy\": []}\n", + " if model is not None:\n", + " self.model = model\n", + " self.optimizer = optimizer\n", + " else:\n", + " self.model = Net()\n", + " self.optimizer = FedProxOptimizer(\n", + " self.model.parameters(), lr=learning_rate, mu=mu, weight_decay=weight_decay)\n", + "\n", + " @aggregator\n", + " def start(self):\n", + " \"\"\"\n", + " Start of the flow. Call compute_loss_and_accuracy step for each collaborator\n", + " \"\"\"\n", + " print(f'\\nStarting round number {self.round_number} .... \\n')\n", + " self.collaborators = self.runtime.collaborators\n", + " self.next(self.compute_loss_and_accuracy, foreach='collaborators')\n", + "\n", + " @collaborator\n", + " def compute_loss_and_accuracy(self):\n", + " \"\"\"\n", + " Compute training accuracy, training loss, aggregated validation accuracy,\n", + " aggregated validation loss, \n", + " \"\"\"\n", + " # Compute Train Loss and Train Acc\n", + " self.training_accuracy, self.training_loss, _, = compute_loss_and_acc(\n", + " self.model, self.train_loader)\n", + " \n", + " # Compute Test Loss and Test Acc\n", + " self.agg_validation_score, self.agg_validation_loss, test_correct = compute_loss_and_acc(\n", + " self.model, self.test_loader)\n", + "\n", + " self.train_dataset_length = len(self.train_loader.dataset)\n", + " self.test_dataset_length = len(self.test_loader.dataset)\n", + "\n", + " print(\n", + " \" | Train Round: {:<5} : Train Loss {:<.6f}, Test Acc: {:<.6f} [{}/{}]\".format(\n", + " self.input,\n", + " self.round_number,\n", + " self.training_loss,\n", + " self.agg_validation_score,\n", + " test_correct, \n", + " self.test_dataset_length\n", + " )\n", + " )\n", + "\n", + " self.next(self.gather_results_and_take_weighted_average)\n", + "\n", + " @aggregator\n", + " def gather_results_and_take_weighted_average(self, inputs):\n", + " \"\"\"\n", + " Gather results of all collaborators computed in previous \n", + " step.\n", + " Compute train and test weightes, and compute weighted average of \n", + " aggregated training loss, and aggregated test accuracy\n", + " \"\"\"\n", + " # Calculate train_weights and test_weights\n", + " train_datasize, test_datasize = [], []\n", + " for input_ in inputs:\n", + " train_datasize.append(input_.train_dataset_length)\n", + " test_datasize.append(input_.test_dataset_length)\n", + "\n", + " self.train_weights, self.test_weights = [], []\n", + " for input_ in inputs:\n", + " self.train_weights.append(input_.train_dataset_length / sum(train_datasize))\n", + " self.test_weights.append(input_.test_dataset_length / sum(test_datasize))\n", + "\n", + " aggregated_model_accuracy_list, aggregated_model_loss_list = [], []\n", + " for input_ in inputs:\n", + " aggregated_model_loss_list.append(input_.training_loss)\n", + " aggregated_model_accuracy_list.append(input_.agg_validation_score)\n", + "\n", + " # Weighted average of training loss\n", + " self.aggregated_model_training_loss = weighted_average(aggregated_model_loss_list, self.train_weights)\n", + " # Weighted average of aggregated model accuracy\n", + " self.aggregated_model_test_accuracy = weighted_average(aggregated_model_accuracy_list, self.test_weights)\n", + "\n", + " # Store experiment results\n", + " self.loss_and_acc[\"Train Loss\"].append(self.aggregated_model_training_loss)\n", + " self.loss_and_acc[\"Test Accuracy\"].append(self.aggregated_model_test_accuracy)\n", + "\n", + " print(\n", + " \" | Train Round: {:<5} : Agg Train Loss {:<.6f}, Agg Test Acc: {:<.6f}\".format(\n", + " self.round_number,\n", + " self.aggregated_model_training_loss,\n", + " self.aggregated_model_test_accuracy\n", + " )\n", + " )\n", + "\n", + " self.next(self.select_collaborators)\n", + "\n", + " @aggregator\n", + " def select_collaborators(self):\n", + " \"\"\"\n", + " Randomly select n_selected_collaborators collaborator\n", + " \"\"\"\n", + " np.random.seed(self.round_number)\n", + " self.selected_collaborator_indices = np.random.choice(range(len(self.collaborators)), \\\n", + " self.n_selected_collaborators, replace=False)\n", + " self.selected_collaborators = [self.collaborators[idx] for idx in self.selected_collaborator_indices]\n", + "\n", + " self.next(self.train_selected_collaborators, foreach=\"selected_collaborators\")\n", + "\n", + " @collaborator\n", + " def train_selected_collaborators(self):\n", + " \"\"\"\n", + " Train selected collaborators\n", + " \"\"\"\n", + " self.model.train(mode=True)\n", + "\n", + " self.train_dataset_length = len(self.train_loader.dataset)\n", + "\n", + " # Rebuild the optimizer with global model parameters\n", + " self.optimizer = FedProxOptimizer(\n", + " self.model.parameters(), lr=learning_rate, mu=mu, weight_decay=weight_decay)\n", + " # Set global model parameters as old weights to enable computation of proximal term\n", + " self.optimizer.set_old_weights([p.clone().detach() for p in self.model.parameters()])\n", + "\n", + " for epoch in range(local_epoch):\n", + " train_loss = []\n", + " correct = 0\n", + " for data, target in self.train_loader:\n", + " self.optimizer.zero_grad()\n", + " output = self.model(data)\n", + " loss = cross_entropy(output, target)\n", + " loss.backward()\n", + " self.optimizer.step()\n", + " pred = output.argmax(dim=1, keepdim=True)\n", + " tar = target.argmax(dim=1, keepdim=True)\n", + " correct += pred.eq(tar).sum().cpu().numpy()\n", + " train_loss.append(loss.item())\n", + " training_accuracy = float(correct / self.train_dataset_length)\n", + " training_loss = np.mean(train_loss)\n", + " print(\n", + " \" | Train Round: {:<5} | Local Epoch: {:<3}: FedProx Optimization Train Loss {:<.6f}, Train Acc: {:<.6f} [{}/{}]\".format(\n", + " self.input,\n", + " self.round_number,\n", + " epoch,\n", + " training_loss,\n", + " training_accuracy,\n", + " correct, \n", + " len(self.train_loader.dataset)\n", + " )\n", + " )\n", + "\n", + " self.next(self.join)\n", + " \n", + " @aggregator\n", + " def join(self, inputs):\n", + " \"\"\"\n", + " Compute train dataset, and take weighted average of model.\n", + " \"\"\"\n", + " train_datasize = sum([input_.train_dataset_length for input_ in inputs])\n", + "\n", + " train_weights, model_state_dict_list = [], [] \n", + " for input_ in inputs:\n", + " train_weights.append(input_.train_dataset_length / train_datasize)\n", + " model_state_dict_list.append(input_.model.state_dict())\n", + "\n", + " avg_model_dict = weighted_average(model_state_dict_list, train_weights)\n", + " self.model.load_state_dict(avg_model_dict)\n", + "\n", + " self.next(self.internal_loop)\n", + "\n", + " @aggregator\n", + " def internal_loop(self):\n", + " \"\"\"\n", + " Check if training is finished for `self.n_rounds`\n", + " if finished move to end step. Otherwise, go back to start\n", + " step for next round of training.\n", + " \"\"\"\n", + " if self.round_number < self.n_rounds:\n", + " self.round_number += 1\n", + " self.next(self.start)\n", + " else:\n", + " self.next(self.end)\n", + "\n", + " @aggregator\n", + " def end(self):\n", + " \"\"\"\n", + " This is the 'end' step.\n", + " \"\"\"\n", + " self.round_number += 1\n", + " print('This is end of the flow')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup Federation\n", + "\n", + "In this step we define entities necessary to run the flow and create a function which returns dataset as private attributes of collaborator. As described in [quickstart](https://github.com/securefederatedai/openfl/blob/develop/openfl-tutorials/experimental/Workflow_Interface_101_MNIST.ipynb) we define entities necessary for the flow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "num_collaborators = 30\n", + "\n", + "# Setup aggregator\n", + "aggregator = Aggregator()\n", + "\n", + "# Setup collaborators with private attributes\n", + "collaborator_names = [f\"col{i}\" for i in range(num_collaborators)]\n", + "\n", + "synthetic_federated_dataset = SyntheticFederatedDataset(\n", + " batch_size=batch_size, num_classes=10, num_collaborators=len(collaborator_names), seed=RANDOM_SEED)\n", + "\n", + "def callable_to_initialize_collaborator_private_attributes(index):\n", + " return synthetic_federated_dataset.split(index)\n", + "\n", + "collaborators = []\n", + "for idx, collaborator_name in enumerate(collaborator_names):\n", + " collaborators.append(\n", + " Collaborator(\n", + " name=collaborator_name, num_cpus=0.0, num_gpus=0.0,\n", + " private_attributes_callable=callable_to_initialize_collaborator_private_attributes,\n", + " index=idx\n", + " )\n", + " )\n", + "\n", + "local_runtime = LocalRuntime(\n", + " aggregator=aggregator, collaborators=collaborators, backend=\"single_process\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We define `loss_and_acc` dictionary to store the test results of our experiment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loss_and_acc = {\n", + " \"FedProx\": {\n", + " \"Train Loss\": [], \"Test Accuracy\": []\n", + " },\n", + " \"FedAvg\": {\n", + " \"Train Loss\": [], \"Test Accuracy\": []\n", + " }\n", + "}" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Distribution\n", + "\n", + "Now that our Federation is setup and actors (Aggregator & Collaborators) are initialized, let us take a moment to analyze the *Synthetic non-IID dataset*. We check how the targets for individual collaborators are distributed across each of the classes by computing and plotting the heat-map distribution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "from matplotlib.colors import LogNorm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "targets_for_collaborators = []\n", + "\n", + "for idx, collab in enumerate(collaborators):\n", + " # Train, and Test dataset is divided into 9:1 ratio\n", + " _, train_y = callable_to_initialize_collaborator_private_attributes(idx)[\"train_loader\"].dataset[:]\n", + " _, test_y = callable_to_initialize_collaborator_private_attributes(idx)[\"test_loader\"].dataset[:]\n", + " # Append train, and test into 1 tensor array\n", + " y = pt.cat((train_y, test_y))\n", + " targets = np.argmax(y.numpy(), axis = 1)\n", + " # Count number of samples for each class\n", + " frequency = np.zeros(10, dtype=np.int32)\n", + " for i, item in enumerate(targets):\n", + " frequency[item] += 1\n", + " targets_for_collaborators.append(frequency)\n", + "\n", + "result_arr = np.array(targets_for_collaborators).T.tolist()\n", + "fig, ax = plt.subplots(figsize=(20, 5))\n", + "ax = sns.heatmap(result_arr, annot=True, fmt=\"d\", annot_kws={\"fontsize\": 7}, ax=ax, norm=LogNorm(), cbar=False)\n", + "ax.set_title('Distribution of Classes in Dataset across Collaborators', fontsize=12)\n", + "ax.set_xlabel('Collaborator ID', fontsize=10)\n", + "ax.set_ylabel('Classes (0 - 9)', fontsize=10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# FedProx\n", + "\n", + "Now that we have flow and runtime defined, let's define our parameters and run the experiment with FedProxOptimizer (mu > 0)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Randomly select `n_selected_collaborators` collaborators\n", + "# Must be less than total collaborators\n", + "n_selected_collaborators = 10\n", + "n_epochs = 100\n", + "learning_rate = 0.01\n", + "weight_decay = 0.001\n", + "local_epoch = 20\n", + "\n", + "# Set `mu` to `1.0` for FedProx\n", + "mu = 1.0\n", + "\n", + "flflow = FedProxFlow(n_selected_collaborators=n_selected_collaborators, n_rounds=n_epochs, checkpoint=False)\n", + "flflow.runtime = local_runtime\n", + "\n", + "flflow.run()\n", + "loss_and_acc[\"FedProx\"][\"Train Loss\"] = flflow.loss_and_acc[\"Train Loss\"][:]\n", + "loss_and_acc[\"FedProx\"][\"Test Accuracy\"] = flflow.loss_and_acc[\"Test Accuracy\"][:]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# FedAvg\n", + "\n", + "Now that we have obtained FedProx results, let's define the parameters for FedAvg and run experiment. Note that for comparison we only change the parameter mu to 0.0 (i.e. FedProxOptimizer with mu = 0.0)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mu = 0.0\n", + "\n", + "flflow = FedProxFlow(n_selected_collaborators=n_selected_collaborators, n_rounds=n_epochs, checkpoint=False)\n", + "flflow.runtime = local_runtime\n", + "\n", + "flflow.run()\n", + "loss_and_acc[\"FedAvg\"][\"Train Loss\"] = flflow.loss_and_acc[\"Train Loss\"][:]\n", + "loss_and_acc[\"FedAvg\"][\"Test Accuracy\"] = flflow.loss_and_acc[\"Test Accuracy\"][:]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Compare Results\n", + "\n", + "Now that we have obtained results for both the optimizers available we conclude the tutorial by comparing the Aggregated Training Loss and Aggregated Test Accuracy. Reference: Appendix C.3.2, Figure 6 of [Federated Optimization in Heterogeneous Networks](https://arxiv.org/pdf/1812.06127.pdf) for Synthetic (0,0) dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 6))\n", + "fig.subplots_adjust(hspace=0.4, top=0.8)\n", + "\n", + "fedprox_loss = loss_and_acc[\"FedProx\"][\"Train Loss\"]\n", + "fedavg_loss = loss_and_acc[\"FedAvg\"][\"Train Loss\"]\n", + "ax1.plot(fedprox_loss,'gv-', label='FedProx (mu=1.0)')\n", + "ax1.plot(fedavg_loss,'rs-', label='FedAvg (mu=0.0)')\n", + "ax1.legend()\n", + "ax1.minorticks_on()\n", + "ax1.grid(which='major',linestyle='-',color='0.5')\n", + "ax1.grid(which='minor',linestyle='--',color='0.25')\n", + "ax1.set_title('Train Loss')\n", + "ax1.set_xlabel('Training Round')\n", + "ax1.set_ylabel('Training Loss')\n", + "\n", + "fedprox_accuracy = loss_and_acc[\"FedProx\"][\"Test Accuracy\"]\n", + "fedavg_accuracy = loss_and_acc[\"FedAvg\"][\"Test Accuracy\"]\n", + "ax2.plot(fedprox_accuracy,'gv-', label='FedProx (mu=1.0)')\n", + "ax2.plot(fedavg_accuracy, 'rs-', label='FedAvg (mu=0.0)')\n", + "ax2.legend()\n", + "ax2.minorticks_on()\n", + "ax2.grid(which='major',linestyle='-',color='0.5')\n", + "ax2.grid(which='minor',linestyle='--',color='0.25')\n", + "ax2.set_title('Test Accuracy')\n", + "ax2.set_xlabel('Training Round')\n", + "ax2.set_ylabel('Test Accuracy')\n", + "\n", + "fig.suptitle('Comparison of FedProx (mu > 0) and FedAvg (mu = 0)', fontsize='18')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "env_fedprox_example", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "c96b31a6dd4c6365f3cc206f3a3aedb434a4eb5a8aa6c7dc735a6d54c4b635a9" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/openfl-tutorials/experimental/requirements_workflow_interface.txt b/openfl-tutorials/experimental/requirements_workflow_interface.txt index e687320c826..f091055e250 100644 --- a/openfl-tutorials/experimental/requirements_workflow_interface.txt +++ b/openfl-tutorials/experimental/requirements_workflow_interface.txt @@ -1,4 +1,5 @@ -dill==0.3.6 -metaflow==2.7.15 -ray==2.2.0 -numpy==1.21.6 \ No newline at end of file +dill==0.3.6 +metaflow==2.7.15 +ray==2.2.0 +torch +torchvision diff --git a/openfl/experimental/interface/__init__.py b/openfl/experimental/interface/__init__.py index 0942123eed3..fc03bd8459e 100644 --- a/openfl/experimental/interface/__init__.py +++ b/openfl/experimental/interface/__init__.py @@ -3,7 +3,7 @@ """openfl.experimental.interface package.""" -from .fl_spec import FLSpec, final_attributes +from .fl_spec import FLSpec from .participants import Aggregator, Collaborator -__all__ = ["FLSpec", "final_attributes", "Aggregator", "Collaborator"] +__all__ = ["FLSpec", "Aggregator", "Collaborator"] diff --git a/openfl/experimental/interface/fl_spec.py b/openfl/experimental/interface/fl_spec.py index aec582580b6..170e9cd9ae3 100644 --- a/openfl/experimental/interface/fl_spec.py +++ b/openfl/experimental/interface/fl_spec.py @@ -19,11 +19,8 @@ ) from openfl.experimental.runtime import Runtime -final_attributes = [] - class FLSpec: - _clones = [] _initial_state = None @@ -39,7 +36,7 @@ def _create_clones(cls, instance: Type[FLSpec], names: List[str]) -> None: @classmethod def _reset_clones(cls): """Reset clones""" - cls._clones = [] + cls._clones = {} @classmethod def save_initial_state(cls, instance: Type[FLSpec]) -> None: @@ -54,17 +51,23 @@ def run(self) -> None: ) self._run_id = self._metaflow_interface.create_run() if str(self._runtime) == "LocalRuntime": - # Setup any necessary ShardDescriptors through the LocalEnvoys - # Assume that first task always runs on the aggregator - self._setup_aggregator() + # Initialize aggregator private attributes + self.runtime.initialize_aggregator() self._foreach_methods = [] FLSpec._reset_clones() FLSpec._create_clones(self, self.runtime.collaborators) - # the start function can just be invoked locally + # Initialize collaborator private attributes + self.runtime.initialize_collaborators() if self._checkpoint: print(f"Created flow {self.__class__.__name__}") try: - self.start() + # Execute all Participant (Aggregator & Collaborator) tasks and + # retrieve the final attributes + # start step is the first task & invoked on aggregator through runtime.execute_task + final_attributes = self.runtime.execute_task( + self, + self.start, + ) except Exception as e: if "cannot pickle" in str(e) or "Failed to unpickle" in str(e): msg = ( @@ -74,7 +77,8 @@ def run(self) -> None: "\nLocalRuntime(...,backend='single_process')\n" "\n or for more information about the original error," "\nPlease see the official Ray documentation" - "\nhttps://docs.ray.io/en/latest/ray-core/objects/serialization.html" + "\nhttps://docs.ray.io/en/releases-2.2.0/ray-core/\ + objects/serialization.html" ) raise SerializationError(str(e) + msg) else: @@ -86,11 +90,6 @@ def run(self) -> None: else: raise Exception("Runtime not implemented") - def _setup_aggregator(self): - """Sets aggregator private attributes as self attributes""" - for name, attr in self.runtime._aggregator.private_attributes.items(): - setattr(self, name, attr) - @property def runtime(self) -> Type[Runtime]: """Returns flow runtime""" @@ -130,9 +129,7 @@ def _is_at_transition_point(self, f: Callable, parent_func: Callable) -> bool: if parent_func.__name__ in self._foreach_methods: self._foreach_methods.append(f.__name__) if should_transfer(f, parent_func): - print( - f"Should transfer from {parent_func.__name__} to {f.__name__}" - ) + print(f"Should transfer from {parent_func.__name__} to {f.__name__}") self.execute_next = f.__name__ return True return False @@ -171,16 +168,7 @@ def next(self, f: Callable, **kwargs) -> None: # Remove included / excluded attributes from next task filter_attributes(self, f, **kwargs) - if self._is_at_transition_point(f, parent_func): - # Collaborator is done executing for now - return - self._display_transition_logs(f, parent_func) - self._runtime.execute_task( - self, - f, - parent_func, - instance_snapshot=agg_to_collab_ss, - **kwargs, - ) + # update parameters required to execute execute_task function + self.execute_task_args = [f, parent_func, agg_to_collab_ss, kwargs] diff --git a/openfl/experimental/interface/keras/__init__.py b/openfl/experimental/interface/keras/__init__.py new file mode 100644 index 00000000000..1d7d84eb7f1 --- /dev/null +++ b/openfl/experimental/interface/keras/__init__.py @@ -0,0 +1,7 @@ +# Copyright (C) 2020-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +"""openfl.experimental.interface.keras package.""" + +from .aggregation_functions import WeightedAverage + +__all__ = ["WeightedAverage", ] diff --git a/openfl/experimental/interface/keras/aggregation_functions/__init__.py b/openfl/experimental/interface/keras/aggregation_functions/__init__.py new file mode 100644 index 00000000000..94708487bcd --- /dev/null +++ b/openfl/experimental/interface/keras/aggregation_functions/__init__.py @@ -0,0 +1,7 @@ +# Copyright (C) 2020-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +"""openfl.experimenal.interface.keras.aggregation_functions package.""" + +from .weighted_average import WeightedAverage + +__all__ = ["WeightedAverage", ] diff --git a/openfl/experimental/interface/keras/aggregation_functions/weighted_average.py b/openfl/experimental/interface/keras/aggregation_functions/weighted_average.py new file mode 100644 index 00000000000..326e57aeced --- /dev/null +++ b/openfl/experimental/interface/keras/aggregation_functions/weighted_average.py @@ -0,0 +1,13 @@ +# Copyright (C) 2020-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +"""openfl.experimental.interface.keras.aggregation_functions.weighted_average package.""" + + +class WeightedAverage: + """Weighted average aggregation for keras or tensorflow.""" + + def __init__(self) -> None: + """ + WeightedAverage class for Keras or Tensorflow library. + """ + raise NotImplementedError("WeightedAverage for keras will be implemented in the future.") diff --git a/openfl/experimental/interface/participants.py b/openfl/experimental/interface/participants.py index 32581758d06..8ff54523b2c 100644 --- a/openfl/experimental/interface/participants.py +++ b/openfl/experimental/interface/participants.py @@ -1,55 +1,228 @@ -# Copyright (C) 2020-2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -"""openfl.experimental.interface.participants module.""" - -from typing import Dict -from typing import Any - - -class Participant: - def __init__(self, name: str = ""): - self.private_attributes = {} - self._name = name - - @property - def name(self): - return self._name - - @name.setter - def name(self, name: str): - self._name = name - - def private_attributes(self, attrs: Dict[str, Any]) -> None: - """ - Set the private attributes of the participant. These attributes will - only be available within the tasks performed by the participants and - will be filtered out prior to the task's state being transfered. - - Args: - attrs: dictionary of ATTRIBUTE_NAME (str) -> object that will be accessible - within the participant's task. - - Example: - {'train_loader' : torch.utils.data.DataLoader(...)} - - In any task performed by this participant performed within the flow, - this attribute could be referenced with self.train_loader - """ - self.private_attributes = attrs - - -class Collaborator(Participant): - """ - Defines a collaborator participant - """ - def __init__(self, **kwargs): - super().__init__(**kwargs) - - -class Aggregator(Participant): - """ - Defines an aggregator participant - """ - def __init__(self, **kwargs): - super().__init__(**kwargs) +# Copyright (C) 2020-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +"""openfl.experimental.interface.participants module.""" + +from typing import Dict, Any +from typing import Callable, Optional + + +class Participant: + def __init__(self, name: str = ""): + self.private_attributes = {} + self._name = name + + @property + def name(self): + return self._name + + @name.setter + def name(self, name: str): + self._name = name + + def private_attributes(self, attrs: Dict[str, Any]) -> None: + """ + Set the private attributes of the participant. These attributes will + only be available within the tasks performed by the participants and + will be filtered out prior to the task's state being transfered. + + Args: + attrs: dictionary of ATTRIBUTE_NAME (str) -> object that will be accessible + within the participant's task. + + Example: + {'train_loader' : torch.utils.data.DataLoader(...)} + + In any task performed by this participant performed within the flow, + this attribute could be referenced with self.train_loader + """ + self.private_attributes = attrs + + +class Collaborator(Participant): + """ + Defines a collaborator participant + """ + def __init__(self, name: str = "", private_attributes_callable: Callable = None, + num_cpus: int = 0, num_gpus: int = 0.0, **kwargs): + """ + Create collaborator object with custom resources and a callable + function to assign private attributes + + Parameters: + name (str): Name of the collaborator. default="" + + private_attributes_callable (Callable): A function which returns collaborator + private attributes for each collaborator. In case private_attributes are not + required this can be omitted. default=None + + num_cpus (int): Specifies how many cores to use for the collaborator step exection. + This will only be used if backend is set to ray. default=0 + + num_gpus (float): Specifies how many GPUs to use to accerlerate the collaborator + step exection. This will only be used if backend is set to ray. default=0 + + kwargs (dict): Parameters required to call private_attributes_callable function. + The key of the dictionary must match the arguments to the private_attributes_callable. + default={} + """ + super().__init__(name=name) + self.num_cpus = num_cpus + self.num_gpus = num_gpus + self.kwargs = kwargs + + if private_attributes_callable is None: + self.private_attributes_callable = private_attributes_callable + else: + if not callable(private_attributes_callable): + raise Exception("private_attributes_callable parameter must be a callable") + else: + self.private_attributes_callable = private_attributes_callable + + def get_name(self) -> str: + """Get collaborator name""" + return self._name + + def initialize_private_attributes(self) -> None: + """ + initialize private attributes of Collaborator object by invoking + the callable specified by user + """ + if self.private_attributes_callable is not None: + self.private_attributes = self.private_attributes_callable(**self.kwargs) + + def __set_collaborator_attrs_to_clone(self, clone: Any) -> None: + """ + Set collaborator private attributes to FLSpec clone before transitioning + from Aggregator step to collaborator steps + """ + # set collaborator private attributes as + # clone attributes + for name, attr in self.private_attributes.items(): + setattr(clone, name, attr) + + def __delete_collab_attrs_from_clone(self, clone: Any) -> None: + """ + Remove collaborator private attributes from FLSpec clone before + transitioning from Collaborator step to Aggregator step + """ + # Update collaborator private attributes by taking latest + # parameters from clone, then delete attributes from clone. + for attr_name in self.private_attributes: + if hasattr(clone, attr_name): + self.private_attributes.update( + {attr_name: getattr(clone, attr_name)} + ) + delattr(clone, attr_name) + + def execute_func(self, ctx: Any, f_name: str, callback: Callable) -> Any: + """ + Execute remote function f + """ + self.__set_collaborator_attrs_to_clone(ctx) + + callback(ctx, f_name) + + self.__delete_collab_attrs_from_clone(ctx) + + return ctx + + +class Aggregator(Participant): + """ + Defines an aggregator participant + """ + + def __init__( + self, + name: str = "", + private_attributes_callable: Callable = None, + num_cpus: int = 0, + num_gpus: int = 0.0, + **kwargs + ): + """ + Create aggregator object with custom resources and a callable + function to assign private attributes + + Parameters: + name (str): Name of the aggregator. default="" + + private_attributes_callable (Callable): A function which returns aggregator + private attributes. In case private_attributes are not required this can be omitted. + default=None + + num_cpus (int): Specifies how many cores to use for the aggregator step exection. + This will only be used if backend is set to ray. default=0 + + num_gpus (float): Specifies how many GPUs to use to accerlerate the aggregator + step exection. This will only be used if backend is set to ray. default=0 + + kwargs (dict): Parameters required to call private_attributes_callable function. + The key of the dictionary must match the arguments to the private_attributes_callable. + default={} + """ + super().__init__(name=name) + self.num_cpus = num_cpus + self.num_gpus = num_gpus + self.kwargs = kwargs + + if private_attributes_callable is None: + self.private_attributes_callable = private_attributes_callable + else: + if not callable(private_attributes_callable): + raise Exception( + "private_attributes_callable parameter must be a callable" + ) + else: + self.private_attributes_callable = private_attributes_callable + + def get_name(self) -> str: + """Get aggregator name""" + return self.name + + def initialize_private_attributes(self) -> None: + """ + initialize private attributes of Aggregator object by invoking + the callable specified by user + """ + if self.private_attributes_callable is not None: + self.private_attributes = self.private_attributes_callable(**self.kwargs) + + def __set_agg_attrs_to_clone(self, clone: Any) -> None: + """ + Set aggregator private attributes to FLSpec clone before transition + from Aggregator step to collaborator steps + """ + # set aggregator private attributes as + # clone attributes + for name, attr in self.private_attributes.items(): + setattr(clone, name, attr) + + def __delete_agg_attrs_from_clone(self, clone: Any) -> None: + """ + Remove aggregator private attributes from FLSpec clone before + transition from Aggregator step to collaborator steps + """ + # Update aggregator private attributes by taking latest + # parameters from clone, then delete attributes from clone. + for attr_name in self.private_attributes: + if hasattr(clone, attr_name): + self.private_attributes.update({attr_name: getattr(clone, attr_name)}) + delattr(clone, attr_name) + + def execute_func(self, ctx: Any, f_name: str, callback: Callable, + clones: Optional[Any] = None) -> Any: + """ + Execute remote function f + """ + self.__set_agg_attrs_to_clone(ctx) + + if clones is not None: + callback(ctx, f_name, clones) + else: + callback(ctx, f_name) + + self.__delete_agg_attrs_from_clone(ctx) + + return ctx diff --git a/openfl/experimental/interface/torch/__init__.py b/openfl/experimental/interface/torch/__init__.py new file mode 100644 index 00000000000..969f47b43a4 --- /dev/null +++ b/openfl/experimental/interface/torch/__init__.py @@ -0,0 +1,7 @@ +# Copyright (C) 2020-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +"""openfl.experimental.interface.torch package.""" + +from .aggregation_functions import WeightedAverage + +__all__ = ["WeightedAverage", ] diff --git a/openfl/experimental/interface/torch/aggregation_functions/__init__.py b/openfl/experimental/interface/torch/aggregation_functions/__init__.py new file mode 100644 index 00000000000..2afa83b219f --- /dev/null +++ b/openfl/experimental/interface/torch/aggregation_functions/__init__.py @@ -0,0 +1,7 @@ +# Copyright (C) 2020-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +"""openfl.experimenal.interface.torch.aggregation_functions package.""" + +from .weighted_average import WeightedAverage + +__all__ = ["WeightedAverage", ] diff --git a/openfl/experimental/interface/torch/aggregation_functions/weighted_average.py b/openfl/experimental/interface/torch/aggregation_functions/weighted_average.py new file mode 100644 index 00000000000..a91cadfa0dd --- /dev/null +++ b/openfl/experimental/interface/torch/aggregation_functions/weighted_average.py @@ -0,0 +1,77 @@ +# Copyright (C) 2020-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +"""openfl.experimental.interface.torch.aggregation_functions.weighted_average package.""" + +import collections +import numpy as np +import torch as pt + + +def weighted_average(tensors, weights): + """Compute weighted average.""" + return np.average(tensors, weights=weights, axis=0) + + +class WeightedAverage: + """Weighted average aggregation.""" + + def __call__(self, objects_list, weights_list) -> np.ndarray: + """ + Compute weighted average of models, optimizers, loss, or accuracy metrics. + For taking weighted average of optimizer do the following steps: + 1. Call "_get_optimizer_state" (openfl.federated.task.runner_pt._get_optimizer_state) + pass optimizer to it, to take optimizer state dictionary. + 2. Pass optimizer state dictionaries list to here. + 3. To set the weighted average optimizer state dictionary back to optimizer, + call "_set_optimizer_state" (openfl.federated.task.runner_pt._set_optimizer_state) + and pass optimizer, device, and optimizer dictionary received in step 2. + + Args: + objects_list: List of objects for which weighted average is to be computed. + - List of Model state dictionaries , or + - List of Metrics (Loss or accuracy), or + - List of optimizer state dictionaries (following steps need to be performed) + 1. Obtain optimizer state dictionary by invoking "_get_optimizer_state" + (openfl.federated.task.runner_pt._get_optimizer_state). + 2. Create a list of optimizer state dictionary obtained in step - 1 + Invoke WeightedAverage on this list. + 3. Invoke "_set_optimizer_state" to set weighted average of optimizer + state back to optimizer (openfl.federated.task.runner_pt._set_optimizer_state). + weights_list: Weight for each element in the list. + + Returns: + dict: For model or optimizer + float: For Loss or Accuracy metrics + """ + # Check the type of first element of tensors list + if type(objects_list[0]) in (dict, collections.OrderedDict): + optimizer = False + # If __opt_state_needed found then optimizer state dictionary is passed + if "__opt_state_needed" in objects_list[0]: + optimizer = True + # Remove __opt_state_needed from all state dictionary in list, and + # check if weightedaverage of optimizer can be taken. + for tensor in objects_list: + error_msg = "Optimizer is stateless, WeightedAverage cannot be taken" + assert tensor.pop("__opt_state_needed") == "true", error_msg + + tmp_list = [] + # # Take keys in order to rebuild the state dictionary taking keys back up + for tensor in objects_list: + # Append values of each state dictionary in list + # If type(value) is Tensor then it needs to be detached + tmp_list.append(np.array([value.detach() if isinstance(value, pt.Tensor) else value + for value in tensor.values()], dtype=object)) + # Take weighted average of list of arrays + # new_params passed is weighted average of each array in tmp_list + new_params = weighted_average(tmp_list, weights_list) + new_state = {} + # Take weighted average parameters and building a dictionary + for i, k in enumerate(objects_list[0].keys()): + if optimizer: + new_state[k] = new_params[i] + else: + new_state[k] = pt.from_numpy(new_params[i].numpy()) + return new_state + else: + return weighted_average(objects_list, weights_list) diff --git a/openfl/experimental/placement/__init__.py b/openfl/experimental/placement/__init__.py index 1e2f9d42c77..05b12d50bb4 100644 --- a/openfl/experimental/placement/__init__.py +++ b/openfl/experimental/placement/__init__.py @@ -3,6 +3,6 @@ """openfl.experimental.placement package.""" -from .placement import RayExecutor, make_remote, aggregator, collaborator +from .placement import aggregator, collaborator -__all__ = ["RayExecutor", "make_remote", "aggregator", "collaborator"] +__all__ = ["aggregator", "collaborator"] diff --git a/openfl/experimental/placement/placement.py b/openfl/experimental/placement/placement.py index 0662137add7..810994043ab 100644 --- a/openfl/experimental/placement/placement.py +++ b/openfl/experimental/placement/placement.py @@ -2,59 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 import functools -import ray -from copy import deepcopy -from openfl.experimental.utilities import ( - RedirectStdStreamContext, - GPUResourcesNotAvailableError, - get_number_of_gpus, -) +from openfl.experimental.utilities import RedirectStdStreamContext from typing import Callable -class RayExecutor: - def __init__(self): - self.remote_functions = [] - self.remote_contexts = [] - - def ray_call_put(self, ctx, func): - remote_to_exec = make_remote(func, num_gpus=func.num_gpus) - ref_ctx = ray.put(ctx) - self.remote_contexts.append(ref_ctx) - self.remote_functions.append(remote_to_exec.remote(ref_ctx, func.__name__)) - del remote_to_exec - del ref_ctx - - def get_remote_clones(self): - clones = deepcopy(ray.get(self.remote_functions)) - del self.remote_functions - # Remove clones from ray object store - for ctx in self.remote_contexts: - ray.cancel(ctx) - return clones - - -def make_remote(f: Callable, num_gpus: int) -> Callable: - """ - Assign function to run in its own process using - Ray - - Args: - num_gpus: Defines the number of GPUs to request for a task - """ - f = ray.put(f) - - @functools.wraps(f) - @ray.remote(num_gpus=num_gpus, max_calls=1) - def wrapper(*args, **kwargs): - f = getattr(args[0], args[1]) - print(f"\nRunning {f.__name__} in a new process") - f() - return args[0] - - return wrapper - - def aggregator(f: Callable = None) -> Callable: """ Placement decorator that designates that the task will @@ -79,7 +30,6 @@ def agg_task(self): f.collaborator_step = False if f.__doc__: f.__doc__ = "" + f.__doc__ - f.num_gpus = 0 @functools.wraps(f) def wrapper(*args, **kwargs): @@ -92,11 +42,7 @@ def wrapper(*args, **kwargs): return wrapper -def collaborator( - f: Callable = None, - *, - num_gpus: float = 0 -) -> Callable: +def collaborator(f: Callable = None) -> Callable: """ Placement decorator that designates that the task will run at the collaborator node @@ -120,9 +66,8 @@ def collaborator_task(self): will result in sharing of GPUs between tasks. 1 >= results in exclusive GPU access for the task. """ - if f is None: - return functools.partial(collaborator, num_gpus=num_gpus) + return functools.partial(collaborator) print(f'Collaborator step "{f.__name__}" registered') f.is_step = True @@ -133,12 +78,6 @@ def collaborator_task(self): f.collaborator_step = True if f.__doc__: f.__doc__ = "" + f.__doc__ - total_gpus = get_number_of_gpus() - if total_gpus < num_gpus: - GPUResourcesNotAvailableError( - f"cannot assign more than available GPUs ({total_gpus} < {num_gpus})." - ) - f.num_gpus = num_gpus @functools.wraps(f) def wrapper(*args, **kwargs): diff --git a/openfl/experimental/runtime/__init__.py b/openfl/experimental/runtime/__init__.py index 9703eb398b3..488e4b53bba 100644 --- a/openfl/experimental/runtime/__init__.py +++ b/openfl/experimental/runtime/__init__.py @@ -5,6 +5,7 @@ from .runtime import Runtime from .local_runtime import LocalRuntime +from .federated_runtime import FederatedRuntime -__all__ = ["LocalRuntime", "Runtime"] +__all__ = ["FederatedRuntime", "LocalRuntime", "Runtime"] diff --git a/openfl/experimental/runtime/local_runtime.py b/openfl/experimental/runtime/local_runtime.py index efac60efae1..2788cf7a6ac 100644 --- a/openfl/experimental/runtime/local_runtime.py +++ b/openfl/experimental/runtime/local_runtime.py @@ -1,232 +1,686 @@ -# Copyright (C) 2020-2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -""" openfl.experimental.runtime package LocalRuntime class.""" - -from __future__ import annotations -from copy import deepcopy -import ray -import gc -from openfl.experimental.runtime import Runtime -from typing import TYPE_CHECKING -if TYPE_CHECKING: - from openfl.experimental.interface import Aggregator, Collaborator, FLSpec -from openfl.experimental.placement import RayExecutor -from openfl.experimental.utilities import ( - aggregator_to_collaborator, - generate_artifacts, - filter_attributes, - checkpoint, -) -from typing import List -from typing import Type -from typing import Callable - - -class LocalRuntime(Runtime): - def __init__( - self, - aggregator: Type[Aggregator] = None, - collaborators: List[Type[Collaborator]] = None, - backend: str = "single_process", - **kwargs, - ) -> None: - """ - Use single node to run the flow - - Args: - aggregator: The aggregator instance that holds private attributes - collaborators: A list of collaborators; each with their own private attributes - backend: The backend that will execute the tasks. Available options are: - - 'single_process': (default) Executes every task within the same process - - 'ray': Executes tasks using the Ray library. Each participant - runs tasks in their own isolated process. Also - supports GPU isolation using Ray's 'num_gpus' - argument, which can be passed in through the - collaborator placement decorator. - - Example: - @collaborator(num_gpus=1) - def some_collaborator_task(self): - ... - - - By selecting num_gpus=1, the task is guaranteed - exclusive GPU access. If the system has one GPU, - collaborator tasks will run sequentially. - """ - super().__init__() - if backend not in ["ray", "single_process"]: - raise ValueError( - f"Invalid 'backend' value '{backend}', accepted values are " - + "'ray', or 'single_process'" - ) - if backend == "ray": - if not ray.is_initialized(): - dh = kwargs.get("dashboard_host", "127.0.0.1") - dp = kwargs.get("dashboard_port", 5252) - ray.init(dashboard_host=dh, dashboard_port=dp) - self.backend = backend - if aggregator is not None: - self.aggregator = aggregator - # List of envoys should be iterable, so that a subset can be selected at runtime - # The envoys is the superset of envoys that can be selected during the experiment - if collaborators is not None: - self.collaborators = collaborators - - @property - def aggregator(self) -> str: - """Returns name of _aggregator""" - return self._aggregator.name - - @aggregator.setter - def aggregator(self, aggregator: Type[Aggregator]): - """Set LocalRuntime _aggregator""" - self._aggregator = aggregator - - @property - def collaborators(self) -> List[str]: - """ - Return names of collaborators. Don't give direct access to private attributes - """ - return list(self.__collaborators.keys()) - - @collaborators.setter - def collaborators(self, collaborators: List[Type[Collaborator]]): - """Set LocalRuntime collaborators""" - self.__collaborators = { - collaborator.name: collaborator for collaborator in collaborators - } - - def restore_instance_snapshot( - self, - ctx: Type[FLSpec], - instance_snapshot: List[Type[FLSpec]] - ): - """Restores attributes from backup (in instance snapshot) to ctx""" - for backup in instance_snapshot: - artifacts_iter, _ = generate_artifacts(ctx=backup) - for name, attr in artifacts_iter(): - if not hasattr(ctx, name): - setattr(ctx, name, attr) - - def execute_task( - self, - flspec_obj: Type[FLSpec], - f: Callable, - parent_func: Callable, - instance_snapshot: List[Type[FLSpec]] = [], - **kwargs - ): - """ - Performs the execution of a task as defined by the - implementation and underlying backend (single_process, ray, etc) - on a single node - - Args: - flspec_obj: Reference to the FLSpec (flow) object. Contains information - about task sequence, flow attributes, that are needed to - execute a future task - f: The next task to be executed within the flow - parent_func: The prior task executed in the flow - instance_snapshot: A prior FLSpec state that needs to be restored from - (i.e. restoring aggregator state after collaborator - execution) - """ - from openfl.experimental.interface import ( - FLSpec, - final_attributes, - ) - - global final_attributes - - if "foreach" in kwargs: - flspec_obj._foreach_methods.append(f.__name__) - selected_collaborators = flspec_obj.__getattribute__( - kwargs["foreach"] - ) - - for col in selected_collaborators: - clone = FLSpec._clones[col] - if ( - "exclude" in kwargs and hasattr(clone, kwargs["exclude"][0]) - ) or ( - "include" in kwargs and hasattr(clone, kwargs["include"][0]) - ): - filter_attributes(clone, f, **kwargs) - artifacts_iter, _ = generate_artifacts(ctx=flspec_obj) - for name, attr in artifacts_iter(): - setattr(clone, name, deepcopy(attr)) - clone._foreach_methods = flspec_obj._foreach_methods - - for col in selected_collaborators: - clone = FLSpec._clones[col] - clone.input = col - if aggregator_to_collaborator(f, parent_func): - # remove private aggregator state - for attr in self._aggregator.private_attributes: - self._aggregator.private_attributes[attr] = getattr( - flspec_obj, attr - ) - if hasattr(clone, attr): - delattr(clone, attr) - - func = None - if self.backend == "ray": - ray_executor = RayExecutor() - for col in selected_collaborators: - clone = FLSpec._clones[col] - # Set new LocalRuntime for clone as it is required - # for calling execute_task and also new runtime - # object will not contain private attributes of - # aggregator or other collaborators - clone.runtime = LocalRuntime(backend="single_process") - for name, attr in self.__collaborators[ - clone.input - ].private_attributes.items(): - setattr(clone, name, attr) - to_exec = getattr(clone, f.__name__) - # write the clone to the object store - # ensure clone is getting latest _metaflow_interface - clone._metaflow_interface = flspec_obj._metaflow_interface - if self.backend == "ray": - ray_executor.ray_call_put(clone, to_exec) - else: - to_exec() - if self.backend == "ray": - clones = ray_executor.get_remote_clones() - FLSpec._clones.update(zip(selected_collaborators, clones)) - del ray_executor - del clones - gc.collect() - for col in selected_collaborators: - clone = FLSpec._clones[col] - func = clone.execute_next - for attr in self.__collaborators[ - clone.input - ].private_attributes: - if hasattr(clone, attr): - self.__collaborators[clone.input].private_attributes[ - attr - ] = getattr(clone, attr) - delattr(clone, attr) - # Restore the flspec_obj state if back-up is taken - self.restore_instance_snapshot(flspec_obj, instance_snapshot) - del instance_snapshot - - g = getattr(flspec_obj, func) - # remove private collaborator state - gc.collect() - g([FLSpec._clones[col] for col in selected_collaborators]) - else: - to_exec = getattr(flspec_obj, f.__name__) - to_exec() - if f.__name__ == "end": - checkpoint(flspec_obj, f) - artifacts_iter, _ = generate_artifacts(ctx=flspec_obj) - final_attributes = artifacts_iter() - - def __repr__(self): - return "LocalRuntime" +# Copyright (C) 2020-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +""" openfl.experimental.runtime package LocalRuntime class.""" + +from __future__ import annotations +from copy import deepcopy +import importlib +import ray +import os +import gc +from openfl.experimental.runtime import Runtime +from typing import TYPE_CHECKING, Optional +import math + +if TYPE_CHECKING: + from openfl.experimental.interface import Aggregator, Collaborator, FLSpec + +from openfl.experimental.utilities import ( + ResourcesNotAvailableError, + aggregator_to_collaborator, + generate_artifacts, + filter_attributes, + checkpoint, + get_number_of_gpus, + check_resource_allocation, +) +from typing import List, Any +from typing import Dict, Type, Callable + + +class RayExecutor: + def __init__(self): + """Create RayExecutor object""" + self.__remote_contexts = [] + + def ray_call_put( + self, + participant: Any, + ctx: Any, + f_name: str, + callback: Callable, + clones: Optional[Any] = None, + ) -> None: + """ + Execute f_name from inside participant (Aggregator or Collaborator) class with the context + of clone (ctx) + """ + if clones is not None: + self.__remote_contexts.append( + participant.execute_func.remote(ctx, f_name, callback, clones) + ) + else: + self.__remote_contexts.append( + participant.execute_func.remote(ctx, f_name, callback) + ) + + def ray_call_get(self) -> List[Any]: + """ + Get remote clones and delete ray references of clone (ctx) and, + reclaim memory + """ + clones = ray.get(self.__remote_contexts) + del self.__remote_contexts + self.__remote_contexts = [] + + return clones + + +def ray_group_assign(collaborators, num_actors=1): + """ + Assigns collaborators to resource groups which share a CUDA context. + + Args: + collaborators (list): The list of collaborators. + num_actors (int, optional): Number of actors to distribute collaborators to. + Defaults to 3. + + Returns: + list: A list of GroupMember instances. + """ + + class GroupMember: + """ + A utility class that manages the collaborator and its group. + + This class maintains compatibility with runtime execution by assigning attributes for each + function in the Collaborator interface in conjunction with RemoteHelper. + """ + + def __init__(self, collaborator_actor, collaborator): + """ + Initializes a new instance of the GroupMember class. + + Args: + collaborator_actor: The collaborator actor. + collaborator: The collaborator. + """ + from openfl.experimental.interface import Collaborator + + all_methods = [ + method + for method in dir(Collaborator) + if callable(getattr(Collaborator, method)) + ] + external_methods = [method for method in all_methods if (method[0] != "_")] + self.collaborator_actor = collaborator_actor + self.collaborator = collaborator + for method in external_methods: + setattr( + self, + method, + RemoteHelper(self.collaborator_actor, self.collaborator, method), + ) + + class RemoteHelper: + """ + A utility class to maintain compatibility with RayExecutor. + + This class returns a lambda function that uses collaborator_actor.execute_from_col to run + a given function from the given collaborator. + """ + + # once ray_grouped replaces the current ray runtime this class can be replaced with a + # funtion that returns the lambda funtion, using a funtion is necesary because this is used + # in setting multiple funtions in a loop and lambda takes the reference to self.f_name and + # not the value so we need to change scope to avoid self.f_name from changing as the loop + # progresses + def __init__(self, collaborator_actor, collaborator, f_name) -> None: + """ + Initializes a new instance of the RemoteHelper class. + + Args: + collaborator_actor: The collaborator actor. + collaborator: The collaborator. + f_name (str): The name of the function. + """ + self.f_name = f_name + self.collaborator_actor = collaborator_actor + self.collaborator = collaborator + self.f = ( + lambda *args, **kwargs: self.collaborator_actor.execute_from_col.remote( + self.collaborator, self.f_name, *args, **kwargs + ) + ) + + def remote(self, *args, **kwargs): + """ + Executes the function with the given arguments and keyword arguments. + + Args: + *args: The arguments to pass to the function. + **kwargs: The keyword arguments to pass to the function. + + Returns: + The result of the function execution. + """ + return self.f(*args, *kwargs) + + collaborator_ray_refs = [] + collaborators_per_group = math.ceil(len(collaborators) / num_actors) + times_called = 0 + # logic to sort collaborators by gpus, if collaborators have the same number of gpu then they + # are sorted by cpu + cpu_magnitude = len(str(abs(max([i.num_cpus for i in collaborators])))) + min_gpu = min([i.num_gpus for i in collaborators]) + min_gpu = max(min_gpu, 0.0001) + collaborators_sorted_by_gpucpu = sorted( + collaborators, + key=lambda x: x.num_gpus / min_gpu * 10**cpu_magnitude + x.num_cpus, + ) + initializations = [] + + for collaborator in collaborators_sorted_by_gpucpu: + # initialize actor group + if times_called % collaborators_per_group == 0: + max_num_cpus = max( + [ + i.num_cpus + for i in collaborators_sorted_by_gpucpu[ + times_called: times_called + collaborators_per_group + ] + ] + ) + max_num_gpus = max( + [ + i.num_gpus + for i in collaborators_sorted_by_gpucpu[ + times_called: times_called + collaborators_per_group + ] + ] + ) + print(f"creating actor with {max_num_cpus}, {max_num_gpus}") + collaborator_actor = ( + ray.remote(RayGroup) + .options( + num_cpus=max_num_cpus, num_gpus=max_num_gpus + ) # max_concurrency=max_concurrency) + .remote() + ) + # add collaborator to actor group + initializations.append( + collaborator_actor.append.remote( + collaborator.get_name(), + private_attributes_callable=collaborator.private_attributes_callable, + **collaborator.kwargs, + ) + ) + + times_called += 1 + + # append GroupMember to output list + collaborator_ray_refs.append( + GroupMember(collaborator_actor, collaborator.get_name()) + ) + # Wait for all collaborators to be created on actors + ray.get(initializations) + + return collaborator_ray_refs + + +class RayGroup: + """ + A Ray actor that manages a group of collaborators. + + This class allows for the execution of functions from a specified collaborator + using the execute_from_col method. The collaborators are stored in a dictionary + where the key is the collaborator's name. + """ + + def __init__(self): + """ + Initializes a new instance of the RayGroup class. + """ + self.collaborators = {} + + def append( + self, + name: str = "", + private_attributes_callable: Callable = None, + **kwargs, + ): + """ + Appends a new collaborator to the group. + + Args: + name (str): The name of the collaborator. + private_attributes_callable (Callable): A callable that sets the private attributes of + the collaborator. + **kwargs: Additional keyword arguments. + """ + from openfl.experimental.interface import Collaborator + + self.collaborators[name] = Collaborator( + name=name, + private_attributes_callable=private_attributes_callable, + **kwargs, + ) + + def execute_from_col(self, name, internal_f_name, *args, **kwargs): + """ + Executes a function from a specified collaborator. + + Args: + name (str): The name of the collaborator. + internal_f_name (str): The name of the function to execute. + *args: Additional arguments to pass to the function. + **kwargs: Additional keyword arguments to pass to the function. + + Returns: + The result of the function execution. + """ + f = getattr(self.collaborators[name], internal_f_name) + return f(*args, **kwargs) + + def get_collaborator(self, name): + """ + Retrieves a collaborator from the group by name. + + Args: + name (str): The name of the collaborator. + + Returns: + The collaborator instance. + """ + return self.collaborators[name] + + +class LocalRuntime(Runtime): + def __init__( + self, + aggregator: Dict = None, + collaborators: Dict = None, + backend: str = "single_process", + **kwargs, + ) -> None: + """ + Use single node to run the flow + + Args: + aggregator: The aggregator instance that holds private attributes + collaborators: A list of collaborators; each with their own private attributes + backend: The backend that will execute the tasks. Available options are: + + 'single_process': (default) Executes every task within the same process + + 'ray': Executes tasks using the Ray library. We use ray + actors called RayGroups to runs tasks in their own + isolated process. Each participant is distributed + into a ray group. The RayGroups run concurrently + while participants in the group run serially. + The default is 1 RayGroup and can be changed by using + the num_actors=1 kwarg. By using more RayGroups more + concurency is allowed with the trade off being that + each RayGroup has extra memory overhead in the form + of extra CUDA CONTEXTS. + + Also the ray runtime supports GPU isolation using + Ray's 'num_gpus' argument, which can be passed in + through the collaborator placement decorator. + + Example: + @collaborator(num_gpus=1) + def some_collaborator_task(self): + ... + + + By selecting num_gpus=1, the task is guaranteed + exclusive GPU access. If the system has one GPU, + collaborator tasks will run sequentially. + + """ + super().__init__() + if backend not in ["ray", "single_process"]: + raise ValueError( + f"Invalid 'backend' value '{backend}', accepted values are " + + "'ray', or 'single_process'" + ) + if backend == "ray": + if not ray.is_initialized(): + dh = kwargs.get("dashboard_host", "127.0.0.1") + dp = kwargs.get("dashboard_port", 5252) + ray.init(dashboard_host=dh, dashboard_port=dp) + + self.num_actors = kwargs.get("num_actors", 1) + self.backend = backend + if aggregator is not None: + self.aggregator = self.__get_aggregator_object(aggregator) + + if collaborators is not None: + self.collaborators = self.__get_collaborator_object(collaborators) + + def __get_aggregator_object(self, aggregator: Type[Aggregator]) -> Any: + """Get aggregator object based on localruntime backend""" + + if self.backend == "single_process": + return aggregator + + total_available_cpus = os.cpu_count() + total_available_gpus = get_number_of_gpus() + + agg_cpus = aggregator.num_cpus + agg_gpus = aggregator.num_gpus + + if agg_gpus > 0: + check_resource_allocation( + total_available_gpus, + {aggregator.get_name(): agg_gpus}, + ) + + if total_available_gpus < agg_gpus: + raise ResourcesNotAvailableError( + f"cannot assign more than available GPUs \ + ({agg_gpus} < {total_available_gpus})." + ) + if total_available_cpus < agg_cpus: + raise ResourcesNotAvailableError( + f"cannot assign more than available CPUs \ + ({agg_cpus} < {total_available_cpus})." + ) + + interface_module = importlib.import_module("openfl.experimental.interface") + aggregator_class = getattr(interface_module, "Aggregator") + + aggregator_actor = ray.remote(aggregator_class).options( + num_cpus=agg_cpus, num_gpus=agg_gpus + ) + aggregator_actor_ref = aggregator_actor.remote( + name=aggregator.get_name(), + private_attributes_callable=aggregator.private_attributes_callable, + **aggregator.kwargs, + ) + + return aggregator_actor_ref + + def __get_collaborator_object(self, collaborators: List) -> Any: + """Get collaborator object based on localruntime backend""" + + if self.backend == "single_process": + return collaborators + + total_available_cpus = os.cpu_count() + total_required_cpus = sum( + [collaborator.num_cpus for collaborator in collaborators] + ) + if total_available_cpus < total_required_cpus: + raise ResourcesNotAvailableError( + f"cannot assign more than available CPUs \ + ({total_required_cpus} < {total_available_cpus})." + ) + + if self.backend == "ray": + collaborator_ray_refs = ray_group_assign( + collaborators, num_actors=self.num_actors + ) + return collaborator_ray_refs + + @property + def aggregator(self) -> str: + """Returns name of _aggregator""" + return self._aggregator.name + + @aggregator.setter + def aggregator(self, aggregator: Type[Aggregator]): + """Set LocalRuntime _aggregator""" + self._aggregator = aggregator + + @property + def collaborators(self) -> List[str]: + """ + Return names of collaborators. Don't give direct access to private attributes + """ + return list(self.__collaborators.keys()) + + @collaborators.setter + def collaborators(self, collaborators: List[Type[Collaborator]]): + """Set LocalRuntime collaborators""" + if self.backend == "single_process": + + def get_collab_name(collab): + return collab.get_name() + + else: + + def get_collab_name(collab): + return ray.get(collab.get_name.remote()) + + self.__collaborators = { + get_collab_name(collaborator): collaborator + for collaborator in collaborators + } + + def initialize_aggregator(self): + """initialize aggregator private attributes""" + if self.backend == "single_process": + self._aggregator.initialize_private_attributes() + else: + ray.get(self._aggregator.initialize_private_attributes.remote()) + + def initialize_collaborators(self): + """initialize collaborator private attributes""" + if self.backend == "single_process": + + def init_private_attrs(collab): + return collab.initialize_private_attributes() + + else: + + def init_private_attrs(collab): + return ray.get(collab.initialize_private_attributes.remote()) + + for collaborator in self.__collaborators.values(): + init_private_attrs(collaborator) + + def restore_instance_snapshot( + self, ctx: Type[FLSpec], instance_snapshot: List[Type[FLSpec]] + ): + """Restores attributes from backup (in instance snapshot) to ctx""" + for backup in instance_snapshot: + artifacts_iter, _ = generate_artifacts(ctx=backup) + for name, attr in artifacts_iter(): + if not hasattr(ctx, name): + setattr(ctx, name, attr) + + def execute_agg_steps(self, ctx: Any, f_name: str, clones: Optional[Any] = None): + """ + Execute aggregator steps until at transition point + """ + if clones is not None: + f = getattr(ctx, f_name) + f(clones) + else: + not_at_transition_point = True + while not_at_transition_point: + f = getattr(ctx, f_name) + f() + + f, parent_func = ctx.execute_task_args[:2] + if aggregator_to_collaborator(f, parent_func) or f.__name__ == "end": + not_at_transition_point = False + + f_name = f.__name__ + + def execute_collab_steps(self, ctx: Any, f_name: str): + """ + Execute collaborator steps until at transition point + """ + not_at_transition_point = True + while not_at_transition_point: + f = getattr(ctx, f_name) + f() + + f, parent_func = ctx.execute_task_args[:2] + if ctx._is_at_transition_point(f, parent_func): + not_at_transition_point = False + + f_name = f.__name__ + + def execute_task(self, flspec_obj: Type[FLSpec], f: Callable, **kwargs): + """ + Defines which function to be executed based on name and kwargs + Updates the arguments and executes until end is not reached + + Args: + flspec_obj: Reference to the FLSpec (flow) object. Contains information + about task sequence, flow attributes. + f: The next task to be executed within the flow + + Returns: + artifacts_iter: Iterator with updated sequence of values + """ + parent_func = None + instance_snapshot = None + self.join_step = False + + while f.__name__ != "end": + if "foreach" in kwargs: + flspec_obj = self.execute_collab_task( + flspec_obj, f, parent_func, instance_snapshot, **kwargs + ) + else: + flspec_obj = self.execute_agg_task(flspec_obj, f) + f, parent_func, instance_snapshot, kwargs = flspec_obj.execute_task_args + else: + flspec_obj = self.execute_agg_task(flspec_obj, f) + f = flspec_obj.execute_task_args[0] + + checkpoint(flspec_obj, f) + artifacts_iter, _ = generate_artifacts(ctx=flspec_obj) + return artifacts_iter() + + def execute_agg_task(self, flspec_obj, f): + """ + Performs execution of aggregator task + Args: + flspec_obj : Reference to the FLSpec (flow) object + f : The task to be executed within the flow + + Returns: + flspec_obj: updated FLSpec (flow) object + """ + from openfl.experimental.interface import FLSpec + + aggregator = self._aggregator + clones = None + + if self.join_step: + clones = [FLSpec._clones[col] for col in self.selected_collaborators] + self.join_step = False + + if self.backend == "ray": + ray_executor = RayExecutor() + ray_executor.ray_call_put( + aggregator, flspec_obj, f.__name__, self.execute_agg_steps, clones + ) + flspec_obj = ray_executor.ray_call_get()[0] + del ray_executor + else: + aggregator.execute_func( + flspec_obj, f.__name__, self.execute_agg_steps, clones + ) + + gc.collect() + return flspec_obj + + def execute_collab_task( + self, flspec_obj, f, parent_func, instance_snapshot, **kwargs + ): + """ + Performs + 1. Filter include/exclude + 2. Set runtime, collab private attributes , metaflow_interface + 3. Execution of all collaborator for each task + 4. Remove collaborator private attributes + 5. Execute the next function after transition + + Args: + flspec_obj : Reference to the FLSpec (flow) object + f : The task to be executed within the flow + parent_func : The prior task executed in the flow + instance_snapshot : A prior FLSpec state that needs to be restored + + Returns: + flspec_obj: updated FLSpec (flow) object + """ + + from openfl.experimental.interface import ( + FLSpec, + ) + + flspec_obj._foreach_methods.append(f.__name__) + selected_collaborators = getattr(flspec_obj, kwargs["foreach"]) + self.selected_collaborators = selected_collaborators + + # filter exclude/include attributes for clone + self.filter_exclude_include(flspec_obj, f, selected_collaborators, **kwargs) + + if self.backend == "ray": + ray_executor = RayExecutor() + # set runtime,collab private attributes and metaflowinterface + for col in selected_collaborators: + clone = FLSpec._clones[col] + # Set new LocalRuntime for clone as it is required + # new runtime object will not contain private attributes of + # aggregator or other collaborators + clone.runtime = LocalRuntime(backend="single_process") + + # write the clone to the object store + # ensure clone is getting latest _metaflow_interface + clone._metaflow_interface = flspec_obj._metaflow_interface + + for collab_name in selected_collaborators: + clone = FLSpec._clones[collab_name] + collaborator = self.__collaborators[collab_name] + + if self.backend == "ray": + ray_executor.ray_call_put( + collaborator, clone, f.__name__, self.execute_collab_steps + ) + else: + collaborator.execute_func(clone, f.__name__, self.execute_collab_steps) + + if self.backend == "ray": + clones = ray_executor.ray_call_get() + FLSpec._clones.update(zip(selected_collaborators, clones)) + clone = clones[0] + del clones + + flspec_obj.execute_task_args = clone.execute_task_args + + # Restore the flspec_obj state if back-up is taken + self.restore_instance_snapshot(flspec_obj, instance_snapshot) + del instance_snapshot + + gc.collect() + # Setting the join_step to indicate to aggregator to collect clones + self.join_step = True + return flspec_obj + + def filter_exclude_include(self, flspec_obj, f, selected_collaborators, **kwargs): + """ + This function filters exclude/include attributes + Args: + flspec_obj : Reference to the FLSpec (flow) object + f : The task to be executed within the flow + selected_collaborators : all collaborators + """ + + from openfl.experimental.interface import ( + FLSpec, + ) + + for col in selected_collaborators: + clone = FLSpec._clones[col] + clone.input = col + if ("exclude" in kwargs and hasattr(clone, kwargs["exclude"][0])) or ( + "include" in kwargs and hasattr(clone, kwargs["include"][0]) + ): + filter_attributes(clone, f, **kwargs) + artifacts_iter, _ = generate_artifacts(ctx=flspec_obj) + for name, attr in artifacts_iter(): + setattr(clone, name, deepcopy(attr)) + clone._foreach_methods = flspec_obj._foreach_methods + + def __repr__(self): + return "LocalRuntime" diff --git a/openfl/experimental/utilities/__init__.py b/openfl/experimental/utilities/__init__.py index 1f16c5ba2c4..2272d1459a4 100644 --- a/openfl/experimental/utilities/__init__.py +++ b/openfl/experimental/utilities/__init__.py @@ -9,7 +9,11 @@ aggregator_to_collaborator, collaborator_to_aggregator, ) -from .exceptions import SerializationError, GPUResourcesNotAvailableError +from .exceptions import ( + SerializationError, + ResourcesNotAvailableError, + ResourcesAllocationError, +) from .stream_redirect import ( RedirectStdStreamBuffer, RedirectStdStream, @@ -21,6 +25,7 @@ generate_artifacts, filter_attributes, checkpoint, + check_resource_allocation, ) @@ -30,7 +35,8 @@ "aggregator_to_collaborator", "collaborator_to_aggregator", "SerializationError", - "GPUResourcesNotAvailableError", + "ResourcesNotAvailableError", + "ResourcesAllocationError", "RedirectStdStreamBuffer", "RedirectStdStream", "RedirectStdStreamContext", @@ -39,4 +45,5 @@ "generate_artifacts", "filter_attributes", "checkpoint", + "check_resource_allocation", ] diff --git a/openfl/experimental/utilities/exceptions.py b/openfl/experimental/utilities/exceptions.py index 7d49ee5ad9c..12a307d271e 100644 --- a/openfl/experimental/utilities/exceptions.py +++ b/openfl/experimental/utilities/exceptions.py @@ -7,7 +7,13 @@ def __init__(self, *args: object) -> None: pass -class GPUResourcesNotAvailableError(Exception): +class ResourcesNotAvailableError(Exception): + def __init__(self, *args: object) -> None: + super().__init__(*args) + pass + + +class ResourcesAllocationError(Exception): def __init__(self, *args: object) -> None: super().__init__(*args) pass diff --git a/openfl/experimental/utilities/metaflow_utils.py b/openfl/experimental/utilities/metaflow_utils.py index 9dccca14871..77112df15ce 100644 --- a/openfl/experimental/utilities/metaflow_utils.py +++ b/openfl/experimental/utilities/metaflow_utils.py @@ -379,8 +379,6 @@ def __init__(self, flow: Type[FLSpec], backend: str = "ray"): """ self.backend = backend self.flow_name = flow.__name__ - self._graph = FlowGraph(flow) - self._steps = [getattr(flow, node.name) for node in self._graph] if backend == "ray": self.counter = Counter.remote() else: diff --git a/openfl/experimental/utilities/resources.py b/openfl/experimental/utilities/resources.py index 126c6b0e175..08df76c941a 100644 --- a/openfl/experimental/utilities/resources.py +++ b/openfl/experimental/utilities/resources.py @@ -3,9 +3,28 @@ """openfl.experimental.utilities.resources module.""" -from torch.cuda import device_count +from logging import getLogger +from subprocess import run, PIPE +logger = getLogger(__name__) -def get_number_of_gpus(): - # TODO remove pytorch dependency - return device_count() + +def get_number_of_gpus() -> int: + """ + Returns number of NVIDIA GPUs attached to the machine. + + Args: + None + Returns: + int: Number of NVIDIA GPUs + """ + # Execute the nvidia-smi command. + command = "nvidia-smi --list-gpus" + try: + op = run(command.strip().split(), shell=False, stdout=PIPE, stderr=PIPE) + stdout = op.stdout.decode().strip() + return len(stdout.split("\n")) + except FileNotFoundError: + logger.warning(f'No GPUs found! If this is a mistake please try running "{command}" ' + + 'manually.') + return 0 diff --git a/openfl/experimental/utilities/runtime_utils.py b/openfl/experimental/utilities/runtime_utils.py index dc038e3b4e3..e122f754381 100644 --- a/openfl/experimental/utilities/runtime_utils.py +++ b/openfl/experimental/utilities/runtime_utils.py @@ -3,8 +3,11 @@ """openfl.experimental.utilities package.""" +import itertools import inspect +import numpy as np from types import MethodType +from openfl.experimental.utilities import ResourcesAllocationError def parse_attrs(ctx, exclude=[], reserved_words=["next", "runtime", "input"]): @@ -82,7 +85,9 @@ def checkpoint(ctx, parent_func, chkpnt_reserved_words=["next", "runtime"]): if ctx._checkpoint: # all objects will be serialized using Metaflow interface print(f"Saving data artifacts for {parent_func.__name__}") - artifacts_iter, _ = generate_artifacts(ctx=ctx, reserved_words=chkpnt_reserved_words) + artifacts_iter, _ = generate_artifacts( + ctx=ctx, reserved_words=chkpnt_reserved_words + ) task_id = ctx._metaflow_interface.create_task(parent_func.__name__) ctx._metaflow_interface.save_artifacts( artifacts_iter(), @@ -92,3 +97,65 @@ def checkpoint(ctx, parent_func, chkpnt_reserved_words=["next", "runtime"]): buffer_err=step_stderr, ) print(f"Saved data artifacts for {parent_func.__name__}") + + +def old_check_resource_allocation(num_gpus, each_participant_gpu_usage): + remaining_gpu_memory = {} + # TODO for each GPU the funtion tries see if all participant usages fit into a GPU, it it + # doesn't it removes that + # participant from the participant list, and adds it to the remaining_gpu_memory dict. So any + # sum of GPU requirements above 1 + # triggers this. + # But at this point the funtion will raise an error because remaining_gpu_memory is never + # cleared. + # The participant list should remove the participant if it fits in the gpu and save the + # partipant if it doesn't and continue + # to the next GPU to see if it fits in that one, only if we run out of GPUs should this + # funtion raise an error. + for gpu in np.ones(num_gpus, dtype=int): + for i, (participant_name, participant_gpu_usage) in enumerate( + each_participant_gpu_usage.items() + ): + if gpu == 0: + break + if gpu < participant_gpu_usage: + remaining_gpu_memory.update({participant_name: gpu}) + each_participant_gpu_usage = dict( + itertools.islice(each_participant_gpu_usage.items(), i) + ) + else: + gpu -= participant_gpu_usage + if len(remaining_gpu_memory) > 0: + raise ResourcesAllocationError( + f"Failed to allocate Participant {list(remaining_gpu_memory.keys())} " + + "to specified GPU. Please try allocating lesser GPU resources to participants" + ) + + +def check_resource_allocation(num_gpus, each_participant_gpu_usage): + # copy participant dict + need_assigned = each_participant_gpu_usage.copy() + # cycle through all available GPU availability + for gpu in np.ones(num_gpus, dtype=int): + # buffer to cycle though since need_assigned will change sizes as we assign participants + current_dict = need_assigned.copy() + for i, (participant_name, participant_gpu_usage) in enumerate( + current_dict.items() + ): + if gpu == 0: + break + if gpu < participant_gpu_usage: + # participant doesn't fitm break to next GPU + break + else: + # if participant fits remove from need_assigned + need_assigned.pop(participant_name) + gpu -= participant_gpu_usage + + # raise error if after going though all gpus there are still participants that needed to be + # assigned + if len(need_assigned) > 0: + raise ResourcesAllocationError( + f"Failed to allocate Participant {list(need_assigned.keys())} " + + "to specified GPU. Please try allocating lesser GPU resources to participants" + ) diff --git a/openfl/experimental/utilities/stream_redirect.py b/openfl/experimental/utilities/stream_redirect.py index c2fc841ba67..0a3d8b99426 100644 --- a/openfl/experimental/utilities/stream_redirect.py +++ b/openfl/experimental/utilities/stream_redirect.py @@ -48,6 +48,9 @@ def write(self, message): self.__stdDestination.write(message) self.__stdBuffer.write(message) + def flush(self): + pass + class RedirectStdStreamContext: """ diff --git a/openfl/experimental/utilities/ui.py b/openfl/experimental/utilities/ui.py index 3d44a4e1dee..983060be047 100644 --- a/openfl/experimental/utilities/ui.py +++ b/openfl/experimental/utilities/ui.py @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from openfl.experimental.utilities.metaflow_utils import DefaultCard +from openfl.experimental.utilities.metaflow_utils import DefaultCard, FlowGraph from pathlib import Path import os import webbrowser @@ -19,7 +19,10 @@ def __init__( self.show_html = show_html self.run_id = run_id self.flow_name = flow_obj.__class__.__name__ - self.graph_dict, _ = flow_obj._metaflow_interface._graph.output_steps() + self._graph = FlowGraph(flow_obj.__class__) + self._steps = [getattr(flow_obj, node.name) for node in self._graph] + + self.graph_dict, _ = self._graph.output_steps() self.show_ui() def get_pathspec(self): diff --git a/openfl/experimental/utilities/utils.py b/openfl/experimental/utilities/utils.py deleted file mode 100644 index 4627e0a0626..00000000000 --- a/openfl/experimental/utilities/utils.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (C) 2020-2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -from torch.cuda import device_count - - -def get_number_of_gpus() -> int: - return device_count() diff --git a/setup.py b/setup.py index 845d9608132..1b3b14ac74b 100644 --- a/setup.py +++ b/setup.py @@ -103,6 +103,10 @@ def run(self): 'openfl.databases.utilities', 'openfl.experimental', 'openfl.experimental.interface', + 'openfl.experimental.interface.keras', + 'openfl.experimental.interface.keras.aggregation_functions', + 'openfl.experimental.interface.torch', + 'openfl.experimental.interface.torch.aggregation_functions', 'openfl.experimental.placement', 'openfl.experimental.runtime', 'openfl.experimental.utilities', diff --git a/tests/github/experimental/requirements_experimental_localruntime_tests.txt b/tests/github/experimental/requirements_experimental_localruntime_tests.txt new file mode 100644 index 00000000000..3e2ac0622f7 --- /dev/null +++ b/tests/github/experimental/requirements_experimental_localruntime_tests.txt @@ -0,0 +1,5 @@ +dill==0.3.6 +metaflow==2.7.15 +ray==2.2.0 +torch +torchvision \ No newline at end of file diff --git a/tests/github/experimental/testflow_datastore_cli.py b/tests/github/experimental/testflow_datastore_cli.py index 40c728f0c05..9b40f765cf6 100644 --- a/tests/github/experimental/testflow_datastore_cli.py +++ b/tests/github/experimental/testflow_datastore_cli.py @@ -284,31 +284,42 @@ def display_validate_errors(validate_flow_error): if __name__ == "__main__": # Setup participants - aggregator = Aggregator() - aggregator.private_attributes = {} + aggregator_ = Aggregator() # Setup collaborators with private attributes collaborator_names = ["Portland", "Seattle", "Chandler", "Bangalore"] - collaborators = [Collaborator(name=name) for name in collaborator_names] - - for idx, collab in enumerate(collaborators): - local_train = deepcopy(mnist_train) - local_test = deepcopy(mnist_test) - local_train.data = mnist_train.data[idx:: len(collaborators)] - local_train.targets = mnist_train.targets[idx:: len(collaborators)] - local_test.data = mnist_test.data[idx:: len(collaborators)] - local_test.targets = mnist_test.targets[idx:: len(collaborators)] - collab.private_attributes = { + + def callable_to_initialize_collaborator_private_attributes( + n_collaborators, index, train_dataset, test_dataset, batch_size + ): + local_train = deepcopy(train_dataset) + local_test = deepcopy(test_dataset) + local_train.data = mnist_train.data[index::n_collaborators] + local_train.targets = mnist_train.targets[index::n_collaborators] + local_test.data = mnist_test.data[index::n_collaborators] + local_test.targets = mnist_test.targets[index::n_collaborators] + return { "train_loader": torch.utils.data.DataLoader( - local_train, batch_size=batch_size_train, shuffle=True + local_train, batch_size=batch_size, shuffle=True ), "test_loader": torch.utils.data.DataLoader( - local_test, batch_size=batch_size_train, shuffle=True + local_test, batch_size=batch_size, shuffle=True ), } + collaborators = [] + for idx, collaborator_name in enumerate(collaborator_names): + collaborators.append( + Collaborator( + name=collaborator_name, num_cpus=0, num_gpus=0.0, + private_attributes_callable=callable_to_initialize_collaborator_private_attributes, + n_collaborators=len(collaborator_names), index=idx, train_dataset=mnist_train, + test_dataset=mnist_test, batch_size=32 + ) + ) + local_runtime = LocalRuntime( - aggregator=aggregator, collaborators=collaborators, backend="ray" + aggregator=aggregator_, collaborators=collaborators, backend="ray" ) print(f"Local runtime collaborators = {local_runtime.collaborators}") num_rounds = 5 diff --git a/tests/github/experimental/testflow_exclude.py b/tests/github/experimental/testflow_exclude.py index c2c180a1127..1b5b14f1ff7 100644 --- a/tests/github/experimental/testflow_exclude.py +++ b/tests/github/experimental/testflow_exclude.py @@ -85,9 +85,7 @@ def test_exclude_agg_to_collab(self): + f"{bcolors.ENDC}" ) else: - TestFlowExclude.exclude_error_list.append( - "test_exclude_agg_to_collab" - ) + TestFlowExclude.exclude_error_list.append("test_exclude_agg_to_collab") print( f"{bcolors.FAIL} ... Exclude test failed in test_exclude_agg_to_collab " + f"{bcolors.ENDC}" @@ -119,9 +117,7 @@ def test_exclude_collab_to_collab(self): + f"{bcolors.ENDC}" ) else: - TestFlowExclude.exclude_error_list.append( - "test_exclude_collab_to_collab" - ) + TestFlowExclude.exclude_error_list.append("test_exclude_collab_to_collab") print( f"{bcolors.FAIL} ... Exclude test failed in test_exclude_collab_to_collab " + f"{bcolors.ENDC}" @@ -154,23 +150,15 @@ def join(self, inputs): ) if validation: - print( - f"{bcolors.OKGREEN} ... Exclude test passed in join {bcolors.ENDC}" - ) + print(f"{bcolors.OKGREEN} ... Exclude test passed in join {bcolors.ENDC}") else: TestFlowExclude.exclude_error_list.append("join") - print( - f"{bcolors.FAIL} ... Exclude test failed in join {bcolors.ENDC}" - ) + print(f"{bcolors.FAIL} ... Exclude test failed in join {bcolors.ENDC}") - print( - f"\n{bcolors.UNDERLINE}Exclude attribute test summary: {bcolors.ENDC}\n" - ) + print(f"\n{bcolors.UNDERLINE}Exclude attribute test summary: {bcolors.ENDC}\n") if TestFlowExclude.exclude_error_list: - validated_exclude_variables = ", ".join( - TestFlowExclude.exclude_error_list - ) + validated_exclude_variables = ", ".join(TestFlowExclude.exclude_error_list) print( f"{bcolors.FAIL}...Test case failed for {validated_exclude_variables} " + f"{bcolors.ENDC}" @@ -199,20 +187,19 @@ def end(self): if __name__ == "__main__": # Setup participants aggregator = Aggregator() - aggregator.private_attributes = {} - # Setup collaborators with private attributes + # Setup collaborators collaborator_names = ["Portland", "Chandler", "Bangalore", "Delhi"] - collaborators = [Collaborator(name=name) for name in collaborator_names] + collaborators = [] + for collaborator_name in collaborator_names: + collaborators.append(Collaborator(name=collaborator_name)) - local_runtime = LocalRuntime( - aggregator=aggregator, collaborators=collaborators - ) + local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators) if len(sys.argv) > 1: - if sys.argv[1] == 'ray': + if sys.argv[1] == "ray": local_runtime = LocalRuntime( - aggregator=aggregator, collaborators=collaborators, backend='ray' + aggregator=aggregator, collaborators=collaborators, backend="ray" ) print(f"Local runtime collaborators = {local_runtime.collaborators}") diff --git a/tests/github/experimental/testflow_include.py b/tests/github/experimental/testflow_include.py index ea4455df85c..8403b557c33 100644 --- a/tests/github/experimental/testflow_include.py +++ b/tests/github/experimental/testflow_include.py @@ -87,9 +87,7 @@ def test_include_agg_to_collab(self): + f"{bcolors.ENDC}" ) else: - TestFlowInclude.include_error_list.append( - "test_include_agg_to_collab" - ) + TestFlowInclude.include_error_list.append("test_include_agg_to_collab") print( f"{bcolors.FAIL} ... Include test failed in test_include_agg_to_collab " + f"{bcolors.ENDC}" @@ -119,9 +117,7 @@ def test_include_collab_to_collab(self): + f"{bcolors.ENDC}" ) else: - TestFlowInclude.include_error_list.append( - "test_include_collab_to_collab" - ) + TestFlowInclude.include_error_list.append("test_include_collab_to_collab") print( f"{bcolors.FAIL} ... Include test failed in test_include_collab_to_collab " + f"{bcolors.ENDC}" @@ -154,23 +150,15 @@ def join(self, inputs): ) if validation: - print( - f"{bcolors.OKGREEN} ... Include test passed in join {bcolors.ENDC}" - ) + print(f"{bcolors.OKGREEN} ... Include test passed in join {bcolors.ENDC}") else: TestFlowInclude.include_error_list.append("join") - print( - f"{bcolors.FAIL} ... Include test failed in join {bcolors.ENDC}" - ) + print(f"{bcolors.FAIL} ... Include test failed in join {bcolors.ENDC}") - print( - f"\n{bcolors.UNDERLINE}Include attribute test summary: {bcolors.ENDC}\n" - ) + print(f"\n{bcolors.UNDERLINE}Include attribute test summary: {bcolors.ENDC}\n") if TestFlowInclude.include_error_list: - validated_include_variables = ",".join( - TestFlowInclude.include_error_list - ) + validated_include_variables = ",".join(TestFlowInclude.include_error_list) print( f"{bcolors.FAIL} ...Test case failed for {validated_include_variables} " + f"{bcolors.ENDC}" @@ -199,20 +187,22 @@ def end(self): if __name__ == "__main__": # Setup participants aggregator = Aggregator() - aggregator.private_attributes = {} - # Setup collaborators with private attributes + # Setup collaborators collaborator_names = ["Portland", "Chandler", "Bangalore", "Delhi"] - collaborators = [Collaborator(name=name) for name in collaborator_names] + collaborators = [] + for collaborator_name in collaborator_names: + collaborators.append(Collaborator(name=collaborator_name)) local_runtime = LocalRuntime( - aggregator=aggregator, collaborators=collaborators + aggregator=aggregator, + collaborators=collaborators, ) if len(sys.argv) > 1: - if sys.argv[1] == 'ray': + if sys.argv[1] == "ray": local_runtime = LocalRuntime( - aggregator=aggregator, collaborators=collaborators, backend='ray' + aggregator=aggregator, collaborators=collaborators, backend="ray" ) print(f"Local runtime collaborators = {local_runtime.collaborators}") diff --git a/tests/github/experimental/testflow_include_exclude.py b/tests/github/experimental/testflow_include_exclude.py index d88bc2a2bcb..7561605d10d 100644 --- a/tests/github/experimental/testflow_include_exclude.py +++ b/tests/github/experimental/testflow_include_exclude.py @@ -39,9 +39,7 @@ def start(self): self.exclude_agg_to_agg = 10 self.include_agg_to_agg = 100 - self.next( - self.test_include_exclude_agg_to_agg, exclude=["exclude_agg_to_agg"] - ) + self.next(self.test_include_exclude_agg_to_agg, exclude=["exclude_agg_to_agg"]) @aggregator def test_include_exclude_agg_to_agg(self): @@ -202,24 +200,30 @@ def end(self): if __name__ == "__main__": # Setup participants aggregator = Aggregator() - aggregator.private_attributes = {} - # Setup collaborators with private attributes + # Setup collaborators collaborator_names = ["Portland", "Chandler", "Bangalore", "Delhi"] - collaborators = [Collaborator(name=name) for name in collaborator_names] + collaborators = [] + for collaborator_name in collaborator_names: + collaborators.append( + Collaborator( + name=collaborator_name, + ) + ) local_runtime = LocalRuntime( - aggregator=aggregator, collaborators=collaborators + aggregator=aggregator, + collaborators=collaborators, ) if len(sys.argv) > 1: - if sys.argv[1] == 'ray': + if sys.argv[1] == "ray": local_runtime = LocalRuntime( - aggregator=aggregator, collaborators=collaborators, backend='ray' + aggregator=aggregator, collaborators=collaborators, backend="ray" ) print(f"Local runtime collaborators = {local_runtime.collaborators}") - flflow = TestFlowIncludeExclude(checkpoint=False) + flflow = TestFlowIncludeExclude(checkpoint=True) flflow.runtime = local_runtime for i in range(5): print(f"Starting round {i}...") diff --git a/tests/github/experimental/testflow_internalloop.py b/tests/github/experimental/testflow_internalloop.py index 6c39bd6d7f5..140bae5214a 100644 --- a/tests/github/experimental/testflow_internalloop.py +++ b/tests/github/experimental/testflow_internalloop.py @@ -50,9 +50,7 @@ def agg_model_mean(self): """ self.agg_mean_value = np.mean(self.model) - print( - f": {self.input} Mean of Agg model: {self.agg_mean_value} " - ) + print(f": {self.input} Mean of Agg model: {self.agg_mean_value} ") self.next(self.collab_model_update) @collaborator @@ -78,9 +76,7 @@ def join(self, inputs): """ Joining inputs from collaborators """ - self.agg_mean = sum(input.local_mean_value for input in inputs) / len( - inputs - ) + self.agg_mean = sum(input.local_mean_value for input in inputs) / len(inputs) print(f"Aggregated mean : {self.agg_mean}") self.next(self.internal_loop) @@ -212,12 +208,10 @@ def display_validate_errors(validate_flow_error): if __name__ == "__main__": - # Setup participants aggregator = Aggregator() - aggregator.private_attributes = {} - # Setup collaborators with private attributes + # Setup collaborators collaborator_names = [ "Portland", "Seattle", @@ -228,16 +222,16 @@ def display_validate_errors(validate_flow_error): "London", "New York", ] - collaborators = [Collaborator(name=name) for name in collaborator_names] + collaborators = [] + for collaborator_name in collaborator_names: + collaborators.append(Collaborator(name=collaborator_name)) - local_runtime = LocalRuntime( - aggregator=aggregator, collaborators=collaborators - ) + local_runtime = LocalRuntime(aggregator=aggregator, collaborators=collaborators) if len(sys.argv) > 1: - if sys.argv[1] == 'ray': + if sys.argv[1] == "ray": local_runtime = LocalRuntime( - aggregator=aggregator, collaborators=collaborators, backend='ray' + aggregator=aggregator, collaborators=collaborators, backend="ray" ) print(f"Local runtime collaborators = {local_runtime.collaborators}") diff --git a/tests/github/experimental/testflow_privateattributes.py b/tests/github/experimental/testflow_privateattributes.py index 30320e15e32..11173a84323 100644 --- a/tests/github/experimental/testflow_privateattributes.py +++ b/tests/github/experimental/testflow_privateattributes.py @@ -1,10 +1,11 @@ # Copyright (C) 2020-2023 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import sys +import numpy as np from openfl.experimental.interface import FLSpec, Aggregator, Collaborator from openfl.experimental.runtime import LocalRuntime from openfl.experimental.placement import aggregator, collaborator -import numpy as np class bcolors: # NOQA: N801 @@ -72,9 +73,7 @@ def collaborator_step_a(self): self.exclude_collab_to_collab = 2 self.include_collab_to_collab = 22 - self.next( - self.collaborator_step_b, exclude=["exclude_collab_to_collab"] - ) + self.next(self.collaborator_step_b, exclude=["exclude_collab_to_collab"]) @collaborator def collaborator_step_b(self): @@ -105,17 +104,17 @@ def join(self, inputs): + f" not accessible {bcolors.ENDC}" ) - for input in enumerate(inputs): + for idx, collab in enumerate(inputs): if ( - hasattr(input, "train_loader") is True - or hasattr(input, "test_loader") is True + hasattr(collab, "train_loader") is True + or hasattr(collab, "test_loader") is True ): # Error - we are able to access collaborator attributes TestFlowPrivateAttributes.error_list.append( "join_collaborator_attributes_found" ) print( - f"{bcolors.FAIL} ... Attribute test failed in Join - COllaborator: {collab}" + f"{bcolors.FAIL} ... Attribute test failed in Join - Collaborator: {collab}" + f" private attributes accessible {bcolors.ENDC}" ) @@ -175,10 +174,7 @@ def validate_collab_private_attr(self, private_attr, step_name): def validate_agg_private_attrs(self, private_attr_1, private_attr_2, step_name): # Collaborator should only be able to access its own attributes - if ( - hasattr(self, private_attr_1) is False - or hasattr(self, private_attr_2) is False - ): + if hasattr(self, private_attr_1) is False or hasattr(self, private_attr_2) is False: TestFlowPrivateAttributes.error_list.append( step_name + "collab_attributes_not_found" ) @@ -199,13 +195,17 @@ def validate_agg_private_attrs(self, private_attr_1, private_attr_2, step_name): if __name__ == "__main__": - # Setup Aggregator with private attributes + # Setup Aggregator with private attributes via callable function aggregator = Aggregator() - aggregator.private_attributes = { - "test_loader": np.random.rand(10, 28, 28) # Random data - } - # Setup collaborators with private attributes + def callable_to_initialize_aggregator_private_attributes(): + return {"test_loader": np.random.rand(10, 28, 28)} # Random data + + aggregator = Aggregator( + name="agg", + private_attributes_callable=callable_to_initialize_aggregator_private_attributes, + ) + # Setup collaborators with private attributes via callable function collaborator_names = [ "Portland", "Seattle", @@ -218,19 +218,33 @@ def validate_agg_private_attrs(self, private_attr_1, private_attr_2, step_name): "Beijing", "Tokyo", ] - collaborators = [Collaborator(name=name) for name in collaborator_names] - for idx, collab in enumerate(collaborators): - collab.private_attributes = { + + def callable_to_initialize_collaborator_private_attributes(index): + return { "train_loader": np.random.rand(idx * 50, 28, 28), "test_loader": np.random.rand(idx * 10, 28, 28), } + collaborators = [] + for idx, collaborator_name in enumerate(collaborator_names): + collaborators.append( + Collaborator( + name=collaborator_name, + private_attributes_callable=callable_to_initialize_collaborator_private_attributes, + index=idx, + ) + ) + + backend = "single_process" + if len(sys.argv) > 1 and sys.argv[1] == "ray": + backend = "ray" + local_runtime = LocalRuntime( - aggregator=aggregator, collaborators=collaborators + aggregator=aggregator, collaborators=collaborators, backend=backend ) print(f"Local runtime collaborators = {local_runtime.collaborators}") - flflow = TestFlowPrivateAttributes(checkpoint=False) + flflow = TestFlowPrivateAttributes(checkpoint=True) flflow.runtime = local_runtime for i in range(5): print(f"Starting round {i}...") diff --git a/tests/github/experimental/testflow_reference.py b/tests/github/experimental/testflow_reference.py index 8dab8a37bb6..ed93d1c7caf 100644 --- a/tests/github/experimental/testflow_reference.py +++ b/tests/github/experimental/testflow_reference.py @@ -65,7 +65,6 @@ def start(self): @aggregator def test_create_agg_attr(self): - """ Create different types of objects. """ @@ -93,7 +92,6 @@ def test_create_agg_attr(self): @collaborator def test_create_collab_attr(self): - """ Modify the attirbutes of aggregator to validate the references. Create different types of objects. @@ -109,16 +107,14 @@ def test_create_collab_attr(self): self.collab_attr_str_one = "Test string data in collab " + self.input self.collab_attr_list_one = [1, 2, 5, 6, 7, 8] self.collab_attr_dict_one = {key: key for key in range(5)} - self.collab_attr_file_one = io.StringIO( - "Test file data in collaborator" - ) + self.collab_attr_file_one = io.StringIO("Test file data in collaborator") self.collab_attr_math_one = math.sqrt(self.index) self.collab_attr_complex_num_one = complex(self.index, self.index) self.collab_attr_log_one = logging.getLogger( "Test logger data in collaborator " + self.input ) - # append self attributes of collaborators + # append attributes of collaborator TestFlowReference.step_one_collab_attrs.append(self) if len(TestFlowReference.step_one_collab_attrs) >= 2: @@ -180,7 +176,7 @@ def join(self, inputs): all_shared_attr = "" print(f"\n{bcolors.UNDERLINE}Reference test summary: {bcolors.ENDC}\n") - for key, val in TestFlowReference.all_ref_error_dict.items(): + for val in TestFlowReference.all_ref_error_dict.values(): all_shared_attr = all_shared_attr + ",".join(val) if all_shared_attr: print( @@ -230,33 +226,36 @@ def filter_attrs(attr_list): return valid_attrs -def find_matched_references(collab_attr_list, all_collborators): +def find_matched_references(collab_attr_list, all_collaborators): """ Iterate attributes of collborator and capture the duplicate reference + return: dict: { + 'Portland': ['failed attributes'], 'Seattle': [], + } """ matched_ref_dict = {} - previous_collaborator = "" - # Initialize dictionary with collborator as key and value as empty list - # to hold duplicated attr list - for collborator_name in all_collborators: - matched_ref_dict[collborator_name.input] = [] - - # Iterate the attributes and get duplicate attribute id - for attr in collab_attr_list: - di = {attr: []} - for collab in all_collborators: - attr_id = id(getattr(collab, attr)) - collaborator_name = collab.input - if attr_id not in di.get(attr): - di.get(attr).append(attr_id) - else: - # append the dict with collabartor as key and attrs as value having same reference - matched_ref_dict.get(collaborator_name).append(attr) - print( - f"{bcolors.FAIL} ... Reference test failed - {collaborator_name} sharing same " - + f"{attr} reference with {previous_collaborator} {bcolors.ENDC}" - ) - previous_collaborator = collaborator_name + for i in range(len(all_collaborators)): + matched_ref_dict[all_collaborators[i].input] = [] + + # For each attribute in the collaborator attribute list, check if any of the collaborator + # attributes are shared with another collaborator + for attr_name in collab_attr_list: + for i, curr_collab in enumerate(all_collaborators): + # Compare the current collaborator with the collaborator(s) that come(s) after it. + for next_collab in all_collaborators[i + 1:]: + # Check if both collaborators have the current attribute + if hasattr(curr_collab, attr_name) and hasattr(next_collab, attr_name): + # Check if both collaborators are sharing same reference + if getattr(curr_collab, attr_name) is getattr( + next_collab, attr_name + ): + matched_ref_dict[curr_collab.input].append(attr_name) + print( + f"{bcolors.FAIL} ... Reference test failed - {curr_collab.input} \ + sharing same " + + f"{attr_name} reference with {next_collab.input} {bcolors.ENDC}" + ) + return matched_ref_dict @@ -274,9 +273,9 @@ def validate_collab_references(matched_ref_dict): if collborators_sharing_ref: for collab in collborators_sharing_ref: if collab not in TestFlowReference.all_ref_error_dict: - TestFlowReference.all_ref_error_dict[ + TestFlowReference.all_ref_error_dict[collab] = matched_ref_dict.get( collab - ] = matched_ref_dict.get(collab) + ) if not reference_flag: print( @@ -291,9 +290,7 @@ def validate_agg_attr_ref(agg_attrs, agg_obj): """ attr_flag = False for attr in agg_attrs: - if TestFlowReference.agg_attr_dict.get(attr) == id( - getattr(agg_obj, attr) - ): + if TestFlowReference.agg_attr_dict.get(attr) == id(getattr(agg_obj, attr)): attr_flag = True if not attr_flag: print( @@ -308,7 +305,6 @@ def validate_agg_attr_ref(agg_attrs, agg_obj): def validate_agg_collab_references(all_collborators, agg_obj, agg_attrs): - """ Iterate attributes of aggregator and collborator to capture the mismatched references. """ @@ -322,7 +318,7 @@ def validate_agg_collab_references(all_collborators, agg_obj, agg_attrs): agg_attr_id = id(getattr(agg_obj, attr)) for collab in all_collborators: collab_attr_id = id(getattr(collab, attr)) - if agg_attr_id == collab_attr_id: + if agg_attr_id is collab_attr_id: attr_ref_flag = True mis_matched_ref.get(collab).append(attr) @@ -338,26 +334,34 @@ def validate_agg_collab_references(all_collborators, agg_obj, agg_attrs): if __name__ == "__main__": - # Setup participants aggregator = Aggregator() - aggregator.private_attributes = {} - ref_exception_list = [] + # Setup collaborators private attributes via callable function + collaborator_names = ["Portland", "Seattle", "Chandler", "Bangalore"] - # Setup collaborators with private attributes - collaborator_names = ["Portland", "Seattle"] # , 'Chandler', 'Bangalore'] - collaborators = [Collaborator(name=name) for name in collaborator_names] - collaborator.private_attributes = {} + def callable_to_initialize_collaborator_private_attributes(index): + return {"index": index + 1} + + collaborators = [] + for idx, collaborator_name in enumerate(collaborator_names): + collaborators.append( + Collaborator( + name=collaborator_name, + private_attributes_callable=callable_to_initialize_collaborator_private_attributes, + index=idx, + ) + ) local_runtime = LocalRuntime( - aggregator=aggregator, collaborators=collaborators + aggregator=aggregator, + collaborators=collaborators, ) if len(sys.argv) > 1: - if sys.argv[1] == 'ray': + if sys.argv[1] == "ray": local_runtime = LocalRuntime( - aggregator=aggregator, collaborators=collaborators, backend='ray' + aggregator=aggregator, collaborators=collaborators, backend="ray" ) print(f"Local runtime collaborators = {local_runtime.collaborators}") @@ -365,9 +369,6 @@ def validate_agg_collab_references(all_collborators, agg_obj, agg_attrs): testflow = TestFlowReference(checkpoint=True) testflow.runtime = local_runtime - for idx, collab in enumerate(collaborators): - collab.private_attributes = {"index": idx + 1} - for i in range(2): print(f"Starting round {i}...") testflow.run() diff --git a/tests/github/experimental/testflow_reference_with_exclude.py b/tests/github/experimental/testflow_reference_with_exclude.py index acc3f03759c..0b5ffa93b75 100644 --- a/tests/github/experimental/testflow_reference_with_exclude.py +++ b/tests/github/experimental/testflow_reference_with_exclude.py @@ -46,8 +46,8 @@ class TestFlowReferenceWithExclude(FLSpec): """ - step_one_collab_attrs = {} - step_two_collab_attrs = {} + step_one_collab_attrs = [] + step_two_collab_attrs = [] all_ref_error_dict = {} @aggregator @@ -65,7 +65,6 @@ def start(self): @aggregator def test_create_agg_attr(self): - """ Create different types of objects """ @@ -86,52 +85,43 @@ def test_create_agg_attr(self): @collaborator def test_create_collab_attr(self): - """ Create different types of objects """ self.collab_attr_list_one = [1, 2, 3, 5, 6, 8] self.collab_attr_dict_one = {key: key for key in range(5)} - attr_collab_dict, collab_attr_list = create_collab_dict(self) - TestFlowReferenceWithExclude.step_one_collab_attrs.update( - attr_collab_dict - ) + TestFlowReferenceWithExclude.step_one_collab_attrs.append(self) if ( len(TestFlowReferenceWithExclude.step_one_collab_attrs) >= MIN_COLLECTION_COUNT ): - matched_ref_dict = find_match_ref_at_step( + collab_attr_list = filter_attrs(inspect.getmembers(self)) + matched_ref_dict = find_matched_references( collab_attr_list, TestFlowReferenceWithExclude.step_one_collab_attrs, ) validate_references(matched_ref_dict) - self.next( - self.test_create_more_collab_attr, exclude=["collab_attr_dict_one"] - ) + self.next(self.test_create_more_collab_attr, exclude=["collab_attr_dict_one"]) @collaborator def test_create_more_collab_attr(self): - """ Create different types of objects """ - self.collab_attr_list_two = [1, 2, 3, 5, 6, 8] self.collab_attr_dict_two = {key: key for key in range(5)} - attr_collab_dict, collab_attr_list = create_collab_dict(self) - TestFlowReferenceWithExclude.step_two_collab_attrs.update( - attr_collab_dict - ) + TestFlowReferenceWithExclude.step_two_collab_attrs.append(self) if ( len(TestFlowReferenceWithExclude.step_two_collab_attrs) >= MIN_COLLECTION_COUNT ): - matched_ref_dict = find_match_ref_at_step( + collab_attr_list = filter_attrs(inspect.getmembers(self)) + matched_ref_dict = find_matched_references( collab_attr_list, TestFlowReferenceWithExclude.step_two_collab_attrs, ) @@ -156,10 +146,7 @@ def join(self, inputs): f"\n{bcolors.UNDERLINE}Reference with exclude keyword test summary: {bcolors.ENDC}\n" ) - for ( - key, - val, - ) in TestFlowReferenceWithExclude.all_ref_error_dict.items(): + for val in TestFlowReferenceWithExclude.all_ref_error_dict.values(): all_shared_attr = all_shared_attr + ",".join(val) if all_shared_attr: @@ -167,9 +154,7 @@ def join(self, inputs): f"{bcolors.FAIL}...Test case failed for {all_shared_attr} {bcolors.ENDC}" ) else: - print( - f"{bcolors.OKGREEN}...Test case passed for all the attributes." - ) + print(f"{bcolors.OKGREEN}...Test case passed for all the attributes.") self.next(self.end) @aggregator @@ -186,8 +171,8 @@ def end(self): ) ) - TestFlowReferenceWithExclude.step_one_collab_attrs = {} - TestFlowReferenceWithExclude.step_two_collab_attrs = {} + TestFlowReferenceWithExclude.step_one_collab_attrs = [] + TestFlowReferenceWithExclude.step_two_collab_attrs = [] TestFlowReferenceWithExclude.all_ref_error_dict = {} @@ -205,33 +190,36 @@ def filter_attrs(attr_list): return valid_attrs -def find_matched_references(collab_attr_list, all_collborators): +def find_matched_references(collab_attr_list, all_collaborators): """ Iterate attributes of collborator and capture the duplicate reference + return: dict: { + 'Portland': ['failed attributes'], 'Seattle': [], + } """ matched_ref_dict = {} - previous_collaborator = "" - # Initialize dictionary with collborator as key and value as empty list to hold - # duplicated attr list - for collborator_name in all_collborators: - matched_ref_dict[collborator_name.input] = [] - - # Iterate the attributes and get duplicate attribute id - for attr in collab_attr_list: - attr_dict = {attr: []} - for collab in all_collborators: - attr_id = id(getattr(collab, attr)) - collaborator_name = collab.input - if attr_id not in attr_dict.get(attr): - attr_dict.get(attr).append(attr_id) - else: - # append the dict with collabartor as key and attrs as value having same reference - matched_ref_dict.get(collaborator_name).append(attr) - print( - f"{bcolors.FAIL} ... Reference test failed - {collaborator_name} sharing same " - + f"{attr} reference with {previous_collaborator} {bcolors.ENDC}" - ) - previous_collaborator = collaborator_name + for i in range(len(all_collaborators)): + matched_ref_dict[all_collaborators[i].input] = [] + + # For each attribute in the collaborator attribute list, check if any of the collaborator + # attributes are shared with another collaborator + for attr_name in collab_attr_list: + for i, curr_collab in enumerate(all_collaborators): + # Compare the current collaborator with the collaborator(s) that come(s) after it. + for next_collab in all_collaborators[i + 1:]: + # Check if both collaborators have the current attribute + if hasattr(curr_collab, attr_name) and hasattr(next_collab, attr_name): + # Check if both collaborators are sharing same reference + if getattr(curr_collab, attr_name) is getattr( + next_collab, attr_name + ): + matched_ref_dict[curr_collab.input].append(attr_name) + print( + f"{bcolors.FAIL} ... Reference test failed - {curr_collab.input} \ + sharing same " + + f"{attr_name} reference with {next_collab.input} {bcolors.ENDC}" + ) + return matched_ref_dict @@ -254,85 +242,33 @@ def validate_references(matched_ref_dict): ] = matched_ref_dict.get(collab) if not reference_flag: - print( - f"{bcolors.OKGREEN} Pass : Reference test passed {bcolors.ENDC}" - ) - - -def create_collab_dict(collab): - """ - saving the collaborator and its attributes to compare with other collaborator references. - return : dict ({ - 'Portland': {'collab_attr_dict_one': 140512653871680}, - 'Seattle': {'collab_attr_dict_one': 140512653871936} - }) - """ - attr_collab_dict = {} - collab_attr_list = filter_attrs(inspect.getmembers(collab)) - for attr in collab_attr_list: - attr_id = id(getattr(collab, attr)) - if attr_collab_dict.get(collab.input): - attr_collab_dict.get(collab.input)[attr] = attr_id - else: - attr_collab_dict[collab.input] = {} - attr_collab_dict.get(collab.input)[attr] = attr_id - return attr_collab_dict, collab_attr_list - - -def find_match_ref_at_step(collab_attr_list, all_collborators): - """ - Determines whether the current attributes are shared with - other participant attributes. If attributes are shared, - the test fails - """ - collab_names = all_collborators.keys() - matched_ref_dict = {} - for collborator_name in collab_names: - matched_ref_dict[collborator_name] = [] - - previous_collaborator = "" - for attr in collab_attr_list: - attr_dict = {attr: []} - for collborator_name in all_collborators.keys(): - attr_id = all_collborators[collborator_name][attr] - if attr_id not in attr_dict.get(attr): - attr_dict.get(attr).append(attr_id) - else: - matched_ref_dict.get(collborator_name).append(attr) - print( - f"{bcolors.FAIL} ... Reference test failed - {collborator_name} sharing same " - + f"{attr} reference with {previous_collaborator} {bcolors.ENDC}" - ) - - previous_collaborator = collborator_name - - return matched_ref_dict + print(f"{bcolors.OKGREEN} Pass : Reference test passed {bcolors.ENDC}") if __name__ == "__main__": - # Setup participants aggregator = Aggregator() - aggregator.private_attributes = {} - # Setup collaborators with private attributes + # Setup collaborators collaborator_names = ["Portland", "Seattle", "Chandler", "Bangalore"] - collaborators = [Collaborator(name=name) for name in collaborator_names] - collaborator.private_attributes = {} + collaborators = [] + for idx, collaborator_name in enumerate(collaborator_names): + collaborators.append(Collaborator(name=collaborator_name)) local_runtime = LocalRuntime( - aggregator=aggregator, collaborators=collaborators + aggregator=aggregator, + collaborators=collaborators, ) if len(sys.argv) > 1: - if sys.argv[1] == 'ray': + if sys.argv[1] == "ray": local_runtime = LocalRuntime( - aggregator=aggregator, collaborators=collaborators, backend='ray' + aggregator=aggregator, collaborators=collaborators, backend="ray" ) print(f"Local runtime collaborators = {local_runtime.collaborators}") - testflow = TestFlowReferenceWithExclude(checkpoint=False) + testflow = TestFlowReferenceWithExclude(checkpoint=True) testflow.runtime = local_runtime for i in range(5): diff --git a/tests/github/experimental/testflow_reference_with_include.py b/tests/github/experimental/testflow_reference_with_include.py index 6e1ff763272..954b20ae5a5 100644 --- a/tests/github/experimental/testflow_reference_with_include.py +++ b/tests/github/experimental/testflow_reference_with_include.py @@ -45,8 +45,8 @@ class TestFlowReferenceWithInclude(FLSpec): """ - step_one_collab_attrs = {} - step_two_collab_attrs = {} + step_one_collab_attrs = [] + step_two_collab_attrs = [] all_ref_error_dict = {} @aggregator @@ -64,7 +64,6 @@ def start(self): @aggregator def test_create_agg_attr(self): - """ Create different types of objects """ @@ -85,7 +84,6 @@ def test_create_agg_attr(self): @collaborator def test_create_collab_attr(self): - """ Modify the attirbutes of aggregator to validate the references. Create different types of objects. @@ -95,25 +93,20 @@ def test_create_collab_attr(self): self.collab_attr_dict_one = {key: key for key in range(5)} # append self attributes of collaborators - attr_collab_dict, collab_attr_list = create_collab_dict(self) - TestFlowReferenceWithInclude.step_one_collab_attrs.update( - attr_collab_dict - ) + TestFlowReferenceWithInclude.step_one_collab_attrs.append(self) if ( len(TestFlowReferenceWithInclude.step_one_collab_attrs) >= MIN_COLLECTION_COUNT ): - matched_ref_dict = find_match_ref_at_step( + collab_attr_list = filter_attrs(inspect.getmembers(self)) + matched_ref_dict = find_matched_references( collab_attr_list, TestFlowReferenceWithInclude.step_one_collab_attrs, ) validate_references(matched_ref_dict) - # must be tested with include functionality - self.next( - self.test_create_more_collab_attr, include=["collab_attr_dict_one"] - ) + self.next(self.test_create_more_collab_attr, include=["collab_attr_dict_one"]) @collaborator def test_create_more_collab_attr(self): @@ -124,16 +117,14 @@ def test_create_more_collab_attr(self): self.collab_attr_list_two = [1, 2, 3, 5, 6, 8] self.collab_attr_dict_two = {key: key for key in range(5)} - attr_collab_dict, collab_attr_list = create_collab_dict(self) - TestFlowReferenceWithInclude.step_two_collab_attrs.update( - attr_collab_dict - ) + TestFlowReferenceWithInclude.step_two_collab_attrs.append(self) if ( len(TestFlowReferenceWithInclude.step_two_collab_attrs) >= MIN_COLLECTION_COUNT ): - matched_ref_dict = find_match_ref_at_step( + collab_attr_list = filter_attrs(inspect.getmembers(self)) + matched_ref_dict = find_matched_references( collab_attr_list, TestFlowReferenceWithInclude.step_two_collab_attrs, ) @@ -154,16 +145,14 @@ def join(self, inputs): validate_references(matched_ref_dict) all_shared_attr = "" print(f"\n{bcolors.UNDERLINE}Reference test summary: {bcolors.ENDC}\n") - for key, val in TestFlowReferenceWithInclude.all_ref_error_dict.items(): + for val in TestFlowReferenceWithInclude.all_ref_error_dict.values(): all_shared_attr = all_shared_attr + ",".join(val) if all_shared_attr: print( f"{bcolors.FAIL}...Test case failed for {all_shared_attr} {bcolors.ENDC}" ) else: - print( - f"{bcolors.OKGREEN}...Test case passed for all the attributes." - ) + print(f"{bcolors.OKGREEN}...Test case passed for all the attributes.") self.next(self.end) @aggregator @@ -179,8 +168,8 @@ def end(self): ) ) - TestFlowReferenceWithInclude.step_one_collab_attrs = {} - TestFlowReferenceWithInclude.step_two_collab_attrs = {} + TestFlowReferenceWithInclude.step_one_collab_attrs = [] + TestFlowReferenceWithInclude.step_two_collab_attrs = [] TestFlowReferenceWithInclude.all_ref_error_dict = {} @@ -198,33 +187,36 @@ def filter_attrs(attr_list): return valid_attrs -def find_matched_references(collab_attr_list, all_collborators): +def find_matched_references(collab_attr_list, all_collaborators): """ Iterate attributes of collborator and capture the duplicate reference + return: dict: { + 'Portland': ['failed attributes'], 'Seattle': [], + } """ matched_ref_dict = {} - previous_collaborator = "" - # Initialize dictionary with collborator as key and value as empty list to hold - # duplicated attr list - for collborator_name in all_collborators: - matched_ref_dict[collborator_name.input] = [] - - # Iterate the attributes and get duplicate attribute id - for attr in collab_attr_list: - attr_dict = {attr: []} - for collab in all_collborators: - attr_id = id(getattr(collab, attr)) - collaborator_name = collab.input - if attr_id not in attr_dict.get(attr): - attr_dict.get(attr).append(attr_id) - else: - # append the dict with collabartor as key and attrs as value having same reference - matched_ref_dict.get(collaborator_name).append(attr) - print( - f"{bcolors.FAIL} ... Reference test failed - {collaborator_name} sharing same " - + f"{attr} reference with {previous_collaborator} {bcolors.ENDC}" - ) - previous_collaborator = collaborator_name + for i in range(len(all_collaborators)): + matched_ref_dict[all_collaborators[i].input] = [] + + # For each attribute in the collaborator attribute list, check if any of the collaborator + # attributes are shared with another collaborator + for attr_name in collab_attr_list: + for i, curr_collab in enumerate(all_collaborators): + # Compare the current collaborator with the collaborator(s) that come(s) after it. + for next_collab in all_collaborators[i + 1:]: + # Check if both collaborators have the current attribute + if hasattr(curr_collab, attr_name) and hasattr(next_collab, attr_name): + # Check if both collaborators are sharing same reference + if getattr(curr_collab, attr_name) is getattr( + next_collab, attr_name + ): + matched_ref_dict[curr_collab.input].append(attr_name) + print( + f"{bcolors.FAIL} ... Reference test failed - {curr_collab.input} \ + sharing same " + + f"{attr_name} reference with {next_collab.input} {bcolors.ENDC}" + ) + return matched_ref_dict @@ -247,81 +239,33 @@ def validate_references(matched_ref_dict): ] = matched_ref_dict.get(collab) if not reference_flag: - print( - f"{bcolors.OKGREEN} Pass : Reference test passed {bcolors.ENDC}" - ) - - -def create_collab_dict(collab): - """ - saving the collaborator and its attributes to compare with other collaborator refences. - return : dict ({ - 'Portland': {'collab_attr_dict_one': 140512653871680}, - 'Seattle': {'collab_attr_dict_one': 140512653871936} - }) - """ - attr_collab_dict = {} - collab_attr_list = filter_attrs(inspect.getmembers(collab)) - for attr in collab_attr_list: - attr_id = id(getattr(collab, attr)) - if attr_collab_dict.get(collab.input): - attr_collab_dict.get(collab.input)[attr] = attr_id - else: - attr_collab_dict[collab.input] = {} - attr_collab_dict.get(collab.input)[attr] = attr_id - return attr_collab_dict, collab_attr_list - - -def find_match_ref_at_step(collab_attr_list, all_collborators): - collab_names = all_collborators.keys() - - matched_ref_dict = {} - for collborator_name in collab_names: - matched_ref_dict[collborator_name] = [] - - previous_collaborator = "" - for attr in collab_attr_list: - attr_dict = {attr: []} - for collborator_name in all_collborators.keys(): - attr_id = all_collborators[collborator_name][attr] - if attr_id not in attr_dict.get(attr): - attr_dict.get(attr).append(attr_id) - else: - matched_ref_dict.get(collborator_name).append(attr) - print( - f"{bcolors.FAIL} ... Reference test failed - {collborator_name} sharing same " - + f"{attr} reference with {previous_collaborator} {bcolors.ENDC}" - ) - - previous_collaborator = collborator_name - - return matched_ref_dict + print(f"{bcolors.OKGREEN} Pass : Reference test passed {bcolors.ENDC}") if __name__ == "__main__": - # Setup participants aggregator = Aggregator() - aggregator.private_attributes = {} - # Setup collaborators with private attributes + # Setup collaborators collaborator_names = ["Portland", "Seattle", "Chandler", "Bangalore"] - collaborators = [Collaborator(name=name) for name in collaborator_names] - collaborator.private_attributes = {} + collaborators = [] + for idx, collaborator_name in enumerate(collaborator_names): + collaborators.append(Collaborator(name=collaborator_name)) local_runtime = LocalRuntime( - aggregator=aggregator, collaborators=collaborators + aggregator=aggregator, + collaborators=collaborators, ) if len(sys.argv) > 1: - if sys.argv[1] == 'ray': + if sys.argv[1] == "ray": local_runtime = LocalRuntime( - aggregator=aggregator, collaborators=collaborators, backend='ray' + aggregator=aggregator, collaborators=collaborators, backend="ray" ) print(f"Local runtime collaborators = {local_runtime.collaborators}") - testflow = TestFlowReferenceWithInclude(checkpoint=False) + testflow = TestFlowReferenceWithInclude(checkpoint=True) testflow.runtime = local_runtime for i in range(5): diff --git a/tests/github/experimental/testflow_subset_of_collaborators.py b/tests/github/experimental/testflow_subset_of_collaborators.py index a4081b43ee7..12fea10a921 100644 --- a/tests/github/experimental/testflow_subset_of_collaborators.py +++ b/tests/github/experimental/testflow_subset_of_collaborators.py @@ -47,9 +47,7 @@ def start(self): self.collaborators = self.runtime.collaborators # select subset of collaborators - self.subset_collabrators = self.collaborators[ - : random.choice(self.random_ints) - ] + self.subset_collabrators = self.collaborators[: random.choice(self.random_ints)] print( f"... Executing flow for {len(self.subset_collabrators)} collaborators out of Total: " @@ -84,15 +82,14 @@ def end(self): End of the flow """ - print( - f"End of the test case {TestFlowSubsetCollaborators.__name__} reached." - ) + print(f"End of the test case {TestFlowSubsetCollaborators.__name__} reached.") if __name__ == "__main__": + # Setup participants aggregator = Aggregator() - aggregator.private_attributes = {} + # Setup collaborators private attributes via callable function collaborator_names = [ "Portland", "Seattle", @@ -103,26 +100,31 @@ def end(self): "London", "New York", ] + + def callable_to_initialize_collaborator_private_attributes(collab_name): + return {"name": collab_name} + collaborators = [] - for name in collaborator_names: - temp_collab_obj = Collaborator(name=name) - temp_collab_obj.private_attributes = {"name": name} - collaborators.append(temp_collab_obj) - del temp_collab_obj + for idx, collaborator_name in enumerate(collaborator_names): + collaborators.append( + Collaborator( + name=collaborator_name, + private_attributes_callable=callable_to_initialize_collaborator_private_attributes, + collab_name=collaborator_name, + ) + ) local_runtime = LocalRuntime( - aggregator=aggregator, collaborators=collaborators + aggregator=aggregator, + collaborators=collaborators, ) - if len(sys.argv) > 1: - if sys.argv[1] == 'ray': + if sys.argv[1] == "ray": local_runtime = LocalRuntime( - aggregator=aggregator, collaborators=collaborators, backend='ray' + aggregator=aggregator, collaborators=collaborators, backend="ray" ) - random_ints = random.sample( - range(1, len(collaborators) + 1), len(collaborators) - ) + random_ints = random.sample(range(1, len(collaborators) + 1), len(collaborators)) tc_pass_fail = {"passed": [], "failed": []} for round_num in range(len(collaborators)): print(f"{bcolors.OKBLUE}Starting round {round_num}...{bcolors.ENDC}")