From 4cdb100265f3de218beb75b487512eff2c4bd9b0 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 14 Nov 2023 21:11:01 +0800 Subject: [PATCH 01/39] Release GIL for Python APIs with wait (#190) --- python/mscclpp/core_py.cpp | 10 ++-- python/mscclpp/semaphore_py.cpp | 3 +- python/test/test_mscclpp.py | 82 ++++++++++++++++++++++++++++++++- 3 files changed, 88 insertions(+), 7 deletions(-) diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp index 60ceb96cc..d1df8fd31 100644 --- a/python/mscclpp/core_py.cpp +++ b/python/mscclpp/core_py.cpp @@ -64,10 +64,10 @@ void register_core(nb::module_& m) { nb::arg("nRanks")) .def("create_unique_id", &TcpBootstrap::createUniqueId) .def("get_unique_id", &TcpBootstrap::getUniqueId) - .def("initialize", (void (TcpBootstrap::*)(UniqueId, int64_t)) & TcpBootstrap::initialize, nb::arg("uniqueId"), - nb::arg("timeoutSec") = 30) - .def("initialize", (void (TcpBootstrap::*)(const std::string&, int64_t)) & TcpBootstrap::initialize, - nb::arg("ifIpPortTrio"), nb::arg("timeoutSec") = 30); + .def("initialize", static_cast(&TcpBootstrap::initialize), + nb::call_guard(), nb::arg("uniqueId"), nb::arg("timeoutSec") = 30) + .def("initialize", static_cast(&TcpBootstrap::initialize), + nb::call_guard(), nb::arg("ifIpPortTrio"), nb::arg("timeoutSec") = 30); nb::enum_(m, "Transport") .value("Unknown", Transport::Unknown) @@ -120,7 +120,7 @@ void register_core(nb::module_& m) { self->updateAndSync(dst, dstOffset, (uint64_t*)src, newValue); }, nb::arg("dst"), nb::arg("dstOffset"), nb::arg("src"), nb::arg("newValue")) - .def("flush", &Connection::flush, nb::arg("timeoutUsec") = (int64_t)3e7) + .def("flush", &Connection::flush, nb::call_guard(), nb::arg("timeoutUsec") = (int64_t)3e7) .def("transport", &Connection::transport) .def("remote_transport", &Connection::remoteTransport); diff --git a/python/mscclpp/semaphore_py.cpp b/python/mscclpp/semaphore_py.cpp index 67e02184a..a616a89da 100644 --- a/python/mscclpp/semaphore_py.cpp +++ b/python/mscclpp/semaphore_py.cpp @@ -30,7 +30,8 @@ void register_semaphore(nb::module_& m) { .def("connection", &Host2HostSemaphore::connection) .def("signal", &Host2HostSemaphore::signal) .def("poll", &Host2HostSemaphore::poll) - .def("wait", &Host2HostSemaphore::wait, nb::arg("max_spin_count") = 10000000); + .def("wait", &Host2HostSemaphore::wait, nb::call_guard(), + nb::arg("max_spin_count") = 10000000); nb::class_ smDevice2DeviceSemaphore(m, "SmDevice2DeviceSemaphore"); smDevice2DeviceSemaphore diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index 3af1580a4..4fc00e3b3 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -3,13 +3,22 @@ from concurrent.futures import ThreadPoolExecutor import time +import threading import cupy as cp import numpy as np import netifaces as ni import pytest -from mscclpp import Fifo, Host2DeviceSemaphore, Host2HostSemaphore, ProxyService, SmDevice2DeviceSemaphore, Transport +from mscclpp import ( + TcpBootstrap, + Fifo, + Host2DeviceSemaphore, + Host2HostSemaphore, + ProxyService, + SmDevice2DeviceSemaphore, + Transport, +) from ._cpp import _ext from .mscclpp_group import MscclppGroup from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group @@ -63,6 +72,50 @@ def test_group_with_ip(mpi_group: MpiGroup, ifIpPortTrio: str): assert np.array_equal(memory, memory_expected) +@parametrize_mpi_groups(2, 4, 8, 16) +def test_bootstrap_init_gil_release(mpi_group: MpiGroup): + bootstrap = TcpBootstrap.create(mpi_group.comm.rank, mpi_group.comm.size) + uniq_id = None + if mpi_group.comm.rank == 0: + # similar to NCCL's unique id + uniq_id = bootstrap.create_unique_id() + uniq_id_global = mpi_group.comm.bcast(uniq_id, 0) + + if mpi_group.comm.rank == 0: + # rank 0 never initializes the bootstrap, making other ranks block + pass + else: + check_list = [] + + def check_target(): + check_list.append("this thread could run.") + + def init_target(): + try: + # expected to raise a timeout after 3 seconds + bootstrap.initialize(uniq_id_global, 3) + except: + pass + + init_thread = threading.Thread(target=init_target) + check_thread = threading.Thread(target=check_target) + init_thread.start() + + time.sleep(0.1) + + # check that the check thread is not blocked + s = time.time() + check_thread.start() + check_thread.join() + e = time.time() + assert e - s < 0.1 + assert len(check_list) == 1 + + init_thread.join() + + mpi_group.comm.barrier() + + def create_and_connect(mpi_group: MpiGroup, transport: str): if transport == "NVLink" and all_ranks_on_the_same_node(mpi_group) is False: pytest.skip("cannot use nvlink for cross node") @@ -186,6 +239,33 @@ def test_h2h_semaphores(mpi_group: MpiGroup): group.barrier() +@parametrize_mpi_groups(2, 4, 8, 16) +def test_h2h_semaphores_gil_release(mpi_group: MpiGroup): + group, connections = create_and_connect(mpi_group, "IB") + + semaphores = group.make_semaphore(connections, Host2HostSemaphore) + + def target_wait(sems, conns): + for rank in conns: + sems[rank].wait(-1) + + def target_signal(sems, conns): + # sleep 1 sec to let target_wait() starts a bit earlier + time.sleep(1) + # if wait() doesn't release GIL, this will block forever + for rank in conns: + sems[rank].signal() + + wait_thread = threading.Thread(target=target_wait, args=(semaphores, connections)) + signal_thread = threading.Thread(target=target_signal, args=(semaphores, connections)) + wait_thread.start() + signal_thread.start() + signal_thread.join() + wait_thread.join() + + group.barrier() + + class MscclppKernel: def __init__( self, From e7107017282e5148f64351ae28c5be519748112f Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 15 Nov 2023 16:03:29 +0800 Subject: [PATCH 02/39] Warning ahead of CQ being full (#202) --- src/connection.cc | 22 +++++++++------------- src/ib.cc | 31 +++++++++++++++++-------------- src/include/connection.hpp | 1 - src/include/ib.hpp | 4 +++- 4 files changed, 29 insertions(+), 29 deletions(-) diff --git a/src/connection.cc b/src/connection.cc index 7d8b4b7f7..834a1456c 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -97,7 +97,6 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) { IBConnection::IBConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, Context& context) : transport_(localEndpoint.transport()), remoteTransport_(remoteEndpoint.transport()), - numSignaledSends(0), dummyAtomicSource_(std::make_unique(0)) { qp = getImpl(localEndpoint)->ibQp_; qp->rtr(getImpl(remoteEndpoint)->ibQpInfo_); @@ -131,7 +130,6 @@ void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMem qp->stageSend(srcMr, dstMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset, /*signaled=*/true); - numSignaledSends++; qp->postSend(); INFO(MSCCLPP_NET, "IBConnection write: from %p to %p, size %lu", (uint8_t*)srcMr->getBuff() + srcOffset, @@ -152,7 +150,6 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6 *src = newValue; qp->stageAtomicAdd(dstTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue, /*signaled=*/true); - numSignaledSends++; qp->postSend(); INFO(MSCCLPP_NET, "IBConnection atomic Write: from %p to %p, %lu -> %lu", src, (uint8_t*)dstMrInfo.addr + dstOffset, @@ -161,24 +158,23 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6 void IBConnection::flush(int64_t timeoutUsec) { Timer timer; - while (numSignaledSends) { + while (qp->getNumCqItems()) { int wcNum = qp->pollCq(); if (wcNum < 0) { throw mscclpp::IbError("pollCq failed: error no " + std::to_string(errno), errno); - } - - auto elapsed = timer.elapsed(); - if ((timeoutUsec >= 0) && (elapsed > timeoutUsec)) { - throw Error("pollCq is stuck: waited for " + std::to_string(elapsed / 1e6) + " seconds. Expected " + - std::to_string(numSignaledSends) + " signals", - ErrorCode::InternalError); + } else if (timeoutUsec >= 0) { + auto elapsed = timer.elapsed(); + if (elapsed > timeoutUsec) { + throw Error("pollCq timed out: waited for " + std::to_string(elapsed / 1e6) + " seconds. Expected " + + std::to_string(qp->getNumCqItems()) + " signals", + ErrorCode::Timeout); + } } for (int i = 0; i < wcNum; ++i) { const ibv_wc* wc = qp->getWc(i); if (wc->status != IBV_WC_SUCCESS) { - throw mscclpp::IbError("pollCq failed: status " + std::to_string(wc->status), wc->status); + throw mscclpp::IbError("a work item failed: status " + std::to_string(wc->status), wc->status); } - numSignaledSends--; } } INFO(MSCCLPP_NET, "IBConnection flushing connection"); diff --git a/src/ib.cc b/src/ib.cc index 4cac11fe0..1d25b68b6 100644 --- a/src/ib.cc +++ b/src/ib.cc @@ -54,7 +54,7 @@ uint32_t IbMr::getLkey() const { return this->mr->lkey; } IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int port, int maxCqSize, int maxCqPollNum, int maxSendWr, int maxRecvWr, int maxWrPerSend) - : maxCqPollNum(maxCqPollNum), maxWrPerSend(maxWrPerSend) { + : numSignaledPostedItems(0), numSignaledStagedItems(0), maxCqPollNum(maxCqPollNum), maxWrPerSend(maxWrPerSend) { this->cq = ibv_create_cq(ctx, maxCqSize, nullptr, nullptr, 0); if (this->cq == nullptr) { std::stringstream err; @@ -212,6 +212,7 @@ void IbQp::stageSend(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64 wrInfo.sge->addr = (uint64_t)(mr->getBuff()) + srcOffset; wrInfo.sge->length = size; wrInfo.sge->lkey = mr->getLkey(); + if (signaled) (this->numSignaledStagedItems)++; } void IbQp::stageAtomicAdd(const IbMr* mr, const IbMrInfo& info, uint64_t wrId, uint64_t dstOffset, uint64_t addVal, @@ -226,6 +227,7 @@ void IbQp::stageAtomicAdd(const IbMr* mr, const IbMrInfo& info, uint64_t wrId, u wrInfo.sge->addr = (uint64_t)(mr->getBuff()); wrInfo.sge->length = sizeof(uint64_t); // atomic op is always on uint64_t wrInfo.sge->lkey = mr->getLkey(); + if (signaled) (this->numSignaledStagedItems)++; } void IbQp::stageSendWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, @@ -240,6 +242,7 @@ void IbQp::stageSendWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, wrInfo.sge->addr = (uint64_t)(mr->getBuff()) + srcOffset; wrInfo.sge->length = size; wrInfo.sge->lkey = mr->getLkey(); + if (signaled) (this->numSignaledStagedItems)++; } void IbQp::postSend() { @@ -254,28 +257,28 @@ void IbQp::postSend() { throw mscclpp::IbError(err.str(), errno); } this->wrn = 0; + this->numSignaledPostedItems += this->numSignaledStagedItems; + this->numSignaledStagedItems = 0; + if (this->numSignaledPostedItems + 4 > this->cq->cqe) { + WARN("IB: CQ is almost full ( %d / %d ). The connection needs to be flushed to prevent timeout errors.", + this->numSignaledPostedItems, this->cq->cqe); + } } -void IbQp::postRecv(uint64_t wrId) { - struct ibv_recv_wr wr, *bad_wr; - wr.wr_id = wrId; - wr.sg_list = nullptr; - wr.num_sge = 0; - wr.next = nullptr; - int ret = ibv_post_recv(this->qp, &wr, &bad_wr); - if (ret != 0) { - std::stringstream err; - err << "ibv_post_recv failed (errno " << errno << ")"; - throw mscclpp::IbError(err.str(), errno); +int IbQp::pollCq() { + int wcNum = ibv_poll_cq(this->cq, this->maxCqPollNum, this->wcs.get()); + if (wcNum > 0) { + this->numSignaledPostedItems -= wcNum; } + return wcNum; } -int IbQp::pollCq() { return ibv_poll_cq(this->cq, this->maxCqPollNum, this->wcs.get()); } - IbQpInfo& IbQp::getInfo() { return this->info; } const ibv_wc* IbQp::getWc(int idx) const { return &this->wcs[idx]; } +int IbQp::getNumCqItems() const { return this->numSignaledPostedItems; } + IbCtx::IbCtx(const std::string& devName) : devName(devName) { int num; struct ibv_device** devices = ibv_get_device_list(&num); diff --git a/src/include/connection.hpp b/src/include/connection.hpp index d073d96b3..fffdc2086 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -36,7 +36,6 @@ class IBConnection : public Connection { Transport transport_; Transport remoteTransport_; IbQp* qp; - int numSignaledSends; std::unique_ptr dummyAtomicSource_; // not used anywhere but IB needs a source RegisteredMemory dummyAtomicSourceMem_; mscclpp::TransportInfo dstTransportInfo_; diff --git a/src/include/ib.hpp b/src/include/ib.hpp index db2c426e6..0ea661617 100644 --- a/src/include/ib.hpp +++ b/src/include/ib.hpp @@ -68,11 +68,11 @@ class IbQp { void stageSendWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled, unsigned int immData); void postSend(); - void postRecv(uint64_t wrId); int pollCq(); IbQpInfo& getInfo(); const ibv_wc* getWc(int idx) const; + int getNumCqItems() const; private: struct WrInfo { @@ -92,6 +92,8 @@ class IbQp { std::unique_ptr wrs; std::unique_ptr sges; int wrn; + int numSignaledPostedItems; + int numSignaledStagedItems; const int maxCqPollNum; const int maxWrPerSend; From 060fda12e651ee6c94768aa21bd9f24177a96410 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 16 Nov 2023 12:45:25 +0800 Subject: [PATCH 03/39] mscclpp-test in Python (#204) Co-authored-by: Binyang Li Co-authored-by: Saeed Maleki Co-authored-by: Esha Choukse --- .azure-pipelines/integration-test.yml | 17 + .azure-pipelines/multi-nodes-test.yml | 25 +- .azure-pipelines/ut.yml | 6 +- include/mscclpp/concurrency.hpp | 4 +- include/mscclpp/semaphore_device.hpp | 17 +- include/mscclpp/sm_channel_device.hpp | 65 +- python/benchmark/__init__.py | 0 python/benchmark/allreduce.cu | 769 ++++++++++++++++++ python/benchmark/allreduce_bench.py | 215 +++++ python/benchmark/mscclpp_op.py | 344 ++++++++ python/benchmark/nccl_op.py | 23 + python/mscclpp/__init__.py | 3 + .../mscclpp_group.py => mscclpp/comm.py} | 78 +- python/{test => mscclpp}/utils.py | 29 +- python/requirements_cu11.txt | 7 + python/requirements_cu12.txt | 7 + python/test/mscclpp_mpi.py | 9 +- python/test/requirements_cu11.txt | 6 - python/test/requirements_cu12.txt | 6 - python/test/test_mscclpp.py | 34 +- test/deploy/deploy.sh | 20 +- test/deploy/run_tests.sh | 50 +- test/deploy/setup.sh | 8 +- test/mscclpp-test/allreduce_test.cu | 2 +- 24 files changed, 1589 insertions(+), 155 deletions(-) create mode 100644 python/benchmark/__init__.py create mode 100644 python/benchmark/allreduce.cu create mode 100644 python/benchmark/allreduce_bench.py create mode 100644 python/benchmark/mscclpp_op.py create mode 100644 python/benchmark/nccl_op.py rename python/{test/mscclpp_group.py => mscclpp/comm.py} (66%) rename python/{test => mscclpp}/utils.py (79%) create mode 100644 python/requirements_cu11.txt create mode 100644 python/requirements_cu12.txt delete mode 100644 python/test/requirements_cu11.txt delete mode 100644 python/test/requirements_cu12.txt diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml index 4d96581ad..a433553bc 100644 --- a/.azure-pipelines/integration-test.yml +++ b/.azure-pipelines/integration-test.yml @@ -112,3 +112,20 @@ jobs: set -e python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl workingDirectory: '$(System.DefaultWorkingDirectory)' + + - task: Bash@3 + name: PythonAllReduceBenchmark + displayName: Python Allreduce Benchmark + inputs: + targetType: 'inline' + script: | + set -e + export PATH=/usr/local/mpi/bin:$PATH + python3 -m pip install . + if [[ '$(containerImage)' == *'cuda11'* ]]; then + pip3 install -r ./python/requirements_cu11.txt + else + pip3 install -r ./python/requirements_cu12.txt + fi + mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/benchmark/allreduce_bench.py + workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index bb158646e..44b7bb3b4 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -83,7 +83,7 @@ jobs: tail -f output/mscclit-000000 & CHILD_PID=$! parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh mscclpp-test' + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test' kill $CHILD_PID - task: Bash@3 @@ -102,7 +102,7 @@ jobs: tail -f output/mscclit-000000 & CHILD_PID=$! parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh mp-ut' + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut' kill $CHILD_PID - task: Bash@3 @@ -121,7 +121,26 @@ jobs: tail -f output/mscclit-000000 & CHILD_PID=$! parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh pytests' + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests' + kill $CHILD_PID + + - task: Bash@3 + name: RunMultiNodePythonBenchmark + displayName: Run multi-nodes python benchmark + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + rm -rf output/* + mkdir -p output + touch output/mscclit-000000 + tail -f output/mscclit-000000 & + CHILD_PID=$! + parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark' kill $CHILD_PID - task: AzureCLI@2 diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml index 31b8091cd..c9ea5e1c6 100644 --- a/.azure-pipelines/ut.yml +++ b/.azure-pipelines/ut.yml @@ -81,9 +81,9 @@ jobs: export PATH=/usr/local/mpi/bin:$PATH cd build && make pylib-copy if [[ '$(containerImage)' == *'cuda11'* ]]; then - pip3 install -r ../python/test/requirements_cu11.txt + pip3 install -r ../python/requirements_cu11.txt else - pip3 install -r ../python/test/requirements_cu12.txt + pip3 install -r ../python/requirements_cu12.txt fi - mpirun -tag-output -np 8 ~/.local/bin/pytest ../python/test/test_mscclpp.py -x + mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 ~/.local/bin/pytest ../python/test/test_mscclpp.py -x workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/include/mscclpp/concurrency.hpp b/include/mscclpp/concurrency.hpp index ab2a5bd38..cccab8be4 100644 --- a/include/mscclpp/concurrency.hpp +++ b/include/mscclpp/concurrency.hpp @@ -33,11 +33,13 @@ struct DeviceSyncer { if (tmpIsAdd) { if (atomicAdd(&count_, 1) == maxOldCnt) { flag_ = 1; + count_ = 0; } POLL_MAYBE_JAILBREAK(!flag_, maxSpinCount); } else { - if (atomicSub(&count_, 1) == 1) { + if (atomicAdd(&count_, 1) == maxOldCnt) { flag_ = 0; + count_ = 0; } POLL_MAYBE_JAILBREAK(flag_, maxSpinCount); } diff --git a/include/mscclpp/semaphore_device.hpp b/include/mscclpp/semaphore_device.hpp index 3c6f22fd3..4ed5fbeec 100644 --- a/include/mscclpp/semaphore_device.hpp +++ b/include/mscclpp/semaphore_device.hpp @@ -23,7 +23,7 @@ struct Host2DeviceSemaphoreDeviceHandle { } /// Wait for the host to signal. - __forceinline__ __device__ void wait(int64_t maxSpinCount = 10000000) { + __forceinline__ __device__ void wait(int64_t maxSpinCount = 100000000) { (*expectedInboundSemaphoreId) += 1; POLL_MAYBE_JAILBREAK((cuda::atomic_ref{*inboundSemaphoreId}.load( cuda::memory_order_acquire) < (*expectedInboundSemaphoreId)), @@ -48,7 +48,7 @@ struct SmDevice2DeviceSemaphoreDeviceHandle { } /// Wait for the remote device to signal. - __forceinline__ __device__ void wait(int64_t maxSpinCount = 10000000) { + __forceinline__ __device__ void wait(int64_t maxSpinCount = 100000000) { (*expectedInboundSemaphoreId) += 1; POLL_MAYBE_JAILBREAK((cuda::atomic_ref{*inboundSemaphoreId}.load( cuda::memory_order_acquire) < (*expectedInboundSemaphoreId)), @@ -68,6 +68,19 @@ struct SmDevice2DeviceSemaphoreDeviceHandle { cuda::memory_order_seq_cst); } + /// Signal the remote device. + /// + /// This function is a relaxed version of signal() and provides no guarantee on the completion of memory operations. + /// User requires to call proper fencing before using this function. + /// + __forceinline__ __device__ void relaxedSignal() { + // This fence ensures that preceding writes are visible on the peer GPU before the incremented + // `outboundSemaphoreId` is visible. + semaphoreIncrement(); + cuda::atomic_ref{*remoteInboundSemaphoreId}.store(semaphoreGetLocal(), + cuda::memory_order_relaxed); + } + /// Signal the remote device for copied packets. /// /// Unlike @ref signal(), this function provides no guarantee on the completion of memory operations. This is diff --git a/include/mscclpp/sm_channel_device.hpp b/include/mscclpp/sm_channel_device.hpp index 5c11ecd6b..e4e945c1c 100644 --- a/include/mscclpp/sm_channel_device.hpp +++ b/include/mscclpp/sm_channel_device.hpp @@ -16,30 +16,22 @@ namespace Element { /// Load an element from DRAM. /// -/// This is a warpper of ld.volatile.global.* PTX instruction. Address alignment is not this function's -/// responsibility. -/// /// @param v The value to be loaded. /// @param p The address of the value to be loaded. /// template __forceinline__ __device__ void load(T& v, const T* p) { - // We should only use the specialized functions. - __assert_fail("Unsupported type", __FILE__, __LINE__, __PRETTY_FUNCTION__); + v = *p; } /// Write an element on DRAM. /// -/// This is a wrapper of st.volatile.global.* PTX instruction. Address alignment is not this function's -/// responsibility. -/// /// @param p The address of the value to be written. /// @param v The value to be written. /// template __forceinline__ __device__ void store(T* p, const T& v) { - // We should only use the specialized functions. - __assert_fail("Unsupported type", __FILE__, __LINE__, __PRETTY_FUNCTION__); + *p = v; } /// Copy aligned elements from the source memory to the destination memory. @@ -64,52 +56,6 @@ __forceinline__ __device__ void copy(T* dst, T* src, uint64_t numElems, uint32_t } } -template <> -__forceinline__ __device__ void load(long long& v, const long long* p) { - asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(v) : "l"(p) : "memory"); -} - -template <> -__forceinline__ __device__ void store(long long* p, const long long& v) { - asm volatile("st.volatile.global.u64 [%0], %1;" : : "l"(p), "l"(v) : "memory"); -} - -template <> -__forceinline__ __device__ void load(int& v, const int* p) { - asm volatile("ld.volatile.global.u32 %0, [%1];" : "=r"(v) : "l"(p) : "memory"); -} - -template <> -__forceinline__ __device__ void store(int* p, const int& v) { - asm volatile("st.volatile.global.u32 [%0], %1;" : : "l"(p), "r"(v) : "memory"); -} - -template <> -__forceinline__ __device__ void load(longlong2& v, const longlong2* p) { - asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory"); -} - -template <> -__forceinline__ __device__ void store(longlong2* p, const longlong2& v) { - asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" : : "l"(p), "l"(v.x), "l"(v.y) : "memory"); -} - -template <> -__forceinline__ __device__ void load(int4& v, const int4* p) { - asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" - : "=r"(v.x), "=r"(v.y), "=r"(v.z), "=r"(v.w) - : "l"(p) - : "memory"); -} - -template <> -__forceinline__ __device__ void store(int4* p, const int4& v) { - asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" - : - : "l"(p), "r"(v.x), "r"(v.y), "r"(v.z), "r"(v.w) - : "memory"); -} - } // namespace Element #endif // __CUDACC__ @@ -315,6 +261,13 @@ struct SmChannelDeviceHandle { /// __forceinline__ __device__ void signal() { semaphore_.signal(); } + /// Signal the remote semaphore. + /// + /// This function is a relaxed version of signal() and provides no guarantee on the completion of memory operations. + /// User requires to call proper fencing before using this function. + /// + __forceinline__ __device__ void relaxedSignal() { semaphore_.relaxedSignal(); } + /// Signal the remote semaphore for copied packets. /// /// Unlike @ref signal(), this function provides no guarantee on the completion of memory operations. This is diff --git a/python/benchmark/__init__.py b/python/benchmark/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/benchmark/allreduce.cu b/python/benchmark/allreduce.cu new file mode 100644 index 000000000..4dc2b0c45 --- /dev/null +++ b/python/benchmark/allreduce.cu @@ -0,0 +1,769 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include + +#include +#include +#include + +__device__ mscclpp::DeviceSyncer deviceSyncer; +__device__ mscclpp::DeviceSyncer allGatherDeviceSyncer; +__device__ mscclpp::DeviceSyncer reduceScatterDeviceSyncer; +__device__ mscclpp::DeviceSyncer ibDeviceSyncer; + +#ifndef TYPE +#define TYPE float +#endif + +#define VECTOR_SIZE (sizeof(int4) / sizeof(TYPE)) + +template +__forceinline__ __device__ To bit_cast(const From& src) { + static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast"); + + union { + From f; + To t; + } u; + u.f = src; + return u.t; +} + +template +__forceinline__ __device__ T add_elements(T a, T b) { + return a + b; +} + +template <> +__forceinline__ __device__ __half2 add_elements(__half2 a, __half2 b) { + return __hadd2(a, b); +} + +template +__forceinline__ __device__ int4 add_vectors_helper(int4 a, int4 b) { + int4 ret; + ret.w = bit_cast(add_elements(bit_cast(a.w), bit_cast(b.w))); + ret.x = bit_cast(add_elements(bit_cast(a.x), bit_cast(b.x))); + ret.y = bit_cast(add_elements(bit_cast(a.y), bit_cast(b.y))); + ret.z = bit_cast(add_elements(bit_cast(a.z), bit_cast(b.z))); + return ret; +} + +template +__forceinline__ __device__ int4 add_vectors(int4 a, int4 b) { + return add_vectors_helper(a, b); +} + +template <> +__forceinline__ __device__ int4 add_vectors<__half>(int4 a, int4 b) { + return add_vectors_helper<__half2>(a, b); +} + +template +__forceinline__ __device__ uint2 add_vectors_helper(uint2 a, uint2 b) { + uint2 ret; + ret.x = bit_cast(add_elements(bit_cast(a.x), bit_cast(b.x))); + ret.y = bit_cast(add_elements(bit_cast(a.y), bit_cast(b.y))); + return ret; +} + +template +__forceinline__ __device__ uint2 add_vectors(uint2 a, uint2 b) { + return add_vectors_helper(a, b); +} + +template <> +__forceinline__ __device__ uint2 add_vectors<__half>(uint2 a, uint2 b) { + return add_vectors_helper<__half2>(a, b); +} + +template +__forceinline__ __device__ int add_vectors_helper(int a, int b) { + return bit_cast(add_elements(bit_cast(a), bit_cast(b))); +} + +template +__forceinline__ __device__ int add_vectors(int a, int b) { + return add_vectors_helper(a, b); +} + +template <> +__forceinline__ __device__ int add_vectors<__half>(int a, int b) { + return add_vectors_helper<__half2>(a, b); +} + +__forceinline__ __device__ void vectorSum(TYPE* dst, TYPE* src, size_t nElem, int blockId, int nBlocks) { + size_t nInt4 = nElem / 4; + size_t nLastInts = nElem % 4; + int4* dst4 = (int4*)dst; + int4* src4 = (int4*)src; + for (int i = threadIdx.x + blockId * blockDim.x; i < nInt4; i += blockDim.x * nBlocks) { + dst4[i] = add_vectors(dst4[i], src4[i]); + } + if (nLastInts > 0) { + int* dstLast = ((int*)dst) + nInt4 * 4; + int* srcLast = ((int*)src) + nInt4 * 4; + for (int i = threadIdx.x + blockId * blockDim.x; i < nLastInts; i += blockDim.x * nBlocks) { + dstLast[i] = add_vectors(dstLast[i], srcLast[i]); + } + } +} + +__forceinline__ __device__ void vectorSum(TYPE* dst, TYPE* src, size_t nElem) { + vectorSum(dst, src, nElem, blockIdx.x, gridDim.x); +} + +// ------------------------------------------- +// AllReduce1 +// ------------------------------------------- + +#ifndef READ_ONLY +#define READ_ONLY 0 +#endif + +extern "C" __global__ void __launch_bounds__(1024, 1) + allreduce1(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff, int rank, int nranks, size_t nelems) { + const size_t chunkSize = nelems / nranks; + if (nranks == 1) return; + const int nPeer = nranks - 1; + const size_t indexOffset = rank * chunkSize; + const size_t indexOffset4 = indexOffset / VECTOR_SIZE; + int4* buff4 = (int4*)buff; + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + + // synchronize everyone + if (tid == 0) { + __threadfence_system(); + } + __syncthreads(); + if (tid < nPeer) { + smChans[tid].relaxedSignal(); + } + if (tid >= nPeer && tid < nPeer * 2) { + smChans[tid - nPeer].wait(); + } + deviceSyncer.sync(gridDim.x); + + // use int4 as much as possible + const size_t nInt4 = chunkSize / VECTOR_SIZE; + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * gridDim.x) { + int4 tmp = buff4[indexOffset4 + idx]; + for (int index = 0; index < nPeer; ++index) { + int4 val; + int peerIdx = (index + rank); + if (peerIdx >= nPeer) peerIdx -= nPeer; + val = smChans[peerIdx].read(indexOffset4 + idx); + tmp = add_vectors(tmp, val); + } + if (READ_ONLY == 0) { + for (int index = 0; index < nPeer; ++index) { + int peerIdx = (index + rank); + if (peerIdx >= nPeer) peerIdx -= nPeer; + smChans[peerIdx].write(indexOffset4 + idx, tmp); + } + } + buff4[indexOffset4 + idx] = tmp; + } + + // use the given TYPE for the rest + size_t processed = nInt4 * VECTOR_SIZE * nranks; + const size_t nRemElems = nelems - processed; + const size_t startIdx = processed + (nRemElems * rank) / nranks; + const size_t endIdx = processed + (nRemElems * (rank + 1)) / nranks; + for (int idx = threadIdx.x + blockIdx.x * blockDim.x + startIdx; idx < endIdx; idx += blockDim.x * gridDim.x) { + TYPE tmp = buff[idx]; + for (int index = 0; index < nPeer; ++index) { + int peerIdx = (index + rank); + if (peerIdx >= nPeer) peerIdx -= nPeer; + TYPE val = smChans[peerIdx].read(idx); + tmp += val; + } + for (int index = 0; index < nPeer; ++index) { + int peerIdx = (index + rank); + if (peerIdx >= nPeer) peerIdx -= nPeer; + smChans[peerIdx].write(idx, tmp); + } + buff[idx] = tmp; + } + + // synchronize everyone again + deviceSyncer.sync(gridDim.x); + if (tid == 0) { + __threadfence_system(); + } + __syncthreads(); + if (tid < nPeer) { + smChans[tid].relaxedSignal(); + } + if (tid >= nPeer && tid < nPeer * 2) { + smChans[tid - nPeer].wait(); + } + + if (READ_ONLY) { + for (int i = 0; i < nPeer; ++i) { + int peerIdx = (i + rank); + if (peerIdx >= nPeer) peerIdx -= nPeer; + const int remoteRank = (peerIdx < rank ? peerIdx : peerIdx + 1); + size_t offset = chunkSize * remoteRank * sizeof(TYPE); + smChans[peerIdx].get(offset, chunkSize * sizeof(TYPE), tid, blockDim.x * gridDim.x); + } + } +} + +// ------------------------------------------- +// AllReduce2 +// ------------------------------------------- + +__device__ uint64_t globalFlag = 1; + +extern "C" __global__ void __launch_bounds__(512, 1) + allreduce2(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff, TYPE* scratch, void* resultBuff, int rank, + int worldSize, size_t nelems) { + nelems = nelems / (sizeof(int) / sizeof(TYPE)); + // This version of allreduce only works for single nodes + const int nPeers = worldSize - 1; + const int nPkts = nelems / 2; + const int nelemsPerRank = nelems / worldSize; + const int nPktsPerRank = nelemsPerRank / 2; + // flag for packets. Initially 1 + const uint32_t flag = (uint32_t)globalFlag; + // thread block & channel info + const int nBlocksPerPeer = gridDim.x / nPeers; + const int localBlockIdx = blockIdx.x % nBlocksPerPeer; + const int peerIdx = blockIdx.x / nBlocksPerPeer; + const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1; + mscclpp::SmChannelDeviceHandle smChan = smChans[peerIdx]; + const int tid = threadIdx.x + localBlockIdx * blockDim.x; + // double buffering + size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LLPacket); + void* scratchBuff = (void*)((char*)scratch + scratchBaseOffset); + size_t scratchOffset = scratchBaseOffset + rank * nPktsPerRank * sizeof(mscclpp::LLPacket); + size_t scratchResultOffset = + (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LLPacket) : 3 * nPkts * sizeof(mscclpp::LLPacket); + size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int); + uint2* src = (uint2*)((char*)buff + rank * nelemsPerRank * sizeof(int)); + uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); + + // step 1: write to scratch buffer + smChan.putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, flag); + // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { + uint2 data = make_uint2(0, 0); + for (int index = 0; index < nPeers; index++) { + const int remoteRank = index < rank ? index : index + 1; + mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank; + uint2 val = dstPkt[idx].read(flag); + data = add_vectors(val, data); + } + data = add_vectors(data, src[idx]); + dst[idx] = data; + + mscclpp::LLPacket packet; + packet.data1 = data.x; + packet.flag1 = flag; + packet.data2 = data.y; + packet.flag2 = flag; + size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket) + (idx + rank * nPktsPerRank); + for (int index = 0; index < nPeers; index++) { + smChans[index].write(offset, packet); + } + } + // step 3: get data result from scratch buffer + mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)((char*)scratch + scratchResultOffset); + const int dstOffset = remoteRank * nPktsPerRank; + uint2* result = (uint2*)((char*)resultBuff + remoteRank * nelemsPerRank * sizeof(int)); + for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * nBlocksPerPeer) { + uint2 data = dstPkt[idx + dstOffset].read(flag); + result[idx].x = data.x; + result[idx].y = data.y; + } + if (threadIdx.x == 0 && blockIdx.x == 0) { + globalFlag += 1; + } +} + +// ------------------------------------------- +// AllReduce3 +// ------------------------------------------- + +extern "C" __global__ void __launch_bounds__(1024, 1) + allreduce3(mscclpp::SimpleProxyChannelDeviceHandle* fstRoundChans, + mscclpp::SimpleProxyChannelDeviceHandle* sndRoundChans, TYPE* buff, TYPE* scratch, int rank, + int worldSize, size_t nelems) { + nelems = nelems / (sizeof(int) / sizeof(TYPE)); + + int isComm = (threadIdx.x == 0) && (blockIdx.x == 0); + int remoteSendRank = (rank + 1) % worldSize; + int remoteRecvRank = (rank + worldSize - 1) % worldSize; + int peerSendId = (remoteSendRank < rank) ? remoteSendRank : remoteSendRank - 1; + int peerRecvId = (remoteRecvRank < rank) ? remoteRecvRank : remoteRecvRank - 1; + + mscclpp::SimpleProxyChannelDeviceHandle& devFstSendChan = fstRoundChans[peerSendId]; + mscclpp::SimpleProxyChannelDeviceHandle& devFstRecvChan = fstRoundChans[peerRecvId]; + mscclpp::SimpleProxyChannelDeviceHandle& devSndSendChan = sndRoundChans[peerSendId]; + mscclpp::SimpleProxyChannelDeviceHandle& devSndRecvChan = sndRoundChans[peerRecvId]; + + // Step 1 + size_t chunkIndex = (rank + worldSize - 1) % worldSize; + size_t chunkNelem = nelems / worldSize; + size_t offset = chunkIndex * chunkNelem * sizeof(int); + if (isComm) { + if (chunkNelem > 1) { + devFstSendChan.putWithSignal(offset, chunkNelem / 2 * sizeof(int)); + } + } + + // Step 2 ~ Step n-1 + for (int step = 2; step < worldSize; ++step) { + if (isComm) { + if (chunkNelem > 1) { + devFstRecvChan.wait(); + devFstSendChan.flush(); + } + devFstSendChan.putWithSignal(offset + chunkNelem / 2 * sizeof(int), (chunkNelem - chunkNelem / 2) * sizeof(int)); + } + deviceSyncer.sync(gridDim.x); + + // Reduce + chunkIndex = (rank + worldSize - step) % worldSize; + offset = chunkIndex * chunkNelem * sizeof(int); + int* dst = (int*)((char*)buff + offset); + int* src = (int*)((char*)scratch + offset); + vectorSum((TYPE*)dst, (TYPE*)src, chunkNelem / 2); + + if (isComm) { + devFstRecvChan.wait(); + devFstSendChan.flush(); + if (chunkNelem > 1) { + devFstSendChan.putWithSignal(offset, chunkNelem / 2 * sizeof(int)); + } + } + deviceSyncer.sync(gridDim.x); + + dst += chunkNelem / 2; + src += chunkNelem / 2; + vectorSum((TYPE*)dst, (TYPE*)src, chunkNelem - chunkNelem / 2); + } + + // Step n + if (isComm) { + if (chunkNelem > 1) { + devFstRecvChan.wait(); + devFstSendChan.flush(); + } + devFstSendChan.putWithSignal(offset + chunkNelem / 2 * sizeof(int), (chunkNelem - chunkNelem / 2) * sizeof(int)); + } + deviceSyncer.sync(gridDim.x); + + offset = rank * chunkNelem * sizeof(int); + int* dst = (int*)((char*)buff + offset); + int* src = (int*)((char*)scratch + offset); + vectorSum((TYPE*)dst, (TYPE*)src, chunkNelem / 2); + + if (isComm) { + devFstRecvChan.wait(); + devFstSendChan.flush(); + if (chunkNelem > 1) { + devSndSendChan.putWithSignal(offset, chunkNelem / 2 * sizeof(int)); + } + } + deviceSyncer.sync(gridDim.x); + + dst += chunkNelem / 2; + src += chunkNelem / 2; + vectorSum((TYPE*)dst, (TYPE*)src, chunkNelem - chunkNelem / 2); + + if (isComm) { + if (chunkNelem > 1) { + devSndRecvChan.wait(); + devSndSendChan.flush(); + } + devSndSendChan.putWithSignalAndFlush(offset + chunkNelem / 2 * sizeof(int), + (chunkNelem - chunkNelem / 2) * sizeof(int)); + } + + // Step n+1 ~ Step 2n-2 + for (int i = 1; i < worldSize - 1; ++i) { + if (isComm) { + devSndRecvChan.wait(); + } + deviceSyncer.sync(gridDim.x); + + // Copy + chunkIndex = (rank + worldSize - i) % worldSize; + if (isComm) { + devSndSendChan.putWithSignalAndFlush(chunkIndex * chunkNelem * sizeof(int), chunkNelem * sizeof(int)); + } + } + + // Final receive + if (isComm) { + devSndRecvChan.wait(); + } +} + +// ------------------------------------------- +// AllReduce4 +// 2-node +// ------------------------------------------- +__device__ void localReduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff, int rank, int nRanksPerNode, + int startChunkIndex, size_t offsetInChunk, size_t chunkSize, size_t nelems, + int nBlocks) { + if (nRanksPerNode == 1) return; + if (blockIdx.x >= nBlocks) return; + const int nPeer = nRanksPerNode - 1; + + const size_t localRankIndexInNode = rank % nRanksPerNode; + const size_t indexOffset = ((localRankIndexInNode + startChunkIndex) * chunkSize + offsetInChunk); + const size_t indexOffset4 = indexOffset / 4; + + int4* buff4 = (int4*)buff; + + for (int peerIdx = threadIdx.x + blockIdx.x * blockDim.x; peerIdx < nPeer; peerIdx += blockDim.x * nBlocks) { + smChans[peerIdx].relaxedSignal(); + } + for (int peerIdx = threadIdx.x + blockIdx.x * blockDim.x; peerIdx < nPeer; peerIdx += blockDim.x * nBlocks) { + smChans[peerIdx].wait(); + } + reduceScatterDeviceSyncer.sync(nBlocks); + + const size_t nInt4 = nelems / 4; + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * nBlocks) { + int4 tmp = buff4[indexOffset4 + idx]; + for (int index = 0; index < nPeer; ++index) { + int4 val; + int peerIdx = index + localRankIndexInNode; + if (peerIdx >= nPeer) peerIdx -= nPeer; + val = smChans[peerIdx].read(indexOffset4 + idx); + tmp = add_vectors(tmp, val); + } + buff4[indexOffset4 + idx] = tmp; + } + + // TODO: deal with rest elements +} + +// This kernel is the most performant when the number of blocks is a multiple of (nRanksPerNode - 1). +__device__ void localAllGatherSm(mscclpp::SmChannelDeviceHandle* smChans, int rank, int nRanksPerNode, + int startRankChunkIndex, uint64_t offsetInRankChunk, uint64_t rankChunkSize, + uint64_t size, size_t nBlocks) { + if (nRanksPerNode == 1) return; + if (blockIdx.x >= nBlocks) return; + const size_t nPeer = nRanksPerNode - 1; + const size_t peerIdx = blockIdx.x % nPeer; + const size_t nBlockForThisPeer = nBlocks / nPeer + (nBlocks % nPeer > peerIdx ? 1 : 0); + const size_t peerLocalBlockIdx = blockIdx.x / nPeer; + const size_t rankLocalIndex = rank % nRanksPerNode; + const int remoteRankLocalIndex = (peerIdx < rankLocalIndex ? peerIdx : peerIdx + 1); + + // Split the data into chunks for aligned data access. Ignore the remainder here and let the last block handle it. + constexpr size_t chunkBytes = 128; // heuristic value + const size_t nChunk = size / chunkBytes; + const size_t nMinChunkPerBlock = nChunk / nBlockForThisPeer; + const size_t nRemainderChunk = nChunk % nBlockForThisPeer; + + // Distribute chunks to blocks + size_t nChunkForThisBlock; + size_t offsetForThisBlock; + if (peerLocalBlockIdx < nRemainderChunk) { + nChunkForThisBlock = nMinChunkPerBlock + 1; + offsetForThisBlock = (nMinChunkPerBlock + 1) * peerLocalBlockIdx; + } else { + nChunkForThisBlock = nMinChunkPerBlock; + offsetForThisBlock = + (nMinChunkPerBlock + 1) * nRemainderChunk + (peerLocalBlockIdx - nRemainderChunk) * nMinChunkPerBlock; + } + offsetForThisBlock *= chunkBytes; + + // Calculate the size of the data for this block + size_t sizeForThisBlock = nChunkForThisBlock * chunkBytes; + const size_t lastChunkSize = size - nChunk * chunkBytes; + if (lastChunkSize > 0 && peerLocalBlockIdx == nBlockForThisPeer - 1) { + sizeForThisBlock += lastChunkSize; + } + if (threadIdx.x == 0 && peerLocalBlockIdx == 0) { + smChans[peerIdx].relaxedSignal(); + smChans[peerIdx].wait(); + } + allGatherDeviceSyncer.sync(nBlocks); + size_t offset = rankChunkSize * (startRankChunkIndex + remoteRankLocalIndex) + offsetInRankChunk; + smChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x); +} + +__device__ void localAllGatherAllPairsSm(mscclpp::SmChannelDeviceHandle* smChans, int rank, int nRanksPerNode, + uint64_t size, size_t nBlocks) { + if (nRanksPerNode == 1) return; + if (blockIdx.x >= nBlocks) return; + + int tid = threadIdx.x + blockIdx.x * blockDim.x; + const int nPeer = nRanksPerNode - 1; + + if (tid < nPeer) { + smChans[tid].signal(); + } + int waitStart = nBlocks * blockDim.x - nPeer; + if (tid >= waitStart && tid < nBlocks * blockDim.x) { + smChans[tid - waitStart].wait(); + } + allGatherDeviceSyncer.sync(nBlocks); + for (int i = 0; i < nPeer; ++i) { + int peerIdx = (i + rank) % nPeer; + const int remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + size_t offset = size * remoteRankLocalIndex; + smChans[peerIdx].get(offset, size, tid, blockDim.x * nBlocks); + } +} + +// This is an allgather4 equivalent +__device__ void allGatherSm(mscclpp::SmChannelDeviceHandle* smChans, + mscclpp::SimpleProxyChannelDeviceHandle* proxyChans, int rank, int worldSize, + int nRanksPerNode, size_t nelemsPerGPU, int pipelineDepth) { + // this allgather is a pipelined and hierarchical one and only works for two nodes + // it is implemented as follows: + // Step 1: each node does a local allgather and concurrently, + // local GPU i exchange (piplineSize-1)/pipelineSize portion of their data with + // its cross-node neighbor (local GPU i on the other node) via IB + // Step 2: each node does a local allgather again with the data just received from its + // cross-node neighbor in step 1, and concurrently, exchange the rest of the data with + // its cross-node neighbor + // Step 3: each node does a local allgather for the last time with the rest of the data + + int pipelineSize = pipelineDepth; + int peerRank = (rank + nRanksPerNode) % worldSize; + int peerNodeId = peerRank / nRanksPerNode; + int peer = (peerRank < rank) ? peerRank : peerRank - 1; + mscclpp::SimpleProxyChannelDeviceHandle proxyChan = proxyChans[peer]; + const size_t nBlocksForLocalAllGather = gridDim.x / (nRanksPerNode - 1) * (nRanksPerNode - 1); + const size_t rankChunkSize = nelemsPerGPU * sizeof(int); + const int startRankIndexInLocalNode = (rank / nRanksPerNode) * nRanksPerNode; + const int startRankIndexInPeerNode = (peerRank / nRanksPerNode) * nRanksPerNode; + + if (peerNodeId == rank / nRanksPerNode) { + localAllGatherSm(smChans, rank, nRanksPerNode, 0, 0, rankChunkSize, rankChunkSize, gridDim.x); + return; + } + + constexpr size_t alignment = 128; + size_t step1Bytes = (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int); + step1Bytes = step1Bytes / alignment * alignment; + const size_t step2Bytes = nelemsPerGPU * sizeof(int) - step1Bytes; + + // Step 1 + if (threadIdx.x == 0 && blockIdx.x == 0 && step1Bytes > 0) { + proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), step1Bytes); + } + localAllGatherSm(smChans, rank, nRanksPerNode, startRankIndexInLocalNode, 0, rankChunkSize, rankChunkSize, + nBlocksForLocalAllGather); + if (threadIdx.x == 0 && blockIdx.x == 0 && step1Bytes > 0) { + proxyChan.wait(); + proxyChan.flush(); + } + deviceSyncer.sync(gridDim.x); + // Step 2 + if (threadIdx.x == 0 && blockIdx.x == 0) { + proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int) + step1Bytes, step2Bytes); + } + if (step1Bytes > 0) + localAllGatherSm(smChans, rank, nRanksPerNode, startRankIndexInPeerNode, 0, rankChunkSize, step1Bytes, + nBlocksForLocalAllGather); + if (threadIdx.x == 0 && blockIdx.x == 0) { + proxyChan.wait(); + proxyChan.flush(); + } + deviceSyncer.sync(gridDim.x); + // Step 3 + localAllGatherSm(smChans, rank, nRanksPerNode, startRankIndexInPeerNode, step1Bytes, rankChunkSize, step2Bytes, + nBlocksForLocalAllGather); +} + +__device__ void reduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans, + mscclpp::SimpleProxyChannelDeviceHandle* proxyChans, TYPE* buff, TYPE* scratch, + int rank, int nRanksPerNode, int worldSize, + size_t nelems, // must be divisible by 3 + int pipelineDepth) { + // this reduce-scatter algorithm works as follows: + // Step 1: each node does a local reduce-scatter on peer node data chunks with 1/pipeline portion of chunk data. For + // example, 2 nodes and each node has 2 ranks. rank 0 and rank 1 perform reduce-scatter on chunk 2 and chunk 3, with + // 1/pipeline portion of the data. + // Step 2: each node does a local reduce-scatter on peers data chunks with (pipeline-1)/pipeline portion of chunk + // data. Meanwhile, exchange the reduced data of the previous step with its cross-node neighbor (same local rank + // number on the other node) via IB. Then performs a reduce operation. + // Step 3: each node does a local reduce-scatter on local ranks, meanwhile exchange the reduced data of the previous + // step with its cross-node neighbor (same local rank number on the other node) via IB. Then performs a reduce + // operation. + int pipelineSize = pipelineDepth; + float nBlocksForReduceScatterRatio = 0.8; + const size_t chunkSize = nelems / worldSize; + const int peerRank = (rank + nRanksPerNode) % worldSize; + int peerNodeId = peerRank / nRanksPerNode; + int nBlocksForReduceScatter = + (int)(nBlocksForReduceScatterRatio * gridDim.x) / (nRanksPerNode - 1) * (nRanksPerNode - 1); + int isComm = (threadIdx.x == 0) && (blockIdx.x == nBlocksForReduceScatter); + int peer = (peerRank < rank) ? peerRank : peerRank - 1; + int nBlocksRemain = gridDim.x - nBlocksForReduceScatter; + mscclpp::SimpleProxyChannelDeviceHandle proxyChan = proxyChans[peer]; + if (peerNodeId == rank / nRanksPerNode) { + localReduceScatterSm(smChans, buff, rank, nRanksPerNode, 0, 0, chunkSize, chunkSize, gridDim.x); + return; + } + + // step 1: local reduce + int startChunkIndex = peerNodeId * nRanksPerNode; + localReduceScatterSm(smChans, buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize / pipelineSize, + nBlocksForReduceScatter); + deviceSyncer.sync(gridDim.x); + + // step 2: local reduce and exchange data with neighbor + if (isComm) { + size_t offset = (peerRank * chunkSize) * sizeof(int); + // opposite side + proxyChan.putWithSignal(offset, (chunkSize / pipelineSize * sizeof(int))); + } + if (pipelineSize > 1) + localReduceScatterSm(smChans, buff, rank, nRanksPerNode, startChunkIndex, chunkSize / pipelineSize, chunkSize, + (pipelineSize - 1) * chunkSize / pipelineSize, nBlocksForReduceScatter); + if (isComm) { + proxyChan.wait(); + } + if (blockIdx.x >= nBlocksForReduceScatter) { + ibDeviceSyncer.sync(nBlocksRemain); + // reduce data received from peer to related rank + size_t offset = rank * chunkSize * sizeof(int); + int* dst = (int*)((char*)buff + offset); + int* src = (int*)((char*)scratch + offset); + vectorSum((TYPE*)dst, (TYPE*)src, chunkSize / pipelineSize, blockIdx.x - nBlocksForReduceScatter, nBlocksRemain); + } + if (isComm) { + proxyChan.flush(); + } + deviceSyncer.sync(gridDim.x); + + // step 3: local reduce and exchange data with neighbor + startChunkIndex = (rank / nRanksPerNode) * nRanksPerNode; + if (isComm && pipelineSize > 1) { + size_t offset = (peerRank * chunkSize + chunkSize / pipelineSize) * sizeof(int); + proxyChan.putWithSignal(offset, (pipelineSize - 1) * chunkSize / pipelineSize * sizeof(int)); + } + localReduceScatterSm(smChans, buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize, + nBlocksForReduceScatter); + if (isComm && pipelineSize > 1) { + proxyChan.wait(); + } + deviceSyncer.sync(gridDim.x); + // reduce to related rank, can not overlap since localReduceScatter also calculate the sum + size_t offset = (rank * chunkSize + chunkSize / pipelineSize) * sizeof(int); + int* dst = (int*)((char*)buff + offset); + int* src = (int*)((char*)scratch + offset); + if (pipelineSize > 1) vectorSum((TYPE*)dst, (TYPE*)src, (pipelineSize - 1) * chunkSize / pipelineSize); + if (isComm) { + proxyChan.flush(); + } +} + +extern "C" __global__ void __launch_bounds__(1024, 1) __global__ + allreduce4(mscclpp::SmChannelDeviceHandle* smChans, + mscclpp::SimpleProxyChannelDeviceHandle* reduceScatterProxyChans, + mscclpp::SimpleProxyChannelDeviceHandle* allGatherProxyChans, TYPE* buff, TYPE* scratch, int rank, + int nRanksPerNode, int worldSize, size_t nelems, int pipelineDepth) { + nelems = nelems / (sizeof(int) / sizeof(TYPE)); + reduceScatterSm(smChans, reduceScatterProxyChans, buff, scratch, rank, nRanksPerNode, worldSize, nelems, + pipelineDepth); + deviceSyncer.sync(gridDim.x); + allGatherSm(smChans, allGatherProxyChans, rank, worldSize, nRanksPerNode, nelems / worldSize, pipelineDepth); +} + +// allreduce 5 for 2-nodes +extern "C" __global__ void __launch_bounds__(1024, 1) + allreduce5(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::SimpleProxyChannelDeviceHandle* proxyChans, TYPE* buff, + TYPE* scratch, TYPE* putBuff, TYPE* resultBuff, int rank, int nRanksPerNode, int worldSize, + size_t nelems) { + nelems = nelems / (sizeof(int) / sizeof(TYPE)); + // This version of allreduce only works for single nodes + const int nPeersInNode = nRanksPerNode - 1; + const int nPkts = nelems / 2; + const int nelemsPerLocalRank = nelems / nRanksPerNode; + const int nPktsPerLocalRank = nelemsPerLocalRank / 2; + const int localRankId = rank % nRanksPerNode; + // flag for packets. Initially 1 + const uint32_t flag = (uint32_t)globalFlag; + // thread block & channel info + const int nBlocksPerPeer = gridDim.x / nPeersInNode; + const int localBlockIdx = blockIdx.x % nBlocksPerPeer; + const int peerIdx = blockIdx.x / nBlocksPerPeer; + const int remoteRankIdx = peerIdx < localRankId ? peerIdx : peerIdx + 1; + mscclpp::SmChannelDeviceHandle smChan = smChans[peerIdx]; + mscclpp::SimpleProxyChannelDeviceHandle proxyChan = proxyChans[localRankId]; + const int tid = threadIdx.x + localBlockIdx * blockDim.x; + // double buffering + size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LLPacket); + size_t putBaseOffset = (flag & 1) ? 0 : nPktsPerLocalRank * sizeof(mscclpp::LLPacket); + void* scratchBuff = (void*)((char*)scratch + scratchBaseOffset); + size_t scratchOffset = scratchBaseOffset + localRankId * nPktsPerLocalRank * sizeof(mscclpp::LLPacket); + size_t scratchResultOffset = + (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LLPacket) : 3 * nPkts * sizeof(mscclpp::LLPacket); + size_t srcOffset = remoteRankIdx * nelemsPerLocalRank * sizeof(int); + uint2* src = (uint2*)((char*)buff + localRankId * nelemsPerLocalRank * sizeof(int)); + uint2* dst = (uint2*)((char*)resultBuff + localRankId * nelemsPerLocalRank * sizeof(int)); + + // step 1: write to scratch buffer + if (nRanksPerNode > 1) { + smChan.putPackets(scratchOffset, srcOffset, nelemsPerLocalRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, + flag); + } + // step 2: get data from scratch buffer, do local reduce-scatter in each node. + mscclpp::LLPacket* putPkt = (mscclpp::LLPacket*)((char*)putBuff + putBaseOffset); + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerLocalRank; idx += blockDim.x * gridDim.x) { + uint2 data = make_uint2(0, 0); + for (int index = 0; index < nPeersInNode; index++) { + const int remoteRank = index < localRankId ? index : index + 1; + mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerLocalRank; + uint2 val = dstPkt[idx].read(flag); + data = add_vectors(val, data); + } + data = add_vectors(data, src[idx]); + putPkt[idx].write(data.x, data.y, flag); + dst[idx] = data; + } + deviceSyncer.sync(gridDim.x); + // step 3. send local reduced data to remote node. + if (threadIdx.x == 0 && blockIdx.x == 0) { + proxyChan.put(scratchOffset, putBaseOffset, nPktsPerLocalRank * sizeof(mscclpp::LLPacket)); + if ((flag & 63) == 0) { + proxyChan.flush(); + } + } + // step 4. try to read the data from scratch buffer and write to local peers + mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + localRankId * nPktsPerLocalRank; + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerLocalRank; idx += blockDim.x * gridDim.x) { + uint2 res = dst[idx]; + uint2 val = dstPkt[idx].read(flag); + res = add_vectors(res, val); + + mscclpp::LLPacket packet; + packet.data1 = res.x; + packet.flag1 = flag; + packet.data2 = res.y; + packet.flag2 = flag; + size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket) + (idx + localRankId * nPktsPerLocalRank); + for (int index = 0; index < nPeersInNode; index++) { + smChans[index].write(offset, packet); + } + dst[idx] = res; + } + + // step 5: get data result from scratch buffer + dstPkt = (mscclpp::LLPacket*)((char*)scratch + scratchResultOffset); + const int dstOffset = remoteRankIdx * nPktsPerLocalRank; + uint2* result = (uint2*)((char*)resultBuff + remoteRankIdx * nelemsPerLocalRank * sizeof(int)); + if (nRanksPerNode > 1) { + for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerLocalRank; + idx += blockDim.x * nBlocksPerPeer) { + uint2 data = dstPkt[idx + dstOffset].read(flag); + result[idx] = data; + } + } + if (threadIdx.x == 0 && blockIdx.x == 0) { + globalFlag += 1; + } +} diff --git a/python/benchmark/allreduce_bench.py b/python/benchmark/allreduce_bench.py new file mode 100644 index 000000000..aa2c096ed --- /dev/null +++ b/python/benchmark/allreduce_bench.py @@ -0,0 +1,215 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import cupy as cp +from mscclpp_op import MscclppAllReduce1, MscclppAllReduce2, MscclppAllReduce3, MscclppAllReduce4, MscclppAllReduce5 +from nccl_op import NcclAllReduce +from mpi4py import MPI +import cupy.cuda.nccl as nccl +import mscclpp.comm as mscclpp_comm +from mscclpp import ProxyService +from prettytable import PrettyTable +import netifaces as ni + +data_type = cp.float16 + +if data_type == cp.float16: + dtype_str = "fp16" +elif data_type == cp.float32: + dtype_str = "fp32" +elif data_type == cp.int32: + dtype_str = "int32" +else: + raise RuntimeError("Unknown data type") + + +def human_readable_size(size, decimal_places=1): + for unit in ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]: + if size < 1024.0 or unit == "PiB": + break + size /= 1024.0 + return f"{size:.{decimal_places}f} {unit}" + + +def check_correctness(memory, func): + rand_gen = cp.random.default_rng(seed=MPI.COMM_WORLD.rank) + memory[:] = rand_gen.random(memory.shape).astype(data_type) + cp.cuda.runtime.deviceSynchronize() + output_memory = func(0) + cp.cuda.runtime.deviceSynchronize() + expected = cp.zeros_like(memory) + for i in range(MPI.COMM_WORLD.size): + rand_gen = cp.random.default_rng(seed=i) + expected += rand_gen.random(memory.shape).astype(data_type) + + if data_type == cp.float16: + ac = cp.allclose(output_memory, expected, rtol=1.0e-2, atol=1.0e-4) + else: + ac = cp.allclose(output_memory, expected, rtol=1.0e-2, atol=1.0e-4) + + ac = MPI.COMM_WORLD.allreduce(ac, op=MPI.SUM) + if not ac: + print(output_memory, expected) + return ac + + +def bench_time(niter: int, func): + # capture cuda graph for nites of the kernel launch + stream = cp.cuda.Stream(non_blocking=True) + with stream: + stream.begin_capture() + for i in range(niter): + func(stream.ptr) + graph = stream.end_capture() + + # now run a warm up round + graph.launch(stream) + + # now run the benchmark and measure time + start = cp.cuda.Event() + end = cp.cuda.Event() + + start.record(stream) + graph.launch(stream) + end.record(stream) + end.synchronize() + + return cp.cuda.get_elapsed_time(start, end) / niter * 1000.0 + + +def find_best_config(mscclpp_call, niter): + best_time = 10000000.0 + for config in mscclpp_call.auto_tune(): + cur_time = bench_time(niter, mscclpp_call) + if cur_time < best_time: + best_time = cur_time + best_config = config + if MPI.COMM_WORLD.rank == 0: + print("t", end="", flush=True) + best_config = MPI.COMM_WORLD.bcast(best_config, root=0) + if MPI.COMM_WORLD.rank == 0: + print(best_config, end="", flush=True) + return best_config + + +def run_benchmark( + mscclpp_group: mscclpp_comm.CommGroup, nccl_op: nccl.NcclCommunicator, table: PrettyTable, niter: int, nelem: int +): + memory = cp.zeros(nelem, dtype=data_type) + memory_out = cp.zeros(nelem, dtype=data_type) + cp.cuda.runtime.deviceSynchronize() + + if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1: + if memory.nbytes < 2**20: + mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out) + elif memory.nbytes < 2**29: + if memory.nbytes >= 2**20 and memory.nbytes <= 2**22: + read_only = 0 + else: + read_only = 1 + mscclpp_call = MscclppAllReduce1(mscclpp_group, memory, read_only=read_only) + else: + proxy_service = ProxyService() + mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service) + proxy_service.start_proxy() + else: + if memory.nbytes < 2**22: + proxy_service = ProxyService() + mscclpp_call = MscclppAllReduce5(mscclpp_group, memory, memory_out, N_GPUS_PER_NODE, proxy_service) + proxy_service.start_proxy() + best_config = find_best_config(mscclpp_call, 100) + mscclpp_call.set_params(*best_config) + else: + proxy_service = ProxyService() + mscclpp_call = MscclppAllReduce4(mscclpp_group, memory, N_GPUS_PER_NODE, proxy_service) + proxy_service.start_proxy() + best_config = find_best_config(mscclpp_call, 20) + mscclpp_call.set_params(*best_config) + + nccl_call = NcclAllReduce(nccl_op, memory) + + memory_nbytes = memory.nbytes + mscclpp_time = bench_time(niter, mscclpp_call) + mscclpp_algBw = memory_nbytes / mscclpp_time / 1e3 + mscclpp_check = "PASS" if check_correctness(memory, mscclpp_call) else "FAIL" + + nccl_time = bench_time(niter, nccl_call) + nccl_algBw = memory_nbytes / nccl_time / 1e3 + nccl_check = "PASS" if check_correctness(memory, nccl_call) else "FAIL" + + if ( + isinstance(mscclpp_call, MscclppAllReduce3) + or isinstance(mscclpp_call, MscclppAllReduce5) + or isinstance(mscclpp_call, MscclppAllReduce4) + ): + MPI.COMM_WORLD.barrier() + proxy_service.stop_proxy() + + if MPI.COMM_WORLD.rank == 0: + table.add_row( + [ + human_readable_size(memory_nbytes), + "{:.2f}".format(mscclpp_time), + "{:.2f}".format(mscclpp_algBw), + mscclpp_check, + "{:.2f}".format(nccl_time), + "{:.2f}".format(nccl_algBw), + nccl_check, + "{:.2f}".format(nccl_time / mscclpp_time), + ] + ) + if MPI.COMM_WORLD.rank == 0: + print(".", end="", flush=True) + + +if __name__ == "__main__": + shm_comm = MPI.COMM_WORLD.Split_type(MPI.COMM_TYPE_SHARED, 0, MPI.INFO_NULL) + N_GPUS_PER_NODE = shm_comm.size + shm_comm.Free() + cp.cuda.Device(MPI.COMM_WORLD.rank % N_GPUS_PER_NODE).use() + + # create a MscclppGroup + network_interface = "eth0" + my_ip = ni.ifaddresses(network_interface)[ni.AF_INET][0]["addr"] + root_ip = MPI.COMM_WORLD.bcast(my_ip, root=0) + ifIpPortTrio = network_interface + ":" + root_ip + ":50000" # some random port + mscclpp_group = mscclpp_comm.CommGroup( + interfaceIpPortTrio=ifIpPortTrio, rank=MPI.COMM_WORLD.rank, size=MPI.COMM_WORLD.size + ) + + # create a NcclComm + if MPI.COMM_WORLD.rank == 0: + uid = nccl.get_unique_id() + else: + uid = None + uid = MPI.COMM_WORLD.bcast(uid, root=0) + nccl_comm = nccl.NcclCommunicator(MPI.COMM_WORLD.size, uid, MPI.COMM_WORLD.rank) + + table = None + if MPI.COMM_WORLD.rank == 0: + # Set table headers + table = PrettyTable() + table.field_names = [ + f"Size ({dtype_str})", + "Time (us)", + "AlgBW (GB/s)", + "Correctness", + "NCCL Time (us)", + "NCCL AlgBW (GB/s)", + "NCCL Correctness", + "Speed Up", + ] + + for i in range(10, 28): + if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1: + run_benchmark(mscclpp_group, nccl_comm, table, 100, 2**i) + elif MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 2: + run_benchmark(mscclpp_group, nccl_comm, table, 100, 3 * 2**i) + else: + raise RuntimeError("Only support one node/two nodes communication") + + if MPI.COMM_WORLD.rank == 0: + print() + print(table) + mscclpp_group = None + nccl_comm = None diff --git a/python/benchmark/mscclpp_op.py b/python/benchmark/mscclpp_op.py new file mode 100644 index 000000000..92d17e2f7 --- /dev/null +++ b/python/benchmark/mscclpp_op.py @@ -0,0 +1,344 @@ +import os +import cupy as cp +import ctypes +from mscclpp import Transport, ProxyService +import mscclpp.comm as mscclpp_comm +from mscclpp.utils import KernelBuilder, pack + + +IB_TRANSPORTS = [ + Transport.IB0, + Transport.IB1, + Transport.IB2, + Transport.IB3, + Transport.IB4, + Transport.IB5, + Transport.IB6, + Transport.IB7, +] + + +def type_to_str(dtype): + if dtype == cp.float16: + return "__half" + elif dtype == cp.float32: + return "float" + elif dtype == cp.int32: + return "int" + else: + raise RuntimeError("Unknown data type") + + +class MscclppAllReduce1: + def __init__( + self, + group: mscclpp_comm.CommGroup, + memory: cp.ndarray, + read_only: int = 1, + nthreads: int = 1024, + nblocks: int = 24, + ): + self.group = group + self.memory = memory + remote_nghrs = list(range(self.group.nranks)) + remote_nghrs.remove(self.group.my_rank) + + self.group.barrier() + # create a connection for each remote neighbor + self.connections = self.group.make_connection(remote_nghrs, Transport.CudaIpc) + type_str = type_to_str(memory.dtype) + + # create a sm_channel for each remote neighbor + self.sm_channels = self.group.make_sm_channels(self.memory, self.connections) + file_dir = os.path.dirname(os.path.abspath(__file__)) + self.kernel = KernelBuilder( + file="allreduce.cu", + kernel_name="allreduce1", + file_dir=file_dir, + macro_dict={"TYPE": type_str, "READ_ONLY": str(read_only)}, + ).get_compiled_kernel() + self.params = b"" + self.device_handles = [] + for rank in range(self.group.nranks): + if rank != self.group.my_rank: + self.device_handles.append(self.sm_channels[rank].device_handle().raw) + self.params += pack( + cp.asarray(memoryview(b"".join(self.device_handles)), dtype=cp.uint8), + self.memory, + self.group.my_rank, + self.group.nranks, + ctypes.c_size_t(self.memory.size), + ) + self.nthreads = nthreads + self.nblocks = nblocks + + def __call__(self, stream_ptr): + self.kernel.launch_kernel(self.params, self.nblocks, self.nthreads, 0, stream_ptr) + return self.memory + + +class MscclppAllReduce2: + def __init__(self, group: mscclpp_comm.CommGroup, memory: cp.ndarray, memory_out: cp.ndarray): + self.group = group + self.memory = memory + self.memory_out = memory_out + remote_nghrs = list(range(self.group.nranks)) + remote_nghrs.remove(self.group.my_rank) + + self.group.barrier() + # create a connection for each remote neighbor + self.connections = self.group.make_connection(remote_nghrs, Transport.CudaIpc) + type_str = type_to_str(memory.dtype) + + self.scratch = cp.zeros(self.memory.size * 8, dtype=self.memory.dtype) + # create a sm_channel for each remote neighbor + self.sm_channels = self.group.make_sm_channels_with_scratch(self.memory, self.scratch, self.connections) + file_dir = os.path.dirname(os.path.abspath(__file__)) + self.kernel = KernelBuilder( + file="allreduce.cu", kernel_name="allreduce2", file_dir=file_dir, macro_dict={"TYPE": type_str} + ).get_compiled_kernel() + self.params = b"" + self.device_handles = [] + for rank in range(self.group.nranks): + if rank != self.group.my_rank: + self.device_handles.append(self.sm_channels[rank].device_handle().raw) + self.params += pack( + cp.asarray(memoryview(b"".join(self.device_handles)), dtype=cp.uint8), + self.memory, + self.scratch, + self.memory_out, + self.group.my_rank, + self.group.nranks, + ctypes.c_size_t(self.memory.size), + ) + + def __call__(self, stream_ptr): + self.kernel.launch_kernel(self.params, 21, 512, 0, stream_ptr) + return self.memory_out + + +class MscclppAllReduce3: + def __init__(self, group: mscclpp_comm.CommGroup, memory: cp.ndarray, proxy_service: ProxyService): + self.group = group + self.memory = memory + remote_nghrs = list(range(self.group.nranks)) + remote_nghrs.remove(self.group.my_rank) + + self.group.barrier() + # create a connection for each remote neighbor + self.connections = self.group.make_connection(remote_nghrs, Transport.CudaIpc) + type_str = type_to_str(memory.dtype) + + self.proxy_service = proxy_service + self.scratch = cp.zeros(self.memory.size, dtype=self.memory.dtype) + + # create a sm_channel for each remote neighbor + self.fst_round_proxy_chans = self.group.make_proxy_channels_with_scratch( + self.proxy_service, self.memory, self.scratch, self.connections + ) + self.snd_round_proxy_chans = self.group.make_proxy_channels(self.proxy_service, self.memory, self.connections) + file_dir = os.path.dirname(os.path.abspath(__file__)) + self.kernel = KernelBuilder( + file="allreduce.cu", kernel_name="allreduce3", file_dir=file_dir, macro_dict={"TYPE": type_str} + ).get_compiled_kernel() + self.params = b"" + self.fst_device_handles = [] + self.snd_device_handles = [] + for rank in range(self.group.nranks): + if rank != self.group.my_rank: + self.fst_device_handles.append(self.fst_round_proxy_chans[rank].device_handle().raw) + self.snd_device_handles.append(self.snd_round_proxy_chans[rank].device_handle().raw) + self.params += pack( + cp.asarray(memoryview(b"".join(self.fst_device_handles)), dtype=cp.uint8), + cp.asarray(memoryview(b"".join(self.snd_device_handles)), dtype=cp.uint8), + self.memory, + self.scratch, + self.group.my_rank, + self.group.nranks, + ctypes.c_size_t(self.memory.size), + ) + + def __call__(self, stream_ptr): + self.kernel.launch_kernel(self.params, 24, 1024, 0, stream_ptr) + return self.memory + + +class MscclppAllReduce4: + def __init__( + self, + group: mscclpp_comm.CommGroup, + memory: cp.ndarray, + nranks_per_node: int, + proxy_service: ProxyService, + nblocks: int = 45, + block_size: int = 512, + pipeline_depth: int = 3, + ): + self.group = group + self.memory = memory + + self.nranks_per_node = nranks_per_node + in_same_node = lambda rank: rank // nranks_per_node == self.group.my_rank // nranks_per_node + remote_nghrs = list(range(self.group.nranks)) + remote_nghrs.remove(self.group.my_rank) + transports = {} + for rank in remote_nghrs: + if in_same_node(rank): + transports[rank] = Transport.CudaIpc + else: + transports[rank] = IB_TRANSPORTS[rank % nranks_per_node] + + self.group.barrier() + # create a connection for each remote neighbor + self.connections = self.group.make_connection(remote_nghrs, transports) + type_str = type_to_str(memory.dtype) + + self.proxy_service = proxy_service + self.scratch = cp.zeros(self.memory.size, dtype=self.memory.dtype) + same_node_connections = {rank: conn for rank, conn in self.connections.items() if in_same_node(rank)} + # create a sm_channel for each remote neighbor + self.sm_channels = self.group.make_sm_channels(self.memory, same_node_connections) + self.reduce_scatter_proxy_channels = self.group.make_proxy_channels_with_scratch( + self.proxy_service, self.memory, self.scratch, self.connections + ) + self.all_gather_proxy_channels = self.group.make_proxy_channels( + self.proxy_service, self.memory, self.connections + ) + file_dir = os.path.dirname(os.path.abspath(__file__)) + self.kernel = KernelBuilder( + file="allreduce.cu", kernel_name="allreduce4", file_dir=file_dir, macro_dict={"TYPE": type_str} + ).get_compiled_kernel() + self.sm_device_handles = [] + self.reduce_sactter_proxy_device_handles = [] + self.all_gather_proxy_device_handles = [] + for rank in range(self.group.nranks): + if rank != self.group.my_rank and in_same_node(rank): + self.sm_device_handles.append(self.sm_channels[rank].device_handle().raw) + if rank != self.group.my_rank: + self.reduce_sactter_proxy_device_handles.append( + self.reduce_scatter_proxy_channels[rank].device_handle().raw + ) + self.all_gather_proxy_device_handles.append(self.all_gather_proxy_channels[rank].device_handle().raw) + + self.set_params(nblocks, block_size, pipeline_depth) + + def __call__(self, stream_ptr): + self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream_ptr) + return self.memory + + def set_params(self, nblocks, block_size, pipeline_depth): + self.nblocks = nblocks + self.block_size = block_size + self.pipeline_depth = pipeline_depth + + self.params = b"" + self.params += pack( + cp.asarray(memoryview(b"".join(self.sm_device_handles)), dtype=cp.uint8), + cp.asarray(memoryview(b"".join(self.reduce_sactter_proxy_device_handles)), dtype=cp.uint8), + cp.asarray(memoryview(b"".join(self.all_gather_proxy_device_handles)), dtype=cp.uint8), + self.memory, + self.scratch, + self.group.my_rank, + self.nranks_per_node, + self.group.nranks, + bytes(4), # padding for memory alignment + ctypes.c_size_t(self.memory.size), + self.pipeline_depth, + ) + + def auto_tune(self): + nblocks_to_try = [24, 32, 40, 45, 48, 64, 72, 90, 96, 108] + block_size_to_try = [256, 512, 1024] + pipeline_depth_to_try = [1, 2, 3, 4] + for nblocks in nblocks_to_try: + for block_size in block_size_to_try: + for pipeline_depth in pipeline_depth_to_try: + self.set_params(nblocks, block_size, pipeline_depth) + yield nblocks, block_size, pipeline_depth + + +class MscclppAllReduce5: + def __init__( + self, + group: mscclpp_comm.CommGroup, + memory: cp.ndarray, + memory_out: cp.ndarray, + nranks_per_node: int, + proxy_service: ProxyService, + nblocks: int = 21, + block_size: int = 512, + ): + self.group = group + self.memory = memory + self.memory_out = memory_out + + self.nranks_per_node = nranks_per_node + in_same_node = lambda rank: rank // nranks_per_node == self.group.my_rank // nranks_per_node + remote_nghrs = list(range(self.group.nranks)) + remote_nghrs.remove(self.group.my_rank) + transports = {} + for rank in remote_nghrs: + if in_same_node(rank): + transports[rank] = Transport.CudaIpc + else: + transports[rank] = IB_TRANSPORTS[rank % nranks_per_node] + + self.group.barrier() + # create a connection for each remote neighbor + self.connections = self.group.make_connection(remote_nghrs, transports) + type_str = type_to_str(memory.dtype) + + self.proxy_service = proxy_service + self.scratch = cp.zeros(self.memory.size * 8, dtype=self.memory.dtype) + self.put_buff = cp.zeros(self.memory.size * 8 // nranks_per_node, dtype=self.memory.dtype) + same_node_connections = {rank: conn for rank, conn in self.connections.items() if in_same_node(rank)} + across_node_connections = {rank: conn for rank, conn in self.connections.items() if not in_same_node(rank)} + # create a sm_channel for each remote neighbor + self.sm_channels = self.group.make_sm_channels_with_scratch(self.memory, self.scratch, same_node_connections) + self.proxy_channels = self.group.make_proxy_channels_with_scratch( + self.proxy_service, self.put_buff, self.scratch, across_node_connections + ) + file_dir = os.path.dirname(os.path.abspath(__file__)) + self.kernel = KernelBuilder( + file="allreduce.cu", kernel_name="allreduce5", file_dir=file_dir, macro_dict={"TYPE": type_str} + ).get_compiled_kernel() + self.sm_device_handles = [] + self.proxy_device_handles = [] + for rank in range(self.group.nranks): + if rank != self.group.my_rank and in_same_node(rank): + self.sm_device_handles.append(self.sm_channels[rank].device_handle().raw) + if rank != self.group.my_rank and not in_same_node(rank): + self.proxy_device_handles.append(self.proxy_channels[rank].device_handle().raw) + + self.set_params(nblocks, block_size) + + def __call__(self, stream_ptr): + self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream_ptr) + return self.memory_out + + def set_params(self, nblocks, block_size): + self.nblocks = nblocks + self.block_size = block_size + + self.params = b"" + self.params += pack( + cp.asarray(memoryview(b"".join(self.sm_device_handles)), dtype=cp.uint8), + cp.asarray(memoryview(b"".join(self.proxy_device_handles)), dtype=cp.uint8), + self.memory, + self.scratch, + self.put_buff, + self.memory_out, + self.group.my_rank, + self.nranks_per_node, + self.group.nranks, + bytes(4), # padding for memory alignment + ctypes.c_size_t(self.memory.size), + ) + + def auto_tune(self): + nblocks_to_try = [21, 42, 84] + block_size_to_try = [256, 512, 1024] + for nblocks in nblocks_to_try: + for block_size in block_size_to_try: + self.set_params(nblocks, block_size) + yield nblocks, block_size diff --git a/python/benchmark/nccl_op.py b/python/benchmark/nccl_op.py new file mode 100644 index 000000000..8a2ff56aa --- /dev/null +++ b/python/benchmark/nccl_op.py @@ -0,0 +1,23 @@ +import cupy.cuda.nccl as nccl +from mpi4py import MPI +import cupy as cp + + +class NcclAllReduce: + def __init__(self, nccl_comm: nccl.NcclCommunicator, memory: cp.ndarray): + self.nccl_comm = nccl_comm + self.memory = memory + if memory.dtype == cp.float32: + self.nccl_dtype = nccl.NCCL_FLOAT32 + elif memory.dtype == cp.float16: + self.nccl_dtype = nccl.NCCL_FLOAT16 + elif memory.dtype == cp.int32: + self.nccl_dtype = nccl.NCCL_INT32 + else: + raise RuntimeError("Make sure that the data type is mapped to the correct NCCL data type") + + def __call__(self, stream_ptr): + self.nccl_comm.allReduce( + self.memory.data.ptr, self.memory.data.ptr, self.memory.size, self.nccl_dtype, nccl.NCCL_SUM, stream_ptr + ) + return self.memory diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py index 5165e95cb..d411bc1b0 100644 --- a/python/mscclpp/__init__.py +++ b/python/mscclpp/__init__.py @@ -23,6 +23,9 @@ __version__ = version() +if _os.environ.get("MSCCLPP_HOME", None) is None: + _os.environ["MSCCLPP_HOME"] = _os.path.abspath(_os.path.dirname(__file__)) + def get_include(): """Return the directory that contains the MSCCL++ headers.""" diff --git a/python/test/mscclpp_group.py b/python/mscclpp/comm.py similarity index 66% rename from python/test/mscclpp_group.py rename to python/mscclpp/comm.py index 7a7c7b017..c01c04a2d 100644 --- a/python/test/mscclpp_group.py +++ b/python/mscclpp/comm.py @@ -2,11 +2,10 @@ # Licensed under the MIT license. from __future__ import annotations -import logging from typing import Type import cupy as cp -from mscclpp import ( +from ._mscclpp import ( Communicator, Connection, Host2DeviceSemaphore, @@ -20,26 +19,32 @@ Transport, TransportFlags, ) +import mpi4py import numpy as np -from .mscclpp_mpi import MpiGroup -logger = logging.getLogger(__name__) - - -class MscclppGroup: - def __init__(self, mpi_group: MpiGroup, interfaceIpPortTrio=""): - self.bootstrap = TcpBootstrap.create(mpi_group.comm.rank, mpi_group.comm.size) +class CommGroup: + def __init__( + self, mpi_comm: mpi4py.MPI.Comm = None, interfaceIpPortTrio: str = "", rank: int = None, size: int = None + ): if interfaceIpPortTrio == "": + self.bootstrap = TcpBootstrap.create(mpi_comm.rank, mpi_comm.size) uniq_id = None - if mpi_group.comm.rank == 0: + if mpi_comm.rank == 0: # similar to NCCL's unique id uniq_id = self.bootstrap.create_unique_id() - uniq_id_global = mpi_group.comm.bcast(uniq_id, 0) + uniq_id_global = mpi_comm.bcast(uniq_id, 0) self.bootstrap.initialize(uniq_id_global) - else: + elif mpi_comm: # use this instead + self.bootstrap = TcpBootstrap.create(mpi_comm.rank, mpi_comm.size) + self.bootstrap.initialize(interfaceIpPortTrio) + elif not interfaceIpPortTrio == "": + assert rank >= 0 and size >= 1 + self.bootstrap = TcpBootstrap.create(rank, size) self.bootstrap.initialize(interfaceIpPortTrio) + else: + raise RuntimeError("Either the interface or mpi_group need to be specified") self.communicator = Communicator(self.bootstrap) self.my_rank = self.bootstrap.get_rank() self.nranks = self.bootstrap.get_n_ranks() @@ -73,9 +78,15 @@ def my_ib_device(self, local_rank: int) -> Transport: else: assert False # only 8 IBs are supported - def make_connection(self, remote_ranks: list[int], transport: Transport) -> dict[int, Connection]: + def make_connection( + self, remote_ranks: list[int], transports: Transport | dict[int, Transport] + ) -> dict[int, Connection]: connections = {} for rank in remote_ranks: + if type(transports) is dict: + transport = transports[rank] + else: + transport = transports connections[rank] = self.communicator.connect_on_setup(rank, 0, transport) self.communicator.setup() connections = {rank: connections[rank].get() for rank in connections} @@ -119,19 +130,19 @@ def make_sm_channels(self, tensor: cp.ndarray, connections: dict[int, Connection channels[rank] = SmChannel(semaphores[rank], registered_memories[rank], tensor.data.ptr) return channels - def make_sm_channels_with_packet( - self, tensor: cp.ndarray, packetTensor: cp.ndarray, connections: dict[int, Connection] + def make_sm_channels_with_scratch( + self, tensor: cp.ndarray, scratchTensor: cp.ndarray, connections: dict[int, Connection] ) -> dict[int, SmChannel]: semaphores = self.make_semaphore(connections, SmDevice2DeviceSemaphore) - registered_memories = self.register_tensor_with_connections(packetTensor, connections) + registered_memories = self.register_tensor_with_connections(scratchTensor, connections) channels = {} for rank in connections: channels[rank] = SmChannel( - semaphores[rank], registered_memories[rank], tensor.data.ptr, packetTensor.data.ptr + semaphores[rank], registered_memories[rank], tensor.data.ptr, scratchTensor.data.ptr ) return channels - def make_proxy_channels_with_packet( + def make_proxy_channels( self, proxy_service: ProxyService, tensor: cp.ndarray, connections: dict[int, Connection] ) -> dict[int, SmChannel]: semaphores = self.make_semaphore(connections, Host2DeviceSemaphore) @@ -148,3 +159,34 @@ def make_proxy_channels_with_packet( proxy_service.proxy_channel(semaphore_ids[rank]), memory_ids[rank], memory_ids[self.my_rank] ) return channels + + def make_proxy_channels_with_scratch( + self, + proxy_service: ProxyService, + tensor: cp.ndarray, + scratchTensor: cp.ndarray, + connections: dict[int, Connection], + ) -> dict[int, SmChannel]: + transport_flags = TransportFlags() + for rank in connections: + transport_flags |= connections[rank].transport() + data_ptr = tensor.data.ptr if isinstance(tensor, cp.ndarray) else tensor.ctypes.data + local_reg_memory = self.communicator.register_memory(data_ptr, tensor.size * tensor.itemsize, transport_flags) + + semaphores = self.make_semaphore(connections, Host2DeviceSemaphore) + registered_memories = self.register_tensor_with_connections(scratchTensor, connections) + memory_ids = {} + semaphore_ids = {} + for rank in registered_memories: + if rank == self.my_rank: + memory_ids[self.my_rank] = proxy_service.add_memory(local_reg_memory) + else: + memory_ids[rank] = proxy_service.add_memory(registered_memories[rank]) + for rank in semaphores: + semaphore_ids[rank] = proxy_service.add_semaphore(semaphores[rank]) + channels = {} + for rank in semaphores: + channels[rank] = SimpleProxyChannel( + proxy_service.proxy_channel(semaphore_ids[rank]), memory_ids[rank], memory_ids[self.my_rank] + ) + return channels diff --git a/python/test/utils.py b/python/mscclpp/utils.py similarity index 79% rename from python/test/utils.py rename to python/mscclpp/utils.py index ca11407d1..9f71b70c4 100644 --- a/python/test/utils.py +++ b/python/mscclpp/utils.py @@ -74,19 +74,27 @@ def __del__(self): class KernelBuilder: kernel_map: dict = {} - def __init__(self, file: str, kernel_name: str): - if kernel_name in self.kernel_map: - self._kernel = self.kernel_map[kernel_name] + def get_key(self, kernel_name, macro_dict): + return kernel_name + "-".join(f"{key}={macro_dict[key]}" for key in sorted(macro_dict)) + + def __init__(self, file: str, kernel_name: str, file_dir: str = None, macro_dict: dict = {}): + kernel_key = self.get_key(kernel_name, macro_dict) + if kernel_key in self.kernel_map: + self._kernel = self.kernel_map[kernel_key] return self._tempdir = tempfile.TemporaryDirectory(suffix=f"{os.getpid()}") - self._current_file_dir = os.path.dirname(os.path.abspath(__file__)) + self._current_file_dir = file_dir if file_dir else os.path.dirname(os.path.abspath(__file__)) + self.macros = None + if file_dir: + self.macros = ["-D{}={}".format(macro, value) for macro, value in macro_dict.items()] device_id = cp.cuda.Device().id ptx = self._compile_cuda(os.path.join(self._current_file_dir, file), f"{kernel_name}.ptx", device_id) self._kernel = Kernel(ptx, kernel_name, device_id) - self.kernel_map[kernel_name] = self._kernel + self.kernel_map[kernel_key] = self._kernel def _compile_cuda(self, source_file, output_file, device_id, std_version="c++17"): - include_dir = os.path.join(self._current_file_dir, "../../include") + mscclpp_home = os.environ.get("MSCCLPP_HOME", "/usr/local/mscclpp") + include_dir = os.path.join(mscclpp_home, "include") major = _check_cuda_errors( cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device_id) ) @@ -108,12 +116,15 @@ def _compile_cuda(self, source_file, output_file, device_id, std_version="c++17" "-o", f"{self._tempdir.name}/{output_file}", ] + if self.macros: + command += self.macros try: subprocess.run(command, capture_output=True, text=True, check=True, bufsize=1) with open(f"{self._tempdir.name}/{output_file}", "rb") as f: return f.read() except subprocess.CalledProcessError as e: - raise RuntimeError("Compilation failed:", e.stderr, " ".join(command)) + print(e.stderr, end="") + raise RuntimeError("Compilation failed: ", " ".join(command)) def get_compiled_kernel(self): return self._kernel @@ -128,6 +139,8 @@ def pack(*args): for arg in list(args): if isinstance(arg, int): res += struct.pack("i", arg) + elif isinstance(arg, ctypes.c_size_t): + res += struct.pack("N", arg.value) elif isinstance(arg, np.ndarray): res += struct.pack("P", arg.ctypes.data) elif isinstance(arg, cp.ndarray): @@ -135,6 +148,8 @@ def pack(*args): # use int to represent bool, which can avoid CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES error elif isinstance(arg, bool): res += struct.pack("i", arg) + elif isinstance(arg, bytes): + res += struct.pack(f"{len(arg)}s", arg) else: raise RuntimeError(f"Unsupported type: {type(arg)}") return res diff --git a/python/requirements_cu11.txt b/python/requirements_cu11.txt new file mode 100644 index 000000000..47285da3e --- /dev/null +++ b/python/requirements_cu11.txt @@ -0,0 +1,7 @@ +mpi4py +cupy-cuda11x +prettytable +cuda-python +netifaces +pytest +numpy diff --git a/python/requirements_cu12.txt b/python/requirements_cu12.txt new file mode 100644 index 000000000..094dff8de --- /dev/null +++ b/python/requirements_cu12.txt @@ -0,0 +1,7 @@ +mpi4py +cupy-cuda12x +prettytable +cuda-python +netifaces +pytest +numpy diff --git a/python/test/mscclpp_mpi.py b/python/test/mscclpp_mpi.py index 1f37eb9c6..21a7fff44 100644 --- a/python/test/mscclpp_mpi.py +++ b/python/test/mscclpp_mpi.py @@ -38,10 +38,13 @@ def finalize_mpi(): class MpiGroup: - def __init__(self, ranks: list): + def __init__(self, ranks: list = []): world_group = MPI.COMM_WORLD.group - group = world_group.Incl(ranks) - self.comm = MPI.COMM_WORLD.Create(group) + if len(ranks) == 0: + self.comm = MPI.COMM_WORLD + else: + group = world_group.Incl(ranks) + self.comm = MPI.COMM_WORLD.Create(group) @pytest.fixture diff --git a/python/test/requirements_cu11.txt b/python/test/requirements_cu11.txt deleted file mode 100644 index 2b79ab977..000000000 --- a/python/test/requirements_cu11.txt +++ /dev/null @@ -1,6 +0,0 @@ -cuda-python==12.1.0 -mpi4py==3.1.4 -netifaces==0.11.0 -numpy==1.22.2 -pytest==7.2.2 -cupy-cuda11x diff --git a/python/test/requirements_cu12.txt b/python/test/requirements_cu12.txt deleted file mode 100644 index 0061438d2..000000000 --- a/python/test/requirements_cu12.txt +++ /dev/null @@ -1,6 +0,0 @@ -cuda-python==12.1.0 -mpi4py==3.1.4 -netifaces==0.11.0 -numpy==1.22.2 -pytest==7.2.2 -cupy-cuda12x diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index 4fc00e3b3..f3a7f9dd6 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -2,6 +2,7 @@ # Licensed under the MIT license. from concurrent.futures import ThreadPoolExecutor +import os import time import threading @@ -11,18 +12,18 @@ import pytest from mscclpp import ( - TcpBootstrap, Fifo, Host2DeviceSemaphore, Host2HostSemaphore, ProxyService, SmDevice2DeviceSemaphore, + TcpBootstrap, Transport, ) +import mscclpp.comm as mscclpp_comm +from mscclpp.utils import KernelBuilder, pack from ._cpp import _ext -from .mscclpp_group import MscclppGroup from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group -from .utils import KernelBuilder, pack ethernet_interface_name = "eth0" @@ -50,7 +51,7 @@ def test_group_with_ip(mpi_group: MpiGroup, ifIpPortTrio: str): # ranks are on different nodes pytest.skip("this case is not supported as localhost will be different for different nodes") - group = MscclppGroup(mpi_group, ifIpPortTrio) + group = mscclpp_comm.CommGroup(mpi_group.comm, ifIpPortTrio) nelem = 1024 memory = np.zeros(nelem, dtype=np.int32) @@ -119,7 +120,7 @@ def init_target(): def create_and_connect(mpi_group: MpiGroup, transport: str): if transport == "NVLink" and all_ranks_on_the_same_node(mpi_group) is False: pytest.skip("cannot use nvlink for cross node") - group = MscclppGroup(mpi_group) + group = mscclpp_comm.CommGroup(mpi_group.comm) remote_nghrs = list(range(mpi_group.comm.size)) remote_nghrs.remove(mpi_group.comm.rank) @@ -278,33 +279,40 @@ def __init__( scratch=None, fifo=None, ): + file_dir = os.path.dirname(os.path.abspath(__file__)) if test_name == "h2d_semaphore": self._kernel = KernelBuilder( - file="h2d_semaphore_test.cu", kernel_name="h2d_semaphore" + file="h2d_semaphore_test.cu", kernel_name="h2d_semaphore", file_dir=file_dir ).get_compiled_kernel() self.nblocks = 1 self.nthreads = nranks elif test_name == "d2d_semaphore": self._kernel = KernelBuilder( - file="d2d_semaphore_test.cu", kernel_name="d2d_semaphore" + file="d2d_semaphore_test.cu", kernel_name="d2d_semaphore", file_dir=file_dir ).get_compiled_kernel() self.nblocks = 1 self.nthreads = nranks elif test_name == "sm_channel": - self._kernel = KernelBuilder(file="sm_channel_test.cu", kernel_name="sm_channel").get_compiled_kernel() + self._kernel = KernelBuilder( + file="sm_channel_test.cu", kernel_name="sm_channel", file_dir=file_dir + ).get_compiled_kernel() self.nblocks = nranks self.nthreads = 1024 elif test_name == "fifo": - self._kernel = KernelBuilder(file="fifo_test.cu", kernel_name="fifo").get_compiled_kernel() + self._kernel = KernelBuilder( + file="fifo_test.cu", kernel_name="fifo", file_dir=file_dir + ).get_compiled_kernel() self.nblocks = 1 self.nthreads = 1 elif test_name == "proxy": - self._kernel = KernelBuilder(file="proxy_test.cu", kernel_name="proxy").get_compiled_kernel() + self._kernel = KernelBuilder( + file="proxy_test.cu", kernel_name="proxy", file_dir=file_dir + ).get_compiled_kernel() self.nblocks = 1 self.nthreads = nranks elif test_name == "simple_proxy_channel": self._kernel = KernelBuilder( - file="simple_proxy_channel_test.cu", kernel_name="simple_proxy_channel" + file="simple_proxy_channel_test.cu", kernel_name="simple_proxy_channel", file_dir=file_dir ).get_compiled_kernel() self.nblocks = 1 self.nthreads = 1024 @@ -393,7 +401,7 @@ def test_sm_channels(mpi_group: MpiGroup, nelem: int, use_packet: bool): memory_expected[(nelemPerRank * rank) : (nelemPerRank * (rank + 1))] = rank + 1 if use_packet: - channels = group.make_sm_channels_with_packet(memory, scratch, connections) + channels = group.make_sm_channels_with_scratch(memory, scratch, connections) else: channels = group.make_sm_channels(memory, connections) kernel = MscclppKernel("sm_channel", group.my_rank, group.nranks, channels, memory, use_packet, scratch) @@ -496,7 +504,7 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u memory_to_register = scratch else: memory_to_register = memory - simple_channels = group.make_proxy_channels_with_packet(proxy_service, memory_to_register, connections) + simple_channels = group.make_proxy_channels(proxy_service, memory_to_register, connections) kernel = MscclppKernel( "simple_proxy_channel", diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh index 248c09c43..dee5af2d6 100644 --- a/test/deploy/deploy.sh +++ b/test/deploy/deploy.sh @@ -1,14 +1,10 @@ set -e KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} -SRC_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/build" -SRC_INCLUDE_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/include" -PYTHON_SRC_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/python" +ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/" DST_DIR="/tmp/mscclpp" HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile" -DEPLOY_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy" SSH_OPTION="StrictHostKeyChecking=no" -MSCCLPP_TEST_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/mscclpp-test" chmod 400 ${KeyFilePath} ssh-keygen -t rsa -f sshkey -P "" @@ -25,23 +21,15 @@ done set -e parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "rm -rf ${DST_DIR}" -parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "mkdir -p ${DST_DIR}" -parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${SRC_DIR} ${DST_DIR} -parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${PYTHON_SRC_DIR} ${DST_DIR} -parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${SRC_INCLUDE_DIR} ${DST_DIR} - -parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION sshkey ${DST_DIR} -parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION sshkey.pub ${DST_DIR} -parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${DEPLOY_DIR}/* ${DST_DIR} -parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${MSCCLPP_TEST_DIR}/check_perf_result.py ${DST_DIR} +parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR} # force to pull the latest image parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ "sudo docker pull ${CONTAINERIMAGE}" parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ "sudo docker run --rm -itd --privileged --net=host --ipc=host --gpus=all \ - -w /root -v ${DST_DIR}:/root/mscclpp --name=mscclpp-test \ + -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --name=mscclpp-test \ --entrypoint /bin/bash ${CONTAINERIMAGE}" parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ - "sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/setup.sh'" + "sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh'" diff --git a/test/deploy/run_tests.sh b/test/deploy/run_tests.sh index d862d39bb..fb9701797 100644 --- a/test/deploy/run_tests.sh +++ b/test/deploy/run_tests.sh @@ -1,62 +1,63 @@ set -e +HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi function run_mscclpp_test() { echo "=================Run allgather_test_perf on 2 nodes=========================" - /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \ + /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl # For kernel 2, the message size must can be divided by 3 - /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \ + /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl - /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \ + /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl echo "==================Run allreduce_test_perf on 2 nodes=========================" - /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \ + /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl - /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \ + /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl - /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \ + /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl - /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \ + /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl - /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \ + /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl echo "==================Run alltoall_test_perf on 2 nodes=========================" - /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \ + /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 8 /root/mscclpp/build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl echo "========================Run performance check===============================" - python3 /root/mscclpp/check_perf_result.py --perf-file /root/mscclpp/output.jsonl \ - --baseline-file /root/mscclpp/perf_ndmv4.jsonl + python3 /root/mscclpp/test/mscclpp-test/check_perf_result.py --perf-file /root/mscclpp/output.jsonl \ + --baseline-file /root/mscclpp/test/deploy/perf_ndmv4.jsonl } function run_mp_ut() { echo "============Run multi-process unit tests on 2 nodes (np=2, npernode=1)=========================" /usr/local/mpi/bin/mpirun -allow-run-as-root -tag-output -np 2 --bind-to numa \ - -hostfile /root/mscclpp/hostfile_mpi -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ + -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 1 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003 echo "============Run multi-process unit tests on 2 nodes (np=16, npernode=8)=========================" /usr/local/mpi/bin/mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \ - -hostfile /root/mscclpp/hostfile_mpi -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ + -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 8 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003 } @@ -64,12 +65,23 @@ function run_pytests() { echo "==================Run python tests================================" /usr/local/mpi/bin/mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \ - -hostfile /root/mscclpp/hostfile_mpi -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ - -npernode 8 bash /root/mscclpp/pytest.sh + -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ + -x MSCCLPP_HOME=/root/mscclpp -npernode 8 bash /root/mscclpp/test/deploy/pytest.sh +} + +function run_py_benchmark() +{ + echo "==================Run python benchmark================================" + /usr/local/mpi/bin/mpirun -allow-run-as-root -np 16 --bind-to numa \ + -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ + -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \ + -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/opt/microsoft/ndv4-topo.xml \ + -x NCCL_NET_PLUGIN=none -x NCCL_IB_DISABLE=0 -x NCCL_MIN_NCHANNELS=32 -x NCCL_DEBUG=WARN -x NCCL_P2P_DISABLE=0 -x NCCL_SHM_DISABLE=0 \ + -x MSCCLPP_HOME=/root/mscclpp -np 16 -npernode 8 python3 /root/mscclpp/python/benchmark/allreduce_bench.py } if [ $# -lt 1 ]; then - echo "Usage: $0 " + echo "Usage: $0 " exit 1 fi test_name=$1 @@ -83,9 +95,13 @@ case $test_name in run_mp_ut ;; pytests) - echo "==================Run python tests================================" + echo "==================Run python tests====================================" run_pytests ;; + py-benchmark) + echo "==================Run python benchmark================================" + run_py_benchmark + ;; *) echo "Unknown test name: $test_name" exit 1 diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh index 2b2c7f7e8..1d0641773 100644 --- a/test/deploy/setup.sh +++ b/test/deploy/setup.sh @@ -3,7 +3,7 @@ set -e mkdir -p /root/.ssh mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys chown root:root /root/.ssh/authorized_keys -mv /root/mscclpp/config /root/.ssh/config +mv /root/mscclpp/test/deploy/config /root/.ssh/config chown root:root /root/.ssh/config chmod 400 /root/mscclpp/sshkey chown root:root /root/mscclpp/sshkey @@ -14,10 +14,12 @@ for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do done if [[ "${CUDA_VERSION}" == *"11."* ]]; then - pip3 install -r /root/mscclpp/python/test/requirements_cu11.txt + pip3 install -r /root/mscclpp/python/requirements_cu11.txt else - pip3 install -r /root/mscclpp/python/test/requirements_cu12.txt + pip3 install -r /root/mscclpp/python/requirements_cu12.txt fi +cd /root/mscclpp && pip3 install . + mkdir -p /var/run/sshd /usr/sbin/sshd -p 22345 diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu index 4df3f09e9..73f23a1b1 100644 --- a/test/mscclpp-test/allreduce_test.cu +++ b/test/mscclpp-test/allreduce_test.cu @@ -890,7 +890,7 @@ __global__ void allreduce6(int* buff, int* scratch, void* resultBuff, int rank, size_t scratchResultOffset = (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LLPacket) : 3 * nPkts * sizeof(mscclpp::LLPacket); size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int); - uint2* src = (uint2*)((char*)buff + srcOffset); + uint2* src = (uint2*)((char*)buff + rank * nelemsPerRank * sizeof(int)); uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); // step 1: write to scratch buffer From 1d1199703a8ad2ec087212970ce573639dedccfc Mon Sep 17 00:00:00 2001 From: Saeed Maleki <30272783+saeedmaleki@users.noreply.github.com> Date: Fri, 17 Nov 2023 05:42:05 -0800 Subject: [PATCH 04/39] Auto-tune single-node AllReduce (#219) single node auto-tuner + graph plotter + bug fix for illegal memory access --------- Co-authored-by: Changho Hwang --- python/benchmark/allreduce.cu | 19 ++-- python/benchmark/allreduce_bench.py | 78 ++++++++++++++--- python/benchmark/mscclpp_op.py | 130 ++++++++++++++++++++++------ python/requirements_cu11.txt | 1 + python/requirements_cu12.txt | 1 + 5 files changed, 182 insertions(+), 47 deletions(-) diff --git a/python/benchmark/allreduce.cu b/python/benchmark/allreduce.cu index 4dc2b0c45..e90dc1474 100644 --- a/python/benchmark/allreduce.cu +++ b/python/benchmark/allreduce.cu @@ -118,12 +118,9 @@ __forceinline__ __device__ void vectorSum(TYPE* dst, TYPE* src, size_t nElem) { // AllReduce1 // ------------------------------------------- -#ifndef READ_ONLY -#define READ_ONLY 0 -#endif - -extern "C" __global__ void __launch_bounds__(1024, 1) - allreduce1(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff, int rank, int nranks, size_t nelems) { +template +__device__ void allreduce1_helper(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff, int rank, int nranks, + size_t nelems) { const size_t chunkSize = nelems / nranks; if (nranks == 1) return; const int nPeer = nranks - 1; @@ -211,13 +208,21 @@ extern "C" __global__ void __launch_bounds__(1024, 1) } } +extern "C" __global__ void __launch_bounds__(1024, 1) allreduce1(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff, + int rank, int nranks, size_t nelems, int read_only) { + if (read_only) + allreduce1_helper<1>(smChans, buff, rank, nranks, nelems); + else + allreduce1_helper<0>(smChans, buff, rank, nranks, nelems); +} + // ------------------------------------------- // AllReduce2 // ------------------------------------------- __device__ uint64_t globalFlag = 1; -extern "C" __global__ void __launch_bounds__(512, 1) +extern "C" __global__ void __launch_bounds__(1024, 1) allreduce2(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff, TYPE* scratch, void* resultBuff, int rank, int worldSize, size_t nelems) { nelems = nelems / (sizeof(int) / sizeof(TYPE)); diff --git a/python/benchmark/allreduce_bench.py b/python/benchmark/allreduce_bench.py index aa2c096ed..2cf09cba0 100644 --- a/python/benchmark/allreduce_bench.py +++ b/python/benchmark/allreduce_bench.py @@ -23,6 +23,46 @@ raise RuntimeError("Unknown data type") +def plot_graph(sizes, mscclpp_algbw, nccl_algbw, speed_ups): + import matplotlib.pyplot as plt + + human_readable_sizes = [human_readable_size(size) for size in sizes] + + fig, ax1 = plt.subplots(figsize=(10, 6)) + + # Plotting AlgBW for MSCCLPP and NCCL on the primary y-axis + (line1,) = ax1.plot(sizes, mscclpp_algbw, marker="o", color="blue", label="MSCCLPP AlgBW") + (line2,) = ax1.plot(sizes, nccl_algbw, marker="x", color="red", label="NCCL AlgBW") + ax1.set_ylabel("AlgBW (GB/s)") + ax1.set_xlabel("Data Size") + + # Logarithmic x-axis + ax1.set_xscale("log", base=2) + ax1.set_xticks(sizes) + ax1.set_xticklabels(human_readable_sizes, rotation=45) + + # Adding secondary y-axis for Speed Up + ax2 = ax1.twinx() + (line3,) = ax2.plot(sizes, speed_ups, marker="^", color="green", label="Speed Up") + ax2.set_ylabel("Speed Up (NCCL Time / MSCCLPP Time)", color="green") + ax2.tick_params(axis="y", labelcolor="green") + + # Set the lower bound of the secondary y-axis to 0 + ax2.set_ylim(bottom=0) + + # Creating legends + lines = [line1, line2, line3] + labels = [line.get_label() for line in lines] + ax1.legend(lines, labels, loc="upper left") + + # Setting title and grid + ax1.set_title("MSCCLPP vs NCCL -- " + str(MPI.COMM_WORLD.size // N_GPUS_PER_NODE) + " Nodes") + ax2.grid(True, which="both", ls="--") + + # Saving the plot + plt.savefig("mscclpp_vs_nccl_comparison.pdf", format="pdf") + + def human_readable_size(size, decimal_places=1): for unit in ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]: if size < 1024.0 or unit == "PiB": @@ -99,15 +139,12 @@ def run_benchmark( memory_out = cp.zeros(nelem, dtype=data_type) cp.cuda.runtime.deviceSynchronize() + proxy_service = None if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1: if memory.nbytes < 2**20: mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out) elif memory.nbytes < 2**29: - if memory.nbytes >= 2**20 and memory.nbytes <= 2**22: - read_only = 0 - else: - read_only = 1 - mscclpp_call = MscclppAllReduce1(mscclpp_group, memory, read_only=read_only) + mscclpp_call = MscclppAllReduce1(mscclpp_group, memory) else: proxy_service = ProxyService() mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service) @@ -117,14 +154,13 @@ def run_benchmark( proxy_service = ProxyService() mscclpp_call = MscclppAllReduce5(mscclpp_group, memory, memory_out, N_GPUS_PER_NODE, proxy_service) proxy_service.start_proxy() - best_config = find_best_config(mscclpp_call, 100) - mscclpp_call.set_params(*best_config) else: proxy_service = ProxyService() mscclpp_call = MscclppAllReduce4(mscclpp_group, memory, N_GPUS_PER_NODE, proxy_service) proxy_service.start_proxy() - best_config = find_best_config(mscclpp_call, 20) - mscclpp_call.set_params(*best_config) + + best_config = find_best_config(mscclpp_call, 20) + mscclpp_call.set_params(*best_config) nccl_call = NcclAllReduce(nccl_op, memory) @@ -145,6 +181,7 @@ def run_benchmark( MPI.COMM_WORLD.barrier() proxy_service.stop_proxy() + speed_up = nccl_time / mscclpp_time if MPI.COMM_WORLD.rank == 0: table.add_row( [ @@ -155,12 +192,14 @@ def run_benchmark( "{:.2f}".format(nccl_time), "{:.2f}".format(nccl_algBw), nccl_check, - "{:.2f}".format(nccl_time / mscclpp_time), + "{:.2f}".format(speed_up), ] ) if MPI.COMM_WORLD.rank == 0: print(".", end="", flush=True) + return memory.nbytes, mscclpp_algBw, nccl_algBw, speed_up + if __name__ == "__main__": shm_comm = MPI.COMM_WORLD.Split_type(MPI.COMM_TYPE_SHARED, 0, MPI.INFO_NULL) @@ -200,16 +239,29 @@ def run_benchmark( "Speed Up", ] - for i in range(10, 28): + sizes = [] + mscclpp_algbw = [] + nccl_algbw = [] + speed_ups = [] + for i in range(10, 30): if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1: - run_benchmark(mscclpp_group, nccl_comm, table, 100, 2**i) + nelems = 2**i elif MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 2: - run_benchmark(mscclpp_group, nccl_comm, table, 100, 3 * 2**i) + nelems = 3 * 2**i else: raise RuntimeError("Only support one node/two nodes communication") + size, mscclpp_algBw, nccl_algBw, speed_up = run_benchmark(mscclpp_group, nccl_comm, table, 100, nelems) + sizes.append(size) + mscclpp_algbw.append(mscclpp_algBw) + nccl_algbw.append(nccl_algBw) + speed_ups.append(speed_up) + if MPI.COMM_WORLD.rank == 0: print() print(table) + + plot_graph(sizes, mscclpp_algbw, nccl_algbw, speed_ups) + mscclpp_group = None nccl_comm = None diff --git a/python/benchmark/mscclpp_op.py b/python/benchmark/mscclpp_op.py index 92d17e2f7..ab51f7c84 100644 --- a/python/benchmark/mscclpp_op.py +++ b/python/benchmark/mscclpp_op.py @@ -35,7 +35,7 @@ def __init__( group: mscclpp_comm.CommGroup, memory: cp.ndarray, read_only: int = 1, - nthreads: int = 1024, + block_size: int = 1024, nblocks: int = 24, ): self.group = group @@ -55,30 +55,55 @@ def __init__( file="allreduce.cu", kernel_name="allreduce1", file_dir=file_dir, - macro_dict={"TYPE": type_str, "READ_ONLY": str(read_only)}, + macro_dict={"TYPE": type_str}, ).get_compiled_kernel() - self.params = b"" self.device_handles = [] for rank in range(self.group.nranks): if rank != self.group.my_rank: self.device_handles.append(self.sm_channels[rank].device_handle().raw) + + self.device_handles_cp = cp.asarray(memoryview(b"".join(self.device_handles)), dtype=cp.uint8) + + self.set_params(nblocks, block_size, read_only) + + def __call__(self, stream_ptr): + self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream_ptr) + return self.memory + + def set_params(self, nblocks, block_size, read_only): + self.nblocks = nblocks + self.block_size = block_size + self.read_only = read_only + self.params = b"" self.params += pack( - cp.asarray(memoryview(b"".join(self.device_handles)), dtype=cp.uint8), + self.device_handles_cp, self.memory, self.group.my_rank, self.group.nranks, ctypes.c_size_t(self.memory.size), + self.read_only, ) - self.nthreads = nthreads - self.nblocks = nblocks - def __call__(self, stream_ptr): - self.kernel.launch_kernel(self.params, self.nblocks, self.nthreads, 0, stream_ptr) - return self.memory + def auto_tune(self): + nblocks_to_try = [8, 12, 16, 24, 32, 48, 64, 72, 96, 108] + block_size_to_try = [256, 512, 1024] + read_only_to_try = [0, 1] + for nblocks in nblocks_to_try: + for block_size in block_size_to_try: + for read_only in read_only_to_try: + self.set_params(nblocks, block_size, read_only) + yield nblocks, block_size, read_only class MscclppAllReduce2: - def __init__(self, group: mscclpp_comm.CommGroup, memory: cp.ndarray, memory_out: cp.ndarray): + def __init__( + self, + group: mscclpp_comm.CommGroup, + memory: cp.ndarray, + memory_out: cp.ndarray, + block_size: int = 512, + nblocks: int = 21, + ): self.group = group self.memory = memory self.memory_out = memory_out @@ -97,13 +122,26 @@ def __init__(self, group: mscclpp_comm.CommGroup, memory: cp.ndarray, memory_out self.kernel = KernelBuilder( file="allreduce.cu", kernel_name="allreduce2", file_dir=file_dir, macro_dict={"TYPE": type_str} ).get_compiled_kernel() - self.params = b"" self.device_handles = [] for rank in range(self.group.nranks): if rank != self.group.my_rank: self.device_handles.append(self.sm_channels[rank].device_handle().raw) + + self.device_handles_cp = cp.asarray(memoryview(b"".join(self.device_handles)), dtype=cp.uint8) + + self.set_params(nblocks, block_size) + + def __call__(self, stream_ptr): + self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream_ptr) + return self.memory_out + + def set_params(self, nblocks, block_size): + self.nblocks = nblocks + self.block_size = block_size + + self.params = b"" self.params += pack( - cp.asarray(memoryview(b"".join(self.device_handles)), dtype=cp.uint8), + self.device_handles_cp, self.memory, self.scratch, self.memory_out, @@ -112,13 +150,24 @@ def __init__(self, group: mscclpp_comm.CommGroup, memory: cp.ndarray, memory_out ctypes.c_size_t(self.memory.size), ) - def __call__(self, stream_ptr): - self.kernel.launch_kernel(self.params, 21, 512, 0, stream_ptr) - return self.memory_out + def auto_tune(self): + nblocks_to_try = [21, 42, 63, 84, 105] + block_size_to_try = [256, 512, 1024] + for nblocks in nblocks_to_try: + for block_size in block_size_to_try: + self.set_params(nblocks, block_size) + yield nblocks, block_size class MscclppAllReduce3: - def __init__(self, group: mscclpp_comm.CommGroup, memory: cp.ndarray, proxy_service: ProxyService): + def __init__( + self, + group: mscclpp_comm.CommGroup, + memory: cp.ndarray, + proxy_service: ProxyService, + block_size: int = 1024, + nblocks: int = 24, + ): self.group = group self.memory = memory remote_nghrs = list(range(self.group.nranks)) @@ -141,16 +190,28 @@ def __init__(self, group: mscclpp_comm.CommGroup, memory: cp.ndarray, proxy_serv self.kernel = KernelBuilder( file="allreduce.cu", kernel_name="allreduce3", file_dir=file_dir, macro_dict={"TYPE": type_str} ).get_compiled_kernel() - self.params = b"" self.fst_device_handles = [] self.snd_device_handles = [] for rank in range(self.group.nranks): if rank != self.group.my_rank: self.fst_device_handles.append(self.fst_round_proxy_chans[rank].device_handle().raw) self.snd_device_handles.append(self.snd_round_proxy_chans[rank].device_handle().raw) + self.fst_device_handles_cp = cp.asarray(memoryview(b"".join(self.fst_device_handles)), dtype=cp.uint8) + self.snd_device_handles_cp = cp.asarray(memoryview(b"".join(self.snd_device_handles)), dtype=cp.uint8) + + self.set_params(nblocks, block_size) + + def __call__(self, stream_ptr): + self.kernel.launch_kernel(self.params, 24, 1024, 0, stream_ptr) + return self.memory + + def set_params(self, nblocks, block_size): + self.nblocks = nblocks + self.block_size = block_size + self.params = b"" self.params += pack( - cp.asarray(memoryview(b"".join(self.fst_device_handles)), dtype=cp.uint8), - cp.asarray(memoryview(b"".join(self.snd_device_handles)), dtype=cp.uint8), + self.fst_device_handles_cp, + self.snd_device_handles_cp, self.memory, self.scratch, self.group.my_rank, @@ -158,9 +219,13 @@ def __init__(self, group: mscclpp_comm.CommGroup, memory: cp.ndarray, proxy_serv ctypes.c_size_t(self.memory.size), ) - def __call__(self, stream_ptr): - self.kernel.launch_kernel(self.params, 24, 1024, 0, stream_ptr) - return self.memory + def auto_tune(self): + nblocks_to_try = [8, 12, 16, 24, 32, 48, 64, 72, 96, 108] + block_size_to_try = [256, 512, 1024] + for nblocks in nblocks_to_try: + for block_size in block_size_to_try: + self.set_params(nblocks, block_size) + yield nblocks, block_size class MscclppAllReduce4: @@ -220,6 +285,14 @@ def __init__( ) self.all_gather_proxy_device_handles.append(self.all_gather_proxy_channels[rank].device_handle().raw) + self.sm_device_handles_cp = cp.asarray(memoryview(b"".join(self.sm_device_handles)), dtype=cp.uint8) + self.reduce_sactter_proxy_device_handles_cp = cp.asarray( + memoryview(b"".join(self.reduce_sactter_proxy_device_handles)), dtype=cp.uint8 + ) + self.all_gather_proxy_device_handles_cp = cp.asarray( + memoryview(b"".join(self.all_gather_proxy_device_handles)), dtype=cp.uint8 + ) + self.set_params(nblocks, block_size, pipeline_depth) def __call__(self, stream_ptr): @@ -233,9 +306,9 @@ def set_params(self, nblocks, block_size, pipeline_depth): self.params = b"" self.params += pack( - cp.asarray(memoryview(b"".join(self.sm_device_handles)), dtype=cp.uint8), - cp.asarray(memoryview(b"".join(self.reduce_sactter_proxy_device_handles)), dtype=cp.uint8), - cp.asarray(memoryview(b"".join(self.all_gather_proxy_device_handles)), dtype=cp.uint8), + self.sm_device_handles_cp, + self.reduce_sactter_proxy_device_handles_cp, + self.all_gather_proxy_device_handles_cp, self.memory, self.scratch, self.group.my_rank, @@ -310,6 +383,9 @@ def __init__( if rank != self.group.my_rank and not in_same_node(rank): self.proxy_device_handles.append(self.proxy_channels[rank].device_handle().raw) + self.sm_device_handles_cp = cp.asarray(memoryview(b"".join(self.sm_device_handles)), dtype=cp.uint8) + self.proxy_device_handles_cp = cp.asarray(memoryview(b"".join(self.proxy_device_handles)), dtype=cp.uint8) + self.set_params(nblocks, block_size) def __call__(self, stream_ptr): @@ -322,8 +398,8 @@ def set_params(self, nblocks, block_size): self.params = b"" self.params += pack( - cp.asarray(memoryview(b"".join(self.sm_device_handles)), dtype=cp.uint8), - cp.asarray(memoryview(b"".join(self.proxy_device_handles)), dtype=cp.uint8), + self.sm_device_handles_cp, + self.proxy_device_handles_cp, self.memory, self.scratch, self.put_buff, diff --git a/python/requirements_cu11.txt b/python/requirements_cu11.txt index 47285da3e..7f4b4ea15 100644 --- a/python/requirements_cu11.txt +++ b/python/requirements_cu11.txt @@ -5,3 +5,4 @@ cuda-python netifaces pytest numpy +matplotlib diff --git a/python/requirements_cu12.txt b/python/requirements_cu12.txt index 094dff8de..aa657eac3 100644 --- a/python/requirements_cu12.txt +++ b/python/requirements_cu12.txt @@ -5,3 +5,4 @@ cuda-python netifaces pytest numpy +matplotlib From 70eb6d73287cef41ba3ec7b71626b1156f0b068a Mon Sep 17 00:00:00 2001 From: Saeed Maleki <30272783+saeedmaleki@users.noreply.github.com> Date: Sat, 18 Nov 2023 10:34:52 -0800 Subject: [PATCH 05/39] Fixing the bug in allreduce1 (#220) --- python/benchmark/allreduce.cu | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/benchmark/allreduce.cu b/python/benchmark/allreduce.cu index e90dc1474..c22966fd3 100644 --- a/python/benchmark/allreduce.cu +++ b/python/benchmark/allreduce.cu @@ -176,10 +176,12 @@ __device__ void allreduce1_helper(mscclpp::SmChannelDeviceHandle* smChans, TYPE* TYPE val = smChans[peerIdx].read(idx); tmp += val; } - for (int index = 0; index < nPeer; ++index) { - int peerIdx = (index + rank); - if (peerIdx >= nPeer) peerIdx -= nPeer; - smChans[peerIdx].write(idx, tmp); + if (READ_ONLY == 0) { + for (int index = 0; index < nPeer; ++index) { + int peerIdx = (index + rank); + if (peerIdx >= nPeer) peerIdx -= nPeer; + smChans[peerIdx].write(idx, tmp); + } } buff[idx] = tmp; } @@ -198,6 +200,7 @@ __device__ void allreduce1_helper(mscclpp::SmChannelDeviceHandle* smChans, TYPE* } if (READ_ONLY) { + deviceSyncer.sync(gridDim.x); for (int i = 0; i < nPeer; ++i) { int peerIdx = (i + rank); if (peerIdx >= nPeer) peerIdx -= nPeer; From 3431f370671fad8b3f4922b86e08f186f35a4950 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 21 Nov 2023 09:15:18 +0800 Subject: [PATCH 06/39] Fix DeviceSyncer (#222) --- include/mscclpp/concurrency.hpp | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/include/mscclpp/concurrency.hpp b/include/mscclpp/concurrency.hpp index cccab8be4..61299d891 100644 --- a/include/mscclpp/concurrency.hpp +++ b/include/mscclpp/concurrency.hpp @@ -23,27 +23,25 @@ struct DeviceSyncer { /// @param blockNum The number of blocks that will synchronize. /// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative. __forceinline__ __device__ void sync(int blockNum, int64_t maxSpinCount = 100000000) { - int maxOldCnt = blockNum - 1; + unsigned int maxOldCnt = blockNum - 1; __syncthreads(); if (blockNum == 1) return; if (threadIdx.x == 0) { // Need a `__threadfence()` before to flip `flag`. __threadfence(); - int tmpIsAdd = isAdd_ ^ 1; - if (tmpIsAdd) { - if (atomicAdd(&count_, 1) == maxOldCnt) { + int tmp = isIncFlag_ ^ 1; + if (tmp) { + if (atomicInc(&count_, maxOldCnt) == maxOldCnt) { flag_ = 1; - count_ = 0; } POLL_MAYBE_JAILBREAK(!flag_, maxSpinCount); } else { - if (atomicAdd(&count_, 1) == maxOldCnt) { + if (atomicInc(&count_, maxOldCnt) == maxOldCnt) { flag_ = 0; - count_ = 0; } POLL_MAYBE_JAILBREAK(flag_, maxSpinCount); } - isAdd_ = tmpIsAdd; + isIncFlag_ = tmp; } // We need sync here because only a single thread is checking whether // the flag is flipped. @@ -55,9 +53,9 @@ struct DeviceSyncer { /// The flag to indicate whether the barrier is reached by the latest thread. volatile int flag_; /// The counter of synchronized blocks. - int count_; - /// The flag to indicate whether to increase or decrease @ref count_. - int isAdd_; + unsigned int count_; + /// The flag to indicate whether to increase or decrease @ref flag_. + int isIncFlag_; }; } // namespace mscclpp From 7bd66a938c7e1b8bff4e86b6df754ed6e87ac351 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 22 Nov 2023 12:06:50 +0800 Subject: [PATCH 07/39] Robust correctness test (#221) Co-authored-by: Aashaka Shah --- .azure-pipelines/integration-test.yml | 2 +- pyproject.toml | 2 +- python/benchmark/__init__.py | 0 python/mscclpp_benchmark/__init__.py | 1 + .../allreduce.cu | 2 +- .../allreduce_bench.py | 39 ++++++++++--------- .../mscclpp_op.py | 0 .../nccl_op.py | 0 8 files changed, 25 insertions(+), 21 deletions(-) delete mode 100644 python/benchmark/__init__.py create mode 100644 python/mscclpp_benchmark/__init__.py rename python/{benchmark => mscclpp_benchmark}/allreduce.cu (99%) rename python/{benchmark => mscclpp_benchmark}/allreduce_bench.py (89%) rename python/{benchmark => mscclpp_benchmark}/mscclpp_op.py (100%) rename python/{benchmark => mscclpp_benchmark}/nccl_op.py (100%) diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml index a433553bc..fa40876a5 100644 --- a/.azure-pipelines/integration-test.yml +++ b/.azure-pipelines/integration-test.yml @@ -127,5 +127,5 @@ jobs: else pip3 install -r ./python/requirements_cu12.txt fi - mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/benchmark/allreduce_bench.py + mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/pyproject.toml b/pyproject.toml index 5902c9464..7421790d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ version = "0.3.0" [tool.scikit-build] cmake.minimum-version = "3.25.0" build-dir = "build/{wheel_tag}" -wheel.packages = ["python/mscclpp"] +wheel.packages = ["python/mscclpp", "python/mscclpp_benchmark"] wheel.install-dir = "mscclpp" [tool.scikit-build.cmake.define] diff --git a/python/benchmark/__init__.py b/python/benchmark/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/python/mscclpp_benchmark/__init__.py b/python/mscclpp_benchmark/__init__.py new file mode 100644 index 000000000..3f0560ca7 --- /dev/null +++ b/python/mscclpp_benchmark/__init__.py @@ -0,0 +1 @@ +from .mscclpp_op import MscclppAllReduce1, MscclppAllReduce2, MscclppAllReduce3, MscclppAllReduce4, MscclppAllReduce5 diff --git a/python/benchmark/allreduce.cu b/python/mscclpp_benchmark/allreduce.cu similarity index 99% rename from python/benchmark/allreduce.cu rename to python/mscclpp_benchmark/allreduce.cu index c22966fd3..b4623afff 100644 --- a/python/benchmark/allreduce.cu +++ b/python/mscclpp_benchmark/allreduce.cu @@ -231,7 +231,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1) nelems = nelems / (sizeof(int) / sizeof(TYPE)); // This version of allreduce only works for single nodes const int nPeers = worldSize - 1; - const int nPkts = nelems / 2; + const size_t nPkts = nelems / 2; const int nelemsPerRank = nelems / worldSize; const int nPktsPerRank = nelemsPerRank / 2; // flag for packets. Initially 1 diff --git a/python/benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py similarity index 89% rename from python/benchmark/allreduce_bench.py rename to python/mscclpp_benchmark/allreduce_bench.py index 2cf09cba0..9a9286a7e 100644 --- a/python/benchmark/allreduce_bench.py +++ b/python/mscclpp_benchmark/allreduce_bench.py @@ -11,7 +11,7 @@ from prettytable import PrettyTable import netifaces as ni -data_type = cp.float16 +data_type = cp.float32 if data_type == cp.float16: dtype_str = "fp16" @@ -71,25 +71,28 @@ def human_readable_size(size, decimal_places=1): return f"{size:.{decimal_places}f} {unit}" -def check_correctness(memory, func): - rand_gen = cp.random.default_rng(seed=MPI.COMM_WORLD.rank) - memory[:] = rand_gen.random(memory.shape).astype(data_type) - cp.cuda.runtime.deviceSynchronize() - output_memory = func(0) - cp.cuda.runtime.deviceSynchronize() - expected = cp.zeros_like(memory) - for i in range(MPI.COMM_WORLD.size): - rand_gen = cp.random.default_rng(seed=i) - expected += rand_gen.random(memory.shape).astype(data_type) - - if data_type == cp.float16: - ac = cp.allclose(output_memory, expected, rtol=1.0e-2, atol=1.0e-4) - else: - ac = cp.allclose(output_memory, expected, rtol=1.0e-2, atol=1.0e-4) +def check_correctness(memory, func, niter=100): + ac = True + for p in range(niter): + memory[:] = cp.ones(memory.shape).astype(data_type) * (p * MPI.COMM_WORLD.size + MPI.COMM_WORLD.rank) + cp.cuda.runtime.deviceSynchronize() + output_memory = func(0) + cp.cuda.runtime.deviceSynchronize() + expected = cp.zeros_like(memory) + for i in range(MPI.COMM_WORLD.size): + expected += cp.ones(memory.shape).astype(data_type) * (p * MPI.COMM_WORLD.size + i) + + is_close = cp.isclose(output_memory, expected, rtol=1.0e-2, atol=2) + icf = is_close == 0 + all_close = cp.all(is_close) + ac = ac and all_close + if not all_close: + print( + f"not close: p={p}, rank={MPI.COMM_WORLD.rank}, output={output_memory[icf][0]}, expected={expected[icf][0]}", + flush=True, + ) ac = MPI.COMM_WORLD.allreduce(ac, op=MPI.SUM) - if not ac: - print(output_memory, expected) return ac diff --git a/python/benchmark/mscclpp_op.py b/python/mscclpp_benchmark/mscclpp_op.py similarity index 100% rename from python/benchmark/mscclpp_op.py rename to python/mscclpp_benchmark/mscclpp_op.py diff --git a/python/benchmark/nccl_op.py b/python/mscclpp_benchmark/nccl_op.py similarity index 100% rename from python/benchmark/nccl_op.py rename to python/mscclpp_benchmark/nccl_op.py From 15f6dcca498eda0e73feaefac069c58366b97c4c Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 23 Nov 2023 04:58:04 +0800 Subject: [PATCH 08/39] Update documentation (#217) Co-authored-by: Saeed Maleki --- CITATION.cff | 47 ++++++++ CMakeLists.txt | 2 +- README.md | 70 ++++++----- cmake/AddFormatTargets.cmake | 4 +- docs/.gitignore | 2 + docs/Doxyfile | 2 +- docs/Makefile | 20 ++++ docs/README.md | 27 +++++ docs/conf.py | 29 +++++ docs/figs/abstractions.png | Bin 0 -> 63939 bytes ...scclpp_vs_nccl_comparison_num_nodes_1.jpeg | Bin 0 -> 66241 bytes ...scclpp_vs_nccl_comparison_num_nodes_2.jpeg | Bin 0 -> 67652 bytes docs/index.rst | 26 +++++ docs/make.bat | 35 ++++++ docs/performance-ndmv4.md | 49 +------- docs/quickstart.md | 59 ++++++---- include/mscclpp/core.hpp | 2 +- pyproject.toml | 2 +- python/examples/bootstrap.py | 109 ------------------ python/examples/send_recv.py | 82 ------------- python/examples/utils.py | 17 --- python/mscclpp_benchmark/allreduce_bench.py | 7 +- 22 files changed, 280 insertions(+), 311 deletions(-) create mode 100644 CITATION.cff create mode 100644 docs/Makefile create mode 100644 docs/README.md create mode 100644 docs/conf.py create mode 100644 docs/figs/abstractions.png create mode 100644 docs/figs/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg create mode 100644 docs/figs/mscclpp_vs_nccl_comparison_num_nodes_2.jpeg create mode 100644 docs/index.rst create mode 100644 docs/make.bat delete mode 100644 python/examples/bootstrap.py delete mode 100644 python/examples/send_recv.py delete mode 100644 python/examples/utils.py diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 000000000..01d77a159 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,47 @@ +cff-version: 1.2.0 +title: "MSCCL++: A GPU-driven communication stack for scalable AI applications" +version: 0.4.0 +message: >- + If you use this project in your research, please cite it as below. +authors: + - given-names: Peng + family-names: Cheng + affiliation: Microsoft Research + - given-names: Changho + family-names: Hwang + affiliation: Microsoft Research + - given-names: Abhinav + family-names: Jangda + affiliation: Microsoft Research + - given-names: Suriya + family-names: Kalivardhan + affiliation: Microsoft Azure + - given-names: Binyang + family-names: Li + affiliation: Microsoft Azure + - given-names: Shuguang + family-names: Liu + affiliation: Microsoft Azure + - given-names: Saeed + family-names: Maleki + affiliation: Microsoft Research + - given-names: Madan + family-names: Musuvathi + affiliation: Microsoft Research + - given-names: Olli + family-names: Saarikivi + affiliation: Microsoft Research + - given-names: Wei + family-names: Tsui + affiliation: Microsoft Research + - given-names: Ziyue + family-names: Yang + affiliation: Microsoft Research + +repository-code: 'https://github.com/microsoft/mscclpp' +abstract: >- + MSCCL++ redefines the interface for inter-GPU communication, thereby + delivering a highly efficient and customizable communication stack + tailored for distributed GPU applications. +license: MIT +license-url: https://github.com/microsoft/mscclpp/blob/main/LICENSE diff --git a/CMakeLists.txt b/CMakeLists.txt index 2aef18aa3..5c32d47ee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ # Licensed under the MIT license. set(MSCCLPP_MAJOR "0") -set(MSCCLPP_MINOR "3") +set(MSCCLPP_MINOR "4") set(MSCCLPP_PATCH "0") set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR}) diff --git a/README.md b/README.md index 7f0112ec1..ff97ad887 100644 --- a/README.md +++ b/README.md @@ -1,32 +1,54 @@ # MSCCL++ -GPU-driven computation & communication stack. +[![Latest Release](https://img.shields.io/github/release/microsoft/mscclpp.svg)](https://github.com/microsoft/mscclpp/releases/latest) +[![License](https://img.shields.io/github/license/microsoft/mscclpp.svg)](LICENSE) +[![CodeQL](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml/badge.svg?branch=main)](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml) -See [Quick Start](docs/quickstart.md) to quickly get started. +| Pipelines | Build Status | +|--------------------------|-------------------| +| Unit Tests (CUDA) | [![Build Status](https://dev.azure.com/binyli/HPC/_apis/build/status%2Fmscclpp-ut?branchName=main)](https://dev.azure.com/binyli/HPC/_build/latest?definitionId=4&branchName=main) | +| Integration Tests (CUDA) | [![Build Status](https://dev.azure.com/binyli/HPC/_apis/build/status%2Fmscclpp-test?branchName=main)](https://dev.azure.com/binyli/HPC/_build/latest?definitionId=3&branchName=main) | -See the latest performance evaluation on Azure [NDmv4](docs/performance-ndmv4.md). +A GPU-driven communication stack for scalable AI applications. -Build our Doxygen document by running `doxygen` in [`docs/`](docs/) directory. Run `python3 -m http.server ` in `docs/doxygen/html/` directory to serve the generated HTML files. +See [Quick Start](docs/quickstart.md) to quickly get started. ## Overview -MSCCL++ is a development kit for implementing highly optimized distributed GPU applications, in terms of both inter-GPU communication and GPU computation. MSCCL++ is specially designed for developers who want to fine-tune inter-GPU communication of their applications at the GPU kernel level, without awareness of detailed communication mechanisms. The key underlying concept of MSCCL++ is GPU-driven execution, where both communication and computation tasks are initiated by GPU not by CPU. That is, the communication and computation interfaces of MSCCL++ are provided as device-side APIs (called inside a GPU kernel), while the host-side APIs of MSCCL++ are for bootstrapping, initial connection setups, or background host threads for inter-GPU DMA and RDMA (called proxies). By using MSCCL++, we expect: +MSCCL++ redefines inter-GPU communication interfaces, thereby delivering a highly efficient and customizable communication stack for distributed GPU applications. Its design is specifically tailored to accommodate diverse performance optimization scenarios often encountered in state-of-the-art AI applications. Figure below provides a high-level overview of MSCCL++ abstractions in CUDA, C, and Python. + +|
MSCCL++ Abstractions Overview | +|-------------------------------| +| MSCCL++ Abstractions | + +The followings highlight the key features of MSCCL++. + +* **Light-weight and multi-layer abstractions.** MSCCL++ provides communication abstractions at lowest level close to hardware and at the highest level close to application API. The lowest level of abstraction is ultra light weight which enables a user to implement logics of data movement for a collective operation such as AllReduce inside a GPU kernel extremely efficiently without worrying about memory ordering of different ops. The modularity of MSCCL++ enables a user to construct the building blocks of MSCCL++ in a high level abstraction in Python and feed them to a CUDA kernel in order to facilitate the user's productivity. + +* **1-sided 0-copy synchronous and asynchronous abstracts.** MSCCL++ provides fine-grained synchronous and asynchronous 0-copy 1-sided abstracts for communication primitives such as `put()`, `get()`, `signal()`, `flush()`, and `wait()`. The 1-sided abstractions allows a user to asynchronously `put()` their data on the remote GPU as soon as it is ready without requiring the remote side to issue any receive instruction. This enables users to easily implement flexible communication logics, such as overlapping communication with computation, or implementing customized collective communication algorithms without worrying about potential deadlocks. Additionally, the 0-copy capability enables MSCCL++ to directly transfer data between user's buffers without using intermediate internal buffers which saves GPU bandwidth and memory capacity. + +* **Unified abstractions for different interconnection hardware.** MSCCL++ provides consistent abstractions regardless of the location of the remote GPU (either on the local node or on a remote node) or the underlying link (either NVLink/xGMI or InfiniBand). This simplifies the code for inter-GPU communication, which is often complex due to memory ordering of GPU/CPU read/writes and therefore, is error-prone. -* **Holistic Optimization for High GPU Utilization.** As both communication and computation are scheduled inside a GPU kernel at the same time, we can optimize end-to-end performance of distributed GPU applications from a global view. For example, we can minimize the GPU resource contention between communication and computation, which is known to often substantially degrade throughput of distributed deep learning applications. +## Performance -* **Fully Pipelined System to Reduce Overhead from the Control Plane.** We can eliminate control overhead from CPU by allowing GPU to autonomously schedule both communication and computation. This significantly reduces GPU scheduling overhead and CPU-GPU synchronization overhead. For example, this allows us to implement a highly fine-grained system pipelining (i.e., hiding communication delays by overlapping with computation), which has been difficult for CPU-controlled applications due to the large control/scheduling overhead. +While the power of MSCCL++ is fully realized with application-specific optimization, it still delivers performance benefits even for collective communication operations. The following figures provide a comparison of the AllReduce throughput of MSCCL++ against NCCL 2.19.3. This benchmark was tested over two [Azure NDmv4 SKUs](https://learn.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series) (8 A100-80G GPUs per node). -* **Runtime Performance Optimization for Dynamic Workload.** As we can easily implement flexible communication logics, we can optimize communication performance even during runtime. For example, we can implement the system to automatically choose different communication paths or different collective communication algorithms depending on the dynamic workload at runtime. +The key motivation behind these results is scaling of inference for LLM models using tensor parallelism. LLM requests usually are executed in two phases: prompt processing and token sampling. The prompt processing uses a large batch size that is usually equal to a request context length and the corresponding AllReduce size is `len_context*dim_hidden*sizeof(fp16)`. For a context length of 2048 with a hidden dimension of 12288 (GPT-3 size), the AllReduce size is 48MB. The token sampling uses a smaller batch size which corresponds to concurrent user requests in the system and therefore, the AllReduce size is `batch_size*dim_hidden*sizeof(fp16)`. For a concurrency of 16 users, the AllReduce size is 384KB. As the figures below demonstrates, MSCCL++ provides significant speed up over NCCL which is crucial for efficiency of serving LLMs at large scale. -## Key Features (v0.3) +|
Single-node AllReduce |
Two-node AllReduce | +|-------------------------------|----------------------------| +| MSCCL++ vs NCCL AllReduce (Single-node) | MSCCL++ vs NCCL AllReduce (Two-node) | -MSCCL++ v0.3 supports the following features. +## Key Concepts -### In-Kernel Communication Interfaces +The following highlights key concepts of MSCCL++. -MSCCL++ provides inter-GPU communication interfaces to be called by a GPU thread. For example, the `put()` method in the following example copies 1KB data from the local GPU to a remote GPU. `channel` is a peer-to-peer communication channel between two GPUs, which consists of information on send/receive buffers. `channel` is initialized from the host side before the kernel execution. +### On-GPU Communication Interfaces: Channels + +MSCCL++ provides peer-to-peer communication methods between GPUs. A peer-to-peer connection between two GPUs is called a *Channel*. Channels are constructed by MSCCL++ host-side interfaces and copied to GPUs during initialization. Channels provide *GPU-side interfaces*, which means that all communication methods are defined as a device function to be called from a GPU kernel code. For example, the `put()` method in the following example copies 1KB data from the local GPU to a remote GPU. ```cpp +// `ProxyChannel` will be explained in the following section. __device__ mscclpp::DeviceHandle channel; __global__ void gpuKernel() { ... @@ -53,11 +75,17 @@ __device__ void barrier() { } ``` -MSCCL++ provides consistent in-kernel interfaces, i.e., the above interfaces are used regardless of the location of the remote GPU (either on the local node or on a remote node) or the underlying link (either NVLink or InfiniBand). +MSCCL++ provides consistent interfaces, i.e., the above interfaces are used regardless of the location of the remote GPU (either on the local node or on a remote node) or the underlying link (either NVLink or InfiniBand). + +### ProxyChannel and SmChannel + +MSCCL++ delivers two types of channels, **ProxyChannel** and **SmChannel**. `ProxyChannel` provides (R)DMA-based data copy and synchronization methods. When called, these methods send/receive a signal to/from a host-side proxy (hence the name `ProxyChannel`), which will trigger (R)DMA (such as `cudaMemcpy*` or `ibv_post_send`) or issue synchronization methods (such as `cudaStreamSynchronize` or `ibv_poll_cq`). Since the key functionalities are run by the proxy, `ProxyChannel` requires only a single GPU thread to call its methods. See all `ProxyChannel` methods from [here](./include/mscclpp/proxy_channel_device.hpp). + +On the other hand, `SmChannel` provides memory-mapping-based copy and synchronization methods. When called, these methods will directly use GPU threads to read/write from/to the remote GPU's memory space. Comparing against `ProxyChannel`, `SmChannel` is especially performant for low-latency scenarios, while it may need many GPU threads to call copying methods at the same time to achieve high copying bandwidth. See all `SmChannel` methods from [here](./include/mscclpp/sm_channel_device.hpp). ### Host-Side Communication Proxy -Some in-kernel communication interfaces of MSCCL++ send requests (called triggers) to a GPU-external helper that conducts key functionalities such as DMA or RDMA. This helper is called a proxy service or a proxy in short. MSCCL++ provides a default implementation of a proxy, which is a background host thread that busy polls triggers from GPUs and conducts functionalities accordingly. For example, the following is a typical host-side code for MSCCL++. +MSCCL++ provides a default implementation of a host-side proxy for ProxyChannels, which is a background host thread that busy polls triggers from GPUs and conducts functionalities accordingly. For example, the following is a typical host-side code for MSCCL++. ```cpp // Bootstrap: initialize control-plane connections between all ranks @@ -120,19 +148,9 @@ public: Customized proxies can be used for conducting a series of pre-defined data transfers within only a single trigger from GPU at runtime. This would be more efficient than sending a trigger for each data transfer one by one. -### Flexible Customization - -Most of key components of MSCCL++ are designed to be easily customized. This enables MSCCL++ to easily adopt a new software / hardware technology and lets users implement algorithms optimized for their own use cases. - -### New in MSCCL++ v0.3 (Latest Release) -* Updated interfaces -* Add Python bindings and interfaces -* Add Python unit tests -* Add more configurable parameters -* Add a new single-node AllReduce kernel -* Fix bugs +### Python Interfaces -See details from https://github.com/microsoft/mscclpp/issues/89. +MSCCL++ provides Python bindings and interfaces, which simplifies integration with Python applications. ## Contributing diff --git a/cmake/AddFormatTargets.cmake b/cmake/AddFormatTargets.cmake index 71c3ef4ab..b95ad447b 100644 --- a/cmake/AddFormatTargets.cmake +++ b/cmake/AddFormatTargets.cmake @@ -26,11 +26,11 @@ find_program(BLACK black) if (BLACK) message(STATUS "Found black: ${BLACK}") add_custom_target(check-format-py - COMMAND ${BLACK} --config ${PROJECT_SOURCE_DIR}/pyproject.toml --check ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test + COMMAND ${BLACK} --config ${PROJECT_SOURCE_DIR}/pyproject.toml --check ${PROJECT_SOURCE_DIR} ) add_dependencies(check-format check-format-py) add_custom_target(format-py - COMMAND ${BLACK} --config ${PROJECT_SOURCE_DIR}/pyproject.toml ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test + COMMAND ${BLACK} --config ${PROJECT_SOURCE_DIR}/pyproject.toml ${PROJECT_SOURCE_DIR} ) add_dependencies(format format-py) else() diff --git a/docs/.gitignore b/docs/.gitignore index 94f90d1e6..00d9344fb 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1 +1,3 @@ doxygen/ +_build/ +sphinx/ diff --git a/docs/Doxyfile b/docs/Doxyfile index 0fa68bf2a..b2d5528e7 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -2043,7 +2043,7 @@ MAN_LINKS = NO # captures the structure of the code including all documentation. # The default value is: NO. -GENERATE_XML = NO +GENERATE_XML = YES # The XML_OUTPUT tag is used to specify where the XML pages will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 000000000..d4bb2cbb9 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 000000000..2bb9c1efb --- /dev/null +++ b/docs/README.md @@ -0,0 +1,27 @@ +## How to build docs + +1. Install `doxygen`. + + ```bash + $ sudo apt-get install doxygen + ``` + +2. Install Python packages below. If you install them on the user's local, you need to include `~/.local/bin` to `$PATH` (to use `sphinx-build`). + + ```bash + $ sudo python3 -m pip install sphinx sphinx_rtd_theme breathe + ``` + +3. Create Doxygen documents. + + ```bash + $ doxygen + ``` + +4. Create Sphinx documents. + + ```bash + $ sphinx-build -b html -Dbreathe_projects.mscclpp=$PWD/doxygen/xml $PWD $PWD/sphinx + ``` + +5. Done. The HTML files will be on `sphinx/` directory. diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 000000000..a8f28c2ed --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,29 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "mscclpp" +copyright = "2023, MSCCL++ Team" +author = "MSCCL++ Team" +release = "v0.4.0" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = ["breathe"] + +templates_path = ["_templates"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +# Breathe configuration +breathe_default_project = "mscclpp" + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "sphinx_rtd_theme" +html_static_path = ["_static"] diff --git a/docs/figs/abstractions.png b/docs/figs/abstractions.png new file mode 100644 index 0000000000000000000000000000000000000000..e6183aa919a89efd1cdce757e99631d4c005ee12 GIT binary patch literal 63939 zcmeFZ2UL?=w=NtvTiF(1TMz{S8$}dRX@Wvjlqw)4v{0-dp-Lw}f+8TGpcDl{lMW#P zgb;cVR0IU11qe-g389A)2>dJT{hf2}J?wkW_{Tr)xc?oOF&qX6Z|1w!oO8|jJkNZC z?rEuT?Bm@Bfj~IaRB!7-AnbJz$Zwbb_znEz=4XM|;J;nYx+=FI_@)z+;G5sAZff3y zK=Q-(Z#>uyzTfMpYU~VwaMrN??W%Rid<=oC8L8dAsqbMv&D4G++!;IWDHdSd{$s+D z$S(Ez&?gxILQGDFrlG8obiJlhrn;KJ6NlbeeAI;H;ERt@D@WUYy_+E0b@(P<5q8$Q z#hI1)WHmYW2FGC1ly6Iv_d&MovnS5}yfv6q?_c<{;pc&trR1_%+JFx&$XO6tJKdEJ zowc!?8Q==6stJv$06(x36w|9*Kxc7(j^@7EInz4Cv*g!H|I zv;X}%iO-4puOGW8+4KMX+SmR&j_vQ)KRHTV|MiyAQ~&qk|2r+q>)DyUG6_>R509s= z(a%Fd_S3&NHHDU1IylTOEwzjNWokXxZr^#B8hmzo`p48ILP$tRp`VtPRt_)tee`z( zLH!KOKiA+*j1KzpWA@@Pu=Nv4gMIV!`PHv_hz^FkEbi#|r=Q^GKbL(k?t!y~q=<+} z(X|A<=qIjUo0^+v`zx^VC1qtcHpXq)%PT8{#h^Sduf;}LI|q4rmt5Mx{Sb(6%)K~= z!Aa-RGkNkq?$X##%~r1CVQj?$tj`*op1x3@VKqHFn-~|DdahyV-8sV2EfYUWMJSXg zgz3PyO~qO}+Zv1)G%FTNnJ+Id+umt*(F1AZu-p~7y>D}@j0Vf_3KMbKUMV?Ti7Ww4i4^xDu2X8 z)}r*|_pleAj4!`-`$X$)IJSGY>3I-9k-D$mj;U#+9`B-D_eePH@U^?hJD%cdVph=N zOf5Z}N4TCLN}i%T?Us@ls6Xnq&zYLQn|1XX)Un6XbvufZ-rmx@6E=Buz+~ShcRX3R z+b;BixcTTi8+Jx2!#{^lf+{ z2utq#j6hBQo_)VeKny0m8Y-{8dNs|cp1ffC;LCem$L=E)^w2I_11nW~1)u&`50Tgg z-QEG|Q|9@j@svx2FH5gVe(AS#YE96jm+5qiChP^_o%f`+ww5^~jG|*Ly2TO-f-iD% zaV?(vg*s(J;(Pm=$@GCYIM0%{Dmw_1&jMk5KUR#I$CPirgR@^kgM}%)rY3N*UyZkr z{;I2-vf(iMs?pLFIZigj!g#l$yoDUU3lzbwRyb3+`@Fh>FBTv~Xx;4xoT*dqmdFJx_Ze)ydQrliSBmW=Psr5EC7M+ZcW0ue=+Fp?p7Nv>OY8SoLXwa z+0>-?5J)5k*TzC|8TVV4nRRtRTutrwHCtvlx*uR25j8)x5QiHL##0qAu@CIc3N>mJ z+i`zz#s3b0biM4i9=QLdw>0xIA{_TUM#-KzG4b7CK5nSPW$}#CvZLK4=$saASuuGf zo&v0+tWCbQNNX64WCI>mqLjwC#MwXw1(Ha=@{Lbmv2zK_Z9^amGE}}t;SL#!+Vn+FL z7eiXf6=!5H9g`p6h9{piK#65#J@T!O4|^5s9g7VmG{bW(0(?P|xG;5b{7Hi8@cNx?7dZ)Txcg5mcV7$}nOI2U~ z+T;dY#+#fs24@If#g9L~#y@yU!?(r1hC!{kLys1gT29)ZE71lkSCuT?r+(t-_3^XK z1|w3t)Asq=^t=s7VcsI6WKR-W$4lc1nY?e8=Nd7xcJ>)=_faL3H`%l+(n*tSuP4YU z1EEt$JvK-LIAD`>XsQuRCi8hqp35726E#6q;A`1mtMm>|Qbi$EpLwGSU$uuy9Zx{c z91I}$yBhU5%XG9T5=$+a_WH_?b19Y$I1mEG6olWR%aSrz(M6HNk!0odL%XN`GAOI? z-M%IqKE|Hh8jW^$C(;GZ=DMX^k@cO#N96w{6lIceTP-ny!V-IJ_~5FX~`7wy4_)3l;}o zMES_aCV0U(-&Jlcsc<=HL?@|7;xWygEnHn`KKxSI@}{WZ;)l$>%)9nxKC%7C+g`a~ zMV{$NUi7M)vGs&>vdwxz=&jGYuGFg@o;+xlHjg*zGuciGrDJ46*8vZ^3g@4?)_F!( z{x)UyPDq!&iiWXS0ZLZv%qK&;f>8rdub4iV%k-UgtBpC&hGD)@gPMO_v zHW~99yw#e&S%1Bvzpu{>E0E-tvS49a=}LXyI1WCI#NRQTZfUw4-q*bV!L#=-LZ+g= zBvsH~;afcnbL>HKuq==_bEZyKiUobO@%yTCrCg!&x$*kw`>GP1jVM_dOH7IhBy5{l@EkzPDSWz=;JnD_DXTENrgN!s!qK zUsN-W#F)a$BtCMrVY=yuVPc5=u7L- zLd~vOc6q`kbO_6yZ?~0Z?XvY+DgWKN;MigU^Ez1DBl9cl=p@f4a==dm!wY?eLMx`$ zerdH4;(7$WJN6Za;R03kDldT>6%T}(#;Fd8m&D(yki;zD7vKNfY z(#B=srZvs())Bsy{hVO&JtnD&9&=SwvC_;dWc?C4IZPL1GQ6yG#4*$Xh><*N-lnIq>8N!Rxbf=Xo}h6je{!atDUhvE@U*# z8z6AvMAuy{srM>5)oNZc9ka?$+Qzy;Ddl_q*V2;In>!jq-)` zRn;50cJ10U^EbuF8!*?4x302>4k`IlYaJ`g_NKD!T>di{4} zoB`p?L|a$>wr`FCZ@9z7#f8Ib@_ud*{roWFQsHuo)IMt4_?A(8#Imhe-wz6#b;I8l zCCOaK^;!7X@g&XI4C{F1<|HWE8b?y0M=26lr`_2Or9LZ8j~uDBad^=Fi#ATiDBrw8 z>S^CjsxkFdk04dtEH6Zt%`y(Ik4+Yj1WT3SFQ39~rrZ_~Q%rwhGE zUiJ4980EPu>Ul`PWX?BgVn%-3i=_Dp3EABx z{mL8QP<=u$NpqL?A3(#_?RKef^gG!rNbaj2^S}>;ij`!=^u7fab?jtO+Lcyrby8ls z%q8(3VHuy185t+i50<|^bb%c3aWvNcYz`%h_DZ!hMw2>_?pXvHzF+$?IT~zIOtB49 zAR!kQ-%CpP<`o5OUE1~^{Z@F;f@9(GQRNT^aUy=tW3Vt`Gm&7gmebRhZ^ZumkCvR+ zt!*&gzUT14#z)Rj!%zAV>4EsA?UVZwF(KgBzIQnMjnd-3jPVE6X{^A)Q|GJJ&FQU; z7f5zpasQn`Vs*F=Sg>jTvSc=ltSFOVKoft>TVB$OETQnd+_u9qvM22zhuj{BDM$JE zu68UFmDus5f|XrXoUMI3Bp&LC#;KXFY#VZxh(e+4ouQpavSiiHcmPMv;ZV{5C40Bn z6mVurLDqj0Ru4P}&#>#A86$J!Gq~R8{AGzMlIs(w;$3$mSl&|SOic-Uahk|^f0LED zK79M2yYPFJ<_MXaU67xDRI+pXv{NH}i&jS?MH8d7(rn(kSkUkB1&n1~)^z86-Z&Lm zD5LNeK?*E;!n98W`Sj=1zv74T-8-kZ^hHIwgO&QWXS@;>C@M*$y71g9OKJbL*O|)q z9}>wdD7kL2WsHObKN=;wySsCr%s%cv5IVh`Oll zU;L^bTrNsw1x$s`Ib%dytATZJ2RK~r`%;Vwt#20?9*cqBx$N(WlRd(GQ6rUL_}b33 z(WUdg^R@WOEt9f6F=vUDEjsrmpSQ~kJZ)gu@%mz4u3Yy;PUcco>wZr6m$+i7*zJ(T z>1;;b=1na+-00AN(f1 zjgRp|&bw*;H5QEBfs-7nq%?I!k5^scAELb1i(oV}*khZNq*k`NDa(|B^j*d3+o#Ai z-sH23WjP?WRQ|w~xYF1$^pfhlR%`|J?V#G6@hG{?2llBwonGbx3XB^;aH75O>(N5a z#+i7ypn&=Ee$F<&x8Z5_{pHrcV;)^_qulqL`}G{gb|G~K-k3D70B1X zub7g0M8Kv;dVo$){F~F2=3%_3dZBT=+&G#0Ku>Qo;bL0+pS!1o&M@={TyrOcl5iDz zn6t!$ne+Lj-CbYai2FVnRNzX|VT>Hsnft(nORvWzCMNa-$C3X~Y~(qw)#z|P_Rr4H zPo<#BRsi>ZHR(+RV(@3XH}s5T77ia_X;Dzn3iYhb;$D_y;Z{Ybv}&(pt5f z623H(GH`#9;w5;_dykefEuK=<%XpQNlw&U=^X$?7`^I3UpB84Xf7Pnv%1IC-UbMDZ zths@xaleByFjF~oKeC7%vaEGfmNs~b8p$zAwt?DsuFidgb*ga%pxB(1RajWc{S-t> zuXm~f=qjGu^FqfXNc%eN6h%nSt#6rnWw}%2eU_3Bgjw6nw(l7utDM;V#I@ii_&ZQX z$q^>?y9Wjw)Hk)F;o79AbDD|2XELpe$?}Rq{H@_zsUKl&0tRu+ZmY!ZyRPl1uPKBU z)(VuC<+;istlHR4+mOC?^C~&8v%nkQH`yGkk{?s_L9Kn}B9_fjVJ?CF=jp~<21f+W_3HmTeEK^84#XQYc#1jT`AV2j>!pwe>cho8X20Pf1dKw;Kir68gFZY}7gd z)FSTxspowyy50V)m>l5 zYPz%U^J!auu@TKj`xDb#$MvSUZJ$lg1%z7hN>#3M2e|r8t+ZrD_0-cfP(tk5gn}eNt}#L6JEQj;2y6oE5 z2Qm2_xYi^)=Ton$KT+=GM9Exw>7v;hf63rwhC**{_d~4XkrI1R;zdoad~0bnDcf8H z$o5wDBLhP}lbxs)vuDI`cB+Po>fYvFk6@WajfC?jo>Z+WtK8H~b8|Q5_oY4u3F~yQ zv=_UQ_0OV~KW}T&7;3%`3FI$G`Sx4$`5kX=a;OTzEUMqT%f!MzL_FR=^*1#&>DS!N z+06zTE(qjUvdn@6k(oP{f`u;L5~G!+`Z69oh$ng zVtf_(yYv14hn0(?o10tN;5?N8@F1IrZ1^8HmktSWndo>JMlWlUQcU<>bLv+H5?ot& z^!+r)$#-N0{Av1U0j5Kf5y~2-Af{X_E8Lzv(_zM0@}5Bii&oERH)8J5 zRc}hy)y6X5dQxTt84EtU5bt250wGz9H>xP&FbQfL zc^w7MJyN7sTjj^jWgAnX?*1Xx&J{Ez0C4?z@lA{gX>~7f`ICpfRZg3uuB%GFx_xS>3s+jM7A_jjxfNw zS+HP9W=Epyi^Ggqc&{TJ{5?k2fg=9J*}G)1zSwL<)1DR0U{igMiqvVl&(LVb+gPvE zPu%S#4b>6z+}E%^Kc%qH&>Y?>I?MG<%0a1dR8&EBMw^Ru(cd&b>M?!OY z;+EvluGb*|R30%kHNAaCQ)a$R3A_!0k1i;XEc5rU2P@h;u={I3Z>7&QXwuZi#&zZ$ z1I;ziD%o*@W|8I0;LGv@ZZ;fR2fy4cLCR3TPqPi@Js&RKA_UUMm;FUhKk4{tE`L65`?E0k9a+VE}ZP1Y3rDBL19;ib_UC zhK!)E&FlY*qO;ZzeEok!E;N=qQMD5lnZwY{;Sc7>NfzyL|J0F6u~#rro15bAhlhrS z5;xn>^&EeIVEh3)wbE&UwtLWf{f8(KU-pi)va*>#6KDOA&XIa^Q&Xd-Aws|Q0O;<% z@z_zPW>Mm1EqbH2C<460cen11b*u%mJ{kT)K=vf~Nl40{O)pB0N9)+fJ3Iun(*2er zKf~b`f7;&wzdWjPDsH#4BRf_u_8?R{?MG#@em+>x)EvE%!^slRW^#quNK#rAy(ZIc8M zpMC+;B)3mQ$i$CZ9lw{*cjqf=7)b+jg`a!|rdj!JN5;<>|Lna^)cp^A-sd5}FYngB z0H!r>A6<|n$^SEMltFg_Ux0Jj`rMj1+wQk4xV+0&>@W~cxsO3t$L*kGs zZ_U{u_c^nvPQg2^F|+jQwQB|Zr7{mu%geN$qE;UBpV%(_*XqnCWgc&s8-v6!i`9ul z*l1M2H5pf5==JrGyj!^5^xq&qf8gtPTzq=TRRskFd3j1tPtR@2(#D3jl~t|VMAtA` zq#0*JodFDv<%9)BN@U8I1`ZEe3i!x@DI4jYG11}9r2gsBRqjQ~u=_2sjs8x-?)FMW zGat|4HHy8$5S$<4d)f`AwWAX@-=b%}ub-Hl>_8A${LnxZ+k^DRZG>!@B2j*c*vkMJ zTl-Z;A@$@{|x5|T5Dr2r$~xiAtW-Zlo($yg(%Xk@=crd95E z-#Xl)mn+MWX{X2jM+}nr0OX0Mk2;5U^l!!mP$8ncy=|#x8P5_GeGuHkr-%ulOd-f9 z5Q)T$u{zLrq7}m?NTY%tPq}&?+OW72n5hW31*ED!)Mq(flSZZG3h09y8LsnU-H43a z<$qYaYEaF$TpB-!Cbgd-3Iw^W_xGDiUGX89@bmK<(O+pw6c8S+r@7?`-E}VJ^MZ=#~AQ28CTm8@|nQEKyuQwl1IA< z#v0d#@0nm*vYH7@+&G(BjV@tSr!Uh#%Z61`l&ev}l2dVL3B9eYt)`~t(t;rxpc|U&_uqiF!l&NLqtzJKYQ8}2?e1(=y;Uikd8Agf?k8N9+B8f+L&;d=qe}8IdvS7BFV$N7~#mM_}k$g73MXZ?1l^qrk z%3T<$@e@QXL=Y>B2WHizq3cthZNdiTwafQX+xJh7E+xxQUD{Q*?)*+z{>@4b6W?c$92R2>o_oiojIC(;o=?H-pkh|BH*1((7UK_l4-|dQHgC1`p_6NcO-I3@MGKUveR2@ z&j$>JN<5(G4E7-vuQSjaib}Nsf{GH1+>XW{ZB_%!UxQ)Uag(B1bFvBLJmGoZ&a2Yx zTQaZ`9eWb@+l#R$4xHcWAcOnd0=N8>!#ayOXt4G6MASmxki}&)=S;?+2sA6*dzCgU zIx84#(2;em(5euf9CiCn?zOE9j>Q*Gi+nN%n*eR$x=()*m32S{m z{xRERFmOQ8zHyU>Q>HUpT`}(7x!$7?Q7gm79n!(C(h;)6#f#n#eE`=!6Wgp6JZ6o& z7}y*{b-+;WQ+gQ)!l#l>WZ2V6=kZY7kX%J6#R4?++O-8dV`M@iGn=#T4Dy9k1+b`; zHw9Pl4M&t>>W@vfWfbM5$iQ8D7Tanb8I|MNzR{fqt#1isZI}>J=!6af1rbGm6UA3$ zbq%TQx^@9R+`4~Qe;ggUZ3Z4XBO!M+dDyGMEvk=yG*8|kfwj9 zfCzFgwn^YDg$*f=nZ1HZ>0&W+b)Qszrg~%8XzbVoj;)+<@)74C=;}zu1AnH#oPuf8 zTE|%LI^zt+C8kKep5H1qTMU|EfLZ%}sK!B17N_3kP)r9)RpA}1ntRC7z!R*C!Qe{@ zJd9I`(+s_};Ka8%v5>S*vh5P*uCR{`MUPHv(Uh%a5rz?f_m(e1>{N|)o+b)R(`Y_h z%oPLL?a8l{j0m1|&GRw6NZd1YLwne5MS1y8S^R5pu5IHS+@$=NZyz3`QkIZ)_+z8Q zyTFgsl~+`h%I&g;-S4zzoQ$10JoGL-S5r#v>eW1<&c60xOY_`gZ|XbIHUq!4A%R7z zA5m9VKX;obsS4UO%D7S$SI2ay5N~N|+v0k@My{GQXW9M@a!L*P=D=~Z$1=|Y7;dtvK1F0w)(D_8*j;;O?abjnXqgnNg2=^0tHzjJo zEl0>~Sfl*}REQXW@a$o*Ok^UiBt%vBA|s>7WRBy* zvxzk+t%cbj;A*bN4ALG5aAVq&&JqO_{1Vu+*rJ1XHbqW4?K#Zdsv);K)zZrFGSPgY z^&BYxRC*Xnjyvn%FkMvHsPR;%kV<+>r6;Z=ekY^t$m2N=H3`@bF~(Y3e0;o1*7Fiq z(pE|iv*T9*k!0%gKyQ6P`;Y^Z-auP-p^vf;Tn| z>2gx|MQX%mj8%$8wB}b~Y0)~|`gq5P`yUY7$p7gQU%3jFIvM;RW9vKlt3Rvppt8sva)#%SM#!ji#8KKUG zooRY+QJ`iFju%ZQ?>z<5fg1J-pG0zaL)XOpNn4XE zM(w*d{q{v|yz+o%Xd*z*TlJ!=Lw1L9G+IxO*T~hqXd_k=0iebQ1^90LTB}QN=VDMy zvqTrQl>Ywyx;ix)o31>~>Ud=Z@I<6X!{+AZ@`yUWRB=jX z5DLSD9h%t0vQ+{tk$hnmM|GT-Ox7*{kL`bGIIu(Ya$if+`hEuxN!Z9_b$7$o#M@*C zyxR=gWjn7k7Rf0~^MivA#u5tJQ11szT(^QjxBHOWb%Bo$rw*scbDBx;xP=y^!E zmOd<@zdeGQAN&lK7<7iHu>^rlP!*?Zdad3jZRbLc>3BF-_SrJ@*X&5|haU|OEUvqh zYj>0AUqS4i{E_uAg4d-Hr;;+c>po`ZB>5xj97j&+H8`qKSQ^L<@{Rd)!>$1@Nq`gmJAoe14&mo*Zw53gXANf( z7yB7ggbiNB^=U~AhnjcblFi>M$uUQD3{w;6DZO(fhtxtDFCe?!|RgW|$ zW;G@#uFtS@p^uEALE|i+u{xeda&oJ+v^#2P0HRJEbeIu{86W057^KRrdq*cfNwoKV z@yV??zjB~>fn|J~0PLo!R zZv7c)ZH6;I71dYumTE7k*WufMc7!3T>ebswskQ<2T%^$lN=ypd$u^A1M|7MPiEEFG z5oN)Ll>QcVBiTOQdx>FlG3J`L{-v@cU2uAYG@+9CylvBV=?0{DPX0O%Yjym>T4It}ccl zc(Sjrk9N7Mm%4P|y@_HiZaGz{Q+jmtgGG^C9N3IDCCkEF(4jKjPt;_e&5^ppSW!z2 zyfp119aUPP8RU_8x$|S%6Wq|Jr-dG+5gMy!RNl-!lbnSb?PK5&7qt6ky+ao?q7>hm zUHOjOpUhq$B~*;|_LPYlETMxQP>^g;>E)mxzog+$!akVBRd0aAbMt{cv#sO9;3=q> z{C!l530xGRPlujx-ar z9a0f&0Pu&MpLG7nyI!H1xy9uCACwb`YaLqqCm+!&&aCLA5hg7Owdf1)X5IqkWMKcz zI=o(u?Tj=oqlhCGH>npPnjt8T-`#NlTb8rz{{{W^W_#+yvo={eHY{qeCA70TG?ugHbg zhbgVB8;k;c%J>fE>8iJ=|A5$BNLT6}a25qAdfXdFI3r{&sO>1e&HUdXx2LmXb}j=; zLH_`to%n==1RTGh;QD;~ryiO6*X#2E#B)MG0Fi7!4HTo2Owix)@6WyxLiWGP^fW2@ zL?T&K+*WIqMD8K-KH@gJecY;|v`9ld$c=@$2vg9!<{Gs563e|dZJbNSu1wnD`Pk0} z@6uqfMRu5l&HR*5btSIPb&|*-g(lb*9?tHdpg&zBbBW}>bXCS&hCE2h}WJsZL@&hiIe80nPLl?M-ClK4lpe&;hJvPMHUU}b^ZlN6=M7e}s zUQwX{l`2!SJKBm}NL6s6j(-r;Y)f^#k>BjQqS1$NXji0v#VWtBVWVv|PFg}EE9+)v z@NT!l%rer8mNn9>JJZ5a)T}=!KkAYMN?Cj2+NY(~3c+0?7@v0PAT)~d$RWGDN@3-~ zZ1wn?k+qhAM0{momgJutX1U@l98%!nVIZU;YXrM#bXujZ2L3V}1&6Oe=QoKb;P|S? zvB15pD2g|rp6})?o;geam|AUmQ$XyBU&Et1pEZEX?45Lg=EsJM{lJmB zHl`!XMX5O5{+pA=gF8Z1Bod24M*5TPf#m$OmmeIm%%A*0|Oq}M$xYhFdPM) zFbLmrW{01EF*q9O0%?BAF0<6I!hwx|!10$y_3tc75OVMvw($|5UH$_qRS2=Ulo>E&U5H90JD6r*Zv6{^1wB{Cy|`L z`KA@@x7v}OVTxUYLRb9Wg(X5c1wzIW-&6uFcCP!1NiWn~8PpUw`?CJcshg^Ol>jRO zA25sB$mpP$dwcv`zbTq|+m9TG22x-=+^J-7YQrwlRbHQ=Lr+T^cU@lrbO6+2^-Lms>nkw~#|gp9mU-`dLgrS2 z)(R>U!=5YgVJrGJdSx^EY$I-<(r2ADK7#mF!ph(z?W6@zrG7uQ>&DlM$vfL*a|-$P zJ3#bd_vJS9THM{-07sFNTm}%TC_rh$(bT!TeWqT5-is3~ZUVTNQX<9$*^~sRFFij#Rgh#KM5c>fFK1lh9ORK6!30cfLi)A9Uiws0rnQ2=>KMn zyt1-l*CA1!M!{a0~JP0s(YzICtjBcNyA=CSN_i<79#E8xfA?)&5zQMX4zYCaesg z2gL9Z9~4mvsjSw&3Gi(oF0%<)EMOnA4|0RA?ASk1Zb~Ker_qbANESftoSRlb6a0Gk zcx!t~${BbAWAPCE6U>xrrE*g=wHg30tZfG8VP3Vip#=r32&L=mPGlQv_4xqSoADWB zyPovWH@(Q)*9KW_g$#~1Jq@OhC}{E?8P^o9pNSi$Z&aT3V}kyN}gsv+40@C;IS^ zms@Ldd)m*j$S~lBEA`#+bE2M6Jc0rnfcPw@e`TVyv)8%}xy-?~pocV#GR##aGI$8a z87ayPiPM<`P{B}%6;qlt>{CI6KV`2s;P>jfcV?yr$HM@vm#|Ht-MM&sX1#ePBd3Y`o>G_Z(Y)h0m+0?{nfTk8b?m& z1PZ_e;nbs)b|8qRhP9TrtQ|qDotzPdI>#^o;;9+FJIIO3$TeXVKKe^4@;PPnP6NXI z!dGNtT%H)i!9$1Hd9-XACovQF6{eo>bLcPwQKad53umsNo@21vl3 zVhd&-1)6POH8@n5dcO#F2;GnELbXFw*2#LfvFq~khuXN?yt-9e~dWazL+4j(~#P?_eHJjj) zFM{3+ZF$uwu2lhi{RYCBg?wq2-i7mzhDvI}q7wD6jo!b_1APs~AZ=0>WPgtZbHzKU zhI=KkAXa^_j<;xl+&LA9C!sw{IM6PhKbaOX)9-`LWz}l|u9o4JQpiWZ*5TNW#hXd* zl$mVa!MVVxxs92*B`-aswbe1N6xx7wJj zdhUnmU>#J$($G-yV;8j4m!9;a1z*WXS!pLAzPB-T*9*g}O`PXX-Xd4-PN#z$^Y#|g zan7%R5b#!!9BVg4_L*x;A@5p6Ai5l*WMc&b7RT+&*T9|N{3<0k=cXm9vrTHl)*T?3 z?TW?Un(sxWbE$flZh(lDA!tRua$tUWvig7xa&YIapcCl+>y9BxwI#@*92P)Q;?Q7~ z=wqo@_DZ>OToUJBLcgV$VQ~eCjO;~z1f{uG8UfGMlmC_c4S{IaW7>~tYirkBPKjDV zz4|$}da#=ic4Bf$`onuWUP(p4NN@a@#xlqgc$T zkG7(lg@-06Hn_{+FHf<=R9YUA;KAHIbEuh*+R7Sfx)96tgkV?UP(tJSod zdd;1VVd~ijKyUVk2{s?t+u2k;-;(%JuHzaZm(TJZNFA3XO$MMFB5l_wRhKZx0@qb2 z#P-FGSn+efwmI z-m>gew#~*p!wqyE8Iu6zNT4QfRYtarAg7|NVqaVxZmY}Zw~zd=b=&t&n~SgSE->s? zZbaXmed(#c&}=!+hkdAbcr+|@mv3V})?tTI&(X|@%HjtYofp_JQ!e#uU>0G8rY{Mw z>pR_^7|QEj=lrE6u5NRFLD1*xt~V-WBUq=-#T+2pwR>AGSql`@N|IA zje}c`3E|5Q7m4;9r9VVI(Je59*WyJo5A~Rv=#@TqyD9FW|Fult>T744hc4BjiZ@?;3Qr+AKcpC-n;W2^MmVP7*$d&&Mxjo4`oDHm~6&nrTN~xWFc>!T* zHt zPc@$wY_OS$@T=C1Rwt9!zPOw0LvOXs=4bBTVCNQFJ2;Qh!n6Zbs_*Wdz57x|x>VlO zjP053zda&p>+5-KVx0GJG>V%`3ozx(6IHh-6u-g82$?wbrqWDIG- zdvriEH=t;bOwH-9f(FE2rwfSw7@%f-HdF(>cjt~>TO{S+sA6C5xu_fhzc)&okMHP8 z+Q_p-o!|PA{%32CDSt-nNdfMMy#?%8KR{jdNoF(WSL+957C1)*v{!-~j`S zgm^$8eD<46biQ(QODWc$>r&l%U)YvZj-y99DhYIuM|gpNA*p_@7kLLxD`*uH3bGzJ zIH(lLsvd>23um;U1)7L^;F$wsG3BzmOyN7T^xtuOTap-{?U(78j9jEoQo=7xy(*y( z4?hC>iLs8OO7_Iel=b($|%tx=EGu5Qmpm3SFG5{tV?JC8~4c5En)ul2D$H?%NfUUPFk2}{yNRT@09MOYLi69TyeDS!?a;n0jBcOXWsum^tN*3aewRuN3$ z=A!)yYx{D<%ipVgPziDckL556OvB~Rb{=WvQfN+T%lSGP`n%eMP2oA#TmBff&p~+W1glRxO3lL zhJ#axP}bu{GGpYuRQ^RFWBo0*)UvWNg*_Or_=L~84dcZf2`&?*>E#;>4Kjgkv2X|E z_tyr+;2|WE3WhuJaNw~g>;Ib*Z}jKCbgvwMO7J%1G|o@D(xh-j*{%K8oS-lhjJ<)^ zM(v?VPB}R_wZJNk|1tHJT}FN9xT1YnOio6ZvJc+h!uO!M+*f;Z$^j#P<9j=m3nL{B zJ!Hxvj&zVrpa@=baTAT{WdI?rJHR;?hJP|EVJ!W$L5nc2qM-U9ao@)2ZPL*!nMx4qiV_{A?pj`n%=-AAT0MmjA z7TqSyBYZby{LQP_j3?*8qe?n7BF`$MSUF-Z8?@Ysd`8CFl(mOH%>Yz}O)@iMkkBrW z2>*~LL%Wd(>l{!#n0wro+t>sqjlM@G9L+U9J>5ew`+(}Xv1r(!eC?wNj%l^0#FhWY z0}JcU?)I71N7}+a_P>14BJp0q;cT;re}i5f{Fe$E`}7eDxiRsWTdT;dceo{d(|b4t znl;vvEd?hO-2)orl*tDQV#p}()BSC6sP62i90q;c+*Bl(uER`P8Dtezxc%LlT-mWT z0UAG05icqgVA?HDRn*iBTyHE<0R@_rW<>R_uiyy}(pl$eZzvg+9v!NVc1cWd>0Vr?$6~QCU*7(xqoo=V+}@53GYnybMgIm~(AqwF`LZjqaW)vGU|m1UD77 zxd~7K&G_d}Cy;iY^Mg0(e_}4Boh6#<7Q#yv&wPdcPGX6S3rc1F=B1b{LL|3+Y(LPm zq!ega{zIi2Q%J}LlML{1qYvJ^E30eGZw9 zc9)D^viz-VISmv>^PtN3pSdAyB(r2xqL{=Sd*X)FStX#&@kwTpo?Y$T_ZCr( zo)=0aSn$OuCQZaZPZw0ZJeb7zaGoa>u=;D?|7HjcSrCI%D^wh1^;?zE&ZH1f0|MP} zu)uGC2`VFz&`wUeX)f1VbWLPNs4L{KeaEAF53t(gM|Z^#CBjp#4HkF!p6*}aFEwuo zVKCE4(qE_Kb%zNl8?Pa^XL5lq>E>z}fm7eH+X5Kap|J^`}@0AXRf1zj9yKpaiZ26R^j zA?qbaE=${!8`sC0$WZt+6(velP-lO`63$qY);PGD17kW?xFM!OZXN%?h4u-XiHLj% z#1k#w=byCtfcAA+4Lq!4U$Igv!UhT?iL=UBtB(f;1rKfl1Gc6*AIt9vO4zHndOjWE z=-<7glHeo}6?Ny#7dx@{&0KiFg;aVKRL9EQy))gu6VzmD`C!h$7Zk7(XI~6#PVMpx zJq6}ij=QDTi}L>t(XjfbWDj#gp=Q|61lE3j)Ur{)B7*iPeiHT}Z%j^Rb=Y;<68gvPsHZ;}a}M0#AshGq>Dt^pOywu5yL2zKgQJ{2N&ydm z!X<+d>yQF~-$g`#Nrwq5l*<2mJ}rUtBk0gn8S4oE7M!bJTP6;FxY1_E`Kts})a##X z1lM{Q5|uu-t`NtH?$Mo(Y7`w1e5?6&*FAhnHfql@%2sO6TH77C^`&;~Og9nV^erp&^3 z{i3Y@!l*c0_skUots*0*v>x;q`D+kJWB&R7%xi&@*!3BZrma5&fbUT~y1KnV^wCnE_2mc=?P+M$p4DwGZW917L=y5w@E;Jw{~f>sRNVja zNyDT@<~pf=V2ciAsK_g$;i%aal;mjh=5!}%)xUAi>!I8&Mkkax=@2RpsVe`Edn7#X zj$Xm|&UF}~!6mkO%DA|hhO#I39obrTu+sv}xct9{FFOSk77>J z1eKCqYw-j5gh?*W>e1H5I}`Qk3j<7d8OgjOn~;Q_z3QJ~PchZjrpK42VGsTuw`71B zFoyxBn!u1v+Q6op9cgP_=3vNTCz{!V`Vo0&oQ%DBS$-2eyHd=KNeE4wm|$Q@pR)1Z z&E}q+h)0XU%|L&=gsCxVj2;ilXB-L_WXmLNO*bE%JGeShLXTOQcKHv(r4H@IvL_WC zJ7V7WIey&Gd3P?P2}iYD8;5Q+j;p13r@l;^W4}~FkHr5b?|NP;ez2?z*^Fpj)pn$B zcDSm@K6IABCd8i&Ac_5J2GwHxNwwU=$oALTZzo;+gj$0nUZ2el4&{hH+))B+$tqF* zc|o@2SFV&;ub-wCd(`Z~C=~I8PP4fkupPdp^mM%igQl0n9}`q4jnvf+0T0)h?bOD= z6P=rp6iK*4_P>vzPBk89?)j1^>T&=3t;sLk^7&$=9@YwE$jhgnIgm9 zsWRct?L>VyHznrm<(@GvWZ`mZeXZH9>3A%x*&Z4ok?MQ9;>5#4QEOlG|C+MhR(V&* z$@9E?G-0u+fw_A7swkcPX!bt164Y|f;l_;=$^mqm&Z&c!N(}o7QnQmWnF7fP6%>5O zx1xohuQU~h%?|Fw#z4V_m~-+S&2EruK<7ka_QbwRfLs4JH!v@IvgYt%d7n4y9>;AH zwVn*Vy|>03Xf}&_@5MH>=n|1ydMWDXtQe3T4|c%DzyO%1FjPk9i2s8{DQrR?>VU;9 zS#`w~7`VN@MwXTq>0JFTb$x1UPSEzD!=?Z#ftd6(?S_m?a{ZjcWK#sGHS_Hs(Q(ML zjG`~3trdd~we|wT9s;GyK#{$aIboM@+$g>(g$O&wmf7PwXY2t}-peNQycZealB6lR zgXxQ@o5N|eM4?Ojww{HA2BU8N+L#$mb&y#+LVdme9JX>B`@9jj@d1y8+WM;E; z)IiO71|C!$5>(rF-2uh zW6fA5*~vNv!z{l`-S_?J`~Cd|zsKYE56rx;?Of-a>zwDSIoAutXc-%0clM1$ZFJH6CtUtQ2A>a>p-{3a>S{C`c; zqt9pd-anbq)<|5-dw-p3SYM)TLXr8gRB`T>mo+4t1e@~q-4=+v#nlcr4M>_X$tYwY z%+WMAgEz!lnznnU*x!5m@4y5iB5+Fzf8hDAw(iHjawLeBx3TBe5F=xcrShbPlp5dC z#)__={@xbuzwT6LEHKoTRaYfk-?`i|oAa5(q<@afW~jhzx9duHYAE5}PlVfEtt!sv zS5wdbpVQ5`g20T)pJbfg#@g=o3}j=v5tZ^@_(fCwpSgheKN{ z>?HK2cno{QBk?nHgO~+#R~K>AA`!VZ1rOrlH?{ns7GSih<`{_f4pkdy#Rd?rLU_E) zC?6uS-u|D{`xnal_~hRE!_Dn?Au0*7w4*75r>l{!AxM)`)k18gB~oEZ@m({TjtkAdq%}W96FugrQDRPL6yk0UVXmvdw(OGJ@i8xzmZQj*|8kN38;S@mb?LA)E z6Zk^|{H zMFC}OucW=^+V6tx4N^cknF-rDiu6U)J3sj>@91l82%1h`n?$a%>b8gGR^H4sK4xc} zI?<`s#k*e2dr-@fm0tQaY!=d#ZvOs%g&&l2n-x89dXo0f&a4LWF0TW*0Ec z-ySd?3^PKM-c>>~N4v4o+fne#du}Yr!`G@V(CG5mPJC86bUs4YewXZ<`!EHzqEbZy zChSw#%Uyjp@kNyd^ro(|kW~ps3F45S!c>z$QI&&D_0i7lGk5qhOQh1VVzlb!d%IDUh3WVNE0|OUy6pBT8>ALY<|(?; zoKgR^^#06^Y#%onl)3pqC44DVM+M1{fPciDLNS+^>wV-&g*6vOFW(ROj6>+U{rJrO zjqPK9T!pb0Yj^P@3pT}V00i#IayP_}Fq@kkD-R)BrjkQ#8f|l`d$3u z)hd`43IExWTwDe@dYD!x)p8;@4L@OQt~@{RGPyBx0KbWx|8UIl`y~6!)B(anpR`jDuXD_?2`+<0t}iA>T=L*Uv)%g40JxA%IWoJ5z-~{ zxrMOHueOdHaz2NkbyJ9djbI^_)&FQ?Rl&q;3hEZ* z7H6@Bw$z%67p;gHrad%pRNn7ZIT7RQJV?3ZRv+8XkO} z9RXGT)pDfxk^q$To-BKcgc(wESv(v3% z4zoeH(e=KfA~erjkkuZ}RB0^{1^f)&8n$&t0gv&FETFZKPgy32Z4pRLNyfSEOr-Y7 zw;{Vm2txDbl{V?G({oCSx30q+#d0jm9mjB$({=3aMK@@F7h`+;nuHz4u!k3%v=Kk- zF#vhceG4%{s@I z!v0+x(7d8JEzD>lF--C&=RD)~QmG#5{^mW$iymPi3d|W7vy3NenwfUXm~K>4(V8Yv z&P%Tb)RzG3MG*IvaPmv=^k}rrU60SVESF_mJ}jJ2V6^=>w?4hc z@^}-8Fr7@WwOX3vKt1b!nOgt4h<58vWfP@)t}f5R#~-Z=*C!k0zb@F!+o}MNCYzHZ z9Ge273-8yYtz|!F{nghywqfE2p1yru%^e;=jRbfWFy=+8;)Kt60_c=yTWCjDn-%46 z5nCb`URaIN0yG7L{PisO>SfW#YR?;U)eZ3B@7{6?Qd7nt*Xqiv>`ObWPwY7=eI=ST zW0E#M*K)hd4!yp4v0X;jXDQ7v+q5TD?)XP+>KUi2l?WIOgNf63HA*WOG{4iFMAWNG zol7g|p^#)4JzBy1Qj+<~-#z3TT(0?bZ8<~p>|sMV`y5O8>db{U$xO!%-#?9@79#k$ z>cYcd&9xFXzPz=CcY=Kw6OmE{wh8<78p;D!v}^R20R6<=1glWhr?cEf0gS=|Cq?;_ zj+YF1@J2>FNe*E~o=TfB8Mh{Qad*_WFUefxeCCwX-3ynF9j3``IR|-Wo1_Q%I3Rkv zUB*YvpSd5~=g+!zg+d~xJR7aDP=r%2=?d^D@5@2&-Nt@pIHGL~qzevIUcsgFdj&^M zMLTS4J$(m39`@2Un%Amryjz?}k-lH>H}Z>~Owa+kfZ#Hr(>EkjLi4|atA$8^FkGpj zkF5+Xlf(KQ$=(I=^B>23Dc-2c(o|pSBO)4RxWC(Dl#fP`)FBxSC(X1h2y5n#t$Gi# z5H!@M!)G|Ub9ysh(MQ`~e&|D@_VyZB@Cvdi>%3C1-PZm(+YqtU*p)gck@Iz{$=S>Q z>2^?jLrM_fgXUF-BGl}~RBM_rT@#-n9hBlqT#o6p{waSgqpy<9 zLM56Ng?Xu(Ow-R-?sLRaoLrRLo{l#!{gs$U*EKn8g18Lb1-n0*9F1CEDUH*TB}aYNaRy zK{uD=1FJ2kZ>#DaU6{B8uI+j?aaZ+L&TtfA?aqX55gqFKHaEh0Z~cvg)-%>T0VjH&wbDb%-jZ}UHKSOz z@^cumTIIdC(j0h(p}VozCT3FVPLus~{cE{I$jK+7x~@qS*!R6avIodzZMU7NR{8ca zt^0kE=0xePREOf12bZ9WF7l^ju8MSJ!PD&4Q-@!ljLM{gNMEKBw^4-Rdb?_0^XXwdd*u&iWnV{+NO-9a_rJ+Evd0oAx zE*?gK;YQN_(voh)YAZ34Gq&zJ;xyg{=BXYP8iuGviT)fcs}fRHR0{2CpKe%f!5)})y4Pe;|IU5xnq!yPEX;NM$itih4)po#Qk#m zR<*Q~K(qLGzs;GBVgtoUlt<{VS)Z2j-0!;Ol6Tb3x2AuJT2a2U` z4^Ajv49efE&^E$gsz?|}T8#pIYqrVfZYDF{-MQ;ox&>UX%X{Qo*{e3*w5@c$Vqgh= z(XfoInF28NrAS7Q>rVjCQdNAL?xRY;qO$EKo){_^ULz-4-LTG!k z9$DF${vn=>`@GyWRGnKsIuWUwZb!TxGQPFL@8@K?^4J;k`#PtpMyv0Fy|^aAjCi)0 zTK)9^vc^}yCJl+_C}+nyRqVAqz~MYu)MTqtqLRRTv~WK`KV5sjdm z)Vffd<)jDPZ=!!+`cC&TCA^VhisDdppdBv8=N}=yiZ?kz+v>!b%9M>gRg~Np60k7(=qP5T^HYd@eV*68@gedB1k= zXDqF9K1aFVvvMDD${kr)=swSQEMR$XJl?wO%<;G7$d$&X%`7IFbP?U;n-ixg6KbM~ z7=7+x(uNl_kM&+DwUT09WlVPaKno(Ba27PaR+wium!kKu;EU``io7LOSMW z2)OHB8W%WAeX-D+m@2(CsVKUFx?~|c&FgJ~Cr6RcCH7bGsUp=1 z4C2>XOf!!ln)h3LBD$YThZ)#Y(1sLHeB?u$@rhG92WWZ=nRYKk~`50@y4%-a+?;j_=Kz`jY_ z7AA~D?6~_4|6jaP{ z*UU~fYHeKhr~iWk_op4&ut;406kMH5L{NuonP1Tp2XKecCg zXAZ|BC;PpAs;e?eN9GTwN!kt-pl- z(){k%NjLnL-j@LXe`$EDJHVK^aP~rhl$)V|>HjiALQ$$^Va=+MR*gh+X&(U|Fpm6c zkNJ+A>Hbey2+SM0!1L>B0D~7f92lFp^z!ahoQ3Vj2d9hk&0)^70~K_xaUHl!+@*>( z-jaC6q&aDcIC{w=zsW!|h2)ssakhFi$G*pBhQ7TLmZ;Ey7H02uQo|+_9hjkf2980m zVZCMU+R6g`c?OMLrJtG&x+^mLAOGjRhH$-okk-R*eOd5LAJM4lLB^N;oqDZ3Iu03w zSq2uRZAekzqHXXr&_h0PPq9H<*&I718T~x zrqhs&byt(I10u*re%uzCxFbLIx>~j^l@s|j4qIKfjUTS%-59$&AhlpUoTM79wa5;! zKAd#8iImutnfUXY&h*7k_svl~j#nU-aIg}u*5{=JM_Z`z>re3ezOq5gW2?;5vyR92 zExdN_@@|v2%*|XN9zYI`X-nMwBLd`Bh;NF!Lfd=e!|ltT)U7+#Llz8L$%@&IQ=pT; zb`?yxWx~V9a;R%@=~frtc~ZZy(cH4ECyZBOT$JOb(LUM(@k@haVdcUVrf*WgPS%Et z>asJKs6?zdg;(Zr?N}wcNrx(h!PsUE?q~U3udlH{_bG^SepXC{{Y(-X30A)Q@o4 zSZj^Si#P}{Keyy-h3 zM+*Zv?gJwK5d1DG9}`XHym`ufUWDfLns)p!_-ls=-X0oUj5lhMBS?9&10{QpJNPW(btKE5j;#s zGgtVga$YCt`SKA6{W_+=zOMVnL_fLlG~gJkkvqi)%SF<2Dx87HOnf(`FF<%N`=H{U zXRuoVALP$*W3Qmf*QKkiLPKY2OQEaUP7grkBHV7YVsfc15?)BL@Qn>LS6_0u)eCv$_d z@EfNTpj~o;UTM-Ep=+sTBMyiXRL&HxLC>#i{c=?DG8ywXeQkbFr~c}{QO&_=$%!Vz zF@=g?;TV1qLsh+RGD+`d0*%^1O@Ob~($H!u>j(#pHXReeqTThfjxL{Ms<~*kFOmV~3pV&;% z3OE&W4VSBJ;|gJo&DE)0#KZ^obOfv~G3lg|8@Xln>2g)KO!ac%hKg_gnA(?kxFC}C zN0*TTLHnYNhDhnei8{itzim7)2=zIV^}|n2%cD3Uq$`i{8XcG1F&s6|{TNpb_|@~9 zE}t;LDfdy7K`wT14fbZfgiMwfOeShsXBN>0vt-?T2deMCFv}&KQs`%cini(rM2=Qb zFOuz-`cuQg{i`S;8cSxX7Hq1h9tE=hhD*K`XqoyQm4qGe`KfS1+S>4xWqT99hxi6U zqA1x&L{dSxZ}pn)eE)2ag^EW}+rT z$)0Dc)WmCU+vhBjmUT}pI-5xe`(m`Ur+U$l4}2duMsMGZzZFOA7`6cpDYE7+5CL#u+|3vU90CKkdQ;YB2dS@|;{juKvE; zRVNaDu5J|g6Se2YpV+FqF(SW?I}MG;S9Z2wr6#UVsY?itR+y_D6D&YCUEWfM>Pj4J zhPmomR!?KMk0yTEONA4DZh3P_W7aNLI|pbgT?;Yi!B)NYpdX5irv^lU>Anh?HDiYM zt@r0)pI*v`f_aZsmVRxoSl|9%K=PQTT(RgFM$R^!+8j;$dMrtane3c4xKA0YAEnCA zS(5}4rXCH8{1K6;`>1FFb-jv2{e;jTkiKhL*8FBCShv)Dr6RxPG}6;**s!$5MHF+j z40kMQTK)c>A@D|L%onqmL$ihpIB10Rga)VR0g;G{DktvrTMzdw@A$!Cs^_%0uAr=o z7B1uKu}jxL95TH=Hn96slr<;yXBpV!r1ONwak25@DmJpvm(K}c%9l(_oVvV~nZbKt zppf{?A1%17=ihRs^p1)0W;yf>IdO%VImG@l9O*Tbt3eYLF%Xz~x1xzL z^g4W9<-82aYM|N!?Dl5P?{7uLP%J6$+SlBX0rD9q4oJT41~$%LuH#iVu>v^-Zb%`- zSb@EK+;-7_IcHYt(Ho3AhK<5FtWU(yVItTY^{UQP5Q!}IE)6}deuHc->dg$I4K+|2 z(7$HgRX{3m@7VMwRV&p92ZWFE<1K_J{R_FdXoBkCdjRqBK}_kQT~kcL=mZNrIQPxVq_dLj<)^Kxq??<<2Y)U;;}n^!1h zT>MLlW`2m*g!-K|{o6&DzL1kWST|-7w2SnM8rCIqJ6<1j4R*W*KYN*Xmp21c6=FjIW4bte>t-C_` zjm+iyyY2uFaA3~qKusxo4duhH7ir~S)DwfT=^?+Jvfccl*-o|v{nmGmIpip)1WKik z|n*u1{rFNZJ1ODlVay$tZY%r}E>uBQt!Gx74X1 zPpWKvGY_MND~E-T=R82A=O6Hx*~n89slDTR6S&&Q8cKnEz_R?QIULpU4$Ara=k?3c z^!-1yg+CMeZPE1(I!Q#26xAABW=T_+u84FLUTVFLH~155bsye41V2=qF<6~Xarr69 z8?rI?BgNo+>vxkz{zAw=kv-X@4_wVwPe7bVPb0?t@veW0w(XNB@chuX#H>6y371#U zDNYMtRwM_x;0_MXc=r zxCat&(wh8z9s1ML#cjM?`oi@SXJl0Kp$ngjgxg8k3Q({nbbj}{ZuPJ2$1ZsZp(x8z2Q;CTN@F@RJsD6kMG^D^? zl7!Oq+PdSGL#ub_iMaI;E8MG>WX||G?C(>wQg0QKC!OXsAY%!^gqNe*bMRx#h+_ogOv({1eN7f=w;N{Q5et z)%9GsnHZyu8 zqquoZT2IhsamC(t!tkQ|Up|{>t+JUAPj9a65)D4#@Bi?!(%t;$G5=j)-TddhP z238)j3~D>``JV++5U;F&i^-B(wEtb9$+uw-6CbNf>^@T`-Mom2S;VJhVochd|>X<`s161OYLXF%5^&(w0a)lW{0hYY#9@I zxlWEDj3)+3@i8&$uf0u7tj&`0I^vAe-4H#BA_~#zX%jwg?6iE`B^Ipl^)N>wrf}x3 zB&&A2z6xZV>$=N;bS&Ikd`a(*i==*5FrR$OOJ+`4*5vq zOmKl{hm|8NTo098+;4v!4%X$um2UL@d>vagEedyKw$@-WiT*=c-PfQ3$#*Et~P^`W=jF zV6;U${)&ZeoIk1ROR;h(@6^&?o>pYh;=H3l+-GMYLK)eWnZ%O~G};faG@EGYQapE| zL-l=~oT6^bgwL1IV8SpcE5cW05T~`bPZ9fY&U#DRnSG~+;xrD<1{e?VbmpLKaLHLU zvzr(4O~56c48j6pmOd8RnjOFHvXFGSr$Ql`e%xrS+k|rI+1_hibC;fd@zo!yR39tR z=p@&ny=~}w^wooo6RcC8eKNi^eY?<_(VM^CoA*xEqQ~Mw8%-nq72r>)s)=duq>+si zy1g{U-?lfis}Bi4vlljsRUfIH|7nK5St}r(=DL=N4LTKJCoU;6G!*iO(Ho`J8K4eM zbREO76&KD}E_DU_M^T@`^{>5coU5IE`q><$AaK)tjr{_LP-D)u@NgYshTBzks>GZ% z5N@r^NgF*$MCVO)UU5%&yb&LF7$E`jU$EIr&)g2x3VmkW$UowhyR;koVJUgxj!4kv zVRcEOmde}Jhc|mZZygPJQaPh`d0|<{%_jlaG06Ai$L)XD2GiIKlWG^zXE&>cGg(}#M`$rVu{jUsuzyV#;o`5{s$mop~ zGn^ccUJyem&>Xxj+cJYh_GofKp&y)><0ir*+$u7D~z}cjPEbKUeKgNQzfeR~E3n zS+`#7;6s%)Qa|8+%z3JV&B@a#0_7!}H}_=bkU@gX-?Zu~$;#434j6!K*==RK!qe&T zc4A{$8|FNg5skZYQucpyOu-Pwcp&lx8N7E9;=F;H(Z~xaSQ>OjH*Lqh1E1_XhWP z;rh~!99L=`OAMWH(#1(*===&Lg?@AAP>vYr45`vy)`QyT_x39?Nq>DgygHdNONLN) zu1#|fiHu$7+(!QQx&tKUnCcy(#83^is3&JRN8x5=Ib)M>gN^A*{8q`=c#04Gs>mql zFvCL3JR;KA01Hr@Z-q#Wq{OkQS@VA+Cw@~b>6@_;p8L6`NtW7O-09Hcc{kLTnjqLge~o3t z6?(?>K)*~yY?4J3(QLcW@_FX%9(2;on^V@%Uwfp$M=3Eghin`F)tL54)=(HHw2F5$xzf~8Vjn#^m^pBSXEFj{}R#0LXUn*A-r@X1Zes=AZ^IH zx<&P8w4RA-fXRT-H1caO%n!KFF2mCJsxjv31@* zbUQ|3V_U;Wk7j}n_viVn3?~jhG8)hpruGsY-0E=!83MJt&VyC1&b{Flvd|%e_!5_{ zy8M~gWJ7V;34Rk&o8IEiG%_~o!0UNyzrg7bd4*=^i|)$uGzq9}KktZnq-r`3MCo;U z+CZUwQ{wF|sC9yjZmh6uv87*4d?vOb`EXuzcLn3pMHv{#Jd;X(6+mo^AEe_2X-V>y z=tiQaU-}qu;q|rI&Sl(b6mn2Lvsry-KBfR}(bF*rzy{yqGY#PPzvwx|f4Rn^`Y^r5 zOaGCTds+$9VR^kU1Dn51vpy7~UL-L$2zfAgj>O_>gI#m)`vYr*USe|Eqyv?;_ zHg;)UJ?!B^kC&YGQif-X8mw2*y4snwX|3a7yl!J?b&=|Ki`ay#?a33(=?)KNPo`br z_Il_LLHY)26l6T6mX|RLZA-aDp6CVFHnZ(aiB@cTUce4;?KQkh&ei2bw)5-=f%A`7 zxsCDCb4|@ZHs#g5$V&35Cp7Z~o&(WBg=;*ckM8eHoSAH_mZpEa4y3Vc&&OG8Eq|_@ zIQVo>!>s{^)pV~Z_czvV`%B#^(6kD~k7;0vNTD z^Dc(^O9inIfV)F0r<$Q)Qk_E|*`V61f+>Kf&Q+y^^1^`P%yS!kIriypIHTT*u_ji$ zd>iM|VBtZaKvxk({t?QM$?A!xWGyF{a z>?v<7nLSF#>iP2UU+Xs%zV02OG?a5_EliFog;dYbr@G>R1DTx4Tx<4X2LQICg0&8s^TWtnz-j z;yx6t@9(|VWGddPrD^|&`tH2vS%>+?no8og(Xg;D5Mpgvm6MI8(nw)c&fQz;8MzUl ziK;^0^GWv){e-m*F$|};5rMr9Z9h%EICSZE`L~HebqPkt>suiQN^8b}OE_xwGSJr< zczN&XA;$u-<@a8HR4&!-V`_1L;uueHq5ix&2T()47xmbXfO@?5cM)+3P^74?W1Ptt zLn>@9+3aR{!zLPp3%+DbHI!)_}I~cZUdorLH&Cf*X9Re|8OKqz`JF`v;uAK1fm)F zj)R3IVY_a^-uGDLvkU)>QodTqU);W{OMi7b_F!XQo?#| z=pV7=P|kkrw*UX7Y}AxnLqV;@BIz@6 zeY?&6kU-+zU*iSRn}b<#u(1CP9q1BWx|3wV%pg&==X7LSwad0o2J9TsZI9dH~}4QLI*R{Fh4CJ~p;H`fqo-#d+TT1YJtrhSKR? z9EA45^y1U={aA~t_rF@?(*?$_Y%dQrO9rlW6|J@n3IaN#a3)>dG#hZXcdO}0GQ7~ zC5srL1Xh+qzGF|>CeP2<~UuYcG4u~fQ$IT#<>M0OWPaKWXEb4T2hFRQ3j)mPb7a7my2S+XOiyy_fiDy4?7 zWI!+g{Wi=qb1?lcKxT{LIr87OxVk2LXGaRQ&#Rm;T)TK2w3z8*?{F?2&eA05)>pAQ z>?j7IXU24|o!D5BVqxih6ZQMAw@wmL#QY*}irS~_s~GCtg)XbxC~eth090RPx#_8Y0eV54U>^@u`Dte(@T|TbfY+d8xFq~G zvTE%(*Q*`XrTzY>%~o9?6E(IXmGg08=L5w5{Qyi~(U#v1%oCvF;>$9k(|;}ExEnhP zUuj$kp%laJ?gi8&{kP)B7%=0VFt%8)`ou3GD_*hxEkYPz02*g60Z@jRjUgb`Y&RDl zJkdH~k_(W^xQC#J`Oq2{iv7pT-&a^za6j8JehJS#YXyHd(D?{E6T-(mJk;9&PIcoU z-`Sw_#{@dSr3GJRq+lcg9RMIV#foF( z(Vxs-5>TZr%@Z8DyLMDAs*e6Tlz*oJ5<8(t<5mbBh+J0HdcDcqp12a;(567+O8_i3 z9n&!Yv3mn%mb*;s)U}SXUmAKE<=~|mkO)oTTAZ{5>rl~ozWc-ZC!aBVfhJ(uH(Ny@ zk@q0ReI5u%M&@#_Qx9na)|1T2%#1@LsgljLLJo_M_w)9)>thw$OTqMgcG6PJ&E$FL zBx2rHs##eCnmtE#l@h_1HMrWSll93vw0F#mI+h(?6$Fr-+UlCBwKQ1&i?}k70P_n_rU43@Y?Hsp+ICQX}w4u zSBefQ;BgXNE@XSeS$H~_``-$jmtv0wMj{&#%)3Vm-R_@5`EQ;>KLnaymp=~}h}Yt3 z{K$D&&j*HH;y~jELn$x}9p-iY@b>qHW-l0%DA-iY2{qS zJaE+u7E~Yo0OW>R-yYl(&bTJ7&8@e-ee5of5QlfYZPKM#=U7@B)&&5eX;%qnr~CP_ z@_PRw>Glf)h&MYM#XbtUtc0Ev4m+H1s`l95D z(JWVf+A4s6<-9|QnZxoIzhBI2zTJhqNa^zeemO?Q+~y7?(H^iUXn)Ba}LPMfcrUEre);`jWMkhRq- z_tF*ixSHHLDfmOT3lRdPx&a)gjw!^lA}^Te6lv8Wd4y+Ad7n#&`FP;M^DBYLEG%c< zU|fK~`W*rXov)=>n~eg67FwtDBp? zFV2ekE_MW02ETqMC~mR!;gFy9JLNeOGm(!!BK7dUm9m5btCK;^o!0R}fu{~1@&Ndb zE&*M@-fUyONP;RHgBZcYY2db^*6mJee5_;+YgJByj#aT-y;4{3AEGjfx*|qBrZ_2l z9+^lBM$U7F=WnWJ4aPnhG6%gyEu@{wv zSbOnKm=qo8$iAEIrte@b^<14CtZvjjc2ect07@3s;}EY`c~FbX4ck_#1b5+Qd3)q1 zBpn!%->MvvkuPZ}j2nl$cC_GoBN+hy>rH{F47$5nl854*w@U9ht7Fgu>{w`FzwydT zSh2|wk$#?eXQcWZft^C?STEvKIa8d9p%|_C9M|^NR-P9Dm_4AqQ2Q+zEddyoM-;6%5oi`5SxBLL`cWiea`(4w1Cx7_GEyv~~K zy<_~uu&TKRxK1BuiUcYZ))jH@itss3WR z)$%D!8!$B3|H7UwH^)lhcslP|7muGI>aZEA@~U+|*Vm3DqWt}f6R49q8Bs#_Mw`yUXxS8ls@?e08C2xvst)z=6P4voRZ*iZNLWAtGYVz z^<`GVlm;Zj@n1gEAsE^R(H-#)#@AM4Q8+xkXRTA))hki)i%cKsa-jn*wN7wEO9rTa z(OpBIP-yAUKeWHhM|gtY!vB!|>Of}0O6L)$j64nX>-32+#6In67|Yjm%U(dvnEnY0 z5>Vdm-V8jglxabM&sW@lS$xraU3;yp&^GV}_AMb#B_HQldtI|WxDk;)C#Gi-Z7yL& zcxjySwPRlKg%9Nbwa+nseX0jem?#)z3|TW&$(0Z&j!$Rd;A zNU5+rt)JfH*yg#-lLuGilXU)%3X@yr0t*bwYf=n z-HyQHad8ZX{<@_GXD^Q<$kG|$N69&|2q!S_p}#}eW5bk)gyt>?b!9xuN~py^k^LiR zphlOyllxv8|J_;$$fU7Xr5hhRw6^>0y*~|&54)Nxk24U^q=WX=Av9q!^Ll-J%bT=Q zEtMY+e!Pfykl>&O?OypH2Aq?QEKZSPdj4VhX~aVErOcZT1DEm@tSnTXcZOw#1YS^N z!naqjoqI7|D!l{c{GQA|$aXm{nayAD*sCCoz_oVNCLpY1b7wpYo&tsPzELP^?1rt&#H@I9PE`XD%RZ zEoY25VKP5Q`YKIS`xlj5&)j%EO&F@fRR-=zeN#HK#jv&Gj@rCI1J1V- zT)qwKT$QKfyT(#DyNy)4%5R@d zh-xQ=R~E8itPCGz4t|@-1vfJCt7iUB@7`-b0*2hKdeq6VdFttQ$4#cO6IRYot$D03 z?~?pqiL<(Uy>eOPl^oQ^3bixJ8$ByfZ83`(dq?LtSH@FMjS*9KlqN3O_wdCuaX_f$ za*U|YzeU%i-Pn@F)-f`@I0KzCP$vk~p9v{s_}q(*NI7Y}~_`4t;$d1~il;k`7weP4)9w4D~AL zueliRTaNS0|K{VnL%@=!d+E;(d^)CoGh_#T^hg&rpmHM6EwJ_q03%oP6ZKicDmS~N zrCOa;k3o7At%Z_k%Avo++m;KNV^7OiUI`!I0`bT6g-)!ANL$&^u;u3SZ|9V!g@xjg zY~7H(&8bz%H~<7NS=3^All}O74(Vv;b76kvHp4^BLGNxO^xwW-4v| zIz1kf7}V+C2IxvFn-aX9Cyx5ll@X8@@>>axM|i~^6z0Y}u5xQRhGJ&Glk#BaKOQ9C z53LUKGXjV?2XU7~x$jzLaGf*Vo{d2x_^or^oCoZgEz|_owD8eFiMMVRn~%S#e7Qgqxh+wWYXFv1Ji&rAJi}i;VA>N!?8W>j42btV$9LyT z^921~`(Y}=)OZqDIlxkym!CjSY^SL(GpP>N06@9)G0%_Xd)uGG4jQTPXg2m71hjau zTkFUguN79)G>t+r&x@(rF3$iW@nmGCtCMtRso-RK9N|1+EF)oPXrKarmC(Ar|%t!a><-oZoy z1n3Bf`CF7ueJ@BZco@fnp{r6&m{o?MBh0h{)N&N+yjeE7xnZ@U?aPI}NA#KacT?wcge4?JHi0h>BC+FHkADA>q@l@H7ql1aA)JhTZ97& zbxtvR$n|VJG3G4jo_P5(gDJ_JXWFGi+Nr#uJapWQv~QW%g0rnwAJNTJDlbd{azMOX z9&9bxlx5dNPri!%)<8n^>Q5c+tD!T_zPD9)nJ<$o#h~wi{Ii^MI?y=T+~Ydu!3g1I z0ft6}3o)W^{Ut9|Yqt%{+tr^2JAw8u|Cn+K1gQXXM&$^==l|qnV+1ORZY=>u{cc5q zQ_e93;n&35KpO6`u^Soe@8I>%JMjCwd*(F~@KaME>M> zjyh&u&Z1&>L?pQ>L8vL5Uo}2Yu27J(`8*&H1S351>t0X=HqM7LTF-)9#TkQ{nVuhx z`o*TJgzIZ9AH?2p3L6znAWj4Q!1ptMj_(*?iBepl78G0Hi)!bnOZo3Z5y)1n*B_03 zoU#Ci4k|@-9IEUA=2v!P>Vdyzt`>lvViv%VAaaP|(l?MIuADL+VUQjQS`^R(N^od& zl_PW^++7&v0~K+01*eKE&)SeU?4Zu(DZI8l+8RrWyObXZxRdS&uDNCGtZ_nOKS$l3 zsKvaKN~^X&CiusX(-Zwo^2Ta1m~V>OPWfFb7)x0FcZRYjxO;IkF&*DH!*$$uFpUE#ik1cDq=u@-AaW^sl!$fk_nm z#o00Tl>E-(?HhQg%8w0wFT9caKP1q)GpMldJu8fLHF@cO5v5`^A5JSLjf0P(nCxNKa(XV}tVNoSVBBFuHSphPC2vY*K zkHW~Oj+ep$*;m~*AH{=6B&I0-r^;B%ebA|21 z08WATY?h`D1%Ou%d81HFpcrPZ%dPBOd)!dbNC_Y<2Vd_MNxg^HH4p39w)2UEYMBEG z)M=l|p~3MUzBO`{RtTTU`?E_vstkYy2Zn1Et{B4s+=a?MM&`BLvwtrro*vdySDopt zRi$_Q1{SB;e;kc|Bg(yVieFFmr!%+UsI^(G1=jJFNCgcsKqvon_X0 zK%(d_SylklH8g*hoT=bsITrVa5g$Fvbcmw^DOmH*&0i-Y229{VeaZZetUHlQ;kAKb ze-+D;N8JbAySh%Ccf6I^f5?6QO?FGbW`aFU-ePiQrUml#{lTbEA>8ts}Y zDtv=WIO%6aEB5Kf>KqUr9-YTZr|8{SG%-E$YI_LsX$|G)NBc6+)qn%zr|yg&NC699 zF75W8HxO%ky<-7F3|_!W#olse7wV2|IRNzdsnrip_pO_Z$eZQpT}fDd7XR)IeYh4c zI;UWKep2%T!ojHRxxo8K=H=`+*?HbSDq5S!W#cMVcH6YlblxhG>aHoo#pc|t(#~F& zwO@aRP7e)f9M^KP-&a1 zE)d}7B4rGzixi)8j7;qD&!dozQUL4ap6bupcW>?yRegwsb|nzL>phPRALZi=VZX*2 z-$90DqA~))97K^33^;z18Mq>>a9@;1KD0n`7=A-PD^?=tf@_}+H$*jX^DE?{A`naO zsxIEyBthEJ^nzDMQDM03vOyBdx&S49(R=XghQk5s<)Zt~y)t+Ag2<%87U>xd9aB3; zXW%^5=NCAxN$#j-l>Zlx_~f6Q-Q2t^{0*cFuVGKW&hx$L0cp{q z>hrJ@T#5|Si(}Ec`qGs}-@%bchO6RrOWm~O%5cRyW|V)yfmkadi|Mgnd*ziVISYZZIhlC4hYLRI_VcP0TWNGzGRu0 zU!m3tL@a?$N&p;{DEw+-1KYOqh7l=8Sf=W+YqC()e{ZcD!b$}mIMB>r>g9yzWSZ_~ zIqj_u60cbm@xxo4%vJni8thGJhUEZF4ORkvsViBCw3}gasL|0bH+1w_GIP$ z?KfiQb};bR#`4MZ*}Wf<;fC$bO9x~uT5sk#xIpKzdM4(0tUnfhU1>qnu~vZi>|xfE z%l@}jl2w^M8m{*QmfRVGzulvEK9)77Sv1rgyj;XfZ=VB^zHjj=kfH&?jz=-2AY9z_ z9a^UKzkKY-|HIyUMm4#0VSp$qHbA9V5V#hU4uS*$0Tn5Nfbfx1=HL8rEtkUkzWbcB&o0k?9=@ZX zHr4Ua;IjqIz1a2%tYD}$LWvI&brUYYGWwBv4`XV)ld(~6`E`9 zfKfbIo$07K2>+PYTdEA?^b6??Um?toy$js znl)dMc;n3F!<>s@LT>|Kqb~9_$*e?tAJVC+Oi7?Bv8|y&lmy2QE%CMZxr2 zMG*b+1z=+!F)pm0W_}Z9eEqc_3H+<%so+7_!WpHr-4~5TQPh)w0*rj=AloG-O^uzO zUqr8rhS3OWyOQn`5FMgCPqJ7+jHXzXfw#Njp-rmb2KtucZRs zfS)!|Vj8QUk6yT&P$fhCa!2cc9b4E%g*<^Ygq?yHY_BUQgjT_Ev4*+Q^dtguCSm|-{mzZ2$w{jE& zjOeMsywN!WVsx%N13drnip$MU@WFSXEJ?r}?A>4u^!g|?VJTPW>5A^7MF>Dt=3v9T zO8hO@u@{qo?aywpQnRRSj@Rv2O%B1x&8H`6hD z-yKwQMJoz8jt9=j9^zIsa*1ANT2}+Vv@36*S8#-yo)(ol;dO=XU46kEEu)J89x~s% zer6f)CfV<5J6#XviF&VRmB7?>*b@j{mezWZ%yq7US;L8;$bHffFa6EwUIu0*Qqy_( zim0e#;ZmqGZdVrNcO%eqyvloeOt5uTYQl(7GXhBKPEgG}U8=|8_4ugykA4l*`R^eW z(PM$WM2EF)c0a1*WVg9Io=lc%3GTtCIap zD-5~;9hm_w#Q8qITZKuUzP>jrs*6^JU))>$>*!p=9@j7G5zXw9Zi!C5`0zMNpp~ zBzSOZ`EN|Syi_`#yw_v%dZo@zT>1QT&s%N>#LtGzt&9P_e3&rV7l`f{dfFI5RR~XX1ap7$C&2azi#BK-}Y=0AziA2Klohx<_uj? z$gg9tCoi|NLIq8B44N>5q4ab6?aEU$sig#?+L<5esS(0x_-^Bdncx{7vy~@!maybC z&D-SPDq>%*5hr$-`E|=A^Y_z<9;|Antz{!Z^JnAUQ;j8sGdg)K;ntTclGh)@#!cH+ zG3sSN>(cjt>gIf-u7dGQYxs$?AR(3aWaiO*b3fVgu)OcRzr{2Eezp1lxPf(F9@RL7 zC)fCXdsb~w!<6wuf#SZBbUnYj zZ-9XIfG_)(`PdNc@~`IrcMIzv)7gPjvQpPJ4z%9UdW5d-DP)tsW3^*oG)+q~9AHmK z7iUi?24D-v4_~2E!Fl}C&k4=p6J*w`rG>G1bM<3^^FYBWcDd1cwAwxo%Df!EfTk)7 zYj99>--EK2_1i-TfkWj->Av3m9RY&L!+qR+Y%{R5?`ymrH!Ho#U)kr;v8>1;{#~XQ z{jle$OFcb3ps5d+|BL1Ii1@7Xo8y{+q@p+xH>T?wS3&lyyV#xl^A6@$qx;8k-t2Ny zBfRYMAIzTfvIk5Ud7VFI9qK-o@qEPvOh^_7f(D{SUD>_HfqI z^bm$Ev&j=f4f+gp{Rt)`|In1jF;MTS&jb7feY7|ARSvZ)-`vF|JVMi;4kQUW+?zEt zaV8*|*dSa1VjUor$pUa2TNXGVVt?hJ@8iegEO@cL05bap@8C!MTC-VMS>;p4D@8JG zh?V8#EmIEBXc{O)_VGFp@{G8)|9P;<8?`tO^faKS*dxX++UKS__Gv+dtW_ivJo`VLK^GI9^>nynu!PK3Q^ds z!+q@EcLgl~FU8z9JYcU6JPT`^*;uFN?l7FjB2SGx0y>iK^~?%rnt7kLY4rJ>d7^Py z-{`*H*ELxEpw|I}R_*WF58qKZWb{xV*c}#&NQwsY!Zg9{eR6eQJu<37=CBu8PvHSr z=+lMUhl_vjhQzy)54>9Z%T=||LY+L9!UaTjWiObic%14PJ15mTGPFH}%U+{NnOG}uEGF3Oj1Y}BQd6>e^BO+wm+n!%DG zNi)8uN9Pp&OS|u{hX56gcEHE|m*N3Bq~Ig9KYe;je}D_~82dm(=6-O_EISOICs#q4 zs0*OU76KSRhpPu&+=c%i14u__uKi!=$^TCzl0p{6?S9dqwb$HzysP zlM&mi^glqb#~yw_bP?#@(V#=sJF7nf!zo63n-!S=`33m7`GNK~9UX`DFD~Vu(lY?p zqu1DEGcepfQsspx__j1u0mBAvKv7BIXH)%rXiVj80FMMT`oJxd_f_K4&}Y}voBx8- zyP)z%p1?bb)n)jMMBEXnM|*2|W=~E)x|69ugF9zSPp9iRM6&1v;5ipL{}t#+wkz2s z9KD*I_kBBGeg6-e-!C;dO#>UzR2HRM*j*bDE-D{*M7S%KtK4jDXUXJYe%C%4RL}Lu zucL)5q#gT&A>cD$zc6tx|3OAc+7Fx0*aDh*N0}WmSCDxW2v5lLk`K^1g0d9_msEpZ z)D8j%Xcgt!7Xj_)@iD7XbXhvPe7?&^|MWq$ZJlZNnD^-pc&~w7QLy@8LHxF$(r*sx zpWqeE{0Mr`l_z~H=KP?Z4Ag~3akwsb=+~~}eHYam5!|<~$x^H~J8;D1Wis63)`{u;X@84gu|2Q45)td4@hO|^-wxRrtN_lNQ)6*9V zVT&b>n6m9{0|5qJ*8FAxTd&*;OMs7hY-dih_E01gBS+>8rIhECo*c{s6$fuZO+oXp zXFHHD*y1CUWv@Mv%OhnDrVD)pT+z~OA{jXwR=+LoJ&o1W(C|Iy1&3q!*s1`ERAhEy zVnT47TSUPbEwE%ut@c^EpU0c2kt263CtDM--=H6AHq0E$vj@{*IP7J#TX$t2UFVON z;RpU8seZiUX!X6ZHECVIPc_*Wecy8u)YTRX4+5TB+4-kTdvbNHCSdI=2|r=m(P{U) zOctCNib$Tcw}aR2g0Jn$uq{_l694U-`UNU#fQXEM-*_NEq$Wte(^QCpt~h+)l9G`r z)ywDP=7t_D$OaO!Xw((H#xwr2ll13=-RPdXu|NCIZm=0P|1nk(rPg-1j!zfJIoRqH zo(JmEQO;I(FdeIzSIX1@A@ej4atPG_rD_f6B5gh3HQHyV9~B>|iGbLL%p^h$;5qlS z18McUSXRC86MNM10z?_jMF5lS#=Loib-H zT4!ecv(y@ps;?7U`c6ZTBJfxV)l4orS3kpnh(EHxqsO^hk5eCxUG3e$#U00_^&W%} z_-PA9yyW{jzN!-fhDRqbE+7Q5=*jYv_Z-Pxm)!7CCe7f$s#$a_A7Mw*EXw z1&%!eV21_9H!sfuEby1`*veYpnKg&-Y??v->rE(kK1IS~P@28%tEVOI4Cy*`v$r8^ z(+baJXSd|4m8RUN^O|clas|KY?3c_+!pW$gVCoWRX=YnEgj16=aYI8r-$5=!R%l9a z!w-Tp9Axq8i$2(9W0yhzklMHI7Yy-?Qi6@+`Qg(J76bvLLlz}_4+eD_3m--<0HN38 z$D7?FjyWcfWqOYmxxdwOJs}&RUi{(8!o@uRa=NP72guwaFHZV+vIG@l<7-+r z&CtOv8>^S9xtB6FA86H)t2BY)OhFVBm4PNb8cq_gwv~Vys^W`t7*j8FB!$fulk@X! zVaxe>Bc+3{iatOdI38>8jw=wW&guU>!~mj91<-g<4s99>P!x$sAx=!(*%%l$D!-Kk z78*1V#oI0cwIJZ+{E$v>HK4j0K7=2k9Oa{@P3QLLykLue%jZtZJ##~KQhtty=jKHe z)PU2$eRPiBPNza6)|H#ZN)>kI+`;RC-(oJ(V{|u{?u^_H4Qu!Za!v zj4|wQ^(R71d8>u#T4NNC9C`HmH;>u?YwN3!9O1w^2AACvk0M{GW!#J_K6uWK%6mB^ z@*Gl8q~F8GuQ441bMSIV?+D4rXwDiK=3uXXVG`nLN%>2+HD)D@-=?&(A_Jn`wi$BH z=+r`ZUnz(5yOuF){-Ko*pho6@qY@bdD1Cf_`y*4HuSwyS@NQCb&l;)vh3d&}mqu>41CpYnrThJ@N7Nb;@()y7`kB z&C+wwWjIgkjh6cn>MpJV;!(wF^?^YEX^~xk^G~jooy6>WKRT4JQpwI+1X#XG?1`&>JUdw8>o-QB7{;ui>BVQdobyNK?7g7wKOa+h*>$01WW$3X>&4 zkyv#zw_#!dvbtG@Omnm zhPN?BkZHqbA8A{cNcA>I+)L1l8tWN z+QpT9VrS0lj(IJ9k5~EutZDY|&F&D8f=IW8HMNlavbXeXT;>|@uOx4x^wAcw-q@J@ zNn89#=};45dl^tC9|BU%sqg-tgNlcm*+yE*ovx-`3>*(1J{&*|+Sx$odn-LKFF@nk z`$(QjzJA_)#E!vcYE~K0#3VUUfaB1!B6`x~(zwRbf9vEqNK$D-4-WUgG2BD-;(GV} z3YXYsFat=b!S|hq6weGQ<*D!%H1{e=%EwCyXgyD6%31Xpy1LwFj{I(uTgAMF%&}_? zgh++P-xbg*`)vGwU7&3DXN=EYjDLG49DAX#>tn*u`IPM@v#P$I*b|ETo@3Jn6i(iO zJ@_0a6fE0qcxOZRNJRDy`Hd#NmmS(~Eyt@Z-hEgVw$+uSioe2~PEZyySYv$PlWJ^M zw(Stk!F@Eh*ExUY?d~0cG8E%RTTjSqo4LcfW%CuhMY3wl(i{8`LrUB@l}Yhk#g*y`zR#X zy|N^d+dVs7I&2Z;_6d$#NJZLlwni{Uka_cYU&Wxf@D@)oBVFAx0;Z0sk}6X+hbJ3 z3l1s{9DKVjSq#mV?s8Kw6jyA|x!N`a)=8&Ac=%M_6ee@9xiFXDIWoqqQp_nf(l@0oRo*y63XvtiOEc1u6!yK%5S~DL-Ej$cLZadZjxPexh zUxF;3R(9|XX`#g&1LpyhwD5XG^0UzpO@)f(ij9pY2_9{kVa-`2FE1~cgEDYN<4pwP zdn+l5^}N2r`U%IDn;CNIR_IA<92t*vNb8nL#x732pE3{s$dsmV0e0RNDwvZBOhL>) zoY}5aU20>d6I5fn`cEF%Anq+H4p@73->!>5JT^u>FY&PT4fM{k{&2cju`GUrc>WRV zY?%njvc$h-b*sQV>mq?Z-cF;W?tP@wy5O!fib>UaidJ(j8!3Qzrx4u3}GjJcMZJMsE3Zwd!8_@Z8Pt6{v|3~wG_ zNOWxX`kvfpab6#+mB@w^7F)A=HD2hzkk-?&-FpEk&z?GuaA)vvRJ9#@s?v zd75DFe@>t+IDxa#xn(W7zUL|~GXsTM6eK6~<7xK!b6&)`;>#n8W_cdHvvZ)wUsuA9 z;k|)Mearzpypihu%+X~xK24=h9(qr9$eS*FsMJD_A2Kf{FI`2bDpv?wu(LSgN_X15FxdtH+I!$95&#e^>d=%a;lv!>?FD)(M9LpF{4P;^- ziTCv96Y2b(@sK9G!alZxnwayei19hemktc!ck51k{Lw~~iOF<-2K02d`JOM0S?vG% z+A#Z{Y>BJs%eQyj_IUz@>Pc^?c4mqOMC|Rbv;jBg5oIx+OX9xaE|_pYGV6CD`$45B z!#NtqA+~UWBj>BXihe(Hq+Un4^U@bt4zOjS5_z3n(6F;^ueN`}@#{c<9#JI>IG>+B zw00jc^w7^DGAu+ux1qYx*-8E^kX_-8v6#*=2-1iqJIi>>nG^X>48*VMJJbtF0J5qc zhd@dr>o3xanfNmACSM8yod#^qy*Mcj&Np;(Yb| z-fzpMxGNv1om4SQcf3B)BT0+o9PS&FZ+HngI^tq3RC>sVRY@E^PCi)^XsOM?W?E1k zfg}ehB;awGbvN==>uo&9$AgDp4y6~ApE7q9I4%WJ^nK8wu~RVU3vg-q=jYG4Z^CbTJ;V-#$52+SWPd4CMcx z&k^9xW8pk=T9wZW+?L8d3YdDpGQf?xBQ5PTTeKoc;XbHd)7Khjn!b*YtgF+T+8}%{v8$X`6S{fE9Yq?wfwiI`N&atCU zK5?kJNX)b5%n<$hkY!toU+V^#kMK(czdk=MOAb(Wc^xYt)mq!ik1)v`x}Ic6wuQuN zmlRKOrPBLsF)+hf9RMP=2EHC1T#nij?pp~B9o6*k`z%@t0DD-#!Z$Wlx&WfHMfTCp z2EU^-T7swe76h9%V6G4d+v0Gz;z|}!x8mtgp`E0JWD94s4(XaLd@8cO&hju|dK^@} z*t^upbMwibevzJ->8DeBHL7I|?vl%n;@c==(bh~(MQ2W$D&T((PXKj_*%!6Sxu1kZL;FqnZ^niuj5_r3X}1kMaQ6F9 zH^5!YqullWQgu*45I6-qJ=Q3THOc2Z8CI4J(sz~TadTS6HRrF(n>2Kkcad@mM5bmd z?s@c<(uiVvbpTk#hydao&tof{0zdxT)t^BDjH%M2^2?VSKup|rprfM$whRQkd#13( z7f^ITpVVWeK}90JMq$YhwX6MEiIQ2i8rw|feQxKPh~em1>Co^JoWcUxZFYx$s!j|W z+)nrjnaf8E*G}&a7KiHa9kDTnpXq7fpRQ+j-Z)fA*gu7BQPb%N19U`}%BOhG0Vhxd zDC0Ha0xx~^qvgZJm^nG-g`SLJG2X;;*J@jb;T+Z z`q4jbpgHTYEkLw^7#g$@Kx+-UUILoC_r1|N&fc%VM21r$_KHQhd()BF6yI#EI|iGs z7|G_grMccauo;0^?uygF=f=F1`Q^kqFUqa1M>O2t^0Jp6DSi)SaE1N%GPauWc!Y92 zwc0u6zIkN#e%T@M_~QLt{VTex1=j-^ARsV_76QZ{ZCBVfTDBY)JbqV_L6`D@*2GX|Iqr_L zY&Jkx(whR+UYKz-ydFDTW(H`hy|oge$HWv9Vw2%Q0?jA4H^gJzg)22Qvr!_17kBj5 z1KI?PA-=CZr4n>|*s5N=1SV?Jgyr)OBN{lhse7qf zd3~&ur1H~r>)Apk=--h4!rzf|=kkA2#d-8@s!K=o<;=ZMuQzB4IfP`9ijBt{S=;v> z`#z;|imNq9Fm3Y+UF(bgimNO@MU&hxg@|1z@*cNgO&~gbm}zpM4Qa|n#=|wNRGdd< zpP2bRec%?CvjP}(k(OEww3EGYLT+Pn5Rjc{9DC5;5sgty&*INo9lyI9Yb)uDd;AKBP-$c;lGFKZEjhAnx2b=Im%=tFQX-N`(=qG^y`?V{a0 z8#3Gb-Vb|kK7AFLpQ71Nk$G3Si)I`ndBWm*y?d;$K-%talQ4DHsD(V38asMtrd@vE zuFHFJSV*#{X;sSJkx~i2VZzd)!KmfzJO7tQkw9F<4SrOJ`ZAE|0xVzEU(hpMhih!b zkS)PGeq|4 z>jTkWKLXm&m5ih$&O;cNcxJqE<%_lOr+8Lq>x*!gquYSexe$J*yQ;bZBqJQ;L?vjG=qpM!Nv{x2A& zu#CL?wnC9L2>hw-wmpq&v=InqwlePgM>lDG_^q0GpCwNyMT&4ohueb1w@jDP z84_`PnkJzCXPM&K+Odef_E!a$q8O5!xizT63h58M1{gG}q~+g>H{XZZ9#+dzk#HGK z7V79>ug0MPm!mX_yR3-1b>)1w$D^D^ANtAa8h|AMQtB z!`IYy1PUh3B%n@c-OwBG7n;Td4|p)jHQJ#q`I-=Fto+u_%>09|GF6!9Y-;~gV)HOd zV-T0amwmxo-Vgx!;&|z; z{EH_EPB!h2J%V+ip#!B_$<8|#=dI1<^GS2HUS7OIO~b^|LUqT?`NrI`8EnF9h57d+ zh3#hwda96eD@_}xi38Ky$uSNNkt=3x`a7OsO!c`Rn8iRnOy3X8b!pqpT`00@mlZ1uqvKrsBo-4MM221QoUdHYv z0yaf}ZJC^$1jPgroD=qvDx`y;(lFdwOusHv*+QbovATV@j=jxZy za4o=}9EaChhSml?fDfdf?PN2C#$w3$QKdEI0S*UDEmIz-%Gu_9$o~vZs)e&jA&27S zk47R^ePJ`YWuN1zuaz6<;v|oq{KuvFg>tm=8XtDVUlVqDLUaF7_f3JCw3#X8@NN7( zaS^>@sX<=U;gHQn$LxJtm(#ZHhQ8x%pZ_>_HgLDTLWg7)S|P2>l5pyt|D1QvxiSwe zH`kx^dd)QB_W=${u47`dc!P{R7KAc&$@>h6O^Nd}#!t@sC#WRLcjerjmEM!QZo0Io zj#w(OgUw$7VRb-uRxXHxrI&UT%aSvP-VZWUtu{fzAg7*?{v(~6%*uuD+Isz{57wPb zC`_0)N|+R*?)(+4Av=UZ*ZE*Eo4kPI{5wL24>#)A@+U&nHXfGWu723#m~muyZ*Onx z^5n|s$k0&fJ$%;3Ii@yBPa?Em8`9u?vP~b6ORG2c8%Y5ol|UBb?>2+CocE@tE-%t} zIv#T$o8&&67QJ0lsqPhP$O#dAHIHid;IxZc=eCYLUS`*XXcjjL&Qi)e+gF@k$diw%}(tXI_s!p_#jdWL^$8#TV7X;5!~q|z+x zwPDOmb@Z|uST(=G$-W8@$qss5-en5LrBZx|tAeS3nuIz!%1LU5CGK&BSFyT%c30(? z=5|fg=i|Kf;)%=jWcLe~&$_DJiZ4vEkD4O)?H~RUop9Lo0{tP~9X5@}M^9h-YoGAt zeK}JXvvmEQzzcV*R_(YVWfa_E>qb@>=*QnExZP`CTwbks=*%ded%r&1O1G?ku)OSB z50_od(miGQiRBE znBLZ{k8RxSZRRa2QM>oN5qQ;w%l;?|Rw`)>H+sc~JahVrZrLX$pNdjBy1p?=5$kfk zel+`haCtctzI`I+h0_bkrOAW*j(4No=}JxXyI7(5_lM1*Yc)x|CGx_KMs1Z(Y-h`y znx6RBwEWPI!z9}BeI|^O{l2}X_<-|a>49jU91`wiZta;VypVeUZ<&Haehb9!f3BdMgcyS zQOCW`ECB_BtJYf|5FwDiIIe`TmUz&Gmag5tRbF!eYx6=+Gm_E9X+gJRMDdxoZAA_& zPBgBVhx9$Mpdt}nV^W1_x>d2G;qTZ`Vf8)B@4})=Jo%Qm?3pi?3i?V)&JJOM2Iu$` zPQO}in!uqpf8Z}E(u}G%^5t??)MuBp%x-l1c{ylolB_z?BpL7JncY;ve9Ak7sZjrh zg>Bz;c_Dmv`PYQ4&HZcvBcpDU|(oUa&EW82*m!alZj(z&A=o2t&fNn7dQzD{YeOQ zkC8s;w>A;za%r!h|4I|neAS8u(MY#Xr(oA< z&Q`TEnW=0h3Oy}>Z@-=X!{MC}U0KKL!~1Z%uMHCp>dU^B-%f_d9;Z6{|2*&{tDZsp z+~s$7FQ02l&pUu^#!i20IX3ova(e4jg=C`AP^GxbIqphXIAPCdETgc-j?tvXQw6a6 zxgmvQHnXQk2IL<~S~qt7G_su8EcwaYlE`JM)&S#u)yI^1cVJ~vB`P$iNSdj`w9s|1 z-NDlN`0!ZW=>(a9o?{<0GL^I!m&+P$5L4mW2KlM2uVR)mHwSSgBS%YPgnXSlTyPN` zCJrg3|Lir_J2>xB8mZR^D%$^%E_@CnGr7__Jq>6r{AuFj!~kZwrz_}fGILBb&>xgo z3x|^@cNlTZGxZ^-%dHC;_tG7NsJ8?Y1+C1$NW^Nl`)8q)+8oI)S0bq56GRh=UAtqTl~x zS`$qd*Lgp6mnR!Km`$ae;@iz)=Zdqwy;It6{CVv0UIlXeg{P`a0cUH;RYsfZT6)rA zLS?ZKg@8Q46TR%7mxMhG5?l<=l@eu>uq6pax-I@FcS*{SJhwfgeO{UDV4RXEOWE1M zxZXBYQr{t{2aksoL6~#N88*`HpnA_s=<}6!?3nDUaw8Kz+rrU?V;`8zw*-5*0iL(r zX~G$1(ghjneN?vC8dI$KSA(l%Cjz%mB7OVdb)R=J{I1m6RD8Q z&$o;dMlZ#p(ybM`8$St%E8EaLf5&)oe+hxH7;-EhH5%l?WMktQ=o2z(dWgS)-A2`R6cu;?YuEZmMa(#V$p)U{nqi`kEnN5di#+`(LtW`Lht^SZWle$gWyuJ5| zx$hFZFp_w+A|Yyo`2&j^ond0=sr^5Wb3B591H;2;GwQ!%vE5$PX#T?tE=k=IqkG=7 zRwe0DSDSFSyycMX3q8r_PMmo}ycyL2MjlKg_3ikinW4QsX7pB#ht2dwz!u5qvv84n zY_!^-HjJKXFx$(74n=R(DEQIYZ&%eC9;IWqO?&m+$xk7_ZGT=t*`oQ(T6ab4KDxv= zLoSCvA$WKa)UK`n_IhCw#If5+zP52RPo}>%SazNaHred#Yzmx01Q<&4K|)ER-s(Wp z*jFom)@4m+%~iaMbc$wGVJcNDX8-eg+^q|LLB~!w0w*dUBojxBCgI|IsKg5rbaWTd z^W>$J@J*Y~QDckPPyO6$J&y+<|CnsQ@-u)rKXqVwI)gfnZNl7GDD>UFJ~BfG zsimS&!LRkNKE=t=oqrz6_GhKe(LERHl==x>J}(;0G3#~X>~}2gr(n^-q4x+&Yhaye zs0-RUbZ__KTCR9^3Y9!tKE>?6x?HtF0X8iOR=oy=gR=#WAL87=n;}g(wF<;a^OTtz z1KqpQlL~*lGCf_=R-dx(?2(3Ve-;Mc;#}=drTFCy)Q_?4fz2RdT3rNrmu276omO z^DKWXa{Yo@lbUlig*clsw7>B51;z&Z?w?IT^ED>`D%BLnd-;!{0m!e2|IbgAU986= z`w=YplOG5+_>Vh3rvuye(G7$<51)!axV%9cnfC$sp<&#CKo|9LTd8bhRK;9 z2^UH6Cx>H({nz6uMfU)=-TyK)b@E$F2v`*3P;;5Ta79aN>W-ARKa49d( zOIt`yaydWwYc4ZQdTKocA&E z2*0kg=`b=_)t|=-pJhon0FgxntD6i>wqzMi<$CF%bMZTKW1CC~stG1`gcz6-*ci-_`rvtkpOS+iEGg|#chI)T)Hw|hZ2=>~bZZ)6Un;Cd|mmntLDwir` zf|$2bwQ~RAv!|Y0ZRn=4;8h{Da)0SY>?LYJR+e5ha(C%!3UPIB(12TF2X?uHU^-Z| zxS?X!h^>E}`zKyw1T3_Azdd!LcR8iBkovy;)U5Kj+)kXY>>}Teyl{PI6Ol^V?n-`{ zIif4I_F%nnCx!NXT&jDAISMm+`J;odJP9|Al^G==-I(OmBYiZ#_Gpo4Wi)N?o^as`1N(hz{&={~_ zU~6V7gL+2Z?wSGS-p~`U;+I5bfbrFynA)#2s5D|_snInB58>{4B%Eyt;objw*htDk z2lWMVwy;Liu?pNBc+-$Qa9X_l;;C0^f;GY=v}8;vd5A#~&htVO9obcSBh_CxFch;| zTl`>qyt1lH0xUU<8oD7fxY)OH^F8FAb;JX0fi8<}NsBp8ea69`Fn0D{kNL96K}XIY zwkMSv9T2ARb}wf|rTF0M`KgDq<2dc&xw*a{Fg0tu!`-Bf zEWP^FE5cM(iz`biTXTBWwkn?k7E2oq%Sw47IxlJ~WilY|)&GFXBXHa&BvLis_-Zi} z#qTaUeTa0j8x!)e8hJ{#tj|O})}`#`h?OLnyvFPFf1c23d55vay%H;V#%yW|T&B!s zqG|^I>7}-u==Au8l($J{l|x35VVE|t;iwjK9_5ZjXB#1v+{*+a}E%|y8hhkcP zsuJ7W7BEaQm5G#f);r70=j+8OB7KEn&fi6zEGnC-&c7A1A)jGAxy|*ce0ig4RP5Bk z(yuQp6oU4JGN0&c9m7R9p^?ssT{BC^dHlMHOL*uxgPdj}#L7xZE!aK6q)baAJ}W$q zuxzQrFeh7&k+sJK_*LP1rj5&UwO)fie7>i)aY(!35+|+6Nm?T9Xl~}6P;U5RnUfG* z#zw5<_d&rTz!puH-~218`RQKj!F{xq`LVB!t{ZVw&Q7Fc=N2hD$~Gy;OA(I0e`D#3 z5ah2GEV38uOMAUt86~#x5X?b&d1!yT&D->?<I>bp>{JM>29gQ*StP1s}5 z@-3+uNmJm}S#18*^J`p(QWL5iVJkTz+`Y#-d-7MtkXC{$_IWZbF~kCCh3KNtapSj&y=8v|vK=)=*U6 zZg+y|TYbDicU9@ft1a$jck+gqxu5lO8B!;MAWdty-S{eiCn{fo*AY zXrzaHg2zCzv3^(y$F`gW(ijt|gnFE6z(1B$z&JdsT1speFM0M+U7)uZCAk{HYK(#3 zjouL>&3kS4yjW7jo-G0Iw0uXoLu@H6830wKjaUof5w&YD>Ghl1++3)Nfqni5p0360 z;riimVtvIAj{aZ%?fo!_Ya=Vm8Rx1*ya;T^Uj$;Q~QM1p_x2F zq$)|UWu_eov1!=*&A%{b(Wo?e++k) zPj0tL*dB}3%%6BbPHS*=xVn^N7U0>yznhKC5=TaOu@N>lZ}G&Nuao<>Db4N)3p-~j z1QgU0mt3A5#w(OH;T{+VG?zK~xk+-Z93qK|LkpKc&&b0vZy+K6QYbvrFzzcmh!eqK zm60d6m}T-gDg{~n5E#UV&Kb89Mj-=BEJZsdm8#ztEUPfJbu%E>mlRXF8kMhm*EKYq*QZF_Dy z;5q%2d;Df6KkggfIEG!+-8wtg#J>uI-}GzwabMKr^jZ0;?w5Rb7Dmzpq4FH)x}CDt ze$4*+?XSDf>0YtI8%dVu)M1x?h?f!uofn@pDaujK&$`H#J=us+H{WCX?l6HZ@fdS` zZ0jc;+Mk;pg&)zcnhoC~)8BBekLioLjzCN){=kUl4)mhWy5JsDkaqu%-<2i!<#PN; zQ_3A~xsCO1o9soI(p7T2r^Pa4w&K3uIgl7Fw_E!?a-00=wRn#iA$H-zW=Oc1J8msD zDsPdHwZA9r`~A+bhMBd5(|n}EITf+HXCw=x#3`lO3cj&%iHsQ%+4mc-rn zbaIHdhb|{rQiQ^lc3#!nzVt8EE0e9k+DfAtzBhb<*xUPaT=9Bab}0G?@L=9cUbGKb zf@w-K;C~Dzfisw5uFk|m@QgARkIpwT0LQz!dX)pqhW1Nu2$sQ^S&TgsUA7}lWW4|V(k(0TNYc*i=~qm3g-1Ty8Ebk+Ub6P9 zTwir^YaC4qB8l{ev$i>fgno=dkm3r<&&wTN4L5!2dMbtRVeHWN?WV!b^k7X}-`VXP zyL>{w<50R}l1lmV=|Z8Pl^;(^Jg8O%_|rCc@FkRuu1I>kOh-GT4@k*qRw9XMUmto= zN&xPe5OXr62+gyc4;OZn#W^piXI12#U-ZYTBlCqI_Zu9((Za#74Ih&dN9B%xj{^0} zmwUbrzT)kkcQj6D7dwU4;>5Ro((N~17hL9nju|nHg~m2xJ*JMOEatgSf_I_5i3sV3KxWKd(mlnEvc}8+DP<8 zE~>lQ3(^DVd&G($u(1{XZ3^6~&S68HO?DT=A!s&A* zHs%KGvu6(+HWn+u+@Un#A(SB2Szsr-oebd0kf6zqGUh}?{}11wScpe0c9+YibcbE3 zM#@1UieGLCu zuNfvwGgikie`9>~-Iw5SHU~|0=KN#0J)*0D`>ofh=iw8G)Oti}xY-lnX|F=5wB)Nr zct;%J6A97FSp9jqhOPP1aq8zr(=-7>-sY{5=v@D1P%N1gKQ8O2aT&VZ*6S~6$`9@D z60Y`&k(Wb;8(Obbx#dOSg|4(c+3ONgFGk&}ikT6Vy5aB7mSB7dTDKGAIVqEXQ1uz) znDnM7Iq(N2S$xbi#%410kilhh5bBxec~uDdzqh+;`EuIonoIua^{$S_Bc7lv;kVZ& z@{T!h#=ktt;66zlCi_aAQv!SqEs&~S9M4eKG!fsKyl-udndU+>@wwdlXmCLhCP2Y z_y;ZoGl2WDh0q=zzbBsIWxF?sI{o4?<_1;gqgl8qWj)>C6zn-ghJR`6k|*H%2A432%}fzug(b=_ss_T1VwiZjP0e^2zIMtBBi?{66)xoq+KG z=}(yp&_dK9PmlQu9$lu+&^^cAeR9>C>Qd`ksnVrmC!{9Pb+%7qWbDTTd zsy{6G-|xz|P9rHwl>S912Ptn{wV>CrExyX!lXq5CJp0{Ud!vPGa;0EjZ9hh+PWF>` z&!kQzA2~Z_XyCc0@tleBYR-X)8$#r$-FYKAz49@6roYC}qbQno0XXMzkT|Z{+d8tO z4LT><=u6#_+~PhNdY4#JU)9Z_8e@x>^B<9VGI2WI)s<|z=?tG3lMJ$HpK#-BiZEGi zFiNn{@oJjziC(W5sf{6wR0*JEpcK?eD3)J|7+Zrb&!-;toNbMOe_Hpv`RL!T%bilf z?6NSg(Sl^C{5LFLnfl0KPDc2P(ax=`v%0U>y^_eb-%!o_N<5nGq#S9Q`L?6p7s0K` zmL`buw!PukV!S095fuKuJ1wtnM}2c6oeAVh&?c;HdA?IN2T4>zhD1vfc_-z=b}j~a?Q<`-Pk>x1q%AYW=`RDmom@*q-S zg%5XI*f;A0=3CCYhp(1}hDwa07-gq@px6Z^V#+)h^+Rg^gpka4`3K_#oRa^3JCSj_ zE5mv&ECpZQX=}^7EiH|Y_<(^PXY!*8a96_@x?N$^2g^}O{})45z7sZ-@=H7`gzq5^4@e1_7i&JZ30vKPM2XYm!zy6Kkl)< z%bFMKd~@5lqsr-4tjCGosz+_S$M93gzVvgi=W=RmB*0KRd=^4(`P; z&6*I0AzuMI8TgF!J5&&L$i}Vrmi-|%q&<7BgCyaEe{w-KH0FB_E}~Z_M{=sqX70tJ zyoTr5r^5$5DEUxNaFTgECwKU?wmwN;(bxKGXsp1_^5~cZx!fA90Vl=21bd+LL_PO`}w>HMtHCz0=v1pRsR&tL1TK_l3q=dVC?tiua zoQ+;lv&)QEf09jjaUnHn+S>oGew4chxxRjx_TsLM?z`!s7u7Nk126x)v_7)__l|Al zxpQyyUbcFyw|e2~dmWGet;(q9)_;9YD1B|#lGMxf?PX;}r*C~)yei5J*pJX%-R}Jv zSStUumBOBpP5|5MJD2+D=f2{xW5{|HZcc5j0sumV6FPMCVaZ6P6vWS?j`)#dv(r2db)w&~Uz6aP@n`}ZFwU>(Bgvx)w$-rnz?zb)X<{JvK?dsbdn zn=7{*xXi#x?UrD4&W+FE_m7|M4F>d^-L4n8S!dGP)4&$S)93%UPv4<@!Q|Gt zxho%4$Sr(k@p`v;lYZ3UmOEL?-u&Rtbr3t<_hm9~>T9F@sY|y>-%er=-?3S_ecjEx zJCa<9!BJYTLaaBJUd)M{|NZFu#q%TPiQljOcJ#!r-N2E`o29=ZFBS%PXPW)zk&29V zpFJgZ_gvAn-)>%6-@~o%oV)d|RTC%M0;ec~gMo7%a}8e4iBUgke(rrn`sN4kGI^<^L46EhQ=w@ThT7;;f8>k)9aNdM;>_s=gL zq*t%{@0Ss&lTmk?>+X)$yfxuAds~+AUe0@U!r%H$l-6zEy*jTobidwP`1N%|cJ|-Z zd*xPJPqMD=j&j_b`E1LYvan#T-LEJ3A66^9ZCt!Ht96UhRFRJ9i&N9h7~8OqsU)Af zR%$f!oNno|tl;IBZszP-6qddRcrg>ho5}U&;WMXoGXKUA^=8r6cW&Rm>5$G~8P!{N{12d6(|Qb8pw%*gReYJiO=I+mBC! z>zS4I{x8ZGHo-orA(Ol9oqq9b`PSb{{=J!94RWQEuX1fZij87Q@Dd@etSbBDDoI?aDgYafOi)j zSau^}9b^*zKoWQqA9%YZ!|bHyoxp_;pt}vb6+z1|fHzt)IYbyZD-~0t!W18>I0TjETz?*UMnt6+` z0C48+>_vfdY{k488qxLt{`};x|KV)y0WxG+hQ?~fwY<3-Ss~_lXFI=Z+f;gSVI^?y z_Rl;upc*rm!{NM1oY%H{to;UDf-wth-my~Wcj@Qn-85Loo0|$+SpWuG`oX>iu^X(w z3l6}-R}?{4Yl6W7K2R2cfYKA!fGZ)uV1+X{`GVLEvY^}n0k0rI1L6dlJ)2J{r~bcoV$0K-QxKON`s!RelF{r5}E*EWq@1& literal 0 HcmV?d00001 diff --git a/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg b/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..9c483b986a62a9b87d09527e889b6413df712d64 GIT binary patch literal 66241 zcmeEu1yG#Zw&ssR0s#`-g9i=nmIMm{0tB}NcXw+b0fIIXAh<&a7Tn!}yE~1$Ytu~U zoOADe=e$?%-8VBeQ&V%hYu8^izwEWvx4ymiT5JFJ)A!2&&TA=IDF6Wh0WgApf%`e& zC4i2KiiV1Uj)sPYfr0)26AugX;X_Q~$4{{F$Vn(E$Vte^sA$<3sHj~ZSGP%CXRm0Cj`T2u;VEZ5 z`Maq69d#e(`r(F*BJ;(k+MzF>%@!rOjf|(g2wVD$hftk)b#bY3JXo72YYmJP78#|^ zYD|QcT|2njI40vZ8X*BrWaBrer4V);^9z!BjCASTMU+_Uuf1a|uP!6W>yPJbxVkL) z#K)+QraB&a3ll_HH82xJeR*v3L5RT0m39rrGhpfHrZ9uD+1}TeWcx(co;XOcl=J%= zx$r&~WNX#An&f~kMxv(#`TE5vnPitTk0ez&ZqFLP*mz7| zx{`*ir*{=NIOzFxO{Q>4!3A-DKH%LfG-KX&b6d}7;3rk z!a!Z-LfEFoE6-QTjcW9&;4Qd|f<*fs!0HxFcd~vxd;Q{7Z?&kljZs)6UrOzEm!04+ zYG-z;u;zYlV~}|C!kUCg1FC8?EiK~9Ql@o@LEd8f)&d`rDK_V%A)J@?WK4A~xfga{ zTTvpV!sB`b1RTf6Yre;&;d*g>Q?#}=^``4w?+7NUj9BRZrr|4B%<1b)y9{7Cic%b> z1qln3xHXMwOzamPP-;g@Mkq@#8GJT9MLS8y8hELBEIAZ@$Sv|%o`T5QDx;Rw)0VkB z-jt}f2JmVMDl!JOc@|rcP((};%z9VlI~O*^IBpgUJm6#rNZ>7t#*m2cOe=bB?%^6L z%CPg~uIzLVoH^6?WJXUGh$Ua6ivDDSY*XL`g_$Fw@qz` zvW%UpQR?*oV7*20bds_&zKM7$v(&Q2`tb#G5>6j!SHC)Dp{yFRM{>v(scM_B~njFJ~4hm>_uRE^% zA=0{(oxZ#aBzxZ3-+FGjo9kmDu6p^PsJ?t`4XT-%Sa0TcFmT2&EnWHxk*${Tbs(5B zGyy$p9%B{ykInU-KDh@>hG+w^nhbhuX<6#aBgiFMTQ_HGWD_xe3fSr_Uj%%+o5AT2hUe>v_y&vp|frZcu~9 z>^A^#t)hijVx#S75mlB4ZVavs(81OXsb5mG^?#)@V&)J)%dt{iC64b(6pgW=itd7S zd1f&pmogg8RcsPH9WtCP7goe`rF<+*iutnyFLH-ztNp28Fal~VJC?L#u9w;vl-Y&W zc$9^0bksy_mzCD6U+1Qn82k1M!h|{O>9Jx+w6%15sX4){VDGWEoKKl7-DDS&4l)&W zG&*q)gug&AE%-c1aW9+#i+7|!_FE7=jR5ZvwfDBB@4R}HpRa=b?2iWGRbBR03bLR! z7+o!Fpm|J7=F0#naxK{X8LbiKbE*h5s?Ctnf*oed)grWR#M!;AQOyBkhmD2yxq|0L zu`#6n(GAIV=mUW5oX%BM-@beyJy@hpTR30jYcyvW`%_J#zy`=uT)gibdTa< z3Uq1BMb@(&f0iQRaWOIg(e_u9X2%ghs}Hqr<*APr%Vzp$`Y^2w#8IE?6U{re47&07 z9bI-e;e%UYWTTWi6(m73@2hVG4D0(_bQ=XXLx8hHroa-40@g@fl8w>DT{pUeWpfo< z0U|Y()Z~qFqpD+fULU=@EZ`f;G72F?LB~FIV;sn@wME%Z6R_RgExw)97itI-f!BqxJ>pjW2!6>6BSFx z1yYeaIKaz%rM@|Mk?w>i)Lg7W@{3E72JAYP?74mhwXe|7;~f>VMf{0#Ip^@ava-Zg zp2){JQ4aw-i7x4H#iB#2_;*8Xcj|i4le6TGZ-QYXR;mp@mQj^a_x6m9KMeUasLMi{ zuNhM5?tvewQ?L+=lY8JZ@2Qy{23V&uJaJ5I#Bi($WkQB|a0b@Oz-4OVClAJT4Z(?|k_A?XLY|fO9gfcHv?-`r|ENz*_HJZoNL>4?i5}OjvrNSEmj+~=#|=(PoSzbNfPls_W-8i z1HJFD(jY-$YXgC~8V3>k=_#RX$=vxXb%mqJatNzh?*=X==ho7P&$49ST6}9l-AR3) z`U#N5O!|r_zB}#?a12higI@goq-f zSHzzvN#X(jH=2s(2uEYAgB_=RqO(1WZ*1>eUZu??{^G}iw(R#BHxu!l>B{7=kF9G+ zS8vg@Hzn@@5f#`Cp8E)B=-oZg6yR90%#PAkpVoFm4OTHGg2hropX#C90~wAl?tvEm z<~CnJTS4~gowUp0tie#2N8a5TbHY9FiMgfujLl5{3XSke=^j8=_XBO|3^yM=Vn0fM zl=R1mjHj9RKx+Oy;Qe3&wDRDH{qA8RAq=4th6ifHU)H}N`;1bPxRL#{YWEzm)bT zK>Q17EC2GFf050$FQ{iDSL4gJf;6eQx_?43%q@Oj2de+t1J?J_p_1^WPuRdZ*vdqTqK7{ zc}7$q43Fpxg$nDXlHky)N!_e0a>3sunfGpq(|<7 ziegslxQ$)spP#5p--k3a+c=g+@RK>Xr$36W1>=E!wl3cTw-1_O(K>32G5K?rS|X|W zX0p}w0VZI(*4cI%(et2H7qkvhY``B>so#7jsPh(0f(pwMhlYvx7S+tsVLidnY}&~B;GUM2AuqLY^A}kbDoW?gTVIo^ zU*=5kc(Y4Z5t5|rS0=w&=03oCvc)w%Ud1;ln3JhHt<&lZm)%7}M=-5JeNc?BLjTG3 zY);thMge_^&4)Rx=_(B+Qif-hZfF@p_Gd-fnqYaCfTw5j&hPZu&sHiJ{^ZoqWFCn_g-Omv$xZfvO08eETlt$w9&1DS zVERCBdij`y2~`aTzNg6R5-Q0M{In0p4`s$Hh$<~Dg)TG?v0@vmqnn>QStgr;*H9 z<7j*xLpH%5&#e`#)#TJ}AI@7$xm>aemJwT-oLqi^#9gC~jm2#gN#x9GZ_O48R)N3s zG+&&J&XE!sJ#OPTd7bop*^ml!n9a{Hx_4JcQ!V$PPJNswpy&d3h0out_|eik>&Ta?9eP{FKyss@?M zfJD|VbGoG{o&V*VN!i+oL^nMwwpFtQOb*7UBj`JM%@#nT7}$z6w}ES0Fi}Hu;V#1a zW#8wvXd}!>vZnbTo3hZGvymn2`4WX1>L68QZO?r4HDfCRJ95BY~Wi^_8)l1l~V{D}J zT}*7LI*eCH_j+x*sP?}3%l6^i!h&{>z{W_3{n z&?JJCyQfZiWxOBp)pSJ&gM`=c4;Y)rSu%X|qM^~)iJERLTx#gc%^qMu*IHLc#r#1U zQY=k&^A;_=6QV2NpMAkMS&ax|6*Z#|M{1=F{lXeZW8UlgE>**GqR;k;`IH{rxD%bG z+D02)oF=(=m_J@*o^T{N#8qt)5bt-$Z?xhMKYbWxWQ8k{eAb$d{buA`$QH+p zw=SpQaZcmBo|Pr1@0{WbafWZ3B_6dFvK60F>rps+2U&&WtIK>1NZr(kx--AE+@Xk$ z!Gu@t0lT-zeCKekdz4L~w;&?1n%purV!DA02Om zRX?7isyESH>5W#c^VS++UvikvO+AbPiGW>Zvg@)&EXuBLL5!p^1sRAdGZk6mU6 z^8m{Rd-GcER@%jbrh8x{ZenbT!}}hf<9qchs&@$st16C@*2pp^TX{#oK)n1G>09A0 ztE1vLn|#INpA2&i(%jfjIM04QLY1b8+LeUW8s^S-TicB#ti7b}#;S-RPxn>)nTJH{ ze4f7%UgF>mv0Qp76=K9$c+|~=`e291MmZ1JG}2DuLU?Q^frgJQ}I|=EoP&{kB*WIJ#5qxDNyIqFQ-uME)09ZLt;oE0%Elr<*3ixfWA z`d*G?DIgTGoE(+fXxUI~*hzWv+)WJDR&ozy$KxTCZ98AoaBvIx^Fr3iD5MpWLeh+< zzHSbNen2xjiIC1!H~_t-go(l9RQAnZzjIzCN%)%vT3aeaXiOJmEl@E@BjcuGG0;}C z;ccqO(BC~&mT+XPm!^#yl-+bf&Zqdgpolx4|C#m&iv?heOW|#QYJW3{2NN8&K9}=U z8@F1-7+>8G`eH0i+15h#jpnLODer1mqA7Y)Fg&G!L$*wSzE+a+CNg^s^NgWLx+-u= zP)|QZY2(6XF^zrO#^zJ~0=-F)IokUZFj|$54d}IeeA8PBFA5WPTUrRE{L*BB*40PJ z##%W~d95z@B_YF*72+f$27v4gwc`7jqG6)bCYpJHN2GP#>AUL@_xD(R>&a<4h0k2c{x! zlFEGY6>;Eme64(HfRYNd3-x@8^hx8RERxL9k#o)1zT~R9Hm-`0wnB(w!r-Sk4_)2e z*LIErr|T3o6}WHo(iy4ac9X0{6XE*rlX#OsqQuD`&7}!|lL%vHyvn1l4q#te;+r&4 z-Guv(ykdMslbVYhVs?kV>$;KLjA~|ld|MYefbPHr3w_22X__R*cE0%54KC4Qez-Qw zXHF8BMIK%`HQRMUA@45*w(V_{S&QkiPXzWJ=t94y;`*I#&`r zjMf(=JBX$Nwx~AFBj+8R9PH56o@oj3$NS_9?1GcPQ^&<}3dfI%T=@dIbL&y>ii=}Z zBBg4hSgUEs3?DZ3ONZIH6`%Mrf2!|KNPLVgBu=r>ESbvzL-vS%L`+@Oh!NMmPqDhJTK7jQN!Q0|Ox48RSoHSA zolDM)pl~tIeUaW?II2+pj4BIjII0d~a5uN%4n7gW?3n)1dMdGdFtl&e;@XsEEM;l* zM}umHo}~wJ6z#Eh9U37)G_CRLpABJN-6d~sr(`AHWM(HC4Kmr=d%vN?+U2?mdRfsN zA!F!eRr|Taq3l^Zc4PNFpjX+ZYa5x6G5Ze)6ibS&#vUJo54IDJ=bNGy9yMQ1?I>2d zRC2G)mM`E+$#;z|nB4;}*Vi*YRg&oGZI-L)dRLV;N zY8amDKe$@-Y0Pnwv30`dN53?*8MsMMf;cFCb5D&-U#dChjndQ1lb=HBeF78-LbcHR zdUmf&;VZbSEy{NNay89NU*<3M3l#K0W=L))V1_198!@f7@t$k+W-gH~*oH;&0-*Tu=`Ri3^}lkvfzHFe1pGl*YhM-6Z)^&< z-y#dycpZ67Hx%!#r!1u&ZohfgFujuJTiZ4Ilfi6mbZ!Pw8lr5&cS9 zb7y{Q<8|`us=nW^Qh_VUU+>xq^N8?gD^4=P*#exN);f;DyrCWLH(v8Tl?EZe$A32( z+5Fj#zll%v%(aR0ckv@+58;kE8A-Re&BZMwBZT(FO{DzP@kTY|H}M&(r|;quSfcx{MU z>8Tr^3`#@b{yqK{<^5k+^O;<8%#*g~qF#U1K+p&HAQvIAA5)(;npAe#+nt7%P=Dyo zKy=o1&>Rj&0M#?+f1`qr^}ngWsDC(8V&*oh_3{@0I`I%Rat^-$ssCOBSMyDpJ?ERQ zXZc&{4_4TDo3@u(d{+MO02;c(xC~QY{=wA`w3KhK=2a6J5y`7Xec-lCJGeG=Rn)n; zRl#TXs=^XR8RU(^06f-gHhwW$ub#Dhc?*&EiGlFlFHGuI>se9?va_cYtS5sNTtTl> z;V^l8*-JHqmXJ!?!{0sTdY<#l_3JS=6~G7EI`M9TUsNgyh(F%-`ZVxp^ik_N*~d8s zSFs#E)+~Ckh9_*W9zRhZUbVYO7zQ$Gd#q z@4h?l-h3x4y_&+%HYsK?!S;e+mGP}jw8@$3A+*slN|&@l3`a+Cm7P!e=vn|Q8+^>u zvS(t`ewVf;Wmy)8J19C?upZ{H~aNMbBRx^{A&(7Uwf7 zn7G|2n6wxm+8#G4nt^hTQdc*_h94H_=~FK0F6BqR5J5n;-Vd+YJ}a(kSF9M_%PcRg z3R#?e`YD8`1>fYm+n}Q3LpSNhyUm?@Kr-`uT)_62EI!DGatnr+Ja0}gEa|XYC7Z1t zRhk+ae=U7lob(4{Q|-eCAP@NPdXm3YY%%d&EnY-ZKy-iJoGz2us`2FW>GCO8I~;j8 zHn55uwoB>Yj9Fk&IIHJ3zbjPI#9TR_b`BP$-YUpj(v;x4u-?WN$$Fh3+~@P0R?ovR zwL8LXH^aS%dxSnmm{ShVI5&Cqd70&dTi9!x;CtYG!QmaTwXO(YZ)d?< zvij{Xa{^S!YEYGu9my11s{iHaE#MgTe!WSr(c++zFDR$Bx}jkrr@Y3}s^wURn`NXF z%~$l2%6>>8c5Fn`lh35Nq`!MchY^YH{>mS0H`#dv*5%_1?UnbS~@U_b-_?k$V@*eo0eh-Ywg#cUsb`EVSk!qS~ zO{fs9NmO1xo57+;8xtX~2fBiV-gUtY^0HS`*W0qUcTuM@dUo`XMx9CLoMA&_`)5~O z+scazhH>h94}H<~^XSvE-6&s=je2^f2+7x1V<(wLPhe&!syc#`&@y%-D#u4OZ;A9S z1XB$|obJ+us?SSGwlk6}SMCUU*V$OPlZY7J>_;;@sp*7~;cyKnj%J38&{heUDm%I+ zHRt!}xseDBU~7!c-bGu^U5>LJsIezbs`zJPLZ`b$I5Egfk|_O+%gTq-ExD~?<(GIwfkPqsiADPUy`|w zAcr)W-^iB`U}H}hiz3PWy|pcY5GmGc%Uw&!kBB_c^e=IQY7kPA$y~dHrdV!xvz=L) z+i$$;z9gEY6?Qw74&-t_!&K_k*;t8~AuJ0ZUNbt@%OKy+{xQJq`nKV1Ic;$nVrkjk za(5~D>1cMKSus_F8?EezsF&HFTVub3`J?Zkr#}*fz(c^i^|t0EbK29ETn%GVPOB%m zSChOCHgp@G0^#{@TzQK&Y`??RrA^hcAreJWXUxsZQfEz5DZjoO`wiJmbXwUal3Nal)}Kvy|l9`Q))V?3iYF{46J zmx@qypW3rR(*s$>wR_;@gVeS08xhK_d!TsJQW8(CYef?8EyDkihEI0dZfQ$<`8fM+ zS5B(*`|s(_w5k4y^7T~Yw4r#4G|AM6{oX)HsS`6cBH&XhzqAZ z^=kXP7zRQ){z0~>P;%%L`=z)dbb)~2?E-d#=WCFG?P-KYUwD?lC279qe zoK8^^DHqE{^=)F}hLlD!_m2sYm5nA~{@{j%@`)~ddA!#P{$@z#o zJ;b?5R#TkD9zmMiKFyZW~tQm z5xXjt?#AK5mDFlFY_eO}wfPn7nC7CSnp1MsX0O)YBt#|g4+_Rajwsze-I$R9YR@+&emV%c%YuKy^StX_4lC|rg%=j< zOOs(dQj}=hdAj50a=Vst+iVAhHJ=o%3avvrqM(J2JtwD85lu7S86j-v*f_g&_rUZT z>O$t|elY<;p=@5{2`Z;pA1U$kb{mYAZmNyY`k}}A7PD$#XGi40I$_sG7qjitIsKh8 zx)YGFcz}t#0il^+fauIxT5`0~k7)d?PiTa=_u*Fp`Mfzh{_KX~vlOXOX1RT~-i=u@ zce^2*h~NlR1R^|ywmH!p&u?aee4lx$1H{S?7|bhf$B61;=~RK<3_0tU(WsoVwC~lb z+cq_YNpx-GN2@YCUTsn~QkunVKq5SjcXi3T08Rl`PX%GPZJnxh_8N`^l0dhPV@t`)+Tn`Oxvj(E zvjXA+NLSB#wN#Y-7^alu+oq$G!2DI-2_7Qr5-Llc%3He5x}UX=bi@XtQtiWhxPc+W zr3DKgtkSyrlww-?cbe0VoGfAaPwxSDd~K0aU3^~tQN~IB5oK-!)pze1$pO3_fH5-! z-pah>3v>Neb{k(UH<*8pza6E(^D#Ziq0cPte8!l^Fx;Yel`s0VhkfzK@Y~EAp1uf0 zD6;*Yr1M4E@h0}Ie<_qEOEhTZ+xs;71DzqK?t<6z$#?ve0ajX|?qsop2O+ZG77ot2 zvIw%=ZO&nh7~eACEB4=?90D*ibkx!MPz~EST?iRDFJPw4M>^!4-f?M-D@QA<8GLyc z6FT#i0()|8vVzt!qY5??-Z)E9z;px>Z4S4c!vA&;bh1Le@<^O)*Yyxg=>_xOg0Ie? z11^<#N!82?#7Z&MEYk4d&;O!&T#^^m1V%t7VA3X%@&$IkMnT61RAe2?d!6L4V1=I9?% zvP%ChJSKn@C63#(Cso{gIbV<6QQ> z3#D1CrjMM1KTqKU?>8pt3E3#Kv}CgcXK& z-h7$ZY-z)uuM9rk$yq1uff0aVZ!C1(V%j~BAKJ2I8Cyj$s`}F-r;`JcJ(RKba>tw} zN^G%tbEJ2#H6P5(_v2(=|Xiv4JuJ=@iu^=1AZxEwQrReWr_ z(Hyg}$_~yZGj|n%gUtF3xAKq=B>N8t-{Wx`Q?ywA6TM2!m z0S2F=M`@UQDDNYroO0Kmw_Kaw14kbfy$7pT<&|$m48l8bDX$((Y2*34dq#2L9YW}s zT-xp6ZGP_4#lBj9F0g2*#(v54{Ot8T;8_oS!z$L=L_T_C?G*aHJ?m$aA$Dh(rShR- zMN+Ox$|w@p&;4?LrT;BCL69f7d}u6 zyokN-F9SM0gRkfqJcn{Ceyev62<>s0f)=|*!fqk>QH6@dva3rE=(F;+FD|m)#q>ol zp0B%z8K)7eJofAIOxqPFulN!CI;Fp`k=SEQI9P0RL%#>qmIsf&oY39_X$c@Y_>KQx z{~h8ilhz?BR6f#iaG|9f+LtNbod}vJQD}sHn$rHh^L(2`buz=w9>2kOZiyVYWB_;l zfIiHVx6TyJ@eo`6@cpOFCc_Wgx@K`9m!87!o}+KC%O5mBW5trr^p6yL=Y%kM`m-sr z{O*{X--h5z0Yr##A>;FZWAVQo2d2*4obMj($|nJDzN*0>k5rqHLI#iYN_q1$zp^{j z#ePiEt`u=CA#KJT^mf!Gx1(UL2O(bt3o9a{YRhfG2HYQ1EsO$htk0Lh>7te4(m(I= z6q=`^hE#8Pzn{(AfLerZm{Mc&B?`9HW&=w5`EY~um3FXe1SkV^id5uvNplnH;Hki` zaK8Uvr26BuFiKqK671dQW_2t}Mt3cV=52@r(c(&xcKo|jX@kwaP1IMr9aWCzlWGKn zmvL7nP4zK-?^Iea3&OjpPQ-4`_obh9%AuFKhJ+sM=XpCpZA|wK2YNl7O)U~(QJK=s z@5c98Q8|%E0~odh)7omqu3k217M`wR6Dwj$7Zrnve6ekxLp!|TWgH+ZFNPf1P5Kz} ziBT#DFv4Lr zXNmlwER7zoM~5Qm$}@Y85sHl-GiuHOEdnPaE|#7P(6iTzp@-B?>!Nw2StXr90|Z?w zB~IuKKXFU#@tSOzL}C03yAkKEdz4R-ipy8l;RQ*vFQL2jnGN|SSUr^r4`Bq)fm3ii+CZemKBm2=u#UO$X0E6QhRgQFs`^^&9J5t%2y733r; zq{s}sAmN4)-2;*Q^hJ)4B@_)yz9#mutl>*(pBU!CdRYx$;c`$Js)Nwkp+3A|z|>1# zvA%eV6Vz-0xmLG}>?9SUH7nd?Q59$LzBBLe#b5xO>FXU?ZZvvBUVl-R`>qV~eAO1u zR5?ia^C~yZ<*+Rp)D#0=W2U(61mRRHXkoF*2y=OQbF41*n7S(M%uq4jMCqbj2D3w7 zy3@z5h~&K8ubt2_Vf2o<9e{avO$9-D3|Dqt3089Iz8s|PsMN_Oji+tvahGGLDm>4_ z-|V{vsfoy}n6E^qGH5yl?Xvatxvtlqc6V$GtC|+2A0tp$cX+P4r7;#nKPdSS0hy;b zpY}owF1F!Gnz4e=J!XvCDj7)o8jH5=4|;IHsv}q9U$VTepEJ!wUq7O$ z=~6c1ai9s$^>;u2k2356p@-Mc>KN|)OPbG&E1k~&GFIh8mUVR!diuUDwvX9P1f221 zEemz;0V~Pe9Rl3BK5lkyd93cl=h=v)>≠9$*ibtb+O0Z@DRX)1mcQ7%A-o>rC`^ z0$(eefL;UDYM&|49ZY ztE9v$h$ErKx($d*^=yFsTKls)EwCc9BgpuT-iz|DkH{6chOq` z05S74fcB%Hx{fV2|Fa}4!PJFH;FBU^&wAR01ON7H3A}XCVUZY?LGHl^~H6)Pi>*Gz|taqcxg5q#%~o;%dsr-$Z)zuu zT|Z2-*`GOG5=(WwQVCBshFFgo14x$xAHhngAQE^uzL;edi^e924PC6s2bO$%rR*I- zg9G-ul^FE89NJ8tfQSCE%cM67lXB+M0pZDRiY$I^|6aOBNb;xW{evJjH|TF^hM89* z-wD$m$P%TrZjNulV}lydW8Q675BH+#TtQQO9y|n1D+7%Mq@lnrN1>SRj6vy@*H)QE zlB}Xn_&W};R~?5SYK-aVK7EwQqX+T!JMbs1C##@*dBH_-eWc*H*#r{aDC9yxE(IQK zlvdmW1ZJ}->-&X-QNb*0Cu@tXFDX?8){JnklpSPe%5*Xl z9_QTB`WQBCmo0z1o$MCT2tJ<_xzd}$J{^vXUAdMkU+6v zOJt8vJbXW|GQ)&1J(WlH!;?%{8W?|C&8f2C7N(17c02X$2Bs@Ev>^dk>_m)HeLN9` zrQyjS7FAiW>Nxk6Pi?}``VeJ+MrI`+Gf$X2NrwATZ5M-``?)|*-d&uS*x&-&p(0P! zS@)aOTzF1|HB63_%Kd11(A2^0OyFk@uM%fwTpGsM6MpFR=5YbuTVzUO@PIj=Ty1&O zMwJ-LvWjHb3mS^+iLPQ7edd;aSyW@fbDphcoBR67ZZ^Ti~_Fawd(WwOm7FPQs)L;TXQyq9>@ zI9lI_B0p>Ut@4t2+YbBd#C;}4HjNV(u68yJPqFne=;vP`D?5m{+xi`3l3!j&WaPC& z1X6mSkAA@!zjRp5k|>e;#p#M;n8Nbuj$_Q#7_5&n)#T4ge8?EbD}K82nSrZT*n@0M zIf?Nr6)RuQzg42q?@IJTb}x$XMzwp^Z1DM&;gaa6W17h6^(GHIYg%lz%1UaJ#0XF} z`7Ab^UX6zGxBr~W?eoVzOhWpFQ+yFT1noTzcRju9i-U#Vt`iFo+KamWe4En?pdD(H zd4w&)c;IL0Xm;?e6c|^=*s7OOz{NK2_wG=LQZ!E(#(fWzb#ErP%GF$)YTK?S|4L$h zFvJPZS`sDlmra37LH*^=WMlzPJEG0doRVM4!NIUt1qVY5^jY)2K>{2=zdZ|~JgmXG z&MF=7x&C3l!!-nv`3o8S%HdW|XL6ul4psbN=3kN*&~Ljk0@ysI-QNbW6eR(50wqQ} z65F(56US@E?iCrUOtXoHLjkmZ7!O_){fk&q>HlHAuT>l@Hh!D!hH#@phBs-Begm}e z6Y6K?M+tDh0X>1Uqw9>}X`=4vfrxbCG+bMW8Fls5rIm3z2X4#N!LGv5&sCLnWOHL| zexg2P{A2=0V)E}uv~Qw5Ht5c0^IJ)Bch+^6dcbnHHivn){gq-d#nZmaysyZz8E4&7 z5v!t3Kgf@nw|GZQ+i&3mTBO)4Orv$rOJt>pz3s-VY;^yV?4-1CL($!a%_cPD%sZTg zz9wKmY<(Zb4wJN8?9lDAj(81OaoA1ta&U`Li$2smC6KBQLHE^hpaJq2xrWa#wsq-0 zu3&MsPqNyY6;iywET_`YdchTT{Xq&<(PAmHb_ZfP5mX*?=}8+66?DgvWR=1C3*VWe zB#Zxal|O|8GVN>=+*YWhlY<`~CiN7~1|NTCgFNpzF&?7@w)HNwUz+-;G8JVh*p-9d zm6zk@veuK)NgY~{@DSHZBjslqqGEh%9+~4$vPo5o?-qN9E$`{VV#0KbtS=VlF8AtE zzq4ejI%A1a{>_$?9NZQGn{Ata3@o+U?!q2%BeVof1_T2)TAu;_|LHB^L zFu&NciqgJU+B`Ne2BQ@_qRGEy@d2gO=`N;PT5m@rbaKE)p_zY-LXwU`Flx{cI~I1A zM@}Jo`1;5}yb=hr-E=*MCF$Ua**P)=7K2tVwD6lTbX3WirX*#sKsjDvbPsFdZah-E zMZhJcdu58nyS7+?)LOBdkbVU=mh0vo@ycp*pn-UZN%lbyMKW0DIDBPSXkTBiG^uO! z&F=cgoNmSrt{*!xunU0yJ;qQK^Tdw9g0)^~wSGdQhh1M9tu+&P-#ZpYaag}~_IhWX zRF|T+w_i)EV_5vr3rQMA$~P2DhJU8t0z7qzHQu>;s=v~3?p+`|bKjEa!d&@gljHfI z{XX0uUQrr>`=u9n-vVMD(~xn~w5~`vc|ga!AZT1-+rJGYzRaY5;#aY~((Aro-0^Y1 z%15&kUd#e_J0`)vH0-bqSB4`2$WiuRN%Pm_f$|r7RLj+m4OlmFjPd9Gg*1Pa7Sxsn z`y-BdrEqY8hv06f51(#Sz^5B6{&0R4{mB120V<(~d2Y`BF%y6@ju!84z4y9ngb(iU z0SgVgc&n-|zC)LQc)jtJp!MFTR!?r$G|r`~D8FtIu83qi=HkG4^I0?h36rFruxm^1 zy@NF;l=*1cbNU}8e=bEGMq{x2;7U<5n-UjrmyFDr^trz@up#m!3tfh6?LheSyug*i zLTz^3qq8Y-ir$m1I&pzs(kwya$NFbvul`3OXsC50%sldRwb9wpWuT~pm3>ib_Lo(a zHB6?Ysq!gELWESFh#jierh%C5W!diAjlKi2IzBnp>E`hxt!#mHu^oZzqN~d+MI=2` z;hKqWV&WM5wmjZ0(W?3@&UDh1P9viK6u?vC4YJJB9`Z``uAlu(|6CSuTt7h2;4HS| zBy#F*w3_H#AS85lfd_av|Eb{RUv{UcEB?DlXC6GH^!NODw#kE>uf9MJL#2B2I4gJk zGj?`U8CD3fsUy5>OXpUdX2jAyhh%wibl{q<;0cF}+5@a~^F5M&nN-?RxW#I>@JfSb^&4#I| z6eYqr%37jR0=8FD4Ju+#Z@1C6V)eGhB!wDr%tM2;mIu3AMc!e8 zFsr4Vvzy&h(1ts#{%BVY{Hi{#w`)^r`~|$Aa=+;qA|ehEXx$B26&6^=a>4Q$sno~5 zp2WO0)uelaB5bo#-mTx(xEM0%_SzvjZjvJv^~+jLW#gmwP@$uQ-7Lu~83_Dc9J!Q% zi?||A&QOL((B8y^lGZH=sJ96MDj0>SUeZL1VKz{KHlBcU`kSGu_If;5j-0x4k}Kg@ z{fL~SeHUTt!7FzAUBD1zU$0a0)zSDyk);RmyVj9pcWc{M5$r3Ksr5Nq4%>ADN#Z|b zlUz*IdaGo~o(!Mf1Be&Jqan<%?bbp9OG>}&_0@S0%sb0Nd#ViC(cw4j?V zYCM;Exe;d&?K2_fXxU?MYnMP%pS!)YU67U9v9{>-bTW_tIG=aEv|RBv?5b9>@ZqL3 z9UG~h-BJ?k6(KQ%%I`yK@gaKXM`FQjP}4k6N7=;)t4*|KULrx56uzHuqyWTE|J_t= z;%NK2Sn| zOWURw>f%rmxP7<9`N?Nqg;I33E<)ZMp}!^8p>D0wz7#K}kT#XDwW>dn&%$(79bY1O z57-nz=k}(oBKD&y9*nUbde`GnSwIPZFHy`2nbly%3~CY_LK&Ym?xTP)sZn9Je&oTE!ftBloqU2HjZJY2`ETD(h zcarQ0`^&U8jUWI3t<3R3li%7iPwija>AMU%K{0#%d6(aNO)THV)>3~CD3~{kFERc=+5|XNi@)8D7HAtMuat9zXlcZa4u|BSd%A+O}{c6Ft1!xCcbr z%aM?fa`05!9te@VNc10r@5=a({r^AWgx`MKgc@hb7G8O%R{m_N`rv5R{w+#?=;lO> zWQ#HXSMH78UL`bi$N0)s6RbbX%ENesnSa|!*#-%u{K}KBqX)hR@RlM_-Cx~k8?C{7Arrj>{CNj#&dDCU(_lM$zJvf7%%Kcr7YYpZgd;ZF!!NgnJ^B1b~pgtn8@S&Bc z>nS;qr)}=EqNmdemEXg9PwZWtA^*K%c8c9jk`U{L@#gW2dmxS%a=>_Vq*#IvfAx#K zGZg$11z%qH-DK3!q`xu3+1e+*r|`wTclA)hmkj+RH;Vv&@Ruvo!w74mQfJ zR`Bi^{Jl5-#arh7iy&Rz?}75#c(C&1W1-LwS*2j_hOV7~ABOR1T^4Om&cJn z^s!&uhTK%6Y)*YmJkHYV(8?SWWqb|zRz1xDKBKVj&v>JGxkI%J^Map_@K&aQzSFn_U%&EZHUQLaSskc=UPlfS68zq<% zrg*GP<&FtL$w2R|*+J6xk;`{C6)u?E}k#S(w)@12kLh5k27O6Jf(}Uk81sX6BO9+VyLJyPd>NP zwuCyVDt5lIFz_i}IMKsk1XX0C68L)biJPnDD35>*l=A|Gjpajmw9uyWNx$|dhi9rn zH4YDwO_8F+Nf#+6AX8A0yq#kyL%E-iZCN8;Kju_Ff(H%yEk&tEc=QR}l!p{2_SgV^3rx_U{e~d_2_U6(Jn%jex@a@07cAu;7CmPB%m*4+pq~eHU z4lweC{Se{URZew6vkbYtl9IeBnChpyNui~8%s2=+m*KmK@$aNbCVzzD-~sx2 z*OwCBtYY>5m!+IB8~IK?}DqVcG#_f3!8 z)GH19$tOX*F{h?;%3W6_4FVU@BGXw=A%mlh8%O+6bhX0mQS`~p$3Y2sx>&4RzIKy4 z*WW>81AcvkO|_;QiBRE@=~&+a+U-f^V6`_==GdMd@Lc39W4@2$vUnhsB0+H*JQ(Cz zt`jD!jF1n0y^=sGg#=ci%~ec9`!^VV>OnLxqtp30nfFdd7$~Ydv4Ly}iYACYNm8GT zw85mu@PE2w-~DkM91h2YuCa6Kst|XYEXz8tYEB8I>A4p#uMzUZm=HLF5wzJ+EAivr;;W7l7%i< z%W9YrPTRL&{M7LCjt!DP{}{sqX7hdsb;LS=kAP+3j&pYq#cNSE7TEg*C1_q z+b!|(a$Kw^_Vm=+eG-9|B+wk~{yd`*ZsBLgUNGs{bNfnXco>_XSSC$XD|S!+h&w#z=MI!?QCfkvzXkw$AfGppI|@*0-80C?j+1Hjw-KFvWS zVg}cTzvEL9@_NqdZ7)9wi=-zRTy2xP!5jd|p4`BgO?_&00Z%a|bXAQ>29Ep@QW;>x zC@0CkA<9p9g>ZKrEglK%o8eaLXAZ^wh9^GpLh;n{i|W+-l+rWig9ej+%+rw`1)PUT5@cs4y^} z#?5s~x#%rF2!GqPo)(I<&dbHSF;uy+L%r4zCll8)i5F{R_;T#c{`-aKz7=fbcJM_% z`%)dB5XVdZ&8`X7gw5*8ZaS^aM+SyvfeCsUJYT;l^O`(kJ`7IcQ32`0H^3j}GUYVL z*f_x+Md-c$6Pp<;0`l*m5Razspi^|n4M}A|;%^k2B%;K}4SvLQK(YS+{i|wF;0~$# z&~U4~P+T3c2J6Rb^F$Jj9%vHNxQuQ9sn1f#+ZXP6Y396}l==i8O8Eq>+FE`4H&L$0 zX?DJGz9tI69eENWGw-Mnf3B~g@!;I&`iXsd2$`0FxrUoRe9DupE<)+Mj!a_gzD5Iu z*^GucAq{d3S(=VgA2;~WlYhNI1O5ELKY4|H?&ptxz5MalzrQMUsMALR2jcmh!+xKP zCIq4bD(N2Zbd=Lktv6sBMNLL#BtF<*u7N#PJK5Iyk)ujOCW-dBu5@uWtR)_8#NG=r z$e2(e^8V)R6^=nY<8wk5KR&3ZI2ztDCC)h%J7|(=doq`@)nK&5dL@SW7IncSGFfYX zPsBq;K&ZKdQjo6G%SRG&hjpjTmX#n}N}YhXq)=H-x#G3i9+Pl))6aR9fAIiSn2rm0 z_WI)$dzEj^fn&IDs^&fawsk)jmGAHjr`D2^*%{S_Bb9?~H|o=V$F3mWK(Jfj@c@CK znGNp47lQJDn9^HQSOORAkcjP(YS04b!A@dje~{aDzI7dX=}3u_%GP@ndpzCF^Q(mh z9|<$wHY+x(k>&M+2Vs_aNJVA3=mwrEU4MaH87RNXBUCS4ai3fvLBT0kY%d+j-({N>Fx@_ zQc`;T`pxkB;a96FK2Jsb{CsFg51Wv%`^~G?;7_*XGWVlC73fDAlzt-+!s}cP>v~ENI<0*-Iv4%D&ZZ*$rFMptyj0nZV7Oekw8Z$OUcJf7!$`77K0uipPIbo?ikXdCN4w8b#3_W$0W{f1}uFWQ(()XyV# zRSfV+j;5PlV2IljJ!Vx0yD5}z*gdpMQ<|;~={#%<9Kf8`mVGKvrWi>QOXe$!ikmGj z%}TV-XG!oV;#f@_C`wSEk(dxlY>PfT-FAdDN)X=Wta>r=I~sony$(#k`kI|F1ZC>q&+e%I64(9Mf+6)oJqAJX&gU9+uRf*t4pmnAj?IerEGsVfh}0A{ zO0vf;S>rl5E9D2FX|(s~z)_^DRPM}5b4i%peql-wV%qJnyHVX~mh>!Cu4ux)o)auo zuZx&SUEsRs0;V_-lk<%2Pjc=7jzbf5N{!YLFV?BjQ+%`9_>`~|3fWKag?7J#9(6g; zd<0ldtp8{s`^#d7_a7DHs5To|&Z=vuu2;9M#s`>pu_L91vV;KEyNx?*M3=$#dum*k z#R)2Jj5c10$5>>P?$H~?g&|kia>Ua&A()b!a*o+hr=K$9n$Q+yFPxj0zkgA{i{SKRINeB|`_uEOOd;#D$TKVT-tnh>##T?X?<}Au-A7zdL6{}9pN>~RbplV3sWIkL8Cv`O;pyR5;78`24ynS2 zuercA%SQdy4ux$y&=978j`+y2FTqMWVISBav$~*rc<}L(h^hR&gSqU++Hh3yJK*q5 znDR6|YYXRn@aN0KR0G{m{H*cUd0Ih$F4|z+u>cUa!DJnLgf)8_(fTl9fc zen)}97ET2^l6~ROHezp~CmvMrU)|Y#*ZvhV{XcQc`xl?JRA((tL^9zN(u^y#jyl2? zxGr_OIpO}qonTYypR+DC!tkXDLHCIa>k)se>?clD!B=KPBgUGw7F zT+!a|-_}7E*rG2`;_BFn<;Wh?W2FC)N9RrzG@; zD>HRo@#A(@B+UEC4kT7?Iqb152ao_p=v{-s8AFp7jJ@($&q1}GHtm9-2mWDOst=)` z7th-Y-=?*blzhz#R#R6IoENce`jXpp<+jAFp{g_3yi#V^GS$ZA*G@MIcSN2!|=$V-GTHN05mJIr#_<5DR4VWeQ}JeFv+T0kMN9MhARb z*^dU5sovGlZpmL3r{n>HuzK(ZHUO|;hOwm}#fkIC-zrbTpUSo(=sXRaZ~)X)WXFfg zO_URO=8niNogIthx(X)eblx(JPw|%9;xPGnkGz)&V(Sp(y@lLpB~G#qVTF?aD%1h7 z!zo>2_yHxr#Q5N^&RGn*uE&9nMQ2RuGUC)uLo?q&^2}}H{yV}^P58A}6~M8;UA?cs zC4F7`PYL3`e(paci=U@%K2tA9;0ncOv%#iKt74#dn*C}sj(wS4y;h4VzmR>j)P%yu zsmgkcJ>fXCq)%JNKu$G2q)za#>h^%hQJ$$>bgeK&bemBH`xR7X4#}IXP+3t&nhw*@ z|6rI@sR3s+?*>!illZWXpmAhZ+8rTHB^FQ1d;GiS?XP!~(bLdFO3_nbpa7in{L@;| zdjg-CvaOArG)k11c2-sz%PT6LBDJu|TSan#WOcqC7a(VP$TfVDjccag7z@>)NnpCk zKX5;wd5|bY)(vlo>?*FfRzTlooA~NWalXh_6*r+9Fqm!VdB z;=md{&&eP4?E0NI7xR)td0mj8<)NX}MG>P3**hND-Md0MHCR-^R_y}bI9(b0L<#7| zPn%Duuh$-Uuc%=X&BI4LjhJ^%n^#ENWM_`b&7sN6n3&DeC#TU=x&xJ}#FL)cV7IyP z{WfcGTEXn+&&x=8;BS2)*a1{sJA)@~g0fxW%)W7#72I}|>^i7xT_u$7&iPx&mVIXG zyjEEd7y+a^izAz%4i-}roZ^|=B}=dLH0=kpZm7z=k@AxSiF}BCneK=9# z6l@7)8D71$PrDMWmxL5a)UZy3cb;8+OG06kC2&qO%$(4hL{V7(S1Bg)fI%CS?eWueNaygbh5oiB)Sj8m6kq90$*k-!Y^tp6=GM zWtMTTYz~bv%j^s451;OSf~C|iqEC5W&$WT&5XN4BofECdfbU`lDY=~kqMEBhI z0=*Aj+dxVH&F;fLaLg9C(0?DGb>lAHM-7Da+FZT*CMjrKfJ+d6b_Qb=<_w~` z=T0fkOo3mt*g7lnwWthpHaZph()e^jmBz(kXD1fmR10WGU-F&`d4DzkWB9;8JZ^j~ zTbJI;yqMe5c6&7_>2TW8n4}Tyd$%E={(mDqlsfzrEtC! z$4By**!GZ26}#&iTNvLT$H=yY&egoUk&1k$i5@i+cb^ z9nrito;W)KP@<};!ghdre=l<&!;kR{pn0!KVL^(-h5ix&z^qupBEYd;0UnqYF-(SV zTV|_2X7YL+U|5t2^q!=mP;?ou@QN0WuXe0R94#O9Lf%K2T_1PvY@7^K2!mE@Ei(JZ zm@iiot46FpUA@zU;6d7e>H+}J_eq!6DL)g4BBE`};13}Z6 z6woQ1QU&ZUAD-*C+Nt9@8MyXXip|DXsiuS-WvZGxG;%RGjnPggTypm|T}99iX+R|o zuL5^{JqE)1bgy33@US`;;BNFAGyq{i3xtI}E<+5kDRn|Gs(KI0riWNpB&~D^@tHj* z7Y&gHe63!~gu2mp5xlz>ZlNp?XGw%(IkVE96|*a}s|ua>GTa(^G3bm zN{vFc>#U#owAUBkVkR1XQ4eJ~N`vJ^eH$A|uq0aNcTh(W55|WqAXyIxC((IT!T>udV z#ro#a=+T$B0LcM*P7wx_5^9G}3o~o?>(mtd%nV7yFlW23x+BS7O-(dhmQ-ycipD?qcd>QS%R$7;Fr2J($0Z^F**VQJE!Fbk3fA~!B&szbo39}dXcxJy4N<#V#Z z6M0#MSdm#ubh}EmBNGK$s1;7?9ah}rNC z-5dTaivGQ?+q577{i)DrO=RPOXIn?Mr+q}fV@@FycD4lIRX8DlSN+U-3|zOT6tjOv zogd76YZ@D~)x$bCiX#`W*_xoK-jiG)uAPQ@lZwx8FRE4)|Y|FSm0&al|a0|p*Q}r+LI?UI8!hDHJqL4N$r2h_g z88;akRFH3(Gc`a^cNEUHli%q>hx-S(~Ard$90~yLGQ_wx#2I zc>P^Mbt)hH%1jt}4f+SPGy6+4#yj3@{{h>72RrvgnTDH3TBR1DKbJaSxslVcBKPP1 zmPry1e4o?JHVn8|VDNk5T!6R*9ddh_lIs^E%z?|+q^cz(S^=z()&W-J0a!}nzg8_~ zZpvR_7yZ|;3vaTb2#B5#M|yEzSoa2OEO|GUvZ^v1ff&fntUnj+A5ahfse7S5eib;q z%XP`nAa4L>5_UO1hyRuz+(rKYj_l_O5esUp{|*X`OM!f&1X>R9s{k@Ya81^X(ly_p zt_OczokhlMk}sksJ%L_B-%)2Op5t0bhw@g61MTrwJ@3VKp;V&w-bCYf2csf10!74b z)KaQr(E+GF%N1X0Yb%Sr@2Qkna0urzD}0b{4gNM$3HXOPLRAb*ZSu*4F`qS=nU{#X z=6xRUjYySdp%dHrU_!;0HrcXpQlM2p3%md%OtC;>dFL?F*lT*PMSYCM0^Z!n9K zXy<-A=8y*6Rw!0UE5<7!y5B-p1y=+$EV5bHVc4{-WS$l4u{5QQZ+Q;h_I!4}`e`q1 z$AUwgV)8aUfLpY9$Yc@)*T(&&$%+DJJY1ryd;Tac3T`f$#mjWAX%c5YcA91GJ#uV9 zDZo5J0l&P+V#)|$7DSW&1uX=R*d~wTHIpAO8-Zt~VNhXYhxQcQ23@w+ow+8-3-B0b z?bCwGKlfhjZar5zrUv5zmH`=wAACZFBv=ez=`NSLJ?+513{>OySK#irJ;5Hhn%2 zVPU`)FwEMy4?8oxTo*QRI7HIX-z93GYbLkQNbHQewZ?CFOwkm>&C zJ6ZT{nLjt#zsX7eiR;Gj;QzsvK??LsE;LScl++>;fcHbil~@&gB>W!P*N?;Xcdd_Q z`zfUiLRcx<+QO|(q#v3Il#S#E?G`+y!@kO_1kW!6wuqJa&u%FoSTpMn55F056l`+= zPTDJJIu{=S-{TFWl{aT$E+Hy z^R7&(s)((3YA2c#G}qh9^suiv8u)Urk;94^!MFy1 zg7&fZ>RG~DnUKQW=s=OK&g&>V$JOdfptI{e&<>ye=e(`a2qn{*_J{~~U-CSdU z@`NbtB@H zN$d@8z$aoX#4{-f;4rJf5li^#cn?+SCb1d6rYcmP%|RlU#c?D{cDgM4%=$aX-Rza_ zhXPU3aR-;o@1V{?(j((saDu+2iGTS)QJMKZo=y*uj=%L%nR1HnSm(TDn7kPvOr;3@ zS5N%^?&nsRbB8k45?6OMAnQGb>$}fcFov2*LthZ19lc=H?QG}e>M&gU&58KyJ#au0 zM4OoJQKl19H3bSnGj$bLm+S+HD$cc7>q!KeN#%Fr6)B5@es3LC;pxbTmV3C9#fL&WHlOo%nU)8)XCEGPl`{-=ReB>-5`mac*T{xa za)dFIvB!jDiia;Ra0m#iLr1q=aaLO80m9|;qY*xf zN6kZE+$TGFnitDE@Z7XzR*vE|OU$GB)C;BT{gH<`eJ&Q6?kI2EcRz2G`0IO$+HbFrfyb~XCtBNm10y&?=RtWrY4+@k9HKzgwq8-G z)JPW@p3}#$lRM?L9I0;?zJpK|4d`_A@~E(1oX4HV9k6i+IHG+AS$T$E_8k`Ty}v!v zI}1LEsKK$8;&WK}@%5IM4S}z$DNK__ObUpr`3c{9DOgm?d`-r)nfDkfb>>?{(Rj2m z_aU0GM9@}cDwFngqA%h}fp$ydxN_~Vkvd7WMV?66j^`tu;ts07R*$KW6EY?*rS9v} zGcM6b1nz$MenYrargKrDo3}e681;lB*K&-GLB8KXHinO`T!@)3Fu!&~K3_p-_-sSk zL->mTt#a!996HePo#ZbnrA1z5vlKM$ivph?=eh83QMMT%k>{SUSt!PgMNt*|PEbc> zUesww0s_~5;6H!XF#JDw4bvK5bxtDg};ygqcp%eAL}^(e&683Jg+At1G?vyeil#y+pZ81xFTXE@8M>T1UF*-9L+p zU??BeQy^f4e;nM5z}&q2IK9=&O$8G60l^|LV&j$wQ*5D5Qi&wU)`aviiK$e~DSi@N z6C&%fB~_2MpLiV(Uq;Yk1IDajp~X`*p0kSkUD!mzLw7(yAG)x2L2Ha~+t%_dUK)~+ zM^gckz2hGaTh(>H9TInT2e!c94W=UC80!O}DWEJCJ~gC2FGhW|(J+a$*t0VXhMJ(2 zFhr4qj$HZRjh>1Nb-S?-dsjK+_blwZ78uhDpf!0}1C?anSVRAuVuI9DrdV@ycu?vu ziD>Z~icofV+KBXth*tYaT1Or>pBhoT=bI$`HVvL_&6WlVJo41^Ta{ntE)LvR+PJLq zni)|}^A^;s6tKQCW-bI;wgkwk)&Fg~h~Mlcs%Qz6#Rw9N3Y7}X7#+MC;m=bxh;NYA z_6y46NDN*{;Q3R^)6qBPS2(Yw1+W9n@`_70rn@0(b;hah;V z)}Faw@GWXMea=`VcdCKvq>1y!LM?#Oay6h$*!e(g?{_*~_C!tiA)!#sNP##}nO8c* z{GK`2MmvodKgSo=9_V@YEse?)YHUY~xx=9&Zj3-T>YxLGC(hoM-??7KfZ2lWXyK=8 zQKszJTdQ@vYmbczM8M#)v|zefNKRI#6qB~k^wxB_#pn!RjVp!k_l-04aHDR7YN%J; zuO+6P4GH3tnvRW<8#O*wV>Y_lcxV2OLBi4r#1W?2|0!MF-a{s+b@G^>u6ASW;n3aN zt4F&Ki5~2J)9a$SLk{zA_LbEqrS4Z#+YJhX&S6n4)%Yu?sX%K2+Ru_Mb?ww95>)!w zJGTeBMg_oe7%y-fwn+c95Nu#DNLs8Gb+QViivl#4Ov)=qQS`H3)mMZq{YXX9Av6ZnFG?xMuI|KYRk2_>ymyDSlXgGOI z^fT;xRK{ao1{6FU>mO+hiE5%H58$7R_|)B2$+}uY`^4U|r*+cVHmaRZF+*w5{xq}z z4z%!5e$!r4pyJM#yi&-s54Ys(9zuG#lheF+!@dyeMIsH(RP2CKm->u9mblT8!=CZNMZ^<`c4^=coF*PAEFsVHbC7yDlsuCnx_DEbK8YSHqQJmNyoWMy{(b({# z%CMBVa)8)J?ofQ{4{)z zZq6Te-4x8~7d#I84lygWVg4>G5M|)8o40(cPX@4vcVnTAv~)4Iv$&8(sBKxb7vi@1 zWz`JhIDYf1rT}cWVP@The^@ocy*)ot}6dv z)@&h?e-OgR!GqfoZYkjm^jj!}F|(wzfAg!RxDVU`WWa2==a<3B`jn$_%^fmh>YApYi=>G}_PK+0*N2R&qpzbvolnaiPXIp-9+)qWi zJ5m7%tZ6NC)krTATz(Iuxv_*Fe&2jw;3^#i$i@-v2=fh1CHNGO9gbG*#a&m4IbB?V zmLPAU>Xkoo!W~+Qk|DUTz=$cpyWIYlT5o zwKLIPdbe-=z|L^|`EGfl{3LhGZ48mlhFUJ^YU^K&{g+M(Xq&w_g_iL3+oW=l=HCzP zs7+Z`Kkw4T!sRu+VC&jYJ4f7JdG%J~4&Y(=Y=}FsfmpBLlkMlC_M#LoJ`m+cH_(oF zP7~mte$O6dS@W|v=+YF($G;)VJb97aUSWRe6m}u)OGT3fwf>!Nki^l>^R4!$=h~ww zfr=zml>&+(toA~LR3Tv@;}ufsPU+`)MnQUrzO*FFIF7qL?}pNb^j8>mmgg<@f~*n^ zqobQu{UD99xmu$(;S#4Xf1x(IeIT~Ok9I!1A>R_29Zqxtyf@2m2fl_ouXfNh_(r&I=PDNfP)-c^$SH{Dp_3iNS1(B zKjR0m(^FiU+E!o*0R4&WA8Br#2ar22el=8(1ZrN2MCSh1R0C*^(JuqciXAX3w?AhE z$m72^)d&ea>)|PB#M8H+3xzdefYJW2Mhwuu^v)Rp+R&)LbKX4!+7L9tP#t!*^jlL6 znyEsjpajfF-XAxhYi3{tM*EkmU%Z1Yw|^kJbd*zS>(?ES8G+RD8Gu$>JXn9@jy8Pm zBa?vGNKOl0sC`llTcq)=b~y=u26;ald^E|J^Q^aG=?>poCfm0CT43`S$4A|ext<3{ zJFSIi{ERx=>Cqo|!g%hc-y~etD9tnhYJAleBJ<3z^ZSMzGb&=k_fc-%42{$aBd^NN z1Xgyf79Eq7SB!HCK1m5At@`YSbRSd^M7AMg82LnfjyJ0*&yP*J+(tnPHwe{&*Hw=10ND;a$RdpxsU6-_5iq@qx2^C(?ch_@+-Yz3B0Ed z*k6uHRoar7E^?UW$F}XS%uQ*JkOjVv9W2z?9(s-OAX;^YYut68LnjlxOb*qv+IB_$ z|CjQ|Kb1Ow@vQ&IchF+O`A5AR)W{x1DSfN;mxP<&ZUhJuY;6J-^U;iM@oI|HUsBcI z>u2t)%^ondzkH@N>Ss|sW2mQ0-F`4**ihIwky6A-mXGh$0%(2O8VS^z6YqBd74&8r z?d6+Gq$II44AQ=#o4|7kVBkPmp6p_lzA?^nW>!2q`7)CKSPq+UF+L%3Uh|q~T!Uxd zWT&lOYN+>b0`O@+^I{~~ANMNVc}^c^&l4Q97<{;Ppl#G@gX(ANN_VlKPuC+4|0OMBK(4;diNv(J=I zgg05JL!#V_@+%X5K%K{h5^ufq5$MZ0@R3a@w+<8N3fMW0`YGQ(E2(bu(#EVgH*roujC&<(q}wMWrzf zCZzd0h&>(PI;znvk5O0j4dwb=n=DmB%v+OIZ+^HicrI^+q;)EskxkO8jN7BcI?5dG zDziPH(V>WxeTPc54{7PX&2dPzH@}gmWaBa8chI>Yrw-X=!`&QIB_^OgA(AR-&^qf_ zj9^O4*Wi8NjBRc=FVH7}^|&AGUO49a%t(p55565bn1mHj*=wDjHKNxUqaQW zWbpiQ-&RfAe&^gV0-Z-+Y zK!mFFEbB6m>$RFZH2d4;Sgwth4SFl}wN$yqwhL-e{*pAqaQ5*1qOc0}84+tKzJEOP z`41R^-`Rl-B03^B-g%GNs<^+|As)lUA$}@VQF;)035KFXP%3liO7mj9AhIS-hw=~Y zdcIi_D~W!Sto@WH|I4F~maK%rHGC4UY>PNU%JPWdWI-T!H$=n5YhhB6;7hNDPo~J) z)Y^%os|n+rpi~m5s)bseRwXT_XULa88i3u-}sW}zWOV6{;T&?(FX$8{sgE%|Hs_#pbgSCz!LH|dj&Uk z$A43={#QBT|KvSu59)cQ>vOe@wUy$rkr`gVJnpjXT7Wyk)~45fODuMe7xPr-hFdgP zT_t2>XXVr4>kjApQ$sf7HZCv6B`dhAEB#psVm84^z{z1a*95Ecpcl8LIJD0h(6C35 zrvwKlh4R3MiW!4|v_I~e}@Wyb-nJjr=3G#Fkr@X>`GBW19@+9Vpw1b5=IpRCjeE}JdZN?V1 z^35Ry$r6(tSHHfBL6`N_1btfD(3-ukhkK9gZ3m=AOK-rtx}IX>{P6`mWK#`VeJ&iy z=T@ToJB1;cDxTE0#qz5{sJSeV6-B@OhGxSiZ|F;2K-41`c5RefRq5S7sS@Xd!Ky92 zyKN7M)Vt-1m{vEn(|dO(M>ky5YG~ik*&7C%LFG?h*9sK!F6@x*k;;&yeV+gLLuf)7 z;qE;vdZ=Yk%jimW&~Q_+GG22GOm8{o*MCOg6FqTfpb8&xMm08#ll)qs zJ6QIt%gN%ZiYpQ92}v}{P05h-uOPNwVlsF!z+{HIHH$vH?58@U0V(i;rhGZtDl3JY8k zHc8NZHa4oT-S%on^C*VOH>!hm?!M$X{}vL`b2m?MFk+Zfv!|)+(ugpF-Dku>pgTeX zny+Pb#wkmUGBve1G+pXu_x@{bt&hS;0}QAco#Y zul7SMn-U&p%{yZ}PgZq)hS^&E4>-MGf)T*|#;28;!YO$oJ=cukg#*9ZV6F+aB?0r4 z|MfTQ_7CvG$+1+?FA|Y41a&3X*DbLHJBiK^6j*+1_Y~DAZ^sUCI~Gx?jHOsAQyb)- z^(S}(eFFziFdCF+U>E61y`OfpRqb_bmLrHUyFi3YV4}}<_l5jYCdoTkE z@@L8h@7L$PYpr53!kIcz=uEf15_C*+yg}4K^cWNi0cM(kBm!wj6pG^5~*qOo=l~UCjRqt$YMqC$Q0RJlj9~ zLz|Yt4*vDZ_;R@h%gE%D`Yado$*_iilhZ+yZvl^*S`43NHzx895xF2j00q7Ffj}TJ zRHAga6glPuUvyjAS7a)iRa+s;G!_4ggyB}H7|uBFkmm6A_>*=vozbZpLMwHV(3}hT z8A#9dfHqqC$IP8gDzONpv}W5PTBaCVBqAe`|08TT)xb zuc@^PJ!8(MG}?MGzu*_QJ6TOT5e^mQUyEd)8!o)!RKYT`<_izyO+_?$KW}@5f}QHky`wAwn;YoOH323}gJpx-zWFz) zdccn&fnE(KkoK@E;f`B~Ny8o>M-0H^?X8K-%1ay9)eBlm$w@$7>W|-V6G!`jkpV4MID~4u71wMdL*DFlZ_K19oP#@KuZ7b zy%qj0J=s|QVVze%b9MWewqp9GD6f8WwIU(XWa`k)dM|$5K(S#z>fG$sqZ)FHKzj~U zK#k44rmQU(H(Gt&+;E6mbtu~2u|s`I%VE1mg)(f%I@5r?6F6EUXUs%JEqyPsc^5Z2 zq8!t!1@sLO8~=hZK{?00)S42RJncV%TUZGFlO0%(eG$r5;$yjxR5Ce8j711cpbzvZ z0|fdeD;)F=bW`JSV6(y=K%no#*K!l(W89Z^X`_7FD1rT z+PFKz2A;ZP022KHl#0w@@XZI9M1M;7g$+wd=(m=~F&ddYfJA>vSPD&D{QlajJ9_mE zv_B>K?-H%JfiH7>i3xu^uqBEy^ts!m+)}9j`lUd{4<0Tz&Ov~kCq&N5jDjAROXiOuo)@tD)qgy3Ve!#l32P;3wKQ(a9hnfHi{k>*= z0;nCJ(0}UidCoBIK8VVzMQ1$l_J=}WrJDsF04Vg2ns+vi z2O5-NqH^l=C&h6ST_jqcMT0VXd}YiiknS$Gm`VSfN*Gy*N-V(y-I3lQTA-`c$i&XL ziPa#_oYJ3(-)q)&a+&_2b*%_5)!(rKCEcNevW&0XDyCx`A_m_SXk$IbMDe_IuB$uZ zc{ZlD=6x*oW(7FM&t+S4HVqY&i=Nliy`h|&NIxuhDR-GFX+VZ%&a`m=;lqC=TPev< zsme0$e=8d+VZoaH;cL%B_8t+pCM|=9rF<1=UU%GG%sKX{JjW!i$?92yS`Mx!G5oi{ zX)U`0-1~2L5oONL?v!&H6i(ag3lZu@isUQbN||UGH@O-hZ#5!>2nP8FnV$!E^5(qVQ_1 z$fDhFdXVst=9OuMIkydH1MBuGvE^da?1|Bt{AIW!_!HHP(QDevc-kn_%Bw>>UB@Nv zulNr!Tj&iaELdaWBx_n!RX>LI&EXdFoRW-3P_Wf2ax2`}a8b%8zo9IDt`2ywIbGD} za6=cZVrkt^)E;eeKgRW{AIs1=?bIWBK@}1l5Qyb?%o#}a)v!bbmbd-?mA9D}PaRez zN_$GbgL*h=27wgG$rNWCP0{mfmu(Xg&o8J~wpRB`W2tNWjmrVo>1~)ntlj$w+Ud;=x@%l%w2%ypjBA^Cv@4WmO0q|ze?FS}T z;dt(vS3(MV=BCixXC;RS%X;750W~`}NJL$%!l?iHKHBhS00{QmEiky$?^4YE9m#$O z&})-eS;FvVkdY5>z#40UCh7D!Ko}zcVGJ$o`Jwt>7FzQG4+ELhwD;cOb#l*d>Ew8 zaitxs()|Dca{R`5Eiizs*GD`UR83_ANcKI-d20PFC$udCExLi_hBtR$6R zwY}{Elm%!ivxdoczX`5{#~m` zI0gW%(be0bhlQ2tpVF^$clj}|spjrOzTyyG0XW%SQnm-T{$*VuDDB)8x$wG{C$94% z2z#Zd-RfLH2bhG5zoCgXu$t)oxtjQj2LL?D8N!$2SfdD+&Vz^WjfvEyWBCdBX0NpM zk*VA2nyH8rGd()-E#}aHJ&^W6J5bd^Fc}b7f^>_Lbsb}|j%`miJuYpMPU!}$Nhd^P zeJ2-?2-*;`*n!h*+jszu2s0qc{fRh0WKTrA!lR(#Yp7j;EO5ARuN>U-$l8eBV#rKD z`VRV7crJe$vq(*-2!F*bU7irMPvr(|rMi|~<&3t6F8Cg9FNXoEa*$b#Z_CIvaIYlszy(O+^!hGaHI}5H4XCGiY{2e4bUHo;2aelE6 z=yle3m=wT#L)S3CRI#U_cMh1b3|_5Gxa?i~)1HejS1$?gkCiyA?;u71%Gi&D`vm}g z#ucKXrSjonP~0<_9aDNxWZI9yrm7ocCSj%C4WuQ)mGQhs=(#8qxNpl-3kqJ75W1=9 zCnJT4-?+z5@{FYONviZN@lMtkNuq%6P;aEt&u0(biY=<(@@YYjMd-;0+BP5yNHvu} z|CvvIOjULA*q)RS&_iqaSi5g|p+mS;tx_o+1h%2RwGZ&D8avpsaezgg{Le+*Rj#5a zNX^xFM$+%R;MRC`>D*)?cwv|NWlh-C8Rnyoi;q%UfWFhZPk{gGQPkbUxaT(S6HGkl zrJzxqPx?7d3krc<5yM}1MF#Fa_XDB)Se~7i&Xvi7nMR6cs6rW4@|jN-P(QJ#qmN@s zs_DJ=PZb~`epu7A(ZBT74RZhZKClG@DygGQHhCoCwt!6-rN-ZvO&9==zp|x_AeRn$ zh2aU*mz9XDpj{bXmHdi4VCDVr13?kAerzI( zYkaw?>6A9}>WO3T!=iHfa!-p;MgCt83nS#bYeR%G2MF zgxlabLs(7b86JcLw>qJ(7Rfl*#BZjW&lPT(Dr)d%E4DZowKJ)#44R^wk6SY7S?fvh*= zm|z-oIHr$~P4Cg!oTTzRa;0Pufgsaf^_fHRvCi}6+U8Gy1E1I<*JQxL`Z<6fI93k$ zGkjJyX8Q1AR*dGYsjm*{B)OldN`xGUp49FGYg2U?(654zu6a+jA z4Dt9BN04X9P5(`P0I(3YH1qzru&i|AgSbb++3^87XSKD?ec5=C*@F7S-uI{(vLCpS zGhnkuNV%&nKQ#Supn;=2=(f!9?9(-u{=(F=yY(9{gZB~_#ntt9dm-r#F*RnG?2lcy zxkgQWtkqqv_ePHQkCVMKa>o_I#^@pOwKX*LaV0JFxMlk%sW}`H4s3H)c4V}#FT*z0 zsSJIo0_mmX1*nxv@>!%)YXIjBnb-dlD*pG+VjT!P?@1{-!%i0*;@sx@U(~&2P+Z%# z2HHq)NeB=e5;VBG1%fn$Ai*2AV2wM$gQkJd1W0gqcS~>&?jAI_yY079DLA$#4;5p_G~#51Mwc$w|-jTDKchj)&b;hB=Lye2K2JKE8tdH|i;MsNi+ z+lq0fy|ex$p;@ONcB_=9qrJ^U=h0@4ESdjso|=9FF|m;RgOueWk2PVY+_x6i*s zigxQ`x--4V%1|P87osg9Sn@Xw4K;-va*WK;AU}YQB_2q}5dUxg{u5%ce>x2Qs{`Wy zyp14oSq$GokFfMwFVF5FP?zqoE@{8(u`;r_?Z^!h~AKyT)yis}ZJt#gOqDa*rvd75{agwl5 z@IBG$GUmdOxo+x)rHEHlRFCdHsjTqlXIa+G!?z?*^+eD;%LuqA5=^RrdHH<0gl?CS z^z{%?P#&;`r%n%&N}gR8lzLCQx*@}LUKj72o!xm^y^=e)i%bg@E3fQ>aM3TaKgX{b z3mZwVV>Gf-g$^hzddVrvGNh+AGy;XN;ahtqL1>-dqXs|(0WMGJ=}qgzOHn%`KjjUKt1`PYurwlHp zZlBCEw#imOprRv8ZoWupwe%D@d7^+*NSxoSq2y`mKHc;b{(P(^=PVYtQ*!9)txK_Y zMNEM{TInU!W|tk&DVy@i#6H+Uki6|eb~9ppc$7MfzsB7@-`1$V(wOFyej!C$S~!oo zzg^8azc&LqfBK`IV#fWyx)1q(sqH^aeSk%))_(p}N&7ioL>ek5^+DRD2RoO!#cu}# z8FW=s|%$EquvNLFpe7EGZ`m}Y&>IoD<7Hq^%4U)LL`^33MK%rM1rCW9zFpRuzHrM$gD{F{N#2Jx zLjv3v2Xb2z)amnC3OPmFp-_0G6sc5(Rc85s+lm7VWYA{Tii09OXM)HP1`255 zi_;Jk8UMW2HrHe5<7mQ4bA6QQa6c>3W!6{Gfr(yovhY-l@Iqw-%^4Zaq9W06pEHO- zrai##Dqb{MCR#PVaV1q8oGoK$M%EelDOq~%NQlGCMP#asCdH%cPdB9?r<1?Q^#Cr8 z{xS#TFWaYSG}GP5cAu*Nb?b|L+LL*PZ#b$+^YlC~-GVx&-}J1~XX86vC29!WhAuae z-P}D%suUrpw{VHPzKk2u62djSMCx}LZjn<~v-~3K>g7eaf3!Eus{7!(M`YJ$uTuei zh4Qk9ZoDTE=F^E09U+E;r7PQdao=pQv~_2vFPc>Az4SFd2bw`bCoye^UYl49+b6!> zhMCm~HBFyQ=$iCFi-ZNOME7ah>Yr22V<%GPQ6_n!q!K zG_3cKKnTtFmC6tb^Z!N@{wKK#fA{R4qZIxohv6?#^j{$EFX|<%HEIE*%x)3_4|H*W zD-mMUTRfm-aAKr3r-)@;StayS_1m2#&VaQm50%@9*{|-RgM{bET-L;H$Pv8U}{M`1(Cq=4HiQE zimP)q;^qIf{mC`6sHTwr&mV#`yH7KyC(^FTYd)5HC9`peW6YvR3mS|{g{KE2PTZ&5 z_YGJFqTpe__O7(Te1gjgU8+Zo1BP{);{{(FJZ=nW9KP{H+55N+L|K-Ln0KG%ZyTEB zc2}3aePg3c)YA20$eM-t0m%ej-ubg2{i50F zWiMx4B%hJsRwE6PEHxG1jFbbVP)`73@YT}xUj^h5fa8Lju)Qs7&HUAf%L=F2;ZYwE z>Lfmb@z1A%cVZVT`_9}NJDP#6aqKG)@`e?s{}2Y~&%)rXdYk^*@?$582S5ld?;|#N z^yJv5snljOqDc;v@bhiO%*+$K;ZQ@gi@9_TfEGS=YKAjCK~q+-9s?MYI3ec`k>z^# zK-VjqUIVHtk@u*t`}|QwL%!SCZvBw9WkzU41>vO`aV>QSBL?c32PT`{|j?6(RfC2ob9}g}_fRX{aFsj>M%g zXyRkO>QV1VhN(btiRq-&_|@jxn6i!KEtVqo1(Wna=Vpz%&C7e`!%^F6 zGp+qm%0-3@pJuW-{E}+{B+m|Txjz8Ns?V_JIVLx?V8G-R=q=7zR7W~{+LPUQZDrr18HtvQy%Z6PQOvlI_7T?KKF{9!P6W|(h#iU) z57_q>i}UE4(VZc=lW}Od=J$Ld3D8o__^`g~zbQ7dia%>up67)Cxuh(&p9CK8%zriJ z4479v(N`A{2{(5WYICK)8>6kN4Yb8xzD${sZOj9i@%3~QuTtD@mlCSy?`%WS%!rup zn?%YfmTHB-^i6~N=bD=8 z(Ohsp;Cwtu)zS`fAsj||fB(BdAJU{~RozQ7uwI9Vt5S1w+rl``%JqcwjTF%HYQ@j6pv%8n9fFXY#iE z3ae7<5s{@PF53zaOdc7js<+fD4k4YS-XZPizd~_ zu!ZM(7F*XSiZ+!9yuW=;D`bYl3Qd|S<^c?kftNSDf_Eq>kxuD{X8dMAp8e?RD$v!r z`~9V+6_ea41{Ft*CFN~v-(41e$mpLG3y?-gr2(=Kis~QI7z&_zJkW9sNTc$%G3xIS znmv9rZFtl&2iS~3pl`<2$ApT28Dl_z7<6~NAo)Y>q4@lSrN9y78)Qnpi2y^h;X<ixK*q4^Jq!~jMEi){oj zHMacyfPw1#A&?dEhrfclBk&`vlf@A2rQD@4@<;-yBp{CzX2YMZL<#xwrvaffuEN%Soz7Ob_ znel@JO&6s1Uo@LVe=1VHmiOSJkF^-rDAFPuh}J(nM{QZ9n)6R-xcwmws1^G2f$<-X z1UQ2K%?;5urF_49;NG-2#pl$epT(`IRR|@SX?<{5_uLrucYK`YuXhf|;Lo&5cntxQ zE$2GT(l`;fbLbxaez4T!`+_gs&$ku4!{i`_X}6Uo#vFB`|8$bH=m6k5@9$8ujgI)z z|9mQyP(5wFqY;vcEbyyOp`T*DDPh_vz4WYs9(8hUhy4ReD!>W$>vtZY2h~`)TRk3<{Av?Ykl-&CR;1$X$jVA7g`}a~4@&E2T zeu%>#$qF6)9O2o~?N3CK{6Hl8;{=1AL7n*D5Lt4D$?@PFFffxnrZw2>N&M{Iv46Dy zI3BmY7YXSmK+rK>!Tv2VR994*D_O)UbdsBeTJ<74y(QPeMZ>N-!Siv18^Ku9C);C# zh{j~Eeiy=OmulXq8Er_1EtF(e)<`@zV+f%+6~|?;UGPKMWMFg-|vCI3eDAol&8>P$P=U?Lin zODKBt$OP%M!%7BMy<#ov5&dd0-N@WlUMRN;SH3K)+pa|uFK2P2F`@H|;2}=d61LQd zB9`SDZ?fhw4{7RSrx&o8mpRn1fM%M>P&akVg8he@nvl<3S0oGr(c*ETbbCk7QIpQ?kWsW)#OmnnlQ!FSy@@z+Z9meZQ_Nl!&Tk|FKL z7N&?TMr%WubT?4+EvDTBBbv0GL*NvsJ|VO;rZv$yNQDeV(g z?)qA7!*|H6*M0i+LTo7JZ!_)K^_sOfhbxu|twNeZZAHlq#3Yo)BE}O3(`eeQko3&k{ zSp2xJHRIFPmw*{v+}z?UOUq^p6;8Gkgheh0-0_mF8a(YH*eK&_WEi23U3+FS;F7!| zvd#Z=?2hhT&FLk%X6RCBtQ%iH>%RJD!o{<*FX~Jd@d`oVX4Uatg)Rpr2dP_L;4Q_v zf)C2N=ro0q6pkNO*XUhZfl~#6YKt-BkpM%Ni>mQcAt~J? za)R~?C;|yDwOjjw;ED z=hoSGc`BuD0P{BM;#|*aoN;Y>W1r4NPew{gJ@WpNU)A;iRi(8 zhY#WA)nrPM4%o*)|DpI}-FAw@Z#Xh*vvanJta%J_ui)<>bUTY_g?3fzC1|`&Ie90} z1Z!O@YKbX_yQ%tox30+yO$j8ScdV9hpRT4L>*l+Y`c7Yl>NpLJ%XR*BORHe7QkRi zl}C*1#9ei);a9THyh}G)B5dlJu1QC(LiA>S$0LvmueNI_T30{n8&k<8`b9;1e4U88 zdp2+bnn9VQpzVg?XtDfC4_tFm*@U;C@sZxvkgaZZzieBh{wy;4`|eCpc(Gt;@dL3> z#V-5HCH_0Jh7&9CFa4k^?v5vnXt!2z<%)sT{MUTJdT)FNAp&%Rx99O2P+{YxZGzdH z3s2=xFP6&rlsSABQ|87=k#L1JdmpG%w!p!VdkPu`IiF%?|kzmtm9#ou$8kY8p-Xu1Zj8BqyNKTvrG( zGdDhLbE8hOV%wDii&xlgF%^QRam;IU4O&^}=5NcVW!#_n%pUqgzt#|HBV(P!dPm6K zDS`OyaKMVSuWf;1XfJ#`_EXNZ&Wl;$?qyXo`ip1hCj|Xg>~7r~=L-3H=2oxQqL}Sw z!A}i5K_2nVl?VHwG>sz<+exa{DoI_mgc-PLY4ngde=QZ3Z;mZWUJKCH`q(Jyo$nFm<4bL8`jx=2QT+; zl&J(YwuV+#_V)J3m}E7~BxZVi?>VCP)t|P))XT@CLcZqr*N(g?^}>^1-EI2%8v)$U zTCM-ZsoVebl{GVc6xBrbD>A%0t`tyyKLo$ulj_|9r?bd;Vr=FHbEdm~<$=IZE+ z48C(3CCGtGKtB2Qtr~ujp5q@xo=o*=tvCq2MHD+S)4yY^?`0w>%~H|neIQ>B@ow^( zo)w-Y=X2H8e=2r-6QT5m=OE-4&_{9Lk~!4pB47Dnv6cGHVDlG{PbQFgT9BWfF*OR* zh<=0vf+EN|52hz?(D+{2{du34g2T{^GnBdlr;zPITT zPKsB)Qy*3FdZv1LB>Z;IIz<9SM5LfTy%PR5J?_bMY|~^+45(3_0xPe zB#F9_VuSJ#Ys0!Ij4o;>?6>FL0?Va^-VZFUY;vqzSmQ6LqdA|sWf;CNSh3-B?4FO( zi<06&D~db4^(G;a;p*rV&d;bRIXya_bR>U6zcjv?y4e(-Pr0d55_Cd)UC){*8k=`V zkRv53dnT|z*RWu_Z+dT-JLbAx_x%+l=R(~jX(|>tp&O&KQKfsR84E}kb^8) z$f?*rOn`cdIX8#lY`oDhm|3H6#=(B?!ckMM{+S0rAnud5xs6w)B z1w?o!ADXS?l1fi(An00L9R^w#TM`)&`#!4s3#f$_*bR_o+&%24zVqT)yukz74(m5J zQgMv$9?nzWy*d58?ZdPq~c&2Y@yJzi;*s#^*{I)9UhXeWfUsp8Y>p6N4~ z4fM}nzUht+LK5^LK5#G2;fRwr#lc1i;k+kI_P_@n{f6-thO%7&=XK1JU@h0cYT1GeUA%DLDY1rc-vR=rBDJ@+ zf>N^RQatl@GGV*9I`86;y;`3{2;;W0`;dUTfYjx4S-@;40L19XaXvRTkr#^Gk(Vcv zJ_lf+sdGCg-B$o;bBeq?{=IEW;O_B;ODR~SUGS??XVxlQfn-w+-vLY~3=**_$R}Bd zjb8}JwNQ~ArroWK-Pz()6^x5zH*uE8Jhs10~sbWKGz7yGRd8#A&`#1!mfU@_6%~W;!j4F3mG* zt`QcVGneoICJ*|1~Oujml@ z9Ub)^3`ierL*hy1Xd5I8L+7cBr{ZBh|%??B_fOy!u}&GVr>V>V>LuerqU%grZW zAU$Pl3%T+~qb@@MB$2TR&X@*6$-l%ZD$a@IJ$FYs&6pVU>ENINfSq$T$psxHT`F{rC@PDY%e>Md_#NZIHOcykXYg4*S=cMKeGoOUSXue0M&=aSHpicb- z-rEILBunPETBSL4W+knSyD+FNh$y@I#oia3{J`?R;9KS={I9HjgTM3W@UJQXEIQ&8 z9Dg$n83D@_!Opfoq8Q27(%!TcxCqH+%oxlcEfSehdn=?D*wsh;jWq=s1CiN3zW9eJ z{JSsylM*_Ur99x~HwmQ;){^;DEnSqMi@57QMcJ_W?V;Zkr5s?`{#K*^+OYkb1^JV) z`y0GRF4F~b^+rh+Hnh&tstp&1N;g~c{m}E3p-AoMKY6OD(UZ*4eP{U(QAt&pQ4ZfC zYo9B3`CUlku4ruhv}-@j?H>yDt7-jLa|`|_^VKR!241-p&@LwAe9 zZRHhgpN({+QX{4BH>m@S9=#7ceNN z{Gs4U|E}P=x=IOh;Hql{&F)rtD~qDHZcA^&NgrOgOCEFoYT|!5r$1cQUm^M56#B2# z;^{y8HM!sXn(h37-LtMViYoel9hh_3-0#ywU@mz~>E|H!_ipuH(Dyw~l(9Ue%R?_UR`C0@ z!RTHZfSIlI@VZZ`xrO+9w zI%9tI_1s_HOU^00@Foir51bUJi9Jop>L4p1!lbWN-w1uBFtW3!P@s!$0(X`&Q-!f% z5IunFx>Fs-msl)?vMdasj$y?^VcAwzwlpZO28ISxKC-$Ik)t?fQw6uCQG_FcB@r<_0@2pZd(x-zfK>C%gV zrT*WRsr&nIO#1a!aas;$$f*;y31?hWfC939Hc1qvh~SXujqy z5@9lUN zMOM%}i%EG43!%2!N+bR(6C*`wGiquw<0hT~^9k1@%eeCef}lb9u|160ZC<|6D9f9e zcwvXO*vD?z$@Wu7BTn;jqZ*?IW6Wj`wFeSb>^P#-3&bQxSWd_ZCtp;t4nvBOMEN-A zX(RZ1Xn@N#315uwHww$d+_9x)bSAXSs z0Xw4d`tw6%Q-Q8~Jy(9>L;w?D_$g851kH$cD-pK^A+hmi84FuF>jz}!IgGgT>Me8A z&8={c&kx%@yqnHGRB{14V|Z??kIge-QBg4lIJm|-*b^x}^dW^yO}vK}^<7T!>e&a8 z^eQlWX4Ek8WRw-kjPm~7hep55*emrl<8TC2LowDR zxaI->ce8Cs^?2j6w5jQS3-_G4cX=iT>59pSz6K}vbSt*sGsTZNHC*!a$&<_JUnGm~ zvW6CgNYggcbTA-+R;Y&}y;I4P1D36HJ>(Ni{nsb1Mr~(Uij6qxZBUVs#$V!KLnm;9 zo#Y#cS)E0+mxaQOF4&TX@-#El2Bua}lQQU|duUR``Eo&pySADE62tpEPe-&=Ca9UD zF!o%?OGR!KDLymykDnHWgv`%LfnuNGn@MBV4N4JjDLAlyDJ!(#o5_xaQjcYfy?)zi zh+vZzBol7rA{>DraC(A|AhFox5tNcAebqLrJFQ@1OB*Ry4R&j^0HKU+Rhd878A6q|Zg-s9|uomb@iJ zGP;~I5wTat_ikH0Ky+COI&3|1yKsqZ)ey>}CnFexVNUJULGn<&d7DOB!9q`?x2U5; z*l8aSdVScw=K&-znox;}3^JzLi=hfqN#M~WbFuOGX51Gj*7 z#q0&mP2T%m_61w_z6_F;lyj?H5A1Gm@{_^E$-Cp)XBN+RgFT&Xp2Tn6dm%9x`DF<| z(GeDX+Tq~MKDtyL%Dt<1{|l*;=59&RIeO)$efZaQxa7n0CZs)tk|iH(yW+We%uO6SXkKPd3q>bf&>~88TJ1oz|(X1puGs$wBnM`#^RasFN zSlCn^xK=eLTX@*JcNluVl2>6)+T_Lx1K{#1>+VMOA$m z^Ok0I3;V&fXb$u}S~L=)4DlXrF3;SB&vQYR%muGLU430S8`eF!-D4UmoJSV`eG`SR zvj8_ji4=kQwsLuSZBAVvtaU-=hsMb`)5} zm%L@>xfR;k_vF3YXMQeMwwQP65)Io!10g3*iKa&cF9nTrk6LBwc}2HA8W_=;0;5Ah z5_xI-z#D-T(PIXFlT=6}<1u5L?|Y@6q540yAdp~CBT`~V&3fj=e4we*4@FB}=Kqut zl4xsyj+sbiawtxm%y5XWM>@1VhNb6OP$-vBo$Eq{MD6sP7`bdi#44fcu>DFj&+GLv z(?^@#Z%2~)784Gayd$EnGSqLHjn@PQ>{h7M)q+EdPj6XlcDS#d#t6AYi_*#&NZ!6c z{ABFFDLd&a8b8 zKA~x-9Mw?5wI}2198hJpv3n^ln(Ej*peVO|>V7hB4DPtGuwNogD(&n6ft@Tq+?(nZ zWc9f6^k+cIIY^0kJV7|pr9{i~v_1Q&%8cx{hUZ3n?O)lSDUlNM55Ce3cMzlxKFxC$ zg8Tw{Fh4UeW7(*VZk-@$L;7OGr&7Z=!>?z{Z95l~$^3|z>R~fQWz~0}VsY3D3YNOz zQ_|<9LHtUaS3OF1&*4n0XIShR0y@V5lR@7QNh2w?IJ@Ix-dB>?o`sakqlU>e*0mW) zV>G5AvEt{^O26c@pD~Z$jSOMNo)73&HZjix8=Eii>E-t}-hLQv`dTu@?bC z8p6qm2Ti7jmHht@8=A>D}R z&{H<)3YYpXNmv|Up0iJ8ZwdrR1*D$xXed?2BqBjw$;!H*nrR9|nI3 z!msLa+tz6uP8)@@P>8(jJaW7@2kn_HeNW&MlIWd$f`A$-tmcSP#u{bpX_4)fJh~f4 z5_n@8{hEr|Dys3Eo}UZ{yuA4K#wdOZGOe#Yr)7mQY-8f@=^CnIN1QP~?qI*g1RsmX z_StDus=e0xslfg88SZqtyc!ce zhsi_OkyZ$Dbn`caPFYlWDSR{EC-KYt_;I!gR_s!1A8~5Q6_@nvQ6rrp?~?dbLOMi> zcFNh#Xp1W3UGW)n&Lf&{iisE)rA6&H6?y0~mOZD1!37qNhNUtU$llpzZ*}{f?O`mL z&={anot$(j#|J2ujTfIb^z}bxsr13i)#NeHdE5B#@xb&bxuOb+wCA<2>lJIo>yk~S zEp?oJTh?!UKoN8Fd$Pj*XjEZCg|`o710+TaOKaR|Q~_rBU^eP9=7EJ(xZ6tnP%2{7 z$0vlwz9`_V60xyH)zORmYiZuT>@D85-jP$x#6}=~sjQm_iNATvO-v4C_2zjUU6;DN z=d-2jz6z`RW-bj;;sP8Tg0LS;a^eMwZF1)pu%Bl}-Kx}=VQ^vJu{%^s&GlsM)oQ-X@q$v&dX zS|+u1AL)i^$~j&?_QXm?&2bbb8B^A}+RI{%OkK28nh}|cE#)piWJh^{wL*UbFsGD+ zz5$#~VX7f|^3k3l^fk(EI{2mRy-18yM6gIOF+ah(Ot*E8B3<)js`yT938oMJ^i`}w zy-RgqPX*9b(a00a$bYxP_ip+Oe64h&m$L+?#03 zrgB>Z^J{rR>(GlhAhYJaX(PgU`hy|frC9G9W4Il{rW0`0Nzh?+$K6m}JDxV2R&Jjm zxxV#qQO;P3#^ymPSuAzp2{97fEumACp?cUETS#%Na(aD~wGGTO!!e5<{sHDUz~%4g z=eHQY<}5G36K*!ncGsEwIIyEL=FKC*_)JSzW@}Dw(?Lmi4zh_e=mZz{B1Pelwt)wB znM|u#Dc;+A5K9aS0=VAxc4 z?Abo;+5#yaFTz(ho$>=Woy6COx99Jy%|;)+8oMc?SNvFR;aY)xes78Zuh8@5_>5M$ zPjGZtN6@HB{p5(7R8bR*g{oJV^C2{E29C0At8iHm4wxswDu085y}Yqh zMn1QB+R;k*MMUZ^Aa^oEAhOmiQr_qO28C|mnEf1c)V#K0QqMXy>Wh_pIJ;M*$GY&m zq}CyCu=6hRzkG;Mk->Nb0nd8 zJ;BJV<)atT_{SJsnBB@;zJrzI1<9Ot{GvgXK9$Ts)R9lc-8k;-nQ1-_jDKI39e zT+#9VZQiSh@MYe~9qZLG>%CNYt84zc7zx6sPmtQYmN8B@+*465YFc9{1c%<9X&5o( z_yrc-Hr%cn_QFMvD3((X27^rEM;7*9CFobzhe?!cE9UnwGpnvp*<8Lq1)Y*hX&vws z0_0gU``9u45z!(;v7IyPVk|=h6o%+2G4iXv_KOqsGr*z;C?S{n9z_w@%Hre6f*ip4 zyxQ4u#gMPeY`Z1YI$e)zK8Vhp(NHiT3AS}EP>xj*?LhZKV~w6;)4U87n6245earmaPKxE!*t1_k=7?& zZ%YF0%;M3RIcW@KJu?`EPX$FY(LQl9W2>;hG|cajM&^pnS<<~f;_cu-cx2H_0b zN5i_NItz9_y#=s<5?AX!y{+E7m3o+jgg+R-3peO96R>m~2z2!!NLLUtu$+=lWn8hcJBS+)+5z6Sn-`y$bz<{Vo;D%1B{Z4-bn)9g-NRgPG|6G$y3aaNU z-+gc3%ui)-?K~qEY|YIVvx_||oQlH72OosJCR*qs_f#u-LfYU((n)%WToX0h8C@%Y zwIptw;uv2ieotwt-FWOk5Ybb$W+o=$6I%S;*}}vV zm*8Y2-;@{8T6**&#itM^q0fxwamuBMx-*x-=MG6(avbEJG+_~+P<+w$Dk;g#&k0j+ z)5&(?iK)%k0&mBCO$<+Vb#j7lkXQmw*W-Ld%g=IOOq>IRI_<^8p$y%{?9)BnL_8Te zr1Kh`XM&j?*mv;5**?PT$;T4%%~Sc$L=CawL@uaf7<=Z3q!9C+U89UndcI`7rjDfsi4&jQ=e1h(_~pY)X?%p3yo*7p5w_LJ`q&jGA@5-t z^${@nmpzTI&d79Lw=zX$7kw?&U)V-hzLX|PCnDSl+wkSQ5@e z2A^Kq8?2k)z-%~Z2a)@?}dAFqGRPlZ}?-Fl9p0Fvd8sirAy*S}W<*<}yWqF(! zPdwUQi+66nfTq*eh$*{U4}(@&cQ3k8iLNF!#D6GuGxKmgU z9GqEK=ifU4DaeBpJyAIVQyVJF(7-(~aux8zG%$xq+35 zT0%{Ukl`AO5sgxLn|Dv*+S=OqJcMiFs@G5w3TMlitt!LeqOvBYNb8{c^YzQ0Ssd_9 zR#&3MiLa!C4bz^{@S-zjMt*LbUi}#DE7@Tw zo6&m=h%B%5D~8%J1$72R+6aOcCJ1A-*g0j@c2Hl9=H%u>PS_|mIG-1q1-KC1H;!L; zwqw`*ZNx1FJjR+;I(}jX+lsdmlELbFFh~j$HUSR0`H5`WaWZ(4QrkQ|R|Kj`b|9ah zpffiX)b43<3D5eeBM|$|qy_D9)jD^9h`mWJ?4cIeou3LIx;^B1m{51Z9KoXY*EFWc zDNJmVogz%bQH0#8%7MzhB6;SiC~@K~@(jj94|__)4kP?>HMfI1JWLm|d&*Rf7A915 zA9#RC$O??9l2tgVI^1yihJRDIvKVOg$hVU{KG5T>ilH<^~qo>gYpBvwi z?!3>#SO_K~Il^?S>}P6=)@!q?=Yu(EzS9fMnwgBqU}>_yUGC|@jy?~<27&Y4Jg(a+ zbS)l{%#T=*WC%iZ=cDZryT6O7wnQlHt(mBkB~sYy`DMBpv4|e>5Wjv6WOd2fzVKnz z;A9o^=yp7|vvW**s6DY$bLG`o(To}&`Z!E#VAR)6CYQ&|7ehh<1_zyTS9QDHAx@BZ zjrSKUSn$S4=t_?xKo52eYjefer5W2DNBNh1xrOzYHiZ>ZtCBw7oN1tKbK|u=R{rd+ zJ4MZuJl=WX`Lbc+bT#1B7Yb(>=d7im?g?bgJd@1&!-Mtbnuu9`)I(WN8GG|GC)s}L3! zdsYtLcsC+Dv<4v-fJJ`9lg3sbcj_l-G|Y!wIRrzs2=dfMcvY2szdMeH;~tunp;+sQ_fF=`>d7OjgE1zG7@Wrk!p|JQ!y!c4zh+P z9(H)VTKGJsj8+WYEyj>NM0#(uTc;=_`?&nscDzr}8xF&QjV5SLFKigt9ZmW;kjeLjG3t%f-uf*GI+D=keX=++9xmbT&`@-j+aThWGAh(TCsj03geatz$*M>BS5PF*1NbUIAV%h35 z>GLt_FGDj!y6x0){e(997BKb!BnNT&@N8eMHyjzoO?i6N8J^7PLW-sOo329z3dHku z=pT$dA``6cE7B*ff5v=S7WrbdXeZs>Bm%A7@>sx?$|HKBFi_3RAeC@Sd|Ms}=+q!y z8O8%=D&)I&we@fnXROI~-!EV0bI2t3t4MC6O8x|T)Cf)6@>qy?2W+Nz`Km`kSMK+% zg@#btN9JrMSM_XosCB*L5ilLIDcWQC@^vCUyz-E!Bv6Ioe}8UT9)vXyYlb9lb*0P2h6 zi++&$h*L|pW((%P`Kzw`p~cxqU!$J!Bm2$L{ZKuE^#$6h$u}a)0;e&PtP$ZJ61^2v z@tEr0K!=Cd6)+nWv^mFQ&Zy12yo*S!ipH^hC!mN^m&5VYOI95w2${R*Lm_h1dDBiu zxJQ7lZ4eLLA!VRJU6VgGjlV4GeZvd~SohTFtICwLR4hfT3r+~ci}h;xh+!X!*;zit z!7(>u@^Qs$-VWnigs*{y6&44i^?eJ86dr@>bEW)LClHQ59PLT9KR;DCbr=s?NC1NP zH;@np+!rIOV5%=~yuW zkmWd1TJw2*rC3l9@*g2BM0u{{<;-sG6qT7#2S}9poPi= z3rKf7DFQ;P25+eIXPPIy3ChE?C`2Rrr8x_svn={0G-CQvVyup`(Rg`97Aq@eO4*W zy-23od@TI$el060-tl2SYuff4ibnG`EH&(MqE8UBQqK?vY2?y8*br|yqidXCkv{5E zAEU1xXMw(vU^6V&#Tl4??%4SjBun)LqoVpI7m4Cii9xnIhx>x-hh!U$CiD5o7en_v zMMSLk*yyzOX!%qw3(I@8UacXaiM_QSDFQSV`J2f}vB}nzMtG%yt^5cNzWbXz7Tr?m z)?RS~v^DpoU{o>z0%`H$SB25AsF73oXJQ362( zDTP+N@wD-Cy2M1?%M6C0d_CO3=}#hZE2LhzHH6~fYZlo?SbfJ;de>dn80M+9u&q2!XLfs4U$b}4chxHYIs%_f zxmlzmDW*8ZhGl1Wx#lewUmV^&VtSfyt~AjXL&#;R5!~XMfgr>#nw7E9+;m?K+eR6v zaUmwE1Rnp}b5|9?E`oAA2^%wN+7K#)4Mp+`UDubN$8I1jgF)1kGkg-H1qzldO|L@{ zKdxyJA*~~o*?kmFJU|#&eAme4BNk%)Mlu`H;r-b3QM^uvnvFy-TCXDM$2h<}#grrr>YBW@^Z-H2~)cdZ5fJYO%q1B@-g-XiQM9T(^+^hENt^o`C zp==UpzR8<<*yrG3EK-U9?BY;C7$EJi^>s`kEgKtCS%Xl_9_nfWd@Q)4p~@s?-bL`M z+UnS?6`6|YHboa^vxudHT|8>pC!F^>hZ-X1>1nVIM|^3=o%wIHmcRzS9j_f-`dE|h z9S3?=$>0&c!kk-ry5zodL$>p(u<4$sXh(M7x;;`atL3Z|Ri^p!>~U#05AxE# zcM6Fr9R+1%D}(vawh8e<-M3@wLJvzSARm|9w6qI577sVNk#mR1!d zW-e`m5_3=@QItc~P&Gx25wk-%af(J;ZO^%%bAQ~m?w`BvpZn`u`&-|Sz1M#C-tYT8 z&wd{CyQRiUrhX&Zd;23sZ3G^)f%s?t33d@p4|_xh8zj_;@>HJ5kSf6$loop?u`hBrA0v(*c~ z7MLc6hRqa#UgVgscsQ{aFEDh@*hqJ;Xo(fNgDJ(iRbLZ~_py#|L^^)q6)bo9d9Et; zSk)&GzJiH0_YD$hsRLdsqgT_TYN~)faRbSl-*$D49f@LiK1GC4pR%C=duM_22Yqzs z&E&uDl}EA!g|)?vf9y|=#L8G>T8r&cdsLTAOO^c+3-H&vAnQiXOP@zfR0Gx2BH^#p zIYDZA8$-Fei4S-OrK7og8kd`idE5YhSm(#Zx1w|G)nQd;&r4Eu$b7!*nfjsc_9TM# zuOng4Bp3faTKUfPkF#D^WKXq~x7}jycH)DTf6PC<+s)F)ji68FUH;gW9(fk=VftVU z!^8w0e5}Q`yfG^X=l@QHQv@YU_zi}mB=ZYy)OqL<%Wj^SY}RVFR7tvIKCrsQ?e1RY zI$u>veKa7wggB-h8g17XTZU5z z|0W5ltzelwq5LrX)6VEzxhJ#v;>}30!k-I};!5jt8uP0})zTz!@-q5rx_k^6D5=RC?V7y$4cNYl7~F_C@}^kG{E9E?1cQ z@R|k2px&1zXUXFY@Z9$fqKfZ@G5-})` zkR1vGsNW)ecP+Bf>YuP*IF)p|VCbNcO|pFgq$cP~0%&-0R~~4naH+cx`2P>6HN$ zkP(#bCO0_925`~9B2s5*g^av%WbOIn!I0a^8pBfA`cM?hqDy?cd$p*W-j}DAH~nx& z_V;3TOm>fvYlz+ya@NNKa{pMF0NB|7CvAGWgBXfB&h>}%w3wS4ujHNDYW<`E@xH^8wWbD!GBmmcuFP!m8HL|Bq`zotDHaM?kacIeFyveIwH)kU!C5vDvZG}c0J#?gKPx6>)9~r3)Ud3b z^DAmmaACgHh7@`Ur|@i}>TL)XYwnK|X`yJo;j$G%t~%DF8UhQ-j!P43E-hg7@~vdz zd8haqC`jZKNaRUUi)-Q4LALf`W?|?m-BHW5x{?#u7v`jt^f=2pc z=|(n~yiCU6DX%G$93B#J)G3H(=aj0%v&_2qgoKXA5Ku&FvF%4zeCrdBuCY(dxX32@ zP_Tad4=wR{8--5o5tYuoWvPG;^N1BVAgy~R<=%J*4hasP&2mYGJXllaZs&vDlk!Q^1>GpBRah{t4Gv;2U)RcHD_J^5nvz zsaM^y6~gBeWGrluqkAjG{v92bu8hn`(RVO;GOudPvg(cd^lO39v@g4x-s-+PaR z?Ew{%`xqq!Uu$DJc}I=SdGnQ~FC+vp5bseBO$LFKH%)95w}6ijvkEpZ0@f_?p)Pqz z=CVmZWk{T)z|*a8Mh+=B97WYbfoSPyPZ)3XNw?Cd%`7Kg6z?~vC!{L>?N|d;lK=Ht z5^v7sBg=3?33hYoH7bhkDmwP*9i>7?=bVit;s`JXb4|I@#54dx7$S$6eJyKDG%f2P z~*1EJv>V{Z8NRntWfo*4^8a4F>9h; z;#4~(&ZGPnQai%V05oJ@2Zr-2gdZs4w&R^pGEoJ<C-77}z@ zF`*w)sl=Fi4luo(#n8t5bojs^0SJ~iTJCuK1&0CGu&aRg1Pv60|LzO`I tdYbiwVk@0u^g4>Gz=J})Lf051xT2&9XcohN{t%eUTmPc~3HYzse*$mggmwS` literal 0 HcmV?d00001 diff --git a/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_2.jpeg b/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_2.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..6c8132565a20ad31f6748b89ae2f20b7d362e7ee GIT binary patch literal 67652 zcmeFZby!thyYRc{?rsneq#J1gX_1mHr5ouEK|$#TLAtxUySrr3E!|ybqCW5Ye$U?5 z-p{+geVy~idGMOoV(M_rapxH0H}1I}zCSF1(51zt#6b`c5Fi8K59nbQBnpCug@uEK zfro>GLqLE>M8-xzMnXa+z{EnsCLtsvB_SjxCZ}SeA%DU^NlZ-3{fvQym4kzWjD|DuFf5kB zJGdt{*lZudQ{gE^N?UOhM-Qpk^=y3*5OMMF2?(j5($GGmLQ+av z>5a0As+zinzJZ~Uv5BdfoxOvjle3Gf@5fJm{-3`DL_|hK$Hc~cjZaI@$jr*l$;~S( zuc)l5uBokSYwzgn>h9_78ylaPoSObV172QPU0dJS{IRutbbNApc7Abrb^Rz81PJPv zSpSghPjX=ZazR2vLqWqm$^`-G1YA%U&@dFNu$aOMaPMrep0Is@#})}sEp0`hWLG@I z(X$;z#HHd`rapQU?WbgaO)#JTk!1f6?4NRhLC8=LfbpO(KtiC4YuYq##J{h12!xK5Ac&D^2OZhF@Z4=xPx|_i zhSWmS1q#hT(P!Yo1n1F-WRal7&)5*vX_x00N=HMr*;1CE(SkyQB(U0KP}!BOiD?@zGClCmN2mn&tgO^AlpRE=5i;@^XycH4ts%(bLl9WHcR}OTVEZ zx41RQg6Y?ekb85V-hgjY=Umo~hAR!9W?CQI8Om0S$^z2J?vseV4s+?es&Njh+5kn#_f7^$SA!tS zAEXphWSy~11k-7y<~5d>=ky8a{X|^@s>lUWD$uUq0)7xFm&qClP-JzUhHB|af(-G9l^7G4VNw5ap|)vp-tB5%{5QB z>yeSud&ALO9~pAl$9)Uy&CS%HoTiTQYHk|=eH_``ai9R&VhKlU6%BL&9@U5UkV@YqvGFJ>9Nip7SQAI{F(w z&HgtSW#OJ~oXRCR^u_IKzg*L&nYfkN8U^Bu$|1A8nCw>b1W@aGHE{GF20`pAaKV*m zaN8RAmF0dLLu-T7_v`vkL`mBQ;>Zo?S$W_xE#y`RV!Pr6Bdy3Iy6(H&(rKYfY4v9- zHt{Kj^}*%*a@bB}nEXV@J;m5z+jKwLDL#CGfURXlk+92hR~f&hccd~LV_+H^GZNZi zq%s-Mx-KF>yNQCBG`*)dk_(8ilxQzC#Rc>B9cjvVmPt`hbxZx9QDS3gNmT!Mi`k)CUc1H|E=C^H~g{BZ+(>8os@U z9|T#?YF$?K@5vTC4Hc-<E58bWje=)D8ja`JAoNMKkT}2j3)Hj~pnotD=Z5$I_5zOl@S;)OtC?AyDWD zP=b@a-9?wNl1HfF*_S?ybmkr^5`NF@kDhR_(c!hj(b$G%80eEmPi$)hSA}P3n{k)9 zI98Q%lrIH#dFDNb25R~Vq6hG#Og#!qo=4~CT05MQGTm_Rij zF=cWEFKg?*sa6BK(LXDqREMBbM=ybcCN^*lSlzv+(4D(wo^;q^v`n4m=t;fE#NGp_ ztZM&8l1Hbei+7V>pI{h&&=$`%I@wR~iD-d&=(&gB&dUHW_-G_zha zIR-|A;3>lxawItUo#Wc2CE|^Wm7`6+r<1C(p~}HUq`DMnG51PR<8LF0kc@+1hs?tQ zl}nF=IyJvk5I7wib{uAI=e}_|N$;AXOsw~(oF)mjt9z+oIsD1~0L5I8ZPKFX9eO)c z@Hzg+@V+2;q|epiHIVpN;k|$bCeGxDot?xds7jh95a&8lGbNWIbZIfd8`8qQ5}$<7 zr$Dp$pbD?4Mmwjgl^<@@7V7*Ur8f646DgBT8#XeM>5T@-`$Bwpi*2YD+Se#DC1q6Z zqn>tb=scOkW>@-o%@Z4f6INe`XCfBSj4BNj4MDal*{X-&@S90~8dEQGl*x+t ze1AU!)WSgZo*sVE7eb#v4o&vDCzC=4sS=5Jw`X!UKU8_|ArH08Yj2s;XwSG5Lbnti zKy5fny4QFd%(qa1x3o}MCut9$q}&IP2O|HE=FL2>MUmR%OOXbo2aw3drw5R?jn^l# z!%xn5_n!uVzXPp2fDl1j4ZoX)@-km-Ctr*d8Q-H9Lq32;YE!+Ih;^4GZYgwN9zdyf zA`hUJ=gn>2yw<;4rVj<)yJp{=(tmvbeWY(`K4mh|y@bQN1O^7L`oZgm)=2XqD)V6q zYQpb^wErC7hSxIUA@dzlJl;J-=RLO9-;(^7HZgFfX~|e}wi6k?bLqUt;l8DR0Koz_ z%1`%N!uDEzK?3*NOB6Q69zX(0_t!YXfO|9_THhhoao;@rJ5yI{-X4tBj$1l?s@O+pc6JoLG0hZR9RO-ukiWF9zgF@A3$R=0b4sU zM+T-`waf5pV)nfG>uLsMr!oPc9|qv*62Ddm{GjV2rDYEoMK?Yr7sv$XfHbbr5}Tea zt;7T0TXn8%Cp+_1wZgP`34_2JLOj}_*XX4@7N~5lQxm2l>1Ij_XU#w25~@WDCeAFG z#jEfLqxUK^UM_JA;n9p3=jJk47dY^}sVlAZ*_glI4XZH}`dpkSNMq_rf7n%MY~xTc z?1a>!$k%UfW0oh?Pw{4z|Jkso!1Iwy<>dq19hKON^Z3;*$33>XL4gRZt7E&`Sg%($ z5xhrRI;Y2U3M~byRl)T;mV4$J4Gz4>F#{S(?fww5>48+$g@z7*wha z&$ujqHN|PEFZ<=GpCz4au=T`bS8T6ZZILn&y1x>idIH8EK$|$j+`ea)Z_K1D7WJe!19ObkC&Q!+$z=!eyBF~JW3(%1UA|=xuz5F|am9;4Ewkb;{kY!i#^y zRda&7&&e>eRo!od67d2=ytWd|*P&}wE15k~X(>RZ*vmObi|)-nG_Ci|%bC&US>{f) zt6)Z*fooKi3U^F3uZ3%X3)UQ6)z%6R;w(N0_NbSNls?;hM-H>qMjtjf7p$c`=@3)P{eqbBiL_dx583q^|dj$?xW+*I?E6@MEwB${m3h)EsblV zLi4=AoK9$~!guzT)F&W_Z%+hOp^yAYY!+(0(6$SW)77^Kv;8^Q^SX)OYweY>iLn>K zeckkELtE&zI2q?um2RvUtqetb1AiYdE|OEIWl^z}JKop52zs6ky##OUDEPdZp* zP;aE?&6qZWW!AtVxr8$Kew5L+32pmxg5pr*7lR6R2T#ia z3~|eFI{NSivpYk9tBslw=2$Wsc07OBWh?$5%V<-Z?Gs)v|NcmY6?6r z;3L7)I=rvXW^5_18#gZZ;P1{>>m_VacA~6dxAK9l<1Dg`cYw2BGNk z?m(5XK{i|4IaYoo+CVb_K1SWZ0g??K`V-HZcaIzt(Wm#l>1KFI#77p6xZto#7~>M|SHB&?nA5(rN-Z_@0!u=08^%%X zOMN&JcHzx7Vg)d`_odFKWTMZ(+OVXb)U?+hK{%w_HC zku}9JtjSvPFn}%qIiU>A)bRmzyoKh2>w|mcD$h_T%7v~u#pg&u z^nEuBf;vur=wsI93(d^Rf;Wt%FMUW{3c|5~NzYl{O4q2<3K*l=zwOaqgiFd<3jlQl zNFnOuLyy1&1pv<^E7Npovwp(NVGn4dTEpzSdLA!h0Rr%UiLgTF1icLhT*LBf)yLgs zHsAGY*FR)LE1h)-eH`kNpQSHjQfba#r}JezX|Wr6#p54`6U4)p2DWNzAM- z5NBb|oat;aWW3<5Z01AO6-hFiT-~9mtW5*8yx0euq zI_`q_>A4GL{pZdNoh`u7SPx!c8Muyy?8e$}m@xe(IPT1Ta}z$bW?K z?E^@tCvfoozO_D!_wHF*+n~P5)D(JXa33Z9^WXjf!~qvqP)^nqCf2?WU~HwPUdxS& zX5+vC3qKEE8o^Ttu0N8;=K;Mbl8X=N*@x>)?*UU?=(K?weA1^QDK zFcoYyUU$(o^zWaqxBk~~LHlzSfC)}n3^b95W4RPQ@%Tsfs0G~q^I!orM2{}RTf%K)ZDLP#nzlW+p_ZSDd2OwXRvpm%cUf4v; zqIAIoOG1O38v7TuJ3(VXhBPMmC7ia8)+r)ASyBDK$RvXzj+Btza>IZ^&yBwvZ8N@5 z6Z)A9eN7P8q=E!aXa|B3>2loC9&r^5g%sG&)~+{K)-4p1w6a;W#UN*FTETCUh)VaZJB5ga|gMbL|Cx>P!zJBmlp zRU7zYt%YKpihQjSK**Pgv1FPDQy$_rCs-JxvEu4sVwfX_1hNoLFnbBJR^=E z`gRoJm)pMV!r-tuQLu015o1KFxN&R>5lKVzf#zAyYUvvCi3(H2ojND2gC=t+=PVe> z{BxGko9BGK&p=5CR`6@2X(d{>RYpi)Td?*62=e|*E{Uc@b)(5`b{P86CZD#H3c*1S zr)Q2yo#~jpo7>aKLFZQHr>wQ#WAsDjB2{+X~814xRAKD$W2lkAwqS?Ip4_yLp=iw!X%bn7K{zv{)RG=cU2A{+IpSsYL% zaFtlr{4LZ*2t-V#?`GD=SAc@j?XlepVX<9 zsnw6*J>gl97TW0N=lwDqHi$;0JLrWi_BLxzLJ3{R0D6;<48)R3Ck3EZB# z-!zGTbhab*&OIJ>k5Ya@zqG3EMr}c~w?uMu?@Be7IKp{|{RK%U&Kh_)uu?|qR)KBj zI*OanV^z%1q)tyD9fH^*O{$?d3RgeEKnvBgJ{~wLmE~IPs2b$u0A{kP+A(isx?-kx zU{t|(8=v?$53cpRc^frFN~Wf$8#R5`jZJ;F)lgB{5Q3w^-aqG8&L<+$=ISD%r^Q4v zc|&C*&Vr4&a(aNiJ#FW#FKu384c^}33p?le@CBiq?3C0b%|)l`~Ajj-z$f?ONG#(EXDP}$RDrP(t(}w z#%VC^rSI}WvwhDiaKp*GIQ-_ib(p8=^|ax#w%mTzU)?(dQTD8Q<}C_5fCe_Q z$&6`r^UBW_Z-=@9Xp4LcXww2f|LYhXbo7k9-$a^vJ_;+}@w{)opvd693 zS1`M#qdF)@Xse7JvGOyQBA4A3SoYO=#Ozgaef8ZS{UV-Qp7JGT0_U5qk>PBOZ4C<) zlf1D-IV%lCB63jr8jbG+;`t()-m2gAph{ZZ!d9`i5XwMffwh6#hvx*GgJusPbbk8O zi0Zoe;`ryssn(n)Ta|1Qt7@yuVG|(-SeKEiQ&Kl0U)?{W-IePke<+HSJWTdKMHedP z2$V4R3h751r3eDkrmqX#U?fmdH^Y^Mo z(a!x28ieSsisi@8o;&337F+HMRRm=k1V_ah)0|P=JDex7v|~wAv!;fNLq!un?dW7_ zCM&h@xbe~r4=%pmTRfEfGG;fQWS4%+QPVVPq1oX{c?3Nkhc#G%m#G)Dp zF1^%U>W8Z3pm#~mHIFeZ^JtHtFHUQy`l z4=wF`iaoC?RWfzDWq&6<$z*AW@4R?Q9wK~!V;iul{4{eYsqs46%Aa$&8N%7FEf!PD zl-D_AMYv*+vb;q1h18M(wheDzuB^@c>F^ z%+D^;ySbOP`U1pZ{=prArM3VF$oPNx!WlMOplBLv5YANRiVD^DvSk!o1bZ9W5j9^o zW>aD$trpsIc3pPg7kEX+cSW~KJLVKm(2+}%&`@*RW z$v1m35}{2x_%dP+X{4;|Ev`YR6)MkC2@SDh<|MzY2WlZ#Lb}NV2y6$QWt<$^35OT6 zZN>3asvYWj{+!s>rNG0X@=jAL1Zfz_4C@Ei3XcksTcGY~ZVAyveWq?W+F_6%B~=)R z+xqR)X(-kP!qw3 z@z;$6vGI%;afbc#92u-$s?|u<rtW}EADotvwZnXYzT`hFr zQ1R@-YYfSbTS*i$9@@Jz-?=t10$+NGmDS1w%0BHv80Rwxn8%5wgo8#teOi%|Rw2b0 zqCW9$e&b-n74kg91~-2K`F3P@fkCU`6%^cQd2PW}wejUfJy2D|I<9C^@-v=7 zIs@X#gHQWs9`rqv4lxne^ig${byd}BpDMmwNT)J}_x9-7;)gsNOa&%SkGrhg;&xsw zu#81MjQ5u}HC^ZZrFsGo3ccsu5@H~`Yt+aU6b8-NMRP~)7|3Wnh;jnPA3d=%p|4 zF0g2JX5E!=Kh33xr~gcugn+4LAiwp5#n6zF+=thP7Amyn(6MB&XclaKK6OqwaKJH| z_`D&vi8I)zHrD#(w)|PH^(~yGL!PtMHd40_2_Y+zh>}Zfur!C+V0dsbVby!LX>CA1 z8pa>7WRHq+S16pvkiNHxS?ITTCpNu9Npiwg8qp`&0g}ki`A7UFdrM0$P-r~!u<0`o zo>L%4;102<9oqD1(M;3UwJJ{b1E`~aMZvSwl-Eq+G(z;-189*m+EOvpQD0Xr=fc!0 zdC76ITgR&-eNUle0#E|vRePjs`US=4Nm`#7kUGCs_eOG zap&L`pUT*v`HW}?_T=kAVDXPI_9*N)PwLUbfz^{xwj zZ}zRZ8z`-V^XkwBnXc%J*5%o!TMoHoY_E;S^4Sb={Y9d6+89O;8p!&*Z18f8? zok>+!4xLAeAwT)xdo`WvWpOJy;%$#dpQVa`3( zWDMtNg|BFu9y}^jLaCJ0{!`4+7E$1$loA#v~h4SyY;9=xA(0hOKkJ!fF;p7?SHl|+F)*8%leUgj@a}7 z8jYD8pJoN-Icn~g51))d>=r&bPu3wDI0A8+XaSjifMid zs*oICW%qJoi2E-#FGzqg$nI0!eeM&Sp?UKr`ha8Sy<3vamI$_P?(-<~4O)IK_*Kep zDzgh3fj#(hFL6ez5?7NXq?7kz7WBK%KUb$v$2#||7MU61o;yCVv7MRSzfP2sF z%dL4v4$NMzv4R}}tB;m787+z3melzTD|+mn{FT8T$_BFWIOjO_*yP;qM=j-q=J{*2 zlX>Cf)+umNwl!05{uY|&1^aAYy;Y1v3Vu%x@fo{Uza8>+_zqhbL|Sq$3tw6`ek>)yiHJO48m~RdW8!j@41s!FiRs1<~Ic zWIco`&bKE$-VUyiugbN~(&@^$65oEl2D+H77s_PZNK7$fZO_)3F|AhU45b4*hqu{u zO;Uyk&_V@Pc}QQ#kJwwIn_DA#8PB~QH@(PWb)b{ZgNPQ10sB)S8$?L96k2~8Ds$$j zn@h;F6gmAeB;+bMN;B9^e6~7OwOM+&ex4LFjT^7s8C?qVB!YI~d8h>>OQqTTX+4sI zj8S>DkGQdNs2M#gRVn^bgcQ|vJZR-~=SBt%@p6i<{&T7`^BG*Yb_*SGtf=pH0sPsZ zv0B^M4j9!xX9w@vq0U(cfe?(;RXtBvHD(J>{^8qd5r9J69I`GwFO)O`>lxWRM9AtH zO2x;rc$UJ$1BMz$Dr8l4Bh@CTOByR~2*E_=pCeu(6W3Toh`V&LUXDOF!8F%R+!Et*)UKiP!zpCrmJ$>sG}>0Rbi1pmf~zMc>^D=1`*Mz==%O_{Q&-OmZ|Jsj=iW2?iPraJE%l}b>{`vs65iF zq83t|RFGlU(ea(fGnBiyAW5s#@2{a;e4($6ylJfKJfpkZc}4Av^L9{rn;UKYr#P-u zucw*=9P&Nb+clzAR#@3)xnt_PWLVn1c}V6okTY^D!kbPtn|#E#oXjNZ?iuVI&Ze4@ z)A-_e-#pNzbM~g5{yurbURg=0R8~&9zBF=QOJo#lD?Dg?@GicqjB|p5j8CnH#bz4& zpmO%=ULqMU;cjr%>DB4AO0c$id_{aY4^YadE3oNXFlI$lF>g+nyiEHZZarWt-~VuZ zz3A&IGsf1U$!E>iy)ZLht`=azv;02}!$4yX<(0s4k|}TI!8WxQo^u}Et8MEX3!rLwncA4{j`5(z9fo|Rk zPXAQLhJ6Ig^4Bo;N48x?Ai909<$Yy{SDc>&wOt#s~ zzklc-GF!7!-n`VDgUoL*EFZ^)5WvyW)!@3-YIC%`v;jPgmit_f6A&5W3q+7FbO7)m z(erxi=c1dP8%co37hAHLH(U}9bsy98?GRw`1lV8T(&p`X8Rdjb%YSs=(dp1qfhzkMh&Ol~nEr!xODiY2)VFgg+!bjz-ze z%j8k$PhNF0LEK}Bk6LPfwArJ~%P+7LlskXrT?19M!cnn_v) zBJ1);2pK99M|QrUc-AXUxA!5Nzt`HD>e!=aq}~HG=-%mal+cB>BkN}E9;WHvrvV?; zg?E!pNwf3qTCed{d;YC(XIG=mo&R#gcUb{LB!MZR&3rHT2axdtsKAUU*=GSMSyMgY zcm;pX?*{R8y_`Fw_%2!TNWisLk&7ctufF29_#d?q$4%3o8cZAPO0jKfqqjVT&Ib|| zlMPZUGU56zn0<9t- zS$%RFD#*-zlkhXjvI1|of-^L8q@QxLguOM+GXI>`iKDfsL6=UH^8zqfXu{U(Kcy9U z=m}*TFC%?6q#Q>mpM|$Af?qSat5Q<9S$xU+Q2wwPN==~s+gPHX2|eS_S>@4t z1ZT`hu=X&l-y<^pw3!Lsl)op_J3+PnT4v&Z6@cfnj?+9C?? z2|oHl3D5$*0|-njVwDPUzsw|Poyqs-tn%oi_lrh)Y(P?EAjxcK4Z1dpS^)crJ&Fvt z+X>V-X$8da;-BaXrPj24Ff(w-Aefq`t|3xqxoae~`9>Z%Way7@J#;?joNjeOx$~Jf zy+GFzqqFiecz;x#+OrKrr(VFO{h-;trSn$V+}1_k6UP`CoOkDrim=oD+%z=AzQ;+= zn#iXP?B^AcoDKXICJPtTnW)QRmgr$YXyJaNKCk66jrVYh4EOZ=2kfKXaM@m+%gb(x zaa~R}eD}jR<^^7^IA6OnGaH#S`?rYDfOGo2I2zmo=)3X*=rm%b&h~(b&d8~Vwk1Y@5Ro;Ai+k3@)sc`P}080KUL=Dukmx_T@Z-AF2h6j=( z9zdVS{@wq;JhOQ3X>c2ROXo@p+5E>1?Z0%XmgY_VFSj&*w4v+7hS1$T0FDFMJZ0{& zJrW;4n;UYyUTp;mFL3UM9zZ82G;7Vv5`U#Vv0NyHd5(QB-jLJ%Xmc=?Qs|la3G0&M zO1+qv7-9+x6iAg)==Rk;9k_c+;Av~7;FzT2>itz~QCnGdK_+wToW6{DpN`{A^U@rP z0By@@TEN8j)dp$Uud8yX<{&+u#EPro9GfU$`%z|I+HzWC-aUopu{67i`|`X+tD`-~ zfUo8jqA>C49jEM*p*z;973^53DsdZi38`MEi7wJTDT$`$>>pY!r|D2*0rX05fBixW z`d|Be?DuDPkiS0uxc#4hHr%X`brM-a%?ZCliJi4xm*Yr{)a&|6j``m9H5!g@+GVlU zWySHCg$e<+jZJVL{nu2E${uW|}rO*0B@cr( zT!aLe$)-B%447SSN+5dKnBPwnkLZsdE{)#`EeTuZpxAXd8LLevgfsYFr~n(njzm}i z*E=?IH{rwc*yTbV3;X<+J{bKN{XF+4+=}LkWx0XD&+F8b4KBP)fp>_Gb-7x_uX}zQ zaf#vgFQEVJ&-H@|vM+T{UkDx1{n`wVNmHRqtwEnSCNq0F?spS(TTJu(hutyHXqMT@ z_UN{qM7E}!HFPef$o`7TThI`t{H0_f z@R#2CQSq$LL7^9JZXQBL=PRy|V1#@yMn0|lNAl85G(tgi_c%u5xEejjby z(OpPB&Ib53mTtsVh1gsbe{82N{H^keUftkEv7ulo4d(0~YbT)>V*1-n;xppS;$rQ? z`sK9+xbph=>y>B42VDLb1{r3xEzqkpG)V&VBPNCPn`(96t@C;_QG2Pg#eRIu?G$_q z*r1o=-~Dz2p1^~vXLU4pKE=(ahL!ebe?A>a0diFeWO|JA+wKH9iKK+@$P>}s+bUq# zuO7shKg@HrB=t`AnIQW{3nJhrzh?CE=sA=6XTDtTD-x4>o}cJV9^@M?rf=d{=g@G` zn?2i@rJ@48GoQn>-S;Mp#v@!u;2i!60M_#FkT5;v zlYPSWGCp&cYt8|w-bN_pcBi;VNa+FeCAMRabhyCPA)`8k&&oR7ZvQ!Fw6)*-DQky0 zs?N2?rq1Wmv@{ToIEdsEBeu#fF0JJh0=&vaJ#u!odLwwIqH-D_7ySUr-qa2@xJ-C; z|NU?%{Vg6N14@Zk6_4%&;!uHB${1TA9-SrQpcy?@>vi&Z^T&$*wi(BdRQ`9BSDI(A zRoko7RPozd2AMe269E1+{J(zP(eN@eO>swN|&OcPNU~U_X^>|dh~Up{E+!MOw}ua?S1Z}rAM5F`{hnX40})2MOtt5j47!l1e9OjLEtU>fFb= zUT2=LXJwqlDTv{m`2Dqm0An+a{pj?pJqh{h>QxHuuGLeW?89O~A*_8w*{tuaPg8TG zC^Y%vmg&kUesrRuE^m#jRvPk-_*Q&f{M66-Jf~JEBrt!}b$~8+ zHi+oPU1SrJ-NkVtHc0Yzo2EJU`iymVGmrS4>}nM|MB?sDU669*_T{yxm)T`>5k{g4 zy{0xNH~-fwkt*A4fg8nWmFNlG<$I&3vTNJ;pnNgiYId)WmX}E%$GRe@ET7~zIX-*N z7*a#Bb=L0udRTtk;RD{6hWvd}egaAiZzOS4K~=cOoo*VOxykWlwt5vF8)pZR zY%0Tn!n&Dj7td-Sofd5TwH#SXhSO=vd1{jZjxd9LJ|EMyI5c4T^ppvn9JI6=K4;6W z?O@Bh-s|5Pp0?vy$HVpFD;4~p(9j|i{u0bKUt;&jlyVamAutm+7mrLGg2m(QGbx)e z`3Fv*ls z1O}Z`0@Qju)_eSySCM9C1QqY*N2>5ibhF|ZeQ%*rCi>j}P_ooyU5IC7^^WN36+o1d zDw+WVLJA;|MhJ%bE>#jcr6pH|4_m-?;7@c&`WI62m4py(FnmNx$xvfKXwo3>KXVoU zRA3-@PNK!U9{GvVRDLc5Q{X*kEH@au06O*-vFIo=DGqzo!=$yR?&HUswdYr!mHNNB zE3)35`iYpzaDsf{NPyihFheH`G6R)0ut)V-t{!w2XwpEBkadeQW$eHv7>nqUlZ|r_ z-H5$(tOII*$F$xxMgQ*2f6sBj#(xjPC2*2ED6|dO1`ikkpy|bX%>jJag10{W( zY97H;fAAwFRHaxB%y%osa*oxEah{^7)ubhf1HCjKLTeH(Dpzcf<&&vze9tx8cAV^y zlDws+i?$m--s*_L%&z5E8>o*^3Nyg1xDd0B zB9qNbnkoz(ADNZ&BeVJ?m_`E5e}`t@d?$^13zYQ&Xk)Y@&;j9P*9Ims^6B9JNnJ}n zY>C5Vf^0mI;~Cjij!=D$p9Gq>+aN|hfEd<@N(AFJceLL&m$QC_nc#zHwPtxs25doc zA5TWDqFu%q6(@Js1geV%GH#M!9G{a$M>ArD8tIF^T%G||hq@mCh6Mvr3zWdhTa$5* zPknGR(;m?ht7DM~UZsQ6x_)!{I7F4KPOs{jD;;Zq-CbDDKc2sEG+R`negOzs4Gf-h zKR_cY1>rLBNuwY@O%;4cb_mB^x~PJ>#hDb%gJi7@R$EEQ(h0wP_D^($dny1u z0YFJLk0|M1-Jj#p{pDqtj>1D~u@|dn;({LWE1aqIBgD)H1`ikm`WMLjNB{u;(71nq zOjg_=%+;SHF#5Ng0L=7MO#rxW!`>eXsXD=YjN}%&`YufY);}D#^DN!EbAZN}8Ssti zF#u>%sssAK55w~`>reTLnGr)PgW zj_glK2RK|4q943@BdFuJ|C3?=w$pEp{m-QH=To^qWCT#v6U;|cm6-YyRV6+mWBr?# zI{F}Gr=V}AEx9F+kRb(e<$BaN+yMdMq6YxlCO`3^RVw@p;vscUYic~LcbU^4l^@f76xEp({X3rejbQs=T+Hy)!v*Bjcfiy>vW7q3)6?s6k< z7VT1z8yQ+PBxzns>%8sgYbWw* zk~6#zk{+(P{IXn=aa_2c*_d$e0)8=ub^5$}N4>5@v%*0XA|RRZcoljuHZlD_w2sb>QLaaA0(@huGj5UzCRh&j#i&aA{n(kbUp39M%k?lC3(8!36 zXto*IoXvNy%Yd^Fv+i{_Oz)DPKY(y;LJK0j?5!R^P?@}mVYTLVcKFe6N(q)Arjx$n z^E>2md(w{oQh{}3t!=baZy+?NChDSv8 z(1E6l#+~D{=3oa|k$13PQY4WM%gq$wv2Q*Osd+q0mzp~B*i=8V6($VowanCGNpI4+ z{eB&=|8wfjJ6F9-@bX`KcXnWWi)a&?Sag4-vq%-9DSHSk=rc9rQYvD^g^rHw$fLj3 ze=8*?%hVu5{dE59wAvuND0wh^L2F>h1+BhF!ot4O`seE6O>Qeal|F;f6UB|FS?k8<29Kqx?;|w0M0)NWBzcPaQ!2B7-jdpYL+SQRu`XzVpJp1?-kiIR6 zB$-laYmgrgVE&l=i>wQ=DY)gmA3*P8$sP}3fRvE_7+5~)PC-EqJ%6Nx$1_(}g2xpL z>e$Z}%kQPmKTpqohrU$?Cx%f`x%%c+ z*<3!Gw_08fq6C#6@HIQBd8@A`Hz(64HbP%lB`qK*Qhl25C}`0i(909?VfRLna>(B9 z%@xg?@%U1-Fruj?#?F@7I!7Q0eJJx@g8U4}XQTS8bf~u0vgiTy!RzHjNMZWyLWQSN z54`h9Vo^pu^2HiqkMkMhJIb0WLO%S9*#WBppge}#z@u3RA?!UGXK~tAw$$j}ZoC}l zDs)3!*QTC&2%oL2{`X61Sk14@I%iK~_E)q-J?NOGqQ6QizlG})Lf!aA!>&m!+o+T|CE}t46VuQ3EVAWTE@hawQGXOoZQyIQ$97LfEDa`0!+zo zhT8b(?GqSN$vTXOH{AyrYFx1%ru^sfM#B8`wJ9Plb3*lz(^YOqI*r+8F}#gwz&VJ5 zmWWpX*O0*HZM4#ybRAB8huS1EEW}Y+>+jWK*ecyG{u77-(R(YF*!@YP3+u}n(dRvDs8?`Z{VNKH*KYxE8qaxqeGWW=?zgUh&E;lureNk&DPD~a)SQQ$jwRNtwKs`uVD z7dLbzRZ=sHUEikykNHRC865QyCIXz_xZ(#pn=GwP=eP%uVTl(q@jcGO>@dk-g(?Qg zXy1pCYacc`Mv*D{J+AFBTWF=WqgsIv(1c4(c+NH3j{YKb#~o7&wAF?*7vUjjI;)}W zTWUPC%H=n@sRJ9S>NMxEE1}{5;Ihm3vz7jxdwSoard+22Gx4ncyVKClRiBduyFORTL%*Jwj#71691{vG+kaW^m zGT}(R7Jsm}XSqjtyq6MyeZOmh?J1I(mX}C7K1SP(6H**#Iu*TmHvY&dMT7n;v;EZ) zwzUN9NtNY;+gXgPtkiQ#i#J#G?u%4SyQ_S)UA(HATFB=yCyRr*gUfrUvYeGz<`mBC z?+1*IgFdd3xo-k*dza>cD}(5(3iTURRSCqA1%w9~RZ7PTDtZWskR%z?j20BJ;gQ}V zFF7rFDjQX*Zd(-pKkU6_Kvi4!#*3n&qJ)a1gdj)@(kUt}(p^f!7LaZ>p>zlcD7EPh z=?3Wr>E1|pcWv(k^qiyTJ@MZA{=eN1xCOWDHP@PR%(b5J8_zI&3`EKY|MXaW^6$Sq zL}LLC?-$ntP;~;(QpzkmVc)q7f5|Y5TS#?YOzi``X8p)mtp+@o3m^qO=A0jy3`J7{ z2OL+#b)Ub1MnO_z6D53t!|s|-GV7E2_}7($INu%KD-_>|YwzvJIA@K2Jl-q5UuRKJ zXt~?hWCGzbtDdALdxa}LK_yHotf>bq7GAo^;a;%CRPpV1QLCO_Eu*#{l5HU9?F zfPYkR2TJ#tZ_K23+9$M+<6;J1iNJCud9p%$D+m@9_YsZj<+Dfjdm-(OglW1uMB9$~hsw*?$F>7mhEO-!qm2E5C2DFB z!2hL>8Qius4GpUor;`vk#}n71g}EnBnQ2Wgg8(Oj+e)ct)x~k)oatTP4-r~^5f+O> zMSx1juiH`TEj^Cfsit@xIHEjqHz$QF3jZC)WCb;De+n*Hv{_HBtk1;zJO0+OMXR6o zW<>#zZYV9@J3&-sb;a$O?fEj@kxt-mkb!{JmyY`id_l`q)|g;e5!pJ17#oDvhH-G0 zsN+EnR`NLUs8OE-H%$u_R8^%h8;~V1Eesjls(BICS6b@U+zcs>pkWZu(xLD<7naYK zZ{9}A)}+cA7qYf{^vvFFLhRW_2txIZV7Fe2|KTFmfIQ{e!sz7MdwH{K;!vhgQ@ao& z^r?#$7PlNP94_vD9RG&o26J#c2D36l%;iL=2Vd8g?ViKrzN~B=Lb7t2UPq}%7FvF> zT@UEda%7&p4Mm|dq`zAvj`3lL1yoL5EjqX}g17BOP4%8c5-wDoi5ICzDQ4hzkBdLu zT8_V9{TZ)*cKg6KL~C-mO>$Sf=-9Sj)6nALa*m2p8^M=G#z29pVE61>`@c*rhmJQ{ z=AxmMYrf6HA)E=_I_!m^^T#z?#~0}x@eL}8_DJ-*hvDk}M?@VxOUaqWPuJfjCYYNq z&Z&#_cDv;keJv~@^r4Q=Aia0b9CD@JU>(Ft*<1gd;Q6_5150fK(JfwRvr4DoLMGMP z^#BQpQA{E0T^ZCSMa_Cvo+D03*Kc}K#;lr{LQnmbCSh@h?T+cIq(hPP=-KALN=sd- zq`SqdYFGuZdPhW`?4DvS4w<9VX-TQPG4 z6ihX)JtjJwx@MYr{tH{}An zJq;E~Jej6cyM)H7W&Ey2u^&IIQ(UL;3>ORIrWbvq2C*XDWtoiEJY1cN+fMi8{fv!4 zy->&z-ZY^}5^T(O4`ULaWmk%g%Y5FeI(}x1mKU}?cA%T_rsm~1I>e`w_pODaah_fI zH>A)MsyU7^uK}?MMc3hhtVmHa1vK>#Eyx22YtFa)b63+iBq~+*Xb?iiq*1i9yUG2|4G3S0wHT~y8 z94vQW^uTS#l`EF`c|&MMkSt0eBB;MVY$Q{o;qjOJkMx-J3&aQ~F=*-((jF2? zRrz;gMRZxp6-Pde3=Z0+-815}v+T{FrHv_c8QN`;_2*?>iF*|h)Y0Z2ja94wC-mzY z;(pxK-VE~YO2Mu40NSPMP<)3#i$?fbQ`i45QJ!d{jJ&b}eu+0=zVkXpnL&7BS7c1&s&eH56 zC<}Y<_(}|4xDx}5(^*PkrT*#^!Fs^BTaJF&WeR@f3!lnGq3ksz#q#`(Y)I2j6Re3S z*>0U%km$f_y~1@PK3gaz@ac=@&!9>wwz%V`{E-YVdI3~(L0xBf2Yne*nZ0xvCI7*3 z*08j2l0eR}D6B2Z%bptC1k860LJfqoNpkfR(Fb+wXY~~b+2+`fl99_})4~_|2g5%I zAfy5QlaxU}Ok39A53r<1K%EM1Pyq;<>~D(O=I?@MW8w_)Z-S>2MQk*HDI@t?$?C5F z;ot6keIU_g zAlbkx*D_bt`i-{PE$S}Wuu_GaTGI2Ymk+#+iieOj;vq#QcVIE^{VL&_XMb-%sySy9zaW zC}CdW8B(@%ij+A#RFd}gkx?%9)#t0eN(-+8m*^K|2V5b>Qq*K2z^?j-F3JJYHtqCu z1DT4)GW*?&$rFzAXRPiwc=h(e4Nu=zG929bu{B zUS%Y4d$|(%A-l3+6hbcix1S8)l#x>2M-{P2XcGjzjfQ--j6wU~}st4SDPgK_=I7vPBqEh&GcsBEQz7dFCXY8ro zclPuGY)YRpd#$IYFkYPmt+TM)w!tBk{PRwN@LbtqPo?r>Y>5LTofp z1Ovoci5o$B$Nsw_n44Vek@o{;{FzSw4&EjlY8QS8IF6-VIVTHe9Gv$;iaOzE@d41t z$!-tQIu-8$C5_hzB-UD|dOjC^@JT1gb_5B46UD)x?Y0s#j>AbyMjuNu6u)srJ)>tW zD|1g3(UYn6GW!%Mx2Y!rNgxZL^i_(SLb^+>hQt6geusMH(m5vlpMu%Ra83?v9UR!) z6>T_U(PEhRkMzO!V0IFhknId`Z;jOAJ-T{*5-c#=FZ70@9NvJd2TJ??qeYkqQp&`2 zgW;Nz>njAI%e{OW*puJtJIo@B{Hm#^2~+)3hA zEur(CE#1d7(oLaCjeRS7HTnh81Ehr1zZI`%LQc=^nKYL4LmC4>w!jY{`;x8B35DKv z92?82`WOo_PxoL<1cc49NpujLu~Ut4iQXcK@ynmB$v4rbYf{8G<5KHLHCNU97Ld(2 zEkF1*0!HI#eB}Swm&%+tO-PqM5&5r?w%R8);q8A=w33o=%(P62RS)wexl6mSwi4qXW7E#ImM5Q1x0bCo%AYe~-*?xaN)^EF`tv4nM#ukBs_=6H?4-N_aQh4- zC2vUBKO3pep1vS4#N)P zb+sPt=H_|dd6F{xJ?NZ({$Rk5Z(3YN-@Rh8=|DLKrZ*nQM$Oy@6=01GDqdjKX6+n4 z#5}2?da(?`J;>n!l^sC)f5>&AIsQ_!k$j&UFZd~0O%byP#9*NHmaglZCEMBsrvj%k?O0l1onXf;mwdJOlU@^lr470G4 z(PCKE#(B|d%~`~L2mMwQX|J(){U>E{S~(jN z8=AQ(8nUr*Fw5UZ&ol*b6aKs3ZwZV3%iw-fGQ1jX-W5s)xakyR* zogy|+h&1NHHt0oA{eEHSrn<0(d5bS|l8v{@$Oo~~?2&Q5%cLe0*)xfTOyR=^OJV|H zV#~*t%v`+JGx8&L`r0_TSkVg6Uz?G(RMFI0G!px3aF}NpG^|Z=n$OckZNOI`gJBbj z8Gafxk5ovu&OJJ!*2pBfN(ZY3500rcf2GH?_D=N$Yj_K|w3e=HQj`U_JiWxQala)f zs9wgTFZ=U)@ca2nnJ*T%Un?l#>(k;oEoL7z@yhbPRa<|vW3s1&7~0ufGJ#j!iygV8 zqwulQNr%`z+BO2+k>4~7>YmhZ?`zZU&%9TjyFm`z;R1*HZlKbb8luo2wmKozIGF*z zeZ9djW?eC#NB7Jkle!}LZ=DmB?93}ym9*eyY2Vgl=QBIsc{T>Yka zesP|P#+4X(AI+p@0k*XF5*J=qVdY)9fJr7P5F^DTks57|gy!ZKEdLv#pCkp zWLNvMxO=}LP2}$RLt?W!;^!_J(^r<>tNMl{k9&XyU5!z-rea8u+UB{uNYCFrv!(Xx zBPZ>3WA~v82fXgS?Fk{h+w9k;7eCF~nPLy?O%fJof+2WO<83FX#ixo|9Xle6G*_Kw z7;nivxpqo@#5(=_8m}Ysc%@2KK$rqp#2bIaj+|N*XEj>>h@aifwv{QZx_)tVB^G{X z4c_0wfC#b_(9BhrUVys*9uEU@bFWaPFnjGi#j@PAtHHR}UAhO&I2LMxS==bL^ITW7 zA&L4o6lmi^4t&rqt_KB?eK;Ji^c0tz-)=Lr6O>X37_QtW9I9$S5!Rqe>)2NpXQvwt zb=Vq{u0asyt}VL{F1rd-M{bS?x3nG)`a(c%oZnmkY&7c&y^)H=ZKI#gKy=D?r)huS zQSrZiHQil62VPLbMBlxj=2p1IA@qY!bzZwS-?(wplC7P+-gcsR_$Oj?>s*Jmop^7$ zPVn}9nkX9t#JgxEa&*A_+%tpoel=WnQ(+~mnHdSiLLdq`8gnbjKV7Wv&WnmPrIzw~ zi%L!Y^rf~|Ya6Oq__N%PMr6WtbaZ4zp)D7Pb=j+DBhdRZG<=C9!nV!lm)^~{BsV$S zYYGqw7S@#+>?iTsp1h)JXw#Pwr!Sf|+LD;vvl<^91-XeYk~%fzyC4|cvFnN41#U^D zvVFDUu(bCg-MlsL1ufW(V>B=?ie57iIKM0z#i5nPI;oToi=AJaA9|&KJ#di8Ntgze ze9iAsIWCucCURG9gyKiTL4@Z}(-LxV4 zs*f#Ua23;U`;UzH=$l#yw(Hq4yX!)3%dgFQ zT0k8%8P#L*LK>YZ!W!rOWZhUwgBtjvdTZu)!l(?s46vk4?BN16m zx%p9M7ndR*$u>_PbonU`*F!cV2wHKN@A=*n!i@0#H>3bpu-JC6pvPoInNh|5KWPLK zDB{0rQvdDuD(YU&Yu}JO%vZ6VUM_6R?Rf(_zhlSxY|yfcX(NaF@;W2l zK&ckyt5rImna?C)kpVqM63Bx8Xv}qq!rT4{_S>U}HmQxsT#y1BP{R`M{#LzXt4(SR z+dTmsu-g~4z=MRv@X~qwG?AlsjuGge5Bg|lkK|xCNwSG}{Ks=BV23%Qu%e1XL0%QV z?K#=Geo)j_Q|~?$oIo$0Q?cv_$wrlb4N@EJf7nVt**}>GSH0E%AuzT{6|cQ5v9aPH zzAFz;l#-O!$3Jlx0uGGD6^NRTG?^r zHwF|piNzrOf%*Dj~3(0j0=`UEO4D0`Sqt0+C!G;e2Q8+ZOZ?o42w zp1=0CwZSNEBGU9=Ea{z=7g9}U7?}WXB$D}-w5IP}UsB)4u}G;XFDQFXEwulGCHAk$ zXYfD&3tOVU(ke|H$-g0aS%G2$&+h5`@Y_m$9Iq&*%(l1@E~U?=v}C#E#a!jPGR+SI zK4g;Khr{34xw`JjVLRtj&US%1j*g>e<{x=lSfHMQBgQdz4kvQdrj|PqI;bPblU+(j z2cTjNZovdXx;o&Z_k+DnyX)t1()k zTN%T}{2JrRNZ+=l_^Gmv7OmX|@#+$h`FMC$bWXE)sk_UTArF~d z`0ssDsSgaq@{b`+P{>rTS88j^FT-OAY&g`F;!(jDXP{UYazMhJU0;@Dzh75i{!*_w zU$2_iuWw;2Q2Ar7b&F(wBA&e4tB>@b>y~y!Hxvop@gX*~2rLM!X7MWs*YXox$RbKd z7ane!*s(GSL8EsN7RthFs!<*V=t-?>D$2fvH&LlMSUB@xJ>HuF zJm0b)_{fkFNui$4D_JsQt}9Oh`Z2$LU3Y;85BAefTo+!p@W@zUp6XnJUte!3VC-rt za=TDOjKs1tOVO2+vG$>Vk2h=YlDqp#e7V=bqInjCc9=O$=rTEXT4M*>XeHr5hF9|! zy1PZ$R_;oOeo4wp(9xqLLuRo=IN?w3*ci(JAv*NMU9jIHGOqV)?0 zB%3|_CTP`NJouQ)+AsE}mXUes^UZ9Zmrk+(;eR}jqdjCJwavjKsUTuqK%Cay7=|2i z0XH+WE+D<+f#4U~?R>^-!y{$-c&{BT8DAsKf2Nq3?<>Ae!Pg5^nK1J5Bs~Ma+zMY^ zJKA9}9CqpgfveGejGm7OKc4PCHvL{1%;*^QPhFoXnD3`YcU26K_DD{>Y1F?<_# z`C=6XYETtD^P#Z$br=?`Dgff>H|xrgLI$%AR7!wL(bas>$=vN5UGin-nuQ*no4!O|B(ttx2O z;&0B$?=U2$kSMMYo)6UMGjX~hU}U?2B7(H!yB)g(Bxool;F`MoLVxvIu|FfED}RUu zHosd*Xi1x4=>M>i_zq8E06U5HSOEV4Lj<1h)ZkBL@;`uBbdV&d14#lKpA)sutowY} zPo?tTdFnIj7(e+j%i|NFID>Q95&;bYlql@$nOO3J8vI3?a{Z}f{s)|WoH^Y7;23B% zwu$Qjdq37S(f<~~KEalmV^0vasT^ZmoSpBvb3$G=;;{FP|_gSP!U;r#oj0`~Fm z#Pc+qTPHmKKj`Nt*!oqZOhCN;rG)YbzA z3WP?CgjYV;Px-yMKe^}%FD-v4*liBcG)^2C9(wv*qy|SqJf%TeT}+z651&raJ~kq( zv>Q^IS(;!JrFu<TnFm*Ad<7g`K6r=%IzGqJ?UX3D`ZahZo}tjx-ircg z{6Os;`%UcyN-3!pHSQ%ej*|%U?3yK@FqGB0w;=f5X=!iHUw~^~LnV5IlG1pz7`5hG z6tHq6mM(U=C+h(VHcq}fFO6eE|B;zQE$=w{I&YMtG5WmJ1j7-lJn~+7FSJ8iwH+lv zlL#&v9y<)m2^zP6=TDB*4)tVc-Bh9EvG#^4uFBXD6)BkW>Hu`Uz6D%j{ulz!YBuB= z!v0pK{}I~%QL6tzXJ6Xs;UEFt6@Rc#3H+*3{Y;aN6(j)Xe{l9~$3IyOom2oYYn({Q zFF^Kyedh;K3lbH<1FDZ7J4_JRS1Miq*&zt)_u?D`GyrT2-;m7yAS(Z%4(0nSqz1Wa z8O;-AS@=|02E{K*C&e#xqU7rV{szBSVVo5A{1w6deUX83rJvJ@zxt~D*z8b-X`jS8 zoMg86za#PoUsgdGMGNj}Uh78{hRWEWM=kJ&dGnR^!vis}*TG+()EoJBviiWPf16^5 z0w=CGso-?Hbj)Ku4vQ`QJ)!V>0R1_&2ll#ffiBc!5jx}(o05b-oQ&VYR+2-`aH}2y zgx%;6)k)Gjra#J~z~p91aRZk$jPYf$hv&^fH#8hw_$T9ED)Ku= z$g&_bmKz>AT6fv68eJavI$42kfD=`J$7gp3L5^s1;NQMn(W)I$CX|RWa{gL0^325| zuA|~M@$^wOmXySKODpo;%r`m09I?4n0@!b_U2dW)duQFo%GYUe{mwHpgWH7++4R)3 zab!7-P~V8gL0x6fbhk06mX4N0Sl&S*bdslqSR}Ov3EGW+Tpb#k znSfJDiwp95lE3yy>J+#{DY8Jx-4xj)Wfv(Bz};3CSKc+?iTsYIjv;`-xhEL-oe>7d zD3*}qv54<*Q*I3$)qGGJd93i&et^iSBVS)GdjL*!S^^mT%LDuA-97jj^85ACxX~f~ zb9tWIcK5fj0ymm(2{H=Y4aJowQMkgD>tZL5l`v0#3)qHRlS<`PsG4$qSdDkXASu8m zeoIG2bAI?J{1K^-d6xJ(Qi<{P1(zl=yIlSI&ZMS39iNuZxm{*&APrDczNKU_*G0<& z#o8;_l(q8+TtF!=k`rn=FZT}Pf}Ma+$7>Y!Xmrik>@unY*FDP-12>&g!H~O~<`-@%Z5y!@I_ZEc8`LB^ zZfuHnz5L5Y1kp3XD+f1Kuj?9w-44zW%$5A)GNpMglIP`?QQR7H2WUjt9)V zZ=PR#e8l?*oqBt!uQ26K?=f#yPk>mHbxs8g#}|Mjo5O%l=8UK*t0vN_zocNO`xq zt@DCpG4e=&oIU#Fy>*MoZs8kzX2x{a44_WO-A7dic3;x?ngo+4JzN9bH}ghcd(Y>9&?@@-z5-*x z?#*>YY%`_wnAWB(wL|unq4oRT#{KeU zx^%;(a!Au= zz+x?%sP+N|0avkudoe6@*v@}iwKpD*l(-Y9EVa$XO4g2_6>78BbsC#!tQd@rFkdrMdp`NbbUUpui*+s3H@cgPH*t93Kh*1<*~Dy)QFsamk+D)jDr26)sMo%e|@CVUgjJ zYLvdzRK^i{33a$>89uxmD#rC%=)qjLtibp{WU0q;NdW}ACKPJuEfK{Y1`#NS<_fhc zFupX)+f_aMyzCbt*UM&qPr8umBxq~@tC9bQw3hL0p?8Z>7=eN^e5nBOy}7GQn9jrb3B zzGrTxTzh3>D`%#IXNc2JR*S&f^A9Dw-@ziAjFXaL?0Z1~6Uql9)rVG>g*#4P3qL$S zPyFt3`F9Qh5`RRM|H>R754eB+tuerFUZ#Ix3UG2D|7-|wMkfAbwfK`6z>m4~w?+Wb zs924Vp^Gc96jqNTmS}a&W4ex5HTo^+6NI?LKN-79vr$qrQe0%#k@~`XNRO#sz0&4E zyc~i(KH<1f)0a=}*}t`#twwZn&va(gm~Yt3c;?)l2|pq3!pWG7Nc_4Xf!w49(@>K~ z(=J>$i{yO|42gF;#FaeS#zO26T94zz1MLtx%YSbR+cWSld||15_k7b;hV<6-N)&+~ zQAIw*AgvQuEUW_YnR0QMcqcOj)c<1su(f<;QTLh$GmQS$UccnJVXfzmHD6AbrVi5n zlhtOFLqRs4k65G5KAIFBu$t=*>VxWzSxt=EgBF~ssDcG+2zT8CMV}`SoJS@tgi8)h zO}t8-z-G6#{^a4B66w>s7dK@s#7sLxyYH1-ZMwvm`_X@l-;v-x+{vU0GBq-@EmwT) z8YJnr^m%aWIsD< z>s~yz&0wm}UsJY`v!st-Jru&iCIXd<4+@0KiL0fk@>*@$X0Xm`?wLhn-%xkR zm3j}oGu^^|{D9iB<#89&h=cdDJSYDNKP;5ZdV{9&{A*rD8rOfbi1k}X%VnudX&U#% zo7eEg=7?M~BKD8r=Y?~lC(U}Byk$1e=SqLB>G9rB%5X>im<1nd!s@rJ$mETS%Y-Ul zx7|hH5aWK7^JIPjdN8=OOp}pL+XwH0vyrHI;=Lsd78etv5bTQ1&k2E33spR=b-l37 z&fQ@q|Ev28mc-lgd82lb*Cu5NrDI1!ssz$ZB|pze!EsaFn`iZFoC9&WSi*+Y0j z8I`PapF6M!aLTQ4mFKx1k2fEqPkJrBh{8|40kM%%h|pGJxt7lDk*pI~KWp)R*_dwK z?QNrB4eqqK8nt)D7ejM4x_l_=qGw_{l~&jwthE4|x|s(vu8tdX4?B7`oFcC7>Bu_; zO8hHg%XF(NZ>?zb8b2?{9pN{xnozbFor3X8Ij(RY+J$X#5)tm{92byhTw^q+H2L7c z17o>JWgTWF|138;NFr~9oR$3}MOP8|?mU+x>rLusb|D zyudkk~PMWn7^<4LDIZ#S6?VX-RAu-Xyz=wioXe&Xgy;B)`ZKo1W| zsJy>a{%u`JW%|1U9|r1BD~>Wp_MO&!N6>6#%nr-?9`1ODhK=<#2&%|?K20$tk|jmmVtXE}3<@r4_X3N1fWW5?kK2oJm|{_hHHh^I@_aRO%5D^1=i5WfgnNiaU4- z(;ItH8`V4GEo}sw4c9z_B8vi8FWy0#my{i)ws@GnADhvCupde9ey^9j%hjMyLa~{^ zBgWl{^z(2)kZZrsdL2ZL#o0=i##uJ`qU95b8JXKbx<$RWBY#kR9T3rb6uWLybIsgT&$sJASl`lyFMP4=r0p= zEV#{4)m_n)pfoDRsN6s*##Oa;`NgA0q-1ZaI7@saP^^9dPC-|Vx6Gc*0P>kHN$*TI zQ~8`hML&w(nZ5kZL}evAOL4)Gb}bJV}a4s_go$UUzq9E`3v?tbe=xQDCm-H31})o*?HHehbOOg&tq#-h|!o zLT$6~^;xfF3sk)$it%FsxS_gCgkrJVqcL3{;Cimp@B~4E55{K4UU_B zkU6uZ3S0*&8OrAQ(Td$}by2+vht?GF$b$M2@rwz0h1fS}Ue_j!4m(9#FxTt#ZCW=d zieswRHzf=+;AyA%S3qZT(Wk#5^)(qz`A@bP<_+MwyQh;8L4u!O?1>A&;T^s64e3~* z)MHX8PGM5^sRWj+01|RE)&{H=ItnUvkq4nmGRxnP=(Q@>ZXG~9x0lM`erJMi zovDLDteX^@qj$B%7+vemlrcY1_o9-B2-VU?67$8@W;l62-E_G)n%)JL42~JaO1Cb$ z6*vO)lgWtmMR}aNwi6P<;`N+I|DJIirUD2l10*bE<9BNBwmDZUwb^wV@Zt$r{B~7fjg~E~>-^ z38yhIf!-3z^zJ;k-K#Ik3bHMtP;Lmo?IIqq6X+Bg=Q%u-R#DgA>F3#~#qqD*O3RZ; zeeIR=RN(#H^uX-KmbF=a;cF$Y3WsEpBKzcN%8t?WVFMM6-IbFywQ~J)9Q?_{UqS09 zNK&vG2P3y_p3c_wY?k55T1tlpMQ=*IkhHQQ@yF>XS6UZ$rmc_scRlu6}Hf^ORMMoSX%1P#c6Vtf@Y| zR~W(`$4$s`4Sj-21zluncF!tiKhc3}blwcMudn&K>43n$ITlpqCx*7Ul|xdVf-zO) zSFYwy!7>axnAZhYIJgspjwBEBK}R6qv>k`sRkXS%YEjPV3#ktEa!ud4Mu)xV+(;z- z%zeVqnJ3@yMIfUf6+_^}YYhbzbGZ@D$*kuv=jHirOYA;<+R<`z&SxiF1{Eq~Sh;Q# zVs>qD@s_juymVh-af#KysihFGO88&B{>c`)`<>;xA`}9XuHx`t(dPLB@+@74WL4pG zWg+9|vsW}^(mbjuao?@6xVo^Xh2FNUX}CIU@TI;5Ls&_a`*66_=fN=SIlqCVM08-z z(cskxDe$MbNOx?Pc_YnBD{~J`JJ?50cCuq(^6(TDeYwu~u=qX`^4GrR&2c<+soQ28 z`T0kVh**>ab^EBE-J zTF#aUe$7Cw{iKRN%Gk7;Sg0qpp==Q_(MTdh5p|_Qsoni^&OEA-J!?u`f)Gte^Q7Si z^Y@fGXlX@;4(+q-q%%`^OUcn!Ie3DNW9Is7RTEcphaP8>Xoe79(}GF&^s)X^m)!2o zKNpPu8&&*Yy+>9|^jS_HjaE)&q3~D7)S9FsZvUb*b18jh9r7SK-OKlSk1jkO!Oa&( zL%VHU&J8>&Acu|Ic6R1Wh8jT=x*m1Mtwt5Mx@PLq(JmjFPa>2a@P6h*J{C&abJ>_| zs2~oiww381^w;~m$eIl+gU-4TN&7($23WJoSgpndg5=iXF1*EdX#Yrh^HwU#J>kO) zXM`85^!+ez1HqFemnXa>syX2@ei!>jjHs|A0Ju=dZ)jk2+nWD8r zk?^!+|EqI3w~B+U)K%$4^~5C+>&x4_jk8%53`$MfOtuzpCf4I5uLNDVg0AA+g9fkY zWpxwu%rKw49TaUNk9~b+)PSQbTuGi=m~T_FJ8Mx*X0bkQ+Pf0E!A(F%jV?e}YTu=}0c1S6T>>QeW2*Nwi)qb4cjq&Fx?sUiznbJR{>2fK;XsqOu&J1Y)bM8j*xd?R!u$@1%==G~Y*P803d z%;>E2snL=%Lc{(6PUkaikpy{VvxIt>99ZQreIM7u7hBppI35g}_2V8~)pm1@pEqOb z!!yjtOD`3z!;lhG(X#vs+CV1dO9zz*^7by2G{+~H`vUp7#P7?G+`xZ&&AQ!#z_HGf z={2%I_4Z{?jCw*!#GJjvv&z?1p6Jz_fzvB2R9!)c6(~Jtin_K~(|`LDb008ki)G#< z?2>;(M6g^16w?&m@e6^&k*u{tqgk=c4If+)=lMbEWJ!OAsCVA-n3vQa(X?I%JF>I( ztKUNNw$YtNq_pu{WLb9z< zXGj*;Z=i8Ofgj(8 zY*?^akLJE^xg~7v1B7Ucd;zxG)l;C32As{u_-qU{?}{UPk{WZj;}G~gRF%~giPXk_ zuHHYQ`PkRGz8ki^DM$irQrxn?Aw4!q+s`0c`>+67V3042h4riRLz;MJj(m?J#b=?W zVB=^j6I;iZ>SiiH<}2-9N!))CGOI6P-PB-@K0^ifU%#vRpZW57F(wBgo-KF?6K1+Oh$PA*w$Ys&7{iuHRB$9f0o0K&u-%`;QG1 zQ?L(rUV#R+1=A@QBWgydHy($z*U%8|g)nJ> zXZ?=KF8=LEDq4Q*la=|lST@}KRf{bxgPGIo=b5@1wR8>{n#g7kdxO=}sZAlrwjbmV zdZczNq128^GL971kD#pzu!OwIBQc62n$@eYQU_`W+IUba1xe0zUS9g@rGFup%pmK& zYUu!)@|_%;ltpFPm5S9p@@D4Bj}BvN&z~S}tGlC*1};~GWFfE!fZKfD{TDX&X**XR zg1%Vk(XeMx!GnFgp7z`*o?BT&B4V&cRTpN47)|X9~pzmP2aX06^ zrE2=+dNh>D?enD^_bHQ=5+`e@yuxIyz;Y^J{dX41MWw5Ik1EE@cItBu<-_2XiiO5a zqumu|stFq4YN?M`9-z&CL)ycroK);7RJLzT-6&Pf`p^N)7#ewIn;zm5($I?$v4puPsV@TXWGuB&-6;(SabC#=*n5F0s}ZdCT(C zahiO&F?n^T4Wp19Y)8OUd>kJBwc{!bZBWe2fpuuRbZ0YbrQ;s#Fzz^bj<=D}jW9y* z$s#lG8V$B`7z;d7i)=Jn4aVcaU)%P7yCuX_df_fxy_&#bSMkD<9q{n&TNa%VQ{zUh z*)4Hx^jNQj%dJkl^M9{d@tj)nqogRq)o882*GQR!Rmu&7ck$ArBT>ZOd6SWPe{@&8 zyn+JUEflgiF_p}M4B=9UOskg<)+1-7g#@9U^sIXZ2Lx!auk^PExI;bex{xw92YHCz zByQ2ADLQIU+O@r(p8Um^IyvIjTz7n!%jj+qNE2m8|9{A$|JD7%zs;_Gx?Gm zF8WrBUM>)#qfk&(H(`IwccI`BX;yQ7S3k)_Imdd|L`7ka=x4`vj|vUaS}7Bf;WFn< zG&c5qeqH(T5{3*%Wrbxx(wn+y{Yg<*N^QLp++ ziYAB=nR#GhnyUz1tK3(0laWIv5?(CWbjBFgNJl=_Rr<({tZa_u9{Q2kr{peJpR z6L95BjkU^ZW6hw0OlV~^}(gAQxKO=FU{ zXl8>kxY4%hXRWkl+aVR4)Tl+n@sVTZPlBs zK4&#?_ho4dMe-jhVBcn&Tu+$;2ErS6weWTJ&r$5sADDEE3mL~^USSc>i_mPeM<}!@ zo^)Q4aWrTsP;K)8kYeqzPQW-r+%>F5$+KQ8*n^hjKMX?Ihg{f0dy(hLd;jL#=b+U~}idpi3E8rZxr z0!LaVjdXXE&UQw*UoOz}WzbkaKkbU(!xr<$w%nRJ(a>|~xx z+Thw?zZ!fb3}*1@cs*w%KW%GnocOinVeF4k>iKTLE4G=_4&}D;b$JL(Az|G zyf>9QYveMQMQFbvsoGu1E9A)fWpOZn-s_$S+cVk`<>sk|ZwTFDt_oWxp?rt0_hgfy z>P~s_>}}Q~{&k8Gzn1D9u_JNs@CXN36m?MKe7n+hyM)E_2t_v|bqP~4`s1&U z*a=6=7yDa2S+mI(u*5si4ejS^@3QE!i@+B+VD5>$80urz9kRi6%!1_iU%uH-(MH#j zD#rI<=iZ-VgnVh+ddwW8wWsmuJPKI4UNaSUtmzRL{%H8Px{50^@7uUM&_)N4E_S>? zQO{6cSNFhw`pJ))?dVu)y)2v!g);*9SMA&pvl&dVHLZOv5&Shzl}VYbl75rpK@WwJ zWYWbnME>!!j+-hY)-DIU*YtCiJR3RQ&()YqPI5Bz+`iM;V5~H^6xQ5aMY~KKLTzU* zBZI;AdF}y2kjxSN`1Tqibh<3cVrzL;b%gL)4q@2=+Ssw}a25w9^j4wnbE>>o$x>Et zpxC{Y-7Pb65U|dh#ZT9HLvjO=FYmJC{tRu-%AKRc;r&T0W~`WcIqXiHA+exwp$j-f zydjMGVpC6a z%ohy&#+v$4ofap-6=h_eBoH5f#eEmuV2)GJVd(OF1MgeVP>tH5fCK}qFX6NG)m_m~ z6qdalYKFwO5l(Or@!}+On%Lb$A<^0An879r1u@ZW=X9BCJgP+#Cyc|z0zuIo3}WI= z&dT;ixbd^?dP;AHn*u90?CBnUARu(O1U!JiQj%xQrXt0Hp+{(-P^{xjt|vL9^0LPc z(~bA+i}`opq<}~9>K#&CaGgA03tAp6 zBz!W<`%IT`8yt<(Xgwj*=~6d6Tk7xBo<4JZHAYN^0Mg=f_M?-AaSCHk@7{O?)BnEK zt-zv{1GD@lSg!u^mh%z!G2bJvY&N~iyfM)$$!joC@q+5w4a^#nS`HQ?vqJz^K?V& z)`Ure_`qj_IkwTZaV?@6^s@n0w;;@<8*5;CvWUB!?Ps9-1lVF|z!qbB8q~3vPe~QPj&}x>a?`f~^h}jSuu1r{g9jINQy@TdvOR z+`BB&wz*YH2jyWq(su;&s&hHi)I}J~D_epWJttdoL9GVkE12H*OCW87q4#}5-u*t& z33y5~l;$fj%@vcpw!i2XW(Qn+-_2s*0v+e4Ah4+CFl)@L9-1~@lA0K6^VL>KibNGy z+!jRgk2(M8mk#4BB9ZaPephg#uW|onzNpLK<`~@CnJJvJFXw6g;j_SagQePnUdbX1 zlxyO+^IhRdb7AF?J24jI*@h(LoDugtr7}q`$Ll=#JcM?6Cb6V5i>$byyr3uK&SOq! zZVUCWyKty$WVjm|A1R7YRKAzgbUt|cWbcCAOW$4XLd3%fd&qmXgy@VOA4fl|qpNaK zyz?$#No00Co%i4x8eKI=-LAI3JUTodS9W%P?pOD&`z9V8%hwp(RH7| z?;KChxxqi({*{mW>KGae+Ze?N8PjIo%PSUYx=-fY92mcu_Z2g$KG{0bc^%#LGcjyg z9`uNO9(wCs&;Ryj+(c>CAAQGr&JimQG0^9ZhJlzEh6~O&MDvW`!QBWGY)re=$^%#6 zO<=3NyO%aeU5R3?!JfD5aFX*o)dp94FyF>p?} znn#afAShWFIJ2@wsW4SUis@Q4ghUWf51IS4SC)n98i~vQJjqcNe_m%hXJ0+r+h>}3 z$zyU)uOz~o*E@gs*z}Q=&BY^c3!cubJp|Uke93cI+%T#kOEbI+^9Z9JgUHM;F;h~V zR~c6=XF4KpIcNV~(pQ7n5Kz%B&%m!LC{fU5KEIkphB;ha8d({=%YD&|2djIw|+)x?4 zJvL)2%R4Y{ckf7;-mVEspEkO$c>2W7^|u1pe-g$1i=X{l>j0l1nWFNKdH2f0xCIPF z_}>dyHeFAqsd8SrdI9O6FvHSZMubjPD1x9shu)n~@^;LtUB;0Hi1n8mnzrKRw^DpY zdpa*+219W;v8hzeH9j`7F8|bH4$R(jjeCVc0C`CsdICWUH;bvS(5_2p8+pfE8@)#K zO8i3uiZL#?HO@l=j^~BsaS4kB4D(H?%E~Qs{sWQ`B8n+*XJInbBak4kftAgL> z3&0t}o5I$aGs0PPM)yjKt*b6;1+-v9Z{}&8i>fX@)~zWa51#0_s-}Oy|2CngB;G|h zy6~yhr9M+WVyz9}&G^4J>NqW1#=@$rWeM3(Q4z_0f)`?C<`iv}7-ZCV1;VS;&oqXv z-vLb>Z7?)~)y{_ScyRR!sIYbjgWI5ev|?LFyMnUU?s=8b0B$6Avxa(6fxzC^5fImNVqou+v;zXdOQG;MEur~|E%>Ku6}Y^|BAo-){%D~v7Y42R;*y( z#`XMaPXIEj?msGZI*haDPk`b$=C|TF4UJQk;%(%h=EXpLIq={jg0Ga(W)k%Lhu`gg z()HE_E(NIiRT@Z?7k*5|_}OVhDuspqY{>6Y{_IYEQkE;0B}OaXrJ{xXS3_O^JoR@r zwAA6>9$F0O{HG!<8{_$eRgAuMV{jTs{IysGRRZ#Zf3MV)T!`F&`uk@){#$bbf+c}N z9uoj_>wcaNsJ|U1y>jkWuZ>p@J~Z;Ps$+b2V^|(A?5PprsXcN2_0h68;{}I#qAFW0 z)W*$@m@^BfK{Tnwy46U~abG`2FLszf>$we7xb#eZn#y5QLwYnv0^2QcLOOiDvRlYO zHzNOpql4e6XO`(GYmy}r%_67U)LA%zf@NP{=+qvgGB(JJWXvphyQ}c`2GW!=Jac-s!&D>>UPiQ;wj1&JwcR1{-7zF=;ZbKJ=9eA;WtyMB zzzGA?Y2xH|c=l#W&m_BM3WmXz>JnwRNQHG2G=wz~In?Z7P>xQqCP zKI|be078-e7YOD3TaWYUzpOs{!4Z`OI7*uFEli`96#aEgs&)5- zgm}(3NIm_Ng{-@z30WkOb4+?li^QyAaLU5pRV?fKLo*-o zBIHwxe7CgM;_>C&-x8T~56brPBqXvnm7+Gg?GGT)5T2#3wwl@H_l|XEajb?7EtJLz zWjTL&tP&Ypv;HLbGFboogPIBRz=^;rV@o)b&~@Q#)r9x{;@>S@m z+?z9d#EwTjK)ApA0ixf>Q^TdFRYL&5`92V}p~Cn*_IUpWrS%T<{sw>QAd?3JEn|}* z;elKA3719Hp1{1xsQc55Yn)NFmPLZtEz4Bv!@6IM`qfX=;8RGAZXKfc?oUhz$#Eclm_=s)A|-%x|oZXaUc zN}~Ua!((Xwz=DC}5XBcW@1D!W9Mb%TP{`lv{=!7Rz#;crI_v=x)_|AOb?}ljFlpoW zc%)1+6USvAvl@c%8z`#yAQ4asL>dXsM z_Y?+D#neut$nF#%9Q`E*|2%|%HWT(nE@p-FH;9(5>AmIko9pi1G4Ma1yVB(M{KI!g z_u2YG|2J-hTIU5ftmN-($sQxX-5|ISx{m*bgZ~K|uwztu{zQp>f}#KuR?W!k(R)aR zSJpa!vYd9CFaJFR{|xT{M#NC2b^HekF4YoZq(LW4!uuOWgbO5zpBT}-{H%-nwOnnE z`#Nq{OGu_EsxZM%ELAvwi1Sw-d95bJAWV_-#Nql^0{wdk1|(m9LSuhHML#Y{pc`2( z^CEf!z>Yre-A_g?s4=W5q{cE00DAx+6+pRo4oBVe-}GG^%>y8QY7R%FJFwhhKyvPs z`vs=TMR>^r2*ra9X`ca}UUmnx`{dv9NO|*nDFFOO{T&(u;OxIduU`_?9yrv%Lw^Dm z&Ib-K*q=`={a;T_5f0J8~LiWrc8!)SE*P zWSZ=^t`Tx-#I5Ez$5JLXRY?*dS@UJ<&=)qbu&k8H8iCMVe#p+7-Q|>X~Bu z<}KB*9{*ZIXtUYLj}xzobLv92vp>cwur30s$EISf2KAS zCqj$UpqYYzbz#qU<8)Et(FMaexqL6k=3cw>q$W`=eOk7nc9t~jk=+(I;n@oLibm`e zODl}(^NH6xdvhM=5kNmU{_%onZ(c!bY7?K}tA`h5M8fbT!pIWEazqC#v8ijPX5J1bxg4}V)4vAaWtZ$lPMiwZ2_EhXbsg=Bk zi+oIBHF6m^W9?HgntSJ7I41n<_AGMk%kq_$U3t7hbp?M=`4*?Gw6RME?H(zQBHQ$i zhlBdG?1|5NTrwS{NU3V-t(x)c*p>>?_2WV9;8u5IZu8s;3{md^Yw96b(^F?;oGm11`;hAQc~foawKBc z2Srix-shdEf?*zaJ*fkD)slFlgn5NvH#G~@LT2}9t83u=;>$1po4@_P{oOy0c22W- z{Qy1XkpL#xsa@WO)s(<|`Q{bSA=VeR!fD3imL?oJI&PHgFy|o8yEcc`)*X&ywe8BO z2iXmhh2j_Q7PkhNV5`p*C}<69DYPdt(w{RJammwZd1qr#za&=6>@+psjMGRo*97U! zY$RiX4v^Cd&y-YwGwxN<>SBNfBjJAr1{&{c-w`utKes9=WjxfOCuD}8{SO30y(PRzd%wZ9sroS`<_QFMphhN3Sq&Qe8ilGtPzfLxAp^YP1AtL~ z1F;h&6Z+K|Ydl$BCKp>tqSZMFcXP8P>;TzjqJ>4f@oqShPLAi&S1LvvDG2X)xKj}0ySLEBKDX3HEN-AI zw!UJOw$sBIuxQ7QuH4#5nLF1L8GgR_u;8%m`{Masq(ny3g+dsNSfSpkaNvG3;XX?Z zBFwY%){gu`N%^^jBk+3Crz{ZO`o2X(XO`&sqi`CE(i;s<~XmceP=6fzN0-rx_=eCb`JaS_X7*ADUH zk8F@4{URjYYrBGYmmXcXA)yBAh>3Es?^(DFmWodP(%RZ{<;%y0`FYn=7us0 z3)=B=Tf#r4FL*0KefbbQVsuvNOiBtg!(O`iH2Ms75i^#u#z8Z|caxDBXQ>2c{7bRl zvyBRLs`IRhPI=4Hs z1ID87DSj&7`T4ktspM+Zldp6yVHd1!Vsxh!NfApizx?Q&;_Bo$JR&*ws4s6B`EHU` zQV%#U;%DvsHP};PSr+TQKk`b!;nI^~4Ipz^Q+$s#`kwwS@_3i!)~)Ye-N5&AMR|(R zH3hfp)PQ!?iFZcBsZVFSR>JHASv0&W9jq_{77#IcD1voQxLd1FF#h#U!ANNEmW~c% zVz`Fz+bKpu>$R<|*W1bEPMt=EYs04F@AD9jqw&BEcw}YQIfrClpZ7lJ`)>v`PZr|r zq?Sl)ap1lU+c_2hV!L%sp=T$_T7vTAoOF_H#S1pOCMFrzai^q`PluJdXsS12?=OpM zyiGL>vv+S)h2>L?tjsAcUxg@o`xj3?x$vP_^eJi{Wdt(dFd>~pkS!kuD#cz@PD&jn3*keBHhPS)XW!i2 z8x3sak|N)?5Y$?I$Vc5ch>|$PIioZ>&+}R3gKdZNwBJk)XZE3lVBMi4?lB!4T25P4 zSFy5W$>4g&3K=j-H??fo-&&G-6UzY*zCniUH$JRG5i)(Bo)*7cO@xspz3bEr4WSZq zeYU14#z$}C3JU_?gw3`pQH33MZ3-R&>cq%2)$W0K?CD>MfP;pm;@W(v~IAbe620!&Q{<=TPfDT=@W zipBEDuZkip_1R14J;$vVOC6g9n1O8x^FY9J6hP%g_fWiA^|oh^+3du^TX1{4`|<=z?0+o zcIx|A`#(TFKtH!uSeQ|s!f)@gAv?;l|8?RpKglTPXm4PX=H8T1nDk1<=L@;W?Jdm@ z5S}~*mLRRj76#V$iyxqMJs~{d?$p!Ly2wLP)4Tild&D20FQ;U(D`%H;)dv?mO6F5m zbl9TT9}jyJ#YpTO+J+d4M!YF@oae}rd2TpaT%mp(FYb*vTmh~H3oc*pd!2uV`U~Q%#_p!idb1%YkYgRsos9tbp?gu`thj;BgV$MFp!>D{ z`P0iv*o5GdEJ&^bS;U7dY5N^@p14sNWshSH%^DGO$L44E?MI`^s~OkKkIyAOP(cT1 z@A1hRHC!aghXiKEh=QJB+p0KwTmw`c!SjWzM5zRdh4Es7UHHgdmF(;auL0Ni9beT6 zH7(MTtfj;EAOz4Wb^Yx0j>4t+=CdFRUMcU+e!*JrFF!y_tvgs``|6>@a$14>F}ciU z0vtRtV+2x=6@NG4!BodbB2;jTs^{~tfNFCkNmOu=e>Oxl=3~k{&_pF5y*j>6HvSxK z<~?$GCvPfHkrL%jd+?Efc~+rnVpYoYwfQxG z_y7M}Vg9>{^*^XU|D(R~pS|TqleygmlXwTgpp6SB{#=sa@|zmBS=ITMX@{cn?5&Pp z^{jB=zWB6zWcdu|Gy?NVWk28ZNZ-*UO5fdBe7*t$AFVr$kBXA^+|e{h-(Bzj{+Z*? zKf$_ooEABTxBwQmpPNNZFP)c$sgCvDoouqnVZNhi06mnPk9xEuK~I6wpSlgv1t9JV zjfrdz0~xsO>GBT{NpU!itQ?Nm_*veDxpM!UfRi&JBldHDANtR~F8J4*_;ta*+@l`A zhUtf#;D`sVK@6vqwydCqX49gLuWFEwu6YqxoRP->68O7sGJvwS9B_k(@jiFfHCz~U zTxg5Tn;XVf!W#=|2w!7xp)2=~ajw*EKwJO3xy*Z(aWA0XkxulK0T06Tk;;uvO5@og6v3%(K!z|G;)3X&iP z;KN;tZ`*%`r}S{yK5!z}SOM)u^h;Ne`?Q8CS2Gh=GqU1X7BoY4 z-kfQ6!Z^=~UgmBGfdY6M${(PC9H4ppW0C*5pibj(fe0hYBgjByhPw67`}*50{Jbyw zXitfeRo7@ur;lz)#&74iMwn-q{BB$1;dsgX&VYp$1QvSyFAFXHLhkvcY`q&1i9$UO zE$DWm3+yWL?%h3}w=W-1fjLHAf1CFC&6a>!EAK9UE}YOG3)kRIivXM^`p0r+My@>z z=IDot7d5uegVjb$LZn(PDHe}y#o7*}i=CXr6~AV9LY{>?7K9=}z&*amKQE^1~=D4 zFQ!Pn2v*LD9P$O;C}8(kcMTS)zd6+Jwha5j-}G1s6@?x!{aWn5S-|KYHsbz=jX0hu zBGWEob8^+mItOdvG*o7pN;NF>XTgh}o{BQo3$IW83$uSV7H#D{uFlRHJvV1zNX~0W zE!BBXu6Q?S@XMnAX63(m!GEn>2`u!dYy5pV^H#4y(igh*QJfMKIi47gmV81|Rtv^H zM|>px8UuI}(ql2w7148i#F8EfM-am>A`>^1J2dE(B9h3jF!E<;`Bz^3do=pmt)r+jFcv{L+%c{4Q2}3MJ{Tkr%}`iC-2E}J>!b6S%xHmboBhMEjV^i_kOvp>v#A7DuTatS{`7c_DE_uwFn z+$n^)wbN)`4=zT z;}fPbv+a8nlnQ$x++x{n)#{Ji^?Z3s4pRw&fFn2nN zPB~1|SgK$xm%y4Ag_(Mf!+mFLe!6*2I(aqCKXMf?`2v!=v=>iDE%26rYO{^P_I+j~ z!f_Zycj<4j?N`kG)1Q6|uYZrZ_A!46u_3>Q*gPU|3_0l6^%}&8ZlC?(yE?>6afwR- zdFIM8qfZa3;tuxMRrzDV-cGLQwB-K2e5qFooRQamF7BVP?a!t7BewnBCNf3-^l|?G z?&A#&>D(V~uoEF(?N`2r@$Ds3H zl>^G~?4OBuX6meJ>WobMMYL21i&a*wsZws6(5(L>P5U#1=_dR8GO^&SdkL?>~~ zKPr%!(X*`4gRGheDX9LIN3^C9l9v8+5%>-7|1Gip&G?3yJ=IJ^BF>t{7@dYI+Hx1g zsegcs^o#|X9!Lo`!LaLlY}~&x0n&Aq5qn&*MouyBZyQ{ZUI=Rl}BxTPoB&jz%?gQ_Ie#s})$7Q7k zH=K&CbLo$FJl1JW#>yk&oF{rYHMJyzo|}@MD#EIJ%*v0xjraP06&-G)JDmmkkRh|~ zT&9#8r?byXA|02tJnEOW`Z5*6rj8ebw0bSTOixQpxDNF777P76b7IpAy`uNyMB9n# zF!RvErNfE!vK`PFeCbqKLqyJqoTM&Lt!K3VTTzpLik{G;H4f;Yy{T`i3tf6Z$r)WI zi5P_hSqEmAn!!!;u+7a62qw%EbH1aGdkyMqB~AP`??bVN+!UTH;OvbB!B1H&gK>jQA;QvZP{ z(ML|D{??e}LVSdXnKX0(AM(BEh26xcN5e+mr|U9RwQi+UfvWc>p)f*LMGZodlV~Z6 zrqOc`CYKih%Hx~Bd0bR=devhY%79)azm*_5{zm(L@nrBC4Ik;<*?u+3!n z<^^4E7ki1?zV3M;NQ_aARK4ficZQA$#zl+AL1*IL2ycc6+6{utlq3zW;tf7XjaF+> z!VvQ2W@Oz+G3{m4cweV&lmLfo)*`W6BJpCI6QwR20()kQH$FiTI$cNuK#%baP5 z`$U@x@&+d8kem#iCn7r@MCc&+)|E5A@Rs|MoM?PbAca#R$s`ql?K+t7VT$i<58?{? zBg}ii{*WpQn73bLQW_`QyW(x%Q=gP%;tiEG=w3KmWCo(gmsJjUaoCr4`}*U4AL#(c zw6^xm)lXSf+dML>;f|iu+r4C47&^j*d}|*KoO6dK>O1en8(7|TIklK_cK?uef>+7e zz+JhC5ibh=(jLT$$&OX4V2io@{DcYCbg3wbH`u(Hu4HHBfF^ZvM5N-6B-YmURfj8J zbrJul>!_MZ+jV>LYmHZW`h(Zus$}KeSOOX}F@4}vQJy?d0dY$)CNz6VzsY{c%8-Ne z5%VsggD-Lm%2|G?6Pge=??!Wd=pq>US}N8*$eCA-=t$1qwx+yzcP{%PL8!VV*Cu}u z(YTd+7?-)XTt*CH@s#j;ZOIS%7R|5{(kR_&M->`j?ohoLF*79#%8 z^VFy)0gW`xB|w=8|FR|;mFb9D>$TnJ$FKl>hfK4}2ES$>%$E&MtIu+n-t?O}JS}R) z7)0Hz|LBX_+lEX;SSnRkUOUct>p=qWcWj8eh_`taS@{6Ifzr!pvsJ~h-uK!5{)4^{ zUW7e4kXZg({5km~hdh1A=Dd#7SpOBxLnH-+7+qyg@x<=?Zarr-0VWp3+UPZm=&IUB z8S5DId6e*2ysHXCyoJ$vZyT~(%Janw=7Sm6yeePCoyb6u1drz1jC zt(KM?OB*@UVWOWX_aDOxK{Eowg(a))ffzoX@PuSlHUfIG!X6%OTud8pc@P{QWuVva z!6jb+S_d#B^dF}kXBZ=KD}ylr=`Ylu^=oOqBPx^$u%osBB=B5b{G zH}Qxea^#Mq;KRObxPwZ>#q!JEab?exV7rsu6D%0azbz@wi>ln=P+uyzW8<3og~X z3hFy5(pIE~rUM!@Y<)Np1^sTwP<pbORgRg{7=IQncC``8xl&akGKOLznJNJD7*s#A*YckaKe>J;^`>DJgm64Jal}j zbfs|w6}QQaKr*@)eQUlPo=R)9cRU$=p0B@ce0xkd?mYRJ4P;~eK9i5p9ZNrUX!dH$ ziblPq{N!m@)5-IZ()p@ei03D;=N2Ey7AW%LHucvqKP`Oyr*+d+P#2S2QD%uZUPj_( z-8R&=$VnvF(Ne4&BjLH_`DZNXviSfnA<^Uvxs8X}nw#*%Uf5oHIS2kLGNZRO3Yk4f zh&vS2+6pizHt!_Y_zs3_x(sMP90@n-5GKE0OzV&Wd|k2jw%laZn_Ei;hn4y8nk_Crqa^VJkY2W` zF|CmMdR~7bXxEc&{e88PxnbUs-t+arQ|8BWGKXLB2z^iR$tIAx74&l&`8OODuq+hv zwU@3U?&tPzSvj52ER$R|tT-U|vj*{I*U~hf$(#TvUH5CaQ@Hf=2zxARzUme*RkXgkcT4p!2^9a~!5Xlm?uEj5R!f)9N5=wr`{yJ7E*Ml z!yTVD`?T0!2OO7JK?^<10u25gb|C2Pb7b=yN%MT&tH3vbOiWECrpN=fJ90I-mNFzE zO3?XS>f9WVz#*DJ#R*1oE&gj`*8W$CMC)vQxB*1!D!qb{L)8q_9dl1cEwgcSNebul z<+>Kz^fDkaps3I7X&}%ULVjPdJBR_N^PVcQqXBO!tt=`32|Y}i9Pg7a8qAr6GG@m8 zm_d7UQ&uZ+iH&jZT@R5}dJ-wThw)`8^SGx%fyJCByO^nsMezfUrCljjs)=<<20XBF z#UVA3kU%JCwEARnC_Hajx3tEAI9)F_ z<#eK%H<6dh%iCwMu1rl7?uU#QHrP7v=Us;3VZAzhf-n>^2mQDL?B{u{2p~oHalJ9x zIS7#wEzQ;x}~-76P@!-9ZZUZ((35A8YNb~IhO?}bO3(u{{4zk2DnvRe*Of%!pau2?! zeP5d1G!ad01*&}xl_VFVAzSQy5Gm#h0@=fjjRJy08f+z3acj$3k3IF{V8w8FyZW~Mhp5S;XFsr7l4lDeofP))Yq}#Kuz~t?r@cQohXR^JeMtq68;$Jm;cQB99_%_Z+qeYRkM~{gkcy^7y$GQ1~?;~HZr7~$qeUChtrR^06Iq&G! z11IEQsRgJH4hLS;)LOhf(OD|sFkEL^NW%!b>|Lur)KDBfFGPz5e^5KNh^>V_dB?D8 zjf6L8iyC$;)+636t)D7>yj9|aCfn?z7<44W$!^{UkGBY|9gLs6NJ+y)ripe1?eAnH zF@~TkFor$mRQ}Sr0^&~usXOT+y+R{Jhvkknt?lfswZn4_n)|lkE~gg*kV^@h?uu24nOr0KTA4 zi~rudY8(fgni{rDdeEvbcUfO++YW|1W=d{Hj@Hv67QxW^FY!2>6>KzoZ(c+8)pye1 z3*KtS=0I}?y}VdMm$2h3aSlKT(U4Y%v`)b<+D2HOXm(N^bIRmVUGe45)q#5zO^|lHq{sz91bpF ztWH9!w)k*n!aazD62-91NNDs>N{pJftyC=SVWM@CO}1usc6mbP%hlY3O^0X&#*LWi z`a(xFmHe6Tm*zo7*gz>3*mCmuja{RSu4yyKbf27I%#k6swmHs-EppJu-F;Kv`PtQ$n`US;b*&4NONR2Uhc)(YXB0$0ZrpL^ zQuE%{@GG}Z*P;=m(sw-Kd%AE2!c-_NLp;MvT?;4Q))-&$5_Sw$sz|XeCKr7{go|BW zJ~T!c5Rov<+XZ5uR~%`w$X?Q{Aj(7-XsD`x-@_;ofj|wCF2A@a(Gb6K<@~PxO}^!R zysqqN2~+9%q-oe)&!;TOD3it@1n4xZ0be$MZ;5^EoagxaRIHs9Zuwh|7k!&w0;MLD zi)8PvD5KZ{4)q2>Vy5PJ;mO=(NZ1m71Rn90SnA+tuf9PANp8qvFbj17?Vjd};sY;f z<%%)OtmX-3qnesXlB}o|^klemurIRvfVrT^dNHOMRb9}{XarG)O(G2$RzVpjjA^Wt zONf0*k7vN*7A@~9)B4BNr^ds*JvCy~LJ=4aaT*~7u0WdPZ01lZYcd48cWU4l6Qi_~ zHGaA)YP;}NO%A)*fm0AY4k|S)GT7xw{Q90ce8Z#8#>zNMlJ+$_1zR+@?|iEnOMZN9 z+)5qSH1-(1av%Ww1w@kU%WnHX2WShn&kcyowS^99YSG-D_^Fkc)K{caSe~V( zjn$Kyj27xdI9fFAU)OXevAVS!vyphN$0 zg+GbVi+%aV%8B(pGvkSz6uf#}U)_A*mU#jW~^ z>*`?u(;>l-A3K%Hb$l_qkXwiRl|^4l(oO@0+BnjrX5)}!V1d1J`GQHp-J|4|UAJsC zwh|w5^Cn}`xbeODt$7S;$28ZFPx<|V|jII|U z*-0!ZN_1>+nJrPCvSHWb-+^MZ_4Iy`rZF<~hhYlYiULeU0$0bYR51^fuWu?-^xpWJ zC5Kng^&f%42Vl!nb;DTZwTUI4@k(;@@KWVNHCfIIckZU6iI9-`>;-!vApcfqE#fxU zh~~n6qDB~FRk*l@oI1n2`5T)v$B@H*t8KJ9#2UU~q5YB>m}x?_3-srsB&0pm=7u5# zXLsbzLjLKlMkbdWnE`PFrQ&SK=SJQVjYO^a*mFiAPM>?I2@QUD}#*a9&lGM33R?P ztsh@uqLNa)0KFZ=X_Ts<=KsEl(R*{F5xq}N7}GxOGFp@Gff-mR5LEAcD`5+v>|h*i z5r4uf5QHLhEGLdv6P|4r)8)oVOb}M}&P{H;Osbih5xn3tO}(-~8U-Od)Gy5$x(SLj zb#SuTloiAt05c1(KSCuG4)SF`vq2Nfl{4`XJmk*BOPpI!AXj3x1uYE3#Pkr+6gvVG zXOu5_*$MtYe;wL>Vj64&{1}gSEtKIiFe-OY(=PDo9#S&^;%!{gJvGq^IhqSxNy{@@ zON^IOGCV}-aDIT23Hg}X#pV2dXz%g&r5MAt0Wxh(d9ryn09=}3v9I2o*{ zFIMGuqf&^%iFe%SZDp^Kc<5;;#~iv_+V{BSE_GHCr9+>`kT$V}m8rK4VUfq~Xmdx} zr0E_W0hzSLY3Kd**J|^NWIc3!$T_5M%1JQ+ZLa4gOs2aadGQx?7 zDU@aD5hj^3+|`1@IB#zEN1z;M;>Q5~JSuu`~vPRBB@SW#aad@%MLLK43z z__(bWE&{Sv-Gq^+YhV_Sy`bk+ZF#g=9dt(eJ%^JLdHA_|K6OLluG+8DePkd)cwhWb z`iM#kp^IQ2b8|lbH4iuHP(_)UE*WI%91PG){iPeCG{|Hd63=PYroE7~xW1&rDoG?( z%&J4-qSo%^5RP1v-F7>L^Jb`oO+BTE(?p3YaummthEdL9Dzrm-LPrt2U6?M;t}n{c z!KP}rqE9hKlvu%1A zwSgU-Bq|~*N6cQ$M?~(Wb;ItZ`PmbRYHONxvfX~2==Qaypk%PN4ps|Xp0tKsddY}*tb=_K)uMv^ z)U_04{A$WJuwn1qC(Z!wr=7PTo3vf#Kta|1>MBF_b*Oq6HT&ALsC494e8aqm{_e&g z{galELJNKgpP7d2!Pvthm_7J0O6L{whEZgvY1lb=a!E+GuIp!|L@%kWxwX;a#Ho8C zDpq|-dhVUt2iGo7m=VJ4=3{j3DWbQfP6S*XmzwWWxDuKw;t z!*n-3x0{_8cjEfueDbt&RidH*G-kD8Ep#G^S5?euK4sg(cCkbwh4KVkMRB4au-4$4xv0C!J&EaZ9#!W*fAJ zB-ne!wE*SkwX11cg&0s`Ef_qnjf+^LtJ)IQy?7T5tvT2LNeLmtH=1QcphVLe$*hZ2 zDTF@Tso^?qCV})S+1h` zwi$<*M+3}8h^XtyE78yxKlh^OB1r5`&A~8`pIDVCJ&1*xDEkIFEj=%zw^fB|?XO~8 zNmmUlA5!2g&YE){eba#;wn!v>iZ@NT>Azkv%_C#CC_c{a&U9LpWB1Z}u?&H)nvXZ9 znFvFF%qr6bdUNFiX-w)QfKg3wUY*U-)XK^fV|Vbx1<&;gx%K_^wEWT!Ng zBjzX7z&6C1YAE4EWgqneHH3Wd86M1ZTu>onvCY)y!DMl#0z#h^tt}5c7UlU?f$PAF zhABY-Fg}E|mQ4=ti|q@kzA$Tw0n3x1`3Q^Xnn}+2&-l9p4d^(td*cHeZ|Cg%GO5Ul zWM$Z@@LsVle)DxpL&K-(tl>9yC;J*mHBoA1$Q?HqnLs=b!3RA!bt#3jPnWmrd2t}f zms#Ssx=+R^yOm_-PsA1m%LQ#C2qEV%D2Ian9c%L6ea{ea^}(tT*v-~?o5)nw*f{>w zBCvLju*S%7AuU2n771bg^NRzsG%UX3?sUB6_mnlG)rZomW%U)%(y2Wum#Qc$6mjfR zwx>Z07el-ymkO17A_}#YkT(g2qW+MD#u|Z2@|>av2)rQEVkQutipoSWrx(A+?e07E zk&>qc=$ex6uIgmn*fqdj>NN;{?XN$?c>HpjH} z!8}5YFD)GBKH=FQKN@XTqRgsj=(k?I`GxY0#A-lzC8XFjXF#$}$JGDgbf^1`g$IcP zN7k!#54aIizlCpME7JsP`#y)wfLS}rYL%2!qv2Yy-!DA4<;=T%7fB;vG*4}P-x$?=+;z24(?fh3-?amK^Dz=$nD1v^?^mdKT` zSANjA{Z<4@VsDDF$)lEpA%`MH1>dIJrmc*b4U3mV9UdDt`@j9Ap z76b>B#KoLf#muG5%8=Qk1(|(t(OwpOK*-L{cBned9H> z-`LozFB>kdmVnO*CFUT09?ILZ$cqqOPeTK{@g+i+MZ|6o4DC)Ad&6jp#77ftex^!L z8?}3(vZTwV`AVER3Y8KrrgdKweWIE{h^k#@es_w*wRK?|2H-515U?+3BSyD@Cb-F= zsB+M@c0^0U$clLjUx8UlWS^L)jm8E99Z7b|;yHG`l{PCfPJJ<1#QeG*{}SqX+?#9f z7LuAk=uRFS(Dmlo2RDMqs`=SBxAG0A@hypSiy5^+UnKgeHE9k$~OqhpxLx#248GI)fOn7#rjyD$a_<#b~C(b^Yw}KJXa=Ssyj&^+u(Z>(D66 zVTRjzy@W!Zx$Q7maSwkKGBmit=(;h4x}$EDlXE z&`lNk6_Ka(NXn ztO(%k=Op3mmxqa#H){D4PN@}xtGAkdw7AzVTf-i;8L?A*L~6$&;U-9cpjxrgx(pFikUtl zT)5ZWL$YkP;O4r9Z>DY*J9cZYy zST*G|_LZsiewPX_VXixIODt8>MHnsNPU!p#<&cV@LlRHSPv}p@`d6a(zJ|SNa32^u zLnC|kO|jXJ=qnd$(|l5h{I++0{Xqc;ZB7F3aLifXwbB56=byY$0Q_& zrFr%wo;@6%a*UlS!uS}k!F}9_#dGkrJWGB@Cl9%W)oTZVz2D|a*3xl**0N}0l=;we zmZ@ZQneyX0DLE_B(QNS*W#}BDKTxY=!YN@tFZfxhFdeQ|)r2lBfkO#Dg6-1U3;cXR zXZf_CqEZ}LFXSLDzN*&lYo;tS?y!_Lc~(|?FJ7S9MHu8{!PVDsb)ME7H>{~UZmqxN zTjq?`f7Q2{pcNtZy2nn}QOKf3j&X5WLAcl=SNDAm#_M+UCWzlejkne8uA0v(>hbYp zL>A|!)|Yl?C2Ev8v0evqXZ&HlZiiv*Wnd^29ijDeI7(<(M20U%b@l|aWB%&s5mnI1 ztPu{6up-#{6|2-rRuq<}2A4U%t8^DZO|ARjM54mQs}|@vNx+kkDY-Cla#6}s@=U)$ zwU#lG)xoO7`xt?NR{>upi^;0f+fK>J|kb4!?{5aWfr~U##h*h%p{n4HU(J=2zGBG6g=EZVkD=t_Ds|5HXgw z;mC4+05+3{wn13R7Lg~N&^==2cn8zsK2xiUt8ZTB{DCin)ZXyT^^=yU9^2{M3Hf^oF6CG5>typ6=eP%7m_WXyQ8PC_F-EK^)s>*7r zm>8XFMs;7nCCa>hXl%W4DY#v%^{@c+5QKn$kd~bWUnrNSHNT`YC+CFj!b8&ZDWTPz z`+QDd9iT>XwR%d8k68@TAI#`2%2rsIVI!@& z=M6C2`IxPl0}pp2hGZ>2hGP@hcE5P!9B`TpvnU^9De+XxHJ=OUc$;W>BEwv?75T-8 z(kj!!aqeNQwQG?cK?lLjW~KDHhfBOlB{Uj10vN`?C=W-QBUGgN%9zLf$f`K4qB?YH z3O5KP3T)S1r>fKn0%#vQ84-kWv{0jPDLlnja0hf}YI^c>I*atDQ>b5XKjxWr@TUNOdM;ee0jpSYHmlHY2B=FdVNU*-??R z!FtP{UEJK<@HxqnqP`*ei~~O?cCu7`@%5Xb;}@F?R0WYZErSx+azKEY{tB<~(`BnoR){PJd=7I3U=3JSG4*?k9QY)yLQu zn_k?SQ?_8w8J8WO)U*W)Hp!&uU8!@FIAQZ>_JSWr_hdi3y%TtOivE@(ACIff{j-2A zj4NX9!hJ#9xhjbp=S@A65oQ>7_~Bjdz*y~|OZwM*-K?b(jzoN&=vF@WV!wg90$;P} ztBJRwT?@Vo-aazfO3S9J!C32ghWZXk3)xK>Ow%SlyesYaxqeg0>1bm;^BKw9JNTMk z7*%{Y>OjxAx7q$;_2qE%sAP;ekx=CO+;wE~DbJ zo~!+fQ$I>oSH5?fGz+++_tv=+PtIgYHRkpF46*L%bF=*O+S&D@d-vLR`R~7KZ=IVu z*?-4;zxk)S1G~%=BOCrRRJMgQe{aiRF1a05aZFo#_F26{pIv5M)Jv#t@Hwen+`A-! zcS-S_=Qd$~w{8(DyBHnq1dOJZK8q__tNI!kF9h8_$h&q=*|(2t0*)*c6iCp5R2EEo zzl00l@aJ6v+ylF#;?!}oySF9!_^co8*t%!ekD^f1?Ax<}hj4f}ME^WFDch3&!>OuG zhQHTd5#I7meb1`tw_>L)%l&sdZ*zC$w2G;}*d-4qFFaOceBr@Y5k|LT%BwfMD(BUX zxZ^5)FtI#mRYHR2z2Xze6A!+OHUS>Lm++(A@!#!=eV5J`CbQ%{_ve4Q_RNl-etpo z?(rt|3_eQKdz_N=+Q1^W;5%@r`Ke9f!{2Y69{XQ0VbV3-u{5IfpPO>Sr$?^#uf;D` z8?q~;?XtDIxTfTXZq&2vX`M`4K5u(I;d`3h^4q}mq1-0GJ-VjA;nu|yLnpRLMx+S7 z$Y#6lwrBQ{{|vw@>$iEVc~o6%$H7*%SRnFl$E&gxw=5*y&*Rcy2A*yMY#YoMda-(a zMcHRx-lr#((j?jCV!BTm{4!tzp7Z~;VV6|xhJBy@-F&J)|EbKnKfE_AfLHGTkLM=b zEAOkW$V>k8yv;JeAa~uuHpdedwg(xmOo5)V7}oB$Y5Tq%``WHaynE-=`7DI-375g{ z{@}(-Kx@wehhve?VifewWw~@zXTz~iPTp>37EG}?=J@ugH{^;d#$&JViEg_5Hubtm zXY!@5(Z;IhZ`}aSzm!|~uT z`qB39mAHp;l49T0h&&;MLvOg`pXGU*%N7EA%r{@&lAF53=KQjk+otte-qmapb$YJy zx$(TpztTETNbm0Fds!az%Gf_^llKZI-FLDllP)$kEVeHx@?&H3s{(dpQ+|Y9@;=Wa zD<8i}Tz|RGwU)ciJM%>Co-`LIJoWl~-tyV54B%iCu(q6}F(KE*l`*7>!3r4po@Ay& H_Ww5lk)ho7 literal 0 HcmV?d00001 diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 000000000..ba060047c --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,26 @@ +.. MSCCL++ documentation master file, created by + sphinx-quickstart on Tue Sep 5 13:03:46 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to MSCCL++'s documentation! +=================================== + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + +Docs +==== + +.. doxygennamespace:: mscclpp + :members: diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 000000000..32bb24529 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/performance-ndmv4.md b/docs/performance-ndmv4.md index 28e38b0e7..4187b3b0a 100644 --- a/docs/performance-ndmv4.md +++ b/docs/performance-ndmv4.md @@ -1,50 +1,3 @@ # NDmv4 Performance -All results from NDmv4. NCCL version 2.17.1+cuda11.8, reported in-place numbers. - -nccl-tests command example: -```bash -mpirun --bind-to numa -hostfile /mnt/hostfile --tag-output --allow-run-as-root -map-by ppr:8:node --bind-to numa -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 -x PATH -x LD_PRELOAD=/mnt/nccl/build/lib/libnccl.so -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/mnt/ndv4-topo.xml -x NCCL_DEBUG=WARN ./build/all_gather_perf -b 1K -e 1K -g 1 -c 1 -w 10 -n 10 -G 1 -``` - -mscclpp-tests command example: -```bash -mpirun -allow-run-as-root -map-by ppr:8:node -hostfile /mnt/hostfile ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1K -w 10 -n 10 -G 10 -k 0 -``` - -**NOTE:** NCCL AllGather leverages Ring algorithm instead of all-pairs alike algorithm, which greatly reduces inter-node transmission, causing significant higher performance. MSCCL++ should do something similar in the future - -### 1 node, 8 gpus/node -**Latency (us)** -| Message Size | NCCL AllGather | NCCL AllReduce | NCCL AllToAll | MSCCL AllToAll LL/LL128/Simple | MSCCL++ AllGather K0/K1/K2 | MSCCL++ AllReduce | -|:------------:|:--------------:|:--------------:|:-------------:|:------------------------------:|:--------------------------:|:-----------------:| -| 1K | 12.53 | **16.96** | 9.34 | **7.76** / 21.06 / 28.50 | 157.91 / 143.21 / 447.0 | 326.4 | - -**BusBW (GB/s)** -| Message Size | NCCL AllGather | NCCL AllReduce | NCCL AllToAll | MSCCL AllToAll LL/LL128/Simple | MSCCL++ AllGather K0/K1/K2 | MSCCL++ AllReduce | -|:------------:|:--------------:|:--------------:|:-------------:|:------------------------------:|:----------------------------:|:-----------------:| -| 1G | 253.59 | **231.45** | 254.69 | 217.05 / 216.98 / 217.15 | 125.06 / **255.64** / 124.89 | 22.55 | - -### 2 nodes, 1 gpu/node -**Latency (us)** -| Message Size | NCCL AllGather | NCCL AllReduce | NCCL AllToAll | MSCCL AllToAll LL/LL128/Simple | MSCCL++ AllGather K0/K1/K2 | MSCCL++ AllReduce | -|:------------:|:--------------:|:--------------:|:--------------:|:------------------------------:|:--------------------------:|:-----------------:| -| 1K | 16.08 | **21.27** | 29.84 | 14.67 / 29.12 / 35.43 | 15.32 / **13.84** / 26.08 | - | - -**BusBW (GB/s)** -| Message Size | NCCL AllGather | NCCL AllReduce | NCCL AllToAll | MSCCL AllToAll LL/LL128/Simple | MSCCL++ AllGather K0/K1/K2 | MSCCL++ AllReduce | -|:------------:|:--------------:|:--------------:|:-------------:|:------------------------------:|:--------------------------:|:-----------------:| -| 1G | 15.84 | **18.65** | 15.48 | 13.94 / 13.83 / 14.10 | **23.30** / 23.29 / 21.60 | - | - -### 2 nodes, 8 gpus/node -**Latency (us)** -| Message Size | NCCL AllGather | NCCL AllReduce | NCCL AllToAll | MSCCL AllToAll LL/LL128/Simple | MSCCL++ AllGather K0/K1/K2 | MSCCL++ AllReduce | -|:------------:|:--------------:|:--------------:|:-------------:|:------------------------------:|:--------------------------:|:-----------------:| -| 1K | 33.74 | **35.85** | 49.75 | **22.55** / 39.33 / 56.93 | 159.14 / 230.52 / 462.7 | - | - -**BusBW (GB/s)** -| Message Size | NCCL AllGather | NCCL AllReduce | NCCL AllToAll | MSCCL AllToAll LL/LL128/Simple | MSCCL++ AllGather K0/K1/K2 | MSCCL++ AllReduce | -|:------------:|:--------------:|:--------------:|:-------------:|:------------------------------:|:--------------------------:|:-----------------:| -| 1G | 177.05 | **183.82** | 37.80 | 40.17 / 40.18 / 40.23 | 44.19 / 9.31 / **209.33** | - | -| 4G | 186.01 | **188.18** | 37.81 | - / - / - | 44.60 / - / **234.08** | - | - +TBU diff --git a/docs/quickstart.md b/docs/quickstart.md index 9ccf1b6f9..19b32d6ec 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -8,8 +8,9 @@ * ND_H100_v5 * [NC_A100_v4](https://learn.microsoft.com/en-us/azure/virtual-machines/nc-a100-v4-series) (TBD) * Non-Azure Systems - * NVIDIA A100 GPUs + CUDA >= 11.1.1 - * NVIDIA H100 GPUs + CUDA >= 12.0.0 + * NVIDIA A100 GPUs + CUDA >= 11.8 + * NVIDIA H100 GPUs + CUDA >= 12.0 + * AMD support is underway. * OS: tested over Ubuntu 18.04 and 20.04 * Libraries: [libnuma](https://github.com/numactl/numactl), MPI (optional) * Others @@ -54,6 +55,8 @@ Our base image installs all prerequisites for MSCCL++. $ docker pull ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 ``` +See all available images [here](https://github.com/microsoft/mscclpp/pkgs/container/mscclpp%2Fmscclpp). + ## Unit Tests `unit_tests` require one GPU on the system. It only tests operation of basic components. @@ -76,37 +79,53 @@ To run `mp_unit_tests` with more than two nodes, you need to specify the `-ip_po $ mpirun -np 16 -npernode 8 -hostfile hostfile ./test/mp_unit_tests -ip_port 10.0.0.5:50000 ``` -## mscclpp-test +## Performance Benchmark + +### Python Benchmark -mscclpp-test is a set of performance benchmarks for MSCCL++. It requires MPI to be installed on the system, and the path should be provided via `MPI_HOME` environment variable to the CMake build system. +[Install the MSCCL++ Python package](https://github.com/microsoft/mscclpp/blob/chhwang/docs/docs/quickstart.md#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system. + +```bash +# Choose either `requirements_cu11.txt` or `requirements_cu12.txt` according to your CUDA version. +$ python3 -m pip install -r ./python/requirements_cu12.txt +$ mpirun -tag-output -np 8 python3 ./python/benchmark/allreduce_bench.py +``` + +### C++ Benchmark (mscclpp-test) + +*NOTE: mscclpp-test will be retired soon and will be maintained only as an example of C++ implementation. If you want to get the latest performance numbers, please use the Python benchmark instead.* + +mscclpp-test is a set of C++ performance benchmarks. It requires MPI on the system, and the path should be provided via `MPI_HOME` environment variable to the CMake build system. ```bash $ MPI_HOME=/path/to/mpi cmake -DCMAKE_BUILD_TYPE=Release .. -$ make -j sendrecv_test_perf allgather_test_perf allreduce_test_perf alltoall_test_perf +$ make -j allgather_test_perf allreduce_test_perf ``` -For example, the following command runs the AllReduce benchmark with 8 GPUs starting from 3MB to 48MB messages, by doubling the message size in between. +For example, the following command runs the `allreduce5` algorithm with 8 GPUs starting from 3MB to 48MB messages, by doubling the message size in between. You can try different algorithms by changing the `-k 5` option to another value (e.g., `-k 3` runs `allreduce3`). Check all algorithms from the code: [allreduce_test.cu](https://github.com/microsoft/mscclpp/blob/main/test/mscclpp-test/allreduce_test.cu) and [allgather_test.cu](https://github.com/microsoft/mscclpp/blob/main/test/mscclpp-test/allgather_test.cu). ```bash -$ mpirun -np 8 ./test/mscclpp-test/allreduce_test_perf -b 3m -e 48m -G 100 -n 100 -w 20 -f 2 -k 4 +$ mpirun --bind-to-numa -np 8 ./test/mscclpp-test/allreduce_test_perf -b 3m -e 48m -G 100 -n 100 -w 20 -f 2 -k 5 ``` +*NOTE: a few algorithms set a condition on the total data size, such as to be a multiple of 3. If the condition is unmet, the command will throw a regarding error.* + Check the help message for more details. ```bash $ ./test/mscclpp-test/allreduce_test_perf --help -USAGE: allreduce_test_perf - [-b,--minbytes ] - [-e,--maxbytes ] - [-i,--stepbytes ] - [-f,--stepfactor ] - [-n,--iters ] - [-w,--warmup_iters ] - [-c,--check <0/1>] - [-T,--timeout