From 08f60b15031b8c1eb5bb331005f9cb3755386b72 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 11 Dec 2023 22:34:36 -0500 Subject: [PATCH 01/61] Fix HIP build for AMD (#1243) * fix * update hip docker * undo legion update until pr is merged on gitlab --- CMakeLists.txt | 11 +++++++---- cmake/hip.cmake | 4 ++-- config/config.inc | 7 +++++-- docker/flexflow-environment/Dockerfile | 5 +---- inference/incr_decoding/CMakeLists.txt | 1 + inference/spec_infer/CMakeLists.txt | 1 + 6 files changed, 17 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b201cf99dc..90cab126e6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,6 +14,7 @@ endif() set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_LIST_DIR}/cmake) set(FLEXFLOW_ROOT ${CMAKE_CURRENT_LIST_DIR}) set(CMAKE_CXX_FLAGS "-std=c++17 ${CMAKE_CXX_FLAGS} -fPIC -UNDEBUG") +set(CMAKE_HIP_FLAGS "-std=c++17 ${CMAKE_HIP_FLAGS} -fPIC -UNDEBUG") # set std 17 #set(CMAKE_CXX_STANDARD 17) @@ -51,6 +52,7 @@ endif() # do not disable assertions even if in release mode set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") +set(CMAKE_HIP_FLAGS_RELEASE "${CMAKE_HIP_FLAGS_RELEASE} -UNDEBUG") if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") set(LIBEXT ".so") @@ -157,6 +159,7 @@ endif() # HIP if (FF_GPU_BACKEND STREQUAL "hip_rocm" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + enable_language(HIP) include(hip) endif() @@ -299,7 +302,10 @@ if(NOT BUILD_LEGION_ONLY) LIST_DIRECTORIES False ${FLEXFLOW_ROOT}/src/*.cpp) - if(BUILD_SHARED_LIBS) + set_source_files_properties(${FLEXFLOW_GPU_SRC} PROPERTIES LANGUAGE HIP) + set_source_files_properties(${FLEXFLOW_SRC} PROPERTIES LANGUAGE HIP) + + if(BUILD_SHARED_LIBS) add_library(flexflow SHARED ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC}) else() add_library(flexflow STATIC ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC}) @@ -474,9 +480,6 @@ if(NOT BUILD_LEGION_ONLY) endif() if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_TOKENIZER) - if (FF_GPU_BACKEND STREQUAL "hip_rocm") - SET(SPM_USE_BUILTIN_PROTOBUF OFF CACHE BOOL "Use builtin version of protobuf to compile SentencePiece") - endif() # Ensure Rust is installed execute_process(COMMAND rustc --version RESULT_VARIABLE RUST_COMMAND_RESULT diff --git a/cmake/hip.cmake b/cmake/hip.cmake index abcc82b03a..25f2e05e19 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -2,11 +2,11 @@ if (NOT FF_HIP_ARCH STREQUAL "") if (FF_HIP_ARCH STREQUAL "all") set(FF_HIP_ARCH "gfx900,gfx902,gfx904,gfx906,gfx908,gfx909,gfx90a,gfx90c,gfx940,gfx1010,gfx1011,gfx1012,gfx1013,gfx1030,gfx1031,gfx1032,gfx1033,gfx1034,gfx1035,gfx1036,gfx1100,gfx1101,gfx1102,gfx1103") endif() - string(REPLACE "," " " HIP_ARCH_LIST "${FF_HIP_ARCH}") + string(REPLACE "," "," HIP_ARCH_LIST "${FF_HIP_ARCH}") endif() message(STATUS "FF_HIP_ARCH: ${FF_HIP_ARCH}") if(FF_GPU_BACKEND STREQUAL "hip_rocm") - set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE STRING "Path to the clang compiler by ROCM" FORCE) + #set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE STRING "Path to the clang compiler by ROCM" FORCE) set(GPU_TARGETS "${FF_HIP_ARCH}" CACHE STRING "The GPU TARGETs") endif() diff --git a/config/config.inc b/config/config.inc index e5c9c69acf..1121c114c4 100644 --- a/config/config.inc +++ b/config/config.inc @@ -190,6 +190,8 @@ if [ -n "$ROCM_PATH" ]; then SET_ROCM_PATH="-DROCM_PATH=${ROCM_PATH}" fi +ADD_ROCM_TO_PATH="" + # set GPU backend if [ -n "$FF_GPU_BACKEND" ]; then SET_FF_GPU_BACKEND="-DFF_GPU_BACKEND=${FF_GPU_BACKEND}" @@ -222,7 +224,8 @@ if [ -n "$FF_GPU_BACKEND" ]; then chmod +x "$(pwd)/nvidia_hipcc" SET_CXX="-DCMAKE_CXX_COMPILER=$(pwd)/nvidia_hipcc -DCMAKE_CXX_LINKER=$(pwd)/nvidia_hipcc" else - SET_CXX="-DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_CXX_LINKER=/opt/rocm/bin/hipcc" + ADD_ROCM_TO_PATH="PATH=${PATH}:${ROCM_PATH}/bin" + #SET_CXX="-DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_CXX_LINKER=/opt/rocm/bin/hipcc" fi fi fi @@ -232,7 +235,7 @@ CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET function run_cmake() { SRC_LOCATION=${SRC_LOCATION:=`dirname $0`/../} -CMAKE_COMMAND="${SET_CC_FLAGS} ${SET_NVCC_FLAGS} ${SET_LD_FLAGS} ${SET_CUDA_LIB_PATH} cmake ${CMAKE_FLAGS} $* ${SRC_LOCATION}" +CMAKE_COMMAND="${SET_CC_FLAGS} ${SET_NVCC_FLAGS} ${SET_LD_FLAGS} ${SET_CUDA_LIB_PATH} ${ADD_ROCM_TO_PATH} cmake ${CMAKE_FLAGS} $* ${SRC_LOCATION}" echo $CMAKE_COMMAND eval $CMAKE_COMMAND } diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index 0e9a3cda82..edbf9a7e52 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -74,11 +74,8 @@ RUN if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ] rm ./${AMD_GPU_SCRIPT_NAME}; \ amdgpu-install -y --usecase=hip,rocm --no-dkms; \ apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk rocm-device-libs; \ - # Install protobuf v3.20.x manually + # Install protobuf dependencies apt-get update -y && sudo apt-get install -y pkg-config zip g++ zlib1g-dev autoconf automake libtool make; \ - git clone -b 3.20.x https://github.com/protocolbuffers/protobuf.git; cd protobuf/ ; git submodule update --init --recursive; \ - ./autogen.sh; ./configure; cores_available=$(nproc --all); n_build_cores=$(( cores_available -1 )); \ - if (( n_build_cores < 1 )) ; then n_build_cores=1 ; fi; make -j $n_build_cores; make install; ldconfig; cd .. ; \ else \ echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping installing HIP dependencies"; \ fi diff --git a/inference/incr_decoding/CMakeLists.txt b/inference/incr_decoding/CMakeLists.txt index e415835a79..53b7cf0c2f 100644 --- a/inference/incr_decoding/CMakeLists.txt +++ b/inference/incr_decoding/CMakeLists.txt @@ -20,6 +20,7 @@ if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_NVIDIA__) endif() elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + set_source_files_properties(${CPU_SRC} PROPERTIES LANGUAGE HIP) hip_add_executable(${project_target} ${CPU_SRC}) if (FF_HIP_ARCH STREQUAL "") message(FATAL_ERROR "FF_HIP_ARCH is empty!") diff --git a/inference/spec_infer/CMakeLists.txt b/inference/spec_infer/CMakeLists.txt index 26d5bd1894..c877a3530b 100644 --- a/inference/spec_infer/CMakeLists.txt +++ b/inference/spec_infer/CMakeLists.txt @@ -19,6 +19,7 @@ if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_NVIDIA__) endif() elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + set_source_files_properties(${CPU_SRC} PROPERTIES LANGUAGE HIP) hip_add_executable(${project_target} ${CPU_SRC}) if (FF_HIP_ARCH STREQUAL "") message(FATAL_ERROR "FF_HIP_ARCH is empty!") From 3cf49a6d89b9ce60efde018fc99565390ee37eb7 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 12 Dec 2023 03:55:52 -0500 Subject: [PATCH 02/61] [Documentation] - Annotate attention kernel with shapes of tensors (#1244) * add attention shape annotations * linting * fix --- src/ops/inc_multihead_self_attention.cu | 443 ++++++++++--------- src/ops/spec_inc_multihead_self_attention.cu | 2 +- 2 files changed, 247 insertions(+), 198 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index ce30b5dfda..7da9aa389c 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -504,7 +504,6 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - DT alpha = 1.0f, beta = 0.0f; assert(m->qSize == m->vSize && m->qSize == m->kSize); cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) @@ -518,43 +517,52 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, compute_type = CUBLAS_COMPUTE_32F_FAST_16F; } #endif - // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) - // Weights: qSize x qProjSize x 3 x num_q_heads - // Input: qSize x num_tokens - // Output >>> qProjSize x num_tokens x 3 x num_q_heads - int m_q = m->qProjSize * m->num_q_heads; - int m_k = m->kProjSize * m->num_q_heads; - int m_v = m->vProjSize * m->num_q_heads; - assert(m_q == m_k && m_k == m_v); // keep things simple for now - int n = bc->num_active_tokens(); - int k = m->qSize; - int m_ = m_q * QKV_WEIGHT_NUM; - int lda = k, ldb = k, ldc = m_; - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - weight_ptr, - cublas_data_type, - lda, - input_ptr, - cublas_data_type, - ldb, - &beta, - output_ptr, - cublas_data_type, - ldc, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // apply rotary emmmbedding for q - // and k step1 change the k, v to complex tensor + + // Step 1: Compute QKV projections + { + DT alpha = 1.0f, beta = 0.0f; + // after transpositions + int m_q = m->qProjSize * m->num_q_heads; + int m_k = m->kProjSize * m->num_q_heads; + int m_v = m->vProjSize * m->num_q_heads; + assert(m_q == m_k && m_k == m_v); // keep things simple for now + int n = bc->num_active_tokens(); + int k = m->qSize; + int m_ = m_q * QKV_WEIGHT_NUM; + // before transpositions + int lda = k, ldb = k, ldc = m_; + // matrix A: QKV weights + // matrix A's layout: [qSize (hidden_dim), qProjSize, num_heads, 3] + // matrix B: input + // matrix B's layout: [qSize (hidden_dim), num_new_tokens] + // matrix C: devQKVProjArray + // matrix B's layout: [qProjSize, num_heads, 3, num_new_tokens] + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + weight_ptr, + cublas_data_type, + lda, + input_ptr, + cublas_data_type, + ldb, + &beta, + output_ptr, + cublas_data_type, + ldc, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + int num_tokens = bc->num_active_tokens(); int parallelism = m->kProjSize * num_tokens * m->num_q_heads; size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads; - // apply bias for q, k, v + + // Step 2: apply bias for QKV, or scale the query if (*m->qkv_bias) { apply_proj_bias_qkv<<scaling_factor, m->hidden_size); } + + // Step 3: apply rotary embedding if needed if (*m->apply_rotary_embedding) { /*q&k*/ parallelism = num_tokens * m->hidden_size; @@ -638,38 +648,47 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m, cudaDataType_t compute_type = cublas_data_type; #endif // Project to output, save result directly on output tensor - DT alpha = 1.0f, beta = 0.0f; - // int num_tokens = bc->num_active_tokens(); - int m_ = m->oProjSize; - int k = m->vProjSize * m->num_q_heads; - int n = num_tokens; - int lda = k, ldb = k, ldc = m_; - DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_q_heads + - m->vProjSize * m->num_q_heads); - DT const *B = static_cast
(m->attn_heads); - DT *C = static_cast
(output_ptr); - - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - B, - cublas_data_type, - ldb, - &beta, - C, - cublas_data_type, - ldc, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - + { + DT alpha = 1.0f, beta = 0.0f; + // after transpositions + int m_ = m->oProjSize; + int k = m->vProjSize * m->num_q_heads; + int n = num_tokens; + // before transpositions + int lda = k, ldb = k, ldc = m_; + // matrix A: output projection weight + // matrix A's layout: [vProjSize * num_heads, oProjSize] + DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + + m->kProjSize * m->num_q_heads + + m->vProjSize * m->num_q_heads); + // matrix B: attn heads + // matrix B's layout: [vProjSize * num_heads, num_new_tokens] + DT const *B = static_cast
(m->attn_heads); + // matrix B: output + // matrix B's layout: [oProjSize, num_new_tokens] + DT *C = static_cast
(output_ptr); + + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + // Add final output bias if (*m->final_bias && shard_id == 0) { int parallelism = m->oProjSize * num_tokens; int qkv_weight_size = m->qProjSize * m->global_num_q_heads + @@ -945,54 +964,69 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; - // bc->token_last_available_idx[i] + 1; - // Compute (QK^T/sqrt(d_k)) - // a flag of using this scaling alpha - int m_ = num_new_tokens; - int n = total_tokens; - int k = m->qProjSize; - int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, - ldc = m_; - int strideA = q_block_size; - int strideB = kt_block_size; - int strideC = num_new_tokens * total_tokens; - DT alpha = 1.0f, beta = 0.0f; - if (*m->qk_prod_scaling) { - alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + // Step 1: compute query-key product QK.T/sqrt(d_k) + { + // Scale by sqrt(d_k) as per the original attention paper + DT alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + } + // after transpositions + int m_ = num_new_tokens; + int n = total_tokens; + int k = m->qProjSize; + // before transpositions + int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, + ldc = m_; + // N.B. strides are applied before transpose operations + int strideA = q_block_size; + int strideB = kt_block_size; + int strideC = num_new_tokens * total_tokens; + + // matrix A: devQKVProjArray + // matrix A's layout: [qProjSize, num_heads, 3, num_new_tokens] + // To get query projection, skip over Q entries from previous requests + DT const *A = static_cast
(m->devQKVProjArray) + + tokens_previous_requests * m->qProjSize * m->num_q_heads * + QKV_WEIGHT_NUM; + // matrix B: key cache + // matrix B's layout: [kProjSize * num_heads, total_tokens] + // To get B, skip over K entries from previous requests (all heads + + // padding) + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + // matrix C: qk_prods + // matrix C's layout: [num_new_tokens, total_tokens, num_heads] + // To get C, skip over QK.T products from previous requests + DT *C = static_cast
(m->qk_prods); + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } - // To get A, skip over Q entries from previous requests (same head) - DT const *A = static_cast
(m->devQKVProjArray) + - tokens_previous_requests * m->qProjSize * m->num_q_heads * - QKV_WEIGHT_NUM; - // To get B, skip over K entries from previous requests (all heads + - // padding) - DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; - // To get C, skip over QK^T products from previous requests + // Step 2: Add alibi position bias to qk production + // matrix C: qk_prods + // matrix C's layout: [num_new_tokens, total_tokens, num_heads] + // To get C, skip over QK.T products from previous requests DT *C = static_cast
(m->qk_prods); - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // add alibi position bias to qk production if (*m->position_bias) { size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; apply_position_bias_qkprd<< 0) { @@ -1022,87 +1056,102 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, entries_above_diagonal, static_cast
(-INFINITY)); } - // Compute Softmax(QK^T/sqrt(d_k)) - // Before modifying the parameters below, make sure to read the following - // description of the CUDNN_TENSOR_NCHW tensor layout, from - // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: - // This tensor format specifies that the data is laid out in the following - // order: batch size, feature maps, rows, columns. The strides are - // implicitly defined in such a way that the data are contiguous in memory - // with no padding between images, feature maps, rows, and columns; the - // columns are the inner dimension and the images are the outermost - // dimension. - int n_param = m->num_q_heads; - int c_param = total_tokens; - int h_param = 1; - int w_param = num_new_tokens; - checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, - CUDNN_TENSOR_NCHW, - cudnn_data_type, - n_param, - c_param, - h_param, - w_param)); - float softmax_alpha = 1.0f, softmax_beta = 0.0f; - DT *C_softmax = static_cast
(m->qk_prods_softmax); - // The softmax operation below is executed according to the - // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The - // softmax operation is computed per spatial location (H,W) per image (N) - // across dimension C. - checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &softmax_alpha, - m->qk_tensor, - C, - &softmax_beta, - m->qk_tensor, - C_softmax)); - // Matmul softmax(QK^T/sqrt(d_k)) by V - alpha = 1.0f, beta = 0.0f; - m_ = m->vProjSize; - n = num_new_tokens; - k = total_tokens; - lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; - strideA = vt_block_size; - strideB = num_new_tokens * total_tokens; - strideC = m->vProjSize; - // To get A, skip over V^T entries from previous requests (all heads + - // padding) - A = static_cast
(m->valueCache) + i * vt_req_block_size; - // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous - // requests (all heads) - B = C_softmax; - // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous - // requests - - // store the result attn heads, also skip the genration tokens - C = static_cast
(m->attn_heads) + - (tokens_previous_requests + bc->num_generation_tokens) * - m->num_q_heads * m->vProjSize; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + // Step 4: Compute Softmax(QK.T/sqrt(d_k)) + { + // Before modifying the parameters below, make sure to read the following + // description of the CUDNN_TENSOR_NCHW tensor layout, from + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: + // This tensor format specifies that the data is laid out in the following + // order: batch size, feature maps, rows, columns. The strides are + // implicitly defined in such a way that the data are contiguous in memory + // with no padding between images, feature maps, rows, and columns; the + // columns are the inner dimension and the images are the outermost + // dimension. + int n_param = m->num_q_heads; + int c_param = total_tokens; + int h_param = 1; + int w_param = num_new_tokens; + checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + n_param, + c_param, + h_param, + w_param)); + float softmax_alpha = 1.0f, softmax_beta = 0.0f; + DT *C_softmax = static_cast
(m->qk_prods_softmax); + // The softmax operation below is executed according to the + // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The + // softmax operation is computed per spatial location (H,W) per image (N) + // across dimension C. + checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &softmax_alpha, + m->qk_tensor, + C, + &softmax_beta, + m->qk_tensor, + C_softmax)); + } + // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @ + // softmax(QK.T/sqrt(d_k)).T + { + DT alpha = 1.0f, beta = 0.0f; + // after transpositions + int m_ = m->vProjSize; + int n = num_new_tokens; + int k = total_tokens; + // before transpositions + int lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; + // N.B. strides are applied before transpose operations + int strideA = vt_block_size; + int strideB = num_new_tokens * total_tokens; + int strideC = m->vProjSize; + // matrix A: value cache + // matrix A's layout: [vProjSize, num_heads, total_tokens] + // To get A, skip over V.T entries from previous requests (all heads + + // padding) + DT *A = static_cast
(m->valueCache) + i * vt_req_block_size; + // matrix B: qk_prods_softmax + // matrix B's layout: [num_new_tokens, total_tokens, num_heads] + // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous + // requests (all heads) + DT *B = static_cast
(m->qk_prods_softmax); + ; + // matrix C: attn heads + // matrix C's layout: [vProjSize, num_heads, num_new_tokens] + // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous + // requests + // store the result attn heads, also skip the genration tokens + DT *C = static_cast
(m->attn_heads) + + (tokens_previous_requests + bc->num_generation_tokens) * + m->num_q_heads * m->vProjSize; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } tokens_previous_requests += num_new_tokens; } assert(tokens_previous_requests == num_tokens); @@ -1255,7 +1304,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( assert(kSize == vSize); qProjSize = _qProjSize; kProjSize = _kProjSize; - assert(qProjSize == kProjSize); // required for attention QK^T matmul + assert(qProjSize == kProjSize); // required for attention QK.T matmul vProjSize = _vProjSize; oProjSize = _oProjSize; size_t size_of_dt = data_type_size(attn->data_type); diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 6dad1c6de9..562dee4d93 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -492,7 +492,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // compute output production and bias together for all tokens int num_tokens = bc->num_active_tokens() * BeamSearchBatchConfig::MAX_BEAM_WIDTH; - + compute_o_prod_bias( m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); } From 7e7f955f7a4a1f5de9f78d7e964f8e4d0baabb72 Mon Sep 17 00:00:00 2001 From: Xinhao Cheng <99570243+xinhaoc@users.noreply.github.com> Date: Sun, 24 Dec 2023 10:14:12 -0500 Subject: [PATCH 03/61] Fix link issue (#1247) --- src/ops/inc_multihead_self_attention.cu | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 7da9aa389c..695f4b13b9 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1515,4 +1515,24 @@ template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( GenericTensorAccessorR const weight, DataType data_type, cudaStream_t stream); + +template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + float *output_ptr, + float const *weight_ptr, + float const *bias_ptr, + int num_tokens, + cudaStream_t stream); + +template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + half *output_ptr, + half const *weight_ptr, + half const *bias_ptr, + int num_tokens, + cudaStream_t stream); }; // namespace FlexFlow From ed5a2e07fdc9285612f167c150f8d138e51895f7 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Mon, 25 Dec 2023 12:17:48 -0500 Subject: [PATCH 04/61] init --- include/flexflow/batch_config.h | 12 + include/flexflow/config.h | 9 + include/flexflow/ffconst.h | 1 + include/flexflow/model.h | 45 + include/flexflow/operator_params.h | 2 + .../specinfer_inc_multihead_self_attention.h | 150 +++ ...nfer_inc_multihead_self_attention_params.h | 33 + include/flexflow/request_manager.h | 14 +- inference/file_loader.cc | 3 +- inference/models/llama.cc | 5 +- inference/spec_infer/spec_infer.cc | 3 + src/ops/inc_multihead_self_attention.cpp | 19 + src/ops/inc_multihead_self_attention.cu | 61 +- .../specinfer_inc_multihead_self_attention.cc | 883 +++++++++++++++++ .../specinfer_inc_multihead_self_attention.cu | 890 ++++++++++++++++++ src/ops/tree_inc_multihead_self_attention.cu | 24 +- src/runtime/ffconst_utils.cc | 2 + src/runtime/graph.cc | 71 +- src/runtime/inference_manager.cc | 13 +- src/runtime/model.cc | 149 ++- src/runtime/model.cpp | 48 + src/runtime/model.cu | 28 +- src/runtime/request_manager.cc | 250 +++-- src/runtime/request_manager.cpp | 16 + src/runtime/request_manager.cu | 50 + 25 files changed, 2589 insertions(+), 192 deletions(-) create mode 100644 include/flexflow/ops/specinfer_inc_multihead_self_attention.h create mode 100644 include/flexflow/ops/specinfer_inc_multihead_self_attention_params.h create mode 100644 src/ops/specinfer_inc_multihead_self_attention.cc create mode 100644 src/ops/specinfer_inc_multihead_self_attention.cu diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index e2903c4d11..c33c3558cc 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -129,6 +129,9 @@ class BeamSearchBatchConfig : public BatchConfig { inline static int const MAX_BEAM_WIDTH = 1; inline static int const MAX_BEAM_DEPTH = 8; + // maximum tree branches for a request + inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 9; + int model_id; struct BeamSearchPerRequestInfo { @@ -139,14 +142,23 @@ class BeamSearchBatchConfig : public BatchConfig { BatchConfig::TokenId tokens[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; float probs[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; int parent_id[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + int sub_request_num; }; struct BeamSearchPerTokenInfo { int sub_request_index; }; + struct SpecInferTopology { + int real_token_pos[MAX_SPECULATIVE_TREE_BRANCHES][MAX_NUM_TOKENS]; + int allocated_tokens; + }; + + BeamSearchPerRequestInfo beamRequestsInfo[MAX_NUM_REQUESTS]; BeamSearchPerTokenInfo beamTokenInfo[MAX_NUM_TOKENS * MAX_BEAM_WIDTH]; + SpecInferTopology topology_mask[MAX_NUM_REQUESTS]; + // why is this == MAX_NUM_REQUESTS * MAX_BEAM_WIDTH? int sub_requests[MAX_NUM_REQUESTS * MAX_BEAM_WIDTH]; diff --git a/include/flexflow/config.h b/include/flexflow/config.h index c2af6d707c..321d14961b 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -16,6 +16,7 @@ #ifndef _FLEXFLOW_CONFIG_H_ #define _FLEXFLOW_CONFIG_H_ #include "ffconst.h" +#include "flexflow/batch_config.h" #include "legion.h" #include #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) @@ -75,6 +76,14 @@ struct FFHandler { #endif void *workSpace; size_t workSpaceSize; + void *batch_config_metadata; + + // request info + token info + topolopgy mask info + size_t batch_config_metadata_size = + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::topology_mask) + + sizeof(BeamSearchBatchConfig::beamTokenInfo) + + sizeof(BeamSearchBatchConfig::beamRequestsInfo); void *offload_reserve_space; size_t offload_reserve_space_size; DataType quantization_type; diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 512645e624..ef0003b08e 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -171,6 +171,7 @@ enum OperatorType { OP_INC_MULTIHEAD_SELF_ATTENTION, OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, + OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION, OP_SAMPLING, // Parallel Ops OP_REPARTITION, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index d8402ba622..3602cb108b 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -172,6 +172,8 @@ enum TaskIDs { SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, TREE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, TREE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, + SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, MSELOSS_BWD_TASK_ID, FUSEDOP_INIT_TASK_ID, FUSEDOP_FWD_TASK_ID, @@ -324,6 +326,7 @@ class Linear; class MultiHeadAttention; class IncMultiHeadSelfAttention; class TreeIncMultiHeadSelfAttention; +class SpecInferIncMultiHeadSelfAttention; class Pool2D; class Reduce; class Reshape; @@ -743,6 +746,25 @@ class FFModel { bool qk_prod_scaling = true, bool position_bias = false, char const *name = NULL); + +Tensor specinfer_inc_multihead_self_attention( + const Tensor input, + int embed_dim, + int num_heads, + int kdim = 0, + int vdim = 0, + float dropout = 0.0f, + bool bias = false, + bool add_bias_kv = false, + bool add_zero_attn = false, + DataType data_type = DT_NONE, + Initializer *kernel_initializer = NULL, + bool apply_rotary_embedding = false, + bool scaling_query = false, + float scaling_factor = 1.0f, + bool qk_prod_scaling = true, + bool position_bias = false, + char const *name = NULL); Tensor inc_multiquery_self_attention(const Tensor input, int embed_dim, int num_q_heads, @@ -799,6 +821,26 @@ class FFModel { bool qk_prod_scaling = true, bool position_bias = false, char const *name = NULL); + + Tensor specinfer_inc_multiquery_self_attention( + const Tensor input, + int embed_dim, + int num_q_heads, + int num_kv_heads, + int kdim = 0, + int vdim = 0, + float dropout = 0.0f, + bool bias = false, + bool add_bias_kv = false, + bool add_zero_attn = false, + DataType data_type = DT_NONE, + Initializer *kernel_initializer = NULL, + bool apply_rotary_embedding = false, + bool scaling_query = false, + float scaling_factor = 1.0f, + bool qk_prod_scaling = true, + bool position_bias = false, + char const *name = NULL); // ======================================== // Inference APIs // ======================================== @@ -1200,6 +1242,9 @@ class FFModel { std::unordered_map< std::pair, TreeIncMultiHeadSelfAttention *>, + std::unordered_map< + std::pair, + SpecInferIncMultiHeadSelfAttention *>, std::unordered_map, Reduce *>, std::unordered_map, diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index 5b187839ef..cee2ae95a4 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -37,6 +37,7 @@ #include "flexflow/ops/topk_params.h" #include "flexflow/ops/transpose_params.h" #include "flexflow/ops/tree_inc_multihead_self_attention_params.h" +#include "flexflow/ops/specinfer_inc_multihead_self_attention_params.h" #include "flexflow/parallel_ops/allreduce_params.h" #include "flexflow/parallel_ops/combine_params.h" #include "flexflow/parallel_ops/fused_parallel_op_params.h" @@ -72,6 +73,7 @@ using OperatorParameters = mp::variant +#include + +namespace FlexFlow { + +class SpecInferIncMultiHeadSelfAttentionMeta; + +class SpecInferIncMultiHeadSelfAttention : public Op { +public: + using Params = SpecInferIncMultiHeadSelfAttentionParams; + using Input = ParallelTensor; + + SpecInferIncMultiHeadSelfAttention(FFModel &model, + LayerID const &layer_guid, + const ParallelTensor _input, + int _embed_dim, + int _num_q_heads, + int _num_kv_heads, + int _kdim, + int _vdim, + float _dropout, + bool _qkv_bias, + bool _final_bias, + bool _add_zero_attn, + bool _apply_rotary_embedding, + bool _scaling_query, + float _scaling_factor, + bool _qk_prod_scaling, + bool _position_bias, + bool allocate_weights, + char const *name); + SpecInferIncMultiHeadSelfAttention(FFModel &model, + const ParallelTensor _input, + const ParallelTensor _weight, + int _embed_dim, + int _num_q_heads, + int _num_kv_heads, + int _kdim, + int _vdim, + float _dropout, + bool _qkv_bias, + bool _final_bias, + bool _add_zero_attn, + bool _apply_rotary_embedding, + bool _scaling_query, + float _scaling_factor, + bool _qk_prod_scaling, + bool _position_bias, + bool allocate_weights, + char const *name); + SpecInferIncMultiHeadSelfAttention(FFModel &model, + SpecInferIncMultiHeadSelfAttention const &other, + const ParallelTensor input, + bool allocate_weights); + SpecInferIncMultiHeadSelfAttention(FFModel &model, + Params const ¶ms, + Input const &inputs, + bool allocate_weights = false, + char const *name = nullptr); + static Op * + create_operator_from_layer(FFModel &model, + Layer const *layer, + std::vector const &inputs); + void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void print_layer(FFModel const &model) override { + assert(0); + } + bool get_int_parameter(PMParameter, int *) const override; + + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + Op *materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const override; + bool measure_operator_cost(Simulator *sim, + MachineView const &mv, + CostMetrics &cost_metrics) const override; + + static void + inference_kernel_wrapper(SpecInferIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + int shard_id, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &bias); + Params get_params() const; + +public: + int num_q_heads, num_kv_heads, tensor_parallelism_degree; + float dropout, scaling_factor; + bool qkv_bias; + bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, + qk_prod_scaling, position_bias; + int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; + int qoSeqLength, kvSeqLength; +}; + +class SpecInferIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { +public: + SpecInferIncMultiHeadSelfAttentionMeta(FFHandler handler, + SpecInferIncMultiHeadSelfAttention const *attn, + GenericTensorAccessorR const &weight, + MemoryAllocator &gpu_mem_allocator, + int num_samples, + int _num_q_heads, + int _num_kv_heads); + ~SpecInferIncMultiHeadSelfAttentionMeta(void); + +public: + Realm::RegionInstance beam_search_reserve_inst; + BeamSearchBatchConfig::BeamSearchPerTokenInfo *beam_token_infos; + BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos; + BeamSearchBatchConfig::SpecInferTopology *beam_topology_mask; +}; + +}; // namespace FlexFlow + +#endif // _FLEXFLOW_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_H diff --git a/include/flexflow/ops/specinfer_inc_multihead_self_attention_params.h b/include/flexflow/ops/specinfer_inc_multihead_self_attention_params.h new file mode 100644 index 0000000000..b57b06a7f7 --- /dev/null +++ b/include/flexflow/ops/specinfer_inc_multihead_self_attention_params.h @@ -0,0 +1,33 @@ +#ifndef _FLEXFLOW_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H +#define _FLEXFLOW_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H + +#include "flexflow/ffconst.h" +#include "flexflow/fftype.h" +#include "flexflow/parallel_tensor.h" + +namespace FlexFlow { + +struct SpecInferIncMultiHeadSelfAttentionParams { + LayerID layer_guid; + int embed_dim, num_q_heads, num_kv_heads, kdim, vdim; + float dropout, scaling_factor; + bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, + scaling_query, qk_prod_scaling, position_bias; + + bool is_valid(ParallelTensorShape const &) const; +}; + +bool operator==(SpecInferIncMultiHeadSelfAttentionParams const &, + SpecInferIncMultiHeadSelfAttentionParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t + operator()(FlexFlow::SpecInferIncMultiHeadSelfAttentionParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index baf6844801..e67888d2d6 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -38,7 +38,8 @@ class InferenceManager { Legion::FutureMap inference(FFModel *model, int index, BatchConfigFuture const &bc); void load_input_tokens_from_batch_config(BatchConfigFuture const &bc, - ParallelTensor const input); + ParallelTensor const input, + FFHandler *handlers); void load_positions(BatchConfigFuture const &bc, ParallelTensor position_input, int offset); @@ -72,9 +73,10 @@ struct Request { struct BeamTree { struct treeLayer { BeamSearchBatchConfig::TokenId - tokens[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + tokens[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; int parent_ids[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; - float probs[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + float probs[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; + int nodes_num_this_layer = 0; }; treeLayer treeLayers[BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1]; }; @@ -100,6 +102,7 @@ class RequestManager { void set_max_tokens_per_batch(int max_num_tokens); int get_max_tokens_per_batch(); void set_max_sequence_length(int max_seq_length); + void push_spec_infer_tree_width(int tree_width); int get_max_sequence_length(); int register_ssm_model(FFModel *model); void register_tokenizer(ModelType model_type, @@ -148,6 +151,7 @@ class RequestManager { void store_beam_metadata(BeamSearchBatchConfig const &old_bc, BeamInferenceResult const &result); void update_beam_metadata(BeamSearchBatchConfig &new_bc, + BeamSearchBatchConfig const &old_bc, BeamTree &tree, int request_index); @@ -210,6 +214,7 @@ class RequestManager { int max_requests_per_batch; int max_tokens_per_batch; int max_sequence_length; + std::vector spec_infer_tree_width; // private fields std::unique_ptr tokenizer_; bool verbose; @@ -243,7 +248,8 @@ class RequestManager { private: struct ProfileInfo { - int decoding_steps; + int llm_decoding_steps; + int ssm_decoding_steps; double start_time, finish_time; }; std::unordered_map profiling_requests; diff --git a/inference/file_loader.cc b/inference/file_loader.cc index 7c6870d439..3f70ddf488 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -726,7 +726,8 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION || - l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) { + l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || + l->op_type == OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION) { if (weight_filename.find("self_attention") != std::string::npos) { load_attention_weights_multi_query( data, weight_filename, weights_folder, hidden_dim, num_heads); diff --git a/inference/models/llama.cc b/inference/models/llama.cc index b8fe70526d..f62df1b1d7 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -90,7 +90,7 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor mha; switch (mode) { case BEAM_SEARCH_MODE: { - mha = ff.spec_inc_multihead_self_attention( + mha = ff.specinfer_inc_multihead_self_attention( att_norm, llama_config.hidden_size, llama_config.num_attention_heads, @@ -246,7 +246,8 @@ void LLAMA::create_llama_model(FFModel &ff, if (mode == BEAM_SEARCH_MODE) { Tensor softmax = ff.softmax(dense, -1); // output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); - output = ff.argmax(softmax, /*beam_Search*/ true); + // output = ff.argmax(softmax, /*beam_Search*/ true); + output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); } else { // Tensor softmax = ff.softmax(dense, -1); if (generation_config.do_sample) { diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 8b0eb926d9..e2594ba87f 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -302,6 +302,9 @@ void FlexFlow::top_level_task(Task const *task, model_metadata.llm_tokenizer_path); rm->register_output_filepath(file_paths.output_file_path); + //first decoding step: 3 results + rm->push_spec_infer_tree_width(1); + // Create LLM model FFModel tree_model(ffconfig, ffconfig.cpu_offload); if (model_metadata.llm_model_type == ModelType::LLAMA) { diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index d60386f927..a59740f4a3 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -1098,4 +1098,23 @@ template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( DataType data_type, hipStream_t stream); +template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + float *output_ptr, + float const *weight_ptr, + float const *bias_ptr, + int num_tokens, + cudaStream_t stream); +template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + half *output_ptr, + half const *weight_ptr, + half const *bias_ptr, + int num_tokens, + cudaStream_t stream); + }; // namespace FlexFlow diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 695f4b13b9..4c184acb3c 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -826,17 +826,17 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, } // todo Xinhao copy how many requests if requests are not continous? - cudaMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - bc->num_active_tokens() * sizeof(BatchConfig::PerTokenInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->request_infos, - &(bc->requestsInfo), - bc->max_requests_per_batch() * - sizeof(BatchConfig::PerRequestInfo), - cudaMemcpyHostToDevice, - stream); + // cudaMemcpyAsync(m->token_infos, + // &(bc->tokensInfo), + // bc->num_active_tokens() * + // sizeof(BatchConfig::PerTokenInfo), cudaMemcpyHostToDevice, + // stream); + // cudaMemcpyAsync(m->request_infos, + // &(bc->requestsInfo), + // bc->max_requests_per_batch() * + // sizeof(BatchConfig::PerRequestInfo), + // cudaMemcpyHostToDevice, + // stream); // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, @@ -1375,14 +1375,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( break; } case BEAM_SEARCH_MODE: { + // a K-ary tree max node is (k^n - 1) / 2 key_cache_size = num_q_heads * kProjSize * BeamSearchBatchConfig::max_requests_per_batch() * BatchConfig::max_sequence_length() * - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; value_cache_size = num_q_heads * vProjSize * BeamSearchBatchConfig::max_requests_per_batch() * BatchConfig::max_sequence_length() * - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; break; } default: @@ -1400,10 +1401,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( (qkv_max_proj_size + key_cache_size + value_cache_size + 2 * qk_prod_size + attn_heads_size) * size_of_dt + - tokeninfo_size * sizeof(BatchConfig::PerTokenInfo) + - complex_size * sizeof(cuFloatComplex) + - requestinfo_size * - sizeof(BatchConfig::PerRequestInfo); // more components will + complex_size * sizeof(cuFloatComplex); // more components will // be added here later if (offload) { // assert that we have enough reserved work space left @@ -1447,10 +1445,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size * size_of_dt); + token_infos = + static_cast(handler.batch_config_metadata); + request_infos = static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo)); + if (offload) { - token_infos = - gpu_mem_allocator.allocate_reserved( - tokeninfo_size); + // token_infos = + // gpu_mem_allocator.allocate_reserved( + // tokeninfo_size); // offset += sizeof(BatchConfig::PerTokenInfo) * tokeninfo_size; qk_prods = gpu_mem_allocator.allocate_reserved_untyped(qk_prod_size * size_of_dt); @@ -1464,13 +1467,13 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( complex_input = gpu_mem_allocator.allocate_reserved(complex_size); // offset += complex_size * sizeof(cuFloatComplex); - request_infos = - gpu_mem_allocator.allocate_reserved( - requestinfo_size); + // request_infos = + // gpu_mem_allocator.allocate_reserved( + // requestinfo_size); } else { - token_infos = - gpu_mem_allocator.allocate_instance( - tokeninfo_size); + // token_infos = + // gpu_mem_allocator.allocate_instance( + // tokeninfo_size); qk_prods = gpu_mem_allocator.allocate_instance_untyped(qk_prod_size * size_of_dt); qk_prods_softmax = gpu_mem_allocator.allocate_instance_untyped( @@ -1479,9 +1482,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( size_of_dt); complex_input = gpu_mem_allocator.allocate_instance(complex_size); - request_infos = - gpu_mem_allocator.allocate_instance( - requestinfo_size); + // request_infos = + // gpu_mem_allocator.allocate_instance( + // requestinfo_size); } // allocate more size for quantization data diff --git a/src/ops/specinfer_inc_multihead_self_attention.cc b/src/ops/specinfer_inc_multihead_self_attention.cc new file mode 100644 index 0000000000..42074f39e4 --- /dev/null +++ b/src/ops/specinfer_inc_multihead_self_attention.cc @@ -0,0 +1,883 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/specinfer_inc_multihead_self_attention.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/model.h" +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "flexflow/utils/cuda_helper.h" +#else +#include "flexflow/utils/hip_helper.h" +#endif +#include "flexflow/utils/hash_utils.h" +#include "legion/legion_utilities.h" + +namespace FlexFlow { + +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::Future; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; +using PCG::Node; + +bool SpecInferIncMultiHeadSelfAttentionParams::is_valid( + ParallelTensorShape const &input) const { + bool is_valid = input.is_valid(); + return is_valid; +} + +Tensor FFModel::specinfer_inc_multihead_self_attention( + Tensor const input, + int embed_dim, + int num_heads, + int kdim, + int vdim, + float dropout, + bool qkv_bias, + bool final_bias, + bool add_zero_attn, + DataType data_type, + Initializer *kernel_initializer, + bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + bool position_bias, + char const *name) { + return specinfer_inc_multiquery_self_attention(input, + embed_dim, + num_heads, + num_heads, + kdim, + vdim, + dropout, + qkv_bias, + final_bias, + add_zero_attn, + data_type, + kernel_initializer, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + name); +} + +Tensor FFModel::specinfer_inc_multiquery_self_attention( + Tensor const input, + int embed_dim, + int num_q_heads, + int num_kv_heads, + int kdim, + int vdim, + float dropout, + bool qkv_bias, + bool final_bias, + bool add_zero_attn, + DataType data_type, + Initializer *kernel_initializer, + bool apply_rotary_embedding, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + bool position_bias, + char const *name) { + if (data_type == DT_NONE) { + data_type = input->data_type; + } + Layer *li = nullptr; + int weight_num = (qkv_bias || final_bias) ? 2 : 1; + if (data_type != input->data_type) { + Tensor casted_input = cast(input, data_type, "type cast for IncMHA"); + li = new Layer(this, + OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION, + data_type, + name, + 1 /*inputs*/, + weight_num /*weights*/, + 1 /*outputs*/, + casted_input); + } else { + li = new Layer(this, + OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION, + data_type, + name, + 1 /*inputs*/, + weight_num /*weights*/, + 1 /*outputs*/, + input); + } + { + int numdims = input->num_dims; + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdims; i++) { + dims[i] = input->dims[i]; + } + dims[0] = embed_dim; + li->outputs[0] = create_tensor_legion_ordering( + numdims, dims, data_type, li, 0, true /*create_grad*/); + } + // Compute weight size + int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, + oProjSize = embed_dim; + int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0]; + int qParas = qProjSize * qSize; + int kParas = kProjSize * kSize; + int vParas = vProjSize * vSize; + int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); + int weight_size = qParas * num_q_heads + kParas * num_q_heads + + vParas * num_q_heads + oParas * num_q_heads; + { + int dims[1] = {weight_size}; + li->weights[0] = create_weight_legion_ordering(1, + dims, + data_type, + li, + true /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); + } + if (qkv_bias || final_bias) { + // q, k, v, o + int qkv_bias_size = + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; + int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + + (final_bias ? oProjSize : 0)}; + li->weights[1] = create_weight_legion_ordering(1, + dims, + data_type, + li, + true /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); + } + li->data_type = data_type; + li->add_int_property("embed_dim", embed_dim); + li->add_int_property("num_q_heads", num_q_heads); + li->add_int_property("num_kv_heads", num_kv_heads); + li->add_int_property("kdim", kdim); + li->add_int_property("vdim", vdim); + li->add_int_property("qkv_bias", qkv_bias); + li->add_int_property("final_bias", final_bias); + li->add_int_property("add_zero_attn", add_zero_attn); + li->add_float_property("dropout", dropout); + li->add_int_property("apply_rotary_embedding", apply_rotary_embedding); + li->add_int_property("scaling_query", scaling_query); + li->add_float_property("scaling_factor", scaling_factor); + li->add_int_property("qk_prod_scaling", qk_prod_scaling); + li->add_int_property("position_bias", position_bias); + layers.push_back(li); + return li->outputs[0]; +} + +Op *SpecInferIncMultiHeadSelfAttention::create_operator_from_layer( + FFModel &model, + Layer const *layer, + std::vector const &inputs) { + + std::cout << "spec create operator: " << layer->name << "\n"; + long long value; + layer->get_int_property("embed_dim", value); + int embed_dim = value; + layer->get_int_property("num_q_heads", value); + int num_q_heads = value; + layer->get_int_property("num_kv_heads", value); + int num_kv_heads = value; + layer->get_int_property("kdim", value); + int kdim = value; + layer->get_int_property("vdim", value); + int vdim = value; + float dropout; + layer->get_float_property("dropout", dropout); + layer->get_int_property("qkv_bias", value); + bool qkv_bias = (bool)value; + layer->get_int_property("final_bias", value); + bool final_bias = (bool)value; + layer->get_int_property("add_zero_attn", value); + bool add_zero_attn = (bool)value; + layer->get_int_property("apply_rotary_embedding", value); + bool apply_rotary_embedding = (bool)value; + layer->get_int_property("scaling_query", value); + bool scaling_query = (bool)value; + float scaling_factor; + layer->get_float_property("scaling_factor", scaling_factor); + layer->get_int_property("qk_prod_scaling", value); + bool qk_prod_scaling = (bool)value; + layer->get_int_property("position_bias", value); + bool position_bias = (bool)value; + + return new SpecInferIncMultiHeadSelfAttention(model, + layer->layer_guid, + inputs[0], + embed_dim, + num_q_heads, + num_kv_heads, + kdim, + vdim, + dropout, + qkv_bias, + final_bias, + add_zero_attn, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + false /*allocate_weights*/, + layer->name); +} + +SpecInferIncMultiHeadSelfAttention::SpecInferIncMultiHeadSelfAttention( + FFModel &model, + LayerID const &_layer_guid, + ParallelTensor const _input, + int _embed_dim, + int _num_q_heads, + int _num_kv_heads, + int _kdim, + int _vdim, + float _dropout, + bool _qkv_bias, + bool _final_bias, + bool _add_zero_attn, + bool _apply_rotary_embedding, + bool _scaling_query, + float _scaling_factor, + bool _qk_prod_scaling, + bool _position_bias, + bool allocate_weights, + char const *name) + // Initializer* _bias_initializer) + : Op(model, + OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION, + _input->data_type, + name, + 1 /*inputs*/, + (_qkv_bias || _final_bias ? 2 : 1) /*weights*/, + 1 /*outputs*/, + _input), + num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), + qkv_bias(_qkv_bias), final_bias(_final_bias), + add_zero_attn(_add_zero_attn), + apply_rotary_embedding(_apply_rotary_embedding), + qSize(_input->dims[0].size), kSize(_input->dims[0].size), + vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), + vProjSize(_vdim), oProjSize(_embed_dim), + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), + scaling_query(_scaling_query), scaling_factor(_scaling_factor), + qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias) { + // overwrite layer_guid + layer_guid = _layer_guid; + + numOutputs = 1; + int numdim = _input->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = _input->dims[i]; + } + dims[0].size = _embed_dim; + // Currently require no parallelism along this dim + assert(dims[0].degree == 1); + if (allocate_weights) { + // Create weight tensor + int num_dims = inputs[0]->num_dims; + // Compute weight size + int qParas = this->qProjSize * this->qSize; + int kParas = this->kProjSize * this->kSize; + int vParas = this->vProjSize * this->vSize; + int oParas = + this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); + ParallelDim dims[2]; + dims[0] = inputs[0]->dims[num_dims - 2]; + dims[0].size = dims[0].degree; + dims[1] = inputs[0]->dims[num_dims - 1]; + dims[1].size = this->num_q_heads * (qParas + oParas) + + this->num_q_heads * (kParas + vParas); + dims[1].is_replica_dim = false; + int seed = std::rand(); + Initializer *initializer = new GlorotUniform(seed); + weights[0] = model.create_parallel_weight<2>(dims, + this->data_type, + NULL /*owner_op*/, + true /*create_grad*/, + initializer, + CHOSEN_SYNC_TYPE); + if (qkv_bias || final_bias) { + ParallelTensorShape bias_shape = _input->get_shape(); + int qkv_bias_size = + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; + bias_shape.dims[0].size = + (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); + bias_shape.dims[1].size = bias_shape.dims[2].size = 1; + weights[1] = + model.create_parallel_weight_legion_ordering(bias_shape.num_dims, + bias_shape.dims, + this->data_type, + nullptr /*owner_op*/, + true /*create_grad*/, + initializer, + CHOSEN_SYNC_TYPE); + } + } + + outputs[0] = model.create_parallel_tensor_legion_ordering( + _input->num_dims, dims, this->data_type, this); + /* for (int i = 0; i < numdim; i++) { */ + /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ + /* } */ + /* // Check correctness */ + /* assert(check_output_input_weight_parallel_dims()); */ +} + +SpecInferIncMultiHeadSelfAttention::SpecInferIncMultiHeadSelfAttention( + FFModel &model, + ParallelTensor const _input, + ParallelTensor const _weight, + int _embed_dim, + int _num_q_heads, + int _num_kv_heads, + int _kdim, + int _vdim, + float _dropout, + bool _qkv_bias, + bool _final_bias, + bool _add_zero_attn, + bool _apply_rotary_embedding, + bool _scaling_query, + float _scaling_factor, + bool _qk_prod_scaling, + bool _position_bias, + bool allocate_weights, + char const *name) + // Initializer* _bias_initializer) + : Op(model, + OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION, + _input->data_type, + name, + 1 /*inputs*/, + (_qkv_bias || _final_bias ? 2 : 1) /*weights*/, + 1 /*outputs*/, + _input, + _weight), + num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), + qkv_bias(_qkv_bias), final_bias(_final_bias), + add_zero_attn(_add_zero_attn), + apply_rotary_embedding(_apply_rotary_embedding), + qSize(_input->dims[0].size), kSize(_input->dims[0].size), + vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), + vProjSize(_vdim), oProjSize(_embed_dim), + qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), + scaling_query(_scaling_query), scaling_factor(_scaling_factor), + qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias) +// bias_initializer(_bias_initializer) +{ + numOutputs = 1; + int numdim = _input->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = _input->dims[i]; + } + dims[0].size = _embed_dim; + // Currently require no parallelism along this dim + assert(dims[0].degree == 1); + if (allocate_weights) { + // Create weight tensor + int num_dims = inputs[0]->num_dims; + // Compute weight size + int qParas = this->qProjSize * this->qSize; + int kParas = this->kProjSize * this->kSize; + int vParas = this->vProjSize * this->vSize; + int oParas = + this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); + ParallelDim dims[2]; + dims[0] = inputs[0]->dims[num_dims - 2]; + dims[0].size = dims[0].degree; + dims[1] = inputs[0]->dims[num_dims - 1]; + dims[1].size = this->num_q_heads * (qParas + oParas) + + this->num_q_heads * (kParas + vParas); + dims[1].is_replica_dim = false; + // dims[2].size = qParas + kParas + vParas + oParas; + int seed = std::rand(); + Initializer *initializer = new GlorotUniform(seed); + weights[0] = model.create_parallel_weight<2>(dims, + this->data_type, + NULL /*owner_op*/, + true /*create_grad*/, + initializer, + CHOSEN_SYNC_TYPE); + if (qkv_bias || final_bias) { + ParallelTensorShape bias_shape = _input->get_shape(); + int qkv_bias_size = + qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; + bias_shape.dims[0].size = + (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); + bias_shape.dims[1].size = bias_shape.dims[2].size = 1; + weights[1] = + model.create_parallel_weight_legion_ordering(bias_shape.num_dims, + bias_shape.dims, + this->data_type, + nullptr /*owner_op*/, + true /*create_grad*/, + initializer, + CHOSEN_SYNC_TYPE); + } + } + + outputs[0] = model.create_parallel_tensor_legion_ordering( + _input->num_dims, dims, this->data_type, this); + + /* for (int i = 0; i < numdim; i++) { */ + /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ + /* } */ + /* register_output_weight_parallel_dims(outputs[0], numdim-1, _weight, 1); */ + /* register_output_weight_parallel_dims(outputs[0], numdim-2, _weight, 2); */ + // Check correctness + /* assert(check_output_input_weight_parallel_dims()); */ +} + +SpecInferIncMultiHeadSelfAttention::SpecInferIncMultiHeadSelfAttention( + FFModel &model, + SpecInferIncMultiHeadSelfAttention const &other, + ParallelTensor const input, + bool allocate_weights) + : SpecInferIncMultiHeadSelfAttention(model, + other.layer_guid, + input, + other.oProjSize, + other.num_q_heads, + other.num_kv_heads, + other.qProjSize, + other.vProjSize, + other.dropout, + other.qkv_bias, + other.final_bias, + other.add_zero_attn, + other.apply_rotary_embedding, + other.scaling_query, + other.scaling_factor, + other.qk_prod_scaling, + other.position_bias, + allocate_weights, + other.name) {} + +SpecInferIncMultiHeadSelfAttention::SpecInferIncMultiHeadSelfAttention( + FFModel &model, + SpecInferIncMultiHeadSelfAttentionParams const ¶ms, + ParallelTensor const &input, + bool allocate_weights, + char const *name) + : SpecInferIncMultiHeadSelfAttention(model, + params.layer_guid, + input, + params.embed_dim, + params.num_q_heads, + params.num_kv_heads, + params.kdim, + params.vdim, + params.dropout, + params.qkv_bias, + params.final_bias, + params.add_zero_attn, + params.apply_rotary_embedding, + params.scaling_query, + params.scaling_factor, + params.qk_prod_scaling, + params.position_bias, + allocate_weights, + name) {} + +void SpecInferIncMultiHeadSelfAttention::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = batch_outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher( + SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(SpecInferIncMultiHeadSelfAttention)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + +void SpecInferIncMultiHeadSelfAttention::init(FFModel const &ff) { + assert(check_output_input_weight_same_parallel_is()); + parallel_is = outputs[0]->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher( + SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(SpecInferIncMultiHeadSelfAttention)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(2, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +/* + regions[0](I): input + regions[1](I): weight + regions[2](O): output +*/ +OpMeta *SpecInferIncMultiHeadSelfAttention::init_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + SpecInferIncMultiHeadSelfAttention const *attn = + (SpecInferIncMultiHeadSelfAttention *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + + GenericTensorAccessorR input = + helperGetGenericTensorAccessorRO(attn->inputs[0]->data_type, + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR weight = + helperGetGenericTensorAccessorRO(attn->weights[0]->data_type, + regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW output = + helperGetGenericTensorAccessorWO(attn->outputs[0]->data_type, + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); + + int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1; + assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); + assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); + int num_q_heads = attn->num_q_heads; + int num_kv_heads = attn->num_kv_heads; + assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); + + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + MemoryAllocator gpu_mem_allocator(gpu_mem); + // We don't do offloading for SSMs (small speculative models) + SpecInferIncMultiHeadSelfAttentionMeta *m = + new SpecInferIncMultiHeadSelfAttentionMeta(handle, + attn, + weight, + gpu_mem_allocator, + num_samples, + num_q_heads, + num_kv_heads); + // assert that we didn't over allocate memory + assert(gpu_mem_allocator.instance_allocated_size == + gpu_mem_allocator.instance_total_size); + m->profiling = attn->profiling; + m->inference_debugging = attn->inference_debugging; + std::strcpy(m->op_name, attn->name); + m->layer_guid = attn->layer_guid; + assert(weight.domain.get_volume() * data_type_size(weight.data_type) == + m->weightSize); + return m; +} + +void SpecInferIncMultiHeadSelfAttention::forward(FFModel const &ff) { + // SpecInferIncMultiHeadSelfAttention doesn't support forward + assert(false); +} + +FutureMap SpecInferIncMultiHeadSelfAttention::inference( + FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + int idx = 0; + IndexLauncher launcher(SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(idx++, FID_DATA); + + if (qkv_bias || final_bias) { + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(idx++, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): input + regions[3](I): weight + regions[4](O): output +*/ +void SpecInferIncMultiHeadSelfAttention::inference_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == regions.size()); + + BeamSearchBatchConfig const &bc = + Future(task->futures[0]).get_result(); + if (bc.num_tokens == 0) { + return; + } + + SpecInferIncMultiHeadSelfAttentionMeta *m = + *((SpecInferIncMultiHeadSelfAttentionMeta **)task->local_args); + assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4 + : regions.size() == 3)); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + biases = helperGetGenericTensorAccessorRO(m->weight_type[1], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + Domain bias_domain = runtime->get_index_space_domain( + ctx, task->regions[3].region.get_index_space()); + assert(bias_domain.get_dim() == 4); + } + Domain input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain weight_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + Domain output_domain = runtime->get_index_space_domain( + ctx, task->regions[2].region.get_index_space()); + + assert(input_domain.get_dim() == 4); + assert(weight_domain.get_dim() == 2); + assert(output_domain.get_dim() == 4); + + assert(task->index_point.get_dim() == 1); + SpecInferIncMultiHeadSelfAttention::inference_kernel_wrapper( + m, &bc, task->index_point.point_data[0], input, weight, output, biases); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + std::vector weights_accessors; + weights_accessors.push_back(weight); + if (*m->qkv_bias || *m->final_bias) { + weights_accessors.push_back(biases); + } + SpecInferIncMultiHeadSelfAttention::save_inference_tensors_to_file( + m, shard_id, &bc, {input}, weights_accessors, {output}); + } +} + +void SpecInferIncMultiHeadSelfAttention::backward(FFModel const &ff) { + // SpecInferIncMultiHeadSelfAttention does not support backward + assert(false); +} + +bool SpecInferIncMultiHeadSelfAttention::get_int_parameter(PMParameter para, + int *value) const { + switch (para) { + case PM_NUM_HEADS: + *value = num_q_heads; + return true; + default: + return Op::get_int_parameter(para, value); + } +} + +Op *SpecInferIncMultiHeadSelfAttention::materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const { + SpecInferIncMultiHeadSelfAttentionParams params = get_params(); + return new SpecInferIncMultiHeadSelfAttention( + ff, params, inputs[0], true, this->name); +} + +bool SpecInferIncMultiHeadSelfAttention::measure_operator_cost( + Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { + return false; +} + +bool operator==(SpecInferIncMultiHeadSelfAttentionParams const &lhs, + SpecInferIncMultiHeadSelfAttentionParams const &rhs) { + return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim && + lhs.num_q_heads == rhs.num_q_heads && lhs.kdim == rhs.kdim && + lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && + lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias && + lhs.add_zero_attn == rhs.add_zero_attn && + lhs.apply_rotary_embedding == rhs.apply_rotary_embedding && + lhs.scaling_query == rhs.scaling_query && + lhs.scaling_factor == rhs.scaling_factor && + lhs.qk_prod_scaling == rhs.qk_prod_scaling && + lhs.position_bias == rhs.position_bias; +} + +SpecInferIncMultiHeadSelfAttentionParams + SpecInferIncMultiHeadSelfAttention::get_params() const { + SpecInferIncMultiHeadSelfAttentionParams params; + params.layer_guid = this->layer_guid; + params.embed_dim = this->oProjSize; + params.num_q_heads = this->num_q_heads; + params.num_kv_heads = this->num_kv_heads; + params.kdim = this->kProjSize; + params.vdim = this->vProjSize; + params.dropout = this->dropout; + params.qkv_bias = this->qkv_bias; + params.final_bias = this->final_bias; + params.add_zero_attn = this->add_zero_attn; + params.apply_rotary_embedding = this->apply_rotary_embedding; + params.scaling_query = this->scaling_query; + params.scaling_factor = this->scaling_factor; + params.qk_prod_scaling = this->qk_prod_scaling; + params.position_bias = this->position_bias; + + return params; +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::SpecInferIncMultiHeadSelfAttentionParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.layer_guid.id); + hash_combine(key, params.embed_dim); + hash_combine(key, params.num_q_heads); + hash_combine(key, params.num_kv_heads); + hash_combine(key, params.kdim); + hash_combine(key, params.vdim); + hash_combine(key, params.dropout); + hash_combine(key, params.qkv_bias); + hash_combine(key, params.final_bias); + hash_combine(key, params.add_zero_attn); + hash_combine(key, params.apply_rotary_embedding); + hash_combine(key, params.scaling_query); + hash_combine(key, params.scaling_factor); + hash_combine(key, params.qk_prod_scaling); + hash_combine(key, params.position_bias); + return key; +} +}; // namespace std diff --git a/src/ops/specinfer_inc_multihead_self_attention.cu b/src/ops/specinfer_inc_multihead_self_attention.cu new file mode 100644 index 0000000000..0bdf07a9d7 --- /dev/null +++ b/src/ops/specinfer_inc_multihead_self_attention.cu @@ -0,0 +1,890 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "cuComplex.h" +#endif +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" +#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" +#include "flexflow/ops/specinfer_inc_multihead_self_attention.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { + +#define WARP_SIZE 32 + +// declare Legion names +using Legion::coord_t; +using Legion::Memory; +using namespace Kernels::IncMultiHeadAttention; + +namespace Kernels { +namespace SpecInferIncMultiHeadAttention { + +template +__global__ void compute_specinfer_attention_kernel_generation_kernel( + DT const *query, + DT const *key_cache, + DT const *value_cache, + DT *output_ptr, + float const scale, + int const max_seq_length, + int per_head_size, + int hidden_size, + BatchConfig::PerRequestInfo *request_infos, + BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos, + BeamSearchBatchConfig::SpecInferTopology *topology_mask, + int max_tree_branches) { + + // q, k + using Q_vec = typename VEC_K::Type; + using K_vec = typename VEC_K::Type; + using V_vec = typename VEC_V
::Type; + using Out_sum = typename Vec_fp32_::Type; + + constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; + + constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT); + constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY; + constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; + // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT); + + // thread id + int const tidx = threadIdx.x; + // head id + int const head_idx = blockIdx.x; + // request idx + int const request_idx = blockIdx.y; + + BeamSearchBatchConfig::SpecInferTopology topology = + topology_mask[request_idx]; + + int const first_step = 0; + + int const tlength = request_infos[request_idx].first_token_depth_in_request + + request_infos[request_idx].num_tokens_in_batch; + // int const qlength = request_infos[request_idx].num_tokens_in_batch; + int const tree_branch_num = beam_request_infos[request_idx].sub_request_num; + + // will decode qlength tokens in this thread block + // int const qlength = tree_branch_num; + + int first_token_idx = 0; + for (int r = 0; r < request_idx; r++) { + first_token_idx += request_infos[request_idx].num_tokens_in_batch; + } + + // shared memory objects + extern __shared__ char smem_[]; + + float *qk_smem = reinterpret_cast(smem_); + float *out_smem = reinterpret_cast(smem_); + + float qk_max = -FLT_MAX; + + // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum + __shared__ float red_smem[WARPS_PER_BLOCK * 2]; + + const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM + + head_idx * per_head_size; + __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; + + // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE + int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE; + int ki_o = tidx % THREADS_PER_KEY; + // the first key's offset for this thread + // ko = 0, 0, 0, 0, 1, 1, 1, 1, .... + int ko = tidx / THREADS_PER_KEY; + // load q tensor + Q_vec q_vec[K_VECS_PER_THREAD]; + + constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY; + // The number of keys per warp. + constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; + + DT const *k_cache_batch = + key_cache + + request_idx * max_seq_length * hidden_size * max_tree_branches + ki; + + int ti_end = + div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; + + for (int sub_req_idx = 0; sub_req_idx < tree_branch_num; sub_req_idx += 1) { +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + q_vecs[ki_o][ii] = *reinterpret_cast( + q_ptr + (hidden_size * QKV_WEIGHT_NUM * sub_req_idx) + ki + + ii * THREADS_PER_KEY * K_VEC_SIZE); + } + __syncthreads(); + for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { + K_vec k[K_VECS_PER_THREAD]; + int const ti_circ = ti % max_seq_length; + + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; + if (ti < tlength) { + // find the real position of the cache; + // depth: 0, 1, 2, 3, 4, 4, 5, 5 ,5, 5, + int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti]; + k[ii] = *reinterpret_cast( + k_cache_batch + real_cache_idx * hidden_size + + head_idx * per_head_size + jj); + } + } + float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); + + if (ti < tlength && tidx % THREADS_PER_KEY == 0) { + // todo add alobi here + bool const mask = ti_circ >= tlength; + if (mask) { + assert(false); + } + qk_max = mask ? qk_max : fmaxf(qk_max, qk); + qk_smem[ti - first_step] = mask ? 0.f : qk; + } + } + + __syncthreads(); + +#pragma unroll + for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { + qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + } + + // Decompose the thread index into warp and lane. + int const warp = tidx / WARP_SIZE; + int const lane = tidx % WARP_SIZE; + + // The warp leader writes the max to shared memory. + if (lane == 0) { + red_smem[warp] = qk_max; + } + + // Make sure the products are in shared memory. + __syncthreads(); + + // The warps finalize the reduction. + qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; +#pragma unroll + for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + } + + // Broadcast to all the threads in the warp. + qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); + + float exp_sum = 0.f; + for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { + float logit = __expf(qk_smem[ti - first_step] - qk_max); + exp_sum += logit; + qk_smem[ti - first_step] = logit; + } + + // Compute the sum. + exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); + + // softmax + float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); + for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { + qk_smem[ti - first_step] *= inv_sum; + } + + __syncthreads(); + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("softmax %.10f\n", qk_smem[0]); + // } + + // value projection + constexpr int V_VEC_SIZE = 16 / sizeof(DT); + // A vector of V elements for the current timestep. + // using V_vec_k = typename V_vec_k_::Type; + // using V_vec_acum = typename V_vec_acum_fp32_::Type; + + // The value computed by this thread. + int vo = tidx / THREADS_PER_VALUE; + // The hidden dimensions computed by this particular thread. + int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; + constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; + + Out_sum out; + zero(out); + + // The base pointer for the value in the cache buffer. + DT const *v_cache_batch = + value_cache + + request_idx * max_seq_length * hidden_size * max_tree_branches + vi; + // DT const *v_cache_batch = + // value_cache + + // (beam_request_idx * max_beam_width + beam_sub_request_idx) * + // max_seq_length * hidden_size + + // vi; + + if (Dh == Dh_MAX || vi < Dh) { + for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { + // Load the values from the cache. + int const ti_circ = ti % max_seq_length; + int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti]; + V_vec v = *reinterpret_cast( + v_cache_batch + real_cache_idx * hidden_size + + head_idx * per_head_size); + float logit = qk_smem[ti - first_step]; + out = FlexFlow::fma(logit, cast_to_float(v), out); + } + } + + // // Make sure we can start writing to shared memory. + __syncthreads(); + + // Run the final reduction amongst the different groups computing different + // partial outputs. + if (Dh == Dh_MAX || vi < Dh) { +#pragma unroll + for (int active_groups = V_PER_ITER; active_groups >= 2; + active_groups /= 2) { + + // The midpoint in the number of active groups. + int midpoint = active_groups / 2; + + // The upper part of active threads store to shared memory. + if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) { + *reinterpret_cast(out_smem + (vo - midpoint) * Dh + vi) = + out; + } + __syncthreads(); + + // The bottom warps update their values. + if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) { + out = add(*reinterpret_cast(out_smem + vo * Dh + vi), + out); + } + __syncthreads(); + } + } + + // Output the final values. + if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { + convert_from_float( + *reinterpret_cast(output_ptr + request_idx * hidden_size + + head_idx * per_head_size + vi), + out); + } + } +} + +template +__global__ void specinfer_store_kv_cache( + DT const *devQKVProjArray, + DT *kCache_ptr, + DT *vCache_ptr, + BatchConfig::PerTokenInfo *tokenInfos, + BatchConfig::PerRequestInfo *requestInfo, + BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos, + BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos, + BeamSearchBatchConfig::SpecInferTopology *beam_topology_mask, + int qProjSize, + int kProjSize, + int vProjSize, + int num_tokens, + int max_seq_len, + int max_tree_branches, + bool is_root, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * 2) { + int token_idx = i / (hidden_size * KV_WEIGHT_NUM); + int offset = i % hidden_size; + + size_t val_idx = + token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; + + DT kVal = devQKVProjArray[val_idx]; + DT vVal = devQKVProjArray[val_idx + hidden_size]; + + // above no need to be changed + // int const req_id = id_map[token_idx].request_index; + // int const tok_id = id_map[token_idx].token_position; + // int const sub_req_id = id_map[token_idx].sub_request_index; + // int const parent_id = id_map[token_idx].parent_id; + // int const beam_depth = id_map[token_idx].beam_depth; + // int const beam_width = id_map[token_idx].beam_width; + + int const req_id = tokenInfos[token_idx].request_index; + int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; + // int const parent_id = beamRequestInfos[req_id].parent_id[sub_req_id]; + // int const beam_depth = beamRequestInfos[req_id].current_depth; + // int const beam_width = beamRequestInfos[req_id].beam_size; + int const allocated_tokens = beam_topology_mask[req_id].allocated_tokens; + + kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + + (allocated_tokens + sub_req_id) * hidden_size + offset] = kVal; + vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + + (allocated_tokens + sub_req_id) * hidden_size + offset] = vVal; + } +} + +template +void update_kv_cache_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + cudaStream_t stream) { + int num_tokens = bc->num_active_tokens(); + int curr_depth = bc->beamRequestsInfo[0].current_depth; + // printf("curr depth: %d\n", curr_depth); + // assert(curr_depth < 3); + if (num_tokens > 0) { + int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens; + specinfer_store_kv_cache<<>>( + static_cast
(m->devQKVProjArray), + static_cast
(m->keyCache), + static_cast
(m->valueCache), + m->token_infos, + m->request_infos, + m->beam_token_infos, + m->beam_request_infos, + m->beam_topology_mask, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens, + BatchConfig::max_sequence_length(), + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES, + /*root*/ curr_depth == 0, + m->hidden_size); + } +} + +#define LAUNCH_SPECINFER_ATTENTION_SCORE_KERNEL( \ + DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream) \ + smem_sz = smem_size_in_bytes
(m->qProjSize, \ + BatchConfig::max_sequence_length(), \ + THREADS_PER_VALUE, \ + THDS_PER_BLOCK); \ + compute_specinfer_attention_kernel_generation_kernel \ + <<>>( \ + static_cast
(m->devQKVProjArray), \ + static_cast
(m->keyCache), \ + static_cast
(m->valueCache), \ + output_ptr, \ + scale, \ + BatchConfig::max_sequence_length(), \ + m->qProjSize, \ + m->hidden_size, \ + m->request_infos, \ + m->beam_request_infos, \ + m->beam_topology_mask, \ + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES) + +template +void compute_specinfer_attention_kernel_generation( + SpecInferIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + DT *output_ptr, + cudaStream_t stream) { + // one block == one head per request + dim3 grid(m->num_q_heads, bc->num_active_requests()); + int const per_head_size = m->qProjSize; + float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; + size_t smem_sz; + if (per_head_size == 64) { + constexpr int THREADS_PER_VALUE_64 = threads_per_value_t::value; + LAUNCH_SPECINFER_ATTENTION_SCORE_KERNEL( + DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream); + } else if (per_head_size == 128) { + constexpr int THREADS_PER_VALUE_128 = threads_per_value_t::value; + LAUNCH_SPECINFER_ATTENTION_SCORE_KERNEL( + DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream); + } else { + assert(false && "a unsupported head size"); + } +} + +template +__global__ void spec_fill_entries_above_diagonal(DT *matrix, + size_t new_tokens, + size_t total_tokens_in_request, + size_t num_q_heads, + DT value) { + CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_q_heads) { + // size_t head_idx = i / (new_tokens * total_tokens_in_request); + size_t src_idx = (i / new_tokens) % total_tokens_in_request; + size_t dst_idx = i % new_tokens + total_tokens_in_request - new_tokens; + // Casual Mask + if (src_idx > dst_idx) { + matrix[i] = value; + } + } +} + +template +void compute_attention_kernel_prompt( + SpecInferIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + int shard_id, + DT *output_ptr, + DT const *bias_ptr, + DT const *weight_ptr, + cudaStream_t stream) { + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + assert(data_type_size(m->output_type[0]) == sizeof(DT)); +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + cudaDataType_t compute_type = cublas_data_type; +#else + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } +#endif + // int num_requests = bc->num_active_requests(); + int num_tokens = bc->num_active_tokens(); + int tokens_previous_requests = 0; + int tokens_prev_requests_squares = 0; + // int qkv_block_size = + // (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; + int q_block_size = m->qProjSize; + + int kt_block_size = m->kProjSize; + int kt_req_block_size = + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_block_size = m->vProjSize; + int vt_req_block_size = + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + assert(m->qProjSize == m->kProjSize); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } else if (tokens_previous_requests < bc->num_generation_tokens) { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + + // all requests in prompt phase should only have one sub requests; + assert(bc->sub_requests[i] == 1); + // int num_new_tokens = bc->num_processing_tokens[i]; + // int total_tokens = bc->token_last_available_idx[i] + 1; + + int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + + bc->requestsInfo[i].num_tokens_in_batch; + + if (num_new_tokens <= 0) { + continue; + } + + // Compute (QK^T/sqrt(d_k)) + int m_ = num_new_tokens; + int n = total_tokens; + int k = m->qProjSize; + int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, + ldc = m_; + int strideA = q_block_size; + int strideB = kt_block_size; + int strideC = num_new_tokens * total_tokens; + + // a flag of using this scaling alpha + DT alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + } + // To get A, skip over Q entries from previous requests (same head) + DT const *A = static_cast
(m->devQKVProjArray) + + bc->requestsInfo[i].first_token_offset_in_batch * + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; + // To get B, skip over K entries from previous requests (all heads + + // padding) + DT const *B = static_cast
(m->keyCache) + + (i * bc->MAX_SPECULATIVE_TREE_BRANCHES) * kt_req_block_size; + + // if (i == 0 && sub_req_id == 0 && + // bc->beam_slots.at(0).current_depth == 1) { + // int offset = (float *)B - m->keyCache; + // printf("key cache offset %d\n", kt_req_block_size); + // } + // To get C, skip over QK^T products from previous requests + DT *C = static_cast
(m->qk_prods) + + m->num_q_heads * tokens_prev_requests_squares; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // add alibi position bias to qk production + // add alibi position bias to qk production + if (*m->position_bias) { + size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; + apply_position_bias_qkprd<<>>(C, + num_new_tokens, + total_tokens, + m->num_q_heads, + m->global_num_q_heads, + shard_id); + } + // Fill all elements above diagonal in qk prods with -inf to force + // causal attention. + assert(num_new_tokens <= total_tokens); + if (num_new_tokens > 1) { + size_t parallelism = m->num_q_heads * num_new_tokens * total_tokens; + spec_fill_entries_above_diagonal<<>>(C, + num_new_tokens, + total_tokens, + m->num_q_heads, + static_cast
(-INFINITY)); + } + // Compute Softmax(QK^T/sqrt(d_k)) + // Before modifying the parameters below, make sure to read the following + // description of the CUDNN_TENSOR_NCHW tensor layout, from + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: + // This tensor format specifies that the data is laid out in the following + // order: batch size, feature maps, rows, columns. The strides are + // implicitly defined in such a way that the data are contiguous in memory + // with no padding between images, feature maps, rows, and columns; the + // columns are the inner dimension and the images are the outermost + // dimension. + int n_param = m->num_q_heads; + int c_param = total_tokens; + int h_param = 1; + int w_param = num_new_tokens; + checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + n_param, + c_param, + h_param, + w_param)); + float softmax_alpha = 1.0f, softmax_beta = 0.0f; + DT *C_softmax = static_cast
(m->qk_prods_softmax) + + m->num_q_heads * tokens_prev_requests_squares; + // The softmax operation below is executed according to the + // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The + // softmax operation is computed per spatial location (H,W) per image (N) + // across dimension C. + checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &softmax_alpha, + m->qk_tensor, + C, + &softmax_beta, + m->qk_tensor, + C_softmax)); + // Matmul softmax(QK^T/sqrt(d_k)) by V + alpha = 1.0f, beta = 0.0f; + m_ = m->vProjSize; + n = num_new_tokens; + k = total_tokens; + lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; + strideA = vt_block_size; + strideB = num_new_tokens * total_tokens; + strideC = m->vProjSize; + // To get A, skip over V^T entries from previous requests (all heads + + // padding) + A = static_cast
(m->valueCache) + + (i * bc->MAX_SPECULATIVE_TREE_BRANCHES) * vt_req_block_size; + // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous + // requests (all heads) + B = C_softmax; + // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous + // requests + C = static_cast
(m->attn_heads) + + (tokens_previous_requests + bc->num_generation_tokens) * + m->num_q_heads * m->vProjSize; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + tokens_previous_requests += num_new_tokens; + tokens_prev_requests_squares += num_new_tokens * total_tokens; + } + + // assert(tokens_previous_requests == num_tokens); +} + +template +void inference_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + int shard_id, + DT const *input_ptr, + DT const *weight_ptr, + DT *output_ptr, + DT const *bias_ptr, + cudaStream_t stream) { + // phase 1: Implement kernel to compute KQV for input tokens + compute_qkv_kernel(m, + bc, + shard_id, + input_ptr, + weight_ptr, + static_cast
(m->devQKVProjArray), + bias_ptr, + stream); + // phase 2: Update key/val cache + update_kv_cache_kernel
(m, bc, stream); + if (bc->num_generation_tokens > 0) { + compute_specinfer_attention_kernel_generation
( + m, bc, static_cast
(m->attn_heads), stream); + } + // phase 3: Compute attention score + // 3 kernels for pahse 3: matmul1 - softmax - matmal2 + if (bc->num_tokens > bc->num_generation_tokens) { + compute_attention_kernel_prompt( + m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); + } + + // compute output production and bias together for all tokens + int num_tokens = bc->num_active_tokens(); + + compute_o_prod_bias( + m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); +} + +} // namespace SpecInferIncMultiHeadAttention +} // namespace Kernels + +/*static*/ +void SpecInferIncMultiHeadSelfAttention::inference_kernel_wrapper( + SpecInferIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + int shard_id, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &bias) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + bool use_bias = *m->qkv_bias || *m->final_bias; + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + assert(input.data_type == weight.data_type); + assert(input.data_type == output.data_type); + if (use_bias) { + assert(input.data_type == bias.data_type); + } + + if (input.data_type == DT_HALF) { + half const *bias_ptr = + use_bias ? bias.get_half_ptr() : static_cast(nullptr); + Kernels::SpecInferIncMultiHeadAttention::inference_kernel( + m, + bc, + shard_id, + input.get_half_ptr(), + weight.get_half_ptr(), + output.get_half_ptr(), + bias_ptr, + stream); + } else if (input.data_type == DT_FLOAT) { + float const *bias_ptr = + use_bias ? bias.get_float_ptr() : static_cast(nullptr); + Kernels::SpecInferIncMultiHeadAttention::inference_kernel( + m, + bc, + shard_id, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr(), + bias_ptr, + stream); + } else { + assert(false && "Unspported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("SpecInferIncMultiHeadSelfAttention forward time = %.2fms\n", + elapsed); + // print_tensor<3, float>(acc_query.ptr, acc_query.rect, + // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, + // acc_output.rect, "[Attention:forward:output]"); + } +} + +SpecInferIncMultiHeadSelfAttentionMeta::SpecInferIncMultiHeadSelfAttentionMeta( + FFHandler handler, + SpecInferIncMultiHeadSelfAttention const *attn, + GenericTensorAccessorR const &weight, + MemoryAllocator &gpu_mem_allocator, + int num_samples, + int _num_q_heads, + int _num_kv_heads) + : IncMultiHeadSelfAttentionMeta(handler, + BEAM_SEARCH_MODE, + attn, + attn->qSize, + attn->kSize, + attn->vSize, + attn->qProjSize, + attn->kProjSize, + attn->vProjSize, + attn->oProjSize, + attn->apply_rotary_embedding, + attn->qkv_bias, + attn->scaling_query, + attn->qk_prod_scaling, + attn->position_bias, + attn->final_bias, + attn->scaling_factor, + weight, + gpu_mem_allocator, + num_samples, + attn->num_q_heads, + attn->num_kv_heads, + _num_q_heads, + _num_kv_heads, + DT_NONE, + false) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + checkCUDNN(cudnnSetStream(handler.dnn, stream)); + + // allocate memory for the seqArray and reserve space + { + // int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); + // size_t beam_tokeninfo_size = + // max_tokens_per_batch * BeamSearchBatchConfig::MAX_BEAM_WIDTH; + // size_t requestinfo_size = + // BeamSearchBatchConfig::max_requests_per_batch(); size_t + // beam_requestinfo_size = + // BeamSearchBatchConfig::max_requests_per_batch(); + // size_t total_size = + // beam_tokeninfo_size * + // sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo) + + // beam_requestinfo_size * + // sizeof(BeamSearchBatchConfig:: + // BeamSearchPerRequestInfo); // more components will + // // be added here later + + // We always directly allocate memory for small speculative models + // gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, + // total_size); + beam_topology_mask = + static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo)); + + beam_token_infos = + static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::topology_mask)); + + beam_request_infos = + static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::topology_mask) + + sizeof(BeamSearchBatchConfig::beamTokenInfo)); + // beam_token_infos = + // gpu_mem_allocator + // .allocate_instance( + // beam_tokeninfo_size); + // offset += beam_tokeninfo_size * + // sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo); + // beam_request_infos = + // gpu_mem_allocator + // .allocate_instance( + // beam_requestinfo_size); + // offset += beam_requestinfo_size * + // sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo); + // assert(offset == total_size); + // assert(gpu_mem_allocator.instance_total_size == + // gpu_mem_allocator.instance_allocated_size); + } + + cudaStreamSynchronize(stream); +} + +SpecInferIncMultiHeadSelfAttentionMeta::~SpecInferIncMultiHeadSelfAttentionMeta( + void) { + if (beam_search_reserve_inst != Realm::RegionInstance::NO_INST) { + beam_search_reserve_inst.destroy(); + } +} + +}; // namespace FlexFlow diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index bc7d1017b7..1da56e383a 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -834,18 +834,18 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream); bias_ptr = static_cast
(m->bias_ptr); } - cudaMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - bc->num_active_tokens() * - sizeof(TreeVerifyBatchConfig::PerTokenInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->request_infos, - &(bc->requestsInfo), - bc->max_requests_per_batch() * - sizeof(BatchConfig::PerRequestInfo), - cudaMemcpyHostToDevice, - stream); + // cudaMemcpyAsync(m->token_infos, + // &(bc->tokensInfo), + // bc->num_active_tokens() * + // sizeof(TreeVerifyBatchConfig::PerTokenInfo), + // cudaMemcpyHostToDevice, + // stream); + // cudaMemcpyAsync(m->request_infos, + // &(bc->requestsInfo), + // bc->max_requests_per_batch() * + // sizeof(BatchConfig::PerRequestInfo), + // cudaMemcpyHostToDevice, + // stream); // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index c7b6e1257a..904bfbcaff 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -154,6 +154,8 @@ std::string get_operator_type_name(OperatorType type) { return "SpecIncMultiHeadSelfAttention"; case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: return "TreeIncMultiHeadSelfAttention"; + case OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION: + return "SpecInferPgraoIncMultiHeadSelfAttention"; case OP_INPUT: return "Input"; case OP_WEIGHT: diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 6d33dd9f27..46f7cc0f29 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -51,6 +51,7 @@ #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" #include "flexflow/ops/tree_inc_multihead_self_attention.h" +#include "flexflow/ops/specinfer_inc_multihead_self_attention.h" #include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" @@ -69,7 +70,7 @@ using FlexFlow::MachineView; LegionRuntime::Logger::Category log_graph("graph"); LegionRuntime::Logger::Category log_simplify("graph_simplify"); -const Node Node::INVALID_NODE = Node(); +Node const Node::INVALID_NODE = Node(); Node::Node(void) : guid(0), ptr(NULL) {} @@ -2384,6 +2385,28 @@ GraphOptimalViewSerialized sez.serialize(attn->tensor_parallelism_degree); break; } + case OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION: { + SpecInferIncMultiHeadSelfAttention *attn = + (SpecInferIncMultiHeadSelfAttention *)op; + sez.serialize(attn->layer_guid.id); + sez.serialize(attn->layer_guid.transformer_layer_id); + sez.serialize(attn->layer_guid.model_id); + sez.serialize(attn->oProjSize); + sez.serialize(attn->num_q_heads); + sez.serialize(attn->qProjSize); + sez.serialize(attn->vProjSize); + sez.serialize(attn->dropout); + sez.serialize(attn->qkv_bias); + sez.serialize(attn->final_bias); + sez.serialize(attn->add_zero_attn); + sez.serialize(attn->apply_rotary_embedding); + sez.serialize(attn->scaling_query); + sez.serialize(attn->scaling_factor); + sez.serialize(attn->qk_prod_scaling); + sez.serialize(attn->position_bias); + sez.serialize(attn->num_kv_heads); + break; + } case OP_SOFTMAX: { Softmax *softmax = (Softmax *)op; sez.serialize(softmax->dim); @@ -2914,6 +2937,52 @@ void FFModel::deserialize_graph_optimal_view( params); break; } + case OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION: { + assert(num_inputs == 1); + int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads; + float dropout, scaling_factor; + bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, + scaling_query, qk_prod_scaling, position_bias; + size_t id, transformer_layer_id, deserialized_model_id; + dez.deserialize(id); + dez.deserialize(transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); + dez.deserialize(embed_dim); + dez.deserialize(num_q_heads); + dez.deserialize(k_dim); + dez.deserialize(v_dim); + dez.deserialize(dropout); + dez.deserialize(qkv_bias); + dez.deserialize(final_bias); + dez.deserialize(add_zero_attn); + dez.deserialize(apply_rotary_embedding); + dez.deserialize(scaling_query); + dez.deserialize(scaling_factor); + dez.deserialize(qk_prod_scaling); + dez.deserialize(position_bias); + dez.deserialize(num_kv_heads); + + SpecInferIncMultiHeadSelfAttentionParams params; + params.embed_dim = embed_dim; + params.num_q_heads = num_q_heads; + params.kdim = k_dim; + params.vdim = v_dim; + params.dropout = dropout; + params.qkv_bias = qkv_bias; + params.final_bias = final_bias; + params.add_zero_attn = add_zero_attn; + params.layer_guid = layer_guid; + params.apply_rotary_embedding = apply_rotary_embedding; + params.scaling_query = scaling_query; + params.scaling_factor = scaling_factor; + params.qk_prod_scaling = qk_prod_scaling; + params.position_bias = position_bias; + params.num_kv_heads = num_kv_heads; + node = get_or_create_node(inputs[0], + params); + break; + } case OP_TOPK: { node = TopK::deserialize(*this, dez, inputs, num_inputs); break; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index eb045e8159..fb978adfff 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -318,7 +318,7 @@ FutureMap InferenceManager::inference(FFModel *model, found_input_operator = true; assert(op->numOutputs == 1); ParallelTensor pt = tensor_buffer[op->outputs[0]][batch_index]; - load_input_tokens_from_batch_config(bc, pt); + load_input_tokens_from_batch_config(bc, pt, model->handlers); } } @@ -348,11 +348,20 @@ FutureMap InferenceManager::inference(FFModel *model, }; void InferenceManager::load_input_tokens_from_batch_config( - BatchConfigFuture const &bc, ParallelTensor const input) { + BatchConfigFuture const &bc, ParallelTensor const input, FFHandler *handlers) { Context ctx = ff_config.lg_ctx; Runtime *runtime = ff_config.lg_hlr; size_t machine_view_hash = input->machine_view.hash(); ArgumentMap argmap; + Rect<1> task_rect(Point<1>(0), + Point<1>(ff_config.workersPerNode * ff_config.numNodes - 1)); + IndexSpaceT<1> task_is = runtime->create_index_space(ctx, task_rect); + MachineView view = input->machine_view; + for (PointInRectIterator<1> it(task_rect); it(); it++) { + FFHandler handle = handlers[view.get_device_id(*it)]; + argmap.set_point(*it, TaskArgument(&handle, sizeof(FFHandler))); + } + IndexLauncher launcher(RM_LOAD_TOKENS_TASK_ID, input->parallel_is, TaskArgument(nullptr, 0), diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 92f0cff472..8bda9016c3 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -59,6 +59,7 @@ #include "flexflow/ops/sigmoid_silu_multi.h" #include "flexflow/ops/softmax.h" #include "flexflow/ops/spec_inc_multihead_self_attention.h" +#include "flexflow/ops/specinfer_inc_multihead_self_attention.h" #include "flexflow/ops/split.h" #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" @@ -93,10 +94,10 @@ Op::Op(FFModel &model, int numWeights, bool allocate_weights, int numOutputs, - const ParallelTensor input1, - const ParallelTensor input2, - const ParallelTensor input3, - const ParallelTensor input4) + ParallelTensor const input1, + ParallelTensor const input2, + ParallelTensor const input3, + ParallelTensor const input4) : Op(model, otype, dtype, @@ -116,10 +117,10 @@ Op::Op(FFModel &model, int _numInputs, int _numWeights, int _numOutputs, - const ParallelTensor _input1, - const ParallelTensor _input2, - const ParallelTensor _input3, - const ParallelTensor _input4) + ParallelTensor const _input1, + ParallelTensor const _input2, + ParallelTensor const _input3, + ParallelTensor const _input4) : op_type(_otype), data_type(_dtype), op_guid(model.op_global_guid++), numInputs(_numInputs), numWeights(_numWeights), numOutputs(_numOutputs), profiling(model.config.profiling), @@ -1024,9 +1025,9 @@ void Op::register_output_parallel_dims( operation); } -int Op::get_output_to_input_dim_mapping(const ParallelTensor output, +int Op::get_output_to_input_dim_mapping(ParallelTensor const output, int output_dim, - const ParallelTensor input) { + ParallelTensor const input) { int output_idx = -1, input_idx = -1; for (int i = 0; i < numOutputs; i++) { if (output == outputs[i]) { @@ -1059,9 +1060,9 @@ int Op::get_output_to_input_dim_mapping(const ParallelTensor output, return -1; } -int Op::get_output_to_weight_dim_mapping(const ParallelTensor output, +int Op::get_output_to_weight_dim_mapping(ParallelTensor const output, int output_dim, - const ParallelTensor weight) { + ParallelTensor const weight) { int output_idx = -1, weight_idx = -1; for (int i = 0; i < numOutputs; i++) { if (output == outputs[i]) { @@ -1658,7 +1659,7 @@ Tensor FFModel::create_tensor(int numdim, } ParallelTensor FFModel::create_parallel_tensor(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *op, int idx, @@ -1691,7 +1692,7 @@ Tensor FFModel::create_tensor_legion_ordering(int numdim, ParallelTensor FFModel::create_parallel_tensor_legion_ordering(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *op, int idx, @@ -1741,7 +1742,7 @@ Tensor FFModel::create_tensor(int const dims[], } template -ParallelTensor FFModel::create_parallel_tensor(const ParallelDim dims[], +ParallelTensor FFModel::create_parallel_tensor(ParallelDim const dims[], DataType data_type, Op const *owner_op, int owner_idx, @@ -1822,7 +1823,7 @@ Parameter FFModel::create_weight(int numdim, } template -ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[], +ParallelParameter FFModel::create_parallel_weight(ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1853,7 +1854,7 @@ ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[], } ParallelParameter FFModel::create_parallel_weight(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1873,7 +1874,7 @@ ParallelParameter FFModel::create_parallel_weight(int numdim, ParallelParameter FFModel::create_parallel_weight_legion_ordering( int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -2087,7 +2088,7 @@ void FFModel::map_weight_with_dim(ParallelTensor weight, } bool FFModel::get_parallel_tensor_from_tensor( - const Tensor tensor, ParallelTensor ¶llel_tensor) const { + Tensor const tensor, ParallelTensor ¶llel_tensor) const { // check if tensor->parallel_tensor is already set if (tensor->parallel_tensor != nullptr) { parallel_tensor = tensor->parallel_tensor; @@ -2124,7 +2125,7 @@ bool FFModel::get_parallel_tensor_from_tensor( } void FFModel::create_disjoint_partition(int num_dims, - const ParallelDim dims[], + ParallelDim const dims[], IndexSpace const &part_is, LogicalRegion const ®ion, LogicalPartition &part) { @@ -2147,7 +2148,7 @@ void FFModel::create_disjoint_partition(int num_dims, template void FFModel::create_disjoint_partition_with_dim2( - const ParallelDim dims[], + ParallelDim const dims[], IndexSpaceT const &part_is, LogicalRegion const ®ion, LogicalPartition &part) { @@ -2180,7 +2181,7 @@ void FFModel::create_disjoint_partition_with_dim2( } void FFModel::create_aliased_partition(int num_dims, - const ParallelDim dims[], + ParallelDim const dims[], int aliased_dim, IndexSpace const &part_is, LogicalRegion const ®ion, @@ -2204,7 +2205,7 @@ void FFModel::create_aliased_partition(int num_dims, template void FFModel::create_aliased_partition_with_dim2( - const ParallelDim dims[], + ParallelDim const dims[], int aliased_dim, IndexSpaceT const &part_is, LogicalRegion const ®ion, @@ -2241,7 +2242,7 @@ void FFModel::create_aliased_partition_with_dim2( } template -void FFModel::create_disjoint_partition(const ParallelTensor tensor, +void FFModel::create_disjoint_partition(ParallelTensor const tensor, IndexSpaceT const &part_is, LogicalPartition &part_fwd, LogicalPartition &part_bwd) { @@ -2289,7 +2290,7 @@ void FFModel::create_disjoint_partition(const ParallelTensor tensor, template void FFModel::create_data_parallel_partition_with_diff_dims( - const ParallelTensor tensor, + ParallelTensor const tensor, IndexSpaceT const &part_is, LogicalPartition &part_fwd, LogicalPartition &part_bwd) { @@ -2671,7 +2672,7 @@ IndexSpace FFModel::get_task_is(ParallelConfig const &pc) const { return get_task_is(view); } -IndexSpace FFModel::get_or_create_task_is(const ParallelTensor tensor) { +IndexSpace FFModel::get_or_create_task_is(ParallelTensor const tensor) { MachineView view; view.ndims = 0; for (int i = 0; i < tensor->num_dims; i++) { @@ -3038,6 +3039,12 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } + case OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION: { + Op *op = SpecInferIncMultiHeadSelfAttention::create_operator_from_layer( + *this, layer, inputs); + operators.push_back(op); + return op; + } case OP_BATCHMATMUL: { Op *op = BatchMatmul::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); @@ -3227,7 +3234,7 @@ Op *FFModel::create_operator_from_layer( } void FFModel::create_operators_from_layers() { - std::map tensors_to_parallel_tensors; + std::map tensors_to_parallel_tensors; // for (auto const &l : layers) { for (int layer_idx = 0; layer_idx < layers.size(); layer_idx++) { auto const &l = layers[layer_idx]; @@ -3973,38 +3980,38 @@ void FFIterationConfig::reset() { // Default Config Parameters struct DefaultConfig { - const static int epochs = 1; + static int const epochs = 1; // const static int iterations = 1; - const static int batchSize = 64; - const static bool profiling = false; - const static bool inference_debugging = false; + static int const batchSize = 64; + static bool const profiling = false; + static bool const inference_debugging = false; constexpr static float learningRate = 0.01f; constexpr static float weightDecay = 0.0001f; - const static size_t workSpaceSize = (size_t)128 * 1024 * 1024; // 128 MB - const static int numNodes = 1; - const static int workersPerNode = 0; - const static int cpusPerNode = 0; - const static size_t searchBudget = -1; - const static size_t simulatorWorkSpaceSize = + static size_t const workSpaceSize = (size_t)128 * 1024 * 1024; // 128 MB + static int const numNodes = 1; + static int const workersPerNode = 0; + static int const cpusPerNode = 0; + static size_t const searchBudget = -1; + static size_t const simulatorWorkSpaceSize = (size_t)2 * 1024 * 1024 * 1024; // 2 GB constexpr static float searchAlpha = 1.2f; - const static bool searchOverlapBackwardUpdate = false; - const static size_t offloadReserveSpaceSize = + static bool const searchOverlapBackwardUpdate = false; + static size_t const offloadReserveSpaceSize = (size_t)8 * 1024 * 1024 * 1024; // 8 GB - const static bool cpuOffload = false; - const static bool onlyDataParallel = true; - const static bool enableSampleParallel = true; - const static bool enableParameterParallel = false; - const static bool enableAttributeParallel = false; - const static bool enableInplaceOptimizations = false; - const static bool allowTensorOpMathConversion = false; - const static int machine_model_version = 0; - const static int simulator_segment_size = 16777216; // 16 MB - const static int simulator_max_num_segments = 1; - const static int base_optimize_threshold = 10; - const static bool enable_control_replication = true; + static bool const cpuOffload = false; + static bool const onlyDataParallel = true; + static bool const enableSampleParallel = true; + static bool const enableParameterParallel = false; + static bool const enableAttributeParallel = false; + static bool const enableInplaceOptimizations = false; + static bool const allowTensorOpMathConversion = false; + static int const machine_model_version = 0; + static int const simulator_segment_size = 16777216; // 16 MB + static int const simulator_max_num_segments = 1; + static int const base_optimize_threshold = 10; + static bool const enable_control_replication = true; // The default python data loader type is 2 to enable control replication - const static int python_data_loader_type = 2; + static int const python_data_loader_type = 2; }; FFConfig::FFConfig() { @@ -6209,6 +6216,44 @@ void register_flexflow_internal_tasks(Runtime *runtime, TreeIncMultiHeadSelfAttention::inference_task>(registrar); } } + { + TaskVariantRegistrar registrar( + SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, + "SpecInferIncMultiHeadSelfAttention Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + OpMeta *, + SpecInferIncMultiHeadSelfAttention::init_task>( + registrar, "SpecInferIncMultiHeadSelfAttention Init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant< + OpMeta *, + SpecInferIncMultiHeadSelfAttention::init_task>(registrar); + } + } + { + TaskVariantRegistrar registrar( + SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, + "SpecInferIncMultiHeadSelfAttention Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + SpecInferIncMultiHeadSelfAttention::inference_task>( + registrar, "SpecInferIncMultiHeadSelfAttention Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant< + SpecInferIncMultiHeadSelfAttention::inference_task>(registrar); + } + } // NoOp { TaskVariantRegistrar registrar(NOOP_INIT_TASK_ID, "Weight NCCL Init"); diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp index 6c482426eb..b51ab83091 100644 --- a/src/runtime/model.cpp +++ b/src/runtime/model.cpp @@ -131,6 +131,54 @@ FFHandler .wait(); handle.workSpace = workspaceInst.pointer_untyped(0, sizeof(char)); } + if (handle.offload_reserve_space_size > 0) { + // allocate memory for offload reserve space + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(handle.offload_reserve_space_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance workspaceInst; + Realm::RegionInstance::create_instance(workspaceInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + handle.offload_reserve_space = + workspaceInst.pointer_untyped(0, sizeof(char)); + }else { + handle.offload_reserve_space = nullptr; + } + if (handle.batch_config_metadata_size > 0) { + // allocate memory for offload reserve space + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance workspaceInst; + Realm::RegionInstance::create_instance(workspaceInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + handle.batch_config_metadata = + workspaceInst.pointer_untyped(0, sizeof(char)); + }else { + handle.batch_config_metadata = nullptr; + } // checkCUDA(hipMalloc(&handle.workSpace, handle.workSpaceSize)); #ifdef FF_USE_NCCL handle.ncclComm = NULL; diff --git a/src/runtime/model.cu b/src/runtime/model.cu index 17401a0f14..523b3c76f3 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -148,9 +148,35 @@ FFHandler .wait(); handle.offload_reserve_space = workspaceInst.pointer_untyped(0, sizeof(char)); - } else { + }else { handle.offload_reserve_space = nullptr; } + if (handle.batch_config_metadata_size > 0) { + printf("allocate instance for metadata %d\n", handle.batch_config_metadata_size); + // allocate memory for offload reserve space + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance workspaceInst; + Realm::RegionInstance::create_instance(workspaceInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + handle.batch_config_metadata = + workspaceInst.pointer_untyped(0, sizeof(char)); + }else { + handle.batch_config_metadata = nullptr; + } + // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize)); #ifdef FF_USE_NCCL diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 7c37f3391e..e1b591c320 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -106,6 +106,11 @@ int RequestManager::get_max_sequence_length() { return max_sequence_length; } +void RequestManager::push_spec_infer_tree_width(int tree_width) { + assert(tree_width <= BeamSearchBatchConfig::MAX_BEAM_WIDTH); + spec_infer_tree_width.emplace_back(tree_width); +} + void RequestManager::register_tokenizer(ModelType type, int bos_token_id, int eos_token_id, @@ -176,7 +181,7 @@ size_t RequestManager::get_num_ssms() { RequestManager::RequestGuid RequestManager::register_new_request(std::vector const &prompt, int max_sequence_length) { - const std::lock_guard lock(request_queue_mutex); + std::lock_guard const lock(request_queue_mutex); // Add a new request Request request; @@ -232,7 +237,7 @@ RequestManager::RequestGuid RequestManager::RequestGuid RequestManager::register_new_request(std::string const &prompt, int max_sequence_length) { - const std::lock_guard lock(request_queue_mutex); + std::lock_guard const lock(request_queue_mutex); // Add a new request Request request; request.status = Request::PENDING; @@ -290,7 +295,7 @@ RequestManager::RequestGuid } bool RequestManager::is_request_completed(RequestGuid const &guid) { - const std::lock_guard lock(request_queue_mutex); + std::lock_guard const lock(request_queue_mutex); assert(all_requests.find(guid) != all_requests.end()); Request const &request = all_requests[guid]; // return request.tokens.size() >= request.max_sequence_length; @@ -299,7 +304,7 @@ bool RequestManager::is_request_completed(RequestGuid const &guid) { GenerationResult RequestManager::get_generation_result(RequestGuid const &guid) { - const std::lock_guard lock(request_queue_mutex); + std::lock_guard const lock(request_queue_mutex); assert(request_generation_results.find(guid) != request_generation_results.end()); return request_generation_results[guid]; @@ -337,7 +342,7 @@ BatchConfig RequestManager::prepare_next_batch_task( BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, InferenceResult const &result) { - const std::lock_guard lock(request_queue_mutex); + std::lock_guard const lock(request_queue_mutex); // Step 1: append result from previous iteration to request's tokens for (int i = 0; i < old_bc.num_tokens; i++) { @@ -406,13 +411,14 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, total_request_run_time += profile_info.finish_time - profile_info.start_time; profiling_requests[request.guid] = profile_info; - log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf)", - request.guid, - profile_info.decoding_steps, - profile_info.start_time, - profile_info.finish_time, - profile_info.finish_time - profile_info.start_time); + log_req_mgr.print( + "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) " + "finish(%.1lf) latency(%.1lf)", + request.guid, + profile_info.llm_decoding_steps, + profile_info.start_time, + profile_info.finish_time, + profile_info.finish_time - profile_info.start_time); // Write output to file if needed: if (!output_filepath.empty()) { std::ofstream outputFile(output_filepath, std::ios::app); @@ -420,8 +426,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, outputFile << "end-to-end latency: " << std::fixed << std::setprecision(3) << total_request_run_time << std::endl; - outputFile << "num decoding steps: " << profile_info.decoding_steps - << std::endl; + outputFile << "num decoding steps: " + << profile_info.llm_decoding_steps << std::endl; outputFile << "token IDs: "; for (int i = 0; i < request.tokens.size(); i++) { outputFile << request.tokens[i]; @@ -469,7 +475,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } // Update profiling profiling_requests[new_bc.requestsInfo[i].request_guid] - .decoding_steps++; + .llm_decoding_steps++; } } } @@ -494,7 +500,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.request_completed[i] = false; // add profile_info for the new request ProfileInfo profile_info; - profile_info.decoding_steps = 1; + profile_info.llm_decoding_steps = 1; profile_info.start_time = Realm::Clock::current_time_in_microseconds(); profiling_requests[new_request.guid] = profile_info; for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { @@ -553,7 +559,7 @@ BeamSearchBatchConfig RequestManager::prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc, InferenceResult const &result, int model_id) { - const std::lock_guard lock(request_queue_mutex); + std::lock_guard const lock(request_queue_mutex); if (verbose) { std::cout << "\n############### prepare_next_batch_init ###############\n"; } @@ -664,16 +670,18 @@ BeamSearchBatchConfig // Log profiling info ProfileInfo profile_info = profiling_requests[request.guid]; profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); + profile_info.ssm_decoding_steps = 0; total_request_run_time += profile_info.finish_time - profile_info.start_time; profiling_requests[request.guid] = profile_info; - log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf)", - request.guid, - profile_info.decoding_steps, - profile_info.start_time, - profile_info.finish_time, - profile_info.finish_time - profile_info.start_time); + log_req_mgr.print( + "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) " + "finish(%.1lf) latency(%.1lf)", + request.guid, + profile_info.llm_decoding_steps, + profile_info.start_time, + profile_info.finish_time, + profile_info.finish_time - profile_info.start_time); // Write output to file if needed: if (!output_filepath.empty()) { @@ -682,8 +690,8 @@ BeamSearchBatchConfig outputFile << "end-to-end latency: " << std::fixed << std::setprecision(3) << total_request_run_time << std::endl; - outputFile << "num decoding steps: " << profile_info.decoding_steps - << std::endl; + outputFile << "num decoding steps: " + << profile_info.llm_decoding_steps << std::endl; outputFile << "token IDs: "; for (int i = 0; i < request.tokens.size(); i++) { outputFile << request.tokens[i]; @@ -726,8 +734,14 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].first_token_depth_in_request - verified_tokens.size(); new_bc.beamRequestsInfo[i].current_depth = 1; + + profiling_requests[request.guid].ssm_decoding_steps = 0; + + int ssm_decoding_steps = 0; new_bc.beamRequestsInfo[i].beam_size = - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + spec_infer_tree_width.size() > ssm_decoding_steps + ? spec_infer_tree_width[ssm_decoding_steps] + : 1; new_bc.beamRequestsInfo[i].max_depth = std::min(new_max_depth, BeamSearchBatchConfig::MAX_BEAM_DEPTH); for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { @@ -735,6 +749,8 @@ BeamSearchBatchConfig new_bc.beamRequestsInfo[i].probs[j] = 1; } + new_bc.beamRequestsInfo[i].sub_request_num = 1; + new_bc.sub_requests[i] = 1; // Token Info @@ -746,6 +762,8 @@ BeamSearchBatchConfig new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second; + new_bc.topology_mask[i].real_token_pos[0][token.second] = + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request; // Beam Token Info new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; @@ -786,14 +804,20 @@ BeamSearchBatchConfig // TODO: Beam Request Info, missing from VerifyTreeBatchConfig new_bc.beamRequestsInfo[i].current_depth = 1; + int ssm_decoding_steps = + profiling_requests[request.guid].ssm_decoding_steps; new_bc.beamRequestsInfo[i].beam_size = - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + spec_infer_tree_width.size() > ssm_decoding_steps + ? spec_infer_tree_width[ssm_decoding_steps] + : 1; new_bc.beamRequestsInfo[i].max_depth = 0; for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { new_bc.beamRequestsInfo[i].parent_id[j] = 0; new_bc.beamRequestsInfo[i].probs[j] = 1; } + new_bc.beamRequestsInfo[i].sub_request_num = 1; + new_bc.sub_requests[i] = 1; // Token Info @@ -829,12 +853,17 @@ BeamSearchBatchConfig // add profile_info for the new request ProfileInfo profile_info; - profile_info.decoding_steps = 0; + profile_info.llm_decoding_steps = 0; + profile_info.ssm_decoding_steps = 0; profile_info.start_time = Realm::Clock::current_time_in_microseconds(); profiling_requests[new_request.guid] = profile_info; // init the beam search metadata per request + int ssm_decoding_steps = profile_info.ssm_decoding_steps; + new_bc.beamRequestsInfo[i].beam_size = - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + spec_infer_tree_width.size() > ssm_decoding_steps + ? spec_infer_tree_width[ssm_decoding_steps] + : 1; new_bc.beamRequestsInfo[i].current_depth = 1; new_bc.beamRequestsInfo[i].max_depth = std::min(BeamSearchBatchConfig::MAX_BEAM_DEPTH, @@ -846,6 +875,7 @@ BeamSearchBatchConfig } new_bc.request_completed[i] = false; + new_bc.beamRequestsInfo[i].sub_request_num = 1; new_bc.sub_requests[i] = 1; for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { @@ -855,6 +885,7 @@ BeamSearchBatchConfig assert(depth < new_request.tokens.size()); new_bc.tokensInfo[new_bc.num_tokens].token_id = new_request.tokens[depth]; + new_bc.topology_mask[i].real_token_pos[0][depth] = depth; // beam search meta data, indicate which sub request this token // belongs to, init to 0; @@ -937,7 +968,7 @@ BeamSearchBatchConfig RequestManager::prepare_next_batch_beam_task( BeamSearchBatchConfig RequestManager::prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, BeamInferenceResult const &result) { - const std::lock_guard lock(request_queue_mutex); + std::lock_guard const lock(request_queue_mutex); if (verbose) { std::cout << "\n############### prepare_next_batch_beam ###############\n"; } @@ -1005,25 +1036,38 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; - + profiling_requests[request.guid].ssm_decoding_steps += 1; // update the beam search metadata // how many sub request in current request // why is sub_requests has max_requests_per_batch() * MAX_BEAM_WIDTH // entries? - new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size; - // update the parentid, accumalated_probs, depth, and token_ids + int ssm_decoding_steps = + profiling_requests[request.guid].ssm_decoding_steps; + new_bc.beamRequestsInfo[i].beam_size = - old_bc.beamRequestsInfo[i].beam_size; + spec_infer_tree_width.size() > ssm_decoding_steps + ? spec_infer_tree_width[ssm_decoding_steps] + : 1; new_bc.beamRequestsInfo[i].max_depth = old_bc.beamRequestsInfo[i].max_depth; + new_bc.sub_requests[i] = + old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; + new_bc.beamRequestsInfo[i].sub_request_num = + old_bc.beamRequestsInfo[i].sub_request_num * + new_bc.beamRequestsInfo[i].beam_size; + + assert(new_bc.beamRequestsInfo[i].sub_request_num <= + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES); + if (request.status == Request::RUNNING) { new_bc.beamRequestsInfo[i].current_depth = old_bc.beamRequestsInfo[i].current_depth + 1; new_bc.request_running[i] = true; // do the slot exchange to minimize the cache exchange in kernel. - update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i); + update_beam_metadata( + new_bc, old_bc, request.beam_trees.at(old_bc.model_id), i); } else { assert(false && "Request should not be pending in beam search phase"); } @@ -1059,7 +1103,7 @@ BeamSearchBatchConfig // register more tokens due to the beam width for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; - for (int k = 0; k < new_bc.sub_requests[i]; k++) { + for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) { new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; @@ -1103,13 +1147,24 @@ BeamSearchBatchConfig // how many sub request in current request // why is sub_requests has max_requests_per_batch() * MAX_BEAM_WIDTH // entries? - new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size; + int ssm_decoding_steps = + profiling_requests[request.guid].ssm_decoding_steps; - // update the parentid, accumalated_probs, depth, and token_ids new_bc.beamRequestsInfo[i].beam_size = - old_bc.beamRequestsInfo[i].beam_size; + spec_infer_tree_width.size() > ssm_decoding_steps + ? spec_infer_tree_width[ssm_decoding_steps] + : 1; new_bc.beamRequestsInfo[i].max_depth = old_bc.beamRequestsInfo[i].max_depth; + new_bc.sub_requests[i] = + old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; + new_bc.beamRequestsInfo[i].sub_request_num = + old_bc.beamRequestsInfo[i].sub_request_num * + new_bc.beamRequestsInfo[i].beam_size; + assert(new_bc.beamRequestsInfo[i].sub_request_num <= + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES); + + // update the parentid, accumalated_probs, depth, and token_ids if (request.status == Request::PENDING) { // if the request is pending, we need to update the beam search @@ -1152,7 +1207,7 @@ BeamSearchBatchConfig // register more tokens due to the beam width for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; - for (int k = 0; k < new_bc.sub_requests[i]; k++) { + for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) { new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; @@ -1209,7 +1264,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify_task( TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::vector const &old_batches) { - const std::lock_guard lock(request_queue_mutex); + std::lock_guard const lock(request_queue_mutex); std::cout << "\n############### prepare_next_batch_verify ###############\n"; @@ -1238,7 +1293,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( Request &request = all_requests[guid]; // Profiling - profiling_requests[request.guid].decoding_steps += 1; + profiling_requests[request.guid].llm_decoding_steps += 1; if (request.status == Request::RUNNING) { new_bc.request_running[i] = true; @@ -1478,16 +1533,19 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, int index = old_bc.tokensInfo[i - 1].request_index; int beam_size = old_bc.beamRequestsInfo[index].beam_size; + + // int leaf_node_num = old_bc.sub_requests[index]; + int leaf_node_num = old_bc.beamRequestsInfo[i].sub_request_num; int depth = old_bc.beamRequestsInfo[index].current_depth; // Each token yields (beam_width) results - int beam_width = old_bc.beamRequestsInfo[index].beam_size; + // int beam_width = old_bc.beamRequestsInfo[index].beam_size; // Count tokens sent to model in this request to find the final token's // index result_index += (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) * - beam_width; + leaf_node_num; if (verbose) { std::cout << "i = " << i << ", result index = " << result_index @@ -1514,7 +1572,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, } } - for (int beam_id = 0; beam_id < beam_width; beam_id++) { + for (int beam_id = 0; beam_id < leaf_node_num; beam_id++) { request.beam_trees.at(old_bc.model_id) .treeLayers[depth] .tokens[beam_id] = result.token_ids[result_index]; @@ -1546,6 +1604,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, // for updating the beam search metadata in requests in incremental phase void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, + BeamSearchBatchConfig const &old_bc, BeamTree &tree, int request_index) { @@ -1556,6 +1615,9 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, int depth = new_bc.beamRequestsInfo[request_index].current_depth - 1; int beam_size = new_bc.beamRequestsInfo[request_index].beam_size; + // int leaf_node_num = old_bc.sub_requests[request_index]; + int leaf_node_num = old_bc.beamRequestsInfo[request_index].sub_request_num; + if (new_bc.beamRequestsInfo[request_index].current_depth == 1) { // TODO: check if this is correct // for (int j = 0; j < beam_size; j++) { @@ -1568,49 +1630,61 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, // Do nothing // assert(false); } else { - std::set parents; - std::set childs; - // cache stealing - for (int j = 0; j < beam_size; j++) { - int parent_id = tree.treeLayers[depth].parent_ids[j]; - if (childs.find(parent_id) == childs.end()) { - // copy beam slot - new_bc.beamRequestsInfo[request_index].parent_id[parent_id] = - tree.treeLayers[depth].parent_ids[j]; - new_bc.beamRequestsInfo[request_index].probs[parent_id] = - tree.treeLayers[depth].probs[j]; - new_bc.beamRequestsInfo[request_index].tokens[parent_id] = - tree.treeLayers[depth].tokens[j]; - parents.emplace(j); - childs.emplace(parent_id); - } - } - if (parents.size() < beam_size) { - for (int j = 0; j < beam_size; j++) { - if (parents.find(j) == parents.end()) { - // this slot has not been assigned - // find the smallest not assigned child and put in - if (verbose) { - std::cout << "request_index" << request_index - << ", miss slot: " << j << "\n"; - } - for (int k = 0; k < beam_size; k++) { - if (childs.find(k) == childs.end()) { - // parent -> j to child k; - new_bc.beamRequestsInfo[request_index].parent_id[k] = - tree.treeLayers[depth].parent_ids[j]; - new_bc.beamRequestsInfo[request_index].probs[k] = - tree.treeLayers[depth].probs[j]; - new_bc.beamRequestsInfo[request_index].tokens[k] = - tree.treeLayers[depth].tokens[j]; - parents.emplace(j); - childs.emplace(k); - break; - } - } - } - } + for (int j = 0; j < leaf_node_num; j++) { + new_bc.beamRequestsInfo[request_index].parent_id[j] = + tree.treeLayers[depth].parent_ids[j]; + new_bc.beamRequestsInfo[request_index].probs[j] = + tree.treeLayers[depth].probs[j]; + new_bc.beamRequestsInfo[request_index].tokens[j] = + tree.treeLayers[depth].tokens[j]; + + // new_bc.topology_mask[request_index].real_token_pos[j] = } + assert(false); + + // std::set parents; + // std::set childs; + // // cache stealing + // for (int j = 0; j < beam_size; j++) { + // int parent_id = tree.treeLayers[depth].parent_ids[j]; + // if (childs.find(parent_id) == childs.end()) { + // // copy beam slot + // new_bc.beamRequestsInfo[request_index].parent_id[parent_id] = + // tree.treeLayers[depth].parent_ids[j]; + // new_bc.beamRequestsInfo[request_index].probs[parent_id] = + // tree.treeLayers[depth].probs[j]; + // new_bc.beamRequestsInfo[request_index].tokens[parent_id] = + // tree.treeLayers[depth].tokens[j]; + // parents.emplace(j); + // childs.emplace(parent_id); + // } + // } + // if (parents.size() < beam_size) { + // for (int j = 0; j < beam_size; j++) { + // if (parents.find(j) == parents.end()) { + // // this slot has not been assigned + // // find the smallest not assigned child and put in + // if (verbose) { + // std::cout << "request_index" << request_index + // << ", miss slot: " << j << "\n"; + // } + // for (int k = 0; k < beam_size; k++) { + // if (childs.find(k) == childs.end()) { + // // parent -> j to child k; + // new_bc.beamRequestsInfo[request_index].parent_id[k] = + // tree.treeLayers[depth].parent_ids[j]; + // new_bc.beamRequestsInfo[request_index].probs[k] = + // tree.treeLayers[depth].probs[j]; + // new_bc.beamRequestsInfo[request_index].tokens[k] = + // tree.treeLayers[depth].tokens[j]; + // parents.emplace(j); + // childs.emplace(k); + // break; + // } + // } + // } + // } + // } } if (verbose) { std::cout << "-----------after parent id exchange-----------" << std::endl; diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp index 1e756606f8..9635b3bc1e 100644 --- a/src/runtime/request_manager.cpp +++ b/src/runtime/request_manager.cpp @@ -56,6 +56,22 @@ void RequestManager::load_tokens_task( sizeof(TokenId) * batch_config->num_tokens, hipMemcpyHostToDevice, stream)); + + // copy meta data to workSpace + FFHandler handle = *((FFHandler const *)task->local_args); + cudaMemcpyAsync(handle.batch_config_metadata, + &(batch_config->tokensInfo), + batch_config->num_active_tokens() * + sizeof(BatchConfig::PerTokenInfo), + cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo), + &(batch_config->requestsInfo), + batch_config->max_requests_per_batch() * + sizeof(BatchConfig::PerRequestInfo), + cudaMemcpyHostToDevice, + stream); } void RequestManager::load_positions_task( diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index cd3e03fff6..f4500d152d 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -30,6 +30,7 @@ void RequestManager::load_tokens_task( // BatchConfig const batch_config = *((BatchConfig *)task->args); BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]); + BatchConfig::TokenId dram_copy[BatchConfig::MAX_NUM_TOKENS]; // Extreme long prompts are not supported, only load up to @@ -55,6 +56,55 @@ void RequestManager::load_tokens_task( sizeof(TokenId) * batch_config->num_tokens, cudaMemcpyHostToDevice, stream)); + + // copy meta data to workSpace + FFHandler handle = *((FFHandler const *)task->local_args); + cudaMemcpyAsync(handle.batch_config_metadata, + &(batch_config->tokensInfo), + batch_config->num_active_tokens() * + sizeof(BatchConfig::PerTokenInfo), + cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo), + &(batch_config->requestsInfo), + batch_config->max_requests_per_batch() * + sizeof(BatchConfig::PerRequestInfo), + cudaMemcpyHostToDevice, + stream); + + + // load speculative metadata + if (batch_config->get_mode() == BEAM_SEARCH_MODE) { + BeamSearchBatchConfig const *beam_batch_config = + static_cast(batch_config); + + cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo), + &(beam_batch_config->topology_mask), + sizeof(BeamSearchBatchConfig::topology_mask), + cudaMemcpyHostToDevice, + stream); + + cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::topology_mask), + &(beam_batch_config->beamRequestsInfo), + sizeof(BeamSearchBatchConfig::beamRequestsInfo), + cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::topology_mask) + + sizeof(BeamSearchBatchConfig::beamRequestsInfo), + &(beam_batch_config->beamTokenInfo), + sizeof(BeamSearchBatchConfig::beamTokenInfo), + cudaMemcpyHostToDevice, + stream); + } } void RequestManager::load_positions_task( From d3a57cb22b080741d9677d82701f035ccd33f8da Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 26 Dec 2023 03:09:33 -0500 Subject: [PATCH 05/61] fix speculative --- include/flexflow/batch_config.h | 4 +- inference/models/llama.cc | 1 + inference/spec_infer/spec_infer.cc | 4 +- src/ops/beam_topk.cc | 11 ++- src/ops/beam_topk.cu | 61 ++++++------ .../specinfer_inc_multihead_self_attention.cu | 91 +++++++++++------- src/runtime/inference_manager.cc | 1 + src/runtime/request_manager.cc | 93 +++++++++++++++---- src/runtime/request_manager.cu | 10 +- 9 files changed, 185 insertions(+), 91 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index c33c3558cc..dd947bbd85 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -126,11 +126,11 @@ class BeamSearchBatchConfig : public BatchConfig { size_t beam_width; size_t target_iterations; - inline static int const MAX_BEAM_WIDTH = 1; + inline static int const MAX_BEAM_WIDTH = 3; inline static int const MAX_BEAM_DEPTH = 8; // maximum tree branches for a request - inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 9; + inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 3; int model_id; diff --git a/inference/models/llama.cc b/inference/models/llama.cc index f62df1b1d7..4f76e9e0fa 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -248,6 +248,7 @@ void LLAMA::create_llama_model(FFModel &ff, // output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); // output = ff.argmax(softmax, /*beam_Search*/ true); output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); + // output = ff.top_k(softmax, ) } else { // Tensor softmax = ff.softmax(dense, -1); if (generation_config.do_sample) { diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index e2594ba87f..2ccdfd388d 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -303,7 +303,7 @@ void FlexFlow::top_level_task(Task const *task, rm->register_output_filepath(file_paths.output_file_path); //first decoding step: 3 results - rm->push_spec_infer_tree_width(1); + rm->push_spec_infer_tree_width(3); // Create LLM model FFModel tree_model(ffconfig, ffconfig.cpu_offload); @@ -404,7 +404,7 @@ void FlexFlow::top_level_task(Task const *task, prompts.push_back(text); // tree_model.generate(text, 128 /*max_sequence_length*/); } - tree_model.generate(prompts, 128 /*max_sequence_length*/); + tree_model.generate(prompts, 15 /*max_sequence_length*/); } // Execution fence diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 2883428254..3f636c2c98 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -366,14 +366,18 @@ BeamInferenceResult GenericTensorAccessorW value = helperGetGenericTensorAccessorWO( DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO( - DT_FLOAT, regions[3], task->regions[3], FID_DATA, ctx, runtime); + DT_INT32, regions[3], task->regions[3], FID_DATA, ctx, runtime); Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - + + printf("----------1-----------\n"); int *index_ptr = index.get_int32_ptr(); + printf("----------2-----------\n"); float *value_ptr = value.get_float_ptr(); + printf("----------3-----------\n"); int *parent_ptr = parent.get_int32_ptr(); + printf("----------4-----------\n"); // embedding size: eg. 4096 int length = input_domain.hi()[0] - input_domain.lo()[0] + 1; @@ -398,6 +402,9 @@ BeamInferenceResult download_tensor( parent_ptr, ir.parent_id, batch_size * m->max_beam_width); + print_tensor(index_ptr, 32, "indexxxxxxx"); + printf("max beam width %d\n", m->max_beam_width); + if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index 72ab7862a6..515bba4bc0 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -379,9 +379,9 @@ template __global__ void mergeSubRequestsKernel(int64_t N, T const *X, T const *rstd, T *Y) { using T_ACC = T; - const int64_t i = blockIdx.x; + int64_t const i = blockIdx.x; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; + int64_t const index = i * N + j; Y[index] = static_cast(X[index]) * static_cast(rstd[i]); } } @@ -556,8 +556,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, int beam_size = bc->beamRequestsInfo[i].beam_size; // initial request - log_beam_topk.debug() << "sub_requests: " << i << ", " << sub_requests[i] - << "\n"; + std::cout << "sub_requests: " << i << ", " << sub_requests[i] << "\n"; assert(sub_requests[i] > 0); // process sub requests for (int j = 0; j < sub_requests[i]; j++) { @@ -565,12 +564,12 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, // beam_slots[i].parent_id[j]; acc_probs[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] = bc->beamRequestsInfo[i].probs[j]; - log_beam_topk.debug() - << "probbbb req: " << i - << ", sub req probability : " << bc->beamRequestsInfo[i].probs[j] - << ", sub request id " << j << ", parent id " - << bc->beamRequestsInfo[i].parent_id[j] << ", data inddd" - << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j << "\n"; + std::cout << "probbbb req: " << i << ", sub req probability : " + << bc->beamRequestsInfo[i].probs[j] << ", sub request id " << j + << ", parent id " << bc->beamRequestsInfo[i].parent_id[j] + << ", data inddd" + << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j + << "\n"; } // process tokens @@ -584,6 +583,8 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, max_heap_size = std::max(max_heap_size, beam_size * sub_requests[i]); max_beam_width = std::max(max_beam_width, beam_size); + + std::cout << "max beam width: " << max_beam_width << "\n"; req_index += 1; block_start_index += (sub_requests[i] - 1) * num_new_tokens * length; } @@ -613,26 +614,34 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, assert(num_shards >= (size_t)max_heap_size); num_shards = max_heap_size; - checkCUDA(cudaMemcpy(m->parent_ids, - parent_ids, - sizeof(int) * max_total_requests, - cudaMemcpyHostToDevice)); - checkCUDA(cudaMemcpy(m->acc_probs, - acc_probs, - sizeof(DT) * max_total_requests, - cudaMemcpyHostToDevice)); - checkCUDA(cudaMemcpy(m->block_start_index, - beam_block_start_index.data(), - sizeof(int) * beam_num_blocks, - cudaMemcpyHostToDevice)); - checkCUDA(cudaMemcpy(m->request_id, + checkCUDA(cudaMemcpyAsync(m->parent_ids, + parent_ids, + sizeof(int) * max_total_requests, + cudaMemcpyHostToDevice, + stream)); + checkCUDA(cudaMemcpyAsync(m->acc_probs, + acc_probs, + sizeof(DT) * max_total_requests, + cudaMemcpyHostToDevice, + stream)); + // trick, set acc_probs to 0; + checkCUDA( + cudaMemsetAsync(m->acc_probs, 1.0, batch_size * sizeof(DT), stream)); + checkCUDA(cudaMemcpyAsync(m->block_start_index, + beam_block_start_index.data(), + sizeof(int) * beam_num_blocks, + cudaMemcpyHostToDevice, + stream)); + checkCUDA(cudaMemcpyAsync(m->request_id, request_id.data(), sizeof(int) * beam_num_blocks, - cudaMemcpyHostToDevice)); - checkCUDA(cudaMemcpy(m->tokens_per_request, + cudaMemcpyHostToDevice, + stream)); + checkCUDA(cudaMemcpyAsync(m->tokens_per_request, tokens_per_request.data(), sizeof(int) * beam_num_blocks, - cudaMemcpyHostToDevice)); + cudaMemcpyHostToDevice, + stream)); // int depth = // bc->beamRequestsInfo[bc->tokensInfo[0].request_index].current_depth; beam_topk_forward_kernel<<>>( diff --git a/src/ops/specinfer_inc_multihead_self_attention.cu b/src/ops/specinfer_inc_multihead_self_attention.cu index 0bdf07a9d7..9d6f70d5ba 100644 --- a/src/ops/specinfer_inc_multihead_self_attention.cu +++ b/src/ops/specinfer_inc_multihead_self_attention.cu @@ -133,6 +133,13 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( q_ptr + (hidden_size * QKV_WEIGHT_NUM * sub_req_idx) + ki + ii * THREADS_PER_KEY * K_VEC_SIZE); } + + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + printf("cacheposssss %d, %d\n", tree_branch_num, topology.real_token_pos[0][0]); + printf("cacheposssss %d, %d\n", tree_branch_num, topology.real_token_pos[0][1]); + printf("cacheposssss %d, %d\n", tree_branch_num, topology.real_token_pos[0][2]); + printf("cacheposssss %d, %d\n", tree_branch_num, topology.real_token_pos[0][10]); + } __syncthreads(); for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { K_vec k[K_VECS_PER_THREAD]; @@ -317,26 +324,38 @@ __global__ void specinfer_store_kv_cache( DT kVal = devQKVProjArray[val_idx]; DT vVal = devQKVProjArray[val_idx + hidden_size]; - // above no need to be changed - // int const req_id = id_map[token_idx].request_index; - // int const tok_id = id_map[token_idx].token_position; - // int const sub_req_id = id_map[token_idx].sub_request_index; - // int const parent_id = id_map[token_idx].parent_id; - // int const beam_depth = id_map[token_idx].beam_depth; - // int const beam_width = id_map[token_idx].beam_width; - int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + int const first_token_in_req = requestInfo[req_id].first_token_depth_in_request; int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; - // int const parent_id = beamRequestInfos[req_id].parent_id[sub_req_id]; - // int const beam_depth = beamRequestInfos[req_id].current_depth; - // int const beam_width = beamRequestInfos[req_id].beam_size; int const allocated_tokens = beam_topology_mask[req_id].allocated_tokens; + int const beam_size = beamRequestInfos[req_id].sub_request_num; + + int real_idx = tok_id - first_token_in_req + allocated_tokens; + + if (i == 0) { + printf("ffasdasds%d, %d, %d, %d, %d, %d\n", + beamTokenInfos[0].sub_request_index, + allocated_tokens, + sub_req_id, + tok_id, + first_token_in_req, + real_idx); + } + // }else if(i == hidden_size * 2){ + // printf("ffasdasdskkkk%d, %d, %d\n", allocated_tokens, tok_id, + // sub_req_id); + // } + + + kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + - (allocated_tokens + sub_req_id) * hidden_size + offset] = kVal; + (real_idx) * hidden_size + + offset] = kVal; vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + - (allocated_tokens + sub_req_id) * hidden_size + offset] = vVal; + (real_idx) * hidden_size + + offset] = vVal; } } @@ -350,6 +369,9 @@ void update_kv_cache_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, // assert(curr_depth < 3); if (num_tokens > 0) { int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens; + printf("tokenInfo %d, %d\n", + bc->beamTokenInfo[0].sub_request_index, + num_tokens); specinfer_store_kv_cache<<max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; - } else if (tokens_previous_requests < bc->num_generation_tokens) { - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; - continue; - } + } + // else if (tokens_previous_requests < bc->num_generation_tokens) { + // tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + // continue; + // } // all requests in prompt phase should only have one sub requests; assert(bc->sub_requests[i] == 1); @@ -523,6 +546,9 @@ void compute_attention_kernel_prompt( m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; // To get B, skip over K entries from previous requests (all heads + // padding) + + print_tensor((float*)A, 32, "A"); + std::cout << "meta: " << num_new_tokens << ", " << total_tokens << "\n"; DT const *B = static_cast
(m->keyCache) + (i * bc->MAX_SPECULATIVE_TREE_BRANCHES) * kt_req_block_size; @@ -557,6 +583,7 @@ void compute_attention_kernel_prompt( m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + print_tensor((float*)C, 32, "C"); // add alibi position bias to qk production // add alibi position bias to qk production if (*m->position_bias) { @@ -641,6 +668,8 @@ void compute_attention_kernel_prompt( B = C_softmax; // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests + + print_tensor((float*)C_softmax, 32, "C_softmax"); C = static_cast
(m->attn_heads) + (tokens_previous_requests + bc->num_generation_tokens) * m->num_q_heads * m->vProjSize; @@ -695,6 +724,8 @@ void inference_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, stream); // phase 2: Update key/val cache update_kv_cache_kernel
(m, bc, stream); + std::cout << "specinfer kernel token num: " << bc->num_generation_tokens + << ", " << bc->num_tokens << "\n"; if (bc->num_generation_tokens > 0) { compute_specinfer_attention_kernel_generation
( m, bc, static_cast
(m->attn_heads), stream); @@ -705,6 +736,8 @@ void inference_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, compute_attention_kernel_prompt( m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); } + // compute_attention_kernel_prompt( + // m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); // compute output production and bias together for all tokens int num_tokens = bc->num_active_tokens(); @@ -783,6 +816,12 @@ void SpecInferIncMultiHeadSelfAttention::inference_kernel_wrapper( // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, // acc_output.rect, "[Attention:forward:output]"); } + + // if(bc->num_tokens == 1){ + // print_tensor(input.get_float_ptr(), 32, "specinc input"); + // print_tensor(output.get_float_ptr(), 32, "specinc output"); + // assert(false); + // } } SpecInferIncMultiHeadSelfAttentionMeta::SpecInferIncMultiHeadSelfAttentionMeta( @@ -825,24 +864,6 @@ SpecInferIncMultiHeadSelfAttentionMeta::SpecInferIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - // int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); - // size_t beam_tokeninfo_size = - // max_tokens_per_batch * BeamSearchBatchConfig::MAX_BEAM_WIDTH; - // size_t requestinfo_size = - // BeamSearchBatchConfig::max_requests_per_batch(); size_t - // beam_requestinfo_size = - // BeamSearchBatchConfig::max_requests_per_batch(); - // size_t total_size = - // beam_tokeninfo_size * - // sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo) + - // beam_requestinfo_size * - // sizeof(BeamSearchBatchConfig:: - // BeamSearchPerRequestInfo); // more components will - // // be added here later - - // We always directly allocate memory for small speculative models - // gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, - // total_size); beam_topology_mask = static_cast( handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index fb978adfff..52fd64c606 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -257,6 +257,7 @@ void InferenceManager::init_operators_inference(FFModel *model) { ((ParallelOp *)op) ->create_input_partition_inference(*model, inputs, outputs); } + printf("init op %s\n", op->name); op->init_inference(*model, inputs, outputs); } } diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index e1b591c320..845a580c13 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -714,7 +714,8 @@ BeamSearchBatchConfig dfs_tree_inputs.erase(request.guid); } else { // Request not finished, pass verified_tokens to next iteration - + + std::cout << "parse to next iteration: " << "\n"; new_bc.request_completed[i] = false; new_bc.request_running[i] = true; @@ -752,6 +753,12 @@ BeamSearchBatchConfig new_bc.beamRequestsInfo[i].sub_request_num = 1; new_bc.sub_requests[i] = 1; + new_bc.topology_mask[i].allocated_tokens = request.tokens.size(); + + //assign new kv cache position + for(int j = 0; j < request.tokens.size(); j++){ + new_bc.topology_mask[i].real_token_pos[0][j] = j; + } // Token Info for (int j = 0; j < verified_tokens.size(); j++) { @@ -768,6 +775,8 @@ BeamSearchBatchConfig // Beam Token Info new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; new_bc.num_tokens++; + std::cout << "num_gen ++ " << "\n"; + num_generation_tokens++; // Add verified token to request's token list request.tokens.push_back(token.first); @@ -776,6 +785,8 @@ BeamSearchBatchConfig break; } } + + std::string output = this->tokenizer_->Decode(request.tokens); // Unlike Huggingface, the sentencepiece C++ library automatically // removes the BOS token @@ -817,6 +828,7 @@ BeamSearchBatchConfig } new_bc.beamRequestsInfo[i].sub_request_num = 1; + new_bc.topology_mask[i].allocated_tokens = 0; new_bc.sub_requests[i] = 1; @@ -875,7 +887,11 @@ BeamSearchBatchConfig } new_bc.request_completed[i] = false; + new_bc.beamRequestsInfo[i].sub_request_num = 1; + printf("sub request num == 1, %d \n", + new_bc.beamRequestsInfo[i].beam_size); + new_bc.sub_requests[i] = 1; for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { @@ -892,6 +908,7 @@ BeamSearchBatchConfig new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; new_bc.num_tokens++; } + new_bc.topology_mask[i].allocated_tokens = 0; // if (new_bc.requestsInfo[i].num_tokens_in_batch < // new_request.initial_len) { @@ -927,6 +944,8 @@ BeamSearchBatchConfig } new_bc.num_generation_tokens = num_generation_tokens; + std::cout << "prepare next batch init gen tokens: " << new_bc.num_generation_tokens << "\n"; + if (verbose) { std::cout << "prepare_next_batch_init OLD vs NEW batchconfigs below:" << std::endl; @@ -969,10 +988,10 @@ BeamSearchBatchConfig RequestManager::prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, BeamInferenceResult const &result) { std::lock_guard const lock(request_queue_mutex); - if (verbose) { + if (true) { std::cout << "\n############### prepare_next_batch_beam ###############\n"; } - if (verbose) { + if (true) { std::cout << "print all results" << "\n"; for (int i = 0; i < 40; i++) { @@ -980,6 +999,8 @@ BeamSearchBatchConfig } std::cout << "Current Beam Depth: " << old_bc.beamRequestsInfo[0].current_depth << "\n"; + std::cout << "Current sub request num: " + << old_bc.beamRequestsInfo[0].sub_request_num << "\n"; } // Step 1: Store result to the beam tree struct store_beam_metadata(old_bc, result); @@ -1049,6 +1070,7 @@ BeamSearchBatchConfig spec_infer_tree_width.size() > ssm_decoding_steps ? spec_infer_tree_width[ssm_decoding_steps] : 1; + new_bc.beamRequestsInfo[i].max_depth = old_bc.beamRequestsInfo[i].max_depth; @@ -1154,13 +1176,16 @@ BeamSearchBatchConfig spec_infer_tree_width.size() > ssm_decoding_steps ? spec_infer_tree_width[ssm_decoding_steps] : 1; + printf("beam size: %d, %d\n", + new_bc.beamRequestsInfo[i].beam_size, + ssm_decoding_steps); new_bc.beamRequestsInfo[i].max_depth = old_bc.beamRequestsInfo[i].max_depth; new_bc.sub_requests[i] = old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; new_bc.beamRequestsInfo[i].sub_request_num = - old_bc.beamRequestsInfo[i].sub_request_num * - new_bc.beamRequestsInfo[i].beam_size; + old_bc.beamRequestsInfo[i].sub_request_num; + assert(new_bc.beamRequestsInfo[i].sub_request_num <= BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES); @@ -1230,6 +1255,16 @@ BeamSearchBatchConfig old_bc.print(); new_bc.print(); } + + if (true) { + std::cout << "print all resultsBBB" + << "\n"; + for (int i = 0; i < 40; i++) { + std::cout << result.token_ids[i] << ", "; + } + std::cout << "Current Beam DepthBBB: " + << old_bc.beamRequestsInfo[0].current_depth << "\n"; + } return new_bc; } @@ -1296,6 +1331,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( profiling_requests[request.guid].llm_decoding_steps += 1; if (request.status == Request::RUNNING) { + std::cout << "prepare next batch running: pending\n" + << "\n"; new_bc.request_running[i] = true; std::cout << "[Verify] Request " << request.guid << " is running" << std::endl; @@ -1401,6 +1438,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } else if (request.status == Request::PENDING) { + std::cout << "prepare next batch verify: pending\n" + << "\n"; new_bc.request_running[i] = false; if (verbose) { std::cout << "[Verify] Request " << request.guid @@ -1450,6 +1489,9 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( << std::endl; if (request.llm_cache_size < request.initial_len) { + std::cout << "Initialization (prompt) phase: " + << new_bc.requestsInfo[i].num_tokens_in_batch << ", " + << old_batches.at(0).beamRequestsInfo[i].beam_size << "\n"; // Initialization (prompt) phase for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { new_bc.tokensInfo[new_bc.num_tokens].request_index = i; @@ -1457,7 +1499,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( request.tokens[request.llm_cache_size + j]; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = request.llm_cache_size + j; - + std::cout << "load prompt tokens: " << j << ": " << new_bc.tokensInfo[new_bc.num_tokens].token_id << "\n"; new_bc.num_tokens++; } @@ -1483,6 +1525,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } else { // launch the request into running phase after loading all prompt if (get_max_tokens_per_batch() - new_bc.num_tokens > 0) { + std::cout << "Initialization running phase: " + << new_bc.requestsInfo[i].num_tokens_in_batch << "\n"; request.status = Request::RUNNING; new_bc.request_running[i] = true; @@ -1521,7 +1565,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, auto start_depth = old_bc.tokensInfo[0].abs_depth_in_request; int result_index = 0; - if (verbose) { + if (true) { std::cout << "Store total of " << old_bc.num_tokens << " tokens in the current batch.\n"; } @@ -1535,7 +1579,8 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, int beam_size = old_bc.beamRequestsInfo[index].beam_size; // int leaf_node_num = old_bc.sub_requests[index]; - int leaf_node_num = old_bc.beamRequestsInfo[i].sub_request_num; + int leaf_node_num = + old_bc.beamRequestsInfo[index].sub_request_num * beam_size; int depth = old_bc.beamRequestsInfo[index].current_depth; // Each token yields (beam_width) results @@ -1545,18 +1590,26 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, // index result_index += (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) * - leaf_node_num; + beam_size; - if (verbose) { + // result_index += old_bc.topology_mask[index].allocated_tokens; + + if (true) { std::cout << "i = " << i << ", result index = " << result_index - << ", value: " << result.token_ids[result_index] << "\n"; + << ", value: " << result.token_ids[result_index] + << ", leaf node num: " << leaf_node_num << ", depth" << depth + << ", beam size: " << beam_size << "\n"; } Request &request = all_requests[old_bc.requestsInfo[index].request_guid]; + if (old_bc.requestsInfo[index].num_tokens_in_batch == 0) { + continue; + } + if (depth == 1) { // store the last input into the tree; - if (verbose) { + if (true) { std::cout << "try to store the input" << "\n"; } @@ -1566,7 +1619,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, request.beam_trees.at(old_bc.model_id).treeLayers[0].probs[0] = 1; request.beam_trees.at(old_bc.model_id).treeLayers[0].parent_ids[0] = -1; - if (verbose) { + if (true) { std::cout << "Store the previous last token to the tree root: " << request.tokens.back() << "\n"; } @@ -1583,7 +1636,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, .treeLayers[depth] .parent_ids[beam_id] = result.parent_id[result_index]; - if (verbose) { + if (true) { std::cout << "tree value: " << depth << "token: " << request.beam_trees.at(old_bc.model_id) .treeLayers[depth] @@ -1592,7 +1645,6 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, } result_index += 1; } - // update the guid and start_depth for current request if (i < old_bc.num_tokens) { guid = old_bc.requestsInfo[index].request_guid; @@ -1600,6 +1652,10 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, } } } + + if (old_bc.num_tokens != 10) { + assert(false); + } } // for updating the beam search metadata in requests in incremental phase @@ -1638,7 +1694,6 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, new_bc.beamRequestsInfo[request_index].tokens[j] = tree.treeLayers[depth].tokens[j]; - // new_bc.topology_mask[request_index].real_token_pos[j] = } assert(false); @@ -1784,7 +1839,7 @@ std::vector> // depth) pairs for (auto const &pair : inputSerializedTree) { oss << " " << pair.second << ":" << pair.first; - // log_req_mgr.print("(%d, %d)", pair.first, pair.second); + log_req_mgr.print("(%d, %d)", pair.first, pair.second); } log_req_mgr.print("Input tree:%s", oss.str().c_str()); } @@ -1793,7 +1848,7 @@ std::vector> // outputSerializedTree is an array of (token id, depth + 1) pairs std::ostringstream oss; for (auto const &pair : outputSerializedTree) { - // log_req_mgr.print("(%d, %d)", pair.first, pair.second); + log_req_mgr.print("(%d, %d)", pair.first, pair.second); oss << " " << pair.second << ":" << pair.first; } log_req_mgr.print("Output tree:%s", oss.str().c_str()); @@ -1847,7 +1902,7 @@ std::vector> // log_req_mgr.print("========Verified============"); std::ostringstream oss; for (auto const &pair : verifiedTree) { - // log_req_mgr.print("(%d, %d)", pair.first, pair.second); + log_req_mgr.print("(%d, %d)", pair.first, pair.second); oss << " " << pair.second << ":" << pair.first; } log_req_mgr.print("Verified:%s", oss.str().c_str()); diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index f4500d152d..b76c5c326e 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -91,17 +91,17 @@ void RequestManager::load_tokens_task( sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + sizeof(BeamSearchBatchConfig::topology_mask), - &(beam_batch_config->beamRequestsInfo), - sizeof(BeamSearchBatchConfig::beamRequestsInfo), + &(beam_batch_config->beamTokenInfo), + sizeof(BeamSearchBatchConfig::beamTokenInfo), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + sizeof(BeamSearchBatchConfig::topology_mask) + - sizeof(BeamSearchBatchConfig::beamRequestsInfo), - &(beam_batch_config->beamTokenInfo), - sizeof(BeamSearchBatchConfig::beamTokenInfo), + sizeof(BeamSearchBatchConfig::beamTokenInfo), + &(beam_batch_config->beamRequestsInfo), + sizeof(BeamSearchBatchConfig::beamRequestsInfo), cudaMemcpyHostToDevice, stream); } From 617a29fdda4e79d0d9c7bbcc1455ed447c42988f Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 26 Dec 2023 13:43:49 -0500 Subject: [PATCH 06/61] fix speculative --- .../specinfer_inc_multihead_self_attention.cu | 42 ++++--- src/runtime/request_manager.cc | 107 +++++++++++++----- 2 files changed, 109 insertions(+), 40 deletions(-) diff --git a/src/ops/specinfer_inc_multihead_self_attention.cu b/src/ops/specinfer_inc_multihead_self_attention.cu index 9d6f70d5ba..63cd90f44f 100644 --- a/src/ops/specinfer_inc_multihead_self_attention.cu +++ b/src/ops/specinfer_inc_multihead_self_attention.cu @@ -134,11 +134,20 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( ii * THREADS_PER_KEY * K_VEC_SIZE); } - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - printf("cacheposssss %d, %d\n", tree_branch_num, topology.real_token_pos[0][0]); - printf("cacheposssss %d, %d\n", tree_branch_num, topology.real_token_pos[0][1]); - printf("cacheposssss %d, %d\n", tree_branch_num, topology.real_token_pos[0][2]); - printf("cacheposssss %d, %d\n", tree_branch_num, topology.real_token_pos[0][10]); + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && sub_req_idx == 0) { + printf("cacheposssssA %d, %d\n", tree_branch_num, topology.real_token_pos[0][0]); + printf("cacheposssssB %d, %d\n", tree_branch_num, topology.real_token_pos[0][1]); + printf("cacheposssssC %d, %d\n", tree_branch_num, topology.real_token_pos[0][2]); + printf("cacheposssssD %d, %d\n", tree_branch_num, topology.real_token_pos[0][11]); + printf("cacheposssssD %d, %d\n", tree_branch_num, topology.real_token_pos[0][12]); + printf("cacheposssssD %d, %d\n", tree_branch_num, topology.real_token_pos[0][13]); + }else if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && sub_req_idx == 1) { + printf("cacheposssssE %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][0]); + printf("cacheposssssF %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][1]); + printf("cacheposssssG %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][2]); + printf("cacheposssssH %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][11]); + printf("cacheposssssH %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][12]); + printf("cacheposssssH %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][13]); } __syncthreads(); for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { @@ -289,7 +298,7 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // Output the final values. if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { convert_from_float( - *reinterpret_cast(output_ptr + request_idx * hidden_size + + *reinterpret_cast(output_ptr + (request_idx + sub_req_idx) * hidden_size + head_idx * per_head_size + vi), out); } @@ -332,7 +341,7 @@ __global__ void specinfer_store_kv_cache( int const beam_size = beamRequestInfos[req_id].sub_request_num; - int real_idx = tok_id - first_token_in_req + allocated_tokens; + int real_idx = tok_id - first_token_in_req + allocated_tokens + sub_req_id; if (i == 0) { printf("ffasdasds%d, %d, %d, %d, %d, %d\n", @@ -343,10 +352,15 @@ __global__ void specinfer_store_kv_cache( first_token_in_req, real_idx); } - // }else if(i == hidden_size * 2){ - // printf("ffasdasdskkkk%d, %d, %d\n", allocated_tokens, tok_id, - // sub_req_id); - // } + else if(i == hidden_size * 2){ + printf("hshddhdhdsdaww%d, %d, %d, %d, %d, %d\n", + beamTokenInfos[0].sub_request_index, + allocated_tokens, + sub_req_id, + tok_id, + first_token_in_req, + real_idx); + } @@ -547,7 +561,7 @@ void compute_attention_kernel_prompt( // To get B, skip over K entries from previous requests (all heads + // padding) - print_tensor((float*)A, 32, "A"); + // print_tensor((float*)A, 32, "A"); std::cout << "meta: " << num_new_tokens << ", " << total_tokens << "\n"; DT const *B = static_cast
(m->keyCache) + (i * bc->MAX_SPECULATIVE_TREE_BRANCHES) * kt_req_block_size; @@ -583,7 +597,7 @@ void compute_attention_kernel_prompt( m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - print_tensor((float*)C, 32, "C"); + // print_tensor((float*)C, 32, "C"); // add alibi position bias to qk production // add alibi position bias to qk production if (*m->position_bias) { @@ -669,7 +683,7 @@ void compute_attention_kernel_prompt( // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests - print_tensor((float*)C_softmax, 32, "C_softmax"); + // print_tensor((float*)C_softmax, 32, "C_softmax"); C = static_cast
(m->attn_heads) + (tokens_previous_requests + bc->num_generation_tokens) * m->num_q_heads * m->vProjSize; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 845a580c13..775280e2cf 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -714,8 +714,9 @@ BeamSearchBatchConfig dfs_tree_inputs.erase(request.guid); } else { // Request not finished, pass verified_tokens to next iteration - - std::cout << "parse to next iteration: " << "\n"; + + std::cout << "parse to next iteration: " + << "\n"; new_bc.request_completed[i] = false; new_bc.request_running[i] = true; @@ -755,8 +756,8 @@ BeamSearchBatchConfig new_bc.sub_requests[i] = 1; new_bc.topology_mask[i].allocated_tokens = request.tokens.size(); - //assign new kv cache position - for(int j = 0; j < request.tokens.size(); j++){ + // assign new kv cache position + for (int j = 0; j < request.tokens.size(); j++) { new_bc.topology_mask[i].real_token_pos[0][j] = j; } @@ -775,7 +776,8 @@ BeamSearchBatchConfig // Beam Token Info new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; new_bc.num_tokens++; - std::cout << "num_gen ++ " << "\n"; + std::cout << "num_gen ++ " + << "\n"; num_generation_tokens++; // Add verified token to request's token list @@ -785,7 +787,6 @@ BeamSearchBatchConfig break; } } - std::string output = this->tokenizer_->Decode(request.tokens); // Unlike Huggingface, the sentencepiece C++ library automatically @@ -944,7 +945,8 @@ BeamSearchBatchConfig } new_bc.num_generation_tokens = num_generation_tokens; - std::cout << "prepare next batch init gen tokens: " << new_bc.num_generation_tokens << "\n"; + std::cout << "prepare next batch init gen tokens: " + << new_bc.num_generation_tokens << "\n"; if (verbose) { std::cout << "prepare_next_batch_init OLD vs NEW batchconfigs below:" @@ -1078,7 +1080,14 @@ BeamSearchBatchConfig old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; new_bc.beamRequestsInfo[i].sub_request_num = old_bc.beamRequestsInfo[i].sub_request_num * - new_bc.beamRequestsInfo[i].beam_size; + old_bc.beamRequestsInfo[i].beam_size; + + std::cout << "oldbc : " << old_bc.beamRequestsInfo[i].sub_request_num + << ", " << old_bc.beamRequestsInfo[i].beam_size << "\n"; + + // if (old_bc.beamRequestsInfo[i].current_depth == 3) { + // assert(false); + // } assert(new_bc.beamRequestsInfo[i].sub_request_num <= BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES); @@ -1090,6 +1099,10 @@ BeamSearchBatchConfig // do the slot exchange to minimize the cache exchange in kernel. update_beam_metadata( new_bc, old_bc, request.beam_trees.at(old_bc.model_id), i); + + new_bc.topology_mask[i].allocated_tokens = + old_bc.topology_mask[i].allocated_tokens + + old_bc.beamRequestsInfo[i].sub_request_num; } else { assert(false && "Request should not be pending in beam search phase"); } @@ -1101,6 +1114,7 @@ BeamSearchBatchConfig request.tokens.size()) { // Incremental phase if (request.status == Request::RUNNING) { + // todo check it new_bc.requestsInfo[i].num_tokens_in_batch = 1; } else { assert(false && "Request should be done"); @@ -1122,7 +1136,31 @@ BeamSearchBatchConfig << std::endl; } + // for (int j = 0; j < request.tokens.size(); j++) { + // new_bc.topology_mask[i].real_token_pos[0][j] = j; + // } + // register more tokens due to the beam width + std::cout << "register more tokens: " + << new_bc.beamRequestsInfo[i].sub_request_num << ", " + << new_bc.requestsInfo[i].num_tokens_in_batch << ", " + << new_bc.topology_mask[i].allocated_tokens << "\n"; + + // copy meta data and replicate + int replicate_num = new_bc.beamRequestsInfo[i].sub_request_num / + old_bc.beamRequestsInfo[i].sub_request_num; + + for (int j = 0; j < old_bc.beamRequestsInfo[i].sub_request_num; j++) { + int old_idx = j; + for (int k = 0; k < replicate_num; k++) { + int new_idx = j * replicate_num + k; + std::cout << "copy from " << old_idx << "to: " << new_idx << "\n"; + memcpy(new_bc.topology_mask[i].real_token_pos[new_idx], + old_bc.topology_mask[i].real_token_pos[old_idx], + sizeof(int) * BatchConfig::MAX_NUM_TOKENS); + } + } + for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) { @@ -1135,6 +1173,15 @@ BeamSearchBatchConfig new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; new_bc.num_tokens++; + + // width first + new_bc.topology_mask[i].real_token_pos[k][depth] = + new_bc.topology_mask[i].allocated_tokens + num_generation_tokens; + + std::cout << "topology: sub request: " << k << ", " + << ", " << depth << ", " + << new_bc.topology_mask[i].real_token_pos[k][depth] << "\n"; + num_generation_tokens++; } } } @@ -1331,6 +1378,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( profiling_requests[request.guid].llm_decoding_steps += 1; if (request.status == Request::RUNNING) { + std::cout << "prepare next batch running: pending\n" << "\n"; new_bc.request_running[i] = true; @@ -1415,11 +1463,12 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.requestsInfo[i].first_token_depth_in_request = request.tokens.size() - 1; - + + std::cout << "prepare next batch verify: " << dfs_tree_inputs.size() << "\n"; // Add Tokens from the DFS Tree to the next batch for (int j = 1; j < dfs_tree_inputs.size(); j++) { auto token = dfs_tree_inputs.at(j); - if (verbose) { + if (true) { std::cout << "[" << j << "] Token: " << token.first << ", Depth:" << token.second << std::endl; } @@ -1436,6 +1485,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( break; } } + assert(false); } else if (request.status == Request::PENDING) { std::cout << "prepare next batch verify: pending\n" @@ -1499,7 +1549,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( request.tokens[request.llm_cache_size + j]; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = request.llm_cache_size + j; - std::cout << "load prompt tokens: " << j << ": " << new_bc.tokensInfo[new_bc.num_tokens].token_id << "\n"; + std::cout << "load prompt tokens: " << j << ": " + << new_bc.tokensInfo[new_bc.num_tokens].token_id << "\n"; new_bc.num_tokens++; } @@ -1625,7 +1676,10 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, } } + std::cout << "leaffffff: " << leaf_node_num << "\n"; + for (int beam_id = 0; beam_id < leaf_node_num; beam_id++) { + request.beam_trees.at(old_bc.model_id) .treeLayers[depth] .tokens[beam_id] = result.token_ids[result_index]; @@ -1635,14 +1689,19 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, request.beam_trees.at(old_bc.model_id) .treeLayers[depth] .parent_ids[beam_id] = result.parent_id[result_index]; - - if (true) { - std::cout << "tree value: " << depth << "token: " - << request.beam_trees.at(old_bc.model_id) - .treeLayers[depth] - .tokens[beam_id] - << "result tokens: " << result.token_ids[result_index]; - } + std::cout << "??????? beam id: " << beam_id << ", token: " + << request.beam_trees.at(old_bc.model_id) + .treeLayers[depth] + .tokens[beam_id] + << "\n"; + + // if (true) { + // std::cout << "tree value: " << depth << "token: " + // << request.beam_trees.at(old_bc.model_id) + // .treeLayers[depth] + // .tokens[beam_id] + // << "result tokens: " << result.token_ids[result_index]; + // } result_index += 1; } // update the guid and start_depth for current request @@ -1652,10 +1711,6 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, } } } - - if (old_bc.num_tokens != 10) { - assert(false); - } } // for updating the beam search metadata in requests in incremental phase @@ -1672,7 +1727,7 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, int beam_size = new_bc.beamRequestsInfo[request_index].beam_size; // int leaf_node_num = old_bc.sub_requests[request_index]; - int leaf_node_num = old_bc.beamRequestsInfo[request_index].sub_request_num; + int leaf_node_num = new_bc.beamRequestsInfo[request_index].sub_request_num; if (new_bc.beamRequestsInfo[request_index].current_depth == 1) { // TODO: check if this is correct @@ -1693,9 +1748,9 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, tree.treeLayers[depth].probs[j]; new_bc.beamRequestsInfo[request_index].tokens[j] = tree.treeLayers[depth].tokens[j]; - + std::cout << "token: " << j << ": " + << new_bc.beamRequestsInfo[request_index].tokens[j] << "\n"; } - assert(false); // std::set parents; // std::set childs; From b5f9d5d2d5eea50951a466d339bdc47910e69e07 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 28 Dec 2023 01:57:39 -0500 Subject: [PATCH 07/61] bitmap+tree verify --- include/flexflow/batch_config.h | 20 +- include/flexflow/config.h | 3 +- .../inc_multihead_self_attention_utils.cuh | 2 +- .../specinfer_inc_multihead_self_attention.h | 1 + .../ops/tree_inc_multihead_self_attention.h | 1 + include/flexflow/request_manager.h | 10 + src/ops/argmax.cc | 2 + src/ops/inc_multihead_self_attention.cu | 8 +- src/ops/kernels/embedding_kernels.cu | 1 + .../specinfer_inc_multihead_self_attention.cu | 202 ++++++++---- src/ops/tree_inc_multihead_self_attention.cu | 197 ++++++++---- src/runtime/request_manager.cc | 291 ++++++++++++++---- src/runtime/request_manager.cu | 12 + 13 files changed, 562 insertions(+), 188 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index dd947bbd85..db5d4a8e48 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -56,6 +56,7 @@ class BatchConfig { // across workers static int const MAX_NUM_REQUESTS = 64; static int const MAX_NUM_TOKENS = 1024; + static int const MAX_SPEC_TREE_TOKEN_NUM = 64; // Set by update int num_tokens; @@ -75,6 +76,24 @@ class BatchConfig { int request_index; TokenId token_id; }; + + struct BitMask { + unsigned long long mask[MAX_SPEC_TREE_TOKEN_NUM] = {0}; + + // how many tokens before the tree, every sub requests need this part of + // cache + int non_tree_cache_size; + + // current tree size + int tree_size; + + int this_layer_size; + + // input length-> prompt/root + int prompt_size; + }; + + BitMask causalMask[MAX_NUM_REQUESTS]; PerRequestInfo requestsInfo[MAX_NUM_REQUESTS]; PerTokenInfo tokensInfo[MAX_NUM_TOKENS]; @@ -154,7 +173,6 @@ class BeamSearchBatchConfig : public BatchConfig { int allocated_tokens; }; - BeamSearchPerRequestInfo beamRequestsInfo[MAX_NUM_REQUESTS]; BeamSearchPerTokenInfo beamTokenInfo[MAX_NUM_TOKENS * MAX_BEAM_WIDTH]; SpecInferTopology topology_mask[MAX_NUM_REQUESTS]; diff --git a/include/flexflow/config.h b/include/flexflow/config.h index 321d14961b..fe261dfb48 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -83,7 +83,8 @@ struct FFHandler { sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + sizeof(BeamSearchBatchConfig::topology_mask) + sizeof(BeamSearchBatchConfig::beamTokenInfo) + - sizeof(BeamSearchBatchConfig::beamRequestsInfo); + sizeof(BeamSearchBatchConfig::beamRequestsInfo) + + sizeof(BatchConfig::causalMask); void *offload_reserve_space; size_t offload_reserve_space_size; DataType quantization_type; diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh index c128c1a126..0c065b6b0e 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh @@ -456,7 +456,7 @@ inline size_t smem_size_in_bytes(int hidden_size_per_head, int threads_per_block) { // The amount of shared memory needed to store the Q*K^T values in float. - size_t qk_sz = div_up(max_sequence_length + 1, 4) * 16; + size_t qk_sz = div_up(1000 + 1, 4) * 16; size_t logits_sz = qk_sz; // The total size needed during softmax. diff --git a/include/flexflow/ops/specinfer_inc_multihead_self_attention.h b/include/flexflow/ops/specinfer_inc_multihead_self_attention.h index 6e5dc73b5c..eb1b2882c3 100644 --- a/include/flexflow/ops/specinfer_inc_multihead_self_attention.h +++ b/include/flexflow/ops/specinfer_inc_multihead_self_attention.h @@ -143,6 +143,7 @@ class SpecInferIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionM BeamSearchBatchConfig::BeamSearchPerTokenInfo *beam_token_infos; BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos; BeamSearchBatchConfig::SpecInferTopology *beam_topology_mask; + BatchConfig::BitMask *causalMask; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index 6e2da19ce9..d160da4a72 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -147,6 +147,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { int num_active_tokens; Realm::RegionInstance committed_token_reserve_inst; TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos; + BatchConfig::BitMask *causalMask; }; }; // namespace FlexFlow diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index e67888d2d6..dc1939c74b 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -110,6 +110,16 @@ class RequestManager { int eos_token_id, std::string const &path); void register_output_filepath(std::string const &); + void initBitMask(BatchConfig::BitMask &bitmask, int initLength); + void appendBitMask(BatchConfig::BitMask &bitmask, + int newNodes, + int preBeamSize, + int old_sub_num, + BeamTree const tree, + int currentDepth); + void updateBitMask(BatchConfig::BitMask &bitmask, + int initLength, + int non_tree_size); FFModel *get_model(int model_id); diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index f336c843e8..0344c707fc 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -398,6 +398,8 @@ InferenceResult ArgMax::save_inference_tensors_to_file( m, shard_id, bc, {}, {}, {input, indices}); } + + print_tensor(indices.get_int32_ptr(), 32, "tree attn output"); download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size); return ir; diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 4c184acb3c..a05dbbf919 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1364,8 +1364,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( vProjSize * num_q_heads); size_t key_cache_size = 0, value_cache_size = 0; switch (infer_mode) { - case INC_DECODING_MODE: - case TREE_VERIFY_MODE: { + case INC_DECODING_MODE: { key_cache_size = num_q_heads * kProjSize * BatchConfig::max_requests_per_batch() * BatchConfig::max_sequence_length(); @@ -1374,7 +1373,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( BatchConfig::max_sequence_length(); break; } - case BEAM_SEARCH_MODE: { + case BEAM_SEARCH_MODE: + case TREE_VERIFY_MODE: { // a K-ary tree max node is (k^n - 1) / 2 key_cache_size = num_q_heads * kProjSize * BeamSearchBatchConfig::max_requests_per_batch() * @@ -1402,7 +1402,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( 2 * qk_prod_size + attn_heads_size) * size_of_dt + complex_size * sizeof(cuFloatComplex); // more components will - // be added here later + // be added here later if (offload) { // assert that we have enough reserved work space left size_t totalSharedSize = diff --git a/src/ops/kernels/embedding_kernels.cu b/src/ops/kernels/embedding_kernels.cu index 22d8161ff1..91f5d60e85 100644 --- a/src/ops/kernels/embedding_kernels.cu +++ b/src/ops/kernels/embedding_kernels.cu @@ -118,6 +118,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, // print_tensor(output_ptr, output_domain.get_volume(), // "[Embedding:forward:output]"); } + print_tensor(input.get_int32_ptr(), 32, "embeddinginput"); } /*static*/ diff --git a/src/ops/specinfer_inc_multihead_self_attention.cu b/src/ops/specinfer_inc_multihead_self_attention.cu index 63cd90f44f..e8ac1d980c 100644 --- a/src/ops/specinfer_inc_multihead_self_attention.cu +++ b/src/ops/specinfer_inc_multihead_self_attention.cu @@ -51,6 +51,7 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( BatchConfig::PerRequestInfo *request_infos, BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos, BeamSearchBatchConfig::SpecInferTopology *topology_mask, + BatchConfig::BitMask *causalMask, int max_tree_branches) { // q, k @@ -75,11 +76,18 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( BeamSearchBatchConfig::SpecInferTopology topology = topology_mask[request_idx]; + BatchConfig::BitMask bitmask = causalMask[request_idx]; int const first_step = 0; int const tlength = request_infos[request_idx].first_token_depth_in_request + request_infos[request_idx].num_tokens_in_batch; + + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + printf("specinfer attn fused kernel %lld\n", bitmask.mask[1]); + } + + int const totalCacheSize = bitmask.non_tree_cache_size + bitmask.tree_size; // int const qlength = request_infos[request_idx].num_tokens_in_batch; int const tree_branch_num = beam_request_infos[request_idx].sub_request_num; @@ -88,7 +96,8 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( int first_token_idx = 0; for (int r = 0; r < request_idx; r++) { - first_token_idx += request_infos[request_idx].num_tokens_in_batch; + // first_token_idx += request_infos[request_idx].num_tokens_in_batch; + first_token_idx += bitmask.this_layer_size; } // shared memory objects @@ -124,7 +133,7 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( request_idx * max_seq_length * hidden_size * max_tree_branches + ki; int ti_end = - div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; + div_up(totalCacheSize - first_step, K_PER_WARP) * K_PER_WARP + first_step; for (int sub_req_idx = 0; sub_req_idx < tree_branch_num; sub_req_idx += 1) { #pragma unroll @@ -134,21 +143,25 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( ii * THREADS_PER_KEY * K_VEC_SIZE); } - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && sub_req_idx == 0) { - printf("cacheposssssA %d, %d\n", tree_branch_num, topology.real_token_pos[0][0]); - printf("cacheposssssB %d, %d\n", tree_branch_num, topology.real_token_pos[0][1]); - printf("cacheposssssC %d, %d\n", tree_branch_num, topology.real_token_pos[0][2]); - printf("cacheposssssD %d, %d\n", tree_branch_num, topology.real_token_pos[0][11]); - printf("cacheposssssD %d, %d\n", tree_branch_num, topology.real_token_pos[0][12]); - printf("cacheposssssD %d, %d\n", tree_branch_num, topology.real_token_pos[0][13]); - }else if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && sub_req_idx == 1) { - printf("cacheposssssE %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][0]); - printf("cacheposssssF %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][1]); - printf("cacheposssssG %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][2]); - printf("cacheposssssH %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][11]); - printf("cacheposssssH %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][12]); - printf("cacheposssssH %d, %d\n", tree_branch_num, topology.real_token_pos[sub_req_idx][13]); - } + int const query_token = bitmask.tree_size - tree_branch_num + sub_req_idx; + + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && sub_req_idx == 0) { + // printf("fuckmasksss %d, %d, %d, %d, %d\n", + // bitmask.prompt_size, + // bitmask.non_tree_cache_size, + // tree_branch_num, + // bitmask.tree_size, + // tlength); + // printf("cacheposssssB %d, %d\n", tree_branch_num, + // topology.real_token_pos[0][1]); + // printf("cacheposssssC %d, %d\n", tree_branch_num, + // topology.real_token_pos[0][2]); + // printf("cacheposssssD %d, %d\n", tree_branch_num, + // topology.real_token_pos[0][11]); printf("cacheposssssD %d, %d\n", + // tree_branch_num, topology.real_token_pos[0][12]); + // printf("cacheposssssD %d, %d\n", tree_branch_num, + // topology.real_token_pos[0][13]); + } __syncthreads(); for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { K_vec k[K_VECS_PER_THREAD]; @@ -156,22 +169,33 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; - if (ti < tlength) { + if (ti < totalCacheSize) { // find the real position of the cache; // depth: 0, 1, 2, 3, 4, 4, 5, 5 ,5, 5, - int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti]; + // int const real_cache_idx = + // topology.real_token_pos[sub_req_idx][ti]; k[ii] = *reinterpret_cast( - k_cache_batch + real_cache_idx * hidden_size + - head_idx * per_head_size + jj); + k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size + + jj); } } float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); - if (ti < tlength && tidx % THREADS_PER_KEY == 0) { + if (ti < totalCacheSize && tidx % THREADS_PER_KEY == 0) { // todo add alobi here - bool const mask = ti_circ >= tlength; - if (mask) { - assert(false); + // bool const mask = ti_circ >= totalCacheSize; + bool const mask = (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << query_token)))); + + if (blockIdx.y == 0 && blockIdx.x == 0 && mask && sub_req_idx == 0) { + // printf("specinfer mask: ti:%d, %d, %d, %d, %lld\n", + // ti, + // totalCacheSize, + // ti - bitmask.non_tree_cache_size, + // query_token, + // bitmask.mask[ti - bitmask.non_tree_cache_size]); + // assert(false); } qk_max = mask ? qk_max : fmaxf(qk_max, qk); qk_smem[ti - first_step] = mask ? 0.f : qk; @@ -208,10 +232,14 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); float exp_sum = 0.f; - for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { - float logit = __expf(qk_smem[ti - first_step] - qk_max); + for (int ti = first_step + tidx; ti < totalCacheSize; + ti += THREADS_PER_BLOCK) { + bool const mask = (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << query_token)))); + float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max); exp_sum += logit; - qk_smem[ti - first_step] = logit; + qk_smem[ti - first_step] = mask ? 0.0f : logit; } // Compute the sum. @@ -219,7 +247,8 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // softmax float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); - for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { + for (int ti = first_step + tidx; ti < totalCacheSize; + ti += THREADS_PER_BLOCK) { qk_smem[ti - first_step] *= inv_sum; } @@ -254,14 +283,17 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // vi; if (Dh == Dh_MAX || vi < Dh) { - for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { + for (int ti = first_step + vo; ti < totalCacheSize; ti += V_PER_ITER) { // Load the values from the cache. int const ti_circ = ti % max_seq_length; - int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti]; + // int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti]; V_vec v = *reinterpret_cast( - v_cache_batch + real_cache_idx * hidden_size + - head_idx * per_head_size); - float logit = qk_smem[ti - first_step]; + v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); + + bool const mask = (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << query_token)))); + float logit = mask ? 0.0f : qk_smem[ti - first_step]; out = FlexFlow::fma(logit, cast_to_float(v), out); } } @@ -298,7 +330,8 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // Output the final values. if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { convert_from_float( - *reinterpret_cast(output_ptr + (request_idx + sub_req_idx) * hidden_size + + *reinterpret_cast(output_ptr + + (request_idx + sub_req_idx) * hidden_size + head_idx * per_head_size + vi), out); } @@ -315,6 +348,7 @@ __global__ void specinfer_store_kv_cache( BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos, BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos, BeamSearchBatchConfig::SpecInferTopology *beam_topology_mask, + BatchConfig::BitMask *causalMask, int qProjSize, int kProjSize, int vProjSize, @@ -335,41 +369,57 @@ __global__ void specinfer_store_kv_cache( int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - int const first_token_in_req = requestInfo[req_id].first_token_depth_in_request; + int const first_token_in_req = + requestInfo[req_id].first_token_depth_in_request; int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; int const allocated_tokens = beam_topology_mask[req_id].allocated_tokens; + int const total_token = requestInfo[req_id].num_tokens_in_batch; + + BatchConfig::BitMask bitmask = causalMask[req_id]; + + int const sub_request_num = beamRequestInfos[req_id].sub_request_num; - int const beam_size = beamRequestInfos[req_id].sub_request_num; + int const tree_branch_num = beamRequestInfos[req_id].sub_request_num; + + // int const query_token = bitmask.non_tree_cache_size + bitmask.tree_size - + // tree_branch_num + sub_req_id + tok_id; + // bitmask.tree_size - tree_branch_num + sub_req_id; + + // if prompt token -> token id + // if tree token: + int const cache_idx = bitmask.non_tree_cache_size + bitmask.tree_size - + bitmask.this_layer_size + token_idx; int real_idx = tok_id - first_token_in_req + allocated_tokens + sub_req_id; - if (i == 0) { - printf("ffasdasds%d, %d, %d, %d, %d, %d\n", - beamTokenInfos[0].sub_request_index, - allocated_tokens, - sub_req_id, - tok_id, - first_token_in_req, - real_idx); - } - else if(i == hidden_size * 2){ - printf("hshddhdhdsdaww%d, %d, %d, %d, %d, %d\n", - beamTokenInfos[0].sub_request_index, - allocated_tokens, - sub_req_id, - tok_id, - first_token_in_req, - real_idx); - } - - + // if (i == 0) { + // printf("ffasdasds%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", + // beamTokenInfos[0].sub_request_index, + // allocated_tokens, + // sub_req_id, + // tok_id, + // first_token_in_req, + // real_idx, + // cache_idx, + // bitmask.non_tree_cache_size, + // bitmask.tree_size, + // sub_request_num, + // token_idx ); + // } else if (i == hidden_size * 2) { + // printf("hshddhdhdsdaww%d, %d, %d, %d, %d, %d, %d\n", + // beamTokenInfos[0].sub_request_index, + // allocated_tokens, + // sub_req_id, + // tok_id, + // first_token_in_req, + // real_idx, + // cache_idx); + // } kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + - (real_idx) * hidden_size + - offset] = kVal; + (cache_idx)*hidden_size + offset] = kVal; vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + - (real_idx) * hidden_size + - offset] = vVal; + (cache_idx)*hidden_size + offset] = vVal; } } @@ -398,6 +448,7 @@ void update_kv_cache_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, m->beam_token_infos, m->beam_request_infos, m->beam_topology_mask, + m->causalMask, m->qProjSize, m->kProjSize, m->vProjSize, @@ -433,6 +484,7 @@ void update_kv_cache_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, m->request_infos, \ m->beam_request_infos, \ m->beam_topology_mask, \ + m->causalMask, \ BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES) template @@ -520,7 +572,7 @@ void compute_attention_kernel_prompt( for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; - } + } // else if (tokens_previous_requests < bc->num_generation_tokens) { // tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; // continue; @@ -728,6 +780,16 @@ void inference_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, DT const *bias_ptr, cudaStream_t stream) { // phase 1: Implement kernel to compute KQV for input tokens + + cudaMemcpyAsync(m->causalMask, + &(bc->causalMask), + bc->num_active_requests() * sizeof(BatchConfig::BitMask), + cudaMemcpyHostToDevice, + stream); + std::cout << "kernel bit mask: " << bc->causalMask[0].prompt_size << ", " + << bc->causalMask[0].non_tree_cache_size << ", " + << bc->causalMask[0].mask[0] << ", " << sizeof(BatchConfig::BitMask) + << "\n"; compute_qkv_kernel(m, bc, shard_id, @@ -830,6 +892,7 @@ void SpecInferIncMultiHeadSelfAttention::inference_kernel_wrapper( // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, // acc_output.rect, "[Attention:forward:output]"); } + // print_tensor(output.get_float_ptr(), 32, "specinc output"); // if(bc->num_tokens == 1){ // print_tensor(input.get_float_ptr(), 32, "specinc input"); @@ -878,6 +941,11 @@ SpecInferIncMultiHeadSelfAttentionMeta::SpecInferIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { + size_t causal_mask_size = BatchConfig::MAX_NUM_REQUESTS; + size_t total_size = causal_mask_size * sizeof(BatchConfig::BitMask); + gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, + total_size); + beam_topology_mask = static_cast( handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + @@ -895,6 +963,16 @@ SpecInferIncMultiHeadSelfAttentionMeta::SpecInferIncMultiHeadSelfAttentionMeta( sizeof(BatchConfig::requestsInfo) + sizeof(BeamSearchBatchConfig::topology_mask) + sizeof(BeamSearchBatchConfig::beamTokenInfo)); + // causalMask = + // static_cast( + // handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + // sizeof(BatchConfig::requestsInfo) + + // sizeof(BeamSearchBatchConfig::topology_mask) + + // sizeof(BeamSearchBatchConfig::beamTokenInfo)) + + // sizeof(BeamSearchBatchConfig::beamRequestsInfo); + + causalMask = gpu_mem_allocator.allocate_instance( + causal_mask_size); // beam_token_infos = // gpu_mem_allocator // .allocate_instance( diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 1da56e383a..a3e3adcc30 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -53,6 +53,8 @@ __global__ void compute_attention_kernel_fused_kernel( BatchConfig::PerRequestInfo *request_infos, int num_heads, int num_requests, + int max_tree_branches, + BatchConfig::BitMask *causalMask, int qk_smem_sz) { // q, k @@ -81,6 +83,17 @@ __global__ void compute_attention_kernel_fused_kernel( request_infos[request_idx].num_tokens_in_batch; int const qlength = request_infos[request_idx].num_tokens_in_batch; + BatchConfig::BitMask bitmask = causalMask[request_idx]; + + // bitmask.mask[1] = 3; + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + printf("tree attn fused kernel %d, %d, %d, %lld\n", + tlength, + qlength, + bitmask.non_tree_cache_size, + bitmask.mask[1]); + } + int first_token_idx = 0; for (int r = 0; r < request_idx; r++) { first_token_idx += request_infos[request_idx].num_tokens_in_batch; @@ -115,7 +128,8 @@ __global__ void compute_attention_kernel_fused_kernel( constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; DT const *k_cache_batch = - key_cache + request_idx * max_seq_length * hidden_size + ki; + key_cache + + request_idx * max_tree_branches * max_seq_length * hidden_size + ki; int ti_end = div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; @@ -127,10 +141,12 @@ __global__ void compute_attention_kernel_fused_kernel( q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + ii * THREADS_PER_KEY * K_VEC_SIZE); } + __syncthreads(); for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { K_vec k[K_VECS_PER_THREAD]; int const ti_circ = ti % max_seq_length; + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; if (ti < tlength) { @@ -142,22 +158,35 @@ __global__ void compute_attention_kernel_fused_kernel( float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); if (ti < tlength && tidx % THREADS_PER_KEY == 0) { - bool const mask = ti_circ >= tlength; - if (mask) { - assert(false); + bool const mask = + (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + + if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 0 && mask) { + printf("tree attn mask for first token %d, %lld, %d, %d\n", + ti, + bitmask.mask[ti - bitmask.non_tree_cache_size], + bitmask.non_tree_cache_size, + qi); } - int pos = ti * qlength + qi; - if (((pos / qlength) % tlength) > (pos % qlength + tlength - qlength)) { - qk = -FLT_MAX; - } qk_max = mask ? qk_max : fmaxf(qk_max, qk); - qk_smem[pos] = mask ? 0.f : qk; + if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 1 && !mask) { + printf("tree attn mask for second token %d, %lld, %d, %d, %.10f\n", + ti, + bitmask.mask[ti - bitmask.non_tree_cache_size], + bitmask.non_tree_cache_size, + qi, + qk); + } + qk_smem[ti - first_step] = mask ? 0.0f : qk; } } + __syncthreads(); +#pragma unroll for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); } @@ -176,66 +205,97 @@ __global__ void compute_attention_kernel_fused_kernel( // The warps finalize the reduction. qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; - +#pragma unroll for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); } // Broadcast to all the threads in the warp. qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); + + if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 0 && tidx == 0) { + printf("tree attn first token qk_max %f\n", + qk_max); + } - float exp_sum = 0.f; + float exp_sum = 0.f; for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { - float logit = __expf(qk_smem[ti * qlength + qi] - qk_max); + bool const mask = + (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max); exp_sum += logit; - qk_smem[ti * qlength + qi] = logit; + qk_smem[ti - first_step] = mask ? 0.0f : logit; } // Compute the sum. exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + printf("expsum %.10f\n", exp_sum); + } + // softmax float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); - for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { - qk_smem[ti * qlength + qi] *= inv_sum; + qk_smem[ti - first_step] *= inv_sum; } __syncthreads(); - } + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + printf("softmax %.10f\n", qk_smem[0]); + } - // value projection - constexpr int V_VEC_SIZE = 16 / sizeof(DT); - // The value computed by this thread. - int vo = tidx / THREADS_PER_VALUE; - // The hidden dimensions computed by this particular thread. - int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; - constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; + // value projection + constexpr int V_VEC_SIZE = 16 / sizeof(DT); + // A vector of V elements for the current timestep. + // using V_vec_k = typename V_vec_k_::Type; + // using V_vec_acum = typename V_vec_acum_fp32_::Type; - Out_sum out; - // The base pointer for the value in the cache buffer. - DT const *v_cache_batch = - value_cache + request_idx * max_seq_length * hidden_size + vi; + // The value computed by this thread. + int vo = tidx / THREADS_PER_VALUE; + // The hidden dimensions computed by this particular thread. + int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; + constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; - for (int qi = 0; qi < qlength; qi++) { + Out_sum out; zero(out); - __syncthreads(); + + // The base pointer for the value in the cache buffer. + DT const *v_cache_batch = + value_cache + + request_idx * max_seq_length * hidden_size * max_tree_branches + vi; + // DT const *v_cache_batch = + // value_cache + + // (beam_request_idx * max_beam_width + beam_sub_request_idx) * + // max_seq_length * hidden_size + + // vi; + if (Dh == Dh_MAX || vi < Dh) { for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { // Load the values from the cache. int const ti_circ = ti % max_seq_length; - + // int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti]; V_vec v = *reinterpret_cast( v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); - float logit = qk_smem[ti * qlength + qi]; + + bool const mask = + (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + float logit = mask ? 0.0f : qk_smem[ti - first_step]; out = FlexFlow::fma(logit, cast_to_float(v), out); + } } - // Make sure we can start writing to shared memory. + // // Make sure we can start writing to shared memory. __syncthreads(); + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + printf("valueX %.10f\n", out.x); + } + // Run the final reduction amongst the different groups computing different // partial outputs. if (Dh == Dh_MAX || vi < Dh) { @@ -268,6 +328,11 @@ __global__ void compute_attention_kernel_fused_kernel( output_ptr + (first_token_idx + qi) * hidden_size + head_idx * per_head_size + vi), out); + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + printf("tree attn final value, %.9f, %.9f, %.9f, %.9f, %d, %d\n", + out.x, out.y, out.z, out.w, vi, (first_token_idx + qi) * hidden_size + + head_idx * per_head_size + vi); + } } } } @@ -380,7 +445,9 @@ __global__ void update_tree_branch_kv_cache_fused( int vProjSize, int num_new_tokens, int max_seq_len, - int hidden_size) { + int hidden_size, + int max_tree_branches, + int first_token_depth) { CUDA_KERNEL_LOOP(i, num_new_tokens * hidden_size) { int token_idx = i / hidden_size; @@ -393,10 +460,10 @@ __global__ void update_tree_branch_kv_cache_fused( int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = kVal; - vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = vVal; + kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + + (token_idx + first_token_depth) * hidden_size + offset] = kVal; + vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + + (token_idx + first_token_depth) * hidden_size + offset] = vVal; } } @@ -473,7 +540,6 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, } std::cout << "num_new_tokens: " << num_new_tokens << "\n"; - assert(false); int total_tokens_in_request = bc->tokensInfo[j].abs_depth_in_request + 1; assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens); @@ -728,22 +794,11 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, THDS_PER_KEY, \ THDS_PER_VALUE> \ <<>>( \ - static_cast
(m->devQKVProjArray), \ - static_cast
(m->keyCache), \ - static_cast
(m->valueCache), \ - output_ptr, \ - scale, \ - BatchConfig::max_sequence_length(), \ - BatchConfig::max_tokens_per_batch(), \ - m->qProjSize, \ - m->hidden_size, \ - m->request_infos, \ - m->num_q_heads, \ - bc->num_active_requests(), \ + static_cast
(m->devQKVProjArray), static_cast
(m->keyCache), static_cast
(m->valueCache), output_ptr, scale, BatchConfig::max_sequence_length(), BatchConfig::max_tokens_per_batch(), m->qProjSize, m->hidden_size, m->request_infos, m->num_q_heads, bc->num_active_requests(), BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES, m->causalMask, \ smem_sz[0]) template -void compute_attention_kernel_fused(IncMultiHeadSelfAttentionMeta const *m, +void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, TreeVerifyBatchConfig const *bc, DT *output_ptr, cudaStream_t stream) { @@ -752,6 +807,12 @@ void compute_attention_kernel_fused(IncMultiHeadSelfAttentionMeta const *m, // update K-V cache int num_new_tokens = bc->num_active_tokens(); int parallelism = m->hidden_size * num_new_tokens; + printf("update KV cache %d, idx: %d\n", + num_new_tokens, + bc->requestsInfo[0].first_token_depth_in_request); + for (int i = 0; i < num_new_tokens; i++) { + printf("abs depth:%d\n", bc->tokensInfo[i].abs_depth_in_request); + } update_tree_branch_kv_cache_fused<<vProjSize, num_new_tokens, BatchConfig::max_sequence_length(), - m->hidden_size); + m->hidden_size, + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES, + bc->requestsInfo[0].first_token_depth_in_request); dim3 grid(m->num_q_heads, bc->num_active_requests()); int const per_head_size = m->qProjSize; @@ -816,12 +879,19 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // Note that m->num_active_tokens stores the number of active // tokens in the previous batch, which is needed for committing // keys/values to the key-value cache + std::cout << "tokens to be committed: " << bc->num_tokens_to_commit << "\n"; + cudaMemcpyAsync(m->committed_token_infos, &(bc->committed_tokens), bc->num_tokens_to_commit * sizeof(TreeVerifyBatchConfig::CommittedTokensInfo), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(m->causalMask, + &(bc->causalMask), + bc->num_active_requests() * sizeof(BatchConfig::BitMask), + cudaMemcpyHostToDevice, + stream); commit_tokens
(m, bc, stream); // After commit we update m->num_active_tokens to be the number of active @@ -948,6 +1018,20 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventDestroy(t_start); cudaEventDestroy(t_end); } + + // print_tensor(output.get_float_ptr(), 32, "tree attn kernel"); + + // save_tensor( + // input.get_float_ptr(), + // 768 * bc->num_active_tokens(), + // "/home/xinhaoc/FlexFlow/inference/output/Newtreeinput.txt"); + // save_tensor( + // output.get_float_ptr(), + // 768 * bc->num_active_tokens(), + // "/home/xinhaoc/FlexFlow/inference/output/Newtreeoutput.txt"); + // std::cout << "new tokens: " << bc->num_active_tokens() << "\n"; + + // assert(bc->num_tokens_to_commit == 0); } TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( @@ -993,8 +1077,11 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( { int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); size_t committed_tokeninfo_size = max_tokens_per_batch; + size_t causal_mask_size = BatchConfig::MAX_NUM_REQUESTS; + size_t total_size = committed_tokeninfo_size * - sizeof(TreeVerifyBatchConfig::CommittedTokensInfo); + sizeof(TreeVerifyBatchConfig::CommittedTokensInfo) + + causal_mask_size * sizeof(BatchConfig::BitMask); if (offload) { // assert that we have enough reserved work space left assert(gpu_mem_allocator.reserved_total_size - @@ -1004,6 +1091,8 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( gpu_mem_allocator .allocate_reserved( committed_tokeninfo_size); + causalMask = gpu_mem_allocator.allocate_instance( + causal_mask_size); } else { gpu_mem_allocator.create_legion_instance(committed_token_reserve_inst, total_size); @@ -1011,6 +1100,8 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( gpu_mem_allocator .allocate_instance( committed_tokeninfo_size); + causalMask = gpu_mem_allocator.allocate_instance( + causal_mask_size); } } diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 775280e2cf..8a7cea1cc3 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -16,6 +16,7 @@ #include "flexflow/request_manager.h" #include "flexflow/parallel_ops/parallel_op.h" // #include "flexflow/tokenizers.h" +#include #include #include #include @@ -735,6 +736,11 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].max_sequence_length - new_bc.requestsInfo[i].first_token_depth_in_request - verified_tokens.size(); + // std::cout << "max depth: " << new_max_depth << ", " + // << new_bc.requestsInfo[i].first_token_depth_in_request << + // ", " + // << verified_tokens.size() << "\n"; + // assert(false); new_bc.beamRequestsInfo[i].current_depth = 1; profiling_requests[request.guid].ssm_decoding_steps = 0; @@ -761,6 +767,10 @@ BeamSearchBatchConfig new_bc.topology_mask[i].real_token_pos[0][j] = j; } + updateBitMask(new_bc.causalMask[i], + verified_tokens.size(), + request.tokens.size()); + // Token Info for (int j = 0; j < verified_tokens.size(); j++) { auto token = verified_tokens.at(j); @@ -910,6 +920,11 @@ BeamSearchBatchConfig new_bc.num_tokens++; } new_bc.topology_mask[i].allocated_tokens = 0; + new_bc.causalMask[i].non_tree_cache_size = 0; + new_bc.causalMask[i].tree_size = + new_bc.requestsInfo[i].num_tokens_in_batch; + initBitMask(new_bc.causalMask[i], + new_bc.requestsInfo[i].num_tokens_in_batch); // if (new_bc.requestsInfo[i].num_tokens_in_batch < // new_request.initial_len) { @@ -1161,6 +1176,27 @@ BeamSearchBatchConfig } } + memcpy(&new_bc.causalMask[i], + &old_bc.causalMask[i], + sizeof(BatchConfig::BitMask)); + // sub_request_num -> nodes of input next iteration + // beam_size replicate num + + std::cout << "print beam tree: " + << old_bc.beamRequestsInfo[i].current_depth << "\n"; + BeamTree tree = request.beam_trees[old_bc.model_id]; + for (int k = 0; k <= old_bc.beamRequestsInfo[i].current_depth; k++) { + std::cout << "layer: " << k << "\n"; + std::cout << "nodes: " << tree.treeLayers[k].nodes_num_this_layer + << "\n"; + } + appendBitMask(new_bc.causalMask[i], + new_bc.beamRequestsInfo[i].sub_request_num, + old_bc.beamRequestsInfo[i].beam_size, + old_bc.beamRequestsInfo[i].sub_request_num, + tree, + old_bc.beamRequestsInfo[i].current_depth); + // assert(false); for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) { @@ -1248,6 +1284,10 @@ BeamSearchBatchConfig assert(false && "Request should be pending"); } + memcpy(&new_bc.causalMask[i], + &old_bc.causalMask[i], + sizeof(BatchConfig::BitMask)); + if (new_bc.requestsInfo[i].first_token_depth_in_request >= request.tokens.size()) { // request is done @@ -1260,6 +1300,13 @@ BeamSearchBatchConfig (int)request.tokens.size() - new_bc.requestsInfo[i].first_token_depth_in_request); request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch; + BeamTree tree = request.beam_trees[old_bc.model_id]; + appendBitMask(new_bc.causalMask[i], + new_bc.beamRequestsInfo[i].sub_request_num, + old_bc.beamRequestsInfo[i].beam_size, + old_bc.beamRequestsInfo[i].sub_request_num, + tree, + old_bc.beamRequestsInfo[i].current_depth); } if (verbose) { @@ -1378,7 +1425,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( profiling_requests[request.guid].llm_decoding_steps += 1; if (request.status == Request::RUNNING) { - + std::cout << "prepare next batch running: pending\n" << "\n"; new_bc.request_running[i] = true; @@ -1398,7 +1445,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::vector> dfs_tree_inputs = merge_dfs_trees(all_dfs_trees, request.tokens.size() - 1, guid); - if (verbose) { + if (true) { std::cout << "Request Tokens Size: " << request.tokens.size() << std::endl; for (int k = 0; k < request.tokens.size(); k++) { @@ -1414,6 +1461,13 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( old_batches.at(0).requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length; + + // copy bitmask to verify batchconfig + memcpy(&(new_bc.causalMask[i]), + &(old_batches.at(0).causalMask[i]), + sizeof(BatchConfig::BitMask)); + // std::cout << "bitmask: " << new_bc.causalMask[i].mask[0] << "\n"; + // assert(false); // TODO: Check this new_bc.requestsInfo[i].num_tokens_in_batch = 0; new_bc.request_completed[i] = false; @@ -1429,7 +1483,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( i; new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = committed_token.first; - if (verbose) { + if (true) { std::cout << new_bc.num_tokens_to_commit << "- committed_token.token_depth: " << committed_token.first @@ -1441,7 +1495,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } } - if (verbose) { + if (true) { std::cout << "new_bc.num_tokens_to_commit: " << new_bc.num_tokens_to_commit << std::endl; } @@ -1463,8 +1517,10 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.requestsInfo[i].first_token_depth_in_request = request.tokens.size() - 1; - - std::cout << "prepare next batch verify: " << dfs_tree_inputs.size() << "\n"; + + std::cout << "prepare next batch verify: " << dfs_tree_inputs.size() + << "\n"; + // Add Tokens from the DFS Tree to the next batch for (int j = 1; j < dfs_tree_inputs.size(); j++) { auto token = dfs_tree_inputs.at(j); @@ -1485,7 +1541,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( break; } } - assert(false); } else if (request.status == Request::PENDING) { std::cout << "prepare next batch verify: pending\n" @@ -1518,6 +1573,12 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( << new_bc.num_tokens_to_commit << std::endl; } + memcpy(&(new_bc.causalMask[i]), + &(old_batches.at(0).causalMask[i]), + sizeof(BatchConfig::BitMask)); + // std::cout << "bitmask: " << new_bc.causalMask[i].mask[0] << "\n"; + // assert(false); + // Normal Request Info new_bc.requestsInfo[i].first_token_depth_in_request = request.llm_cache_size; @@ -1643,8 +1704,6 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) * beam_size; - // result_index += old_bc.topology_mask[index].allocated_tokens; - if (true) { std::cout << "i = " << i << ", result index = " << result_index << ", value: " << result.token_ids[result_index] @@ -1669,6 +1728,9 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, request.tokens.back(); request.beam_trees.at(old_bc.model_id).treeLayers[0].probs[0] = 1; request.beam_trees.at(old_bc.model_id).treeLayers[0].parent_ids[0] = -1; + request.beam_trees.at(old_bc.model_id) + .treeLayers[0] + .nodes_num_this_layer = 1; if (true) { std::cout << "Store the previous last token to the tree root: " @@ -1677,7 +1739,9 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, } std::cout << "leaffffff: " << leaf_node_num << "\n"; - + request.beam_trees.at(old_bc.model_id) + .treeLayers[depth] + .nodes_num_this_layer = leaf_node_num; for (int beam_id = 0; beam_id < leaf_node_num; beam_id++) { request.beam_trees.at(old_bc.model_id) @@ -1751,50 +1815,6 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, std::cout << "token: " << j << ": " << new_bc.beamRequestsInfo[request_index].tokens[j] << "\n"; } - - // std::set parents; - // std::set childs; - // // cache stealing - // for (int j = 0; j < beam_size; j++) { - // int parent_id = tree.treeLayers[depth].parent_ids[j]; - // if (childs.find(parent_id) == childs.end()) { - // // copy beam slot - // new_bc.beamRequestsInfo[request_index].parent_id[parent_id] = - // tree.treeLayers[depth].parent_ids[j]; - // new_bc.beamRequestsInfo[request_index].probs[parent_id] = - // tree.treeLayers[depth].probs[j]; - // new_bc.beamRequestsInfo[request_index].tokens[parent_id] = - // tree.treeLayers[depth].tokens[j]; - // parents.emplace(j); - // childs.emplace(parent_id); - // } - // } - // if (parents.size() < beam_size) { - // for (int j = 0; j < beam_size; j++) { - // if (parents.find(j) == parents.end()) { - // // this slot has not been assigned - // // find the smallest not assigned child and put in - // if (verbose) { - // std::cout << "request_index" << request_index - // << ", miss slot: " << j << "\n"; - // } - // for (int k = 0; k < beam_size; k++) { - // if (childs.find(k) == childs.end()) { - // // parent -> j to child k; - // new_bc.beamRequestsInfo[request_index].parent_id[k] = - // tree.treeLayers[depth].parent_ids[j]; - // new_bc.beamRequestsInfo[request_index].probs[k] = - // tree.treeLayers[depth].probs[j]; - // new_bc.beamRequestsInfo[request_index].tokens[k] = - // tree.treeLayers[depth].tokens[j]; - // parents.emplace(j); - // childs.emplace(k); - // break; - // } - // } - // } - // } - // } } if (verbose) { std::cout << "-----------after parent id exchange-----------" << std::endl; @@ -1809,6 +1829,128 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, } } +// bit mask related function + +// prompt phase, init task +void RequestManager::initBitMask(BatchConfig::BitMask &bitmask, + int initLength) { + assert(initLength <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM && + "do not support tree size > 64"); + // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: + // 0000000..1000 + + bitmask.prompt_size = initLength; + bitmask.this_layer_size = initLength; + bitmask.tree_size = initLength; + for (int i = 0; i < bitmask.prompt_size; i++) { + for (int j = i; j < bitmask.prompt_size; j++) { + bitmask.mask[i] |= (1 << j); + } + } + std::cout << "see bit mask" << bitmask.prompt_size << "\n"; + std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[0]) << "\n"; + std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[1]) << "\n"; + std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[2]) << "\n"; +} + +// prepare next init +void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask, + int initLength, + int non_tree_size) { + // assert(initLength == 1); + // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: + // 0000000..1000 + assert(initLength <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM && + "do not support tree size > 64"); + bitmask.non_tree_cache_size = non_tree_size; + bitmask.tree_size = initLength; + bitmask.this_layer_size = initLength; + std::cout << "non_tree_size: " << non_tree_size << "\n"; + bitmask.prompt_size = initLength; + for (int i = 0; i < bitmask.prompt_size; i++) { + for (int j = i; j < bitmask.prompt_size; j++) { + bitmask.mask[i] |= (1 << j); + } + } + + std::cout << "see bit mask update" << bitmask.prompt_size << "\n"; + std::cout << "see bit mask update" << std::bitset<64>(bitmask.mask[0]) + << "\n"; + std::cout << "see bit mask update" << std::bitset<64>(bitmask.mask[1]) + << "\n"; + std::cout << "see bit mask update" << std::bitset<64>(bitmask.mask[2]) + << "\n"; +} + +// prepare next beam, append layers to the tree +void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, + int newNodes, + int preBeamSize, + int old_sub_num, + BeamTree const tree, + int currentDepth) { + int pre_tree_size = bitmask.tree_size; + bitmask.tree_size += newNodes; + bitmask.this_layer_size = newNodes; + assert(bitmask.tree_size <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM && + "do not support tree size > 64"); + // preBeamSize: replicate num + + // add relationship with input/prompt + for (int i = 0; i < bitmask.prompt_size; i++) { + for (int j = pre_tree_size; j < bitmask.tree_size; j++) { + bitmask.mask[i] |= (1 << j); + std::cout << "see bit mask append: " << i << ", to" << j + << std::bitset<64>(bitmask.mask[i]) << "\n"; + } + } + + std::cout << "bitmask.tree_size: " << bitmask.tree_size << ", " + << pre_tree_size << ", " << bitmask.prompt_size << ", " + << preBeamSize << "\n"; + + // int num_groups = newNodes / preBeamSize; + // int group_size = newNodes / num_groups; + // add relations to branch + // requests in same groups share same relations, except the last token. + + // set middle layers + // skip the root prompt/tokens + int token_idx = bitmask.prompt_size; + int new_nodes_start_idx = pre_tree_size; + std::cout << "new nodes start " << new_nodes_start_idx << "\n"; + for (int i = 1; i < currentDepth; i++) { + new_nodes_start_idx = pre_tree_size; + int nodes_this_layer = tree.treeLayers[i].nodes_num_this_layer; + std::cout << "tree layer: " << i << " nodes:" << nodes_this_layer + << "group size: " << newNodes / nodes_this_layer << "\n"; + for (int j = 0; j < nodes_this_layer; j++) { + int group_size = newNodes / nodes_this_layer; + for (int k = 0; k < group_size; k++) { + bitmask.mask[token_idx] |= (1 << new_nodes_start_idx); + new_nodes_start_idx += 1; + } + token_idx += 1; + } + } + + std::cout << "token idx: " << token_idx << ", " << pre_tree_size << ", " + << new_nodes_start_idx << ", " << newNodes + << "current depth: " << currentDepth << "\n"; + std::cout << "new nodes end " << new_nodes_start_idx << "\n"; + + std::cout << "tree size: " << bitmask.tree_size << "\n"; + assert(token_idx == pre_tree_size); + assert(currentDepth <= 1 || new_nodes_start_idx == bitmask.tree_size); + + // assert(currentDepth <= 2); + // set last layer, all tokens are only relevant to it self; + for (int i = token_idx; i < bitmask.tree_size; i++) { + bitmask.mask[i] |= (1 << i); + std::cout << "set rel: " << i << "to: " << i << "\n"; + } +} + bool PreOrder( BeamTree const &tree, int max_depth, @@ -1979,7 +2121,7 @@ std::vector> RequestManager::traverse_beam_tree(BeamSearchBatchConfig const &old_bc, int request_index, int first_token_depth_in_request) { - if (verbose) { + if (true) { std::cout << "[Traverse Beam Tree] request_index: " << request_index << "\n"; std::cout << "[Traverse Beam Tree] max_depth: " @@ -1988,6 +2130,8 @@ std::vector> << old_bc.beamRequestsInfo[request_index].current_depth << "\n"; std::cout << "[Traverse Beam Tree] beam_width: " << old_bc.beamRequestsInfo[request_index].beam_size << "\n"; + std::cout << "[Traverse Beam Tree] start index: " + << first_token_depth_in_request << "\n"; } auto guid = old_bc.requestsInfo[request_index].request_guid; @@ -1995,27 +2139,39 @@ std::vector> // std::cout << "request.beam_trees.size(): " << request.beam_trees.size() // << std::endl; BeamTree tree = request.beam_trees.at(old_bc.model_id); - // std::cout << "\n\n"; + std::cout << "print beam tree: " + << "\n"; + std::vector> serializedTree; + for (int i = 0; i <= old_bc.beamRequestsInfo[request_index].max_depth; i++) { + std::cout << "tree layer: " << i + << ", num_nodes: " << tree.treeLayers[i].nodes_num_this_layer + << "\n"; + // push tokens into tree + for (int j = 0; j < tree.treeLayers[i].nodes_num_this_layer; j++) { + std::cout << "token: " << tree.treeLayers[i].tokens[j] << "\n"; + serializedTree.push_back(std::make_pair(tree.treeLayers[i].tokens[j], i)); + } + } // token, index // todo make this one global for different stages - std::vector> serializedTree; - PreOrder(tree, - old_bc.beamRequestsInfo[request_index].max_depth, - 0, - old_bc.beamRequestsInfo[request_index].beam_size, - 0, - serializedTree, - verbose); + + // PreOrder(tree, + // old_bc.beamRequestsInfo[request_index].max_depth, + // 0, + // old_bc.beamRequestsInfo[request_index].beam_size, + // 0, + // serializedTree, + // verbose); // print it - if (verbose) { + if (true) { std::cout << "Print serialized tree: size:" << request_index << serializedTree.size() << "\n"; } for (int k = 0; k < serializedTree.size(); k++) { serializedTree.at(k).second += first_token_depth_in_request; - if (verbose) { + if (true) { std::cout << "token id: " << serializedTree.at(k).first << ", depth: " << serializedTree.at(k).second << "\n"; } @@ -2041,6 +2197,9 @@ std::vector> input_trees, int root_depth, RequestGuid guid) { + assert(input_trees.size() == 1 && "currently using one ssm"); + return input_trees.at(0); + std::vector> merged_tree; std::unordered_map> childrens; diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index b76c5c326e..4d7e2c8806 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -104,6 +104,18 @@ void RequestManager::load_tokens_task( sizeof(BeamSearchBatchConfig::beamRequestsInfo), cudaMemcpyHostToDevice, stream); + + // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + // sizeof(BatchConfig::tokensInfo) + + // sizeof(BatchConfig::requestsInfo) + + // sizeof(BeamSearchBatchConfig::topology_mask) + + // sizeof(BeamSearchBatchConfig::beamTokenInfo) + + // sizeof(BeamSearchBatchConfig::beamRequestsInfo), + // &(beam_batch_config->causalMask), + // sizeof(BatchConfig::causalMask), + // cudaMemcpyHostToDevice, + // stream); + // std::cout << "copy calsual mask info: " << beam_batch_config->causalMask[0].prompt_size << "\n"; } } From 945268f1a56e804b62b731c136bf8358c47b765f Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 28 Dec 2023 11:19:16 -0500 Subject: [PATCH 08/61] fix. --- inference/spec_infer/spec_infer.cc | 2 +- src/ops/tree_inc_multihead_self_attention.cu | 78 ++++++++++---------- src/runtime/request_manager.cc | 11 ++- 3 files changed, 50 insertions(+), 41 deletions(-) diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 2ccdfd388d..e4fa71a1d5 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -404,7 +404,7 @@ void FlexFlow::top_level_task(Task const *task, prompts.push_back(text); // tree_model.generate(text, 128 /*max_sequence_length*/); } - tree_model.generate(prompts, 15 /*max_sequence_length*/); + tree_model.generate(prompts, 23 /*max_sequence_length*/); } // Execution fence diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index a3e3adcc30..3d5ccf9431 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -162,24 +162,24 @@ __global__ void compute_attention_kernel_fused_kernel( (ti >= bitmask.non_tree_cache_size && (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); - if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 0 && mask) { - printf("tree attn mask for first token %d, %lld, %d, %d\n", - ti, - bitmask.mask[ti - bitmask.non_tree_cache_size], - bitmask.non_tree_cache_size, - qi); - } + // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 0 && mask) { + // printf("tree attn mask for first token %d, %lld, %d, %d\n", + // ti, + // bitmask.mask[ti - bitmask.non_tree_cache_size], + // bitmask.non_tree_cache_size, + // qi); + // } qk_max = mask ? qk_max : fmaxf(qk_max, qk); - if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 1 && !mask) { - printf("tree attn mask for second token %d, %lld, %d, %d, %.10f\n", - ti, - bitmask.mask[ti - bitmask.non_tree_cache_size], - bitmask.non_tree_cache_size, - qi, - qk); - } + // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 1 && !mask) { + // printf("tree attn mask for second token %d, %lld, %d, %d, %.10f\n", + // ti, + // bitmask.mask[ti - bitmask.non_tree_cache_size], + // bitmask.non_tree_cache_size, + // qi, + // qk); + // } qk_smem[ti - first_step] = mask ? 0.0f : qk; } } @@ -213,10 +213,10 @@ __global__ void compute_attention_kernel_fused_kernel( // Broadcast to all the threads in the warp. qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); - if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 0 && tidx == 0) { - printf("tree attn first token qk_max %f\n", - qk_max); - } + // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 0 && tidx == 0) { + // printf("tree attn first token qk_max %f\n", + // qk_max); + // } float exp_sum = 0.f; @@ -232,9 +232,9 @@ __global__ void compute_attention_kernel_fused_kernel( // Compute the sum. exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { - printf("expsum %.10f\n", exp_sum); - } + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + // printf("expsum %.10f\n", exp_sum); + // } // softmax float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); @@ -243,9 +243,9 @@ __global__ void compute_attention_kernel_fused_kernel( } __syncthreads(); - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { - printf("softmax %.10f\n", qk_smem[0]); - } + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + // printf("softmax %.10f\n", qk_smem[0]); + // } // value projection constexpr int V_VEC_SIZE = 16 / sizeof(DT); @@ -292,9 +292,9 @@ __global__ void compute_attention_kernel_fused_kernel( // // Make sure we can start writing to shared memory. __syncthreads(); - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { - printf("valueX %.10f\n", out.x); - } + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + // printf("valueX %.10f\n", out.x); + // } // Run the final reduction amongst the different groups computing different // partial outputs. @@ -328,11 +328,11 @@ __global__ void compute_attention_kernel_fused_kernel( output_ptr + (first_token_idx + qi) * hidden_size + head_idx * per_head_size + vi), out); - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { - printf("tree attn final value, %.9f, %.9f, %.9f, %.9f, %d, %d\n", - out.x, out.y, out.z, out.w, vi, (first_token_idx + qi) * hidden_size + - head_idx * per_head_size + vi); - } + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + // printf("tree attn final value, %.9f, %.9f, %.9f, %.9f, %d, %d\n", + // out.x, out.y, out.z, out.w, vi, (first_token_idx + qi) * hidden_size + + // head_idx * per_head_size + vi); + // } } } } @@ -807,12 +807,12 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, // update K-V cache int num_new_tokens = bc->num_active_tokens(); int parallelism = m->hidden_size * num_new_tokens; - printf("update KV cache %d, idx: %d\n", - num_new_tokens, - bc->requestsInfo[0].first_token_depth_in_request); - for (int i = 0; i < num_new_tokens; i++) { - printf("abs depth:%d\n", bc->tokensInfo[i].abs_depth_in_request); - } + // printf("update KV cache %d, idx: %d\n", + // num_new_tokens, + // bc->requestsInfo[0].first_token_depth_in_request); + // for (int i = 0; i < num_new_tokens; i++) { + // printf("abs depth:%d\n", bc->tokensInfo[i].abs_depth_in_request); + // } update_tree_branch_kv_cache_fused<<> verified_tokens = traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); + log_req_mgr.print("Number of Verified Tokens = %zu", verified_tokens.size()); @@ -1426,7 +1429,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( if (request.status == Request::RUNNING) { - std::cout << "prepare next batch running: pending\n" + std::cout << "prepare next batch running:\n" << "\n"; new_bc.request_running[i] = true; std::cout << "[Verify] Request " << request.guid << " is running" @@ -1663,6 +1666,9 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } + std::cout << "check dfs tree input size: " << dfs_tree_inputs[1000000].size() + << "\n"; + return new_bc; } @@ -2198,6 +2204,7 @@ std::vector> int root_depth, RequestGuid guid) { assert(input_trees.size() == 1 && "currently using one ssm"); + dfs_tree_inputs[guid] = input_trees.at(0); return input_trees.at(0); std::vector> merged_tree; @@ -2249,6 +2256,8 @@ std::vector> } dfs_tree_inputs[guid] = merged_tree; + // std::cout << "assign dfr tree: " << guid << ", " << merged_tree.size() << ", " + // << dfs_tree_inputs[guid].size() << "\n"; return merged_tree; } From ce95127aecaf553679539310574b48417609efa2 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 29 Dec 2023 03:41:26 -0500 Subject: [PATCH 09/61] fix --- inference/spec_infer/spec_infer.cc | 4 +- src/ops/kernels/embedding_kernels.cu | 2 +- .../specinfer_inc_multihead_self_attention.cu | 76 ++++--- src/ops/tree_inc_multihead_self_attention.cu | 114 ++++++---- src/runtime/request_manager.cc | 198 +++++++++++------- 5 files changed, 246 insertions(+), 148 deletions(-) diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index e4fa71a1d5..9af3e12e5a 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -402,9 +402,9 @@ void FlexFlow::top_level_task(Task const *task, printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); total_num_requests++; prompts.push_back(text); - // tree_model.generate(text, 128 /*max_sequence_length*/); + // tree_model.generate(text, 128 /*max_sequence_length*/); } - tree_model.generate(prompts, 23 /*max_sequence_length*/); + tree_model.generate(prompts, 128 /*max_sequence_length*/); } // Execution fence diff --git a/src/ops/kernels/embedding_kernels.cu b/src/ops/kernels/embedding_kernels.cu index 91f5d60e85..0cde42de56 100644 --- a/src/ops/kernels/embedding_kernels.cu +++ b/src/ops/kernels/embedding_kernels.cu @@ -118,7 +118,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, // print_tensor(output_ptr, output_domain.get_volume(), // "[Embedding:forward:output]"); } - print_tensor(input.get_int32_ptr(), 32, "embeddinginput"); + // print_tensor(input.get_int32_ptr(), 32, "embeddinginput"); } /*static*/ diff --git a/src/ops/specinfer_inc_multihead_self_attention.cu b/src/ops/specinfer_inc_multihead_self_attention.cu index e8ac1d980c..f2ea63d904 100644 --- a/src/ops/specinfer_inc_multihead_self_attention.cu +++ b/src/ops/specinfer_inc_multihead_self_attention.cu @@ -83,9 +83,9 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( int const tlength = request_infos[request_idx].first_token_depth_in_request + request_infos[request_idx].num_tokens_in_batch; - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - printf("specinfer attn fused kernel %lld\n", bitmask.mask[1]); - } + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("specinfer attn fused kernel %lld\n", bitmask.mask[1]); + // } int const totalCacheSize = bitmask.non_tree_cache_size + bitmask.tree_size; // int const qlength = request_infos[request_idx].num_tokens_in_batch; @@ -181,6 +181,10 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( } float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); + // if (blockIdx.y == 0 && blockIdx.x == 0) { + // printf("spec inc attn qkqkqk %d, %.10f, %d\n", ti, qk, sub_req_idx); + // } + if (ti < totalCacheSize && tidx % THREADS_PER_KEY == 0) { // todo add alobi here // bool const mask = ti_circ >= totalCacheSize; @@ -188,15 +192,15 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << query_token)))); - if (blockIdx.y == 0 && blockIdx.x == 0 && mask && sub_req_idx == 0) { - // printf("specinfer mask: ti:%d, %d, %d, %d, %lld\n", - // ti, - // totalCacheSize, - // ti - bitmask.non_tree_cache_size, - // query_token, - // bitmask.mask[ti - bitmask.non_tree_cache_size]); - // assert(false); - } + // if (blockIdx.y == 0 && blockIdx.x == 0 && sub_req_idx == 0) { + // printf("specinfer mask: ti:%d, %d, %d, %d, %lld\n", + // ti, + // totalCacheSize, + // bitmask.non_tree_cache_size, + // query_token, + // bitmask.mask[ti - bitmask.non_tree_cache_size]); + // // assert(false); + // } qk_max = mask ? qk_max : fmaxf(qk_max, qk); qk_smem[ti - first_step] = mask ? 0.f : qk; } @@ -231,6 +235,10 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // Broadcast to all the threads in the warp. qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("spec inc attn first token qk_max %.10f\n", qk_max); + // } + float exp_sum = 0.f; for (int ti = first_step + tidx; ti < totalCacheSize; ti += THREADS_PER_BLOCK) { @@ -245,6 +253,10 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // Compute the sum. exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("spec inc attn exp_sum %.10f\n", exp_sum); + // } + // softmax float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); for (int ti = first_step + tidx; ti < totalCacheSize; @@ -301,6 +313,10 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // // Make sure we can start writing to shared memory. __syncthreads(); + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("valueX %.10f\n", out.x); + // } + // Run the final reduction amongst the different groups computing different // partial outputs. if (Dh == Dh_MAX || vi < Dh) { @@ -357,8 +373,8 @@ __global__ void specinfer_store_kv_cache( int max_tree_branches, bool is_root, int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * 2) { - int token_idx = i / (hidden_size * KV_WEIGHT_NUM); + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / (hidden_size); int offset = i % hidden_size; size_t val_idx = @@ -416,6 +432,16 @@ __global__ void specinfer_store_kv_cache( // cache_idx); // } + // if (i % hidden_size == 0) { + // printf("update cache: %d, %d, %d, %d, %d, %d\n", + // cache_idx, + // num_tokens, + // bitmask.non_tree_cache_size, + // bitmask.tree_size, + // bitmask.this_layer_size, + // token_idx); + // } + kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + offset] = kVal; vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + @@ -433,9 +459,9 @@ void update_kv_cache_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, // assert(curr_depth < 3); if (num_tokens > 0) { int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens; - printf("tokenInfo %d, %d\n", - bc->beamTokenInfo[0].sub_request_index, - num_tokens); + // printf("tokenInfo %d, %d\n", + // bc->beamTokenInfo[0].sub_request_index, + // num_tokens); specinfer_store_kv_cache<<num_active_requests() * sizeof(BatchConfig::BitMask), cudaMemcpyHostToDevice, stream); - std::cout << "kernel bit mask: " << bc->causalMask[0].prompt_size << ", " - << bc->causalMask[0].non_tree_cache_size << ", " - << bc->causalMask[0].mask[0] << ", " << sizeof(BatchConfig::BitMask) - << "\n"; + // std::cout << "kernel bit mask: " << bc->causalMask[0].prompt_size << ", " + // << bc->causalMask[0].non_tree_cache_size << ", " + // << bc->causalMask[0].mask[0] << ", " << + // sizeof(BatchConfig::BitMask) + // << "\n"; compute_qkv_kernel(m, bc, shard_id, @@ -800,8 +827,8 @@ void inference_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, stream); // phase 2: Update key/val cache update_kv_cache_kernel
(m, bc, stream); - std::cout << "specinfer kernel token num: " << bc->num_generation_tokens - << ", " << bc->num_tokens << "\n"; + // std::cout << "specinfer kernel token num: " << bc->num_generation_tokens + // << ", " << bc->num_tokens << "\n"; if (bc->num_generation_tokens > 0) { compute_specinfer_attention_kernel_generation
( m, bc, static_cast
(m->attn_heads), stream); @@ -809,6 +836,7 @@ void inference_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, // phase 3: Compute attention score // 3 kernels for pahse 3: matmul1 - softmax - matmal2 if (bc->num_tokens > bc->num_generation_tokens) { + // printf("spec inc prompt decoding\n"); compute_attention_kernel_prompt( m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); } @@ -892,7 +920,7 @@ void SpecInferIncMultiHeadSelfAttention::inference_kernel_wrapper( // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, // acc_output.rect, "[Attention:forward:output]"); } - // print_tensor(output.get_float_ptr(), 32, "specinc output"); + // print_tensor(output.get_float_ptr(), 32, "specinc output"); // if(bc->num_tokens == 1){ // print_tensor(input.get_float_ptr(), 32, "specinc input"); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 3d5ccf9431..180a165451 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -86,13 +86,13 @@ __global__ void compute_attention_kernel_fused_kernel( BatchConfig::BitMask bitmask = causalMask[request_idx]; // bitmask.mask[1] = 3; - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - printf("tree attn fused kernel %d, %d, %d, %lld\n", - tlength, - qlength, - bitmask.non_tree_cache_size, - bitmask.mask[1]); - } + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("tree attn fused kernel %d, %d, %d, %lld\n", + // tlength, + // qlength, + // bitmask.non_tree_cache_size, + // bitmask.mask[3]); + // } int first_token_idx = 0; for (int r = 0; r < request_idx; r++) { @@ -161,7 +161,7 @@ __global__ void compute_attention_kernel_fused_kernel( bool const mask = (ti >= bitmask.non_tree_cache_size && (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); - + // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 0 && mask) { // printf("tree attn mask for first token %d, %lld, %d, %d\n", // ti, @@ -169,16 +169,22 @@ __global__ void compute_attention_kernel_fused_kernel( // bitmask.non_tree_cache_size, // qi); // } + // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 3 && mask) { + // printf("tree attn mask for third token %d, %lld, %d, %d\n", + // ti, + // bitmask.mask[ti - bitmask.non_tree_cache_size], + // bitmask.non_tree_cache_size, + // qi); + // } qk_max = mask ? qk_max : fmaxf(qk_max, qk); // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 1 && !mask) { - // printf("tree attn mask for second token %d, %lld, %d, %d, %.10f\n", + // printf("tree attn qkqkqkqk %d %.10f, %.10f, %.10f\n", // ti, - // bitmask.mask[ti - bitmask.non_tree_cache_size], - // bitmask.non_tree_cache_size, - // qi, - // qk); + // qk, + // q_vecs[ki_o][0].x, + // k[0].x); // } qk_smem[ti - first_step] = mask ? 0.0f : qk; } @@ -212,12 +218,10 @@ __global__ void compute_attention_kernel_fused_kernel( // Broadcast to all the threads in the warp. qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); - - // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 0 && tidx == 0) { - // printf("tree attn first token qk_max %f\n", - // qk_max); - // } + // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 1 && tidx == 0) { + // printf("tree attn first token qk_max %f\n", qk_max); + // } float exp_sum = 0.f; for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { @@ -244,7 +248,7 @@ __global__ void compute_attention_kernel_fused_kernel( __syncthreads(); // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { - // printf("softmax %.10f\n", qk_smem[0]); + // printf("softmax %.10f\n", qk_smem[1]); // } // value projection @@ -280,12 +284,13 @@ __global__ void compute_attention_kernel_fused_kernel( V_vec v = *reinterpret_cast( v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); - bool const mask = - (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); - float logit = mask ? 0.0f : qk_smem[ti - first_step]; - out = FlexFlow::fma(logit, cast_to_float(v), out); - + if (ti < tlength) { + bool const mask = + (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + float logit = mask ? 0.0f : qk_smem[ti - first_step]; + out = FlexFlow::fma(logit, cast_to_float(v), out); + } } } @@ -328,11 +333,16 @@ __global__ void compute_attention_kernel_fused_kernel( output_ptr + (first_token_idx + qi) * hidden_size + head_idx * per_head_size + vi), out); - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { - // printf("tree attn final value, %.9f, %.9f, %.9f, %.9f, %d, %d\n", - // out.x, out.y, out.z, out.w, vi, (first_token_idx + qi) * hidden_size + - // head_idx * per_head_size + vi); - // } + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + // printf("tree attn final value, %.9f, %.9f, %.9f, %.9f, %d, %d\n", + // out.x, + // out.y, + // out.z, + // out.w, + // vi, + // (first_token_idx + qi) * hidden_size + head_idx * per_head_size + + // vi); + // } } } } @@ -349,11 +359,12 @@ __global__ void commit_tokens_kernel( int num_tokens_to_commit, int num_active_tokens_in_last_batch, int max_seq_len, - int hidden_size) { + int hidden_size, + int max_tree_branches) { - CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size * 2) { + CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size) { - int token_pos = i / (hidden_size * KV_WEIGHT_NUM); + int token_pos = i / (hidden_size); int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index; int offset = i % hidden_size; assert(token_idx_in_last_batch < num_active_tokens_in_last_batch); @@ -367,10 +378,23 @@ __global__ void commit_tokens_kernel( int const req_id = committedTokenInfos[token_pos].request_index; int const tok_id = committedTokenInfos[token_pos].token_depth; - kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = kVal; - vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = vVal; + // if(i == 0){ + // printf("commit token: %d %d %f\n", token_idx_in_last_batch, tok_id, + // kVal); + // } + // if(i == hidden_size){ + // printf("commit token 1: %d %d %f\n", token_idx_in_last_batch, tok_id, + // kVal); + // } + // if(i == 2 * hidden_size){ + // printf("commit token 2: %d %d %f\n", token_idx_in_last_batch, tok_id, + // kVal); + // } + + kCache_ptr[req_id * max_tree_branches * (hidden_size * max_seq_len) + + tok_id * hidden_size + offset] = kVal; + vCache_ptr[req_id * max_tree_branches * (hidden_size * max_seq_len) + + tok_id * hidden_size + offset] = vVal; } } @@ -395,7 +419,8 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, num_tokens_to_commit, m->num_active_tokens, // number of active tokens in previous batch BatchConfig::max_sequence_length(), - m->hidden_size); + m->hidden_size, + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES); } } @@ -413,9 +438,9 @@ __global__ void update_tree_branch_kv_cache( int total_tokens_in_batch, int max_seq_len, int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size * 2) { + CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size) { - int token_idx = i / (hidden_size * KV_WEIGHT_NUM); + int token_idx = i / (hidden_size); int offset = i % hidden_size; token_idx += processed_tokens_in_batch; // get index in the whole batch @@ -460,6 +485,11 @@ __global__ void update_tree_branch_kv_cache_fused( int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + + // if(i % hidden_size == 0){ + // printf("update token id: %d, %d\n", token_idx, token_idx + + // first_token_depth); + // } kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + (token_idx + first_token_depth) * hidden_size + offset] = kVal; vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + @@ -879,7 +909,8 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // Note that m->num_active_tokens stores the number of active // tokens in the previous batch, which is needed for committing // keys/values to the key-value cache - std::cout << "tokens to be committed: " << bc->num_tokens_to_commit << "\n"; + // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit << + // "\n"; cudaMemcpyAsync(m->committed_token_infos, &(bc->committed_tokens), @@ -925,6 +956,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, static_cast
(m->devQKVProjArray), bias_ptr, stream); + // print_tensor((float *)m->devQKVProjArray, 32, "qkvtenor"); // phase 2: No need to update key/val cache // IncMultiHeadSelfAttention::update_kv_cache_kernel( diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index e7b08f653d..d5c2b7392d 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -609,6 +609,8 @@ BeamSearchBatchConfig committed_tokens[guid].emplace_back(abs_depth, result_index); } else if (abs_depth >= root_abs_depth) { tree_outputs.emplace_back(token_id, abs_depth + 1); + std::cout << "committred tokens push: " << abs_depth + << " ,result index: " << result_index << "\n"; committed_tokens[guid].emplace_back(abs_depth, result_index); if (verbose) { @@ -789,9 +791,9 @@ BeamSearchBatchConfig // Beam Token Info new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; new_bc.num_tokens++; - std::cout << "num_gen ++ " - << "\n"; - num_generation_tokens++; + // std::cout << "num_gen ++ " + // << "\n"; + // num_generation_tokens++; // Add verified token to request's token list request.tokens.push_back(token.first); @@ -923,9 +925,7 @@ BeamSearchBatchConfig new_bc.num_tokens++; } new_bc.topology_mask[i].allocated_tokens = 0; - new_bc.causalMask[i].non_tree_cache_size = 0; - new_bc.causalMask[i].tree_size = - new_bc.requestsInfo[i].num_tokens_in_batch; + initBitMask(new_bc.causalMask[i], new_bc.requestsInfo[i].num_tokens_in_batch); @@ -1185,14 +1185,14 @@ BeamSearchBatchConfig // sub_request_num -> nodes of input next iteration // beam_size replicate num - std::cout << "print beam tree: " - << old_bc.beamRequestsInfo[i].current_depth << "\n"; + // std::cout << "print beam tree: " + // << old_bc.beamRequestsInfo[i].current_depth << "\n"; BeamTree tree = request.beam_trees[old_bc.model_id]; - for (int k = 0; k <= old_bc.beamRequestsInfo[i].current_depth; k++) { - std::cout << "layer: " << k << "\n"; - std::cout << "nodes: " << tree.treeLayers[k].nodes_num_this_layer - << "\n"; - } + // for (int k = 0; k <= old_bc.beamRequestsInfo[i].current_depth; k++) { + // std::cout << "layer: " << k << "\n"; + // std::cout << "nodes: " << tree.treeLayers[k].nodes_num_this_layer + // << "\n"; + // } appendBitMask(new_bc.causalMask[i], new_bc.beamRequestsInfo[i].sub_request_num, old_bc.beamRequestsInfo[i].beam_size, @@ -1217,9 +1217,10 @@ BeamSearchBatchConfig new_bc.topology_mask[i].real_token_pos[k][depth] = new_bc.topology_mask[i].allocated_tokens + num_generation_tokens; - std::cout << "topology: sub request: " << k << ", " - << ", " << depth << ", " - << new_bc.topology_mask[i].real_token_pos[k][depth] << "\n"; + // std::cout << "topology: sub request: " << k << ", " + // << ", " << depth << ", " + // << new_bc.topology_mask[i].real_token_pos[k][depth] << + // "\n"; num_generation_tokens++; } } @@ -1354,13 +1355,13 @@ BeamSearchBatchConfig } if (true) { - std::cout << "print all resultsBBB" - << "\n"; - for (int i = 0; i < 40; i++) { - std::cout << result.token_ids[i] << ", "; - } - std::cout << "Current Beam DepthBBB: " - << old_bc.beamRequestsInfo[0].current_depth << "\n"; + // std::cout << "print all resultsBBB" + // << "\n"; + // for (int i = 0; i < 40; i++) { + // std::cout << result.token_ids[i] << ", "; + // } + // std::cout << "Current Beam DepthBBB: " + // << old_bc.beamRequestsInfo[0].current_depth << "\n"; } return new_bc; } @@ -1449,11 +1450,11 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( merge_dfs_trees(all_dfs_trees, request.tokens.size() - 1, guid); if (true) { - std::cout << "Request Tokens Size: " << request.tokens.size() - << std::endl; - for (int k = 0; k < request.tokens.size(); k++) { - std::cout << k << ": " << request.tokens[k] << std::endl; - } + // std::cout << "Request Tokens Size: " << request.tokens.size() + // << std::endl; + // for (int k = 0; k < request.tokens.size(); k++) { + // std::cout << k << ": " << request.tokens[k] << std::endl; + // } } // Normal Request Info @@ -1475,27 +1476,42 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.requestsInfo[i].num_tokens_in_batch = 0; new_bc.request_completed[i] = false; + std::cout << "dfs_tree_inputs: " << dfs_tree_inputs.size() << ", " + << new_bc.causalMask[i].tree_size << ", " + << new_bc.causalMask[i].non_tree_cache_size << "\n"; + std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[0]) + << "\n"; + std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[1]) + << "\n"; + std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[2]) + << "\n"; + std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[3]) + << "\n"; + std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[4]) + << "\n"; + // Committed Tokens if (committed_tokens.find(guid) != committed_tokens.end()) { - for (int j = 0; j < dfs_tree_inputs.size(); j++) { - if (j < committed_tokens.at(guid).size()) { - auto committed_token = committed_tokens.at(guid).at(j); - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = - committed_token.second; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = - i; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = - committed_token.first; - if (true) { - std::cout << new_bc.num_tokens_to_commit - << "- committed_token.token_depth: " - << committed_token.first - << ", token_index: " << committed_token.second - << std::endl; - } - new_bc.num_tokens_to_commit++; - request.llm_cache_size++; + for (int j = 0; j < committed_tokens.at(guid).size(); j++) { + // if (j < committed_tokens.at(guid).size()) { + + auto committed_token = committed_tokens.at(guid).at(j); + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = + committed_token.second; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = + i; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = + committed_token.first; + if (true) { + std::cout << new_bc.num_tokens_to_commit + << "- committed_token.token_depth: " + << committed_token.first + << ", token_index: " << committed_token.second + << std::endl; } + new_bc.num_tokens_to_commit++; + request.llm_cache_size++; + // } } } if (true) { @@ -1759,11 +1775,11 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, request.beam_trees.at(old_bc.model_id) .treeLayers[depth] .parent_ids[beam_id] = result.parent_id[result_index]; - std::cout << "??????? beam id: " << beam_id << ", token: " - << request.beam_trees.at(old_bc.model_id) - .treeLayers[depth] - .tokens[beam_id] - << "\n"; + // std::cout << "??????? beam id: " << beam_id << ", token: " + // << request.beam_trees.at(old_bc.model_id) + // .treeLayers[depth] + // .tokens[beam_id] + // << "\n"; // if (true) { // std::cout << "tree value: " << depth << "token: " @@ -1844,19 +1860,20 @@ void RequestManager::initBitMask(BatchConfig::BitMask &bitmask, "do not support tree size > 64"); // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: // 0000000..1000 + bitmask.non_tree_cache_size = 0; + bitmask.tree_size = initLength; bitmask.prompt_size = initLength; bitmask.this_layer_size = initLength; - bitmask.tree_size = initLength; for (int i = 0; i < bitmask.prompt_size; i++) { for (int j = i; j < bitmask.prompt_size; j++) { bitmask.mask[i] |= (1 << j); } } - std::cout << "see bit mask" << bitmask.prompt_size << "\n"; - std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[0]) << "\n"; - std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[1]) << "\n"; - std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[2]) << "\n"; + // std::cout << "see bit mask" << bitmask.prompt_size << "\n"; + // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[0]) << "\n"; + // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[1]) << "\n"; + // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[2]) << "\n"; } // prepare next init @@ -1868,11 +1885,16 @@ void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask, // 0000000..1000 assert(initLength <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM && "do not support tree size > 64"); - bitmask.non_tree_cache_size = non_tree_size; - bitmask.tree_size = initLength; + assert(initLength >= 1 && "verified token num should >= 1"); + + std::cout << "non tree size: " << non_tree_size << ", " + << bitmask.non_tree_cache_size << "\n"; + + bitmask.non_tree_cache_size = non_tree_size + initLength - 1; + bitmask.tree_size = 1; bitmask.this_layer_size = initLength; std::cout << "non_tree_size: " << non_tree_size << "\n"; - bitmask.prompt_size = initLength; + bitmask.prompt_size = 1; for (int i = 0; i < bitmask.prompt_size; i++) { for (int j = i; j < bitmask.prompt_size; j++) { bitmask.mask[i] |= (1 << j); @@ -1906,14 +1928,14 @@ void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, for (int i = 0; i < bitmask.prompt_size; i++) { for (int j = pre_tree_size; j < bitmask.tree_size; j++) { bitmask.mask[i] |= (1 << j); - std::cout << "see bit mask append: " << i << ", to" << j - << std::bitset<64>(bitmask.mask[i]) << "\n"; + // std::cout << "see bit mask append: " << i << ", to" << j + // << std::bitset<64>(bitmask.mask[i]) << "\n"; } } - std::cout << "bitmask.tree_size: " << bitmask.tree_size << ", " - << pre_tree_size << ", " << bitmask.prompt_size << ", " - << preBeamSize << "\n"; + // std::cout << "bitmask.tree_size: " << bitmask.tree_size << ", " + // << pre_tree_size << ", " << bitmask.prompt_size << ", " + // << preBeamSize << "\n"; // int num_groups = newNodes / preBeamSize; // int group_size = newNodes / num_groups; @@ -1924,12 +1946,12 @@ void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, // skip the root prompt/tokens int token_idx = bitmask.prompt_size; int new_nodes_start_idx = pre_tree_size; - std::cout << "new nodes start " << new_nodes_start_idx << "\n"; + // std::cout << "new nodes start " << new_nodes_start_idx << "\n"; for (int i = 1; i < currentDepth; i++) { new_nodes_start_idx = pre_tree_size; int nodes_this_layer = tree.treeLayers[i].nodes_num_this_layer; - std::cout << "tree layer: " << i << " nodes:" << nodes_this_layer - << "group size: " << newNodes / nodes_this_layer << "\n"; + // std::cout << "tree layer: " << i << " nodes:" << nodes_this_layer + // << "group size: " << newNodes / nodes_this_layer << "\n"; for (int j = 0; j < nodes_this_layer; j++) { int group_size = newNodes / nodes_this_layer; for (int k = 0; k < group_size; k++) { @@ -1940,12 +1962,12 @@ void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, } } - std::cout << "token idx: " << token_idx << ", " << pre_tree_size << ", " - << new_nodes_start_idx << ", " << newNodes - << "current depth: " << currentDepth << "\n"; - std::cout << "new nodes end " << new_nodes_start_idx << "\n"; + // std::cout << "token idx: " << token_idx << ", " << pre_tree_size << ", " + // << new_nodes_start_idx << ", " << newNodes + // << "current depth: " << currentDepth << "\n"; + // std::cout << "new nodes end " << new_nodes_start_idx << "\n"; - std::cout << "tree size: " << bitmask.tree_size << "\n"; + // std::cout << "tree size: " << bitmask.tree_size << "\n"; assert(token_idx == pre_tree_size); assert(currentDepth <= 1 || new_nodes_start_idx == bitmask.tree_size); @@ -1953,8 +1975,23 @@ void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, // set last layer, all tokens are only relevant to it self; for (int i = token_idx; i < bitmask.tree_size; i++) { bitmask.mask[i] |= (1 << i); - std::cout << "set rel: " << i << "to: " << i << "\n"; + // std::cout << "set rel: " << i << "to: " << i << "\n"; } + + // if(bitmask.non_tree_cache_size == 19 && bitmask.tree_size > 2){ + // assert(false); + // } + + std::cout << "see bit mask append" << bitmask.prompt_size << "\n"; + std::cout << "see bit mask append" << bitmask.non_tree_cache_size << "\n"; + std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[0]) + << "\n"; + std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[1]) + << "\n"; + std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[2]) + << "\n"; + std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[3]) + << "\n"; } bool PreOrder( @@ -2146,16 +2183,16 @@ std::vector> // << std::endl; BeamTree tree = request.beam_trees.at(old_bc.model_id); - std::cout << "print beam tree: " - << "\n"; + // std::cout << "print beam tree: " + // << "\n"; std::vector> serializedTree; for (int i = 0; i <= old_bc.beamRequestsInfo[request_index].max_depth; i++) { - std::cout << "tree layer: " << i - << ", num_nodes: " << tree.treeLayers[i].nodes_num_this_layer - << "\n"; + // std::cout << "tree layer: " << i + // << ", num_nodes: " << tree.treeLayers[i].nodes_num_this_layer + // << "\n"; // push tokens into tree for (int j = 0; j < tree.treeLayers[i].nodes_num_this_layer; j++) { - std::cout << "token: " << tree.treeLayers[i].tokens[j] << "\n"; + // std::cout << "token: " << tree.treeLayers[i].tokens[j] << "\n"; serializedTree.push_back(std::make_pair(tree.treeLayers[i].tokens[j], i)); } } @@ -2256,7 +2293,8 @@ std::vector> } dfs_tree_inputs[guid] = merged_tree; - // std::cout << "assign dfr tree: " << guid << ", " << merged_tree.size() << ", " + // std::cout << "assign dfr tree: " << guid << ", " << merged_tree.size() << + // ", " // << dfs_tree_inputs[guid].size() << "\n"; return merged_tree; From 3ed25d681127d742770776b8d07d9771e0e19f79 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 29 Dec 2023 16:10:16 -0500 Subject: [PATCH 10/61] multi batch --- src/ops/beam_topk.cc | 3 +- src/ops/beam_topk.cu | 3 +- .../specinfer_inc_multihead_self_attention.cu | 66 +++++++------------ .../tree attn kernel, 0----> -0.029753357172 | 1 + src/ops/tree_inc_multihead_self_attention.cu | 45 +++++++++---- src/runtime/request_manager.cc | 37 ++++++++--- 6 files changed, 89 insertions(+), 66 deletions(-) create mode 100644 src/ops/tree attn kernel, 0----> -0.029753357172 diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 3f636c2c98..20d019eec3 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -402,8 +402,7 @@ BeamInferenceResult download_tensor( parent_ptr, ir.parent_id, batch_size * m->max_beam_width); - print_tensor(index_ptr, 32, "indexxxxxxx"); - printf("max beam width %d\n", m->max_beam_width); + // print_tensor(index_ptr, 32, "indexxxxxxx"); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index 515bba4bc0..d647fe9ed7 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -626,7 +626,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, stream)); // trick, set acc_probs to 0; checkCUDA( - cudaMemsetAsync(m->acc_probs, 1.0, batch_size * sizeof(DT), stream)); + cudaMemsetAsync(m->acc_probs, 1.0, max_total_requests * sizeof(DT), stream)); checkCUDA(cudaMemcpyAsync(m->block_start_index, beam_block_start_index.data(), sizeof(int) * beam_num_blocks, @@ -644,6 +644,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, stream)); // int depth = // bc->beamRequestsInfo[bc->tokensInfo[0].request_index].current_depth; + beam_num_blocks = bc->num_active_tokens(); beam_topk_forward_kernel<<>>( input_ptr, shared_memory_size, diff --git a/src/ops/specinfer_inc_multihead_self_attention.cu b/src/ops/specinfer_inc_multihead_self_attention.cu index f2ea63d904..3fdd1ab554 100644 --- a/src/ops/specinfer_inc_multihead_self_attention.cu +++ b/src/ops/specinfer_inc_multihead_self_attention.cu @@ -100,6 +100,10 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( first_token_idx += bitmask.this_layer_size; } + // if (tidx == 0 && head_idx == 0) { + // printf("spec req: %d, %d\n", request_idx, first_token_idx); + // } + // shared memory objects extern __shared__ char smem_[]; @@ -135,17 +139,16 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( int ti_end = div_up(totalCacheSize - first_step, K_PER_WARP) * K_PER_WARP + first_step; - for (int sub_req_idx = 0; sub_req_idx < tree_branch_num; sub_req_idx += 1) { + for (int qi = 0; qi < tree_branch_num; qi += 1) { #pragma unroll for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { q_vecs[ki_o][ii] = *reinterpret_cast( - q_ptr + (hidden_size * QKV_WEIGHT_NUM * sub_req_idx) + ki + + q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + ii * THREADS_PER_KEY * K_VEC_SIZE); } - int const query_token = bitmask.tree_size - tree_branch_num + sub_req_idx; - - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && sub_req_idx == 0) { + int const query_token = bitmask.tree_size - tree_branch_num + qi; + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 0) { // printf("fuckmasksss %d, %d, %d, %d, %d\n", // bitmask.prompt_size, // bitmask.non_tree_cache_size, @@ -345,11 +348,10 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // Output the final values. if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { - convert_from_float( - *reinterpret_cast(output_ptr + - (request_idx + sub_req_idx) * hidden_size + - head_idx * per_head_size + vi), - out); + convert_from_float(*reinterpret_cast( + output_ptr + (first_token_idx + qi) * hidden_size + + head_idx * per_head_size + vi), + out); } } } @@ -391,6 +393,9 @@ __global__ void specinfer_store_kv_cache( int const allocated_tokens = beam_topology_mask[req_id].allocated_tokens; int const total_token = requestInfo[req_id].num_tokens_in_batch; + int const request_token_offset = + requestInfo[req_id].first_token_offset_in_batch; + BatchConfig::BitMask bitmask = causalMask[req_id]; int const sub_request_num = beamRequestInfos[req_id].sub_request_num; @@ -404,42 +409,18 @@ __global__ void specinfer_store_kv_cache( // if prompt token -> token id // if tree token: int const cache_idx = bitmask.non_tree_cache_size + bitmask.tree_size - - bitmask.this_layer_size + token_idx; + bitmask.this_layer_size + token_idx - + request_token_offset; int real_idx = tok_id - first_token_in_req + allocated_tokens + sub_req_id; - // if (i == 0) { - // printf("ffasdasds%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", - // beamTokenInfos[0].sub_request_index, - // allocated_tokens, - // sub_req_id, - // tok_id, - // first_token_in_req, + // if (i % hidden_size == 0) { + // printf("ffasdasds request %d, real idx %d, cache idx %d token id %d, kval %.10f\n", + // req_id, // real_idx, // cache_idx, - // bitmask.non_tree_cache_size, - // bitmask.tree_size, - // sub_request_num, - // token_idx ); - // } else if (i == hidden_size * 2) { - // printf("hshddhdhdsdaww%d, %d, %d, %d, %d, %d, %d\n", - // beamTokenInfos[0].sub_request_index, - // allocated_tokens, - // sub_req_id, // tok_id, - // first_token_in_req, - // real_idx, - // cache_idx); - // } - - // if (i % hidden_size == 0) { - // printf("update cache: %d, %d, %d, %d, %d, %d\n", - // cache_idx, - // num_tokens, - // bitmask.non_tree_cache_size, - // bitmask.tree_size, - // bitmask.this_layer_size, - // token_idx); + // kVal); // } kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + @@ -846,6 +827,8 @@ void inference_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, // compute output production and bias together for all tokens int num_tokens = bc->num_active_tokens(); + // std::cout << "specinfer num tokens: " << num_tokens; + compute_o_prod_bias( m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); } @@ -920,7 +903,8 @@ void SpecInferIncMultiHeadSelfAttention::inference_kernel_wrapper( // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, // acc_output.rect, "[Attention:forward:output]"); } - // print_tensor(output.get_float_ptr(), 32, "specinc output"); + // save_tensor(output.get_float_ptr(), 768 * 3, "/home/xinhaoc/FlexFlow/inference/output/fk1.txt"); + // save_tensor(output.get_float_ptr() + 768 * 3, 768 * 3, "/home/xinhaoc/FlexFlow/inference/output/fk2.txt"); // if(bc->num_tokens == 1){ // print_tensor(input.get_float_ptr(), 32, "specinc input"); diff --git a/src/ops/tree attn kernel, 0----> -0.029753357172 b/src/ops/tree attn kernel, 0----> -0.029753357172 new file mode 100644 index 0000000000..e4f14ee757 --- /dev/null +++ b/src/ops/tree attn kernel, 0----> -0.029753357172 @@ -0,0 +1 @@ +tree attn kernel, 0----> -0.02975335717201232910 0.01930358447134494781 0.03780741989612579346 0.11878532171249389648 -0.03523746877908706665 0.02421043440699577332 0.03719477355480194092 -0.00304851122200489044 0.02062662504613399506 0.06683708727359771729 -0.00642335414886474609 -0.00504039414227008820 0.02955199964344501495 0.00648811273276805878 0.00558663159608840942 0.02003456838428974152 -0.04041406139731407166 0.00736814411357045174 -0.04575226455926895142 0.03949077427387237549 0.05742383748292922974 0.04866250604391098022 0.04687267541885375977 -0.00701304525136947632 -0.03712264448404312134 -0.02175992354750633240 -0.03979443758726119995 0.03961737453937530518 -0.07450901716947555542 0.02090370282530784607 -0.03487894684076309204 0.01653470844030380249 \ No newline at end of file diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 180a165451..11169fa36d 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -99,6 +99,10 @@ __global__ void compute_attention_kernel_fused_kernel( first_token_idx += request_infos[request_idx].num_tokens_in_batch; } + // if(tidx == 0 && head_idx == 0){ + // printf("tree req: %d, %d\n", request_idx, first_token_idx); + // } + // shared memory objects extern __shared__ char smem_[]; @@ -140,6 +144,12 @@ __global__ void compute_attention_kernel_fused_kernel( q_vecs[ki_o][ii] = *reinterpret_cast( q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + ii * THREADS_PER_KEY * K_VEC_SIZE); + + // if (head_idx == 0 && qi == 1 && tidx == 0) { + // printf("laod q %d, %d %.10f\n", + // request_idx, + // qi,q_vecs[ki_o][ii].x); + // } } __syncthreads(); @@ -162,11 +172,12 @@ __global__ void compute_attention_kernel_fused_kernel( (ti >= bitmask.non_tree_cache_size && (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); - // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 0 && mask) { - // printf("tree attn mask for first token %d, %lld, %d, %d\n", + // if (head_idx == 0 && qi == 9 && mask) { + // printf("tree attn mask for first token %d, %lld, %d, %d, %d\n", // ti, // bitmask.mask[ti - bitmask.non_tree_cache_size], // bitmask.non_tree_cache_size, + // request_idx, // qi); // } // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 3 && mask) { @@ -179,11 +190,15 @@ __global__ void compute_attention_kernel_fused_kernel( qk_max = mask ? qk_max : fmaxf(qk_max, qk); - // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 1 && !mask) { - // printf("tree attn qkqkqkqk %d %.10f, %.10f, %.10f\n", + // if (head_idx == 0 && qi == 1 && !mask && tidx == 0) { + // printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, %.10f\n", + // request_idx, // ti, // qk, // q_vecs[ki_o][0].x, + // q_vecs[ki_o][1].x, + // q_vecs[ki_o][2].x, + // q_vecs[ki_o][3].x, // k[0].x); // } qk_smem[ti - first_step] = mask ? 0.0f : qk; @@ -219,7 +234,7 @@ __global__ void compute_attention_kernel_fused_kernel( // Broadcast to all the threads in the warp. qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); - // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 1 && tidx == 0) { + // if (head_idx == 0 && qi == 9 && tidx == 0) { // printf("tree attn first token qk_max %f\n", qk_max); // } @@ -236,7 +251,7 @@ __global__ void compute_attention_kernel_fused_kernel( // Compute the sum. exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + // if (head_idx == 0 && tidx == 0 && qi == 9) { // printf("expsum %.10f\n", exp_sum); // } @@ -247,7 +262,7 @@ __global__ void compute_attention_kernel_fused_kernel( } __syncthreads(); - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + // if (head_idx == 0 && tidx == 0 && qi == 9) { // printf("softmax %.10f\n", qk_smem[1]); // } @@ -465,6 +480,7 @@ __global__ void update_tree_branch_kv_cache_fused( DT *kCache_ptr, DT *vCache_ptr, TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos, + BatchConfig::PerRequestInfo *request_infos, int qProjSize, int kProjSize, int vProjSize, @@ -486,14 +502,15 @@ __global__ void update_tree_branch_kv_cache_fused( int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + int const request_token_offset = request_infos[req_id].first_token_offset_in_batch; + // if(i % hidden_size == 0){ - // printf("update token id: %d, %d\n", token_idx, token_idx + - // first_token_depth); + // printf("update token request id: %d, %d, %d value%.10f\n", req_id, token_idx, request_token_offset, kVal); // } kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + - (token_idx + first_token_depth) * hidden_size + offset] = kVal; + (token_idx + first_token_depth - request_token_offset) * hidden_size + offset] = kVal; vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + - (token_idx + first_token_depth) * hidden_size + offset] = vVal; + (token_idx + first_token_depth - request_token_offset) * hidden_size + offset] = vVal; } } @@ -851,6 +868,7 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, static_cast
(m->keyCache), static_cast
(m->valueCache), m->token_infos, + m->request_infos, m->qProjSize, m->kProjSize, m->vProjSize, @@ -956,7 +974,8 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, static_cast
(m->devQKVProjArray), bias_ptr, stream); - // print_tensor((float *)m->devQKVProjArray, 32, "qkvtenor"); + // print_tensor((float *)m->devQKVProjArray + 768 * 8 * 3 + 768, 32, "qkvtenor1"); + // print_tensor((float *)m->devQKVProjArray + 768 * 18 * 3 + 768, 32, "qkvtenor2"); // phase 2: No need to update key/val cache // IncMultiHeadSelfAttention::update_kv_cache_kernel( @@ -1000,6 +1019,8 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventRecord(t_start, stream); } + std::cout << "tree input tokens: " <num_active_tokens() << "\n"; + // assert(input.data_type == weight.data_type); assert(input.data_type == output.data_type); if (use_bias) { diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index d5c2b7392d..ab062a4610 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -634,6 +634,7 @@ BeamSearchBatchConfig if (request.status == Request::RUNNING) { std::cout << "verify running: " << dfs_tree_inputs.at(guid).size() << ", " << tree_outputs.size() << "\n"; + std::vector> verified_tokens = traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); @@ -812,6 +813,7 @@ BeamSearchBatchConfig } log_req_mgr.print("Output: %s", output.c_str()); } + } else if (request.status == Request::PENDING) { new_bc.request_completed[i] = false; new_bc.request_running[i] = false; @@ -1185,8 +1187,8 @@ BeamSearchBatchConfig // sub_request_num -> nodes of input next iteration // beam_size replicate num - // std::cout << "print beam tree: " - // << old_bc.beamRequestsInfo[i].current_depth << "\n"; + std::cout << "print beam tree: " + << old_bc.beamRequestsInfo[i].current_depth << "\n"; BeamTree tree = request.beam_trees[old_bc.model_id]; // for (int k = 0; k <= old_bc.beamRequestsInfo[i].current_depth; k++) { // std::cout << "layer: " << k << "\n"; @@ -1224,6 +1226,12 @@ BeamSearchBatchConfig num_generation_tokens++; } } + // if(new_bc.beamRequestsInfo[i].current_depth >= 3 && i > 0){ + // assert(false); + // } + + + } } @@ -1709,6 +1717,8 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid != guid) { + std::cout << "i is: " << i << "old guid" << guid << " new guid" << old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid <<"\n"; + int index = old_bc.tokensInfo[i - 1].request_index; int beam_size = old_bc.beamRequestsInfo[index].beam_size; @@ -1722,16 +1732,21 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, // Count tokens sent to model in this request to find the final token's // index + + std::cout << "previous result index: "<< result_index; + result_index += (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) * beam_size; - - if (true) { - std::cout << "i = " << i << ", result index = " << result_index - << ", value: " << result.token_ids[result_index] - << ", leaf node num: " << leaf_node_num << ", depth" << depth - << ", beam size: " << beam_size << "\n"; - } + + std::cout << "after result index: "<< result_index; + + // if (true) { + // std::cout << "i = " << i << ", result index = " << result_index + // << ", value: " << result.token_ids[result_index] + // << ", leaf node num: " << leaf_node_num << ", depth" << depth + // << ", beam size: " << beam_size << "\n"; + // } Request &request = all_requests[old_bc.requestsInfo[index].request_guid]; @@ -1792,7 +1807,9 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, } // update the guid and start_depth for current request if (i < old_bc.num_tokens) { - guid = old_bc.requestsInfo[index].request_guid; + int new_req_idx = old_bc.tokensInfo[i].request_index; + guid = old_bc.requestsInfo[new_req_idx].request_guid; + std::cout << "update guid: " << guid << ", request idx: " << index<< "\n"; start_depth = old_bc.tokensInfo[i].abs_depth_in_request; } } From 5c3ad3592f7b71dc705466fa24cb7c7c1e179deb Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 29 Dec 2023 17:37:28 -0500 Subject: [PATCH 11/61] copy metadata once --- include/flexflow/batch_config.h | 6 -- include/flexflow/config.h | 4 +- .../specinfer_inc_multihead_self_attention.h | 1 - src/ops/inc_multihead_self_attention.cu | 13 --- .../specinfer_inc_multihead_self_attention.cu | 94 ++++--------------- src/ops/tree_inc_multihead_self_attention.cu | 65 ++++++------- src/runtime/request_manager.cc | 46 +-------- src/runtime/request_manager.cu | 74 ++++++++------- 8 files changed, 89 insertions(+), 214 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index db5d4a8e48..c3a75e59a4 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -168,14 +168,8 @@ class BeamSearchBatchConfig : public BatchConfig { int sub_request_index; }; - struct SpecInferTopology { - int real_token_pos[MAX_SPECULATIVE_TREE_BRANCHES][MAX_NUM_TOKENS]; - int allocated_tokens; - }; - BeamSearchPerRequestInfo beamRequestsInfo[MAX_NUM_REQUESTS]; BeamSearchPerTokenInfo beamTokenInfo[MAX_NUM_TOKENS * MAX_BEAM_WIDTH]; - SpecInferTopology topology_mask[MAX_NUM_REQUESTS]; // why is this == MAX_NUM_REQUESTS * MAX_BEAM_WIDTH? int sub_requests[MAX_NUM_REQUESTS * MAX_BEAM_WIDTH]; diff --git a/include/flexflow/config.h b/include/flexflow/config.h index fe261dfb48..1526b9291f 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -81,10 +81,10 @@ struct FFHandler { // request info + token info + topolopgy mask info size_t batch_config_metadata_size = sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::topology_mask) + sizeof(BeamSearchBatchConfig::beamTokenInfo) + sizeof(BeamSearchBatchConfig::beamRequestsInfo) + - sizeof(BatchConfig::causalMask); + sizeof(BatchConfig::causalMask) + + sizeof(TreeVerifyBatchConfig::committed_tokens); void *offload_reserve_space; size_t offload_reserve_space_size; DataType quantization_type; diff --git a/include/flexflow/ops/specinfer_inc_multihead_self_attention.h b/include/flexflow/ops/specinfer_inc_multihead_self_attention.h index eb1b2882c3..b6fed1ae25 100644 --- a/include/flexflow/ops/specinfer_inc_multihead_self_attention.h +++ b/include/flexflow/ops/specinfer_inc_multihead_self_attention.h @@ -142,7 +142,6 @@ class SpecInferIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionM Realm::RegionInstance beam_search_reserve_inst; BeamSearchBatchConfig::BeamSearchPerTokenInfo *beam_token_infos; BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos; - BeamSearchBatchConfig::SpecInferTopology *beam_topology_mask; BatchConfig::BitMask *causalMask; }; diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index a05dbbf919..a084f216e9 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -825,19 +825,6 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, bias_ptr = static_cast
(m->bias_ptr); } - // todo Xinhao copy how many requests if requests are not continous? - // cudaMemcpyAsync(m->token_infos, - // &(bc->tokensInfo), - // bc->num_active_tokens() * - // sizeof(BatchConfig::PerTokenInfo), cudaMemcpyHostToDevice, - // stream); - // cudaMemcpyAsync(m->request_infos, - // &(bc->requestsInfo), - // bc->max_requests_per_batch() * - // sizeof(BatchConfig::PerRequestInfo), - // cudaMemcpyHostToDevice, - // stream); - // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, diff --git a/src/ops/specinfer_inc_multihead_self_attention.cu b/src/ops/specinfer_inc_multihead_self_attention.cu index 3fdd1ab554..4d4afd28e4 100644 --- a/src/ops/specinfer_inc_multihead_self_attention.cu +++ b/src/ops/specinfer_inc_multihead_self_attention.cu @@ -50,7 +50,6 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( int hidden_size, BatchConfig::PerRequestInfo *request_infos, BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos, - BeamSearchBatchConfig::SpecInferTopology *topology_mask, BatchConfig::BitMask *causalMask, int max_tree_branches) { @@ -74,8 +73,6 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // request idx int const request_idx = blockIdx.y; - BeamSearchBatchConfig::SpecInferTopology topology = - topology_mask[request_idx]; BatchConfig::BitMask bitmask = causalMask[request_idx]; int const first_step = 0; @@ -148,23 +145,7 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( } int const query_token = bitmask.tree_size - tree_branch_num + qi; - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 0) { - // printf("fuckmasksss %d, %d, %d, %d, %d\n", - // bitmask.prompt_size, - // bitmask.non_tree_cache_size, - // tree_branch_num, - // bitmask.tree_size, - // tlength); - // printf("cacheposssssB %d, %d\n", tree_branch_num, - // topology.real_token_pos[0][1]); - // printf("cacheposssssC %d, %d\n", tree_branch_num, - // topology.real_token_pos[0][2]); - // printf("cacheposssssD %d, %d\n", tree_branch_num, - // topology.real_token_pos[0][11]); printf("cacheposssssD %d, %d\n", - // tree_branch_num, topology.real_token_pos[0][12]); - // printf("cacheposssssD %d, %d\n", tree_branch_num, - // topology.real_token_pos[0][13]); - } + __syncthreads(); for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { K_vec k[K_VECS_PER_THREAD]; @@ -173,10 +154,7 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; if (ti < totalCacheSize) { - // find the real position of the cache; - // depth: 0, 1, 2, 3, 4, 4, 5, 5 ,5, 5, - // int const real_cache_idx = - // topology.real_token_pos[sub_req_idx][ti]; + k[ii] = *reinterpret_cast( k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size + jj); @@ -291,17 +269,12 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( DT const *v_cache_batch = value_cache + request_idx * max_seq_length * hidden_size * max_tree_branches + vi; - // DT const *v_cache_batch = - // value_cache + - // (beam_request_idx * max_beam_width + beam_sub_request_idx) * - // max_seq_length * hidden_size + - // vi; + if (Dh == Dh_MAX || vi < Dh) { for (int ti = first_step + vo; ti < totalCacheSize; ti += V_PER_ITER) { // Load the values from the cache. int const ti_circ = ti % max_seq_length; - // int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti]; V_vec v = *reinterpret_cast( v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); @@ -365,7 +338,6 @@ __global__ void specinfer_store_kv_cache( BatchConfig::PerRequestInfo *requestInfo, BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos, BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos, - BeamSearchBatchConfig::SpecInferTopology *beam_topology_mask, BatchConfig::BitMask *causalMask, int qProjSize, int kProjSize, @@ -390,7 +362,6 @@ __global__ void specinfer_store_kv_cache( int const first_token_in_req = requestInfo[req_id].first_token_depth_in_request; int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; - int const allocated_tokens = beam_topology_mask[req_id].allocated_tokens; int const total_token = requestInfo[req_id].num_tokens_in_batch; int const request_token_offset = @@ -412,17 +383,6 @@ __global__ void specinfer_store_kv_cache( bitmask.this_layer_size + token_idx - request_token_offset; - int real_idx = tok_id - first_token_in_req + allocated_tokens + sub_req_id; - - // if (i % hidden_size == 0) { - // printf("ffasdasds request %d, real idx %d, cache idx %d token id %d, kval %.10f\n", - // req_id, - // real_idx, - // cache_idx, - // tok_id, - // kVal); - // } - kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + offset] = kVal; vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + @@ -454,7 +414,6 @@ void update_kv_cache_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, m->request_infos, m->beam_token_infos, m->beam_request_infos, - m->beam_topology_mask, m->causalMask, m->qProjSize, m->kProjSize, @@ -490,7 +449,6 @@ void update_kv_cache_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, m->hidden_size, \ m->request_infos, \ m->beam_request_infos, \ - m->beam_topology_mask, \ m->causalMask, \ BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES) @@ -788,16 +746,6 @@ void inference_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, cudaStream_t stream) { // phase 1: Implement kernel to compute KQV for input tokens - cudaMemcpyAsync(m->causalMask, - &(bc->causalMask), - bc->num_active_requests() * sizeof(BatchConfig::BitMask), - cudaMemcpyHostToDevice, - stream); - // std::cout << "kernel bit mask: " << bc->causalMask[0].prompt_size << ", " - // << bc->causalMask[0].non_tree_cache_size << ", " - // << bc->causalMask[0].mask[0] << ", " << - // sizeof(BatchConfig::BitMask) - // << "\n"; compute_qkv_kernel(m, bc, shard_id, @@ -953,38 +901,30 @@ SpecInferIncMultiHeadSelfAttentionMeta::SpecInferIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - size_t causal_mask_size = BatchConfig::MAX_NUM_REQUESTS; - size_t total_size = causal_mask_size * sizeof(BatchConfig::BitMask); - gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, - total_size); + // size_t causal_mask_size = BatchConfig::MAX_NUM_REQUESTS; + // size_t total_size = causal_mask_size * sizeof(BatchConfig::BitMask); + // gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, + // total_size); - beam_topology_mask = - static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo)); beam_token_infos = static_cast( handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::topology_mask)); + sizeof(BatchConfig::requestsInfo)); beam_request_infos = static_cast( handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::topology_mask) + + sizeof(BatchConfig::requestsInfo) + sizeof(BeamSearchBatchConfig::beamTokenInfo)); - // causalMask = - // static_cast( - // handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - // sizeof(BatchConfig::requestsInfo) + - // sizeof(BeamSearchBatchConfig::topology_mask) + - // sizeof(BeamSearchBatchConfig::beamTokenInfo)) + - // sizeof(BeamSearchBatchConfig::beamRequestsInfo); - - causalMask = gpu_mem_allocator.allocate_instance( - causal_mask_size); + causalMask = static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::beamTokenInfo) + + sizeof(BeamSearchBatchConfig::beamRequestsInfo)); + + // causalMask = gpu_mem_allocator.allocate_instance( + // causal_mask_size); // beam_token_infos = // gpu_mem_allocator // .allocate_instance( diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 11169fa36d..ebbfac23ea 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -191,8 +191,8 @@ __global__ void compute_attention_kernel_fused_kernel( qk_max = mask ? qk_max : fmaxf(qk_max, qk); // if (head_idx == 0 && qi == 1 && !mask && tidx == 0) { - // printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, %.10f\n", - // request_idx, + // printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, + // %.10f\n", request_idx, // ti, // qk, // q_vecs[ki_o][0].x, @@ -355,7 +355,8 @@ __global__ void compute_attention_kernel_fused_kernel( // out.z, // out.w, // vi, - // (first_token_idx + qi) * hidden_size + head_idx * per_head_size + + // (first_token_idx + qi) * hidden_size + head_idx * + // per_head_size + // vi); // } } @@ -502,15 +503,21 @@ __global__ void update_tree_branch_kv_cache_fused( int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - int const request_token_offset = request_infos[req_id].first_token_offset_in_batch; + int const request_token_offset = + request_infos[req_id].first_token_offset_in_batch; // if(i % hidden_size == 0){ - // printf("update token request id: %d, %d, %d value%.10f\n", req_id, token_idx, request_token_offset, kVal); + // printf("update token request id: %d, %d, %d value%.10f\n", req_id, + // token_idx, request_token_offset, kVal); // } kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + - (token_idx + first_token_depth - request_token_offset) * hidden_size + offset] = kVal; + (token_idx + first_token_depth - request_token_offset) * + hidden_size + + offset] = kVal; vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + - (token_idx + first_token_depth - request_token_offset) * hidden_size + offset] = vVal; + (token_idx + first_token_depth - request_token_offset) * + hidden_size + + offset] = vVal; } } @@ -974,8 +981,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, static_cast
(m->devQKVProjArray), bias_ptr, stream); - // print_tensor((float *)m->devQKVProjArray + 768 * 8 * 3 + 768, 32, "qkvtenor1"); - // print_tensor((float *)m->devQKVProjArray + 768 * 18 * 3 + 768, 32, "qkvtenor2"); + // print_tensor((float *)m->devQKVProjArray + 768 * 8 * 3 + 768, 32, + // "qkvtenor1"); print_tensor((float *)m->devQKVProjArray + 768 * 18 * + // 3 + 768, 32, "qkvtenor2"); // phase 2: No need to update key/val cache // IncMultiHeadSelfAttention::update_kv_cache_kernel( @@ -1019,7 +1027,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventRecord(t_start, stream); } - std::cout << "tree input tokens: " <num_active_tokens() << "\n"; + std::cout << "tree input tokens: " << bc->num_active_tokens() << "\n"; // assert(input.data_type == weight.data_type); assert(input.data_type == output.data_type); @@ -1128,34 +1136,15 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); - size_t committed_tokeninfo_size = max_tokens_per_batch; - size_t causal_mask_size = BatchConfig::MAX_NUM_REQUESTS; - - size_t total_size = committed_tokeninfo_size * - sizeof(TreeVerifyBatchConfig::CommittedTokensInfo) + - causal_mask_size * sizeof(BatchConfig::BitMask); - if (offload) { - // assert that we have enough reserved work space left - assert(gpu_mem_allocator.reserved_total_size - - gpu_mem_allocator.reserved_allocated_size >= - total_size); - committed_token_infos = - gpu_mem_allocator - .allocate_reserved( - committed_tokeninfo_size); - causalMask = gpu_mem_allocator.allocate_instance( - causal_mask_size); - } else { - gpu_mem_allocator.create_legion_instance(committed_token_reserve_inst, - total_size); - committed_token_infos = - gpu_mem_allocator - .allocate_instance( - committed_tokeninfo_size); - causalMask = gpu_mem_allocator.allocate_instance( - causal_mask_size); - } + + causalMask = static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo)); + committed_token_infos = + static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BatchConfig::causalMask)); } cudaStreamSynchronize(stream); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index ab062a4610..670db1ab0e 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -766,12 +766,6 @@ BeamSearchBatchConfig new_bc.beamRequestsInfo[i].sub_request_num = 1; new_bc.sub_requests[i] = 1; - new_bc.topology_mask[i].allocated_tokens = request.tokens.size(); - - // assign new kv cache position - for (int j = 0; j < request.tokens.size(); j++) { - new_bc.topology_mask[i].real_token_pos[0][j] = j; - } updateBitMask(new_bc.causalMask[i], verified_tokens.size(), @@ -786,8 +780,6 @@ BeamSearchBatchConfig new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = token.second; - new_bc.topology_mask[i].real_token_pos[0][token.second] = - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request; // Beam Token Info new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; @@ -846,7 +838,6 @@ BeamSearchBatchConfig } new_bc.beamRequestsInfo[i].sub_request_num = 1; - new_bc.topology_mask[i].allocated_tokens = 0; new_bc.sub_requests[i] = 1; @@ -919,14 +910,12 @@ BeamSearchBatchConfig assert(depth < new_request.tokens.size()); new_bc.tokensInfo[new_bc.num_tokens].token_id = new_request.tokens[depth]; - new_bc.topology_mask[i].real_token_pos[0][depth] = depth; // beam search meta data, indicate which sub request this token // belongs to, init to 0; new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; new_bc.num_tokens++; } - new_bc.topology_mask[i].allocated_tokens = 0; initBitMask(new_bc.causalMask[i], new_bc.requestsInfo[i].num_tokens_in_batch); @@ -1120,9 +1109,6 @@ BeamSearchBatchConfig update_beam_metadata( new_bc, old_bc, request.beam_trees.at(old_bc.model_id), i); - new_bc.topology_mask[i].allocated_tokens = - old_bc.topology_mask[i].allocated_tokens + - old_bc.beamRequestsInfo[i].sub_request_num; } else { assert(false && "Request should not be pending in beam search phase"); } @@ -1156,31 +1142,9 @@ BeamSearchBatchConfig << std::endl; } - // for (int j = 0; j < request.tokens.size(); j++) { - // new_bc.topology_mask[i].real_token_pos[0][j] = j; - // } - // register more tokens due to the beam width - std::cout << "register more tokens: " - << new_bc.beamRequestsInfo[i].sub_request_num << ", " - << new_bc.requestsInfo[i].num_tokens_in_batch << ", " - << new_bc.topology_mask[i].allocated_tokens << "\n"; - - // copy meta data and replicate - int replicate_num = new_bc.beamRequestsInfo[i].sub_request_num / - old_bc.beamRequestsInfo[i].sub_request_num; - - for (int j = 0; j < old_bc.beamRequestsInfo[i].sub_request_num; j++) { - int old_idx = j; - for (int k = 0; k < replicate_num; k++) { - int new_idx = j * replicate_num + k; - std::cout << "copy from " << old_idx << "to: " << new_idx << "\n"; - memcpy(new_bc.topology_mask[i].real_token_pos[new_idx], - old_bc.topology_mask[i].real_token_pos[old_idx], - sizeof(int) * BatchConfig::MAX_NUM_TOKENS); - } - } + //copy metadata memcpy(&new_bc.causalMask[i], &old_bc.causalMask[i], sizeof(BatchConfig::BitMask)); @@ -1215,14 +1179,6 @@ BeamSearchBatchConfig new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; new_bc.num_tokens++; - // width first - new_bc.topology_mask[i].real_token_pos[k][depth] = - new_bc.topology_mask[i].allocated_tokens + num_generation_tokens; - - // std::cout << "topology: sub request: " << k << ", " - // << ", " << depth << ", " - // << new_bc.topology_mask[i].real_token_pos[k][depth] << - // "\n"; num_generation_tokens++; } } diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index 4d7e2c8806..e8824feda5 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -59,64 +59,74 @@ void RequestManager::load_tokens_task( // copy meta data to workSpace FFHandler handle = *((FFHandler const *)task->local_args); + size_t total_copy_size = 0; cudaMemcpyAsync(handle.batch_config_metadata, &(batch_config->tokensInfo), - batch_config->num_active_tokens() * - sizeof(BatchConfig::PerTokenInfo), + sizeof(BatchConfig::tokensInfo), cudaMemcpyHostToDevice, stream); + total_copy_size += sizeof(BatchConfig::tokensInfo); + cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo), + total_copy_size, &(batch_config->requestsInfo), - batch_config->max_requests_per_batch() * - sizeof(BatchConfig::PerRequestInfo), + sizeof(BatchConfig::requestsInfo), cudaMemcpyHostToDevice, stream); + total_copy_size += sizeof(BatchConfig::requestsInfo); - // load speculative metadata if (batch_config->get_mode() == BEAM_SEARCH_MODE) { BeamSearchBatchConfig const *beam_batch_config = static_cast(batch_config); cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo), - &(beam_batch_config->topology_mask), - sizeof(BeamSearchBatchConfig::topology_mask), - cudaMemcpyHostToDevice, - stream); - - cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::topology_mask), + total_copy_size, &(beam_batch_config->beamTokenInfo), sizeof(BeamSearchBatchConfig::beamTokenInfo), cudaMemcpyHostToDevice, stream); + + total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); + cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::topology_mask) + - sizeof(BeamSearchBatchConfig::beamTokenInfo), + total_copy_size, &(beam_batch_config->beamRequestsInfo), sizeof(BeamSearchBatchConfig::beamRequestsInfo), cudaMemcpyHostToDevice, stream); + total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); - // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - // sizeof(BatchConfig::tokensInfo) + - // sizeof(BatchConfig::requestsInfo) + - // sizeof(BeamSearchBatchConfig::topology_mask) + - // sizeof(BeamSearchBatchConfig::beamTokenInfo) + - // sizeof(BeamSearchBatchConfig::beamRequestsInfo), - // &(beam_batch_config->causalMask), - // sizeof(BatchConfig::causalMask), - // cudaMemcpyHostToDevice, - // stream); - // std::cout << "copy calsual mask info: " << beam_batch_config->causalMask[0].prompt_size << "\n"; + cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(beam_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + cudaMemcpyHostToDevice, + stream); + + total_copy_size += sizeof(BatchConfig::causalMask); + } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { + TreeVerifyBatchConfig const *tree_batch_config = + static_cast(batch_config); + + cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(tree_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + cudaMemcpyHostToDevice, + stream); + total_copy_size += sizeof(BatchConfig::causalMask); + cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(tree_batch_config->committed_tokens), + sizeof(TreeVerifyBatchConfig::committed_tokens), + cudaMemcpyHostToDevice, + stream); + total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); } + + // add a size check + assert(total_copy_size <= handle.batch_config_metadata_size); } void RequestManager::load_positions_task( From fae148da9a4b495d26642c1929ebe9f25cdf3b1d Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sat, 30 Dec 2023 05:11:38 -0500 Subject: [PATCH 12/61] fix some corner cases --- include/flexflow/model.h | 1 + .../inc_multihead_self_attention_utils.cuh | 4 +- include/flexflow/request_manager.h | 7 + inference/spec_infer/spec_infer.cc | 6 +- src/ops/argmax.cc | 2 +- src/ops/beam_topk.cc | 1 + src/ops/inc_multihead_self_attention.cu | 8 +- src/ops/kernels/embedding_kernels.cu | 2 +- src/ops/spec_inc_multihead_self_attention.cu | 18 +-- .../specinfer_inc_multihead_self_attention.cu | 75 +++++----- src/ops/tree_inc_multihead_self_attention.cu | 94 ++++++------ src/runtime/cuda_helper.cu | 2 +- src/runtime/inference_manager.cc | 61 +++++++- src/runtime/model.cc | 17 +++ src/runtime/request_manager.cc | 141 ++++++++++++++---- src/runtime/request_manager.cu | 87 +++++++++++ 16 files changed, 389 insertions(+), 137 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 3602cb108b..9cdbec64a9 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -242,6 +242,7 @@ enum TaskIDs { // InferenceManager & RequestManager RM_LOAD_TOKENS_TASK_ID, RM_LOAD_POSITION_TASK_ID, + RM_LOAD_BATCH_CONFIG_TASK_ID, RM_PREPARE_NEXT_BATCH_TASK_ID, RM_PREPARE_NEXT_BATCH_INIT_TASK_ID, RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh index 0c065b6b0e..1b21a80dc9 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh @@ -456,7 +456,7 @@ inline size_t smem_size_in_bytes(int hidden_size_per_head, int threads_per_block) { // The amount of shared memory needed to store the Q*K^T values in float. - size_t qk_sz = div_up(1000 + 1, 4) * 16; + size_t qk_sz = div_up(2000 + 1, 4) * 16; size_t logits_sz = qk_sz; // The total size needed during softmax. @@ -493,7 +493,7 @@ inline void smem_size_in_bytes_tree(int hidden_size_per_head, } // todo fix this - int max_qk_length = max_query_length * max_total_length; + int max_qk_length = max_query_length * max_total_length + 1000; // The amount of shared memory needed to store the Q*K^T values in float. size_t qk_sz = div_up(max_qk_length + 1, 4) * 16; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index dc1939c74b..8cb45e55b4 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -43,6 +43,8 @@ class InferenceManager { void load_positions(BatchConfigFuture const &bc, ParallelTensor position_input, int offset); + void load_inference_metadata_batch_config(BatchConfigFuture const &bc, + FFHandler *handlers); public: FFConfig ff_config; @@ -195,6 +197,11 @@ class RequestManager { Legion::Context ctx, Legion::Runtime *runtime); + static void + load_batch_config_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static BatchConfig prepare_next_batch_task( Legion::Task const *task, std::vector const ®ions, diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 9af3e12e5a..258b2d78eb 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -266,9 +266,9 @@ void FlexFlow::top_level_task(Task const *task, ModelMeta model_metadata; bool use_full_precision = false; bool verbose = false; - int max_requests_per_batch = 16; - int max_tokens_per_batch = 256; - int max_sequence_length = 1024; + int max_requests_per_batch = 10; + int max_tokens_per_batch = 199; + int max_sequence_length = 200; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index 0344c707fc..d195a5af75 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -399,7 +399,7 @@ InferenceResult m, shard_id, bc, {}, {}, {input, indices}); } - print_tensor(indices.get_int32_ptr(), 32, "tree attn output"); + // print_tensor(indices.get_int32_ptr(), 199, "tree attn output"); download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size); return ir; diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 20d019eec3..5dfaae41ee 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -404,6 +404,7 @@ BeamInferenceResult // print_tensor(index_ptr, 32, "indexxxxxxx"); + if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index a084f216e9..2f16dd71c2 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1365,12 +1365,12 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( // a K-ary tree max node is (k^n - 1) / 2 key_cache_size = num_q_heads * kProjSize * BeamSearchBatchConfig::max_requests_per_batch() * - BatchConfig::max_sequence_length() * - BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; + (BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); value_cache_size = num_q_heads * vProjSize * BeamSearchBatchConfig::max_requests_per_batch() * - BatchConfig::max_sequence_length() * - BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; + (BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); break; } default: diff --git a/src/ops/kernels/embedding_kernels.cu b/src/ops/kernels/embedding_kernels.cu index 0cde42de56..3085fdb6ba 100644 --- a/src/ops/kernels/embedding_kernels.cu +++ b/src/ops/kernels/embedding_kernels.cu @@ -118,7 +118,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, // print_tensor(output_ptr, output_domain.get_volume(), // "[Embedding:forward:output]"); } - // print_tensor(input.get_int32_ptr(), 32, "embeddinginput"); + print_tensor(input.get_int32_ptr(), 200, "embeddinginput"); } /*static*/ diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 562dee4d93..29e3d9a48d 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -111,15 +111,15 @@ __global__ void spec_store_kv_cache( // naive cache stealing if (sub_req_id != parent_id) { - if (offset == 0 && tok_id == 0) { - printf("cache stealing!, depth %d req_id %d sub_req_id %d, parentid " - "%d, tok_id %d\n", - beam_depth, - req_id, - sub_req_id, - parent_id, - tok_id); - } + // if (offset == 0 && tok_id == 0) { + // printf("cache stealing!, depth %d req_id %d sub_req_id %d, parentid " + // "%d, tok_id %d\n", + // beam_depth, + // req_id, + // sub_req_id, + // parent_id, + // tok_id); + // } for (int depth = 0; depth < beam_depth; depth++) { int steal_token_idx = tok_id - beam_depth + depth; diff --git a/src/ops/specinfer_inc_multihead_self_attention.cu b/src/ops/specinfer_inc_multihead_self_attention.cu index 4d4afd28e4..e84ec3095c 100644 --- a/src/ops/specinfer_inc_multihead_self_attention.cu +++ b/src/ops/specinfer_inc_multihead_self_attention.cu @@ -50,8 +50,7 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( int hidden_size, BatchConfig::PerRequestInfo *request_infos, BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos, - BatchConfig::BitMask *causalMask, - int max_tree_branches) { + BatchConfig::BitMask *causalMask) { // q, k using Q_vec = typename VEC_K::Type; @@ -83,8 +82,14 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { // printf("specinfer attn fused kernel %lld\n", bitmask.mask[1]); // } + int const totalCacheSize = bitmask.non_tree_cache_size + bitmask.tree_size; + + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("specinfer attn fused kernel %d, %d\n", + // totalCacheSize,request_infos[request_idx].num_tokens_in_batch); + // } // int const qlength = request_infos[request_idx].num_tokens_in_batch; int const tree_branch_num = beam_request_infos[request_idx].sub_request_num; @@ -94,7 +99,7 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( int first_token_idx = 0; for (int r = 0; r < request_idx; r++) { // first_token_idx += request_infos[request_idx].num_tokens_in_batch; - first_token_idx += bitmask.this_layer_size; + first_token_idx += causalMask[r].this_layer_size; } // if (tidx == 0 && head_idx == 0) { @@ -130,8 +135,7 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; DT const *k_cache_batch = - key_cache + - request_idx * max_seq_length * hidden_size * max_tree_branches + ki; + key_cache + request_idx * max_seq_length * hidden_size + ki; int ti_end = div_up(totalCacheSize - first_step, K_PER_WARP) * K_PER_WARP + first_step; @@ -267,9 +271,7 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // The base pointer for the value in the cache buffer. DT const *v_cache_batch = - value_cache + - request_idx * max_seq_length * hidden_size * max_tree_branches + vi; - + value_cache + request_idx * max_seq_length * hidden_size + vi; if (Dh == Dh_MAX || vi < Dh) { for (int ti = first_step + vo; ti < totalCacheSize; ti += V_PER_ITER) { @@ -344,7 +346,6 @@ __global__ void specinfer_store_kv_cache( int vProjSize, int num_tokens, int max_seq_len, - int max_tree_branches, bool is_root, int hidden_size) { CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { @@ -383,10 +384,10 @@ __global__ void specinfer_store_kv_cache( bitmask.this_layer_size + token_idx - request_token_offset; - kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + - (cache_idx)*hidden_size + offset] = kVal; - vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + - (cache_idx)*hidden_size + offset] = vVal; + kCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + + offset] = vVal; } } @@ -419,8 +420,8 @@ void update_kv_cache_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, m->kProjSize, m->vProjSize, num_tokens, - BatchConfig::max_sequence_length(), - BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES, + BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, /*root*/ curr_depth == 0, m->hidden_size); } @@ -429,7 +430,8 @@ void update_kv_cache_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, #define LAUNCH_SPECINFER_ATTENTION_SCORE_KERNEL( \ DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream) \ smem_sz = smem_size_in_bytes
(m->qProjSize, \ - BatchConfig::max_sequence_length(), \ + BatchConfig::max_sequence_length() + \ + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ THREADS_PER_VALUE, \ THDS_PER_BLOCK); \ compute_specinfer_attention_kernel_generation_kernel(m->valueCache), \ output_ptr, \ scale, \ - BatchConfig::max_sequence_length(), \ + BatchConfig::max_sequence_length() + \ + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ m->qProjSize, \ m->hidden_size, \ m->request_infos, \ m->beam_request_infos, \ - m->causalMask, \ - BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES) + m->causalMask) template void compute_specinfer_attention_kernel_generation( @@ -527,11 +529,13 @@ void compute_attention_kernel_prompt( int q_block_size = m->qProjSize; int kt_block_size = m->kProjSize; - int kt_req_block_size = - kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int kt_req_block_size = kt_block_size * m->num_q_heads * + (BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); int vt_block_size = m->vProjSize; - int vt_req_block_size = - vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_req_block_size = vt_block_size * m->num_q_heads * + (BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { @@ -580,8 +584,7 @@ void compute_attention_kernel_prompt( // print_tensor((float*)A, 32, "A"); std::cout << "meta: " << num_new_tokens << ", " << total_tokens << "\n"; - DT const *B = static_cast
(m->keyCache) + - (i * bc->MAX_SPECULATIVE_TREE_BRANCHES) * kt_req_block_size; + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; // if (i == 0 && sub_req_id == 0 && // bc->beam_slots.at(0).current_depth == 1) { @@ -692,8 +695,7 @@ void compute_attention_kernel_prompt( strideC = m->vProjSize; // To get A, skip over V^T entries from previous requests (all heads + // padding) - A = static_cast
(m->valueCache) + - (i * bc->MAX_SPECULATIVE_TREE_BRANCHES) * vt_req_block_size; + A = static_cast
(m->valueCache) + i * vt_req_block_size; // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous // requests (all heads) B = C_softmax; @@ -851,8 +853,10 @@ void SpecInferIncMultiHeadSelfAttention::inference_kernel_wrapper( // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, // acc_output.rect, "[Attention:forward:output]"); } - // save_tensor(output.get_float_ptr(), 768 * 3, "/home/xinhaoc/FlexFlow/inference/output/fk1.txt"); - // save_tensor(output.get_float_ptr() + 768 * 3, 768 * 3, "/home/xinhaoc/FlexFlow/inference/output/fk2.txt"); + // save_tensor(output.get_float_ptr(), 768 * 3, + // "/home/xinhaoc/FlexFlow/inference/output/fk1.txt"); + // save_tensor(output.get_float_ptr() + 768 * 3, 768 * 3, + // "/home/xinhaoc/FlexFlow/inference/output/fk2.txt"); // if(bc->num_tokens == 1){ // print_tensor(input.get_float_ptr(), 32, "specinc input"); @@ -906,7 +910,6 @@ SpecInferIncMultiHeadSelfAttentionMeta::SpecInferIncMultiHeadSelfAttentionMeta( // gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, // total_size); - beam_token_infos = static_cast( handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + @@ -915,13 +918,13 @@ SpecInferIncMultiHeadSelfAttentionMeta::SpecInferIncMultiHeadSelfAttentionMeta( beam_request_infos = static_cast( handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + + sizeof(BatchConfig::requestsInfo) + sizeof(BeamSearchBatchConfig::beamTokenInfo)); - causalMask = static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo) - + sizeof(BeamSearchBatchConfig::beamRequestsInfo)); + causalMask = static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::beamTokenInfo) + + sizeof(BeamSearchBatchConfig::beamRequestsInfo)); // causalMask = gpu_mem_allocator.allocate_instance( // causal_mask_size); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index ebbfac23ea..8641e63e38 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -53,7 +53,6 @@ __global__ void compute_attention_kernel_fused_kernel( BatchConfig::PerRequestInfo *request_infos, int num_heads, int num_requests, - int max_tree_branches, BatchConfig::BitMask *causalMask, int qk_smem_sz) { @@ -86,8 +85,9 @@ __global__ void compute_attention_kernel_fused_kernel( BatchConfig::BitMask bitmask = causalMask[request_idx]; // bitmask.mask[1] = 3; - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - // printf("tree attn fused kernel %d, %d, %d, %lld\n", + // if (head_idx == 0 && tidx == 0) { + // printf("tree attn fused kernel req id %d %d, %d, %d, %lld\n", + // request_idx, // tlength, // qlength, // bitmask.non_tree_cache_size, @@ -96,12 +96,12 @@ __global__ void compute_attention_kernel_fused_kernel( int first_token_idx = 0; for (int r = 0; r < request_idx; r++) { - first_token_idx += request_infos[request_idx].num_tokens_in_batch; + first_token_idx += request_infos[r].num_tokens_in_batch; } - // if(tidx == 0 && head_idx == 0){ - // printf("tree req: %d, %d\n", request_idx, first_token_idx); - // } + if(tidx == 0 && head_idx == 0){ + printf("tree req: %d, %d\n", request_idx, first_token_idx); + } // shared memory objects extern __shared__ char smem_[]; @@ -132,8 +132,7 @@ __global__ void compute_attention_kernel_fused_kernel( constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; DT const *k_cache_batch = - key_cache + - request_idx * max_tree_branches * max_seq_length * hidden_size + ki; + key_cache + request_idx * max_seq_length * hidden_size + ki; int ti_end = div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; @@ -190,17 +189,14 @@ __global__ void compute_attention_kernel_fused_kernel( qk_max = mask ? qk_max : fmaxf(qk_max, qk); - // if (head_idx == 0 && qi == 1 && !mask && tidx == 0) { - // printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, - // %.10f\n", request_idx, - // ti, - // qk, - // q_vecs[ki_o][0].x, - // q_vecs[ki_o][1].x, - // q_vecs[ki_o][2].x, - // q_vecs[ki_o][3].x, - // k[0].x); - // } + if (head_idx == 0 && qi == 0 && !mask) { + printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, %.10f\n ", + request_idx, + ti, + qk, + q_vecs[ki_o][0].x, + k[0].x); + } qk_smem[ti - first_step] = mask ? 0.0f : qk; } } @@ -283,8 +279,7 @@ __global__ void compute_attention_kernel_fused_kernel( // The base pointer for the value in the cache buffer. DT const *v_cache_batch = - value_cache + - request_idx * max_seq_length * hidden_size * max_tree_branches + vi; + value_cache + request_idx * max_seq_length * hidden_size + vi; // DT const *v_cache_batch = // value_cache + // (beam_request_idx * max_beam_width + beam_sub_request_idx) * @@ -375,8 +370,7 @@ __global__ void commit_tokens_kernel( int num_tokens_to_commit, int num_active_tokens_in_last_batch, int max_seq_len, - int hidden_size, - int max_tree_branches) { + int hidden_size) { CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size) { @@ -407,10 +401,10 @@ __global__ void commit_tokens_kernel( // kVal); // } - kCache_ptr[req_id * max_tree_branches * (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = kVal; - vCache_ptr[req_id * max_tree_branches * (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = vVal; + kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = vVal; } } @@ -434,9 +428,9 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_tokens_to_commit, m->num_active_tokens, // number of active tokens in previous batch - BatchConfig::max_sequence_length(), - m->hidden_size, - BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES); + BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, + m->hidden_size); } } @@ -488,7 +482,6 @@ __global__ void update_tree_branch_kv_cache_fused( int num_new_tokens, int max_seq_len, int hidden_size, - int max_tree_branches, int first_token_depth) { CUDA_KERNEL_LOOP(i, num_new_tokens * hidden_size) { @@ -510,11 +503,11 @@ __global__ void update_tree_branch_kv_cache_fused( // printf("update token request id: %d, %d, %d value%.10f\n", req_id, // token_idx, request_token_offset, kVal); // } - kCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + + kCache_ptr[req_id * (hidden_size * max_seq_len) + (token_idx + first_token_depth - request_token_offset) * hidden_size + offset] = kVal; - vCache_ptr[(req_id * max_tree_branches) * (hidden_size * max_seq_len) + + vCache_ptr[req_id * (hidden_size * max_seq_len) + (token_idx + first_token_depth - request_token_offset) * hidden_size + offset] = vVal; @@ -569,10 +562,12 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, int q_block_size = m->qProjSize; int kt_block_size = m->kProjSize; int kt_req_block_size = - kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM; int vt_block_size = m->vProjSize; int vt_req_block_size = - vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM; assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { @@ -836,7 +831,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, #define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL( \ DT, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream) \ smem_size_in_bytes_tree
(m->qProjSize, \ - BatchConfig::max_sequence_length(), \ + BatchConfig::max_sequence_length() + \ + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ THDS_PER_VALUE, \ THDS_PER_BLOCK, \ bc, \ @@ -848,7 +844,20 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, THDS_PER_KEY, \ THDS_PER_VALUE> \ <<>>( \ - static_cast
(m->devQKVProjArray), static_cast
(m->keyCache), static_cast
(m->valueCache), output_ptr, scale, BatchConfig::max_sequence_length(), BatchConfig::max_tokens_per_batch(), m->qProjSize, m->hidden_size, m->request_infos, m->num_q_heads, bc->num_active_requests(), BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES, m->causalMask, \ + static_cast
(m->devQKVProjArray), \ + static_cast
(m->keyCache), \ + static_cast
(m->valueCache), \ + output_ptr, \ + scale, \ + BatchConfig::max_sequence_length() + \ + BatchConfig::BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ + BatchConfig::max_tokens_per_batch(), \ + m->qProjSize, \ + m->hidden_size, \ + m->request_infos, \ + m->num_q_heads, \ + bc->num_active_requests(), \ + m->causalMask, \ smem_sz[0]) template @@ -880,9 +889,8 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, m->kProjSize, m->vProjSize, num_new_tokens, - BatchConfig::max_sequence_length(), + BatchConfig::max_sequence_length() + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, m->hidden_size, - BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES, bc->requestsInfo[0].first_token_depth_in_request); dim3 grid(m->num_q_heads, bc->num_active_requests()); @@ -981,9 +989,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, static_cast
(m->devQKVProjArray), bias_ptr, stream); - // print_tensor((float *)m->devQKVProjArray + 768 * 8 * 3 + 768, 32, - // "qkvtenor1"); print_tensor((float *)m->devQKVProjArray + 768 * 18 * - // 3 + 768, 32, "qkvtenor2"); + + // print_tensor((float *)m->devQKVProjArray, 32, "qkvtenor1"); + // print_tensor((float *)m->devQKVProjArray + 768 * (25 * 7) * 3, 32, "qkvtenor2"); // phase 2: No need to update key/val cache // IncMultiHeadSelfAttention::update_kv_cache_kernel( diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index fa6bf55fe5..398ed7f3cd 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -226,7 +226,7 @@ __host__ void print_tensor(T const *ptr, printf("%s, %d---->", prefix, shard_id); for (idx = 0; idx < num_elements; idx++) { printf(" %.20lf", (float)host_ptr[idx]); - if (idx >= 100) { + if (idx >= 200) { break; } } diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 52fd64c606..e7f7c5f52d 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -320,6 +320,7 @@ FutureMap InferenceManager::inference(FFModel *model, assert(op->numOutputs == 1); ParallelTensor pt = tensor_buffer[op->outputs[0]][batch_index]; load_input_tokens_from_batch_config(bc, pt, model->handlers); + load_inference_metadata_batch_config(bc, model->handlers); } } @@ -349,18 +350,32 @@ FutureMap InferenceManager::inference(FFModel *model, }; void InferenceManager::load_input_tokens_from_batch_config( - BatchConfigFuture const &bc, ParallelTensor const input, FFHandler *handlers) { + BatchConfigFuture const &bc, + ParallelTensor const input, + FFHandler *handlers) { Context ctx = ff_config.lg_ctx; Runtime *runtime = ff_config.lg_hlr; size_t machine_view_hash = input->machine_view.hash(); ArgumentMap argmap; - Rect<1> task_rect(Point<1>(0), - Point<1>(ff_config.workersPerNode * ff_config.numNodes - 1)); - IndexSpaceT<1> task_is = runtime->create_index_space(ctx, task_rect); - MachineView view = input->machine_view; - for (PointInRectIterator<1> it(task_rect); it(); it++) { - FFHandler handle = handlers[view.get_device_id(*it)]; - argmap.set_point(*it, TaskArgument(&handle, sizeof(FFHandler))); + Domain domain = runtime->get_index_space_domain(ctx, input->parallel_is); + + switch (domain.get_dim()) { +#define DIMFUNC(DIM) \ + case DIM: { \ + Rect rect = domain; \ + MachineView view = input->machine_view; \ + int idx = 0; \ + for (PointInRectIterator it(rect); it(); it++) { \ + argmap.set_point(*it, \ + TaskArgument(&handlers[view.get_device_id(*it)], \ + sizeof(FFHandler))); \ + } \ + break; \ + } + LEGION_FOREACH_N(DIMFUNC) +#undef DIMFUNC + default: + assert(false); } IndexLauncher launcher(RM_LOAD_TOKENS_TASK_ID, @@ -378,6 +393,36 @@ void InferenceManager::load_input_tokens_from_batch_config( runtime->execute_index_space(ctx, launcher); } +void InferenceManager::load_inference_metadata_batch_config( + BatchConfigFuture const &bc, + FFHandler *handlers) { + Context ctx = ff_config.lg_ctx; + Runtime *runtime = ff_config.lg_hlr; + ArgumentMap argmap; + + Rect<1> task_rect(Point<1>(0), + Point<1>(ff_config.workersPerNode * ff_config.numNodes - 1)); + IndexSpaceT<1> task_is = runtime->create_index_space(ctx, task_rect); + + // int rank = 0; + int idx = 0; + for (PointInRectIterator<1> it(task_rect); it(); it++) { + FFHandler handler = handlers[idx++]; + argmap.set_point(*it, TaskArgument(&handler, sizeof(FFHandler))); + } + + IndexLauncher launcher(RM_LOAD_BATCH_CONFIG_TASK_ID, + task_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + FFConfig::DataParallelism_GPU); + launcher.add_future(bc); + runtime->execute_index_space(ctx, launcher); +} + void InferenceManager::load_positions(BatchConfigFuture const &bc, ParallelTensor position_input, int offset) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 8bda9016c3..cf72f2d40b 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4344,6 +4344,23 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + // RequestManager load metadata + { + TaskVariantRegistrar registrar(RM_LOAD_BATCH_CONFIG_TASK_ID, + "RequestManager Load meta data"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "RequestManager Load metadata Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // RequestManager prepare_next_batch { TaskVariantRegistrar registrar(RM_PREPARE_NEXT_BATCH_TASK_ID, diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 670db1ab0e..5c3262eb27 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -609,8 +609,8 @@ BeamSearchBatchConfig committed_tokens[guid].emplace_back(abs_depth, result_index); } else if (abs_depth >= root_abs_depth) { tree_outputs.emplace_back(token_id, abs_depth + 1); - std::cout << "committred tokens push: " << abs_depth - << " ,result index: " << result_index << "\n"; + // std::cout << "committred tokens push: " << abs_depth + // << " ,result index: " << result_index << "\n"; committed_tokens[guid].emplace_back(abs_depth, result_index); if (verbose) { @@ -621,12 +621,12 @@ BeamSearchBatchConfig tree_outputs.back().second, token_id); } - std::cout << "Index within old batch: " << result_index << std::endl; - printf(" Input: [%d] %d ---> [%d] %d \n", - abs_depth, - old_bc.tokensInfo[result_index].token_id, - tree_outputs.back().second, - token_id); + // std::cout << "Index within old batch: " << result_index << std::endl; + // printf(" Input: [%d] %d ---> [%d] %d \n", + // abs_depth, + // old_bc.tokensInfo[result_index].token_id, + // tree_outputs.back().second, + // token_id); } result_index++; } @@ -634,13 +634,12 @@ BeamSearchBatchConfig if (request.status == Request::RUNNING) { std::cout << "verify running: " << dfs_tree_inputs.at(guid).size() << ", " << tree_outputs.size() << "\n"; - + std::vector> verified_tokens = traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); log_req_mgr.print("Number of Verified Tokens = %zu", verified_tokens.size()); - // check if the request is finished if (verified_tokens.size() + request.tokens.size() >= request.max_sequence_length) { @@ -805,7 +804,12 @@ BeamSearchBatchConfig } log_req_mgr.print("Output: %s", output.c_str()); } - + + if (request.tokens.size() > 19 && i >= 7) { + std::cout << request.tokens.size() << "\n"; + assert(false); + } + } else if (request.status == Request::PENDING) { new_bc.request_completed[i] = false; new_bc.request_running[i] = false; @@ -1099,7 +1103,8 @@ BeamSearchBatchConfig // } assert(new_bc.beamRequestsInfo[i].sub_request_num <= - BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES); + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES && + "exceed maximum nodes per layer"); if (request.status == Request::RUNNING) { new_bc.beamRequestsInfo[i].current_depth = @@ -1144,7 +1149,7 @@ BeamSearchBatchConfig // register more tokens due to the beam width - //copy metadata + // copy metadata memcpy(&new_bc.causalMask[i], &old_bc.causalMask[i], sizeof(BatchConfig::BitMask)); @@ -1185,9 +1190,6 @@ BeamSearchBatchConfig // if(new_bc.beamRequestsInfo[i].current_depth >= 3 && i > 0){ // assert(false); // } - - - } } @@ -1238,7 +1240,8 @@ BeamSearchBatchConfig old_bc.beamRequestsInfo[i].sub_request_num; assert(new_bc.beamRequestsInfo[i].sub_request_num <= - BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES); + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES && + "exceed maximum nodes per layer"); // update the parentid, accumalated_probs, depth, and token_ids @@ -1504,6 +1507,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::cout << "prepare next batch verify: " << dfs_tree_inputs.size() << "\n"; + bool cutLayer = false; // Add Tokens from the DFS Tree to the next batch for (int j = 1; j < dfs_tree_inputs.size(); j++) { auto token = dfs_tree_inputs.at(j); @@ -1520,11 +1524,27 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - if (new_bc.num_tokens == get_max_tokens_per_batch() - 1) { + if (new_bc.num_tokens == get_max_tokens_per_batch() && + (j != dfs_tree_inputs.size() - 1)) { + cutLayer = true; break; } } + // delete the last incomplete layer + if (cutLayer) { + int total_tokens = new_bc.num_tokens; + for (int j = total_tokens - 1; j >= 1; j--) { + new_bc.num_tokens--; + new_bc.requestsInfo[i].num_tokens_in_batch--; + std::cout << "cut: " << j << "\n"; + if (new_bc.tokensInfo[j].abs_depth_in_request != + new_bc.tokensInfo[j - 1].abs_depth_in_request) { + break; + } + } + } + } else if (request.status == Request::PENDING) { std::cout << "prepare next batch verify: pending\n" << "\n"; @@ -1646,6 +1666,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } + std::cout << "how many tokens in verify? " << new_bc.num_tokens << "\n"; + std::cout << "check dfs tree input size: " << dfs_tree_inputs[1000000].size() << "\n"; @@ -1673,7 +1695,10 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid != guid) { - std::cout << "i is: " << i << "old guid" << guid << " new guid" << old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid <<"\n"; + std::cout << "i is: " << i << "old guid" << guid << " new guid" + << old_bc.requestsInfo[old_bc.tokensInfo[i].request_index] + .request_guid + << "\n"; int index = old_bc.tokensInfo[i - 1].request_index; int beam_size = old_bc.beamRequestsInfo[index].beam_size; @@ -1689,18 +1714,19 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, // Count tokens sent to model in this request to find the final token's // index - std::cout << "previous result index: "<< result_index; + std::cout << "previous result index: " << result_index; result_index += (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) * beam_size; - - std::cout << "after result index: "<< result_index; + + std::cout << "after result index: " << result_index; // if (true) { // std::cout << "i = " << i << ", result index = " << result_index // << ", value: " << result.token_ids[result_index] - // << ", leaf node num: " << leaf_node_num << ", depth" << depth + // << ", leaf node num: " << leaf_node_num << ", depth" << + // depth // << ", beam size: " << beam_size << "\n"; // } @@ -1765,7 +1791,8 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, if (i < old_bc.num_tokens) { int new_req_idx = old_bc.tokensInfo[i].request_index; guid = old_bc.requestsInfo[new_req_idx].request_guid; - std::cout << "update guid: " << guid << ", request idx: " << index<< "\n"; + std::cout << "update guid: " << guid << ", request idx: " << index + << "\n"; start_depth = old_bc.tokensInfo[i].abs_depth_in_request; } } @@ -2082,12 +2109,42 @@ std::vector> // In this case the inputSeriedTree ends with padding 0s assert(inputSerializedTree.size() >= outputSerializedTree.size()); + int *treeLayers = new int[inputSerializedTree.size()]; + int node_num = 1; + int layer_num = 0; + for (int token_id = 0; token_id < inputSerializedTree.size(); token_id++) { + if (token_id == (inputSerializedTree.size() - 1) || + inputSerializedTree.at(token_id + 1).second != + inputSerializedTree.at(token_id).second) { + treeLayers[layer_num] = node_num; + layer_num += 1; + node_num = 1; + } else { + node_num++; + } + } + + // to avoid branch switch when same tokens in input tree. + + bool findFirst = false; + layer_num = -1; + int first_layer_slot = 0; + int first_layer_slot_total = 0; + int processed_whole_layer_tokens = 0; + for (int i = 0; i < outputSerializedTree.size(); i++) { auto input = inputSerializedTree.at(i); auto output = outputSerializedTree.at(i); + if (i == 0 || inputSerializedTree.at(i - 1).second != + inputSerializedTree.at(i).second) { + layer_num += 1; + processed_whole_layer_tokens += i == 0 ? 0 : treeLayers[layer_num - 1]; + } + if (i == 0) { verifiedTree.push_back(output); + new_committed_tokens.push_back(std::make_pair( input.second, committed_tokens.at(guid).at(i).second)); // > if (input.first == verifiedTree.back().first && input.second == verifiedTree.back().second) { - verifiedTree.push_back(output); - new_committed_tokens.push_back(std::make_pair( - input.second, - committed_tokens.at(guid).at(i).second)); // + if (findFirst) { + // must in this branch. + int layer_slot = i - processed_whole_layer_tokens; + int layer_slot_total = treeLayers[layer_num]; + if ((first_layer_slot == layer_slot)) { + verifiedTree.push_back(output); + new_committed_tokens.push_back(std::make_pair( + input.second, committed_tokens.at(guid).at(i).second)); + // at this point, you'll not go other branches + std::cout << "verify tree push back: " << output.first + << ", tree size is: " << verifiedTree.size() + << ", ??: " << input.first << ", " << input.second << "\n"; + + } else { + printf("not correct slot\n"); + } + } else { + verifiedTree.push_back(output); + first_layer_slot = i - processed_whole_layer_tokens; + first_layer_slot_total = treeLayers[layer_num]; + findFirst = true; + new_committed_tokens.push_back(std::make_pair( + input.second, + committed_tokens.at(guid).at(i).second)); // + // at this point, you'll not go other branches + std::cout << "verify tree push back: " << output.first + << ", tree size is: " << verifiedTree.size() + << ", ??: " << input.first << ", " << input.second << "\n"; + } + assert(committed_tokens.at(guid).at(i).first == input.second); } } diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index e8824feda5..bb6b6030aa 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -57,6 +57,92 @@ void RequestManager::load_tokens_task( cudaMemcpyHostToDevice, stream)); + // // copy meta data to workSpace + // FFHandler handle = *((FFHandler const *)task->local_args); + // size_t total_copy_size = 0; + // cudaMemcpyAsync(handle.batch_config_metadata, + // &(batch_config->tokensInfo), + // sizeof(BatchConfig::tokensInfo), + // cudaMemcpyHostToDevice, + // stream); + // total_copy_size += sizeof(BatchConfig::tokensInfo); + + // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + // total_copy_size, + // &(batch_config->requestsInfo), + // sizeof(BatchConfig::requestsInfo), + // cudaMemcpyHostToDevice, + // stream); + // total_copy_size += sizeof(BatchConfig::requestsInfo); + + // // load speculative metadata + // if (batch_config->get_mode() == BEAM_SEARCH_MODE) { + // BeamSearchBatchConfig const *beam_batch_config = + // static_cast(batch_config); + + // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + // total_copy_size, + // &(beam_batch_config->beamTokenInfo), + // sizeof(BeamSearchBatchConfig::beamTokenInfo), + // cudaMemcpyHostToDevice, + // stream); + + // total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); + + // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + // total_copy_size, + // &(beam_batch_config->beamRequestsInfo), + // sizeof(BeamSearchBatchConfig::beamRequestsInfo), + // cudaMemcpyHostToDevice, + // stream); + // total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); + + // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + // total_copy_size, + // &(beam_batch_config->causalMask), + // sizeof(BatchConfig::causalMask), + // cudaMemcpyHostToDevice, + // stream); + + // total_copy_size += sizeof(BatchConfig::causalMask); + // } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { + // TreeVerifyBatchConfig const *tree_batch_config = + // static_cast(batch_config); + + // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + // total_copy_size, + // &(tree_batch_config->causalMask), + // sizeof(BatchConfig::causalMask), + // cudaMemcpyHostToDevice, + // stream); + // total_copy_size += sizeof(BatchConfig::causalMask); + // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + // total_copy_size, + // &(tree_batch_config->committed_tokens), + // sizeof(TreeVerifyBatchConfig::committed_tokens), + // cudaMemcpyHostToDevice, + // stream); + // total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); + // } + + // // add a size check + // std::cout << "handle.batch_config_metadata_size: " << handle.batch_config_metadata_size << ", "<< total_copy_size << "\n"; + // assert(total_copy_size <= handle.batch_config_metadata_size); +} + +void RequestManager::load_batch_config_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 0); + assert(task->regions.size() == 0); + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + // BatchConfig const batch_config = *((BatchConfig *)task->args); + BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]); + // copy meta data to workSpace FFHandler handle = *((FFHandler const *)task->local_args); size_t total_copy_size = 0; @@ -126,6 +212,7 @@ void RequestManager::load_tokens_task( } // add a size check + std::cout << "hahaha handle.batch_config_metadata_size: " << handle.batch_config_metadata_size << ", "<< total_copy_size << "\n"; assert(total_copy_size <= handle.batch_config_metadata_size); } From 6c442593976ebc7efa6a50087a486ee613616a74 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sat, 30 Dec 2023 13:06:37 -0500 Subject: [PATCH 13/61] Replicate load_token tasks so that it can be fused with other compute tasks; this eliminates Replicate and enables a larger fused op --- include/flexflow/config.h | 1 + src/ops/embedding.cc | 18 ++++++------------ src/runtime/model.cc | 31 ++++++++++++++++++++----------- 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/include/flexflow/config.h b/include/flexflow/config.h index c2af6d707c..01f318c6d5 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -132,6 +132,7 @@ class FFConfig { size_t workSpaceSize; Legion::Context lg_ctx; Legion::Runtime *lg_hlr; + Legion::IndexSpaceT<1> all_gpu_task_is; // Legion::FieldSpace field_space; bool syntheticInput, profiling, perform_fusion; bool inference_debugging; diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 007e799fe0..76236e65ff 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -155,11 +155,8 @@ int Embedding::output_size(ParallelDim output_dims[MAX_TENSOR_DIM]) { output_dims[OUT_CHANNELS].size = this->out_channels; output_dims[OUT_CHANNELS].degree = 1; output_dims[OUT_CHANNELS].parallel_idx = -1; - // Currently do not support parallelizing over the replica dim - output_dims[num_dims - 1].size = 1; - output_dims[num_dims - 1].degree = 1; - output_dims[num_dims - 1].parallel_idx = -1; - output_dims[num_dims - 1].is_replica_dim = true; + // Copy replica dim + output_dims[num_dims - 1] = input->dims[input->num_dims - 1]; return num_dims; } else { int num_dims = input->num_dims; @@ -170,11 +167,8 @@ int Embedding::output_size(ParallelDim output_dims[MAX_TENSOR_DIM]) { output_dims[OUT_CHANNELS].size = this->out_channels; output_dims[OUT_CHANNELS].degree = 1; output_dims[OUT_CHANNELS].parallel_idx = -1; - // Currently do not support parallelizing over the replica dim - output_dims[num_dims - 1].size = 1; - output_dims[num_dims - 1].degree = 1; - output_dims[num_dims - 1].parallel_idx = -1; - output_dims[num_dims - 1].is_replica_dim = true; + // Copy replica dim + output_dims[num_dims - 1] = input->dims[input->num_dims - 1]; return num_dims; } // const int REPLICA = this->output_vocab_size_replica_dim(); @@ -189,13 +183,13 @@ int Embedding::weight_size(ParallelDim weight_dims[MAX_TENSOR_DIM]) { weight_dims[Weight::VOCAB_SIZE].size = this->num_entries; weight_dims[Weight::VOCAB_SIZE].degree = 1; weight_dims[Weight::VOCAB_SIZE].parallel_idx = -1; - for (int i = 2; i < input->num_dims; i++) { + for (int i = 2; i < input->num_dims + 1; i++) { weight_dims[i].size = input->dims[i - 1].degree; weight_dims[i].degree = weight_dims[i].size; weight_dims[i].parallel_idx = input->dims[i - 1].parallel_idx; weight_dims[i].is_replica_dim = true; } - return input->num_dims; + return input->num_dims + 1; } void Embedding::register_output_mappings() { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 92f0cff472..975045cd3b 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1499,10 +1499,8 @@ FFRuntime::FFRuntime(FFConfig &config) { Context ctx = config.lg_ctx; ArgumentMap argmap; - Rect<1> task_rect(Point<1>(0), - Point<1>(config.workersPerNode * config.numNodes - 1)); - IndexSpaceT<1> task_is = runtime->create_index_space(ctx, task_rect); - + Domain domain = runtime->get_index_space_domain(ctx, config.all_gpu_task_is); + Rect<1> task_rect = domain; // int rank = 0; for (PointInRectIterator<1> it(task_rect); it(); it++) { FFInitInfo info; @@ -1518,7 +1516,7 @@ FFRuntime::FFRuntime(FFConfig &config) { // Init CUDA library on each worker IndexLauncher initLauncher(FF_INIT_TASK_ID, - task_is, + config.all_gpu_task_is, TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, @@ -2993,6 +2991,12 @@ Op *FFModel::create_operator_from_layer( dims[num_dims].degree = 1; dims[num_dims].parallel_idx = -1; dims[num_dims].is_replica_dim = true; + if (config.computationMode == COMP_MODE_INFERENCE && + config.tensor_parallelism_degree > 1) { + dims[num_dims].size *= config.tensor_parallelism_degree; + dims[num_dims].degree *= config.tensor_parallelism_degree; + dims[num_dims].parallel_idx = 0; + } // create_parallel_tensor adds an NoOp into operators ParallelTensor pt = create_parallel_tensor_legion_ordering(num_dims + 1, @@ -3002,6 +3006,7 @@ Op *FFModel::create_operator_from_layer( 0, true /*gradients*/, tensor->tensor_guid); + assert(pt->get_shape().is_valid()); // assert that this tensor hasn't been mapped before assert(tensor->parallel_tensor == nullptr); tensor->parallel_tensor = pt; @@ -3260,12 +3265,12 @@ void FFModel::create_operators_from_layers() { if (config.computationMode == COMP_MODE_INFERENCE && config.tensor_parallelism_degree > 1 && l->op_type == OP_EMBEDDING) { assert(op->numOutputs == 1); - Replicate *repl = new Replicate(*this, - op->outputs[0], - op->outputs[0]->num_dims - 1, - config.tensor_parallelism_degree); - operators.push_back(repl); - op = repl; + // Replicate *repl = new Replicate(*this, + // op->outputs[0], + // op->outputs[0]->num_dims - 1, + // config.tensor_parallelism_degree); + // operators.push_back(repl); + // op = repl; } else if (config.computationMode == COMP_MODE_INFERENCE && config.tensor_parallelism_degree > 1 && (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || @@ -4076,6 +4081,10 @@ FFConfig::FFConfig() { Runtime *runtime = Runtime::get_runtime(); lg_hlr = runtime; lg_ctx = Runtime::get_context(); + Rect<1> task_rect(Point<1>(0), Point<1>(workersPerNode * numNodes - 1)); + // Create an index space for tasks running on all GPUs + all_gpu_task_is = runtime->create_index_space(lg_ctx, task_rect); + // field_space = runtime->create_field_space(lg_ctx); } From ac112037a8e88193d3377684ae2821d253551c2d Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sat, 30 Dec 2023 15:09:19 -0500 Subject: [PATCH 14/61] more fix. --- include/flexflow/batch_config.h | 3 + src/ops/inc_multihead_self_attention.cu | 13 ++-- src/ops/kernels/embedding_kernels.cu | 2 +- .../specinfer_inc_multihead_self_attention.cu | 58 ++++++++--------- src/ops/tree_inc_multihead_self_attention.cu | 42 ++++++------ src/runtime/request_manager.cc | 65 ++++++++++--------- 6 files changed, 98 insertions(+), 85 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index c3a75e59a4..8065e0f038 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -69,6 +69,9 @@ class BatchConfig { int first_token_offset_in_batch; int num_tokens_in_batch; int max_sequence_length; + + //request id in batch config: + int batch_config_request_id; RequestGuid request_guid; }; struct PerTokenInfo { diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 2f16dd71c2..3b3879e8e5 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -82,6 +82,9 @@ __global__ void compute_attention_kernel_generation_kernel( // request idx int const request_idx = blockIdx.y; + int const batch_config_request_id = + request_infos[request_idx].batch_config_request_id; + int const beam_request_idx = is_beam ? request_idx / max_beam_width : request_idx; int const beam_sub_request_idx = is_beam ? request_idx % max_beam_width : 0; @@ -89,8 +92,8 @@ __global__ void compute_attention_kernel_generation_kernel( int const first_step = 0; int const tlength = - request_infos[beam_request_idx].first_token_depth_in_request + - request_infos[beam_request_idx].num_tokens_in_batch; + request_infos[batch_config_request_id].first_token_depth_in_request + + request_infos[batch_config_request_id].num_tokens_in_batch; // shared memory objects extern __shared__ char smem_[]; @@ -103,7 +106,7 @@ __global__ void compute_attention_kernel_generation_kernel( // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum __shared__ float red_smem[WARPS_PER_BLOCK * 2]; - const DT *q_ptr = query + beam_request_idx * hidden_size * QKV_WEIGHT_NUM + + const DT *q_ptr = query + batch_config_request_id * hidden_size * QKV_WEIGHT_NUM + head_idx * per_head_size; __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; // DT const *q_ptr = @@ -139,7 +142,7 @@ __global__ void compute_attention_kernel_generation_kernel( DT const *k_cache_batch = key_cache + - (beam_request_idx * max_beam_width + beam_sub_request_idx) * + (batch_config_request_id * max_beam_width + beam_sub_request_idx) * max_seq_length * hidden_size + ki; @@ -245,7 +248,7 @@ __global__ void compute_attention_kernel_generation_kernel( // The base pointer for the value in the cache buffer. DT const *v_cache_batch = value_cache + - (beam_request_idx * max_beam_width + beam_sub_request_idx) * + (batch_config_request_id * max_beam_width + beam_sub_request_idx) * max_seq_length * hidden_size + vi; diff --git a/src/ops/kernels/embedding_kernels.cu b/src/ops/kernels/embedding_kernels.cu index 3085fdb6ba..6947be432e 100644 --- a/src/ops/kernels/embedding_kernels.cu +++ b/src/ops/kernels/embedding_kernels.cu @@ -118,7 +118,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, // print_tensor(output_ptr, output_domain.get_volume(), // "[Embedding:forward:output]"); } - print_tensor(input.get_int32_ptr(), 200, "embeddinginput"); + // print_tensor(input.get_int32_ptr(), 200, "embeddinginput"); } /*static*/ diff --git a/src/ops/specinfer_inc_multihead_self_attention.cu b/src/ops/specinfer_inc_multihead_self_attention.cu index e84ec3095c..8340519ff3 100644 --- a/src/ops/specinfer_inc_multihead_self_attention.cu +++ b/src/ops/specinfer_inc_multihead_self_attention.cu @@ -69,36 +69,43 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( int const tidx = threadIdx.x; // head id int const head_idx = blockIdx.x; - // request idx + // nth request idx int const request_idx = blockIdx.y; - BatchConfig::BitMask bitmask = causalMask[request_idx]; + // request id in batch config + int const batch_config_request_id = + request_infos[request_idx].batch_config_request_id; + + // request_idx = re + + BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; int const first_step = 0; - int const tlength = request_infos[request_idx].first_token_depth_in_request + - request_infos[request_idx].num_tokens_in_batch; + int const tlength = + request_infos[batch_config_request_id].first_token_depth_in_request + + request_infos[batch_config_request_id].num_tokens_in_batch; - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - // printf("specinfer attn fused kernel %lld\n", bitmask.mask[1]); - // } - + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + printf("specinfer attn fused kernel!!!\n"); + } int const totalCacheSize = bitmask.non_tree_cache_size + bitmask.tree_size; - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - // printf("specinfer attn fused kernel %d, %d\n", - // totalCacheSize,request_infos[request_idx].num_tokens_in_batch); - // } + if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + printf("specinfer attn fused kernel %d, %d\n", + totalCacheSize, + request_infos[batch_config_request_id].num_tokens_in_batch); + } // int const qlength = request_infos[request_idx].num_tokens_in_batch; - int const tree_branch_num = beam_request_infos[request_idx].sub_request_num; + int const tree_branch_num = + beam_request_infos[batch_config_request_id].sub_request_num; // will decode qlength tokens in this thread block // int const qlength = tree_branch_num; int first_token_idx = 0; for (int r = 0; r < request_idx; r++) { - // first_token_idx += request_infos[request_idx].num_tokens_in_batch; first_token_idx += causalMask[r].this_layer_size; } @@ -135,7 +142,7 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; DT const *k_cache_batch = - key_cache + request_idx * max_seq_length * hidden_size + ki; + key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; int ti_end = div_up(totalCacheSize - first_step, K_PER_WARP) * K_PER_WARP + first_step; @@ -166,10 +173,6 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( } float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); - // if (blockIdx.y == 0 && blockIdx.x == 0) { - // printf("spec inc attn qkqkqk %d, %.10f, %d\n", ti, qk, sub_req_idx); - // } - if (ti < totalCacheSize && tidx % THREADS_PER_KEY == 0) { // todo add alobi here // bool const mask = ti_circ >= totalCacheSize; @@ -177,14 +180,8 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << query_token)))); - // if (blockIdx.y == 0 && blockIdx.x == 0 && sub_req_idx == 0) { - // printf("specinfer mask: ti:%d, %d, %d, %d, %lld\n", - // ti, - // totalCacheSize, - // bitmask.non_tree_cache_size, - // query_token, - // bitmask.mask[ti - bitmask.non_tree_cache_size]); - // // assert(false); + // if (blockIdx.y == 0 && blockIdx.x == 0 && !mask) { + // printf("spec inc attn qkqkqk %d, %.10f, %d\n", ti, qk, qi); // } qk_max = mask ? qk_max : fmaxf(qk_max, qk); qk_smem[ti - first_step] = mask ? 0.f : qk; @@ -271,7 +268,8 @@ __global__ void compute_specinfer_attention_kernel_generation_kernel( // The base pointer for the value in the cache buffer. DT const *v_cache_batch = - value_cache + request_idx * max_seq_length * hidden_size + vi; + value_cache + batch_config_request_id * max_seq_length * hidden_size + + vi; if (Dh == Dh_MAX || vi < Dh) { for (int ti = first_step + vo; ti < totalCacheSize; ti += V_PER_ITER) { @@ -461,6 +459,7 @@ void compute_specinfer_attention_kernel_generation( DT *output_ptr, cudaStream_t stream) { // one block == one head per request + printf("??? at here: %d\n", bc->num_active_requests()); dim3 grid(m->num_q_heads, bc->num_active_requests()); int const per_head_size = m->qProjSize; float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; @@ -761,13 +760,14 @@ void inference_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, // std::cout << "specinfer kernel token num: " << bc->num_generation_tokens // << ", " << bc->num_tokens << "\n"; if (bc->num_generation_tokens > 0) { + printf("spec inc generation decoding\n"); compute_specinfer_attention_kernel_generation
( m, bc, static_cast
(m->attn_heads), stream); } // phase 3: Compute attention score // 3 kernels for pahse 3: matmul1 - softmax - matmal2 if (bc->num_tokens > bc->num_generation_tokens) { - // printf("spec inc prompt decoding\n"); + printf("spec inc prompt decoding\n"); compute_attention_kernel_prompt( m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); } diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 8641e63e38..a4329f52db 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -76,13 +76,16 @@ __global__ void compute_attention_kernel_fused_kernel( // request idx int const request_idx = blockIdx.y; + int const batch_config_request_id = + request_infos[request_idx].batch_config_request_id; + int const first_step = 0; - int const tlength = request_infos[request_idx].first_token_depth_in_request + - request_infos[request_idx].num_tokens_in_batch; - int const qlength = request_infos[request_idx].num_tokens_in_batch; + int const tlength = request_infos[batch_config_request_id].first_token_depth_in_request + + request_infos[batch_config_request_id].num_tokens_in_batch; + int const qlength = request_infos[batch_config_request_id].num_tokens_in_batch; - BatchConfig::BitMask bitmask = causalMask[request_idx]; + BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; // bitmask.mask[1] = 3; // if (head_idx == 0 && tidx == 0) { @@ -132,7 +135,7 @@ __global__ void compute_attention_kernel_fused_kernel( constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; DT const *k_cache_batch = - key_cache + request_idx * max_seq_length * hidden_size + ki; + key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; int ti_end = div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; @@ -189,14 +192,14 @@ __global__ void compute_attention_kernel_fused_kernel( qk_max = mask ? qk_max : fmaxf(qk_max, qk); - if (head_idx == 0 && qi == 0 && !mask) { - printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, %.10f\n ", - request_idx, - ti, - qk, - q_vecs[ki_o][0].x, - k[0].x); - } + // if (head_idx == 0 && qi == 0 && !mask) { + // printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, %.10f\n ", + // request_idx, + // ti, + // qk, + // q_vecs[ki_o][0].x, + // k[0].x); + // } qk_smem[ti - first_step] = mask ? 0.0f : qk; } } @@ -279,7 +282,7 @@ __global__ void compute_attention_kernel_fused_kernel( // The base pointer for the value in the cache buffer. DT const *v_cache_batch = - value_cache + request_idx * max_seq_length * hidden_size + vi; + value_cache + batch_config_request_id * max_seq_length * hidden_size + vi; // DT const *v_cache_batch = // value_cache + // (beam_request_idx * max_beam_width + beam_sub_request_idx) * @@ -481,8 +484,7 @@ __global__ void update_tree_branch_kv_cache_fused( int vProjSize, int num_new_tokens, int max_seq_len, - int hidden_size, - int first_token_depth) { + int hidden_size) { CUDA_KERNEL_LOOP(i, num_new_tokens * hidden_size) { int token_idx = i / hidden_size; @@ -498,10 +500,11 @@ __global__ void update_tree_branch_kv_cache_fused( int const request_token_offset = request_infos[req_id].first_token_offset_in_batch; + int const first_token_depth = request_infos[req_id].first_token_depth_in_request; // if(i % hidden_size == 0){ - // printf("update token request id: %d, %d, %d value%.10f\n", req_id, - // token_idx, request_token_offset, kVal); + // printf("update token request id: %d, %d, %d real id %d, value%.10f\n", req_id, + // token_idx, request_token_offset,(token_idx + first_token_depth - request_token_offset), kVal); // } kCache_ptr[req_id * (hidden_size * max_seq_len) + (token_idx + first_token_depth - request_token_offset) * @@ -890,8 +893,7 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_new_tokens, BatchConfig::max_sequence_length() + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, - m->hidden_size, - bc->requestsInfo[0].first_token_depth_in_request); + m->hidden_size); dim3 grid(m->num_q_heads, bc->num_active_requests()); int const per_head_size = m->qProjSize; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 5c3262eb27..e30a7ee478 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -364,6 +364,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } } int num_generation_tokens = 0; + int num_active_req = -1; // Step 2: prepare the next batch for existing requests BatchConfig new_bc; @@ -454,6 +455,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; + num_active_req++; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 == request.tokens.size()) { // Incremental phase @@ -490,6 +493,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, Request new_request = pending_request_queue.front(); pending_request_queue.pop(); // all_requests[new_request.guid] = new_request; + new_bc.requestsInfo[i].first_token_depth_in_request = 0; new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = new_request.guid; @@ -499,6 +503,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; new_bc.request_completed[i] = false; + num_active_req++; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // add profile_info for the new request ProfileInfo profile_info; profile_info.llm_decoding_steps = 1; @@ -574,6 +580,7 @@ BeamSearchBatchConfig int result_index = 0; int num_generation_tokens = 0; + int num_active_req = -1; for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (old_bc.request_completed[i]) { @@ -596,10 +603,11 @@ BeamSearchBatchConfig } else { committed_tokens[guid].clear(); } + // iterate through all the tokens that belong to request i int root_abs_depth = request.tokens.size() - 1; - + while (result_index < old_bc.num_tokens && old_bc.tokensInfo[result_index].request_index == i) { int abs_depth = old_bc.tokensInfo[result_index].abs_depth_in_request; @@ -639,7 +647,7 @@ BeamSearchBatchConfig traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); log_req_mgr.print("Number of Verified Tokens = %zu", - verified_tokens.size()); + verified_tokens.size()); // check if the request is finished if (verified_tokens.size() + request.tokens.size() >= request.max_sequence_length) { @@ -723,8 +731,10 @@ BeamSearchBatchConfig std::cout << "parse to next iteration: " << "\n"; + new_bc.request_completed[i] = false; new_bc.request_running[i] = true; + num_active_req++; // Normal Request Info new_bc.requestsInfo[i].first_token_depth_in_request = @@ -735,6 +745,7 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size(); + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // TODO: Beam Request Info, missing from VerifyTreeBatchConfig int new_max_depth = @@ -805,14 +816,15 @@ BeamSearchBatchConfig log_req_mgr.print("Output: %s", output.c_str()); } - if (request.tokens.size() > 19 && i >= 7) { - std::cout << request.tokens.size() << "\n"; - assert(false); - } + // if (request.tokens.size() > 19 && i >= 7) { + // std::cout << request.tokens.size() << "\n"; + // assert(false); + // } } else if (request.status == Request::PENDING) { new_bc.request_completed[i] = false; new_bc.request_running[i] = false; + num_active_req++; std::cout << "ssm_cache_size: " << request.ssm_cache_size << ", " << "initial_len: " << request.initial_len << std::endl; @@ -826,6 +838,7 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; new_bc.requestsInfo[i].num_tokens_in_batch = 0; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // TODO: Beam Request Info, missing from VerifyTreeBatchConfig new_bc.beamRequestsInfo[i].current_depth = 1; @@ -867,6 +880,7 @@ BeamSearchBatchConfig Request new_request = pending_request_queue.front(); pending_request_queue.pop(); // all_requests[new_request.guid] = new_request; + num_active_req++; new_bc.requestsInfo[i].first_token_depth_in_request = 0; new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = new_request.guid; @@ -875,6 +889,7 @@ BeamSearchBatchConfig (int)new_request.tokens.size()); new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // add profile_info for the new request ProfileInfo profile_info; @@ -967,6 +982,8 @@ BeamSearchBatchConfig old_bc.print(); new_bc.print(); } + std::cout << "prepare next batch init active tokens: " + << new_bc.num_tokens << "\n"; return new_bc; } @@ -1027,10 +1044,12 @@ BeamSearchBatchConfig int num_generation_tokens = 0; // Add incremental tokens to the batch + int num_active_req = -1; for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (old_bc.request_completed[i] || !old_bc.request_running[i]) { continue; } + num_active_req ++; // Comment out this assertion since num_tokens_in_batch can be // zero when beam search has reached required sequence length // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); @@ -1040,29 +1059,6 @@ BeamSearchBatchConfig // assert(processed_tokens < request.tokens.size()); log_req_mgr.debug() << "processed_tokens: " << processed_tokens << "\n"; - // if (processed_tokens > - // old_bc.beamRequestsInfo[i].max_depth + request.tokens.size() && - // request.status == Request::RUNNING - // // || ir.results[t] == 0 TODO: replace this with - // ) { - // // log_req_mgr.print("[Done] guid(%zu) with spec_tree_depth(%d)", - // // old_bc.requestsInfo[i].request_guid, - // // old_bc.beamRequestsInfo[i].max_depth); - // // // new_bc.request_completed[i] = true; - // // new_bc.request_completed[i] = false; - // // new_bc.requestsInfo[i].first_token_depth_in_request = - // processed_tokens; - // // new_bc.requestsInfo[i].request_guid = - // // old_bc.requestsInfo[i].request_guid; - // // new_bc.requestsInfo[i].max_sequence_length = - // // old_bc.requestsInfo[i].max_sequence_length; - // // new_bc.beamRequestsInfo[i].current_depth = - // // old_bc.beamRequestsInfo[i].current_depth; - // // new_bc.request_running[i] = false; - // std::cout << "beam search end:" << request.status << i << ", " - // << new_bc.requestsInfo[i].num_tokens_in_batch << "\n"; - // } - // else { log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", " << new_bc.num_tokens; @@ -1073,6 +1069,7 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; profiling_requests[request.guid].ssm_decoding_steps += 1; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // update the beam search metadata // how many sub request in current request // why is sub_requests has max_requests_per_batch() * MAX_BEAM_WIDTH @@ -1164,6 +1161,7 @@ BeamSearchBatchConfig // std::cout << "nodes: " << tree.treeLayers[k].nodes_num_this_layer // << "\n"; // } + std::cout << "append bit mask: "<< i << "\n"; appendBitMask(new_bc.causalMask[i], new_bc.beamRequestsInfo[i].sub_request_num, old_bc.beamRequestsInfo[i].beam_size, @@ -1198,6 +1196,7 @@ BeamSearchBatchConfig if (old_bc.request_completed[i] || old_bc.request_running[i]) { continue; } + num_active_req++; // Comment out this assertion since num_tokens_in_batch can be // zero when beam search has reached required sequence length // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); @@ -1217,6 +1216,7 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // update the beam search metadata // how many sub request in current request @@ -1330,6 +1330,8 @@ BeamSearchBatchConfig // std::cout << "Current Beam DepthBBB: " // << old_bc.beamRequestsInfo[0].current_depth << "\n"; } + std::cout << "prepare next batch beam total tokens: " << new_bc.num_tokens + << "gneration tokens: " << new_bc.num_generation_tokens << "\n"; return new_bc; } @@ -1384,11 +1386,12 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( max_prompt_load_size -= 1; } } - + int num_active_req = -1; for (int i = 0; i < TreeVerifyBatchConfig::max_requests_per_batch(); i++) { if (old_batches.at(0).request_completed[i]) { continue; } + num_active_req++; size_t guid = old_batches.at(0).requestsInfo[i].request_guid; Request &request = all_requests[guid]; @@ -1432,6 +1435,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( old_batches.at(0).requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // copy bitmask to verify batchconfig memcpy(&(new_bc.causalMask[i]), @@ -1590,6 +1594,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( old_batches.at(0).requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; new_bc.request_completed[i] = false; From 7eaffbc480b05d674bbf465c903b2277f6240e0b Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sat, 30 Dec 2023 17:24:08 -0500 Subject: [PATCH 15/61] clean up --- include/flexflow/batch_config.h | 2 +- include/flexflow/ffconst.h | 1 - include/flexflow/model.h | 45 - include/flexflow/operator_params.h | 2 - .../ops/spec_inc_multihead_self_attention.h | 1 + .../specinfer_inc_multihead_self_attention.h | 150 --- ...nfer_inc_multihead_self_attention_params.h | 33 - include/flexflow/request_manager.h | 2 + inference/file_loader.cc | 3 +- inference/models/llama.cc | 2 +- src/ops/argmax.cc | 1 - src/ops/beam_topk.cc | 7 +- src/ops/beam_topk.cu | 39 +- src/ops/inc_multihead_self_attention.cu | 3 +- src/ops/kernels/embedding_kernels.cu | 1 - src/ops/spec_inc_multihead_self_attention.cc | 12 +- src/ops/spec_inc_multihead_self_attention.cu | 1011 +++++++++++------ .../specinfer_inc_multihead_self_attention.cc | 883 -------------- .../specinfer_inc_multihead_self_attention.cu | 958 ---------------- .../tree attn kernel, 0----> -0.029753357172 | 1 - src/ops/tree_inc_multihead_self_attention.cu | 122 +- src/runtime/ffconst_utils.cc | 2 - src/runtime/graph.cc | 71 +- src/runtime/inference_manager.cc | 8 +- src/runtime/model.cc | 149 +-- src/runtime/model.cpp | 4 +- src/runtime/model.cu | 5 +- src/runtime/request_manager.cc | 288 ++--- src/runtime/request_manager.cu | 1 - 29 files changed, 835 insertions(+), 2972 deletions(-) delete mode 100644 include/flexflow/ops/specinfer_inc_multihead_self_attention.h delete mode 100644 include/flexflow/ops/specinfer_inc_multihead_self_attention_params.h delete mode 100644 src/ops/specinfer_inc_multihead_self_attention.cc delete mode 100644 src/ops/specinfer_inc_multihead_self_attention.cu delete mode 100644 src/ops/tree attn kernel, 0----> -0.029753357172 diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 8065e0f038..13904aaa46 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -70,7 +70,7 @@ class BatchConfig { int num_tokens_in_batch; int max_sequence_length; - //request id in batch config: + // request id in batch config: int batch_config_request_id; RequestGuid request_guid; }; diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index ef0003b08e..512645e624 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -171,7 +171,6 @@ enum OperatorType { OP_INC_MULTIHEAD_SELF_ATTENTION, OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, - OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION, OP_SAMPLING, // Parallel Ops OP_REPARTITION, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 9cdbec64a9..16df99ab1a 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -172,8 +172,6 @@ enum TaskIDs { SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, TREE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, TREE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, - SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, - SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, MSELOSS_BWD_TASK_ID, FUSEDOP_INIT_TASK_ID, FUSEDOP_FWD_TASK_ID, @@ -327,7 +325,6 @@ class Linear; class MultiHeadAttention; class IncMultiHeadSelfAttention; class TreeIncMultiHeadSelfAttention; -class SpecInferIncMultiHeadSelfAttention; class Pool2D; class Reduce; class Reshape; @@ -747,25 +744,6 @@ class FFModel { bool qk_prod_scaling = true, bool position_bias = false, char const *name = NULL); - -Tensor specinfer_inc_multihead_self_attention( - const Tensor input, - int embed_dim, - int num_heads, - int kdim = 0, - int vdim = 0, - float dropout = 0.0f, - bool bias = false, - bool add_bias_kv = false, - bool add_zero_attn = false, - DataType data_type = DT_NONE, - Initializer *kernel_initializer = NULL, - bool apply_rotary_embedding = false, - bool scaling_query = false, - float scaling_factor = 1.0f, - bool qk_prod_scaling = true, - bool position_bias = false, - char const *name = NULL); Tensor inc_multiquery_self_attention(const Tensor input, int embed_dim, int num_q_heads, @@ -822,26 +800,6 @@ Tensor specinfer_inc_multihead_self_attention( bool qk_prod_scaling = true, bool position_bias = false, char const *name = NULL); - - Tensor specinfer_inc_multiquery_self_attention( - const Tensor input, - int embed_dim, - int num_q_heads, - int num_kv_heads, - int kdim = 0, - int vdim = 0, - float dropout = 0.0f, - bool bias = false, - bool add_bias_kv = false, - bool add_zero_attn = false, - DataType data_type = DT_NONE, - Initializer *kernel_initializer = NULL, - bool apply_rotary_embedding = false, - bool scaling_query = false, - float scaling_factor = 1.0f, - bool qk_prod_scaling = true, - bool position_bias = false, - char const *name = NULL); // ======================================== // Inference APIs // ======================================== @@ -1243,9 +1201,6 @@ Tensor specinfer_inc_multihead_self_attention( std::unordered_map< std::pair, TreeIncMultiHeadSelfAttention *>, - std::unordered_map< - std::pair, - SpecInferIncMultiHeadSelfAttention *>, std::unordered_map, Reduce *>, std::unordered_map, diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index cee2ae95a4..5b187839ef 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -37,7 +37,6 @@ #include "flexflow/ops/topk_params.h" #include "flexflow/ops/transpose_params.h" #include "flexflow/ops/tree_inc_multihead_self_attention_params.h" -#include "flexflow/ops/specinfer_inc_multihead_self_attention_params.h" #include "flexflow/parallel_ops/allreduce_params.h" #include "flexflow/parallel_ops/combine_params.h" #include "flexflow/parallel_ops/fused_parallel_op_params.h" @@ -73,7 +72,6 @@ using OperatorParameters = mp::variant -#include - -namespace FlexFlow { - -class SpecInferIncMultiHeadSelfAttentionMeta; - -class SpecInferIncMultiHeadSelfAttention : public Op { -public: - using Params = SpecInferIncMultiHeadSelfAttentionParams; - using Input = ParallelTensor; - - SpecInferIncMultiHeadSelfAttention(FFModel &model, - LayerID const &layer_guid, - const ParallelTensor _input, - int _embed_dim, - int _num_q_heads, - int _num_kv_heads, - int _kdim, - int _vdim, - float _dropout, - bool _qkv_bias, - bool _final_bias, - bool _add_zero_attn, - bool _apply_rotary_embedding, - bool _scaling_query, - float _scaling_factor, - bool _qk_prod_scaling, - bool _position_bias, - bool allocate_weights, - char const *name); - SpecInferIncMultiHeadSelfAttention(FFModel &model, - const ParallelTensor _input, - const ParallelTensor _weight, - int _embed_dim, - int _num_q_heads, - int _num_kv_heads, - int _kdim, - int _vdim, - float _dropout, - bool _qkv_bias, - bool _final_bias, - bool _add_zero_attn, - bool _apply_rotary_embedding, - bool _scaling_query, - float _scaling_factor, - bool _qk_prod_scaling, - bool _position_bias, - bool allocate_weights, - char const *name); - SpecInferIncMultiHeadSelfAttention(FFModel &model, - SpecInferIncMultiHeadSelfAttention const &other, - const ParallelTensor input, - bool allocate_weights); - SpecInferIncMultiHeadSelfAttention(FFModel &model, - Params const ¶ms, - Input const &inputs, - bool allocate_weights = false, - char const *name = nullptr); - static Op * - create_operator_from_layer(FFModel &model, - Layer const *layer, - std::vector const &inputs); - void init(FFModel const &) override; - void init_inference(FFModel const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; - void forward(FFModel const &) override; - void backward(FFModel const &) override; - Legion::FutureMap inference(FFModel const &, - BatchConfigFuture const &, - std::vector const &, - std::vector const &, - MachineView const *mv = nullptr) override; - void print_layer(FFModel const &model) override { - assert(0); - } - bool get_int_parameter(PMParameter, int *) const override; - - static OpMeta *init_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); - static void inference_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); - Op *materialize(FFModel &ff, - ParallelTensor inputs[], - int num_inputs) const override; - bool measure_operator_cost(Simulator *sim, - MachineView const &mv, - CostMetrics &cost_metrics) const override; - - static void - inference_kernel_wrapper(SpecInferIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - int shard_id, - GenericTensorAccessorR const &input, - GenericTensorAccessorR const &weight, - GenericTensorAccessorW const &output, - GenericTensorAccessorR const &bias); - Params get_params() const; - -public: - int num_q_heads, num_kv_heads, tensor_parallelism_degree; - float dropout, scaling_factor; - bool qkv_bias; - bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, - qk_prod_scaling, position_bias; - int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; - int qoSeqLength, kvSeqLength; -}; - -class SpecInferIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { -public: - SpecInferIncMultiHeadSelfAttentionMeta(FFHandler handler, - SpecInferIncMultiHeadSelfAttention const *attn, - GenericTensorAccessorR const &weight, - MemoryAllocator &gpu_mem_allocator, - int num_samples, - int _num_q_heads, - int _num_kv_heads); - ~SpecInferIncMultiHeadSelfAttentionMeta(void); - -public: - Realm::RegionInstance beam_search_reserve_inst; - BeamSearchBatchConfig::BeamSearchPerTokenInfo *beam_token_infos; - BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos; - BatchConfig::BitMask *causalMask; -}; - -}; // namespace FlexFlow - -#endif // _FLEXFLOW_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_H diff --git a/include/flexflow/ops/specinfer_inc_multihead_self_attention_params.h b/include/flexflow/ops/specinfer_inc_multihead_self_attention_params.h deleted file mode 100644 index b57b06a7f7..0000000000 --- a/include/flexflow/ops/specinfer_inc_multihead_self_attention_params.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef _FLEXFLOW_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H -#define _FLEXFLOW_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H - -#include "flexflow/ffconst.h" -#include "flexflow/fftype.h" -#include "flexflow/parallel_tensor.h" - -namespace FlexFlow { - -struct SpecInferIncMultiHeadSelfAttentionParams { - LayerID layer_guid; - int embed_dim, num_q_heads, num_kv_heads, kdim, vdim; - float dropout, scaling_factor; - bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling, position_bias; - - bool is_valid(ParallelTensorShape const &) const; -}; - -bool operator==(SpecInferIncMultiHeadSelfAttentionParams const &, - SpecInferIncMultiHeadSelfAttentionParams const &); - -} // namespace FlexFlow - -namespace std { -template <> -struct hash { - size_t - operator()(FlexFlow::SpecInferIncMultiHeadSelfAttentionParams const &) const; -}; -} // namespace std - -#endif // _FLEXFLOW_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 8cb45e55b4..1c4b0b2a2f 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -231,6 +231,8 @@ class RequestManager { int max_requests_per_batch; int max_tokens_per_batch; int max_sequence_length; + + // tree width in each speculative step, if not specified 1 std::vector spec_infer_tree_width; // private fields std::unique_ptr tokenizer_; diff --git a/inference/file_loader.cc b/inference/file_loader.cc index 3f70ddf488..7c6870d439 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -726,8 +726,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION || - l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || - l->op_type == OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION) { + l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) { if (weight_filename.find("self_attention") != std::string::npos) { load_attention_weights_multi_query( data, weight_filename, weights_folder, hidden_dim, num_heads); diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 4f76e9e0fa..10001ee916 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -90,7 +90,7 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor mha; switch (mode) { case BEAM_SEARCH_MODE: { - mha = ff.specinfer_inc_multihead_self_attention( + mha = ff.spec_inc_multihead_self_attention( att_norm, llama_config.hidden_size, llama_config.num_attention_heads, diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index d195a5af75..c3bb3d493e 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -399,7 +399,6 @@ InferenceResult m, shard_id, bc, {}, {}, {input, indices}); } - // print_tensor(indices.get_int32_ptr(), 199, "tree attn output"); download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size); return ir; diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 5dfaae41ee..87d357b535 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -370,14 +370,10 @@ BeamInferenceResult Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - - printf("----------1-----------\n"); + int *index_ptr = index.get_int32_ptr(); - printf("----------2-----------\n"); float *value_ptr = value.get_float_ptr(); - printf("----------3-----------\n"); int *parent_ptr = parent.get_int32_ptr(); - printf("----------4-----------\n"); // embedding size: eg. 4096 int length = input_domain.hi()[0] - input_domain.lo()[0] + 1; @@ -404,7 +400,6 @@ BeamInferenceResult // print_tensor(index_ptr, 32, "indexxxxxxx"); - if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index d647fe9ed7..a958786be3 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -379,9 +379,9 @@ template __global__ void mergeSubRequestsKernel(int64_t N, T const *X, T const *rstd, T *Y) { using T_ACC = T; - int64_t const i = blockIdx.x; + const int64_t i = blockIdx.x; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - int64_t const index = i * N + j; + const int64_t index = i * N + j; Y[index] = static_cast(X[index]) * static_cast(rstd[i]); } } @@ -556,7 +556,6 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, int beam_size = bc->beamRequestsInfo[i].beam_size; // initial request - std::cout << "sub_requests: " << i << ", " << sub_requests[i] << "\n"; assert(sub_requests[i] > 0); // process sub requests for (int j = 0; j < sub_requests[i]; j++) { @@ -564,12 +563,13 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, // beam_slots[i].parent_id[j]; acc_probs[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] = bc->beamRequestsInfo[i].probs[j]; - std::cout << "probbbb req: " << i << ", sub req probability : " - << bc->beamRequestsInfo[i].probs[j] << ", sub request id " << j - << ", parent id " << bc->beamRequestsInfo[i].parent_id[j] - << ", data inddd" - << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j - << "\n"; + // std::cout << "probbbb req: " << i << ", sub req probability : " + // << bc->beamRequestsInfo[i].probs[j] << ", sub request id " << + // j + // << ", parent id " << bc->beamRequestsInfo[i].parent_id[j] + // << ", data inddd" + // << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j + // << "\n"; } // process tokens @@ -584,7 +584,6 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, max_heap_size = std::max(max_heap_size, beam_size * sub_requests[i]); max_beam_width = std::max(max_beam_width, beam_size); - std::cout << "max beam width: " << max_beam_width << "\n"; req_index += 1; block_start_index += (sub_requests[i] - 1) * num_new_tokens * length; } @@ -625,23 +624,23 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, cudaMemcpyHostToDevice, stream)); // trick, set acc_probs to 0; - checkCUDA( - cudaMemsetAsync(m->acc_probs, 1.0, max_total_requests * sizeof(DT), stream)); + checkCUDA(cudaMemsetAsync( + m->acc_probs, 1.0, max_total_requests * sizeof(DT), stream)); checkCUDA(cudaMemcpyAsync(m->block_start_index, beam_block_start_index.data(), sizeof(int) * beam_num_blocks, cudaMemcpyHostToDevice, stream)); checkCUDA(cudaMemcpyAsync(m->request_id, - request_id.data(), - sizeof(int) * beam_num_blocks, - cudaMemcpyHostToDevice, - stream)); + request_id.data(), + sizeof(int) * beam_num_blocks, + cudaMemcpyHostToDevice, + stream)); checkCUDA(cudaMemcpyAsync(m->tokens_per_request, - tokens_per_request.data(), - sizeof(int) * beam_num_blocks, - cudaMemcpyHostToDevice, - stream)); + tokens_per_request.data(), + sizeof(int) * beam_num_blocks, + cudaMemcpyHostToDevice, + stream)); // int depth = // bc->beamRequestsInfo[bc->tokensInfo[0].request_index].current_depth; beam_num_blocks = bc->num_active_tokens(); diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 3b3879e8e5..cca0b230c3 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -106,7 +106,8 @@ __global__ void compute_attention_kernel_generation_kernel( // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum __shared__ float red_smem[WARPS_PER_BLOCK * 2]; - const DT *q_ptr = query + batch_config_request_id * hidden_size * QKV_WEIGHT_NUM + + const DT *q_ptr = query + + batch_config_request_id * hidden_size * QKV_WEIGHT_NUM + head_idx * per_head_size; __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; // DT const *q_ptr = diff --git a/src/ops/kernels/embedding_kernels.cu b/src/ops/kernels/embedding_kernels.cu index 6947be432e..22d8161ff1 100644 --- a/src/ops/kernels/embedding_kernels.cu +++ b/src/ops/kernels/embedding_kernels.cu @@ -118,7 +118,6 @@ void forward_kernel_wrapper(EmbeddingMeta const *m, // print_tensor(output_ptr, output_domain.get_volume(), // "[Embedding:forward:output]"); } - // print_tensor(input.get_int32_ptr(), 200, "embeddinginput"); } /*static*/ diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index eb6fd721e6..5d234df822 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -53,7 +53,7 @@ bool SpecIncMultiHeadSelfAttentionParams::is_valid( } Tensor - FFModel::spec_inc_multihead_self_attention(const Tensor input, + FFModel::spec_inc_multihead_self_attention(Tensor const input, int embed_dim, int num_heads, int kdim, @@ -91,7 +91,7 @@ Tensor } Tensor - FFModel::spec_inc_multiquery_self_attention(const Tensor input, + FFModel::spec_inc_multiquery_self_attention(Tensor const input, int embed_dim, int num_q_heads, int num_kv_heads, @@ -257,7 +257,7 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( FFModel &model, LayerID const &_layer_guid, - const ParallelTensor _input, + ParallelTensor const _input, int _embed_dim, int _num_q_heads, int _num_kv_heads, @@ -358,8 +358,8 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( FFModel &model, - const ParallelTensor _input, - const ParallelTensor _weight, + ParallelTensor const _input, + ParallelTensor const _weight, int _embed_dim, int _num_q_heads, int _num_kv_heads, @@ -465,7 +465,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( FFModel &model, SpecIncMultiHeadSelfAttention const &other, - const ParallelTensor input, + ParallelTensor const input, bool allocate_weights) : SpecIncMultiHeadSelfAttention(model, other.layer_guid, diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 29e3d9a48d..b3a87fe244 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -23,16 +23,295 @@ namespace FlexFlow { +#define WARP_SIZE 32 + // declare Legion names using Legion::coord_t; using Legion::Memory; using namespace Kernels::IncMultiHeadAttention; namespace Kernels { -namespace SpecIncMultiHeadAttention { +namespace SpecIncMultiHeadSelfAttention { + +template +__global__ void compute_spec_inc_attention_kernel_generation_kernel( + DT const *query, + DT const *key_cache, + DT const *value_cache, + DT *output_ptr, + float const scale, + int const max_seq_length, + int per_head_size, + int hidden_size, + BatchConfig::PerRequestInfo *request_infos, + BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos, + BatchConfig::BitMask *causalMask) { + + // q, k + using Q_vec = typename VEC_K::Type; + using K_vec = typename VEC_K::Type; + using V_vec = typename VEC_V
::Type; + using Out_sum = typename Vec_fp32_::Type; + + constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; + + constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT); + constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY; + constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; + // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT); + + // thread id + int const tidx = threadIdx.x; + // head id + int const head_idx = blockIdx.x; + // nth request idx + int const request_idx = blockIdx.y; + + // request id in batch config + int const batch_config_request_id = + request_infos[request_idx].batch_config_request_id; + + // request_idx = re + + BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; + + int const first_step = 0; + + int const tlength = + request_infos[batch_config_request_id].first_token_depth_in_request + + request_infos[batch_config_request_id].num_tokens_in_batch; + + int const totalCacheSize = bitmask.non_tree_cache_size + bitmask.tree_size; + + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("spec inc attn fused kernel %d, %d\n", + // totalCacheSize, + // request_infos[batch_config_request_id].num_tokens_in_batch); + // } + // int const qlength = request_infos[request_idx].num_tokens_in_batch; + int const tree_branch_num = + beam_request_infos[batch_config_request_id].sub_request_num; + + // will decode qlength tokens in this thread block + // int const qlength = tree_branch_num; + + int first_token_idx = 0; + for (int r = 0; r < request_idx; r++) { + first_token_idx += causalMask[r].this_layer_size; + } + + // shared memory objects + extern __shared__ char smem_[]; + + float *qk_smem = reinterpret_cast(smem_); + float *out_smem = reinterpret_cast(smem_); + + float qk_max = -FLT_MAX; + + // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum + __shared__ float red_smem[WARPS_PER_BLOCK * 2]; + + const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM + + head_idx * per_head_size; + __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; + + // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE + int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE; + int ki_o = tidx % THREADS_PER_KEY; + // the first key's offset for this thread + // ko = 0, 0, 0, 0, 1, 1, 1, 1, .... + int ko = tidx / THREADS_PER_KEY; + // load q tensor + Q_vec q_vec[K_VECS_PER_THREAD]; + + constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY; + // The number of keys per warp. + constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; + + DT const *k_cache_batch = + key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; + + int ti_end = + div_up(totalCacheSize - first_step, K_PER_WARP) * K_PER_WARP + first_step; + + for (int qi = 0; qi < tree_branch_num; qi += 1) { +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + q_vecs[ki_o][ii] = *reinterpret_cast( + q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + + ii * THREADS_PER_KEY * K_VEC_SIZE); + } + + int const query_token = bitmask.tree_size - tree_branch_num + qi; + + __syncthreads(); + for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { + K_vec k[K_VECS_PER_THREAD]; + int const ti_circ = ti % max_seq_length; + + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; + if (ti < totalCacheSize) { + + k[ii] = *reinterpret_cast( + k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size + + jj); + } + } + float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); + + if (ti < totalCacheSize && tidx % THREADS_PER_KEY == 0) { + // todo add alobi here + // bool const mask = ti_circ >= totalCacheSize; + bool const mask = (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << query_token)))); + + // if (blockIdx.y == 0 && blockIdx.x == 0 && !mask) { + // printf("spec inc attn qkqkqk %d, %.10f, %d\n", ti, qk, qi); + // } + qk_max = mask ? qk_max : fmaxf(qk_max, qk); + qk_smem[ti - first_step] = mask ? 0.f : qk; + } + } + + __syncthreads(); + +#pragma unroll + for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { + qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + } + + // Decompose the thread index into warp and lane. + int const warp = tidx / WARP_SIZE; + int const lane = tidx % WARP_SIZE; + + // The warp leader writes the max to shared memory. + if (lane == 0) { + red_smem[warp] = qk_max; + } + + // Make sure the products are in shared memory. + __syncthreads(); + + // The warps finalize the reduction. + qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; +#pragma unroll + for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + } + + // Broadcast to all the threads in the warp. + qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); + + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("spec inc attn first token qk_max %.10f\n", qk_max); + // } + + float exp_sum = 0.f; + for (int ti = first_step + tidx; ti < totalCacheSize; + ti += THREADS_PER_BLOCK) { + bool const mask = (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << query_token)))); + float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max); + exp_sum += logit; + qk_smem[ti - first_step] = mask ? 0.0f : logit; + } + + // Compute the sum. + exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); + + // softmax + float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); + for (int ti = first_step + tidx; ti < totalCacheSize; + ti += THREADS_PER_BLOCK) { + qk_smem[ti - first_step] *= inv_sum; + } + + __syncthreads(); + + // value projection + constexpr int V_VEC_SIZE = 16 / sizeof(DT); + // A vector of V elements for the current timestep. + // using V_vec_k = typename V_vec_k_::Type; + // using V_vec_acum = typename V_vec_acum_fp32_::Type; + + // The value computed by this thread. + int vo = tidx / THREADS_PER_VALUE; + // The hidden dimensions computed by this particular thread. + int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; + constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; + + Out_sum out; + zero(out); + + // The base pointer for the value in the cache buffer. + DT const *v_cache_batch = + value_cache + batch_config_request_id * max_seq_length * hidden_size + + vi; + + if (Dh == Dh_MAX || vi < Dh) { + for (int ti = first_step + vo; ti < totalCacheSize; ti += V_PER_ITER) { + // Load the values from the cache. + int const ti_circ = ti % max_seq_length; + V_vec v = *reinterpret_cast( + v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); + + bool const mask = (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << query_token)))); + float logit = mask ? 0.0f : qk_smem[ti - first_step]; + out = FlexFlow::fma(logit, cast_to_float(v), out); + } + } + + // // Make sure we can start writing to shared memory. + __syncthreads(); + + // Run the final reduction amongst the different groups computing different + // partial outputs. + if (Dh == Dh_MAX || vi < Dh) { +#pragma unroll + for (int active_groups = V_PER_ITER; active_groups >= 2; + active_groups /= 2) { + + // The midpoint in the number of active groups. + int midpoint = active_groups / 2; + + // The upper part of active threads store to shared memory. + if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) { + *reinterpret_cast(out_smem + (vo - midpoint) * Dh + vi) = + out; + } + __syncthreads(); + + // The bottom warps update their values. + if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) { + out = add(*reinterpret_cast(out_smem + vo * Dh + vi), + out); + } + __syncthreads(); + } + } + + // Output the final values. + if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { + convert_from_float(*reinterpret_cast( + output_ptr + (first_token_idx + qi) * hidden_size + + head_idx * per_head_size + vi), + out); + } + } +} template -__global__ void spec_store_kv_cache( +__global__ void spec_inc_store_kv_cache( DT const *devQKVProjArray, DT *kCache_ptr, DT *vCache_ptr, @@ -40,16 +319,16 @@ __global__ void spec_store_kv_cache( BatchConfig::PerRequestInfo *requestInfo, BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos, BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos, + BatchConfig::BitMask *causalMask, int qProjSize, int kProjSize, int vProjSize, int num_tokens, int max_seq_len, - int max_beam_width, bool is_root, int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * 2) { - int token_idx = i / (hidden_size * KV_WEIGHT_NUM); + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / (hidden_size); int offset = i % hidden_size; size_t val_idx = @@ -58,100 +337,36 @@ __global__ void spec_store_kv_cache( DT kVal = devQKVProjArray[val_idx]; DT vVal = devQKVProjArray[val_idx + hidden_size]; - // above no need to be changed - // int const req_id = id_map[token_idx].request_index; - // int const tok_id = id_map[token_idx].token_position; - // int const sub_req_id = id_map[token_idx].sub_request_index; - // int const parent_id = id_map[token_idx].parent_id; - // int const beam_depth = id_map[token_idx].beam_depth; - // int const beam_width = id_map[token_idx].beam_width; - int const req_id = tokenInfos[token_idx].request_index; int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + int const first_token_in_req = + requestInfo[req_id].first_token_depth_in_request; int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; - int const parent_id = beamRequestInfos[req_id].parent_id[sub_req_id]; - int const beam_depth = beamRequestInfos[req_id].current_depth; - int const beam_width = beamRequestInfos[req_id].beam_size; - - kCache_ptr[(req_id * max_beam_width + sub_req_id) * - (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = kVal; - vCache_ptr[(req_id * max_beam_width + sub_req_id) * - (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = vVal; - - // replica in the root iteration - if (beam_depth == 1) { - for (int i = 1; i < beam_width; i++) { - kCache_ptr[(req_id * max_beam_width + i) * (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = kVal; - vCache_ptr[(req_id * max_beam_width + i) * (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = vVal; - } - } + int const total_token = requestInfo[req_id].num_tokens_in_batch; - // if (head_idx == 0 && beam_depth == 0 && token_idx == 8 && k_cache) { - // // printf("token idx %d\n", token_idx); - // printf("data idx: %d, tok_id %d, new_token_cache_idx %d, parent_id %d, - // " - // "sub_req_id %d, num_tokens %d, kProjSize %d, num_kv_heads %d, - // val " - // "%f, beam_width %d\n", - // data_idx, - // tok_id, - // new_token_cache_idx, - // parent_id, - // sub_req_id, - // num_tokens, - // kProjSize, - // num_kv_heads, - // val, - // beam_width); - // } + int const request_token_offset = + requestInfo[req_id].first_token_offset_in_batch; - // naive cache stealing - if (sub_req_id != parent_id) { - // if (offset == 0 && tok_id == 0) { - // printf("cache stealing!, depth %d req_id %d sub_req_id %d, parentid " - // "%d, tok_id %d\n", - // beam_depth, - // req_id, - // sub_req_id, - // parent_id, - // tok_id); - // } - - for (int depth = 0; depth < beam_depth; depth++) { - int steal_token_idx = tok_id - beam_depth + depth; - int steal_from_idx = (req_id * max_beam_width + parent_id) * - (hidden_size * max_seq_len) + - steal_token_idx * hidden_size + offset; - int steal_to_idx = (req_id * max_beam_width + sub_req_id) * - (hidden_size * max_seq_len) + - steal_token_idx * hidden_size + offset; - kCache_ptr[steal_to_idx] = kCache_ptr[steal_from_idx]; - vCache_ptr[steal_to_idx] = vCache_ptr[steal_from_idx]; - - // if(data_idx == 0 && head_idx == 0 && k_cache && req_id == 1){ - // printf("cache stealing kernel!, steal_token_idx %d\n", - // steal_token_idx); - // } - } - } + BatchConfig::BitMask bitmask = causalMask[req_id]; - // parallel cache stealing not yet implemented - // logic shld be - // launch spec_store_kv_cache with parallelism * current depth - // from the i here, get depth index - // if depth index not the current one, check if we need to steal - // steal if needed - - // cache stealing theory - // identify which sub request does this token come from - // for initial token, 0 - // for other, may 0,0,1/ 0,1,2/ 1,1,1 to get which cache to be reuse and - // which to be delete copy beam_size bunch of blocks when sub_req_id == - // parent_id : like 0 -> 0, 1->1, 2->2, do nothing, just append the new k/v + int const sub_request_num = beamRequestInfos[req_id].sub_request_num; + + int const tree_branch_num = beamRequestInfos[req_id].sub_request_num; + + // int const query_token = bitmask.non_tree_cache_size + bitmask.tree_size - + // tree_branch_num + sub_req_id + tok_id; + // bitmask.tree_size - tree_branch_num + sub_req_id; + + // if prompt token -> token id + // if tree token: + int const cache_idx = bitmask.non_tree_cache_size + bitmask.tree_size - + bitmask.this_layer_size + token_idx - + request_token_offset; + + kCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + + offset] = vVal; } } @@ -161,28 +376,79 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, cudaStream_t stream) { int num_tokens = bc->num_active_tokens(); int curr_depth = bc->beamRequestsInfo[0].current_depth; - // printf("curr depth: %d\n", curr_depth); - // assert(curr_depth < 3); if (num_tokens > 0) { int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens; - spec_store_kv_cache<<>>(static_cast
(m->devQKVProjArray), - static_cast
(m->keyCache), - static_cast
(m->valueCache), - m->token_infos, - m->request_infos, - m->beam_token_infos, - m->beam_request_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_tokens, - BatchConfig::max_sequence_length(), - BeamSearchBatchConfig::MAX_BEAM_WIDTH, - /*root*/ curr_depth == 0, - m->hidden_size); + spec_inc_store_kv_cache<<>>( + static_cast
(m->devQKVProjArray), + static_cast
(m->keyCache), + static_cast
(m->valueCache), + m->token_infos, + m->request_infos, + m->beam_token_infos, + m->beam_request_infos, + m->causalMask, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens, + BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, + /*root*/ curr_depth == 0, + m->hidden_size); + } +} + +#define LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( \ + DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream) \ + smem_sz = smem_size_in_bytes
(m->qProjSize, \ + BatchConfig::max_sequence_length() + \ + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ + THREADS_PER_VALUE, \ + THDS_PER_BLOCK); \ + compute_spec_inc_attention_kernel_generation_kernel \ + <<>>( \ + static_cast
(m->devQKVProjArray), \ + static_cast
(m->keyCache), \ + static_cast
(m->valueCache), \ + output_ptr, \ + scale, \ + BatchConfig::max_sequence_length() + \ + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ + m->qProjSize, \ + m->hidden_size, \ + m->request_infos, \ + m->beam_request_infos, \ + m->causalMask) + +template +void compute_spec_inc_attention_kernel_generation( + SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + DT *output_ptr, + cudaStream_t stream) { + // one block == one head per request + dim3 grid(m->num_q_heads, bc->num_active_requests()); + int const per_head_size = m->qProjSize; + float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; + size_t smem_sz; + if (per_head_size == 64) { + constexpr int THREADS_PER_VALUE_64 = threads_per_value_t::value; + LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( + DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream); + } else if (per_head_size == 128) { + constexpr int THREADS_PER_VALUE_128 = threads_per_value_t::value; + LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( + DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream); + } else { + assert(false && "a unsupported head size"); } } @@ -204,13 +470,14 @@ __global__ void spec_fill_entries_above_diagonal(DT *matrix, } template -void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - int shard_id, - DT *output_ptr, - DT const *bias_ptr, - DT const *weight_ptr, - cudaStream_t stream) { +void compute_attention_kernel_prompt( + SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + int shard_id, + DT *output_ptr, + DT const *bias_ptr, + DT const *weight_ptr, + cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); @@ -236,199 +503,208 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, int q_block_size = m->qProjSize; int kt_block_size = m->kProjSize; - int kt_req_block_size = - kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int kt_req_block_size = kt_block_size * m->num_q_heads * + (BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); int vt_block_size = m->vProjSize; - int vt_req_block_size = - vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_req_block_size = vt_block_size * m->num_q_heads * + (BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } - for (int sub_req_id = 0; sub_req_id < bc->sub_requests[i]; sub_req_id++) { - // int num_new_tokens = bc->num_processing_tokens[i]; - // int total_tokens = bc->token_last_available_idx[i] + 1; + // else if (tokens_previous_requests < bc->num_generation_tokens) { + // tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + // continue; + // } - int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + - bc->requestsInfo[i].num_tokens_in_batch; + // all requests in prompt phase should only have one sub requests; + assert(bc->sub_requests[i] == 1); + // int num_new_tokens = bc->num_processing_tokens[i]; + // int total_tokens = bc->token_last_available_idx[i] + 1; - if (num_new_tokens <= 0) { - continue; - } + int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + + bc->requestsInfo[i].num_tokens_in_batch; - // Compute (QK^T/sqrt(d_k)) - int m_ = num_new_tokens; - int n = total_tokens; - int k = m->qProjSize; - int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, - ldc = m_; - int strideA = q_block_size; - int strideB = kt_block_size; - int strideC = num_new_tokens * total_tokens; - - // a flag of using this scaling alpha - DT alpha = 1.0f, beta = 0.0f; - if (*m->qk_prod_scaling) { - alpha = static_cast
(1.0f / sqrt(m->kProjSize)); - } - // To get A, skip over Q entries from previous requests (same head) - DT const *A = static_cast
(m->devQKVProjArray) + - bc->requestsInfo[i].first_token_offset_in_batch * - m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; - // To get B, skip over K entries from previous requests (all heads + - // padding) - DT const *B = static_cast
(m->keyCache) + - (i * bc->MAX_BEAM_WIDTH + sub_req_id) * kt_req_block_size; - - // if (i == 0 && sub_req_id == 0 && - // bc->beam_slots.at(0).current_depth == 1) { - // int offset = (float *)B - m->keyCache; - // printf("key cache offset %d\n", kt_req_block_size); - // } - // To get C, skip over QK^T products from previous requests - DT *C = static_cast
(m->qk_prods) + - m->num_q_heads * tokens_prev_requests_squares; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // add alibi position bias to qk production - // add alibi position bias to qk production - if (*m->position_bias) { - size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; - apply_position_bias_qkprd<<>>(C, - num_new_tokens, - total_tokens, - m->num_q_heads, - m->global_num_q_heads, - shard_id); - } - // Fill all elements above diagonal in qk prods with -inf to force - // causal attention. - assert(num_new_tokens <= total_tokens); - if (num_new_tokens > 1) { - size_t parallelism = m->num_q_heads * num_new_tokens * total_tokens; - spec_fill_entries_above_diagonal<<>>( - C, - num_new_tokens, - total_tokens, - m->num_q_heads, - static_cast
(-INFINITY)); - } - // Compute Softmax(QK^T/sqrt(d_k)) - // Before modifying the parameters below, make sure to read the following - // description of the CUDNN_TENSOR_NCHW tensor layout, from - // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: - // This tensor format specifies that the data is laid out in the following - // order: batch size, feature maps, rows, columns. The strides are - // implicitly defined in such a way that the data are contiguous in memory - // with no padding between images, feature maps, rows, and columns; the - // columns are the inner dimension and the images are the outermost - // dimension. - int n_param = m->num_q_heads; - int c_param = total_tokens; - int h_param = 1; - int w_param = num_new_tokens; - checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, - CUDNN_TENSOR_NCHW, - cudnn_data_type, - n_param, - c_param, - h_param, - w_param)); - float softmax_alpha = 1.0f, softmax_beta = 0.0f; - DT *C_softmax = static_cast
(m->qk_prods_softmax) + - m->num_q_heads * tokens_prev_requests_squares; - // The softmax operation below is executed according to the - // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The - // softmax operation is computed per spatial location (H,W) per image (N) - // across dimension C. - checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &softmax_alpha, - m->qk_tensor, - C, - &softmax_beta, - m->qk_tensor, - C_softmax)); - // Matmul softmax(QK^T/sqrt(d_k)) by V - alpha = 1.0f, beta = 0.0f; - m_ = m->vProjSize; - n = num_new_tokens; - k = total_tokens; - lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; - strideA = vt_block_size; - strideB = num_new_tokens * total_tokens; - strideC = m->vProjSize; - // To get A, skip over V^T entries from previous requests (all heads + - // padding) - A = static_cast
(m->valueCache) + - (i * bc->MAX_BEAM_WIDTH + sub_req_id) * vt_req_block_size; - // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous - // requests (all heads) - B = C_softmax; - // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous - // requests - C = static_cast
(m->attn_heads) + - (tokens_previous_requests + bc->num_generation_tokens) * - m->num_q_heads * m->vProjSize; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - tokens_previous_requests += num_new_tokens; - tokens_prev_requests_squares += num_new_tokens * total_tokens; + if (num_new_tokens <= 0) { + continue; + } + + // Compute (QK^T/sqrt(d_k)) + int m_ = num_new_tokens; + int n = total_tokens; + int k = m->qProjSize; + int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, + ldc = m_; + int strideA = q_block_size; + int strideB = kt_block_size; + int strideC = num_new_tokens * total_tokens; + + // a flag of using this scaling alpha + DT alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + } + // To get A, skip over Q entries from previous requests (same head) + DT const *A = static_cast
(m->devQKVProjArray) + + bc->requestsInfo[i].first_token_offset_in_batch * + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; + // To get B, skip over K entries from previous requests (all heads + + // padding) + + // print_tensor((float*)A, 32, "A"); + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + + // if (i == 0 && sub_req_id == 0 && + // bc->beam_slots.at(0).current_depth == 1) { + // int offset = (float *)B - m->keyCache; + // printf("key cache offset %d\n", kt_req_block_size); + // } + // To get C, skip over QK^T products from previous requests + DT *C = static_cast
(m->qk_prods) + + m->num_q_heads * tokens_prev_requests_squares; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // print_tensor((float*)C, 32, "C"); + // add alibi position bias to qk production + // add alibi position bias to qk production + if (*m->position_bias) { + size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; + apply_position_bias_qkprd<<>>(C, + num_new_tokens, + total_tokens, + m->num_q_heads, + m->global_num_q_heads, + shard_id); } + // Fill all elements above diagonal in qk prods with -inf to force + // causal attention. + assert(num_new_tokens <= total_tokens); + if (num_new_tokens > 1) { + size_t parallelism = m->num_q_heads * num_new_tokens * total_tokens; + spec_fill_entries_above_diagonal<<>>(C, + num_new_tokens, + total_tokens, + m->num_q_heads, + static_cast
(-INFINITY)); + } + // Compute Softmax(QK^T/sqrt(d_k)) + // Before modifying the parameters below, make sure to read the following + // description of the CUDNN_TENSOR_NCHW tensor layout, from + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: + // This tensor format specifies that the data is laid out in the following + // order: batch size, feature maps, rows, columns. The strides are + // implicitly defined in such a way that the data are contiguous in memory + // with no padding between images, feature maps, rows, and columns; the + // columns are the inner dimension and the images are the outermost + // dimension. + int n_param = m->num_q_heads; + int c_param = total_tokens; + int h_param = 1; + int w_param = num_new_tokens; + checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + n_param, + c_param, + h_param, + w_param)); + float softmax_alpha = 1.0f, softmax_beta = 0.0f; + DT *C_softmax = static_cast
(m->qk_prods_softmax) + + m->num_q_heads * tokens_prev_requests_squares; + // The softmax operation below is executed according to the + // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The + // softmax operation is computed per spatial location (H,W) per image (N) + // across dimension C. + checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &softmax_alpha, + m->qk_tensor, + C, + &softmax_beta, + m->qk_tensor, + C_softmax)); + // Matmul softmax(QK^T/sqrt(d_k)) by V + alpha = 1.0f, beta = 0.0f; + m_ = m->vProjSize; + n = num_new_tokens; + k = total_tokens; + lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; + strideA = vt_block_size; + strideB = num_new_tokens * total_tokens; + strideC = m->vProjSize; + // To get A, skip over V^T entries from previous requests (all heads + + // padding) + A = static_cast
(m->valueCache) + i * vt_req_block_size; + // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous + // requests (all heads) + B = C_softmax; + // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous + // requests + + // print_tensor((float*)C_softmax, 32, "C_softmax"); + C = static_cast
(m->attn_heads) + + (tokens_previous_requests + bc->num_generation_tokens) * + m->num_q_heads * m->vProjSize; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + tokens_previous_requests += num_new_tokens; + tokens_prev_requests_squares += num_new_tokens * total_tokens; } // assert(tokens_previous_requests == num_tokens); @@ -443,31 +719,8 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, DT *output_ptr, DT const *bias_ptr, cudaStream_t stream) { - // here because we need postion info in infernece 1 - cudaMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - bc->num_active_tokens() * sizeof(BatchConfig::PerTokenInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->request_infos, - &(bc->requestsInfo), - bc->max_requests_per_batch() * - sizeof(BatchConfig::PerRequestInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->beam_token_infos, - &(bc->beamTokenInfo), - bc->num_active_tokens() * bc->MAX_BEAM_WIDTH * - sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->beam_request_infos, - &(bc->beamRequestsInfo), - bc->max_requests_per_batch() * - sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo), - cudaMemcpyHostToDevice, - stream); // phase 1: Implement kernel to compute KQV for input tokens + compute_qkv_kernel(m, bc, shard_id, @@ -479,7 +732,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // phase 2: Update key/val cache update_kv_cache_kernel
(m, bc, stream); if (bc->num_generation_tokens > 0) { - compute_attention_kernel_generation
( + compute_spec_inc_attention_kernel_generation
( m, bc, static_cast
(m->attn_heads), stream); } // phase 3: Compute attention score @@ -488,16 +741,14 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, compute_attention_kernel_prompt( m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); } - // compute output production and bias together for all tokens - int num_tokens = - bc->num_active_tokens() * BeamSearchBatchConfig::MAX_BEAM_WIDTH; + int num_tokens = bc->num_active_tokens(); compute_o_prod_bias( m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); } -} // namespace SpecIncMultiHeadAttention +} // namespace SpecIncMultiHeadSelfAttention } // namespace Kernels /*static*/ @@ -529,25 +780,27 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( if (input.data_type == DT_HALF) { half const *bias_ptr = use_bias ? bias.get_half_ptr() : static_cast(nullptr); - Kernels::SpecIncMultiHeadAttention::inference_kernel(m, - bc, - shard_id, - input.get_half_ptr(), - weight.get_half_ptr(), - output.get_half_ptr(), - bias_ptr, - stream); + Kernels::SpecIncMultiHeadSelfAttention::inference_kernel( + m, + bc, + shard_id, + input.get_half_ptr(), + weight.get_half_ptr(), + output.get_half_ptr(), + bias_ptr, + stream); } else if (input.data_type == DT_FLOAT) { float const *bias_ptr = use_bias ? bias.get_float_ptr() : static_cast(nullptr); - Kernels::SpecIncMultiHeadAttention::inference_kernel(m, - bc, - shard_id, - input.get_float_ptr(), - weight.get_float_ptr(), - output.get_float_ptr(), - bias_ptr, - stream); + Kernels::SpecIncMultiHeadSelfAttention::inference_kernel( + m, + bc, + shard_id, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr(), + bias_ptr, + stream); } else { assert(false && "Unspported data type"); } @@ -559,7 +812,8 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); cudaEventDestroy(t_start); cudaEventDestroy(t_end); - printf("SpecIncMultiHeadSelfAttention forward time = %.2fms\n", elapsed); + printf("SpecIncMultiHeadSelfAttention forward time = %.2fms\n", + elapsed); // print_tensor<3, float>(acc_query.ptr, acc_query.rect, // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, // acc_output.rect, "[Attention:forward:output]"); @@ -606,44 +860,51 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); - size_t beam_tokeninfo_size = - max_tokens_per_batch * BeamSearchBatchConfig::MAX_BEAM_WIDTH; - size_t requestinfo_size = BeamSearchBatchConfig::max_requests_per_batch(); - size_t beam_requestinfo_size = - BeamSearchBatchConfig::max_requests_per_batch(); - size_t total_size = - beam_tokeninfo_size * - sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo) + - beam_requestinfo_size * - sizeof(BeamSearchBatchConfig:: - BeamSearchPerRequestInfo); // more components will - // be added here later - - // We always directly allocate memory for small speculative models - gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, - total_size); + // size_t causal_mask_size = BatchConfig::MAX_NUM_REQUESTS; + // size_t total_size = causal_mask_size * sizeof(BatchConfig::BitMask); + // gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, + // total_size); + beam_token_infos = - gpu_mem_allocator - .allocate_instance( - beam_tokeninfo_size); + static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo)); + + beam_request_infos = + static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::beamTokenInfo)); + causalMask = static_cast( + handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::beamTokenInfo) + + sizeof(BeamSearchBatchConfig::beamRequestsInfo)); + + // causalMask = gpu_mem_allocator.allocate_instance( + // causal_mask_size); + // beam_token_infos = + // gpu_mem_allocator + // .allocate_instance( + // beam_tokeninfo_size); // offset += beam_tokeninfo_size * // sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo); - beam_request_infos = - gpu_mem_allocator - .allocate_instance( - beam_requestinfo_size); + // beam_request_infos = + // gpu_mem_allocator + // .allocate_instance( + // beam_requestinfo_size); // offset += beam_requestinfo_size * // sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo); // assert(offset == total_size); - assert(gpu_mem_allocator.instance_total_size == - gpu_mem_allocator.instance_allocated_size); + // assert(gpu_mem_allocator.instance_total_size == + // gpu_mem_allocator.instance_allocated_size); } cudaStreamSynchronize(stream); } -SpecIncMultiHeadSelfAttentionMeta::~SpecIncMultiHeadSelfAttentionMeta(void) { +SpecIncMultiHeadSelfAttentionMeta::~SpecIncMultiHeadSelfAttentionMeta( + void) { if (beam_search_reserve_inst != Realm::RegionInstance::NO_INST) { beam_search_reserve_inst.destroy(); } diff --git a/src/ops/specinfer_inc_multihead_self_attention.cc b/src/ops/specinfer_inc_multihead_self_attention.cc deleted file mode 100644 index 42074f39e4..0000000000 --- a/src/ops/specinfer_inc_multihead_self_attention.cc +++ /dev/null @@ -1,883 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flexflow/ops/specinfer_inc_multihead_self_attention.h" -#include "flexflow/ffconst_utils.h" -#include "flexflow/model.h" -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) -#include "flexflow/utils/cuda_helper.h" -#else -#include "flexflow/utils/hip_helper.h" -#endif -#include "flexflow/utils/hash_utils.h" -#include "legion/legion_utilities.h" - -namespace FlexFlow { - -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::Future; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::Machine; -using Legion::Memory; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; -using PCG::Node; - -bool SpecInferIncMultiHeadSelfAttentionParams::is_valid( - ParallelTensorShape const &input) const { - bool is_valid = input.is_valid(); - return is_valid; -} - -Tensor FFModel::specinfer_inc_multihead_self_attention( - Tensor const input, - int embed_dim, - int num_heads, - int kdim, - int vdim, - float dropout, - bool qkv_bias, - bool final_bias, - bool add_zero_attn, - DataType data_type, - Initializer *kernel_initializer, - bool apply_rotary_embedding, - bool scaling_query, - float scaling_factor, - bool qk_prod_scaling, - bool position_bias, - char const *name) { - return specinfer_inc_multiquery_self_attention(input, - embed_dim, - num_heads, - num_heads, - kdim, - vdim, - dropout, - qkv_bias, - final_bias, - add_zero_attn, - data_type, - kernel_initializer, - apply_rotary_embedding, - scaling_query, - scaling_factor, - qk_prod_scaling, - position_bias, - name); -} - -Tensor FFModel::specinfer_inc_multiquery_self_attention( - Tensor const input, - int embed_dim, - int num_q_heads, - int num_kv_heads, - int kdim, - int vdim, - float dropout, - bool qkv_bias, - bool final_bias, - bool add_zero_attn, - DataType data_type, - Initializer *kernel_initializer, - bool apply_rotary_embedding, - bool scaling_query, - float scaling_factor, - bool qk_prod_scaling, - bool position_bias, - char const *name) { - if (data_type == DT_NONE) { - data_type = input->data_type; - } - Layer *li = nullptr; - int weight_num = (qkv_bias || final_bias) ? 2 : 1; - if (data_type != input->data_type) { - Tensor casted_input = cast(input, data_type, "type cast for IncMHA"); - li = new Layer(this, - OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION, - data_type, - name, - 1 /*inputs*/, - weight_num /*weights*/, - 1 /*outputs*/, - casted_input); - } else { - li = new Layer(this, - OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION, - data_type, - name, - 1 /*inputs*/, - weight_num /*weights*/, - 1 /*outputs*/, - input); - } - { - int numdims = input->num_dims; - int dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdims; i++) { - dims[i] = input->dims[i]; - } - dims[0] = embed_dim; - li->outputs[0] = create_tensor_legion_ordering( - numdims, dims, data_type, li, 0, true /*create_grad*/); - } - // Compute weight size - int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, - oProjSize = embed_dim; - int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0]; - int qParas = qProjSize * qSize; - int kParas = kProjSize * kSize; - int vParas = vProjSize * vSize; - int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); - int weight_size = qParas * num_q_heads + kParas * num_q_heads + - vParas * num_q_heads + oParas * num_q_heads; - { - int dims[1] = {weight_size}; - li->weights[0] = create_weight_legion_ordering(1, - dims, - data_type, - li, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); - } - if (qkv_bias || final_bias) { - // q, k, v, o - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + - (final_bias ? oProjSize : 0)}; - li->weights[1] = create_weight_legion_ordering(1, - dims, - data_type, - li, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); - } - li->data_type = data_type; - li->add_int_property("embed_dim", embed_dim); - li->add_int_property("num_q_heads", num_q_heads); - li->add_int_property("num_kv_heads", num_kv_heads); - li->add_int_property("kdim", kdim); - li->add_int_property("vdim", vdim); - li->add_int_property("qkv_bias", qkv_bias); - li->add_int_property("final_bias", final_bias); - li->add_int_property("add_zero_attn", add_zero_attn); - li->add_float_property("dropout", dropout); - li->add_int_property("apply_rotary_embedding", apply_rotary_embedding); - li->add_int_property("scaling_query", scaling_query); - li->add_float_property("scaling_factor", scaling_factor); - li->add_int_property("qk_prod_scaling", qk_prod_scaling); - li->add_int_property("position_bias", position_bias); - layers.push_back(li); - return li->outputs[0]; -} - -Op *SpecInferIncMultiHeadSelfAttention::create_operator_from_layer( - FFModel &model, - Layer const *layer, - std::vector const &inputs) { - - std::cout << "spec create operator: " << layer->name << "\n"; - long long value; - layer->get_int_property("embed_dim", value); - int embed_dim = value; - layer->get_int_property("num_q_heads", value); - int num_q_heads = value; - layer->get_int_property("num_kv_heads", value); - int num_kv_heads = value; - layer->get_int_property("kdim", value); - int kdim = value; - layer->get_int_property("vdim", value); - int vdim = value; - float dropout; - layer->get_float_property("dropout", dropout); - layer->get_int_property("qkv_bias", value); - bool qkv_bias = (bool)value; - layer->get_int_property("final_bias", value); - bool final_bias = (bool)value; - layer->get_int_property("add_zero_attn", value); - bool add_zero_attn = (bool)value; - layer->get_int_property("apply_rotary_embedding", value); - bool apply_rotary_embedding = (bool)value; - layer->get_int_property("scaling_query", value); - bool scaling_query = (bool)value; - float scaling_factor; - layer->get_float_property("scaling_factor", scaling_factor); - layer->get_int_property("qk_prod_scaling", value); - bool qk_prod_scaling = (bool)value; - layer->get_int_property("position_bias", value); - bool position_bias = (bool)value; - - return new SpecInferIncMultiHeadSelfAttention(model, - layer->layer_guid, - inputs[0], - embed_dim, - num_q_heads, - num_kv_heads, - kdim, - vdim, - dropout, - qkv_bias, - final_bias, - add_zero_attn, - apply_rotary_embedding, - scaling_query, - scaling_factor, - qk_prod_scaling, - position_bias, - false /*allocate_weights*/, - layer->name); -} - -SpecInferIncMultiHeadSelfAttention::SpecInferIncMultiHeadSelfAttention( - FFModel &model, - LayerID const &_layer_guid, - ParallelTensor const _input, - int _embed_dim, - int _num_q_heads, - int _num_kv_heads, - int _kdim, - int _vdim, - float _dropout, - bool _qkv_bias, - bool _final_bias, - bool _add_zero_attn, - bool _apply_rotary_embedding, - bool _scaling_query, - float _scaling_factor, - bool _qk_prod_scaling, - bool _position_bias, - bool allocate_weights, - char const *name) - // Initializer* _bias_initializer) - : Op(model, - OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION, - _input->data_type, - name, - 1 /*inputs*/, - (_qkv_bias || _final_bias ? 2 : 1) /*weights*/, - 1 /*outputs*/, - _input), - num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), - qkv_bias(_qkv_bias), final_bias(_final_bias), - add_zero_attn(_add_zero_attn), - apply_rotary_embedding(_apply_rotary_embedding), - qSize(_input->dims[0].size), kSize(_input->dims[0].size), - vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), - vProjSize(_vdim), oProjSize(_embed_dim), - qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), - scaling_query(_scaling_query), scaling_factor(_scaling_factor), - qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias) { - // overwrite layer_guid - layer_guid = _layer_guid; - - numOutputs = 1; - int numdim = _input->num_dims; - ParallelDim dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdim; i++) { - dims[i] = _input->dims[i]; - } - dims[0].size = _embed_dim; - // Currently require no parallelism along this dim - assert(dims[0].degree == 1); - if (allocate_weights) { - // Create weight tensor - int num_dims = inputs[0]->num_dims; - // Compute weight size - int qParas = this->qProjSize * this->qSize; - int kParas = this->kProjSize * this->kSize; - int vParas = this->vProjSize * this->vSize; - int oParas = - this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); - ParallelDim dims[2]; - dims[0] = inputs[0]->dims[num_dims - 2]; - dims[0].size = dims[0].degree; - dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_q_heads * (qParas + oParas) + - this->num_q_heads * (kParas + vParas); - dims[1].is_replica_dim = false; - int seed = std::rand(); - Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<2>(dims, - this->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - if (qkv_bias || final_bias) { - ParallelTensorShape bias_shape = _input->get_shape(); - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - bias_shape.dims[0].size = - (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); - bias_shape.dims[1].size = bias_shape.dims[2].size = 1; - weights[1] = - model.create_parallel_weight_legion_ordering(bias_shape.num_dims, - bias_shape.dims, - this->data_type, - nullptr /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - } - } - - outputs[0] = model.create_parallel_tensor_legion_ordering( - _input->num_dims, dims, this->data_type, this); - /* for (int i = 0; i < numdim; i++) { */ - /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ - /* } */ - /* // Check correctness */ - /* assert(check_output_input_weight_parallel_dims()); */ -} - -SpecInferIncMultiHeadSelfAttention::SpecInferIncMultiHeadSelfAttention( - FFModel &model, - ParallelTensor const _input, - ParallelTensor const _weight, - int _embed_dim, - int _num_q_heads, - int _num_kv_heads, - int _kdim, - int _vdim, - float _dropout, - bool _qkv_bias, - bool _final_bias, - bool _add_zero_attn, - bool _apply_rotary_embedding, - bool _scaling_query, - float _scaling_factor, - bool _qk_prod_scaling, - bool _position_bias, - bool allocate_weights, - char const *name) - // Initializer* _bias_initializer) - : Op(model, - OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION, - _input->data_type, - name, - 1 /*inputs*/, - (_qkv_bias || _final_bias ? 2 : 1) /*weights*/, - 1 /*outputs*/, - _input, - _weight), - num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), - qkv_bias(_qkv_bias), final_bias(_final_bias), - add_zero_attn(_add_zero_attn), - apply_rotary_embedding(_apply_rotary_embedding), - qSize(_input->dims[0].size), kSize(_input->dims[0].size), - vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), - vProjSize(_vdim), oProjSize(_embed_dim), - qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), - scaling_query(_scaling_query), scaling_factor(_scaling_factor), - qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias) -// bias_initializer(_bias_initializer) -{ - numOutputs = 1; - int numdim = _input->num_dims; - ParallelDim dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdim; i++) { - dims[i] = _input->dims[i]; - } - dims[0].size = _embed_dim; - // Currently require no parallelism along this dim - assert(dims[0].degree == 1); - if (allocate_weights) { - // Create weight tensor - int num_dims = inputs[0]->num_dims; - // Compute weight size - int qParas = this->qProjSize * this->qSize; - int kParas = this->kProjSize * this->kSize; - int vParas = this->vProjSize * this->vSize; - int oParas = - this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); - ParallelDim dims[2]; - dims[0] = inputs[0]->dims[num_dims - 2]; - dims[0].size = dims[0].degree; - dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_q_heads * (qParas + oParas) + - this->num_q_heads * (kParas + vParas); - dims[1].is_replica_dim = false; - // dims[2].size = qParas + kParas + vParas + oParas; - int seed = std::rand(); - Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<2>(dims, - this->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - if (qkv_bias || final_bias) { - ParallelTensorShape bias_shape = _input->get_shape(); - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - bias_shape.dims[0].size = - (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); - bias_shape.dims[1].size = bias_shape.dims[2].size = 1; - weights[1] = - model.create_parallel_weight_legion_ordering(bias_shape.num_dims, - bias_shape.dims, - this->data_type, - nullptr /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - } - } - - outputs[0] = model.create_parallel_tensor_legion_ordering( - _input->num_dims, dims, this->data_type, this); - - /* for (int i = 0; i < numdim; i++) { */ - /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ - /* } */ - /* register_output_weight_parallel_dims(outputs[0], numdim-1, _weight, 1); */ - /* register_output_weight_parallel_dims(outputs[0], numdim-2, _weight, 2); */ - // Check correctness - /* assert(check_output_input_weight_parallel_dims()); */ -} - -SpecInferIncMultiHeadSelfAttention::SpecInferIncMultiHeadSelfAttention( - FFModel &model, - SpecInferIncMultiHeadSelfAttention const &other, - ParallelTensor const input, - bool allocate_weights) - : SpecInferIncMultiHeadSelfAttention(model, - other.layer_guid, - input, - other.oProjSize, - other.num_q_heads, - other.num_kv_heads, - other.qProjSize, - other.vProjSize, - other.dropout, - other.qkv_bias, - other.final_bias, - other.add_zero_attn, - other.apply_rotary_embedding, - other.scaling_query, - other.scaling_factor, - other.qk_prod_scaling, - other.position_bias, - allocate_weights, - other.name) {} - -SpecInferIncMultiHeadSelfAttention::SpecInferIncMultiHeadSelfAttention( - FFModel &model, - SpecInferIncMultiHeadSelfAttentionParams const ¶ms, - ParallelTensor const &input, - bool allocate_weights, - char const *name) - : SpecInferIncMultiHeadSelfAttention(model, - params.layer_guid, - input, - params.embed_dim, - params.num_q_heads, - params.num_kv_heads, - params.kdim, - params.vdim, - params.dropout, - params.qkv_bias, - params.final_bias, - params.add_zero_attn, - params.apply_rotary_embedding, - params.scaling_query, - params.scaling_factor, - params.qk_prod_scaling, - params.position_bias, - allocate_weights, - name) {} - -void SpecInferIncMultiHeadSelfAttention::init_inference( - FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { - assert(check_output_input_weight_same_parallel_is()); - parallel_is = batch_outputs[0]->parallel_is; - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - size_t machine_view_hash = view->hash(); - set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); - IndexLauncher launcher( - SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(SpecInferIncMultiHeadSelfAttention)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(2, FID_DATA); - FutureMap fm = runtime->execute_index_space(ctx, launcher); - fm.wait_all_results(); - set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); -} - -void SpecInferIncMultiHeadSelfAttention::init(FFModel const &ff) { - assert(check_output_input_weight_same_parallel_is()); - parallel_is = outputs[0]->parallel_is; - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); - IndexLauncher launcher( - SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(SpecInferIncMultiHeadSelfAttention)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(2, FID_DATA); - FutureMap fm = runtime->execute_index_space(ctx, launcher); - fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); -} - -/* - regions[0](I): input - regions[1](I): weight - regions[2](O): output -*/ -OpMeta *SpecInferIncMultiHeadSelfAttention::init_task( - Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - SpecInferIncMultiHeadSelfAttention const *attn = - (SpecInferIncMultiHeadSelfAttention *)task->args; - FFHandler handle = *((FFHandler const *)task->local_args); - - GenericTensorAccessorR input = - helperGetGenericTensorAccessorRO(attn->inputs[0]->data_type, - regions[0], - task->regions[0], - FID_DATA, - ctx, - runtime); - GenericTensorAccessorR weight = - helperGetGenericTensorAccessorRO(attn->weights[0]->data_type, - regions[1], - task->regions[1], - FID_DATA, - ctx, - runtime); - GenericTensorAccessorW output = - helperGetGenericTensorAccessorWO(attn->outputs[0]->data_type, - regions[2], - task->regions[2], - FID_DATA, - ctx, - runtime); - - int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1; - assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); - assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); - int num_q_heads = attn->num_q_heads; - int num_kv_heads = attn->num_kv_heads; - assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); - - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); - MemoryAllocator gpu_mem_allocator(gpu_mem); - // We don't do offloading for SSMs (small speculative models) - SpecInferIncMultiHeadSelfAttentionMeta *m = - new SpecInferIncMultiHeadSelfAttentionMeta(handle, - attn, - weight, - gpu_mem_allocator, - num_samples, - num_q_heads, - num_kv_heads); - // assert that we didn't over allocate memory - assert(gpu_mem_allocator.instance_allocated_size == - gpu_mem_allocator.instance_total_size); - m->profiling = attn->profiling; - m->inference_debugging = attn->inference_debugging; - std::strcpy(m->op_name, attn->name); - m->layer_guid = attn->layer_guid; - assert(weight.domain.get_volume() * data_type_size(weight.data_type) == - m->weightSize); - return m; -} - -void SpecInferIncMultiHeadSelfAttention::forward(FFModel const &ff) { - // SpecInferIncMultiHeadSelfAttention doesn't support forward - assert(false); -} - -FutureMap SpecInferIncMultiHeadSelfAttention::inference( - FFModel const &ff, - BatchConfigFuture const &bc, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - parallel_is = batch_outputs[0]->parallel_is; - MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); - size_t machine_view_hash = view->hash(); - int idx = 0; - IndexLauncher launcher(SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, - parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); - launcher.add_future(bc); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(idx++, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(idx++, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(idx++, FID_DATA); - - if (qkv_bias || final_bias) { - launcher.add_region_requirement(RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region)); - launcher.add_field(idx++, FID_DATA); - } - return runtime->execute_index_space(ctx, launcher); -} - -/* - regions[0](I): input - regions[3](I): weight - regions[4](O): output -*/ -void SpecInferIncMultiHeadSelfAttention::inference_task( - Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(task->regions.size() == regions.size()); - - BeamSearchBatchConfig const &bc = - Future(task->futures[0]).get_result(); - if (bc.num_tokens == 0) { - return; - } - - SpecInferIncMultiHeadSelfAttentionMeta *m = - *((SpecInferIncMultiHeadSelfAttentionMeta **)task->local_args); - assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4 - : regions.size() == 3)); - - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - biases = helperGetGenericTensorAccessorRO(m->weight_type[1], - regions[3], - task->regions[3], - FID_DATA, - ctx, - runtime); - Domain bias_domain = runtime->get_index_space_domain( - ctx, task->regions[3].region.get_index_space()); - assert(bias_domain.get_dim() == 4); - } - Domain input_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Domain weight_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - Domain output_domain = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); - - assert(input_domain.get_dim() == 4); - assert(weight_domain.get_dim() == 2); - assert(output_domain.get_dim() == 4); - - assert(task->index_point.get_dim() == 1); - SpecInferIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, &bc, task->index_point.point_data[0], input, weight, output, biases); - if (m->inference_debugging) { - assert(task->index_point.get_dim() == 1); - int shard_id = task->index_point.point_data[0]; - std::vector weights_accessors; - weights_accessors.push_back(weight); - if (*m->qkv_bias || *m->final_bias) { - weights_accessors.push_back(biases); - } - SpecInferIncMultiHeadSelfAttention::save_inference_tensors_to_file( - m, shard_id, &bc, {input}, weights_accessors, {output}); - } -} - -void SpecInferIncMultiHeadSelfAttention::backward(FFModel const &ff) { - // SpecInferIncMultiHeadSelfAttention does not support backward - assert(false); -} - -bool SpecInferIncMultiHeadSelfAttention::get_int_parameter(PMParameter para, - int *value) const { - switch (para) { - case PM_NUM_HEADS: - *value = num_q_heads; - return true; - default: - return Op::get_int_parameter(para, value); - } -} - -Op *SpecInferIncMultiHeadSelfAttention::materialize(FFModel &ff, - ParallelTensor inputs[], - int num_inputs) const { - SpecInferIncMultiHeadSelfAttentionParams params = get_params(); - return new SpecInferIncMultiHeadSelfAttention( - ff, params, inputs[0], true, this->name); -} - -bool SpecInferIncMultiHeadSelfAttention::measure_operator_cost( - Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { - return false; -} - -bool operator==(SpecInferIncMultiHeadSelfAttentionParams const &lhs, - SpecInferIncMultiHeadSelfAttentionParams const &rhs) { - return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim && - lhs.num_q_heads == rhs.num_q_heads && lhs.kdim == rhs.kdim && - lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && - lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias && - lhs.add_zero_attn == rhs.add_zero_attn && - lhs.apply_rotary_embedding == rhs.apply_rotary_embedding && - lhs.scaling_query == rhs.scaling_query && - lhs.scaling_factor == rhs.scaling_factor && - lhs.qk_prod_scaling == rhs.qk_prod_scaling && - lhs.position_bias == rhs.position_bias; -} - -SpecInferIncMultiHeadSelfAttentionParams - SpecInferIncMultiHeadSelfAttention::get_params() const { - SpecInferIncMultiHeadSelfAttentionParams params; - params.layer_guid = this->layer_guid; - params.embed_dim = this->oProjSize; - params.num_q_heads = this->num_q_heads; - params.num_kv_heads = this->num_kv_heads; - params.kdim = this->kProjSize; - params.vdim = this->vProjSize; - params.dropout = this->dropout; - params.qkv_bias = this->qkv_bias; - params.final_bias = this->final_bias; - params.add_zero_attn = this->add_zero_attn; - params.apply_rotary_embedding = this->apply_rotary_embedding; - params.scaling_query = this->scaling_query; - params.scaling_factor = this->scaling_factor; - params.qk_prod_scaling = this->qk_prod_scaling; - params.position_bias = this->position_bias; - - return params; -} - -}; // namespace FlexFlow - -namespace std { -size_t hash::operator()( - FlexFlow::SpecInferIncMultiHeadSelfAttentionParams const ¶ms) const { - size_t key = 0; - hash_combine(key, params.layer_guid.id); - hash_combine(key, params.embed_dim); - hash_combine(key, params.num_q_heads); - hash_combine(key, params.num_kv_heads); - hash_combine(key, params.kdim); - hash_combine(key, params.vdim); - hash_combine(key, params.dropout); - hash_combine(key, params.qkv_bias); - hash_combine(key, params.final_bias); - hash_combine(key, params.add_zero_attn); - hash_combine(key, params.apply_rotary_embedding); - hash_combine(key, params.scaling_query); - hash_combine(key, params.scaling_factor); - hash_combine(key, params.qk_prod_scaling); - hash_combine(key, params.position_bias); - return key; -} -}; // namespace std diff --git a/src/ops/specinfer_inc_multihead_self_attention.cu b/src/ops/specinfer_inc_multihead_self_attention.cu deleted file mode 100644 index 8340519ff3..0000000000 --- a/src/ops/specinfer_inc_multihead_self_attention.cu +++ /dev/null @@ -1,958 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) -#include "cuComplex.h" -#endif -#include "flexflow/ffconst_utils.h" -#include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" -#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" -#include "flexflow/ops/specinfer_inc_multihead_self_attention.h" -#include "flexflow/utils/cuda_helper.h" - -namespace FlexFlow { - -#define WARP_SIZE 32 - -// declare Legion names -using Legion::coord_t; -using Legion::Memory; -using namespace Kernels::IncMultiHeadAttention; - -namespace Kernels { -namespace SpecInferIncMultiHeadAttention { - -template -__global__ void compute_specinfer_attention_kernel_generation_kernel( - DT const *query, - DT const *key_cache, - DT const *value_cache, - DT *output_ptr, - float const scale, - int const max_seq_length, - int per_head_size, - int hidden_size, - BatchConfig::PerRequestInfo *request_infos, - BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos, - BatchConfig::BitMask *causalMask) { - - // q, k - using Q_vec = typename VEC_K::Type; - using K_vec = typename VEC_K::Type; - using V_vec = typename VEC_V
::Type; - using Out_sum = typename Vec_fp32_::Type; - - constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; - - constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT); - constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY; - constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; - // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT); - - // thread id - int const tidx = threadIdx.x; - // head id - int const head_idx = blockIdx.x; - // nth request idx - int const request_idx = blockIdx.y; - - // request id in batch config - int const batch_config_request_id = - request_infos[request_idx].batch_config_request_id; - - // request_idx = re - - BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; - - int const first_step = 0; - - int const tlength = - request_infos[batch_config_request_id].first_token_depth_in_request + - request_infos[batch_config_request_id].num_tokens_in_batch; - - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - printf("specinfer attn fused kernel!!!\n"); - } - - int const totalCacheSize = bitmask.non_tree_cache_size + bitmask.tree_size; - - if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - printf("specinfer attn fused kernel %d, %d\n", - totalCacheSize, - request_infos[batch_config_request_id].num_tokens_in_batch); - } - // int const qlength = request_infos[request_idx].num_tokens_in_batch; - int const tree_branch_num = - beam_request_infos[batch_config_request_id].sub_request_num; - - // will decode qlength tokens in this thread block - // int const qlength = tree_branch_num; - - int first_token_idx = 0; - for (int r = 0; r < request_idx; r++) { - first_token_idx += causalMask[r].this_layer_size; - } - - // if (tidx == 0 && head_idx == 0) { - // printf("spec req: %d, %d\n", request_idx, first_token_idx); - // } - - // shared memory objects - extern __shared__ char smem_[]; - - float *qk_smem = reinterpret_cast(smem_); - float *out_smem = reinterpret_cast(smem_); - - float qk_max = -FLT_MAX; - - // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum - __shared__ float red_smem[WARPS_PER_BLOCK * 2]; - - const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM + - head_idx * per_head_size; - __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; - - // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE - int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE; - int ki_o = tidx % THREADS_PER_KEY; - // the first key's offset for this thread - // ko = 0, 0, 0, 0, 1, 1, 1, 1, .... - int ko = tidx / THREADS_PER_KEY; - // load q tensor - Q_vec q_vec[K_VECS_PER_THREAD]; - - constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY; - // The number of keys per warp. - constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; - - DT const *k_cache_batch = - key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; - - int ti_end = - div_up(totalCacheSize - first_step, K_PER_WARP) * K_PER_WARP + first_step; - - for (int qi = 0; qi < tree_branch_num; qi += 1) { -#pragma unroll - for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { - q_vecs[ki_o][ii] = *reinterpret_cast( - q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + - ii * THREADS_PER_KEY * K_VEC_SIZE); - } - - int const query_token = bitmask.tree_size - tree_branch_num + qi; - - __syncthreads(); - for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { - K_vec k[K_VECS_PER_THREAD]; - int const ti_circ = ti % max_seq_length; - - for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { - int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; - if (ti < totalCacheSize) { - - k[ii] = *reinterpret_cast( - k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size + - jj); - } - } - float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); - - if (ti < totalCacheSize && tidx % THREADS_PER_KEY == 0) { - // todo add alobi here - // bool const mask = ti_circ >= totalCacheSize; - bool const mask = (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & - (1 << query_token)))); - - // if (blockIdx.y == 0 && blockIdx.x == 0 && !mask) { - // printf("spec inc attn qkqkqk %d, %.10f, %d\n", ti, qk, qi); - // } - qk_max = mask ? qk_max : fmaxf(qk_max, qk); - qk_smem[ti - first_step] = mask ? 0.f : qk; - } - } - - __syncthreads(); - -#pragma unroll - for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { - qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); - } - - // Decompose the thread index into warp and lane. - int const warp = tidx / WARP_SIZE; - int const lane = tidx % WARP_SIZE; - - // The warp leader writes the max to shared memory. - if (lane == 0) { - red_smem[warp] = qk_max; - } - - // Make sure the products are in shared memory. - __syncthreads(); - - // The warps finalize the reduction. - qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; -#pragma unroll - for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { - qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); - } - - // Broadcast to all the threads in the warp. - qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); - - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - // printf("spec inc attn first token qk_max %.10f\n", qk_max); - // } - - float exp_sum = 0.f; - for (int ti = first_step + tidx; ti < totalCacheSize; - ti += THREADS_PER_BLOCK) { - bool const mask = (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & - (1 << query_token)))); - float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max); - exp_sum += logit; - qk_smem[ti - first_step] = mask ? 0.0f : logit; - } - - // Compute the sum. - exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); - - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - // printf("spec inc attn exp_sum %.10f\n", exp_sum); - // } - - // softmax - float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); - for (int ti = first_step + tidx; ti < totalCacheSize; - ti += THREADS_PER_BLOCK) { - qk_smem[ti - first_step] *= inv_sum; - } - - __syncthreads(); - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - // printf("softmax %.10f\n", qk_smem[0]); - // } - - // value projection - constexpr int V_VEC_SIZE = 16 / sizeof(DT); - // A vector of V elements for the current timestep. - // using V_vec_k = typename V_vec_k_::Type; - // using V_vec_acum = typename V_vec_acum_fp32_::Type; - - // The value computed by this thread. - int vo = tidx / THREADS_PER_VALUE; - // The hidden dimensions computed by this particular thread. - int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; - constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; - - Out_sum out; - zero(out); - - // The base pointer for the value in the cache buffer. - DT const *v_cache_batch = - value_cache + batch_config_request_id * max_seq_length * hidden_size + - vi; - - if (Dh == Dh_MAX || vi < Dh) { - for (int ti = first_step + vo; ti < totalCacheSize; ti += V_PER_ITER) { - // Load the values from the cache. - int const ti_circ = ti % max_seq_length; - V_vec v = *reinterpret_cast( - v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); - - bool const mask = (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & - (1 << query_token)))); - float logit = mask ? 0.0f : qk_smem[ti - first_step]; - out = FlexFlow::fma(logit, cast_to_float(v), out); - } - } - - // // Make sure we can start writing to shared memory. - __syncthreads(); - - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - // printf("valueX %.10f\n", out.x); - // } - - // Run the final reduction amongst the different groups computing different - // partial outputs. - if (Dh == Dh_MAX || vi < Dh) { -#pragma unroll - for (int active_groups = V_PER_ITER; active_groups >= 2; - active_groups /= 2) { - - // The midpoint in the number of active groups. - int midpoint = active_groups / 2; - - // The upper part of active threads store to shared memory. - if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) { - *reinterpret_cast(out_smem + (vo - midpoint) * Dh + vi) = - out; - } - __syncthreads(); - - // The bottom warps update their values. - if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) { - out = add(*reinterpret_cast(out_smem + vo * Dh + vi), - out); - } - __syncthreads(); - } - } - - // Output the final values. - if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { - convert_from_float(*reinterpret_cast( - output_ptr + (first_token_idx + qi) * hidden_size + - head_idx * per_head_size + vi), - out); - } - } -} - -template -__global__ void specinfer_store_kv_cache( - DT const *devQKVProjArray, - DT *kCache_ptr, - DT *vCache_ptr, - BatchConfig::PerTokenInfo *tokenInfos, - BatchConfig::PerRequestInfo *requestInfo, - BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos, - BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos, - BatchConfig::BitMask *causalMask, - int qProjSize, - int kProjSize, - int vProjSize, - int num_tokens, - int max_seq_len, - bool is_root, - int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { - int token_idx = i / (hidden_size); - int offset = i % hidden_size; - - size_t val_idx = - token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; - - DT kVal = devQKVProjArray[val_idx]; - DT vVal = devQKVProjArray[val_idx + hidden_size]; - - int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - int const first_token_in_req = - requestInfo[req_id].first_token_depth_in_request; - int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; - int const total_token = requestInfo[req_id].num_tokens_in_batch; - - int const request_token_offset = - requestInfo[req_id].first_token_offset_in_batch; - - BatchConfig::BitMask bitmask = causalMask[req_id]; - - int const sub_request_num = beamRequestInfos[req_id].sub_request_num; - - int const tree_branch_num = beamRequestInfos[req_id].sub_request_num; - - // int const query_token = bitmask.non_tree_cache_size + bitmask.tree_size - - // tree_branch_num + sub_req_id + tok_id; - // bitmask.tree_size - tree_branch_num + sub_req_id; - - // if prompt token -> token id - // if tree token: - int const cache_idx = bitmask.non_tree_cache_size + bitmask.tree_size - - bitmask.this_layer_size + token_idx - - request_token_offset; - - kCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + - offset] = kVal; - vCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + - offset] = vVal; - } -} - -template -void update_kv_cache_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - cudaStream_t stream) { - int num_tokens = bc->num_active_tokens(); - int curr_depth = bc->beamRequestsInfo[0].current_depth; - // printf("curr depth: %d\n", curr_depth); - // assert(curr_depth < 3); - if (num_tokens > 0) { - int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens; - // printf("tokenInfo %d, %d\n", - // bc->beamTokenInfo[0].sub_request_index, - // num_tokens); - specinfer_store_kv_cache<<>>( - static_cast
(m->devQKVProjArray), - static_cast
(m->keyCache), - static_cast
(m->valueCache), - m->token_infos, - m->request_infos, - m->beam_token_infos, - m->beam_request_infos, - m->causalMask, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_tokens, - BatchConfig::max_sequence_length() + - BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, - /*root*/ curr_depth == 0, - m->hidden_size); - } -} - -#define LAUNCH_SPECINFER_ATTENTION_SCORE_KERNEL( \ - DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream) \ - smem_sz = smem_size_in_bytes
(m->qProjSize, \ - BatchConfig::max_sequence_length() + \ - BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ - THREADS_PER_VALUE, \ - THDS_PER_BLOCK); \ - compute_specinfer_attention_kernel_generation_kernel \ - <<>>( \ - static_cast
(m->devQKVProjArray), \ - static_cast
(m->keyCache), \ - static_cast
(m->valueCache), \ - output_ptr, \ - scale, \ - BatchConfig::max_sequence_length() + \ - BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ - m->qProjSize, \ - m->hidden_size, \ - m->request_infos, \ - m->beam_request_infos, \ - m->causalMask) - -template -void compute_specinfer_attention_kernel_generation( - SpecInferIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - DT *output_ptr, - cudaStream_t stream) { - // one block == one head per request - printf("??? at here: %d\n", bc->num_active_requests()); - dim3 grid(m->num_q_heads, bc->num_active_requests()); - int const per_head_size = m->qProjSize; - float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; - size_t smem_sz; - if (per_head_size == 64) { - constexpr int THREADS_PER_VALUE_64 = threads_per_value_t::value; - LAUNCH_SPECINFER_ATTENTION_SCORE_KERNEL( - DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream); - } else if (per_head_size == 128) { - constexpr int THREADS_PER_VALUE_128 = threads_per_value_t::value; - LAUNCH_SPECINFER_ATTENTION_SCORE_KERNEL( - DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream); - } else { - assert(false && "a unsupported head size"); - } -} - -template -__global__ void spec_fill_entries_above_diagonal(DT *matrix, - size_t new_tokens, - size_t total_tokens_in_request, - size_t num_q_heads, - DT value) { - CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_q_heads) { - // size_t head_idx = i / (new_tokens * total_tokens_in_request); - size_t src_idx = (i / new_tokens) % total_tokens_in_request; - size_t dst_idx = i % new_tokens + total_tokens_in_request - new_tokens; - // Casual Mask - if (src_idx > dst_idx) { - matrix[i] = value; - } - } -} - -template -void compute_attention_kernel_prompt( - SpecInferIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - int shard_id, - DT *output_ptr, - DT const *bias_ptr, - DT const *weight_ptr, - cudaStream_t stream) { - checkCUDA(cublasSetStream(m->handle.blas, stream)); - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); - cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); - assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif - // int num_requests = bc->num_active_requests(); - int num_tokens = bc->num_active_tokens(); - int tokens_previous_requests = 0; - int tokens_prev_requests_squares = 0; - // int qkv_block_size = - // (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; - int q_block_size = m->qProjSize; - - int kt_block_size = m->kProjSize; - int kt_req_block_size = kt_block_size * m->num_q_heads * - (BatchConfig::max_sequence_length() + - BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); - int vt_block_size = m->vProjSize; - int vt_req_block_size = vt_block_size * m->num_q_heads * - (BatchConfig::max_sequence_length() + - BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); - assert(m->qProjSize == m->kProjSize); - - for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } - // else if (tokens_previous_requests < bc->num_generation_tokens) { - // tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; - // continue; - // } - - // all requests in prompt phase should only have one sub requests; - assert(bc->sub_requests[i] == 1); - // int num_new_tokens = bc->num_processing_tokens[i]; - // int total_tokens = bc->token_last_available_idx[i] + 1; - - int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + - bc->requestsInfo[i].num_tokens_in_batch; - - if (num_new_tokens <= 0) { - continue; - } - - // Compute (QK^T/sqrt(d_k)) - int m_ = num_new_tokens; - int n = total_tokens; - int k = m->qProjSize; - int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, - ldc = m_; - int strideA = q_block_size; - int strideB = kt_block_size; - int strideC = num_new_tokens * total_tokens; - - // a flag of using this scaling alpha - DT alpha = 1.0f, beta = 0.0f; - if (*m->qk_prod_scaling) { - alpha = static_cast
(1.0f / sqrt(m->kProjSize)); - } - // To get A, skip over Q entries from previous requests (same head) - DT const *A = static_cast
(m->devQKVProjArray) + - bc->requestsInfo[i].first_token_offset_in_batch * - m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; - // To get B, skip over K entries from previous requests (all heads + - // padding) - - // print_tensor((float*)A, 32, "A"); - std::cout << "meta: " << num_new_tokens << ", " << total_tokens << "\n"; - DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; - - // if (i == 0 && sub_req_id == 0 && - // bc->beam_slots.at(0).current_depth == 1) { - // int offset = (float *)B - m->keyCache; - // printf("key cache offset %d\n", kt_req_block_size); - // } - // To get C, skip over QK^T products from previous requests - DT *C = static_cast
(m->qk_prods) + - m->num_q_heads * tokens_prev_requests_squares; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // print_tensor((float*)C, 32, "C"); - // add alibi position bias to qk production - // add alibi position bias to qk production - if (*m->position_bias) { - size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; - apply_position_bias_qkprd<<>>(C, - num_new_tokens, - total_tokens, - m->num_q_heads, - m->global_num_q_heads, - shard_id); - } - // Fill all elements above diagonal in qk prods with -inf to force - // causal attention. - assert(num_new_tokens <= total_tokens); - if (num_new_tokens > 1) { - size_t parallelism = m->num_q_heads * num_new_tokens * total_tokens; - spec_fill_entries_above_diagonal<<>>(C, - num_new_tokens, - total_tokens, - m->num_q_heads, - static_cast
(-INFINITY)); - } - // Compute Softmax(QK^T/sqrt(d_k)) - // Before modifying the parameters below, make sure to read the following - // description of the CUDNN_TENSOR_NCHW tensor layout, from - // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: - // This tensor format specifies that the data is laid out in the following - // order: batch size, feature maps, rows, columns. The strides are - // implicitly defined in such a way that the data are contiguous in memory - // with no padding between images, feature maps, rows, and columns; the - // columns are the inner dimension and the images are the outermost - // dimension. - int n_param = m->num_q_heads; - int c_param = total_tokens; - int h_param = 1; - int w_param = num_new_tokens; - checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, - CUDNN_TENSOR_NCHW, - cudnn_data_type, - n_param, - c_param, - h_param, - w_param)); - float softmax_alpha = 1.0f, softmax_beta = 0.0f; - DT *C_softmax = static_cast
(m->qk_prods_softmax) + - m->num_q_heads * tokens_prev_requests_squares; - // The softmax operation below is executed according to the - // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The - // softmax operation is computed per spatial location (H,W) per image (N) - // across dimension C. - checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &softmax_alpha, - m->qk_tensor, - C, - &softmax_beta, - m->qk_tensor, - C_softmax)); - // Matmul softmax(QK^T/sqrt(d_k)) by V - alpha = 1.0f, beta = 0.0f; - m_ = m->vProjSize; - n = num_new_tokens; - k = total_tokens; - lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; - strideA = vt_block_size; - strideB = num_new_tokens * total_tokens; - strideC = m->vProjSize; - // To get A, skip over V^T entries from previous requests (all heads + - // padding) - A = static_cast
(m->valueCache) + i * vt_req_block_size; - // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous - // requests (all heads) - B = C_softmax; - // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous - // requests - - // print_tensor((float*)C_softmax, 32, "C_softmax"); - C = static_cast
(m->attn_heads) + - (tokens_previous_requests + bc->num_generation_tokens) * - m->num_q_heads * m->vProjSize; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - tokens_previous_requests += num_new_tokens; - tokens_prev_requests_squares += num_new_tokens * total_tokens; - } - - // assert(tokens_previous_requests == num_tokens); -} - -template -void inference_kernel(SpecInferIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - int shard_id, - DT const *input_ptr, - DT const *weight_ptr, - DT *output_ptr, - DT const *bias_ptr, - cudaStream_t stream) { - // phase 1: Implement kernel to compute KQV for input tokens - - compute_qkv_kernel(m, - bc, - shard_id, - input_ptr, - weight_ptr, - static_cast
(m->devQKVProjArray), - bias_ptr, - stream); - // phase 2: Update key/val cache - update_kv_cache_kernel
(m, bc, stream); - // std::cout << "specinfer kernel token num: " << bc->num_generation_tokens - // << ", " << bc->num_tokens << "\n"; - if (bc->num_generation_tokens > 0) { - printf("spec inc generation decoding\n"); - compute_specinfer_attention_kernel_generation
( - m, bc, static_cast
(m->attn_heads), stream); - } - // phase 3: Compute attention score - // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - if (bc->num_tokens > bc->num_generation_tokens) { - printf("spec inc prompt decoding\n"); - compute_attention_kernel_prompt( - m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); - } - // compute_attention_kernel_prompt( - // m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); - - // compute output production and bias together for all tokens - int num_tokens = bc->num_active_tokens(); - - // std::cout << "specinfer num tokens: " << num_tokens; - - compute_o_prod_bias( - m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); -} - -} // namespace SpecInferIncMultiHeadAttention -} // namespace Kernels - -/*static*/ -void SpecInferIncMultiHeadSelfAttention::inference_kernel_wrapper( - SpecInferIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - int shard_id, - GenericTensorAccessorR const &input, - GenericTensorAccessorR const &weight, - GenericTensorAccessorW const &output, - GenericTensorAccessorR const &bias) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - bool use_bias = *m->qkv_bias || *m->final_bias; - - cudaEvent_t t_start, t_end; - if (m->profiling) { - cudaEventCreate(&t_start); - cudaEventCreate(&t_end); - cudaEventRecord(t_start, stream); - } - - assert(input.data_type == weight.data_type); - assert(input.data_type == output.data_type); - if (use_bias) { - assert(input.data_type == bias.data_type); - } - - if (input.data_type == DT_HALF) { - half const *bias_ptr = - use_bias ? bias.get_half_ptr() : static_cast(nullptr); - Kernels::SpecInferIncMultiHeadAttention::inference_kernel( - m, - bc, - shard_id, - input.get_half_ptr(), - weight.get_half_ptr(), - output.get_half_ptr(), - bias_ptr, - stream); - } else if (input.data_type == DT_FLOAT) { - float const *bias_ptr = - use_bias ? bias.get_float_ptr() : static_cast(nullptr); - Kernels::SpecInferIncMultiHeadAttention::inference_kernel( - m, - bc, - shard_id, - input.get_float_ptr(), - weight.get_float_ptr(), - output.get_float_ptr(), - bias_ptr, - stream); - } else { - assert(false && "Unspported data type"); - } - - if (m->profiling) { - cudaEventRecord(t_end, stream); - checkCUDA(cudaEventSynchronize(t_end)); - float elapsed = 0; - checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); - cudaEventDestroy(t_start); - cudaEventDestroy(t_end); - printf("SpecInferIncMultiHeadSelfAttention forward time = %.2fms\n", - elapsed); - // print_tensor<3, float>(acc_query.ptr, acc_query.rect, - // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, - // acc_output.rect, "[Attention:forward:output]"); - } - // save_tensor(output.get_float_ptr(), 768 * 3, - // "/home/xinhaoc/FlexFlow/inference/output/fk1.txt"); - // save_tensor(output.get_float_ptr() + 768 * 3, 768 * 3, - // "/home/xinhaoc/FlexFlow/inference/output/fk2.txt"); - - // if(bc->num_tokens == 1){ - // print_tensor(input.get_float_ptr(), 32, "specinc input"); - // print_tensor(output.get_float_ptr(), 32, "specinc output"); - // assert(false); - // } -} - -SpecInferIncMultiHeadSelfAttentionMeta::SpecInferIncMultiHeadSelfAttentionMeta( - FFHandler handler, - SpecInferIncMultiHeadSelfAttention const *attn, - GenericTensorAccessorR const &weight, - MemoryAllocator &gpu_mem_allocator, - int num_samples, - int _num_q_heads, - int _num_kv_heads) - : IncMultiHeadSelfAttentionMeta(handler, - BEAM_SEARCH_MODE, - attn, - attn->qSize, - attn->kSize, - attn->vSize, - attn->qProjSize, - attn->kProjSize, - attn->vProjSize, - attn->oProjSize, - attn->apply_rotary_embedding, - attn->qkv_bias, - attn->scaling_query, - attn->qk_prod_scaling, - attn->position_bias, - attn->final_bias, - attn->scaling_factor, - weight, - gpu_mem_allocator, - num_samples, - attn->num_q_heads, - attn->num_kv_heads, - _num_q_heads, - _num_kv_heads, - DT_NONE, - false) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - checkCUDNN(cudnnSetStream(handler.dnn, stream)); - - // allocate memory for the seqArray and reserve space - { - // size_t causal_mask_size = BatchConfig::MAX_NUM_REQUESTS; - // size_t total_size = causal_mask_size * sizeof(BatchConfig::BitMask); - // gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, - // total_size); - - beam_token_infos = - static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo)); - - beam_request_infos = - static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo)); - causalMask = static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo) + - sizeof(BeamSearchBatchConfig::beamRequestsInfo)); - - // causalMask = gpu_mem_allocator.allocate_instance( - // causal_mask_size); - // beam_token_infos = - // gpu_mem_allocator - // .allocate_instance( - // beam_tokeninfo_size); - // offset += beam_tokeninfo_size * - // sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo); - // beam_request_infos = - // gpu_mem_allocator - // .allocate_instance( - // beam_requestinfo_size); - // offset += beam_requestinfo_size * - // sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo); - // assert(offset == total_size); - // assert(gpu_mem_allocator.instance_total_size == - // gpu_mem_allocator.instance_allocated_size); - } - - cudaStreamSynchronize(stream); -} - -SpecInferIncMultiHeadSelfAttentionMeta::~SpecInferIncMultiHeadSelfAttentionMeta( - void) { - if (beam_search_reserve_inst != Realm::RegionInstance::NO_INST) { - beam_search_reserve_inst.destroy(); - } -} - -}; // namespace FlexFlow diff --git a/src/ops/tree attn kernel, 0----> -0.029753357172 b/src/ops/tree attn kernel, 0----> -0.029753357172 deleted file mode 100644 index e4f14ee757..0000000000 --- a/src/ops/tree attn kernel, 0----> -0.029753357172 +++ /dev/null @@ -1 +0,0 @@ -tree attn kernel, 0----> -0.02975335717201232910 0.01930358447134494781 0.03780741989612579346 0.11878532171249389648 -0.03523746877908706665 0.02421043440699577332 0.03719477355480194092 -0.00304851122200489044 0.02062662504613399506 0.06683708727359771729 -0.00642335414886474609 -0.00504039414227008820 0.02955199964344501495 0.00648811273276805878 0.00558663159608840942 0.02003456838428974152 -0.04041406139731407166 0.00736814411357045174 -0.04575226455926895142 0.03949077427387237549 0.05742383748292922974 0.04866250604391098022 0.04687267541885375977 -0.00701304525136947632 -0.03712264448404312134 -0.02175992354750633240 -0.03979443758726119995 0.03961737453937530518 -0.07450901716947555542 0.02090370282530784607 -0.03487894684076309204 0.01653470844030380249 \ No newline at end of file diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index a4329f52db..5c6527baf9 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -81,30 +81,22 @@ __global__ void compute_attention_kernel_fused_kernel( int const first_step = 0; - int const tlength = request_infos[batch_config_request_id].first_token_depth_in_request + - request_infos[batch_config_request_id].num_tokens_in_batch; - int const qlength = request_infos[batch_config_request_id].num_tokens_in_batch; + int const tlength = + request_infos[batch_config_request_id].first_token_depth_in_request + + request_infos[batch_config_request_id].num_tokens_in_batch; + int const qlength = + request_infos[batch_config_request_id].num_tokens_in_batch; BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; - // bitmask.mask[1] = 3; - // if (head_idx == 0 && tidx == 0) { - // printf("tree attn fused kernel req id %d %d, %d, %d, %lld\n", - // request_idx, - // tlength, - // qlength, - // bitmask.non_tree_cache_size, - // bitmask.mask[3]); - // } - int first_token_idx = 0; for (int r = 0; r < request_idx; r++) { first_token_idx += request_infos[r].num_tokens_in_batch; } - if(tidx == 0 && head_idx == 0){ - printf("tree req: %d, %d\n", request_idx, first_token_idx); - } + // if(tidx == 0 && head_idx == 0){ + // printf("tree req: %d, %d\n", request_idx, first_token_idx); + // } // shared memory objects extern __shared__ char smem_[]; @@ -174,26 +166,11 @@ __global__ void compute_attention_kernel_fused_kernel( (ti >= bitmask.non_tree_cache_size && (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); - // if (head_idx == 0 && qi == 9 && mask) { - // printf("tree attn mask for first token %d, %lld, %d, %d, %d\n", - // ti, - // bitmask.mask[ti - bitmask.non_tree_cache_size], - // bitmask.non_tree_cache_size, - // request_idx, - // qi); - // } - // if (blockIdx.y == 0 && blockIdx.x == 0 && qi == 3 && mask) { - // printf("tree attn mask for third token %d, %lld, %d, %d\n", - // ti, - // bitmask.mask[ti - bitmask.non_tree_cache_size], - // bitmask.non_tree_cache_size, - // qi); - // } - qk_max = mask ? qk_max : fmaxf(qk_max, qk); // if (head_idx == 0 && qi == 0 && !mask) { - // printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, %.10f\n ", + // printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, %.10f\n + // ", // request_idx, // ti, // qk, @@ -250,10 +227,6 @@ __global__ void compute_attention_kernel_fused_kernel( // Compute the sum. exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); - // if (head_idx == 0 && tidx == 0 && qi == 9) { - // printf("expsum %.10f\n", exp_sum); - // } - // softmax float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { @@ -261,9 +234,6 @@ __global__ void compute_attention_kernel_fused_kernel( } __syncthreads(); - // if (head_idx == 0 && tidx == 0 && qi == 9) { - // printf("softmax %.10f\n", qk_smem[1]); - // } // value projection constexpr int V_VEC_SIZE = 16 / sizeof(DT); @@ -282,12 +252,8 @@ __global__ void compute_attention_kernel_fused_kernel( // The base pointer for the value in the cache buffer. DT const *v_cache_batch = - value_cache + batch_config_request_id * max_seq_length * hidden_size + vi; - // DT const *v_cache_batch = - // value_cache + - // (beam_request_idx * max_beam_width + beam_sub_request_idx) * - // max_seq_length * hidden_size + - // vi; + value_cache + batch_config_request_id * max_seq_length * hidden_size + + vi; if (Dh == Dh_MAX || vi < Dh) { for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { @@ -310,10 +276,6 @@ __global__ void compute_attention_kernel_fused_kernel( // // Make sure we can start writing to shared memory. __syncthreads(); - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { - // printf("valueX %.10f\n", out.x); - // } - // Run the final reduction amongst the different groups computing different // partial outputs. if (Dh == Dh_MAX || vi < Dh) { @@ -391,19 +353,6 @@ __global__ void commit_tokens_kernel( int const req_id = committedTokenInfos[token_pos].request_index; int const tok_id = committedTokenInfos[token_pos].token_depth; - // if(i == 0){ - // printf("commit token: %d %d %f\n", token_idx_in_last_batch, tok_id, - // kVal); - // } - // if(i == hidden_size){ - // printf("commit token 1: %d %d %f\n", token_idx_in_last_batch, tok_id, - // kVal); - // } - // if(i == 2 * hidden_size){ - // printf("commit token 2: %d %d %f\n", token_idx_in_last_batch, tok_id, - // kVal); - // } - kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + offset] = kVal; vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + @@ -500,11 +449,13 @@ __global__ void update_tree_branch_kv_cache_fused( int const request_token_offset = request_infos[req_id].first_token_offset_in_batch; - int const first_token_depth = request_infos[req_id].first_token_depth_in_request; + int const first_token_depth = + request_infos[req_id].first_token_depth_in_request; // if(i % hidden_size == 0){ - // printf("update token request id: %d, %d, %d real id %d, value%.10f\n", req_id, - // token_idx, request_token_offset,(token_idx + first_token_depth - request_token_offset), kVal); + // printf("update token request id: %d, %d, %d real id %d, value%.10f\n", + // req_id, token_idx, request_token_offset,(token_idx + first_token_depth + // - request_token_offset), kVal); // } kCache_ptr[req_id * (hidden_size * max_seq_len) + (token_idx + first_token_depth - request_token_offset) * @@ -591,8 +542,6 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, num_new_tokens++; } - std::cout << "num_new_tokens: " << num_new_tokens << "\n"; - int total_tokens_in_request = bc->tokensInfo[j].abs_depth_in_request + 1; assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens); { @@ -873,12 +822,6 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, // update K-V cache int num_new_tokens = bc->num_active_tokens(); int parallelism = m->hidden_size * num_new_tokens; - // printf("update KV cache %d, idx: %d\n", - // num_new_tokens, - // bc->requestsInfo[0].first_token_depth_in_request); - // for (int i = 0; i < num_new_tokens; i++) { - // printf("abs depth:%d\n", bc->tokensInfo[i].abs_depth_in_request); - // } update_tree_branch_kv_cache_fused<<bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream); bias_ptr = static_cast
(m->bias_ptr); } - // cudaMemcpyAsync(m->token_infos, - // &(bc->tokensInfo), - // bc->num_active_tokens() * - // sizeof(TreeVerifyBatchConfig::PerTokenInfo), - // cudaMemcpyHostToDevice, - // stream); - // cudaMemcpyAsync(m->request_infos, - // &(bc->requestsInfo), - // bc->max_requests_per_batch() * - // sizeof(BatchConfig::PerRequestInfo), - // cudaMemcpyHostToDevice, - // stream); // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, @@ -992,9 +923,6 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, bias_ptr, stream); - // print_tensor((float *)m->devQKVProjArray, 32, "qkvtenor1"); - // print_tensor((float *)m->devQKVProjArray + 768 * (25 * 7) * 3, 32, "qkvtenor2"); - // phase 2: No need to update key/val cache // IncMultiHeadSelfAttention::update_kv_cache_kernel( // m, bc, stream); @@ -1037,8 +965,6 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventRecord(t_start, stream); } - std::cout << "tree input tokens: " << bc->num_active_tokens() << "\n"; - // assert(input.data_type == weight.data_type); assert(input.data_type == output.data_type); if (use_bias) { @@ -1089,20 +1015,6 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventDestroy(t_start); cudaEventDestroy(t_end); } - - // print_tensor(output.get_float_ptr(), 32, "tree attn kernel"); - - // save_tensor( - // input.get_float_ptr(), - // 768 * bc->num_active_tokens(), - // "/home/xinhaoc/FlexFlow/inference/output/Newtreeinput.txt"); - // save_tensor( - // output.get_float_ptr(), - // 768 * bc->num_active_tokens(), - // "/home/xinhaoc/FlexFlow/inference/output/Newtreeoutput.txt"); - // std::cout << "new tokens: " << bc->num_active_tokens() << "\n"; - - // assert(bc->num_tokens_to_commit == 0); } TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index 904bfbcaff..c7b6e1257a 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -154,8 +154,6 @@ std::string get_operator_type_name(OperatorType type) { return "SpecIncMultiHeadSelfAttention"; case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: return "TreeIncMultiHeadSelfAttention"; - case OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION: - return "SpecInferPgraoIncMultiHeadSelfAttention"; case OP_INPUT: return "Input"; case OP_WEIGHT: diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 46f7cc0f29..6d33dd9f27 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -51,7 +51,6 @@ #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" #include "flexflow/ops/tree_inc_multihead_self_attention.h" -#include "flexflow/ops/specinfer_inc_multihead_self_attention.h" #include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" @@ -70,7 +69,7 @@ using FlexFlow::MachineView; LegionRuntime::Logger::Category log_graph("graph"); LegionRuntime::Logger::Category log_simplify("graph_simplify"); -Node const Node::INVALID_NODE = Node(); +const Node Node::INVALID_NODE = Node(); Node::Node(void) : guid(0), ptr(NULL) {} @@ -2385,28 +2384,6 @@ GraphOptimalViewSerialized sez.serialize(attn->tensor_parallelism_degree); break; } - case OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION: { - SpecInferIncMultiHeadSelfAttention *attn = - (SpecInferIncMultiHeadSelfAttention *)op; - sez.serialize(attn->layer_guid.id); - sez.serialize(attn->layer_guid.transformer_layer_id); - sez.serialize(attn->layer_guid.model_id); - sez.serialize(attn->oProjSize); - sez.serialize(attn->num_q_heads); - sez.serialize(attn->qProjSize); - sez.serialize(attn->vProjSize); - sez.serialize(attn->dropout); - sez.serialize(attn->qkv_bias); - sez.serialize(attn->final_bias); - sez.serialize(attn->add_zero_attn); - sez.serialize(attn->apply_rotary_embedding); - sez.serialize(attn->scaling_query); - sez.serialize(attn->scaling_factor); - sez.serialize(attn->qk_prod_scaling); - sez.serialize(attn->position_bias); - sez.serialize(attn->num_kv_heads); - break; - } case OP_SOFTMAX: { Softmax *softmax = (Softmax *)op; sez.serialize(softmax->dim); @@ -2937,52 +2914,6 @@ void FFModel::deserialize_graph_optimal_view( params); break; } - case OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION: { - assert(num_inputs == 1); - int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads; - float dropout, scaling_factor; - bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling, position_bias; - size_t id, transformer_layer_id, deserialized_model_id; - dez.deserialize(id); - dez.deserialize(transformer_layer_id); - dez.deserialize(deserialized_model_id); - LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); - dez.deserialize(embed_dim); - dez.deserialize(num_q_heads); - dez.deserialize(k_dim); - dez.deserialize(v_dim); - dez.deserialize(dropout); - dez.deserialize(qkv_bias); - dez.deserialize(final_bias); - dez.deserialize(add_zero_attn); - dez.deserialize(apply_rotary_embedding); - dez.deserialize(scaling_query); - dez.deserialize(scaling_factor); - dez.deserialize(qk_prod_scaling); - dez.deserialize(position_bias); - dez.deserialize(num_kv_heads); - - SpecInferIncMultiHeadSelfAttentionParams params; - params.embed_dim = embed_dim; - params.num_q_heads = num_q_heads; - params.kdim = k_dim; - params.vdim = v_dim; - params.dropout = dropout; - params.qkv_bias = qkv_bias; - params.final_bias = final_bias; - params.add_zero_attn = add_zero_attn; - params.layer_guid = layer_guid; - params.apply_rotary_embedding = apply_rotary_embedding; - params.scaling_query = scaling_query; - params.scaling_factor = scaling_factor; - params.qk_prod_scaling = qk_prod_scaling; - params.position_bias = position_bias; - params.num_kv_heads = num_kv_heads; - node = get_or_create_node(inputs[0], - params); - break; - } case OP_TOPK: { node = TopK::deserialize(*this, dez, inputs, num_inputs); break; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index e7f7c5f52d..52a1efc2ab 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -257,7 +257,6 @@ void InferenceManager::init_operators_inference(FFModel *model) { ((ParallelOp *)op) ->create_input_partition_inference(*model, inputs, outputs); } - printf("init op %s\n", op->name); op->init_inference(*model, inputs, outputs); } } @@ -394,14 +393,13 @@ void InferenceManager::load_input_tokens_from_batch_config( } void InferenceManager::load_inference_metadata_batch_config( - BatchConfigFuture const &bc, - FFHandler *handlers) { + BatchConfigFuture const &bc, FFHandler *handlers) { Context ctx = ff_config.lg_ctx; Runtime *runtime = ff_config.lg_hlr; ArgumentMap argmap; - Rect<1> task_rect(Point<1>(0), - Point<1>(ff_config.workersPerNode * ff_config.numNodes - 1)); + Rect<1> task_rect( + Point<1>(0), Point<1>(ff_config.workersPerNode * ff_config.numNodes - 1)); IndexSpaceT<1> task_is = runtime->create_index_space(ctx, task_rect); // int rank = 0; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index cf72f2d40b..c3ee73d78c 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -59,7 +59,6 @@ #include "flexflow/ops/sigmoid_silu_multi.h" #include "flexflow/ops/softmax.h" #include "flexflow/ops/spec_inc_multihead_self_attention.h" -#include "flexflow/ops/specinfer_inc_multihead_self_attention.h" #include "flexflow/ops/split.h" #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" @@ -94,10 +93,10 @@ Op::Op(FFModel &model, int numWeights, bool allocate_weights, int numOutputs, - ParallelTensor const input1, - ParallelTensor const input2, - ParallelTensor const input3, - ParallelTensor const input4) + const ParallelTensor input1, + const ParallelTensor input2, + const ParallelTensor input3, + const ParallelTensor input4) : Op(model, otype, dtype, @@ -117,10 +116,10 @@ Op::Op(FFModel &model, int _numInputs, int _numWeights, int _numOutputs, - ParallelTensor const _input1, - ParallelTensor const _input2, - ParallelTensor const _input3, - ParallelTensor const _input4) + const ParallelTensor _input1, + const ParallelTensor _input2, + const ParallelTensor _input3, + const ParallelTensor _input4) : op_type(_otype), data_type(_dtype), op_guid(model.op_global_guid++), numInputs(_numInputs), numWeights(_numWeights), numOutputs(_numOutputs), profiling(model.config.profiling), @@ -1025,9 +1024,9 @@ void Op::register_output_parallel_dims( operation); } -int Op::get_output_to_input_dim_mapping(ParallelTensor const output, +int Op::get_output_to_input_dim_mapping(const ParallelTensor output, int output_dim, - ParallelTensor const input) { + const ParallelTensor input) { int output_idx = -1, input_idx = -1; for (int i = 0; i < numOutputs; i++) { if (output == outputs[i]) { @@ -1060,9 +1059,9 @@ int Op::get_output_to_input_dim_mapping(ParallelTensor const output, return -1; } -int Op::get_output_to_weight_dim_mapping(ParallelTensor const output, +int Op::get_output_to_weight_dim_mapping(const ParallelTensor output, int output_dim, - ParallelTensor const weight) { + const ParallelTensor weight) { int output_idx = -1, weight_idx = -1; for (int i = 0; i < numOutputs; i++) { if (output == outputs[i]) { @@ -1659,7 +1658,7 @@ Tensor FFModel::create_tensor(int numdim, } ParallelTensor FFModel::create_parallel_tensor(int numdim, - ParallelDim const dims[], + const ParallelDim dims[], DataType data_type, Op const *op, int idx, @@ -1692,7 +1691,7 @@ Tensor FFModel::create_tensor_legion_ordering(int numdim, ParallelTensor FFModel::create_parallel_tensor_legion_ordering(int numdim, - ParallelDim const dims[], + const ParallelDim dims[], DataType data_type, Op const *op, int idx, @@ -1742,7 +1741,7 @@ Tensor FFModel::create_tensor(int const dims[], } template -ParallelTensor FFModel::create_parallel_tensor(ParallelDim const dims[], +ParallelTensor FFModel::create_parallel_tensor(const ParallelDim dims[], DataType data_type, Op const *owner_op, int owner_idx, @@ -1823,7 +1822,7 @@ Parameter FFModel::create_weight(int numdim, } template -ParallelParameter FFModel::create_parallel_weight(ParallelDim const dims[], +ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1854,7 +1853,7 @@ ParallelParameter FFModel::create_parallel_weight(ParallelDim const dims[], } ParallelParameter FFModel::create_parallel_weight(int numdim, - ParallelDim const dims[], + const ParallelDim dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1874,7 +1873,7 @@ ParallelParameter FFModel::create_parallel_weight(int numdim, ParallelParameter FFModel::create_parallel_weight_legion_ordering( int numdim, - ParallelDim const dims[], + const ParallelDim dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -2088,7 +2087,7 @@ void FFModel::map_weight_with_dim(ParallelTensor weight, } bool FFModel::get_parallel_tensor_from_tensor( - Tensor const tensor, ParallelTensor ¶llel_tensor) const { + const Tensor tensor, ParallelTensor ¶llel_tensor) const { // check if tensor->parallel_tensor is already set if (tensor->parallel_tensor != nullptr) { parallel_tensor = tensor->parallel_tensor; @@ -2125,7 +2124,7 @@ bool FFModel::get_parallel_tensor_from_tensor( } void FFModel::create_disjoint_partition(int num_dims, - ParallelDim const dims[], + const ParallelDim dims[], IndexSpace const &part_is, LogicalRegion const ®ion, LogicalPartition &part) { @@ -2148,7 +2147,7 @@ void FFModel::create_disjoint_partition(int num_dims, template void FFModel::create_disjoint_partition_with_dim2( - ParallelDim const dims[], + const ParallelDim dims[], IndexSpaceT const &part_is, LogicalRegion const ®ion, LogicalPartition &part) { @@ -2181,7 +2180,7 @@ void FFModel::create_disjoint_partition_with_dim2( } void FFModel::create_aliased_partition(int num_dims, - ParallelDim const dims[], + const ParallelDim dims[], int aliased_dim, IndexSpace const &part_is, LogicalRegion const ®ion, @@ -2205,7 +2204,7 @@ void FFModel::create_aliased_partition(int num_dims, template void FFModel::create_aliased_partition_with_dim2( - ParallelDim const dims[], + const ParallelDim dims[], int aliased_dim, IndexSpaceT const &part_is, LogicalRegion const ®ion, @@ -2242,7 +2241,7 @@ void FFModel::create_aliased_partition_with_dim2( } template -void FFModel::create_disjoint_partition(ParallelTensor const tensor, +void FFModel::create_disjoint_partition(const ParallelTensor tensor, IndexSpaceT const &part_is, LogicalPartition &part_fwd, LogicalPartition &part_bwd) { @@ -2290,7 +2289,7 @@ void FFModel::create_disjoint_partition(ParallelTensor const tensor, template void FFModel::create_data_parallel_partition_with_diff_dims( - ParallelTensor const tensor, + const ParallelTensor tensor, IndexSpaceT const &part_is, LogicalPartition &part_fwd, LogicalPartition &part_bwd) { @@ -2672,7 +2671,7 @@ IndexSpace FFModel::get_task_is(ParallelConfig const &pc) const { return get_task_is(view); } -IndexSpace FFModel::get_or_create_task_is(ParallelTensor const tensor) { +IndexSpace FFModel::get_or_create_task_is(const ParallelTensor tensor) { MachineView view; view.ndims = 0; for (int i = 0; i < tensor->num_dims; i++) { @@ -3039,12 +3038,6 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } - case OP_SPECINFER_INC_MULTIHEAD_SELF_ATTENTION: { - Op *op = SpecInferIncMultiHeadSelfAttention::create_operator_from_layer( - *this, layer, inputs); - operators.push_back(op); - return op; - } case OP_BATCHMATMUL: { Op *op = BatchMatmul::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); @@ -3234,7 +3227,7 @@ Op *FFModel::create_operator_from_layer( } void FFModel::create_operators_from_layers() { - std::map tensors_to_parallel_tensors; + std::map tensors_to_parallel_tensors; // for (auto const &l : layers) { for (int layer_idx = 0; layer_idx < layers.size(); layer_idx++) { auto const &l = layers[layer_idx]; @@ -3980,38 +3973,38 @@ void FFIterationConfig::reset() { // Default Config Parameters struct DefaultConfig { - static int const epochs = 1; + const static int epochs = 1; // const static int iterations = 1; - static int const batchSize = 64; - static bool const profiling = false; - static bool const inference_debugging = false; + const static int batchSize = 64; + const static bool profiling = false; + const static bool inference_debugging = false; constexpr static float learningRate = 0.01f; constexpr static float weightDecay = 0.0001f; - static size_t const workSpaceSize = (size_t)128 * 1024 * 1024; // 128 MB - static int const numNodes = 1; - static int const workersPerNode = 0; - static int const cpusPerNode = 0; - static size_t const searchBudget = -1; - static size_t const simulatorWorkSpaceSize = + const static size_t workSpaceSize = (size_t)128 * 1024 * 1024; // 128 MB + const static int numNodes = 1; + const static int workersPerNode = 0; + const static int cpusPerNode = 0; + const static size_t searchBudget = -1; + const static size_t simulatorWorkSpaceSize = (size_t)2 * 1024 * 1024 * 1024; // 2 GB constexpr static float searchAlpha = 1.2f; - static bool const searchOverlapBackwardUpdate = false; - static size_t const offloadReserveSpaceSize = + const static bool searchOverlapBackwardUpdate = false; + const static size_t offloadReserveSpaceSize = (size_t)8 * 1024 * 1024 * 1024; // 8 GB - static bool const cpuOffload = false; - static bool const onlyDataParallel = true; - static bool const enableSampleParallel = true; - static bool const enableParameterParallel = false; - static bool const enableAttributeParallel = false; - static bool const enableInplaceOptimizations = false; - static bool const allowTensorOpMathConversion = false; - static int const machine_model_version = 0; - static int const simulator_segment_size = 16777216; // 16 MB - static int const simulator_max_num_segments = 1; - static int const base_optimize_threshold = 10; - static bool const enable_control_replication = true; + const static bool cpuOffload = false; + const static bool onlyDataParallel = true; + const static bool enableSampleParallel = true; + const static bool enableParameterParallel = false; + const static bool enableAttributeParallel = false; + const static bool enableInplaceOptimizations = false; + const static bool allowTensorOpMathConversion = false; + const static int machine_model_version = 0; + const static int simulator_segment_size = 16777216; // 16 MB + const static int simulator_max_num_segments = 1; + const static int base_optimize_threshold = 10; + const static bool enable_control_replication = true; // The default python data loader type is 2 to enable control replication - static int const python_data_loader_type = 2; + const static int python_data_loader_type = 2; }; FFConfig::FFConfig() { @@ -6233,44 +6226,6 @@ void register_flexflow_internal_tasks(Runtime *runtime, TreeIncMultiHeadSelfAttention::inference_task>(registrar); } } - { - TaskVariantRegistrar registrar( - SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, - "SpecInferIncMultiHeadSelfAttention Init"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - if (pre_register) { - Runtime::preregister_task_variant< - OpMeta *, - SpecInferIncMultiHeadSelfAttention::init_task>( - registrar, "SpecInferIncMultiHeadSelfAttention Init Task"); - } else { - if (enable_control_replication) { - registrar.global_registration = false; - } - runtime->register_task_variant< - OpMeta *, - SpecInferIncMultiHeadSelfAttention::init_task>(registrar); - } - } - { - TaskVariantRegistrar registrar( - SPECINFER_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, - "SpecInferIncMultiHeadSelfAttention Inference"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - if (pre_register) { - Runtime::preregister_task_variant< - SpecInferIncMultiHeadSelfAttention::inference_task>( - registrar, "SpecInferIncMultiHeadSelfAttention Inference Task"); - } else { - if (enable_control_replication) { - registrar.global_registration = false; - } - runtime->register_task_variant< - SpecInferIncMultiHeadSelfAttention::inference_task>(registrar); - } - } // NoOp { TaskVariantRegistrar registrar(NOOP_INIT_TASK_ID, "Weight NCCL Init"); diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp index b51ab83091..5499a280a8 100644 --- a/src/runtime/model.cpp +++ b/src/runtime/model.cpp @@ -152,7 +152,7 @@ FFHandler .wait(); handle.offload_reserve_space = workspaceInst.pointer_untyped(0, sizeof(char)); - }else { + } else { handle.offload_reserve_space = nullptr; } if (handle.batch_config_metadata_size > 0) { @@ -176,7 +176,7 @@ FFHandler .wait(); handle.batch_config_metadata = workspaceInst.pointer_untyped(0, sizeof(char)); - }else { + } else { handle.batch_config_metadata = nullptr; } // checkCUDA(hipMalloc(&handle.workSpace, handle.workSpaceSize)); diff --git a/src/runtime/model.cu b/src/runtime/model.cu index 523b3c76f3..c885b29db2 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -148,11 +148,10 @@ FFHandler .wait(); handle.offload_reserve_space = workspaceInst.pointer_untyped(0, sizeof(char)); - }else { + } else { handle.offload_reserve_space = nullptr; } if (handle.batch_config_metadata_size > 0) { - printf("allocate instance for metadata %d\n", handle.batch_config_metadata_size); // allocate memory for offload reserve space Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) .only_kind(Memory::GPU_FB_MEM) @@ -173,7 +172,7 @@ FFHandler .wait(); handle.batch_config_metadata = workspaceInst.pointer_untyped(0, sizeof(char)); - }else { + } else { handle.batch_config_metadata = nullptr; } diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index e30a7ee478..89d4ddaed4 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -182,7 +182,7 @@ size_t RequestManager::get_num_ssms() { RequestManager::RequestGuid RequestManager::register_new_request(std::vector const &prompt, int max_sequence_length) { - std::lock_guard const lock(request_queue_mutex); + const std::lock_guard lock(request_queue_mutex); // Add a new request Request request; @@ -238,7 +238,7 @@ RequestManager::RequestGuid RequestManager::RequestGuid RequestManager::register_new_request(std::string const &prompt, int max_sequence_length) { - std::lock_guard const lock(request_queue_mutex); + const std::lock_guard lock(request_queue_mutex); // Add a new request Request request; request.status = Request::PENDING; @@ -296,7 +296,7 @@ RequestManager::RequestGuid } bool RequestManager::is_request_completed(RequestGuid const &guid) { - std::lock_guard const lock(request_queue_mutex); + const std::lock_guard lock(request_queue_mutex); assert(all_requests.find(guid) != all_requests.end()); Request const &request = all_requests[guid]; // return request.tokens.size() >= request.max_sequence_length; @@ -305,7 +305,7 @@ bool RequestManager::is_request_completed(RequestGuid const &guid) { GenerationResult RequestManager::get_generation_result(RequestGuid const &guid) { - std::lock_guard const lock(request_queue_mutex); + const std::lock_guard lock(request_queue_mutex); assert(request_generation_results.find(guid) != request_generation_results.end()); return request_generation_results[guid]; @@ -343,7 +343,7 @@ BatchConfig RequestManager::prepare_next_batch_task( BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, InferenceResult const &result) { - std::lock_guard const lock(request_queue_mutex); + const std::lock_guard lock(request_queue_mutex); // Step 1: append result from previous iteration to request's tokens for (int i = 0; i < old_bc.num_tokens; i++) { @@ -456,7 +456,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; num_active_req++; - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 == request.tokens.size()) { // Incremental phase @@ -504,7 +504,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_request.max_sequence_length; new_bc.request_completed[i] = false; num_active_req++; - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // add profile_info for the new request ProfileInfo profile_info; profile_info.llm_decoding_steps = 1; @@ -566,7 +566,7 @@ BeamSearchBatchConfig RequestManager::prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc, InferenceResult const &result, int model_id) { - std::lock_guard const lock(request_queue_mutex); + const std::lock_guard lock(request_queue_mutex); if (verbose) { std::cout << "\n############### prepare_next_batch_init ###############\n"; } @@ -603,11 +603,10 @@ BeamSearchBatchConfig } else { committed_tokens[guid].clear(); } - // iterate through all the tokens that belong to request i int root_abs_depth = request.tokens.size() - 1; - + while (result_index < old_bc.num_tokens && old_bc.tokensInfo[result_index].request_index == i) { int abs_depth = old_bc.tokensInfo[result_index].abs_depth_in_request; @@ -640,14 +639,12 @@ BeamSearchBatchConfig } if (request.status == Request::RUNNING) { - std::cout << "verify running: " << dfs_tree_inputs.at(guid).size() << ", " - << tree_outputs.size() << "\n"; std::vector> verified_tokens = traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); log_req_mgr.print("Number of Verified Tokens = %zu", - verified_tokens.size()); + verified_tokens.size()); // check if the request is finished if (verified_tokens.size() + request.tokens.size() >= request.max_sequence_length) { @@ -729,9 +726,6 @@ BeamSearchBatchConfig } else { // Request not finished, pass verified_tokens to next iteration - std::cout << "parse to next iteration: " - << "\n"; - new_bc.request_completed[i] = false; new_bc.request_running[i] = true; num_active_req++; @@ -745,18 +739,13 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size(); - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // TODO: Beam Request Info, missing from VerifyTreeBatchConfig int new_max_depth = new_bc.requestsInfo[i].max_sequence_length - new_bc.requestsInfo[i].first_token_depth_in_request - verified_tokens.size(); - // std::cout << "max depth: " << new_max_depth << ", " - // << new_bc.requestsInfo[i].first_token_depth_in_request << - // ", " - // << verified_tokens.size() << "\n"; - // assert(false); new_bc.beamRequestsInfo[i].current_depth = 1; profiling_requests[request.guid].ssm_decoding_steps = 0; @@ -794,9 +783,6 @@ BeamSearchBatchConfig // Beam Token Info new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0; new_bc.num_tokens++; - // std::cout << "num_gen ++ " - // << "\n"; - // num_generation_tokens++; // Add verified token to request's token list request.tokens.push_back(token.first); @@ -816,11 +802,6 @@ BeamSearchBatchConfig log_req_mgr.print("Output: %s", output.c_str()); } - // if (request.tokens.size() > 19 && i >= 7) { - // std::cout << request.tokens.size() << "\n"; - // assert(false); - // } - } else if (request.status == Request::PENDING) { new_bc.request_completed[i] = false; new_bc.request_running[i] = false; @@ -838,7 +819,7 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; new_bc.requestsInfo[i].num_tokens_in_batch = 0; - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // TODO: Beam Request Info, missing from VerifyTreeBatchConfig new_bc.beamRequestsInfo[i].current_depth = 1; @@ -889,7 +870,7 @@ BeamSearchBatchConfig (int)new_request.tokens.size()); new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // add profile_info for the new request ProfileInfo profile_info; @@ -973,17 +954,12 @@ BeamSearchBatchConfig } new_bc.num_generation_tokens = num_generation_tokens; - std::cout << "prepare next batch init gen tokens: " - << new_bc.num_generation_tokens << "\n"; - if (verbose) { std::cout << "prepare_next_batch_init OLD vs NEW batchconfigs below:" << std::endl; old_bc.print(); new_bc.print(); } - std::cout << "prepare next batch init active tokens: " - << new_bc.num_tokens << "\n"; return new_bc; } @@ -1019,11 +995,11 @@ BeamSearchBatchConfig RequestManager::prepare_next_batch_beam_task( BeamSearchBatchConfig RequestManager::prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, BeamInferenceResult const &result) { - std::lock_guard const lock(request_queue_mutex); - if (true) { + const std::lock_guard lock(request_queue_mutex); + if (verbose) { std::cout << "\n############### prepare_next_batch_beam ###############\n"; } - if (true) { + if (verbose) { std::cout << "print all results" << "\n"; for (int i = 0; i < 40; i++) { @@ -1049,7 +1025,7 @@ BeamSearchBatchConfig if (old_bc.request_completed[i] || !old_bc.request_running[i]) { continue; } - num_active_req ++; + num_active_req++; // Comment out this assertion since num_tokens_in_batch can be // zero when beam search has reached required sequence length // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); @@ -1092,13 +1068,6 @@ BeamSearchBatchConfig old_bc.beamRequestsInfo[i].sub_request_num * old_bc.beamRequestsInfo[i].beam_size; - std::cout << "oldbc : " << old_bc.beamRequestsInfo[i].sub_request_num - << ", " << old_bc.beamRequestsInfo[i].beam_size << "\n"; - - // if (old_bc.beamRequestsInfo[i].current_depth == 3) { - // assert(false); - // } - assert(new_bc.beamRequestsInfo[i].sub_request_num <= BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES && "exceed maximum nodes per layer"); @@ -1122,7 +1091,7 @@ BeamSearchBatchConfig request.tokens.size()) { // Incremental phase if (request.status == Request::RUNNING) { - // todo check it + // todo this is replaced by this_layer_size, but should check it new_bc.requestsInfo[i].num_tokens_in_batch = 1; } else { assert(false && "Request should be done"); @@ -1150,18 +1119,7 @@ BeamSearchBatchConfig memcpy(&new_bc.causalMask[i], &old_bc.causalMask[i], sizeof(BatchConfig::BitMask)); - // sub_request_num -> nodes of input next iteration - // beam_size replicate num - - std::cout << "print beam tree: " - << old_bc.beamRequestsInfo[i].current_depth << "\n"; BeamTree tree = request.beam_trees[old_bc.model_id]; - // for (int k = 0; k <= old_bc.beamRequestsInfo[i].current_depth; k++) { - // std::cout << "layer: " << k << "\n"; - // std::cout << "nodes: " << tree.treeLayers[k].nodes_num_this_layer - // << "\n"; - // } - std::cout << "append bit mask: "<< i << "\n"; appendBitMask(new_bc.causalMask[i], new_bc.beamRequestsInfo[i].sub_request_num, old_bc.beamRequestsInfo[i].beam_size, @@ -1185,9 +1143,6 @@ BeamSearchBatchConfig num_generation_tokens++; } } - // if(new_bc.beamRequestsInfo[i].current_depth >= 3 && i > 0){ - // assert(false); - // } } } @@ -1320,18 +1275,6 @@ BeamSearchBatchConfig old_bc.print(); new_bc.print(); } - - if (true) { - // std::cout << "print all resultsBBB" - // << "\n"; - // for (int i = 0; i < 40; i++) { - // std::cout << result.token_ids[i] << ", "; - // } - // std::cout << "Current Beam DepthBBB: " - // << old_bc.beamRequestsInfo[0].current_depth << "\n"; - } - std::cout << "prepare next batch beam total tokens: " << new_bc.num_tokens - << "gneration tokens: " << new_bc.num_generation_tokens << "\n"; return new_bc; } @@ -1366,7 +1309,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify_task( TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::vector const &old_batches) { - std::lock_guard const lock(request_queue_mutex); + const std::lock_guard lock(request_queue_mutex); std::cout << "\n############### prepare_next_batch_verify ###############\n"; @@ -1399,12 +1342,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( profiling_requests[request.guid].llm_decoding_steps += 1; if (request.status == Request::RUNNING) { - - std::cout << "prepare next batch running:\n" - << "\n"; new_bc.request_running[i] = true; - std::cout << "[Verify] Request " << request.guid << " is running" - << std::endl; // Get the dfs tree std::vector>> @@ -1419,12 +1357,12 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::vector> dfs_tree_inputs = merge_dfs_trees(all_dfs_trees, request.tokens.size() - 1, guid); - if (true) { - // std::cout << "Request Tokens Size: " << request.tokens.size() - // << std::endl; - // for (int k = 0; k < request.tokens.size(); k++) { - // std::cout << k << ": " << request.tokens[k] << std::endl; - // } + if (verbose) { + std::cout << "Request Tokens Size: " << request.tokens.size() + << std::endl; + for (int k = 0; k < request.tokens.size(); k++) { + std::cout << k << ": " << request.tokens[k] << std::endl; + } } // Normal Request Info @@ -1435,31 +1373,21 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( old_batches.at(0).requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length; - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // copy bitmask to verify batchconfig memcpy(&(new_bc.causalMask[i]), &(old_batches.at(0).causalMask[i]), sizeof(BatchConfig::BitMask)); - // std::cout << "bitmask: " << new_bc.causalMask[i].mask[0] << "\n"; - // assert(false); // TODO: Check this new_bc.requestsInfo[i].num_tokens_in_batch = 0; new_bc.request_completed[i] = false; - std::cout << "dfs_tree_inputs: " << dfs_tree_inputs.size() << ", " - << new_bc.causalMask[i].tree_size << ", " - << new_bc.causalMask[i].non_tree_cache_size << "\n"; - std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[0]) - << "\n"; - std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[1]) - << "\n"; - std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[2]) - << "\n"; - std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[3]) - << "\n"; - std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[4]) - << "\n"; + // std::cout << "dfs_tree_inputs: " << dfs_tree_inputs.size() << ", " + // << new_bc.causalMask[i].tree_size << ", " + // << new_bc.causalMask[i].non_tree_cache_size << "\n"; + // std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[0]) + // << "\n"; // Committed Tokens if (committed_tokens.find(guid) != committed_tokens.end()) { @@ -1473,7 +1401,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( i; new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = committed_token.first; - if (true) { + if (verbose) { std::cout << new_bc.num_tokens_to_commit << "- committed_token.token_depth: " << committed_token.first @@ -1485,7 +1413,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( // } } } - if (true) { + if (verbose) { std::cout << "new_bc.num_tokens_to_commit: " << new_bc.num_tokens_to_commit << std::endl; } @@ -1508,14 +1436,11 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.requestsInfo[i].first_token_depth_in_request = request.tokens.size() - 1; - std::cout << "prepare next batch verify: " << dfs_tree_inputs.size() - << "\n"; - bool cutLayer = false; // Add Tokens from the DFS Tree to the next batch for (int j = 1; j < dfs_tree_inputs.size(); j++) { auto token = dfs_tree_inputs.at(j); - if (true) { + if (verbose) { std::cout << "[" << j << "] Token: " << token.first << ", Depth:" << token.second << std::endl; } @@ -1541,7 +1466,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( for (int j = total_tokens - 1; j >= 1; j--) { new_bc.num_tokens--; new_bc.requestsInfo[i].num_tokens_in_batch--; - std::cout << "cut: " << j << "\n"; + // std::cout << "cut: " << j << "\n"; if (new_bc.tokensInfo[j].abs_depth_in_request != new_bc.tokensInfo[j - 1].abs_depth_in_request) { break; @@ -1550,8 +1475,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } else if (request.status == Request::PENDING) { - std::cout << "prepare next batch verify: pending\n" - << "\n"; new_bc.request_running[i] = false; if (verbose) { std::cout << "[Verify] Request " << request.guid @@ -1583,8 +1506,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( memcpy(&(new_bc.causalMask[i]), &(old_batches.at(0).causalMask[i]), sizeof(BatchConfig::BitMask)); - // std::cout << "bitmask: " << new_bc.causalMask[i].mask[0] << "\n"; - // assert(false); // Normal Request Info new_bc.requestsInfo[i].first_token_depth_in_request = @@ -1594,7 +1515,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( old_batches.at(0).requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length; - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; new_bc.request_completed[i] = false; @@ -1608,9 +1529,9 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( << std::endl; if (request.llm_cache_size < request.initial_len) { - std::cout << "Initialization (prompt) phase: " - << new_bc.requestsInfo[i].num_tokens_in_batch << ", " - << old_batches.at(0).beamRequestsInfo[i].beam_size << "\n"; + // std::cout << "Initialization (prompt) phase: " + // << new_bc.requestsInfo[i].num_tokens_in_batch << ", " + // << old_batches.at(0).beamRequestsInfo[i].beam_size << "\n"; // Initialization (prompt) phase for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { new_bc.tokensInfo[new_bc.num_tokens].request_index = i; @@ -1618,8 +1539,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( request.tokens[request.llm_cache_size + j]; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = request.llm_cache_size + j; - std::cout << "load prompt tokens: " << j << ": " - << new_bc.tokensInfo[new_bc.num_tokens].token_id << "\n"; new_bc.num_tokens++; } @@ -1645,8 +1564,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } else { // launch the request into running phase after loading all prompt if (get_max_tokens_per_batch() - new_bc.num_tokens > 0) { - std::cout << "Initialization running phase: " - << new_bc.requestsInfo[i].num_tokens_in_batch << "\n"; + // std::cout << "Initialization running phase: " + // << new_bc.requestsInfo[i].num_tokens_in_batch << "\n"; request.status = Request::RUNNING; new_bc.request_running[i] = true; @@ -1671,11 +1590,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } - std::cout << "how many tokens in verify? " << new_bc.num_tokens << "\n"; - - std::cout << "check dfs tree input size: " << dfs_tree_inputs[1000000].size() - << "\n"; - return new_bc; } @@ -1690,7 +1604,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, auto start_depth = old_bc.tokensInfo[0].abs_depth_in_request; int result_index = 0; - if (true) { + if (verbose) { std::cout << "Store total of " << old_bc.num_tokens << " tokens in the current batch.\n"; } @@ -1700,10 +1614,10 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid != guid) { - std::cout << "i is: " << i << "old guid" << guid << " new guid" - << old_bc.requestsInfo[old_bc.tokensInfo[i].request_index] - .request_guid - << "\n"; + // std::cout << "i is: " << i << "old guid" << guid << " new guid" + // << old_bc.requestsInfo[old_bc.tokensInfo[i].request_index] + // .request_guid + // << "\n"; int index = old_bc.tokensInfo[i - 1].request_index; int beam_size = old_bc.beamRequestsInfo[index].beam_size; @@ -1718,22 +1632,16 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, // Count tokens sent to model in this request to find the final token's // index - - std::cout << "previous result index: " << result_index; - result_index += (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) * beam_size; - std::cout << "after result index: " << result_index; - - // if (true) { - // std::cout << "i = " << i << ", result index = " << result_index - // << ", value: " << result.token_ids[result_index] - // << ", leaf node num: " << leaf_node_num << ", depth" << - // depth - // << ", beam size: " << beam_size << "\n"; - // } + if (verbose) { + std::cout << "i = " << i << ", result index = " << result_index + << ", value: " << result.token_ids[result_index] + << ", leaf node num: " << leaf_node_num << ", depth" << depth + << ", beam size: " << beam_size << "\n"; + } Request &request = all_requests[old_bc.requestsInfo[index].request_guid]; @@ -1743,7 +1651,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, if (depth == 1) { // store the last input into the tree; - if (true) { + if (verbose) { std::cout << "try to store the input" << "\n"; } @@ -1756,13 +1664,11 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, .treeLayers[0] .nodes_num_this_layer = 1; - if (true) { + if (verbose) { std::cout << "Store the previous last token to the tree root: " << request.tokens.back() << "\n"; } } - - std::cout << "leaffffff: " << leaf_node_num << "\n"; request.beam_trees.at(old_bc.model_id) .treeLayers[depth] .nodes_num_this_layer = leaf_node_num; @@ -1777,27 +1683,20 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, request.beam_trees.at(old_bc.model_id) .treeLayers[depth] .parent_ids[beam_id] = result.parent_id[result_index]; - // std::cout << "??????? beam id: " << beam_id << ", token: " - // << request.beam_trees.at(old_bc.model_id) - // .treeLayers[depth] - // .tokens[beam_id] - // << "\n"; - - // if (true) { - // std::cout << "tree value: " << depth << "token: " - // << request.beam_trees.at(old_bc.model_id) - // .treeLayers[depth] - // .tokens[beam_id] - // << "result tokens: " << result.token_ids[result_index]; - // } + + if (verbose) { + std::cout << "tree value: " << depth << "token: " + << request.beam_trees.at(old_bc.model_id) + .treeLayers[depth] + .tokens[beam_id] + << "result tokens: " << result.token_ids[result_index]; + } result_index += 1; } // update the guid and start_depth for current request if (i < old_bc.num_tokens) { int new_req_idx = old_bc.tokensInfo[i].request_index; guid = old_bc.requestsInfo[new_req_idx].request_guid; - std::cout << "update guid: " << guid << ", request idx: " << index - << "\n"; start_depth = old_bc.tokensInfo[i].abs_depth_in_request; } } @@ -1839,8 +1738,8 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, tree.treeLayers[depth].probs[j]; new_bc.beamRequestsInfo[request_index].tokens[j] = tree.treeLayers[depth].tokens[j]; - std::cout << "token: " << j << ": " - << new_bc.beamRequestsInfo[request_index].tokens[j] << "\n"; + // std::cout << "token: " << j << ": " + // << new_bc.beamRequestsInfo[request_index].tokens[j] << "\n"; } } if (verbose) { @@ -1892,13 +1791,13 @@ void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask, "do not support tree size > 64"); assert(initLength >= 1 && "verified token num should >= 1"); - std::cout << "non tree size: " << non_tree_size << ", " - << bitmask.non_tree_cache_size << "\n"; + // std::cout << "non tree size: " << non_tree_size << ", " + // << bitmask.non_tree_cache_size << "\n"; bitmask.non_tree_cache_size = non_tree_size + initLength - 1; bitmask.tree_size = 1; bitmask.this_layer_size = initLength; - std::cout << "non_tree_size: " << non_tree_size << "\n"; + // std::cout << "non_tree_size: " << non_tree_size << "\n"; bitmask.prompt_size = 1; for (int i = 0; i < bitmask.prompt_size; i++) { for (int j = i; j < bitmask.prompt_size; j++) { @@ -1906,13 +1805,9 @@ void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask, } } - std::cout << "see bit mask update" << bitmask.prompt_size << "\n"; - std::cout << "see bit mask update" << std::bitset<64>(bitmask.mask[0]) - << "\n"; - std::cout << "see bit mask update" << std::bitset<64>(bitmask.mask[1]) - << "\n"; - std::cout << "see bit mask update" << std::bitset<64>(bitmask.mask[2]) - << "\n"; + // std::cout << "see bit mask update" << bitmask.prompt_size << "\n"; + // std::cout << "see bit mask update" << std::bitset<64>(bitmask.mask[0]) + // << "\n"; } // prepare next beam, append layers to the tree @@ -1987,16 +1882,10 @@ void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, // assert(false); // } - std::cout << "see bit mask append" << bitmask.prompt_size << "\n"; - std::cout << "see bit mask append" << bitmask.non_tree_cache_size << "\n"; - std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[0]) - << "\n"; - std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[1]) - << "\n"; - std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[2]) - << "\n"; - std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[3]) - << "\n"; + // std::cout << "see bit mask append" << bitmask.prompt_size << "\n"; + // std::cout << "see bit mask append" << bitmask.non_tree_cache_size << "\n"; + // std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[0]) + // << "\n"; } bool PreOrder( @@ -2084,7 +1973,7 @@ std::vector> // depth) pairs for (auto const &pair : inputSerializedTree) { oss << " " << pair.second << ":" << pair.first; - log_req_mgr.print("(%d, %d)", pair.first, pair.second); + // log_req_mgr.print("(%d, %d)", pair.first, pair.second); } log_req_mgr.print("Input tree:%s", oss.str().c_str()); } @@ -2093,7 +1982,7 @@ std::vector> // outputSerializedTree is an array of (token id, depth + 1) pairs std::ostringstream oss; for (auto const &pair : outputSerializedTree) { - log_req_mgr.print("(%d, %d)", pair.first, pair.second); + // log_req_mgr.print("(%d, %d)", pair.first, pair.second); oss << " " << pair.second << ":" << pair.first; } log_req_mgr.print("Output tree:%s", oss.str().c_str()); @@ -2130,6 +2019,7 @@ std::vector> } // to avoid branch switch when same tokens in input tree. + // todo, only checked for N->1->1->1 cases bool findFirst = false; layer_num = -1; @@ -2173,9 +2063,10 @@ std::vector> new_committed_tokens.push_back(std::make_pair( input.second, committed_tokens.at(guid).at(i).second)); // at this point, you'll not go other branches - std::cout << "verify tree push back: " << output.first - << ", tree size is: " << verifiedTree.size() - << ", ??: " << input.first << ", " << input.second << "\n"; + // std::cout << "verify tree push back: " << output.first + // << ", tree size is: " << verifiedTree.size() + // << ", ??: " << input.first << ", " << input.second << + // "\n"; } else { printf("not correct slot\n"); @@ -2190,9 +2081,9 @@ std::vector> committed_tokens.at(guid).at(i).second)); // // at this point, you'll not go other branches - std::cout << "verify tree push back: " << output.first - << ", tree size is: " << verifiedTree.size() - << ", ??: " << input.first << ", " << input.second << "\n"; + // std::cout << "verify tree push back: " << output.first + // << ", tree size is: " << verifiedTree.size() + // << ", ??: " << input.first << ", " << input.second << "\n"; } assert(committed_tokens.at(guid).at(i).first == input.second); @@ -2203,7 +2094,7 @@ std::vector> // log_req_mgr.print("========Verified============"); std::ostringstream oss; for (auto const &pair : verifiedTree) { - log_req_mgr.print("(%d, %d)", pair.first, pair.second); + // log_req_mgr.print("(%d, %d)", pair.first, pair.second); oss << " " << pair.second << ":" << pair.first; } log_req_mgr.print("Verified:%s", oss.str().c_str()); @@ -2225,7 +2116,7 @@ std::vector> RequestManager::traverse_beam_tree(BeamSearchBatchConfig const &old_bc, int request_index, int first_token_depth_in_request) { - if (true) { + if (verbose) { std::cout << "[Traverse Beam Tree] request_index: " << request_index << "\n"; std::cout << "[Traverse Beam Tree] max_depth: " @@ -2269,13 +2160,13 @@ std::vector> // verbose); // print it - if (true) { + if (verbose) { std::cout << "Print serialized tree: size:" << request_index << serializedTree.size() << "\n"; } for (int k = 0; k < serializedTree.size(); k++) { serializedTree.at(k).second += first_token_depth_in_request; - if (true) { + if (verbose) { std::cout << "token id: " << serializedTree.at(k).first << ", depth: " << serializedTree.at(k).second << "\n"; } @@ -2354,9 +2245,6 @@ std::vector> } dfs_tree_inputs[guid] = merged_tree; - // std::cout << "assign dfr tree: " << guid << ", " << merged_tree.size() << - // ", " - // << dfs_tree_inputs[guid].size() << "\n"; return merged_tree; } diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index bb6b6030aa..bb20fb263f 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -212,7 +212,6 @@ void RequestManager::load_batch_config_task( } // add a size check - std::cout << "hahaha handle.batch_config_metadata_size: " << handle.batch_config_metadata_size << ", "<< total_copy_size << "\n"; assert(total_copy_size <= handle.batch_config_metadata_size); } From b621f2a9f62f24a8112df7af3850dc3bdb494dc7 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sat, 30 Dec 2023 17:25:28 -0500 Subject: [PATCH 16/61] . --- inference/spec_infer/spec_infer.cc | 10 +++++----- src/runtime/cuda_helper.cu | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 258b2d78eb..b369a13c1d 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -266,9 +266,9 @@ void FlexFlow::top_level_task(Task const *task, ModelMeta model_metadata; bool use_full_precision = false; bool verbose = false; - int max_requests_per_batch = 10; - int max_tokens_per_batch = 199; - int max_sequence_length = 200; + int max_requests_per_batch = 16; + int max_tokens_per_batch = 256; + int max_sequence_length = 1024; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -302,7 +302,7 @@ void FlexFlow::top_level_task(Task const *task, model_metadata.llm_tokenizer_path); rm->register_output_filepath(file_paths.output_file_path); - //first decoding step: 3 results + // first decoding step: 3 results rm->push_spec_infer_tree_width(3); // Create LLM model @@ -402,7 +402,7 @@ void FlexFlow::top_level_task(Task const *task, printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); total_num_requests++; prompts.push_back(text); - // tree_model.generate(text, 128 /*max_sequence_length*/); + // tree_model.generate(text, 128 /*max_sequence_length*/); } tree_model.generate(prompts, 128 /*max_sequence_length*/); } diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 398ed7f3cd..fa6bf55fe5 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -226,7 +226,7 @@ __host__ void print_tensor(T const *ptr, printf("%s, %d---->", prefix, shard_id); for (idx = 0; idx < num_elements; idx++) { printf(" %.20lf", (float)host_ptr[idx]); - if (idx >= 200) { + if (idx >= 100) { break; } } From 8a0b007bfe20b50302ad201c01c7ac1dfb30a25a Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sat, 30 Dec 2023 18:49:19 -0500 Subject: [PATCH 17/61] load batchconfig --- src/ops/inc_multihead_self_attention.cpp | 4 ++-- src/runtime/inference_manager.cc | 9 ++++----- src/runtime/model.cpp | 2 +- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index a59740f4a3..00cc4d8868 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -1106,7 +1106,7 @@ template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( float const *weight_ptr, float const *bias_ptr, int num_tokens, - cudaStream_t stream); + hipStream_t stream); template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, @@ -1115,6 +1115,6 @@ template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( half const *weight_ptr, half const *bias_ptr, int num_tokens, - cudaStream_t stream); + hipStream_t stream); }; // namespace FlexFlow diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 52a1efc2ab..8af0ed8978 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -398,11 +398,10 @@ void InferenceManager::load_inference_metadata_batch_config( Runtime *runtime = ff_config.lg_hlr; ArgumentMap argmap; - Rect<1> task_rect( - Point<1>(0), Point<1>(ff_config.workersPerNode * ff_config.numNodes - 1)); - IndexSpaceT<1> task_is = runtime->create_index_space(ctx, task_rect); + Domain domain = + runtime->get_index_space_domain(ctx, ff_config.all_gpu_task_is); + Rect<1> task_rect = domain; - // int rank = 0; int idx = 0; for (PointInRectIterator<1> it(task_rect); it(); it++) { FFHandler handler = handlers[idx++]; @@ -410,7 +409,7 @@ void InferenceManager::load_inference_metadata_batch_config( } IndexLauncher launcher(RM_LOAD_BATCH_CONFIG_TASK_ID, - task_is, + ff_config.all_gpu_task_is, TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp index 5499a280a8..ad2b781567 100644 --- a/src/runtime/model.cpp +++ b/src/runtime/model.cpp @@ -152,7 +152,7 @@ FFHandler .wait(); handle.offload_reserve_space = workspaceInst.pointer_untyped(0, sizeof(char)); - } else { + } else { handle.offload_reserve_space = nullptr; } if (handle.batch_config_metadata_size > 0) { From 17a718f95523ed3892d0324ed493ef6043607b13 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sat, 30 Dec 2023 19:18:22 -0500 Subject: [PATCH 18/61] clean --- .../inc_multihead_self_attention_utils.cuh | 4 +- src/ops/argmax.cc | 1 - src/ops/beam_topk.cc | 2 - src/ops/inc_multihead_self_attention.cu | 7 +- src/ops/spec_inc_multihead_self_attention.cu | 111 ++++++------------ src/ops/tree_inc_multihead_self_attention.cu | 13 +- 6 files changed, 49 insertions(+), 89 deletions(-) diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh index 1b21a80dc9..c128c1a126 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh @@ -456,7 +456,7 @@ inline size_t smem_size_in_bytes(int hidden_size_per_head, int threads_per_block) { // The amount of shared memory needed to store the Q*K^T values in float. - size_t qk_sz = div_up(2000 + 1, 4) * 16; + size_t qk_sz = div_up(max_sequence_length + 1, 4) * 16; size_t logits_sz = qk_sz; // The total size needed during softmax. @@ -493,7 +493,7 @@ inline void smem_size_in_bytes_tree(int hidden_size_per_head, } // todo fix this - int max_qk_length = max_query_length * max_total_length + 1000; + int max_qk_length = max_query_length * max_total_length; // The amount of shared memory needed to store the Q*K^T values in float. size_t qk_sz = div_up(max_qk_length + 1, 4) * 16; diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index c3bb3d493e..dc7e4ea3b3 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -352,7 +352,6 @@ BeamInferenceResult GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO( DT_INT32, regions[2], task->regions[2], FID_DATA, ctx, runtime); ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size); - BeamInferenceResult ir; download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size); diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 87d357b535..18d0ec1587 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -398,8 +398,6 @@ BeamInferenceResult download_tensor( parent_ptr, ir.parent_id, batch_size * m->max_beam_width); - // print_tensor(index_ptr, 32, "indexxxxxxx"); - if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index cca0b230c3..da70e23f87 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1381,7 +1381,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( assert(false && "Unkown inference mode"); } size_t requestinfo_size = BatchConfig::max_requests_per_batch(); - size_t tokeninfo_size = max_tokens_per_batch; + // size_t tokeninfo_size = max_tokens_per_batch; size_t qk_prod_size = max_tokens_per_batch * BatchConfig::max_sequence_length() * num_q_heads; size_t attn_heads_size = max_tokens_per_batch * num_q_heads * vProjSize; @@ -1438,8 +1438,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( token_infos = static_cast(handler.batch_config_metadata); - request_infos = static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo)); + request_infos = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo)); if (offload) { // token_infos = diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index b3a87fe244..88dd3f92e4 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -82,29 +82,20 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel( int const first_step = 0; - int const tlength = - request_infos[batch_config_request_id].first_token_depth_in_request + - request_infos[batch_config_request_id].num_tokens_in_batch; + // int const tlength = + // request_infos[batch_config_request_id].first_token_depth_in_request + + // request_infos[batch_config_request_id].num_tokens_in_batch; int const totalCacheSize = bitmask.non_tree_cache_size + bitmask.tree_size; - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - // printf("spec inc attn fused kernel %d, %d\n", - // totalCacheSize, - // request_infos[batch_config_request_id].num_tokens_in_batch); - // } - // int const qlength = request_infos[request_idx].num_tokens_in_batch; - int const tree_branch_num = - beam_request_infos[batch_config_request_id].sub_request_num; - - // will decode qlength tokens in this thread block - // int const qlength = tree_branch_num; - int first_token_idx = 0; for (int r = 0; r < request_idx; r++) { first_token_idx += causalMask[r].this_layer_size; } + int const tree_branch_num = + beam_request_infos[batch_config_request_id].sub_request_num; + // shared memory objects extern __shared__ char smem_[]; @@ -338,20 +329,14 @@ __global__ void spec_inc_store_kv_cache( DT vVal = devQKVProjArray[val_idx + hidden_size]; int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - int const first_token_in_req = - requestInfo[req_id].first_token_depth_in_request; - int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; - int const total_token = requestInfo[req_id].num_tokens_in_batch; + // int const tok_id = tokenInfos[token_idx].abs_depth_in_request; int const request_token_offset = requestInfo[req_id].first_token_offset_in_batch; BatchConfig::BitMask bitmask = causalMask[req_id]; - int const sub_request_num = beamRequestInfos[req_id].sub_request_num; - - int const tree_branch_num = beamRequestInfos[req_id].sub_request_num; + // int const tree_branch_num = beamRequestInfos[req_id].sub_request_num; // int const query_token = bitmask.non_tree_cache_size + bitmask.tree_size - // tree_branch_num + sub_req_id + tok_id; @@ -379,9 +364,9 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, if (num_tokens > 0) { int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens; spec_inc_store_kv_cache<<>>( + min(CUDA_NUM_THREADS, parallelism), + 0, + stream>>>( static_cast
(m->devQKVProjArray), static_cast
(m->keyCache), static_cast
(m->valueCache), @@ -401,19 +386,19 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, } } -#define LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( \ +#define LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( \ DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream) \ smem_sz = smem_size_in_bytes
(m->qProjSize, \ BatchConfig::max_sequence_length() + \ BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ THREADS_PER_VALUE, \ THDS_PER_BLOCK); \ - compute_spec_inc_attention_kernel_generation_kernel \ + compute_spec_inc_attention_kernel_generation_kernel \ <<>>( \ static_cast
(m->devQKVProjArray), \ static_cast
(m->keyCache), \ @@ -470,14 +455,13 @@ __global__ void spec_fill_entries_above_diagonal(DT *matrix, } template -void compute_attention_kernel_prompt( - SpecIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - int shard_id, - DT *output_ptr, - DT const *bias_ptr, - DT const *weight_ptr, - cudaStream_t stream) { +void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + int shard_id, + DT *output_ptr, + DT const *bias_ptr, + DT const *weight_ptr, + cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); @@ -812,8 +796,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); cudaEventDestroy(t_start); cudaEventDestroy(t_end); - printf("SpecIncMultiHeadSelfAttention forward time = %.2fms\n", - elapsed); + printf("SpecIncMultiHeadSelfAttention forward time = %.2fms\n", elapsed); // print_tensor<3, float>(acc_query.ptr, acc_query.rect, // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, // acc_output.rect, "[Attention:forward:output]"); @@ -860,51 +843,29 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - // size_t causal_mask_size = BatchConfig::MAX_NUM_REQUESTS; - // size_t total_size = causal_mask_size * sizeof(BatchConfig::BitMask); - // gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, - // total_size); - beam_token_infos = - static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo)); beam_request_infos = - static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + sizeof(BeamSearchBatchConfig::beamTokenInfo)); - causalMask = static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + + causalMask = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + sizeof(BeamSearchBatchConfig::beamTokenInfo) + sizeof(BeamSearchBatchConfig::beamRequestsInfo)); - - // causalMask = gpu_mem_allocator.allocate_instance( - // causal_mask_size); - // beam_token_infos = - // gpu_mem_allocator - // .allocate_instance( - // beam_tokeninfo_size); - // offset += beam_tokeninfo_size * - // sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo); - // beam_request_infos = - // gpu_mem_allocator - // .allocate_instance( - // beam_requestinfo_size); - // offset += beam_requestinfo_size * - // sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo); - // assert(offset == total_size); - // assert(gpu_mem_allocator.instance_total_size == - // gpu_mem_allocator.instance_allocated_size); } cudaStreamSynchronize(stream); } -SpecIncMultiHeadSelfAttentionMeta::~SpecIncMultiHeadSelfAttentionMeta( - void) { +SpecIncMultiHeadSelfAttentionMeta::~SpecIncMultiHeadSelfAttentionMeta(void) { if (beam_search_reserve_inst != Realm::RegionInstance::NO_INST) { beam_search_reserve_inst.destroy(); } diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 5c6527baf9..b4af80976f 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -445,7 +445,7 @@ __global__ void update_tree_branch_kv_cache_fused( DT vVal = devQKVProjArray[val_idx + hidden_size]; int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + // int const tok_id = tokenInfos[token_idx].abs_depth_in_request; int const request_token_offset = request_infos[req_id].first_token_offset_in_batch; @@ -1059,12 +1059,13 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - causalMask = static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo)); + causalMask = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo)); committed_token_infos = - static_cast( - handler.batch_config_metadata + sizeof(BatchConfig::tokensInfo) + + reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + sizeof(BatchConfig::causalMask)); } From c8a107b1b75e5c90a9c7329ab2618b940a4b260f Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sat, 30 Dec 2023 19:19:45 -0500 Subject: [PATCH 19/61] hip --- src/ops/inc_multihead_self_attention.cpp | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 00cc4d8868..d60386f927 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -1098,23 +1098,4 @@ template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( DataType data_type, hipStream_t stream); -template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( - IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - float *output_ptr, - float const *weight_ptr, - float const *bias_ptr, - int num_tokens, - hipStream_t stream); -template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( - IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - half *output_ptr, - half const *weight_ptr, - half const *bias_ptr, - int num_tokens, - hipStream_t stream); - }; // namespace FlexFlow From 42e1b5d92cf3e93e3f56d3d18d3fb68803b6caaf Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sat, 30 Dec 2023 20:42:49 -0500 Subject: [PATCH 20/61] hip --- src/runtime/request_manager.cpp | 95 +++++++++++++++++--- src/runtime/request_manager.cu | 154 +++++++++----------------------- 2 files changed, 123 insertions(+), 126 deletions(-) diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp index 9635b3bc1e..fadbf80d6d 100644 --- a/src/runtime/request_manager.cpp +++ b/src/runtime/request_manager.cpp @@ -56,22 +56,91 @@ void RequestManager::load_tokens_task( sizeof(TokenId) * batch_config->num_tokens, hipMemcpyHostToDevice, stream)); +} + +void RequestManager::load_batch_config_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 0); + assert(task->regions.size() == 0); + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + // BatchConfig const batch_config = *((BatchConfig *)task->args); + BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]); // copy meta data to workSpace FFHandler handle = *((FFHandler const *)task->local_args); - cudaMemcpyAsync(handle.batch_config_metadata, - &(batch_config->tokensInfo), - batch_config->num_active_tokens() * - sizeof(BatchConfig::PerTokenInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo), - &(batch_config->requestsInfo), - batch_config->max_requests_per_batch() * - sizeof(BatchConfig::PerRequestInfo), - cudaMemcpyHostToDevice, - stream); + size_t total_copy_size = 0; + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata, + &(batch_config->tokensInfo), + sizeof(BatchConfig::tokensInfo), + hipMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BatchConfig::tokensInfo); + + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(batch_config->requestsInfo), + sizeof(BatchConfig::requestsInfo), + hipMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BatchConfig::requestsInfo); + + // load speculative metadata + if (batch_config->get_mode() == BEAM_SEARCH_MODE) { + BeamSearchBatchConfig const *beam_batch_config = + static_cast(batch_config); + + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(beam_batch_config->beamTokenInfo), + sizeof(BeamSearchBatchConfig::beamTokenInfo), + hipMemcpyHostToDevice, + stream)); + + total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); + + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(beam_batch_config->beamRequestsInfo), + sizeof(BeamSearchBatchConfig::beamRequestsInfo), + hipMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); + + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(beam_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + hipMemcpyHostToDevice, + stream)); + + total_copy_size += sizeof(BatchConfig::causalMask); + } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { + TreeVerifyBatchConfig const *tree_batch_config = + static_cast(batch_config); + + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(tree_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + hipMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BatchConfig::causalMask); + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(tree_batch_config->committed_tokens), + sizeof(TreeVerifyBatchConfig::committed_tokens), + hipMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); + } + + // add a size check + assert(total_copy_size <= handle.batch_config_metadata_size); } void RequestManager::load_positions_task( diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index bb20fb263f..51c52c3026 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -56,78 +56,6 @@ void RequestManager::load_tokens_task( sizeof(TokenId) * batch_config->num_tokens, cudaMemcpyHostToDevice, stream)); - - // // copy meta data to workSpace - // FFHandler handle = *((FFHandler const *)task->local_args); - // size_t total_copy_size = 0; - // cudaMemcpyAsync(handle.batch_config_metadata, - // &(batch_config->tokensInfo), - // sizeof(BatchConfig::tokensInfo), - // cudaMemcpyHostToDevice, - // stream); - // total_copy_size += sizeof(BatchConfig::tokensInfo); - - // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - // total_copy_size, - // &(batch_config->requestsInfo), - // sizeof(BatchConfig::requestsInfo), - // cudaMemcpyHostToDevice, - // stream); - // total_copy_size += sizeof(BatchConfig::requestsInfo); - - // // load speculative metadata - // if (batch_config->get_mode() == BEAM_SEARCH_MODE) { - // BeamSearchBatchConfig const *beam_batch_config = - // static_cast(batch_config); - - // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - // total_copy_size, - // &(beam_batch_config->beamTokenInfo), - // sizeof(BeamSearchBatchConfig::beamTokenInfo), - // cudaMemcpyHostToDevice, - // stream); - - // total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); - - // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - // total_copy_size, - // &(beam_batch_config->beamRequestsInfo), - // sizeof(BeamSearchBatchConfig::beamRequestsInfo), - // cudaMemcpyHostToDevice, - // stream); - // total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); - - // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - // total_copy_size, - // &(beam_batch_config->causalMask), - // sizeof(BatchConfig::causalMask), - // cudaMemcpyHostToDevice, - // stream); - - // total_copy_size += sizeof(BatchConfig::causalMask); - // } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { - // TreeVerifyBatchConfig const *tree_batch_config = - // static_cast(batch_config); - - // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - // total_copy_size, - // &(tree_batch_config->causalMask), - // sizeof(BatchConfig::causalMask), - // cudaMemcpyHostToDevice, - // stream); - // total_copy_size += sizeof(BatchConfig::causalMask); - // cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - // total_copy_size, - // &(tree_batch_config->committed_tokens), - // sizeof(TreeVerifyBatchConfig::committed_tokens), - // cudaMemcpyHostToDevice, - // stream); - // total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); - // } - - // // add a size check - // std::cout << "handle.batch_config_metadata_size: " << handle.batch_config_metadata_size << ", "<< total_copy_size << "\n"; - // assert(total_copy_size <= handle.batch_config_metadata_size); } void RequestManager::load_batch_config_task( @@ -146,19 +74,19 @@ void RequestManager::load_batch_config_task( // copy meta data to workSpace FFHandler handle = *((FFHandler const *)task->local_args); size_t total_copy_size = 0; - cudaMemcpyAsync(handle.batch_config_metadata, - &(batch_config->tokensInfo), - sizeof(BatchConfig::tokensInfo), - cudaMemcpyHostToDevice, - stream); + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata, + &(batch_config->tokensInfo), + sizeof(BatchConfig::tokensInfo), + cudaMemcpyHostToDevice, + stream)); total_copy_size += sizeof(BatchConfig::tokensInfo); - cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, - &(batch_config->requestsInfo), - sizeof(BatchConfig::requestsInfo), - cudaMemcpyHostToDevice, - stream); + checkCUDA(cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(batch_config->requestsInfo), + sizeof(BatchConfig::requestsInfo), + cudaMemcpyHostToDevice, + stream)); total_copy_size += sizeof(BatchConfig::requestsInfo); // load speculative metadata @@ -166,48 +94,48 @@ void RequestManager::load_batch_config_task( BeamSearchBatchConfig const *beam_batch_config = static_cast(batch_config); - cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, - &(beam_batch_config->beamTokenInfo), - sizeof(BeamSearchBatchConfig::beamTokenInfo), - cudaMemcpyHostToDevice, - stream); + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(beam_batch_config->beamTokenInfo), + sizeof(BeamSearchBatchConfig::beamTokenInfo), + cudaMemcpyHostToDevice, + stream)); total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); - cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, - &(beam_batch_config->beamRequestsInfo), - sizeof(BeamSearchBatchConfig::beamRequestsInfo), - cudaMemcpyHostToDevice, - stream); + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(beam_batch_config->beamRequestsInfo), + sizeof(BeamSearchBatchConfig::beamRequestsInfo), + cudaMemcpyHostToDevice, + stream)); total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); - cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, - &(beam_batch_config->causalMask), - sizeof(BatchConfig::causalMask), - cudaMemcpyHostToDevice, - stream); + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(beam_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + cudaMemcpyHostToDevice, + stream)); total_copy_size += sizeof(BatchConfig::causalMask); } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { TreeVerifyBatchConfig const *tree_batch_config = static_cast(batch_config); - cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, - &(tree_batch_config->causalMask), - sizeof(BatchConfig::causalMask), - cudaMemcpyHostToDevice, - stream); + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(tree_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + cudaMemcpyHostToDevice, + stream)); total_copy_size += sizeof(BatchConfig::causalMask); - cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, - &(tree_batch_config->committed_tokens), - sizeof(TreeVerifyBatchConfig::committed_tokens), - cudaMemcpyHostToDevice, - stream); + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(tree_batch_config->committed_tokens), + sizeof(TreeVerifyBatchConfig::committed_tokens), + cudaMemcpyHostToDevice, + stream)); total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); } From 4957b7c7d4c73a6fca94ea40f140319b50b49e9a Mon Sep 17 00:00:00 2001 From: Xinhao Cheng <99570243+xinhaoc@users.noreply.github.com> Date: Sat, 30 Dec 2023 23:24:37 -0500 Subject: [PATCH 21/61] Specinfer - new kernel (#1252) * init * fix speculative * fix speculative * bitmap+tree verify * fix. * fix * multi batch * copy metadata once * fix some corner cases * Replicate load_token tasks so that it can be fused with other compute tasks; this eliminates Replicate and enables a larger fused op * more fix. * clean up * . * load batchconfig * clean * hip * hip --------- Co-authored-by: Zhihao Jia --- include/flexflow/batch_config.h | 29 +- include/flexflow/config.h | 11 + include/flexflow/model.h | 1 + .../ops/spec_inc_multihead_self_attention.h | 1 + .../ops/tree_inc_multihead_self_attention.h | 1 + include/flexflow/request_manager.h | 33 +- inference/models/llama.cc | 4 +- inference/spec_infer/spec_infer.cc | 3 + src/ops/argmax.cc | 2 +- src/ops/beam_topk.cc | 2 +- src/ops/beam_topk.cu | 65 +- src/ops/embedding.cc | 18 +- src/ops/inc_multihead_self_attention.cu | 81 +- src/ops/spec_inc_multihead_self_attention.cc | 12 +- src/ops/spec_inc_multihead_self_attention.cu | 964 +++++++++++------- src/ops/tree_inc_multihead_self_attention.cu | 232 +++-- src/runtime/inference_manager.cc | 56 +- src/runtime/model.cc | 48 +- src/runtime/model.cpp | 48 + src/runtime/model.cu | 25 + src/runtime/request_manager.cc | 639 +++++++++--- src/runtime/request_manager.cpp | 85 ++ src/runtime/request_manager.cu | 86 ++ 23 files changed, 1727 insertions(+), 719 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index e2903c4d11..13904aaa46 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -56,6 +56,7 @@ class BatchConfig { // across workers static int const MAX_NUM_REQUESTS = 64; static int const MAX_NUM_TOKENS = 1024; + static int const MAX_SPEC_TREE_TOKEN_NUM = 64; // Set by update int num_tokens; @@ -68,6 +69,9 @@ class BatchConfig { int first_token_offset_in_batch; int num_tokens_in_batch; int max_sequence_length; + + // request id in batch config: + int batch_config_request_id; RequestGuid request_guid; }; struct PerTokenInfo { @@ -75,6 +79,24 @@ class BatchConfig { int request_index; TokenId token_id; }; + + struct BitMask { + unsigned long long mask[MAX_SPEC_TREE_TOKEN_NUM] = {0}; + + // how many tokens before the tree, every sub requests need this part of + // cache + int non_tree_cache_size; + + // current tree size + int tree_size; + + int this_layer_size; + + // input length-> prompt/root + int prompt_size; + }; + + BitMask causalMask[MAX_NUM_REQUESTS]; PerRequestInfo requestsInfo[MAX_NUM_REQUESTS]; PerTokenInfo tokensInfo[MAX_NUM_TOKENS]; @@ -126,9 +148,12 @@ class BeamSearchBatchConfig : public BatchConfig { size_t beam_width; size_t target_iterations; - inline static int const MAX_BEAM_WIDTH = 1; + inline static int const MAX_BEAM_WIDTH = 3; inline static int const MAX_BEAM_DEPTH = 8; + // maximum tree branches for a request + inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 3; + int model_id; struct BeamSearchPerRequestInfo { @@ -139,6 +164,7 @@ class BeamSearchBatchConfig : public BatchConfig { BatchConfig::TokenId tokens[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; float probs[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; int parent_id[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + int sub_request_num; }; struct BeamSearchPerTokenInfo { @@ -147,6 +173,7 @@ class BeamSearchBatchConfig : public BatchConfig { BeamSearchPerRequestInfo beamRequestsInfo[MAX_NUM_REQUESTS]; BeamSearchPerTokenInfo beamTokenInfo[MAX_NUM_TOKENS * MAX_BEAM_WIDTH]; + // why is this == MAX_NUM_REQUESTS * MAX_BEAM_WIDTH? int sub_requests[MAX_NUM_REQUESTS * MAX_BEAM_WIDTH]; diff --git a/include/flexflow/config.h b/include/flexflow/config.h index c2af6d707c..e1480264cc 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -16,6 +16,7 @@ #ifndef _FLEXFLOW_CONFIG_H_ #define _FLEXFLOW_CONFIG_H_ #include "ffconst.h" +#include "flexflow/batch_config.h" #include "legion.h" #include #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) @@ -75,6 +76,15 @@ struct FFHandler { #endif void *workSpace; size_t workSpaceSize; + void *batch_config_metadata; + + // request info + token info + topolopgy mask info + size_t batch_config_metadata_size = + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::beamTokenInfo) + + sizeof(BeamSearchBatchConfig::beamRequestsInfo) + + sizeof(BatchConfig::causalMask) + + sizeof(TreeVerifyBatchConfig::committed_tokens); void *offload_reserve_space; size_t offload_reserve_space_size; DataType quantization_type; @@ -132,6 +142,7 @@ class FFConfig { size_t workSpaceSize; Legion::Context lg_ctx; Legion::Runtime *lg_hlr; + Legion::IndexSpaceT<1> all_gpu_task_is; // Legion::FieldSpace field_space; bool syntheticInput, profiling, perform_fusion; bool inference_debugging; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index d8402ba622..16df99ab1a 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -240,6 +240,7 @@ enum TaskIDs { // InferenceManager & RequestManager RM_LOAD_TOKENS_TASK_ID, RM_LOAD_POSITION_TASK_ID, + RM_LOAD_BATCH_CONFIG_TASK_ID, RM_PREPARE_NEXT_BATCH_TASK_ID, RM_PREPARE_NEXT_BATCH_INIT_TASK_ID, RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index 56bb2bd80d..a306f7985a 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -142,6 +142,7 @@ class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { Realm::RegionInstance beam_search_reserve_inst; BeamSearchBatchConfig::BeamSearchPerTokenInfo *beam_token_infos; BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos; + BatchConfig::BitMask *causalMask; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index 6e2da19ce9..d160da4a72 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -147,6 +147,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { int num_active_tokens; Realm::RegionInstance committed_token_reserve_inst; TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos; + BatchConfig::BitMask *causalMask; }; }; // namespace FlexFlow diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index baf6844801..1c4b0b2a2f 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -38,10 +38,13 @@ class InferenceManager { Legion::FutureMap inference(FFModel *model, int index, BatchConfigFuture const &bc); void load_input_tokens_from_batch_config(BatchConfigFuture const &bc, - ParallelTensor const input); + ParallelTensor const input, + FFHandler *handlers); void load_positions(BatchConfigFuture const &bc, ParallelTensor position_input, int offset); + void load_inference_metadata_batch_config(BatchConfigFuture const &bc, + FFHandler *handlers); public: FFConfig ff_config; @@ -72,9 +75,10 @@ struct Request { struct BeamTree { struct treeLayer { BeamSearchBatchConfig::TokenId - tokens[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + tokens[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; int parent_ids[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; - float probs[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + float probs[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; + int nodes_num_this_layer = 0; }; treeLayer treeLayers[BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1]; }; @@ -100,6 +104,7 @@ class RequestManager { void set_max_tokens_per_batch(int max_num_tokens); int get_max_tokens_per_batch(); void set_max_sequence_length(int max_seq_length); + void push_spec_infer_tree_width(int tree_width); int get_max_sequence_length(); int register_ssm_model(FFModel *model); void register_tokenizer(ModelType model_type, @@ -107,6 +112,16 @@ class RequestManager { int eos_token_id, std::string const &path); void register_output_filepath(std::string const &); + void initBitMask(BatchConfig::BitMask &bitmask, int initLength); + void appendBitMask(BatchConfig::BitMask &bitmask, + int newNodes, + int preBeamSize, + int old_sub_num, + BeamTree const tree, + int currentDepth); + void updateBitMask(BatchConfig::BitMask &bitmask, + int initLength, + int non_tree_size); FFModel *get_model(int model_id); @@ -148,6 +163,7 @@ class RequestManager { void store_beam_metadata(BeamSearchBatchConfig const &old_bc, BeamInferenceResult const &result); void update_beam_metadata(BeamSearchBatchConfig &new_bc, + BeamSearchBatchConfig const &old_bc, BeamTree &tree, int request_index); @@ -181,6 +197,11 @@ class RequestManager { Legion::Context ctx, Legion::Runtime *runtime); + static void + load_batch_config_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static BatchConfig prepare_next_batch_task( Legion::Task const *task, std::vector const ®ions, @@ -210,6 +231,9 @@ class RequestManager { int max_requests_per_batch; int max_tokens_per_batch; int max_sequence_length; + + // tree width in each speculative step, if not specified 1 + std::vector spec_infer_tree_width; // private fields std::unique_ptr tokenizer_; bool verbose; @@ -243,7 +267,8 @@ class RequestManager { private: struct ProfileInfo { - int decoding_steps; + int llm_decoding_steps; + int ssm_decoding_steps; double start_time, finish_time; }; std::unordered_map profiling_requests; diff --git a/inference/models/llama.cc b/inference/models/llama.cc index b8fe70526d..10001ee916 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -246,7 +246,9 @@ void LLAMA::create_llama_model(FFModel &ff, if (mode == BEAM_SEARCH_MODE) { Tensor softmax = ff.softmax(dense, -1); // output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); - output = ff.argmax(softmax, /*beam_Search*/ true); + // output = ff.argmax(softmax, /*beam_Search*/ true); + output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); + // output = ff.top_k(softmax, ) } else { // Tensor softmax = ff.softmax(dense, -1); if (generation_config.do_sample) { diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 8b0eb926d9..b369a13c1d 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -302,6 +302,9 @@ void FlexFlow::top_level_task(Task const *task, model_metadata.llm_tokenizer_path); rm->register_output_filepath(file_paths.output_file_path); + // first decoding step: 3 results + rm->push_spec_infer_tree_width(3); + // Create LLM model FFModel tree_model(ffconfig, ffconfig.cpu_offload); if (model_metadata.llm_model_type == ModelType::LLAMA) { diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index f336c843e8..dc7e4ea3b3 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -352,7 +352,6 @@ BeamInferenceResult GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO( DT_INT32, regions[2], task->regions[2], FID_DATA, ctx, runtime); ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size); - BeamInferenceResult ir; download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size); @@ -398,6 +397,7 @@ InferenceResult ArgMax::save_inference_tensors_to_file( m, shard_id, bc, {}, {}, {input, indices}); } + download_tensor( indices.get_int32_ptr(), ir.token_ids, batch_size); return ir; diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 2883428254..18d0ec1587 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -366,7 +366,7 @@ BeamInferenceResult GenericTensorAccessorW value = helperGetGenericTensorAccessorWO( DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO( - DT_FLOAT, regions[3], task->regions[3], FID_DATA, ctx, runtime); + DT_INT32, regions[3], task->regions[3], FID_DATA, ctx, runtime); Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index 72ab7862a6..a958786be3 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -556,8 +556,6 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, int beam_size = bc->beamRequestsInfo[i].beam_size; // initial request - log_beam_topk.debug() << "sub_requests: " << i << ", " << sub_requests[i] - << "\n"; assert(sub_requests[i] > 0); // process sub requests for (int j = 0; j < sub_requests[i]; j++) { @@ -565,12 +563,13 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, // beam_slots[i].parent_id[j]; acc_probs[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] = bc->beamRequestsInfo[i].probs[j]; - log_beam_topk.debug() - << "probbbb req: " << i - << ", sub req probability : " << bc->beamRequestsInfo[i].probs[j] - << ", sub request id " << j << ", parent id " - << bc->beamRequestsInfo[i].parent_id[j] << ", data inddd" - << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j << "\n"; + // std::cout << "probbbb req: " << i << ", sub req probability : " + // << bc->beamRequestsInfo[i].probs[j] << ", sub request id " << + // j + // << ", parent id " << bc->beamRequestsInfo[i].parent_id[j] + // << ", data inddd" + // << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j + // << "\n"; } // process tokens @@ -584,6 +583,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, max_heap_size = std::max(max_heap_size, beam_size * sub_requests[i]); max_beam_width = std::max(max_beam_width, beam_size); + req_index += 1; block_start_index += (sub_requests[i] - 1) * num_new_tokens * length; } @@ -613,28 +613,37 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m, assert(num_shards >= (size_t)max_heap_size); num_shards = max_heap_size; - checkCUDA(cudaMemcpy(m->parent_ids, - parent_ids, - sizeof(int) * max_total_requests, - cudaMemcpyHostToDevice)); - checkCUDA(cudaMemcpy(m->acc_probs, - acc_probs, - sizeof(DT) * max_total_requests, - cudaMemcpyHostToDevice)); - checkCUDA(cudaMemcpy(m->block_start_index, - beam_block_start_index.data(), - sizeof(int) * beam_num_blocks, - cudaMemcpyHostToDevice)); - checkCUDA(cudaMemcpy(m->request_id, - request_id.data(), - sizeof(int) * beam_num_blocks, - cudaMemcpyHostToDevice)); - checkCUDA(cudaMemcpy(m->tokens_per_request, - tokens_per_request.data(), - sizeof(int) * beam_num_blocks, - cudaMemcpyHostToDevice)); + checkCUDA(cudaMemcpyAsync(m->parent_ids, + parent_ids, + sizeof(int) * max_total_requests, + cudaMemcpyHostToDevice, + stream)); + checkCUDA(cudaMemcpyAsync(m->acc_probs, + acc_probs, + sizeof(DT) * max_total_requests, + cudaMemcpyHostToDevice, + stream)); + // trick, set acc_probs to 0; + checkCUDA(cudaMemsetAsync( + m->acc_probs, 1.0, max_total_requests * sizeof(DT), stream)); + checkCUDA(cudaMemcpyAsync(m->block_start_index, + beam_block_start_index.data(), + sizeof(int) * beam_num_blocks, + cudaMemcpyHostToDevice, + stream)); + checkCUDA(cudaMemcpyAsync(m->request_id, + request_id.data(), + sizeof(int) * beam_num_blocks, + cudaMemcpyHostToDevice, + stream)); + checkCUDA(cudaMemcpyAsync(m->tokens_per_request, + tokens_per_request.data(), + sizeof(int) * beam_num_blocks, + cudaMemcpyHostToDevice, + stream)); // int depth = // bc->beamRequestsInfo[bc->tokensInfo[0].request_index].current_depth; + beam_num_blocks = bc->num_active_tokens(); beam_topk_forward_kernel<<>>( input_ptr, shared_memory_size, diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 007e799fe0..76236e65ff 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -155,11 +155,8 @@ int Embedding::output_size(ParallelDim output_dims[MAX_TENSOR_DIM]) { output_dims[OUT_CHANNELS].size = this->out_channels; output_dims[OUT_CHANNELS].degree = 1; output_dims[OUT_CHANNELS].parallel_idx = -1; - // Currently do not support parallelizing over the replica dim - output_dims[num_dims - 1].size = 1; - output_dims[num_dims - 1].degree = 1; - output_dims[num_dims - 1].parallel_idx = -1; - output_dims[num_dims - 1].is_replica_dim = true; + // Copy replica dim + output_dims[num_dims - 1] = input->dims[input->num_dims - 1]; return num_dims; } else { int num_dims = input->num_dims; @@ -170,11 +167,8 @@ int Embedding::output_size(ParallelDim output_dims[MAX_TENSOR_DIM]) { output_dims[OUT_CHANNELS].size = this->out_channels; output_dims[OUT_CHANNELS].degree = 1; output_dims[OUT_CHANNELS].parallel_idx = -1; - // Currently do not support parallelizing over the replica dim - output_dims[num_dims - 1].size = 1; - output_dims[num_dims - 1].degree = 1; - output_dims[num_dims - 1].parallel_idx = -1; - output_dims[num_dims - 1].is_replica_dim = true; + // Copy replica dim + output_dims[num_dims - 1] = input->dims[input->num_dims - 1]; return num_dims; } // const int REPLICA = this->output_vocab_size_replica_dim(); @@ -189,13 +183,13 @@ int Embedding::weight_size(ParallelDim weight_dims[MAX_TENSOR_DIM]) { weight_dims[Weight::VOCAB_SIZE].size = this->num_entries; weight_dims[Weight::VOCAB_SIZE].degree = 1; weight_dims[Weight::VOCAB_SIZE].parallel_idx = -1; - for (int i = 2; i < input->num_dims; i++) { + for (int i = 2; i < input->num_dims + 1; i++) { weight_dims[i].size = input->dims[i - 1].degree; weight_dims[i].degree = weight_dims[i].size; weight_dims[i].parallel_idx = input->dims[i - 1].parallel_idx; weight_dims[i].is_replica_dim = true; } - return input->num_dims; + return input->num_dims + 1; } void Embedding::register_output_mappings() { diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 695f4b13b9..da70e23f87 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -82,6 +82,9 @@ __global__ void compute_attention_kernel_generation_kernel( // request idx int const request_idx = blockIdx.y; + int const batch_config_request_id = + request_infos[request_idx].batch_config_request_id; + int const beam_request_idx = is_beam ? request_idx / max_beam_width : request_idx; int const beam_sub_request_idx = is_beam ? request_idx % max_beam_width : 0; @@ -89,8 +92,8 @@ __global__ void compute_attention_kernel_generation_kernel( int const first_step = 0; int const tlength = - request_infos[beam_request_idx].first_token_depth_in_request + - request_infos[beam_request_idx].num_tokens_in_batch; + request_infos[batch_config_request_id].first_token_depth_in_request + + request_infos[batch_config_request_id].num_tokens_in_batch; // shared memory objects extern __shared__ char smem_[]; @@ -103,7 +106,8 @@ __global__ void compute_attention_kernel_generation_kernel( // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum __shared__ float red_smem[WARPS_PER_BLOCK * 2]; - const DT *q_ptr = query + beam_request_idx * hidden_size * QKV_WEIGHT_NUM + + const DT *q_ptr = query + + batch_config_request_id * hidden_size * QKV_WEIGHT_NUM + head_idx * per_head_size; __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; // DT const *q_ptr = @@ -139,7 +143,7 @@ __global__ void compute_attention_kernel_generation_kernel( DT const *k_cache_batch = key_cache + - (beam_request_idx * max_beam_width + beam_sub_request_idx) * + (batch_config_request_id * max_beam_width + beam_sub_request_idx) * max_seq_length * hidden_size + ki; @@ -245,7 +249,7 @@ __global__ void compute_attention_kernel_generation_kernel( // The base pointer for the value in the cache buffer. DT const *v_cache_batch = value_cache + - (beam_request_idx * max_beam_width + beam_sub_request_idx) * + (batch_config_request_id * max_beam_width + beam_sub_request_idx) * max_seq_length * hidden_size + vi; @@ -825,19 +829,6 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, bias_ptr = static_cast
(m->bias_ptr); } - // todo Xinhao copy how many requests if requests are not continous? - cudaMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - bc->num_active_tokens() * sizeof(BatchConfig::PerTokenInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->request_infos, - &(bc->requestsInfo), - bc->max_requests_per_batch() * - sizeof(BatchConfig::PerRequestInfo), - cudaMemcpyHostToDevice, - stream); - // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, @@ -1364,8 +1355,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( vProjSize * num_q_heads); size_t key_cache_size = 0, value_cache_size = 0; switch (infer_mode) { - case INC_DECODING_MODE: - case TREE_VERIFY_MODE: { + case INC_DECODING_MODE: { key_cache_size = num_q_heads * kProjSize * BatchConfig::max_requests_per_batch() * BatchConfig::max_sequence_length(); @@ -1374,22 +1364,24 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( BatchConfig::max_sequence_length(); break; } - case BEAM_SEARCH_MODE: { + case BEAM_SEARCH_MODE: + case TREE_VERIFY_MODE: { + // a K-ary tree max node is (k^n - 1) / 2 key_cache_size = num_q_heads * kProjSize * BeamSearchBatchConfig::max_requests_per_batch() * - BatchConfig::max_sequence_length() * - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + (BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); value_cache_size = num_q_heads * vProjSize * BeamSearchBatchConfig::max_requests_per_batch() * - BatchConfig::max_sequence_length() * - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + (BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); break; } default: assert(false && "Unkown inference mode"); } size_t requestinfo_size = BatchConfig::max_requests_per_batch(); - size_t tokeninfo_size = max_tokens_per_batch; + // size_t tokeninfo_size = max_tokens_per_batch; size_t qk_prod_size = max_tokens_per_batch * BatchConfig::max_sequence_length() * num_q_heads; size_t attn_heads_size = max_tokens_per_batch * num_q_heads * vProjSize; @@ -1400,11 +1392,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( (qkv_max_proj_size + key_cache_size + value_cache_size + 2 * qk_prod_size + attn_heads_size) * size_of_dt + - tokeninfo_size * sizeof(BatchConfig::PerTokenInfo) + - complex_size * sizeof(cuFloatComplex) + - requestinfo_size * - sizeof(BatchConfig::PerRequestInfo); // more components will - // be added here later + complex_size * sizeof(cuFloatComplex); // more components will + // be added here later if (offload) { // assert that we have enough reserved work space left size_t totalSharedSize = @@ -1447,10 +1436,16 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size * size_of_dt); + token_infos = + static_cast(handler.batch_config_metadata); + request_infos = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo)); + if (offload) { - token_infos = - gpu_mem_allocator.allocate_reserved( - tokeninfo_size); + // token_infos = + // gpu_mem_allocator.allocate_reserved( + // tokeninfo_size); // offset += sizeof(BatchConfig::PerTokenInfo) * tokeninfo_size; qk_prods = gpu_mem_allocator.allocate_reserved_untyped(qk_prod_size * size_of_dt); @@ -1464,13 +1459,13 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( complex_input = gpu_mem_allocator.allocate_reserved(complex_size); // offset += complex_size * sizeof(cuFloatComplex); - request_infos = - gpu_mem_allocator.allocate_reserved( - requestinfo_size); + // request_infos = + // gpu_mem_allocator.allocate_reserved( + // requestinfo_size); } else { - token_infos = - gpu_mem_allocator.allocate_instance( - tokeninfo_size); + // token_infos = + // gpu_mem_allocator.allocate_instance( + // tokeninfo_size); qk_prods = gpu_mem_allocator.allocate_instance_untyped(qk_prod_size * size_of_dt); qk_prods_softmax = gpu_mem_allocator.allocate_instance_untyped( @@ -1479,9 +1474,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( size_of_dt); complex_input = gpu_mem_allocator.allocate_instance(complex_size); - request_infos = - gpu_mem_allocator.allocate_instance( - requestinfo_size); + // request_infos = + // gpu_mem_allocator.allocate_instance( + // requestinfo_size); } // allocate more size for quantization data diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index eb6fd721e6..5d234df822 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -53,7 +53,7 @@ bool SpecIncMultiHeadSelfAttentionParams::is_valid( } Tensor - FFModel::spec_inc_multihead_self_attention(const Tensor input, + FFModel::spec_inc_multihead_self_attention(Tensor const input, int embed_dim, int num_heads, int kdim, @@ -91,7 +91,7 @@ Tensor } Tensor - FFModel::spec_inc_multiquery_self_attention(const Tensor input, + FFModel::spec_inc_multiquery_self_attention(Tensor const input, int embed_dim, int num_q_heads, int num_kv_heads, @@ -257,7 +257,7 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( FFModel &model, LayerID const &_layer_guid, - const ParallelTensor _input, + ParallelTensor const _input, int _embed_dim, int _num_q_heads, int _num_kv_heads, @@ -358,8 +358,8 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( FFModel &model, - const ParallelTensor _input, - const ParallelTensor _weight, + ParallelTensor const _input, + ParallelTensor const _weight, int _embed_dim, int _num_q_heads, int _num_kv_heads, @@ -465,7 +465,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( FFModel &model, SpecIncMultiHeadSelfAttention const &other, - const ParallelTensor input, + ParallelTensor const input, bool allocate_weights) : SpecIncMultiHeadSelfAttention(model, other.layer_guid, diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 562dee4d93..88dd3f92e4 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -23,16 +23,286 @@ namespace FlexFlow { +#define WARP_SIZE 32 + // declare Legion names using Legion::coord_t; using Legion::Memory; using namespace Kernels::IncMultiHeadAttention; namespace Kernels { -namespace SpecIncMultiHeadAttention { +namespace SpecIncMultiHeadSelfAttention { + +template +__global__ void compute_spec_inc_attention_kernel_generation_kernel( + DT const *query, + DT const *key_cache, + DT const *value_cache, + DT *output_ptr, + float const scale, + int const max_seq_length, + int per_head_size, + int hidden_size, + BatchConfig::PerRequestInfo *request_infos, + BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos, + BatchConfig::BitMask *causalMask) { + + // q, k + using Q_vec = typename VEC_K::Type; + using K_vec = typename VEC_K::Type; + using V_vec = typename VEC_V
::Type; + using Out_sum = typename Vec_fp32_::Type; + + constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; + + constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT); + constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY; + constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; + // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT); + + // thread id + int const tidx = threadIdx.x; + // head id + int const head_idx = blockIdx.x; + // nth request idx + int const request_idx = blockIdx.y; + + // request id in batch config + int const batch_config_request_id = + request_infos[request_idx].batch_config_request_id; + + // request_idx = re + + BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; + + int const first_step = 0; + + // int const tlength = + // request_infos[batch_config_request_id].first_token_depth_in_request + + // request_infos[batch_config_request_id].num_tokens_in_batch; + + int const totalCacheSize = bitmask.non_tree_cache_size + bitmask.tree_size; + + int first_token_idx = 0; + for (int r = 0; r < request_idx; r++) { + first_token_idx += causalMask[r].this_layer_size; + } + + int const tree_branch_num = + beam_request_infos[batch_config_request_id].sub_request_num; + + // shared memory objects + extern __shared__ char smem_[]; + + float *qk_smem = reinterpret_cast(smem_); + float *out_smem = reinterpret_cast(smem_); + + float qk_max = -FLT_MAX; + + // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum + __shared__ float red_smem[WARPS_PER_BLOCK * 2]; + + const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM + + head_idx * per_head_size; + __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; + + // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE + int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE; + int ki_o = tidx % THREADS_PER_KEY; + // the first key's offset for this thread + // ko = 0, 0, 0, 0, 1, 1, 1, 1, .... + int ko = tidx / THREADS_PER_KEY; + // load q tensor + Q_vec q_vec[K_VECS_PER_THREAD]; + + constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY; + // The number of keys per warp. + constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; + + DT const *k_cache_batch = + key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; + + int ti_end = + div_up(totalCacheSize - first_step, K_PER_WARP) * K_PER_WARP + first_step; + + for (int qi = 0; qi < tree_branch_num; qi += 1) { +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + q_vecs[ki_o][ii] = *reinterpret_cast( + q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + + ii * THREADS_PER_KEY * K_VEC_SIZE); + } + + int const query_token = bitmask.tree_size - tree_branch_num + qi; + + __syncthreads(); + for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { + K_vec k[K_VECS_PER_THREAD]; + int const ti_circ = ti % max_seq_length; + + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; + if (ti < totalCacheSize) { + + k[ii] = *reinterpret_cast( + k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size + + jj); + } + } + float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); + + if (ti < totalCacheSize && tidx % THREADS_PER_KEY == 0) { + // todo add alobi here + // bool const mask = ti_circ >= totalCacheSize; + bool const mask = (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << query_token)))); + + // if (blockIdx.y == 0 && blockIdx.x == 0 && !mask) { + // printf("spec inc attn qkqkqk %d, %.10f, %d\n", ti, qk, qi); + // } + qk_max = mask ? qk_max : fmaxf(qk_max, qk); + qk_smem[ti - first_step] = mask ? 0.f : qk; + } + } + + __syncthreads(); + +#pragma unroll + for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { + qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + } + + // Decompose the thread index into warp and lane. + int const warp = tidx / WARP_SIZE; + int const lane = tidx % WARP_SIZE; + + // The warp leader writes the max to shared memory. + if (lane == 0) { + red_smem[warp] = qk_max; + } + + // Make sure the products are in shared memory. + __syncthreads(); + + // The warps finalize the reduction. + qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; +#pragma unroll + for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + } + + // Broadcast to all the threads in the warp. + qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); + + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("spec inc attn first token qk_max %.10f\n", qk_max); + // } + + float exp_sum = 0.f; + for (int ti = first_step + tidx; ti < totalCacheSize; + ti += THREADS_PER_BLOCK) { + bool const mask = (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << query_token)))); + float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max); + exp_sum += logit; + qk_smem[ti - first_step] = mask ? 0.0f : logit; + } + + // Compute the sum. + exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); + + // softmax + float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); + for (int ti = first_step + tidx; ti < totalCacheSize; + ti += THREADS_PER_BLOCK) { + qk_smem[ti - first_step] *= inv_sum; + } + + __syncthreads(); + + // value projection + constexpr int V_VEC_SIZE = 16 / sizeof(DT); + // A vector of V elements for the current timestep. + // using V_vec_k = typename V_vec_k_::Type; + // using V_vec_acum = typename V_vec_acum_fp32_::Type; + + // The value computed by this thread. + int vo = tidx / THREADS_PER_VALUE; + // The hidden dimensions computed by this particular thread. + int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; + constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; + + Out_sum out; + zero(out); + + // The base pointer for the value in the cache buffer. + DT const *v_cache_batch = + value_cache + batch_config_request_id * max_seq_length * hidden_size + + vi; + + if (Dh == Dh_MAX || vi < Dh) { + for (int ti = first_step + vo; ti < totalCacheSize; ti += V_PER_ITER) { + // Load the values from the cache. + int const ti_circ = ti % max_seq_length; + V_vec v = *reinterpret_cast( + v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); + + bool const mask = (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << query_token)))); + float logit = mask ? 0.0f : qk_smem[ti - first_step]; + out = FlexFlow::fma(logit, cast_to_float(v), out); + } + } + + // // Make sure we can start writing to shared memory. + __syncthreads(); + + // Run the final reduction amongst the different groups computing different + // partial outputs. + if (Dh == Dh_MAX || vi < Dh) { +#pragma unroll + for (int active_groups = V_PER_ITER; active_groups >= 2; + active_groups /= 2) { + + // The midpoint in the number of active groups. + int midpoint = active_groups / 2; + + // The upper part of active threads store to shared memory. + if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) { + *reinterpret_cast(out_smem + (vo - midpoint) * Dh + vi) = + out; + } + __syncthreads(); + + // The bottom warps update their values. + if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) { + out = add(*reinterpret_cast(out_smem + vo * Dh + vi), + out); + } + __syncthreads(); + } + } + + // Output the final values. + if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { + convert_from_float(*reinterpret_cast( + output_ptr + (first_token_idx + qi) * hidden_size + + head_idx * per_head_size + vi), + out); + } + } +} template -__global__ void spec_store_kv_cache( +__global__ void spec_inc_store_kv_cache( DT const *devQKVProjArray, DT *kCache_ptr, DT *vCache_ptr, @@ -40,16 +310,16 @@ __global__ void spec_store_kv_cache( BatchConfig::PerRequestInfo *requestInfo, BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos, BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos, + BatchConfig::BitMask *causalMask, int qProjSize, int kProjSize, int vProjSize, int num_tokens, int max_seq_len, - int max_beam_width, bool is_root, int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * 2) { - int token_idx = i / (hidden_size * KV_WEIGHT_NUM); + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / (hidden_size); int offset = i % hidden_size; size_t val_idx = @@ -58,100 +328,30 @@ __global__ void spec_store_kv_cache( DT kVal = devQKVProjArray[val_idx]; DT vVal = devQKVProjArray[val_idx + hidden_size]; - // above no need to be changed - // int const req_id = id_map[token_idx].request_index; - // int const tok_id = id_map[token_idx].token_position; - // int const sub_req_id = id_map[token_idx].sub_request_index; - // int const parent_id = id_map[token_idx].parent_id; - // int const beam_depth = id_map[token_idx].beam_depth; - // int const beam_width = id_map[token_idx].beam_width; - int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; - int const parent_id = beamRequestInfos[req_id].parent_id[sub_req_id]; - int const beam_depth = beamRequestInfos[req_id].current_depth; - int const beam_width = beamRequestInfos[req_id].beam_size; - - kCache_ptr[(req_id * max_beam_width + sub_req_id) * - (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = kVal; - vCache_ptr[(req_id * max_beam_width + sub_req_id) * - (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = vVal; - - // replica in the root iteration - if (beam_depth == 1) { - for (int i = 1; i < beam_width; i++) { - kCache_ptr[(req_id * max_beam_width + i) * (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = kVal; - vCache_ptr[(req_id * max_beam_width + i) * (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = vVal; - } - } + // int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - // if (head_idx == 0 && beam_depth == 0 && token_idx == 8 && k_cache) { - // // printf("token idx %d\n", token_idx); - // printf("data idx: %d, tok_id %d, new_token_cache_idx %d, parent_id %d, - // " - // "sub_req_id %d, num_tokens %d, kProjSize %d, num_kv_heads %d, - // val " - // "%f, beam_width %d\n", - // data_idx, - // tok_id, - // new_token_cache_idx, - // parent_id, - // sub_req_id, - // num_tokens, - // kProjSize, - // num_kv_heads, - // val, - // beam_width); - // } + int const request_token_offset = + requestInfo[req_id].first_token_offset_in_batch; - // naive cache stealing - if (sub_req_id != parent_id) { - if (offset == 0 && tok_id == 0) { - printf("cache stealing!, depth %d req_id %d sub_req_id %d, parentid " - "%d, tok_id %d\n", - beam_depth, - req_id, - sub_req_id, - parent_id, - tok_id); - } + BatchConfig::BitMask bitmask = causalMask[req_id]; - for (int depth = 0; depth < beam_depth; depth++) { - int steal_token_idx = tok_id - beam_depth + depth; - int steal_from_idx = (req_id * max_beam_width + parent_id) * - (hidden_size * max_seq_len) + - steal_token_idx * hidden_size + offset; - int steal_to_idx = (req_id * max_beam_width + sub_req_id) * - (hidden_size * max_seq_len) + - steal_token_idx * hidden_size + offset; - kCache_ptr[steal_to_idx] = kCache_ptr[steal_from_idx]; - vCache_ptr[steal_to_idx] = vCache_ptr[steal_from_idx]; - - // if(data_idx == 0 && head_idx == 0 && k_cache && req_id == 1){ - // printf("cache stealing kernel!, steal_token_idx %d\n", - // steal_token_idx); - // } - } - } + // int const tree_branch_num = beamRequestInfos[req_id].sub_request_num; + + // int const query_token = bitmask.non_tree_cache_size + bitmask.tree_size - + // tree_branch_num + sub_req_id + tok_id; + // bitmask.tree_size - tree_branch_num + sub_req_id; + + // if prompt token -> token id + // if tree token: + int const cache_idx = bitmask.non_tree_cache_size + bitmask.tree_size - + bitmask.this_layer_size + token_idx - + request_token_offset; - // parallel cache stealing not yet implemented - // logic shld be - // launch spec_store_kv_cache with parallelism * current depth - // from the i here, get depth index - // if depth index not the current one, check if we need to steal - // steal if needed - - // cache stealing theory - // identify which sub request does this token come from - // for initial token, 0 - // for other, may 0,0,1/ 0,1,2/ 1,1,1 to get which cache to be reuse and - // which to be delete copy beam_size bunch of blocks when sub_req_id == - // parent_id : like 0 -> 0, 1->1, 2->2, do nothing, just append the new k/v + kCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + + offset] = vVal; } } @@ -161,28 +361,79 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, cudaStream_t stream) { int num_tokens = bc->num_active_tokens(); int curr_depth = bc->beamRequestsInfo[0].current_depth; - // printf("curr depth: %d\n", curr_depth); - // assert(curr_depth < 3); if (num_tokens > 0) { int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens; - spec_store_kv_cache<<>>(static_cast
(m->devQKVProjArray), - static_cast
(m->keyCache), - static_cast
(m->valueCache), - m->token_infos, - m->request_infos, - m->beam_token_infos, - m->beam_request_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_tokens, - BatchConfig::max_sequence_length(), - BeamSearchBatchConfig::MAX_BEAM_WIDTH, - /*root*/ curr_depth == 0, - m->hidden_size); + spec_inc_store_kv_cache<<>>( + static_cast
(m->devQKVProjArray), + static_cast
(m->keyCache), + static_cast
(m->valueCache), + m->token_infos, + m->request_infos, + m->beam_token_infos, + m->beam_request_infos, + m->causalMask, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_tokens, + BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, + /*root*/ curr_depth == 0, + m->hidden_size); + } +} + +#define LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( \ + DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream) \ + smem_sz = smem_size_in_bytes
(m->qProjSize, \ + BatchConfig::max_sequence_length() + \ + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ + THREADS_PER_VALUE, \ + THDS_PER_BLOCK); \ + compute_spec_inc_attention_kernel_generation_kernel \ + <<>>( \ + static_cast
(m->devQKVProjArray), \ + static_cast
(m->keyCache), \ + static_cast
(m->valueCache), \ + output_ptr, \ + scale, \ + BatchConfig::max_sequence_length() + \ + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ + m->qProjSize, \ + m->hidden_size, \ + m->request_infos, \ + m->beam_request_infos, \ + m->causalMask) + +template +void compute_spec_inc_attention_kernel_generation( + SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + DT *output_ptr, + cudaStream_t stream) { + // one block == one head per request + dim3 grid(m->num_q_heads, bc->num_active_requests()); + int const per_head_size = m->qProjSize; + float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; + size_t smem_sz; + if (per_head_size == 64) { + constexpr int THREADS_PER_VALUE_64 = threads_per_value_t::value; + LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( + DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream); + } else if (per_head_size == 128) { + constexpr int THREADS_PER_VALUE_128 = threads_per_value_t::value; + LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( + DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream); + } else { + assert(false && "a unsupported head size"); } } @@ -236,199 +487,208 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, int q_block_size = m->qProjSize; int kt_block_size = m->kProjSize; - int kt_req_block_size = - kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int kt_req_block_size = kt_block_size * m->num_q_heads * + (BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); int vt_block_size = m->vProjSize; - int vt_req_block_size = - vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_req_block_size = vt_block_size * m->num_q_heads * + (BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM); assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } - for (int sub_req_id = 0; sub_req_id < bc->sub_requests[i]; sub_req_id++) { - // int num_new_tokens = bc->num_processing_tokens[i]; - // int total_tokens = bc->token_last_available_idx[i] + 1; + // else if (tokens_previous_requests < bc->num_generation_tokens) { + // tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + // continue; + // } - int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + - bc->requestsInfo[i].num_tokens_in_batch; + // all requests in prompt phase should only have one sub requests; + assert(bc->sub_requests[i] == 1); + // int num_new_tokens = bc->num_processing_tokens[i]; + // int total_tokens = bc->token_last_available_idx[i] + 1; - if (num_new_tokens <= 0) { - continue; - } + int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + + bc->requestsInfo[i].num_tokens_in_batch; - // Compute (QK^T/sqrt(d_k)) - int m_ = num_new_tokens; - int n = total_tokens; - int k = m->qProjSize; - int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, - ldc = m_; - int strideA = q_block_size; - int strideB = kt_block_size; - int strideC = num_new_tokens * total_tokens; - - // a flag of using this scaling alpha - DT alpha = 1.0f, beta = 0.0f; - if (*m->qk_prod_scaling) { - alpha = static_cast
(1.0f / sqrt(m->kProjSize)); - } - // To get A, skip over Q entries from previous requests (same head) - DT const *A = static_cast
(m->devQKVProjArray) + - bc->requestsInfo[i].first_token_offset_in_batch * - m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; - // To get B, skip over K entries from previous requests (all heads + - // padding) - DT const *B = static_cast
(m->keyCache) + - (i * bc->MAX_BEAM_WIDTH + sub_req_id) * kt_req_block_size; - - // if (i == 0 && sub_req_id == 0 && - // bc->beam_slots.at(0).current_depth == 1) { - // int offset = (float *)B - m->keyCache; - // printf("key cache offset %d\n", kt_req_block_size); - // } - // To get C, skip over QK^T products from previous requests - DT *C = static_cast
(m->qk_prods) + - m->num_q_heads * tokens_prev_requests_squares; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // add alibi position bias to qk production - // add alibi position bias to qk production - if (*m->position_bias) { - size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; - apply_position_bias_qkprd<<>>(C, - num_new_tokens, - total_tokens, - m->num_q_heads, - m->global_num_q_heads, - shard_id); - } - // Fill all elements above diagonal in qk prods with -inf to force - // causal attention. - assert(num_new_tokens <= total_tokens); - if (num_new_tokens > 1) { - size_t parallelism = m->num_q_heads * num_new_tokens * total_tokens; - spec_fill_entries_above_diagonal<<>>( - C, - num_new_tokens, - total_tokens, - m->num_q_heads, - static_cast
(-INFINITY)); - } - // Compute Softmax(QK^T/sqrt(d_k)) - // Before modifying the parameters below, make sure to read the following - // description of the CUDNN_TENSOR_NCHW tensor layout, from - // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: - // This tensor format specifies that the data is laid out in the following - // order: batch size, feature maps, rows, columns. The strides are - // implicitly defined in such a way that the data are contiguous in memory - // with no padding between images, feature maps, rows, and columns; the - // columns are the inner dimension and the images are the outermost - // dimension. - int n_param = m->num_q_heads; - int c_param = total_tokens; - int h_param = 1; - int w_param = num_new_tokens; - checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, - CUDNN_TENSOR_NCHW, - cudnn_data_type, - n_param, - c_param, - h_param, - w_param)); - float softmax_alpha = 1.0f, softmax_beta = 0.0f; - DT *C_softmax = static_cast
(m->qk_prods_softmax) + - m->num_q_heads * tokens_prev_requests_squares; - // The softmax operation below is executed according to the - // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The - // softmax operation is computed per spatial location (H,W) per image (N) - // across dimension C. - checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &softmax_alpha, - m->qk_tensor, - C, - &softmax_beta, - m->qk_tensor, - C_softmax)); - // Matmul softmax(QK^T/sqrt(d_k)) by V - alpha = 1.0f, beta = 0.0f; - m_ = m->vProjSize; - n = num_new_tokens; - k = total_tokens; - lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; - strideA = vt_block_size; - strideB = num_new_tokens * total_tokens; - strideC = m->vProjSize; - // To get A, skip over V^T entries from previous requests (all heads + - // padding) - A = static_cast
(m->valueCache) + - (i * bc->MAX_BEAM_WIDTH + sub_req_id) * vt_req_block_size; - // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous - // requests (all heads) - B = C_softmax; - // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous - // requests - C = static_cast
(m->attn_heads) + - (tokens_previous_requests + bc->num_generation_tokens) * - m->num_q_heads * m->vProjSize; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - tokens_previous_requests += num_new_tokens; - tokens_prev_requests_squares += num_new_tokens * total_tokens; + if (num_new_tokens <= 0) { + continue; + } + + // Compute (QK^T/sqrt(d_k)) + int m_ = num_new_tokens; + int n = total_tokens; + int k = m->qProjSize; + int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, + ldc = m_; + int strideA = q_block_size; + int strideB = kt_block_size; + int strideC = num_new_tokens * total_tokens; + + // a flag of using this scaling alpha + DT alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + } + // To get A, skip over Q entries from previous requests (same head) + DT const *A = static_cast
(m->devQKVProjArray) + + bc->requestsInfo[i].first_token_offset_in_batch * + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; + // To get B, skip over K entries from previous requests (all heads + + // padding) + + // print_tensor((float*)A, 32, "A"); + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + + // if (i == 0 && sub_req_id == 0 && + // bc->beam_slots.at(0).current_depth == 1) { + // int offset = (float *)B - m->keyCache; + // printf("key cache offset %d\n", kt_req_block_size); + // } + // To get C, skip over QK^T products from previous requests + DT *C = static_cast
(m->qk_prods) + + m->num_q_heads * tokens_prev_requests_squares; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // print_tensor((float*)C, 32, "C"); + // add alibi position bias to qk production + // add alibi position bias to qk production + if (*m->position_bias) { + size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; + apply_position_bias_qkprd<<>>(C, + num_new_tokens, + total_tokens, + m->num_q_heads, + m->global_num_q_heads, + shard_id); } + // Fill all elements above diagonal in qk prods with -inf to force + // causal attention. + assert(num_new_tokens <= total_tokens); + if (num_new_tokens > 1) { + size_t parallelism = m->num_q_heads * num_new_tokens * total_tokens; + spec_fill_entries_above_diagonal<<>>(C, + num_new_tokens, + total_tokens, + m->num_q_heads, + static_cast
(-INFINITY)); + } + // Compute Softmax(QK^T/sqrt(d_k)) + // Before modifying the parameters below, make sure to read the following + // description of the CUDNN_TENSOR_NCHW tensor layout, from + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: + // This tensor format specifies that the data is laid out in the following + // order: batch size, feature maps, rows, columns. The strides are + // implicitly defined in such a way that the data are contiguous in memory + // with no padding between images, feature maps, rows, and columns; the + // columns are the inner dimension and the images are the outermost + // dimension. + int n_param = m->num_q_heads; + int c_param = total_tokens; + int h_param = 1; + int w_param = num_new_tokens; + checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + n_param, + c_param, + h_param, + w_param)); + float softmax_alpha = 1.0f, softmax_beta = 0.0f; + DT *C_softmax = static_cast
(m->qk_prods_softmax) + + m->num_q_heads * tokens_prev_requests_squares; + // The softmax operation below is executed according to the + // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The + // softmax operation is computed per spatial location (H,W) per image (N) + // across dimension C. + checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &softmax_alpha, + m->qk_tensor, + C, + &softmax_beta, + m->qk_tensor, + C_softmax)); + // Matmul softmax(QK^T/sqrt(d_k)) by V + alpha = 1.0f, beta = 0.0f; + m_ = m->vProjSize; + n = num_new_tokens; + k = total_tokens; + lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; + strideA = vt_block_size; + strideB = num_new_tokens * total_tokens; + strideC = m->vProjSize; + // To get A, skip over V^T entries from previous requests (all heads + + // padding) + A = static_cast
(m->valueCache) + i * vt_req_block_size; + // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous + // requests (all heads) + B = C_softmax; + // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous + // requests + + // print_tensor((float*)C_softmax, 32, "C_softmax"); + C = static_cast
(m->attn_heads) + + (tokens_previous_requests + bc->num_generation_tokens) * + m->num_q_heads * m->vProjSize; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + tokens_previous_requests += num_new_tokens; + tokens_prev_requests_squares += num_new_tokens * total_tokens; } // assert(tokens_previous_requests == num_tokens); @@ -443,31 +703,8 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, DT *output_ptr, DT const *bias_ptr, cudaStream_t stream) { - // here because we need postion info in infernece 1 - cudaMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - bc->num_active_tokens() * sizeof(BatchConfig::PerTokenInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->request_infos, - &(bc->requestsInfo), - bc->max_requests_per_batch() * - sizeof(BatchConfig::PerRequestInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->beam_token_infos, - &(bc->beamTokenInfo), - bc->num_active_tokens() * bc->MAX_BEAM_WIDTH * - sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->beam_request_infos, - &(bc->beamRequestsInfo), - bc->max_requests_per_batch() * - sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo), - cudaMemcpyHostToDevice, - stream); // phase 1: Implement kernel to compute KQV for input tokens + compute_qkv_kernel(m, bc, shard_id, @@ -479,7 +716,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // phase 2: Update key/val cache update_kv_cache_kernel
(m, bc, stream); if (bc->num_generation_tokens > 0) { - compute_attention_kernel_generation
( + compute_spec_inc_attention_kernel_generation
( m, bc, static_cast
(m->attn_heads), stream); } // phase 3: Compute attention score @@ -488,16 +725,14 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, compute_attention_kernel_prompt( m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); } - // compute output production and bias together for all tokens - int num_tokens = - bc->num_active_tokens() * BeamSearchBatchConfig::MAX_BEAM_WIDTH; + int num_tokens = bc->num_active_tokens(); compute_o_prod_bias( m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); } -} // namespace SpecIncMultiHeadAttention +} // namespace SpecIncMultiHeadSelfAttention } // namespace Kernels /*static*/ @@ -529,25 +764,27 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( if (input.data_type == DT_HALF) { half const *bias_ptr = use_bias ? bias.get_half_ptr() : static_cast(nullptr); - Kernels::SpecIncMultiHeadAttention::inference_kernel(m, - bc, - shard_id, - input.get_half_ptr(), - weight.get_half_ptr(), - output.get_half_ptr(), - bias_ptr, - stream); + Kernels::SpecIncMultiHeadSelfAttention::inference_kernel( + m, + bc, + shard_id, + input.get_half_ptr(), + weight.get_half_ptr(), + output.get_half_ptr(), + bias_ptr, + stream); } else if (input.data_type == DT_FLOAT) { float const *bias_ptr = use_bias ? bias.get_float_ptr() : static_cast(nullptr); - Kernels::SpecIncMultiHeadAttention::inference_kernel(m, - bc, - shard_id, - input.get_float_ptr(), - weight.get_float_ptr(), - output.get_float_ptr(), - bias_ptr, - stream); + Kernels::SpecIncMultiHeadSelfAttention::inference_kernel( + m, + bc, + shard_id, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr(), + bias_ptr, + stream); } else { assert(false && "Unspported data type"); } @@ -606,38 +843,23 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); - size_t beam_tokeninfo_size = - max_tokens_per_batch * BeamSearchBatchConfig::MAX_BEAM_WIDTH; - size_t requestinfo_size = BeamSearchBatchConfig::max_requests_per_batch(); - size_t beam_requestinfo_size = - BeamSearchBatchConfig::max_requests_per_batch(); - size_t total_size = - beam_tokeninfo_size * - sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo) + - beam_requestinfo_size * - sizeof(BeamSearchBatchConfig:: - BeamSearchPerRequestInfo); // more components will - // be added here later - - // We always directly allocate memory for small speculative models - gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, - total_size); beam_token_infos = - gpu_mem_allocator - .allocate_instance( - beam_tokeninfo_size); - // offset += beam_tokeninfo_size * - // sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo); + reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo)); + beam_request_infos = - gpu_mem_allocator - .allocate_instance( - beam_requestinfo_size); - // offset += beam_requestinfo_size * - // sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo); - // assert(offset == total_size); - assert(gpu_mem_allocator.instance_total_size == - gpu_mem_allocator.instance_allocated_size); + reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::beamTokenInfo)); + causalMask = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::beamTokenInfo) + + sizeof(BeamSearchBatchConfig::beamRequestsInfo)); } cudaStreamSynchronize(stream); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index bc7d1017b7..b4af80976f 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -53,6 +53,7 @@ __global__ void compute_attention_kernel_fused_kernel( BatchConfig::PerRequestInfo *request_infos, int num_heads, int num_requests, + BatchConfig::BitMask *causalMask, int qk_smem_sz) { // q, k @@ -75,17 +76,28 @@ __global__ void compute_attention_kernel_fused_kernel( // request idx int const request_idx = blockIdx.y; + int const batch_config_request_id = + request_infos[request_idx].batch_config_request_id; + int const first_step = 0; - int const tlength = request_infos[request_idx].first_token_depth_in_request + - request_infos[request_idx].num_tokens_in_batch; - int const qlength = request_infos[request_idx].num_tokens_in_batch; + int const tlength = + request_infos[batch_config_request_id].first_token_depth_in_request + + request_infos[batch_config_request_id].num_tokens_in_batch; + int const qlength = + request_infos[batch_config_request_id].num_tokens_in_batch; + + BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; int first_token_idx = 0; for (int r = 0; r < request_idx; r++) { - first_token_idx += request_infos[request_idx].num_tokens_in_batch; + first_token_idx += request_infos[r].num_tokens_in_batch; } + // if(tidx == 0 && head_idx == 0){ + // printf("tree req: %d, %d\n", request_idx, first_token_idx); + // } + // shared memory objects extern __shared__ char smem_[]; @@ -115,7 +127,7 @@ __global__ void compute_attention_kernel_fused_kernel( constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; DT const *k_cache_batch = - key_cache + request_idx * max_seq_length * hidden_size + ki; + key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; int ti_end = div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; @@ -126,11 +138,19 @@ __global__ void compute_attention_kernel_fused_kernel( q_vecs[ki_o][ii] = *reinterpret_cast( q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + ii * THREADS_PER_KEY * K_VEC_SIZE); + + // if (head_idx == 0 && qi == 1 && tidx == 0) { + // printf("laod q %d, %d %.10f\n", + // request_idx, + // qi,q_vecs[ki_o][ii].x); + // } } + __syncthreads(); for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { K_vec k[K_VECS_PER_THREAD]; int const ti_circ = ti % max_seq_length; + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; if (ti < tlength) { @@ -142,22 +162,28 @@ __global__ void compute_attention_kernel_fused_kernel( float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); if (ti < tlength && tidx % THREADS_PER_KEY == 0) { - bool const mask = ti_circ >= tlength; - if (mask) { - assert(false); - } + bool const mask = + (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); - int pos = ti * qlength + qi; - if (((pos / qlength) % tlength) > (pos % qlength + tlength - qlength)) { - qk = -FLT_MAX; - } qk_max = mask ? qk_max : fmaxf(qk_max, qk); - qk_smem[pos] = mask ? 0.f : qk; + // if (head_idx == 0 && qi == 0 && !mask) { + // printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, %.10f\n + // ", + // request_idx, + // ti, + // qk, + // q_vecs[ki_o][0].x, + // k[0].x); + // } + qk_smem[ti - first_step] = mask ? 0.0f : qk; } } + __syncthreads(); +#pragma unroll for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); } @@ -176,7 +202,7 @@ __global__ void compute_attention_kernel_fused_kernel( // The warps finalize the reduction. qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; - +#pragma unroll for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); } @@ -184,12 +210,18 @@ __global__ void compute_attention_kernel_fused_kernel( // Broadcast to all the threads in the warp. qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); - float exp_sum = 0.f; + // if (head_idx == 0 && qi == 9 && tidx == 0) { + // printf("tree attn first token qk_max %f\n", qk_max); + // } + float exp_sum = 0.f; for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { - float logit = __expf(qk_smem[ti * qlength + qi] - qk_max); + bool const mask = + (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max); exp_sum += logit; - qk_smem[ti * qlength + qi] = logit; + qk_smem[ti - first_step] = mask ? 0.0f : logit; } // Compute the sum. @@ -197,43 +229,51 @@ __global__ void compute_attention_kernel_fused_kernel( // softmax float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); - for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { - qk_smem[ti * qlength + qi] *= inv_sum; + qk_smem[ti - first_step] *= inv_sum; } __syncthreads(); - } - // value projection - constexpr int V_VEC_SIZE = 16 / sizeof(DT); - // The value computed by this thread. - int vo = tidx / THREADS_PER_VALUE; - // The hidden dimensions computed by this particular thread. - int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; - constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; + // value projection + constexpr int V_VEC_SIZE = 16 / sizeof(DT); + // A vector of V elements for the current timestep. + // using V_vec_k = typename V_vec_k_::Type; + // using V_vec_acum = typename V_vec_acum_fp32_::Type; - Out_sum out; - // The base pointer for the value in the cache buffer. - DT const *v_cache_batch = - value_cache + request_idx * max_seq_length * hidden_size + vi; + // The value computed by this thread. + int vo = tidx / THREADS_PER_VALUE; + // The hidden dimensions computed by this particular thread. + int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; + constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; - for (int qi = 0; qi < qlength; qi++) { + Out_sum out; zero(out); - __syncthreads(); + + // The base pointer for the value in the cache buffer. + DT const *v_cache_batch = + value_cache + batch_config_request_id * max_seq_length * hidden_size + + vi; + if (Dh == Dh_MAX || vi < Dh) { for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { // Load the values from the cache. int const ti_circ = ti % max_seq_length; - + // int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti]; V_vec v = *reinterpret_cast( v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); - float logit = qk_smem[ti * qlength + qi]; - out = FlexFlow::fma(logit, cast_to_float(v), out); + + if (ti < tlength) { + bool const mask = + (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + float logit = mask ? 0.0f : qk_smem[ti - first_step]; + out = FlexFlow::fma(logit, cast_to_float(v), out); + } } } - // Make sure we can start writing to shared memory. + // // Make sure we can start writing to shared memory. __syncthreads(); // Run the final reduction amongst the different groups computing different @@ -268,6 +308,17 @@ __global__ void compute_attention_kernel_fused_kernel( output_ptr + (first_token_idx + qi) * hidden_size + head_idx * per_head_size + vi), out); + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + // printf("tree attn final value, %.9f, %.9f, %.9f, %.9f, %d, %d\n", + // out.x, + // out.y, + // out.z, + // out.w, + // vi, + // (first_token_idx + qi) * hidden_size + head_idx * + // per_head_size + + // vi); + // } } } } @@ -286,9 +337,9 @@ __global__ void commit_tokens_kernel( int max_seq_len, int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size * 2) { + CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size) { - int token_pos = i / (hidden_size * KV_WEIGHT_NUM); + int token_pos = i / (hidden_size); int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index; int offset = i % hidden_size; assert(token_idx_in_last_batch < num_active_tokens_in_last_batch); @@ -329,7 +380,8 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_tokens_to_commit, m->num_active_tokens, // number of active tokens in previous batch - BatchConfig::max_sequence_length(), + BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, m->hidden_size); } } @@ -348,9 +400,9 @@ __global__ void update_tree_branch_kv_cache( int total_tokens_in_batch, int max_seq_len, int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size * 2) { + CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size) { - int token_idx = i / (hidden_size * KV_WEIGHT_NUM); + int token_idx = i / (hidden_size); int offset = i % hidden_size; token_idx += processed_tokens_in_batch; // get index in the whole batch @@ -375,6 +427,7 @@ __global__ void update_tree_branch_kv_cache_fused( DT *kCache_ptr, DT *vCache_ptr, TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos, + BatchConfig::PerRequestInfo *request_infos, int qProjSize, int kProjSize, int vProjSize, @@ -392,10 +445,25 @@ __global__ void update_tree_branch_kv_cache_fused( DT vVal = devQKVProjArray[val_idx + hidden_size]; int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + // int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + + int const request_token_offset = + request_infos[req_id].first_token_offset_in_batch; + int const first_token_depth = + request_infos[req_id].first_token_depth_in_request; + + // if(i % hidden_size == 0){ + // printf("update token request id: %d, %d, %d real id %d, value%.10f\n", + // req_id, token_idx, request_token_offset,(token_idx + first_token_depth + // - request_token_offset), kVal); + // } + kCache_ptr[req_id * (hidden_size * max_seq_len) + + (token_idx + first_token_depth - request_token_offset) * + hidden_size + offset] = kVal; - vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + vCache_ptr[req_id * (hidden_size * max_seq_len) + + (token_idx + first_token_depth - request_token_offset) * + hidden_size + offset] = vVal; } } @@ -448,10 +516,12 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, int q_block_size = m->qProjSize; int kt_block_size = m->kProjSize; int kt_req_block_size = - kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM; int vt_block_size = m->vProjSize; int vt_req_block_size = - vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM; assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { @@ -472,9 +542,6 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, num_new_tokens++; } - std::cout << "num_new_tokens: " << num_new_tokens << "\n"; - assert(false); - int total_tokens_in_request = bc->tokensInfo[j].abs_depth_in_request + 1; assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens); { @@ -716,7 +783,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, #define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL( \ DT, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream) \ smem_size_in_bytes_tree
(m->qProjSize, \ - BatchConfig::max_sequence_length(), \ + BatchConfig::max_sequence_length() + \ + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ THDS_PER_VALUE, \ THDS_PER_BLOCK, \ bc, \ @@ -733,17 +801,19 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, static_cast
(m->valueCache), \ output_ptr, \ scale, \ - BatchConfig::max_sequence_length(), \ + BatchConfig::max_sequence_length() + \ + BatchConfig::BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, \ BatchConfig::max_tokens_per_batch(), \ m->qProjSize, \ m->hidden_size, \ m->request_infos, \ m->num_q_heads, \ bc->num_active_requests(), \ + m->causalMask, \ smem_sz[0]) template -void compute_attention_kernel_fused(IncMultiHeadSelfAttentionMeta const *m, +void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, TreeVerifyBatchConfig const *bc, DT *output_ptr, cudaStream_t stream) { @@ -760,11 +830,12 @@ void compute_attention_kernel_fused(IncMultiHeadSelfAttentionMeta const *m, static_cast
(m->keyCache), static_cast
(m->valueCache), m->token_infos, + m->request_infos, m->qProjSize, m->kProjSize, m->vProjSize, num_new_tokens, - BatchConfig::max_sequence_length(), + BatchConfig::max_sequence_length() + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM, m->hidden_size); dim3 grid(m->num_q_heads, bc->num_active_requests()); @@ -816,12 +887,20 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // Note that m->num_active_tokens stores the number of active // tokens in the previous batch, which is needed for committing // keys/values to the key-value cache + // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit << + // "\n"; + cudaMemcpyAsync(m->committed_token_infos, &(bc->committed_tokens), bc->num_tokens_to_commit * sizeof(TreeVerifyBatchConfig::CommittedTokensInfo), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(m->causalMask, + &(bc->causalMask), + bc->num_active_requests() * sizeof(BatchConfig::BitMask), + cudaMemcpyHostToDevice, + stream); commit_tokens
(m, bc, stream); // After commit we update m->num_active_tokens to be the number of active @@ -834,18 +913,6 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream); bias_ptr = static_cast
(m->bias_ptr); } - cudaMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - bc->num_active_tokens() * - sizeof(TreeVerifyBatchConfig::PerTokenInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->request_infos, - &(bc->requestsInfo), - bc->max_requests_per_batch() * - sizeof(BatchConfig::PerRequestInfo), - cudaMemcpyHostToDevice, - stream); // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, @@ -991,27 +1058,16 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); - size_t committed_tokeninfo_size = max_tokens_per_batch; - size_t total_size = committed_tokeninfo_size * - sizeof(TreeVerifyBatchConfig::CommittedTokensInfo); - if (offload) { - // assert that we have enough reserved work space left - assert(gpu_mem_allocator.reserved_total_size - - gpu_mem_allocator.reserved_allocated_size >= - total_size); - committed_token_infos = - gpu_mem_allocator - .allocate_reserved( - committed_tokeninfo_size); - } else { - gpu_mem_allocator.create_legion_instance(committed_token_reserve_inst, - total_size); - committed_token_infos = - gpu_mem_allocator - .allocate_instance( - committed_tokeninfo_size); - } + + causalMask = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo)); + committed_token_infos = + reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + + sizeof(BatchConfig::requestsInfo) + + sizeof(BatchConfig::causalMask)); } cudaStreamSynchronize(stream); diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index eb045e8159..8af0ed8978 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -318,7 +318,8 @@ FutureMap InferenceManager::inference(FFModel *model, found_input_operator = true; assert(op->numOutputs == 1); ParallelTensor pt = tensor_buffer[op->outputs[0]][batch_index]; - load_input_tokens_from_batch_config(bc, pt); + load_input_tokens_from_batch_config(bc, pt, model->handlers); + load_inference_metadata_batch_config(bc, model->handlers); } } @@ -348,11 +349,34 @@ FutureMap InferenceManager::inference(FFModel *model, }; void InferenceManager::load_input_tokens_from_batch_config( - BatchConfigFuture const &bc, ParallelTensor const input) { + BatchConfigFuture const &bc, + ParallelTensor const input, + FFHandler *handlers) { Context ctx = ff_config.lg_ctx; Runtime *runtime = ff_config.lg_hlr; size_t machine_view_hash = input->machine_view.hash(); ArgumentMap argmap; + Domain domain = runtime->get_index_space_domain(ctx, input->parallel_is); + + switch (domain.get_dim()) { +#define DIMFUNC(DIM) \ + case DIM: { \ + Rect rect = domain; \ + MachineView view = input->machine_view; \ + int idx = 0; \ + for (PointInRectIterator it(rect); it(); it++) { \ + argmap.set_point(*it, \ + TaskArgument(&handlers[view.get_device_id(*it)], \ + sizeof(FFHandler))); \ + } \ + break; \ + } + LEGION_FOREACH_N(DIMFUNC) +#undef DIMFUNC + default: + assert(false); + } + IndexLauncher launcher(RM_LOAD_TOKENS_TASK_ID, input->parallel_is, TaskArgument(nullptr, 0), @@ -368,6 +392,34 @@ void InferenceManager::load_input_tokens_from_batch_config( runtime->execute_index_space(ctx, launcher); } +void InferenceManager::load_inference_metadata_batch_config( + BatchConfigFuture const &bc, FFHandler *handlers) { + Context ctx = ff_config.lg_ctx; + Runtime *runtime = ff_config.lg_hlr; + ArgumentMap argmap; + + Domain domain = + runtime->get_index_space_domain(ctx, ff_config.all_gpu_task_is); + Rect<1> task_rect = domain; + + int idx = 0; + for (PointInRectIterator<1> it(task_rect); it(); it++) { + FFHandler handler = handlers[idx++]; + argmap.set_point(*it, TaskArgument(&handler, sizeof(FFHandler))); + } + + IndexLauncher launcher(RM_LOAD_BATCH_CONFIG_TASK_ID, + ff_config.all_gpu_task_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + FFConfig::DataParallelism_GPU); + launcher.add_future(bc); + runtime->execute_index_space(ctx, launcher); +} + void InferenceManager::load_positions(BatchConfigFuture const &bc, ParallelTensor position_input, int offset) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 92f0cff472..37605c44a4 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1499,10 +1499,8 @@ FFRuntime::FFRuntime(FFConfig &config) { Context ctx = config.lg_ctx; ArgumentMap argmap; - Rect<1> task_rect(Point<1>(0), - Point<1>(config.workersPerNode * config.numNodes - 1)); - IndexSpaceT<1> task_is = runtime->create_index_space(ctx, task_rect); - + Domain domain = runtime->get_index_space_domain(ctx, config.all_gpu_task_is); + Rect<1> task_rect = domain; // int rank = 0; for (PointInRectIterator<1> it(task_rect); it(); it++) { FFInitInfo info; @@ -1518,7 +1516,7 @@ FFRuntime::FFRuntime(FFConfig &config) { // Init CUDA library on each worker IndexLauncher initLauncher(FF_INIT_TASK_ID, - task_is, + config.all_gpu_task_is, TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, @@ -2993,6 +2991,12 @@ Op *FFModel::create_operator_from_layer( dims[num_dims].degree = 1; dims[num_dims].parallel_idx = -1; dims[num_dims].is_replica_dim = true; + if (config.computationMode == COMP_MODE_INFERENCE && + config.tensor_parallelism_degree > 1) { + dims[num_dims].size *= config.tensor_parallelism_degree; + dims[num_dims].degree *= config.tensor_parallelism_degree; + dims[num_dims].parallel_idx = 0; + } // create_parallel_tensor adds an NoOp into operators ParallelTensor pt = create_parallel_tensor_legion_ordering(num_dims + 1, @@ -3002,6 +3006,7 @@ Op *FFModel::create_operator_from_layer( 0, true /*gradients*/, tensor->tensor_guid); + assert(pt->get_shape().is_valid()); // assert that this tensor hasn't been mapped before assert(tensor->parallel_tensor == nullptr); tensor->parallel_tensor = pt; @@ -3260,12 +3265,12 @@ void FFModel::create_operators_from_layers() { if (config.computationMode == COMP_MODE_INFERENCE && config.tensor_parallelism_degree > 1 && l->op_type == OP_EMBEDDING) { assert(op->numOutputs == 1); - Replicate *repl = new Replicate(*this, - op->outputs[0], - op->outputs[0]->num_dims - 1, - config.tensor_parallelism_degree); - operators.push_back(repl); - op = repl; + // Replicate *repl = new Replicate(*this, + // op->outputs[0], + // op->outputs[0]->num_dims - 1, + // config.tensor_parallelism_degree); + // operators.push_back(repl); + // op = repl; } else if (config.computationMode == COMP_MODE_INFERENCE && config.tensor_parallelism_degree > 1 && (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || @@ -4076,6 +4081,10 @@ FFConfig::FFConfig() { Runtime *runtime = Runtime::get_runtime(); lg_hlr = runtime; lg_ctx = Runtime::get_context(); + Rect<1> task_rect(Point<1>(0), Point<1>(workersPerNode * numNodes - 1)); + // Create an index space for tasks running on all GPUs + all_gpu_task_is = runtime->create_index_space(lg_ctx, task_rect); + // field_space = runtime->create_field_space(lg_ctx); } @@ -4337,6 +4346,23 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + // RequestManager load metadata + { + TaskVariantRegistrar registrar(RM_LOAD_BATCH_CONFIG_TASK_ID, + "RequestManager Load meta data"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "RequestManager Load metadata Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // RequestManager prepare_next_batch { TaskVariantRegistrar registrar(RM_PREPARE_NEXT_BATCH_TASK_ID, diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp index 6c482426eb..ad2b781567 100644 --- a/src/runtime/model.cpp +++ b/src/runtime/model.cpp @@ -131,6 +131,54 @@ FFHandler .wait(); handle.workSpace = workspaceInst.pointer_untyped(0, sizeof(char)); } + if (handle.offload_reserve_space_size > 0) { + // allocate memory for offload reserve space + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(handle.offload_reserve_space_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance workspaceInst; + Realm::RegionInstance::create_instance(workspaceInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + handle.offload_reserve_space = + workspaceInst.pointer_untyped(0, sizeof(char)); + } else { + handle.offload_reserve_space = nullptr; + } + if (handle.batch_config_metadata_size > 0) { + // allocate memory for offload reserve space + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance workspaceInst; + Realm::RegionInstance::create_instance(workspaceInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + handle.batch_config_metadata = + workspaceInst.pointer_untyped(0, sizeof(char)); + } else { + handle.batch_config_metadata = nullptr; + } // checkCUDA(hipMalloc(&handle.workSpace, handle.workSpaceSize)); #ifdef FF_USE_NCCL handle.ncclComm = NULL; diff --git a/src/runtime/model.cu b/src/runtime/model.cu index 17401a0f14..c885b29db2 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -151,6 +151,31 @@ FFHandler } else { handle.offload_reserve_space = nullptr; } + if (handle.batch_config_metadata_size > 0) { + // allocate memory for offload reserve space + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance workspaceInst; + Realm::RegionInstance::create_instance(workspaceInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + handle.batch_config_metadata = + workspaceInst.pointer_untyped(0, sizeof(char)); + } else { + handle.batch_config_metadata = nullptr; + } + // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize)); #ifdef FF_USE_NCCL diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 7c37f3391e..89d4ddaed4 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -16,6 +16,7 @@ #include "flexflow/request_manager.h" #include "flexflow/parallel_ops/parallel_op.h" // #include "flexflow/tokenizers.h" +#include #include #include #include @@ -106,6 +107,11 @@ int RequestManager::get_max_sequence_length() { return max_sequence_length; } +void RequestManager::push_spec_infer_tree_width(int tree_width) { + assert(tree_width <= BeamSearchBatchConfig::MAX_BEAM_WIDTH); + spec_infer_tree_width.emplace_back(tree_width); +} + void RequestManager::register_tokenizer(ModelType type, int bos_token_id, int eos_token_id, @@ -358,6 +364,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } } int num_generation_tokens = 0; + int num_active_req = -1; // Step 2: prepare the next batch for existing requests BatchConfig new_bc; @@ -406,13 +413,14 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, total_request_run_time += profile_info.finish_time - profile_info.start_time; profiling_requests[request.guid] = profile_info; - log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf)", - request.guid, - profile_info.decoding_steps, - profile_info.start_time, - profile_info.finish_time, - profile_info.finish_time - profile_info.start_time); + log_req_mgr.print( + "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) " + "finish(%.1lf) latency(%.1lf)", + request.guid, + profile_info.llm_decoding_steps, + profile_info.start_time, + profile_info.finish_time, + profile_info.finish_time - profile_info.start_time); // Write output to file if needed: if (!output_filepath.empty()) { std::ofstream outputFile(output_filepath, std::ios::app); @@ -420,8 +428,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, outputFile << "end-to-end latency: " << std::fixed << std::setprecision(3) << total_request_run_time << std::endl; - outputFile << "num decoding steps: " << profile_info.decoding_steps - << std::endl; + outputFile << "num decoding steps: " + << profile_info.llm_decoding_steps << std::endl; outputFile << "token IDs: "; for (int i = 0; i < request.tokens.size(); i++) { outputFile << request.tokens[i]; @@ -447,6 +455,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; + num_active_req++; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 == request.tokens.size()) { // Incremental phase @@ -469,7 +479,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } // Update profiling profiling_requests[new_bc.requestsInfo[i].request_guid] - .decoding_steps++; + .llm_decoding_steps++; } } } @@ -483,6 +493,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, Request new_request = pending_request_queue.front(); pending_request_queue.pop(); // all_requests[new_request.guid] = new_request; + new_bc.requestsInfo[i].first_token_depth_in_request = 0; new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = new_request.guid; @@ -492,9 +503,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; new_bc.request_completed[i] = false; + num_active_req++; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // add profile_info for the new request ProfileInfo profile_info; - profile_info.decoding_steps = 1; + profile_info.llm_decoding_steps = 1; profile_info.start_time = Realm::Clock::current_time_in_microseconds(); profiling_requests[new_request.guid] = profile_info; for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { @@ -567,6 +580,7 @@ BeamSearchBatchConfig int result_index = 0; int num_generation_tokens = 0; + int num_active_req = -1; for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (old_bc.request_completed[i]) { @@ -602,6 +616,8 @@ BeamSearchBatchConfig committed_tokens[guid].emplace_back(abs_depth, result_index); } else if (abs_depth >= root_abs_depth) { tree_outputs.emplace_back(token_id, abs_depth + 1); + // std::cout << "committred tokens push: " << abs_depth + // << " ,result index: " << result_index << "\n"; committed_tokens[guid].emplace_back(abs_depth, result_index); if (verbose) { @@ -612,22 +628,23 @@ BeamSearchBatchConfig tree_outputs.back().second, token_id); } - std::cout << "Index within old batch: " << result_index << std::endl; - printf(" Input: [%d] %d ---> [%d] %d \n", - abs_depth, - old_bc.tokensInfo[result_index].token_id, - tree_outputs.back().second, - token_id); + // std::cout << "Index within old batch: " << result_index << std::endl; + // printf(" Input: [%d] %d ---> [%d] %d \n", + // abs_depth, + // old_bc.tokensInfo[result_index].token_id, + // tree_outputs.back().second, + // token_id); } result_index++; } if (request.status == Request::RUNNING) { + std::vector> verified_tokens = traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs); + log_req_mgr.print("Number of Verified Tokens = %zu", verified_tokens.size()); - // check if the request is finished if (verified_tokens.size() + request.tokens.size() >= request.max_sequence_length) { @@ -664,16 +681,18 @@ BeamSearchBatchConfig // Log profiling info ProfileInfo profile_info = profiling_requests[request.guid]; profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); + profile_info.ssm_decoding_steps = 0; total_request_run_time += profile_info.finish_time - profile_info.start_time; profiling_requests[request.guid] = profile_info; - log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf)", - request.guid, - profile_info.decoding_steps, - profile_info.start_time, - profile_info.finish_time, - profile_info.finish_time - profile_info.start_time); + log_req_mgr.print( + "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) " + "finish(%.1lf) latency(%.1lf)", + request.guid, + profile_info.llm_decoding_steps, + profile_info.start_time, + profile_info.finish_time, + profile_info.finish_time - profile_info.start_time); // Write output to file if needed: if (!output_filepath.empty()) { @@ -682,8 +701,8 @@ BeamSearchBatchConfig outputFile << "end-to-end latency: " << std::fixed << std::setprecision(3) << total_request_run_time << std::endl; - outputFile << "num decoding steps: " << profile_info.decoding_steps - << std::endl; + outputFile << "num decoding steps: " + << profile_info.llm_decoding_steps << std::endl; outputFile << "token IDs: "; for (int i = 0; i < request.tokens.size(); i++) { outputFile << request.tokens[i]; @@ -709,6 +728,7 @@ BeamSearchBatchConfig new_bc.request_completed[i] = false; new_bc.request_running[i] = true; + num_active_req++; // Normal Request Info new_bc.requestsInfo[i].first_token_depth_in_request = @@ -719,6 +739,7 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size(); + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // TODO: Beam Request Info, missing from VerifyTreeBatchConfig int new_max_depth = @@ -726,8 +747,14 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].first_token_depth_in_request - verified_tokens.size(); new_bc.beamRequestsInfo[i].current_depth = 1; + + profiling_requests[request.guid].ssm_decoding_steps = 0; + + int ssm_decoding_steps = 0; new_bc.beamRequestsInfo[i].beam_size = - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + spec_infer_tree_width.size() > ssm_decoding_steps + ? spec_infer_tree_width[ssm_decoding_steps] + : 1; new_bc.beamRequestsInfo[i].max_depth = std::min(new_max_depth, BeamSearchBatchConfig::MAX_BEAM_DEPTH); for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { @@ -735,8 +762,14 @@ BeamSearchBatchConfig new_bc.beamRequestsInfo[i].probs[j] = 1; } + new_bc.beamRequestsInfo[i].sub_request_num = 1; + new_bc.sub_requests[i] = 1; + updateBitMask(new_bc.causalMask[i], + verified_tokens.size(), + request.tokens.size()); + // Token Info for (int j = 0; j < verified_tokens.size(); j++) { auto token = verified_tokens.at(j); @@ -758,6 +791,7 @@ BeamSearchBatchConfig break; } } + std::string output = this->tokenizer_->Decode(request.tokens); // Unlike Huggingface, the sentencepiece C++ library automatically // removes the BOS token @@ -767,9 +801,11 @@ BeamSearchBatchConfig } log_req_mgr.print("Output: %s", output.c_str()); } + } else if (request.status == Request::PENDING) { new_bc.request_completed[i] = false; new_bc.request_running[i] = false; + num_active_req++; std::cout << "ssm_cache_size: " << request.ssm_cache_size << ", " << "initial_len: " << request.initial_len << std::endl; @@ -783,17 +819,24 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; new_bc.requestsInfo[i].num_tokens_in_batch = 0; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // TODO: Beam Request Info, missing from VerifyTreeBatchConfig new_bc.beamRequestsInfo[i].current_depth = 1; + int ssm_decoding_steps = + profiling_requests[request.guid].ssm_decoding_steps; new_bc.beamRequestsInfo[i].beam_size = - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + spec_infer_tree_width.size() > ssm_decoding_steps + ? spec_infer_tree_width[ssm_decoding_steps] + : 1; new_bc.beamRequestsInfo[i].max_depth = 0; for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { new_bc.beamRequestsInfo[i].parent_id[j] = 0; new_bc.beamRequestsInfo[i].probs[j] = 1; } + new_bc.beamRequestsInfo[i].sub_request_num = 1; + new_bc.sub_requests[i] = 1; // Token Info @@ -818,6 +861,7 @@ BeamSearchBatchConfig Request new_request = pending_request_queue.front(); pending_request_queue.pop(); // all_requests[new_request.guid] = new_request; + num_active_req++; new_bc.requestsInfo[i].first_token_depth_in_request = 0; new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = new_request.guid; @@ -826,15 +870,21 @@ BeamSearchBatchConfig (int)new_request.tokens.size()); new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // add profile_info for the new request ProfileInfo profile_info; - profile_info.decoding_steps = 0; + profile_info.llm_decoding_steps = 0; + profile_info.ssm_decoding_steps = 0; profile_info.start_time = Realm::Clock::current_time_in_microseconds(); profiling_requests[new_request.guid] = profile_info; // init the beam search metadata per request + int ssm_decoding_steps = profile_info.ssm_decoding_steps; + new_bc.beamRequestsInfo[i].beam_size = - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + spec_infer_tree_width.size() > ssm_decoding_steps + ? spec_infer_tree_width[ssm_decoding_steps] + : 1; new_bc.beamRequestsInfo[i].current_depth = 1; new_bc.beamRequestsInfo[i].max_depth = std::min(BeamSearchBatchConfig::MAX_BEAM_DEPTH, @@ -846,6 +896,11 @@ BeamSearchBatchConfig } new_bc.request_completed[i] = false; + + new_bc.beamRequestsInfo[i].sub_request_num = 1; + printf("sub request num == 1, %d \n", + new_bc.beamRequestsInfo[i].beam_size); + new_bc.sub_requests[i] = 1; for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { @@ -862,6 +917,9 @@ BeamSearchBatchConfig new_bc.num_tokens++; } + initBitMask(new_bc.causalMask[i], + new_bc.requestsInfo[i].num_tokens_in_batch); + // if (new_bc.requestsInfo[i].num_tokens_in_batch < // new_request.initial_len) { // all_requests[new_request.guid].status = Request::PENDING; @@ -949,6 +1007,8 @@ BeamSearchBatchConfig } std::cout << "Current Beam Depth: " << old_bc.beamRequestsInfo[0].current_depth << "\n"; + std::cout << "Current sub request num: " + << old_bc.beamRequestsInfo[0].sub_request_num << "\n"; } // Step 1: Store result to the beam tree struct store_beam_metadata(old_bc, result); @@ -960,10 +1020,12 @@ BeamSearchBatchConfig int num_generation_tokens = 0; // Add incremental tokens to the batch + int num_active_req = -1; for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (old_bc.request_completed[i] || !old_bc.request_running[i]) { continue; } + num_active_req++; // Comment out this assertion since num_tokens_in_batch can be // zero when beam search has reached required sequence length // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); @@ -973,29 +1035,6 @@ BeamSearchBatchConfig // assert(processed_tokens < request.tokens.size()); log_req_mgr.debug() << "processed_tokens: " << processed_tokens << "\n"; - // if (processed_tokens > - // old_bc.beamRequestsInfo[i].max_depth + request.tokens.size() && - // request.status == Request::RUNNING - // // || ir.results[t] == 0 TODO: replace this with - // ) { - // // log_req_mgr.print("[Done] guid(%zu) with spec_tree_depth(%d)", - // // old_bc.requestsInfo[i].request_guid, - // // old_bc.beamRequestsInfo[i].max_depth); - // // // new_bc.request_completed[i] = true; - // // new_bc.request_completed[i] = false; - // // new_bc.requestsInfo[i].first_token_depth_in_request = - // processed_tokens; - // // new_bc.requestsInfo[i].request_guid = - // // old_bc.requestsInfo[i].request_guid; - // // new_bc.requestsInfo[i].max_sequence_length = - // // old_bc.requestsInfo[i].max_sequence_length; - // // new_bc.beamRequestsInfo[i].current_depth = - // // old_bc.beamRequestsInfo[i].current_depth; - // // new_bc.request_running[i] = false; - // std::cout << "beam search end:" << request.status << i << ", " - // << new_bc.requestsInfo[i].num_tokens_in_batch << "\n"; - // } - // else { log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", " << new_bc.num_tokens; @@ -1005,25 +1044,42 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; - + profiling_requests[request.guid].ssm_decoding_steps += 1; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // update the beam search metadata // how many sub request in current request // why is sub_requests has max_requests_per_batch() * MAX_BEAM_WIDTH // entries? - new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size; - // update the parentid, accumalated_probs, depth, and token_ids + int ssm_decoding_steps = + profiling_requests[request.guid].ssm_decoding_steps; + new_bc.beamRequestsInfo[i].beam_size = - old_bc.beamRequestsInfo[i].beam_size; + spec_infer_tree_width.size() > ssm_decoding_steps + ? spec_infer_tree_width[ssm_decoding_steps] + : 1; + new_bc.beamRequestsInfo[i].max_depth = old_bc.beamRequestsInfo[i].max_depth; + new_bc.sub_requests[i] = + old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; + new_bc.beamRequestsInfo[i].sub_request_num = + old_bc.beamRequestsInfo[i].sub_request_num * + old_bc.beamRequestsInfo[i].beam_size; + + assert(new_bc.beamRequestsInfo[i].sub_request_num <= + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES && + "exceed maximum nodes per layer"); + if (request.status == Request::RUNNING) { new_bc.beamRequestsInfo[i].current_depth = old_bc.beamRequestsInfo[i].current_depth + 1; new_bc.request_running[i] = true; // do the slot exchange to minimize the cache exchange in kernel. - update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), i); + update_beam_metadata( + new_bc, old_bc, request.beam_trees.at(old_bc.model_id), i); + } else { assert(false && "Request should not be pending in beam search phase"); } @@ -1035,6 +1091,7 @@ BeamSearchBatchConfig request.tokens.size()) { // Incremental phase if (request.status == Request::RUNNING) { + // todo this is replaced by this_layer_size, but should check it new_bc.requestsInfo[i].num_tokens_in_batch = 1; } else { assert(false && "Request should be done"); @@ -1057,9 +1114,22 @@ BeamSearchBatchConfig } // register more tokens due to the beam width + + // copy metadata + memcpy(&new_bc.causalMask[i], + &old_bc.causalMask[i], + sizeof(BatchConfig::BitMask)); + BeamTree tree = request.beam_trees[old_bc.model_id]; + appendBitMask(new_bc.causalMask[i], + new_bc.beamRequestsInfo[i].sub_request_num, + old_bc.beamRequestsInfo[i].beam_size, + old_bc.beamRequestsInfo[i].sub_request_num, + tree, + old_bc.beamRequestsInfo[i].current_depth); + // assert(false); for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; - for (int k = 0; k < new_bc.sub_requests[i]; k++) { + for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) { new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; @@ -1069,6 +1139,8 @@ BeamSearchBatchConfig new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; new_bc.num_tokens++; + + num_generation_tokens++; } } } @@ -1079,6 +1151,7 @@ BeamSearchBatchConfig if (old_bc.request_completed[i] || old_bc.request_running[i]) { continue; } + num_active_req++; // Comment out this assertion since num_tokens_in_batch can be // zero when beam search has reached required sequence length // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); @@ -1098,18 +1171,34 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // update the beam search metadata // how many sub request in current request // why is sub_requests has max_requests_per_batch() * MAX_BEAM_WIDTH // entries? - new_bc.sub_requests[i] = old_bc.beamRequestsInfo[i].beam_size; + int ssm_decoding_steps = + profiling_requests[request.guid].ssm_decoding_steps; - // update the parentid, accumalated_probs, depth, and token_ids new_bc.beamRequestsInfo[i].beam_size = - old_bc.beamRequestsInfo[i].beam_size; + spec_infer_tree_width.size() > ssm_decoding_steps + ? spec_infer_tree_width[ssm_decoding_steps] + : 1; + printf("beam size: %d, %d\n", + new_bc.beamRequestsInfo[i].beam_size, + ssm_decoding_steps); new_bc.beamRequestsInfo[i].max_depth = old_bc.beamRequestsInfo[i].max_depth; + new_bc.sub_requests[i] = + old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; + new_bc.beamRequestsInfo[i].sub_request_num = + old_bc.beamRequestsInfo[i].sub_request_num; + + assert(new_bc.beamRequestsInfo[i].sub_request_num <= + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES && + "exceed maximum nodes per layer"); + + // update the parentid, accumalated_probs, depth, and token_ids if (request.status == Request::PENDING) { // if the request is pending, we need to update the beam search @@ -1121,6 +1210,10 @@ BeamSearchBatchConfig assert(false && "Request should be pending"); } + memcpy(&new_bc.causalMask[i], + &old_bc.causalMask[i], + sizeof(BatchConfig::BitMask)); + if (new_bc.requestsInfo[i].first_token_depth_in_request >= request.tokens.size()) { // request is done @@ -1133,6 +1226,13 @@ BeamSearchBatchConfig (int)request.tokens.size() - new_bc.requestsInfo[i].first_token_depth_in_request); request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch; + BeamTree tree = request.beam_trees[old_bc.model_id]; + appendBitMask(new_bc.causalMask[i], + new_bc.beamRequestsInfo[i].sub_request_num, + old_bc.beamRequestsInfo[i].beam_size, + old_bc.beamRequestsInfo[i].sub_request_num, + tree, + old_bc.beamRequestsInfo[i].current_depth); } if (verbose) { @@ -1152,7 +1252,7 @@ BeamSearchBatchConfig // register more tokens due to the beam width for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; - for (int k = 0; k < new_bc.sub_requests[i]; k++) { + for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) { new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; @@ -1229,21 +1329,20 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( max_prompt_load_size -= 1; } } - + int num_active_req = -1; for (int i = 0; i < TreeVerifyBatchConfig::max_requests_per_batch(); i++) { if (old_batches.at(0).request_completed[i]) { continue; } + num_active_req++; size_t guid = old_batches.at(0).requestsInfo[i].request_guid; Request &request = all_requests[guid]; // Profiling - profiling_requests[request.guid].decoding_steps += 1; + profiling_requests[request.guid].llm_decoding_steps += 1; if (request.status == Request::RUNNING) { new_bc.request_running[i] = true; - std::cout << "[Verify] Request " << request.guid << " is running" - << std::endl; // Get the dfs tree std::vector>> @@ -1274,31 +1373,44 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( old_batches.at(0).requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; + + // copy bitmask to verify batchconfig + memcpy(&(new_bc.causalMask[i]), + &(old_batches.at(0).causalMask[i]), + sizeof(BatchConfig::BitMask)); // TODO: Check this new_bc.requestsInfo[i].num_tokens_in_batch = 0; new_bc.request_completed[i] = false; + // std::cout << "dfs_tree_inputs: " << dfs_tree_inputs.size() << ", " + // << new_bc.causalMask[i].tree_size << ", " + // << new_bc.causalMask[i].non_tree_cache_size << "\n"; + // std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[0]) + // << "\n"; + // Committed Tokens if (committed_tokens.find(guid) != committed_tokens.end()) { - for (int j = 0; j < dfs_tree_inputs.size(); j++) { - if (j < committed_tokens.at(guid).size()) { - auto committed_token = committed_tokens.at(guid).at(j); - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = - committed_token.second; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = - i; - new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = - committed_token.first; - if (verbose) { - std::cout << new_bc.num_tokens_to_commit - << "- committed_token.token_depth: " - << committed_token.first - << ", token_index: " << committed_token.second - << std::endl; - } - new_bc.num_tokens_to_commit++; - request.llm_cache_size++; + for (int j = 0; j < committed_tokens.at(guid).size(); j++) { + // if (j < committed_tokens.at(guid).size()) { + + auto committed_token = committed_tokens.at(guid).at(j); + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = + committed_token.second; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = + i; + new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = + committed_token.first; + if (verbose) { + std::cout << new_bc.num_tokens_to_commit + << "- committed_token.token_depth: " + << committed_token.first + << ", token_index: " << committed_token.second + << std::endl; } + new_bc.num_tokens_to_commit++; + request.llm_cache_size++; + // } } } if (verbose) { @@ -1324,6 +1436,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.requestsInfo[i].first_token_depth_in_request = request.tokens.size() - 1; + bool cutLayer = false; // Add Tokens from the DFS Tree to the next batch for (int j = 1; j < dfs_tree_inputs.size(); j++) { auto token = dfs_tree_inputs.at(j); @@ -1340,11 +1453,27 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - if (new_bc.num_tokens == get_max_tokens_per_batch() - 1) { + if (new_bc.num_tokens == get_max_tokens_per_batch() && + (j != dfs_tree_inputs.size() - 1)) { + cutLayer = true; break; } } + // delete the last incomplete layer + if (cutLayer) { + int total_tokens = new_bc.num_tokens; + for (int j = total_tokens - 1; j >= 1; j--) { + new_bc.num_tokens--; + new_bc.requestsInfo[i].num_tokens_in_batch--; + // std::cout << "cut: " << j << "\n"; + if (new_bc.tokensInfo[j].abs_depth_in_request != + new_bc.tokensInfo[j - 1].abs_depth_in_request) { + break; + } + } + } + } else if (request.status == Request::PENDING) { new_bc.request_running[i] = false; if (verbose) { @@ -1374,6 +1503,10 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( << new_bc.num_tokens_to_commit << std::endl; } + memcpy(&(new_bc.causalMask[i]), + &(old_batches.at(0).causalMask[i]), + sizeof(BatchConfig::BitMask)); + // Normal Request Info new_bc.requestsInfo[i].first_token_depth_in_request = request.llm_cache_size; @@ -1382,6 +1515,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( old_batches.at(0).requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_batches.at(0).requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; new_bc.request_completed[i] = false; @@ -1395,6 +1529,9 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( << std::endl; if (request.llm_cache_size < request.initial_len) { + // std::cout << "Initialization (prompt) phase: " + // << new_bc.requestsInfo[i].num_tokens_in_batch << ", " + // << old_batches.at(0).beamRequestsInfo[i].beam_size << "\n"; // Initialization (prompt) phase for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { new_bc.tokensInfo[new_bc.num_tokens].request_index = i; @@ -1402,7 +1539,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( request.tokens[request.llm_cache_size + j]; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = request.llm_cache_size + j; - new_bc.num_tokens++; } @@ -1428,6 +1564,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } } else { // launch the request into running phase after loading all prompt if (get_max_tokens_per_batch() - new_bc.num_tokens > 0) { + // std::cout << "Initialization running phase: " + // << new_bc.requestsInfo[i].num_tokens_in_batch << "\n"; request.status = Request::RUNNING; new_bc.request_running[i] = true; @@ -1476,26 +1614,41 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid != guid) { + // std::cout << "i is: " << i << "old guid" << guid << " new guid" + // << old_bc.requestsInfo[old_bc.tokensInfo[i].request_index] + // .request_guid + // << "\n"; + int index = old_bc.tokensInfo[i - 1].request_index; int beam_size = old_bc.beamRequestsInfo[index].beam_size; + + // int leaf_node_num = old_bc.sub_requests[index]; + int leaf_node_num = + old_bc.beamRequestsInfo[index].sub_request_num * beam_size; int depth = old_bc.beamRequestsInfo[index].current_depth; // Each token yields (beam_width) results - int beam_width = old_bc.beamRequestsInfo[index].beam_size; + // int beam_width = old_bc.beamRequestsInfo[index].beam_size; // Count tokens sent to model in this request to find the final token's // index result_index += (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) * - beam_width; + beam_size; if (verbose) { std::cout << "i = " << i << ", result index = " << result_index - << ", value: " << result.token_ids[result_index] << "\n"; + << ", value: " << result.token_ids[result_index] + << ", leaf node num: " << leaf_node_num << ", depth" << depth + << ", beam size: " << beam_size << "\n"; } Request &request = all_requests[old_bc.requestsInfo[index].request_guid]; + if (old_bc.requestsInfo[index].num_tokens_in_batch == 0) { + continue; + } + if (depth == 1) { // store the last input into the tree; if (verbose) { @@ -1507,14 +1660,20 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, request.tokens.back(); request.beam_trees.at(old_bc.model_id).treeLayers[0].probs[0] = 1; request.beam_trees.at(old_bc.model_id).treeLayers[0].parent_ids[0] = -1; + request.beam_trees.at(old_bc.model_id) + .treeLayers[0] + .nodes_num_this_layer = 1; if (verbose) { std::cout << "Store the previous last token to the tree root: " << request.tokens.back() << "\n"; } } + request.beam_trees.at(old_bc.model_id) + .treeLayers[depth] + .nodes_num_this_layer = leaf_node_num; + for (int beam_id = 0; beam_id < leaf_node_num; beam_id++) { - for (int beam_id = 0; beam_id < beam_width; beam_id++) { request.beam_trees.at(old_bc.model_id) .treeLayers[depth] .tokens[beam_id] = result.token_ids[result_index]; @@ -1534,10 +1693,10 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, } result_index += 1; } - // update the guid and start_depth for current request if (i < old_bc.num_tokens) { - guid = old_bc.requestsInfo[index].request_guid; + int new_req_idx = old_bc.tokensInfo[i].request_index; + guid = old_bc.requestsInfo[new_req_idx].request_guid; start_depth = old_bc.tokensInfo[i].abs_depth_in_request; } } @@ -1546,6 +1705,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc, // for updating the beam search metadata in requests in incremental phase void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, + BeamSearchBatchConfig const &old_bc, BeamTree &tree, int request_index) { @@ -1556,6 +1716,9 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, int depth = new_bc.beamRequestsInfo[request_index].current_depth - 1; int beam_size = new_bc.beamRequestsInfo[request_index].beam_size; + // int leaf_node_num = old_bc.sub_requests[request_index]; + int leaf_node_num = new_bc.beamRequestsInfo[request_index].sub_request_num; + if (new_bc.beamRequestsInfo[request_index].current_depth == 1) { // TODO: check if this is correct // for (int j = 0; j < beam_size; j++) { @@ -1568,48 +1731,15 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, // Do nothing // assert(false); } else { - std::set parents; - std::set childs; - // cache stealing - for (int j = 0; j < beam_size; j++) { - int parent_id = tree.treeLayers[depth].parent_ids[j]; - if (childs.find(parent_id) == childs.end()) { - // copy beam slot - new_bc.beamRequestsInfo[request_index].parent_id[parent_id] = - tree.treeLayers[depth].parent_ids[j]; - new_bc.beamRequestsInfo[request_index].probs[parent_id] = - tree.treeLayers[depth].probs[j]; - new_bc.beamRequestsInfo[request_index].tokens[parent_id] = - tree.treeLayers[depth].tokens[j]; - parents.emplace(j); - childs.emplace(parent_id); - } - } - if (parents.size() < beam_size) { - for (int j = 0; j < beam_size; j++) { - if (parents.find(j) == parents.end()) { - // this slot has not been assigned - // find the smallest not assigned child and put in - if (verbose) { - std::cout << "request_index" << request_index - << ", miss slot: " << j << "\n"; - } - for (int k = 0; k < beam_size; k++) { - if (childs.find(k) == childs.end()) { - // parent -> j to child k; - new_bc.beamRequestsInfo[request_index].parent_id[k] = - tree.treeLayers[depth].parent_ids[j]; - new_bc.beamRequestsInfo[request_index].probs[k] = - tree.treeLayers[depth].probs[j]; - new_bc.beamRequestsInfo[request_index].tokens[k] = - tree.treeLayers[depth].tokens[j]; - parents.emplace(j); - childs.emplace(k); - break; - } - } - } - } + for (int j = 0; j < leaf_node_num; j++) { + new_bc.beamRequestsInfo[request_index].parent_id[j] = + tree.treeLayers[depth].parent_ids[j]; + new_bc.beamRequestsInfo[request_index].probs[j] = + tree.treeLayers[depth].probs[j]; + new_bc.beamRequestsInfo[request_index].tokens[j] = + tree.treeLayers[depth].tokens[j]; + // std::cout << "token: " << j << ": " + // << new_bc.beamRequestsInfo[request_index].tokens[j] << "\n"; } } if (verbose) { @@ -1625,6 +1755,139 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, } } +// bit mask related function + +// prompt phase, init task +void RequestManager::initBitMask(BatchConfig::BitMask &bitmask, + int initLength) { + assert(initLength <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM && + "do not support tree size > 64"); + // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: + // 0000000..1000 + bitmask.non_tree_cache_size = 0; + bitmask.tree_size = initLength; + + bitmask.prompt_size = initLength; + bitmask.this_layer_size = initLength; + for (int i = 0; i < bitmask.prompt_size; i++) { + for (int j = i; j < bitmask.prompt_size; j++) { + bitmask.mask[i] |= (1 << j); + } + } + // std::cout << "see bit mask" << bitmask.prompt_size << "\n"; + // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[0]) << "\n"; + // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[1]) << "\n"; + // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[2]) << "\n"; +} + +// prepare next init +void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask, + int initLength, + int non_tree_size) { + // assert(initLength == 1); + // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: + // 0000000..1000 + assert(initLength <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM && + "do not support tree size > 64"); + assert(initLength >= 1 && "verified token num should >= 1"); + + // std::cout << "non tree size: " << non_tree_size << ", " + // << bitmask.non_tree_cache_size << "\n"; + + bitmask.non_tree_cache_size = non_tree_size + initLength - 1; + bitmask.tree_size = 1; + bitmask.this_layer_size = initLength; + // std::cout << "non_tree_size: " << non_tree_size << "\n"; + bitmask.prompt_size = 1; + for (int i = 0; i < bitmask.prompt_size; i++) { + for (int j = i; j < bitmask.prompt_size; j++) { + bitmask.mask[i] |= (1 << j); + } + } + + // std::cout << "see bit mask update" << bitmask.prompt_size << "\n"; + // std::cout << "see bit mask update" << std::bitset<64>(bitmask.mask[0]) + // << "\n"; +} + +// prepare next beam, append layers to the tree +void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, + int newNodes, + int preBeamSize, + int old_sub_num, + BeamTree const tree, + int currentDepth) { + int pre_tree_size = bitmask.tree_size; + bitmask.tree_size += newNodes; + bitmask.this_layer_size = newNodes; + assert(bitmask.tree_size <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM && + "do not support tree size > 64"); + // preBeamSize: replicate num + + // add relationship with input/prompt + for (int i = 0; i < bitmask.prompt_size; i++) { + for (int j = pre_tree_size; j < bitmask.tree_size; j++) { + bitmask.mask[i] |= (1 << j); + // std::cout << "see bit mask append: " << i << ", to" << j + // << std::bitset<64>(bitmask.mask[i]) << "\n"; + } + } + + // std::cout << "bitmask.tree_size: " << bitmask.tree_size << ", " + // << pre_tree_size << ", " << bitmask.prompt_size << ", " + // << preBeamSize << "\n"; + + // int num_groups = newNodes / preBeamSize; + // int group_size = newNodes / num_groups; + // add relations to branch + // requests in same groups share same relations, except the last token. + + // set middle layers + // skip the root prompt/tokens + int token_idx = bitmask.prompt_size; + int new_nodes_start_idx = pre_tree_size; + // std::cout << "new nodes start " << new_nodes_start_idx << "\n"; + for (int i = 1; i < currentDepth; i++) { + new_nodes_start_idx = pre_tree_size; + int nodes_this_layer = tree.treeLayers[i].nodes_num_this_layer; + // std::cout << "tree layer: " << i << " nodes:" << nodes_this_layer + // << "group size: " << newNodes / nodes_this_layer << "\n"; + for (int j = 0; j < nodes_this_layer; j++) { + int group_size = newNodes / nodes_this_layer; + for (int k = 0; k < group_size; k++) { + bitmask.mask[token_idx] |= (1 << new_nodes_start_idx); + new_nodes_start_idx += 1; + } + token_idx += 1; + } + } + + // std::cout << "token idx: " << token_idx << ", " << pre_tree_size << ", " + // << new_nodes_start_idx << ", " << newNodes + // << "current depth: " << currentDepth << "\n"; + // std::cout << "new nodes end " << new_nodes_start_idx << "\n"; + + // std::cout << "tree size: " << bitmask.tree_size << "\n"; + assert(token_idx == pre_tree_size); + assert(currentDepth <= 1 || new_nodes_start_idx == bitmask.tree_size); + + // assert(currentDepth <= 2); + // set last layer, all tokens are only relevant to it self; + for (int i = token_idx; i < bitmask.tree_size; i++) { + bitmask.mask[i] |= (1 << i); + // std::cout << "set rel: " << i << "to: " << i << "\n"; + } + + // if(bitmask.non_tree_cache_size == 19 && bitmask.tree_size > 2){ + // assert(false); + // } + + // std::cout << "see bit mask append" << bitmask.prompt_size << "\n"; + // std::cout << "see bit mask append" << bitmask.non_tree_cache_size << "\n"; + // std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[0]) + // << "\n"; +} + bool PreOrder( BeamTree const &tree, int max_depth, @@ -1740,12 +2003,43 @@ std::vector> // In this case the inputSeriedTree ends with padding 0s assert(inputSerializedTree.size() >= outputSerializedTree.size()); + int *treeLayers = new int[inputSerializedTree.size()]; + int node_num = 1; + int layer_num = 0; + for (int token_id = 0; token_id < inputSerializedTree.size(); token_id++) { + if (token_id == (inputSerializedTree.size() - 1) || + inputSerializedTree.at(token_id + 1).second != + inputSerializedTree.at(token_id).second) { + treeLayers[layer_num] = node_num; + layer_num += 1; + node_num = 1; + } else { + node_num++; + } + } + + // to avoid branch switch when same tokens in input tree. + // todo, only checked for N->1->1->1 cases + + bool findFirst = false; + layer_num = -1; + int first_layer_slot = 0; + int first_layer_slot_total = 0; + int processed_whole_layer_tokens = 0; + for (int i = 0; i < outputSerializedTree.size(); i++) { auto input = inputSerializedTree.at(i); auto output = outputSerializedTree.at(i); + if (i == 0 || inputSerializedTree.at(i - 1).second != + inputSerializedTree.at(i).second) { + layer_num += 1; + processed_whole_layer_tokens += i == 0 ? 0 : treeLayers[layer_num - 1]; + } + if (i == 0) { verifiedTree.push_back(output); + new_committed_tokens.push_back(std::make_pair( input.second, committed_tokens.at(guid).at(i).second)); // > if (input.first == verifiedTree.back().first && input.second == verifiedTree.back().second) { - verifiedTree.push_back(output); - new_committed_tokens.push_back(std::make_pair( - input.second, - committed_tokens.at(guid).at(i).second)); // + if (findFirst) { + // must in this branch. + int layer_slot = i - processed_whole_layer_tokens; + int layer_slot_total = treeLayers[layer_num]; + if ((first_layer_slot == layer_slot)) { + verifiedTree.push_back(output); + new_committed_tokens.push_back(std::make_pair( + input.second, committed_tokens.at(guid).at(i).second)); + // at this point, you'll not go other branches + // std::cout << "verify tree push back: " << output.first + // << ", tree size is: " << verifiedTree.size() + // << ", ??: " << input.first << ", " << input.second << + // "\n"; + + } else { + printf("not correct slot\n"); + } + } else { + verifiedTree.push_back(output); + first_layer_slot = i - processed_whole_layer_tokens; + first_layer_slot_total = treeLayers[layer_num]; + findFirst = true; + new_committed_tokens.push_back(std::make_pair( + input.second, + committed_tokens.at(guid).at(i).second)); // + // at this point, you'll not go other branches + // std::cout << "verify tree push back: " << output.first + // << ", tree size is: " << verifiedTree.size() + // << ", ??: " << input.first << ", " << input.second << "\n"; + } + assert(committed_tokens.at(guid).at(i).first == input.second); } } @@ -1804,6 +2125,8 @@ std::vector> << old_bc.beamRequestsInfo[request_index].current_depth << "\n"; std::cout << "[Traverse Beam Tree] beam_width: " << old_bc.beamRequestsInfo[request_index].beam_size << "\n"; + std::cout << "[Traverse Beam Tree] start index: " + << first_token_depth_in_request << "\n"; } auto guid = old_bc.requestsInfo[request_index].request_guid; @@ -1811,18 +2134,30 @@ std::vector> // std::cout << "request.beam_trees.size(): " << request.beam_trees.size() // << std::endl; BeamTree tree = request.beam_trees.at(old_bc.model_id); - // std::cout << "\n\n"; + // std::cout << "print beam tree: " + // << "\n"; + std::vector> serializedTree; + for (int i = 0; i <= old_bc.beamRequestsInfo[request_index].max_depth; i++) { + // std::cout << "tree layer: " << i + // << ", num_nodes: " << tree.treeLayers[i].nodes_num_this_layer + // << "\n"; + // push tokens into tree + for (int j = 0; j < tree.treeLayers[i].nodes_num_this_layer; j++) { + // std::cout << "token: " << tree.treeLayers[i].tokens[j] << "\n"; + serializedTree.push_back(std::make_pair(tree.treeLayers[i].tokens[j], i)); + } + } // token, index // todo make this one global for different stages - std::vector> serializedTree; - PreOrder(tree, - old_bc.beamRequestsInfo[request_index].max_depth, - 0, - old_bc.beamRequestsInfo[request_index].beam_size, - 0, - serializedTree, - verbose); + + // PreOrder(tree, + // old_bc.beamRequestsInfo[request_index].max_depth, + // 0, + // old_bc.beamRequestsInfo[request_index].beam_size, + // 0, + // serializedTree, + // verbose); // print it if (verbose) { @@ -1857,6 +2192,10 @@ std::vector> input_trees, int root_depth, RequestGuid guid) { + assert(input_trees.size() == 1 && "currently using one ssm"); + dfs_tree_inputs[guid] = input_trees.at(0); + return input_trees.at(0); + std::vector> merged_tree; std::unordered_map> childrens; diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp index 1e756606f8..fadbf80d6d 100644 --- a/src/runtime/request_manager.cpp +++ b/src/runtime/request_manager.cpp @@ -58,6 +58,91 @@ void RequestManager::load_tokens_task( stream)); } +void RequestManager::load_batch_config_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 0); + assert(task->regions.size() == 0); + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + // BatchConfig const batch_config = *((BatchConfig *)task->args); + BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]); + + // copy meta data to workSpace + FFHandler handle = *((FFHandler const *)task->local_args); + size_t total_copy_size = 0; + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata, + &(batch_config->tokensInfo), + sizeof(BatchConfig::tokensInfo), + hipMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BatchConfig::tokensInfo); + + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(batch_config->requestsInfo), + sizeof(BatchConfig::requestsInfo), + hipMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BatchConfig::requestsInfo); + + // load speculative metadata + if (batch_config->get_mode() == BEAM_SEARCH_MODE) { + BeamSearchBatchConfig const *beam_batch_config = + static_cast(batch_config); + + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(beam_batch_config->beamTokenInfo), + sizeof(BeamSearchBatchConfig::beamTokenInfo), + hipMemcpyHostToDevice, + stream)); + + total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); + + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(beam_batch_config->beamRequestsInfo), + sizeof(BeamSearchBatchConfig::beamRequestsInfo), + hipMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); + + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(beam_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + hipMemcpyHostToDevice, + stream)); + + total_copy_size += sizeof(BatchConfig::causalMask); + } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { + TreeVerifyBatchConfig const *tree_batch_config = + static_cast(batch_config); + + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(tree_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + hipMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BatchConfig::causalMask); + checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(tree_batch_config->committed_tokens), + sizeof(TreeVerifyBatchConfig::committed_tokens), + hipMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); + } + + // add a size check + assert(total_copy_size <= handle.batch_config_metadata_size); +} + void RequestManager::load_positions_task( Task const *task, std::vector const ®ions, diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index cd3e03fff6..51c52c3026 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -30,6 +30,7 @@ void RequestManager::load_tokens_task( // BatchConfig const batch_config = *((BatchConfig *)task->args); BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]); + BatchConfig::TokenId dram_copy[BatchConfig::MAX_NUM_TOKENS]; // Extreme long prompts are not supported, only load up to @@ -57,6 +58,91 @@ void RequestManager::load_tokens_task( stream)); } +void RequestManager::load_batch_config_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 0); + assert(task->regions.size() == 0); + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + // BatchConfig const batch_config = *((BatchConfig *)task->args); + BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]); + + // copy meta data to workSpace + FFHandler handle = *((FFHandler const *)task->local_args); + size_t total_copy_size = 0; + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata, + &(batch_config->tokensInfo), + sizeof(BatchConfig::tokensInfo), + cudaMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BatchConfig::tokensInfo); + + checkCUDA(cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + + total_copy_size, + &(batch_config->requestsInfo), + sizeof(BatchConfig::requestsInfo), + cudaMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BatchConfig::requestsInfo); + + // load speculative metadata + if (batch_config->get_mode() == BEAM_SEARCH_MODE) { + BeamSearchBatchConfig const *beam_batch_config = + static_cast(batch_config); + + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(beam_batch_config->beamTokenInfo), + sizeof(BeamSearchBatchConfig::beamTokenInfo), + cudaMemcpyHostToDevice, + stream)); + + total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); + + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(beam_batch_config->beamRequestsInfo), + sizeof(BeamSearchBatchConfig::beamRequestsInfo), + cudaMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); + + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(beam_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + cudaMemcpyHostToDevice, + stream)); + + total_copy_size += sizeof(BatchConfig::causalMask); + } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { + TreeVerifyBatchConfig const *tree_batch_config = + static_cast(batch_config); + + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(tree_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + cudaMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(BatchConfig::causalMask); + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(tree_batch_config->committed_tokens), + sizeof(TreeVerifyBatchConfig::committed_tokens), + cudaMemcpyHostToDevice, + stream)); + total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); + } + + // add a size check + assert(total_copy_size <= handle.batch_config_metadata_size); +} + void RequestManager::load_positions_task( Task const *task, std::vector const ®ions, From 3047c82aab223b7ff2f6b49cc5489bd89d5b07af Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sun, 31 Dec 2023 19:17:30 -0500 Subject: [PATCH 22/61] Reducing memory requirements by reusing logical regions (#1254) * Replicate load_token tasks so that it can be fused with other compute tasks; this eliminates Replicate and enables a larger fused op * Reuse regions for inference to reduce memory requirement * bug fix when reused regions are assigned to different pipeline stages --- include/flexflow/model.h | 11 +- include/flexflow/ops/fused.h | 11 +- src/mapper/mapper.cc | 3 +- src/ops/fused.cc | 56 ++++++- src/ops/fused.cu | 31 ++-- src/ops/inc_multihead_self_attention.cu | 14 ++ src/runtime/inference_manager.cc | 190 +++++++++--------------- src/runtime/model.cc | 116 ++++++++------- 8 files changed, 239 insertions(+), 193 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 16df99ab1a..cda1f91c89 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -1034,8 +1034,15 @@ class FFModel { void get_metrics(); void backward(int seq_length = -1); void update(); - bool apply_fusion(std::vector const &operators, - std::vector &new_operators); + bool apply_fusion( + std::vector const &operators, + std::vector &new_operators, + std::unordered_map> + *parallel_tensor_mapping = nullptr); + bool check_operators_integrity( + std::vector const &old_operators, + std::unordered_map> + *pt_mapping = nullptr); Op *get_final_operator() const; void compile(LossType loss_type, std::vector const &metrics, diff --git a/include/flexflow/ops/fused.h b/include/flexflow/ops/fused.h index 87c2201c28..a8326e9ab4 100644 --- a/include/flexflow/ops/fused.h +++ b/include/flexflow/ops/fused.h @@ -23,7 +23,16 @@ class FusedOp : public Op { SOURCE_OUTPUT, }; FusedOp(FFModel &model, Op *op); - bool add_operator(FFModel &model, Op *op); + static bool use_same_regions( + ParallelTensor const source_tensor, + ParallelTensor const target_tensor, + std::unordered_map> + *pt_mapping = nullptr); + bool add_operator( + FFModel &model, + Op *op, + std::unordered_map> + *parallel_tensor_mapping = nullptr); ParallelTensor init_inout(FFModel &model, const ParallelTensor input) { assert(0); return ParallelTensor(); diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index a86a6167a6..a2fb1d89be 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -934,13 +934,14 @@ void FFMapper::map_inline(const MapperContext ctx, &footprint)) { log_ff_mapper.error( "FlexFlow Mapper failed allocation of size %zd bytes" - " for region requirement of inline ammping in task %s (UID %lld)" + " for region requirement of inline mapping in task %s (UID %lld)" " in memory " IDFMT "for processor " IDFMT ".", footprint, inline_op.parent_task->get_task_name(), inline_op.parent_task->get_unique_id(), target_memory.id, inline_op.parent_task->current_proc.id); + printf("target_memory.kind() = %d\n", target_memory.kind()); assert(false); } else { output.chosen_instances.push_back(result); diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 1d5db2f461..9ad5c4dc9c 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -115,7 +115,42 @@ FusedOp::FusedOp(FFModel &model, Op *op) } } -bool FusedOp::add_operator(FFModel &model, Op *op) { +bool FusedOp::use_same_regions( + ParallelTensor const source_tensor, + ParallelTensor const target_tensor, + std::unordered_map> + *pt_mapping) { + if (pt_mapping == nullptr) { + return (source_tensor->region == target_tensor->region); + } else { + assert(pt_mapping->find(source_tensor) != pt_mapping->end()); + assert(pt_mapping->find(target_tensor) != pt_mapping->end()); + std::vector const &source_mapped_tensor_vector = + (*pt_mapping)[source_tensor]; + std::vector const &target_mapped_tensor_vector = + (*pt_mapping)[target_tensor]; + assert(source_mapped_tensor_vector.size() == + target_mapped_tensor_vector.size()); + bool same_region = source_mapped_tensor_vector[0]->region == + target_mapped_tensor_vector[0]->region + ? true + : false; + // Same that the two vectors use the exact same regions + if (same_region) { + for (size_t i = 0; i < source_mapped_tensor_vector.size(); i++) { + assert(source_mapped_tensor_vector[i]->region == + target_mapped_tensor_vector[i]->region); + } + } + return same_region; + } +} + +bool FusedOp::add_operator( + FFModel &model, + Op *op, + std::unordered_map> + *pt_mapping) { // Context ctx = model.config.lg_ctx; // Runtime* runtime = model.config.lg_hlr; // Currently assume fusion optimization is performed @@ -164,7 +199,7 @@ bool FusedOp::add_operator(FFModel &model, Op *op) { for (int i = 0; i < op->numInputs; i++) { bool found = false; for (int j = 0; j < numInputs; j++) { - if (inputs[j]->region == op->inputs[i]->region) { + if (use_same_regions(inputs[j], op->inputs[i], pt_mapping)) { // This input is one of my inputs assert(!found); assert(inputs[j]->region != LogicalRegion::NO_REGION); @@ -175,7 +210,7 @@ bool FusedOp::add_operator(FFModel &model, Op *op) { } } for (int j = 0; j < numOutputs; j++) { - if ((outputs[j]->region == op->inputs[i]->region) && (!found)) { + if (use_same_regions(outputs[j], op->inputs[i], pt_mapping) && (!found)) { // This input is one of my outputs assert(!found); assert(outputs[j]->region != LogicalRegion::NO_REGION); @@ -201,6 +236,11 @@ bool FusedOp::add_operator(FFModel &model, Op *op) { for (int i = 0; i < op->numWeights; i++) { bool found = false; for (int j = 0; j < numWeights; j++) { + // pt_mapping does not apply to weights + if (pt_mapping != nullptr) { + assert(pt_mapping->find(weights[j]) == pt_mapping->end()); + assert(pt_mapping->find(op->weights[i]) == pt_mapping->end()); + } if (weights[j]->region == op->weights[i]->region) { assert(!found); assert(weights[j]->region != LogicalRegion::NO_REGION); @@ -226,7 +266,7 @@ bool FusedOp::add_operator(FFModel &model, Op *op) { for (int i = 0; i < op->numOutputs; i++) { bool found = false; for (int j = 0; j < numOutputs; j++) { - if (outputs[j]->region == op->outputs[i]->region) { + if (use_same_regions(outputs[j], op->outputs[i], pt_mapping)) { assert(!found); found = true; op_output_source[output_offset + i] = SOURCE_OUTPUT; @@ -347,22 +387,26 @@ void FusedOp::init_inference(FFModel const &ff, Domain domain = runtime->get_index_space_domain(ctx, parallel_is); int ioff = 0, ooff = 0; for (int op = 0; op < numOperators; op++) { - // prepare batch_inputs, batch_outputs for operators[i] + // prepare batch_inputs, batch_outputs for operators[op] std::vector my_batch_inputs; std::vector my_batch_outputs; for (int i = 0; i < op_num_inputs[op]; i++) { int my_off = op_input_idx[i + ioff]; if (op_input_source[i + ioff] == SOURCE_INPUT) { + assert(my_off < batch_inputs.size()); my_batch_inputs.push_back(batch_inputs[my_off]); } else if (op_input_source[i + ioff] == SOURCE_OUTPUT) { + assert(my_off < batch_outputs.size()); my_batch_inputs.push_back(batch_outputs[my_off]); } else { assert(false); } } for (int i = 0; i < op_num_outputs[op]; i++) { + int my_off = op_output_idx[i + ooff]; assert(op_output_source[i + ooff] == SOURCE_OUTPUT); - my_batch_outputs.push_back(batch_outputs[i + ooff]); + assert(my_off < batch_outputs.size()); + my_batch_outputs.push_back(batch_outputs[my_off]); } ioff += op_num_inputs[op]; ooff += op_num_outputs[op]; diff --git a/src/ops/fused.cu b/src/ops/fused.cu index b157453035..c6ba0b04c5 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -173,10 +173,11 @@ __host__ void FusedOp::forward_task(Task const *task, my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; } for (int i = 0; i < fused->op_num_outputs[op]; i++) { + int my_off = fused->op_output_idx[i + ooff]; assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); - // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; - // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; - my_output_accessor[i] = output_accessor[i + ooff]; + // my_od[i] = output_domain[my_off]; + // my_op[i] = output_ptr[my_off]; + my_output_accessor[i] = output_accessor[my_off]; } switch (fused->op_op_type[op]) { case OP_CONCAT: { @@ -619,9 +620,11 @@ __host__ void int my_off = fused->op_input_idx[i + ioff]; if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { // my_id[i] = input_domain[my_off]; + assert(my_off < fused->numInputs); my_input_accessor[i] = input_accessor[my_off]; } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { // my_id[i] = output_domain[my_off]; + assert(my_off < fused->numOutputs); my_input_accessor[i] = output_accessor[my_off]; } else { assert(false); @@ -631,13 +634,16 @@ __host__ void assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]]; // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]]; + assert(fused->op_weight_idx[i + woff] < fused->numWeights); my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; } for (int i = 0; i < fused->op_num_outputs[op]; i++) { + int my_off = fused->op_output_idx[i + ooff]; assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); + assert(my_off < fused->numOutputs); // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; - my_output_accessor[i] = output_accessor[i + ooff]; + my_output_accessor[i] = output_accessor[my_off]; } switch (fused->op_op_type[op]) { case OP_CONCAT: { @@ -1108,7 +1114,8 @@ __host__ void weight_accessor[fused->op_weight_idx[i + woff]]); } for (int i = 0; i < fused->op_num_outputs[op]; i++) { - output_accessors_to_save.push_back(output_accessor[i + ooff]); + int my_off = fused->op_output_idx[i + ooff]; + output_accessors_to_save.push_back(output_accessor[my_off]); } assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; @@ -1310,13 +1317,13 @@ __host__ void FusedOp::backward_task(Task const *task, } for (int i = 0; i < fused->op_num_outputs[op]; i++) { assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); - // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; - // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; - my_output_accessor[i] = output_accessor[fused->op_output_idx[i + ooff]]; - // my_grad_od[i] = output_grad_domain[fused->op_output_idx[i + ooff]]; - // my_grad_op[i] = output_grad_ptr[fused->op_output_idx[i + ooff]]; - my_output_grad_accessor[i] = - output_grad_accessor[fused->op_output_idx[i + ooff]]; + int my_off = fused->op_output_idx[i + ooff]; + // my_od[i] = output_domain[my_off]; + // my_op[i] = output_ptr[my_off]; + my_output_accessor[i] = output_accessor[my_off]; + // my_grad_od[i] = output_grad_domain[my_off]; + // my_grad_op[i] = output_grad_ptr[my_off]; + my_output_grad_accessor[i] = output_grad_accessor[my_off]; assert(my_output_grad_accessor[i].domain == my_output_accessor[i].domain); } switch (fused->op_op_type[op]) { diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index da70e23f87..db64868cb9 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1530,4 +1530,18 @@ template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( half const *bias_ptr, int num_tokens, cudaStream_t stream); + +template void + Kernels::IncMultiHeadAttention::compute_attention_kernel_generation( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + float *output_ptr, + cudaStream_t stream); + +template void + Kernels::IncMultiHeadAttention::compute_attention_kernel_generation( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + half *output_ptr, + cudaStream_t stream); }; // namespace FlexFlow diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 8af0ed8978..cc76da58bb 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -151,7 +151,9 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { std::vector list; bool found_parallel_tensor = false; - if (model->cpu_offload) { + // Always enable memory reuse + // if (model->cpu_offload) { + if (true) { for (auto const &pre_pt : tensor_buffer) { bool used_by_future_operator = false; bool used_by_current_operator = false; @@ -159,6 +161,12 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { // Continue if shape mismatches continue; } + // Skip if pre_pt and pt_base are in different pipeline stages + // we compare their pipeline stages using the machine views + // of the first data pipeline + if (pre_pt.second[0]->machine_view != machine_views[0]) { + continue; + } // Check that pt cannot be used as an input to the current operator for (int j = 0; j < op->numInputs; j++) { if (parallel_tensor_list_overlaps(tensor_buffer[op->inputs[j]], @@ -221,6 +229,67 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { } // std::cout << std::endl; } + + // Perform fusion optimizations + if (model->config.perform_fusion) { + fprintf(stderr, "Applying fusion optimizations during compilation...\n"); + fprintf( + stderr, "%zu operators before fusion...\n", model->operators.size()); + std::vector new_operators; + std::vector old_operators = model->operators; + while ( + model->apply_fusion(model->operators, new_operators, &tensor_buffer)) { + for (size_t i = 0; i < new_operators.size(); i++) { + for (int idx = 0; idx < new_operators[i]->numInputs; idx++) { + for (size_t j = i + 1; j < new_operators.size(); j++) { + if (new_operators[i]->inputs[idx]->owner_op == new_operators[j]) { + assert(false); + } + } + } + } + model->operators = new_operators; + } + assert(model->check_operators_integrity(old_operators, &tensor_buffer)); + fprintf(stderr, "%zu operators after fusion...\n", model->operators.size()); + } + + // print optimized graph + for (size_t i = 0; i < model->operators.size(); i++) { + Op *op = model->operators[i]; + if (op->op_type == OP_INPUT || op->op_type == OP_WEIGHT) { + continue; + } + printf("operator[%zu]: type(%s) guid(%lu)\n", + i, + get_operator_type_name(model->operators[i]->op_type).c_str(), + model->operators[i]->op_guid); + for (int j = 0; j < op->numInputs; j++) { + assert(tensor_buffer.find(op->inputs[j]) != tensor_buffer.end()); + LogicalRegion handle = tensor_buffer[op->inputs[j]][0]->region; + printf("\tinputs[%d] mapped_region(%d,%d,%d)\n", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); + } + for (int j = 0; j < op->numOutputs; j++) { + LogicalRegion handle = tensor_buffer[op->outputs[j]][0]->region; + printf("\toutputs[%d] mapped_region(%d,%d,%d)\n", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); + } + for (int j = 0; j < op->numWeights; j++) { + LogicalRegion handle = op->weights[j]->region; + printf("\tweights[%d] mapped_region(%d,%d,%d)\n", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); + } + } } void InferenceManager::init_operators_inference(FFModel *model) { @@ -577,124 +646,7 @@ void FFModel::compile_inference() { assert(op->outputs[i]->parallel_tensor_guid != 0); } } - // Perform fusion optimizations - if (config.perform_fusion) { - fprintf(stderr, "Applying fusion optimizations during compilation...\n"); - fprintf(stderr, "%zu operators before fusion...\n", operators.size()); - std::vector new_operators; - std::vector old_operators = operators; - while (apply_fusion(operators, new_operators)) { - for (size_t i = 0; i < new_operators.size(); i++) { - for (int idx = 0; idx < new_operators[i]->numInputs; idx++) { - for (size_t j = i + 1; j < new_operators.size(); j++) { - if (new_operators[i]->inputs[idx]->owner_op == new_operators[j]) { - assert(false); - } - } - } - } - operators = new_operators; - } - // Check integrity - for (size_t l = 0; l < operators.size(); l++) { - if (operators[l]->op_type == OP_FUSED) { - FusedOp *fused = (FusedOp *)operators[l]; - int ioff = 0, woff = 0, ooff = 0; - for (int op = 0; op < fused->numOperators; op++) { - Op *old_op = fused->operators[op]; - for (int i = 0; i < fused->op_num_inputs[op]; i++) { - int my_off = fused->op_input_idx[i + ioff]; - if (fused->op_input_source[i + ioff] == FusedOp::SOURCE_INPUT) { - assert(fused->inputs[my_off]->region == - old_op->inputs[i]->region); - } else if (fused->op_input_source[i + ioff] == - FusedOp::SOURCE_OUTPUT) { - assert(fused->outputs[my_off]->region == - old_op->inputs[i]->region); - } else { - assert(false); - } - } - for (int i = 0; i < fused->op_num_weights[op]; i++) { - int my_off = fused->op_weight_idx[i + woff]; - assert(fused->op_weight_source[i + woff] == FusedOp::SOURCE_WEIGHT); - assert(fused->weights[my_off]->region == - old_op->weights[i]->region); - } - for (int i = 0; i < fused->op_num_outputs[op]; i++) { - int my_off = fused->op_output_idx[i + ooff]; - assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT); - assert(fused->outputs[my_off]->region == - old_op->outputs[i]->region); - } - ioff += fused->op_num_inputs[op]; - woff += fused->op_num_weights[op]; - ooff += fused->op_num_outputs[op]; - } - } else { - bool found = false; - for (size_t i = 0; i < old_operators.size(); i++) { - if (old_operators[i] == operators[l]) { - assert(!found); - found = true; - } - } - assert(found); - } - } - fprintf(stderr, "%zu operators after fusion...\n", operators.size()); - for (size_t i = 0; i < operators.size(); i++) { - Op *op = operators[i]; - printf("operator[%zu]: type(%s) guid(%lu)\n", - i, - get_operator_type_name(operators[i]->op_type).c_str(), - operators[i]->op_guid); - for (int j = 0; j < op->numInputs; j++) { - LogicalRegion handle = op->inputs[j]->region; - printf("\tinputs[%d] region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); - } - for (int j = 0; j < op->numOutputs; j++) { - LogicalRegion handle = op->outputs[j]->region; - printf("\toutputs[%d] region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); - } - for (int j = 0; j < op->numWeights; j++) { - LogicalRegion handle = op->weights[j]->region; - printf("\tweights[%d] region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); - } - } - } - for (size_t i = 0; i < operators.size(); i++) { - Op *op = operators[i]; - printf("operator[%zu]: type(%d)\n", i, operators[i]->op_type); - for (int j = 0; j < op->numInputs; j++) { - LogicalRegion handle = op->inputs[j]->region; - printf("\tinputs[%d] region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); - } - for (int j = 0; j < op->numOutputs; j++) { - LogicalRegion handle = op->outputs[j]->region; - printf("\toutputs[%d] region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); - } - } + #ifdef FF_USE_NCCL for (size_t l = 0; l < operators.size(); l++) { // Only create nccl for allreduce and fusedop for inference diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 37605c44a4..3bfe429ddd 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -2859,8 +2859,11 @@ void FFModel::compile(Optimizer *_optimizer, compile(loss_type, metrics, comp_mode); } -bool FFModel::apply_fusion(std::vector const &operators, - std::vector &new_operators) { +bool FFModel::apply_fusion( + std::vector const &operators, + std::vector &new_operators, + std::unordered_map> + *parallel_tensor_mapping) { // Context ctx = config.lg_ctx; // Runtime* runtime = config.lg_hlr; for (size_t l = 1; l < operators.size() - 1; l++) { @@ -2925,7 +2928,8 @@ bool FFModel::apply_fusion(std::vector const &operators, fused_op = new FusedOp(*this, operators[i]); allocate_new_fused_op = true; } - if (fused_op->add_operator(*this, operators[l])) { + if (fused_op->add_operator( + *this, operators[l], parallel_tensor_mapping)) { // Construct new operators new_operators.clear(); for (size_t j = 0; j < i; j++) { @@ -2943,7 +2947,9 @@ bool FFModel::apply_fusion(std::vector const &operators, (op->inputs[idx]->owner_op == operators[i])) { int found = -1; for (int k = 0; k < fused_op->numOutputs; k++) { - if (fused_op->outputs[k]->region == op->inputs[idx]->region) { + if (fused_op->use_same_regions(fused_op->outputs[k], + op->inputs[idx], + parallel_tensor_mapping)) { assert(found == -1); found = k; } @@ -2959,7 +2965,6 @@ bool FFModel::apply_fusion(std::vector const &operators, assert(new_operators.size() + 1 == operators.size()); return true; } else { - // TODO: delete fused_op to avoid memory leakage if (allocate_new_fused_op) { delete fused_op; } @@ -3490,53 +3495,7 @@ void FFModel::compile(LossType loss_type, } operators = new_operators; } - // Check integrity - for (size_t l = 0; l < operators.size(); l++) { - if (operators[l]->op_type == OP_FUSED) { - FusedOp *fused = (FusedOp *)operators[l]; - int ioff = 0, woff = 0, ooff = 0; - for (int op = 0; op < fused->numOperators; op++) { - Op *old_op = fused->operators[op]; - for (int i = 0; i < fused->op_num_inputs[op]; i++) { - int my_off = fused->op_input_idx[i + ioff]; - if (fused->op_input_source[i + ioff] == FusedOp::SOURCE_INPUT) { - assert(fused->inputs[my_off]->region == - old_op->inputs[i]->region); - } else if (fused->op_input_source[i + ioff] == - FusedOp::SOURCE_OUTPUT) { - assert(fused->outputs[my_off]->region == - old_op->inputs[i]->region); - } else { - assert(false); - } - } - for (int i = 0; i < fused->op_num_weights[op]; i++) { - int my_off = fused->op_weight_idx[i + woff]; - assert(fused->op_weight_source[i + woff] == FusedOp::SOURCE_WEIGHT); - assert(fused->weights[my_off]->region == - old_op->weights[i]->region); - } - for (int i = 0; i < fused->op_num_outputs[op]; i++) { - int my_off = fused->op_output_idx[i + ooff]; - assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT); - assert(fused->outputs[my_off]->region == - old_op->outputs[i]->region); - } - ioff += fused->op_num_inputs[op]; - woff += fused->op_num_weights[op]; - ooff += fused->op_num_outputs[op]; - } - } else { - bool found = false; - for (size_t i = 0; i < old_operators.size(); i++) { - if (old_operators[i] == operators[l]) { - assert(!found); - found = true; - } - } - assert(found); - } - } + assert(check_operators_integrity(old_operators)); fprintf(stderr, "%zu operators after fusion...\n", operators.size()); for (size_t i = 0; i < operators.size(); i++) { Op *op = operators[i]; @@ -3678,6 +3637,59 @@ void FFModel::compile(LossType loss_type, #endif } +bool FFModel::check_operators_integrity( + std::vector const &old_operators, + std::unordered_map> + *pt_mapping) { + // Check integrity + for (size_t l = 0; l < operators.size(); l++) { + if (operators[l]->op_type == OP_FUSED) { + FusedOp *fused = (FusedOp *)operators[l]; + int ioff = 0, woff = 0, ooff = 0; + for (int op = 0; op < fused->numOperators; op++) { + Op *old_op = fused->operators[op]; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + int my_off = fused->op_input_idx[i + ioff]; + if (fused->op_input_source[i + ioff] == FusedOp::SOURCE_INPUT) { + assert(FusedOp::use_same_regions( + fused->inputs[my_off], old_op->inputs[i], pt_mapping)); + } else if (fused->op_input_source[i + ioff] == + FusedOp::SOURCE_OUTPUT) { + assert(FusedOp::use_same_regions( + fused->outputs[my_off], old_op->inputs[i], pt_mapping)); + } else { + assert(false); + } + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + int my_off = fused->op_weight_idx[i + woff]; + assert(fused->op_weight_source[i + woff] == FusedOp::SOURCE_WEIGHT); + assert(fused->weights[my_off]->region == old_op->weights[i]->region); + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + int my_off = fused->op_output_idx[i + ooff]; + assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT); + assert(FusedOp::use_same_regions( + fused->outputs[my_off], old_op->outputs[i], pt_mapping)); + } + ioff += fused->op_num_inputs[op]; + woff += fused->op_num_weights[op]; + ooff += fused->op_num_outputs[op]; + } + } else { + bool found = false; + for (size_t i = 0; i < old_operators.size(); i++) { + if (old_operators[i] == operators[l]) { + assert(!found); + found = true; + } + } + assert(found); + } + } + return true; +} + struct PropagationEdgeInfo { Op *dstOp; size_t size; From 1901f65bc2045860d4c26c26c2a158b270cb300a Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Sun, 31 Dec 2023 23:25:21 -0500 Subject: [PATCH 23/61] embedding return when no token --- src/ops/embedding.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 76236e65ff..3be3eac618 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -478,6 +478,7 @@ FutureMap Embedding::inference(FFModel const &ff, 0 /*mapper_id*/, machine_view_hash); // regions[0]: input + launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection*/, READ_ONLY, @@ -516,6 +517,10 @@ void Embedding::forward_task(Task const *task, assert(task->regions.size() == 3); // Assert that weight and output must have the same data type // otherwise, a cast operator should be inserted + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_tokens() == 0) { + return; + } assert(m->weight_type[0] == m->output_type[0]); assert(m->input_type[0] == DT_INT32 || m->input_type[0] == DT_INT64); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( From 130ad92f8369d6ba39dd470dafd160b844e49e99 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Mon, 1 Jan 2024 01:39:41 -0500 Subject: [PATCH 24/61] use arg topk instead of beam topk --- include/flexflow/flexflow_c.h | 1 + include/flexflow/model.h | 2 + include/flexflow/ops/arg_topk.h | 16 ++- include/flexflow/ops/arg_topk_params.h | 1 + inference/models/llama.cc | 2 +- python/flexflow/core/flexflow_cffi.py | 5 +- src/c/flexflow_c.cc | 4 +- src/ops/arg_topk.cc | 185 +++++++++++++++++++------ src/ops/arg_topk.cu | 91 +++++++++--- src/runtime/model.cc | 18 +++ 10 files changed, 258 insertions(+), 67 deletions(-) diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 01a2818a2b..305c8da513 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -571,6 +571,7 @@ flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_, const flexflow_tensor_t input_, int k, bool sorted, + bool speculative_decoding, char const *name); flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 16df99ab1a..01244a371b 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -146,6 +146,7 @@ enum TaskIDs { TOPK_BWD_TASK_ID, ARG_TOPK_INIT_TASK_ID, ARG_TOPK_INF_TASK_ID, + ARG_TOPK_INF_SPECULATIVE_TASK_ID, SAMPLING_INIT_TASK_ID, SAMPLING_INF_TASK_ID, ARGMAX_INIT_TASK_ID, @@ -674,6 +675,7 @@ class FFModel { // Tensor *outputs, int k, bool sorted, + bool speculative_decoding, char const *name = NULL); Tensor argmax(const Tensor input, bool beam_search, char const *name = NULL); Tensor sampling(const Tensor input, float top_p, char const *name = NULL); diff --git a/include/flexflow/ops/arg_topk.h b/include/flexflow/ops/arg_topk.h index 8b2d2aa11c..3822a5e41e 100644 --- a/include/flexflow/ops/arg_topk.h +++ b/include/flexflow/ops/arg_topk.h @@ -12,6 +12,8 @@ class ArgTopKMeta : public OpMeta { public: ArgTopKMeta(FFHandler handle, Op const *op); bool sorted; + int k; + bool speculative_decoding; }; class ArgTopK : public Op { @@ -23,6 +25,7 @@ class ArgTopK : public Op { const ParallelTensor input, int k, bool sorted, + bool speculative_decoding, char const *name); ArgTopK(FFModel &model, LayerID const &layer_guid, @@ -61,6 +64,11 @@ class ArgTopK : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static BeamInferenceResult inference_speculative_task( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); void serialize(Legion::Serializer &s) const override; static PCG::Node deserialize(FFModel &ff, Legion::Deserializer &d, @@ -75,22 +83,26 @@ class ArgTopK : public Op { template static void forward_kernel(ArgTopKMeta const *m, DT const *input_ptr, - // float *output_ptr, + float *output_ptr, int *indices_ptr, size_t batch_size, int length, int k, bool sorted, + BeamSearchBatchConfig const *bc, ffStream_t stream); static void forward_kernel_wrapper(ArgTopKMeta const *m, GenericTensorAccessorR const &input, + GenericTensorAccessorW const &prob, GenericTensorAccessorW const &indices, - int batch_size); + int batch_size, + BeamSearchBatchConfig const *bc); Params get_params() const; public: int k; bool sorted; + bool speculative_decoding; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/arg_topk_params.h b/include/flexflow/ops/arg_topk_params.h index 9d2a21034f..bd9c38e2a9 100644 --- a/include/flexflow/ops/arg_topk_params.h +++ b/include/flexflow/ops/arg_topk_params.h @@ -11,6 +11,7 @@ struct ArgTopKParams { LayerID layer_guid; int k; bool sorted; + bool speculative_decoding; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(ArgTopKParams const &, ArgTopKParams const &); diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 10001ee916..e9c84efe90 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -247,7 +247,7 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor softmax = ff.softmax(dense, -1); // output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); // output = ff.argmax(softmax, /*beam_Search*/ true); - output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); + output = ff.arg_top_k(softmax, llama_config.max_beam_width, false, true); // output = ff.top_k(softmax, ) } else { // Tensor softmax = ff.softmax(dense, -1); diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index de3f7e6929..a3c221474d 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -3349,7 +3349,7 @@ def residual_rms_norm(self, input1, input2, eps, dim, name=None): handles_array[1], owner_op_type=OpType.RESIDUAL_RMS_NORM ) - def arg_top_k(self, input, k, sorted, name=None): + def arg_top_k(self, input, k, sorted, speculative_decoding, name=None): """Defines the Arg TopK layer. :param input: the input Tensor. @@ -3361,6 +3361,9 @@ def arg_top_k(self, input, k, sorted, name=None): :param sorted: Whether the entries should be sorted :type sorted: bool + :param speculative_decoding: Whether you need to perform beam search + :type speculative_decoding: bool + :param name: the name of the layer. Default is None. :type name: string diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 80202f6f99..579fc5e2d1 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1489,10 +1489,12 @@ flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_, const flexflow_tensor_t input_, int k, bool sorted, + bool speculative_decoding, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); - Tensor tensor = handle->arg_top_k(input, k, sorted, name); + Tensor tensor = + handle->arg_top_k(input, k, sorted, speculative_decoding, name); return FFCObjectWrapper::wrap(tensor); } diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc index a06b89de07..2727a1d249 100644 --- a/src/ops/arg_topk.cc +++ b/src/ops/arg_topk.cc @@ -51,6 +51,7 @@ using PCG::Node; Tensor FFModel::arg_top_k(const Tensor input, int k, bool sorted, + bool speculative_decoding, char const *name) { Layer *li = new Layer(this, OP_ARG_TOPK, @@ -58,7 +59,7 @@ Tensor FFModel::arg_top_k(const Tensor input, name, 1 /*inputs*/, 0 /*weights*/, - 1 /*outputs*/, + speculative_decoding ? 2 : 1 /*outputs*/, input); { int numdims = input->num_dims; @@ -71,9 +72,14 @@ Tensor FFModel::arg_top_k(const Tensor input, // numdims, dims, input->data_type, li, 0, true /*create_grad*/); li->outputs[0] = create_tensor_legion_ordering( numdims, dims, DT_INT32, li, 0, false /*create_grad*/); + if (speculative_decoding) { + li->outputs[1] = create_tensor_legion_ordering( + numdims, dims, DT_FLOAT, li, 1, false /*create_grad*/); + } } li->add_int_property("k", k); li->add_int_property("sorted", sorted); + li->add_int_property("speculative_decoding", speculative_decoding); layers.push_back(li); // outputs[0] = li->outputs[0]; // outputs[1] = li->outputs[1]; @@ -89,14 +95,23 @@ Op *ArgTopK::create_operator_from_layer( int k = value; layer->get_int_property("sorted", value); bool sorted = (bool)value; - return new ArgTopK( - model, layer->layer_guid, inputs[0], k, sorted, layer->name); + layer->get_int_property("speculative_decoding", value); + bool speculative_decoding = (bool)value; + + return new ArgTopK(model, + layer->layer_guid, + inputs[0], + k, + sorted, + speculative_decoding, + layer->name); } ArgTopKParams ArgTopK::get_params() const { ArgTopKParams params; params.k = this->k; params.sorted = this->sorted; + params.speculative_decoding = this->speculative_decoding; return params; } @@ -106,7 +121,8 @@ bool ArgTopKParams::is_valid(ParallelTensorShape const &) const { } bool operator==(ArgTopKParams const &lhs, ArgTopKParams const &rhs) { - return lhs.k == rhs.k && lhs.sorted == rhs.sorted; + return lhs.k == rhs.k && lhs.sorted == rhs.sorted && + lhs.speculative_decoding == rhs.speculative_decoding; } ArgTopK::ArgTopK(FFModel &model, @@ -114,6 +130,7 @@ ArgTopK::ArgTopK(FFModel &model, const ParallelTensor _input, int _k, bool _sorted, + bool _speculative_decoding, char const *name) : Op(model, OP_ARG_TOPK, @@ -121,9 +138,9 @@ ArgTopK::ArgTopK(FFModel &model, name, 1 /*inputs*/, 0 /*weights*/, - 1 /*outputs*/, + _speculative_decoding ? 2 : 1 /*outputs*/, _input), - k(_k), sorted(_sorted) { + k(_k), sorted(_sorted), speculative_decoding(_speculative_decoding) { // overwrite layer_guid layer_guid = _layer_guid; int numdim = inputs[0]->num_dims; @@ -131,26 +148,42 @@ ArgTopK::ArgTopK(FFModel &model, for (int i = 0; i < numdim; i++) { dims[i] = inputs[0]->dims[i]; } + dims[0].size = k; assert(inputs[0]->dims[0].degree == 1); assert(inputs[0]->dims[0].parallel_idx == -1); - // outputs[0] = model.create_parallel_tensor_legion_ordering( - // numdim, dims, _input->data_type, this, 0 /*owner_idx*/); + outputs[0] = model.create_parallel_tensor_legion_ordering( numdim, dims, DT_INT32, this, 0 /*owner_idx*/); + if (_speculative_decoding) { + outputs[1] = model.create_parallel_tensor_legion_ordering( + numdim, dims, DT_FLOAT, this, 1 /*owner_idx*/); + } } ArgTopK::ArgTopK(FFModel &model, LayerID const &layer_guid, ArgTopK const &other, const ParallelTensor input) - : ArgTopK(model, layer_guid, input, other.k, other.sorted, other.name) {} + : ArgTopK(model, + layer_guid, + input, + other.k, + other.sorted, + other.speculative_decoding, + other.name) {} ArgTopK::ArgTopK(FFModel &model, ArgTopKParams const ¶ms, - const ParallelTensor input, + ParallelTensor const input, char const *name) - : ArgTopK(model, params.layer_guid, input, params.k, params.sorted, name) {} + : ArgTopK(model, + params.layer_guid, + input, + params.k, + params.sorted, + params.speculative_decoding, + name) {} void ArgTopK::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -243,8 +276,10 @@ OpMeta *ArgTopK::init_task(Task const *task, m->profiling = topk->profiling; m->inference_debugging = topk->inference_debugging; m->sorted = topk->sorted; + m->k = topk->k; std::strcpy(m->op_name, topk->name); m->layer_guid = topk->layer_guid; + m->speculative_decoding = topk->speculative_decoding; return m; } @@ -267,34 +302,64 @@ FutureMap ArgTopK::inference(FFModel const &ff, size_t machine_view_hash = view->hash(); /* std::cout << "ArgTopK op machine_view: " << *(MachineView const *)mv << std::endl; */ - IndexLauncher launcher(ARG_TOPK_INF_TASK_ID, - parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); - launcher.add_future(bc); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(1, FID_DATA); - // launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, - // 0 /*projection id*/, - // WRITE_ONLY, - // EXCLUSIVE, - // batch_outputs[1]->region)); - // launcher.add_field(2, FID_DATA); - return runtime->execute_index_space(ctx, launcher); + if (speculative_decoding) { + IndexLauncher launcher(ARG_TOPK_INF_SPECULATIVE_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + + launcher.add_region_requirement( + RegionRequirement(batch_outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[1]->region)); + launcher.add_field(2, FID_DATA); + return runtime->execute_index_space(ctx, launcher); + + } else { + IndexLauncher launcher(ARG_TOPK_INF_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); + } } InferenceResult @@ -317,9 +382,11 @@ InferenceResult m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW probs; int batch_size = bc->num_active_tokens(); - ArgTopK::forward_kernel_wrapper(m, input, indices, batch_size); + ArgTopK::forward_kernel_wrapper( + m, input, probs, indices, batch_size, nullptr); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); @@ -334,6 +401,39 @@ InferenceResult return ir; } +BeamInferenceResult ArgTopK::inference_speculative_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 3); + assert(task->regions.size() == 3); + BeamSearchBatchConfig const &bc = + Future(task->futures[0]).get_result(); + if (bc.num_active_tokens() == 0) { + // Directly return for empty batch config + BeamInferenceResult ir; + return ir; + } + ArgTopKMeta *m = *((ArgTopKMeta **)task->local_args); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( + DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW probs = helperGetGenericTensorAccessorWO( + DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); + + int batch_size = bc.num_active_tokens(); + ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, &bc); + + BeamInferenceResult ir; + download_tensor( + indices.get_int32_ptr(), ir.token_ids, batch_size * m->k); + download_tensor(probs.get_float_ptr(), ir.probs, batch_size * m->k); + return ir; +} + void ArgTopK::backward(FFModel const &ff) { // ArgTopK does not support backward assert(false); @@ -345,6 +445,7 @@ void ArgTopK::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->k); sez.serialize(this->sorted); + sez.serialize(this->speculative_decoding); } Node ArgTopK::deserialize(FFModel &ff, @@ -359,12 +460,15 @@ Node ArgTopK::deserialize(FFModel &ff, LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); int k; bool sorted; + bool speculative_decoding; dez.deserialize(k); dez.deserialize(sorted); + dez.deserialize(speculative_decoding); ArgTopKParams params; params.layer_guid = layer_guid; params.k = k; params.sorted = sorted; + params.speculative_decoding = speculative_decoding; return ff.get_or_create_node(inputs[0], params); } @@ -390,6 +494,7 @@ size_t hash::operator()( hash_combine(key, params.layer_guid.id); hash_combine(key, params.k); hash_combine(key, params.sorted); + hash_combine(key, params.speculative_decoding); return key; } }; // namespace std diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu index 575e0183b4..0b8bb8b563 100644 --- a/src/ops/arg_topk.cu +++ b/src/ops/arg_topk.cu @@ -262,8 +262,9 @@ __device__ void mergeShards(int num_shards, int k, Entry *__restrict__ entries, Entry *__restrict__ top_k_heap, - // T *top_k_values, - int *top_k_indices) { + float *top_k_values, + int *top_k_indices, + bool speculative_decoding) { // If k < num_shards, we can use a min-heap with k elements to get the top k // of the sorted blocks. // If k > num_shards, we can initialize a min-heap with the top element from @@ -313,7 +314,11 @@ __device__ void mergeShards(int num_shards, int const last_k = k - 1; for (int rank = 0; rank < last_k; rank++) { Entry const &max_element = max_heap.root(); - // top_k_values[rank] = max_element.value; + if (speculative_decoding) { + assert(top_k_values != nullptr); + top_k_values[rank] = static_cast(max_element.value); + } + int shard_index = max_element.index; top_k_indices[rank] = entries[shard_index].index; int next_shard_index = shard_index + num_shards; @@ -337,8 +342,9 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, int length, int k, bool sorted, - // T *__restrict__ output, - int *__restrict__ indices) { + float *__restrict__ output, + int *__restrict__ indices, + bool speculative_decoding) { __shared__ char shared_memory[48 << 10]; int const batch_index = blockIdx.x; T const *batch_input = input + batch_index * length; @@ -350,15 +356,16 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, __syncthreads(); if (thread_index == 0) { int const offset = batch_index * k; - // auto batch_output = output + offset; + auto batch_output = output + offset; auto batch_indices = indices + offset; Entry *top_k_heap = shared_entries + thread_count * k; mergeShards(thread_count, k, shared_entries, top_k_heap, - // batch_output, - batch_indices); + batch_output, + batch_indices, + speculative_decoding); } } @@ -366,12 +373,13 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, template void ArgTopK::forward_kernel(ArgTopKMeta const *m, DT const *input_ptr, - // float *output_ptr, + float *output_ptr, int *indices_ptr, size_t batch_size, int length, int k, bool sorted, + BeamSearchBatchConfig const *bc, cudaStream_t stream) { // Adopted from TensorFlow's ArgTopK implementation // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h @@ -390,24 +398,58 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry
); // size_t num_blocks = (batch_size + num_shards - 1) / num_shards; size_t num_blocks = batch_size; - assert(num_shards >= (size_t)k); - num_shards = k; - arg_topk_forward_kernel<<>>( - input_ptr, - shared_memory_size, - length, - k, - sorted, - // output_ptr, - indices_ptr); + + // all requests are in the same beam stages + if (m->speculative_decoding) { + assert(bc->num_active_requests() >= 0); + + // check + int beam_size = -1; + for (int i = 1; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } else if (beam_size == -1) { + beam_size = bc->beamRequestsInfo[i].beam_size; + } else { + assert(beam_size == bc->beamRequestsInfo[i].beam_size); + } + } + + assert(num_shards >= (size_t)beam_size); + num_shards = k; + arg_topk_forward_kernel<<>>( + input_ptr, + shared_memory_size, + length, + beam_size, + sorted, + output_ptr, + indices_ptr, + m->speculative_decoding); + } else { + + assert(num_shards >= (size_t)k); + num_shards = k; + arg_topk_forward_kernel<<>>( + input_ptr, + shared_memory_size, + length, + k, + sorted, + nullptr, + indices_ptr, + false); + } } /*static*/ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, GenericTensorAccessorR const &input, // float *output_ptr, + GenericTensorAccessorW const &probs, GenericTensorAccessorW const &indices, - int batch_size) { + int batch_size, + BeamSearchBatchConfig const *bc) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -439,6 +481,7 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, int length = input.domain.hi()[0] - input.domain.lo()[0] + 1; int k = indices.domain.hi()[0] - indices.domain.lo()[0] + 1; /*TODO: This prints to 5*/ + // batch_size = input.domain.get_volume() / length; // assert(indices.domain.get_volume() / k == batch_size); cudaEvent_t t_start, t_end; @@ -451,22 +494,26 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, if (input.data_type == DT_HALF) { ArgTopK::forward_kernel(m, input.get_half_ptr(), - // output_ptr, + m->speculative_decoding ? probs.get_float_ptr() + : nullptr, indices.get_int32_ptr(), batch_size, length, k, m->sorted, + m->speculative_decoding ? bc : nullptr, stream); } else if (input.data_type == DT_FLOAT) { ArgTopK::forward_kernel(m, input.get_float_ptr(), - // output_ptr, + m->speculative_decoding ? probs.get_float_ptr() + : nullptr, indices.get_int32_ptr(), batch_size, length, k, m->sorted, + m->speculative_decoding ? bc : nullptr, stream); } else { assert(false && "Unsupported data type"); diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 37605c44a4..f72d320bc8 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -5917,6 +5917,24 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(ARG_TOPK_INF_SPECULATIVE_TASK_ID, + "ArgTopK Speculative Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "ArgTopK Speculative Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // BeamTopk task { TaskVariantRegistrar registrar(BEAM_TOPK_INIT_TASK_ID, "BeamTopK Init"); From 4259d2dfa5c42488dad76d511517e45c0ad438c7 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Mon, 1 Jan 2024 10:08:38 -0500 Subject: [PATCH 25/61] embedding --- include/flexflow/ops/embedding.h | 4 ++ src/ops/embedding.cc | 64 ++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/include/flexflow/ops/embedding.h b/include/flexflow/ops/embedding.h index ae93ef4d1d..0f1b1335d4 100644 --- a/include/flexflow/ops/embedding.h +++ b/include/flexflow/ops/embedding.h @@ -80,6 +80,10 @@ class Embedding : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void backward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 3be3eac618..40d5b600be 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -517,6 +517,70 @@ void Embedding::forward_task(Task const *task, assert(task->regions.size() == 3); // Assert that weight and output must have the same data type // otherwise, a cast operator should be inserted + assert(m->weight_type[0] == m->output_type[0]); + assert(m->input_type[0] == DT_INT32 || m->input_type[0] == DT_INT64); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR kernel = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + if (m->aggr == AGGR_MODE_NONE) { + // assert(kernel_domain.get_dim() == 2); + assert(input.domain.get_dim() + 1 == output.domain.get_dim()); + for (size_t i = 0; i < input.domain.get_dim(); i++) { + assert(input.domain.hi()[i] == output.domain.hi()[i + 1]); + assert(input.domain.lo()[i] == output.domain.lo()[i + 1]); + } + assert(kernel.domain.hi()[0] - kernel.domain.lo()[0] == + output.domain.hi()[0] - output.domain.lo()[0]); + } else { + // assert(kernel_domain.get_dim() == 2); + assert(input.domain.get_dim() == output.domain.get_dim()); + for (size_t i = 1; i < input.domain.get_dim(); i++) { + assert(input.domain.hi()[i] == output.domain.hi()[i]); + assert(input.domain.lo()[i] == output.domain.lo()[i]); + } + assert(kernel.domain.hi()[0] - kernel.domain.lo()[0] == + output.domain.hi()[0] - output.domain.lo()[0]); + } + + int in_dim, out_dim, effective_batch_size; + if (m->aggr == AGGR_MODE_NONE) { + in_dim = 1; + out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + effective_batch_size = output.domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == input.domain.get_volume()); + } else { + in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + effective_batch_size = output.domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == input.domain.get_volume()); + } + forward_kernel_wrapper( + m, input, output, kernel, in_dim, out_dim, effective_batch_size); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + Embedding::save_inference_tensors_to_file( + m, shard_id, nullptr, {input}, {kernel}, {output}); + } +} + +/* + regions[0](I): input + regions[1](O): output + regions[2](I): kernel +*/ +void Embedding::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + EmbeddingMeta *m = *((EmbeddingMeta **)task->local_args); + assert(regions.size() == 3); + assert(task->regions.size() == 3); + // Assert that weight and output must have the same data type + // otherwise, a cast operator should be inserted BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_active_tokens() == 0) { return; From fae7fba1994aaf3c04da250a04bec3beb217236e Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Mon, 1 Jan 2024 10:13:30 -0500 Subject: [PATCH 26/61] fmt --- include/flexflow/ops/embedding.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/flexflow/ops/embedding.h b/include/flexflow/ops/embedding.h index 0f1b1335d4..ed89fcf37a 100644 --- a/include/flexflow/ops/embedding.h +++ b/include/flexflow/ops/embedding.h @@ -83,7 +83,7 @@ class Embedding : public Op { static void inference_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, - Legion::Runtime *runtime); + Legion::Runtime *runtime); static void backward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, From 8d1d5842253a0b6c894bec14550dd1e88eb9c4fd Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Mon, 1 Jan 2024 12:05:12 -0500 Subject: [PATCH 27/61] hip --- src/ops/arg_topk.cpp | 90 ++++++++++++++++++++++++++++++++------------ 1 file changed, 66 insertions(+), 24 deletions(-) diff --git a/src/ops/arg_topk.cpp b/src/ops/arg_topk.cpp index 6db8abb8c4..f431d3d4bf 100644 --- a/src/ops/arg_topk.cpp +++ b/src/ops/arg_topk.cpp @@ -263,8 +263,9 @@ __device__ void mergeShards(int num_shards, int k, Entry *__restrict__ entries, Entry *__restrict__ top_k_heap, - // T *top_k_values, - int *top_k_indices) { + float *top_k_values, + int *top_k_indices, + bool speculative_decoding) { // If k < num_shards, we can use a min-heap with k elements to get the top k // of the sorted blocks. // If k > num_shards, we can initialize a min-heap with the top element from @@ -314,7 +315,10 @@ __device__ void mergeShards(int num_shards, int const last_k = k - 1; for (int rank = 0; rank < last_k; rank++) { Entry const &max_element = max_heap.root(); - // top_k_values[rank] = max_element.value; + if (speculative_decoding) { + assert(top_k_values != nullptr); + top_k_values[rank] = static_cast(max_element.value); + } int shard_index = max_element.index; top_k_indices[rank] = entries[shard_index].index; int next_shard_index = shard_index + num_shards; @@ -338,8 +342,9 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, int length, int k, bool sorted, - // T *__restrict__ output, - int *__restrict__ indices) { + float *__restrict__ output, + int *__restrict__ indices, + bool speculative_decoding) { __shared__ char shared_memory[48 << 10]; int const batch_index = blockIdx.x; T const *batch_input = input + batch_index * length; @@ -351,15 +356,16 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, __syncthreads(); if (thread_index == 0) { int const offset = batch_index * k; - // auto batch_output = output + offset; + auto batch_output = output + offset; auto batch_indices = indices + offset; Entry *top_k_heap = shared_entries + thread_count * k; mergeShards(thread_count, k, shared_entries, top_k_heap, - // batch_output, - batch_indices); + batch_output, + batch_indices, + speculative_decoding); } } @@ -367,12 +373,13 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, template void ArgTopK::forward_kernel(ArgTopKMeta const *m, DT const *input_ptr, - // float *output_ptr, + float *output_ptr, int *indices_ptr, size_t batch_size, int length, int k, bool sorted, + BeamSearchBatchConfig const *bc, hipStream_t stream) { // Adopted from TensorFlow's ArgTopK implementation // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h @@ -391,28 +398,57 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry
); // size_t num_blocks = (batch_size + num_shards - 1) / num_shards; size_t num_blocks = batch_size; - assert(num_shards >= (size_t)k); - num_shards = k; - hipLaunchKernelGGL(arg_topk_forward_kernel, - num_blocks, - num_shards, - 0, - stream, - input_ptr, - shared_memory_size, - length, - k, - sorted, - // output_ptr, - indices_ptr); + // all requests are in the same beam stages + if (m->speculative_decoding) { + assert(bc->num_active_requests() >= 0); + + // check + int beam_size = -1; + for (int i = 1; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } else if (beam_size == -1) { + beam_size = bc->beamRequestsInfo[i].beam_size; + } else { + assert(beam_size == bc->beamRequestsInfo[i].beam_size); + } + } + + assert(num_shards >= (size_t)beam_size); + num_shards = k; + arg_topk_forward_kernel<<>>( + input_ptr, + shared_memory_size, + length, + beam_size, + sorted, + output_ptr, + indices_ptr, + m->speculative_decoding); + } else { + + assert(num_shards >= (size_t)k); + num_shards = k; + arg_topk_forward_kernel<<>>( + input_ptr, + shared_memory_size, + length, + k, + sorted, + nullptr, + indices_ptr, + false); + } } /*static*/ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, GenericTensorAccessorR const &input, + GenericTensorAccessorW const &probs, // float *output_ptr, GenericTensorAccessorW const &indices, - int batch_size) { + int batch_size, + BeamSearchBatchConfig const *bc) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); // Domain in1_domain = runtime->get_index_space_domain( @@ -457,21 +493,27 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, ArgTopK::forward_kernel(m, input.get_half_ptr(), // output_ptr, + m->speculative_decoding ? probs.get_float_ptr() + : nullptr, indices.get_int32_ptr(), batch_size, length, k, m->sorted, + m->speculative_decoding ? bc : nullptr, stream); } else if (input.data_type == DT_FLOAT) { ArgTopK::forward_kernel(m, input.get_float_ptr(), // output_ptr, + m->speculative_decoding ? probs.get_float_ptr() + : nullptr, indices.get_int32_ptr(), batch_size, length, k, m->sorted, + m->speculative_decoding ? bc : nullptr, stream); } else { assert(false && "Unsupported data type"); From 25097e084772ed9693bef408315385a11340671b Mon Sep 17 00:00:00 2001 From: Xinhao Cheng <99570243+xinhaoc@users.noreply.github.com> Date: Mon, 1 Jan 2024 15:12:07 -0500 Subject: [PATCH 28/61] SpecInfer: optimize performance (#1255) * init * fix speculative * fix speculative * bitmap+tree verify * fix. * fix * multi batch * copy metadata once * fix some corner cases * Replicate load_token tasks so that it can be fused with other compute tasks; this eliminates Replicate and enables a larger fused op * more fix. * clean up * . * load batchconfig * clean * hip * hip * embedding return when no token * use arg topk instead of beam topk * embedding * fmt * hip --------- Co-authored-by: Zhihao Jia --- include/flexflow/flexflow_c.h | 1 + include/flexflow/model.h | 2 + include/flexflow/ops/arg_topk.h | 16 ++- include/flexflow/ops/arg_topk_params.h | 1 + include/flexflow/ops/embedding.h | 4 + inference/models/llama.cc | 2 +- python/flexflow/core/flexflow_cffi.py | 5 +- src/c/flexflow_c.cc | 4 +- src/ops/arg_topk.cc | 185 +++++++++++++++++++------ src/ops/arg_topk.cpp | 90 ++++++++---- src/ops/arg_topk.cu | 91 +++++++++--- src/ops/embedding.cc | 69 +++++++++ src/runtime/model.cc | 18 +++ 13 files changed, 397 insertions(+), 91 deletions(-) diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 01a2818a2b..305c8da513 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -571,6 +571,7 @@ flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_, const flexflow_tensor_t input_, int k, bool sorted, + bool speculative_decoding, char const *name); flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index cda1f91c89..cf7bb3dd2d 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -146,6 +146,7 @@ enum TaskIDs { TOPK_BWD_TASK_ID, ARG_TOPK_INIT_TASK_ID, ARG_TOPK_INF_TASK_ID, + ARG_TOPK_INF_SPECULATIVE_TASK_ID, SAMPLING_INIT_TASK_ID, SAMPLING_INF_TASK_ID, ARGMAX_INIT_TASK_ID, @@ -674,6 +675,7 @@ class FFModel { // Tensor *outputs, int k, bool sorted, + bool speculative_decoding, char const *name = NULL); Tensor argmax(const Tensor input, bool beam_search, char const *name = NULL); Tensor sampling(const Tensor input, float top_p, char const *name = NULL); diff --git a/include/flexflow/ops/arg_topk.h b/include/flexflow/ops/arg_topk.h index 8b2d2aa11c..3822a5e41e 100644 --- a/include/flexflow/ops/arg_topk.h +++ b/include/flexflow/ops/arg_topk.h @@ -12,6 +12,8 @@ class ArgTopKMeta : public OpMeta { public: ArgTopKMeta(FFHandler handle, Op const *op); bool sorted; + int k; + bool speculative_decoding; }; class ArgTopK : public Op { @@ -23,6 +25,7 @@ class ArgTopK : public Op { const ParallelTensor input, int k, bool sorted, + bool speculative_decoding, char const *name); ArgTopK(FFModel &model, LayerID const &layer_guid, @@ -61,6 +64,11 @@ class ArgTopK : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static BeamInferenceResult inference_speculative_task( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); void serialize(Legion::Serializer &s) const override; static PCG::Node deserialize(FFModel &ff, Legion::Deserializer &d, @@ -75,22 +83,26 @@ class ArgTopK : public Op { template static void forward_kernel(ArgTopKMeta const *m, DT const *input_ptr, - // float *output_ptr, + float *output_ptr, int *indices_ptr, size_t batch_size, int length, int k, bool sorted, + BeamSearchBatchConfig const *bc, ffStream_t stream); static void forward_kernel_wrapper(ArgTopKMeta const *m, GenericTensorAccessorR const &input, + GenericTensorAccessorW const &prob, GenericTensorAccessorW const &indices, - int batch_size); + int batch_size, + BeamSearchBatchConfig const *bc); Params get_params() const; public: int k; bool sorted; + bool speculative_decoding; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/arg_topk_params.h b/include/flexflow/ops/arg_topk_params.h index 9d2a21034f..bd9c38e2a9 100644 --- a/include/flexflow/ops/arg_topk_params.h +++ b/include/flexflow/ops/arg_topk_params.h @@ -11,6 +11,7 @@ struct ArgTopKParams { LayerID layer_guid; int k; bool sorted; + bool speculative_decoding; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(ArgTopKParams const &, ArgTopKParams const &); diff --git a/include/flexflow/ops/embedding.h b/include/flexflow/ops/embedding.h index ae93ef4d1d..ed89fcf37a 100644 --- a/include/flexflow/ops/embedding.h +++ b/include/flexflow/ops/embedding.h @@ -80,6 +80,10 @@ class Embedding : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void backward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 10001ee916..e9c84efe90 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -247,7 +247,7 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor softmax = ff.softmax(dense, -1); // output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); // output = ff.argmax(softmax, /*beam_Search*/ true); - output = ff.beam_top_k(softmax, llama_config.max_beam_width, false); + output = ff.arg_top_k(softmax, llama_config.max_beam_width, false, true); // output = ff.top_k(softmax, ) } else { // Tensor softmax = ff.softmax(dense, -1); diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index de3f7e6929..a3c221474d 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -3349,7 +3349,7 @@ def residual_rms_norm(self, input1, input2, eps, dim, name=None): handles_array[1], owner_op_type=OpType.RESIDUAL_RMS_NORM ) - def arg_top_k(self, input, k, sorted, name=None): + def arg_top_k(self, input, k, sorted, speculative_decoding, name=None): """Defines the Arg TopK layer. :param input: the input Tensor. @@ -3361,6 +3361,9 @@ def arg_top_k(self, input, k, sorted, name=None): :param sorted: Whether the entries should be sorted :type sorted: bool + :param speculative_decoding: Whether you need to perform beam search + :type speculative_decoding: bool + :param name: the name of the layer. Default is None. :type name: string diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 80202f6f99..579fc5e2d1 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1489,10 +1489,12 @@ flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_, const flexflow_tensor_t input_, int k, bool sorted, + bool speculative_decoding, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input = FFCObjectWrapper::unwrap(input_); - Tensor tensor = handle->arg_top_k(input, k, sorted, name); + Tensor tensor = + handle->arg_top_k(input, k, sorted, speculative_decoding, name); return FFCObjectWrapper::wrap(tensor); } diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc index a06b89de07..2727a1d249 100644 --- a/src/ops/arg_topk.cc +++ b/src/ops/arg_topk.cc @@ -51,6 +51,7 @@ using PCG::Node; Tensor FFModel::arg_top_k(const Tensor input, int k, bool sorted, + bool speculative_decoding, char const *name) { Layer *li = new Layer(this, OP_ARG_TOPK, @@ -58,7 +59,7 @@ Tensor FFModel::arg_top_k(const Tensor input, name, 1 /*inputs*/, 0 /*weights*/, - 1 /*outputs*/, + speculative_decoding ? 2 : 1 /*outputs*/, input); { int numdims = input->num_dims; @@ -71,9 +72,14 @@ Tensor FFModel::arg_top_k(const Tensor input, // numdims, dims, input->data_type, li, 0, true /*create_grad*/); li->outputs[0] = create_tensor_legion_ordering( numdims, dims, DT_INT32, li, 0, false /*create_grad*/); + if (speculative_decoding) { + li->outputs[1] = create_tensor_legion_ordering( + numdims, dims, DT_FLOAT, li, 1, false /*create_grad*/); + } } li->add_int_property("k", k); li->add_int_property("sorted", sorted); + li->add_int_property("speculative_decoding", speculative_decoding); layers.push_back(li); // outputs[0] = li->outputs[0]; // outputs[1] = li->outputs[1]; @@ -89,14 +95,23 @@ Op *ArgTopK::create_operator_from_layer( int k = value; layer->get_int_property("sorted", value); bool sorted = (bool)value; - return new ArgTopK( - model, layer->layer_guid, inputs[0], k, sorted, layer->name); + layer->get_int_property("speculative_decoding", value); + bool speculative_decoding = (bool)value; + + return new ArgTopK(model, + layer->layer_guid, + inputs[0], + k, + sorted, + speculative_decoding, + layer->name); } ArgTopKParams ArgTopK::get_params() const { ArgTopKParams params; params.k = this->k; params.sorted = this->sorted; + params.speculative_decoding = this->speculative_decoding; return params; } @@ -106,7 +121,8 @@ bool ArgTopKParams::is_valid(ParallelTensorShape const &) const { } bool operator==(ArgTopKParams const &lhs, ArgTopKParams const &rhs) { - return lhs.k == rhs.k && lhs.sorted == rhs.sorted; + return lhs.k == rhs.k && lhs.sorted == rhs.sorted && + lhs.speculative_decoding == rhs.speculative_decoding; } ArgTopK::ArgTopK(FFModel &model, @@ -114,6 +130,7 @@ ArgTopK::ArgTopK(FFModel &model, const ParallelTensor _input, int _k, bool _sorted, + bool _speculative_decoding, char const *name) : Op(model, OP_ARG_TOPK, @@ -121,9 +138,9 @@ ArgTopK::ArgTopK(FFModel &model, name, 1 /*inputs*/, 0 /*weights*/, - 1 /*outputs*/, + _speculative_decoding ? 2 : 1 /*outputs*/, _input), - k(_k), sorted(_sorted) { + k(_k), sorted(_sorted), speculative_decoding(_speculative_decoding) { // overwrite layer_guid layer_guid = _layer_guid; int numdim = inputs[0]->num_dims; @@ -131,26 +148,42 @@ ArgTopK::ArgTopK(FFModel &model, for (int i = 0; i < numdim; i++) { dims[i] = inputs[0]->dims[i]; } + dims[0].size = k; assert(inputs[0]->dims[0].degree == 1); assert(inputs[0]->dims[0].parallel_idx == -1); - // outputs[0] = model.create_parallel_tensor_legion_ordering( - // numdim, dims, _input->data_type, this, 0 /*owner_idx*/); + outputs[0] = model.create_parallel_tensor_legion_ordering( numdim, dims, DT_INT32, this, 0 /*owner_idx*/); + if (_speculative_decoding) { + outputs[1] = model.create_parallel_tensor_legion_ordering( + numdim, dims, DT_FLOAT, this, 1 /*owner_idx*/); + } } ArgTopK::ArgTopK(FFModel &model, LayerID const &layer_guid, ArgTopK const &other, const ParallelTensor input) - : ArgTopK(model, layer_guid, input, other.k, other.sorted, other.name) {} + : ArgTopK(model, + layer_guid, + input, + other.k, + other.sorted, + other.speculative_decoding, + other.name) {} ArgTopK::ArgTopK(FFModel &model, ArgTopKParams const ¶ms, - const ParallelTensor input, + ParallelTensor const input, char const *name) - : ArgTopK(model, params.layer_guid, input, params.k, params.sorted, name) {} + : ArgTopK(model, + params.layer_guid, + input, + params.k, + params.sorted, + params.speculative_decoding, + name) {} void ArgTopK::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -243,8 +276,10 @@ OpMeta *ArgTopK::init_task(Task const *task, m->profiling = topk->profiling; m->inference_debugging = topk->inference_debugging; m->sorted = topk->sorted; + m->k = topk->k; std::strcpy(m->op_name, topk->name); m->layer_guid = topk->layer_guid; + m->speculative_decoding = topk->speculative_decoding; return m; } @@ -267,34 +302,64 @@ FutureMap ArgTopK::inference(FFModel const &ff, size_t machine_view_hash = view->hash(); /* std::cout << "ArgTopK op machine_view: " << *(MachineView const *)mv << std::endl; */ - IndexLauncher launcher(ARG_TOPK_INF_TASK_ID, - parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); - launcher.add_future(bc); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(1, FID_DATA); - // launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, - // 0 /*projection id*/, - // WRITE_ONLY, - // EXCLUSIVE, - // batch_outputs[1]->region)); - // launcher.add_field(2, FID_DATA); - return runtime->execute_index_space(ctx, launcher); + if (speculative_decoding) { + IndexLauncher launcher(ARG_TOPK_INF_SPECULATIVE_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + + launcher.add_region_requirement( + RegionRequirement(batch_outputs[1]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[1]->region)); + launcher.add_field(2, FID_DATA); + return runtime->execute_index_space(ctx, launcher); + + } else { + IndexLauncher launcher(ARG_TOPK_INF_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); + } } InferenceResult @@ -317,9 +382,11 @@ InferenceResult m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW probs; int batch_size = bc->num_active_tokens(); - ArgTopK::forward_kernel_wrapper(m, input, indices, batch_size); + ArgTopK::forward_kernel_wrapper( + m, input, probs, indices, batch_size, nullptr); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); @@ -334,6 +401,39 @@ InferenceResult return ir; } +BeamInferenceResult ArgTopK::inference_speculative_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 3); + assert(task->regions.size() == 3); + BeamSearchBatchConfig const &bc = + Future(task->futures[0]).get_result(); + if (bc.num_active_tokens() == 0) { + // Directly return for empty batch config + BeamInferenceResult ir; + return ir; + } + ArgTopKMeta *m = *((ArgTopKMeta **)task->local_args); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( + DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW probs = helperGetGenericTensorAccessorWO( + DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime); + + int batch_size = bc.num_active_tokens(); + ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, &bc); + + BeamInferenceResult ir; + download_tensor( + indices.get_int32_ptr(), ir.token_ids, batch_size * m->k); + download_tensor(probs.get_float_ptr(), ir.probs, batch_size * m->k); + return ir; +} + void ArgTopK::backward(FFModel const &ff) { // ArgTopK does not support backward assert(false); @@ -345,6 +445,7 @@ void ArgTopK::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->k); sez.serialize(this->sorted); + sez.serialize(this->speculative_decoding); } Node ArgTopK::deserialize(FFModel &ff, @@ -359,12 +460,15 @@ Node ArgTopK::deserialize(FFModel &ff, LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); int k; bool sorted; + bool speculative_decoding; dez.deserialize(k); dez.deserialize(sorted); + dez.deserialize(speculative_decoding); ArgTopKParams params; params.layer_guid = layer_guid; params.k = k; params.sorted = sorted; + params.speculative_decoding = speculative_decoding; return ff.get_or_create_node(inputs[0], params); } @@ -390,6 +494,7 @@ size_t hash::operator()( hash_combine(key, params.layer_guid.id); hash_combine(key, params.k); hash_combine(key, params.sorted); + hash_combine(key, params.speculative_decoding); return key; } }; // namespace std diff --git a/src/ops/arg_topk.cpp b/src/ops/arg_topk.cpp index 6db8abb8c4..f431d3d4bf 100644 --- a/src/ops/arg_topk.cpp +++ b/src/ops/arg_topk.cpp @@ -263,8 +263,9 @@ __device__ void mergeShards(int num_shards, int k, Entry *__restrict__ entries, Entry *__restrict__ top_k_heap, - // T *top_k_values, - int *top_k_indices) { + float *top_k_values, + int *top_k_indices, + bool speculative_decoding) { // If k < num_shards, we can use a min-heap with k elements to get the top k // of the sorted blocks. // If k > num_shards, we can initialize a min-heap with the top element from @@ -314,7 +315,10 @@ __device__ void mergeShards(int num_shards, int const last_k = k - 1; for (int rank = 0; rank < last_k; rank++) { Entry const &max_element = max_heap.root(); - // top_k_values[rank] = max_element.value; + if (speculative_decoding) { + assert(top_k_values != nullptr); + top_k_values[rank] = static_cast(max_element.value); + } int shard_index = max_element.index; top_k_indices[rank] = entries[shard_index].index; int next_shard_index = shard_index + num_shards; @@ -338,8 +342,9 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, int length, int k, bool sorted, - // T *__restrict__ output, - int *__restrict__ indices) { + float *__restrict__ output, + int *__restrict__ indices, + bool speculative_decoding) { __shared__ char shared_memory[48 << 10]; int const batch_index = blockIdx.x; T const *batch_input = input + batch_index * length; @@ -351,15 +356,16 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, __syncthreads(); if (thread_index == 0) { int const offset = batch_index * k; - // auto batch_output = output + offset; + auto batch_output = output + offset; auto batch_indices = indices + offset; Entry *top_k_heap = shared_entries + thread_count * k; mergeShards(thread_count, k, shared_entries, top_k_heap, - // batch_output, - batch_indices); + batch_output, + batch_indices, + speculative_decoding); } } @@ -367,12 +373,13 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, template void ArgTopK::forward_kernel(ArgTopKMeta const *m, DT const *input_ptr, - // float *output_ptr, + float *output_ptr, int *indices_ptr, size_t batch_size, int length, int k, bool sorted, + BeamSearchBatchConfig const *bc, hipStream_t stream) { // Adopted from TensorFlow's ArgTopK implementation // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h @@ -391,28 +398,57 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry
); // size_t num_blocks = (batch_size + num_shards - 1) / num_shards; size_t num_blocks = batch_size; - assert(num_shards >= (size_t)k); - num_shards = k; - hipLaunchKernelGGL(arg_topk_forward_kernel, - num_blocks, - num_shards, - 0, - stream, - input_ptr, - shared_memory_size, - length, - k, - sorted, - // output_ptr, - indices_ptr); + // all requests are in the same beam stages + if (m->speculative_decoding) { + assert(bc->num_active_requests() >= 0); + + // check + int beam_size = -1; + for (int i = 1; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } else if (beam_size == -1) { + beam_size = bc->beamRequestsInfo[i].beam_size; + } else { + assert(beam_size == bc->beamRequestsInfo[i].beam_size); + } + } + + assert(num_shards >= (size_t)beam_size); + num_shards = k; + arg_topk_forward_kernel<<>>( + input_ptr, + shared_memory_size, + length, + beam_size, + sorted, + output_ptr, + indices_ptr, + m->speculative_decoding); + } else { + + assert(num_shards >= (size_t)k); + num_shards = k; + arg_topk_forward_kernel<<>>( + input_ptr, + shared_memory_size, + length, + k, + sorted, + nullptr, + indices_ptr, + false); + } } /*static*/ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, GenericTensorAccessorR const &input, + GenericTensorAccessorW const &probs, // float *output_ptr, GenericTensorAccessorW const &indices, - int batch_size) { + int batch_size, + BeamSearchBatchConfig const *bc) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); // Domain in1_domain = runtime->get_index_space_domain( @@ -457,21 +493,27 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, ArgTopK::forward_kernel(m, input.get_half_ptr(), // output_ptr, + m->speculative_decoding ? probs.get_float_ptr() + : nullptr, indices.get_int32_ptr(), batch_size, length, k, m->sorted, + m->speculative_decoding ? bc : nullptr, stream); } else if (input.data_type == DT_FLOAT) { ArgTopK::forward_kernel(m, input.get_float_ptr(), // output_ptr, + m->speculative_decoding ? probs.get_float_ptr() + : nullptr, indices.get_int32_ptr(), batch_size, length, k, m->sorted, + m->speculative_decoding ? bc : nullptr, stream); } else { assert(false && "Unsupported data type"); diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu index 575e0183b4..0b8bb8b563 100644 --- a/src/ops/arg_topk.cu +++ b/src/ops/arg_topk.cu @@ -262,8 +262,9 @@ __device__ void mergeShards(int num_shards, int k, Entry *__restrict__ entries, Entry *__restrict__ top_k_heap, - // T *top_k_values, - int *top_k_indices) { + float *top_k_values, + int *top_k_indices, + bool speculative_decoding) { // If k < num_shards, we can use a min-heap with k elements to get the top k // of the sorted blocks. // If k > num_shards, we can initialize a min-heap with the top element from @@ -313,7 +314,11 @@ __device__ void mergeShards(int num_shards, int const last_k = k - 1; for (int rank = 0; rank < last_k; rank++) { Entry const &max_element = max_heap.root(); - // top_k_values[rank] = max_element.value; + if (speculative_decoding) { + assert(top_k_values != nullptr); + top_k_values[rank] = static_cast(max_element.value); + } + int shard_index = max_element.index; top_k_indices[rank] = entries[shard_index].index; int next_shard_index = shard_index + num_shards; @@ -337,8 +342,9 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, int length, int k, bool sorted, - // T *__restrict__ output, - int *__restrict__ indices) { + float *__restrict__ output, + int *__restrict__ indices, + bool speculative_decoding) { __shared__ char shared_memory[48 << 10]; int const batch_index = blockIdx.x; T const *batch_input = input + batch_index * length; @@ -350,15 +356,16 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, __syncthreads(); if (thread_index == 0) { int const offset = batch_index * k; - // auto batch_output = output + offset; + auto batch_output = output + offset; auto batch_indices = indices + offset; Entry *top_k_heap = shared_entries + thread_count * k; mergeShards(thread_count, k, shared_entries, top_k_heap, - // batch_output, - batch_indices); + batch_output, + batch_indices, + speculative_decoding); } } @@ -366,12 +373,13 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input, template void ArgTopK::forward_kernel(ArgTopKMeta const *m, DT const *input_ptr, - // float *output_ptr, + float *output_ptr, int *indices_ptr, size_t batch_size, int length, int k, bool sorted, + BeamSearchBatchConfig const *bc, cudaStream_t stream) { // Adopted from TensorFlow's ArgTopK implementation // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h @@ -390,24 +398,58 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry
); // size_t num_blocks = (batch_size + num_shards - 1) / num_shards; size_t num_blocks = batch_size; - assert(num_shards >= (size_t)k); - num_shards = k; - arg_topk_forward_kernel<<>>( - input_ptr, - shared_memory_size, - length, - k, - sorted, - // output_ptr, - indices_ptr); + + // all requests are in the same beam stages + if (m->speculative_decoding) { + assert(bc->num_active_requests() >= 0); + + // check + int beam_size = -1; + for (int i = 1; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } else if (beam_size == -1) { + beam_size = bc->beamRequestsInfo[i].beam_size; + } else { + assert(beam_size == bc->beamRequestsInfo[i].beam_size); + } + } + + assert(num_shards >= (size_t)beam_size); + num_shards = k; + arg_topk_forward_kernel<<>>( + input_ptr, + shared_memory_size, + length, + beam_size, + sorted, + output_ptr, + indices_ptr, + m->speculative_decoding); + } else { + + assert(num_shards >= (size_t)k); + num_shards = k; + arg_topk_forward_kernel<<>>( + input_ptr, + shared_memory_size, + length, + k, + sorted, + nullptr, + indices_ptr, + false); + } } /*static*/ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, GenericTensorAccessorR const &input, // float *output_ptr, + GenericTensorAccessorW const &probs, GenericTensorAccessorW const &indices, - int batch_size) { + int batch_size, + BeamSearchBatchConfig const *bc) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -439,6 +481,7 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, int length = input.domain.hi()[0] - input.domain.lo()[0] + 1; int k = indices.domain.hi()[0] - indices.domain.lo()[0] + 1; /*TODO: This prints to 5*/ + // batch_size = input.domain.get_volume() / length; // assert(indices.domain.get_volume() / k == batch_size); cudaEvent_t t_start, t_end; @@ -451,22 +494,26 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m, if (input.data_type == DT_HALF) { ArgTopK::forward_kernel(m, input.get_half_ptr(), - // output_ptr, + m->speculative_decoding ? probs.get_float_ptr() + : nullptr, indices.get_int32_ptr(), batch_size, length, k, m->sorted, + m->speculative_decoding ? bc : nullptr, stream); } else if (input.data_type == DT_FLOAT) { ArgTopK::forward_kernel(m, input.get_float_ptr(), - // output_ptr, + m->speculative_decoding ? probs.get_float_ptr() + : nullptr, indices.get_int32_ptr(), batch_size, length, k, m->sorted, + m->speculative_decoding ? bc : nullptr, stream); } else { assert(false && "Unsupported data type"); diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 76236e65ff..40d5b600be 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -478,6 +478,7 @@ FutureMap Embedding::inference(FFModel const &ff, 0 /*mapper_id*/, machine_view_hash); // regions[0]: input + launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection*/, READ_ONLY, @@ -566,6 +567,74 @@ void Embedding::forward_task(Task const *task, } } +/* + regions[0](I): input + regions[1](O): output + regions[2](I): kernel +*/ +void Embedding::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + EmbeddingMeta *m = *((EmbeddingMeta **)task->local_args); + assert(regions.size() == 3); + assert(task->regions.size() == 3); + // Assert that weight and output must have the same data type + // otherwise, a cast operator should be inserted + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_tokens() == 0) { + return; + } + assert(m->weight_type[0] == m->output_type[0]); + assert(m->input_type[0] == DT_INT32 || m->input_type[0] == DT_INT64); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR kernel = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + if (m->aggr == AGGR_MODE_NONE) { + // assert(kernel_domain.get_dim() == 2); + assert(input.domain.get_dim() + 1 == output.domain.get_dim()); + for (size_t i = 0; i < input.domain.get_dim(); i++) { + assert(input.domain.hi()[i] == output.domain.hi()[i + 1]); + assert(input.domain.lo()[i] == output.domain.lo()[i + 1]); + } + assert(kernel.domain.hi()[0] - kernel.domain.lo()[0] == + output.domain.hi()[0] - output.domain.lo()[0]); + } else { + // assert(kernel_domain.get_dim() == 2); + assert(input.domain.get_dim() == output.domain.get_dim()); + for (size_t i = 1; i < input.domain.get_dim(); i++) { + assert(input.domain.hi()[i] == output.domain.hi()[i]); + assert(input.domain.lo()[i] == output.domain.lo()[i]); + } + assert(kernel.domain.hi()[0] - kernel.domain.lo()[0] == + output.domain.hi()[0] - output.domain.lo()[0]); + } + + int in_dim, out_dim, effective_batch_size; + if (m->aggr == AGGR_MODE_NONE) { + in_dim = 1; + out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + effective_batch_size = output.domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == input.domain.get_volume()); + } else { + in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + effective_batch_size = output.domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == input.domain.get_volume()); + } + forward_kernel_wrapper( + m, input, output, kernel, in_dim, out_dim, effective_batch_size); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + Embedding::save_inference_tensors_to_file( + m, shard_id, nullptr, {input}, {kernel}, {output}); + } +} + void Embedding::backward(FFModel const &ff) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 3bfe429ddd..32b524f643 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -5929,6 +5929,24 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(ARG_TOPK_INF_SPECULATIVE_TASK_ID, + "ArgTopK Speculative Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "ArgTopK Speculative Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // BeamTopk task { TaskVariantRegistrar registrar(BEAM_TOPK_INIT_TASK_ID, "BeamTopK Init"); From d7e8d728b67557bebbf9f76de9b806575b8a4cc2 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 2 Jan 2024 13:54:29 -0500 Subject: [PATCH 29/61] fix corner case --- include/flexflow/batch_config.h | 14 ++- include/flexflow/config.h | 3 +- include/flexflow/model.h | 1 + .../inc_multihead_self_attention_utils.cuh | 2 +- .../ops/spec_inc_multihead_self_attention.h | 1 + .../ops/tree_inc_multihead_self_attention.h | 1 + include/flexflow/request_manager.h | 2 + inference/models/falcon.cc | 5 +- inference/models/llama.cc | 5 +- inference/models/mpt.cc | 5 +- inference/models/opt.cc | 5 +- inference/models/starcoder.cc | 5 +- src/ops/arg_topk.cu | 11 ++- src/ops/inc_multihead_self_attention.cu | 4 +- src/ops/spec_inc_multihead_self_attention.cu | 60 +++++++----- src/ops/tree_inc_multihead_self_attention.cu | 62 +++++++------ src/runtime/batch_config.cc | 6 ++ src/runtime/beam_search_batch_config.cc | 4 + src/runtime/model.cc | 14 +++ src/runtime/request_manager.cc | 93 +++++++++++-------- src/runtime/request_manager.cu | 28 +++++- 21 files changed, 225 insertions(+), 106 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 13904aaa46..ef17ef43ed 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -45,6 +45,7 @@ class BatchConfig { int num_active_tokens() const; static int max_requests_per_batch(); static int max_tokens_per_batch(); + static int max_verify_tokens_per_batch(); static int max_sequence_length(); friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc); void print() const; @@ -72,6 +73,7 @@ class BatchConfig { // request id in batch config: int batch_config_request_id; + bool prompt_phase = false; RequestGuid request_guid; }; struct PerTokenInfo { @@ -85,15 +87,15 @@ class BatchConfig { // how many tokens before the tree, every sub requests need this part of // cache - int non_tree_cache_size; + int non_tree_cache_size = 0; // current tree size - int tree_size; + int tree_size = 0; - int this_layer_size; + int this_layer_size = 0; // input length-> prompt/root - int prompt_size; + int prompt_size = 0; }; BitMask causalMask[MAX_NUM_REQUESTS]; @@ -145,9 +147,13 @@ class BeamSearchBatchConfig : public BatchConfig { bool done() const; int max_beam_depth_all_requests() const; int current_depth_all_requests() const; + int get_speculative_request_num() const; size_t beam_width; size_t target_iterations; + + // how many requests is in speculative phase + int speculative_request_num = 0; inline static int const MAX_BEAM_WIDTH = 3; inline static int const MAX_BEAM_DEPTH = 8; diff --git a/include/flexflow/config.h b/include/flexflow/config.h index e1480264cc..17a3f59e29 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -84,7 +84,8 @@ struct FFHandler { sizeof(BeamSearchBatchConfig::beamTokenInfo) + sizeof(BeamSearchBatchConfig::beamRequestsInfo) + sizeof(BatchConfig::causalMask) + - sizeof(TreeVerifyBatchConfig::committed_tokens); + sizeof(TreeVerifyBatchConfig::committed_tokens) + + sizeof(BatchConfig::request_completed); void *offload_reserve_space; size_t offload_reserve_space_size; DataType quantization_type; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index cf7bb3dd2d..6f805e21bd 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -73,6 +73,7 @@ enum TaskIDs { DROPOUT_BWD_TASK_ID, EMBED_INIT_TASK_ID, EMBED_FWD_TASK_ID, + EMBED_INF_TASK_ID, EMBED_BWD_TASK_ID, GATHER_INIT_TASK_ID, GATHER_FWD_TASK_ID, diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh index c128c1a126..d1e0e050b2 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh @@ -493,7 +493,7 @@ inline void smem_size_in_bytes_tree(int hidden_size_per_head, } // todo fix this - int max_qk_length = max_query_length * max_total_length; + int max_qk_length = max_query_length; // The amount of shared memory needed to store the Q*K^T values in float. size_t qk_sz = div_up(max_qk_length + 1, 4) * 16; diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index a306f7985a..a0d01092bf 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -142,6 +142,7 @@ class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { Realm::RegionInstance beam_search_reserve_inst; BeamSearchBatchConfig::BeamSearchPerTokenInfo *beam_token_infos; BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos; + bool *request_completed; BatchConfig::BitMask *causalMask; }; diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index d160da4a72..02df0c0137 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -147,6 +147,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { int num_active_tokens; Realm::RegionInstance committed_token_reserve_inst; TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos; + bool *request_completed; BatchConfig::BitMask *causalMask; }; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 1c4b0b2a2f..33714c106e 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -103,6 +103,7 @@ class RequestManager { int get_max_requests_per_batch(); void set_max_tokens_per_batch(int max_num_tokens); int get_max_tokens_per_batch(); + int get_max_verify_tokens_per_batch(); void set_max_sequence_length(int max_seq_length); void push_spec_infer_tree_width(int tree_width); int get_max_sequence_length(); @@ -113,6 +114,7 @@ class RequestManager { std::string const &path); void register_output_filepath(std::string const &); void initBitMask(BatchConfig::BitMask &bitmask, int initLength); + void appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength); void appendBitMask(BatchConfig::BitMask &bitmask, int newNodes, int preBeamSize, diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index bfcec847b9..999ca37037 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -39,7 +39,10 @@ void FALCON::create_falcon_model(FFModel &ff, Tensor input; { // assert(falcon_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; + int const token_dims[] = {mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/inference/models/llama.cc b/inference/models/llama.cc index e9c84efe90..e54d6d8811 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -41,7 +41,10 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor input; { - int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; + int const token_dims[] = {mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index b074d332ed..3df67b264c 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -40,7 +40,10 @@ void MPT::create_mpt_model(FFModel &ff, //------------------------------ build the model -------------------------- Tensor input; { - int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; + int const token_dims[] = {mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 9b29ae5410..0279f83239 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -42,7 +42,10 @@ void OPT::create_opt_model(FFModel &ff, Tensor position_input; ff.set_position_offset(2); { - int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; + int const token_dims[] = {mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); position_input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index ba7b2cb43a..e683376e47 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -48,7 +48,10 @@ void STARCODER::create_starcoder_model( ff.set_position_offset(0); { // assert(startcoder_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; + int const token_dims[] = {mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); position_input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu index 0b8bb8b563..3302178728 100644 --- a/src/ops/arg_topk.cu +++ b/src/ops/arg_topk.cu @@ -405,13 +405,20 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, // check int beam_size = -1; - for (int i = 1; i < bc->max_requests_per_batch(); i++) { + + // allow last request different with others + int num_activate_requests = bc->num_active_requests(); + int last_request_idx = + bc->requestsInfo[num_activate_requests - 1].batch_config_request_id; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } else if (beam_size == -1) { beam_size = bc->beamRequestsInfo[i].beam_size; - } else { + + } else if (i != last_request_idx) { assert(beam_size == bc->beamRequestsInfo[i].beam_size); + } else if (i == last_request_idx) { } } diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index db64868cb9..7c8601d3c8 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1349,7 +1349,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); + int max_tokens_per_batch = infer_mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(); size_t qkv_max_proj_size = max_tokens_per_batch * (qProjSize * num_q_heads + kProjSize * num_q_heads + vProjSize * num_q_heads); diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 88dd3f92e4..b31e5d0994 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -50,7 +50,8 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel( int hidden_size, BatchConfig::PerRequestInfo *request_infos, BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos, - BatchConfig::BitMask *causalMask) { + BatchConfig::BitMask *causalMask, + bool *request_completed) { // q, k using Q_vec = typename VEC_K::Type; @@ -86,11 +87,12 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel( // request_infos[batch_config_request_id].first_token_depth_in_request + // request_infos[batch_config_request_id].num_tokens_in_batch; - int const totalCacheSize = bitmask.non_tree_cache_size + bitmask.tree_size; + int const totalCacheSize = + bitmask.non_tree_cache_size + bitmask.tree_size + bitmask.prompt_size - 1; int first_token_idx = 0; - for (int r = 0; r < request_idx; r++) { - first_token_idx += causalMask[r].this_layer_size; + for (int r = 0; r < batch_config_request_id; r++) { + first_token_idx += request_completed[r] ? 0 : causalMask[r].this_layer_size; } int const tree_branch_num = @@ -138,7 +140,8 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel( ii * THREADS_PER_KEY * K_VEC_SIZE); } - int const query_token = bitmask.tree_size - tree_branch_num + qi; + int const query_token = + bitmask.prompt_size + bitmask.tree_size - 1 - tree_branch_num + qi; __syncthreads(); for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { @@ -163,8 +166,12 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel( (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << query_token)))); - // if (blockIdx.y == 0 && blockIdx.x == 0 && !mask) { - // printf("spec inc attn qkqkqk %d, %.10f, %d\n", ti, qk, qi); + // if (head_idx == 0 && ti == 0 && request_idx == 15 && !mask) { + // printf("spec inc attn qkqkqk request id %d, %.10f, %d\n", + // batch_config_request_id, + // ti, + // qk, + // qi); // } qk_max = mask ? qk_max : fmaxf(qk_max, qk); qk_smem[ti - first_step] = mask ? 0.f : qk; @@ -336,17 +343,12 @@ __global__ void spec_inc_store_kv_cache( BatchConfig::BitMask bitmask = causalMask[req_id]; - // int const tree_branch_num = beamRequestInfos[req_id].sub_request_num; - - // int const query_token = bitmask.non_tree_cache_size + bitmask.tree_size - - // tree_branch_num + sub_req_id + tok_id; - // bitmask.tree_size - tree_branch_num + sub_req_id; - // if prompt token -> token id // if tree token: - int const cache_idx = bitmask.non_tree_cache_size + bitmask.tree_size - - bitmask.this_layer_size + token_idx - - request_token_offset; + + int const cache_idx = bitmask.prompt_size + bitmask.non_tree_cache_size + + bitmask.tree_size - 1 - bitmask.this_layer_size + + token_idx - request_token_offset; kCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + offset] = kVal; @@ -411,7 +413,8 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, m->hidden_size, \ m->request_infos, \ m->beam_request_infos, \ - m->causalMask) + m->causalMask, \ + m->request_completed) template void compute_spec_inc_attention_kernel_generation( @@ -420,7 +423,8 @@ void compute_spec_inc_attention_kernel_generation( DT *output_ptr, cudaStream_t stream) { // one block == one head per request - dim3 grid(m->num_q_heads, bc->num_active_requests()); + // how many generation requests + dim3 grid(m->num_q_heads, bc->get_speculative_request_num()); int const per_head_size = m->qProjSize; float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; size_t smem_sz; @@ -499,11 +503,10 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; + } else if (tokens_previous_requests < bc->num_generation_tokens) { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; } - // else if (tokens_previous_requests < bc->num_generation_tokens) { - // tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; - // continue; - // } // all requests in prompt phase should only have one sub requests; assert(bc->sub_requests[i] == 1); @@ -659,10 +662,10 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests - // print_tensor((float*)C_softmax, 32, "C_softmax"); + int token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + C = static_cast
(m->attn_heads) + - (tokens_previous_requests + bc->num_generation_tokens) * - m->num_q_heads * m->vProjSize; + (token_offset)*m->num_q_heads * m->vProjSize; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_N, CUBLAS_OP_T, @@ -860,6 +863,13 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + sizeof(BeamSearchBatchConfig::beamTokenInfo) + sizeof(BeamSearchBatchConfig::beamRequestsInfo)); + + request_completed = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::beamTokenInfo) + + sizeof(BeamSearchBatchConfig::beamRequestsInfo) + + sizeof(BatchConfig::causalMask)); } cudaStreamSynchronize(stream); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index b4af80976f..fc86e1498e 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -54,6 +54,7 @@ __global__ void compute_attention_kernel_fused_kernel( int num_heads, int num_requests, BatchConfig::BitMask *causalMask, + bool *request_completed, int qk_smem_sz) { // q, k @@ -90,13 +91,14 @@ __global__ void compute_attention_kernel_fused_kernel( BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; int first_token_idx = 0; - for (int r = 0; r < request_idx; r++) { - first_token_idx += request_infos[r].num_tokens_in_batch; + for (int r = 0; r < batch_config_request_id; r++) { + first_token_idx += + request_completed[r] ? 0 : request_infos[r].num_tokens_in_batch; } - // if(tidx == 0 && head_idx == 0){ - // printf("tree req: %d, %d\n", request_idx, first_token_idx); - // } + bool prompt_phase = request_infos[batch_config_request_id].prompt_phase; + int q_start = + request_infos[batch_config_request_id].first_token_depth_in_request; // shared memory objects extern __shared__ char smem_[]; @@ -139,7 +141,7 @@ __global__ void compute_attention_kernel_fused_kernel( q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + ii * THREADS_PER_KEY * K_VEC_SIZE); - // if (head_idx == 0 && qi == 1 && tidx == 0) { + // if (head_idx == 0 && request_idx == 1 && tidx == 0) { // printf("laod q %d, %d %.10f\n", // request_idx, // qi,q_vecs[ki_o][ii].x); @@ -163,19 +165,23 @@ __global__ void compute_attention_kernel_fused_kernel( if (ti < tlength && tidx % THREADS_PER_KEY == 0) { bool const mask = - (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + prompt_phase ? (qi + q_start < ti) + : (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << qi)))); qk_max = mask ? qk_max : fmaxf(qk_max, qk); - // if (head_idx == 0 && qi == 0 && !mask) { - // printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, %.10f\n - // ", + // if (head_idx == 0 && !mask) { + // printf("tree attn qkqkqkqk request id %d qi%d, ti %d, %.10f, %.10f, + // %.10f, %d\n", // request_idx, + // qi, // ti, // qk, // q_vecs[ki_o][0].x, - // k[0].x); + // k[0].x, + // bitmask.non_tree_cache_size); // } qk_smem[ti - first_step] = mask ? 0.0f : qk; } @@ -217,8 +223,10 @@ __global__ void compute_attention_kernel_fused_kernel( float exp_sum = 0.f; for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { bool const mask = - (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + prompt_phase ? (q_start + qi < ti) + : (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << qi)))); float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max); exp_sum += logit; qk_smem[ti - first_step] = mask ? 0.0f : logit; @@ -265,8 +273,11 @@ __global__ void compute_attention_kernel_fused_kernel( if (ti < tlength) { bool const mask = - (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + prompt_phase + ? (q_start + qi < ti) + : (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << qi)))); float logit = mask ? 0.0f : qk_smem[ti - first_step]; out = FlexFlow::fma(logit, cast_to_float(v), out); } @@ -810,6 +821,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, \ bc->num_active_requests(), \ m->causalMask, \ + m->request_completed, \ smem_sz[0]) template @@ -841,7 +853,6 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, dim3 grid(m->num_q_heads, bc->num_active_requests()); int const per_head_size = m->qProjSize; float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; - // 0->qk production size, 1->total shared size int smem_sz[2]; if (per_head_size == 64) { @@ -890,17 +901,6 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit << // "\n"; - cudaMemcpyAsync(m->committed_token_infos, - &(bc->committed_tokens), - bc->num_tokens_to_commit * - sizeof(TreeVerifyBatchConfig::CommittedTokensInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->causalMask, - &(bc->causalMask), - bc->num_active_requests() * sizeof(BatchConfig::BitMask), - cudaMemcpyHostToDevice, - stream); commit_tokens
(m, bc, stream); // After commit we update m->num_active_tokens to be the number of active @@ -1068,6 +1068,12 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + sizeof(BatchConfig::causalMask)); + + request_completed = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + + sizeof(BatchConfig::causalMask) + + sizeof(TreeVerifyBatchConfig::committed_tokens)); } cudaStreamSynchronize(stream); diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index d2fbc0883f..c432208eca 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -84,6 +84,12 @@ int BatchConfig::max_tokens_per_batch() { return RequestManager::get_request_manager()->get_max_tokens_per_batch(); } +/*static*/ +int BatchConfig::max_verify_tokens_per_batch() { + return RequestManager::get_request_manager() + ->get_max_verify_tokens_per_batch(); +} + /*static*/ int BatchConfig::max_sequence_length() { return RequestManager::get_request_manager()->get_max_sequence_length(); diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc index 74843e9460..ff7bf1a819 100644 --- a/src/runtime/beam_search_batch_config.cc +++ b/src/runtime/beam_search_batch_config.cc @@ -85,6 +85,10 @@ int BeamSearchBatchConfig::max_beam_depth_all_requests() const { return max_depth_all_requests; } +int BeamSearchBatchConfig::get_speculative_request_num() const { + return speculative_request_num; +} + int BeamSearchBatchConfig::current_depth_all_requests() const { int current_depth = 0; for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 32b524f643..76bed36bda 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4805,6 +4805,20 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(EMBED_INF_TASK_ID, "Embedding Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Embedding Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } { TaskVariantRegistrar registrar(EMBED_BWD_TASK_ID, "Embedding Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 89d4ddaed4..88754f5a82 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -97,6 +97,12 @@ int RequestManager::get_max_tokens_per_batch() { return max_tokens_per_batch; } +int RequestManager::get_max_verify_tokens_per_batch() { + assert(max_tokens_per_batch > 0); + return max_tokens_per_batch + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM * max_requests_per_batch; +} + void RequestManager::set_max_sequence_length(int max_seq_length) { assert(max_sequence_length == -1 || max_sequence_length == max_seq_length); max_sequence_length = max_seq_length; @@ -1126,7 +1132,6 @@ BeamSearchBatchConfig old_bc.beamRequestsInfo[i].sub_request_num, tree, old_bc.beamRequestsInfo[i].current_depth); - // assert(false); for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) { @@ -1146,6 +1151,9 @@ BeamSearchBatchConfig } } + // how many requests is in speculative phase + new_bc.speculative_request_num = num_active_req + 1; + // Add prompt tokens to the batch for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (old_bc.request_completed[i] || old_bc.request_running[i]) { @@ -1184,13 +1192,14 @@ BeamSearchBatchConfig spec_infer_tree_width.size() > ssm_decoding_steps ? spec_infer_tree_width[ssm_decoding_steps] : 1; - printf("beam size: %d, %d\n", - new_bc.beamRequestsInfo[i].beam_size, - ssm_decoding_steps); + // printf("beam size: %d, %d\n", + // new_bc.beamRequestsInfo[i].beam_size, + // ssm_decoding_steps); new_bc.beamRequestsInfo[i].max_depth = old_bc.beamRequestsInfo[i].max_depth; - new_bc.sub_requests[i] = - old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; + // new_bc.sub_requests[i] = + // old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; + new_bc.sub_requests[i] = 1; new_bc.beamRequestsInfo[i].sub_request_num = old_bc.beamRequestsInfo[i].sub_request_num; @@ -1218,6 +1227,9 @@ BeamSearchBatchConfig request.tokens.size()) { // request is done new_bc.requestsInfo[i].num_tokens_in_batch = 0; + new_bc.causalMask[i].this_layer_size = 0; + new_bc.beamRequestsInfo[i].sub_request_num = 0; + new_bc.beamRequestsInfo[i].beam_size = 1; } else { // Prompt phase new_bc.requestsInfo[i].num_tokens_in_batch = @@ -1227,12 +1239,8 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].first_token_depth_in_request); request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch; BeamTree tree = request.beam_trees[old_bc.model_id]; - appendBitMask(new_bc.causalMask[i], - new_bc.beamRequestsInfo[i].sub_request_num, - old_bc.beamRequestsInfo[i].beam_size, - old_bc.beamRequestsInfo[i].sub_request_num, - tree, - old_bc.beamRequestsInfo[i].current_depth); + appendPendingRequest(new_bc.causalMask[i], + new_bc.requestsInfo[i].num_tokens_in_batch); } if (verbose) { @@ -1258,11 +1266,11 @@ BeamSearchBatchConfig // get value from requestinfo new_bc.tokensInfo[new_bc.num_tokens].token_id = - request.tokens[request.tokens.size() - 1]; + request.tokens[request.tokens.size() - + new_bc.requestsInfo[i].num_tokens_in_batch + j]; new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; new_bc.num_tokens++; - num_generation_tokens++; } } } @@ -1319,7 +1327,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens_to_commit = 0; new_bc.num_tokens = 0; - int max_prompt_load_size = get_max_tokens_per_batch(); + int max_prompt_load_size = get_max_verify_tokens_per_batch(); for (int i = 0; i < TreeVerifyBatchConfig::max_requests_per_batch(); i++) { if (old_batches.at(0).request_completed[i]) { continue; @@ -1427,7 +1435,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - if (new_bc.num_tokens > get_max_tokens_per_batch()) { + if (new_bc.num_tokens > get_max_verify_tokens_per_batch()) { assert(false && "Exceeding the space available in the TreeVerify batch"); break; @@ -1453,7 +1461,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - if (new_bc.num_tokens == get_max_tokens_per_batch() && + if (new_bc.num_tokens == get_max_verify_tokens_per_batch() && (j != dfs_tree_inputs.size() - 1)) { cutLayer = true; break; @@ -1542,7 +1550,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; } - if (new_bc.num_tokens > get_max_tokens_per_batch()) { + if (new_bc.num_tokens > get_max_verify_tokens_per_batch()) { assert(false && "Exceeding the space available in the TreeVerify batch"); break; @@ -1555,15 +1563,17 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( request.status = Request::RUNNING; new_bc.request_running[i] = true; - std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " - << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + // std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " + // << new_bc.requestsInfo[i].num_tokens_in_batch << + // std::endl; + new_bc.requestsInfo[i].prompt_phase = true; dfs_tree_inputs[guid] = std::vector>{std::make_pair( request.tokens.back(), request.tokens.size() - 1)}; } } else { // launch the request into running phase after loading all prompt - if (get_max_tokens_per_batch() - new_bc.num_tokens > 0) { + if (get_max_verify_tokens_per_batch() - new_bc.num_tokens > 0) { // std::cout << "Initialization running phase: " // << new_bc.requestsInfo[i].num_tokens_in_batch << "\n"; request.status = Request::RUNNING; @@ -1576,9 +1586,11 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " - << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + // std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch2: " + // << new_bc.requestsInfo[i].num_tokens_in_batch << + // std::endl; + new_bc.requestsInfo[i].prompt_phase = true; dfs_tree_inputs[guid] = std::vector>{std::make_pair( request.tokens.back(), request.tokens.size() - 1)}; @@ -1760,20 +1772,14 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, // prompt phase, init task void RequestManager::initBitMask(BatchConfig::BitMask &bitmask, int initLength) { - assert(initLength <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM && - "do not support tree size > 64"); + assert(initLength > 0); // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: // 0000000..1000 bitmask.non_tree_cache_size = 0; - bitmask.tree_size = initLength; + bitmask.tree_size = 1; bitmask.prompt_size = initLength; bitmask.this_layer_size = initLength; - for (int i = 0; i < bitmask.prompt_size; i++) { - for (int j = i; j < bitmask.prompt_size; j++) { - bitmask.mask[i] |= (1 << j); - } - } // std::cout << "see bit mask" << bitmask.prompt_size << "\n"; // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[0]) << "\n"; // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[1]) << "\n"; @@ -1810,6 +1816,25 @@ void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask, // << "\n"; } +// prompt phase, init task +void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask, + int initLength) { + assert(initLength > 0); + std::cout << "append pending bit mask: " << initLength << "\n"; + // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: + // 0000000..1000 + bitmask.non_tree_cache_size = 0; + bitmask.tree_size = 1; + bitmask.prompt_size += initLength; + bitmask.this_layer_size = initLength; + + // for (int i = 0; i < bitmask.prompt_size; i++) { + // for (int j = i; j < bitmask.prompt_size; j++) { + // bitmask.mask[i] |= (1 << j); + // } + // } +} + // prepare next beam, append layers to the tree void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, int newNodes, @@ -1862,12 +1887,6 @@ void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, } } - // std::cout << "token idx: " << token_idx << ", " << pre_tree_size << ", " - // << new_nodes_start_idx << ", " << newNodes - // << "current depth: " << currentDepth << "\n"; - // std::cout << "new nodes end " << new_nodes_start_idx << "\n"; - - // std::cout << "tree size: " << bitmask.tree_size << "\n"; assert(token_idx == pre_tree_size); assert(currentDepth <= 1 || new_nodes_start_idx == bitmask.tree_size); diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index 51c52c3026..8380d6be73 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -35,10 +35,17 @@ void RequestManager::load_tokens_task( // Extreme long prompts are not supported, only load up to // BatchConfig::max_tokens_per_batch() as prompt - if (batch_config->num_tokens > BatchConfig::max_tokens_per_batch()) { + if (batch_config->num_tokens > BatchConfig::max_tokens_per_batch() && + batch_config->get_mode() == INC_DECODING_MODE) { printf("Warning: too many tokens in prompt, only load up to %d tokens\n", BatchConfig::max_tokens_per_batch()); printf("Got: %d tokens\n", batch_config->num_tokens); + } else if (batch_config->num_tokens > + BatchConfig::max_verify_tokens_per_batch()) { + printf("Warning: Speculative decoding. too many tokens in prompt, only " + "load up to %d tokens\n", + BatchConfig::max_verify_tokens_per_batch()); + printf("Got: %d tokens\n", batch_config->num_tokens); } for (int i = 0; i < batch_config->num_tokens; i++) { @@ -117,8 +124,16 @@ void RequestManager::load_batch_config_task( sizeof(BatchConfig::causalMask), cudaMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::causalMask); + + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(batch_config->request_completed), + sizeof(BatchConfig::request_completed), + cudaMemcpyHostToDevice, + stream)); + + total_copy_size += sizeof(BatchConfig::request_completed); } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { TreeVerifyBatchConfig const *tree_batch_config = static_cast(batch_config); @@ -137,6 +152,15 @@ void RequestManager::load_batch_config_task( cudaMemcpyHostToDevice, stream)); total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); + + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(batch_config->request_completed), + sizeof(BatchConfig::request_completed), + cudaMemcpyHostToDevice, + stream)); + + total_copy_size += sizeof(BatchConfig::request_completed); } // add a size check From a45826e9daa0364b49f353c1c85cf2a9800bc1d9 Mon Sep 17 00:00:00 2001 From: Xinhao Cheng <99570243+xinhaoc@users.noreply.github.com> Date: Tue, 2 Jan 2024 15:28:52 -0500 Subject: [PATCH 30/61] SpecInfer fix corner case (#1258) * init * fix speculative * fix speculative * bitmap+tree verify * fix. * fix * multi batch * copy metadata once * fix some corner cases * Replicate load_token tasks so that it can be fused with other compute tasks; this eliminates Replicate and enables a larger fused op * more fix. * clean up * . * load batchconfig * clean * hip * hip * embedding return when no token * use arg topk instead of beam topk * embedding * fmt * hip * fix corner case --------- Co-authored-by: Zhihao Jia --- include/flexflow/batch_config.h | 14 ++- include/flexflow/config.h | 3 +- include/flexflow/model.h | 1 + .../inc_multihead_self_attention_utils.cuh | 2 +- .../ops/spec_inc_multihead_self_attention.h | 1 + .../ops/tree_inc_multihead_self_attention.h | 1 + include/flexflow/request_manager.h | 2 + inference/models/falcon.cc | 5 +- inference/models/llama.cc | 5 +- inference/models/mpt.cc | 5 +- inference/models/opt.cc | 5 +- inference/models/starcoder.cc | 5 +- src/ops/arg_topk.cu | 11 ++- src/ops/inc_multihead_self_attention.cu | 4 +- src/ops/spec_inc_multihead_self_attention.cu | 60 +++++++----- src/ops/tree_inc_multihead_self_attention.cu | 62 +++++++------ src/runtime/batch_config.cc | 6 ++ src/runtime/beam_search_batch_config.cc | 4 + src/runtime/model.cc | 14 +++ src/runtime/request_manager.cc | 93 +++++++++++-------- src/runtime/request_manager.cu | 28 +++++- 21 files changed, 224 insertions(+), 107 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 13904aaa46..ef17ef43ed 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -45,6 +45,7 @@ class BatchConfig { int num_active_tokens() const; static int max_requests_per_batch(); static int max_tokens_per_batch(); + static int max_verify_tokens_per_batch(); static int max_sequence_length(); friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc); void print() const; @@ -72,6 +73,7 @@ class BatchConfig { // request id in batch config: int batch_config_request_id; + bool prompt_phase = false; RequestGuid request_guid; }; struct PerTokenInfo { @@ -85,15 +87,15 @@ class BatchConfig { // how many tokens before the tree, every sub requests need this part of // cache - int non_tree_cache_size; + int non_tree_cache_size = 0; // current tree size - int tree_size; + int tree_size = 0; - int this_layer_size; + int this_layer_size = 0; // input length-> prompt/root - int prompt_size; + int prompt_size = 0; }; BitMask causalMask[MAX_NUM_REQUESTS]; @@ -145,9 +147,13 @@ class BeamSearchBatchConfig : public BatchConfig { bool done() const; int max_beam_depth_all_requests() const; int current_depth_all_requests() const; + int get_speculative_request_num() const; size_t beam_width; size_t target_iterations; + + // how many requests is in speculative phase + int speculative_request_num = 0; inline static int const MAX_BEAM_WIDTH = 3; inline static int const MAX_BEAM_DEPTH = 8; diff --git a/include/flexflow/config.h b/include/flexflow/config.h index e1480264cc..17a3f59e29 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -84,7 +84,8 @@ struct FFHandler { sizeof(BeamSearchBatchConfig::beamTokenInfo) + sizeof(BeamSearchBatchConfig::beamRequestsInfo) + sizeof(BatchConfig::causalMask) + - sizeof(TreeVerifyBatchConfig::committed_tokens); + sizeof(TreeVerifyBatchConfig::committed_tokens) + + sizeof(BatchConfig::request_completed); void *offload_reserve_space; size_t offload_reserve_space_size; DataType quantization_type; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index cf7bb3dd2d..6f805e21bd 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -73,6 +73,7 @@ enum TaskIDs { DROPOUT_BWD_TASK_ID, EMBED_INIT_TASK_ID, EMBED_FWD_TASK_ID, + EMBED_INF_TASK_ID, EMBED_BWD_TASK_ID, GATHER_INIT_TASK_ID, GATHER_FWD_TASK_ID, diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh index c128c1a126..d1e0e050b2 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh @@ -493,7 +493,7 @@ inline void smem_size_in_bytes_tree(int hidden_size_per_head, } // todo fix this - int max_qk_length = max_query_length * max_total_length; + int max_qk_length = max_query_length; // The amount of shared memory needed to store the Q*K^T values in float. size_t qk_sz = div_up(max_qk_length + 1, 4) * 16; diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index a306f7985a..a0d01092bf 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -142,6 +142,7 @@ class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { Realm::RegionInstance beam_search_reserve_inst; BeamSearchBatchConfig::BeamSearchPerTokenInfo *beam_token_infos; BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos; + bool *request_completed; BatchConfig::BitMask *causalMask; }; diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index d160da4a72..02df0c0137 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -147,6 +147,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { int num_active_tokens; Realm::RegionInstance committed_token_reserve_inst; TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos; + bool *request_completed; BatchConfig::BitMask *causalMask; }; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 1c4b0b2a2f..33714c106e 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -103,6 +103,7 @@ class RequestManager { int get_max_requests_per_batch(); void set_max_tokens_per_batch(int max_num_tokens); int get_max_tokens_per_batch(); + int get_max_verify_tokens_per_batch(); void set_max_sequence_length(int max_seq_length); void push_spec_infer_tree_width(int tree_width); int get_max_sequence_length(); @@ -113,6 +114,7 @@ class RequestManager { std::string const &path); void register_output_filepath(std::string const &); void initBitMask(BatchConfig::BitMask &bitmask, int initLength); + void appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength); void appendBitMask(BatchConfig::BitMask &bitmask, int newNodes, int preBeamSize, diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index bfcec847b9..999ca37037 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -39,7 +39,10 @@ void FALCON::create_falcon_model(FFModel &ff, Tensor input; { // assert(falcon_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; + int const token_dims[] = {mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/inference/models/llama.cc b/inference/models/llama.cc index e9c84efe90..e54d6d8811 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -41,7 +41,10 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor input; { - int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; + int const token_dims[] = {mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index b074d332ed..3df67b264c 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -40,7 +40,10 @@ void MPT::create_mpt_model(FFModel &ff, //------------------------------ build the model -------------------------- Tensor input; { - int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; + int const token_dims[] = {mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 9b29ae5410..0279f83239 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -42,7 +42,10 @@ void OPT::create_opt_model(FFModel &ff, Tensor position_input; ff.set_position_offset(2); { - int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; + int const token_dims[] = {mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); position_input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index ba7b2cb43a..e683376e47 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -48,7 +48,10 @@ void STARCODER::create_starcoder_model( ff.set_position_offset(0); { // assert(startcoder_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {BatchConfig::max_tokens_per_batch(), 1}; + int const token_dims[] = {mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); position_input = ff.create_tensor<2>(token_dims, DT_INT32); } diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu index 0b8bb8b563..5b7978812c 100644 --- a/src/ops/arg_topk.cu +++ b/src/ops/arg_topk.cu @@ -404,17 +404,22 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m, assert(bc->num_active_requests() >= 0); // check + // allow last request different with others int beam_size = -1; - for (int i = 1; i < bc->max_requests_per_batch(); i++) { + int num_activate_requests = bc->num_active_requests(); + int last_request_idx = + bc->requestsInfo[num_activate_requests - 1].batch_config_request_id; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } else if (beam_size == -1) { beam_size = bc->beamRequestsInfo[i].beam_size; - } else { + + } else if (i != last_request_idx) { assert(beam_size == bc->beamRequestsInfo[i].beam_size); + } else if (i == last_request_idx) { } } - assert(num_shards >= (size_t)beam_size); num_shards = k; arg_topk_forward_kernel<<>>( diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index db64868cb9..7c8601d3c8 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1349,7 +1349,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); + int max_tokens_per_batch = infer_mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(); size_t qkv_max_proj_size = max_tokens_per_batch * (qProjSize * num_q_heads + kProjSize * num_q_heads + vProjSize * num_q_heads); diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 88dd3f92e4..b31e5d0994 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -50,7 +50,8 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel( int hidden_size, BatchConfig::PerRequestInfo *request_infos, BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos, - BatchConfig::BitMask *causalMask) { + BatchConfig::BitMask *causalMask, + bool *request_completed) { // q, k using Q_vec = typename VEC_K::Type; @@ -86,11 +87,12 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel( // request_infos[batch_config_request_id].first_token_depth_in_request + // request_infos[batch_config_request_id].num_tokens_in_batch; - int const totalCacheSize = bitmask.non_tree_cache_size + bitmask.tree_size; + int const totalCacheSize = + bitmask.non_tree_cache_size + bitmask.tree_size + bitmask.prompt_size - 1; int first_token_idx = 0; - for (int r = 0; r < request_idx; r++) { - first_token_idx += causalMask[r].this_layer_size; + for (int r = 0; r < batch_config_request_id; r++) { + first_token_idx += request_completed[r] ? 0 : causalMask[r].this_layer_size; } int const tree_branch_num = @@ -138,7 +140,8 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel( ii * THREADS_PER_KEY * K_VEC_SIZE); } - int const query_token = bitmask.tree_size - tree_branch_num + qi; + int const query_token = + bitmask.prompt_size + bitmask.tree_size - 1 - tree_branch_num + qi; __syncthreads(); for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { @@ -163,8 +166,12 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel( (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << query_token)))); - // if (blockIdx.y == 0 && blockIdx.x == 0 && !mask) { - // printf("spec inc attn qkqkqk %d, %.10f, %d\n", ti, qk, qi); + // if (head_idx == 0 && ti == 0 && request_idx == 15 && !mask) { + // printf("spec inc attn qkqkqk request id %d, %.10f, %d\n", + // batch_config_request_id, + // ti, + // qk, + // qi); // } qk_max = mask ? qk_max : fmaxf(qk_max, qk); qk_smem[ti - first_step] = mask ? 0.f : qk; @@ -336,17 +343,12 @@ __global__ void spec_inc_store_kv_cache( BatchConfig::BitMask bitmask = causalMask[req_id]; - // int const tree_branch_num = beamRequestInfos[req_id].sub_request_num; - - // int const query_token = bitmask.non_tree_cache_size + bitmask.tree_size - - // tree_branch_num + sub_req_id + tok_id; - // bitmask.tree_size - tree_branch_num + sub_req_id; - // if prompt token -> token id // if tree token: - int const cache_idx = bitmask.non_tree_cache_size + bitmask.tree_size - - bitmask.this_layer_size + token_idx - - request_token_offset; + + int const cache_idx = bitmask.prompt_size + bitmask.non_tree_cache_size + + bitmask.tree_size - 1 - bitmask.this_layer_size + + token_idx - request_token_offset; kCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + offset] = kVal; @@ -411,7 +413,8 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, m->hidden_size, \ m->request_infos, \ m->beam_request_infos, \ - m->causalMask) + m->causalMask, \ + m->request_completed) template void compute_spec_inc_attention_kernel_generation( @@ -420,7 +423,8 @@ void compute_spec_inc_attention_kernel_generation( DT *output_ptr, cudaStream_t stream) { // one block == one head per request - dim3 grid(m->num_q_heads, bc->num_active_requests()); + // how many generation requests + dim3 grid(m->num_q_heads, bc->get_speculative_request_num()); int const per_head_size = m->qProjSize; float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; size_t smem_sz; @@ -499,11 +503,10 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; + } else if (tokens_previous_requests < bc->num_generation_tokens) { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; } - // else if (tokens_previous_requests < bc->num_generation_tokens) { - // tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; - // continue; - // } // all requests in prompt phase should only have one sub requests; assert(bc->sub_requests[i] == 1); @@ -659,10 +662,10 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests - // print_tensor((float*)C_softmax, 32, "C_softmax"); + int token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + C = static_cast
(m->attn_heads) + - (tokens_previous_requests + bc->num_generation_tokens) * - m->num_q_heads * m->vProjSize; + (token_offset)*m->num_q_heads * m->vProjSize; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_N, CUBLAS_OP_T, @@ -860,6 +863,13 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + sizeof(BeamSearchBatchConfig::beamTokenInfo) + sizeof(BeamSearchBatchConfig::beamRequestsInfo)); + + request_completed = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + + sizeof(BeamSearchBatchConfig::beamTokenInfo) + + sizeof(BeamSearchBatchConfig::beamRequestsInfo) + + sizeof(BatchConfig::causalMask)); } cudaStreamSynchronize(stream); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index b4af80976f..fc86e1498e 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -54,6 +54,7 @@ __global__ void compute_attention_kernel_fused_kernel( int num_heads, int num_requests, BatchConfig::BitMask *causalMask, + bool *request_completed, int qk_smem_sz) { // q, k @@ -90,13 +91,14 @@ __global__ void compute_attention_kernel_fused_kernel( BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; int first_token_idx = 0; - for (int r = 0; r < request_idx; r++) { - first_token_idx += request_infos[r].num_tokens_in_batch; + for (int r = 0; r < batch_config_request_id; r++) { + first_token_idx += + request_completed[r] ? 0 : request_infos[r].num_tokens_in_batch; } - // if(tidx == 0 && head_idx == 0){ - // printf("tree req: %d, %d\n", request_idx, first_token_idx); - // } + bool prompt_phase = request_infos[batch_config_request_id].prompt_phase; + int q_start = + request_infos[batch_config_request_id].first_token_depth_in_request; // shared memory objects extern __shared__ char smem_[]; @@ -139,7 +141,7 @@ __global__ void compute_attention_kernel_fused_kernel( q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + ii * THREADS_PER_KEY * K_VEC_SIZE); - // if (head_idx == 0 && qi == 1 && tidx == 0) { + // if (head_idx == 0 && request_idx == 1 && tidx == 0) { // printf("laod q %d, %d %.10f\n", // request_idx, // qi,q_vecs[ki_o][ii].x); @@ -163,19 +165,23 @@ __global__ void compute_attention_kernel_fused_kernel( if (ti < tlength && tidx % THREADS_PER_KEY == 0) { bool const mask = - (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + prompt_phase ? (qi + q_start < ti) + : (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << qi)))); qk_max = mask ? qk_max : fmaxf(qk_max, qk); - // if (head_idx == 0 && qi == 0 && !mask) { - // printf("tree attn qkqkqkqk request id %d, %d %.10f, %.10f, %.10f\n - // ", + // if (head_idx == 0 && !mask) { + // printf("tree attn qkqkqkqk request id %d qi%d, ti %d, %.10f, %.10f, + // %.10f, %d\n", // request_idx, + // qi, // ti, // qk, // q_vecs[ki_o][0].x, - // k[0].x); + // k[0].x, + // bitmask.non_tree_cache_size); // } qk_smem[ti - first_step] = mask ? 0.0f : qk; } @@ -217,8 +223,10 @@ __global__ void compute_attention_kernel_fused_kernel( float exp_sum = 0.f; for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { bool const mask = - (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + prompt_phase ? (q_start + qi < ti) + : (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << qi)))); float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max); exp_sum += logit; qk_smem[ti - first_step] = mask ? 0.0f : logit; @@ -265,8 +273,11 @@ __global__ void compute_attention_kernel_fused_kernel( if (ti < tlength) { bool const mask = - (ti >= bitmask.non_tree_cache_size && - (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & (1 << qi)))); + prompt_phase + ? (q_start + qi < ti) + : (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << qi)))); float logit = mask ? 0.0f : qk_smem[ti - first_step]; out = FlexFlow::fma(logit, cast_to_float(v), out); } @@ -810,6 +821,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, \ bc->num_active_requests(), \ m->causalMask, \ + m->request_completed, \ smem_sz[0]) template @@ -841,7 +853,6 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, dim3 grid(m->num_q_heads, bc->num_active_requests()); int const per_head_size = m->qProjSize; float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; - // 0->qk production size, 1->total shared size int smem_sz[2]; if (per_head_size == 64) { @@ -890,17 +901,6 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit << // "\n"; - cudaMemcpyAsync(m->committed_token_infos, - &(bc->committed_tokens), - bc->num_tokens_to_commit * - sizeof(TreeVerifyBatchConfig::CommittedTokensInfo), - cudaMemcpyHostToDevice, - stream); - cudaMemcpyAsync(m->causalMask, - &(bc->causalMask), - bc->num_active_requests() * sizeof(BatchConfig::BitMask), - cudaMemcpyHostToDevice, - stream); commit_tokens
(m, bc, stream); // After commit we update m->num_active_tokens to be the number of active @@ -1068,6 +1068,12 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + sizeof(BatchConfig::causalMask)); + + request_completed = reinterpret_cast( + reinterpret_cast(handler.batch_config_metadata) + + sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + + sizeof(BatchConfig::causalMask) + + sizeof(TreeVerifyBatchConfig::committed_tokens)); } cudaStreamSynchronize(stream); diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index d2fbc0883f..c432208eca 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -84,6 +84,12 @@ int BatchConfig::max_tokens_per_batch() { return RequestManager::get_request_manager()->get_max_tokens_per_batch(); } +/*static*/ +int BatchConfig::max_verify_tokens_per_batch() { + return RequestManager::get_request_manager() + ->get_max_verify_tokens_per_batch(); +} + /*static*/ int BatchConfig::max_sequence_length() { return RequestManager::get_request_manager()->get_max_sequence_length(); diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc index 74843e9460..ff7bf1a819 100644 --- a/src/runtime/beam_search_batch_config.cc +++ b/src/runtime/beam_search_batch_config.cc @@ -85,6 +85,10 @@ int BeamSearchBatchConfig::max_beam_depth_all_requests() const { return max_depth_all_requests; } +int BeamSearchBatchConfig::get_speculative_request_num() const { + return speculative_request_num; +} + int BeamSearchBatchConfig::current_depth_all_requests() const { int current_depth = 0; for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 32b524f643..76bed36bda 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4805,6 +4805,20 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(EMBED_INF_TASK_ID, "Embedding Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Embedding Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } { TaskVariantRegistrar registrar(EMBED_BWD_TASK_ID, "Embedding Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 89d4ddaed4..88754f5a82 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -97,6 +97,12 @@ int RequestManager::get_max_tokens_per_batch() { return max_tokens_per_batch; } +int RequestManager::get_max_verify_tokens_per_batch() { + assert(max_tokens_per_batch > 0); + return max_tokens_per_batch + + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM * max_requests_per_batch; +} + void RequestManager::set_max_sequence_length(int max_seq_length) { assert(max_sequence_length == -1 || max_sequence_length == max_seq_length); max_sequence_length = max_seq_length; @@ -1126,7 +1132,6 @@ BeamSearchBatchConfig old_bc.beamRequestsInfo[i].sub_request_num, tree, old_bc.beamRequestsInfo[i].current_depth); - // assert(false); for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) { @@ -1146,6 +1151,9 @@ BeamSearchBatchConfig } } + // how many requests is in speculative phase + new_bc.speculative_request_num = num_active_req + 1; + // Add prompt tokens to the batch for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (old_bc.request_completed[i] || old_bc.request_running[i]) { @@ -1184,13 +1192,14 @@ BeamSearchBatchConfig spec_infer_tree_width.size() > ssm_decoding_steps ? spec_infer_tree_width[ssm_decoding_steps] : 1; - printf("beam size: %d, %d\n", - new_bc.beamRequestsInfo[i].beam_size, - ssm_decoding_steps); + // printf("beam size: %d, %d\n", + // new_bc.beamRequestsInfo[i].beam_size, + // ssm_decoding_steps); new_bc.beamRequestsInfo[i].max_depth = old_bc.beamRequestsInfo[i].max_depth; - new_bc.sub_requests[i] = - old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; + // new_bc.sub_requests[i] = + // old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size; + new_bc.sub_requests[i] = 1; new_bc.beamRequestsInfo[i].sub_request_num = old_bc.beamRequestsInfo[i].sub_request_num; @@ -1218,6 +1227,9 @@ BeamSearchBatchConfig request.tokens.size()) { // request is done new_bc.requestsInfo[i].num_tokens_in_batch = 0; + new_bc.causalMask[i].this_layer_size = 0; + new_bc.beamRequestsInfo[i].sub_request_num = 0; + new_bc.beamRequestsInfo[i].beam_size = 1; } else { // Prompt phase new_bc.requestsInfo[i].num_tokens_in_batch = @@ -1227,12 +1239,8 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].first_token_depth_in_request); request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch; BeamTree tree = request.beam_trees[old_bc.model_id]; - appendBitMask(new_bc.causalMask[i], - new_bc.beamRequestsInfo[i].sub_request_num, - old_bc.beamRequestsInfo[i].beam_size, - old_bc.beamRequestsInfo[i].sub_request_num, - tree, - old_bc.beamRequestsInfo[i].current_depth); + appendPendingRequest(new_bc.causalMask[i], + new_bc.requestsInfo[i].num_tokens_in_batch); } if (verbose) { @@ -1258,11 +1266,11 @@ BeamSearchBatchConfig // get value from requestinfo new_bc.tokensInfo[new_bc.num_tokens].token_id = - request.tokens[request.tokens.size() - 1]; + request.tokens[request.tokens.size() - + new_bc.requestsInfo[i].num_tokens_in_batch + j]; new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k; new_bc.num_tokens++; - num_generation_tokens++; } } } @@ -1319,7 +1327,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens_to_commit = 0; new_bc.num_tokens = 0; - int max_prompt_load_size = get_max_tokens_per_batch(); + int max_prompt_load_size = get_max_verify_tokens_per_batch(); for (int i = 0; i < TreeVerifyBatchConfig::max_requests_per_batch(); i++) { if (old_batches.at(0).request_completed[i]) { continue; @@ -1427,7 +1435,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - if (new_bc.num_tokens > get_max_tokens_per_batch()) { + if (new_bc.num_tokens > get_max_verify_tokens_per_batch()) { assert(false && "Exceeding the space available in the TreeVerify batch"); break; @@ -1453,7 +1461,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - if (new_bc.num_tokens == get_max_tokens_per_batch() && + if (new_bc.num_tokens == get_max_verify_tokens_per_batch() && (j != dfs_tree_inputs.size() - 1)) { cutLayer = true; break; @@ -1542,7 +1550,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; } - if (new_bc.num_tokens > get_max_tokens_per_batch()) { + if (new_bc.num_tokens > get_max_verify_tokens_per_batch()) { assert(false && "Exceeding the space available in the TreeVerify batch"); break; @@ -1555,15 +1563,17 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( request.status = Request::RUNNING; new_bc.request_running[i] = true; - std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " - << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + // std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " + // << new_bc.requestsInfo[i].num_tokens_in_batch << + // std::endl; + new_bc.requestsInfo[i].prompt_phase = true; dfs_tree_inputs[guid] = std::vector>{std::make_pair( request.tokens.back(), request.tokens.size() - 1)}; } } else { // launch the request into running phase after loading all prompt - if (get_max_tokens_per_batch() - new_bc.num_tokens > 0) { + if (get_max_verify_tokens_per_batch() - new_bc.num_tokens > 0) { // std::cout << "Initialization running phase: " // << new_bc.requestsInfo[i].num_tokens_in_batch << "\n"; request.status = Request::RUNNING; @@ -1576,9 +1586,11 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; - std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: " - << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl; + // std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch2: " + // << new_bc.requestsInfo[i].num_tokens_in_batch << + // std::endl; + new_bc.requestsInfo[i].prompt_phase = true; dfs_tree_inputs[guid] = std::vector>{std::make_pair( request.tokens.back(), request.tokens.size() - 1)}; @@ -1760,20 +1772,14 @@ void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc, // prompt phase, init task void RequestManager::initBitMask(BatchConfig::BitMask &bitmask, int initLength) { - assert(initLength <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM && - "do not support tree size > 64"); + assert(initLength > 0); // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: // 0000000..1000 bitmask.non_tree_cache_size = 0; - bitmask.tree_size = initLength; + bitmask.tree_size = 1; bitmask.prompt_size = initLength; bitmask.this_layer_size = initLength; - for (int i = 0; i < bitmask.prompt_size; i++) { - for (int j = i; j < bitmask.prompt_size; j++) { - bitmask.mask[i] |= (1 << j); - } - } // std::cout << "see bit mask" << bitmask.prompt_size << "\n"; // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[0]) << "\n"; // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[1]) << "\n"; @@ -1810,6 +1816,25 @@ void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask, // << "\n"; } +// prompt phase, init task +void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask, + int initLength) { + assert(initLength > 0); + std::cout << "append pending bit mask: " << initLength << "\n"; + // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: + // 0000000..1000 + bitmask.non_tree_cache_size = 0; + bitmask.tree_size = 1; + bitmask.prompt_size += initLength; + bitmask.this_layer_size = initLength; + + // for (int i = 0; i < bitmask.prompt_size; i++) { + // for (int j = i; j < bitmask.prompt_size; j++) { + // bitmask.mask[i] |= (1 << j); + // } + // } +} + // prepare next beam, append layers to the tree void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, int newNodes, @@ -1862,12 +1887,6 @@ void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask, } } - // std::cout << "token idx: " << token_idx << ", " << pre_tree_size << ", " - // << new_nodes_start_idx << ", " << newNodes - // << "current depth: " << currentDepth << "\n"; - // std::cout << "new nodes end " << new_nodes_start_idx << "\n"; - - // std::cout << "tree size: " << bitmask.tree_size << "\n"; assert(token_idx == pre_tree_size); assert(currentDepth <= 1 || new_nodes_start_idx == bitmask.tree_size); diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index 51c52c3026..8380d6be73 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -35,10 +35,17 @@ void RequestManager::load_tokens_task( // Extreme long prompts are not supported, only load up to // BatchConfig::max_tokens_per_batch() as prompt - if (batch_config->num_tokens > BatchConfig::max_tokens_per_batch()) { + if (batch_config->num_tokens > BatchConfig::max_tokens_per_batch() && + batch_config->get_mode() == INC_DECODING_MODE) { printf("Warning: too many tokens in prompt, only load up to %d tokens\n", BatchConfig::max_tokens_per_batch()); printf("Got: %d tokens\n", batch_config->num_tokens); + } else if (batch_config->num_tokens > + BatchConfig::max_verify_tokens_per_batch()) { + printf("Warning: Speculative decoding. too many tokens in prompt, only " + "load up to %d tokens\n", + BatchConfig::max_verify_tokens_per_batch()); + printf("Got: %d tokens\n", batch_config->num_tokens); } for (int i = 0; i < batch_config->num_tokens; i++) { @@ -117,8 +124,16 @@ void RequestManager::load_batch_config_task( sizeof(BatchConfig::causalMask), cudaMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::causalMask); + + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(batch_config->request_completed), + sizeof(BatchConfig::request_completed), + cudaMemcpyHostToDevice, + stream)); + + total_copy_size += sizeof(BatchConfig::request_completed); } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { TreeVerifyBatchConfig const *tree_batch_config = static_cast(batch_config); @@ -137,6 +152,15 @@ void RequestManager::load_batch_config_task( cudaMemcpyHostToDevice, stream)); total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); + + checkCUDA(cudaMemcpyAsync( + static_cast(handle.batch_config_metadata) + total_copy_size, + &(batch_config->request_completed), + sizeof(BatchConfig::request_completed), + cudaMemcpyHostToDevice, + stream)); + + total_copy_size += sizeof(BatchConfig::request_completed); } // add a size check From 8490e50d5744b6731df9fdc4147b2a6ebd4f2d71 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Tue, 2 Jan 2024 16:20:24 -0500 Subject: [PATCH 31/61] fix --- src/runtime/request_manager.cc | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 88754f5a82..a285932b7f 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -1188,10 +1188,7 @@ BeamSearchBatchConfig int ssm_decoding_steps = profiling_requests[request.guid].ssm_decoding_steps; - new_bc.beamRequestsInfo[i].beam_size = - spec_infer_tree_width.size() > ssm_decoding_steps - ? spec_infer_tree_width[ssm_decoding_steps] - : 1; + new_bc.beamRequestsInfo[i].beam_size = 1; // printf("beam size: %d, %d\n", // new_bc.beamRequestsInfo[i].beam_size, // ssm_decoding_steps); @@ -1820,7 +1817,7 @@ void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask, void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength) { assert(initLength > 0); - std::cout << "append pending bit mask: " << initLength << "\n"; + // std::cout << "append pending bit mask: " << initLength << "\n"; // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: // 0000000..1000 bitmask.non_tree_cache_size = 0; From c12f0c6ddaea6629214278167b047ffa3158b491 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Wed, 3 Jan 2024 00:28:15 -0500 Subject: [PATCH 32/61] fix request id issue --- src/ops/inc_multihead_self_attention.cu | 42 +++++--------------- src/ops/spec_inc_multihead_self_attention.cu | 8 ++-- src/runtime/request_manager.cc | 6 +++ 3 files changed, 20 insertions(+), 36 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 7c8601d3c8..42933cee27 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -52,9 +52,7 @@ __global__ void compute_attention_kernel_generation_kernel( int max_seq_length, int per_head_size, int hidden_size, - BatchConfig::PerRequestInfo *request_infos, - bool is_beam, - int max_beam_width) { + BatchConfig::PerRequestInfo *request_infos) { // q, k using Q_vec = typename VEC_K::Type; @@ -85,10 +83,6 @@ __global__ void compute_attention_kernel_generation_kernel( int const batch_config_request_id = request_infos[request_idx].batch_config_request_id; - int const beam_request_idx = - is_beam ? request_idx / max_beam_width : request_idx; - int const beam_sub_request_idx = is_beam ? request_idx % max_beam_width : 0; - int const first_step = 0; int const tlength = @@ -106,8 +100,7 @@ __global__ void compute_attention_kernel_generation_kernel( // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum __shared__ float red_smem[WARPS_PER_BLOCK * 2]; - const DT *q_ptr = query + - batch_config_request_id * hidden_size * QKV_WEIGHT_NUM + + const DT *q_ptr = query + request_idx * hidden_size * QKV_WEIGHT_NUM + head_idx * per_head_size; __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; // DT const *q_ptr = @@ -142,10 +135,7 @@ __global__ void compute_attention_kernel_generation_kernel( constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; DT const *k_cache_batch = - key_cache + - (batch_config_request_id * max_beam_width + beam_sub_request_idx) * - max_seq_length * hidden_size + - ki; + key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; int ti_end = div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; @@ -248,10 +238,7 @@ __global__ void compute_attention_kernel_generation_kernel( // The base pointer for the value in the cache buffer. DT const *v_cache_batch = - value_cache + - (batch_config_request_id * max_beam_width + beam_sub_request_idx) * - max_seq_length * hidden_size + - vi; + value_cache + batch_config_request_id * max_seq_length * hidden_size + vi; if (Dh == Dh_MAX || vi < Dh) { for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { @@ -297,7 +284,7 @@ __global__ void compute_attention_kernel_generation_kernel( // Output the final values. if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { convert_from_float( - *reinterpret_cast(output_ptr + beam_request_idx * hidden_size + + *reinterpret_cast(output_ptr + request_idx * hidden_size + head_idx * per_head_size + vi), out); } @@ -727,9 +714,7 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m, BatchConfig::max_sequence_length(), \ m->qProjSize, \ m->hidden_size, \ - m->request_infos, \ - false, \ - 0) + m->request_infos) template void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m, @@ -944,14 +929,9 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } else if (tokens_previous_requests < bc->num_generation_tokens) { - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase)) { continue; } - assert(tokens_previous_requests == - bc->requestsInfo[i].first_token_offset_in_batch); int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; @@ -978,8 +958,8 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, // matrix A's layout: [qProjSize, num_heads, 3, num_new_tokens] // To get query projection, skip over Q entries from previous requests DT const *A = static_cast
(m->devQKVProjArray) + - tokens_previous_requests * m->qProjSize * m->num_q_heads * - QKV_WEIGHT_NUM; + bc->requestsInfo[i].first_token_offset_in_batch * + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; // matrix B: key cache // matrix B's layout: [kProjSize * num_heads, total_tokens] // To get B, skip over K entries from previous requests (all heads + @@ -1117,7 +1097,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, // requests // store the result attn heads, also skip the genration tokens DT *C = static_cast
(m->attn_heads) + - (tokens_previous_requests + bc->num_generation_tokens) * + (bc->requestsInfo[i].first_token_offset_in_batch) * m->num_q_heads * m->vProjSize; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_N, @@ -1145,7 +1125,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, } tokens_previous_requests += num_new_tokens; } - assert(tokens_previous_requests == num_tokens); + assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens)); } /*static*/ diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index b31e5d0994..a63417de51 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -501,10 +501,8 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } else if (tokens_previous_requests < bc->num_generation_tokens) { - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase) || + (bc->requestsInfo[i].num_tokens_in_batch == 0)) { continue; } @@ -694,7 +692,7 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, tokens_prev_requests_squares += num_new_tokens * total_tokens; } - // assert(tokens_previous_requests == num_tokens); + assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens)); } template diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index a285932b7f..c867d2a979 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -468,12 +468,14 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, // Incremental phase new_bc.requestsInfo[i].num_tokens_in_batch = 1; num_generation_tokens++; + new_bc.requestsInfo[i].prompt_phase = false; } else { // Prompt phase new_bc.requestsInfo[i].num_tokens_in_batch = std::min(get_max_tokens_per_batch() - new_bc.num_tokens, (int)request.tokens.size() - new_bc.requestsInfo[i].first_token_depth_in_request); + new_bc.requestsInfo[i].prompt_phase = true; } for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; @@ -509,6 +511,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; new_bc.request_completed[i] = false; + new_bc.requestsInfo[i].prompt_phase = true; num_active_req++; new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // add profile_info for the new request @@ -755,6 +758,7 @@ BeamSearchBatchConfig new_bc.beamRequestsInfo[i].current_depth = 1; profiling_requests[request.guid].ssm_decoding_steps = 0; + new_bc.requestsInfo[i].prompt_phase = true; int ssm_decoding_steps = 0; new_bc.beamRequestsInfo[i].beam_size = @@ -902,6 +906,7 @@ BeamSearchBatchConfig } new_bc.request_completed[i] = false; + new_bc.requestsInfo[i].prompt_phase = true; new_bc.beamRequestsInfo[i].sub_request_num = 1; printf("sub request num == 1, %d \n", @@ -1220,6 +1225,7 @@ BeamSearchBatchConfig &old_bc.causalMask[i], sizeof(BatchConfig::BitMask)); + new_bc.requestsInfo[i].prompt_phase = true; if (new_bc.requestsInfo[i].first_token_depth_in_request >= request.tokens.size()) { // request is done From 284ad772692e8b5f0c012de7d2493d95f3380428 Mon Sep 17 00:00:00 2001 From: Xinhao Cheng <99570243+xinhaoc@users.noreply.github.com> Date: Wed, 3 Jan 2024 03:08:04 -0500 Subject: [PATCH 33/61] Fix Request Id order issue (#1260) * init * fix speculative * fix speculative * bitmap+tree verify * fix. * fix * multi batch * copy metadata once * fix some corner cases * Replicate load_token tasks so that it can be fused with other compute tasks; this eliminates Replicate and enables a larger fused op * more fix. * clean up * . * load batchconfig * clean * hip * hip * embedding return when no token * use arg topk instead of beam topk * embedding * fmt * hip * fix corner case * fix * fix request id issue --------- Co-authored-by: Zhihao Jia --- src/ops/inc_multihead_self_attention.cu | 42 +++++--------------- src/ops/spec_inc_multihead_self_attention.cu | 5 ++- src/runtime/request_manager.cc | 13 +++--- 3 files changed, 22 insertions(+), 38 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 7c8601d3c8..42933cee27 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -52,9 +52,7 @@ __global__ void compute_attention_kernel_generation_kernel( int max_seq_length, int per_head_size, int hidden_size, - BatchConfig::PerRequestInfo *request_infos, - bool is_beam, - int max_beam_width) { + BatchConfig::PerRequestInfo *request_infos) { // q, k using Q_vec = typename VEC_K::Type; @@ -85,10 +83,6 @@ __global__ void compute_attention_kernel_generation_kernel( int const batch_config_request_id = request_infos[request_idx].batch_config_request_id; - int const beam_request_idx = - is_beam ? request_idx / max_beam_width : request_idx; - int const beam_sub_request_idx = is_beam ? request_idx % max_beam_width : 0; - int const first_step = 0; int const tlength = @@ -106,8 +100,7 @@ __global__ void compute_attention_kernel_generation_kernel( // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum __shared__ float red_smem[WARPS_PER_BLOCK * 2]; - const DT *q_ptr = query + - batch_config_request_id * hidden_size * QKV_WEIGHT_NUM + + const DT *q_ptr = query + request_idx * hidden_size * QKV_WEIGHT_NUM + head_idx * per_head_size; __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; // DT const *q_ptr = @@ -142,10 +135,7 @@ __global__ void compute_attention_kernel_generation_kernel( constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; DT const *k_cache_batch = - key_cache + - (batch_config_request_id * max_beam_width + beam_sub_request_idx) * - max_seq_length * hidden_size + - ki; + key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; int ti_end = div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; @@ -248,10 +238,7 @@ __global__ void compute_attention_kernel_generation_kernel( // The base pointer for the value in the cache buffer. DT const *v_cache_batch = - value_cache + - (batch_config_request_id * max_beam_width + beam_sub_request_idx) * - max_seq_length * hidden_size + - vi; + value_cache + batch_config_request_id * max_seq_length * hidden_size + vi; if (Dh == Dh_MAX || vi < Dh) { for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { @@ -297,7 +284,7 @@ __global__ void compute_attention_kernel_generation_kernel( // Output the final values. if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { convert_from_float( - *reinterpret_cast(output_ptr + beam_request_idx * hidden_size + + *reinterpret_cast(output_ptr + request_idx * hidden_size + head_idx * per_head_size + vi), out); } @@ -727,9 +714,7 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m, BatchConfig::max_sequence_length(), \ m->qProjSize, \ m->hidden_size, \ - m->request_infos, \ - false, \ - 0) + m->request_infos) template void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m, @@ -944,14 +929,9 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } else if (tokens_previous_requests < bc->num_generation_tokens) { - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase)) { continue; } - assert(tokens_previous_requests == - bc->requestsInfo[i].first_token_offset_in_batch); int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; @@ -978,8 +958,8 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, // matrix A's layout: [qProjSize, num_heads, 3, num_new_tokens] // To get query projection, skip over Q entries from previous requests DT const *A = static_cast
(m->devQKVProjArray) + - tokens_previous_requests * m->qProjSize * m->num_q_heads * - QKV_WEIGHT_NUM; + bc->requestsInfo[i].first_token_offset_in_batch * + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; // matrix B: key cache // matrix B's layout: [kProjSize * num_heads, total_tokens] // To get B, skip over K entries from previous requests (all heads + @@ -1117,7 +1097,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, // requests // store the result attn heads, also skip the genration tokens DT *C = static_cast
(m->attn_heads) + - (tokens_previous_requests + bc->num_generation_tokens) * + (bc->requestsInfo[i].first_token_offset_in_batch) * m->num_q_heads * m->vProjSize; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_N, @@ -1145,7 +1125,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, } tokens_previous_requests += num_new_tokens; } - assert(tokens_previous_requests == num_tokens); + assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens)); } /*static*/ diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index b31e5d0994..2d80ed2221 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -501,7 +501,8 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { + if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase) || + (bc->requestsInfo[i].num_tokens_in_batch == 0)) { continue; } else if (tokens_previous_requests < bc->num_generation_tokens) { tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; @@ -694,7 +695,7 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, tokens_prev_requests_squares += num_new_tokens * total_tokens; } - // assert(tokens_previous_requests == num_tokens); + assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens)); } template diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 88754f5a82..c867d2a979 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -468,12 +468,14 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, // Incremental phase new_bc.requestsInfo[i].num_tokens_in_batch = 1; num_generation_tokens++; + new_bc.requestsInfo[i].prompt_phase = false; } else { // Prompt phase new_bc.requestsInfo[i].num_tokens_in_batch = std::min(get_max_tokens_per_batch() - new_bc.num_tokens, (int)request.tokens.size() - new_bc.requestsInfo[i].first_token_depth_in_request); + new_bc.requestsInfo[i].prompt_phase = true; } for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; @@ -509,6 +511,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; new_bc.request_completed[i] = false; + new_bc.requestsInfo[i].prompt_phase = true; num_active_req++; new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // add profile_info for the new request @@ -755,6 +758,7 @@ BeamSearchBatchConfig new_bc.beamRequestsInfo[i].current_depth = 1; profiling_requests[request.guid].ssm_decoding_steps = 0; + new_bc.requestsInfo[i].prompt_phase = true; int ssm_decoding_steps = 0; new_bc.beamRequestsInfo[i].beam_size = @@ -902,6 +906,7 @@ BeamSearchBatchConfig } new_bc.request_completed[i] = false; + new_bc.requestsInfo[i].prompt_phase = true; new_bc.beamRequestsInfo[i].sub_request_num = 1; printf("sub request num == 1, %d \n", @@ -1188,10 +1193,7 @@ BeamSearchBatchConfig int ssm_decoding_steps = profiling_requests[request.guid].ssm_decoding_steps; - new_bc.beamRequestsInfo[i].beam_size = - spec_infer_tree_width.size() > ssm_decoding_steps - ? spec_infer_tree_width[ssm_decoding_steps] - : 1; + new_bc.beamRequestsInfo[i].beam_size = 1; // printf("beam size: %d, %d\n", // new_bc.beamRequestsInfo[i].beam_size, // ssm_decoding_steps); @@ -1223,6 +1225,7 @@ BeamSearchBatchConfig &old_bc.causalMask[i], sizeof(BatchConfig::BitMask)); + new_bc.requestsInfo[i].prompt_phase = true; if (new_bc.requestsInfo[i].first_token_depth_in_request >= request.tokens.size()) { // request is done @@ -1820,7 +1823,7 @@ void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask, void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength) { assert(initLength > 0); - std::cout << "append pending bit mask: " << initLength << "\n"; + // std::cout << "append pending bit mask: " << initLength << "\n"; // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: // 0000000..1000 bitmask.non_tree_cache_size = 0; From e17fb8d923b38221d3ab8ba52677505c2c4a9f93 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Wed, 3 Jan 2024 23:32:45 -0500 Subject: [PATCH 34/61] change MAX_SPECULATIVE_TREE_BRANCHES --- include/flexflow/batch_config.h | 23 ++++++++++++++--------- include/flexflow/request_manager.h | 2 +- src/runtime/request_manager.cc | 11 ++++++++--- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index ef17ef43ed..3dcae464cc 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -167,9 +167,10 @@ class BeamSearchBatchConfig : public BatchConfig { int current_depth = -1; int max_depth = MAX_BEAM_DEPTH; - BatchConfig::TokenId tokens[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; - float probs[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; - int parent_id[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + BatchConfig::TokenId + tokens[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; + float probs[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; + int parent_id[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; int sub_request_num; }; @@ -178,10 +179,11 @@ class BeamSearchBatchConfig : public BatchConfig { }; BeamSearchPerRequestInfo beamRequestsInfo[MAX_NUM_REQUESTS]; - BeamSearchPerTokenInfo beamTokenInfo[MAX_NUM_TOKENS * MAX_BEAM_WIDTH]; + BeamSearchPerTokenInfo + beamTokenInfo[MAX_NUM_TOKENS + + MAX_SPEC_TREE_TOKEN_NUM * MAX_NUM_REQUESTS]; - // why is this == MAX_NUM_REQUESTS * MAX_BEAM_WIDTH? - int sub_requests[MAX_NUM_REQUESTS * MAX_BEAM_WIDTH]; + int sub_requests[MAX_SPECULATIVE_TREE_BRANCHES]; private: size_t current_iteration; @@ -190,9 +192,12 @@ class BeamSearchBatchConfig : public BatchConfig { struct BeamInferenceResult { static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS; BatchConfig::TokenId - token_ids[MAX_NUM_TOKENS * BeamSearchBatchConfig::MAX_BEAM_WIDTH]; - float probs[MAX_NUM_TOKENS * BeamSearchBatchConfig::MAX_BEAM_WIDTH]; - int parent_id[MAX_NUM_TOKENS * BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + token_ids[MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; + float probs[MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; + int parent_id[MAX_NUM_TOKENS * + BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; }; }; // namespace FlexFlow diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 33714c106e..f74b6c5b9f 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -76,7 +76,7 @@ struct BeamTree { struct treeLayer { BeamSearchBatchConfig::TokenId tokens[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; - int parent_ids[BeamSearchBatchConfig::MAX_BEAM_WIDTH]; + int parent_ids[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; float probs[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES]; int nodes_num_this_layer = 0; }; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index c867d2a979..91a5d3be86 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -767,7 +767,9 @@ BeamSearchBatchConfig : 1; new_bc.beamRequestsInfo[i].max_depth = std::min(new_max_depth, BeamSearchBatchConfig::MAX_BEAM_DEPTH); - for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { + for (int j = 0; + j < BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; + j++) { new_bc.beamRequestsInfo[i].parent_id[j] = 0; new_bc.beamRequestsInfo[i].probs[j] = 1; } @@ -840,7 +842,8 @@ BeamSearchBatchConfig ? spec_infer_tree_width[ssm_decoding_steps] : 1; new_bc.beamRequestsInfo[i].max_depth = 0; - for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { + for (int j = 0; j < BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; + j++) { new_bc.beamRequestsInfo[i].parent_id[j] = 0; new_bc.beamRequestsInfo[i].probs[j] = 1; } @@ -900,7 +903,9 @@ BeamSearchBatchConfig std::min(BeamSearchBatchConfig::MAX_BEAM_DEPTH, get_max_tokens_per_batch() - new_bc.requestsInfo[i].num_tokens_in_batch - 1); - for (int j = 0; j < BeamSearchBatchConfig::MAX_BEAM_WIDTH; j++) { + for (int j = 0; + j < BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES; + j++) { new_bc.beamRequestsInfo[i].parent_id[j] = 0; new_bc.beamRequestsInfo[i].probs[j] = 1; } From 429ddb59073f3155acd7f255c97f2153f99d130b Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 4 Jan 2024 00:06:48 -0500 Subject: [PATCH 35/61] =?UTF-8?q?=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/flexflow/batch_config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 3dcae464cc..5c126293cf 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -183,7 +183,7 @@ class BeamSearchBatchConfig : public BatchConfig { beamTokenInfo[MAX_NUM_TOKENS + MAX_SPEC_TREE_TOKEN_NUM * MAX_NUM_REQUESTS]; - int sub_requests[MAX_SPECULATIVE_TREE_BRANCHES]; + int sub_requests[MAX_NUM_REQUESTS]; private: size_t current_iteration; From 4f61b9f348094f87cc4d32625a65ffb64156d325 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 8 Jan 2024 19:31:20 +0000 Subject: [PATCH 36/61] fix --- src/runtime/request_manager.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 91a5d3be86..56a2c122d3 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -43,7 +43,8 @@ std::string LoadBytesFromFile(std::string const &path) { } RequestManager::RequestManager() - : verbose(false), next_available_guid(1000000), num_processed_requests(0) { + : verbose(false), next_available_guid(1000000), num_processed_requests(0), + total_request_run_time(0.0f) { // The following config parameters are set // during ffmodel.compile() // Initialize them to -1 to make sure no one From 29735f2432efd8290bf4ebb301fa96cbb5530eff Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 8 Jan 2024 22:33:22 +0000 Subject: [PATCH 37/61] fixes to run chatgpt.json prompt dataset in python --- .dockerignore | 2 ++ .gitignore | 3 ++- python/flexflow/core/flexflow_cffi.py | 2 +- src/c/flexflow_c.cc | 6 +++++- src/runtime/model.cu | 1 - tests/inference/python_inference_tests.sh | 3 ++- 6 files changed, 12 insertions(+), 5 deletions(-) diff --git a/.dockerignore b/.dockerignore index a7470203e3..b9f228c009 100644 --- a/.dockerignore +++ b/.dockerignore @@ -17,3 +17,5 @@ python/flexflow/core/legion_cffi_header.py /inference/tokenizer/* /inference/prompt/* /inference/output/* + +/tests/inference/python_test_configs/*.json diff --git a/.gitignore b/.gitignore index 8fcc105f01..7f6a3c4137 100644 --- a/.gitignore +++ b/.gitignore @@ -186,4 +186,5 @@ gpt_tokenizer # pip version python/flexflow/version.txt -inference_tensors \ No newline at end of file +inference_tensors +tests/inference/python_test_configs/*.json diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index a3c221474d..00133dacb4 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -56,7 +56,7 @@ def get_c_name(name): if name is None: return ffi.NULL else: - return ffi.new("char[]", name.encode("ascii")) + return ffi.new("char[]", name.encode("utf-8")) def get_datatype_size(datatype): diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 579fc5e2d1..82a37a9736 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1596,7 +1596,11 @@ flexflow_generation_result_t GenerationResult result = handle->generate(prompts, max_seq_length); DEBUG_PRINT( "[Model] generate %p %s %i", handle, text_str.c_str(), max_seq_length); - assert(result.output_tokens.size() <= max_seq_length); + // If the prompt exceeds max seq len, check that we return the prompt with no + // additional token. Otherwise, check that the output does not exceed the max + // sequence length. + assert(result.output_tokens.size() <= max_seq_length || + result.output_tokens.size() == result.input_tokens.size()); output_length_and_tokens[0] = result.output_tokens.size(); std::copy(result.output_tokens.begin(), result.output_tokens.end(), diff --git a/src/runtime/model.cu b/src/runtime/model.cu index c885b29db2..23b7f0efbe 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -175,7 +175,6 @@ FFHandler } else { handle.batch_config_metadata = nullptr; } - // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize)); #ifdef FF_USE_NCCL diff --git a/tests/inference/python_inference_tests.sh b/tests/inference/python_inference_tests.sh index 3544f58e26..10c0821835 100755 --- a/tests/inference/python_inference_tests.sh +++ b/tests/inference/python_inference_tests.sh @@ -6,11 +6,12 @@ set -e cd "${BASH_SOURCE[0]%/*}" # Generate test configs +rm -rf python_test_configs/*.json python python_test_configs/generate_configs.py # Run all tests # Loop through .json files in the ./python_test_configs dir -for file in ./python_test_configs/*.json; do +for file in ./python_test_configs/*"llama"*.json; do # Check filename prefix if [[ $file == *"incr_dec"* ]]; then script="../../inference/python/incr_decoding.py" From ba4af39404bb92af10926222ceb6d9e88a147fb9 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 9 Jan 2024 06:56:36 +0000 Subject: [PATCH 38/61] fix --- tests/inference/python_inference_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/inference/python_inference_tests.sh b/tests/inference/python_inference_tests.sh index 10c0821835..a1ee281914 100755 --- a/tests/inference/python_inference_tests.sh +++ b/tests/inference/python_inference_tests.sh @@ -11,7 +11,7 @@ python python_test_configs/generate_configs.py # Run all tests # Loop through .json files in the ./python_test_configs dir -for file in ./python_test_configs/*"llama"*.json; do +for file in ./python_test_configs/*.json; do # Check filename prefix if [[ $file == *"incr_dec"* ]]; then script="../../inference/python/incr_decoding.py" From 9c85a4f5900e45e2e7dfbc98f57bf43237b4dbc9 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 10 Jan 2024 13:54:11 -0500 Subject: [PATCH 39/61] Fuse bias and relu in OPT (#1265) --- include/flexflow/model.h | 3 ++- inference/models/opt.cc | 5 ++--- python/flexflow/serve/models/opt.py | 5 ++--- src/ops/kernels/linear_kernels.cu | 22 ++++++++++++++++++++++ src/runtime/model.cc | 27 ++++++++++++++++++++++++--- 5 files changed, 52 insertions(+), 10 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 6f805e21bd..75b1dbcbe9 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -1090,7 +1090,7 @@ class FFModel { std::unordered_map>> get_bwd_edge_map() const; - // Internal funcitons + // Internal functions Legion::IndexSpace get_or_create_task_is(ParallelConfig const &pc); Legion::IndexSpace get_or_create_task_is(MachineView const &view); Legion::IndexSpace get_or_create_task_is(Legion::Domain const &domain); @@ -1098,6 +1098,7 @@ class FFModel { Legion::IndexSpace get_task_is(Legion::Domain const &domain) const; Legion::IndexSpace get_task_is(ParallelConfig const &pc) const; Legion::IndexSpace get_task_is(MachineView const &view) const; + bool is_mlp_block(int layer_idx) const; void create_operators_from_layers(); Op *create_operator_from_layer(Layer *layer, std::vector const &inputs); diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 0279f83239..e260f8fa36 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -196,7 +196,7 @@ void OPT::create_opt_model(FFModel &ff, Tensor fc1 = ff.dense(final_norm, opt_config.ffn_dim, - AC_MODE_NONE, + AC_MODE_RELU, true, DT_NONE, nullptr, @@ -205,8 +205,7 @@ void OPT::create_opt_model(FFModel &ff, REG_MODE_NONE, 0.0f, std::string("layers_" + std::to_string(i) + "_fc1").c_str()); - Tensor activation = ff.relu(fc1, false); - fc2 = ff.dense(activation, + fc2 = ff.dense(fc1, opt_config.hidden_size, AC_MODE_NONE, true, diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index dfd1cde7d4..dd36fa6592 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -216,13 +216,12 @@ def build_model(self, max_tokens_per_batch): fc1 = ffmodel.dense( ff_norm, self.opt_config.ffn_dim, - ActiMode.AC_MODE_NONE, + ActiMode.AC_MODE_RELU, True, name=f"layers_{i}_fc1", ) - activation = ffmodel.relu(fc1, False) fc2 = ffmodel.dense( - activation, + fc1, self.opt_config.hidden_size, ActiMode.AC_MODE_NONE, True, diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 9373c2fb2f..c30c9f71c1 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -252,6 +252,18 @@ Parameter* Linear::get_parameter(int index) */ namespace Internal { +template +__global__ void AddBiasWithReLU(DT *output_ptr, + DT const *bias_ptr, + int out_dim, + int batch_size) { + CUDA_KERNEL_LOOP(i, out_dim * batch_size) { + int bias_idx = i % out_dim; + DT value = output_ptr[i] + bias_ptr[bias_idx]; + output_ptr[i] = ((float)value > 0.0f) ? value : (DT)0.0f; + } +} + template void forward_kernel(LinearMeta const *m, void const *input_ptr, @@ -343,6 +355,16 @@ void forward_kernel(LinearMeta const *m, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); // use_bias = True if (bias_ptr != NULL) { + // fuse bias and relu + if (m->activation == AC_MODE_RELU) { + int parallelism = out_dim * batch_size; + AddBiasWithReLU<<>>( + static_cast
(output_ptr), + static_cast
(bias_ptr), + out_dim, + batch_size); + return; + } checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 76bed36bda..4270515224 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3236,6 +3236,27 @@ Op *FFModel::create_operator_from_layer( } } +bool FFModel::is_mlp_block(int layer_idx) const { + auto const &l = layers[layer_idx]; + // standard opt relu + if (l->op_type == OP_LINEAR && layer_idx >= 2 && + layers[layer_idx - 1]->op_type == OP_RELU && + layers[layer_idx - 2]->op_type == OP_LINEAR) { + return true; + } + // mlp layer with relu embedded in first dense layer + if (l->op_type == OP_LINEAR && layer_idx >= 1 && + layers[layer_idx - 1]->op_type == OP_LINEAR) { + long long value; + layers[layer_idx - 1]->get_int_property("activation", value); + ActiMode activation = (ActiMode)value; + if (activation == AC_MODE_RELU) { + return true; + } + } + return false; +} + void FFModel::create_operators_from_layers() { std::map tensors_to_parallel_tensors; // for (auto const &l : layers) { @@ -3280,9 +3301,9 @@ void FFModel::create_operators_from_layers() { config.tensor_parallelism_degree > 1 && (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || - (l->op_type == OP_LINEAR && layer_idx >= 2 && - layers[layer_idx - 1]->op_type == OP_RELU && - layers[layer_idx - 2]->op_type == OP_LINEAR) || + // mlp layer + is_mlp_block(layer_idx) || + // llama mlp layer (l->op_type == OP_LINEAR && layer_idx >= 2 && layers[layer_idx - 1]->op_type == OP_GELU && layers[layer_idx - 2]->op_type == OP_LINEAR) || From 197e308ffb872aee9a326eff1b6c6c0bccb075a7 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 12 Jan 2024 12:13:34 -0500 Subject: [PATCH 40/61] fix spec decoding --- deps/legion | 2 +- inference/models/falcon.cc | 2 +- inference/models/llama.cc | 2 +- inference/models/mpt.cc | 2 +- inference/models/opt.cc | 2 +- inference/models/starcoder.cc | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/deps/legion b/deps/legion index 626b55689c..d065278678 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit 626b55689c77848b246e1da19678c7ad58899f0c +Subproject commit d0652786784249e933dd62f675591da99a5e960d diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index 999ca37037..cf6e90a7de 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -39,7 +39,7 @@ void FALCON::create_falcon_model(FFModel &ff, Tensor input; { // assert(falcon_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {mode == TREE_VERIFY_MODE + int const token_dims[] = {(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) ? BatchConfig::max_verify_tokens_per_batch() : BatchConfig::max_tokens_per_batch(), 1}; diff --git a/inference/models/llama.cc b/inference/models/llama.cc index e54d6d8811..3deba47953 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -41,7 +41,7 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor input; { - int const token_dims[] = {mode == TREE_VERIFY_MODE + int const token_dims[] = {(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) ? BatchConfig::max_verify_tokens_per_batch() : BatchConfig::max_tokens_per_batch(), 1}; diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index 3df67b264c..484a09f62e 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -40,7 +40,7 @@ void MPT::create_mpt_model(FFModel &ff, //------------------------------ build the model -------------------------- Tensor input; { - int const token_dims[] = {mode == TREE_VERIFY_MODE + int const token_dims[] = {(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) ? BatchConfig::max_verify_tokens_per_batch() : BatchConfig::max_tokens_per_batch(), 1}; diff --git a/inference/models/opt.cc b/inference/models/opt.cc index e260f8fa36..9f75dcea4c 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -42,7 +42,7 @@ void OPT::create_opt_model(FFModel &ff, Tensor position_input; ff.set_position_offset(2); { - int const token_dims[] = {mode == TREE_VERIFY_MODE + int const token_dims[] = {(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) ? BatchConfig::max_verify_tokens_per_batch() : BatchConfig::max_tokens_per_batch(), 1}; diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index e683376e47..ef5388b6ca 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -48,7 +48,7 @@ void STARCODER::create_starcoder_model( ff.set_position_offset(0); { // assert(startcoder_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {mode == TREE_VERIFY_MODE + int const token_dims[] = {(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) ? BatchConfig::max_verify_tokens_per_batch() : BatchConfig::max_tokens_per_batch(), 1}; From ed4dbd808eb20ddd99e6349c41a66ec782c3cefb Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 12 Jan 2024 12:20:52 -0500 Subject: [PATCH 41/61] Revert "fix spec decoding" This reverts commit 197e308ffb872aee9a326eff1b6c6c0bccb075a7. --- deps/legion | 2 +- inference/models/falcon.cc | 2 +- inference/models/llama.cc | 2 +- inference/models/mpt.cc | 2 +- inference/models/opt.cc | 2 +- inference/models/starcoder.cc | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/deps/legion b/deps/legion index d065278678..626b55689c 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit d0652786784249e933dd62f675591da99a5e960d +Subproject commit 626b55689c77848b246e1da19678c7ad58899f0c diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index cf6e90a7de..999ca37037 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -39,7 +39,7 @@ void FALCON::create_falcon_model(FFModel &ff, Tensor input; { // assert(falcon_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) + int const token_dims[] = {mode == TREE_VERIFY_MODE ? BatchConfig::max_verify_tokens_per_batch() : BatchConfig::max_tokens_per_batch(), 1}; diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 3deba47953..e54d6d8811 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -41,7 +41,7 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor input; { - int const token_dims[] = {(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) + int const token_dims[] = {mode == TREE_VERIFY_MODE ? BatchConfig::max_verify_tokens_per_batch() : BatchConfig::max_tokens_per_batch(), 1}; diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index 484a09f62e..3df67b264c 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -40,7 +40,7 @@ void MPT::create_mpt_model(FFModel &ff, //------------------------------ build the model -------------------------- Tensor input; { - int const token_dims[] = {(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) + int const token_dims[] = {mode == TREE_VERIFY_MODE ? BatchConfig::max_verify_tokens_per_batch() : BatchConfig::max_tokens_per_batch(), 1}; diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 9f75dcea4c..e260f8fa36 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -42,7 +42,7 @@ void OPT::create_opt_model(FFModel &ff, Tensor position_input; ff.set_position_offset(2); { - int const token_dims[] = {(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) + int const token_dims[] = {mode == TREE_VERIFY_MODE ? BatchConfig::max_verify_tokens_per_batch() : BatchConfig::max_tokens_per_batch(), 1}; diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index ef5388b6ca..e683376e47 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -48,7 +48,7 @@ void STARCODER::create_starcoder_model( ff.set_position_offset(0); { // assert(startcoder_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {(mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) + int const token_dims[] = {mode == TREE_VERIFY_MODE ? BatchConfig::max_verify_tokens_per_batch() : BatchConfig::max_tokens_per_batch(), 1}; From 12fdbac30286eee17d4372ccd58230303dd422d6 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sun, 14 Jan 2024 00:28:25 -0500 Subject: [PATCH 42/61] Add a background server for RequestManager (#1223) * add a background server for RequestManager * . * make incr_decoding work * make spec_infer work * format * update python inference * fix python issues * bug fix * add a Legion future to capture the termination of the background server * Add thread safety for background server. * Simplify backend server design. * resolve conflict. * Add server task timeout. * register callbacks to terminate background worker at exit or termination * [Python] enable decoding multiple requests * update README.md and default configuration * [Python] no need to use the llm context environment to start/stop the background server * require at least four cpu cores * [Python] add back explict start_server()/stop_server(). * fix * fix python chatgpt.json --------- Co-authored-by: Gabriele Oliaro Co-authored-by: zwang86 <46699021+zwang86@users.noreply.github.com> Co-authored-by: Zeyu Wang Co-authored-by: xinhaoc --- .github/README.md | 24 +- CMakeLists.txt | 4 +- include/flexflow/flexflow_c.h | 31 +- include/flexflow/model.h | 5 +- include/flexflow/request_manager.h | 76 ++-- .../flexflow/utils}/file_loader.h | 11 +- inference/incr_decoding/CMakeLists.txt | 1 - inference/incr_decoding/incr_decoding.cc | 8 +- inference/models/falcon.cc | 24 +- inference/models/falcon.h | 2 +- inference/models/llama.cc | 32 +- inference/models/llama.h | 2 +- inference/models/mpt.cc | 23 +- inference/models/mpt.h | 2 +- inference/models/opt.cc | 23 +- inference/models/opt.h | 2 +- inference/models/starcoder.cc | 29 +- inference/models/starcoder.h | 2 +- inference/python/incr_decoding.py | 7 +- inference/python/spec_infer.py | 18 +- inference/spec_infer/CMakeLists.txt | 1 - inference/spec_infer/spec_infer.cc | 6 + python/flexflow/core/flexflow_cffi.py | 48 ++- python/flexflow/serve/models/falcon.py | 4 +- python/flexflow/serve/models/llama.py | 5 +- python/flexflow/serve/models/mpt.py | 5 +- python/flexflow/serve/models/opt.py | 4 +- python/flexflow/serve/models/starcoder.py | 5 +- python/flexflow/serve/serve.py | 88 +++-- src/c/flexflow_c.cc | 94 +++-- src/mapper/mapper.cc | 3 +- src/ops/linear.cc | 12 +- {inference => src/runtime}/file_loader.cc | 15 +- src/runtime/inference_manager.cc | 48 ++- src/runtime/model.cc | 18 + src/runtime/request_manager.cc | 330 ++++++++++++------ 36 files changed, 681 insertions(+), 331 deletions(-) rename {inference => include/flexflow/utils}/file_loader.h (84%) rename {inference => src/runtime}/file_loader.cc (98%) diff --git a/.github/README.md b/.github/README.md index 528df18faf..0972135504 100644 --- a/.github/README.md +++ b/.github/README.md @@ -79,7 +79,12 @@ ssms=[] ssm = ff.SSM("JackFram/llama-68m") ssms.append(ssm) ``` -Next, we declare the generation configuration and compile both the LLM and SSMs. Note that all SSMs should run in the **beam search** mode, and the LLM should run in the **tree verification** mode to verify the speculated tokens from SSMs. +Next, we declare the generation configuration and compile both the LLM and SSMs. Note that all SSMs should run in the **beam search** mode, and the LLM should run in the **tree verification** mode to verify the speculated tokens from SSMs. You can also use the following arguments to specify serving configuration when compiling LLMs and SSMs: + +* max\_requests\_per\_batch: the maximum number of requests to serve in a batch (default: 16) +* max\_seq\_length: the maximum number of tokens in a request (default: 256) +* max\_tokens\_per\_batch: the maximum number of tokens to process in a batch (default: 128) + ```python # Create the sampling configs generation_config = ff.GenerationConfig( @@ -91,11 +96,16 @@ for ssm in ssms: ssm.compile(generation_config) # Compile the LLM for inference and load the weights into memory -llm.compile(generation_config, ssms=ssms) +llm.compile(generation_config, + max_requests_per_batch = 16, + max_seq_length = 256, + max_tokens_per_batch = 128, + ssms=ssms) ``` Finally, we call `llm.generate` to generate the output, which is organized as a list of `GenerationResult`, which include the output tokens and text. ```python -result = llm.generate("Here are some travel tips for Tokyo:\n") +with llm: + result = llm.generate("Here are some travel tips for Tokyo:\n") ``` ### Incremental decoding @@ -124,10 +134,14 @@ generation_config = ff.GenerationConfig( ) # Compile the LLM for inference and load the weights into memory -llm.compile(generation_config) +llm.compile(generation_config, + max_requests_per_batch = 16, + max_seq_length = 256, + max_tokens_per_batch = 128) # Generation begins! -result = llm.generate("Here are some travel tips for Tokyo:\n") +with llm: + result = llm.generate("Here are some travel tips for Tokyo:\n") ``` diff --git a/CMakeLists.txt b/CMakeLists.txt index 90cab126e6..acbe7e385f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -264,14 +264,14 @@ if(NOT BUILD_LEGION_ONLY) LIST_DIRECTORIES False ${FLEXFLOW_ROOT}/include/*.h) - list(APPEND FLEXFLOW_HDR ${FLEXFLOW_ROOT}/inference/file_loader.h) + #list(APPEND FLEXFLOW_HDR ${FLEXFLOW_ROOT}/inference/file_loader.h) file(GLOB_RECURSE FLEXFLOW_SRC LIST_DIRECTORIES False ${FLEXFLOW_ROOT}/src/*.cc) list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc") - list(APPEND FLEXFLOW_SRC ${FLEXFLOW_ROOT}/inference/file_loader.cc) + #list(APPEND FLEXFLOW_SRC ${FLEXFLOW_ROOT}/inference/file_loader.cc) set(FLEXFLOW_CPP_DRV_SRC ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc) diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 305c8da513..cab3d14ea7 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -611,13 +611,13 @@ flexflow_perf_metrics_t void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id); -flexflow_generation_result_t - flexflow_model_generate(flexflow_model_t handle_, - char const *input_text, - int max_num_chars, - char *output_text, - int max_seq_length, - int *output_length_and_tokens); +void flexflow_model_generate(flexflow_model_t handle_, + int num_requests, + char const **input_text, + int max_num_chars, + char **output_text, + int max_seq_length, + int **output_length_and_tokens); void flexflow_model_set_position_offset(flexflow_model_t handle, int offset); @@ -988,6 +988,12 @@ void flexflow_request_manager_register_output_filepath( int flexflow_request_manager_register_ssm_model( flexflow_request_manager_t handle_, flexflow_model_t model_handle_); +void flexflow_request_manager_start_background_server( + flexflow_request_manager_t handle_, flexflow_model_t model_handle_); + +void flexflow_request_manager_terminate_background_server( + flexflow_request_manager_t handle_); + // ----------------------------------------------------------------------- // InferenceManager // ----------------------------------------------------------------------- @@ -1004,6 +1010,11 @@ void flexflow_inference_manager_compile_model_and_allocate_buffer( void flexflow_inference_manager_init_operators_inference( flexflow_inference_manager_t handle_, flexflow_model_t model_handle); +void flexflow_inference_manager_register_model_weights_loader( + flexflow_inference_manager_t handle_, + flexflow_model_t model_handle, + flexflow_file_data_loader_t loader_handle); + // ----------------------------------------------------------------------- // FileDataLoader // ----------------------------------------------------------------------- @@ -1014,13 +1025,13 @@ flexflow_file_data_loader_t int num_kv_heads, int hidden_dim, int qkv_inner_dim, - int tensor_parallelism_degree); + int tensor_parallelism_degree, + bool use_full_precision); void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_); void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_, - flexflow_model_t model_handle_, - bool use_full_precision); + flexflow_model_t model_handle_); #ifdef __cplusplus } diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 75b1dbcbe9..dd6dc76b4d 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -247,6 +247,7 @@ enum TaskIDs { RM_PREPARE_NEXT_BATCH_INIT_TASK_ID, RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID, + RM_BACKGROUND_SERVING_TASK_ID, // Custom tasks CUSTOM_GPU_TASK_ID_FIRST, CUSTOM_GPU_TASK_ID_1, @@ -806,8 +807,8 @@ class FFModel { // ======================================== // Inference APIs // ======================================== - GenerationResult generate(std::vector &prompts, - int max_seq_length); + std::vector generate(std::vector &prompts, + int max_seq_length); Tensor create_tensor_legion_ordering(int num_dim, int const dims[], diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index f74b6c5b9f..50a51705cd 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -18,6 +18,8 @@ #include "flexflow/batch_config.h" #include "flexflow/inference.h" #include "flexflow/model.h" +#include "flexflow/utils/file_loader.h" +#include #include #include @@ -30,25 +32,29 @@ using tokenizers::Tokenizer; class InferenceManager { public: - InferenceManager(FFConfig const &config); + InferenceManager(); static InferenceManager *get_inference_manager(); void compile_model_and_allocate_buffer(FFModel *model); void init_operators_inference(FFModel *model); Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc); Legion::FutureMap inference(FFModel *model, int index, BatchConfigFuture const &bc); - void load_input_tokens_from_batch_config(BatchConfigFuture const &bc, + void load_input_tokens_from_batch_config(FFModel *model, + BatchConfigFuture const &bc, ParallelTensor const input, FFHandler *handlers); - void load_positions(BatchConfigFuture const &bc, + void load_positions(FFModel *model, + BatchConfigFuture const &bc, ParallelTensor position_input, int offset); - void load_inference_metadata_batch_config(BatchConfigFuture const &bc, + void register_model_weights_loader(FFModel *, FileDataLoader *); + void load_inference_metadata_batch_config(FFModel *model, + BatchConfigFuture const &bc, FFHandler *handlers); public: - FFConfig ff_config; std::unordered_map> tensor_buffer; + std::unordered_map model_weights_loaders; int num_devices; }; @@ -91,9 +97,15 @@ struct BeamTree { class RequestManager { public: + enum Status { + INITIALIZED = 1001, + SERVING = 1002, + TERMINATED = 1003, + }; using RequestGuid = BatchConfig::RequestGuid; using TokenId = BatchConfig::TokenId; + static const RequestGuid INVALID_GUID = 0; RequestManager(); static RequestManager *get_request_manager(); size_t get_num_processed_requests(); @@ -125,30 +137,38 @@ class RequestManager { int initLength, int non_tree_size); - FFModel *get_model(int model_id); + FFModel *get_ssm_model(int model_id); - GenerationResult generate_incr_decoding(FFModel *model, - std::vector &prompts, - int max_seq_length); - GenerationResult generate_spec_infer(FFModel *model, - std::vector &prompts, - int max_seq_length); + void serve_incr_decoding(FFModel *model); + void serve_spec_infer(FFModel *model); GenerationResult get_generation_result(RequestGuid const &guid); RequestGuid register_new_request(std::string const &prompt, int max_sequence_length); RequestGuid register_new_request(std::vector const &prompt, int max_sequence_length); + // Methods to start and terminate request manager's background task + void start_background_server(FFModel *model); + bool is_background_server_terminated(); + void terminate_background_server(); + static void terminate_background_server_at_exit(); + // Methods to check and mark request completion bool is_request_completed(RequestGuid const &guid); + void trigger_request_completion_future(RequestGuid const &guid); + // Methods for preparing next batches BatchConfig prepare_next_batch(BatchConfig const &bc, InferenceResult const &result); BatchConfigFuture prepare_next_batch(BatchConfigFuture const &bc, - InferenceResultFuture const &result); + InferenceResultFuture const &result, + Legion::Context ctx, + Legion::Runtime *runtime); BeamSearchBatchConfig prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc, BeamInferenceResult const &result); BeamSearchBatchConfigFuture prepare_next_batch_beam(BeamSearchBatchConfigFuture const &old_bc, - BeamInferenceResultFuture const &result); + BeamInferenceResultFuture const &result, + Legion::Context ctx, + Legion::Runtime *runtime); BeamSearchBatchConfig prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc, InferenceResult const &result, @@ -156,11 +176,15 @@ class RequestManager { BeamSearchBatchConfigFuture prepare_next_batch_init(TreeVerifyBatchConfigFuture const &old_bc, InferenceResultFuture const &result, - int model_id); + int model_id, + Legion::Context ctx, + Legion::Runtime *runtime); TreeVerifyBatchConfig prepare_next_batch_verify( std::vector const &old_batches); TreeVerifyBatchConfigFuture prepare_next_batch_verify( - std::vector const &old_batches); + std::vector const &old_batches, + Legion::Context ctx, + Legion::Runtime *runtime); void store_beam_metadata(BeamSearchBatchConfig const &old_bc, BeamInferenceResult const &result); @@ -187,7 +211,11 @@ class RequestManager { &inputSerializedTree, std::vector> const &outputSerializedTree); - + static void background_serving_task( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void load_tokens_task(Legion::Task const *task, std::vector const ®ions, @@ -233,9 +261,11 @@ class RequestManager { int max_requests_per_batch; int max_tokens_per_batch; int max_sequence_length; + Status request_manager_status; // tree width in each speculative step, if not specified 1 std::vector spec_infer_tree_width; + // private fields std::unique_ptr tokenizer_; bool verbose; @@ -247,12 +277,9 @@ class RequestManager { std::unordered_map all_requests; std::unordered_map request_generation_results; std::mutex request_queue_mutex; + std::unordered_map *> request_to_promise; + std::mutex request_to_promise_mutex; RequestGuid next_available_guid; - // Legion futures for inc_decoding and spec_infer - BatchConfigFuture last_bcf; - InferenceResultFuture last_irf; - TreeVerifyBatchConfigFuture last_tree_bcf; - InferenceResultFuture last_tree_irf; // TODO: Move this two vector to request struct std::unordered_map models; + std::vector ssm_models; // Performance profiling size_t num_processed_requests; + // Background server handler + Legion::Future background_server_handler; + private: struct ProfileInfo { int llm_decoding_steps; diff --git a/inference/file_loader.h b/include/flexflow/utils/file_loader.h similarity index 84% rename from inference/file_loader.h rename to include/flexflow/utils/file_loader.h index 6f01a79b80..646eb18da2 100644 --- a/inference/file_loader.h +++ b/include/flexflow/utils/file_loader.h @@ -30,18 +30,16 @@ class FileDataLoader { int _num_kv_heads, size_t _hidden_dim, size_t _qkv_inner_dim, - int _tensor_parallelism_degree); + int _tensor_parallelism_degree, + bool _use_full_precision); BatchConfig::TokenId *generate_requests(int num, int length); template void load_single_weight_tensor(FFModel *ff, Layer *l, int weight_idx); - void load_quantization_weight(FFModel *ff, - Layer *l, - int weight_idx, - bool use_full_precision); - void load_weights(FFModel *ff, bool use_full_precision); + void load_quantization_weight(FFModel *ff, Layer *l, int weight_idx); + void load_weights(FFModel *ff); void load_positions(FFModel *ff, Tensor pt, @@ -54,4 +52,5 @@ class FileDataLoader { size_t hidden_dim, qkv_inner_dim; std::string prompts_filepath; std::string weights_folder; + bool use_full_precision; }; diff --git a/inference/incr_decoding/CMakeLists.txt b/inference/incr_decoding/CMakeLists.txt index 53b7cf0c2f..3e1a1521d7 100644 --- a/inference/incr_decoding/CMakeLists.txt +++ b/inference/incr_decoding/CMakeLists.txt @@ -7,7 +7,6 @@ set(project_target incr_decoding) set(CPU_SRC ${FLEXFLOW_CPP_DRV_SRC} incr_decoding.cc - ../file_loader.cc ../models/llama.cc ../models/opt.cc ../models/falcon.cc diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index c3f9052305..f88af3bc43 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -24,6 +24,7 @@ #include +using namespace FlexFlow; using namespace Legion; using json = nlohmann::json; @@ -250,6 +251,8 @@ void FlexFlow::top_level_task(Task const *task, assert(false && "unknow model type"); } + rm->start_background_server(&model); + int total_num_requests = 0; { using json = nlohmann::json; @@ -266,10 +269,13 @@ void FlexFlow::top_level_task(Task const *task, total_num_requests++; prompts.push_back(text); } - GenerationResult result = + std::vector result = model.generate(prompts, 128 /*max_sequence_length*/); } + // terminate the request manager by stopping the background thread + rm->terminate_background_server(); + // Execution fence { Future future = runtime->issue_execution_fence(ctx); diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index 999ca37037..e00f4e9cfd 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -39,10 +39,11 @@ void FALCON::create_falcon_model(FFModel &ff, Tensor input; { // assert(falcon_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {mode == TREE_VERIFY_MODE - ? BatchConfig::max_verify_tokens_per_batch() - : BatchConfig::max_tokens_per_batch(), - 1}; + int const token_dims[] = { + (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } @@ -239,6 +240,20 @@ void FALCON::create_falcon_model(FFModel &ff, output = ff.argmax(lm_head, /*beam_Search*/ false); } + FileDataLoader *fileloader = + new FileDataLoader("", + weight_file_path, + falcon_config.n_head, + falcon_config.n_head_kv, + falcon_config.hidden_size, + falcon_config.hidden_size / falcon_config.n_head, + ff.config.tensor_parallelism_degree, + use_full_precision); + + InferenceManager *im = InferenceManager::get_inference_manager(); + im->register_model_weights_loader(&ff, fileloader); + +#ifdef DEADCODE // Compile the model std::cout << "------start compile ----------" << std::endl; InferenceManager *im = InferenceManager::get_inference_manager(); @@ -256,6 +271,7 @@ void FALCON::create_falcon_model(FFModel &ff, // init operators im->init_operators_inference(&ff); +#endif } }; // namespace FlexFlow diff --git a/inference/models/falcon.h b/inference/models/falcon.h index 01226a30dc..fce2dade3f 100644 --- a/inference/models/falcon.h +++ b/inference/models/falcon.h @@ -14,7 +14,7 @@ */ #pragma once -#include "file_loader.h" +// #include "file_loader.h" #include "flexflow/batch_config.h" #include "flexflow/inference.h" #include "flexflow/request_manager.h" diff --git a/inference/models/llama.cc b/inference/models/llama.cc index e54d6d8811..14b8c31fa1 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -41,10 +41,11 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor input; { - int const token_dims[] = {mode == TREE_VERIFY_MODE - ? BatchConfig::max_verify_tokens_per_batch() - : BatchConfig::max_tokens_per_batch(), - 1}; + int const token_dims[] = { + (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } @@ -264,23 +265,28 @@ void LLAMA::create_llama_model(FFModel &ff, } } + FileDataLoader *fileloader = new FileDataLoader( + "", + weight_file_path, + llama_config.num_attention_heads, + llama_config.num_attention_heads, + llama_config.hidden_size, + llama_config.hidden_size / llama_config.num_attention_heads, + ff.config.tensor_parallelism_degree, + use_full_precision); + InferenceManager *im = InferenceManager::get_inference_manager(); + im->register_model_weights_loader(&ff, fileloader); +#ifdef DEADCODE // Compile the model std::cout << "------start compile ----------" << std::endl; im->compile_model_and_allocate_buffer(&ff); - FileDataLoader fileloader("", - weight_file_path, - llama_config.num_attention_heads, - llama_config.num_attention_heads, - llama_config.hidden_size, - llama_config.hidden_size / - llama_config.num_attention_heads, - ff.config.tensor_parallelism_degree); - fileloader.load_weights(&ff, use_full_precision); + fileloader.load_weights(&ff); std::cout << "------load weight finished----------" << std::endl; // init operators im->init_operators_inference(&ff); +#endif } }; // namespace FlexFlow diff --git a/inference/models/llama.h b/inference/models/llama.h index 8ecfcd7155..ba1f0236f9 100644 --- a/inference/models/llama.h +++ b/inference/models/llama.h @@ -14,7 +14,7 @@ */ #pragma once -#include "file_loader.h" +// #include "file_loader.h" #include "flexflow/batch_config.h" #include "flexflow/inference.h" #include "flexflow/request_manager.h" diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index 3df67b264c..7e8fc8358f 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -40,10 +40,11 @@ void MPT::create_mpt_model(FFModel &ff, //------------------------------ build the model -------------------------- Tensor input; { - int const token_dims[] = {mode == TREE_VERIFY_MODE - ? BatchConfig::max_verify_tokens_per_batch() - : BatchConfig::max_tokens_per_batch(), - 1}; + int const token_dims[] = { + (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); } @@ -246,7 +247,20 @@ void MPT::create_mpt_model(FFModel &ff, } else { output = ff.argmax(lm_head, /*beam_Search*/ false); } + FileDataLoader *fileloader = + new FileDataLoader("", + weight_file_path, + mpt_config.n_heads, + mpt_config.n_heads, + mpt_config.hidden_size, + mpt_config.hidden_size / mpt_config.n_heads, + ff.config.tensor_parallelism_degree, + use_full_precision); + InferenceManager *im = InferenceManager::get_inference_manager(); + im->register_model_weights_loader(&ff, fileloader); + +#ifdef DEADCODE //------------------- compile the model -------------------------------- InferenceManager *im = InferenceManager::get_inference_manager(); im->compile_model_and_allocate_buffer(&ff); @@ -259,6 +273,7 @@ void MPT::create_mpt_model(FFModel &ff, ff.config.tensor_parallelism_degree); fileloader.load_weights(&ff, use_full_precision); im->init_operators_inference(&ff); +#endif } }; // namespace FlexFlow diff --git a/inference/models/mpt.h b/inference/models/mpt.h index 1969cd9c89..08597e1d75 100644 --- a/inference/models/mpt.h +++ b/inference/models/mpt.h @@ -14,7 +14,7 @@ */ #pragma once -#include "file_loader.h" +// #include "file_loader.h" #include "flexflow/batch_config.h" #include "flexflow/inference.h" #include "flexflow/request_manager.h" diff --git a/inference/models/opt.cc b/inference/models/opt.cc index e260f8fa36..3ff4c96fdf 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -42,10 +42,11 @@ void OPT::create_opt_model(FFModel &ff, Tensor position_input; ff.set_position_offset(2); { - int const token_dims[] = {mode == TREE_VERIFY_MODE - ? BatchConfig::max_verify_tokens_per_batch() - : BatchConfig::max_tokens_per_batch(), - 1}; + int const token_dims[] = { + (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); position_input = ff.create_tensor<2>(token_dims, DT_INT32); } @@ -254,6 +255,19 @@ void OPT::create_opt_model(FFModel &ff, output = ff.argmax(lm_head, /*beam_Search*/ false); } + FileDataLoader *fileloader = new FileDataLoader( + "", + weight_file_path, + opt_config.num_attention_heads, + opt_config.num_attention_heads, + opt_config.hidden_size, + opt_config.hidden_size / opt_config.num_attention_heads, + ff.config.tensor_parallelism_degree, + use_full_precision); + InferenceManager *im = InferenceManager::get_inference_manager(); + im->register_model_weights_loader(&ff, fileloader); + +#ifdef DEADCODE //------------------- compile the model -------------------------------- std::cout << "------start compile ----------" << std::endl; InferenceManager *im = InferenceManager::get_inference_manager(); @@ -269,6 +283,7 @@ void OPT::create_opt_model(FFModel &ff, fileloader.load_weights(&ff, use_full_precision); std::cout << "------finished loading weights----------" << std::endl; im->init_operators_inference(&ff); +#endif } }; // namespace FlexFlow diff --git a/inference/models/opt.h b/inference/models/opt.h index 1ffe096bca..7c736a26d1 100644 --- a/inference/models/opt.h +++ b/inference/models/opt.h @@ -14,7 +14,7 @@ */ #pragma once -#include "file_loader.h" +// #include "file_loader.h" #include "flexflow/batch_config.h" #include "flexflow/inference.h" #include "flexflow/request_manager.h" diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index e683376e47..2327c86119 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -48,10 +48,11 @@ void STARCODER::create_starcoder_model( ff.set_position_offset(0); { // assert(startcoder_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS); - int const token_dims[] = {mode == TREE_VERIFY_MODE - ? BatchConfig::max_verify_tokens_per_batch() - : BatchConfig::max_tokens_per_batch(), - 1}; + int const token_dims[] = { + (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(), + 1}; input = ff.create_tensor<2>(token_dims, DT_INT32); position_input = ff.create_tensor<2>(token_dims, DT_INT32); } @@ -221,22 +222,26 @@ void STARCODER::create_starcoder_model( } InferenceManager *im = InferenceManager::get_inference_manager(); + FileDataLoader *fileloader = new FileDataLoader( + "", + weight_file_path, + startcoder_config.num_attention_heads, + 1, + startcoder_config.hidden_size, + startcoder_config.hidden_size / startcoder_config.num_attention_heads, + ff.config.tensor_parallelism_degree, + use_full_precision); + im->register_model_weights_loader(&ff, fileloader); +#ifdef DEADCODE // Compile the model std::cout << "------start compile ----------" << std::endl; im->compile_model_and_allocate_buffer(&ff); - FileDataLoader fileloader("", - weight_file_path, - startcoder_config.num_attention_heads, - 1, - startcoder_config.hidden_size, - startcoder_config.hidden_size / - startcoder_config.num_attention_heads, - ff.config.tensor_parallelism_degree); fileloader.load_weights(&ff, use_full_precision); std::cout << "------load weight finished----------" << std::endl; // init operators im->init_operators_inference(&ff); +#endif } }; // namespace FlexFlow diff --git a/inference/models/starcoder.h b/inference/models/starcoder.h index bc113e4d52..0e9577d569 100644 --- a/inference/models/starcoder.h +++ b/inference/models/starcoder.h @@ -14,7 +14,7 @@ */ #pragma once -#include "file_loader.h" +// #include "file_loader.h" #include "flexflow/batch_config.h" #include "flexflow/inference.h" #include "flexflow/request_manager.h" diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py index 4a146ab503..6706cf3c29 100644 --- a/inference/python/incr_decoding.py +++ b/inference/python/incr_decoding.py @@ -102,13 +102,16 @@ def main(): max_seq_length=256, max_tokens_per_batch=64, ) - - # Generation begins! + + llm.start_server() + if len(configs.prompt) > 0: prompts = [s for s in json.load(open(configs.prompt))] results = llm.generate(prompts) else: result = llm.generate("Three tips for staying healthy are: ") + + llm.stop_server() if __name__ == "__main__": diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index c9fb5cc7bb..8b9a116dc5 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -73,17 +73,9 @@ def get_configs(): "cache_path": "", "refresh_cache": False, "full_precision": False, - }, - { - # required ssm parameter - "ssm_model": "facebook/opt-125m", - # optional ssm parameters - "cache_path": "", - "refresh_cache": False, - "full_precision": False, - }, + } ], - "prompt": "../prompt/test.json", + "prompt": "", "output_file": "", } # Merge dictionaries @@ -148,14 +140,16 @@ def main(): max_tokens_per_batch=64, ssms=ssms, ) + + llm.start_server() - # Generation begins! if len(configs.prompt) > 0: prompts = [s for s in json.load(open(configs.prompt))] results = llm.generate(prompts) else: result = llm.generate("Three tips for staying healthy are: ") - + + llm.stop_server() if __name__ == "__main__": print("flexflow inference example (speculative inference)") diff --git a/inference/spec_infer/CMakeLists.txt b/inference/spec_infer/CMakeLists.txt index c877a3530b..1b25de8623 100644 --- a/inference/spec_infer/CMakeLists.txt +++ b/inference/spec_infer/CMakeLists.txt @@ -7,7 +7,6 @@ set(project_target spec_infer) set(CPU_SRC ${FLEXFLOW_CPP_DRV_SRC} spec_infer.cc - ../file_loader.cc ../models/llama.cc ../models/opt.cc ../models/falcon.cc diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index b369a13c1d..7578721dd0 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -22,6 +22,7 @@ #include #include +using namespace FlexFlow; using namespace Legion; using json = nlohmann::json; @@ -385,6 +386,8 @@ void FlexFlow::top_level_task(Task const *task, rm->register_ssm_model(&beam_model); } + rm->start_background_server(&tree_model); + // Register requests from prompt file int total_num_requests = 0; { @@ -407,6 +410,9 @@ void FlexFlow::top_level_task(Task const *task, tree_model.generate(prompts, 128 /*max_sequence_length*/); } + // terminate the request manager by stopping the background thread + rm->terminate_background_server(); + // Execution fence { Future future = runtime->issue_execution_fence(ctx); diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 00133dacb4..d6f84833be 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -3812,26 +3812,28 @@ def get_output_tensor(self, ffmodel, data_type): assert ret_val == True return np_array - def generate(self, prompt, max_sequence_length): - c_input_text = get_c_name(prompt) - max_num_chars = 36000 - c_output_text = ffi.new("char[]", max_num_chars) - c_output_length_and_tokens = ffi.new("int[]", max_sequence_length + 100) + def generate(self, prompt_list, max_sequence_length): + assert isinstance(prompt_list, list) + c_input_texts = [get_c_name(prompt) for prompt in prompt_list] + max_num_chars = 5 * (max_sequence_length + 100) + c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list] + c_output_length_and_tokens = [ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list] ffc().flexflow_model_generate( self.handle, - c_input_text, + len(prompt_list), + c_input_texts, max_num_chars, - c_output_text, + c_output_texts, max_sequence_length, c_output_length_and_tokens, ) - output_length = c_output_length_and_tokens[0] - output_tokens = [] - for i in range(output_length): - output_tokens.append(c_output_length_and_tokens[i + 1]) + #output_length = c_output_length_and_tokens[0] + #output_tokens = [] + #for i in range(output_length): + # output_tokens.append(c_output_length_and_tokens[i + 1]) from flexflow.serve import GenerationResult - return GenerationResult(ffi.string(c_output_text), output_tokens) + return [GenerationResult(ffi.string(c_output_text), []) for c_output_text in c_output_texts] def set_position_offset(self, offset): ffc().flexflow_model_set_position_offset(self.handle, offset) @@ -4202,6 +4204,14 @@ def set_max_sequence_length(self, max_length): return ffc().flexflow_request_manager_set_max_sequence_length( self.handle, max_length) + def start_server(self, model): + return ffc().flexflow_request_manager_start_background_server( + self.handle, model.handle + ) + + def stop_server(self): + return ffc().flexflow_request_manager_terminate_background_server( + self.handle) # ----------------------------------------------------------------------- # InferenceManager # ----------------------------------------------------------------------- @@ -4224,6 +4234,10 @@ def init_operators_inference(self, model): self.handle, model.handle ) + def register_model_weights_loader(self, model, fileloader): + ffc().flexflow_inference_manager_register_model_weights_loader( + self.handle, model.handle, fileloader.handle + ) # ----------------------------------------------------------------------- # FileDataLoader @@ -4241,6 +4255,7 @@ def __init__( hidden_dim, qkv_inner_dim, tensor_parallelism_degree, + use_full_precision ): c_weight_file_path = get_c_name(weight_file_path) self.handle = ffc().flexflow_file_data_loader_create( @@ -4250,13 +4265,14 @@ def __init__( hidden_dim, qkv_inner_dim, tensor_parallelism_degree, + use_full_precision ) self._handle = ffi.gc(self.handle, ffc().flexflow_file_data_loader_destroy) - def load_weights(self, model, data_type): + def load_weights(self, model): # Check data type and create use_full_precision boolean - assert data_type == DataType.DT_FLOAT or data_type == DataType.DT_HALF - use_full_precision = data_type == DataType.DT_FLOAT + #assert data_type == DataType.DT_FLOAT or data_type == DataType.DT_HALF + #use_full_precision = data_type == DataType.DT_FLOAT ffc().flexflow_file_data_loader_load_weights( - self.handle, model.handle, use_full_precision + self.handle, model.handle ) diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py index 2b114f09b3..e9cd789bcc 100644 --- a/python/flexflow/serve/models/falcon.py +++ b/python/flexflow/serve/models/falcon.py @@ -23,6 +23,7 @@ def __init__(self, hf_config): #self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 + self.max_spec_tree_token_num = 64 self.bias = hf_config.bias self.hidden_size = hf_config.hidden_size self.layer_norm_epsilon = hf_config.layer_norm_epsilon @@ -70,6 +71,7 @@ def __init__( self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 + max_verify_tokens_per_batch = max_tokens_per_batch + self.falcon_config.max_spec_tree_token_num # Sanity checks if self.falcon_config.hidden_size % self.falcon_config.n_head != 0: @@ -84,7 +86,7 @@ def __init__( f"Number of q attention heads ({self.falcon_config.n_head}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - self.build_model(max_tokens_per_batch) + self.build_model(max_tokens_per_batch if self.mode == InferenceMode.INC_DECODING_MODE else max_verify_tokens_per_batch) def build_model(self, max_tokens_per_batch): ffmodel = FFModel(self.ffconfig) diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py index 7ba0e78a37..900ab48bcd 100644 --- a/python/flexflow/serve/models/llama.py +++ b/python/flexflow/serve/models/llama.py @@ -23,6 +23,7 @@ def __init__(self, hf_config): #self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 + self.max_spec_tree_token_num = 64 self.num_hidden_layers = hf_config.num_hidden_layers self.vocab_size = hf_config.vocab_size self.hidden_size = hf_config.hidden_size @@ -62,6 +63,8 @@ def __init__( self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 + max_verify_tokens_per_batch = max_tokens_per_batch + self.llama_config.max_spec_tree_token_num + # Sanity checks if self.llama_config.hidden_size % self.llama_config.num_attention_heads != 0: @@ -81,7 +84,7 @@ def __init__( f"Number of attention heads ({self.llama_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - self.build_model(max_tokens_per_batch) + self.build_model(max_tokens_per_batch if self.mode == InferenceMode.INC_DECODING_MODE else max_verify_tokens_per_batch) def build_model(self, max_tokens_per_batch): ffmodel = FFModel(self.ffconfig) diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py index 79a5bb940f..c0f995bf22 100644 --- a/python/flexflow/serve/models/mpt.py +++ b/python/flexflow/serve/models/mpt.py @@ -23,6 +23,7 @@ def __init__(self, hf_config): #self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 + self.max_spec_tree_token_num = 64 self.hidden_size = hf_config.d_model self.n_heads = hf_config.n_heads self.n_layers = hf_config.n_layers @@ -57,6 +58,8 @@ def __init__( self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 + max_verify_tokens_per_batch = max_tokens_per_batch + self.mpt_config.max_spec_tree_token_num + # Sanity checks if self.mpt_config.hidden_size % self.mpt_config.n_heads != 0: @@ -72,7 +75,7 @@ def __init__( raise ValueError( f"Number of attention heads ({self.mpt_config.n_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - self.build_model(max_tokens_per_batch) + self.build_model(max_tokens_per_batch if self.mode == InferenceMode.INC_DECODING_MODE else max_verify_tokens_per_batch) def build_model(self, max_tokens_per_batch): ffmodel = FFModel(self.ffconfig) diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index dd36fa6592..dc3f841a5a 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -23,6 +23,7 @@ def __init__(self, hf_config): #self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 + self.max_spec_tree_token_num = 64 self.do_layer_norm_before = hf_config.do_layer_norm_before self.dropout = hf_config.dropout self.enable_bias = hf_config.enable_bias @@ -63,6 +64,7 @@ def __init__( self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 + max_verify_tokens_per_batch = max_tokens_per_batch + self.opt_config.max_spec_tree_token_num # Sanity checks if self.opt_config.hidden_size % self.opt_config.num_attention_heads != 0: @@ -82,7 +84,7 @@ def __init__( f"Number of attention heads ({self.opt_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - self.build_model(max_tokens_per_batch) + self.build_model(max_tokens_per_batch if self.mode == InferenceMode.INC_DECODING_MODE else max_verify_tokens_per_batch) def build_model(self, max_tokens_per_batch): ffmodel = FFModel(self.ffconfig) diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py index f4f28a70e1..4a6f191abd 100644 --- a/python/flexflow/serve/models/starcoder.py +++ b/python/flexflow/serve/models/starcoder.py @@ -23,6 +23,7 @@ def __init__(self, hf_config): #self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 + self.max_spec_tree_token_num = 64 self.dropout_p = hf_config.attn_pdrop self.hidden_size = hf_config.n_embd self.layer_norm_epsilon = hf_config.layer_norm_epsilon @@ -61,6 +62,8 @@ def __init__( self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 + max_verify_tokens_per_batch = max_tokens_per_batch + self.starcoder_config.max_spec_tree_token_num + # Sanity checks if ( @@ -84,7 +87,7 @@ def __init__( f"Number of attention heads ({self.starcoder_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})" ) - self.build_model(max_tokens_per_batch) + self.build_model(max_tokens_per_batch if self.mode == InferenceMode.INC_DECODING_MODE else max_verify_tokens_per_batch) def build_model(self, max_tokens_per_batch): ffmodel = FFModel(self.ffconfig) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 55601f957e..d1a935e5fc 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -118,6 +118,11 @@ def __init__( self.refresh_cache = refresh_cache self.output_file = output_file + def __del__(self): + # Stop the background server before deleting the object + if type(self) == LLM: + self.rm.stop_server() + def __get_ff_model_type(self): architectures = getattr(self.hf_config, "architectures", []) ff_arch = None @@ -283,25 +288,6 @@ def download_hf_tokenizer_if_needed(self): else: print(f"Loading '{self.model_name}' tokenizer from the cache...") - def __load_hf_weights(self): - print("Loading hf weights...") - - self.download_hf_weights_if_needed() - - # Create file data loader, load weights into tensors - model_configs = self.config_class(self.hf_config) - - self.fileloader = FileDataLoader( - self.weights_path, - model_configs.num_attention_heads, - model_configs.num_key_value_heads, - model_configs.hidden_size, - model_configs.hidden_size // model_configs.num_attention_heads, - self.ffconfig.tensor_parallelism_degree, - ) - - self.fileloader.load_weights(self.model.ffmodel, self.data_type) - def compile( self, generation_config: GenerationConfig = GenerationConfig(), @@ -379,12 +365,27 @@ def compile( max_tokens_per_batch ) - # Create inference manager + # Download the weights from huggingface (if needed) + self.download_hf_weights_if_needed() + + # Create file data loader, load weights into tensors + model_configs = self.config_class(self.hf_config) + + self.fileloader = FileDataLoader( + self.weights_path, + model_configs.num_attention_heads, + model_configs.num_key_value_heads, + model_configs.hidden_size, + model_configs.hidden_size // model_configs.num_attention_heads, + self.ffconfig.tensor_parallelism_degree, + self.data_type == DataType.DT_FLOAT + ) + + # Register weights file loader self.im = InferenceManager() - self.im.compile_model_and_allocate_buffer(self.model.ffmodel) + self.im.register_model_weights_loader(self.model.ffmodel, self.fileloader) - # Download the weights and tokenizer from huggingface (if needed) and load them - self.__load_hf_weights() + # Download the tokenizer from huggingface (if needed) and load them self.download_hf_tokenizer_if_needed() # Create tokenizer (this must be done after we have downloaded the tokenizer @@ -399,11 +400,14 @@ def compile( ) self.rm.register_output_filepath(self.output_file) - self.im.init_operators_inference(self.model.ffmodel) - for ssm in self.ssms: self.rm.register_ssm_model(ssm.model.ffmodel) + # start background server + if (mode == InferenceMode.TREE_VERIFY_MODE) or (mode == InferenceMode.INC_DECODING_MODE): + import atexit + atexit.register(self.rm.stop_server) + def generate(self, prompts: Union[str, List[str]], max_length: int = 128): """Generate tokens based on the input prompt(s) @@ -415,16 +419,32 @@ def generate(self, prompts: Union[str, List[str]], max_length: int = 128): if type(prompts) == str: if len(prompts) == 0: return None - return self.model.ffmodel.generate(prompts, max_length) + return self.model.ffmodel.generate([prompts], max_length) elif type(prompts) == list: if len(prompts) == 0: return [] - return [ - self.model.ffmodel.generate(prompt, max_length) for prompt in prompts - ] + return self.model.ffmodel.generate(prompts, max_length) else: assert False, "Please pass a non-empty string or list of strings" - + + def start_server(self): + self.rm.start_server(self.model.ffmodel) + print("Background server started.") + + def stop_server(self): + self.rm.stop_server() + print("Background server stoped.") + + def __enter__(self): + # Start the server when entering the context + #self.rm.start_server(self.model.ffmodel) + return self + + def __exit__(self, exc_type, exc_value, traceback): + # Stop the server when exiting the context + #self.rm.stop_server() + if exc_type: + print(f"Exception occurred: {exc_value}") class SSM(LLM): """This class creates a SSM (Small-Speculative Model) object based on a model from HuggingFace""" @@ -461,9 +481,9 @@ def __init__( def compile( self, generation_config: GenerationConfig = GenerationConfig(), - max_requests_per_batch: int = 1, + max_requests_per_batch: int = 16, max_seq_length: int = 256, - max_tokens_per_batch: int = 64, + max_tokens_per_batch: int = 128, model_specific_data_parallelism_degree: int = 1, model_specific_tensor_parallelism_degree: int = 1, model_specific_pipeline_parallelism_degree: int = 1, @@ -475,11 +495,11 @@ def compile( :type mode: InferenceMode, optional :param generation_config: The GenerationConfig object with the configurations to use for sampling, defaults to GenerationConfig() :type generation_config: GenerationConfig, optional - :param max_requests_per_batch: The maximum batch size to allow, defaults to 1 + :param max_requests_per_batch: The maximum batch size to allow, defaults to 16 :type max_requests_per_batch: int, optional :param max_seq_length: The maximum sequence length to allow per batch, defaults to 256 :type max_seq_length: int, optional - :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 64 + :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 128 :type max_tokens_per_batch: int, optional :param model_specific_data_parallelism_degree: Use this parameter if you want to give the SSM a different data parallelism degree than the default one, defaults to 1 :type model_specific_data_parallelism_degree: int, optional diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 82a37a9736..9ad58695ad 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -17,7 +17,7 @@ #include "flexflow/dataloader.h" #include "flexflow/mapper.h" #include "flexflow/request_manager.h" -#include "inference/file_loader.h" +#include "flexflow/utils/file_loader.h" using namespace Legion; using namespace FlexFlow; @@ -1582,32 +1582,41 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle_, int id) { handle->set_transformer_layer_id(id); } -flexflow_generation_result_t - flexflow_model_generate(flexflow_model_t handle_, - char const *input_text, - int max_num_chars, - char *output_text, - int max_seq_length, - int *output_length_and_tokens) { +void flexflow_model_generate(flexflow_model_t handle_, + int num_requests, + char const **input_texts, + int max_num_chars, + char **output_texts, + int max_seq_length, + int **output_length_and_tokens) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); std::vector prompts; - std::string const text_str(input_text); - prompts.push_back(input_text); - GenerationResult result = handle->generate(prompts, max_seq_length); - DEBUG_PRINT( - "[Model] generate %p %s %i", handle, text_str.c_str(), max_seq_length); + for (int i = 0; i < num_requests; i++) { + std::string const text_str(input_texts[i]); + prompts.push_back(text_str); + DEBUG_PRINT("[Model] generate[%d] %p %s %i", + i, + handle, + text_str.c_str(), + max_seq_length); + } + std::vector results = + handle->generate(prompts, max_seq_length); // If the prompt exceeds max seq len, check that we return the prompt with no // additional token. Otherwise, check that the output does not exceed the max // sequence length. - assert(result.output_tokens.size() <= max_seq_length || - result.output_tokens.size() == result.input_tokens.size()); - output_length_and_tokens[0] = result.output_tokens.size(); - std::copy(result.output_tokens.begin(), - result.output_tokens.end(), - output_length_and_tokens + 1); - std::memcpy( - output_text, result.output_text.c_str(), result.output_text.length()); - return FFCObjectWrapper::wrap(&result); + for (int i = 0; i < num_requests; i++) { + assert(results[i].output_tokens.size() <= max_seq_length || + results[i].output_tokens.size() == results[i].input_tokens.size()); + output_length_and_tokens[i][0] = results[i].output_tokens.size(); + std::copy(results[i].output_tokens.begin(), + results[i].output_tokens.end(), + output_length_and_tokens[i] + 1); + std::memcpy(output_texts[i], + results[i].output_text.c_str(), + results[i].output_text.length()); + } + // return FFCObjectWrapper::wrap(&results[0]); } void flexflow_model_set_position_offset(flexflow_model_t handle_, @@ -2616,6 +2625,22 @@ int flexflow_request_manager_register_ssm_model( return handle->register_ssm_model(model_handle); } +void flexflow_request_manager_start_background_server( + flexflow_request_manager_t handle_, flexflow_model_t model_handle_) { + RequestManager *handle = FFCObjectWrapper::unwrap(handle_); + FFModel *model_handle = FFCObjectWrapper::unwrap(model_handle_); + DEBUG_PRINT( + "[RequestManager] start background server %p %p", handle, model_handle); + handle->start_background_server(model_handle); +} + +void flexflow_request_manager_terminate_background_server( + flexflow_request_manager_t handle_) { + RequestManager *handle = FFCObjectWrapper::unwrap(handle_); + DEBUG_PRINT("[RequestManager] terminate background server %p", handle); + handle->terminate_background_server(); +} + // ----------------------------------------------------------------------- // InferenceManager // ----------------------------------------------------------------------- @@ -2644,6 +2669,20 @@ void flexflow_inference_manager_init_operators_inference( handle->init_operators_inference(model); } +void flexflow_inference_manager_register_model_weights_loader( + flexflow_inference_manager_t handle_, + flexflow_model_t model_handle, + flexflow_file_data_loader_t loader_handle) { + InferenceManager *handle = FFCObjectWrapper::unwrap(handle_); + FFModel *model = FFCObjectWrapper::unwrap(model_handle); + FileDataLoader *loader = FFCObjectWrapper::unwrap(loader_handle); + DEBUG_PRINT("[InferenceManager] register_model_weights_loader %p %p %p", + handle, + model, + loader); + handle->register_model_weights_loader(model, loader); +} + // ----------------------------------------------------------------------- // FileDataLoader // ----------------------------------------------------------------------- @@ -2654,7 +2693,8 @@ flexflow_file_data_loader_t int num_kv_heads, int hidden_dim, int qkv_inner_dim, - int tensor_parallelism_degree) { + int tensor_parallelism_degree, + bool use_full_precision) { assert(weight_file_path != nullptr && "Cannot convert nullptr char * to std::string"); std::string const weight_file_path_str(weight_file_path); @@ -2664,7 +2704,8 @@ flexflow_file_data_loader_t num_kv_heads, hidden_dim, qkv_inner_dim, - tensor_parallelism_degree); + tensor_parallelism_degree, + use_full_precision); DEBUG_PRINT("[FileDataLoader] new %p", handle); return FFCObjectWrapper::wrap(handle); } @@ -2676,9 +2717,8 @@ void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_) { } void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_, - flexflow_model_t model_handle_, - bool use_full_precision) { + flexflow_model_t model_handle_) { FileDataLoader *handle = FFCObjectWrapper::unwrap(handle_); FFModel *model = FFCObjectWrapper::unwrap(model_handle_); - handle->load_weights(model, use_full_precision); + handle->load_weights(model); } diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index a2fb1d89be..bc26a79d3e 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -286,7 +286,8 @@ void FFMapper::select_task_options(const MapperContext ctx, if ((task.task_id == RM_PREPARE_NEXT_BATCH_TASK_ID) || (task.task_id == RM_PREPARE_NEXT_BATCH_INIT_TASK_ID) || (task.task_id == RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID) || - (task.task_id == RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID)) { + (task.task_id == RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID) || + (task.task_id == RM_BACKGROUND_SERVING_TASK_ID)) { output.initial_proc = all_cpus[0]; return; } diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 63b26bfe7d..6ca6038778 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -467,12 +467,12 @@ OpMeta *Linear::init_task_with_dim(Task const *task, ctx, runtime, false /*readOutput*/); - TensorAccessorW acc_kernel(regions[2], - task->regions[2], - FID_DATA, - ctx, - runtime, - false /*readOutput*/); + // TensorAccessorW acc_kernel(regions[2], + // task->regions[2], + // FID_DATA, + // ctx, + // runtime, + // false /*readOutput*/); // TensorAccessorR acc_bias( // regions[3], task->regions[3], FID_DATA, ctx, runtime); diff --git a/inference/file_loader.cc b/src/runtime/file_loader.cc similarity index 98% rename from inference/file_loader.cc rename to src/runtime/file_loader.cc index 7c6870d439..56558b3185 100644 --- a/inference/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "file_loader.h" +#include "flexflow/utils/file_loader.h" #include "flexflow/ffconst_utils.h" #include "flexflow/inference.h" @@ -28,11 +28,13 @@ FileDataLoader::FileDataLoader(std::string _prompts_filepath, int _num_kv_heads, size_t _hidden_dim, size_t _qkv_inner_dim, - int _tensor_parallelism_degree) + int _tensor_parallelism_degree, + bool _use_full_precision) : prompts_filepath(_prompts_filepath), weights_folder(_weights_folder), num_heads(_num_heads), num_kv_heads(_num_kv_heads), hidden_dim(_hidden_dim), qkv_inner_dim(_qkv_inner_dim), - tensor_parallelism_degree(_tensor_parallelism_degree){}; + tensor_parallelism_degree(_tensor_parallelism_degree), + use_full_precision(_use_full_precision){}; BatchConfig::TokenId *FileDataLoader::generate_requests(int num, int length) { @@ -650,8 +652,7 @@ void load_from_quantized_file(char *ptr, void FileDataLoader::load_quantization_weight(FFModel *ff, Layer *l, - int weight_idx, - bool use_full_precision) { + int weight_idx) { Tensor weight = l->weights[weight_idx]; size_t volume = 1; std::vector dims_vec; @@ -789,7 +790,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, delete data; } -void FileDataLoader::load_weights(FFModel *ff, bool use_full_precision) { +void FileDataLoader::load_weights(FFModel *ff) { for (Layer *l : ff->layers) { if (l->numWeights < 1 || l->name == NULL || strlen(l->name) < 1) { continue; @@ -809,7 +810,7 @@ void FileDataLoader::load_weights(FFModel *ff, bool use_full_precision) { case DT_INT4: case DT_INT8: // load weights in quantization - load_quantization_weight(ff, l, i, use_full_precision); + load_quantization_weight(ff, l, i); break; default: assert(false && "Unsupported data type"); diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index cc76da58bb..6588cbceeb 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -28,8 +28,8 @@ using namespace Legion; LegionRuntime::Logger::Category log_inf_mgr("InferenceManager"); LegionRuntime::Logger::Category log_offload("Offloading"); -InferenceManager::InferenceManager(FFConfig const &_config) - : ff_config(_config) { +InferenceManager::InferenceManager() { +#ifdef DEADCODE num_devices = ff_config.workersPerNode * ff_config.numNodes; // Check parallelization degrees assert(ff_config.data_parallelism_degree <= num_devices && @@ -53,6 +53,7 @@ InferenceManager::InferenceManager(FFConfig const &_config) num_devices && "Product of data, tensor, and pipeline parallelism degrees does not " "match the number of available devices"); +#endif } InferenceManager *inference_manager_singleton = nullptr; @@ -60,8 +61,8 @@ InferenceManager *inference_manager_singleton = nullptr; /*static*/ InferenceManager *InferenceManager::get_inference_manager() { if (inference_manager_singleton == nullptr) { - FFConfig ffconfig; - inference_manager_singleton = new InferenceManager(ffconfig); + // FFConfig ffconfig; + inference_manager_singleton = new InferenceManager(); } return inference_manager_singleton; } @@ -382,13 +383,13 @@ FutureMap InferenceManager::inference(FFModel *model, // input. assert(op->numOutputs == 1); ParallelTensor pt = tensor_buffer[op->outputs[0]][batch_index]; - load_positions(bc, pt, model->position_offset); + load_positions(model, bc, pt, model->position_offset); } else { found_input_operator = true; assert(op->numOutputs == 1); ParallelTensor pt = tensor_buffer[op->outputs[0]][batch_index]; - load_input_tokens_from_batch_config(bc, pt, model->handlers); - load_inference_metadata_batch_config(bc, model->handlers); + load_input_tokens_from_batch_config(model, bc, pt, model->handlers); + load_inference_metadata_batch_config(model, bc, model->handlers); } } @@ -418,11 +419,12 @@ FutureMap InferenceManager::inference(FFModel *model, }; void InferenceManager::load_input_tokens_from_batch_config( + FFModel *model, BatchConfigFuture const &bc, ParallelTensor const input, FFHandler *handlers) { - Context ctx = ff_config.lg_ctx; - Runtime *runtime = ff_config.lg_hlr; + Context ctx = model->config.lg_ctx; + Runtime *runtime = model->config.lg_hlr; size_t machine_view_hash = input->machine_view.hash(); ArgumentMap argmap; Domain domain = runtime->get_index_space_domain(ctx, input->parallel_is); @@ -462,13 +464,13 @@ void InferenceManager::load_input_tokens_from_batch_config( } void InferenceManager::load_inference_metadata_batch_config( - BatchConfigFuture const &bc, FFHandler *handlers) { - Context ctx = ff_config.lg_ctx; - Runtime *runtime = ff_config.lg_hlr; + FFModel *model, BatchConfigFuture const &bc, FFHandler *handlers) { + Context ctx = model->config.lg_ctx; + Runtime *runtime = model->config.lg_hlr; ArgumentMap argmap; Domain domain = - runtime->get_index_space_domain(ctx, ff_config.all_gpu_task_is); + runtime->get_index_space_domain(ctx, model->config.all_gpu_task_is); Rect<1> task_rect = domain; int idx = 0; @@ -478,7 +480,7 @@ void InferenceManager::load_inference_metadata_batch_config( } IndexLauncher launcher(RM_LOAD_BATCH_CONFIG_TASK_ID, - ff_config.all_gpu_task_is, + model->config.all_gpu_task_is, TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, @@ -489,11 +491,12 @@ void InferenceManager::load_inference_metadata_batch_config( runtime->execute_index_space(ctx, launcher); } -void InferenceManager::load_positions(BatchConfigFuture const &bc, +void InferenceManager::load_positions(FFModel *model, + BatchConfigFuture const &bc, ParallelTensor position_input, int offset) { - Context ctx = ff_config.lg_ctx; - Runtime *runtime = ff_config.lg_hlr; + Context ctx = model->config.lg_ctx; + Runtime *runtime = model->config.lg_hlr; size_t machine_view_hash = position_input->machine_view.hash(); ArgumentMap argmap; IndexLauncher launcher(RM_LOAD_POSITION_TASK_ID, @@ -514,6 +517,11 @@ void InferenceManager::load_positions(BatchConfigFuture const &bc, runtime->execute_index_space(ctx, launcher); } +void InferenceManager::register_model_weights_loader(FFModel *model, + FileDataLoader *loader) { + model_weights_loaders[model] = loader; +} + void FFModel::set_transformer_layer_id(int id) { // We assume that users call this function with // monotonically increasing ids @@ -529,6 +537,12 @@ void FFModel::set_position_offset(int offset) { } void FFModel::compile_inference() { + // Request at least four CPU processors for inference runs + assert( + config.cpusPerNode >= 4 && + "FlexFlow Serve requires at least four CPU cores per node, please add " + "`-ll:cpu 4` in the command line if you are using the C++ interface or " + "set `num_cpus` in `ff.init` if you are using the Python interface"); Context ctx = config.lg_ctx; Runtime *runtime = config.lg_hlr; config.computationMode = COMP_MODE_INFERENCE; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 4270515224..c07c33efca 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4480,6 +4480,24 @@ void register_flexflow_internal_tasks(Runtime *runtime, RequestManager::prepare_next_batch_verify_task>(registrar); } } + // RequestManager background serving task + { + TaskVariantRegistrar registrar(RM_BACKGROUND_SERVING_TASK_ID, + "RequestManager Background Serving Task"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + // registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + RequestManager::background_serving_task>( + registrar, "RequestManager Background Serving Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // ElementUnary task { TaskVariantRegistrar registrar(ELEMENTUNARY_INIT_TASK_ID, diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 56a2c122d3..46e17d4fdc 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -18,6 +18,7 @@ // #include "flexflow/tokenizers.h" #include #include +#include #include #include #include @@ -43,7 +44,8 @@ std::string LoadBytesFromFile(std::string const &path) { } RequestManager::RequestManager() - : verbose(false), next_available_guid(1000000), num_processed_requests(0), + : request_manager_status(INITIALIZED), verbose(false), + next_available_guid(1000000), num_processed_requests(0), total_request_run_time(0.0f) { // The following config parameters are set // during ffmodel.compile() @@ -53,26 +55,6 @@ RequestManager::RequestManager() max_requests_per_batch = -1; max_tokens_per_batch = -1; max_sequence_length = -1; - { - // Initialize futures for spec infer - TreeVerifyBatchConfig tree_bc; - InferenceResult tree_ir; - TreeVerifyBatchConfigFuture tree_bcf = - Future::from_value(tree_bc); - InferenceResultFuture tree_irf = - Future::from_value(tree_ir); - last_tree_bcf = tree_bcf; - last_tree_irf = tree_irf; - } - { - // Initialize futures for incr decoding - BatchConfig bc; - InferenceResult ir; - BatchConfigFuture bcf = Future::from_value(bc); - InferenceResultFuture irf = Future::from_value(ir); - last_bcf = bcf; - last_irf = irf; - } } void RequestManager::set_max_requests_per_batch(int max_num_requests) { @@ -171,19 +153,19 @@ void RequestManager::register_output_filepath( } int RequestManager::register_ssm_model(FFModel *model) { - int model_id = models.size(); - models.push_back(model); - std::cout << "Register new model with id: " << model_id << std::endl; + int model_id = ssm_models.size(); + ssm_models.push_back(model); + std::cout << "Register new ssm model with id: " << model_id << std::endl; return model_id; } -FFModel *RequestManager::get_model(int model_id) { - assert(model_id < models.size()); - return models[model_id]; +FFModel *RequestManager::get_ssm_model(int model_id) { + assert(model_id < ssm_models.size()); + return ssm_models[model_id]; } size_t RequestManager::get_num_ssms() { - return models.size(); + return ssm_models.size(); } RequestManager::RequestGuid @@ -203,7 +185,7 @@ RequestManager::RequestGuid << prompt.size() << ".\n"; printf("tokens size: %zu\n", request.tokens.size()); - return 0; + return INVALID_GUID; } else { request.initial_len = prompt.size(); request.tokens = prompt; @@ -214,7 +196,7 @@ RequestManager::RequestGuid "decoding." << std::endl; } else { - std::cout << "Num of models: " << get_num_ssms() << std::endl; + std::cout << "Num of SSMs: " << get_num_ssms() << std::endl; for (int i = 0; i < get_num_ssms(); i++) { BeamTree beam_tree = BeamTree{}; request.beam_trees.push_back(beam_tree); @@ -223,6 +205,10 @@ RequestManager::RequestGuid pending_request_queue.push(request); all_requests[request.guid] = request; + { + const std::lock_guard lock(request_to_promise_mutex); + request_to_promise[request.guid] = new std::promise(); + } if (verbose) { std::cout << "new req: " << request.tokens.size() << std::endl; @@ -261,7 +247,7 @@ RequestManager::RequestGuid << tokens.size() << ".\n"; printf("tokens size: %zu\n", tokens.size()); - return 0; + return INVALID_GUID; } for (int i = 0; i < tokens.size(); i++) { std::cout << "[" << i << "]" << tokens.at(i) << "\n"; @@ -274,7 +260,7 @@ RequestManager::RequestGuid "decoding." << std::endl; } else { - std::cout << "Num of models: " << get_num_ssms() << std::endl; + std::cout << "Num of SSMs: " << get_num_ssms() << std::endl; for (int i = 0; i < get_num_ssms(); i++) { BeamTree beam_tree = BeamTree{}; request.beam_trees.push_back(beam_tree); @@ -283,6 +269,11 @@ RequestManager::RequestGuid pending_request_queue.push(request); all_requests[request.guid] = request; + { + const std::lock_guard lock(request_to_promise_mutex); + request_to_promise[request.guid] = new std::promise(); + } + { std::string output = "New request tokens:"; output = "[" + std::to_string(request.guid) + "]" + output; @@ -312,10 +303,22 @@ bool RequestManager::is_request_completed(RequestGuid const &guid) { GenerationResult RequestManager::get_generation_result(RequestGuid const &guid) { - const std::lock_guard lock(request_queue_mutex); - assert(request_generation_results.find(guid) != - request_generation_results.end()); - return request_generation_results[guid]; + // First get the future of the request + std::future future; + { + const std::lock_guard lock(request_to_promise_mutex); + assert(request_to_promise.find(guid) != request_to_promise.end()); + future = request_to_promise[guid]->get_future(); + } + // Wait until the result is completed + future.get(); + // Get the generation result + { + const std::lock_guard lock(request_queue_mutex); + assert(request_generation_results.find(guid) != + request_generation_results.end()); + return request_generation_results[guid]; + } } size_t RequestManager::get_num_processed_requests() { @@ -324,10 +327,9 @@ size_t RequestManager::get_num_processed_requests() { BatchConfigFuture RequestManager::prepare_next_batch(BatchConfigFuture const &old_bc, - InferenceResultFuture const &result) { - Runtime *runtime = Runtime::get_runtime(); - Context ctx = Runtime::get_context(); - + InferenceResultFuture const &result, + Context ctx, + Runtime *runtime) { RequestManager *rm = this; TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_TASK_ID, TaskArgument(&rm, sizeof(RequestManager *))); @@ -394,10 +396,6 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, request_completed = true; } if (request_completed) { - request.status = Request::COMPLETED; - log_req_mgr.print("[Done] guid(%zu) final_length(%zu)", - old_bc.requestsInfo[i].request_guid, - request.tokens.size()); std::string output = this->tokenizer_->Decode(request.tokens); // Unlike Huggingface, the sentencepiece C++ library automatically // removes the BOS token @@ -405,14 +403,18 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, request.tokens.at(0) == bos_token_id) { output = " " + output; } - { - // update generation result and trigger future + // update generation result GenerationResult &gr = request_generation_results[request.guid]; assert(gr.guid == request.guid); gr.output_tokens = request.tokens; gr.output_text = output; } + request.status = Request::COMPLETED; + trigger_request_completion_future(request.guid); + log_req_mgr.print("[Done] guid(%zu) final_length(%zu)", + old_bc.requestsInfo[i].request_guid, + request.tokens.size()); log_req_mgr.print("Final output: %s", output.c_str()); num_processed_requests++; ProfileInfo profile_info = profiling_requests[request.guid]; @@ -545,9 +547,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_init( TreeVerifyBatchConfigFuture const &old_bc, InferenceResultFuture const &result, - int model_id) { - Runtime *runtime = Runtime::get_runtime(); - Context ctx = Runtime::get_context(); + int model_id, + Context ctx, + Runtime *runtime) { RequestManager *rm = this; TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_INIT_TASK_ID, @@ -581,8 +583,6 @@ BeamSearchBatchConfig std::cout << "\n############### prepare_next_batch_init ###############\n"; } - std::cout << "\n############### prepare_next_batch_init ###############\n"; - // Step 1: use result to update requests BeamSearchBatchConfig new_bc; new_bc.num_tokens = 0; @@ -664,7 +664,6 @@ BeamSearchBatchConfig request.tokens.push_back(token_pair.first); } } - request.status = Request::COMPLETED; log_req_mgr.print("[Done] guid(%zu) with final length(%zu)", request.guid, request.tokens.size()); @@ -676,12 +675,14 @@ BeamSearchBatchConfig output = " " + output; } { - // update generation result and trigger future + // update generation result GenerationResult &gr = request_generation_results[request.guid]; assert(gr.guid == request.guid); gr.output_tokens = request.tokens; gr.output_text = output; } + request.status = Request::COMPLETED; + trigger_request_completion_future(request.guid); log_req_mgr.print("Final output: %s", output.c_str()); new_bc.request_completed[i] = true; @@ -983,9 +984,9 @@ BeamSearchBatchConfig /***** Beam Search Phase *****/ BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_beam( BeamSearchBatchConfigFuture const &old_bc, - BeamInferenceResultFuture const &result) { - Runtime *runtime = Runtime::get_runtime(); - Context ctx = Runtime::get_context(); + BeamInferenceResultFuture const &result, + Context ctx, + Runtime *runtime) { RequestManager *rm = this; TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, @@ -1298,9 +1299,9 @@ BeamSearchBatchConfig /***** Verify Phase *****/ TreeVerifyBatchConfigFuture RequestManager::prepare_next_batch_verify( - std::vector const &old_batches) { - Runtime *runtime = Runtime::get_runtime(); - Context ctx = Runtime::get_context(); + std::vector const &old_batches, + Context ctx, + Runtime *runtime) { RequestManager *rm = this; TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID, @@ -1328,7 +1329,10 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( std::vector const &old_batches) { const std::lock_guard lock(request_queue_mutex); - std::cout << "\n############### prepare_next_batch_verify ###############\n"; + if (verbose) { + std::cout + << "\n############### prepare_next_batch_verify ###############\n"; + } assert(old_batches.size() > 0); @@ -2277,39 +2281,107 @@ std::vector> return merged_tree; } -GenerationResult FFModel::generate(std::vector &prompts, - int max_seq_length) { +std::vector + FFModel::generate(std::vector &prompts, int max_seq_length) { + RequestManager *rm = RequestManager::get_request_manager(); + std::vector guids; + for (int i = 0; i < prompts.size(); i++) { + RequestManager::RequestGuid guid = + rm->register_new_request(prompts.at(i), max_seq_length); + if (guid != RequestManager::INVALID_GUID) { + guids.push_back(guid); + } + } + std::vector results; + for (int i = 0; i < guids.size(); i++) { + results.push_back(rm->get_generation_result(guids[i])); + } + return results; +} + +void RequestManager::start_background_server(FFModel *model) { + assert(request_manager_status == INITIALIZED); + request_manager_status = SERVING; + // Start background task + Runtime *runtime = Runtime::get_runtime(); + Context ctx = Runtime::get_context(); + TaskLauncher launcher(RM_BACKGROUND_SERVING_TASK_ID, + TaskArgument(&model, sizeof(FFModel *))); + background_server_handler = runtime->execute_task(ctx, launcher); + // Register callbacks for normal exit + { + int ret = std::atexit(RequestManager::terminate_background_server_at_exit); + assert(ret == 0); // make sure the callback is successfully registered + } + // Register callbacks for termination + { + std::set_terminate([]() { + RequestManager::terminate_background_server_at_exit(); + std::abort(); + }); + } +} + +void RequestManager::background_serving_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { RequestManager *rm = RequestManager::get_request_manager(); + FFModel *llm = *(FFModel **)task->args; + { + // Update FFModel's lg_hlr and lg_ctx to the current + // task's runtime and ctx, since all future legion tasks are + // launched in this task + llm->config.lg_hlr = runtime; + llm->config.lg_ctx = ctx; + // Update the lg_hlr and lg_ctx of all SSMs' FFConfig + // since all future legion tasks are launched in this task + for (size_t i = 0; i < rm->get_num_ssms(); i++) { + FFModel *ssm = rm->get_ssm_model(i); + ssm->config.lg_hlr = runtime; + ssm->config.lg_ctx = ctx; + } + } if (rm->get_num_ssms() == 0) { // No SSMs: perform incremental decoding - return rm->generate_incr_decoding(this, prompts, max_seq_length); + rm->serve_incr_decoding(llm); } else { // Registered SSMs: perform speculative inference - return rm->generate_spec_infer(this, prompts, max_seq_length); + rm->serve_spec_infer(llm); } } /*static*/ -GenerationResult RequestManager::generate_incr_decoding( - FFModel *llm, std::vector &prompts, int max_seq_length) { +void RequestManager::serve_incr_decoding(FFModel *llm) { + Context ctx = llm->config.lg_ctx; + Runtime *runtime = llm->config.lg_hlr; + // Compile the llm InferenceManager *im = InferenceManager::get_inference_manager(); - RequestGuid guid; - for (int i = 0; i < prompts.size(); i++) { - guid = register_new_request(prompts.at(i), max_seq_length); - } - - if (guid == 0) { - std::cout - << "=========== Discard request exceed prompt maximum... ===========" - << std::endl; - return GenerationResult(); + im->compile_model_and_allocate_buffer(llm); + assert(im->model_weights_loaders.find(llm) != + im->model_weights_loaders.end()); + // Load model weights + im->model_weights_loaders[llm]->load_weights(llm); + // init operators + im->init_operators_inference(llm); + // Legion futures for inc_decoding and spec_infer + BatchConfigFuture last_bcf; + InferenceResultFuture last_irf; + { + // Initialize futures for incr decoding + BatchConfig bc; + InferenceResult ir; + last_bcf = Future::from_value(bc); + last_irf = Future::from_value(ir); } - int tokens_to_generate = max_seq_length - all_requests[guid].tokens.size(); std::queue> batch_pipeline; { batch_pipeline.push(std::make_pair(last_bcf, last_irf)); } - while (!is_request_completed(guid)) { + + while (!is_background_server_terminated()) { + if (batch_pipeline.size() >= 4) { // Block here to avoid launching too many batches auto const &batch = batch_pipeline.front(); @@ -2324,15 +2396,10 @@ GenerationResult RequestManager::generate_incr_decoding( break; } } - if (is_request_completed(guid)) { - break; - } - Runtime *runtime = Runtime::get_runtime(); - Context ctx = Runtime::get_context(); runtime->begin_trace(ctx, 12346 /*trace_id*/); auto const &next_batch = batch_pipeline.back(); BatchConfigFuture bcf = - prepare_next_batch(next_batch.first, next_batch.second); + prepare_next_batch(next_batch.first, next_batch.second, ctx, runtime); FutureMap fm = im->inference(llm, 0, bcf); assert(fm.get_future_map_domain().get_volume() == 1); InferenceResultFuture irf = fm.get_future(0); @@ -2341,30 +2408,51 @@ GenerationResult RequestManager::generate_incr_decoding( last_irf = irf; runtime->end_trace(ctx, 12346 /*trace_id*/); } - GenerationResult gr = get_generation_result(guid); - // assert(gr.output_tokens.size() >= max_seq_length); - return gr; } /*static*/ -GenerationResult RequestManager::generate_spec_infer( - FFModel *llm, std::vector &prompts, int max_seq_length) { +void RequestManager::serve_spec_infer(FFModel *llm) { + Context ctx = llm->config.lg_ctx; + Runtime *runtime = llm->config.lg_hlr; InferenceManager *im = InferenceManager::get_inference_manager(); - RequestGuid guid; - for (int i = 0; i < prompts.size(); i++) { - guid = register_new_request(prompts.at(i), max_seq_length); + { + // Compile the llm + im->compile_model_and_allocate_buffer(llm); + assert(im->model_weights_loaders.find(llm) != + im->model_weights_loaders.end()); + // Load model weights + im->model_weights_loaders[llm]->load_weights(llm); + // init operators + im->init_operators_inference(llm); } - if (guid == 0) { - std::cout - << "=========== Discard request exceed prompt maximum... ===========" - << std::endl; - return GenerationResult(); + for (size_t i = 0; i < get_num_ssms(); i++) { + // Compile the i-th ssm + FFModel *ssm = get_ssm_model(i); + im->compile_model_and_allocate_buffer(ssm); + assert(im->model_weights_loaders.find(llm) != + im->model_weights_loaders.end()); + // Load model weights + im->model_weights_loaders[ssm]->load_weights(ssm); + // init operators + im->init_operators_inference(ssm); } std::queue> batch_pipeline; + // Legion futures for inc_decoding and spec_infer + TreeVerifyBatchConfigFuture last_tree_bcf; + InferenceResultFuture last_tree_irf; + { + // Initialize futures for spec infer + TreeVerifyBatchConfig tree_bc; + InferenceResult tree_ir; + last_tree_bcf = Future::from_value(tree_bc); + last_tree_irf = Future::from_value(tree_ir); + } batch_pipeline.push(std::make_pair(last_tree_bcf, last_tree_irf)); - while (!is_request_completed(guid)) { + + while (!is_background_server_terminated()) { + if (batch_pipeline.size() >= 4) { // Block here to avoid launching too many batches auto const &batch = batch_pipeline.front(); @@ -2380,17 +2468,12 @@ GenerationResult RequestManager::generate_spec_infer( } } auto const &next_batch = batch_pipeline.back(); - BeamSearchBatchConfigFuture beam_bcf = - prepare_next_batch_init(next_batch.first, next_batch.second, 0); + BeamSearchBatchConfigFuture beam_bcf = prepare_next_batch_init( + next_batch.first, next_batch.second, 0, ctx, runtime); std::vector beam_bcf_vec(get_num_ssms()); for (size_t ssm_id = 0; ssm_id < get_num_ssms(); ssm_id++) { beam_bcf_vec[ssm_id] = beam_bcf; } - // if (is_request_completed(guid)) { - // break; - // } - Runtime *runtime = Runtime::get_runtime(); - Context ctx = Runtime::get_context(); runtime->begin_trace(ctx, 12345 /*trace_id*/); for (size_t i = 0; i < get_num_ssms(); i++) { @@ -2398,16 +2481,17 @@ GenerationResult RequestManager::generate_spec_infer( depth++) { beam_bcf = beam_bcf_vec[i]; - FutureMap fm = im->inference(get_model(i), 0, beam_bcf_vec[i]); + FutureMap fm = im->inference(get_ssm_model(i), 0, beam_bcf_vec[i]); assert(fm.get_future_map_domain().get_volume() == 1); BeamInferenceResultFuture beam_irf = fm.get_future(0); - beam_bcf_vec[i] = prepare_next_batch_beam(beam_bcf_vec[i], beam_irf); + beam_bcf_vec[i] = + prepare_next_batch_beam(beam_bcf_vec[i], beam_irf, ctx, runtime); } } // Token Tree Verification { TreeVerifyBatchConfigFuture tree_bcf = - prepare_next_batch_verify(beam_bcf_vec); + prepare_next_batch_verify(beam_bcf_vec, ctx, runtime); FutureMap fm = im->inference(llm, 0, tree_bcf); assert(fm.get_future_map_domain().get_volume() == 1); InferenceResultFuture tree_irf = fm.get_future(0); @@ -2417,10 +2501,34 @@ GenerationResult RequestManager::generate_spec_infer( } runtime->end_trace(ctx, 12345 /*trace_id*/); } +} + +void RequestManager::trigger_request_completion_future( + RequestGuid const &guid) { + const std::lock_guard lock(request_to_promise_mutex); + assert(request_to_promise.find(guid) != request_to_promise.end()); + // Set the completion promise in case other threads are waiting + request_to_promise[guid]->set_value(); +} + +/*static*/ +void RequestManager::terminate_background_server_at_exit() { + RequestManager *rm = RequestManager::get_request_manager(); + rm->terminate_background_server(); +} + +void RequestManager::terminate_background_server() { + if (request_manager_status == SERVING) { + request_manager_status = TERMINATED; + // Wait for the background server to terminate + Runtime *runtime = Runtime::get_runtime(); + Context ctx = Runtime::get_context(); + background_server_handler.get_void_result(); + } +} - GenerationResult gr = get_generation_result(guid); - // assert(gr.output_tokens.size() >= max_seq_length); - return gr; +bool RequestManager::is_background_server_terminated() { + return request_manager_status == TERMINATED; } RequestManager *request_manager_singleton = nullptr; From 18cd4850229e1fe29778d6383ee3f7175668a093 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sun, 14 Jan 2024 07:12:32 -0800 Subject: [PATCH 43/61] Update README.md --- .github/README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/README.md b/.github/README.md index 0972135504..4a2a881c8d 100644 --- a/.github/README.md +++ b/.github/README.md @@ -102,10 +102,11 @@ llm.compile(generation_config, max_tokens_per_batch = 128, ssms=ssms) ``` -Finally, we call `llm.generate` to generate the output, which is organized as a list of `GenerationResult`, which include the output tokens and text. +Next, we call `llm.start_server()` to start an LLM server running on a seperate background thread, which allows users to perform computations in parallel with LLM serving. Finally, we call `llm.generate` to generate the output, which is organized as a list of `GenerationResult`, which include the output tokens and text. After all serving requests are processed, you can either call `llm.stop_server()` to terminate the background thread or directly exit the python program, which will automatically terminate the background server thread. ```python -with llm: - result = llm.generate("Here are some travel tips for Tokyo:\n") +llm.start_server() +result = llm.generate("Here are some travel tips for Tokyo:\n") +llm.stop_server() # This invocation is optional ``` ### Incremental decoding @@ -140,8 +141,9 @@ llm.compile(generation_config, max_tokens_per_batch = 128) # Generation begins! -with llm: - result = llm.generate("Here are some travel tips for Tokyo:\n") +llm.start_server() +result = llm.generate("Here are some travel tips for Tokyo:\n") +llm.stop_server() # This invocation is optional ``` From 75edadcbaf65fc4cea83eea91de73719ed5a4959 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 19 Jan 2024 23:21:31 -0500 Subject: [PATCH 44/61] Better debugging/logging tools for alignment checks (#1275) * only stop server if rm is initialized * fix * better logging * pass layer names to ops * add debugging functionality to hf script * fix * fixes * fix * fix --------- Co-authored-by: Ubuntu --- examples/python/keras/seq_cifar10_cnn.py | 2 +- include/flexflow/operator.h | 125 +++++++++++++++++- .../ops/add_bias_residual_layer_norm_params.h | 1 + include/flexflow/ops/aggregate_params.h | 1 + include/flexflow/ops/aggregate_spec_params.h | 1 + include/flexflow/ops/arg_topk_params.h | 1 + include/flexflow/ops/argmax_params.h | 1 + include/flexflow/ops/attention_params.h | 1 + include/flexflow/ops/batch_matmul_params.h | 1 + include/flexflow/ops/beam_topk_params.h | 1 + include/flexflow/ops/cast_params.h | 1 + include/flexflow/ops/concat_params.h | 2 +- include/flexflow/ops/conv_2d_params.h | 1 + include/flexflow/ops/dropout_params.h | 1 + include/flexflow/ops/element_binary_params.h | 1 + include/flexflow/ops/element_unary_params.h | 1 + include/flexflow/ops/embedding_params.h | 1 + include/flexflow/ops/experts_params.h | 1 + include/flexflow/ops/flat_params.h | 1 + include/flexflow/ops/gather_params.h | 1 + include/flexflow/ops/groupby_params.h | 1 + .../ops/inc_multihead_self_attention_params.h | 1 + include/flexflow/ops/layer_norm_params.h | 1 + include/flexflow/ops/linear_params.h | 1 + include/flexflow/ops/pool_2d_params.h | 1 + include/flexflow/ops/reduce_params.h | 1 + include/flexflow/ops/reshape_params.h | 1 + .../flexflow/ops/residual_layer_norm_params.h | 1 + .../flexflow/ops/residual_rms_norm_params.h | 1 + include/flexflow/ops/rms_norm_params.h | 1 + include/flexflow/ops/sampling_params.h | 1 + .../flexflow/ops/sigmoid_silu_multi_params.h | 1 + include/flexflow/ops/softmax.h | 6 + include/flexflow/ops/softmax_params.h | 2 + ...spec_inc_multihead_self_attention_params.h | 2 +- include/flexflow/ops/split_params.h | 1 + include/flexflow/ops/topk_params.h | 1 + include/flexflow/ops/transpose_params.h | 1 + ...tree_inc_multihead_self_attention_params.h | 1 + .../flexflow/parallel_ops/allreduce_params.h | 1 + .../flexflow/parallel_ops/combine_params.h | 1 + .../parallel_ops/fused_parallel_op_params.h | 1 + .../flexflow/parallel_ops/partition_params.h | 1 + .../flexflow/parallel_ops/reduction_params.h | 1 + .../flexflow/parallel_ops/replicate_params.h | 1 + inference/utils/download_hf_model.py | 4 +- python/flexflow/serve/serve.py | 31 +++-- src/ops/add_bias_residual_layer_norm.cc | 12 +- src/ops/aggregate.cc | 13 +- src/ops/aggregate_spec.cc | 3 + src/ops/arg_topk.cc | 12 +- src/ops/argmax.cc | 12 +- src/ops/attention.cc | 5 +- src/ops/batch_matmul.cc | 9 +- src/ops/beam_topk.cc | 10 +- src/ops/cast.cc | 8 +- src/ops/concat.cc | 2 +- src/ops/conv_2d.cc | 9 +- src/ops/dropout.cc | 9 +- src/ops/element_binary.cc | 9 +- src/ops/element_unary.cc | 11 +- src/ops/embedding.cc | 2 +- src/ops/experts.cc | 9 +- src/ops/flat.cc | 7 + src/ops/fused.cu | 7 +- src/ops/gather.cc | 9 +- src/ops/group_by.cc | 20 ++- src/ops/inc_multihead_self_attention.cc | 5 +- src/ops/layer_norm.cc | 12 +- src/ops/linear.cc | 12 +- src/ops/pool_2d.cc | 9 +- src/ops/reduce.cc | 18 ++- src/ops/reshape.cc | 12 +- src/ops/residual_layer_norm.cc | 12 +- src/ops/residual_rms_norm.cc | 12 +- src/ops/rms_norm.cc | 12 +- src/ops/sampling.cc | 12 +- src/ops/sigmoid_silu_multi.cc | 12 +- src/ops/softmax.cc | 48 ++++++- src/ops/spec_inc_multihead_self_attention.cc | 5 +- src/ops/split.cc | 5 +- src/ops/topk.cc | 12 +- src/ops/transpose.cc | 11 +- src/ops/tree_inc_multihead_self_attention.cc | 5 +- src/parallel_ops/allreduce.cc | 5 +- src/parallel_ops/combine.cc | 5 +- src/parallel_ops/fused_parallel_op.cc | 3 + src/parallel_ops/partition.cc | 5 +- src/parallel_ops/reduction.cc | 5 +- src/parallel_ops/replicate.cc | 5 +- src/runtime/cuda_helper.cu | 24 +++- src/runtime/graph.cc | 93 +++++++++++-- src/runtime/hip_helper.cpp | 24 +++- src/runtime/operator.cc | 101 -------------- src/runtime/substitution.cc | 9 +- tests/inference/huggingface_inference.py | 52 +++++++- 96 files changed, 746 insertions(+), 190 deletions(-) diff --git a/examples/python/keras/seq_cifar10_cnn.py b/examples/python/keras/seq_cifar10_cnn.py index 281a09ed70..66ea8530e0 100644 --- a/examples/python/keras/seq_cifar10_cnn.py +++ b/examples/python/keras/seq_cifar10_cnn.py @@ -56,7 +56,7 @@ def top_level_task(): if __name__ == "__main__": - print("Sequantial model, cifar10 cnn") + print("Sequential model, cifar10 cnn") configs = ff.get_configs() ff.init_flexflow_runtime(configs) top_level_task() diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index fd21436681..73c2c3e092 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -9,6 +9,14 @@ #include "flexflow/utils/dot/record_formatter.h" #include +#include +#include +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "flexflow/utils/cuda_helper.h" +#else +#include "flexflow/utils/hip_helper.h" +#endif + namespace FlexFlow { extern LegionRuntime::Logger::Category log_measure; @@ -227,13 +235,126 @@ class Op { assert(false); }; virtual void print_layer(FFModel const &model) = 0; + template + static std::string get_op_name_without_uid(OpMetaType *m) { + std::string op_name_without_uid = std::string(m->op_name); + size_t last_underscore = op_name_without_uid.length() - 1; + for (int i = op_name_without_uid.length() - 1; i > 0; i--) { + if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) { + break; + } else if (m->op_name[i] == '_') { + last_underscore = i; + } + } + op_name_without_uid.erase(last_underscore); + return op_name_without_uid; + } + template static void save_inference_tensors_to_file( - OpMeta *m, + OpMetaType *m, int shard_id, BatchConfig const *bc, std::vector input_tensors, std::vector weight_tensors, - std::vector output_tensors); + std::vector output_tensors, + bool before_kernel = false) { + // Check if output directory exists, and create it if it does not + char const *folder_path = "./inference_tensors"; + struct stat st = {0}; + if (stat(folder_path, &st) == -1) { + // Directory does not exist, create it + mkdir(folder_path, 0700); + } + // output base filepath, shared by all tensors from the same operator + std::string op_name_without_uid = get_op_name_without_uid(m); + std::string base_filepath = + "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + + "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" + + std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" + + op_name_without_uid + "_shard-id_" + std::to_string(shard_id); + if (before_kernel) { + base_filepath += "_pre"; + } + // save batch config, if passed + if (bc != nullptr) { + bc->save_to_file(base_filepath + "_batch-config"); + } + // save all inputs + for (int i = 0; i < input_tensors.size(); i++) { + std::string filename = base_filepath + "_input_" + std::to_string(i); + if (input_tensors[i].data_type == DT_FLOAT) { + save_tensor(input_tensors[i].get_float_ptr(), + input_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (input_tensors[i].data_type == DT_HALF) { + save_tensor(input_tensors[i].get_half_ptr(), + input_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (input_tensors[i].data_type == DT_INT32) { + save_tensor(input_tensors[i].get_int32_ptr(), + input_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (input_tensors[i].data_type == DT_INT64) { + save_tensor(input_tensors[i].get_int64_ptr(), + input_tensors[i].domain.get_volume(), + filename.c_str()); + } else { + assert(false && "Tensor data type not supported"); + } + } + // only dump the weights once + if (m->decoding_step == 0) { + for (int i = 0; i < weight_tensors.size(); i++) { + std::string filename = base_filepath + "_weight_" + std::to_string(i); + if (weight_tensors[i].data_type == DT_FLOAT) { + save_tensor(weight_tensors[i].get_float_ptr(), + weight_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (weight_tensors[i].data_type == DT_HALF) { + save_tensor(weight_tensors[i].get_half_ptr(), + weight_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (weight_tensors[i].data_type == DT_INT32) { + save_tensor(weight_tensors[i].get_int32_ptr(), + weight_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (weight_tensors[i].data_type == DT_INT64) { + save_tensor(weight_tensors[i].get_int64_ptr(), + weight_tensors[i].domain.get_volume(), + filename.c_str()); + } else { + assert(false && "Tensor data type not supported"); + } + } + } + // save all outputs + for (int i = 0; i < output_tensors.size(); i++) { + std::string filename = base_filepath + "_output_" + std::to_string(i); + if (output_tensors[i].data_type == DT_FLOAT) { + save_tensor(output_tensors[i].get_float_ptr(), + output_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (output_tensors[i].data_type == DT_HALF) { + save_tensor(output_tensors[i].get_half_ptr(), + output_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (output_tensors[i].data_type == DT_INT32) { + save_tensor(output_tensors[i].get_int32_ptr(), + output_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (output_tensors[i].data_type == DT_INT64) { + save_tensor(output_tensors[i].get_int64_ptr(), + output_tensors[i].domain.get_volume(), + filename.c_str()); + } else { + assert(false && "Tensor data type not supported"); + } + } + // increase count of decoding steps + if (!before_kernel) { + m->decoding_step++; + } + } virtual bool measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const = 0; diff --git a/include/flexflow/ops/add_bias_residual_layer_norm_params.h b/include/flexflow/ops/add_bias_residual_layer_norm_params.h index 6f49983467..87fe2fb562 100644 --- a/include/flexflow/ops/add_bias_residual_layer_norm_params.h +++ b/include/flexflow/ops/add_bias_residual_layer_norm_params.h @@ -12,6 +12,7 @@ struct AddBiasResidualLayerNormParams { bool elementwise_affine; float eps; bool use_bias; + char name[MAX_OPNAME]; bool is_valid( std::pair const &) const; }; diff --git a/include/flexflow/ops/aggregate_params.h b/include/flexflow/ops/aggregate_params.h index f746881d89..deaa04b3e7 100644 --- a/include/flexflow/ops/aggregate_params.h +++ b/include/flexflow/ops/aggregate_params.h @@ -9,6 +9,7 @@ namespace FlexFlow { struct AggregateParams { int n; float lambda_bal; + char name[MAX_OPNAME]; bool is_valid(std::vector const &) const; }; bool operator==(AggregateParams const &, AggregateParams const &); diff --git a/include/flexflow/ops/aggregate_spec_params.h b/include/flexflow/ops/aggregate_spec_params.h index eb662f4c07..69e8574cba 100644 --- a/include/flexflow/ops/aggregate_spec_params.h +++ b/include/flexflow/ops/aggregate_spec_params.h @@ -9,6 +9,7 @@ namespace FlexFlow { struct AggregateSpecParams { int n; float lambda_bal; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(AggregateSpecParams const &, AggregateSpecParams const &); diff --git a/include/flexflow/ops/arg_topk_params.h b/include/flexflow/ops/arg_topk_params.h index bd9c38e2a9..b2876c011f 100644 --- a/include/flexflow/ops/arg_topk_params.h +++ b/include/flexflow/ops/arg_topk_params.h @@ -12,6 +12,7 @@ struct ArgTopKParams { int k; bool sorted; bool speculative_decoding; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(ArgTopKParams const &, ArgTopKParams const &); diff --git a/include/flexflow/ops/argmax_params.h b/include/flexflow/ops/argmax_params.h index a8f629619f..9ddb8e1fe3 100644 --- a/include/flexflow/ops/argmax_params.h +++ b/include/flexflow/ops/argmax_params.h @@ -9,6 +9,7 @@ namespace FlexFlow { struct ArgMaxParams { bool beam_search; bool is_valid(ParallelTensorShape const &) const; + char name[MAX_OPNAME]; }; bool operator==(ArgMaxParams const &, ArgMaxParams const &); diff --git a/include/flexflow/ops/attention_params.h b/include/flexflow/ops/attention_params.h index b72923a65c..89906407d3 100644 --- a/include/flexflow/ops/attention_params.h +++ b/include/flexflow/ops/attention_params.h @@ -11,6 +11,7 @@ struct MultiHeadAttentionParams { int embed_dim, num_heads, kdim, vdim; float dropout; bool bias, add_bias_kv, add_zero_attn; + char name[MAX_OPNAME]; bool is_valid(std::tuple const &) const; }; diff --git a/include/flexflow/ops/beam_topk_params.h b/include/flexflow/ops/beam_topk_params.h index 430f16e249..3e09848c9a 100644 --- a/include/flexflow/ops/beam_topk_params.h +++ b/include/flexflow/ops/beam_topk_params.h @@ -11,6 +11,7 @@ struct BeamTopKParams { LayerID layer_guid; bool sorted; int max_beam_width; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(BeamTopKParams const &, BeamTopKParams const &); diff --git a/include/flexflow/ops/cast_params.h b/include/flexflow/ops/cast_params.h index efef3de890..38a69e8a69 100644 --- a/include/flexflow/ops/cast_params.h +++ b/include/flexflow/ops/cast_params.h @@ -8,6 +8,7 @@ namespace FlexFlow { struct CastParams { DataType dtype; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(CastParams const &, CastParams const &); diff --git a/include/flexflow/ops/concat_params.h b/include/flexflow/ops/concat_params.h index 2987b25424..b1a7e74c55 100644 --- a/include/flexflow/ops/concat_params.h +++ b/include/flexflow/ops/concat_params.h @@ -7,7 +7,7 @@ namespace FlexFlow { struct ConcatParams { int axis; - + char name[MAX_OPNAME]; bool is_valid(std::vector const &) const; }; diff --git a/include/flexflow/ops/conv_2d_params.h b/include/flexflow/ops/conv_2d_params.h index 9aac91e315..562d5adef9 100644 --- a/include/flexflow/ops/conv_2d_params.h +++ b/include/flexflow/ops/conv_2d_params.h @@ -13,6 +13,7 @@ struct Conv2DParams { padding_w, groups; ActiMode activation; bool use_bias; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &input) const; void solve_dims(ParallelTensorShape const &input, diff --git a/include/flexflow/ops/dropout_params.h b/include/flexflow/ops/dropout_params.h index 61aee12f9f..eb1a4d98cf 100644 --- a/include/flexflow/ops/dropout_params.h +++ b/include/flexflow/ops/dropout_params.h @@ -9,6 +9,7 @@ namespace FlexFlow { struct DropoutParams { float rate; unsigned long long seed; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(DropoutParams const &, DropoutParams const &); diff --git a/include/flexflow/ops/element_binary_params.h b/include/flexflow/ops/element_binary_params.h index 9489b793a7..bfbb758b6e 100644 --- a/include/flexflow/ops/element_binary_params.h +++ b/include/flexflow/ops/element_binary_params.h @@ -11,6 +11,7 @@ struct ElementBinaryParams { LayerID layer_guid; OperatorType type; bool inplace_a; + char name[MAX_OPNAME]; bool is_valid( std::pair const &) const; diff --git a/include/flexflow/ops/element_unary_params.h b/include/flexflow/ops/element_unary_params.h index 1aac85c43e..16cb015e3c 100644 --- a/include/flexflow/ops/element_unary_params.h +++ b/include/flexflow/ops/element_unary_params.h @@ -12,6 +12,7 @@ struct ElementUnaryParams { bool inplace; float scalar = 0.0; LayerID layer_guid; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/embedding_params.h b/include/flexflow/ops/embedding_params.h index 71e5cc8b20..d813132048 100644 --- a/include/flexflow/ops/embedding_params.h +++ b/include/flexflow/ops/embedding_params.h @@ -12,6 +12,7 @@ struct EmbeddingParams { LayerID layer_guid; AggrMode aggr; DataType data_type; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/experts_params.h b/include/flexflow/ops/experts_params.h index 7adced3c8c..90cce47526 100644 --- a/include/flexflow/ops/experts_params.h +++ b/include/flexflow/ops/experts_params.h @@ -17,6 +17,7 @@ struct ExpertsParams { int experts_internal_dim_size; bool use_bias; ActiMode activation; + char name[MAX_OPNAME]; bool is_valid(std::vector const &) const; }; diff --git a/include/flexflow/ops/flat_params.h b/include/flexflow/ops/flat_params.h index 5f821b0416..fc006849e5 100644 --- a/include/flexflow/ops/flat_params.h +++ b/include/flexflow/ops/flat_params.h @@ -7,6 +7,7 @@ namespace FlexFlow { struct FlatParams { + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; void solve_dims(ParallelTensorShape const &input, ParallelDim output_dims[MAX_TENSOR_DIM], diff --git a/include/flexflow/ops/gather_params.h b/include/flexflow/ops/gather_params.h index 51f1184a72..de27cdfc7c 100644 --- a/include/flexflow/ops/gather_params.h +++ b/include/flexflow/ops/gather_params.h @@ -10,6 +10,7 @@ namespace FlexFlow { struct GatherParams { int legion_dim; LayerID layer_guid; + char name[MAX_OPNAME]; bool is_valid( std::pair const &input) const; }; diff --git a/include/flexflow/ops/groupby_params.h b/include/flexflow/ops/groupby_params.h index 24a74f5412..4f6245863a 100644 --- a/include/flexflow/ops/groupby_params.h +++ b/include/flexflow/ops/groupby_params.h @@ -9,6 +9,7 @@ namespace FlexFlow { struct Group_byParams { int n; float alpha; + char name[MAX_OPNAME]; bool is_valid( std::pair const &) const; }; diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h index 7ae39f1cfe..58681069e2 100644 --- a/include/flexflow/ops/inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/inc_multihead_self_attention_params.h @@ -16,6 +16,7 @@ struct IncMultiHeadSelfAttentionParams { scaling_query, qk_prod_scaling, position_bias; DataType quantization_type; bool offload; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/layer_norm_params.h b/include/flexflow/ops/layer_norm_params.h index c9aa40048d..3effce6204 100644 --- a/include/flexflow/ops/layer_norm_params.h +++ b/include/flexflow/ops/layer_norm_params.h @@ -12,6 +12,7 @@ struct LayerNormParams { bool elementwise_affine; float eps; bool use_bias; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/linear_params.h b/include/flexflow/ops/linear_params.h index 563304e89f..9a62ebd857 100644 --- a/include/flexflow/ops/linear_params.h +++ b/include/flexflow/ops/linear_params.h @@ -20,6 +20,7 @@ class LinearParams { float kernel_reg_lambda; DataType quantization_type; bool offload; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &input_shape) const; void solve_dims(const ParallelTensor input, diff --git a/include/flexflow/ops/pool_2d_params.h b/include/flexflow/ops/pool_2d_params.h index 7d4f1f1c12..54af7f9db6 100644 --- a/include/flexflow/ops/pool_2d_params.h +++ b/include/flexflow/ops/pool_2d_params.h @@ -10,6 +10,7 @@ struct Pool2DParams { int kernel_h, kernel_w, stride_h, stride_w, padding_h, padding_w; PoolType pool_type; ActiMode activation; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &input) const; void solve_dims(ParallelTensorShape const &input, diff --git a/include/flexflow/ops/reduce_params.h b/include/flexflow/ops/reduce_params.h index b79ba9157a..478649584f 100644 --- a/include/flexflow/ops/reduce_params.h +++ b/include/flexflow/ops/reduce_params.h @@ -10,6 +10,7 @@ struct ReduceParams { std::vector axes; bool keepdims; LayerID layer_guid; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/reshape_params.h b/include/flexflow/ops/reshape_params.h index ffd88948ea..15753c8e17 100644 --- a/include/flexflow/ops/reshape_params.h +++ b/include/flexflow/ops/reshape_params.h @@ -10,6 +10,7 @@ namespace FlexFlow { struct ReshapeParams { std::vector shape; LayerID layer_guid; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/residual_layer_norm_params.h b/include/flexflow/ops/residual_layer_norm_params.h index 24da4a2c08..949ae0c799 100644 --- a/include/flexflow/ops/residual_layer_norm_params.h +++ b/include/flexflow/ops/residual_layer_norm_params.h @@ -13,6 +13,7 @@ struct ResidualLayerNormParams { float eps; bool use_bias; bool use_two_residuals; + char name[MAX_OPNAME]; bool is_valid(std::tuple const &) const; diff --git a/include/flexflow/ops/residual_rms_norm_params.h b/include/flexflow/ops/residual_rms_norm_params.h index 64751a30b0..a4e4de59ab 100644 --- a/include/flexflow/ops/residual_rms_norm_params.h +++ b/include/flexflow/ops/residual_rms_norm_params.h @@ -11,6 +11,7 @@ struct ResidualRMSNormParams { LayerID layer_guid; float eps; int dim; + char name[MAX_OPNAME]; bool is_valid( std::pair const &input) const; }; diff --git a/include/flexflow/ops/rms_norm_params.h b/include/flexflow/ops/rms_norm_params.h index 81295322f0..2e4ceecf48 100644 --- a/include/flexflow/ops/rms_norm_params.h +++ b/include/flexflow/ops/rms_norm_params.h @@ -11,6 +11,7 @@ struct RMSNormParams { LayerID layer_guid; float eps; int dim; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/sampling_params.h b/include/flexflow/ops/sampling_params.h index 1449ddbf54..ddc98a3d6c 100644 --- a/include/flexflow/ops/sampling_params.h +++ b/include/flexflow/ops/sampling_params.h @@ -8,6 +8,7 @@ namespace FlexFlow { struct SamplingParams { float top_p; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(SamplingParams const &, SamplingParams const &); diff --git a/include/flexflow/ops/sigmoid_silu_multi_params.h b/include/flexflow/ops/sigmoid_silu_multi_params.h index c8182505b3..eb152db5c1 100644 --- a/include/flexflow/ops/sigmoid_silu_multi_params.h +++ b/include/flexflow/ops/sigmoid_silu_multi_params.h @@ -8,6 +8,7 @@ namespace FlexFlow { struct SigmoidSiluMultiParams { LayerID layer_guid; + char name[MAX_OPNAME]; bool is_valid( std::pair const &) const; }; diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index 6fd1a434d4..61094f7361 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -14,6 +14,7 @@ class Softmax : public Op { using Params = SoftmaxParams; using Input = ParallelTensor; Softmax(FFModel &model, + LayerID const &_layer_guid, const ParallelTensor logit, int dim, char const *name); @@ -60,6 +61,11 @@ class Softmax : public Op { bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; + void serialize(Legion::Serializer &) const override; + static PCG::Node deserialize(FFModel &ff, + Legion::Deserializer &d, + ParallelTensor inputs[], + int num_inputs); Params get_params() const; private: diff --git a/include/flexflow/ops/softmax_params.h b/include/flexflow/ops/softmax_params.h index d805d9966d..63dc87641f 100644 --- a/include/flexflow/ops/softmax_params.h +++ b/include/flexflow/ops/softmax_params.h @@ -6,7 +6,9 @@ namespace FlexFlow { struct SoftmaxParams { + LayerID layer_guid; int dim; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(SoftmaxParams const &, SoftmaxParams const &); diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h index 2f7a706bf1..1461224ba9 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h @@ -13,7 +13,7 @@ struct SpecIncMultiHeadSelfAttentionParams { float dropout, scaling_factor; bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling, position_bias; - + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/split_params.h b/include/flexflow/ops/split_params.h index f0f3b2e956..e21a1ab4a1 100644 --- a/include/flexflow/ops/split_params.h +++ b/include/flexflow/ops/split_params.h @@ -8,6 +8,7 @@ namespace FlexFlow { struct SplitParams { std::vector splits; int legion_axis; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/topk_params.h b/include/flexflow/ops/topk_params.h index 8b9a0f1bd5..01c6ae9da7 100644 --- a/include/flexflow/ops/topk_params.h +++ b/include/flexflow/ops/topk_params.h @@ -9,6 +9,7 @@ namespace FlexFlow { struct TopKParams { int k; bool sorted; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(TopKParams const &, TopKParams const &); diff --git a/include/flexflow/ops/transpose_params.h b/include/flexflow/ops/transpose_params.h index 42737ee3e9..2e3e34007a 100644 --- a/include/flexflow/ops/transpose_params.h +++ b/include/flexflow/ops/transpose_params.h @@ -6,6 +6,7 @@ namespace FlexFlow { struct TransposeParams { std::vector perm; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h index 14fcde74ba..d1a51b8b8f 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h @@ -16,6 +16,7 @@ struct TreeIncMultiHeadSelfAttentionParams { scaling_query, qk_prod_scaling, position_bias; DataType quantization_type; bool offload; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/parallel_ops/allreduce_params.h b/include/flexflow/parallel_ops/allreduce_params.h index c04676ffeb..a0daac8f9a 100644 --- a/include/flexflow/parallel_ops/allreduce_params.h +++ b/include/flexflow/parallel_ops/allreduce_params.h @@ -5,6 +5,7 @@ namespace FlexFlow { struct AllReduceParams { int allreduce_legion_dim; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(AllReduceParams const &, AllReduceParams const &); diff --git a/include/flexflow/parallel_ops/combine_params.h b/include/flexflow/parallel_ops/combine_params.h index 74ef01e08f..8ca05f7f50 100644 --- a/include/flexflow/parallel_ops/combine_params.h +++ b/include/flexflow/parallel_ops/combine_params.h @@ -6,6 +6,7 @@ namespace FlexFlow { struct CombineParams { int combine_legion_dim; int combine_degree; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(CombineParams const &, CombineParams const &); diff --git a/include/flexflow/parallel_ops/fused_parallel_op_params.h b/include/flexflow/parallel_ops/fused_parallel_op_params.h index cba3844a4c..8c56b30998 100644 --- a/include/flexflow/parallel_ops/fused_parallel_op_params.h +++ b/include/flexflow/parallel_ops/fused_parallel_op_params.h @@ -7,6 +7,7 @@ namespace FlexFlow { struct FusedParallelOpParams { std::vector parallel_ops; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(FusedParallelOpParams const &, FusedParallelOpParams const &); diff --git a/include/flexflow/parallel_ops/partition_params.h b/include/flexflow/parallel_ops/partition_params.h index 921ab43eaf..33ccf6b02c 100644 --- a/include/flexflow/parallel_ops/partition_params.h +++ b/include/flexflow/parallel_ops/partition_params.h @@ -6,6 +6,7 @@ namespace FlexFlow { struct RepartitionParams { int repartition_legion_dim; int repartition_degree; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(RepartitionParams const &, RepartitionParams const &); diff --git a/include/flexflow/parallel_ops/reduction_params.h b/include/flexflow/parallel_ops/reduction_params.h index fab7da2626..60b6c4f6aa 100644 --- a/include/flexflow/parallel_ops/reduction_params.h +++ b/include/flexflow/parallel_ops/reduction_params.h @@ -6,6 +6,7 @@ namespace FlexFlow { struct ReductionParams { int reduction_legion_dim; int reduction_degree; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(ReductionParams const &, ReductionParams const &); diff --git a/include/flexflow/parallel_ops/replicate_params.h b/include/flexflow/parallel_ops/replicate_params.h index 06edbc1ddc..da1f94217c 100644 --- a/include/flexflow/parallel_ops/replicate_params.h +++ b/include/flexflow/parallel_ops/replicate_params.h @@ -6,6 +6,7 @@ namespace FlexFlow { struct ReplicateParams { int replicate_legion_dim; int replicate_degree; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; bool operator==(ReplicateParams const &, ReplicateParams const &); diff --git a/inference/utils/download_hf_model.py b/inference/utils/download_hf_model.py index 03fc8e1633..94a8c23e68 100644 --- a/inference/utils/download_hf_model.py +++ b/inference/utils/download_hf_model.py @@ -36,9 +36,9 @@ def parse_args(): def main(args): if args.full_precision_only: - data_types = ff.DataType.DT_FLOAT + data_types = (ff.DataType.DT_FLOAT,) elif args.half_precision_only: - data_types = ff.DataType.DT_HALF + data_types = (ff.DataType.DT_HALF,) else: data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index d1a935e5fc..5c3cac9303 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -117,10 +117,11 @@ def __init__( self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow" self.refresh_cache = refresh_cache self.output_file = output_file + self.rm = None def __del__(self): # Stop the background server before deleting the object - if type(self) == LLM: + if type(self) == LLM and self.rm is not None: self.rm.stop_server() def __get_ff_model_type(self): @@ -320,9 +321,9 @@ def compile( :param ssms: The SSMs to use when operating in speculative inference mode, defaults to [] :type ssms: list, optional """ - #self.max_requests_per_batch = max_requests_per_batch - #self.max_seq_length = max_seq_length - #self.max_tokens_per_batch = max_tokens_per_batch + # self.max_requests_per_batch = max_requests_per_batch + # self.max_seq_length = max_seq_length + # self.max_tokens_per_batch = max_tokens_per_batch self.ssms = ssms self.generation_config = GenerationConfig() self.ffconfig = FFConfig() @@ -362,7 +363,7 @@ def compile( self.ffconfig, self.hf_config, self.data_type, - max_tokens_per_batch + max_tokens_per_batch, ) # Download the weights from huggingface (if needed) @@ -378,7 +379,7 @@ def compile( model_configs.hidden_size, model_configs.hidden_size // model_configs.num_attention_heads, self.ffconfig.tensor_parallelism_degree, - self.data_type == DataType.DT_FLOAT + self.data_type == DataType.DT_FLOAT, ) # Register weights file loader @@ -404,8 +405,11 @@ def compile( self.rm.register_ssm_model(ssm.model.ffmodel) # start background server - if (mode == InferenceMode.TREE_VERIFY_MODE) or (mode == InferenceMode.INC_DECODING_MODE): + if (mode == InferenceMode.TREE_VERIFY_MODE) or ( + mode == InferenceMode.INC_DECODING_MODE + ): import atexit + atexit.register(self.rm.stop_server) def generate(self, prompts: Union[str, List[str]], max_length: int = 128): @@ -426,26 +430,27 @@ def generate(self, prompts: Union[str, List[str]], max_length: int = 128): return self.model.ffmodel.generate(prompts, max_length) else: assert False, "Please pass a non-empty string or list of strings" - + def start_server(self): self.rm.start_server(self.model.ffmodel) print("Background server started.") - + def stop_server(self): self.rm.stop_server() - print("Background server stoped.") - + print("Background server stopped.") + def __enter__(self): # Start the server when entering the context - #self.rm.start_server(self.model.ffmodel) + # self.rm.start_server(self.model.ffmodel) return self def __exit__(self, exc_type, exc_value, traceback): # Stop the server when exiting the context - #self.rm.stop_server() + # self.rm.stop_server() if exc_type: print(f"Exception occurred: {exc_value}") + class SSM(LLM): """This class creates a SSM (Small-Speculative Model) object based on a model from HuggingFace""" diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index 42fbb3016a..e670380901 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -58,6 +58,9 @@ AddBiasResidualLayerNormParams AddBiasResidualLayerNorm::get_params() const { params.elementwise_affine = this->elementwise_affine; params.eps = this->eps; params.use_bias = this->use_bias; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -213,7 +216,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm( params.use_bias, params.eps, allocate_weights, - name) {} + params.name) {} AddBiasResidualLayerNorm::AddBiasResidualLayerNorm( FFModel &model, @@ -755,6 +758,8 @@ void AddBiasResidualLayerNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->elementwise_affine); sez.serialize(this->eps); sez.serialize(this->use_bias); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -783,6 +788,10 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff, dez.deserialize(elementwise_affine); dez.deserialize(eps); dez.deserialize(use_bias); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); AddBiasResidualLayerNormParams params; params.layer_guid = layer_guid; @@ -790,6 +799,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff, params.elementwise_affine = elementwise_affine; params.eps = eps; params.use_bias = use_bias; + strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); } diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index 67810d3f5b..5f05458e34 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -85,6 +85,9 @@ AggregateParams Aggregate::get_params() const { AggregateParams params; params.n = this->n; params.lambda_bal = this->lambda_bal; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -164,7 +167,8 @@ Aggregate::Aggregate(FFModel &model, AggregateParams const ¶ms, std::vector const &inputs, char const *name) - : Aggregate(model, inputs.data(), params.n, params.lambda_bal, name) {} + : Aggregate( + model, inputs.data(), params.n, params.lambda_bal, params.name) {} using PCG::Node; Node Aggregate::deserialize(FFModel &ff, @@ -175,10 +179,15 @@ Node Aggregate::deserialize(FFModel &ff, float lambda_bal; dez.deserialize(n); dez.deserialize(lambda_bal); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); assert(num_inputs == n + 4); AggregateParams params; params.n = n; params.lambda_bal = lambda_bal; + strcpy(params.name, name); return ff.get_or_create_node(inputs, params); } @@ -567,6 +576,8 @@ void Aggregate::backward_task(Task const *task, void Aggregate::serialize(Legion::Serializer &sez) const { sez.serialize(this->n); sez.serialize(this->lambda_bal); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } bool Aggregate::measure_operator_cost(Simulator *sim, diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc index 19b2edc14a..1edd430881 100644 --- a/src/ops/aggregate_spec.cc +++ b/src/ops/aggregate_spec.cc @@ -84,6 +84,9 @@ AggregateSpecParams AggregateSpec::get_params() const { AggregateSpecParams params; params.n = this->n; params.lambda_bal = this->lambda_bal; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc index 2727a1d249..780a77450e 100644 --- a/src/ops/arg_topk.cc +++ b/src/ops/arg_topk.cc @@ -112,6 +112,9 @@ ArgTopKParams ArgTopK::get_params() const { params.k = this->k; params.sorted = this->sorted; params.speculative_decoding = this->speculative_decoding; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -183,7 +186,7 @@ ArgTopK::ArgTopK(FFModel &model, params.k, params.sorted, params.speculative_decoding, - name) {} + params.name) {} void ArgTopK::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -446,6 +449,8 @@ void ArgTopK::serialize(Legion::Serializer &sez) const { sez.serialize(this->k); sez.serialize(this->sorted); sez.serialize(this->speculative_decoding); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } Node ArgTopK::deserialize(FFModel &ff, @@ -464,11 +469,16 @@ Node ArgTopK::deserialize(FFModel &ff, dez.deserialize(k); dez.deserialize(sorted); dez.deserialize(speculative_decoding); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); ArgTopKParams params; params.layer_guid = layer_guid; params.k = k; params.sorted = sorted; params.speculative_decoding = speculative_decoding; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index dc7e4ea3b3..a52ce1886b 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -91,6 +91,9 @@ Op *ArgMax::create_operator_from_layer( ArgMaxParams ArgMax::get_params() const { ArgMaxParams params; params.beam_search = this->beam_search; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -140,7 +143,7 @@ ArgMax::ArgMax(FFModel &model, ArgMaxParams const ¶ms, const ParallelTensor input, char const *name) - : ArgMax(model, input, params.beam_search, name) {} + : ArgMax(model, input, params.beam_search, params.name) {} void ArgMax::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -410,6 +413,8 @@ void ArgMax::backward(FFModel const &ff) { void ArgMax::serialize(Legion::Serializer &sez) const { sez.serialize(this->beam_search); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } Node ArgMax::deserialize(FFModel &ff, @@ -419,8 +424,13 @@ Node ArgMax::deserialize(FFModel &ff, assert(num_inputs == 1); bool beam_search; dez.deserialize(beam_search); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); ArgMaxParams params; params.beam_search = beam_search; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/attention.cc b/src/ops/attention.cc index 1f71be07a8..97afc94341 100644 --- a/src/ops/attention.cc +++ b/src/ops/attention.cc @@ -363,7 +363,7 @@ MultiHeadAttention::MultiHeadAttention( params.add_bias_kv, params.add_zero_attn, allocate_weights, - name) {} + params.name) {} void MultiHeadAttention::init_inference( FFModel const &ff, @@ -1013,6 +1013,9 @@ MultiHeadAttentionParams MultiHeadAttention::get_params() const { params.bias = this->bias; params.add_bias_kv = this->add_bias_kv; params.add_zero_attn = this->add_zero_attn; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } diff --git a/src/ops/batch_matmul.cc b/src/ops/batch_matmul.cc index f4b06877e5..e13169f6c1 100644 --- a/src/ops/batch_matmul.cc +++ b/src/ops/batch_matmul.cc @@ -138,7 +138,7 @@ BatchMatmul::BatchMatmul( inputs.second, params.a_seq_length_dim, params.b_seq_length_dim, - name) {} + params.name) {} // return A*B BatchMatmul::BatchMatmul(FFModel &model, @@ -190,6 +190,8 @@ void BatchMatmul::serialize(Legion::Serializer &sez) const { BatchMatmulParams params = get_params(); sez.serialize(params.a_seq_length_dim); sez.serialize(params.b_seq_length_dim); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -202,10 +204,15 @@ Node BatchMatmul::deserialize(FFModel &ff, int a_seq_length_dim, b_seq_length_dim; dez.deserialize(a_seq_length_dim); dez.deserialize(b_seq_length_dim); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); BatchMatmulParams params; params.a_seq_length_dim = a_seq_length_dim; params.b_seq_length_dim = b_seq_length_dim; + strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); } diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 18d0ec1587..d2054cacb0 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -170,7 +170,7 @@ BeamTopK::BeamTopK(FFModel &model, params.layer_guid, params.max_beam_width, params.sorted, - name) {} + params.name) {} void BeamTopK::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -418,6 +418,8 @@ void BeamTopK::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->sorted); sez.serialize(this->max_beam_width); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } Node BeamTopK::deserialize(FFModel &ff, @@ -434,10 +436,16 @@ Node BeamTopK::deserialize(FFModel &ff, LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(sorted); dez.deserialize(max_beam_width); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); + BeamTopKParams params; params.layer_guid = layer_guid; params.sorted = sorted; params.max_beam_width = max_beam_width; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/cast.cc b/src/ops/cast.cc index 2a845cb303..e514236a31 100644 --- a/src/ops/cast.cc +++ b/src/ops/cast.cc @@ -112,7 +112,7 @@ Cast::Cast(FFModel &model, CastParams const ¶ms, ParallelTensor const &input, char const *name) - : Cast(model, input, params.dtype, name) {} + : Cast(model, input, params.dtype, params.name) {} void Cast::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); @@ -409,6 +409,8 @@ bool Cast::measure_operator_cost(Simulator *sim, void Cast::serialize(Legion::Serializer &sez) const { sez.serialize(this->outputs[0]->data_type); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -420,6 +422,10 @@ Node Cast::deserialize(FFModel &ff, assert(num_inputs == 1); DataType dtype; dez.deserialize(dtype); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); return ff.get_or_create_node(inputs[0], {dtype}); } diff --git a/src/ops/concat.cc b/src/ops/concat.cc index 80935e387b..d4d8e525fc 100644 --- a/src/ops/concat.cc +++ b/src/ops/concat.cc @@ -147,7 +147,7 @@ Concat::Concat(FFModel &model, ConcatParams const ¶ms, std::vector const &inputs, char const *name) - : Concat(model, inputs.size(), inputs.data(), params.axis, name) {} + : Concat(model, inputs.size(), inputs.data(), params.axis, params.name) {} void Concat::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); diff --git a/src/ops/conv_2d.cc b/src/ops/conv_2d.cc index 7d8fd32570..94850a178d 100644 --- a/src/ops/conv_2d.cc +++ b/src/ops/conv_2d.cc @@ -389,7 +389,7 @@ Conv2D::Conv2D(FFModel &model, params.groups, params.use_bias, allocate_weights, - name) {} + params.name) {} bool Conv2DParams::is_valid(ParallelTensorShape const &input) const { ParallelTensorShape output_shape, kernel_shape, bias_shape; @@ -1026,6 +1026,8 @@ void Conv2D::serialize(Legion::Serializer &sez) const { sez.serialize(this->groups); sez.serialize(this->use_bias); sez.serialize(this->activation); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -1055,6 +1057,10 @@ Node Conv2D::deserialize(FFModel &ff, dez.deserialize(groups); dez.deserialize(use_bias); dez.deserialize(activation); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); Conv2DParams params; params.layer_guid = layer_guid; @@ -1068,6 +1074,7 @@ Node Conv2D::deserialize(FFModel &ff, params.groups = groups; params.use_bias = use_bias; params.activation = activation; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/dropout.cc b/src/ops/dropout.cc index 9b11c9d912..58cb82d53d 100644 --- a/src/ops/dropout.cc +++ b/src/ops/dropout.cc @@ -118,7 +118,7 @@ Dropout::Dropout(FFModel &model, DropoutParams const ¶ms, const ParallelTensor input, char const *name) - : Dropout(model, input, params.rate, params.seed, name) {} + : Dropout(model, input, params.rate, params.seed, params.name) {} void Dropout::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); @@ -272,6 +272,8 @@ void Dropout::backward_task(Task const *task, void Dropout::serialize(Legion::Serializer &sez) const { sez.serialize(this->rate); sez.serialize(this->seed); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } Node Dropout::deserialize(FFModel &ff, @@ -283,9 +285,14 @@ Node Dropout::deserialize(FFModel &ff, float rate; dez.deserialize(rate); dez.deserialize(seed); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); DropoutParams params; params.rate = rate; params.seed = seed; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 42c6487581..4352f459b9 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -252,7 +252,7 @@ ElementBinary::ElementBinary( inputs.first, inputs.second, params.inplace_a, - name) {} + params.name) {} void ElementBinary::map_output_tensors(FFModel &ff) { if (has_inplace_output()) { @@ -1128,6 +1128,8 @@ void ElementBinary::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->op_type); sez.serialize(this->inplace_a); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -1146,11 +1148,16 @@ Node ElementBinary::deserialize(FFModel &ff, LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(op_type); dez.deserialize(inplace_a); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); ElementBinaryParams params; params.layer_guid = layer_guid; params.type = op_type; params.inplace_a = inplace_a; + strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); } diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc index 9fb2e6dc1f..0e1d115557 100644 --- a/src/ops/element_unary.cc +++ b/src/ops/element_unary.cc @@ -212,7 +212,7 @@ ElementUnary::ElementUnary(FFModel &model, params.op_type, input, params.inplace, - name, + params.name, params.scalar) {} void ElementUnary::map_output_tensors(FFModel &ff) { @@ -557,7 +557,7 @@ void ElementUnary::forward_task_with_type( assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; std::vector input_accessors; - std::vector output_accessors; + std::vector output_accessors; if (m->inplace) { GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->data_type, regions[0], task->regions[0], FID_DATA, ctx, runtime); @@ -723,6 +723,8 @@ void ElementUnary::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->layer_guid.model_id); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } bool ElementUnary::measure_operator_cost(Simulator *sim, @@ -837,6 +839,10 @@ Node ElementUnary::deserialize(FFModel &ff, dez.deserialize(id); dez.deserialize(transformer_layer_id); dez.deserialize(deserialized_model_id); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); ElementUnaryParams params; @@ -844,6 +850,7 @@ Node ElementUnary::deserialize(FFModel &ff, params.inplace = inplace; params.scalar = scalar; params.layer_guid = layer_guid; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 40d5b600be..e630563b63 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -243,7 +243,7 @@ Embedding::Embedding(FFModel &model, params.aggr, allocate_weights, params.data_type, - name) {} + params.name) {} Embedding::Embedding(FFModel &model, Embedding const &other, diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 6a7d622e51..8c66f9c7bc 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -260,7 +260,7 @@ Experts::Experts(FFModel &model, params.use_bias, params.activation, allocate_weights, - name) {} + params.name) {} Experts::Experts(FFModel &model, LayerID const &_layer_guid, @@ -407,6 +407,8 @@ void Experts::serialize(Legion::Serializer &sez) const { sez.serialize(params.experts_internal_dim_size); sez.serialize(params.use_bias); sez.serialize(params.activation); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -432,6 +434,10 @@ Node Experts::deserialize(FFModel &ff, dez.deserialize(experts_internal_dim_size); dez.deserialize(use_bias); dez.deserialize(activation); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); assert(num_inputs == 3); @@ -445,6 +451,7 @@ Node Experts::deserialize(FFModel &ff, params.experts_internal_dim_size = experts_internal_dim_size; params.use_bias = use_bias; params.activation = activation; + strcpy(params.name, name); return ff.get_or_create_node(inputs, params); } diff --git a/src/ops/flat.cc b/src/ops/flat.cc index 669c457709..80aedbbb31 100644 --- a/src/ops/flat.cc +++ b/src/ops/flat.cc @@ -16,6 +16,7 @@ #include "flexflow/ops/flat.h" #include "flexflow/model.h" #include "flexflow/ops/kernels/flat_kernels.h" +#include "legion/legion_utilities.h" namespace FlexFlow { @@ -317,6 +318,8 @@ Domain Flat::get_input_tensor_shape(ParallelConfig const &pc, } void Flat::serialize(Legion::Serializer &sez) const { + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); return; } @@ -391,6 +394,10 @@ Node Flat::deserialize(FFModel &ff, ParallelTensor inputs[], int num_inputs) { assert(num_inputs == 1); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); return ff.get_or_create_node(inputs[0], {}); } diff --git a/src/ops/fused.cu b/src/ops/fused.cu index c6ba0b04c5..483028599e 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -537,7 +537,7 @@ __host__ void Context ctx, Runtime *runtime) { // const FusedOp* fused = (FusedOp*) task->args; - FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); + FusedOpMeta *metas = *((FusedOpMeta **)task->local_args); FusedOp const *fused = metas->fused_op; // BatchConfig const *bc = (BatchConfig *)task->args; BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); @@ -1097,7 +1097,7 @@ __host__ void if (metas->meta[op]->inference_debugging) { std::vector input_accessors_to_save; std::vector weight_accessors_to_save; - std::vector output_accessors_to_save; + std::vector output_accessors_to_save; for (int i = 0; i < fused->op_num_inputs[op]; i++) { int my_off = fused->op_input_idx[i + ioff]; if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { @@ -1114,8 +1114,7 @@ __host__ void weight_accessor[fused->op_weight_idx[i + woff]]); } for (int i = 0; i < fused->op_num_outputs[op]; i++) { - int my_off = fused->op_output_idx[i + ooff]; - output_accessors_to_save.push_back(output_accessor[my_off]); + output_accessors_to_save.push_back(output_accessor[i + ooff]); } assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/ops/gather.cc b/src/ops/gather.cc index d7c1dee44c..85580ed803 100644 --- a/src/ops/gather.cc +++ b/src/ops/gather.cc @@ -125,7 +125,7 @@ Gather::Gather(FFModel &model, inputs.first, inputs.second, params.legion_dim, - name) {} + params.name) {} Gather::Gather(FFModel &model, LayerID const &_layer_guid, @@ -168,6 +168,8 @@ void Gather::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->layer_guid.model_id); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -183,11 +185,16 @@ Node Gather::deserialize(FFModel &ff, dez.deserialize(id); dez.deserialize(transformer_layer_id); dez.deserialize(deserialized_model_id); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); GatherParams params; params.legion_dim = legion_dim; params.layer_guid = layer_guid; + strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); } diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index 50871983f5..f2f402737c 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -99,6 +99,9 @@ Group_byParams Group_by::get_params() const { Group_byParams params; params.n = this->n; params.alpha = this->alpha; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -161,8 +164,12 @@ Group_by::Group_by(FFModel &model, Group_byParams const ¶ms, std::pair const &inputs, char const *name) - : Group_by( - model, inputs.first, inputs.second, params.n, params.alpha, name) {} + : Group_by(model, + inputs.first, + inputs.second, + params.n, + params.alpha, + params.name) {} void Group_by::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -396,7 +403,7 @@ void Group_by::forward_task(Task const *task, // Create a vector of n outputs, where n is the number of experts. // Each entry in the "outputs" vector points to the Legion tensor that will // contain the tockens dispatched to the corresponding expert - std::vector output_accessors; + std::vector output_accessors; float *outputs[n]; for (int i = 0; i < n; i++) { GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( @@ -522,6 +529,8 @@ void Group_by::backward_task(Task const *task, void Group_by::serialize(Legion::Serializer &sez) const { sez.serialize(this->n); sez.serialize(this->alpha); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } Node Group_by::deserialize(FFModel &ff, @@ -533,9 +542,14 @@ Node Group_by::deserialize(FFModel &ff, float alpha; dez.deserialize(n); dez.deserialize(alpha); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); Group_byParams params; params.n = n; params.alpha = alpha; + strcpy(params.name, name); return ff.get_or_create_node(std::make_pair(inputs[0], inputs[1]), params); } diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 8a3e9c96b1..7aa3503770 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -565,7 +565,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( params.quantization_type, params.offload, params.tensor_parallelism_degree, - name) {} + params.name) {} void IncMultiHeadSelfAttention::init_inference( FFModel const &ff, @@ -929,6 +929,9 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const { params.quantization_type = this->quantization_type; params.offload = this->offload; params.num_kv_heads = this->num_kv_heads; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index bc1358e49c..2218ffe392 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -56,6 +56,9 @@ LayerNormParams LayerNorm::get_params() const { params.elementwise_affine = this->elementwise_affine; params.eps = this->eps; params.use_bias = this->use_bias; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -198,7 +201,7 @@ LayerNorm::LayerNorm(FFModel &model, params.use_bias, params.eps, allocate_weights, - name) {} + params.name) {} LayerNorm::LayerNorm(FFModel &model, LayerID const &_layer_guid, @@ -883,6 +886,8 @@ void LayerNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->elementwise_affine); sez.serialize(this->eps); sez.serialize(this->use_bias); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -911,6 +916,10 @@ Node LayerNorm::deserialize(FFModel &ff, dez.deserialize(elementwise_affine); dez.deserialize(eps); dez.deserialize(use_bias); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); LayerNormParams params; params.layer_guid = layer_guid; @@ -918,6 +927,7 @@ Node LayerNorm::deserialize(FFModel &ff, params.elementwise_affine = elementwise_affine; params.eps = eps; params.use_bias = use_bias; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 6ca6038778..03c9e48af8 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -190,7 +190,7 @@ Linear::Linear(FFModel &model, params.quantization_type, params.offload, allocate_weights, - name) {} + params.name) {} Linear::Linear(FFModel &model, LayerID const &_layer_guid, @@ -1258,6 +1258,8 @@ void Linear::serialize(Legion::Serializer &sez) const { sez.serialize(this->data_type); sez.serialize(this->quantization_type); sez.serialize(this->offload); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } /* static */ @@ -1288,6 +1290,10 @@ Node Linear::deserialize(FFModel &ff, dez.deserialize(data_type); dez.deserialize(quantization_type); dez.deserialize(offload); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); LinearParams params; params.activation = activation; @@ -1299,6 +1305,7 @@ Node Linear::deserialize(FFModel &ff, params.layer_guid = layer_guid; params.quantization_type = quantization_type; params.offload = offload; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } @@ -1313,6 +1320,9 @@ LinearParams Linear::get_params() const { params.kernel_reg_lambda = this->kernel_reg_lambda; params.quantization_type = this->quantization_type; params.offload = this->offload; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } diff --git a/src/ops/pool_2d.cc b/src/ops/pool_2d.cc index e358448ddf..4621ab5909 100644 --- a/src/ops/pool_2d.cc +++ b/src/ops/pool_2d.cc @@ -269,7 +269,7 @@ Pool2D::Pool2D(FFModel &model, params.padding_w, params.pool_type, params.activation, - name) {} + params.name) {} void Pool2D::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); @@ -521,6 +521,8 @@ void Pool2D::serialize(Legion::Serializer &sez) const { sez.serialize(this->padding_w); sez.serialize(this->pool_type); sez.serialize(this->activation); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } bool Pool2D::measure_operator_cost(Simulator *sim, @@ -657,6 +659,10 @@ Node Pool2D::deserialize(FFModel &ff, dez.deserialize(padding_w); dez.deserialize(pool_type); dez.deserialize(activation); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); Pool2DParams params; params.kernel_h = kernel_h; @@ -667,6 +673,7 @@ Node Pool2D::deserialize(FFModel &ff, params.padding_w = padding_w; params.pool_type = pool_type; params.activation = activation; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/reduce.cc b/src/ops/reduce.cc index 7a443e6ad0..454a35caf4 100644 --- a/src/ops/reduce.cc +++ b/src/ops/reduce.cc @@ -41,6 +41,9 @@ ReduceParams Reduce::get_params() const { } params.keepdims = keepdims; params.layer_guid = this->layer_guid; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -110,9 +113,12 @@ Reduce::Reduce(FFModel &model, ReduceParams const ¶ms, const ParallelTensor input, char const *name) - : Reduce( - model, params.layer_guid, input, params.axes, params.keepdims, name) { -} + : Reduce(model, + params.layer_guid, + input, + params.axes, + params.keepdims, + params.name) {} Reduce::Reduce(FFModel &model, LayerID const &_layer_guid, @@ -378,6 +384,8 @@ void Reduce::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->layer_guid.model_id); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -400,6 +408,10 @@ Node Reduce::deserialize(FFModel &ff, dez.deserialize(id); dez.deserialize(transformer_layer_id); dez.deserialize(deserialized_model_id); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); return ff.get_or_create_node(inputs[0], {axes, keepdims, layer_guid}); diff --git a/src/ops/reshape.cc b/src/ops/reshape.cc index 45da190680..49f99e2cb5 100644 --- a/src/ops/reshape.cc +++ b/src/ops/reshape.cc @@ -140,7 +140,7 @@ Reshape::Reshape(FFModel &model, ReshapeParams const ¶ms, const ParallelTensor input, char const *name) - : Reshape(model, params.layer_guid, input, params.shape, name) {} + : Reshape(model, params.layer_guid, input, params.shape, params.name) {} void Reshape::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); @@ -296,6 +296,9 @@ ReshapeParams Reshape::get_params() const { ReshapeParams params; params.shape = shape_vec; params.layer_guid = this->layer_guid; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -414,6 +417,8 @@ void Reshape::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->layer_guid.model_id); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -435,11 +440,16 @@ Node Reshape::deserialize(FFModel &ff, dez.deserialize(id); dez.deserialize(transformer_layer_id); dez.deserialize(deserialized_model_id); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); ReshapeParams params; params.shape = shape; params.layer_guid = layer_guid; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index 7de40fb389..ed9252c309 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -63,6 +63,9 @@ ResidualLayerNormParams ResidualLayerNorm::get_params() const { params.eps = this->eps; params.use_bias = this->use_bias; params.use_two_residuals = this->use_two_residuals; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -228,7 +231,7 @@ ResidualLayerNorm::ResidualLayerNorm( params.use_bias, params.eps, allocate_weights, - name) {} + params.name) {} ResidualLayerNorm::ResidualLayerNorm(FFModel &model, LayerID const &_layer_guid, @@ -779,6 +782,8 @@ void ResidualLayerNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->eps); sez.serialize(this->use_bias); sez.serialize(this->use_two_residuals); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -808,6 +813,10 @@ Node ResidualLayerNorm::deserialize(FFModel &ff, dez.deserialize(eps); dez.deserialize(use_bias); dez.deserialize(use_two_residuals); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); if (use_two_residuals) { assert(num_inputs == 3); } else { @@ -821,6 +830,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff, params.eps = eps; params.use_bias = use_bias; params.use_two_residuals = use_two_residuals; + strcpy(params.name, name); if (use_two_residuals) { return ff.get_or_create_node( {inputs[0], inputs[1], inputs[2]}, params); diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index b447a2a3b5..f4f5bb72d0 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -55,6 +55,9 @@ ResidualRMSNormParams ResidualRMSNorm::get_params() const { params.layer_guid = this->layer_guid; params.eps = this->eps; params.dim = this->dim; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -141,7 +144,7 @@ ResidualRMSNorm::ResidualRMSNorm( params.eps, params.dim, allocate_weights, - name) {} + params.name) {} ResidualRMSNorm::ResidualRMSNorm( FFModel &model, @@ -459,6 +462,8 @@ void ResidualRMSNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->eps); sez.serialize(this->dim); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -477,10 +482,15 @@ Node ResidualRMSNorm::deserialize(FFModel &ff, LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(eps); dez.deserialize(dim); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); ResidualRMSNormParams params; params.layer_guid = layer_guid; params.eps = eps; params.dim = dim; + strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); } diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index 2a34f83be2..bf07ee6bb0 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -53,6 +53,9 @@ RMSNormParams RMSNorm::get_params() const { params.layer_guid = this->layer_guid; params.eps = this->eps; params.dim = this->dim; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -129,7 +132,7 @@ RMSNorm::RMSNorm(FFModel &model, params.eps, params.dim, allocate_weights, - name) {} + params.name) {} RMSNorm::RMSNorm(FFModel &model, RMSNorm const &other, @@ -437,6 +440,8 @@ void RMSNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->eps); sez.serialize(this->dim); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -456,10 +461,15 @@ Node RMSNorm::deserialize(FFModel &ff, LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(eps); dez.deserialize(dim); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); RMSNormParams params; params.layer_guid = layer_guid; params.eps = eps; params.dim = dim; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc index 463b15aadb..9fc2316f9a 100644 --- a/src/ops/sampling.cc +++ b/src/ops/sampling.cc @@ -88,6 +88,9 @@ Op *Sampling::create_operator_from_layer( SamplingParams Sampling::get_params() const { SamplingParams params; params.top_p = this->top_p; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -136,7 +139,7 @@ Sampling::Sampling(FFModel &model, SamplingParams const ¶ms, const ParallelTensor input, char const *name) - : Sampling(model, input, params.top_p, name) {} + : Sampling(model, input, params.top_p, params.name) {} void Sampling::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -325,6 +328,8 @@ void Sampling::backward(FFModel const &ff) { void Sampling::serialize(Legion::Serializer &sez) const { sez.serialize(this->top_p); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } Node Sampling::deserialize(FFModel &ff, @@ -334,8 +339,13 @@ Node Sampling::deserialize(FFModel &ff, assert(num_inputs == 1); float top_p; dez.deserialize(top_p); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); SamplingParams params; params.top_p = top_p; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc index 3b2ed7cef4..3ddd6b8d6e 100644 --- a/src/ops/sigmoid_silu_multi.cc +++ b/src/ops/sigmoid_silu_multi.cc @@ -52,6 +52,9 @@ bool SigmoidSiluMultiParams::is_valid( SigmoidSiluMultiParams SigmoidSiluMulti::get_params() const { SigmoidSiluMultiParams params; params.layer_guid = this->layer_guid; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -110,7 +113,7 @@ SigmoidSiluMulti::SigmoidSiluMulti( std::pair const &inputs, char const *name) : SigmoidSiluMulti( - model, params.layer_guid, inputs.first, inputs.second, name) {} + model, params.layer_guid, inputs.first, inputs.second, params.name) {} SigmoidSiluMulti::SigmoidSiluMulti(FFModel &model, LayerID const &_layer_guid, @@ -366,6 +369,8 @@ void SigmoidSiluMulti::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->layer_guid.model_id); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -379,10 +384,15 @@ Node SigmoidSiluMulti::deserialize(FFModel &ff, dez.deserialize(id); dez.deserialize(transformer_layer_id); dez.deserialize(deserialized_model_id); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); SigmoidSiluMultiParams params; params.layer_guid = layer_guid; + strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); } diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index ba0a1288d6..03618423be 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -17,6 +17,7 @@ #include "flexflow/model.h" #include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/utils/hash_utils.h" +#include "legion/legion_utilities.h" namespace FlexFlow { // declare Legion names @@ -39,7 +40,42 @@ using namespace FlexFlow::Kernels::Softmax; /* Params */ bool operator==(SoftmaxParams const &lhs, SoftmaxParams const &rhs) { - return lhs.dim == rhs.dim; + return lhs.layer_guid == rhs.layer_guid && lhs.dim == rhs.dim; +} + +void Softmax::serialize(Legion::Serializer &sez) const { + sez.serialize(this->layer_guid.id); + sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->layer_guid.model_id); + sez.serialize(this->dim); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); +} + +using PCG::Node; +/*static*/ +Node Softmax::deserialize(FFModel &ff, + Legion::Deserializer &dez, + ParallelTensor inputs[], + int num_inputs) { + assert(num_inputs == 1); + size_t id, transformer_layer_id, deserialized_model_id; + dez.deserialize(id); + dez.deserialize(transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); + int dim; + dez.deserialize(dim); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); + + SoftmaxParams params; + params.layer_guid = layer_guid; + params.dim = dim; + strcpy(params.name, name); + return ff.get_or_create_node(inputs[0], params); } bool SoftmaxParams::is_valid(ParallelTensorShape const &input) const { @@ -48,7 +84,11 @@ bool SoftmaxParams::is_valid(ParallelTensorShape const &input) const { SoftmaxParams Softmax::get_params() const { SoftmaxParams params; + params.layer_guid = this->layer_guid; params.dim = this->dim; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -87,12 +127,14 @@ Op *Softmax::create_operator_from_layer( layer->get_int_property("softmax_dim", value); int dim = (int)value; return new Softmax(model, + layer->layer_guid, inputs[0], (inputs[0]->num_dims - 1 - dim) % inputs[0]->num_dims, layer->name); } Softmax::Softmax(FFModel &model, + LayerID const &_layer_guid, const ParallelTensor _input, int _dim, char const *name) @@ -107,6 +149,7 @@ Softmax::Softmax(FFModel &model, dim(_dim) { // Currently assume we always perform softmax along the inner most dim assert(dim == 0); + layer_guid = _layer_guid; ParallelDim dims[MAX_TENSOR_DIM]; int numdim = _input->num_dims; for (int i = 0; i < numdim; i++) { @@ -119,7 +162,7 @@ Softmax::Softmax(FFModel &model, SoftmaxParams const ¶ms, const ParallelTensor input, char const *name) - : Softmax(model, input, params.dim, name) {} + : Softmax(model, params.layer_guid, input, params.dim, params.name) {} void Softmax::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -518,6 +561,7 @@ namespace std { size_t hash::operator()( FlexFlow::SoftmaxParams const ¶ms) const { size_t key = 0; + hash_combine(key, params.layer_guid.id); hash_combine(key, params.dim); return key; } diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index 5d234df822..9c6ed0e0b6 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -511,7 +511,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( params.qk_prod_scaling, params.position_bias, allocate_weights, - name) {} + params.name) {} void SpecIncMultiHeadSelfAttention::init_inference( FFModel const &ff, @@ -853,6 +853,9 @@ SpecIncMultiHeadSelfAttentionParams params.scaling_factor = this->scaling_factor; params.qk_prod_scaling = this->qk_prod_scaling; params.position_bias = this->position_bias; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } diff --git a/src/ops/split.cc b/src/ops/split.cc index 9298850a99..7c6b631b20 100644 --- a/src/ops/split.cc +++ b/src/ops/split.cc @@ -50,6 +50,9 @@ SplitParams Split::get_params() const { SplitParams params; params.splits = this->splits; params.legion_axis = this->legion_axis; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -137,7 +140,7 @@ Split::Split(FFModel &model, SplitParams const ¶ms, const ParallelTensor input, char const *name) - : Split(model, input, params.splits, params.legion_axis, name) {} + : Split(model, input, params.splits, params.legion_axis, params.name) {} void Split::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); diff --git a/src/ops/topk.cc b/src/ops/topk.cc index b38ff85f90..7d30a8aff3 100644 --- a/src/ops/topk.cc +++ b/src/ops/topk.cc @@ -87,6 +87,9 @@ TopKParams TopK::get_params() const { TopKParams params; params.k = this->k; params.sorted = this->sorted; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -134,7 +137,7 @@ TopK::TopK(FFModel &model, TopKParams const ¶ms, const ParallelTensor input, char const *name) - : TopK(model, input, params.k, params.sorted, name) {} + : TopK(model, input, params.k, params.sorted, params.name) {} void TopK::init_inference(FFModel const &ff, std::vector const &batch_inputs, @@ -426,6 +429,8 @@ void TopK::backward_task(Task const *task, void TopK::serialize(Legion::Serializer &sez) const { sez.serialize(this->k); sez.serialize(this->sorted); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } Node TopK::deserialize(FFModel &ff, @@ -437,9 +442,14 @@ Node TopK::deserialize(FFModel &ff, bool sorted; dez.deserialize(k); dez.deserialize(sorted); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); TopKParams params; params.k = k; params.sorted = sorted; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } diff --git a/src/ops/transpose.cc b/src/ops/transpose.cc index 500b7867af..7a179c4f7d 100644 --- a/src/ops/transpose.cc +++ b/src/ops/transpose.cc @@ -51,6 +51,9 @@ TransposeParams Transpose::get_params() const { for (int i = 0; i < outputs[0]->num_dims; i++) { params.perm.push_back(this->perm[i]); } + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -98,7 +101,7 @@ Transpose::Transpose(FFModel &model, TransposeParams const ¶ms, const ParallelTensor input, char const *name) - : Transpose(model, input, params.perm, name) {} + : Transpose(model, input, params.perm, params.name) {} Transpose::Transpose(FFModel &model, const ParallelTensor input, @@ -383,6 +386,8 @@ void Transpose::serialize(Legion::Serializer &sez) const { for (size_t i = 0; i < params.perm.size(); i++) { sez.serialize(params.perm[i]); } + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -399,6 +404,10 @@ Node Transpose::deserialize(FFModel &ff, dez.deserialize(dim_idx); perm.push_back(dim_idx); } + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); return ff.get_or_create_node(inputs[0], {perm}); } diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index d5a8a1063d..d0efb01d54 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -562,7 +562,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( params.quantization_type, params.offload, params.tensor_parallelism_degree, - name) {} + params.name) {} void TreeIncMultiHeadSelfAttention::init_inference( FFModel const &ff, @@ -927,6 +927,9 @@ TreeIncMultiHeadSelfAttentionParams params.qk_prod_scaling = this->qk_prod_scaling; params.position_bias = this->position_bias; params.tensor_parallelism_degree = this->tensor_parallelism_degree; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc index 027d15c929..5d38e28903 100644 --- a/src/parallel_ops/allreduce.cc +++ b/src/parallel_ops/allreduce.cc @@ -55,6 +55,9 @@ bool AllReduceParams::is_valid(ParallelTensorShape const &input) const { AllReduceParams AllReduce::get_params() const { AllReduceParams params; params.allreduce_legion_dim = this->allreduce_dim; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -79,7 +82,7 @@ AllReduce::AllReduce(FFModel &model, AllReduceParams const ¶ms, ParallelTensor const input, char const *name) - : AllReduce(model, input, params.allreduce_legion_dim, name) {} + : AllReduce(model, input, params.allreduce_legion_dim, params.name) {} void AllReduce::create_input_partition(FFModel &ff) { // Do nothing diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc index 7c266c5392..acc5c414c7 100644 --- a/src/parallel_ops/combine.cc +++ b/src/parallel_ops/combine.cc @@ -58,6 +58,9 @@ CombineParams Combine::get_params() const { CombineParams params; params.combine_legion_dim = this->combine_dim; params.combine_degree = this->combine_degree; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -69,7 +72,7 @@ Combine::Combine(FFModel &model, input, params.combine_legion_dim, params.combine_degree, - name) {} + params.name) {} Combine::Combine(FFModel &model, const ParallelTensor _input, diff --git a/src/parallel_ops/fused_parallel_op.cc b/src/parallel_ops/fused_parallel_op.cc index c0a97bdda1..1a76cbfc40 100644 --- a/src/parallel_ops/fused_parallel_op.cc +++ b/src/parallel_ops/fused_parallel_op.cc @@ -59,6 +59,9 @@ FusedParallelOpParams FusedParallelOp::get_params() const { std::vector ops(std::begin(this->parallel_ops), std::end(this->parallel_ops)); params.parallel_ops = ops; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } diff --git a/src/parallel_ops/partition.cc b/src/parallel_ops/partition.cc index 353b3ce398..e6ab09d088 100644 --- a/src/parallel_ops/partition.cc +++ b/src/parallel_ops/partition.cc @@ -60,6 +60,9 @@ RepartitionParams Repartition::get_params() const { RepartitionParams params; params.repartition_legion_dim = this->repartition_dim; params.repartition_degree = this->repartition_degree; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -92,7 +95,7 @@ Repartition::Repartition(FFModel &model, input, params.repartition_legion_dim, params.repartition_degree, - name) {} + params.name) {} OpMeta *Repartition::init_task(Task const *task, std::vector const ®ions, diff --git a/src/parallel_ops/reduction.cc b/src/parallel_ops/reduction.cc index 5dca591328..5ca2b1301c 100644 --- a/src/parallel_ops/reduction.cc +++ b/src/parallel_ops/reduction.cc @@ -56,6 +56,9 @@ ReductionParams Reduction::get_params() const { ReductionParams params; params.reduction_legion_dim = this->reduction_dim; params.reduction_degree = this->reduction_degree; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -89,7 +92,7 @@ Reduction::Reduction(FFModel &model, input, params.reduction_legion_dim, params.reduction_degree, - name) {} + params.name) {} void Reduction::create_input_partition(FFModel &ff) { assert(outputs[0]->part != LogicalPartition::NO_PART); diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index 20face74e8..ba7bb6677f 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -55,6 +55,9 @@ ReplicateParams Replicate::get_params() const { ReplicateParams params; params.replicate_legion_dim = this->replicate_dim; params.replicate_degree = this->replicate_degree; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -88,7 +91,7 @@ Replicate::Replicate(FFModel &model, input, params.replicate_legion_dim, params.replicate_degree, - name) {} + params.name) {} void Replicate::create_input_partition(FFModel &ff) { assert(outputs[0]->part != LogicalPartition::NO_PART); diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index fa6bf55fe5..57bc5a0458 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -287,7 +287,11 @@ __host__ void tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { - fprintf(tensor_file, "%.9f, ", host_ptr[i]); + if (i < num_elements - 1) { + fprintf(tensor_file, "%.9f, ", host_ptr[i]); + } else { + fprintf(tensor_file, "%.9f", host_ptr[i]); + } } fclose(tensor_file); @@ -313,7 +317,11 @@ __host__ void tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { - fprintf(tensor_file, "%.9f, ", (float)host_ptr[i]); + if (i < num_elements - 1) { + fprintf(tensor_file, "%.9f, ", (float)host_ptr[i]); + } else { + fprintf(tensor_file, "%.9f", (float)host_ptr[i]); + } } fclose(tensor_file); @@ -340,7 +348,11 @@ __host__ void save_tensor(int32_t const *ptr, tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { - fprintf(tensor_file, "%d, ", host_ptr[i]); + if (i < num_elements - 1) { + fprintf(tensor_file, "%d, ", host_ptr[i]); + } else { + fprintf(tensor_file, "%d", host_ptr[i]); + } } fclose(tensor_file); @@ -367,7 +379,11 @@ __host__ void save_tensor(int64_t const *ptr, tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { - fprintf(tensor_file, "%ld, ", host_ptr[i]); + if (i < num_elements - 1) { + fprintf(tensor_file, "%ld, ", host_ptr[i]); + } else { + fprintf(tensor_file, "%ld", host_ptr[i]); + } } fclose(tensor_file); diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 6d33dd9f27..f8e8240ccf 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -2276,6 +2276,8 @@ GraphOptimalViewSerialized case OP_CONCAT: { Concat *concat = (Concat *)op; sez.serialize(concat->legion_axis); + sez.serialize(strlen(concat->name)); + sez.serialize(concat->name, strlen(concat->name)); break; } case OP_SPLIT: { @@ -2285,6 +2287,8 @@ GraphOptimalViewSerialized for (int i = 0; i < split->numOutputs; i++) { sez.serialize(split->outputs[i]->dims[split->legion_axis].size); } + sez.serialize(strlen(split->name)); + sez.serialize(split->name, strlen(split->name)); break; } case OP_EMBEDDING: { @@ -2296,6 +2300,8 @@ GraphOptimalViewSerialized sez.serialize(embed->out_channels); sez.serialize(embed->aggr); sez.serialize(embed->data_type); + sez.serialize(strlen(embed->name)); + sez.serialize(embed->name, strlen(embed->name)); break; } case OP_MULTIHEAD_ATTENTION: { @@ -2311,6 +2317,8 @@ GraphOptimalViewSerialized sez.serialize(attn->bias); sez.serialize(attn->add_bias_kv); sez.serialize(attn->add_zero_attn); + sez.serialize(strlen(attn->name)); + sez.serialize(attn->name, strlen(attn->name)); break; } case OP_INC_MULTIHEAD_SELF_ATTENTION: { @@ -2335,6 +2343,8 @@ GraphOptimalViewSerialized sez.serialize(attn->offload); sez.serialize(attn->num_kv_heads); sez.serialize(attn->tensor_parallelism_degree); + sez.serialize(strlen(attn->name)); + sez.serialize(attn->name, strlen(attn->name)); break; } case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { @@ -2357,6 +2367,8 @@ GraphOptimalViewSerialized sez.serialize(attn->qk_prod_scaling); sez.serialize(attn->position_bias); sez.serialize(attn->num_kv_heads); + sez.serialize(strlen(attn->name)); + sez.serialize(attn->name, strlen(attn->name)); break; } case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { @@ -2382,40 +2394,47 @@ GraphOptimalViewSerialized sez.serialize(attn->offload); sez.serialize(attn->num_kv_heads); sez.serialize(attn->tensor_parallelism_degree); - break; - } - case OP_SOFTMAX: { - Softmax *softmax = (Softmax *)op; - sez.serialize(softmax->dim); + sez.serialize(strlen(attn->name)); + sez.serialize(attn->name, strlen(attn->name)); break; } case OP_REPARTITION: { Repartition *repart = (Repartition *)op; sez.serialize(repart->repartition_dim); sez.serialize(repart->repartition_degree); + sez.serialize(strlen(repart->name)); + sez.serialize(repart->name, strlen(repart->name)); break; } case OP_REPLICATE: { Replicate *replicate = (Replicate *)op; sez.serialize(replicate->replicate_dim); sez.serialize(replicate->replicate_degree); + sez.serialize(strlen(replicate->name)); + sez.serialize(replicate->name, strlen(replicate->name)); break; } case OP_REDUCTION: { Reduction *reduction = (Reduction *)op; sez.serialize(reduction->reduction_dim); sez.serialize(reduction->reduction_degree); + sez.serialize(strlen(reduction->name)); + sez.serialize(reduction->name, strlen(reduction->name)); break; } case OP_COMBINE: { Combine *combine = (Combine *)op; sez.serialize(combine->combine_dim); sez.serialize(combine->combine_degree); + sez.serialize(strlen(combine->name)); + sez.serialize(combine->name, strlen(combine->name)); break; } case OP_ALLREDUCE: { AllReduce *allreduce = (AllReduce *)op; sez.serialize(allreduce->allreduce_dim); + sez.serialize(strlen(allreduce->name)); + sez.serialize(allreduce->name, strlen(allreduce->name)); break; } case OP_FUSED_PARALLEL: { @@ -2424,6 +2443,8 @@ GraphOptimalViewSerialized for (int i = 0; i < fused->num_parallel_ops; i++) { sez.serialize(fused->parallel_ops[i]); } + sez.serialize(strlen(fused->name)); + sez.serialize(fused->name, strlen(fused->name)); break; } default: { @@ -2621,6 +2642,10 @@ void FFModel::deserialize_graph_optimal_view( case OP_CONCAT: { int legion_axis; dez.deserialize(legion_axis); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); node = get_or_create_node( {std::begin(inputs), std::begin(inputs) + num_inputs}, {legion_axis}); @@ -2637,6 +2662,10 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(dim_size); splits.push_back(dim_size); } + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); node = get_or_create_node(inputs[0], {splits, legion_axis}); break; } @@ -2654,6 +2683,10 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(out_channels); dez.deserialize(aggr); dez.deserialize(data_type); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); EmbeddingParams params; params.aggr = aggr; @@ -2661,6 +2694,7 @@ void FFModel::deserialize_graph_optimal_view( params.out_channels = out_channels; params.layer_guid = layer_guid; params.data_type = data_type; + strcpy(params.name, name); node = get_or_create_node(inputs[0], params); break; } @@ -2746,6 +2780,10 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(bias); dez.deserialize(add_bias_kv); dez.deserialize(add_zero_attn); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); MultiHeadAttentionParams params; params.embed_dim = embed_dim; @@ -2757,6 +2795,7 @@ void FFModel::deserialize_graph_optimal_view( params.add_bias_kv = add_bias_kv; params.add_zero_attn = add_zero_attn; params.layer_guid = layer_guid; + strcpy(params.name, name); node = get_or_create_node( {inputs[0], inputs[1], inputs[2]}, params); break; @@ -2791,6 +2830,10 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(offload); dez.deserialize(num_kv_heads); dez.deserialize(tensor_parallelism_degree); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); IncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; @@ -2811,6 +2854,7 @@ void FFModel::deserialize_graph_optimal_view( params.offload = offload; params.num_kv_heads = num_kv_heads; params.tensor_parallelism_degree = tensor_parallelism_degree; + strcpy(params.name, name); node = get_or_create_node(inputs[0], params); break; } @@ -2839,6 +2883,10 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(qk_prod_scaling); dez.deserialize(position_bias); dez.deserialize(num_kv_heads); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); SpecIncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; @@ -2856,6 +2904,7 @@ void FFModel::deserialize_graph_optimal_view( params.qk_prod_scaling = qk_prod_scaling; params.position_bias = position_bias; params.num_kv_heads = num_kv_heads; + strcpy(params.name, name); node = get_or_create_node(inputs[0], params); break; @@ -2890,6 +2939,10 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(offload); dez.deserialize(num_kv_heads); dez.deserialize(tensor_parallelism_degree); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); TreeIncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; @@ -2910,6 +2963,7 @@ void FFModel::deserialize_graph_optimal_view( params.offload = offload; params.num_kv_heads = num_kv_heads; params.tensor_parallelism_degree = tensor_parallelism_degree; + strcpy(params.name, name); node = get_or_create_node(inputs[0], params); break; @@ -2967,10 +3021,7 @@ void FFModel::deserialize_graph_optimal_view( break; } case OP_SOFTMAX: { - assert(num_inputs == 1); - int softmax_dim; - dez.deserialize(softmax_dim); - node = get_or_create_node(inputs[0], {softmax_dim}); + node = Softmax::deserialize(*this, dez, inputs, num_inputs); break; } case OP_TRANSPOSE: { @@ -2990,6 +3041,10 @@ void FFModel::deserialize_graph_optimal_view( int combine_dim, combine_degree; dez.deserialize(combine_dim); dez.deserialize(combine_degree); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); node = get_or_create_node(inputs[0], {combine_dim, combine_degree}); break; @@ -2999,6 +3054,10 @@ void FFModel::deserialize_graph_optimal_view( int repartition_dim, repartition_degree; dez.deserialize(repartition_dim); dez.deserialize(repartition_degree); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); node = get_or_create_node( inputs[0], {repartition_dim, repartition_degree}); break; @@ -3008,6 +3067,10 @@ void FFModel::deserialize_graph_optimal_view( int replicate_dim, replicate_degree; dez.deserialize(replicate_dim); dez.deserialize(replicate_degree); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); node = get_or_create_node(inputs[0], {replicate_dim, replicate_degree}); break; @@ -3017,6 +3080,10 @@ void FFModel::deserialize_graph_optimal_view( int reduction_dim, reduction_degree; dez.deserialize(reduction_dim); dez.deserialize(reduction_degree); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); node = get_or_create_node(inputs[0], {reduction_dim, reduction_degree}); break; @@ -3025,6 +3092,10 @@ void FFModel::deserialize_graph_optimal_view( assert(num_inputs == 1); int allreduce_dim; dez.deserialize(allreduce_dim); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); node = get_or_create_node(inputs[0], {allreduce_dim}); break; } @@ -3038,6 +3109,10 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(info); parallel_ops.push_back(info); } + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); node = get_or_create_node(inputs[0], {parallel_ops}); break; } diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp index fb94135c8f..613df1cbcf 100644 --- a/src/runtime/hip_helper.cpp +++ b/src/runtime/hip_helper.cpp @@ -266,7 +266,11 @@ __host__ void tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { - fprintf(tensor_file, "%.9f, ", host_ptr[i]); + if (i < num_elements - 1) { + fprintf(tensor_file, "%.9f, ", host_ptr[i]); + } else { + fprintf(tensor_file, "%.9f", host_ptr[i]); + } } fclose(tensor_file); @@ -292,7 +296,11 @@ __host__ void tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { - fprintf(tensor_file, "%.9f, ", (float)host_ptr[i]); + if (i < num_elements - 1) { + fprintf(tensor_file, "%.9f, ", (float)host_ptr[i]); + } else { + fprintf(tensor_file, "%.9f", (float)host_ptr[i]); + } } fclose(tensor_file); @@ -319,7 +327,11 @@ __host__ void save_tensor(int32_t const *ptr, tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { - fprintf(tensor_file, "%d, ", host_ptr[i]); + if (i < num_elements - 1) { + fprintf(tensor_file, "%d, ", host_ptr[i]); + } else { + fprintf(tensor_file, "%d", host_ptr[i]); + } } fclose(tensor_file); @@ -346,7 +358,11 @@ __host__ void save_tensor(int64_t const *ptr, tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { - fprintf(tensor_file, "%ld, ", host_ptr[i]); + if (i < num_elements - 1) { + fprintf(tensor_file, "%ld, ", host_ptr[i]); + } else { + fprintf(tensor_file, "%ld", host_ptr[i]); + } } fclose(tensor_file); diff --git a/src/runtime/operator.cc b/src/runtime/operator.cc index 0b3813f41c..36ac02a3a3 100644 --- a/src/runtime/operator.cc +++ b/src/runtime/operator.cc @@ -25,105 +25,4 @@ size_t Op::get_params_hash() const { get_operator_type_name(this->op_type)); } -/*static*/ -void Op::save_inference_tensors_to_file( - OpMeta *m, - int shard_id, - BatchConfig const *bc, - std::vector input_tensors, - std::vector weight_tensors, - std::vector output_tensors) { - - // Check if output directory exists, and create it if it does not - char const *folder_path = "./inference_tensors"; - struct stat st = {0}; - if (stat(folder_path, &st) == -1) { - // Directory does not exist, create it - mkdir(folder_path, 0700); - } - // output base filepath, shared by all tensors from the same operator - std::string base_filepath = - "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + - "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" + - std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" + - m->op_name + "_shard-id_" + std::to_string(shard_id); - // save batch config, if passed - if (bc != nullptr) { - bc->save_to_file(base_filepath + "_batch-config"); - } - // save all inputs - for (int i = 0; i < input_tensors.size(); i++) { - std::string filename = base_filepath + "_input_" + std::to_string(i); - if (input_tensors[i].data_type == DT_FLOAT) { - save_tensor(input_tensors[i].get_float_ptr(), - input_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (input_tensors[i].data_type == DT_HALF) { - save_tensor(input_tensors[i].get_half_ptr(), - input_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (input_tensors[i].data_type == DT_INT32) { - save_tensor(input_tensors[i].get_int32_ptr(), - input_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (input_tensors[i].data_type == DT_INT64) { - save_tensor(input_tensors[i].get_int64_ptr(), - input_tensors[i].domain.get_volume(), - filename.c_str()); - } else { - assert(false && "Tensor data type not supported"); - } - } - // only dump the weights once - if (m->decoding_step == 0) { - for (int i = 0; i < weight_tensors.size(); i++) { - std::string filename = base_filepath + "_weight_" + std::to_string(i); - if (weight_tensors[i].data_type == DT_FLOAT) { - save_tensor(weight_tensors[i].get_float_ptr(), - weight_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (weight_tensors[i].data_type == DT_HALF) { - save_tensor(weight_tensors[i].get_half_ptr(), - weight_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (weight_tensors[i].data_type == DT_INT32) { - save_tensor(weight_tensors[i].get_int32_ptr(), - weight_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (weight_tensors[i].data_type == DT_INT64) { - save_tensor(weight_tensors[i].get_int64_ptr(), - weight_tensors[i].domain.get_volume(), - filename.c_str()); - } else { - assert(false && "Tensor data type not supported"); - } - } - } - // save all outputs - for (int i = 0; i < output_tensors.size(); i++) { - std::string filename = base_filepath + "_output_" + std::to_string(i); - if (output_tensors[i].data_type == DT_FLOAT) { - save_tensor(output_tensors[i].get_float_ptr(), - output_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (output_tensors[i].data_type == DT_HALF) { - save_tensor(output_tensors[i].get_half_ptr(), - output_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (output_tensors[i].data_type == DT_INT32) { - save_tensor(output_tensors[i].get_int32_ptr(), - output_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (output_tensors[i].data_type == DT_INT64) { - save_tensor(output_tensors[i].get_int64_ptr(), - output_tensors[i].domain.get_volume(), - filename.c_str()); - } else { - assert(false && "Tensor data type not supported"); - } - } - // increase count of decoding steps - m->decoding_step++; -} - }; // namespace FlexFlow \ No newline at end of file diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index e8b986582f..c0804d6e19 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -958,8 +958,12 @@ bool GraphXfer::create_new_operator(OpX const *opx, Node &op) { } case OP_SOFTMAX: { int softmax_dim; + assert(opx->matchOpX != NULL); + assert(opx->matchOpX->mapOp.ptr != NULL); + Softmax *softmax = (Softmax *)opx->matchOpX->mapOp.ptr; assert(opx->get_pm_constraint(PM_SOFTMAX_DIM, softmax_dim)); - op = model->get_or_create_node(inputs[0], {softmax_dim}); + SoftmaxParams params = softmax->get_params(); + op = model->get_or_create_node(inputs[0], params); break; } case OP_REPARTITION: { @@ -3749,7 +3753,8 @@ bool FFModel::convert_graph_to_operators( case OP_SOFTMAX: { assert(inList.size() == 1); Softmax *softmax = (Softmax *)node.ptr; - new_op = new Softmax(*this, inputs[0], softmax->dim, NULL); + new_op = new Softmax( + *this, softmax->layer_guid, inputs[0], softmax->dim, NULL); break; } case OP_COMBINE: { diff --git a/tests/inference/huggingface_inference.py b/tests/inference/huggingface_inference.py index 5b533bf3c0..6857b5cbc1 100644 --- a/tests/inference/huggingface_inference.py +++ b/tests/inference/huggingface_inference.py @@ -1,6 +1,7 @@ import argparse import json import os +import shutil import torch from transformers import ( AutoModelForCausalLM, @@ -9,7 +10,30 @@ LlamaTokenizer, GenerationConfig, ) - +######################### debugging helper functions ######################### +def pre_forward_hook(module, input): + assert module.name is not None and module.decoding_step is not None + name = module.name.replace("model.", "") + print( + f"Pre-forward hook activated on module: {name}, decoding step: {module.decoding_step}" + ) + print("Pre-Input: ", input[0].shape) + torch.save( + input, f"./hf_tensors/decoding_step_{module.decoding_step}_{name}.input" + ) +def post_forward_hook(module, input, output): + assert module.name is not None and module.decoding_step is not None + name = module.name.replace("model.", "") + print( + f"Post-forward Hook activated for module: {name}, decoding step: {module.decoding_step}" + ) + print("Post-Input/Output: ", input[0].shape, output[0].shape) + torch.save( + output, f"./hf_tensors/decoding_step_{module.decoding_step}_{name}.output" + ) + print("===") + module.decoding_step += 1 +############################################################################## def main(): # Change working dir to folder storing this script @@ -28,6 +52,11 @@ def main(): ) parser.add_argument("--do-sample", action="store_true", help="Use sampling") parser.add_argument("--gpu", action="store_true", help="Run on GPU") + parser.add_argument( + "--inference-debugging", + action="store_true", + help="Print debugging info and save hidden states/weights to file", + ) args = parser.parse_args() # Check if max-length is greater than 0 if args.max_length <= 0: @@ -64,6 +93,27 @@ def main(): tokenizer = AutoTokenizer.from_pretrained(args.model_name) generation_config = GenerationConfig.from_pretrained(args.model_name) generation_config.do_sample = args.do_sample + ################# debugging ################# + if args.inference_debugging: + # Print model and configs + print(hf_config) + print(model) + # Save weights to file + shutil.rmtree("./hf_tensors") + # Check that the output folder exists + os.makedirs("./hf_tensors", exist_ok=True) + # Save weights + for name, params in model.named_parameters(): + torch.save(params, f"./hf_tensors/{name}") + # params.detach().cpu().numpy().tofile(f"./hf_tensors/{name}") + # Register hooks to save per-op hidden states + for name, layer in dict(model.named_modules()).items(): + layer.name = name + layer.decoding_step = 0 + print(f"Adding hooks to layer {layer.name}") + layer.register_forward_pre_hook(pre_forward_hook) + layer.register_forward_hook(post_forward_hook) + ############################################### # Generate output with open(args.output_file, "w") as f: for i, prompt in enumerate(prompt_list): From 57d1883b5cef266371dd616f812abca44b37099d Mon Sep 17 00:00:00 2001 From: FelixBrakel Date: Sat, 20 Jan 2024 06:07:28 +0100 Subject: [PATCH 45/61] Fix incorrect innode being checked (#1273) Co-authored-by: Gabriele Oliaro --- python/flexflow/torch/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/flexflow/torch/model.py b/python/flexflow/torch/model.py index 65b1669e99..df4042748f 100644 --- a/python/flexflow/torch/model.py +++ b/python/flexflow/torch/model.py @@ -955,7 +955,7 @@ def is_left_scalar_op(node): if len(innodes) != 2: return False return type(innodes[0]) is float or \ - type(innodes[1]) is int + type(innodes[0]) is int @staticmethod def is_elemwise_op(node): From 317cffd82f2dc6559f3243217e617b110c90be05 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 26 Jan 2024 11:36:24 -0500 Subject: [PATCH 46/61] Bug fixes and update Legion version (#1259) * bug fixes and update Legion version * fix * bug fix * update legion * fix arithmetic error due to num_devices uninitialized * update legion version * update ci * fix * debugging ci * Revert "debugging ci" This reverts commit 0b3148ef6adfcb64935e6b1e83a88494910a7b22. --------- Co-authored-by: Gabriele Oliaro --- .github/workflows/gpu-ci.yml | 12 +++--- CMakeLists.txt | 8 ++-- cmake/pip_install/CMakeLists.txt | 4 +- deps/legion | 2 +- include/flexflow/mapper.h | 9 ++--- include/flexflow/model.h | 2 + include/flexflow/operator.h | 5 +++ include/flexflow/request_manager.h | 1 - src/mapper/mapper.cc | 47 ++++++++++------------- src/ops/linear.cc | 8 +--- src/runtime/inference_manager.cc | 30 +-------------- src/runtime/model.cc | 61 ++++++++++++++++++++++++++++++ 12 files changed, 109 insertions(+), 80 deletions(-) diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 3901d6b5f7..48dcda157e 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -222,7 +222,7 @@ jobs: CONDA: "3" needs: inference-tests container: - image: ghcr.io/flexflow/flexflow-environment-cuda:latest + image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest options: --gpus all --shm-size=8192m steps: - name: Install updated git version @@ -243,7 +243,7 @@ jobs: - name: Build and Install FlexFlow run: | - export PATH=/opt/conda/bin:$PATH + export PATH=$CONDA_PREFIX/bin:$PATH export FF_HOME=$(pwd) export FF_BUILD_ALL_EXAMPLES=ON export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON @@ -252,18 +252,18 @@ jobs: - name: Check FlexFlow Python interface (pip) run: | - export PATH=/opt/conda/bin:$PATH + export PATH=$CONDA_PREFIX/bin:$PATH export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib ./tests/python_interface_test.sh after-installation - name: Run multi-gpu tests run: | - export PATH=/opt/conda/bin:$PATH + export PATH=$CONDA_PREFIX/bin:$PATH export CUDNN_DIR=/usr/local/cuda export CUDA_DIR=/usr/local/cuda export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib # C++ tests ./tests/cpp_gpu_tests.sh 4 # Python tests diff --git a/CMakeLists.txt b/CMakeLists.txt index acbe7e385f..43ce4f7044 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -413,6 +413,7 @@ if(NOT BUILD_LEGION_ONLY) # python related if (FF_USE_PYTHON) + find_package(Python COMPONENTS Interpreter Development) # create flexflow_cffi_header.py add_custom_command(TARGET flexflow PRE_BUILD @@ -424,13 +425,13 @@ if(NOT BUILD_LEGION_ONLY) # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library add_custom_command(TARGET flexflow POST_BUILD - COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} + COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python ) # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead. add_custom_command(TARGET flexflow PRE_BUILD - COMMAND ${PYTHON_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR} + COMMAND ${Python_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMENT "Creating flexflow_python interpreter..." ) @@ -567,7 +568,8 @@ if(NOT BUILD_LEGION_ONLY) install(TARGETS flexflow DESTINATION ${LIB_DEST}) # install python if (FF_USE_PYTHON) - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) + find_package(Python COMPONENTS Interpreter Development) + execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) if (NOT FF_BUILD_FROM_PYPI) install( DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/ diff --git a/cmake/pip_install/CMakeLists.txt b/cmake/pip_install/CMakeLists.txt index 7ce38c4abc..105133a310 100644 --- a/cmake/pip_install/CMakeLists.txt +++ b/cmake/pip_install/CMakeLists.txt @@ -1,10 +1,10 @@ # Use setup.py script to re-install the Python bindings library with the right library paths if (FF_USE_PYTHON) - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) if(FF_BUILD_FROM_PYPI) install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${PY_DEST}/flexflow/lib \")") # CMAKE_CURRENT_SOURCE_DIR=/usr/FlexFlow/cmake/pip_install # Legion_BINARY_DIR=/usr/FlexFlow/build//deps/legion - install(CODE "execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)") + install(CODE "execute_process(COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)") endif() endif() diff --git a/deps/legion b/deps/legion index 626b55689c..24e8c45234 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit 626b55689c77848b246e1da19678c7ad58899f0c +Subproject commit 24e8c452341dea41427e0ce61e154d61715e6835 diff --git a/include/flexflow/mapper.h b/include/flexflow/mapper.h index 71be1892aa..e8337818ec 100644 --- a/include/flexflow/mapper.h +++ b/include/flexflow/mapper.h @@ -83,11 +83,10 @@ class FFMapper : public NullMapper { Task const &task, MapTaskInput const &input, MapTaskOutput &output); - virtual void map_replicate_task(const MapperContext ctx, - Task const &task, - MapTaskInput const &input, - MapTaskOutput const &default_output, - MapReplicateTaskOutput &output); + virtual void replicate_task(const MapperContext ctx, + Task const &task, + ReplicateTaskInput const &input, + ReplicateTaskOutput &output); virtual void select_task_variant(const MapperContext ctx, Task const &task, SelectVariantInput const &input, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index dd6dc76b4d..95be9ab581 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -202,6 +202,7 @@ enum TaskIDs { // NCCL tasks NCCL_GETUNIQUEID_TASK_ID, NCCL_INIT_COMMS_TASK_ID, + NCCL_FINISH_COMMS_TASK_ID, // Search STRATEGY_SEARCH_TASK_ID, // Graph @@ -397,6 +398,7 @@ std::vector class FFModel { public: FFModel(FFConfig &config, bool cpu_offload = false); + ~FFModel(); static constexpr float PROPAGATION_CHANCE = 0.25; static constexpr float CONTINUE_PROPAGATION_CHANCE = 0.75; diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 73c2c3e092..1b19bdb82f 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -406,6 +406,11 @@ class Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void + finish_nccl_comms_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); #endif protected: void set_argumentmap_for_init(FFModel const &ff, Legion::ArgumentMap &argmap); diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 50a51705cd..4763eb1ef3 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -55,7 +55,6 @@ class InferenceManager { public: std::unordered_map> tensor_buffer; std::unordered_map model_weights_loaders; - int num_devices; }; struct Request { diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index bc26a79d3e..d46bfc2877 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -661,44 +661,37 @@ void FFMapper::map_task(const MapperContext ctx, } // for idx } -void FFMapper::map_replicate_task(const MapperContext ctx, - Task const &task, - MapTaskInput const &input, - MapTaskOutput const &default_output, - MapReplicateTaskOutput &output) { +void FFMapper::replicate_task(const MapperContext ctx, + Task const &task, + ReplicateTaskInput const &input, + ReplicateTaskOutput &output) { // Should only be replicated for the top-level task assert((task.get_depth() == 0) && (task.regions.size() == 0)); const Processor::Kind target_kind = task.target_proc.kind(); - VariantID chosen_variant; + VariantID vid; { std::vector variant_ids; - runtime->find_valid_variants( - ctx, task.task_id, variant_ids, task.target_proc.kind()); + runtime->find_valid_variants(ctx, task.task_id, variant_ids, target_kind); // Currently assume there is exactly one variant assert(variant_ids.size() == 1); - chosen_variant = variant_ids[0]; + output.chosen_variant = variant_ids[0]; } - std::vector const &all_procs = all_procs_by_kind(target_kind); - // Place on replicate on each node by default - output.task_mappings.resize(total_nodes, default_output); - // Assume default_output does not include any target_procs - assert(default_output.target_procs.size() == 0); - for (std::vector::const_iterator it = all_procs.begin(); - it != all_procs.end(); + output.target_processors.resize(total_nodes); + std::vector handled(total_nodes, false); + size_t count = 0; + Machine::ProcessorQuery procs(machine); + procs.only_kind(target_kind); + for (Machine::ProcessorQuery::iterator it = procs.begin(); it != procs.end(); it++) { - AddressSpace space = it->address_space(); - assert(space < output.task_mappings.size()); - // Add *it as a target_proc if we haven't found one - if (output.task_mappings[space].target_procs.size() == 0) { - output.task_mappings[space].target_procs.push_back(*it); + const AddressSpace space = it->address_space(); + if (handled[space]) { + continue; } + output.target_processors[space] = *it; + handled[space] = true; + count++; } - output.control_replication_map.resize(total_nodes); - for (int idx = 0; idx < total_nodes; idx++) { - output.task_mappings[idx].chosen_variant = chosen_variant; - output.control_replication_map[idx] = - output.task_mappings[idx].target_procs[0]; - } + assert(count == total_nodes); } void FFMapper::select_task_variant(const MapperContext ctx, diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 03c9e48af8..0c7a0f78fe 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -467,12 +467,8 @@ OpMeta *Linear::init_task_with_dim(Task const *task, ctx, runtime, false /*readOutput*/); - // TensorAccessorW acc_kernel(regions[2], - // task->regions[2], - // FID_DATA, - // ctx, - // runtime, - // false /*readOutput*/); + TensorAccessorR acc_kernel( + regions[2], task->regions[2], FID_DATA, ctx, runtime); // TensorAccessorR acc_bias( // regions[3], task->regions[3], FID_DATA, ctx, runtime); diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 6588cbceeb..2a94df8b4d 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -28,33 +28,7 @@ using namespace Legion; LegionRuntime::Logger::Category log_inf_mgr("InferenceManager"); LegionRuntime::Logger::Category log_offload("Offloading"); -InferenceManager::InferenceManager() { -#ifdef DEADCODE - num_devices = ff_config.workersPerNode * ff_config.numNodes; - // Check parallelization degrees - assert(ff_config.data_parallelism_degree <= num_devices && - "Data parallelism degree exceeds number of available devices"); - assert(num_devices % ff_config.data_parallelism_degree == 0 && - "Number of available devices is not divisible by data parallelism " - "degree"); - assert(ff_config.tensor_parallelism_degree <= num_devices && - "Tensor parallelism degree exceeds number of available devices"); - assert(num_devices % ff_config.tensor_parallelism_degree == 0 && - "Number of available devices is not divisible by tensor parallelism " - "degree"); - assert(ff_config.pipeline_parallelism_degree <= num_devices && - "Pipeline parallelism degree exceeds number of available devices"); - assert(num_devices % ff_config.pipeline_parallelism_degree == 0 && - "Number of available devices is not divisible by pipeline parallelism " - "degree"); - assert(ff_config.data_parallelism_degree * - ff_config.tensor_parallelism_degree * - ff_config.pipeline_parallelism_degree == - num_devices && - "Product of data, tensor, and pipeline parallelism degrees does not " - "match the number of available devices"); -#endif -} +InferenceManager::InferenceManager() {} InferenceManager *inference_manager_singleton = nullptr; @@ -296,8 +270,6 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { void InferenceManager::init_operators_inference(FFModel *model) { for (int batch_index = 0; batch_index < model->config.data_parallelism_degree; batch_index++) { - int expert_device_index = 0; - int device_index = batch_index % num_devices; for (size_t o = 0; o < model->operators.size(); o++) { Op *op = model->operators[o]; if (op->op_type == OP_WEIGHT) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index c07c33efca..f9763627c8 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -606,6 +606,15 @@ ncclComm_t Op::init_nccl_comms_task(Task const *task, // ncclComm, allRanks, myRank, ncclId); return ncclComm; } + +void Op::finish_nccl_comms_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + ncclComm_t comm = *((ncclComm_t *)task->local_args); + checkNCCL(ncclCommFinalize(comm)); + checkNCCL(ncclCommDestroy(comm)); +} #endif /** @@ -1578,6 +1587,43 @@ FFModel::FFModel(FFConfig &_config, bool cpu_offload) model_id = model_counter++; } +FFModel::~FFModel() { + // Destroy nccl communication groups +#ifdef FF_USE_NCCL + Context ctx = config.lg_ctx; + Runtime *runtime = config.lg_hlr; + for (auto const &comm : view_hash_to_nccl_comms) { + // Find the machine view that has the hash + MachineView view; + for (size_t l = 0; l < operators.size(); l++) { + view = operators[l]->outputs[0]->machine_view; + if (view.hash() == comm.first) { + break; + } + } + assert(view.hash() == comm.first && "Cannot find the machine view"); + IndexSpace task_is = get_or_create_task_is(view); + Domain domain = runtime->get_index_space_domain(ctx, task_is); + ArgumentMap argmap; + int idx = 0; + for (Domain::DomainPointIterator it(domain); it; it++, idx++) { + argmap.set_point(*it, + TaskArgument(&comm.second[idx], sizeof(ncclComm_t))); + } + IndexLauncher index_launcher(NCCL_FINISH_COMMS_TASK_ID, + task_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + comm.first); + FutureMap fm = runtime->execute_index_space(ctx, index_launcher); + fm.wait_all_results(); + } +#endif +} + void FFModel::clear_graph_search_cache() { this->graph_search->clear_cache(); this->search->clear_cache(); @@ -6853,6 +6899,21 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(NCCL_FINISH_COMMS_TASK_ID, + "NCCL Finish Communicators"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "NCCL Finish Communicators Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } #endif // Search { From d73bba1212be19dd8b07e0e8f591b6db2fe4189d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 26 Jan 2024 11:41:51 -0500 Subject: [PATCH 47/61] Revert "Bug fixes and update Legion version" (#1286) --- .github/workflows/gpu-ci.yml | 12 +++--- CMakeLists.txt | 8 ++-- cmake/pip_install/CMakeLists.txt | 4 +- deps/legion | 2 +- include/flexflow/mapper.h | 9 +++-- include/flexflow/model.h | 2 - include/flexflow/operator.h | 5 --- include/flexflow/request_manager.h | 1 + src/mapper/mapper.cc | 47 +++++++++++++---------- src/ops/linear.cc | 8 +++- src/runtime/inference_manager.cc | 30 ++++++++++++++- src/runtime/model.cc | 61 ------------------------------ 12 files changed, 80 insertions(+), 109 deletions(-) diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 48dcda157e..3901d6b5f7 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -222,7 +222,7 @@ jobs: CONDA: "3" needs: inference-tests container: - image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest + image: ghcr.io/flexflow/flexflow-environment-cuda:latest options: --gpus all --shm-size=8192m steps: - name: Install updated git version @@ -243,7 +243,7 @@ jobs: - name: Build and Install FlexFlow run: | - export PATH=$CONDA_PREFIX/bin:$PATH + export PATH=/opt/conda/bin:$PATH export FF_HOME=$(pwd) export FF_BUILD_ALL_EXAMPLES=ON export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON @@ -252,18 +252,18 @@ jobs: - name: Check FlexFlow Python interface (pip) run: | - export PATH=$CONDA_PREFIX/bin:$PATH + export PATH=/opt/conda/bin:$PATH export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib ./tests/python_interface_test.sh after-installation - name: Run multi-gpu tests run: | - export PATH=$CONDA_PREFIX/bin:$PATH + export PATH=/opt/conda/bin:$PATH export CUDNN_DIR=/usr/local/cuda export CUDA_DIR=/usr/local/cuda export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib # C++ tests ./tests/cpp_gpu_tests.sh 4 # Python tests diff --git a/CMakeLists.txt b/CMakeLists.txt index 43ce4f7044..acbe7e385f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -413,7 +413,6 @@ if(NOT BUILD_LEGION_ONLY) # python related if (FF_USE_PYTHON) - find_package(Python COMPONENTS Interpreter Development) # create flexflow_cffi_header.py add_custom_command(TARGET flexflow PRE_BUILD @@ -425,13 +424,13 @@ if(NOT BUILD_LEGION_ONLY) # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library add_custom_command(TARGET flexflow POST_BUILD - COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} + COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python ) # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead. add_custom_command(TARGET flexflow PRE_BUILD - COMMAND ${Python_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR} + COMMAND ${PYTHON_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMENT "Creating flexflow_python interpreter..." ) @@ -568,8 +567,7 @@ if(NOT BUILD_LEGION_ONLY) install(TARGETS flexflow DESTINATION ${LIB_DEST}) # install python if (FF_USE_PYTHON) - find_package(Python COMPONENTS Interpreter Development) - execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) if (NOT FF_BUILD_FROM_PYPI) install( DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/ diff --git a/cmake/pip_install/CMakeLists.txt b/cmake/pip_install/CMakeLists.txt index 105133a310..7ce38c4abc 100644 --- a/cmake/pip_install/CMakeLists.txt +++ b/cmake/pip_install/CMakeLists.txt @@ -1,10 +1,10 @@ # Use setup.py script to re-install the Python bindings library with the right library paths if (FF_USE_PYTHON) - execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) if(FF_BUILD_FROM_PYPI) install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${PY_DEST}/flexflow/lib \")") # CMAKE_CURRENT_SOURCE_DIR=/usr/FlexFlow/cmake/pip_install # Legion_BINARY_DIR=/usr/FlexFlow/build//deps/legion - install(CODE "execute_process(COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)") + install(CODE "execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)") endif() endif() diff --git a/deps/legion b/deps/legion index 24e8c45234..626b55689c 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit 24e8c452341dea41427e0ce61e154d61715e6835 +Subproject commit 626b55689c77848b246e1da19678c7ad58899f0c diff --git a/include/flexflow/mapper.h b/include/flexflow/mapper.h index e8337818ec..71be1892aa 100644 --- a/include/flexflow/mapper.h +++ b/include/flexflow/mapper.h @@ -83,10 +83,11 @@ class FFMapper : public NullMapper { Task const &task, MapTaskInput const &input, MapTaskOutput &output); - virtual void replicate_task(const MapperContext ctx, - Task const &task, - ReplicateTaskInput const &input, - ReplicateTaskOutput &output); + virtual void map_replicate_task(const MapperContext ctx, + Task const &task, + MapTaskInput const &input, + MapTaskOutput const &default_output, + MapReplicateTaskOutput &output); virtual void select_task_variant(const MapperContext ctx, Task const &task, SelectVariantInput const &input, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 95be9ab581..dd6dc76b4d 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -202,7 +202,6 @@ enum TaskIDs { // NCCL tasks NCCL_GETUNIQUEID_TASK_ID, NCCL_INIT_COMMS_TASK_ID, - NCCL_FINISH_COMMS_TASK_ID, // Search STRATEGY_SEARCH_TASK_ID, // Graph @@ -398,7 +397,6 @@ std::vector class FFModel { public: FFModel(FFConfig &config, bool cpu_offload = false); - ~FFModel(); static constexpr float PROPAGATION_CHANCE = 0.25; static constexpr float CONTINUE_PROPAGATION_CHANCE = 0.75; diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 1b19bdb82f..73c2c3e092 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -406,11 +406,6 @@ class Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - static void - finish_nccl_comms_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); #endif protected: void set_argumentmap_for_init(FFModel const &ff, Legion::ArgumentMap &argmap); diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 4763eb1ef3..50a51705cd 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -55,6 +55,7 @@ class InferenceManager { public: std::unordered_map> tensor_buffer; std::unordered_map model_weights_loaders; + int num_devices; }; struct Request { diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index d46bfc2877..bc26a79d3e 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -661,37 +661,44 @@ void FFMapper::map_task(const MapperContext ctx, } // for idx } -void FFMapper::replicate_task(const MapperContext ctx, - Task const &task, - ReplicateTaskInput const &input, - ReplicateTaskOutput &output) { +void FFMapper::map_replicate_task(const MapperContext ctx, + Task const &task, + MapTaskInput const &input, + MapTaskOutput const &default_output, + MapReplicateTaskOutput &output) { // Should only be replicated for the top-level task assert((task.get_depth() == 0) && (task.regions.size() == 0)); const Processor::Kind target_kind = task.target_proc.kind(); - VariantID vid; + VariantID chosen_variant; { std::vector variant_ids; - runtime->find_valid_variants(ctx, task.task_id, variant_ids, target_kind); + runtime->find_valid_variants( + ctx, task.task_id, variant_ids, task.target_proc.kind()); // Currently assume there is exactly one variant assert(variant_ids.size() == 1); - output.chosen_variant = variant_ids[0]; + chosen_variant = variant_ids[0]; } - output.target_processors.resize(total_nodes); - std::vector handled(total_nodes, false); - size_t count = 0; - Machine::ProcessorQuery procs(machine); - procs.only_kind(target_kind); - for (Machine::ProcessorQuery::iterator it = procs.begin(); it != procs.end(); + std::vector const &all_procs = all_procs_by_kind(target_kind); + // Place on replicate on each node by default + output.task_mappings.resize(total_nodes, default_output); + // Assume default_output does not include any target_procs + assert(default_output.target_procs.size() == 0); + for (std::vector::const_iterator it = all_procs.begin(); + it != all_procs.end(); it++) { - const AddressSpace space = it->address_space(); - if (handled[space]) { - continue; + AddressSpace space = it->address_space(); + assert(space < output.task_mappings.size()); + // Add *it as a target_proc if we haven't found one + if (output.task_mappings[space].target_procs.size() == 0) { + output.task_mappings[space].target_procs.push_back(*it); } - output.target_processors[space] = *it; - handled[space] = true; - count++; } - assert(count == total_nodes); + output.control_replication_map.resize(total_nodes); + for (int idx = 0; idx < total_nodes; idx++) { + output.task_mappings[idx].chosen_variant = chosen_variant; + output.control_replication_map[idx] = + output.task_mappings[idx].target_procs[0]; + } } void FFMapper::select_task_variant(const MapperContext ctx, diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 0c7a0f78fe..03c9e48af8 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -467,8 +467,12 @@ OpMeta *Linear::init_task_with_dim(Task const *task, ctx, runtime, false /*readOutput*/); - TensorAccessorR acc_kernel( - regions[2], task->regions[2], FID_DATA, ctx, runtime); + // TensorAccessorW acc_kernel(regions[2], + // task->regions[2], + // FID_DATA, + // ctx, + // runtime, + // false /*readOutput*/); // TensorAccessorR acc_bias( // regions[3], task->regions[3], FID_DATA, ctx, runtime); diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 2a94df8b4d..6588cbceeb 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -28,7 +28,33 @@ using namespace Legion; LegionRuntime::Logger::Category log_inf_mgr("InferenceManager"); LegionRuntime::Logger::Category log_offload("Offloading"); -InferenceManager::InferenceManager() {} +InferenceManager::InferenceManager() { +#ifdef DEADCODE + num_devices = ff_config.workersPerNode * ff_config.numNodes; + // Check parallelization degrees + assert(ff_config.data_parallelism_degree <= num_devices && + "Data parallelism degree exceeds number of available devices"); + assert(num_devices % ff_config.data_parallelism_degree == 0 && + "Number of available devices is not divisible by data parallelism " + "degree"); + assert(ff_config.tensor_parallelism_degree <= num_devices && + "Tensor parallelism degree exceeds number of available devices"); + assert(num_devices % ff_config.tensor_parallelism_degree == 0 && + "Number of available devices is not divisible by tensor parallelism " + "degree"); + assert(ff_config.pipeline_parallelism_degree <= num_devices && + "Pipeline parallelism degree exceeds number of available devices"); + assert(num_devices % ff_config.pipeline_parallelism_degree == 0 && + "Number of available devices is not divisible by pipeline parallelism " + "degree"); + assert(ff_config.data_parallelism_degree * + ff_config.tensor_parallelism_degree * + ff_config.pipeline_parallelism_degree == + num_devices && + "Product of data, tensor, and pipeline parallelism degrees does not " + "match the number of available devices"); +#endif +} InferenceManager *inference_manager_singleton = nullptr; @@ -270,6 +296,8 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { void InferenceManager::init_operators_inference(FFModel *model) { for (int batch_index = 0; batch_index < model->config.data_parallelism_degree; batch_index++) { + int expert_device_index = 0; + int device_index = batch_index % num_devices; for (size_t o = 0; o < model->operators.size(); o++) { Op *op = model->operators[o]; if (op->op_type == OP_WEIGHT) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index f9763627c8..c07c33efca 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -606,15 +606,6 @@ ncclComm_t Op::init_nccl_comms_task(Task const *task, // ncclComm, allRanks, myRank, ncclId); return ncclComm; } - -void Op::finish_nccl_comms_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - ncclComm_t comm = *((ncclComm_t *)task->local_args); - checkNCCL(ncclCommFinalize(comm)); - checkNCCL(ncclCommDestroy(comm)); -} #endif /** @@ -1587,43 +1578,6 @@ FFModel::FFModel(FFConfig &_config, bool cpu_offload) model_id = model_counter++; } -FFModel::~FFModel() { - // Destroy nccl communication groups -#ifdef FF_USE_NCCL - Context ctx = config.lg_ctx; - Runtime *runtime = config.lg_hlr; - for (auto const &comm : view_hash_to_nccl_comms) { - // Find the machine view that has the hash - MachineView view; - for (size_t l = 0; l < operators.size(); l++) { - view = operators[l]->outputs[0]->machine_view; - if (view.hash() == comm.first) { - break; - } - } - assert(view.hash() == comm.first && "Cannot find the machine view"); - IndexSpace task_is = get_or_create_task_is(view); - Domain domain = runtime->get_index_space_domain(ctx, task_is); - ArgumentMap argmap; - int idx = 0; - for (Domain::DomainPointIterator it(domain); it; it++, idx++) { - argmap.set_point(*it, - TaskArgument(&comm.second[idx], sizeof(ncclComm_t))); - } - IndexLauncher index_launcher(NCCL_FINISH_COMMS_TASK_ID, - task_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - comm.first); - FutureMap fm = runtime->execute_index_space(ctx, index_launcher); - fm.wait_all_results(); - } -#endif -} - void FFModel::clear_graph_search_cache() { this->graph_search->clear_cache(); this->search->clear_cache(); @@ -6899,21 +6853,6 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } - { - TaskVariantRegistrar registrar(NCCL_FINISH_COMMS_TASK_ID, - "NCCL Finish Communicators"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - if (pre_register) { - Runtime::preregister_task_variant( - registrar, "NCCL Finish Communicators Task"); - } else { - if (enable_control_replication) { - registrar.global_registration = false; - } - runtime->register_task_variant(registrar); - } - } #endif // Search { From abf9fb8889504a7bb526401dc9f027e2d4640334 Mon Sep 17 00:00:00 2001 From: April Yang <114364211+april-yyt@users.noreply.github.com> Date: Fri, 26 Jan 2024 11:39:42 -0800 Subject: [PATCH 48/61] Chatbot with Gradio, FastApi Endpoint, Langchain Integration (#1246) * add a background server for RequestManager * . * make incr_decoding work * make spec_infer work * format * update python inference * fix python issues * bug fix * add a Legion future to capture the termination of the background server * gradio finished * chatbot gradio version 2 * chainlit1 * chainlit2 * fastapi done * fastapi incr_decoding * langchain example & wrapper class * langchain example & wrapper class1 * added documentation * entrypoint * del apikey * delete extra files * rag search fixed some bugs * fixed rag search issues * updates before rebase * minor changes * reorganize files * Add thread safety for background server. * Simplify backend server design. * resolve conflict. * specinfer usecases with issues labeled * specinfer usecases with issues labeled 2 * fixed issues with prompt template * fix issues with rag specinfer * Add server task timeout. * register callbacks to terminate background worker at exit or termination * [Python] enable decoding multiple requests * update README.md and default configuration * fix issues with gradio and prompt template * fix issues with rag * adjusted fastapi entrypoint * update documentation * resole conflicts * issues fix * adjustments on usecases and api entrypoints * remove redundent changes * testing CI * Enable backtrace * restore newlines * version * add back misdeleted line * legion verion --------- Co-authored-by: Zhihao Jia Co-authored-by: Gabriele Oliaro Co-authored-by: zwang86 <46699021+zwang86@users.noreply.github.com> Co-authored-by: Zeyu Wang Co-authored-by: xinhaoc --- SERVE.md | 3 - docs/source/chatbot.rst | 64 +++++ docs/source/imgs/gradio_api.png | Bin 0 -> 256263 bytes docs/source/imgs/gradio_interface.png | Bin 0 -> 331678 bytes docs/source/index.rst | 2 + docs/source/prompt_template.rst | 55 ++++ docs/source/rag.rst | 90 ++++++ docs/source/serve_api.rst | 7 + docs/source/serve_fastapi.rst | 106 +++++++ docs/source/serve_gradioapi.rst | 30 ++ docs/source/serve_usecases.rst | 8 + inference/.gitignore | 1 + inference/python/entrypoint/fastapi_incr.py | 162 +++++++++++ .../python/entrypoint/fastapi_specinfer.py | 202 +++++++++++++ inference/python/incr_decoding.py | 6 +- inference/python/spec_infer.py | 6 +- inference/python/usecases/gradio_incr.py | 162 +++++++++++ inference/python/usecases/gradio_specinfer.py | 205 ++++++++++++++ .../python/usecases/prompt_template_incr.py | 187 ++++++++++++ .../usecases/prompt_template_specinfer.py | 236 ++++++++++++++++ inference/python/usecases/rag_incr.py | 220 +++++++++++++++ inference/python/usecases/rag_specinfer.py | 266 ++++++++++++++++++ tests/training_tests.sh | 4 + 23 files changed, 2013 insertions(+), 9 deletions(-) create mode 100644 docs/source/chatbot.rst create mode 100644 docs/source/imgs/gradio_api.png create mode 100644 docs/source/imgs/gradio_interface.png create mode 100644 docs/source/prompt_template.rst create mode 100644 docs/source/rag.rst create mode 100644 docs/source/serve_api.rst create mode 100644 docs/source/serve_fastapi.rst create mode 100644 docs/source/serve_gradioapi.rst create mode 100644 docs/source/serve_usecases.rst create mode 100644 inference/python/entrypoint/fastapi_incr.py create mode 100644 inference/python/entrypoint/fastapi_specinfer.py create mode 100644 inference/python/usecases/gradio_incr.py create mode 100644 inference/python/usecases/gradio_specinfer.py create mode 100644 inference/python/usecases/prompt_template_incr.py create mode 100644 inference/python/usecases/prompt_template_specinfer.py create mode 100644 inference/python/usecases/rag_incr.py create mode 100644 inference/python/usecases/rag_specinfer.py diff --git a/SERVE.md b/SERVE.md index f6e34750cd..e64756e8f4 100644 --- a/SERVE.md +++ b/SERVE.md @@ -187,9 +187,6 @@ We provide five prompt datasets for evaluating FlexFlow Serve: [Chatbot instruct FlexFlow Serve is still under active development. We currently focus on the following tasks and strongly welcome all contributions from bug fixes to new features and extensions. * AMD benchmarking. We are actively working on benchmarking FlexFlow Serve on AMD GPUs and comparing it with the performance on NVIDIA GPUs. -* Chatbot prompt templates and Multi-round conversations -* Support for FastAPI server -* Integration with LangChain for document question answering ## Acknowledgements This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting FlexFlow Serve. Please cite FlexFlow Serve as: diff --git a/docs/source/chatbot.rst b/docs/source/chatbot.rst new file mode 100644 index 0000000000..fc6f616fae --- /dev/null +++ b/docs/source/chatbot.rst @@ -0,0 +1,64 @@ +:tocdepth: 1 +******** +Chatbot +******** + +The chatbot use case involves setting up a conversational AI model using FlexFlow Serve, capable of engaging in interactive dialogues with users. + +Requirements +============ + +- FlexFlow Serve setup with required configurations. +- Gradio or any interactive interface tool. + +Implementation +============== + +1. FlexFlow Initialization + Initialize FlexFlow Serve with desired configurations and specific LLM model. + +2. Gradio Interface Setup + Define a function for response generation based on user inputs. Setup Gradio Chat Interface for interaction. + + .. code-block:: python + + def generate_response(user_input): + result = llm.generate(user_input) + return result.output_text.decode('utf-8') + + +3. Running the Interface + Launch the Gradio interface and interact with the model by entering text inputs. + + .. image:: /imgs/gradio_interface.png + :alt: Gradio Chatbot Interface + :align: center + +4. Shutdown + Stop the FlexFlow server after interaction. + +Example +======= + +Complete code example can be found here: + +1. `Chatbot Example with incremental decoding `__ + +2. `Chatbot Example with speculative inference `__ + + +Example Implementation: + + .. code-block:: python + + import gradio as gr + import flexflow.serve as ff + + ff.init(num_gpus=2, memory_per_gpu=14000, ...) + + def generate_response(user_input): + result = llm.generate(user_input) + return result.output_text.decode('utf-8') + + iface = gr.ChatInterface(fn=generate_response) + iface.launch() \ No newline at end of file diff --git a/docs/source/imgs/gradio_api.png b/docs/source/imgs/gradio_api.png new file mode 100644 index 0000000000000000000000000000000000000000..7bf1b99a5e6448ddebb6f6c68932da548f12446e GIT binary patch literal 256263 zcmeGDWmFu|wgn2K!CiwBLLfK<>EOZL-66r<8@J#=k{}5L_eO)$xCeK)hJ@hm{)%(X zmGOP!a?Y>!>-88-7uB_QRqxt+t-0o$6{(^mjfGBz4hIK^B`YJL1_y_V4F`vug8B^D z<1A5U0SAXEWGybPA}cOVqvGOdVQps)2PYGmsEwkd-bb9S|0Q13DoY-%8SMp6U;?h* zCS|*n48lkJ_hKqPlO>U7*qBVjwIvA!Xgs=xvu7*Vi3t8%qar?ta%W&PIGUL;be!_QQTz!d`$QyZe)P-cuQY|b$XvTDPLh)0u zjAa_p+K5ZN=z%|cL~Hx~cK2J^B3yNvS|%gT%u?6j6G|Fs{TL5sq0`H;eB9?h$VW-$o`*TIa-5w zbU#6P4btS?Uj*#K>k?hBkK2=j7~2rq1F>@}%!4CoNKog!$-P$-vl$4T0!cq03OlKB zh0v+33yFahkh)>Y5tpJ|+-x$-11OU55ByzBu}U=Z%la>=Kcl3J8Nj`GC)||OFO3$S zofpUuk9SMQkHX6~$Vbhq`_Q_9GIxMUz@;iD#ZUUVjB`^KllB=wXJnRy&m!tYuip_P z81v&0ktJ+7hF$vm$_C=%tRd&N125 zN3%jQNJvU1?vA4R##9pf;rni!^BMpqzEd}zM|28bm`#jpzU~m$>NT;j29^(l+8MZ zAslVpWw6G*X13D@mI7ZyUk z71cvNLUM~yHUB~qigz{MmbqxIfA6=fd^MM{D;`UK@=Lu>zE8hT@z1~?wqwgE;_jq1 zY1!;QYJXLx)wUTe87~+eH8eF@HJUYlYa&CcG-n|DnnLB*DrY$ZX=eiZsY=Xn4}}WDLbhv(od0GyPQxiUf+Qro6VRCYh&prph6$lQgzEwj4Y? zwuH6`wiKiFC1b_1)74`itX;S*ZM1BtrgukoXR@thCZnd9M)gW+)OUil>Ud=rqy!{V z3xte)qR*}OdbvMY$i5$|ulA}gt$tI@MwuO3imlI}U7%oDr%M2`Q@sddE{Be%HrBa2jyzZ*^RmKy4JNoelX|{NQ2+NLxNI> z5;=B4!k2^&eYp6zwwGyd8Hwh<@P$OPy4m_I~WX?*-O=ouC!f7!`(KicpKU zw^x+Vj-XFdNfTESq=}!D{}ZUktjEhS*faiuPF*4oln9Z z<9%Ui4%LnBrSmQCjr;Y)-J4s!E6-=&P@E7Th*ZdW&yEr5kcHpRzj6KE_>DcpAh_#Y zc<^$Sug)7ChLE4I3fMTT7I!cHPkdB7N)~a}Tvkn%pDC5;M2AusK?hxDa=2%O;;exO zRwH`{vze5He6HRHvk#Jgau~xIoi)_C$tdxfxvq^qxqsrwrzP&He6QWJ99I>SO6d7A zos^oy`1u33>4%QfkFJr`U1*U41S|B&^sI`|RJ&vrMF*uZndhl$Uyp@(K<<(@8=H7J zmY-S@ED~PH*eWpO734a~A5ar;S)6?<`$_d)zGs=jQoy0V9^-dR&qkaG@4NR1mK{Nu zVp29e!=KoH=qT|Nh;1`GL$+@s%Dv9y*D=`PLr&oNi|xFERn zyLuzDybFbCI)L|k7B9ShdM%thJ61orAS@*OKuAGsgK_eal|Yd}>H9333t?geKf{%R zo=}l8#p~r)?_U0X857EXDBYrI!vHLSieIgTm7AS^68cc`trYQ;tkbqR+{Y#@yR*7m z6*~)@3wV_IGK4L!JD(3ePqlpU=8YxqNaYWn0IiwbiaCM5Du4eBUHO#_EE2h{49lm@ zaIbv0-@S?pK)cDSm1veQ82{Pk+1?X+Dc0UmE9nqD&3diX&)lr@S53?4+?><%sM}cj z9(q0#M6WA9pXAN9RPm`^p?>%8ap1aumQ^XE(Gl4EwEtT3R5P!5xh%cnTCBn6s)r|m zXPhU^HfNf*tV*xWZYeqHM-+MDyF?$zr($BwX5$h40C#a%7&PV;`2er~{L5K4Ti4mY z*EXr!$E;nflKSZmigVukd_9gl3!iEe?Qful_06u*9kEGd4xt; z4d*DE8h;KKIh;^63z_@08*MFb+;E*QEmAkMUcvnqU3#|@q8V-#F1Pdfy6DUg>n&DV zjCIUjI2|@0-xV(<|C;ZI8`YDX#9p^JRw1Rg!Vi~ik!u8Z43Z2!%H|o~i-K8K`&(lL z<#k>WPm*t$(|R8@)csK|6)85FZx+1Nh8dUo<=jAxB8{<{jBa}lhtBKv9aC?9UsE?R zgD4x8^qbjAe|c{c9fPPg#!_nuCIa;PIpfKR=+GRAlMu!Zj^p z!|`Cjku7GZ_As6YxQG+_XuU;vKx00Sa5+KB#J+Fsz36C7#fCd~2wh+=EaxN+1Tn$M zF2lLIlG}v%@LaX1SxD65%o3jPRM8OJlB@5&4fu7l@g}|q8j>;cfba^@SCW8rxE!b_ zqRe$5bg+|=b#hiwhjf0Kjg)lk|4UM47 z2Mc~R38{ZI2mTX!VdduL#Lv#|>FLSl`G(EW#gd)#?c2BP99-;NT&%zmtgc=TZl+*X z2Uog(cJg2SNSM36cd>SIvvzc#dF>i@T?rmMM&xT8IAQa9oMxncim{J+2aS3^Pe$KU?H#^Rp?{g1uCNDHG2 zvj6v86GpEd#3ll6B!#tvvIg)8OtZ%q!VvI>@t>c-HvE-Hp|m?NDMaC9CEjU(;dhsi z-3VvMP!4ervSgf+k%Ey?_xJa^4bchFyPHF6RYHi-6NvQ~)k?t5uM^DL(TGv8gFa>k z(Rf{^I+}!yC4Q*def8jjz4TqcbC<8tYoqBV!-pVOCh>|VOB4U|9TnplVcJ**UV3t6b zsG$F4z5BoUq5nM+!HAdGPmjreABwd06D}C>=yLVf-~*@tsa8dbBrF0_!Eu8*c4n$OFP4OIP2VLpTt!4p-;hF zF1d12V!4NM)0D4ct0#v1^zw+HVL2UnA7(q6neLFxvHC3>n{sSQAP1ox-jeY*1+G(5 z8=rB2>{6pUui)Y5oS?}MZyzJI<{?eCSx9wfb?+A7_!Uz^dHe zhhA(y1SI>uGQ4T(WKd35*>AcJTVTRiuwr3`=LV9{S zWa41<=hu19-F(O!z1Mox9QN~veDpH$qIF)Umd2xtUKnm?kE`j4s7&L56o&m=RQwrE z#TBgfBg!rsmJEZ#Sfvp1?gD!$bdez#k?lfeaVK~fIfI71oRFCPdMu|mfiJ)`+P<2c z@;L?Uy(>6Vx?I`ncMt2ldgKl&Hfd!hKElrz9KUCZCQGc}Fa0!p4Z`)f%P#x9G7W=U z(Fw=4w_Z%nP_O*pnD6X$zlMBw)GK&xvseeckwSjzTHz3WGC0wlepy1PBczH@vI=Bx z8|U0>Y15g7Zx7NDjbRUG(P@hVgBy1Bs zU(x7DUS>r^pcbKLJbmjys%^`7D7o@)wm0J(tODb12UF*(5QPn?kUM9{vX9*iRDpNLwIcAFtdg9Yx>{G0pTwuKOjiqrLy z+k{Zei9a}z;04G888vkXc^HW@Z?Z_en9QIF99j9Hj|FFh)!z3I27g7w!d9wxSIq#* z6q;q9C>RYNO0_EPK0Dc9S4p&ScUv&v1jW-yd+^r>B=74rWM>A*HnrO^hRAv1a$GO8 z&+IX^2Mwq3kA6Jp-C+cV;d3U!Q(ytpnaE5my<@aQ%iYEo1{1BOV)r;E zT0UH9^l8L4TfPm|Xk!;iWYyQJD|MNgcU$d{G;CItR0u}&ZEE(pr(ZF6>!qqWh=4ha zjh(=x8!Qz|A=ZH0TWvj|hF}T}U8r-Icx}obF647D*3=O2#S&i`>a?G*eVs?-6M}f# zBo#*?g^Yf!8g{z%+Il?qbbCr38t?$W1a*&|E?E+NjCjfH#&QJ_=aysHq1K0)_X!Nj z4sz_6SLrOeE=LA4<>BLf>hHP&bhICu!>ELPKY(L-OZC;xcjw#|YV9AAW2r@oij$ct z%?IS{7plK#t&ae4Hsa!yHY0IUlbXs1)sm*`oH5i*w!X#Pg&EuF(~S*gBBPN4YJV<% ze$Qd%!`(rE+r_@2Uac>YV%m9hR%$^3Lky|#H!_|Q<*kVVg7D&i+eu7KzMco1Uv=8hu+e!xm8k>deH zhYTe(Bt^*YIwAjoOI@>fBu~41KS2}q8NKr7;LS5zsMiTGa)yw!eyu}JC`NMOJhL{C zE!q94xh(r-o^yT3B1(a_%0Pu#9(!!Z**+C{k11nPx5g3ETbs zTk*BpOOjIKR)2co91IfniAVwl>-&so#+QFmT(rHu7YMx8sen5=GshRy$DlIrPxw+N zkI^I|;%fiaV;5xqa3P|37T=Dju`}MkEAn^l)FAi#=g|VIMj_2qvA#Zk=w*ZL1r@*B zW`sd+Yr%*J`MdRxt?PC_UdrNopKjtAHj|0<#%AEA_bEU<_MnSY7*DQnPuvIxFb(wL zFIyITiwF`*_D#a?D}Ryu!!!_2U{aRz*CdT52~M($^^JMOHA47#>t}8;%UWIBAdbH$ zpHF2ojG&#Z;G-`tmw%?F1eihb?8Zj4goll*9DDuIQJ+5L#-@q9NXJ+%oF$`x4exJF z1XOwp3-_Louv>M8DNY_tw1-iyO3h@I?9Be8j3MRK*?p-!P0o7K&=dC%YcZ5wyg=c0 z9D$oZs?67(m7to_yFD*YG?dDdZyYe@DcI_L7FD_ai;hR#Z}_`V$o5QmF{ClzL70e9 zL#*V+`dEVY)2LdFVIw{fkz@>6UFL*k@VX+a9yi@`a={n}^WxhTs6E3Y0MaEeYz_}c zQA|Cix-RU7`E7LY7L9tP8`u}Gh9; zcI|T6-^sWE1>#xln-Iik;=wF39!L>es({4FpR)oiG0VEv0NN?u zDx&2Amqfj_h1r~spDC{c=6g$uAn5`ZBKvo*OZ8h-H65$JynZh=-{ci@sO9@AXDHu2 zE6Y?P$HFg)P>PO&$)Y_dti!cPH3tInmIy{d`{Xe-6&fC{e)NUN?JO1(`|!<`X$PN1 zi4NY9mt7Ia?UIql8iE~CujfQl8_;-r@%=%G86)o5M6Dmmnx*-O=ci7S=wudkD)fOT zmr*iF#NJ_GW9YNXAX*UY*jRt2QIraL2fH_fVqjnkOHbKO6YRwB=Yz%Pjy3JK<72Iz z0#`26h3zZ%$r?JWsLp1wu_ucR)hVporm=N8F|;-GFt8P_tX~|B9*?mu;u#e<(5+e8 z6dkHO+#1TH@z>;8=I-mpC`H>+MBCU#I&}mo2#TE_6U(nDzloMWC69Q%O0N8)P17hk@`U_ z+K`bGj3~I?9{C<6F+6tqn6HByyT6Fk;57*LX{Bsz#qyrDh45aNjJ^|f!S0$Qr?xQU zDOgtjMbr+MeVoATLk4EwwFk7X#`okbd9qpGa%qYw0IACVLGs6f;X=g5^Nm^q z683cH&Gi*ol}_Q`(_W$-3`T^8@oJIA@WTpWaK;^3h>fp(&h2=OVw|fUD~IJ<$v>Z^ z4d_9zl9e)5r#7Ab+W9!|^Q;*1y4h*c(kGB990J=**ydqiW$R?9!$iLPJEyi+uejCF zDKHk51?`}^<4$Dw_t&@t|Xp-%X^1Ags8(~oZ0W7KpYNxpDE$pJ<+bE*!V9cb}wX(gi56pZPU9JguQzW>mx-c*P$lnY~qwOK#ufKBq@zXWE zzqXsJOfe+en{Q5@uipDmf7j^Pt2)C*iB;!JkJCx*drmqQHEy$Zdm;K`L zrS0(k=>W=znvnZ5lp22NX`y8^Z6jJRf;5oobH1x&$lr{1{8Bl6kQ>SQXpP>B+B!(! zYKO<_mFICXtA5J8QH{nti~n)WQazs1yMmF?Eb;Je=<#~L-tE#I$eQWs0qlBoc$hgv z{*cruBrGf(rXD3BDTx=ioXL9fmpykbU{GCi1$Al#gU{+Q{!$5)*$glb-|o$mQGjnR z(F~z(k!pEzlFQ3)-DX%c53BxK#v(i!Z>8a-ff@f6nDJQc@SeBlExopX+u%@8szX`f;A{Fz?Ly&?VTJoSR$gzy)}D+blRTVirQK%9vzn(a>9h}yte z{i+=%RHw!Swt5e`hdriZthK1q1dw!`lyJe)UDTlEa;x@-b8 z!h@ZzGzAR@lhg!}%pOFuSWKRH=&P1yhSWqvAz72*9Mp@Ih;5fWOug3bDm)BzS`Wpc zU;X1?2H(IYuG{@BGDNL3c9l2XKcpahG>KBcL+vJZsd_4G=lmJUkF$mK=o(^*df#=X z8rzl7G9#?d2c_0urMQ72E^I|oL7V1{Df|<#eNlrZe-+*n2kY>?xKo>AYtz9_WpC_r zNk8w4$u$3oeAdZkX==^8b9?e46ZpGoxYm&rabw@B8c3-}_c9F5v%+QKn_^vD_W2CZB(hYGF@fe!U?OkJqi%qq@hSi+(rQ zqv5OzlD5X$^N)zeo6Nz9H5_N3)AwtG`bM19!Z{^GfMdttZ&r&udfIGKl&({tKwDxC z(M$QHPnV_s{vk_xt92_^F+Mr?`t9)YCqGXJ;RJ~Sb&rmY^6%guo;&aYi^1t!u%uaa zHEY-BeA~8Kw&Zt3d-ut^Fggh*X)r({LHy!Ss4_o2e^a=fo9IA&mOqqQCs*td02Olz zEU0EQs%)|@{5^72Rpe1h2KD*<`PprB!mIrRUZ#xm9mO2!B5_hRftq;ADdG+&b{Y%BS+kyuphLWll-xf6?+tr_EG z1=&nHo1&_PIeW^T8!BL+ao@2`DSDUq0;gfJuo5wcdI zP@+#D8d=yIBhuqut5eCO~QU6iTn49ii^74lZf5t6lEFt}Du<03iMXv&SBcoc^~irS&wz%tY> z)f4v^GZ*g#;~#7CGg*4Z;GXcJ!3 z(N4K1Lm?O?I=$PV+1LZ5A^#;gCf)4%ZE%yBwHuB!3JN&oOPL`*en7z8?Xm6;h=fm; zX7Nm(>&9uTswK)}tWDfs**&PN0ixTA|2m{*t0Ode{PiK)lljpLUOMbqZmRpnAt1Yq zt9KP>!c*~P2vpUCuJ{TQx#{V%mT8x$ELWtjD6I2o3`NlaI7k-i{F|7J!-`SJ|F&ya z!_3bP358_{Uc_|mLkgEWwA;4wajaN7riy7htT3in(xdtQ_I*wDIJoVtSQ=f;u^!8A z?Xpke_xJHzjGHc9u>xc2GcVDCMoVoOuFuKuQs<_F5gS~tRoaZs5~#jJ^ZujS(Q9%l ztDZB>c**6r!YBjZD+{sEEHzeDw9oSGt`9(vjGIA> ztmf@CHV6AnyfE9y*=npB<$F2mRQC-DWq+|j-sk+;Xl~zxoj)QkZ!jW{tp-40{3TsB zjx)*_4Z}*ss*_Cw5@Wr1oWObYI>4z<*2~J<*EpoDEMhl%ywe^b5>d{OUGe8rU7W6x z<|Dh9BJK2$>f9setyYPIwUJZE4-;ROIP`xR(w^#}3_%oxww-1KJhCAg7iI@h0Ll0` z4}5Vc;$J>_XK4YJn(_@Rt$x5k8* zB+vW!Y-A$dc*qq)MuP9@-7maeZ8GL5rRhz@YAgK!NxJXwjZN7yUzYAMCl5aoe*l$3 z6ilk#z5Xv&kG%E``Snu8Pg(k4L`Rc<0iGU-*WOd?MoqaC!d{vJjrvZR$_MG(9)pZi zLZqdKn>c)q%lHK^3RQDbR0&)X0nx#~5g#EK@e7jh6KWz&3mS-yo8~)I9}L$Hzty)o zk=?euv=AA-?D=8UwA2f1n`1^yq0{u?Io5MXhW_#<>G@>06c8dSmiuGDRXf$5UGP4UPpMSTQ-yLOEg11Tmql)lG(v=QF9#C^@ocqMF$aGSPM5ly)6wZ^ zZgKM21m0ooHqVZ#)8Shq4I)cr&Br>(?JF=t@h_i{XM!$Y;(i@%Yw<&gxcRlqx=%eRMeK#sG(pqP@tmM$WRhX2xObE(z;F6H&FOi)z+G4x>;uKP~UZkLGmj_vNdd@9JN|sO>{h| zStkkS0$8AKTR0_nJI<*222aG%+&TOo^Wh7|zn>4N0FzDQevd=R6&Qv|Wy1Hpx^+o_ z-wOp;UYPr_1v_`nIM-&oR!eyz0Y|~pI?o+4JlA4Ra764N3mMEk0e3=o{J%YyYU}YS zKt6v-^Ax_Tc?Q%uu4KLF?%&)_BmyFUe~RGe`C98D!H-Rg`SuL+H9qA{4%HQV8CV*H zCh$3mfA+q{U3$sW?_NuG()8s>$8~c=OtSH31a*+F1C-VD9+c07y2dWgU*Bksu?!jI za(VDaK)SFxxY$4Y_&le1SFSzi2T;}loNfMv%kd?xT!wI(!_9D7W0Cz08`QhZ>4THW zMLL|#Qhv^=S9Kzv7`t_G*V!|yRCUIIN=J!oV;pW*doaW{AiN0NIIMJx_Hnew*Vs3}8qWyIhCSo@i z3)Ug1T01a~xSy*Wc5;39@P!ayPv*69BSh@ysP-F+mjQ zr-YOC2a>EwHBcr>4uwvdZ3oVFvNWJ}=Jqx9=^)h?{9u!0Q&^j!GzE-&q4|Wm|0_e? ze!AC(5HtGzv1Xs`gTS&ZEFyryKU~4TPc_rn=CP9NjmzMke>wd<2Buo)Pr&cImaytS zhK6moQu`U5_;7-9WL}1_pK=!s8+1}PFcSd_JeDN)n^8eO(NtuCzRh};DK9o9FQ2)*Y$v9uZ2S#chkAo|m9sB)3yY=DM+ zdwimtpTi1(-0ps!t+^^o{!TS|`B6ZYVU+#y5^8T?N=N~_Cg{ah3&Y& zEAMQl%d{zzwf_#?rEO0a7L--|+tBwRH*^*=ilkb!_3Gk`0n(TZILxzShnn16c6vg#U-kvNgD#1^_s_PFZA7n`sm?fZq-96TfuC$;wbjBx8GOjU_78^-`$nAlHby=B}( z8^Za%cMKr(i((11gAoBu7hhnjeqy3<5$c|5Gi5C!=nkKpEBjTgJblTk>yS3g6l;sr zq8;8gAh%vUNx(9%W9Fj#(8GPYL^qddY6H-#`}jQJr?`<90|f%g4bbx+bMamt(SG6m z{tb0R_+&CVd9~fxkCq3AAK(nZmwrKF^lmFz&c-oW=|WK1#v);AL9pnMu$_2&`v#+Y z(%@u+y3h7}9^F53rBcJ%a63G2qs!ZGphZ)Sj?T^^+lx4?m2u@{Hde*UeOrieEA}37 zeh~+G{|bPeTjJuGEnhpYbwTyl4v^wR%?J@-J=(|^*ilvk{}S*u-2n=GtGxT30+J#Cky>k#0{k7L=tk-|InxpJ7os)GO**JN^ScwMZeMBq%@0zv9jy z@1{Wj=G5QzVWnl9hsH#t#%jt)cijcZclC>O&ZQV!VEG4 zP-;7EwmI~t3#Fc`b-DZ30>WMv%ID92(&$j~F_S3pe!BJ>MTrniPp z(Em;O27zfzY#CZhv*05KlYT?RtpX2F0EAPtrq0mjn0S)fE=5_5LBf`0N+&$A_r9ou zyI7MS;&scsKzR+?4c zT}5jvX{KCXmR_E4W@$e#4NDlcMoekR;l)5&2%@i#pdL!xJlsAxE%}IpE0_*9e#NRK=l89LK-8LxF0W&lf^HxP|g^NA^NgEf_TCVe)uYENe7)` zB|3C;{?dhBHObbN=B=u+9&ukBO5qe@)@%7?AvPeb=0C&PR_+YLPB>uC#{e{c-(7Ea z+li8ufEW*O;EGYXZX@l{kNqU{IK7JZHx@rTC)5Pw7yb?z%j8JKY4ZGBXZ4xF?m&;j zZ_6P+bb8Z|PXaIrn(fo9JAESuL#Y@2H&vDm8sFvC3X%H#2K_fijGN+DPPJ>b*~)cV zBAB!o%YgtnB!=D}25^u$39r?eiWQRc4p0StV53)CbHlHq!FjdsjnE%sjyS1#k1&y>g5KD8`nb zc%3K{ZSZ%0e5z|J%(`vR0d#vOvX^6P!wPB>xI`77E;nxcGG3ruu&|=my+%J^^-EUaj#vpk&k^^%F(Azaw0Fx2!c;y7CdIRp=dK9h?Gj za2FnD+@Ic$$WQnlBf=5pmF~ZVJ$os(?!3gx-F6WO3p{i+qWBDc8a-Ty86t>b3UWjD=MuZg!SsY9DHw_BgM1COVykySHlnW zF6!76Y}&gz+4z(8rgc)`;yZHz4)X3&R`EuZSg8DXvkzX_LPFvr3*u_}CpL}kO)^AeI0@$oM}5#Hjz@ZW4$rFq;(Oe6asjJ)Drs>8I4;6*RBA8O%6yK z`G5=Vk-hgmY~!S}%soBMwirr@*jSA{))r%=IbS9M@-1WCXwdKq9k$)!CM60sMQ%uV ztPYk%Jvu519245Ri19o*706^!L~CkRR+bwe_8c`nCnK`}7v;$%WipAu3cyF%U8~8! z^^G^Uk~|7~Kn3hjuk6Nj<=vmiEWX~Kl=UJMV+>m4+Vak0XgJRM|6Wc%S=-pv>GAIa9L-mg&&*C9d^FQB9u;6I3Kc&THE{FuccQvQ zX<==Q+-(y?KX8PL7O|cdi2r%6Kr946JqhFY(1`O`DihZ}3Y8%Dh*MY0RV^Mef<&wH zMW;Hy4g!m;i6+@CuOWUGu2BvbhZn?xM?u9U8{Ibf4}8^}1fouXg;rx7+?MAG{V6=r z5zhgc7kaU&*cYb@nSP<6mLL1-l*P*Q%@Xra|u?j%8&Wu zDinE5J(R*DZPY7vii}G^GykuM9tytvrE0U7n+L&W#*JVN0s{uTw!k@5#9A>rZj)ld z`FBs6+Yeiix#%x4rLMlHu6;!7lJ`2@C1Mr4`c;&yndc&@cRho!^tKMDG1B-0a*OM< zRR4v;YAm38<;Jb~`gM*IV{}2M>-CPW0dG!OjA}PPzx2)UB{ywN6>BH3N>h1JVvLah z1Ilazo-%|bj0!gQ~ zgjYS}z)gRI$=3i}PFx47ps{a1e`~lO*qTp8(Q5!K-qmTbR(W2_-Lw839jAaGFad>= zhLx!IA8DNZ&%{Sha>?z#4$6duS}2ePX|xh)XiCIK(U#GVxS zr(i_Drt8ME`DszmIa?xu%}`J7>nkiTuRPp)FV$cq$9&s(@@B&bz_o-BDu)~Q(nJG@ zrfSX#lmHP(u`^jFYuM;U2vn-ZGWrj5`=ll)6zeN{sfB^Y{#P=n*uMy!EohgKP>!#O zUX>>lQJlj#wgWpEooW8>F!NYxQoMvg+JnsHNz~Tmi2gKwii*kxs=)poz~o7nVc>rZ zWyO$c4+8vXqZN1=-}lpb?a92~{PC8T`~0GzZ0UmF)!3wU>*jU?;GGsH4%Yx|xwt(# zLcczX`@FWDW;uQfK$X{PgJMl+*a?j4eOWkrrhgFmseQ+@db`@6v@75SiYg%Wn=6Z9 zq?%WIqguv+a9!3}=#_rxw6Ib6``y<6JmzP2+nS#!<15J_pomxc1~}08y+MH>nM@Wz z^LN6nFZ!9ho&&Dmj#Ug)Y|xL%Rg{0anVWrN$=l(>30f}@ds1P6IR#9jPO#^BK%tyTO#8bA+WbsFio(Ms~Nzh#53P$9|57$)qameGoJ!|CNw)5y<)vMoRhq`T5E`@6?6CC*d zma2DJw9M~St(_q|R04dD*~4Nqo%-PWTS})Qdfck+TR5w(JT*Pj3`qGn3R6EO%(L;>4aEqoM^91iz-=*6DQ6*TBgBQVil zQ?8J@Ys;Z@`S-1)>|xJ9f|)GTXS?F$d=9Ew-`NCSfDQo%{bZ%N>(txJ`Af2zZr%1E z_&bp-)ROxBY@u_m9fm#@~$Ea zu{+kiA;@BNvWUiJyrhQBF4H<+f8A&E!J2vs5aFfbsD$#D{elm^x^Sl~fVwwd zk!)=6`NW@d%is~Un3S{bGoU>9!tc2)8j0`qYoP{%;s@zl8x~gm7PZE{?d!mMzas6C zeW*2CwSbG0$t@p&)ydz|dBw1ud7m|@j8jU{;s+y3^_$Gh2@Rm){Aqu6Y;OIxe1h?P z4`$en3w<1w^@8@$D+fM-rIwI=&lIL=VYD71#TP@@n3_`G$d zBnQY5W7){A5m@%GKY#wLk!M2*_1a0A`g>dZkkXp0+K1m=TAN6AcvSNa4R9`CYk#8P z)R<4(Uy9D0wig&%3w34nNE#4mGSq?EA!AW+8|d5D@2@8Y^Ooq-$HJM;w&l8XM@^k> z!MQVb6c(MQ*DD?f-&qgMpGghLE~Kvxh_L;>yL-1g-k_%cb+?%Fm>S(Q*8X zfqtF3Q2hxOsHng;$EbvY(!X(-d;IpxXp60KXu!6qpcxqz$n)<@A@9PBl^*&GE;jV1 za6a(V8|z_9Fd|^5APARFKD=VJYz;o(PpGrglb4LTluzBdiYDo(*sgWgdK}w;Wfq8f zW4^Dy5in0MhaOx%Lt?ef-hkq1#l7bNLOR-UV~1_i9=WyVK)d4G)+F*gAV8;Oyi&C3 z|MB|k94*$))zB4SWZ+$VrFlFFe{;T{S<*3j$H{sQL{bAm^bppx_4%Fa)!x_KR|TFq zBvy4N-mw&Okpkmi8I&!m=WNnkB#^3l=4s&Ssu)0rfBFdc0f(h#saoGIVc6M!DG{}R z4UaDJgvgl}#8Dqif<<^QYcAI&p2Bjgp7iY_3Q)J?2ofeeA)Gxf^uPghfW?6u2`{5m zHQ9q=ojM;Vn-}aa@e|;%ma`=B%O~zp;8CFB@;A7ypQ=(N4f2<~e*>zpoZ;o~VRXzvdU2x_*q3V7hRY8q!taQ0`gQy&$=&v(pxW%hhG0Yk16aj_~l zB@Jc`R?5h_yTOU-Wz(+xbV+g5Sk5ZQ+7TGkSv_?O9TU>MW+kmj#yR&0l6C>n_5lS2 z?E*+Xj}P$nR)N&r{-%>b)A9$~>2huzh~M%>dn3@FUiAh#Fh3*|y=p6|iGrf=TgUmP znpre9RX_^=$Yv~9x{bO~m!HaQ|C)#M?vu4sn^nO6hrdc6_6etw0Ee3|uQ%vyr`Z2% zhO`ms-5g1^(<=U+c|70)eq`+*0ZvnlKr7(SGn*}P1RlD6dodEj2ROu6h-ZYpH1vC%3(4!F-Mjst*H?OtPHdgJ6HoxtroOOq;4wX#=` z4iSZHjNDuts2B65Z35xhyr$lcy93Gb8@Cq|U>1aw32Fb63(R>CcEQb7YJSDo#pE-VhmYk}M z8~F>xQ@SxQ{hd&PF7POHS{?Fi(tZ2|1PUDC0hvIK!i*LF#_% zDt!YW$Jct#+4JQRqhe9^8L-(HA4&OqNBLeUr@)bbEwT8r-v;CrDn@kdCyOV72~CwO zI`!Ye;w}wz>Df||Q9zkv*^>Is-l$7TV#wieAk0jb2lTXM%8$d2IW7u^g9)5DHUuIk zOjFwtbYz9Rj9QP+f!rF7DW#d55;s++sxk;TF3^7B7|pX)p72@Yb+s7Dygu7G&Ncu; z!<()!26n~~;#8c@2b~QA&Hg100z^*V3VXVDa6zU@(&3>K6A*VR<&1ct1)E4HAWXGJ zhxh1MSYP8x4XkEri4@bh-|0#=DFaF#gnu1)CQ&Y?zHxWD9hlZ_yPVGz)idpFZNYF^ z@TC<0=J{SHcX+_nA}A=h*`H@3m+(tGf3C`ty2fVd9iR^bK7|*ULMz&OL?R-T93Qnx zbV?x;WfBaaHcII4mg^rWe2#LF1oY!nYP)XB*CQT=LfTz`U|(1@J0E{O-u?JI8=z!M z=>0$Jy=PQZYtjZPAfRL=Nrr=goFqwB1VnPqAR?KDCWj`q2q;QWGDwmPl5@@=Ij3%N z&KVkLpzr3KGc)I`Z*=DVyKCL?2dmctd%wGOy;V;=_0+r50s2KP)cIP9ckTrY*b%c>exXxS`N%AWSiw-

%6sYu)ZmUDDoN%y-5-n{gn<(*xt3PA*X z3!gw9>5k=vxFqb7yvK0}KOyhCJF9i_#|65=&CRyAo(99NCo`8*oV&F+vBNVnGxhrs*OXdqz5!P?`6t0- z4F{BS1wMfw4==r_wDmj`s3coa%KxE=@D;u-C@eUT;Vc(-F#MeKv(6)?hDfi4iKiNH zV8k)YIF;~PmYzB%0nkO)LoHJylD9bC^u#>%j2Bgi8(9$92BZOD6|CfAb`#Gq{ zFi>_boCPp*5E*AKJaG2as=#FAG9Xn_ud)J}My?kF3L~>w&eU+q*bAVEPKi23SOUo} z;`(lNuhSgH;4QM}JJH0+?Evw{I0xzh)ySP;Cc$LuHzt^Y_;I{yuAbJe!NGHl!2@zR z!(W9+?-M*O$l=wQZ$!f;OgjKZGP~kLD@s{>D4oCPZQPG?B_X?(Uvz$X5T4u9d^?a} zD`RW2q;uQNc#C||5)g&u2$hfm8{$u)y_EBl_=*e3>1v}srY*&c;zIC z8_-NoWwMfWoUatrT<3~^>e=*}?_f-1Mx<|LxnV%OK^Z;Vz6Homep7e@Ad??)?%EqTQe5F}%Wj~bS6vu27}~m@8RoyVzmH_2w+D0-i&yF?8jdThJL5i>|ZvHCZA zFb|J-nDfTP9T2=;a&(NM2@2>=nV-JBdEmYa6a_$wplfrC>?;lR+~%NRiOc;PsjDWu z3D7bKu8{k?p$vsos})Aif+SJ3(Dxy8zCpKGcb_)qy~Q|DR7*Zy+Pe`AC9PGDKQV&5 z%!{7U_z$W+fW&AeMKj05{!@{kV~O+nwV=buy*O0ne%6hD5j|~75gAA!A;Ez^#dyPq zQs$B^(sVVZD`@8*j?$3RnlR(%etj8BLINehi zSbwaZTki3~d3|n&7PuY`>c^nOwZg;cql=I2bg_cKGrz}gF@Wqr94MXEoci1ixfHIM zckM#Te=2L74S>@DvTV+B91;c(o76FRfMxD7E%d0?5)#n!5}DU`SKj$G;*`Jnm6ZC_ z75R{5F51YMR9xXq=U&uhRZgFc>0Yvu#$nU7hRordYyf|=4_N~N)9aOgcRHP;CB+OS zYfQxwydGi$1L>zLjG=C$xCFAA4s;T7HhQj2_zxuE{6rV|he#GJhu?K)D!o}sE_Z`1 z?_RLY9*`ot!uyRXs4`f9+27_)^nD>g-vQt729V${rNKAQccmHu=esZa<^3=T(`t2t z^CmwjyW!Xh#IWi$w+54u;;8pOW3Hh+0+=mt0fP7iL-aMD3b1Jh)r@3c?@*tbcMBaL zIMb@9fT}tb(7|W@vG^EdhkTA!LCA)~*mnb^I^>bosAW=ru2#W2MN~5hDQUWudAVwV zEhtWSRbX_bu#if<*hxx*rU1xVj8FZ;0b2f++u9q4uLS=g{s?Ma--e-R&9Ih{cYbxz zpVVYFV4qTp!A}`V@!zc+9-gv|X6YDdbgMI?<$$yS4jPlvd)J9f&jp%6vk|N07j{`GBPC%roLJ+r(O8$}@x`D=u z-gk>&>)eiYxi=m-xhFDrECUcI6R%aes!16Up^8@#*M5JE{3i>`%`aFQ#5b0h3GL_x5`4M_Ym`N9Hqfjm zq0HG$Dn%r2feL_bz1fw#0;PburV4-~*8s^qpcHwa5Xb9uK61xW;{s2_oe9(ZB_ zxRH!rz`*_#~W%~`7z|IaKzv~d(6;+Uv z6Rx?1k3Io66e9G2QBlu%HRl_tSBZKY#Q^Hko_OVj_r7|V*b^devOFrIVnhO&n|U$A zxiy~3-zwt#byvieo(SFq_4a}co2QPp6cp$+eD?5~>bFg}CJ2E6arJH2-)EB)L?3xw z8$=<5mhY?Bb|&4yB>)&kszEY11Y()RVVYeL%u>Yl$T&c=sFFKI(^?2ERXAJ!G+)fD-{F|n!nHf}t_u)WA@ zOaJwDN+ym`6`;}R+J^#)u38b|FM%E}U8_;zm+&p6#D)x)Tw9ej$wPtGV8udLgv-Hk zZ#rxwO^0<@QB^~-Hz-;QM~K|>nMQX){)NloVu#YwIUv)Tqk>hL-~w)>RWD!(-Hp%m z5cGN&V>?}~fI`lyv1}PBx z&wZ-hibvA7f5&2o`=<5MqgzOc*GNrbt19 zv<>{S`F{gVi(Gq#hnTc~i(;{RtLlniyYjB-p?zg4YAbdzr*9}Li|L?O?Z=9V1r@DO zq2AV>#+c8aBd%2}fVv1kHIl;Rb{W9r(hr3B?z7B3y20{oVW*lWkAZ;(v>pH-yW)?r zRHljpioP5(?%M=e1aFK@M>onwDw4cO(6>1*JY9CDq%SBEniDnwDOO=+Zr#%x5ICz6 zZNi*H{#P-QLM_y#Dy^1ozH*U}0ohwBrkw`>{s2mzsm*TxT>qa-#{lHf=#IY2D#!(v z5U1q!EZmTG{X>PJIXYQ#FbbTs0w^RkYWAp4NLj}tHWyUEzII(LEqiQp++gd-|9Xv4om}N6L6h&oK4)dUe_}w@u*D}-uBu=%cDRDE^ zI*$!z$%u}~s7ytqe|-{{tZrLo{H;m_FnvY}^m06LNC}P5a<UqW^-a4# zji}Nxh3aKNvrTE~b&LdHbF1{AWeu6l6BU40bI;A`OC+UjzXE2;a|}-NoOZvSEeoa2 zHHL;~45f9dE%S%y&uyv@6wnQ;koM#U3!4QJQa#7Kn3>L1(WAXWH9{q?k7W61=PC?L zi;r&r$5c43UYPCEp1NOgn~Qnz?da{Fyj7%CsT;Jj1N*`OIfthqaO_B%3FYPL#ThQ| z^%OPGTa%8@8?)zny*7uoAR5QF!u&9V8b6J*{xmKGi}tZ~Moz^a-rxBR2)ibDfRG&! zve|Jb?*rur@9U*|%gjYvSm?HQsJU?P!9>Z&LpADRfsgQW=1hgW(3IpFO%LEo0&Wj0 zpZ^*K(gVaeM;LGrfYnqb8W0o$9aef|l@djOLv4^wfTjRe^2isM`m}dvpoKM9M=Po) zOL)JG<#|v?)=Y3sL;HFmE!T@9TjvAUNKzFYO|>7mAG}eoP-M?BsCnoPOppu=Z~+QE z$@q6%-9VN~b$mGuOgArOoF7|-8(m#2MGk!wJDtO*X2I+C(GZpvE!LvW{=J$@*=iF7v z1W`Glp+yrC;E`o8++(^}d^^yHylmgv_!3#{;r@dh3g}AbRdKskNv_r&XvC45A&#Fc z+;d4H5xy1-g2N=@2=5N1&`vI;<_R9gNZU3%DlqveQ3AF@yNNx8Je%JzP-|TF!{})r zCW`3){O%$4E3r?AGd@+9hlkGef_q{~0sB=Y?hn2t7!?1YTBU`&nBA)4EPxl!y*HU~ zNL(x5fAPX4G?{4Hv8v0Pwi%D1z%ZQa$$M)Ots2^KW&0hCy~l18-A1<(V;(lacN0^0 zIm!zfoST-8K*6O4rmjYico+W*CBu%N9+x;b0~n_gao93ili{m(6I(x=I@t_Ybk~AfDAU0@tL3P1l4nS{zpCJtn`d%xGtoH%p`o^j8BYJZ z5hgnxD;4@D%P$jN)N5Ud>3?!lG=DeS6ju_ueU)de6enM3!+9~5b9SH9+wKEX>Kskf zrAN_tFnf~X^wuhJ3VQkdbRfU=HsBx&ttPDsl`%lGnBzo3|I7g>_fAvou{_{LFQEcS zgCiJ(HY*gW9HjQx{LzqSt1xrn9hH&8%2~@J*G>5))Jc6+Nxx6KbIq)3Xdic_N#R=} zvU_Zmn|!Ill5P*Sczs8cYt z*_!6a$7)<-3TABvP;S*6mOf;I`!^t6r644rM04)#GzFz@t z;&ZWc%j2S_zQK7*pwNb8!?34&0};~y_)gnlPK+3CK%xWH+jbIpX<(JSTGe!ZRQc;zn)00_G~ zjvb-ur@@044Tjg{zwK^>zo+VhFK|91mvBWKJFAZ6FFdl(gkXo$UuUfSE8MVg6V^{i zAR(VeyZttB6ObBSV#&oi;$gz;rK=@CgQfopn}JJNNwB=m)N;Ut`~K_lq+D$5fG8!Y zI=Kwq8cze!KuJnAg3w4s2eH^(10{Cj*7q#Uh`b;K_QDH1p|A>4`v-zt`MQ+$z|j~; z-W;FbZl+nW*BT$fks@(TUkD$oLXS>@TbtBIkMw=wccfxNx5A$IV}rKC+A{V+oj64f z8YfV{rHxv69U$JAwHzJPw^U4jjQsp`iTKseqf#~CYKo6KZZATC5BqZr!W(3OXk|{( zQY=XwAnmDlJ}c1n#fRRZdyCQQFEHwpb!@A+JJYOp1%Fp@xy^h+ZIZ_I;2GgO$j-Vg zaqe1*Rr+1(VOLULC2_cLSCz#qmzJSqQn2eJvq31+@l)Y2$rj07n7+TwK zcf#K(4fVJRI>W;Dy@6W>5NNA70&3^&67@P@I7kQa! z_{vQV47r{k(%e}%V(dCOVN=RU2#H(qAs1ocX64n)$S+)t8zB1`tpNTG*eB$;b&$Rh zaHBcMtCDMe)1Sv}D2_jN$r^9~@JTpzPRQ6P5%pX)&FQvJHVH`vv)f5|szFZ4iDf@G zHNf3jGM59QN)iTx-M7S{7Av1$xNCdUQlu2`jh#JbU`oNi1IG3^t4axIDXURaDy%3AUU^J~>k1A9ali?NsBGCl2r@tlf|Jf@>Er-gAuR#VT)-TxTv* zo1RyT4J3nleyv*sX>uD+{kkT_etS1GBH;X*C7k_3@k_i)@PL$_+eom9=q!F2Cn%{F zg?9Pyf*?ZNTAbm{W4+VuL-=JEj{{QFegoY!!6jCPk|{yMOJQndmR03wbN+=8Wc5ki z-2^RlNc-AEgA*w)fKZS3k19T}Fzcp{qHh1(9Z$y(dB^!t`%A zq>JAB7VykMTCc1Bb}C%S&?+}C{}8cM$)y|2+t$f!oEB@={QmJzhnpm(?yilN2zzn6 zlx}2rcuz(1+fIGUX~DTk|EdOfKIM9C$f!O%0`{;PbQ^zm?DW=f#siC{9Dy{KRq zlQBBZ#$|pgx5Qe{CY#g7q}rBfdzeFZmJ+jbNAlpA4UN4-CFcINIqFm5fwADt0hdws z1p#F6%j-OkGaqcZTuL$(A}m@JKB|)bhmy~ARz_?`&{*t5++V_vF-Yas;=6(QNvk%6 zh28pMlHMg*R)5G?6pmwy4Q--fO@YhwXntTpKafpzAL)&s!Jgb|qghs+|BR8-L-?4Q zx!Ix=`T-5Gjz}d;pVmm4lC7`M(>%D_{6O^yqtm&-Rks`O4ck0BQ@j)9uUY zSp~<&j`y2QXo-hjA-`F0pU*-{w~kCc^k*o^Q2Qx4J=-q_@C$IVTDdIo1EN(uV3?k; zX;yVjkPu>VmS+NzVW4{?)O35MKMS16=7UhfWx&kddvU|SUr|{mqP1BWH}mG>W%Zk` zsfMb(nDa+#u@Cw7PTpMvoT&GfVmXG2r`fs8hVDP8f*otuq=fjZ4Mb~0WTqzvPnKdy`83!HI=tU#jEn4NNzbrt zn^8zn-R`Gy5r^>`a4$_(-8+W-vtXBO@S}STDj@aW{QE{%?K*Z=DzrY{ ztNq-YP;35fb08{r1Za2|NiZve#HMQd^WuaDT%zE2A zu8>q0vMw6&U5CwhDB3dSh_Ch8iJGG>^+X~RyS{U`>73*Odm@AtUZ0(cRHLI5ib*`V zDA9}u#%fA;zV2sX9W?RbLaNOoGeL6$cS1LOe+s*PpZ)Q52C&z6s+2Z8G!R(AV>3>8 zQ=B9Aix}w)&uCv=ek=0jJy^nOjsT0u(J{ZVH)O_WvdL^jQ*Ye*l7>=MjCA>qP|ntL zbYR@7fN`XM;H*{vne=VDT~6|unN@=Ac~Vt12#SQPzXckUf|^Pl2ppSrSSEUwi{9C( zRe2JI&DpAg-ZR6aWti&nq)?0BJ?5aqHA$`I!HQ&rH6n8C`dMfKGR&dliNhz-Zt*oA z_cSYtNxs@ABr-qp)3PnKuL|QT6(Qsh4NmKMM}r)JlX$ zEsMvSCgV(shs8TQ62cZ?^Aaed+5M;_`mG3+jZ*ByR%g2g2z)^|uxn0tKbP;V-4|(o zuYM}5wGyga%$LQuZGL3QyU=|k`I&dwcR5>9(AW99mdnB&{IrAqbf%4xeE^Cc4-1dR zhXss;WiriWc*OkPXJzp?ST*$(tbzkLe_WVzg~$UwYSvA`By3& zTwOZ9<&|3kV|C%cp*O;PAlN#HF%{@De(1JadT9z|UYPr_4{x2fkZQ{tL^=&x}8d8n?PoIIj}!`v^x9;pXjnRSUS6U zr^XA%M7BkJ#);StjxyNx_>`}rvkFX24|01?A$(6O=)zy~24kV>78(+wx^{zy)Pldt zEqk0kE%ndWrRzTSs~mZp64;}@9?tA zsk;JCmSQL(Ha35 zN9hz9;hP;aTDzqKx>9)kR|^!93$m*(UO?NgUExo&9N@BN`thn|I}|99X-?0MwS$B_ zy>Nd6&t2f_#d^#IrmwR5ZIi$o813{@dM=H*v7i-dc)7!21bXV29 zh4|~xPHCfUuZFDB;tMR{;h|4eQ&DCdht2fSO^iw3ylC$^9(A*C7HggwD_@j8qfMGA z@yi>iceoX7C(~PMK-WuiBe~tqOZ=T_ri?VqcQBT!X%TuqHemC`x&A6~50R~0`Rb$U zp2v^!btc^k+#$yluS-y!{K`*(6F$E!^&Agj_(r*Qypm4z`7uvKsAhg?&05t%LvoWy z;s)Bk|M5e~w-I{rHKzM^D+cxglNFQCRdn`A_k@m0^;TI8ue>$y;USWN;KEvzrS2F6 zFH=z17_9cz?Xs%-oV}VAKslg%n`Y%?YTTs5EVAyj?7K&hQld`T{Kf~m%oq=Nz=K44 zdQYYPvZT-}uE`PHO7|9_u^Kf$CQ|oqY3GF!-SVPNJEuB_3gG?K>u+I3(@izo;I8Z%_^C`YQ5fsB}jUwMd^D_GqU->Mq? zAW3Ku%gS#Js;Mzn@^n8N&vv=!Ho52?SwvOMYlf^Phf$%h*U@hhq(SyrK1!4~ zqVYKyUhKGy3ntC{h|1RT%I7_xyn6L9orNlCo6rQJ4Rj%ck~ZQx`R3%>UY9G~z7U4hB2X z&M!{s?@wZ%XzX94TMSN=mE=5ie=HjHEu?Vgj7D&v4>Q5j$-<*$w?oow;z}tLM<6$O z)!G$gap|hq&kx?wTilvQTYvfxQzMScama%wST3p1ZGnEHZB=@b%H~VIiVsMFn;_x^vIU-KjP-N51P@iU0@d zRqgcgXqp8rYIcm{#`?;{-i=tE&{mA7h=k{d=ijzia$PJe5BqHq-#Et# z$4eEN$`HWR;65z&?XbvUh;q`Jhc4@l)QN9fs_?OXC^pIx^HOqhg+Zqc{s^#Pk8 z%YNnks!Tj(VsT`G2Y-=uxkWgCcdXr~vW{skwBM~hu~%=*Z=<@>&eK50TD{+sIyUd~78-bk0w!UW@zslQ69dVwBx-$* zO>Ee0Z*|5h9kYVtck1zQn5^m>U|y~pmG?Am|ITiH0}|P-xltn#xI6986_xyjms+t+_#*TdfhhLoE$SclTaH-J_Zy*h z(tV}Vv{X8#^`|%ar0a@}JkHl`L|>PA0$#6uzH$*DJ5b({z{@@i#`Nfaz)~oEOjbl; zd+PU$vL>EiyS4sFaq(w`y3okH|*L)1HAxFJR?y<)X6idr6u>jc7*Z zCJ5K8yL3UV4N3i}LYqFzR~+$`(qJP)=enKH-jeY;qjqtYtodz>;S%yw#6qZ1aEv8V zl&{Oo?CiG>0$bMFq08{`g1f#%1;SUl6xxZ%Dl5&<)7b{#5PAEKDm$q(uw<|`#zrt= zWK2ApxA8FI7BF>o2S2c|d)zwf0@o08cQ(dO`VS{m!mI`&;b#BpZ{AD`HG>rjK1s z95L;P6~rNPO&iRWFqp^l)Y9V<>QY?fRdE7R^J-GaTU1OGEJ2|i@!^5qSrS#TN%X59 zA)^K^f%}+du0J%zcNuFn@^xG*=i z4`%JFTTC59wHDJTnY{+NF#S&=IHYL2FR!>~z=E9U*dw?|{P9LQyPhf3x=IG^NE;cx z*WTIuy292OGV6EBg(!fJ;8Gq`*M-Soc1 zoCF0;k9!^KO@gIIwTk1?E~vikA+tU{UP)Mwd#$qH8KI1;9N$-;4NByRGjkU%b9xq| zrI#^d;tZ$roG*`dZLM8Lsr^V<;OuH9{ zZIBm7MEbIRbb1>${rB-am59kV!F~9|vAl1dsuD?D?5g30#)x#dI0SEZO?=KSwN@P$ z6DoCt!Nc{Jy1u9x36F34t5H`XG?6qDw;`$yI^eJ#@Z+AD)g!tz6h*QsY_}segV{$Rvg2Ck*)^R1OjJ z$ynWwSdFu5gX=%U9;bpBMjVq{zd`J1yHl4D@XZl`>oN|Ry~;t>dK?bj~SetXRVJC9INzB+GIx=ja!S&4%KORK?c2i-c9oETv&ZE*njFk zLutF@6JH@4Z?vg;@p(jj7F{A#DZaTx~lEa8vT~pWeUoxgIqqZd^e$^XG3sg+)I_f0#?tj(NK)> z8k-yNn(sG^RZ^G9E(76(vg`P1^CD+rsHMWbu?%}toYM{d)>YlzV$des5t4}=ON|(s zJ?XL=c3`KhLv0z-*i^YhJ|$^8vL)0~A1;41RW9Fetvc7pVHj~5taQ@*><FXrx0&G9ICy7 z^qSMfVE+_JLj5&>BFz8z2fOb!0hL9m2M@u0x|``CS~zEg`sY_}HLIIji%x-us~%UQ zpABpVu}Bx+qj+32N8n=PJo^Ya6`5MaxHyJE&L4ct$8v69pWbi?rQ*e3-ocruYI!?ScX~WmMN; zwTX3JuS^5by*f5(qTc=jd-&rL2`X3WOHX{&?I&Rf88Z_q??M$y_Ihhs045nxEze)41+rQr$cJdA$byN$ONTf&x zTdzve6zi60i!uw!Q|@9{wVRYjU0Su>4>+;32K|=jlUaNL@GF~2sG(QD#Fh_BAl2Ss zI~|{Avr>nPT)gMJ@ZxurYore&q`Do|{W=)cC79fUQT0+}?>DUhXaLiIGG_hO)#(eQ z$8+x%&r?&k=RYgoC2l}*0Azf6e~BFk>)z^0JCjYCRz{OP_D3=NdU{mvUO z_N|FgRt(~ZcK!99-vj#a`AZtX;G6Y8T>m^>p(c#t@#sM~ z*8dS_B1JFCz@8StX>);(XYW~e1$@q}o!>B=%4><%m$VbvuD8~TK^h1vrlk(>W$MScKRkJqmF3$5u0_AW0o;x!J_7~M? zYXc<|Z=0eI$fgozC9bsbp>D+hU=d2YW?-X+vd=J)c@sSxFs94H>P1Gz?ZClFh~jLe zGZoxr{b4rcs4%3e-mll$p+NIGqFb9=y_}OU5$I`jm2p;pPd&m`EJD>LqE?;qIPkZj z2C&T0qCu|~&|2D5-Eo=n5TakU;8PK~cWhivi6M=ENc*6H!X$>L`!T}OW$R;AiT-^| z{;egkT#VOAqo=Noe-!EFgvsv-g_A)w=c%_{Hpx3D)fXf`iX0BB3vZXX3w(cHvGDV1 z7wSMt4^Gw(-4dB7IHM3u4p8TN74QJAJwER#D_W%F5^Ua&9c(^-vA;Tcn|iIm5u#UP z-BQXzN5R$B=gtaQOd0A^0?OK}goUv62VXNWb#RbCt~ zBzp|?jq0&si{2ZSys6dYk_i9_YUIx>PrfX{a<)0&@+rwkzh>h=K1YVYqoW)%Xx*s9 zI=0M5c%+z0jUQezk7J0jRroyV%s&Tv3NmjV-bbD)-Ai9a;ds|Mm#e2wHG->{3 zM79J4ZgX+hwj}7F=V&etC_GJskH6zt4PI1jDB(;^9zf?$kd;fx9}FOTWk4C80Eo3?i3leyL7OUh;!<66E_!lACb3co!ze(b4Hup_^G zW8A{^>Gwg!Lo+H@J9)E!l9V!@*ay@~d^7rQdE zxK9N-s@C}ixy&o*$XvRa$jx|-h~ zHT6v3ZI#TJ3;h&XDG;(TJ=LdOAq8VQ-%&}eug3@v3fTwfi`=`OU)a5T#1Yda`A7ns z?m)>5xC)20)PmKGadxZUT>|H)7`xST(6qbW{ig`NU1Qr=E~i(WQO8@$Ucwq@ zo@Ti3J`h$UZ;S1%=b+(v*Zs0y{uj-E9epA;j4IW)<^`u{xV(5RvhfBF;R&;X61x=>UB3y2mY_9gj_;ohU-Nj7Bu^P)ZU~ z5w152HhMFB{8kokrKV5XKaN!-TD#Hi2GGA?A-9|AIx$GGDujB(<~DH~j#y;2gNzJa z*0)0Yzx2)YlZ!=tBN&YyDVZ#g363#V8E8+_W^t@5FsT|RW*kPR`{Dt-s$E@XT8fN2 zJv`Kh<3eP=ZiIeq}$g?ZEgDrr58)S>lWh! z2Z{25bC{LiEPV2@$<{o!F%WQS%(kw`dYybZ+ic46BZAL(pX_)m7(>W^RHnbIfwOX# z%u8U*-p9_m#)fp?*n~HghlF}10IyS>E|^VCFgAWP6Ye(2Iq~nFF~}@hDIVQm|kCO*pUyG-C@j7258xB-k+kh6GkiaizA^x+ z2BS$s1k$D0;9{q;REOmGQHXVx63ffT0(tVY2voI1ve#><1iX6Lr_b>WOoKz%~x$L6keYPRuA=cANlMJ{z8T1w~gmTi|jv6Pa#=T@qbY`1hXLaF?` z=~-N`4u-T3I) z1(apL*|Fa*kRv=@YstHruY*)iXKNMoHYXp7I=;-W9@+EmVl~;?rhbZ1pID|Z4fPf- zQ=0M-S31q=?|s_;sk|X`rNXGIIoSs9j6`P}HL}K(`mFq{L8w@6qc*I6RZ!2MHt-h( z2q};l_GEwPz4FE;uxQb(Dn(pQO~Cezv73W6`_+#Hg9Ub(A!lt=73FnN1zWL{9GR>lfVnf=jYKC^T6SbAH$> zn2dEGU}6d1J$3kEaAtfWeU=7dS*m=^Y9HN)^q2XtL5j2 zOa4wXl$!SCX4O0i@4G;BCmvEvbA!FU%C9a{iuf73izT zp%G*a+4+IFdVB{qa9(o*$B4rT-{!I*e#>a(x*1i3~a3@(22U z_Kf86$ygT*#H$nIjOH=zzMN@UXdcmq3aJ5b502QTVT{a`tx=kS2B2*6MCKrMMRx^s zYm1P!XX7L>hdV}-!xvhvUj7^f45X#OUDAW)vIg;D$XTX6%cZ3!95u3St~x2sKy!1a z9~!2p^?X0|E&w#d_86q?@v%9SZ7*uP7Mo>nl~ z%iw&a3D+rlaWS*N5y3^3V4eeRm`~4Y8E<0*__IR#n-S?!&^Igq8sf(1ll>#CxVjcuAi5-s6=EYt|#0*Qh8r8ugz?9|_QP z`BhHqej=QDeFJVFlw$dt?jY{E4kc8FW`ma0$?(FejKHrV&DnwK~ibE0_M%z5ta-7kOEY0w*CYs2XAr{U-d{fCoRqo$Dad(JNi0_rad|;L#^t0yx)nx;6 z8@t%)mPfIK0uJ?ynP(Zl=)wFM>eGn<%Q{7ks6RvpBHz%7L6MG?-O7bY`4C7_wZQ#X zPW8v#MuY>soMk88N%Eg9?61rp4he8M2`)34f9wB${Y${gU$3`ZW@=~s&xz`PwMdkc zfpuWMuNv$ zM8X8D1Ny2b2z`)ma$+zV)9Le0@*w1H#h&EOaCpT{4YxXEu{SaB1*sd z?94xTHj5r@{9mOPA|~Q%{nC;|G1cE}^^>GQ!q}*c*_Ks=_qT2LlVoH18<0U`sb9EY z{4HkvE2sSR6ZG{pqL2)^Ix^ZUt7{T0GU#NGlj=tJx;(|=vu&(@!O z4=`SkiV4Nvw%tz>W7l&agYrBH0|u@C3sbjUuj3PC_g@$Hv-K-Kyw2pE&^H4AmGPv3 zb-W(Y#`@V^{bl`7o;yG$mnXPi1O3^>|MKu9xUVyLYCF$=aeucWJ^-N5)Pa5y|GzO_ z(Dgd*@kRYt_)7lTmBKMPZT~CdeFE0;KbQW;L-?Od|IK;-=hA-*DgTSoe-A1Di_(8d z7HBvB7p4D}B}o|nKLtE+Jp_Ok&X|H=u7$vhp7ek>6_6dR6auVe3^uwWz$*c-Q7++5 zm4?|h;C;omPJRib1p1mA<5ZJgZsce;SuiVYvrFeF{x5h53k5Rh856gXG?e2byRn?h zuJQ5ac)uW_9Z=k#-R_Cw>%KbOCZ{?dPg<^U+`5B)YeWz&U%+aL!Pt*TJL_hptxgsJ zIn~F%+3tFu;l2Qn3g%5Cf?R{@afh*9OoFVQMA44p=ZUt}%$YNy!1xf8IGQQ3N|y6Qm!0v9Mo1Hnbgg-ev^vl~!K#>Qjm;3p(Dx6pATZ;Z3!j?TX}`9EY)GS)TP z!XRjFuDrc*R`CfKj)>(4#VN${fQ+Z`%P9cQCb58(7V+6m3F@AklM70$K!0PJ+g03r}++&P85)>h=YU) zyWaV7i@Kn>n2+8T)<&UrF}vT&tm?}_tmwQm9>;CP6T4%1dXx2#-85$TkN}?A@7%>m-e1C4IkPda+_0NNW*H#aHZux*WG%hjgIN zs^trioiY?UG}x{W+L*B(KB{+VhaNCEqd#b!jxdj%L>u+iEuvS@CmGc?gJ)+KS#i1C? zQKgSU3m%VbaHw*xuW<^)eylqW&v33tRZaUE`P4aYrIWMRHh!YygH>~usB*LMdBob) zUkpU)T6J1bg$i!CK~6tE8Y)K|baNCZxWZ<{u1?j2x85&qCHb(FbdO*q@qW@hdfhyc zqM4~EmAd|o{gBWmUh8>!?Tp3v|3lPwhqL{D@3+;`nq6qkPLvw8Vnwv7i`t_$^)_mR zB36j1)l#&zx7stYS451WwNf*Ph*2|=*ek*>@6YG^{r;{i|L3}%*YlinpZnbBJ|3~F z)?(bQg%NWgOOMI?$!&OupH*rJ!1-&O6(aMJc?3zWB@9f1f`7(a(iln`%iW!d}OD>h&-B+?Q-A6+s$;e0LiLJ71T&49XZ;e)!fG zxehFF`t${BDS_xaY}V+Mnr>P9HTpvdletWvnpyUTo9iFn%(keScOohRUhI%TW)ohW z>Gl}_0$>XT{%uSc)|9kw*loIU3ZQeG~8GjNgJAa&yW28X&wa<8sn7)ab1~0G_{A`(A(IvPce-+R9)@8Wo><2Q^pN z>O=PDF0!IWiqXNO-`8lQOrNz&@{WU`Le^`kC-DD#uJi2W~AGsbD{k~spswIn&2;);Zn9Cg>c{1P#Ps_rs*U?Rgofh&#>x`_SSOHIi`m^Jkr@N z1>t)(*zatZ{WJj1{YQ96?kxeBR`zAQ*{`%9?y9k#4GKcB91(E{%o#3t9GaF ziDH!!V3xa=KMTWaxkgt=`)@DztvS=hAFzvM59g=W*06}QLp6#Z?C<-%(Gg5yztDdV zRbb}E3!c(hny+=V^giqRRKao6V6SwA_ixv3&;!lC2OF;`+0>@3a^JA+L>UjEINd)v(ztcaI)0gYkaV%S|A_8Zq3{yl6I z_}#+CM-+>or_PRuk1CG zRYMpY4z0w-k}%yJ&DnnP*{s2jd{cYTTouI__3HvfC3 zeL&ycI=nAV(d{#hU72CZZ6tyR;_CPO$YkYIi@GJS^sT0=_=D3a&Ovtg611- z;X|(9%s3Tvd*rT44E>YDi|~#RA9GD2{%6l$ZHx_BTda`A;a3FZ?otnk znt=4xqmL#DAWNV#UokP@69VVZEI#R7SodxtE+%!gTiS;b^cQ@QNXyTC6lda%8)?<+ zy78-Yfdg<-)vYG*H!0JiP2m?C@z`YwZjbjX&*UAb&$hVR)v&-j<&k_p=PU09Kfb#9 zm1z9*L4SDcK>1OUvy#}9{B9c`IRHyr>Q5XFe!N$m00H>+bNk}z^QqXD6Hwt7x{=O) zSK~AJiVp%gV@+of{pXZUQhRUgn?9873*GBV7rS5G9}Jm6H)F!OCkBlJ8Zlobj~))5 z_+D|DkHs;AA1);>#r7Phr^7MhDXG7Mg~f>`fmuPofEMS1p_1f=3bXdl)T9Tz%>X&QMTZuCD-u{uz^P zp@IigDXZf1*je%t6u-|fdBEgSpKLDKJfw5URyr9LD>j=$b_HK`Hye;kuJE zz5@t!A6=~rY3SyB9DYKJ5BMT8RU=|@M;o(}DY4v-{-R$g@5=|>fA}{u_#11_s%7c2 zOuBYP9>1l3g__A4%cZ^mPV~!})7iT?2V@lC{Y#`lDfHnS;d}_g%V90`|5O4wK6KPu zc_z-cGlFfM@+~x1yjc7ullQd)Y2Wx6CZ@3C=J(L<2GVfAht+4~3D%fgGx>F3&`0DP zg14#2@x|-%oDdnb3%6VXbr(Ej8$}wn_w-)C>u3f9w#O8ezW>TO@wMV%-m_11RC46S zd%43O4T)hbT`sq>%l+-ls&G)FJ@@b(M$p>Psr1oZ65D?6*Q{^OuBv&O)&SMk zAuL1HR5f}|MX(}Q_8ej)eW?_2Qo$7?cNq1s1G6_|?Nghnr1MJ7^T2;cB(DtSY?mGD zc3nF*i?Z6iH{)2BCb@Jnj$pGL<;n zIUO^DLh#eeKBqoIafh(w@ra1cjc$6Sq7j_Wm3AT9f6W%>9UqfR65nqN*-qlOL*b<^ zhziEJxnhq;MVVWogzgEIrx4p*EdUGs-Mi8kOfhDd_hV{`>n>B|1Edd33@JB7VwmaGYE9&wB5AH%AEpt;TpW-e~8vB;G z%zQzTujSzU$nLZHt(v~doH=gK3pdAq61n91iPLqF%8~Y>O2)M5D*CeK*<9OfvZbBh z8c$iA3`|^V(AXBh{m`QjS@^0scDL_A@n3o0Sj_dX`Q;Hi`W;{4d4<&%J(|{r#2nf# z#jV6NodQ+=(SvRvmW*PvRpU(HZ*I=skccN8a$Mt`P$auQB7CPnH0zJ(^U#Gg9w^@f zTc$|}*5ZnrYzPAjwn}V~Zf##-*bC7&S-r3YjQtnDA5}=SJflkz0A;J1D+0fi-HrYU zhpsM~xW&f+s{h>y%`als(5K@1q=Z}*7JQ9f%L>b@Amo4UKL1jSK_B!A z_-$OFF5xH(HNve{Z$EdrW!7OUt$@kzA(@v4Wat*K{~{uT?Nd3YO5nzyydk1T@kht3 zp!Pk`hKI{YVI8RflU2XKjx&p7C%MhGWs`<_va5Th(H0u3H!#QO5!X}!O{Bya5MLbnwDL_yZtM+@~ukrT|hY5DA>o#{SD18``(S_WKSnf}?GoewAR!~}}nVwslLmZ@=G7eMmIzaP3zTg`QQl5WB z%$t{c3v)sb!N#f=cijJAKF6A@0sXBMZ zYeHI;g7|&%qKTCq`s4`e+8GIdyWQaPeSOyK~e3Bum>l(ttdE>HNmHeTXF z2m-Z%J(pu8(MLk9e&g7=n2`LovzdbQCS|E0C6XjpfR_aSHd+%72J>vyZe|`X6@H*D6cK(Mvt}`v-TjgMUaDv<~8?D3Eb0 z8M(MYq>}Hu!~A)+=W!=EatpClN8#S{AqOL@=X4b zlFzzLBwtJL&bT|8X6tP8tHJ5CmYfBIzS&+Ha7&=N1|6?1`%EsHq<8mdi*_l5?-q00 zH-IfIgyQ10aPVAoWC+M80Jt%mx0g&+DZ7Uc8zqohsq0sobW#+4w|ZvRMV~;MV)&IM z%!F{k$n};`{(xU+>%;nlr0v%XsYgWjJUbo%tZN^40P07%9CncHj6qVi9lq&qNwMt| zK!JE@=f$ww>TL=>A@bmYhMHcrf0*Q1cU!U5cxta0c{Xxh!D>qIp9hhnQYx?uhP2c3uc?z0oJ28%q~uN4*<)EnoW@ zfd6GoK{@YO9xvyh*S9g~+A?@+d!bgzufREw*ooP56x-?Y490j+ftFwVJQog0Fjjon zl_68t0wbOfZWc`h5G&lGAw-@aUn2ChAlbSvP1Lk+O~osT8q>7(O6)0Cxif;jfkXB9 z*AbHNdy|+MIEY~q2+S{yfz>!R3k|{6rqmakiSG5QBaRbkE^siE(vrfvA** zWC(FeuGjTopn)*yfY<8SUZ$U1l>eMn! z$GACIz|vYLT^`t6NL#>wI*p!U+*?-1MS~@)(X%Zz<-Qc7^29P?1L3qX4s_BFMsEvH zsgy@XYcJAtSL(ha)Pu#2pm_SERmE6c^eRO+7#HM@>a+@8a08j^#l23KhfpDf6(qQS zU0m?-W)ao=Xad~$BTmk|HkIFJB}5O!p}Ld8=1JVKFDXdUvQHWI?RP<-7YVPRqjlvc zRz=#W&VcDr0136B@uT%+Fw0BqJ~O-Z42@KBFQ~A?ZNe+g5)VY)&wII#-&gB~Lc#@l zHX8o4M+&&xB_=?^GUztCrW88BORfoqFO*`&OmgZdV(M6lHt9n@D#%L9Er^{bfKFgsC)|5h_C%yIv10*+dR znB7#^nCPUr5-S>BsH`0gO;b*oNC_i@rQdnXLC?G&6*1=f4SAYad*xXMzSM9JKcLPL zrGm^9fPwI5@#Mbp>#vy^n0D^k{w>xUVoUIVom#L-od`u|F9|{x)(#y7Yam$DKAl%k zLx#yhbJy%WhFfkCO_I#0ZxmwVFMQzsgK@dn0M0Qnoz64m^W8k!Yo^98W~D ztj^LcO5$C9Sz_gM%WeYW}l@6o`Tk-X44-tBGaBPGHY25x1q z0?^)~7gl5|LMFLST7rh)5$s@$uEDb%2wEa4{4Ko~ECI7YdTL#E@E=Fm-@BG(OU)8A zO1a4WTInan{c&w)70Z|V@xN}Dcv|fDMpOk#vpWspjCsFt9nXW7nWOYLx-2c0&Jb{h zV$Cx_Su^jm%ls}eFdajk!wx(H^NQPDu3@QitSx@~6d@lxW;OeDh5IANn@LiQ8Vmly zY)sO>$nd&1v6uHXyrB|8K>aI*~NsbZNbacKLs?GtDx64W3NwN+)Om#Aq z`_+0d>{62MH1spU0%_sWyFmjlQrnkFG;0B`FG2M@gwK1 zLV`BJ(u04E27o8BP4$B26b)Ipt}1*r`v%P&B@vKWYoU0hsb(UlrDVw!IrG^MAA4&# zrX(`4a_isWrU@GDOwuChBR?dd1}&qb#3a9kmO%}jMq=pWK?@WOqSdiIl}AQJT2z|1 z-ftxwI!p}JCBREMav$oit-@{Z%O-?J7*70l%N~D32WHMw|K_&Y%!Hshh%a8TJ&0Xy z^!qvMz#+)SZHaB;#YA`$aV9wT@K`0lFd~NCEAqQOS83eGGd7^?2wm%m9FvNbI2BrV>yh~%F=Huv`@cX2^v;>|PuE4BWWnISCr%?m zS}9JJ+?1O@#Gs-9`bu`d{9~8l{5434C|2<_M?Llnk3(MK08Ix+Yoqy2?%~|{wY7m* z#YHld+z_urLyy+dlc1Zjq3qg$S3Fm&b$l2)p3@?@dsX;(2ulg)1)b4QFAR#~)$7Ce zZQSE^n#-fad|}C!gX&dnSOAu{GbH_Yr{^#E_Li}i90is|*leSp{9O66aCP>{wo-JuWRmAFT8yktM9?i0G zTU1%dqE!*VWb=XJqq1#NgYYRUeEN&#R&6(%jeizrh*K2w>9;h`-(Aa-pC$gt{no?= zQ3X7l5^cVXcC^PoK_u1|^N3JdYuvlhZv4vw_@Kleoe@kfy_6DZ8;fmppMEN~qrjWC z8%rhSU_ST07tY|qBe%9KV3Qg?$5otX0fMld&u)wGvG2Y4o1C};mlduG;6hzRE!6|@Ap?fdWM3TN+g1O z`GVK=G-!jO^sQBa**iFc$8s+hF(4)l_2WYf>=-&EX@Ix$Bf9xcVhMqcLEu*T&FvN5 zWB|d{4lu2W{)x69d*6MUNd!C~RVksr4U48x=ld`-4FbUC=ZI?ixS@dOY^QhYbebup zl%9>i?>l(wD9UO+X>a5iD{bt$Y(84k{)WOq4owRUG*UhMD}xIx8t|F<5KWiTJ#_d&;;L2}?P6SB9wjK|9_?47t!6 z2(8e`iXc&Fqq8DD|J{1-MyJ|24cBbwOjpU`_bfwUk6TvR^_{0S8~9X2=Hp;op7 za!qaeU1zCO)u85Rb}Lz<$U=x|`Sa0~s}h2$k~_~@lg3`p@EYGtEL~Fct=R=9=K>zS zGN+6jP*=n0er~tbO8l7`c~@=qyvCE@SB@fIk_%gqhwIVZ4Vm*vhi!7rM^>RHz|%&w ziv>Zvaj1zRoavTkF7yzGOy7wdcnv9EgkGWD7Rr@<9;L=Jzmlcu^zSCc$@tD zvA|~T4E2>4G+eWb&ZOfHa7I=pMax!+z0+YZi5ov1U499B*xw5y6=cJlfFNq|GcSH63Qr6W!gn3)nk`Hh-JUc^q)2!#Xr_B1#NlbbmqfV8IfF z9J3QQ(8K(UwQP2zk=c{ki0r(vG?kNQ)mA<3&%yy0XvZgd0Hx!mQJs+(2f1z{(7Hou zRK|5c&ZMk5;~aZE-bYNC+CM^jJCrvFJ>s^;c#4%<6>w(|bOuhxag@U0T`F_%!TX$w zxa4hM0B;H**{HhxD@*Cr=cQ$e3d&lYOE%G>D5orSY?)Bp!}^MLCV75rYKK3ija0!f4PTXVUr%v zfbj?cw-#3&Jo9BYQm>NiI?ui`UGC-GQe4$s;_`y#r?u$+bR>{IXCN8>s^yz9>~*fa z^E^beMGy2K@59ni)w-f@F=Hlm@3ht{(aY(3bRlULj%Ck;qLL`pkm=iKyk*0I)|1}9 z9wB{yb6g7j^g0_NL<0zxCJ{H`^V^5mMQ{974!#$?|J1#tmJIz7gq`Y83xlREzl8-H z_=WdnEXFm!lve;xA)0i`adJFf%wyIK@b`OwWP}Exj9KJjFfu&mByB%@SwLqOhNU!Z z6vEzMYji;K!#CV-yw+br=EVUg2bH#Kb1|vTHL$K9(_AnxL zJKs%L*9F@JEwJQ{;X8pQSki}-SIusZI6g6DBsVW4)nby?qO$uI?ChcD{Mml9A9z?e zJWz-rAh8^6D%xoPQFSTRc`U>%zlrrcDyOYUD?a#ocVw3wRaWliEM@c=Q0fd4ex2qj zZs8zpU9^PrBxdH8e-FqixgJoF|HagKfTF@=6B!2(h~xH2t8?chkDelbJiN_Jel2=rK|S1ozN6C>FVe)*j91?85FVSo z$v(Yjjh5yo%-stH$FKrJ3N8qh{jL7+37lZ653Gj=L6f{kibG6(% zQkh)sD>`-Zb0Mx?^8^a>eq0p242Oq29hyyz`gnZ`={<&Xt<$K@STb-wb@MH5xg=x1 zm0%ba;CGftG-UY11K3%x2E+&i*9NZ%(YWEuOWoxjb~ns0?w5YT1%C$Bnb~jNozs-OfJ-wqM;nIq zk!6-2+L@}(gjR-F+Zml^Q>VN7u1ef`9wOL!??TX>X5@PN9u&5>(4#FiDCi4=xm*#);WOB)iwh9V+4Un$cYg~`4n1Er_U zGZcCWTVCf2BoKTEsj9osVK&L#%Z;lMWwd!c$tsfX)ZMlFdAw51H;L$Yxe+?`_AXM; zpKg!>ycFC*tlK+;BdL?(LGHdl_{o=uX1$|V*0Qc0_9lk#5gnZXR-%N)?1zA_*+GNh zdWe?=rK2aL+6A8yBQYz9Zvey89hA(ghL#LX>(c9Gr7nLc47m|WCYGgrW$>nU zRl4S+c&4I$x?cdcF*Dk9Xp|h`+W_3ttT52&pzXLjnM*e9l#1-S_a%wv&A2$85K4wT zuBJI2e7aJvh1ls9cukou$)oKom;~@x0{&_KeRDu+AGFJ8gwVAfZS*SbbINx)Z)x6g zaD1&$HosNG?koIt;BV}2j9{=Kp6B}uK<;tO!N{P3GS9OPOc}4kiPA5?t%LpAOrBSd zfg+qv2Y(p$(*igTzkA=+eO9ecXCBK*wS874Gqr1$2mJtJt}o9 zr~GQ68rGDOG;=raAwz&pOxIcg2v<;O{p0XU?Xy$*wz&`(kA4sLg|Yi`*Ly-)SlIY< z>-l4Lt=2qI$^a2)T$J=JpmgK>6c$k=WYr+jww*#wZ1+v=PxPPdHhOks`N$rlr)Ce& zJ#&WCz;Q8fN<7aN?wNStL!%Up@7Z4N|K@V&N=g0?_w zPP8bfr8u9T@ylb|F&>o#tHEvIZ927awP6o+ zyH{Tdb{zIgr<21O)eJ*z@1+|&zB2`4`%&OxZtWe|dbpNn=x!af1aCe%a{46XhK`8+ zgQB~SwC;Ie0nIa8UAWR2DGxdwy%!^W>)>#2lJd27F0Yi1{WIq_O{5~>Oruhzo_INv zlmL90i1J5i*|7VrTH`w#GvOZ@JJz|;O>-h9rfw*YO;HR>lksp1U2y>uRd~!+hNILv zdJmN5EfZpFX8+WN4}%92OaVEOR|YofKsCPc5fM6vwv_g`N3ZW z=hO|EVr;X)HvVhFtw!Q3hJ-Mvil?*t_ef|uYDx0L?sfcNOSq#u zd2xO*76*=%$d$8vH#w--uoa;mXmzU8l7rdHlWzGiR=vr~mR6XqbE>%T4=_erLD*$QTMssTbm%8c<`uL%G+N#AwwEI_34JCh_AedY1a2!LTFPjuugmCeNlk7r($xn zs`l3lPtGyHJY&ing=FE~#(JZD8;sCnp`w8Ms;%1gHjnFhdzU4YyN%P8ciLOn`BtvU zCbX>eQyrw-*LkW0y#eooUasAz5X8C`@bKrwl-?>e)Of3)7B>6me z@{vnY*g~>^V|r7TujtF0_c|C)%waC}h&b2zGHqZt2r3N2-xkk%=G;B|@{RjzR=z(s zB6U>u)MVjsd2_%OWp69dvC2alAp2>_&D|_;VM&$B!)C>!6C^?-UKUQ%2O4XkGAX{` zNxmaNJv`YRUOk3fw!BRDRS#!=Pnq{ug2zmt~#A+&S?R`XsU9%bg5BHx&F23 zYxs#HGBM78jb}3_{7FVfzyhXI>(ZpYHI?x^-Fl;P&GLre-8Vq%(T#$u=#uLWj>Ck| zlas(#3r=VEv#UKscq5}Ide=}d4iI07F&js7D%>qawWp$U-WN(TX+h?^k9lwh!6RMu4LJg}^i9z;NTk2d+7Yoya2Qt2CY?u9nA_f1h3k&_DT$+O1I+Z#w znvC;?Ng6#UA|}z3Li<xv0A4ux~xbD_zSYxbYF#Itu8hy3+ve4L zm2zkb312SsV=z5XHbsp$c(MIqR$-0ii9b17h%^i(rn{pS;3a$716LhNdy{6A>eSX> z`oYH`dWCa&9mG2qE_$2inv@DzMy{g?GkPDKoad*7oxchkVRcHf(5H&qDnuwTvt+UZ z%9++kwPLzgxPqkwS-OXJ;hd2{vmf9Egdl37;ez?_ZvC3bUQHow=wQ7W@Z%N}TMws| zdKsAzd?+Djs2{1VS#Y6jr8T14&E$6iBaJDU^ztmHC@;9hsQX7^?#-o>WK7+#j1DlP z*Q&tH%3;!|^n6knRHZLh!ICGaH@$W#JL$q&z6g@O?tT*Z%@;*4 zQ5k4BVSwjICv`6HL6`eIYm;L=^@55fhpy)Dt(Z|Oi{dI0y8G3_p|7`cR1zJC+3!IwBRmJcP_d6%N7)12{IxrRk&(%qIw8ilj zsAq^dPX@Cdb8w-^bruvup);qg0uc45pcSKz>z{_gSWg#mpE#}BTDN+MLjxcD64x{y zwb^Uveg+&z#_&(x!E`1#HGjKu8ZwVH{S_04?pKG~PQ@`~xJ_dP1{|i0#EkZGMDBUa zT47z*Q&j>o4&9gqO?-fy4ewW7l$#zI;+mhG$!-Q^^nlQVF)tC1jDv<6%Nq9JuCfYC zee5OAE`j2Lr;Bh;aW&>$Z0Y`VBg}`F)-E7&)z~h${v&E@cftT)URZ!L{cRChd$;#f zN!`H6TD3AJC{4=P%{k&+_QPMi8$8dRbiTZ9kkI}l^m#L0fA*N%g{H0 zDX|B~wkQ^954E#rTOB_K=yzJKe2G zU35FdTJ@wDQ=*CxZEh+*ld=(!mJQPKt$1hq=SfM%7V7gF@8w#4ah&(-=@}`SkBaWr z0tcaXF}dub#c1Vgp4JB?2kSaK9cqAe>BJ{+0J|p}O&PhlXcT*#Yd5-4zRca!_91av z(P}Y}TQ}zswljB*@u*o%IgyTo`mO9-#Q|a8qx1(|8dJ3HA6tpv^H5Xgd3Ph^ZB&Hy zL_yy@gQc5v0-^l*O9+YtHwAlOC0hUGv9~Qe>?uv{iGoc6G?S(*T59iSi39DBGorn+ z1z!xBqRy#YU-|Lfy)tA8MVFt9qb45$>s%+pa&IypBQoIT9c>}P(~y5oSyN>t-jLnR z3+%fH`EURPpIY&+)A;0`W-2+MO8frItUzH(EEA{GiAT!Ez=+=?(xQ`tskTwxa(flE zYf47vdPJQ|j;$a3NaQioK!YHt>+UJ=3}u?>?d9c->Fe&dSb~Lx<#rD0PAI*a>vXO= z+NxCebY4?@nNY3I$bQ|^GXGKem`9e*C8(7cb@X?Uu|xR{G4w)kAGx z7IyQ%NYvwg$&i#uW&jB1D9##qX(TLiHzIozxUVY0JTXcwgIDWcAI({wgx~H+So~kP zkNEexzs<(_l|=zEWt&!u5xmrirsY#W)!@ckQ}ZZ2 ze(j3?Z8l29(f<@utFx*0{xvIu6zp)Z!KpajX^HQ2Q7IO-xp9MYTe)l$+cE{`OtK#Q zm(E6iuE(<@)1?dUbDlPo?RQYR^aVP#(4m~N`2t&_FH`qrfycn|e!jKZMa>7WyNAv5 zamwUMw*<)3k@%0jowm&M^+nsj?0+-9BQatgdjKQL&V z29ERO^<{;x-uWpJ{~0q-BVGL$)op${CIp|}bxvS5V2BhvOTPu~YMl6srtLB6nI*-J zKVZ5JM`!u0B?!;T10`(gtZ06^o1kOb(Wspo=$2tZ#q13Ah}tkLfM{HwH>8*pTs3fy z&)27iDRFD892ZDHmdknx+!~zpqv{O+O}Q9A=Dez)t97>)+>%Y=VNyc&1V#tctwz*H zZdQfKsd^U9gxWUa{BJ5;7POlCE{L1pUf?-=4d1#kb6xVULqKAbr`APYCs~&g9yraJ z`!Cypl*mtaoz^(@qOqC<^x9W>=)`462Gl(zUAW|7Ib8}zy{c_+cSAggA|kPoI%1IJUb@NNtH$D04G*ntDB>AvH*Or0K~D)RsswskYfF3u3-YcPW@@pNhpPd z?uvDP#>U*n-`cu=yqPuHh&W8BQcGnq^pyMZTxNhKX@XA$KYO#$%oeXh^pyf{8zpT=X6D#DaVJ$)pnXoBYLtuXn>kq@bgwqWq zzF?qDRT^gt18Za%vU~(^2E)G4>a`6fAv9DGK7L_;f9@jFRq2bFFE5>gCW2;JE0M(c zI-tqGNd(}w-R~;=$L5%px7i~C>4gqI;*4sB7$$*Tu`h^sRBta4dgGr8bMA?x%+-Wo zK2=2~H2%dHtoA$nDG-OT$vu4X=UheRgA)AP*5{2azJTRFcUXw}nMUTfzNdvT_{G+` z4rzg115tZs^nKuGd!W(KTf8z1!IlT{4oh5NcZb*wKC}n0lKo5-Fk1c3B8&TOOxb3` z_6k(6l&%lw&V{)(S4UuQ?Nboq;+hgRwR6ETVmpw1&y{%*cr&r7#o0b}zi+?U>PC6b zY;;xrRrc9HA7n8sv|C$5(15$m(L(5H`~9ZzQFyP_{K zevi2zHuASBIikW}K=RZrYc^17|4dR)1{9cqNdqcGdBj{q&B|ZYxvdsHi4A+RhJpfr zj6Ax1?)LST@D{(rItd6YgS}q~rE+R6>Q-p*sIh=DM@czq4Qxyualj-D|4m z-fOc!%?6gI%E1c!hYAv9o)=WCAf>$9X`7CFw%vRk=Weh(SNLP6p*ra~4M=f{<*4t6 z56V4(=%>GF_{GxD&Kql=fHrd68j^XJ+4*SJ-GPE+y0EsSdK+jY+*)Pq+;->dWX*8i z1HNCGo8@1OLhpA=rSsQJF_;F+-MwbTZKIjcHhcfR?57B$-V;QN;o`^VpQ(xIv+B1S zFR|B2aMv)L3=K0aMjW?J#(g4DgD!J$kco%tK})WWnD=lUAvU%g`cWQd(B_{AP4u^S znq%>6k351Isgs++6; z$rZsQW(*(a&uZ2eI9(_B`o3VGE&_EUx#lxFP*M*SnVDQ~M*`D@4Z9g9*XLWeG-J8Y? zEq`)BkY#&|^7*x}+CJ(ms-a+)Bo6sb|JYNveNGO>L&Ia3i%_;Mk8oa+ z2|ETO41OeC5wU*d+%WOwiU7-P>6V9bmzZw(ge~9vneRg{8z=oPt;oB*DOteX&88vU zFyk}j=QqZj4}M1a_+JWg$c3k*x`P~tR6D<{I*zvCKZdJPe3mTsyuim{I*Z?)2Qf4| z1XmW&rddY2AN4Rn4(7>{MHX5Y$Q1{mr(~RF%W@60>rfy$d{}0BV)iisJy6 z?t~RmSLn#))?imZtihujYrCrNMi+YAstP0#+4tD(;e9ZG7mHEW@(3^xH*+cb1Hs-x7&Q*EI zHMy+{Yyl1B2H`}_*uTbC$r3Ef=@&DVRr5k(*`XZa6N_op+U-5kNy)9 zIA$)-a-MZKEmWV~S%XC2=^cA!LVsl5C}@&4ef^n0l$Hf$ih;n!mmC+3TU zFGcb8eI{!gVuS6@_ftI15VCdY+bAjL!x`jl;qhlKeWm}@*6v-t))69C27}C7LLWTD zHww7*jnx{w$oeQ8S!fD#G-i6eGVS^fsrQ@~CIW z7&K$!0(ed>T=O*Xv5_QeM1#47ZmfAOL)nd{rndL~m7}*aufLxApbUE-ICEota3-}h z&$o7MBPOB=_)4YZr$Bn=W#~_nqUg!9!N|133GIOoVUwXcn`4T9V9=a9_nDcV&@y-C z86c(I{y=f^HxS#t*$XIh1XF~ql~y<;U2L2{{H;A+lh z*z1GlXz$wao9+2e6E_q2i)EM>!+M$gNeY>JUrs%GKZrRtde^gMej^4Ga+qXhttQ{R z4QD_FsKPbQiD1}CcH5<OB)v&d8iV6}Qxs(+j(jSD5;ksguh-i?|K z;*T&|2@H}D_qc%0OUic1q^u@4(DD-qlY4JXc3H@jf{=Tqqd_s{_YJ9!MT~ZbEJ;^? zazx%Q@Mr^bKfZUd>efT+)|P6|$#bCgGmJQS_nFd`tL6IHd7>XpTczj^Xy0hU?QQ0x zuwZ#}#CrSmrtngY12p6*D!s1JpkZ}3<|4T@`np7Z*+T59TSFCUjhM(N8MRS(Bb$x{ zs!e+UdO!Eh&ymwNOPOFSmvhf^Io%rP<9Gaa6%VTqzrOjol8w9O z3q>_8@W(MDeQjCPvvlrV9uT;^)(I93pP%p2tX;ljV7)(lLZ151d;iSIK7ol1YrWg# zKY^>~R-_`GBOiRYOzut(#Msamg<;~uPPEgvP)RXxw6Rd|ZV6TbBYQFGmS5X;dx|*U z3oXHDTIR)ZLD#{)`~P{&|Gwv5p?}|V69|*!oNO}W`}UVhr~Pa+;yg->X`kZp_8Bh5 z2@(3oZ+)gB`mUCmfNVE&=?#k&tx9XN%jv|VC0{?zg=C+D4O{-QVaXx8yQ7cG4|pmI zSJXPrhMB4Tx^TmyC1`iIVd?9KK8OoeIr9NoYtH*`y4@oxf6c6IyOQN2B7f#-ckTne z8wGE9(kJ~=IK#EIB{j2`K!>@5V8>*r?yaI`iN+`LjLF+6lPmObTf9x^4XYkiBl3qf zyR+*!{N(v3kv0y6!eB11F z&xA63egv(wX`521KK>f)>WW-96Xo&|0eaXEcbMucT+T?szw_A^Z}>g!&vHCn z#VxM)_|Fx0b9!d~;S+G-D*Hr@#^&=a67H75dE>DU1DWh;SZA+lofS%Zs3|@Rl zJm_T~y%|3LWHGeGXWA0V7+h?><2V24<{!$<>jiFWwKs=N!VweY4t6(ch-(62u@t4( zHyu{xuH9FU*a`4qcCcy2zf50fN!qKy?%kP9#Af-+tjAp03Or5ruGdxcPkdJtKK0qT zhIJIie2=f~aD8DDrSg6&-!evlcv`YIK5Q>GpPwG)F~yttN+8Lc`H1oRT)hC9Xd9YX ziO9Hm&@Nz_X1$lS6Rhf}y`Q}mRcjv2>o%u`m1Zi?YYpCcU&iiu)lYLv`qLefhZb4> zBQ@pcMW$OSVP~$?Kez13TMIbx%AP&m+gJ;JRizo28AXq?o^B1ZH=`UMJXA%T(MbS` zux%ZYF|Q!OM;97FA}%@3CU;aASZ9)F!yoX;bX?`Tw(6lxFCo59g)zjqH)^*YZC#`3 zZFy>_22Gh9`G02ZmfM&$iG0_v_@c~w*%_!71(BRn%Tj5&V7p?84h62Ws zIyn!M4?oIs?VE+e9KXrfD~bDKH>;8g{sTe3`TN5jg)4B5sWS@ zOmLS8wOc|V;nFphjKHFA5szuDygM)F#$x!3wX)vf-X4a5S+_L?BN_- z5KOh*OmqvTOvgLBwto}<0Lk87xt$&PT&e+OcrVHQ$y9}V9G?)0+@xu;8OqVa!uXJM z)_Y&B8PSE}|Nr>GKG^rp(2@qqitiwUoc{BuJ0^WGs%*`6Y-D{KhmL;vp5vMr->09- z--WM>4RO7@{e`__b%NG=(d*;>E8It*&ed&Yx9m>T$!`AVDKD1cO*{xXjvKkyAhh)ItoldI|EU{pg}k`>>C)VXiWiZ>XBul*{;6}eyzk}kdI+iR|59f0miA8Z zl}^6)kk5}?eWzwMlpM2iVdV1u%}b^cN}o8(&$?s-zIxc~eQ_`FA*CsaJ!RO(pMDK` zGeP_}V04gDq3N;qUyF?~` zFYqQSNOKBs)El^5%b zCO*w9cRma2!x95JudrY5^?cUjD8IO4mTGeOW9IGY(gQf10ANJ%6P8T=V||g$?6z+ewgI^nnALc*ZHn(U{PmN?>^+ za6{8IGU-8&%T4Ra5T!g_-f+L6OHgAyK=k95@^S&qaXyD3Ck=YlesP*DL zBwObGyu2_NeY=uCPc(2+64huWWJ&b=Syrp-WCMYWbC>Skns`dRz@>G3Y4iM)dUuw= zxzMp!PM2dRv%duo7AEdRvm69PbUf(G1kjN|?tQ+>PA{$LEK5~Fs$U$(k9zaz{l5Rl z-djf1wQO6XSa2t}TY%sY+*Tk2m*52V;O_1a+#z^^1b24}&LX%w!CAP=H?#LX_nrH_ zbKgGa-}`Y}TaC6tl36uJjT+KNuTd-YiC+6?QI0GZ$Zygd3g+Qd`>XIfc>} z26NOlaL_3#NNwBGuq(tOO;#M+OxIssrJ2uyON}ax_Z##+DZokAh+}_8S!pvM=(Tox z9Q6=N^gXQ;JLCuuN*DE3s7GjK=Yt|ch@?%j>s)5xs42M@e9Y^F#YLrxA4-{xr&ZJeZ?M~U)qd*XNzG!_Yhg$PO{S+cc#JQ4>(VmgExbtF zv-%oUD|`Z9FsAX`kl1=u%opP1lEkDeeg)~EkaTtVo}}Lm?YTy1zl>EsISeEL*uy+i zS8Y0Uy*3i4uBmCv)}kKz6kM+D8_>XMmF?8ao8xbT_(QgeZ0n%B#n$imcQmV{kIChR zUS#hMA?3@a3ZFewHP%5iD{&*`NbW(3SxGO z3F0?aREXkhCgi?v-hZtBRN&fl-%CB+Z|F<6Yu|am!WIQ5>G%7igzuk=ube%yl{RD=p+bKhZ&DFp6)|51w28+tCP= z2@x`p593ATNR~d9%4sL3p3U@boS+OnR+JgPx^o1;6_aY&nX49Csfuj3>%6D`!&;oN zp%c8)wLs7U&unzK6LNGS8_V<5-OrE(RIpisIAI zw0sdNP)SW{7p&BCWIuL7?-x&(L{1Hak zBEbhlEXM4OtXZuxxRJ@1teuA!ktEGAAdUk-C(fee5$yp!P_ zmvc{PJZ^Oj&sI%0(TJ_2-1*txf>^igu$`p7n+gHHR^YKAE?c@?ove?NuNOn2=z4lw zZFP@ODOvaAsT)QV=N&IQ=sBB+c)T!}Y)_}`kA9@KK5mt>6?u?!O=CNCEF6#&g6By^r;YaKDCMv1P!=GK59hgXoEb6LtJDmI6n_v4nZrzg{W@ndw*!;%q_$;FZv;ykI%o*n&MY}{P z<^qQjI?`TWugvnT@Xy1BWqUs*IeDA_ZL27Tf)QW*%8Ggh^&WECN=n8i8sM%@5%{I;pwgiIf|j=5Nz5pRhspN)5m#sB!M1w>l9k4QH_^kDFaQ zrG=j_s56OWyJoDrvgocJ#lFf^b)KC!P6dxPwt(2ShaawcKwz5C^hR9kWewRdn1LqfO&9aWGX` zF!3x~v~R8=mv*9`SjGc_lpQo6+ID`fbB*Z1*moR3WJi;0LG4}d1xc}?6jHAy!hY!N zuVZmqGI~X5YD}EcH5W7HilZ?1I|uYzo`0S#a+(ltdgucMu%p7cs$@cbTvkb46TP%^ zecggUz`?){r@c+&I!Mn5qHto)7La)nnJ&k zY@7PhykNt`KK_NvQtila`lxw!3*(Y!)`wj}LTt|=C7tRa*`@!b;EPTpzp7QdXR7U? z+g|-$#Kz|1OCQco4zn>D8N5fPFn?r6u#ec6KNBU5P`n{TW0(2kE(8)ISDMAl4|8Vc zU9vgy7%rDMf)p71YwVp?{IyG81zI9Ok*tI9;%whWHNFws;`ZLZ35h;1S8jJQeM=>T z-3^^+WXhOp^y_fG=I(Yc>%(WSu93&XzQ{KIOXAzUymtuetRiFBUS#?$GAcd84lZs>%bQbJ;<8E;$~uveDN zksz_h)9=}Ct7IeXa#YuiCfXYnf{<7d3E+5+=pE6+D6{BY$F27BQ|vk)Tcy#D@C3t! zD@EyYGvP2)?^TIM39>T9Se}QGd~ru${j@662x6qKykYO-(jA5{e1G7s;0}1b_jp|e zBPiO5BkGkLzrYMQm^FbA2oclxjimN+;wga_}c@<2N{n7-r6|}ZYkDa`r<*8NeL8!Nzysl260f|5V&?rzbaCaUc z9w->K35B!ROl|GTlDqQc=tk-?;nh}J|1(I;DF z(sPV#tIeK9GRL0}fT@^&yk7DyY|St4x(IFzhOG~Kap_U4u19LqSx#*_97A>+;^gtY zWc88V>SuqPkQwDYrbxpc!CF6)o13WQp&1$-J0zy)~_C2PPb}4Mf^#g^<#F8;8z|;?r!; z312|r>spkkT!q6L1>wYW8w;!W4ImoA>eJn5gN7kXbua9>1IpwStMW+4KhbZkP_~7seCa#hcCFMU)a z%wNb=pE*AV1qe%M8^v|ww^tmy)_qzP^HdZb_v7)YI1vh6PYd1&sgi7^POA=Jo-Mtb zMv`hV9#ra&_IKrz!&a`b4LLUZN1z%cMJ?QC>s;$*?nX(*DgW;?oNLCbTq zd|`uXlM=s)>W>JnH3;@5s##YKS)<0IY{{ix$OjzXSZ9L#U!YMb4v^?UQHWd`yP9z` z)SsR;J8ZvB2(GbN zfh)a&X;F`_bjk0M*Jx2#dn!=LiYEiP^qoV_pMmhVa`Cvco10%;mz-PF=jPOG@H^4v ztgeT&>{!lJ}$z9kNI+HglB5lS~n18_s|FIkRD-#d8faCpU%n*^9l=(gbwOZr4uyO z)O8j1sbToGc=k}enSEi(X(7}5>3yl`A|BZQl`qL?4jk_1jx!R!yR@_%UHhUVqa$ap zA19ieA^sF69D&!C=w^xG0@dlu9-34K=H}mz;}*X#C=l+A#S~GmaNNv?D8ik;R9HOp zchB1trUqvhP7HTQ*ZPB121}ps!i{ax^>~2>6M(T_I;5o0bfo$Aw3L33nhP)38bo5+ z;YlcsAQ&A!B~!-N;rT;&LcL6v+WBBrzxsw*>~R#WZjuow5M(B$vK#uWo{kXRKZTxc z#yIACTo#|51Y$^*Yx8B^IUi1mx8Iyunh`Vp7vAj?+{Aqt{h9CiUW)ACD$mC!&8mQ! zaPfGIK0Rg-<2aSnS#)%)qf>#%VY33a=?u~OY;)c`+Xz5X ziDBqNebrDU@-#<-i8+$pA&Fvw$x2*hHJLBN*e9cUb%4s%pK8*06!n4YO6o0_9f~6c zwvnl81u{a$<3Ql5NTX1{u;BSm^xeihC>L|agd;s6N$dKmUPq`btFxk=D*Gf`$h8Ia z-2D;sZk5HayIKm)*`v+MT+ z<&3FTI-KLw-H*OnrGC;ubu6LTD=5y-jpC&Y!%^~YrO5VV)2WyfdlLwOvN}JF%p!a1 z<+=HUk-vQQ69F<{UG_x)!RW;L>iT&yu)+HDp{bXSq|5fTAoASt`jNlOODLWvYKgWI z1XwI?B(M#qUFaiF2$?f$iR|Y-+|m0r$lP$(x^=7M^^9{j^3FSjIjuzOYTbiPj403< z^I%=`UU4US43sQDGBw%eDY+^vlxeY+CUW&?&*6FuQ}nSee#zdGQ78frcJh+=jY}3! zZmqp+?w`)(Y*X-ewcm6!i#lzJ>5AL>BTCTek0ltPJiXaUe{^Nz9m3+Q+ZUqK0U9!t~C zyT84J%VE2apqSx>mpZ)-%v3Spb2(a|UfbnH^rsx>vbG`#R8%6^6(g)m3Ojv5KZ1iH z_2dX^A-Q$|JkkY<^^;Lj51fgYko#vm3{Zx>-hI7N-B2)Ci`RhaRJYP()1u?x%gLbG zSqafe(D=ob8Z(XWlb6r5i$dbtbcfYrx#$77uSm6-2BTk=w}K=%gZ(nKcw_(YN*ebo z8LPTqSAMzT?zy=RWkS03YchG}Jm{~n`a{o%U#@zp(JZ$9`eKFkVq)Aj6ntGUs+Fg^ z%zsBZ&`O+Uv;?vE*&Yg%CURE7y(Kti$*lH#gef6Eze(gP{&dI+od@GI02R-UNnN^H zvFYWR<}x1__iavd%#m**FG75F7c2~6n3TMku3WpBNh*bSHZrJH(hIY`-w-V+GN-?9){&7 zLqKCf?pz-0tAVH}Z!}>65#1HXUkt@&j&PQp3;l4ww$Apf5B4r?v*8gva;hP-k!a^b zALbU18PXvO+<1k-zjTDY3E+!%V_mAxXXh~b`v$B#-w92C8ZtExN+6h(akfbp z22q&|zA~Z&8d-y49mFzyi-9F7TAD=USI^w?#Xp=dAzx*Aqeedya!N{k?1QR7;q#`O zCdd#dVqz}dP7gm{y!2kvK4~3S=;N&)6;d>SpwzdkBI#vmtOK`Z{)%C2XjcCYouSCg z5Dcp?GC`z$Uc3p1D>lb0dOBn_A-d-$vGO}UKQ}b*zwNERr^Pp~4?asLwkT9*TZbA8 zCp7%_9^H{2>d0g6;5aSPG#d!M&DWt?clBoy*f16(neiFIWBFGEC1F-__yL3vx5Ff0K* zhUwZ|drzRfY1w>f&yimpICRX)$!QT*)l>xj9og=~_Z3u^Rk^2|AlTj7tY?6~ldwpF z^NH`Ftf6oCMdKj3jc1RGL5%-~dS*?H6O6A?^PqPH0-c2MpK05Jhfk9xyXFG`el;J~_G z|1=uKH?_tdeZ~`n7H|zi{e*%y*7-cpsfGCh6iZl|X%>wx!5gdeNBHc=&|82h3}0O* z4K?ptXB%RgBk1Bm4}|86YGoQ*BWkrV4YO^nSNhL_mja(KG#2~yNh!Um%{o+`Spa7- zM|m$x=nqeT4H&ktEL%EXmK2>|tfgE#oWmfnQ=@PDZ2_?yH<)(CnjPn9nVozWh|uk2 z-~M5|st3tGA%T6kEM~5~k{4mOu1*D9kCHzpRn#VIlrDnzc?EQkbWz#YI@~vlc?~7+ z72O=^^MX*>8tkCVG4V%Iuj;N~{EtAtT=Z<`;IYvg#c@po-inG?7U+uU5z}M}IJ2cg zrw|D0Hh!vdQd=oIv{Oj^`8>903vV6xb=(dxnVg2~eT@}kW;)K!a+Vq;9#Rw|V^$^qfQZ(FB8uXo>qY18676_yJVgZLvLIDpN) zH92>s9{>mdQ1)H*sHL8a*JIXA`$kRFHE!y?4x0JT*3dXi^qlB$cZu%PQ$imttf4-i zXQWrJs}l?kd@d`o_X3+2d}K=BAL&1mYxYo13WDS;3kH7*=15>> zLOA1JNB{_iU5=t42$mvBqf;pR@E84*^_|;k*Bac0YT8Zbl~1XdJ37xNOqe+|$-|zA zhhBw_ahBD}BknhN>qNvg3~lhzYseM-%HD!K1dYQSi}X0GSwEJur24B3z7;G8Ew8D= zOQ(bi9O&rDe_^^7ISI=g)#T~+2eJAMg#GxHlX53e^FAYR8oB)0k!!Qgtir4gdv?XF zB6Km%mOR3FBqQ|xx!S@MZ)w`7etdyCZOe1|wtYj===TjRhtcV_Dr~ndpO0bp$&J6K~akKXG-0vRp3?k$OwXY_on+I7{qH)S) zS8u#tybmbzL_(1DX^imseqm5$_Wp~v#s+7VhP&pCrgt)4aV)Y;jmTmb+?(v#m7^eU z&<%#$+c5A+o7jV;wksulZC}_{g{#4nDbX~t5kuhCCT|f!OZ7U@@EenwTfSbZFf&T?HZ^zsWW91o zDlxxRmtsfQdU~BCADZBK6>Lv=Fdb)gzQ?m>Li!bf#b>hcnmkC{d!hDdZ7*w@g_^)< z>UR03?cNyceOi1}BD_%FkX#Tx!@1IrYHnV(p96DVK- zn<6>>WILIq<97M*U&bkWb85=nsp>c?KV-(}3{uAq(P~GrT;9=QC ztSkz21_ij>kJ6tS=Ezc39>jtj5bb=klvjTe?R0ztm?K2=U`3u^&l1v%>oHob(5%|9 zGj|c9fmImdyndVPEd&rBIoo#jFXks+2 zDN=JHe@Lya=C*9{IJ`}ics>n?0efo<^q<=&G=p*d7(E&cFi(4KZ$9ID$U$SzKuB-5)~HiKwA2>OUeNme|WZHsA5y*L)cV6Cnp^Y)ecd z7&aVTAi+EY?jQt7gThda8qGc#rPo(W^^Q~TY`8c-BKKCuK2Uni3C+-37&m2Ae68AFWjnQ1h3|LqIaNJ*P(=6afE%qNS^v* z{vO^>eKdV}p$})1Oh;Tek0SLb?g{wAK__U1-nQ8&2Pb5A{de2)9IeG`p6QqXq!0qc zZ~H*}w$OTfaCm-^aOPA&X73|$b-Y}->Z?m|lX0%KLrVg3|JfXRlNpeLLm-@Ti;^^R za3vxrg7Mg#pU87ZG`b zqs*h_ellJS)-iP$uI}1vNi+-qjNoIV(J*p5%1u@6m$Z%kUzOriI!s5Pc1#k;{;3g{ zRZRu`HNF=yx78F2(&-sa@p|7c_-*;?M2i7=RU1!OE{f&tF0MJWsPY8qq}RLXwBPIT znY3RzJP5kF&;3BLO(3=%59GBQMhIp!&3-p*C){hoSfs6Nvs5eYbLVbDf`R5gmKhuU z05a(sg-~+wmfkhj&D;1dSB1bPVTC)5VE1=I!B@v*OHm)F^m{f^!L^X>YJ{YFyD@Zv zSe5|Qdp0D3C#LZ6yY?UE$e2F9Z}{$y5yaa#m&`KitMqOf4JP35^Rn>vjBmpiq-M^b z`5gR~Qsu*dIIltd|W3??NJ;(!wf^bB{6VV1=j7x(7p{Eqe=snRO2# zK3p1nQjh6cvO*=8J(Y^NFdoV4Ubv=oD&RL=<<#u=YuI(8nGGnWQJqm7@Eq7ib(XU# zS#bLn>x5YhxxJ2PmtMQ2pPanK7}V=)A8sbKA4f##T;Ck|))Y#QereYXfz@>r*Y)&v z`>GHc?I7sMI_gv8(slpM0UUuTA!R~HCC8zksyj7g)Bx45w14_TCddo*1l znMI2-H5a>hCnTtr2XkUahbz_=^=l*k%&!?Tc-fDJ$)8v=6e;c4Kvsvt{4FH@A5-#g zH-qZfZ>gGkJIz1PMlya~U^bgpH)!!Fuu`7=GG6fB!K%RzKM3tpL(XFW8&-OZ$q7o? zX5UMzAJa>lVd%mu$vW*yu(xq;%?A5@ai!RdUwAp+2SZ-I7%wIZk$AjPB3to8`W~ULn6xT?5 zy{}ni_Y<3|=TV}=PVA})Fs>rAlyFHh&W#$+*~p=ARbsUxc<8ce{5lKu2j9g=x?o`z zcnIvWin{R*S0Oky2qg&cCKh^S3mlgRyZ$@Q2R^enFdGMYmBFy4??r}0TW^O;io1>- z&750k6Fp;~DQXL&$pu*S=%HcYh}VhUF@EAyEmhQQ7_@uL%@9~Rr|=aOwj>zIxE{7* z<_%`sK&RaWX@oKDykNX(pZ@+qXP?0b@tXnDIEZ5($_95RMfs5~)JXu=`Y@;-rw1@7 zEdeL8v@*%#J6cT+N%{p|G^h}C#C-RM8{9ds%*F$PWb6Ar(KW|Qs6gyZO88Hq0 z{MF%c<*`cQ4Q`_~lUA+=2W_uKACZ6X`SVVre=uFh6)8)vQ)HcG`1aka+BSd75XRfA zZ45VE;nsHjt2_H`dxWn6wL=GT9^2Ta1CXYz49g+mAZP)VmcFFJM9a|kIcq<=96qSu zy;i?Dxr0k@rc3u~r>S_%>lSZ=YeYF3F0qN z?x%LqGe!t*CCH#fc4ewsHoV*aV_kVXUyWB5_@s$mw^&J~&9)#Jj9iiM7aibFQUJEJ zU$=iyyi8v|Z7ua`5GBa$(FxbptqvK%?=n)F&CgqB>acT3S_4Wi}Cnn6Sw69mB}e;eg;eh9>_BD&CWtU|9)*0Og&Dp;GUZfv+7k-`P0#wrht1*`TlSM5t8Dc8npa?k0g}79aqnkX(U_ zuU?LCmFo35q9+M!^SoKy&~uu@nDV+w|F$*1qj%C8W>4PESoU$H_?q&)NA6b>+V)DU ztQYx}y@^yLAo>`Q=R2A5B0Y6o?`?5XL%0SDpnU4EqjQ~K+45tU^1tcT92KBfR?8K} z9~J?MOWYJ^g*ilI9E9+NDO+&}nA8bHi?@5)Yh+(7GV{oAd-4s*ej+XQ4ICg0;3%VKh@(RtHiin=UfNT5D08pW zSY0=E&72~00;x0ek&hO>(euMwg>|U2UNezr=Wx_}YBR6-#%wrA3PMcDViy^X#;KQj zFOU%oA$ZsOw~d0c4Gqet0MTV=s)*BRz9@fxBf8>4P8PE~$WXoRpsTCP~5< zb>O505CmGsGbD7(Ib3J8DII-YfXPMt2bJVc-H}||0A6uzqbg>eldhJU{MaW{?Oxe+ z0-^aP@VOwNYe@A3l2L_^_#r;u2+st`8BAqRb7i8+$t_0U9^N~s;lLd}{*PV&@8fYK zi_zVjIKI~WBrpy99S1H=U-=~T(jw7>b+U?##_vxT^Y>@*Ilj;Gw#kX~@{COQk(5Ul z3s@i;OaAg1vL=;qR2af{2;X>i5+O+vFhl^Q*6I&2SdE7reFUha>DRUwi#^*}Es)5D zvqjg##B|#xxq3!hqc`OKi4-LC$_0r`S{z_Vvdwa{oW-mqO@9pewagCuKZ-4X@$M&Z zfOXwV>wh4`nT*R;iz4kg2fe`c#A69^{I2QO4=#wae(iJOa(UhV6V>6z_AB0B{--dK z6v4k=@W!z3vvlaywPaVo?Y6JT0~5dokWR^Gc&JHINl|Zck!vb5aU4u#|K`PKUOkf8 zKv5(=dJab!n|k#=*05(qm3Nr1DC044OF0YKAp@hRR;*5Q+-2U4DUd`Uf%}ld9hhQg zA9bwCQF4T9aM8}l_E1tB-l*+Qd_l_@63r?H=XgCqQ#n`9LoDwVz+a{mO`6iUlNz1| zHM+vyT_zE670W~qSIgX85IPNfFI%i1{*24G9ZK$-n*tp}98qaKU7}Zh?R9ml&dyNW z2o#d{mWl#9(@EMnH$`q`(MfoB?dxQz{&|x?V>c4O(UUY1Oc_074CYgd{9v!Xqlq;{ ztb`TrTawmAM51AYv9hHpS?n5$1%0OMbdmYol+=p(57+h2&+l@E{;9|5OAdeR*k|5cu>&TXFe%EI8i#_L4W@G5AM@n3j61G3g_QD(|H*Zrp+vH zMja3r{Kv@XH!mF--6?5eOA#yE;!1507{y>1LZB9z8GfmqiT<}I{NtHavViqOgq1_< zn6m5KQj{^Xm6*z~jM%jr%1-iust7JLf((AEamEVt&go3W; z!G?+o1qOtp1qM+RT$1_+8RSn&$X~>eAtXiK*dY-pq_&5(F_2FWORmU>7^kc3!=AXyYu!N9rYmuGcbzWL5q6 zFPQ0{zevdd>yfveX*j0VsDZ?wo*Umz_8CEG#CjW8;sH3>9Rc*YN}saBg?6qjqqMl+ zo>27n!@uXH#sCUL06aY4=eqE}5)w4_37|E8*+FnP^xuMD#gqJ?LMmU-DP&D?#(=Yg z3MX%VKfwe`GKHn=>P#IQpK{^Q%;9v2gKdfOz6Z%k;(nt`4|G%~{1w@D`Pe@^n>`3Z`|1l0At&C8O1scHYLb^rAJCk5cF z-+Zr@{ZD3x1@jWUcOc4;3ot4F@}>X9FDZzAFCM&V@_#Zrc2Pia37S5XzWP^7_diJG zT_7OwpE*Mu|HH@_v0R?PID%c+~oBw||)1NjF^uL?w53?Qi|98&>l0`u{+hL_p zEm2M2Fc|{!o#3sB!+BbrW~ZNPy&TGj{xxRcp`X+e#^9{wYWvOinstW3hl>q7n%X)A zBbqdx(HZevS}ubbx7bqIl!19t39mG#qLL{5gD`J^BOO~Kdy@`lTOtk*b69*1yEz&B z?o^cGrv(3oOY^744wG63g=5p^f7m46cAzE7i31rTm8xv`^VOIQ0$0CcYpDc8chFdJ z52km-Vxw(IkDm&L!5M~k#)%>%~vt(&9E>Y7Ez&{T?TLer7Vf6?pxFMvD1!(rdbu%xjLyXRy#&3X!Cc z<1g9xjC49m|A@B~-gg)A>PRMMTa4cod^Oz3=i1$S`o1HB|D7;NA1jP--46?mL|N4T zbW>PG7k=hrN|!3OB#A#Trc}!nDNQ;yHchZwXBhlg1|PFPgUy64Jhtl#!7p%zp-`&bf0w6cF%SHP<6hH5i%1a@giLL_YY$H zPKPuHGv#qVzrBCIWr#UU%D&uSdn<-c{vbeQQjLi%i3Gc(yP| zW`pZ@pNRkO4@eeEa;DoKnzfaN&*4|1|6_Ja$2BB|p|aG<3TGgbE=F+GH;07ZP4P}> zmWK*oIaeBOFqZw>L%V6ygXg~SyCGu$Fu_JN9!ZoMQg0WPf4;9M)su_UkZvq&h5tKi z>_12KTj*k;9khEB`3Ypp@M z)d!kk3jddHDdfJq34kE9SR>Dzod6;V&U;M zU!4#hY?os+f&55vQ7YE8^@Rul`TMIbyX#bTNFg#_+|z?<*#|90tvV-zxreLL6H?+U ze*-eb%8xhBDCaLnUKf8Z5%AZQ!;DEjdnDNFbU4A z6Kdi-({y1!Pvl{{`VpBvkf@ielRA*@VJqSBo?s9 zhK(V;zIhvoQ1;-G3XB8*-P!e)@ztUt_r@e969t451{;9%?+C{LOR}yTni}!@OkSpn z`U*n8yo8VqdcDAh`$Onk{THNs4)2a_7T*GuP%U;GU^zitFR0KPqk*3gIrSv z@(ZS(aylV&H>%G(?Ch}L7~b--9gbo}&wkDlBEt_e`V>bS%c8^Y{usqW3n)k>sO@aur?~4Mt<;P4Hc`AWlwzx~k%XK>D{Y>} z;}NXy@*aAF572}}?d8{T#^C~ueBJ8k(OVM#jb83W;x$kx%(3p*7 z$UCPuw2JvWUh%@)O4O5Fr9XtM%Gz*-!%iYk9 zZZoAR1GXJwIEYC4=f7BEXI$YhD)ghHY;X5^?H#NG@A!#>VV`R0Nltd+=9oiXOEPtB z*$Z}4qLMx(m!k&54sK>u;{B~iXks;B7y$2^GqOFAH@x=Feu1~W%!{)+MEOVTb^X9v zReI)8Z)p%v9=Ziz?9s+JduwD{?2PFso$p)bnNPk_suF#Xd9`PzH&CwAa@joFvG|cI zC7xDhUk69{R+gpWqb4#w3(azi$m`ql?a94W`LtI6!@>#u`5{88+NfW?&3d**e_e^c z4B+xQ^plq5sg-?kxES~V;Pv^a9^1}*ZfuQ>Ft9P10?VvC+%|a!ej3+{4UkxNh&R2+ z(@l+|&-D=Qbjjf5EYH*+_50{#=tN>t)9njH4>wq;U=s*L(Gmgke`3$5lI?lSs5;{$Yp`6}y{QjK<5 zpsCB&*7?V`P1D7*u@ATBQXenp-|O($F3=38Fed@F-h=e(gauMUW~;z{@%2ZeDt)($FO4X%rY=81}{ z>aMi%Lxyk}w+7aG#eo6W6+3= zw?A(`ktsTC#ZrlXmejrU{JCF0(~QcN@wBW`_#n35(%~`v+DsrLsn=}_aYu0E--zL8 zFjGLE@z7N)C*+HHq#j1R05T{6H7!{s!WOJM&=|zVdPZf%+YC-^dmxTlsol`|)?f^^ zNHI40`M8h?gwm4QktB}D<$!u#$&8S`;U4wyyDbOn9&=?(aoMdo)EHqJi zdsJ-(Ug2#CMfH(dsB6(GkwkD<{bRGN!@G9T)M2GRS z?Q-IY8rwI=xZ@dJm%zAVcE?M6wP4x`#l9|gOTxF zAQOv)@>z)ruOeq*U|d5!s?R33EYv4C?M<|}?j3QJs1Hhzw^sDAOMdc@;WT$}RBnUx zK|dy~k&AyX{8$IXyx)BiL|?9PLzZim zinBc=JcJcF#&m2?ca_o_-ZW}G@gg)~i{he>q~D7$s8$CFFJ#F9q-`SqdwY#SgqDF? zbIyzAN82;??7?aE`=ta7&rN3Fq$C@I=GVqNnQYzm%!ejUsAt(#pR?hm+p%1N#!w1q z(^;Iw6Q+}egPs=#7;MzHJ=d#oA!x+*{=nxGK|7OCj{@GdT7a(95AGOA;0cL|eSv}}6==CX2weH(3h$l5 zs>i^yIx%64<_SO+a{~cnn6kWn^Y?+eq+f-}_ono3scuhwSBH<6Tb$kVrIV7&msC=-#Qb(H|zOs%I!D?&)@hyUKXyHJs&I_*)wRBXUN|@Ooj(^xoSYnvXVrDkq47U z(@pjhZ7;CbUz}|XGR6olOxxdL;6FTO9hAg$iFobk5}q~J2>r_ZIVAop-4+FQj8JQk z%51*ex}*#=fn*8Ew*!2@VKE&t=XxxUb;fO4XkBPKT{j)SPUuT_&n+$a-+eBzU9DDG zI17MFH)hIzQiX{;U+B0t0u4NjQn^h&1K5wc9r>3AE85G7rW2j4@2#(xwAp|dbBsxo zwdtLs)na*D`H!cLr}#MA+g=$-KR;;0z#sx4bebTd--Xcsv3T+4U-=JUN>+9IO9nG& z2s;L#TD7a~PdBo)Eaxa7uY;~+Sd*}*L!>r|SsD==UcmH(w7+((GcPNtwYo*-;%rJP z%BopvzjPH!jn?eJRS65Ep6OiqY!X4EC#)xUaW5cD!)ZF^5?!NiwcaVr1)ARtDR?b> z2Q3g`wcIL?AxKqa25IN;d3a~`T!_;@SckTMaOC^^s9rq#{o7LUeFTe*)sm#}on543 zq<5yI6AFSZ*+p?Hmtk!JIpfI?6a=+W?DdLg-w@Y@hv-wj%6ca&%Jnc;-MJeTgT^RN z&$6Q#6;2=$f}&GewaE+EpN+OQC0Z5j=nQ^N4QBph7(!I4#cI=`Zo!P5pmro>KL`j+ z7IOe?LjvUws+Xx5oMG0wIK2{SkboK4?eF9r!Gj`EyYzmVH#-!AEx#td`MNcfBm|5)BN5IT?yh1GLi9(P#%HJBu^YJQJvij z#UM}KxxecsrpPtSmp-$@Uv))YM}==An6(tkfDw}fs!DK@BmYL!`hhC8I2iWiRi*1@ zvEQfbqDP2e+=}uWzrc{z266b9Ws;K{MVC3)Gvz|L>n2l$1hunYqtx!xempDC(Q}Ln-uBJ&!6?cc?5a2*`&=_kqlk`+lr66uG=YY+wl{R?+V5OG0eoW zD54hc$zulj*2)z|BSy+s@rZ$qV37kS)H6KB55n?Uia3U6l9;t`@H!RisP`cU@=GRq?>pfg8lj(MCDg__cAZ__ zxPP~|02%;0oKn6WRDiz`if-K9Py}e1(E+YCO;;EGa3l^7eGcf|08Z}!HC=bi5_WRB z`=oM;y;eDCGVnDyvE+CdMd6G)N_!$0==W>4Mgkf>F=-F6bZb7J1iI!AMhP2q1L$wQ zWdoT9dG|{Yrhci$se{Aa2w}}$gIA@a1mXCzW~R%r~=-hJDgQq`vO7D}#(p=h`PDCr@*( z(^bQnV%897$=(s~#)oUt+ zX|9kZ8uN*~Tsu$|+gB(pUOOjCnbDW#h?@GL%#Cy1{@AoK(vzkQ$gPNLWbhmfc5CG= zkU0L!{k=n%Mi>EEXhB@T%ic}kd8q%(;>pP171*&{&SpA|;zTv=NCZcE@H3IJ;k7t<-mY#$yF@UvA6qmvY zdV&($FO?!zM|K_bjhIO0Jh|+l7~b`6vB5UMTB|{Nc+tl7swo)Og#b!G+n=;RAIM_P zR#IIvIVxdtyElch&Ttq=1YBfYtX9j%n#6;z8fTDHfpfBJHvrjhnAElQcVcvb!Vut{ z&TX9tlwWuaGd+*3S?kKnqjaSGDrJS;Y$J6swpYcm(1F`+wUc{x;;r*ENA5sdEqFMZo*bb?y>~Yw@Wqo(W zOVb5)ES?WH;S)8kdW84K^iA8z=}rp?3PQ};vJXp;r8!CDbw6f(J5MY-^?@sSySoc> z2DPuP3Blda2;$?otm}T6A%XXTx^J^S_}Tz1eb=HI-t)s=2zd$~qkjK}UUx8+)B-Wz zQ89aJZQ^jea~<0JK}lylmrkA0o8XY*H%|oWjunv0re_SG(8f7TSS~ z)M_D=0qw0R4`x{= z%YU$I=(QTVE7-zIQ34&g1yV|N_tMKEk!Ko<%YlP0>a)|2klUTR0cs4_nLz|}lEfj^ z3a4sC-nAUWX2S?L+3zqU`k{<$FB)IK0ews;;5x_GZTBa;6X4*Vs@_};#mUEh=-%^9Qzb&N+@_t+lUxUuzw+%w$A@PXfws0~2BBxsLoeINwA% z&{uz&2LQUjKn93f0yBMTcAR*;oav13^WTN0<9&k-(U^hb`Zr?DC@b$R~BLgsF z2!c$pz^1)xB*-;v4;fL0q^w;g1Ir4eee{}BrFqtyZHB*D9@%CW2r`;ZoO9&#d=cW# z$*{DjVSUjlSgNx}D)H0n7SG@P=Pmr#krLsQL=WC!_v@8wBJA|okd(aKeV~CiL9~bL z&x7uINwhhPac3G}NIsFV06I@oB0)%{RuGUR4d1#3aAPdIEp1nOa6d&YSDt*i6naTW z(SJ}j@3~bCO(9wBP~bXGjz-i&%uMnTI_7)u`$+DgLjMbV0CV(&Q9g@%Xr?vPVTa= zAy>}Qv5#`>Ew>AY2>UzbsMWRQ2SM1x&Y(9>)k`hoY8q}m#;TviI=6^y^U z$6AK+f6Z?p9eig%U{j3?jkODX3XnUYz)UaW%e4~~v-#<=b7N1`<`1b!1Xqbc{o*wO zHlayC&RIGET-j^r;nQn0+iVfDd_+P$S+1)Bj9w*VT`V8?-(Z&R14}IM6y=bPAfM=i zw}{Gj{LqU1F0Jt(Ud)qpE!s`CodXIf<7qy-pH2rGpTcK<;We!Ya6$sgndvW`GP(+WG*I;(*gdP&bM9h}Jc`Og4lqXS5~$oQ)>Xn#V{gxOGP?Ui72CV=UH z+E`_vXZslt?49>s09=!fh@UEs?R;Hqefp^QO z?#ar|`z#VHnOIXE=GX^&>rKLd1_t+QsIVn;V)-qF1Lr!w(ZeKX3H_tk)S7BDXcD z{U4pRzvScIH3_Z+)2)aQ2MNkjwC%M?;4_1oH=$02s^7j76&anQPDaF27iX(2l;X;f zfb*3DR8W_>EaZ9LDLlXL$fSn}OpwBjNY2V=Sp^o2gotD7|EKNczox|gze{~P0=HZ8 zaLv$FQsXE7ah~?i@Nq-h#I#^-P@?VPfM9G1mOFD`oud+-FT2!*@7u1fbUkXI>q2am z0Bi*k673YR18Z^PK+c6T;t#=8_8)02jJ|eaP!^t~$=n*14a20TeQ^5AQ~Z_Rf&VQJ zqu$W+6$3K7JAZiaI_~Io{mP`q?*rcm>UyvMpQBO}l-kBTMKEd@_sA zMD2&b3=-{FX{&s>Jdy-7?%VQbFq@;huq2)fA0n0kGSV*NCmt>^3Ir@kFT%+z?-{yZ zF@UsLW99bEjH#9j{j!$nHIJMc+pkU*Bo=V5=x-*KOTs!Q7J(Emw1pDjn0GMs(fNYJ`Zw4uoZO6F=N zVQe|0LG-p<5{i@`OTJ4J)*Tpy7&@Ahq={Njm&a2*TK+CjZ$D4}aJ+!>&K-)U@8(R8 zblXHfHq_WSmMBw+gT8+Xqjgi|Zw{BbK_heW>@wyYag<*`sg{Uv?s(6F^{QVubU2MS zPM|!Ujs?^k@>0e#EMr{>70Fv_i?!u@v(K z1F_(hMgPoyKDsCcN#}d&Z;0prOMpMtd`NevR2+2?X!rmNakS_q!j>n0*_LB}*H_Am z?77FjDt5y!X+nyfPf>_l@%BCE2Tyss9$f0;v$ov9jFP2;axyFYBv-lPxf?9FO4-uy zP{6!}T;+`T7xbb8=IXuQvguNW{L7`=Ee_0iJs98J-$pb97b256p%F*4 zhAF>A2WYu9JvqBN$sJG5OWUGn1}#i&UH4n)Fax0F&F|$Cp5$Awf^}Nouh1~O)Go!t z?YPtF*SOtvuOm&u=gi_YR!EUB@px1+W;`okX==(x_nC76Nk*u<14Qm=1Wn@jbh>k| zT6l%qdb;%E^QM}OzBJi7o4IGeLQi~1^uv8^gp2bdBKZlMbU4|)hQ%9weDal5diAbs zudxq5CoLLtX)QycH}R}0A95ES4{rwCQ^ebuN=(@AlW|gieXI8@u)J-Y;$x{XthKPP z6{VlfYxRBS4dE;=O_Xu#*>o~ae&G%m564eICwN3v3&Q;dsKA!ilD&m-n?&LdAH^!! z1{6e3?6rSlJ<7qpEs!>bds~Q5gsVTrBEwLVvvpeELjNgs;j@;0 z;IkC^Evug#kCJGR7n=s6=OVqCX~gWA<6y*mC`k4XUr3@Q=kKX2m~AI?_lN6NyE_tW z_(V0C0OypJf4MZd1wBW2?&|y*ao5YnJPR{=nGQPpyZIr5^sS35P;}gMV&ItfpF7L{ z@FC$sFuB+1KM!w`Jw86(Nas-zkP780QBD?eD(9$QkPMHo=;fccVTKqu1@nHK>@`4o z!F``#;m{gdz;f#J3qi2clY%T^)U}2Tbj(RX_}O*gJ;V2xhsZ?7*3-&8{7InTBR!pZ zY&nLU{rzV&X!MUO`=37)Yi7TvB+?Vz5we;sPNY%R4V}u*BfJ)JjNI7XmUUIk5Ynid zGq=#Lv3zSP%sxw4y3|yw=Qzd3ZEFMT5!!!dykd8&?KZ zKFnI6`e&I;#N}W~E$NxTGZUG+;MBAN%|CB7@IwBPn6*s~E4gaQiFf(;IRk;qvt{nE zrjYqk3cJCWvV6~tO;5n>%1QdjeP(9n6UKRMwk(MegI3d(7d?F{;?YUC@OH2_uEl3q zgPnlYbW5IMrl2x9Rg_1HEbU&QkpG;H;OO_My(-s&vkiEolj%5@NimUM=Dk0izW=Pz zee91es~vXDa0cXlbC&nZec$6jHL3gd*<$uKYG>*A8jZ@RdSJvQuTzG5qF-GXqJQbl=*&>)i*rA~E zTBR&8eb7UMlLryFYK9N8`?&1AJ)>HIV$xcXTJgS!bL4;Sy!+F~muIRagt&9H>~RX5 z?UG4chQLIePq7JuY02J{HqzPmu?YwW0eobiu&|)l2pZ+%ww;@l-)()n&QgUANt^Ll z=!v|3pTw!J!-g8i$8^!oeieu_Zksd2#L6lQLLYf)*Eqb=Z=)%v`6uHkjQ_oX64EiE z=-0FE@k$%(#Nr8>2B1pi$4+A|3Pv|rR|CV%u&3hQH?1P)y;6yuM^I-2!_;~`C3D3a zHS{$KB7cxJOi~zuc>vLq)v34Ta@rn@Gi05NzDKh6oe}bnYfu^~QN~5RGgBF#?z41x zPD|HvW5Bb09mDR_P8}gpgh^%7@GafB-->@LTGu`nC_7DhWVn=}L`VQkkDxt&o-v@m z?Oed1YKWc6XU|3hIWV{Py`Fi?-5&JTY!7PTkCgd?llpyfQ^^a@g0cFm5n{-GnAd5d zhk0=T65gtdzCa;THP-h_ayuSbm$Mb5yM~aZ77;9YA!s~K&LG7bV)HIq3Ud@?#`#aR z9>@QRip3_BFWwMw4wtZ{upG~o5Gsc#J>s=|GEsnLn8?3=pT?DJE%oOY!MOe2%3tWM zU^cvCtcR8>?R?)V0ypytVkASWl_#DwU%ZjCh7a?}F`;NIsqi3Z(s=KTQ_ z=KYu)v==_(?lQE6?`(y@wwS=brqVReeD`l3B*TtR`JP?ZDy>J1`3K(yoXklce<3yC zyCc|XM>k`{uT__PZ}iz-#0BXiGpe0nzXBKzR7H~ge_%nUtes`&Kr zV<6^zBbvf%^?^L&+o@d|1(|%BD@|@=N|RqmM=|Gfh@z6Jh;8^FmvPNk4OHtE`oFKj ze-1@93~#cp6QgIoTtC*vG#qzY-%a0Yxh>G+du`(m=_c=OZv36kWWFOA?7rxi(!k}O zln<@Kaq^YYQrTK`>IFV7bdwR+#07M2jW;-|S&dd0A3D3%8sM=?WyjriCX32bNb2nj zJ{hXvb~1yd&sJHk2FUMa*!{x`0dMtGAzPV}NSy)`MGvgSpHI*K{1vdjC)9;txCkQZ z@Bi%Q_q%g12IH?!{k;iVK!o`Hye@ETNB+%P{r~^5 z?M$Ss{=ZoO|L}|dzavZ{@&DQ*Y?Szv`ibSV9zj~|7ZxSLTmW{CXTOr_8yYI>vFG|X zUZbK4b0C>382|~r--8KFe_Y>BK_Cp@9U~SJ59+3=4Yz@&q({78rftJ>r}k`fjX^m& zSOdZT!g-Z8VJc8?#m(|Wn8_Te10{J@8Es=i`huhR`nXo}pdll5|Gcdsts2X3&$O#% zuItqSY|X;{zO-<{o`f>y9}e{Yx+2wT%$F};vT$Xv-lgJyezU2NBNH`*cX4-Lde`#y z#LcySc!5&UY=mA|H3SIU86iedwZd|*jGerogp`~Km*#?>f98J=4Eg&P`AKLHZlv8i zO5^AF@+*pJUM+Djjonu<)3Si4aL-B8^~R;#A-h7GP`i1;^Sc_^4jz_ong^qE0>-0% z6Je$}vU7-et)~~FC9>5hekWF2D3qp);V=0NurfK3<(Nc|OLG_zR;9x#<|P#TQc=3% zoxM2jzwof7rG8r#x;2(oGIW+7oMz%d57i37arYEg{ghQk%}_AcT>s0H?SX|xr}yWr z>+b(1%05-f_Bcs0zY56z&PW`2_B;2pGQsx}`=_!&VtWv)!5_V z_U#b2sp3uGfurSJI^%VgciGrdSbZTMx^>Q?KfU@O0FcJkgvDJ<3y3DDOZQHEX6~sEjwE^Uj_BMnXgP|SUWcU zH^&nVb&Uu3&HnV4)uyYR!Q_qN(XajrC_mQzPEktRuJ*=@HQ23YDvZn8rZqO#fYBL` zftugSSRSZMBuRW&AerNTDIH|D+Jgl=jEDupObtgWc9vbP=C@l@xK~)cKGc_d>@%; zgq)vu%}}~qJjb_J^0D^!%dc^HS&4n=7&Y+15gno^Yf422Wtp?@B)4j@0wzx>#KvLx zR%eY6>zh3jqyF;cf;fw5gk(Nr)N)!AJC(jg#$KSll8CHd>YFU0rn?3C2DF00*-hkK&C z$8Ph+!YFZa32xsn=kJY3e*0#Hd+XbUdVsi;u>_Mp9h6=j(+Di@mPQdRzi>ZY+{Kk^ z7^R4J2=0=8ZJ2YkdKGAyu1okcO{o#M=~%q|8;!q2!Ly5l`eoCl77xybcLovRLcmSt zM0D@jP#V&#WJK!0PkkQ$7470<>hM+!3qjqbHq)rjkI4tvpLa?~_JG6|HcxUGaZICC znCDDrx{2g|HU{~IxcfQ@EcCxIiaKf3jE9J=}h!Ul@P2Nd;_Y&j6rLNmlj|-tpqX&SWqYKlw(SO?fj!}sk&tYOh-?WF zJ_R#YiMM?}%KureT+<*WOZ$Y4Efy$__!Uz;Din-G{y~A}^{PaF0Q29Bps*R!cCIEC z7;isIHCnK&|7*_c{?eT?dx3;cz6P1|zcl0-UJ07OMsBZUh_)~~i=wx*Kufn1?q@X! zP-Imuk2boz@cwY)DYc1RxuVrlo-g~lK8OH6mEY>|%f^=gH^|Gz$r=4i!zilLpCCJ! z_?rWL=@0I|g^NjP|CScL-MQMdjjgTJ>w30(hw+o=1_9k3u8f2YB0eeDxUbq_VIW_r ze6pr2Z58RfBt7CWXp{|K^M@n+F!YXBGBur8>EOkTcFqwx^*ld61h523pw@m085yy|f91gL< zWB=>o6tqYqH5An0-WX&Q`@*p=7$5x>^y8I#kiVsG zQLRkWv9xUX6Hd+~tBL$nnmMOSpxta!AcFeUszgvA`CrXX0h2{;u~Lnd?b)+PWwmTX z_N5*2om}v3{7Y{)903;pdf3gK&H0?Lb1hlU1shjK7+>X0B&w;J_4Zey1HW6GOuJKD zB8P_5hVOFghv3gI?Ic67ljFti1FX9TK)soy2-s&TrV5VT+Q~?62aYzVy7wM1K1WdN zZy3T~h$&m8*6m|2m*VICpnb0Wr_O3xx6O#KVwU_v3GY@OqoFI>ezsJ;RduG|B`PwI^lU%D; zq}na+m60^U)9KNj{_lp0Ke7g6X)r6a|J9w>Vw%6hjL(2AwKmI1Z1)2b2K1%^^^98vVThkC2UC! z32^!6EE&CC{P>JDcHOAtu>I9yg@>hybWPt%VuO?P_+kk?L*%Ba(_Tqa+3n>T2AYul z1=7DkTQ^AERA{|+o{9ndMATUA%5_mJwmgk7J%iYSRkeArtII}jEE!Rdaj{cl9XU~H zy8-2_4*``cZWbenU0$i>qZ}N9U3` zYQ#)>c`qm7Yz$&^;P$PaMC)Qq-`vn~8E;_!ULWi&0$poop7l0kdPEEDSwJo(nTbXa za@%@3o*77xXO;m*Q?(6ieiH+u>J*N3Q{_`*!C7%KS zE+|tDN2J+K8N4AGJ}fv!!}j*c^PbXN`%NlHHcEX8T`l`ItcKFeeNNA+<4$q_F)YcI z40yBZ=W08wzw};x@&_&Vr;CbXS{>V?m%$w2r>(__W&A+$EqStbGV-Mrd0yF9t$#)P zgvVm2JkgH4u~095@M`M>@tSmh&RHmgqi(VFvHn!QiNV9GTxVE~vV$!yHMi<7m2|Sw zYSkD1Z9cBCY2fznlYzLI4k35wj--lSG@W$S8z%6VX^uP)xU65!TlSYVT^NZjCcT*R z7U1n4sb`&WkI1MJ8Y@O;#F0xr+X)4WhuMaei`|S{bMJ)A=THq)x{i5R^cvp?J{Mg3 zCVRt-5IuFT7{>gQ)0f~Q1WM9=OObonGa~h2Z*##>sRjJh%W}Zom6RzYy?ITRq_C;M9ID_N53OY}_s1ykE$Apq`$Ot0vjF z;OQtc0dA>mExK^G6GO>c-(lFQ3PRgFS8%v4a06LrJU%$2`#nuI$=J|`YFWQvT41)Y z`HshRTQ46%_|PH6Ceu%!eD1R`oxFj8fdf0SIR9&y&taueD<`M3r6CizU9j8W>dA@v zjo-Qo_&g5Pe_FsSwkgv7i`Pf=&%a zKC2Jk5He*hqt0}_-|)c-`F;1EyfZ8Q=W7yi@_hCs05#=ve3`eM`MfWDOMCI`i$!L# zF6=YQQngS;(rrH5xz}h!pKiW{jBk{55{E8}u1$Stg2tsfAVv=kd=?)9EwF9(Y8n!A zoixl*SDfou_fm0*+HxI1`r7-%P%)+F15@{&5{cLN=01_)TARTL#Wwst8!cGj0ah|K zQR}#GmwJ2AH%s(@MTQ!0Kpp-`ngI9x89X7p1q*e)&j=J7hEdPej|`8OK|NjP`=bRN z=A$$fvh>~VT>0)!32YWy+v86|_X3dDyIBgN^>el5OE{1}=u2mBJD95YW)BE($B*rr zcoK{e-xqNZj{;73wkx|7C2S(vKc2vLy(a3Jx?*EcIm0^OhpjK zP=9ei6uHkE^Q)16m9miq-IyiPT7I=UlEf64;%6js7vnpwW+B8~@;J~QV{Ot;6Gqsl zv>W-8)mmE%41Im6paX78owqdR4Q@PMAO!bP zcD0q8X=)t}%sK>~ZSGH~#BB~F4}@lUvq%}b*REwu#W7k-txB+j*BhehOUJ&oE@s%S zO1v|ix2X*lagRy9$aSEXEA*^u>@LgXm@gvWb=5#Ck0DDUU5CgiaelsF*A*PCl6Gp` z=&a7Qg-&x{BV&&*RxC^n(S^$>|D4vU=KPV-)J z9?;+S-bXBe0xjRo)e>?x$}D-@FxL08V`{JfQeEQM`@(ga`ehl{1rT&)hSC6}54zsn z1`L?1p}TBJ$^+1bjzu*gwVLLu5-$2qE?GT3Xh7Yv^BDev#LLb>!bPoBHR-xrFvec* z*P(JM&~N{Q$_36M+T@ma203^(lt$SPbfcQCf!q6__CEAN^HgU|yfpzaTaM;M~t&z?FCsA-)3g&IxNY_4DuQ z<~<1k^!X(=2NX9F>SJCqIjN4FVpq`?CFlUB z5$GqHwqV^F@Z8q3 z@)|y1ztKu87MoISZXr?$YC!H+nfyXEnOwMu&+KK42M`bv*!W9FUrJ{_cZ`d?03|5F zY+5(Bs_Ih)ycUOP4!Ab9&HDpm#L$Wq9Q^I4Q=`uA$aCof?#$i#<(n_ywuiBf?emoy z@S%X!^W~cz@$VBF)D>`ps*~t>c`u$QX~&~Gm&O}0#|shQ{{2~}1>?N7kH-qgoklt0 zU89g=NO7@TzWm3AqR}fjrD5vFBDSy|DD}>wz|N!niFLF`FUAk&N7aK!pAoDWY-AUq z(9d726+I-gjE7O55VH5`!wy_QRZ7uxcBg-Xz=19%l^ zYyPV-tR3wdR3zj*a)YIjTiWU0f2AB|n1-e;xui(5eG1TcA&QyT&E96^2jW>EhASUU zQYGKeT1$(xGOslG0oaz)s08}jDHR2qddt#3|5Qh?!Y*0){5oY=a~kfGzkZrox9rM3 zJLxUU_ae&*6$y2lNd6rgM;bs|UhQvlWWM{}^Iktd99Udl@xwSdh74gf9lP_Z z;=QUT8AVf>M|D>p%S;Ps>hAJyr|gaDyUiJ#j*!`U4rWu?5uf)^F8SQ7Ykc?^a^X|? z2T|Tdi~hY}MK_%zeeX<<8vIB6p_!uS%A4!XK$fjAbhBOCJLj4`n^YEb&vmJs3#5f~ z+2Jg>N_!+~;a4PBA(^xKCvKN)WDVsbP7%T5EZo#i|G)qfTW&Ea(Ca`n_hr35Yx+7l zcSDY1s4U$Xv09LW-~3})TDPR@7iJ$T>?pmt%&3oNGFZUdN>bpLtthmk9`Ad|^~L1b zUdX%D_GXU=bAC%8Ys$a#qvjoqRn1;nnx;i8gm{4L-TJ%}fi77h!#gL4mRc~DirA-^ z(}hW7p8M4-I!gG%)%(|xy{>h_kiexC8UJBWMssd|kY{ZJkj zDlhvn1zNrwri5v<=&6|+hO?#>!?^DaC|`0%ger%T=T6g+tB3g^a24`nkWt_D<)<0H z`2J@ypXoGXT@cP6^G#d?<55J)8q{XtYKBU^AB9g^6gGjItCH$$bqa#5fAqQtENX9A zlJ?vYR8$r-&bGY&QVlt;?v~*?{FA$r?0(R^e!YPOAw}nWmHspoT6<<^wW{NNcp>FB zyB1%QfevwOSA32DX*b4=dhQgz7tEHXs$X=DVw>#M`|=&W2tZ8YCk4tGv`jC~TE(F? zqyauh!KBKf5cLbCFKOk?oh*4>$oLQo2(%GCOl8IwZ>jY0D)f+*r4fW?*6dsRsSqa_ zV2eD5vP1Mh{v7Y{_^${jlpF__kaR3JsEeo-r!?L*2{WI9pb_7$Yy~rTn@?xj;{kBn zQM4ZDEGpJkqxJj5_|6kL9p1!A0JU0*W10L43AK_BSF|m$T+`);9RZu;5*R`S3?8{iOCFG&nrFl2;__T;~1Asz$g}!wL&2GJ#l!_Ml8Ka`V za}TKRzB}6T6PNA=B9eQaQCcYzyI-$K7r$=#FdZ7Na>EK)a(3&L+7J9_-*)l(`8)q# zdQR*5a`OC!Le9OYSH2^yhfN}y1@_Nq{z`1Oa5*<@XNa~i-=*VK1yqi$GU zI_KX`Oz3rO-Z#l|E>#M~x>7KBKlLR#A)AqY0@LsMb1cf(`VhAaIVM%mhx@{{LjFotn@f`*2 zy=&ics;(2O{7!l;%RDI9=ZfyFxnt0IbJf8YG;(h3TZ31NVFuW$zLkg|TxJcq4khQH z#-rr_iJUyU*k)n-Wjpa!-A*9*g@=cST|7!5JvFqQ*cnxhr0o;L4n@MI8h$DyC0uR) zxL@t7dty;S(b-s66^s>1?bwn~VA++C{HJ2(?Ec$>?a3dX&H(|wJxV4utKj?u0{T8r zPST+16hvp#6u~^KIrBCS-ik`i*7XT6B*a3e^NYTFkc~lw*kRGs^G@=(b;)u$tEf#v z!J5614#{*jD4zW++4_|dK&zldXGdG`#K0iuoyIi2E({+o!a*WjbC=)aab+Fat z*}p7erC}5ubon^3i|M*koi%SO?xio}a%bAbSZduX-J`wM%D77dvh*Q&UHr85v-m8s zV^JU?Q!Op}&EbZc)aC&~tVU|WI^ZImN4ZXop@GTDBMWAbNC1rf5YkFVk?}C0-u7gE zEIu*U5>_hSo4C4~Y~x4Kj3ShlV;m;duxK#F7C2jXY{tIL)}mf)ju`3OK?N!T-1?mw ztE2I-0uM&7ovQ-678VYac=oB&@l5>4%$T&ZGK1*I-9XNWNF9f{S<1eG8=Cx3nN$i9 z@5|kJ*M989Z@y2&uevA9!voxgEKn*ZxJNi!P7d!5GggR{fA>qk$wWiN9Lw=dZTBwW z-?(O>cxI;GPFV~UR?VcrmV$oed+*t z^}P+HxlaZ%MbhLXYmasMM|s-NAI#OrHBZdjHOgLU7x)O1=oSRZ7o6489VU1_0MiADlyYW!_ zwjPx<+_a(g`0YL_E|Gg1zw(vYlYl{mJfB*RI{mW-+wWz8qA%->cv{+7&KC8pUv*@d zdtyT}K`^tEmAO%MW52Lv=k>drbyV+eqP@@Vy7l`G)?yF2xgT8i1dIjv(h9s%7Fl1T zXR2CR_Vpmd|5{A{U5keu$a=HgO1j=c<6GsF#3xfLd{IMgr17kBJ?g*e99I2?^L&QR zc5`qn<7Bg@T~xxBZbnIuS8`O$U9KED%O3}06=XSuU7DdDALy;J-F4o^2)&bVM9P}? zQ)3f{gQWoxO?ALlrL-5EwUco@Pd5&i;F6IO4=CD^RgANrI3Xd}n4*qnCZ^1+) z4m!HDm|MdTb?DDh%Y(Ot%3M9MHPZk_l%Pvu0-0)A6!~7!+0(V0Ngf8Z5M_abDa>jw zR8MrCJFJ=o-uDgd^f{=8!1!mc5iR#;t#5q$MNS?8U1&F4W0I`4DdA$u0&)xMF-KxN zsdttJ6y>tvPAzwvjvvpCa7B*#05RjpWWm#;U9IN%3Q8c0xUHQ?{K|C9N_7n&Hlq%n zUo`21eF{X0Dt_`GHT3JW!}b(jFV-sYk*8iYXK&vXph;XEO3teD)gB+|1>w>pP8DMP z4klT`E&F2NSzzd*??|nY83GiVyO?%<(?80Z=TzXbUlfvNT!fPfk8&rZBx*U`OAGs{ z2uU4@XRjICC9Yd?1iK4ZK{s0qYKUh3B+XAI`1P@UEdLMDN9mg4bW$AVUB9RQJ!{q# zQd08GZ^oe7DYylFP5Ptohmd5)OG*P$xsB%X7R`3tc0)Hb7GhpaSqS!-{`9lSNSxI~ z1d32mzwp;G@6LzW(rp?3_p%r>%{OB|96bRodfE_&8l+lr5dNyZZ@a4vf4YkhEPm6# zYdhC%_ektRZo1p9%j%-*<GLXDTI4Gu;p zwn$UzOk}viaIH!g6nmF++y{Y@eGhrR#AMh&FOh|_G|L3%u62kz<%?HuElu5KA!fZ` ze<$cLVTSo{PY~+JwCCx>SX;vIrIfMvW`0#%M;||sk6-ki6x6m6}x*= z*(hKuHiSoz^cIjri&?j}!CV4{K zPPtRAc3RG!thiJKwuW22t4E=*xFIjw50FgXUA~9TT?a>BCH-(UERbsVY=f>gC~R&SvrvDJhq}EVVo8CxdGi#fa^=*LF<VVLa8C}#q&}l|a9i+-Fxcwyt)TJgxVyQ39&C7qf5jLh z##h3s*wvG0F$*BE^wHLOc-qk@jLJFD_Xf()++Jg9(Gr-*Mm<^(!1pbwNZ%c{`|08v z>Hv0cIVKLHCnJfiM!qgI9szLog2PPD+quIAbI%vPgqj7~LFe_|mTw9lzDE_k`O31x zceE3N@;xrv-v3N~#G1s2YgSnWKt=6zNxe>GaIMIMQ|Z%StX0-|1uvniXkDDXa$#G0 zxnv4 zWUa}E7BaGhpDAe?JN>KLxZ2RyqE-o6_8#0(YxnA$mZfH+zmte1nf*b)%9g?Y5R8>7 z;l{Q*X}w3F)UZ{ipE=*=>DBe{P+dZk2X(brZ$RVhFt#bs?+_nK==s^Dute@43CdYJ zfw|M>!F*zN<4aAcCo9r6!Z|+>yq)_g0LbF>^HXpw7G3BrN6_4dcyI|GZ?|C5LjSD-LvsiIHsKb=9c3@m7ug8;1a^=8_kx$Y?L9 zei^A{*M!1o=Gv7FXKaqe)ANo<8nSgG0|?Fc6LoU{5!W^qA4B(qHS77P-T1dvh>lZ~ z(Hq;vLMO>AhrLEuUybYDItK|^h0)v5byfETSD~f-oftG*x}nJF{F6p_6vn;Q9D8iYBaQ(0 zvhnDl8isUgPPjVus@R#_TGWvDu663A{TyX&{B_9Pz(udK#Uak*ZKqt^{~VOu$|VR< z^8vdPI<+Js_I%iSL@xCFV871VzC}Nz?Lx=YcU-=~VP!52_T?K_ODfs>y}Hb_(=VJE z!n2p_^EFl+9$RjPX`J?zDFT|_`JEWaky95{;TWA^OI06g#UcI3jPHzkJC+1kodn6T;ScZDEh(PgVd~K;@kz{!dx1rl5{{o=2WH<$d6T2a-=Qq4LvDV%-%+8jW zdJ9r+%OoO~;sYDSXvax5bNjt9%N4- zTQ!^oogh_m0Sz<9ZRP;#MK!&S;IyxvSkjIPQU!vUdvQRO4E2mz!NJC`Zp-R?wMUf( zxS3m5gMw(m7ot?P$iI}V18ka46>@Vs^9!F)4cA$nGy2qPun-&Eer8bQ*yLnJhZV`SpTl3YXsh{p*anPol7O@6kv%tF1App%Q2>;wGXFTpbowQCzU#J_o#-T- z!X{kBb_*ZK;(x?a(3;H;3!b@OZZqJ%c{P!5MjNLtNCFzy1HZJIxpe56(hwxcye|TA_{YKO2sKyYac91m<+ zmY++KQmnSLA1ozMbWd)(9rbHq=nd}TK@qx&k!a@)?pP6l^LCxCJtxG#5#%7(lj+^2_?1^1 zeYTTPHT}o{c2F;lPWX-IKUWeG$r~!2EJfwAADxFT<8t*wvzZx*E+qIa-pv!@Cdg#L zZm!n_0&W}BsZePK)qW4g5)x=-neP7xEZiX=C07AlLXykN#HRLqLfnO+9hW;I$5~YX z5j_6gquNM|ynea$|6qF4R5t{OM0k`w4Z~+v2B`FB_(R}{1-+cQD?P!{93;+fUW{3M znJVm6M!r&(vHJTrfFi*x5-c~aPEPTBRTdb9)H(6E!R|~{*9NU?zC|;hKVf3&=vUVS zY$>Pe6P48t%#-A=9B#cLJ`ur%D=}TfqHj1wAZa>g8tl|~Dv0bX2Aa1F^ZCnfk1$yY zaI2;miT=ABvlk_yv)U1-g(=EWeLy?~$G^OUk!bQ7#P7pfgqUW;FLo(69`Ypk7GECW z2j4Dw5Idnl`vhQ9dhEtU-Wbv_cbE>Ny7a^rVdAbG;u~4GaEs(x`s@+0P%nS;h)fPW z!R%KkA|ajA6N004hxh}Z6x%lxn_;6#N%%WNEI%0MTxM*_z9`iz5zk6BVJ>1jg z?LJvpW&MlGdhy(4|1B;z_OS(v;!;UQpdT6El<2?PuYVJ>(f!R-FCrOgtr|fS$))BO z;%W!5al@(PpqjebCP@RTs|H@XgOmeYk==$}BbFV@ob}ry4xCmyrK{X-{9R|!&kA^xQA+h5_lzc81IM{Y5v*X5X+B%+)eCu{hM#0c|PVnBYMY zmw-Ro#{-h4JxseJmG}Lf!Ybj|+nw4d>X+kf#6lcW9=dPe;(GN06UO_!ENg#mlhr*r zmMcC;*Ul;gWP0ZZh#2&V_J*8~fT@cJ+cCHY#@A)(4nQK2mVV|@JadCO!3TH zUBG$~#n^Oo?Z|guM!DD>bpQgz;aHpfho^mg=rbJ~bhfyz4D4OJ(oF~U;-O3M**XKyGu@UsTiWup`5LQ@hjRA0s{txrG7C4>mk0UyA%bC-F zM<4Y9+ClQYgM+ z%kuCxHRs$gdokbT%mR^Mtt04AjX^wsYrQ7mejIgY=zzpsoq@T`0|a(3mYG}?-G6r= z?;rZRuS$r}t#23c!B?1@*4=28)vTe16*g`#63&%77dC~qTO7{t~(HkR{y zJ4i?5n%e=bU%tHMqI6|OHq%`u^Wjx-*h&6VJl6|CU?D$8J||haY4jjD0@P!C`{Seu zZ`9p~^6cY2t?v0x-xQRcbMOJ(^lHQlzS~MwL({r5#%Ty~Q;FwU9`9EDi15*uvi7%C z!m~V<@{W&u#lZL3H3hRX;G-7eDLmWY?h}#om5i#N=4o?2?U7keaA`fAKj?X{_XG0x zU>27ye1{yT8>umI13GOYqad5nm&)`ouxsq{3J{>x)qwKp3pz}hZ4Xe{x3U6iXxpT& z_qieQKnOY+dEh4gBrr5LlQV`+YpfIFz;Gm|tj5&1xz2V2;5X_Wk3KwXu1ugSdAHgD z%%-%8c6(9nFc zRX2v4;oJ{^icbU-C4ovXAZZQY+P9MIm-Kh~^QefvI`w)#54hxbpDPR66sGHkUMsNx z8tw`_aMUt?$_-tJ!qv%TIgSlzlC6aW#TKCC-~aF(haDzeUX6NWqo+Xj&z~Slja; z9tevBLPpoq8#-OKvD*X3kGo7AdEmUhc>FVYIaMm>2nB$Jf=u^>bVsxXx7|X}n;YUq zr$Fc3nrP7sF?Y{xpO`)>d*26oV6n*a-d2IWRO=dI&fSHG@Ewrfhwr4zd_<9ylC6PeWZ;&pz@S3gLJmsw~~MRz+L zWwYGNE$>N%b9pa+>z&z}{D0Vc�!_t_|=}1Ox;u6s0Q)2m&g-gD47|^Gnw9a5(2a``-87 z*S_|(M@=LqZsN@qm>k)J4)lShvYDHEd&7hn2^}g{kD_SKW9ryGMDsV5AHtozQGiyy zyepTD-O=kQ_a*2jGFrxh&oYhvL^5HF{G`n8w)$bDnDTPV&ts)7d^L}x0RmCEK6(`} zH_NtnJ(d*ZoWW(66C#b)f%H8w>3Yf_MRs%iDYOXsA)n~=LOhmV|HS8bbtUgkjH@uiIt={)@L6@wg0HZe5MTD=zeNCXoMHr*j=7W`@X(H~LeU@o1H# zCiZ7rx3RjUMx#1#bwfMMJ_lR-U_Kvd-V)E%uNJAVn?AS~s}Z(-;j&Q0j*5m*(P2cnzbUeSiU{{oDxJ4i%j~$_>22I*YCUU9Y}I)_D$* z)XF7*!Xz^LHZff7PSvDd6WQ7t1(}+RXVO~nl6~GaTy|xT(;R2VGXYA6s9&o}oIBz3 z);XmmI5in8xdg56VxHkmdE}mW?3M5NA-h{aV$d(1;e~tX+|;P(vRW%G{{DP9ao)za z*G$-hW4o~F+?ytGYsBMU{VY}Q8qY0h0>mr850-l$wk)+bkYX4sUE;^R zvZC7z6YGXRx8e@Kugc`&iy}J_uSb5$dk0 zsOPcg5Bo$%T#+u{9mcd5>2tDv@y5x3E`XPE{1{NlgyTNnv%5fw3{$fM05{_wA_G!Z zKkMj`Qy$e4-F47oOjlWqijJV(!3vDP$?9`_+RvyZyjuF{cqg3b$+fopi5i zt&H(iqe<=xJB^*f0LAJu{fshg_FK^1Q8UGkuC8Jp5JcdSc~1iK7|%mY)F-Q^K5Eff zR&(*Uc@Y(%i_{OW(Zi1qVR+n=H}Jd(n-3*Ond9tiN?h-Fgv`a#x33Bh`6uY2#e8-l z5P%hNEpfycAU=!Dj4TiANZ6OFZqihTK7L@DZoz=TqU1rL&u;eBcnTb@y+itC%K<6= zvJBxXwG6)7W2uu50IL>nj!7vQ+knKg-_zM2?;#BFp|r%6oFuD(S8%@D01;yt9JtTQ!YhXa3LX6O|Dy$v`9yK`U@k1HXfcL) z?ZgdTQHCj6UmV}~nt_jJxYA&kzox^i*Gg-;ywT##>T^qcJrIA#3Onikuu0ZUs7XSo zAR}JXs%Q|PuD23jw=qM!H0~meWqD3Xdyi5mHKX1GqbzcVCQi;xtG%bV^ z;l8?OPGW(&huBUHj~QYfHDJ;Nx>+0ObxIqh1i`(Gkqgy5kT=7(g-Y?@duV1gJ$O*#R`0eaOR4 z421eMP+(w9+#C=K7s+JCu%Gg5R+y2UF9GgB;FSawNyfNBjw>#XjTiqKWs0>6=3(mN zG?qD@7Z|N}NDL0JcHU_o_@%t=F~@_()(Qe#7fr?2j% zwNOLQ5p0DCchwS=HcH(}I-eiyDLGwtJfIP;*je=-lG=w3-_dtG2ZT5F{^X<-HMaK*4$=rBA5Jh?Tdl zW0_e{#$#3ZH0#JIi46bu@I-|Bx1yU?;^%q8y|(+ki;hlk%aMdmy`>C=?2Cz=5bR6$ znfe2lu3Z4i=%4q7*W<3pWjyTjT#ihr>-Nwf23Mp;(oyYB)#Xn>2ZV~|_6-yh z_=fX@PW}UUT5_ zvZqC&iDItOSUv~}ip|&|o%B!~yTAN&t4dW-umT`jr72?VtrCgA!T6g6hh-~IIF1hM z_OpZDMUWcSaGY>FJ07Ca%WIqRnWQZmpc=o>vEVpU5BOLh|98%n5L`v3Hu_RedEHD= zwPMyWn|VtIG4z_z`=RxA`6BsIKob7u75k&-$Y$>Yf&8aOM*C~5G#vnxyOiJww1)ir zCICq&xjZvJ7Z&geRP9TJ`bSm9ybztHs`}+) zygvbnng&nWA(3L{&4HvW&_}K`!O!dt2+{>@=0`s(=Ng|pf?%Zt=MuH~j6wX>o3^9n zD3*Pf>r*VozXaEnML+zuN}*5uOjB-v92jMHqd~O2V7%GGdZ;>2_tv;s-pRz{drz)Z z(Go;uKK1uKBPwvhQg(kCoa}xF=Us2_F>O-~XMHB!D?Fb6^dMHJXZg6^jupzd8DFpAofTIpaEf{d=;5&bxIwRHtoJZpkIJ!_9C^Q6?*U@BKnktl?2j5oClQ*hJ&BJXPg@ss z-46-Zc{96U(eNL6`)M(JOpYQyo7x;EOy$J@cy+%+e-hfn3;rbSDF)DiDRQ=v9y4zx zlBpKz#(znSyxD0p=YH#H?h%f*oVh|;;yAsTQrn!#F(`O9x1-si=kl6(Tsg;Q%Y?Vk zO54KGbm7-IpW36%uM+@zVR&M9R4kqU3-G|}UAGJ8RoMs|UHMNk=#o6oQ%^yHY>W5k z+!torK0HoGe`H)Q>!NaArPLuNTL734DhUv2m(M~_A5Ubd?&}m5#*vNmbk!K31q*gt z{4w-NEY%Y@VBj&2A=f0N8owv@Tz*oTWaSjBMvO>vQ52vbxN!I=ppQQ*VWR?J4_CGs zV69nydr^p(nTBd4i~OpCMRtuS?>^L?rg;t7$MlA4q3u7y>se@B=XpKA z^y~ZpW{^O7Nr_2}&Ff3~A}XPBYW~rAUdaj|Fn%Y`Uv;b?;D>c&WaV!tkZahB5rT); z%S4tpbb1)9smn;aF;7VYn8J;rK@+>XOpe%%4;OoPFJuPgWFwyejgfA3MMph=MES7O znqpGkMIT%X&iytiWXvmIa-VmnXruCkM1uSy$$xsLw$;fr7oT4Iudvkry=T|A3aFd( z#RLKHIfYxZuae3y+kkR~MmvWr#g!&9%={xiM~<4^DF8j01Dcl3086)|S8Wqmt1^7)YB4L@@ao9mp{ zcjPe@&}9Ei_GoH^4mNaHrVq64$JvI|0;4z5G91~^4GibC?r&X&taw#9wyy+b)elIQN=&jNM;*VudeJ`ozi#%<4i~L-6~WiHoHEdf&bEdUvr^q z;IlDZzs}F(7`1BL6lGWDshmvymrP#yjRx-!s`c*nQN~W7yu}jT=m~J~yRON9sA8!_ z-cMXdu7|E)bl9K!VG-5}bi5MfG$6-biuM7>h4X>TQP8j6JT2N6;1tKCsunR2g#P-2 zHTlyx{Zfho;kHHe;cDcEl#M|rJ28NWYqrE(PT5W+xodf7cHI0b-XDFtT)o%#VCDnczvu%WzG!hXk3C^8-x9@N9 zHQBE+xL!Dy4+AsbP-H4_GBYfMkHeV&-m(-T86tsn1BAHeW!_AaS#%XGWbJq(XF`MH z+fYn%#X4WAlIOLhR;+_XlP(F3TN}*C_E>)e1mXrZ{#wd+I4^Rk>cu{0oYQ$6V0#0? z9Z@58m6c-YRlFcTn}+>CIqD?o63e#j8%CZX&du&=(o{t(<>~7|uisH*_SW|gUHENN zP%l5{`0NNQ(p|ex~pDor;*eI}ZWzo9!2`^x^h8 zKM9-J8;lT{#&hY^#WAMNazKr)V2M%6s|4cR96YS?H%*l>=k-(v;_7eZsGYJZ4lDyN#MThui(?R_$7%#T}e@ zkmIwtwe%2-L1j_dDkftFj@)pcao~WA4OKh9V7~&wCf@^;>AOY>O;2I~DZDt_(tqt` z3{cQO6vz?bC>|^>fqqEXyvlW&Yb|o3-pHCl5e~G{kHtz!^8B=JX-J(Pcc}jAx>}V` zVl&=~PeuZg1JP#q`dDex^k=az@3BI*qmRYh^!E%eS)9?Z?kmnVO-tsA9(+npL1EVM z`A(5-wDeup9> zS=X0_Y7aCO7(Onb+ywob-@~gvCV}X`KB*BCrv0k#Y&3=}flf~8`ZQ(-?HUgd#n1^} zfqbv>IC6~^w_^YdtpIHR_?4rdV%L7}3IE+%|04aZj;Mx)23x!YxJ(|y59F2Y@1vqO z!-y^Qs|V8oWf25wB><>V1AR#EAzqHzO^!ZRPIo=MiaZ$(c|zo**(xzRe|)|LitoGC9g3@t zSaGy0&>MG8IECt-Knm3xwO2Tu*F>7p_C~y=hg(x$&Zp5_tWdKJFn@D?2hOwu5B5I~ zm6j6MmRo!e za2pURNGrYO(9cnI^1dAAY7#fT+@^R9Z*OsABbwJrPS$Pa-*v7gC)n;&kv5mK7q*{j zZz{Ij(mY5iT`iuwrKLr<=EnfdaswcI(b83`Fe6*pA^Mvn_kUv~>#uJ`&~J@3mwv^k zB_-8MH`KKUOJ25$Sr*~_#u4`ZJyFeLSj+5ygsO~_>xH-P*?%0myG%~)Wjsl5soXmE zaKh?Q4btMC^lKq6%>3Jt;l7(w3FiL9pHW;OWo6|zjql0a37x^hNz4GH$Rw{FsG-n4 zpS%7W>ePQ9HdFO{%sg^9^EvwpA}t6K0|EXi^4R*^9aVYN|H7yE4+4ws(|&8|T#kEx z>Cs;OehsJ_>jxT!!AGe+YUukMIKqdF&OAWwqkT@n47C4VeD~y!PnU8&>p3algVRjgLWOVC+KxWd zOC6y5-t6je!M8cKB-&l(o&AS_-h=mWi4i&$52vrvd$P9yBde{Bj75%^8|~@aDtZ*_ zsR8mUv-#=tEbk@Azh1xHT&=V*M58f3ubh0Bp)Z1D7xG=u>T`FjNH1pJnxq6CC_+xk zZEq2l3aIj*6Jz@OYT^Is!v?&4s$@fD5KBk>vvz;Xeg2&4i`90buI`H$^krW?L@sdO z-Xy2W27-FS50%5g?34#&o>42vu6=!vfN^uZ7l+WiijD3HUO1bdKah|sAmPQB;Ijk>329newEX!}DVDB_}zpW`{-sLkG2 z*KUvHyvp{|cqFawc)98p;h5@HX^g(}9g2i^hc{M`lREdL4X?j?tlB>;FbsZ+KwcEFxd@Z@=w9&aTg zPs%GsBs+qJUR?yvmfrZ`&$5z(dgmfz_V631268|tC%43Id{|l|mrGKFS9d-h9$r)nuyP&bNmaz(Cmw$Jg2heKLPKM%gr4 zfcO)0f|BVak8HMQ%QYzf_A*!hY10SFoP%T6{PQZ`?nM9-db)t4=Qv=*F70i6!p&XN zw8H<3?p`tCL|dEm^dXD_gwpK#V>C4%s^jtA_S!f4`2YM6wdWVfydWdeLp7y-evg25 z?k|88_elTf9+_wSRQyY)bIjVW09Wxt{iHV$E(;8*vvBD%4l{0e%jf;>QtB`dv~@6| zCq_tQ;5kj_ZWbJvHwMpeA1q2cl~jCnSt7qY8FI~{`@3=1as|_$ynyU0vkT9*WJ|p2 zUFpUz8yH0w53QOGpwIna?6xIIe^RdfK?fN9@gzuwBN+Ew*?OyK`&8>6cDHJ8`3~g{ zW5-WD0Xt$Pvl_8o-`_Om{qN~F$l{v{j01OjK_&X>tG6m2d1kiXx!Pf$$R%UD(jl`? zH&$MCXU>?bYzzIIfN1~f2UIbWjBur?b~qt3Kui(%(my*ZZM@EiE&t<9tmEI_y}F49 zt>)QkS0xQjpVy9UcL4M(0l!&zIwmDFDkU5D*D-r-gtF2jw)eN+PepKOrUtoQSULaW zW9CQ1+AVIbi#1BmFKc8gT?yx|ea9wr`SWEZHs8xicS1_Qy|a80SWNBcpf}jFmHJnl zfW?@A_fGHOFR=}HY;5}UWVy?G6|7g#PH2p$Bij$NoNtxSNrgLl#~(P&zW**{vp6f6 zZ%THVP4?|y=gwca_;){O5TAEbV93<^=|^hGQm&hyUuWE+S=n1=H~ysDe$o`s`>X6H zNeHTRS>*e=dM_(d7&k2&IJmnuB;ti#r*~QG{zNUqlo-3z1Zw$wbWW7|cYo(Uk8n@S zcfomw6;MzT&@Z*7-3YnCJl7V|5$`j4fsStPZZ?}}IEx+h()AiO*R?@05wnKii7&+0 z==VIC6@UK@|GxLP7+GZAq_Y-HNthL@>ZgdN5A%5gHS$DcKdk+T>|6?nNb_T3tCF+< zH8rX9XVE2t#;Mn?owYQ;ZGHV|mnSN1T$lT9iU>8td6cHG0M%IxU7_)%i| z3ktpJ?3BS!7hP?ClB+Mi5|bi$d&30nTRZq1%7t`-7e`hpHGQvL`kdC=(su1=wW`#`IF8jV{xIGMtzu~oyX5`t|6VorT zjNs3>auKmiRBQo@!#St{2Q=~piRTT%pFU_1cX;|O%l;o`CH9ltSWv7|%B9pRB#4I& zQ#BrhNwCRr#RGx_#O;~-COoz$KU+{V9CH%6Dckie*TkO|_qd!zD#eYj|MkuPF3fL! zP%(JrZvXiB6KiM~^%P8G@mKr~yh|Ho$u{XhOG=7Zhh1e+GO#M1+cx^4Wsmu|&n&TU zm1R9edv&;_|KhgYG3y^p{<5x;uV8jPFet)>eY*GKjREPvuynOgR$81(UK1+nU+#Xv zSniK&voLWTe7kT_Q#;-y`K!<$?|IMH>+r1?ZIIhhx@d)h9}%(mCttt!fBL<9s=jFI zOEj+5_W%pK_eWEbO}cun!B_c9$sZR{h?`ARBZYt1^vc<$|Cg`wg$tzI&k-N~WN3q# zZ>MEL>U6`N{AuHJU%AYb`S>a0pEia@%6UH`p(7J1x<9RM=3`mMq^o!A{?=pupUg-$ zg#02PQ4_SM`}fZEe=q%KA^&$R{^ooBvygu<-hV9e?}C^A+SET=`w#B)A0K(9hyLfV z`R`Wx+aLZpwa?yP|Aaqhy668N#E~Pt2-8ygNpbx$+X&BjV9 zzvAX&YzQoEEZ|4!m|_iGvxU0DHXmI4gGB$fz2GxEQQ{RK-tKJ=%3EloVbZN~&~aol znF|sAXp>hb`4WSB%C->yz;rHLJ}i(RO_)WrfZflE8@gnZvi$%>8J}0XD+AT-4{b*_ zE7tcVf)Ae?Ne z%o}zc-S#88LOlgmw5?icy>Vob;3@1r;u-cV{`?uSw68TyX3TW`Hv?Nb>=eYNRh056 zeCmTibLR6DfjtWDrq+uqX)q_;TKuV3TUFKO@VjTPaqawWV$`*l+hpS1+q92!CmzRcheKKsg*Dz6&; zx?N@4+8(t%G-HnF2DL>?zan#j)h*HRnau1)1Uc0u&z0z9q{G)cCO1V39$$}LyPbma z;jMC8jj6-~^58$fD7omWbnZB>&QDI3_K&}nbjM4Ggi2xFdu(S}#Gu^`qkiuw0O6i& zY0>Z+>?C3O01Kgh`HWvwyicebeK79XUX&o3Te*( z78SaiK;A`3s1OU4UL5!TaaNw^t3YghOLTK4;MxyDy&D6h4-xN2l*TXwuf6wk$x;ACU?Y z{50-lz5Ig$G7p$@bd`xqbL5QlGgCIn+>dBQ&!KibRCISUpMSHEn3PWGg*K(J2WIQK zsH5yw!cuP{YkW22XwS{5?UDWeSi|h|#GxysWn<1e6-#+AI%Y(x5E3tS`q`hwKN=tl zhHd0pBJ=FFNG^~D(g>fCHF96h9Gk9R8!a}SZ!b3q5qillZH~S_<+@{MGE#7*rOKI6 z<(6o{Y@pQKatc)#rHC&7%e7H~D)0uKRyvf1zwSU&65p)lL?EJW!)=7|gAa?`n8(V) zXuWfFCq%q<$0NkEDY~+A_Jde`2oi;W4({XL1fJ&kyIOO(WSFv@3OxxAD6!Cp$(?BNn9Sdj%HiK$wh`Yvs%P-{w;Zjvtr4vPmLnf!d1HOl5wb zYXp*rg1)-ov{-8ND0FC>kmxc9Iu8}Z^IHPytfe+%58NxOf>+(wNWOd zmq3i(x%Qpm* z(8@`?D_T8hp|Q2C*t}%~jN+kL%>|Eo1{_0jYxiR+ap*)8VOAx-*)&Y%3U*pn=pX$lim-O@xo72g2eD@`su{5EcZM$^%8if)D&D(>Iqgve+*zLM+O zgTe2H9xiTCSbswij%<3{lp$X9<52q`JJaR9*M-O1U2S&dhODvnRZdQz$q-<2NXt1= zW)^j+y~OT36n-)4Fcdp@4rKo{*;i^`oGrm~an!k*F&8&OT}RL9gRNKwn0N2i5fWAN z>ab6dU9PW$Jv8hU$uGva>^scX6YU3mz(_GCiu+=xHj*l4i-NXi{2Ik_?ut6LfcQdB zu;bN^t6Kps{2vnu+aibC*t(tEgZ?nJPH!5*PkQWZE~N#R`Rwq!Jm1@4lzjGk)iWcgUiJ+@4gjce#8nXwL>5ooQ&!O zPe4smcfNce?kvqQ5OG5f>RNqmaLX~pt(Se00d2$jtHJqRIAtE`PU0{%={*alp&{hg zm*X6N8-xGrpv`pl&8DQfjZLV5&U#A&fW$0L)IVSWS`-4SwCE*es7lMN?DLfCUR86kd&Z z#m2J}b81@5PJ*XC6Pzo27hBMS9r(#>YJofiFV(`l!jQo%Lj4K2S==UnNVmAV&Q)Dv zy+|iDKaalTXOT0?ddW1O* z$mD$jj^#XE)JW@mht@W`9gV^(%b#{*;3Ix06G|qzNLIy~urFbHt2PReRyZw7{`L4c zBZDp$b};jP#ASnMc9AjPuB5oeqn;qlP^~aomUXPynkmq8zTpA8j6^EYKPb zwpqAmS?kD9abf$a!{MTUA-lx7ZIEeh$gLs58fQkca~H49eg&d)cU*cpiM=zlesedH zdQpK@ueikX$@qdF(NZF=dZL{)K-<+P`8mV-ZB>d$KOVX>b|soV6AkRjkE?x!@{)wL zxoDAyx?!UVn&uEr9XNVU%7+ z67%Yu>>)TZ19xzH|{AoBWjWsUIk=F@2YYcuDTUuxJ5UIh_FE_gxxl;rTo;{m5H6T?~Ln>kNyT0{7<-{adKXpnThMt5)79s z((FDCy}2U{h3y|Y9o?`nN0a;eQ&OddmJT+&1V06NQTK+BZ@#e4Bs{3hFA^Dfbu!|B zl3JzAPZU>0>TS3j@|UkBq-f+ZGI0&k53j7wwXwh+A|#h>einM+g9y&dg;w*vRZX~z zU&}Y;jf1pZUi*azkKNyKgU%ew9ImW5;fd+#1I>_xlU`G^fVb*P@bxkYcH-M4REEI( z1M}4po8HM)H=2x{jMXm&noAoWk)ZD0J!3Fz+k3Z=bM}zlI54j=>x%G{N7MD1BQzz+ zrED=#4eY3v81W-K?l|Z$;`XqSMQ+^ITdy8O$$7|27Mi;k5Pi-g$)qh$VE0gDiBKKv z6jnCa+rjX3Fnq*f1%JGF3Py}x7`V-WU{5pL1sNNQGjjMA`@uA~beKz`sI`r?H(3WD7 ztIPu0KqJ5wv}6+S@+fSQa+_w2k3~sR$`}JCG6#RKwqP>ehz9}y{(YH(89+*wf@%V{ zK{~sc4`D|{_50uHae%J!!t%lKjrbBZYh_rNlQ zQaR`0_`-*B^;Q|t6-B3(uR0cE>5U|-`LT(7$-q>v7D!fcp#yxV%iSL2(IDn@@S2o< zoKE=EE4z?BPb7{RE^GK=?!5W{*>2^K1 zga~TJV63blUg5!Y>D?ADc+?v=w3SR)5MO0UUrB|U)}7QV4le}mYbSwfYLM`I~ z9?cTwgU@CkOkvgYzPUD%V$NMR96nQtUQuJ)&SqZDGN zU>oa&qvo;0Hjy8EQR`i}C+IhPUp<;Oew&mdMe-AYi7Cnz*!A)Kqhcd*}QT24{$;MqzQ(iq~owzV*+>~IqyNDp~w zETyHxmHM39YfryWqdR{KQ(I;nYVf zxm0CTP7WY|A(Gc|`vpLxA;0+5_VAE-_Up}jb2Qx@(Qq+2VtP$uyC)w9#`Vb;49LP#bif+u?B zS?sQaIvne)GKtc$c9~Np1ha?XUsf(gR|!_N7=b6^vJ6J1>&0LP@Pn2Wcu!o4;Z2Ag z!QTGhSkUdsiu3XeD9pg23#twJ#l~EG5?b9>+>0;Y-Sir_5)Z?+)l<_3 z#c1^<4UEGpV2F%|Mfb>vYq8x!_JG0aLcLfrop3;yGF=u?{uD?qg24Zl{GubXg5K`I3w|KaW?U zcCy%uEjI9nV)Fu=Tv{V8Vf+gLBK&LIBHN8T{L;)hc`@tPPX-eCd&9~!MDq&Wa*0W~ zJz*WD+wUJlQ|*6%3XD6Ht4{3q!YwWiNEljul{?L{qK};Y?_6QB{25t`5$f{4`-=b% zn&p(iX{TGoD7@aNt&7W28F02O8l`{eMoEiz+;^D!o)66O zkD1Cg)9Lzi(cYG8B88a^zVcMk_xUQ9Yr5T9!w77{}WmvBCO==OT(1us`nD`rCAfOY4NX9FY!6+#E^baUS!2*Y?S>E`!-K`Ie!Uz_=B>; zNA80m&Kp(9qz1zczRp^gYj(H8502^GyC-HVz={enV@a+nh+2oOjhd3WsO_` zTzl8}==Jr!HTc@(*GcXlY}-eBD-U?5T1j{E1~D)xEmfTiGn!P}b%phX<0os5J9Fkl z;8NokFDy(C{hmKzt(3F~z*mzKUR`@zE$ihyHR&?OyGx?$iZZ>(lX*qXw!54#nPNEN z*Vy3mXz@unboWYxcrCrT9s}^?@r_?4yRhn_(rSd3FoAMVo`7c?b6JpqeHGt;y%Png z&)&z1RgCve#$Dvf;e<3Ac4ME25m`Vyf7+Q^Q-yYhUs*Y4#rhYCj(V6=t3R z;gIHVg*RMA-^?7R^L38z?oV3uBDxe!H*zLW8Vs-RoJjFoS^=X4k9QVV87@($&JAqf zlLJ_LPH=}o9ss;SA)a<}Mr5eXr4iZSi`hp%F8J#&(ekH{)sQ&%_gHE>?9mFWKpOtXg%6M~^_Tc!^oU zJs8^>+%7rL82TcDs%~pTr-iTjjp9SmnP#n7{wM;k3r3%kiiYK(e=^h+O}(mmLvsYA z3qS4*KR`rbgPulm&}aWVgT7}io5TUcX0>r!If*T&Xh}EE5t13)CKteT10}lHF1mz; zcdXgm6Af3mahjN$^d+5(HYN8DlSyFauh}%g{P>S@6@2JtI03{3+EXMW#pxb(gO zD##|U`X&%f+(W-0R`G83oeP!X_$9ux+i1rZv9~XQVd<5|bxfcQ>?%DcT#zyXOOWY? z0MmR(T}+nEPd3rbPrn*3kb3Q7SOqM8SqGO^H~=oFGeq>h(`@Q;`B(-!(M2j6kovt} zABhqZ?wEZ2id)1F!)SpxRg1$_dyK;_12t-lW4dnDd6nXf_|Wd+psgbtkna7;Opac; zfMD9Q$*Xm;g-;4Qb+jy5OuaH~@P$q8hmE+$<_fH9&4h?3v|7pF&74L~5F|3Vjx3=o zOe0op>vI57%?OXLU+sl&YNqEUpKbx$l9{8ec6sa@dyB!${z42%EP%v?Dg)$e zbx-d2?CAk^=}GJHGSxWuPU_iAcIq$yr-4?byx}=9@S2tb@!%XlPX%JnGVfa&#SOmm z@c-ajpFL@*rFWGKJ^B;pS{d_x4T{h(abchMh;PoS@C8MxpI9w2rTOcVQKjaGgCGl| z2*s{2x^M+b-K#3(9?uf)bGT%0&B=T`my;j$pHHVI-{8BNS9ztMn7I*>(!<_%qj4*crOccou3VPTeU@wB_p@yu7FO!|#leg5JeJS4TE78EkjT z1CPz5OZ&+IFsVHJxLI`72LP5infWPtl-VR;N7!BJnhW40h-&GSYw$Gq?ivCIYOH4s zFoLI5Ip@-^L0+|)cTOp{APa`Ni`~)Jcy`esC(;67`#J;L7rIqEFiXk_Ew7ISft(hG zwg9xNb5Q7o?y19<3y8I^6RL9`EQ~f_lHT=So~+hMw+Ha|pwr}03#oBa6?}hx{dGhm zkN+LWQ&i=+vcHMbi5{@6|ANAXZ#b=qgz?b?t@^%0hS9JUoKatVQ7+p9Y!XU;0@_C2 zbiH)AyQUI&%P^_!4FJS&6R`8fGSdvCt9A_z@eXPNW)JLY7A_-APJ48>(bX;hX9zZ~ zec2s{)`3g&oC24panS`*=?E(>Q4Ra~kd=W9MIa>-ejJkp0!}7ixuz5$vd7O5rJP;2mee2w-<6r-Pf(lPQXD=GC8Yy>vd> z7#GJ^k*dOh0T{S|OYbS1B)S~VF5nhqz(wD9gl-i zTrX)QX>pckJVjb)T+(mZprG?bN8n>K|_={3Fm>}Kd>^&jIV6l||R1nR3V8z;8 z;6uZKg89d@d|>=)`a$FH=&vcj*|u??6wib@BcI9-!5ni*QNFF>54pVGlr7hhZ{)lqn+wO+e5jfk`6FA9CtC{Ow=$DR+FC?stqYmYc*RyTt zaT994N1fUFO3o*Kny%mO;!j-FWz@`%EcjBF{A;PQs$o`0 z#K2OymZA;Xk+)oM2>@ZbqtqwpcErbaRuDaPwOk|$al0a&(BizBqB0!u9!K3x7t@@;>0jO3G5Sq4fAM4k!KMBh=ctnpqPXl zkK)c2I;A>C$q=5$9h_{W_H=(<6hWEh6WvlJXxIZYTY6?|TzWgzDYq7+;8*Y3YS^ad|KK>Ms7Bar5^5G2tIWfyu2Z;h5<9`4 z;DwfnHoE&|3%iPEI6dpr>i|0iU>82OZo(mgXZFhg^d-NoGx{!PYnZx3mgCW^c!*`; zSdtq+SI=sxLWDREaHXECR2bEjfK&N5RH6iF8zaL|tS2w^Doe!SC%EF?uz_CDVq-|Q z?g(3bV;O(#@{bFoXp%6*Ay5y=aN4>P^);cYV)Dn6pzHf%J1u=Zz=*xs_Zx{!r-hqB z)shAfWMgw!zmo^ zi@@*M#h?;Ioiytr?K;2QEi}ZyyfKGqx)$rD=u)2COaMgp3MEMHIZ87IxSinK7D)TkBxPw2Q>0S z9m{}A9mH9S@h5yrJX&q`QFWsiQ1J0L)zY2m5kYkZsA`hp^=)z^qSI6Q_;MLzr&9Vb zSc~{8ZP1>MNs6W7cXtt`96J=4=% z$$eYMs07_}5fG8%xkb^V?+#Hfg(@IOW)VdO$gAhyXktcAi}OI)nUbZhK<=%-9ms3u z2^rO;>jnC$R@&Tp!Xq9zqUiRfTdMNUkJNa?%KA7$GzxwlB?bx3e)K=%ZI6gg-*&f( z1-4svj2v_W83EC%Ah8_!zIv0lPSJ!XQp)K7S$D;u_Q7p&H}nh1)G;8|mqQ$HuB=ULl0Lj5IwwFa*=93bg-=?BN^JQp2^Kl&G|*GtTDf%cfW>s4v$FNBU@EvH>jpwGX-w^@)n9lQx54dXatiNj6p!hD- zmzTVb)yXS@3=;QGIKU`V0x?xN{68xZ7Ic+KsSO0~p#@+_Ft3#M=z8OqOIBaPZ1Rp& zMq$)lEC=w;6#?_+yy|u4(;(TjXN~>$$4ure-)ta)hRTwX!7Px>2sdD)e z#)yTc-fW>S8C-w!b6yW2M*TlnNE|fE&|>;D@X4~t3>W8uZO=st#CI%=rQ@D19q)so zm)RBT^zxq8Rv7^*%y`!=!V`v>q+~NhYR74i7-)d^^p@_vztRK_1n{9i8Z(ixQX9r7 zkh{|wP)=sJ)O5RP*Fb7BNAWS53*WW}3o-&vW_i?|maD870A+3Dl6W~O6_bXf`ecaU zGd#80(ermOPPcI+uR~FXg40WT>xdrValsuC<8mX0bF<|Z36Y8RshB$Fh5u|*o8gmK zHQcZstoXXr(K^E13}}N!Q_l@*soCz15}XfFv+u8|-(^WuFga`zgNK~bZhrEVN8rB= z!0iH9++Fb^&+|gW&#Gje61ZetW+pvSC{5ZXd{D8rHLpu-`yVd;ie&_*GX?`yVYlG+ zTp1gcjj39b#9Hh}A>?%?B9h%O!6)(z+b1g)2_XS2(0fx}HiNe!QzKYHkH?Y=_4(FL zz3ci|d6z%+RA8(_rCo%?H~K?!>&<$>+NR_Hrt66)>#pU244r3*!cykHVrcnmkChF# zGrHJ&z;G^=O0}JLRALYDZf%o?18!?04_Ng_c!>S0L^Y|n@4_jA2vw~ByI*@ zRE8zefxyCP`W$ZI9dAx*1?{bvz3h4YD4rwqj1RqW;& z*K2bD-9G;OJ-pW}e*u4}=WxA3cCO;G!kfP1&T^|33Hu z7NP|MlY8?6r}1^LIhy7S`qstEmMo|5J112^an>I>ecZm=z?G_vm>*|6`@jAx$@Ahx zD#0-9G~bzj8HgoiBTug+s}A}@f8k%gi#ORs`y{8{Ri6=o{`*#CVG!B4=Ikp*9}N%k0>PfI*TEPw|AsE#V2`KX|o!TE2%nOKmltE~dZM zMz{VC_P#PK%C-Af7!?EoB~{V@L`u5D1d&p@5kx?`dxlUcX^;-3yE_J?OG-Kh=^k>1 zVVIfou=jq?|6K3c8~6EsK5+SPdGS2=z3#PsEAAD01zJ1~D~EABUfS|#Gr1<~_&IqB zy@D)15D>fxESl8__@P%X;~$!w5HMkKjz&L|leAlxvXs#Gv*Z)5MX`8Z0}85jL&B7Y zB*xzY;lKxj#_e-Mg&qTU9?J!K@;ljQ(1^@crHwvZE`qC3m8J=`dlbKKlB zH#FRDBfECH4-Y zD|YwVPkGxknF0NwdU)Ot;QF7UPv$W$i>-U`<<+$N4bs1@TP`WVx07U95;vf)U{12PXmtmvLKY{FuDj@5~#bBbup0NTaTbO8pX4CdW+nsx-0 ztncQ_%|)%{G7zU5}vJb#Gi*%`JYOEsbpLN)R@&+h8eT4})z7%rGEoitOx78epFmIuct)E*Tc6!2+LZ_x zzyk88ww|L*^{>uf-L)JqA8QHsj-GtdKFC$6tx*~yL?@$0`x6ZXTc%$uOWhPwpNDKn zi-~lF&beU!!zN4R^a1A=GU(MEbwRGCN(L2w%6#ApGG_?j&v>H!iNx*pyZ!_%lB%7E z-wN+?>#?)DZSXS154hWo?s6Oww z2eYiEHI0))_K_lo%eaoMNk*Yf4{Jn=PfDYD(3}^#eJPN!V@=3k$UheMoL3R z)x)@E-P=5jpLZVH87B(Jm6MTP)is`78!S~hlK~c`guw!yS-Rf|mn^O0>Hp`8{Xb10pQL6W6PaeyvPKJx znd%*A2O35s7QQSeCCzw4JG@Jxk}V%p{rsv`Re<&9-<$y9PpUO3vwqDchw&SjmEAb; zz1F(=M1@-5685ZVi-boZC4jZ8a|0OD-!^s_y;EY88k4TE1KnHf=m1D2G(Bw%+>BvC z4(q{Odd5Gy-td?Ve|Q(X*747#+@CIxw4M*3>pJrQdI7yoULjy+P3c=goAb@{l|lhk z+f$vpX`3+csQPb%A3%Y$Wj>0~d zhz_^MrPOI_+<|-$cONSOO>rNf*Hvet&DAiTbeMJng-F6V%WTG7%=-$P4gi6RE+&*V zrwNRQssUk=iawyC0BLgCC!0LI&hFM5d0jSLC`0*qz4P`rwfrxrsHS9M($6`i z%g?!I^~?f-5!44q8lZV2$#1P!`0s~5zjY%C$PzW__8aAU7e$>d8Mmja z2U0CHxr7aue3%zE{#lXx43tR=9D2~&&6l4NA8qw(8*Xj`qqnbc^Osl}sN5D0f9~wM z)dJz34z9oT&!{5Ze}#~gV}-r&a&x=qQ-6y4>eKF6RZ{}Tg{%0VN?kD^^i0#5Co!J`*y;=?0 z?)K(&{SCf!wMh#ud6P}*1LNc$jHaY+4%(Y>;`Pt6iV}!{a^h3OjMzUCCJjaS=duz8 z;3hsU_EWj6?Z7=arAjuK5-N-tw_8m{a_rt9$x|_WahZT?`mk|r=$$pgLf)(7#TA}j zZb3P?v%|tdNTTo}dUfufxu##{?0jK=Vv4%w@q3I$2rC1m zWJ2;I!R6vwaH(Z_)kPqOIL(9W;cl$1ZCXjv?)M;`_wJjI?IOl~LBaB?(&rq~K-{0g zk|qa^MsGFB0BIqp`1B77E-hax1*2kDimCuZ~jOd5*#x%z$TeJFCFpy@;`Em zw=@M1ALL+h|4S7}Yxw{fv}>`nMYvNE7ZML3OqoGw$l%D=ZeP&ek? zzDU4JNA!<}h=<>+dB&10iJrPPb3)6&i;H0;A1XAVK9nL|K7i9pa(P zm$Ao_N=m`A&a-S^44NFku(m*o#nBlr$YNq=XLPO#ZR1{3RcCS-r{46 zBpif>Y9OY_Yo@yCdZ#C*Yi+L4qY3LrCKZ4?U9AA~lTf>sOuQo^^9 zS3P9H7$m-#^nT%S+h1}uUP`p<<*Z&VLREhv?H?~5gN)t~IN6R9ve8f9CK8K%YCAu^ z85&k82HCenhHPG=X&NfDJ?Rs5sy}*_9+06669l`g+$`Pi z@uD@`dvO$S-KK5FV6NJ-6Lke9CaEZbYV&M8yKL9ccXLXz>ljFP%D2om{e^$9IuHhA z+QgDLKMtegxi|g@+f4&7)&X32KgWieRPWoS7}m+bvSH{G$|1#L?$X|xu~M7UxNbar zjoalbkbTt&Wbn`e7gFriVlZ>a0uq`?rWb9s3+(|;Tc8P(43D*XjZD#ZcazmU$tF%$ z%ilL>H*lY%+AB}RJ%IFkz)>1N!wJabLDp_7jmwP0oCmzI46R~9+;#afdvr{p)PXe+xu+A)9&?#s zW7{%gVH_wT10VX-DJ0wxeh5A#DE&?(VN|;;iJ5BH@yfI(dhTiGKfAGAx5qf%wpu+^ zrtkHHN($Y6({4R8c6?gz7SPoHjZxFxCjIotAjzuc>%d)X6tz=TS zfL!^26ES;%vvjH*y45icEMUya0K*^oIvY@9ITITPK(RHKFfM5Vg7XS;VoNI-V$q@M zC9&eLwlz%tQNHlmN$P$IHZ5BtDBhtQZo8~YfODe_W&cDR6hb>8Yj6VF+hH}>0LoAJ zZ8aM=jB(Rq9sPZmC7T3nUt4GRnjcLGs@wHBG%osSr6$SPrtz?$&v76UUZ>@rG}5fZ z12+D1c9LEkFu!yK`cCOdFU)Rr)^1!6+2)0mibMN0ssJSjkMEP3$+$i_- zkm}ENwQK}(u|Vb_5{t&xY~)!iM#wORot5^T9(QK9X0;NFVMh7eMnKNi$0z%n$u3j3vF^*6sBL~;plBa= zSE{|0qSl;xOxvUXc!fvitiE1N{M~TIi_$SsTAJREA;a_1U%46+>v!ser21PLY7Z7% zplGL#QXsj-pN#Q3r1rFUPnqkU?cpn#rEJdoTS^08isG|?oXE2TlpQTb_Rwn)Q+mLi z@j^9F#M+m_$0}sxGLR+y!y9s`XzE7QcmbJgXLT;5*8F=w%)3>fxS$+J)C! zeX_KPqhYNMF}SMrb)qVAUK*$lc)0y&uX(0J0`Bzr<@0x~bjv?teM#>+s4!P_Ez_$H zDCj81m8IDh$=Kf|O%gkeTBms}Yh+ub5IiGG>@=NuooJ;W$Il;;^|IzTWb24{l|zr# zE={%doT|CzZ$M8%j=ULMsD)fJ<<*HThr~9QCZwONLCv_{#wUDi5?T(sT3fvQGPcg@ zbtmeV3>}5}2j9A0J(p8H!5LeOk&~!~sOkE!Do`NtBBg0g$zUaxOA6DSi!vu9a)OFu zgixVJk84X5TYReR4Hi?e`7pq3J`&5@{Dkj3WfJDP9CQ0)>%nH7)i5wD+G}~uzfwb?m?~Q15d}#So~?S^qrIglrPvfvs24DZ zy+d8cIsyg;TF{iXJWr&V{z>1s5Dz32G$KUOYM@_3+JP!4=r8mT95O;SBmNFm^Imne zFOzgzh#8^d_CAZtsU$U-_B$?@Oe&uA?)FUS_~C>{X0BaNuY|ES&{F>U>!Nh2?U!Hu z9Q>VvDfwkvd zy7YSftK%dxu<4cdLG2FTqKpMtbLyTVIM?(lF6Wmy3vb9vB3}^m4Z(l5vM*9=rLmzK z*&EI8lBzyX%bHz#)f!}wE_FQLY6tYdc&9x|LVER9gSC93pu|X)@>yFN*OP94;`SRN z`~7D%DEGgNJXg60fH{VyYBqOH7JnTq>b!OB?x)5`!2_V2u#QDh^W~0V(9u(o-N=FIXxG4QaPWBeoYqeic*7sMf4!K|1ze5~Vdz$Ml{0&HA1++4S2P znSjB8Vq-^4IUrq8F&c}eQc@C}SCQNn zR)WnLG@|NqgM8EDt?$kSM*6z*qDS<$lR-ugiJkFXtXr#kod86_7&Qy7g@F^nF=%C4S zLQ7_(fV&usqE28p1S}|{S*Nj3$c(MiS@&ODG|;*`%&Hp3e9#8eEP9=gc5M~wxFX^^ zzuCZ)L{+opgUaSr|mr;0U)h>m{#dLPxM`;Mbt7 z`?SW_h~(*RW}iaMm5)~B-3B2xS!%(5xwcb#wA*3(?8RtW+3^Rb6j# zOC|tKP+xT`Zn`~trx&bVJT#TOYkC{B4r!U=LxJchXReU8T_SmfB~lUgDZCSIXMUx$ z+_6A4Z1|Uq^$WGVf=!iIuY%p!@!J(>sdQR=kRL~h!0SB4>P#383v)&*E%Ezm?d8jI zctMHOR3Yz^zJDC05aeihx%VCD&c3K{n{wy@@iQi2CSZmy{x$Cl7nv^L0n@7f_OY4v zVuRD1jTl8Ipz?xJG;KCo z&FHcTTE9_Z+>C9dVR?5UkP+TvOX5!zhsi-aI_hJEBd^^z1cO4nqQAGcx~@sj?bJ_+ zP9-L@LYIJY1Q9n1wG`2Ah=mO~G(SB=lkfw*)9OZ`l?D@ircAIoP!gt@oRpMNX3UzQ zBREjJ)yraFFcJ$#8*ry>a9dbbIUv4?wcw#*QT><)q7xZcFWw`h*sXolvn>?523psc zq`nl|u`XxJq&GqSOY4orMO1_{k18QwNhMhUSrvUt)b}2I9=z7Wa9qGB;^Mx6<|nFa zbm5z6SO1Yle^G9p1fP(!2Bi^a-mLH<&~aB<^4YUzqMj5=7YTagg4vAg&C3!b9*P@L zg?#?JYwBTan^uV;x^(ekfN=#-$rpRTyFH1yW6uBR)GO;j4F%mqT%2@jsz`f%6%E~* zr44h89CKCWRZ>sAgG+^Bkk80Pxfc6rkOr8&(g{c|{wgr&E3#;$kuA2WVpW~`v(R;p zOMb~zTYd?eMVzJhCLhm~Ov4w%_VV&t?ZeI%Jn9=0%%9oahUoW8li5mc&AoaP<40Vk zMtY5|Hm3TF?4^o)L17h{qJ6?$?x4`xC zB9k7+Pit6xL5y_VnbFaOr}5V#$VLXk-OLe_H|Uh}F>C%O+Q6`w9#}9{$LHsONhm5H zDER70D_qrn=^;$8nbsGyUwEAo6h}aI|g4&0TR;aQK|QN z?c*nYP_+2e!J)UUax5_nv^g0l4r;wQzI=UMgIY(7?-=Ey!`H_ zP+ie}Cbsdmu(H-353`Np0wmZfSL|4mo=RI=+uU$I6lin~R}vBqr{tBL>^*5~^6!Ly z?fayhzbam77Vaz)Hzg|+tj(XX$yeCKq*v|&Y?ukMY!yVz@Ben9+$Q9Ba5C||D9nMe z)a6RNm?CR>|5w7X%Pn|W-k`sg_5YkM!0641m)^afo2H^D5H@_Wnb#C?db&dcFX6^c z+9TE9$XN@~qv$A;L3VC})fD^pd?-}xI(LF6&}coz;D=U$FO*@Ue;h|IbrDIKu+G> zF6a}qP@>9|e9rBBAu5t>rK(qrhb-$}6W<4aWrJs*HR&!urDj2^Y2O#~Ea=#0iT<{oH22B)dQ_&To78-R)26xz(3Z{w zu_e-idFzt-JY>V(3vKbX&x5f`2?*v z7dLecQ{IorQZr^r~&EZpX@ar107UM{>{dJnA z9mQJ)Z~Vk(m`5g;q2KrL%)XfoV+Zw#WZ^;(iQkT6EU zs3p}Rxc4|SFy{J#(j7Y{{SQ{>&VccmAv|N#?C}@vK7^&_D&TNE=SdOaZ)*5WI4r^S zgQusr2^jBFv8!JVos8MKM>^5Ou2B>%g^go>rZC*x^e)dYlDU<6lG}Px4(NT#J}se1 zUD1Z;G@N_Yk{Pe@dh*HqG@M1YP+MTx8?$*y<xQ znvr)_9#Ue7A{rv1IwH$q7296YNU7NltzWJjpCddWTZ3sRN^So%)PcR!cDs}RC9%y% zMxhp0m$d~I^wKs+ol+X*@0N}tDVuH*Y(cQTJ6=Zf#S5nu!v^k=ZdsBO+D|P|5FDN+ zBK5df)dGvQzIJ98axf$1`|I%W0zrAt&Yuk^tKr=SRDm22?`A|PMvoiNhVO3(Sx?=g zfW+OCVf4xv=N1bDY46ODe3NNx1}f9U{M$~~K7)&1+d=9|HEW65M$~Myfgm53LN1hS@t1vH@PZEAHrR!8kH!**~?I7gQ9HAC7 zsoAjfqRK!x9_FM6C>!am`+OG*z_jj>NQe47p~W4EI*qQiI=An-iV_xnQM^pv#`?u zr($029?=8ZM2N)4V@3~JInr<6KdK@3kqA6LBayYK{o>bG>#*+d>Q5g@1$akj8hBg{ zOW>m#2C>udEjmk9oo@35?=J@s0PYQIUN#l7Z3?mmCJ{8A#`yKu#wKil&{p~C*IYES ziXX5^nC@^KCLzh%+5Zj9?K^)d&lTu3soh^d$wNqyXs0{JB*(0@qqYf}t>#`Ovq zFPnk5_%r{x&Ms(gcjCc=xZE4qh=Zi3{yA-UULo2nwQ0e7gX`1_i7cdR{G{xH)WK4e zQ!I8&Z7rA2$40J8vUFX{NfMk;JSQTbju55DRx+`tV}&~0j3H=E@QHa)V*^L*e{6xZ zGh1M;PSJ0$Kud*hyXDb-J(~wE#!b#^I85pihGH=F;97h+Ne=*hqa6p};( zsWM7kG|?WCTZ2!QcJGo#ZH(si0CV0=POB-FbPiHk9^JDLc&{3(9&2#Nz!O#JdC*_o zUH1{O%5OK{(};~Iw>XIbfeln1TcdK{n>B1%nuU8IlLfnrG6j2|Q`CTRc~e|QYvMw23^OAq1`E6(4j#)U-DUOQY}|OCb>bbG-A%ZufdBpbQM<{? zMkI^*?Mbat%stXJ4$bogxT{eb+}T*}u&heg75bVBeeWe**RpLAf|0z-d_kI9B?0zj zm+{2Z&CSZ%(c}Z}+MB6rxdrK|4_CdLLReWCS~Np-9#4gj1L`(|P5(dJnQ>YzEYm5}&gEk6 zHTE*tU}c^4Fm~o*Xt@4?H9YeiNdilBbBBFY`Dt8p+te8kEg_a+qcgn zo!dR-YYTI#bQ7UApBm(Vf2x}YQTM{5+I)zeb(o!ctPYC~qn?R%yY$Mfj;PTT#WLHa z*?LLT(w?Wwn4o9Jq0+>3A*=2gIl=37TaFlMMc^T%wRDC!TsYiA?8no;q3{3DJm3oD zhZ-R&0waUs&&8uTuk=Di_X%xgfE1jroFQXg?5(Ss!e>&4D`pP_p>P88ME^6X@c(}D z`OYN0Ae~^)?t39_h0|UXKLQ}{)3T*?U`_kci8SZ3EK#BKqW}*TyR^Y3OiigT2k<5= zUre%a=L(Hc1WsdBjh9NC;4PMWYm;zFTlosC34KKu`pSTODww;>#=T|2KL5 zXY`}bWA;DU;eLgUlyw0UagwW?3Y&UC4u?V0V?mp_*Wosq()#sN@CFCw0qlVtoTkfT z{@eBcIk&%ga=!Wp6+|ijHx-2Q-rQx=sVXSYZ+s4*B;%xi^0X4rC+PL-B3&)5@^>Mg z$Wha2_{SSJ|7T@?^OQD7{3nwTVWz`rIRfy}TRKXJoW^R)sR}Lig5zwaxyL?DD{?Up zhV~So<)kML1G0X z0WZ5U&>|k_qWsx%rsh@C>08Tp1g8AwzZaMECnwEIA*+^v>n-U*2}C4jgMMC*s~hBY zpjQ3@bn+gmh|^jbz)0BW$7!5DVDEm#8AzZqISIYZ=k8?R4S$^$`hsp^_TDwK5p!aI zu&WYRpl%1$HjKCoZsh{aSH)#dfN6$}>sK7x+*sZiX1$t1pxUvA5iIi=|>f)XYz4{^In%5r+Q~KBCgUu;m&Q zuU6nUy9NICzghD)jxUq9^lj?z@e%=4WgvSu$8zxkDt^alh7w|u7rZKfO<_aqZz`SQ}od=qK zGJ!Of2mrt=$fFDL2h<|;e}f$VwX3vi{{fOc+Hjq7mBF7I>M}Tkv_tN{1n}GLk_)0c6Mm zy8G2wu@1WhPVa8M2Ij7XdZE0NToe8D(Bd4?lXf4vCW#;_ZmxRuTrM$?n6&tY93RyqISw(W{2=)Tn)L5wb-+)iYr65W+qlK#WjHiJME6~W-d8+0rx0`B^ zqS_&bLoi6s!bgb7sgncWd?+C@b{?Gbar_uH>@dcNe((`!{*C!bpM=vh9z z$$q{?@rRiEoYLGkf6xJcn-$NkS6AV}W1(IgjdECZ9l5!*IvN2WAvW}aW|8N!$#g{1 zHx){=k3MSe^sg53O8<+)o!GlraUzY{r)nU&C~DF&BLeN-P{S=vf7~IogBMt*1cBH- z@S7VJiyl4g)h*DeR>z`DFR5J5*j>2xTc9qvOKA4F+Nd39^dxsJty#Z4OK^Dqc5CWc)j!%ZkD^gNLLQ_F|P;!{0P1!4?8^e7781V``%_59LARRJ7OU8$&Ep!sEm-d6OtC*|E7%A}k-%=NCvcerc~E@ko+ z#a*loZfkUKw^5}FpvWJeS7v)v7R{kkr7CEt$>J|pf8*Tj}UXn3zHS*Q5;%j_xtl#|BVu)e_=STmoI0>=Y`pVfxigktFLrt zE<4SL2?;%G$CD}4?w`$kf23XZuK&GOjEoKW>ay0ezk!PX({A&jmrt{tECW0fkuzgI znnP=tSd@lS+mWEY$Japwpzt2U(ptA%ol{jJMAiT>TdXcg9A>b}^I zkpAANac01!SZTOCldmYjX7Dw#K(kaH0F~I4QHS5spYo}XQG3vf<(?Rm)nu11?Y!F%>M=du-yInQ z3RR!I?4Q8#^>vx+-kpO_GfQ-DsD8)(SYmv&Jk{$M?xwtEV8QLD1N`vR49cl&+3O>{ zo0jU4hhazMPu6Rypn6lrqAB#_I}Ye+uhLB9eyKwF3L-2dRk%okZN>oJ3FY<>2ggrH zHTlnS8C`y$t%_rRk7Zve+~g^IUgElE1ecl?8tDagnpkY3Hd*qfu`f=PZKPEu6!3cQ zy+pXI2pD3XgznEJ5z}BBw`oPt<8n&BW3&4;_?dDskqH783A_jszrgRjP*QC=P5=XOZC_2NukW99Pj}A4PXGy$SHM$fakcyv+ogE<% zD+%tg*T`PQg-4bgkbXufRLw7}7AwO&xJ{VZf^>Hzv%HDlow?{s2P^$iqxl-stv7W5 zt{kDY=xN)#Ny|4XEl8TIQ5?IW9Z7%Jt~sLA;2}8y9p[{nDX{3WX?9_}sg8x^3D z(hDS=_-nZ!3_^YzUi}keyf}q*zIkOjYtGGC z6Hh}iYFmJjq9V*7(99i1S|KM~8(0fjE%Ci~_vjwuc*-);g#z?F@M=}69t)yc<~8SH zWBSQd56EJyAVS1p<*8Hms3h?DN#N7}?(^4(RdZiI%Tvo=nf-dX6eI0?y}Zz8w{7L} z3`V++@!^0Xz%6Uootj@73dN9pyY2iBo#WWtL>NrrzU(LrzN#byhkd`0{q-4_>IC*mc0~2#r^=8aK(qW{%xB4vXRg& z;u?R@W+?+!yC78}(g`x{FirR_A}%K!zyWsgrfF``hp7!vdY%UPEk;QO=rVFZ?~YHZ z@-?Y%^;Q&D4lPvc9NB>G#=(7YJhSq8CGFc^@}2&l$P-SaTBxmdC`F>U33l+yXsmQUlP&3IbVYx%iZo-zJ_nGefh~ zsLX8W4jg(cDKZP(0=)X%hS&L_4UfQ2?Khx9l-aCaJS81S&YaZd)5eNhw2?T2nuYX} zkD%=#%2u|fltizHD0X4q%Y{A?3@q)Ks%Y)yj?vKiV@=DHd-fnCZ8`($haZ<~Mk#nwVHSBn68KXYw{#F1EzLi9l*mTI+b{hEgu4! zT*j@_BoI#xcc5X;y+Tw6@V}z3Rk<#aY-!X zfHHnuA=kFh9&GaK`=ynvpZ8rt+I*f00f*ba59Y%(`b&!P`M1d`C2@{DtLo@6#_ySBxJYZ)P?gYZ*Qc#kkvu}+v z5Nq5h@i6XPXm_duY$W%i_b9I)bW$(xI19^I+#|hbA0a#+7cM++SQ2!;2`hb@VsPGZ zw$3T@ZFeNVcH#<;j0`2|YQnMyvA?KirR;P))N7!BBSZ>!9-Yd*nzmke^wX}Md*xaa z7U&F-uujy|^@)V`Es&6&RfkVSP@u)~1(55Ye$#pFzKGI=u> zg^e(ES5@bCylunc$>d~j=rpXzKlaxig@M$JnD{eq0}-YZ(JeRZ@ub2v+s^8{8QNKi zOyTdh?3a7ouP_1)dJhU5#Psl}2*_jpHQp*;ISvF7Fu10A)X%HrT&Vh4`{ejBHctbLjQH?>r1QeF{9=q zS-mGDwB(3#kk1z3R;dT)q|N}#N&i-{gjae8N?YX2 zD5@E>KkViusc;=@OFM3pj)vqYT<6G5HBoAW4d}uN$dY3f4mn(b*smo zDxWoI^$M?aWYc3@ zPbpK%^K+SXj?#7CLp_S6(RY95-Z;oT@s7D1|Cs`L7t+A%{;iDBtZpqCL2=Gwhd1-bC4XXsE$;AeCg_WXt`i zF0xuv3J_fk7Mo*7Mc`-}dE@tT*)(91E zlY0ab6Re|0VM+5d2IzmwMa3Q+TWK-gy_cO-**eqFLNFHl2;=^-C>uyy>(*5f@%^IZ zGbh17Q<(L&P;Hu4{#l3t9~xhTt`P8l_WKs~mc2n6?r;Smb0rE{eR^@#lFUIW#8Q$PZo;b;Ne{6p-Hy}Pz?RxQ zd!TxMmE@sG4<(!(@Mrq?W(5Eh)k4o{XLwAD$^0Q%*~so$3S-hbfNP(LEW9c8o=P?* zvLuzB@U09}bs%VQj+RBOkCLFOvc4(GQ|QgN+0y~e#MGgudFqAab?__&#V_to$Rk9i zj@^8&OucNB%B@R_G+*TIQuL`{GJ&ZnoR^%|c(N}dV`1!o3V+^&a+FtIe^w0dT2!LJ z(R7_Q)qOuY*)dh#bf({Tnu^L(bB_61jHF1;x}&C3;Tp&?^_-f58z{&L0D0fA8Wsg-M&Ejl37jl1|KZ-{C(oYO{$02%}8E#k`oAe*qI7F z+=l8qGKrO@Hr6d+tipDAX8^AT!2yLP${L0}(m`yjDPt8Z5=QI+qJ{)CU(jA|sM?AW)7|q8e>k;a>s;>2{nZnl3StM83wi%1= z<`mLrJqwE1rGjri`PAfRmgKF-XTPIm>gq9~KsPA8GV9_Tomao}fC7>Lj9tC@Wz=G< zt z3MlaCwJBe5y>b_wcrOh|B)3yn@_);&~^G_#jA$XBi}LU zC+pFT{&bGsKZVFlcFK_{uM5mx#p95Vni7CQ-TWi4CYljbin7d9-^8zi?Vg zE@5LZhgMym`21nMClV93AA6I1>=X<3^=2UoKqd&}X&lv^HX26sC1Ctj3twvig|4n5 z`z3yFTRTGbTie+7Q5vuw-;cq@Ped%VOP!!z3Nem+DUzxuqsQ>%^uT=V=FV*ZSojEn zT~+XKJjj6$p@I6?BlMN)S7sa0Z}NPf#>#I4rbom*yXF?dQHnLxVY>aoUgrn?l=VrC z6B0VQDGgp7nFficP6r(lQW{;AKu@-Vm@Y;DiF{V-mu@ zddYwgGx?m8_*`Rl!^w%VjRZC|HU-WF%-oHu_Sk7a4WFL&F4W(%1Y8Db@+rHY*Zq55 zHl`vFJZg^uK9T$l*a@T~tyr8B2osE$+zG+M6sN8;5-{?jwl$VyWMB%7_9FEA209ESj7Y^UW50 zJ-RiT-)+}Px4!KGZ?&_RZp37aW^y*L-I$|8gf`%QbaFc*@h%ZQ`dHoW9ij65H-5o{|{ zS;`1u0n$D9Nn<#)zE5nMHc0}ZnBmIJvFqH8m1KCXJVFpnM}ajDXDZA-bv0J)C*&I2~V_v(iHMWjC`4H^Ld*}fH=Yd zkZu(Zco!Af?(6ZAcO^Y9&(mh-wInbBu6CTBCd68_z)oSUF?LQt$ zt)3dQnP57z`qm$+ITv!kuX;V}2t~`MxYorMgdW$@LF}z52&y_wi@|UPkHd($0h50D zvD7D0amcMz1MuAuaQt=1`d2HYM>|&Xt5(Oj-t5#{j&Xz}-;rq_YpU0BqtFvpo9P@= zFeC>GnVk>Ea@1tByRmH6)Kx2G`@0q)dt2;Wn5$F0c5W2>4GI%7rG*t(+@E40wVa|% z$piAu@bcnstRX^>`QIzge1kA1?+>bU zB}W9-*yxp&f}wuw-gYf(y=}Ixe!;#AS+2SV^OT zu4`!bojMW9T|1?p(k*J^PFL}lc1k4&B(C2h^*ri#(Z4V)A-UekHk@w* z8Oj!(&0^HmFo3aj;+eaA-YT%Tu$bG)Fr)r}`Q)=&zWU3}e4DGJTn|ljw@&vpDw|YA zooiTxsiF4$B-K4Oe5Sp|vpWaA!#jI9*#!at!|;!$nR8(9N-W??Z@LP>oWo8|-a2)Wto`==|RatO}hh7~v5uNGID}r{-Uif0~XL67Vs#5hzAbXJh*6q=J zlu5uAxL)@F4zvN+FNaseEUIszpZ?)-p>jn-?&<&I9J6oWQHF! z{j;V!o~E0X@$;_l?oKHUnnaE;8nLtIx~nx=KG3Tl*+QRe6rZ$%I&rf#-`s5QuCA8? zVYFE!c~UV!QDV`22a&(MjFe_F zTu}PMXVA7|n)!}r>%qY@Pfy3gUg}-Dja*?LNc3;+nO2;Gxk+J+8r*W=GzS;v)DPyh z^bW$uTUKn4Y7Kb}rb*+XYZe+iTR-Q-KhU5(5;`6{he|1C#j_Dr_%_EY&<1zJSdOfN zb|5+RNFgzoJpAef@9lgtqENbn2q6LFqm|o>E2c^0swSh7WR4zF;y}Zw?H$g_1@A@% zdgvs|s)2y?Nph8y*|0YYnb`<*TR>#9%u41v@n+^rXf2j!d_G4+xAYrat{pEwvX(gz z6l(HDmhKpFfXtMSAO4}&t}Xz$En>z?9SAF9vE<5}HXjBhCt?YCR`X&8T$7&+aOR=5MUr4cIjbc^M58 z8_68%4Z@t(5gAuKJSvJlX-^Br_Vf$gQ=(gQ?DX1imZ&B$QJINa1U28PU=Uj={P2O4 zWNc$(*BWiQENA|6X{IZ?zA{x(uc&3gDC??XS=A45^E$J=;WT+yeZX=>gXNsL*SE&g zY*xi=0C;Flhsz1L9# z2YID#LUugls#CX*jE32SEib5`*L7uH!HN;0{A`(T7R#T)45~5{g$Eb*^C={CwkFoO z!bCP~*QYmgqg&s`3mZn}@9@CCswzYgEJRgSK(XIk4g(dQWY-0PUIh|<{qgRJUwKGk z3Zs_DV|J>~4A{K*;`if&<|mb($^I96?-|royS0x?7Zp%>Y;=_(AfWUf>{3Eh6r`ho z^xkVgL`0+r(n|mZM0&3wARxVl9!Nk+=%FP6LXzL+ocGLo=0Co3o{#?z=fj?HMh5oY z>)z{L>sr@!-TM@{Syq}y%VPPf)?ZIBS^@nzaq_pUw+zwjl&sS|gTF}EF=t{sG9*U> z7@H2?ynjdtn>=@($q}*0sI!619t>C+xER0&JOh~mU!97|9{uz?HB~+n7FON%(jU~I z;@8jX>KL&_odgQh)zUvPb35f|c59xpfKhEUhAz$PvQW{8AAFf`a~k^^({hTQk#CmK z)bF&}i_i>8>a@#1Yi!atK3B>lcu{%Rw7^r&kL=;^a7e5M(dz zDXi)V4t}I3S6*uok8gC_!!=zpMyo#eP$S}{buLVNq?>Y?smH%5G_>4~wM6?I4meiX zc51?TPQQKwO%1b_a9T5s+xW(IqiIZn5Hlkh64n)@hmj03SDn(EXX9;@y72+*e}Jd2 zt69@Q`5O;x`S=#E6`=dvGY zHfjlOY^4b2Z6x%MXMUx8#>k4#BZN*m#f*yUsIdD^o5D>t9Qoc#8Q_J7$($y{?K+2`9$E$F#&`z8~U+zJ~C=Qxs?; zv#E$1aNo<@lE7iC@4^4Riqb2NU6sp4N=!uiQHgwq}JP?nIl zTdY0^rcP53;&*KPz}m|*^S3A6qG#aa!^C}IDc3Vir+88uTA3biJFt1KVrmEMe5wxoQdD~{#3z{zlxu>Xped`mu4w%{ zRJsYniJ}{4F|4Z2P@kPSzqd2EX!3~G>9Z6=7ytHPN(ed@K;?MS%=Hw}gJW$`uU~*l zBz1VLbB15enpg~K4kXnkg7C}3mZA$kdn!fpIkzw(SeV=UX>VgrrNYWO!kAf#!T30C zR*;cCk6v^NG+hmFe=al{^RvFQlq>_LXJ3;{!|D13-svK^JWIo4f zAZe&s3V7`eAYzpiz4|0iezmNU8B#MW&-@~I)?d}r;z_lUrm|hW6$$`2ujUqR--|vN zca381^nz2xOAP(r2BkaR2{|9e#eMqW)NQG%qm>58TeDGVu zss$HKZB5u2HyiWph818N-W<~_7^1$g(|J&F<5F(t@HSOS=i##d;1uQ1FIZZoAxA~! z<*GE(Jrh6SiKY%CV&{i7)vE1qG;+Dxyu6JDP+Qrm6=0Fkz76G^S_5sQF8F(`!Up8K zn=flScSnfnDCwc!Gv`Wtl3fmY6tF)<%U^gLql%k$4Qm8)3eas5;n~chc`{CO42RRV zJ;)nZ=`3+me!dMt+9@|3di^Ah7gH<7M^3Vo7;_L<9WHz5PA-c~X=jToO-K4^TsBkM zbf((`>D!A960c!b`ugG0Nv|ztHC4dLxr`p=so4>6iSv+pN8&RqNVYZBP~%LlsJf z$)((n`>{xH6A#oP+EB*;Bl^J2c39lXMmWa*FMbzk5Vib?q=}%3DDa3+OVU#Z&|YX? zfya^uTVElI)l`+3Tkb~#JDa&XKZ9z!Pj83QZS?&m^i*ACbKLzW8%V=YD*ow@T%8+x zzQ3qGzXPe?6vNE_NJ>~R;JfO=XZz#r#BzA>=EQBN7}sR+Q+UcAZr5UPWl;|;trHr} zTD`%(7x-dgA)FrKzGly3gwK09dWy!%*!{)XOfSY=-73l8^m4Pql8S|2hPYi3HdLVF z@1Um5BKzXmAd-fcn+j*$^bD=MyAfnlG@dCv6Vt%M*On$4uE3q)(K9={`P^)=H~5+IA$WBauh%A`0Ig!l2U6 z_H3LTP*N^@IyfQRU8(&6Hq&U9rFHd-&qovO+<&^5Fd9CmeOTVpH7ckEqy7ZRbi!n*&5w+JGA+Hj;vd)+>T!39 zF1uB!KY2WM2;VBqKjl*DwO=%MJ!kFuOq>UvkhiOVWnE`X-o5UkChE3KLdk-E4Sy@t zsy@;J);_6w^LYbZ(qj2P(5tz$aKHz6a-lzF*b7t_`{X2`(1?6uW-5P zNe4e>53jU|KT-7k^ACX7sqyDUdF9eADAV|$yMH^4+%4`vuw0A-PZ8$g3CI{ieSUti zI<&QK0V839=lLeZ9PMx8Mdr>zR>5??k6D)FHE7HP5Vxc3 z9tZGr?B{qy-|}i`xVmo3`bCm6PpP#@ElN= z)g1}sv_h2)!sp^)4#6p6(e~JXt&d_+^}0@b;zgnC&=LQy;+Xz++ZJTIvjqh zl@rr85KcqGU9xeoM1;HP!Dyf-4Vb|!RGLNqG{w_%aQ=}~n9Bj@UI zNFL*$Fs(fh{|smrV$a7WQ{_vkyXeE^=L`nizp7j4NRR2HiayA~1>w~eMc{&jH;P}@ z8e6wvT~DzW_Ko>YoFnCVc&d)56)5afw z524H9Q2*Sh`=LMp)H(u+d4v~o;nN5W_*23|ft%a9rRlk~xk{O7Jg)@)RPh%pJ5jBg zk7L?0Czl`&PbrXk1<}I3g)4OiSz}S00EI%X5l`ThU_?e7Pcf&_2XoF$R%avjMGt4u2%ZQS>s^Snw9AFRUeEi{Igy0QU`b3W zJ${dpe>BH;_X6yuDDzhkuFfB>LHDi>C~wCs`;SchhGlw?COtS`j%QP%J_+#w7#xg|cn)o4yuJSRfQB)PjNEtp;MWhm2wI-i$)pa1@} z3CUZuYNts*|2U6vI9J_O)<OLc|0_98VBTE^#*;gwF9b1xRAR z@+YIQPH?%`W?Ku?OQujEW1c`k@j)J?HA2*Q*GgW-^2VS`YMMuK;$<_x+6zoeN2-vi znX?5ZnPl^DmTOM%sSYi3lP)EX_sN9JfJcaHj=ir6h+DXzORtf)mG}1DBsb$E;BF6_ zMkO9AC+#7n>lXXPv$o;d{UCIY5^Ai&^cQQJiRrgz0*F}IaJt(tts-C{JibV13qF97 z9hSPA+Un*P3R+EYLZq=t)Y@@(Fx-F(T1f9ZS97zzA6K;dmmLb29_~#|2aT>PKsy)p zj3o;6N`~&4AoHB?!bT2aK|DUysrDD=H6aY=WI!LVztjS1 z#X8E=zUjkquSE$KZ?Ny4@9mM~w%~I+o$LeewWe9XCu%$RX{~!a+k?*tu-59twSk^) zYPV;>$cL(svFTXCWPbDKer%c_%O=)25KvX`PF@ynKx(5c7x$!4@I;F>VCp%mo2l*@ z&_#f`4PD4a&P+V~9Tac&r@%kSWYB7SbZ-%;ZStFVx9J<eW5)!!h~d5)C*sd1Aq0mgRp5+n%C+V8>w$dV>aKlK!kHn(4+ckVE}+H_441S z_i!Bx)$>_9j3#&iZ~0EQ=G63Mt{PE7)5GLuMC9#&-I@C5wN`zE0M()!fX53L^NuSld!GHs18b*KA2qR@UBWs+~kT`Axr7Gu&e30v& z2Erc&U}3k<)UIrI6dFwMU)L3g?CkWCD-?^mdv8s~=Insj{IjgLrf&uG0m}o-_{URI#SJT-2ZyQ5 zS+{2*0WEgt2vEcm+{Y#Ce6pr}3QSkBOMxI+V)5{=h?5pIHa163%~j8M2)SgM@v|-y z>jMW+Z}QX!H-I6nMP8%7BM7@*HMtlCu+DUCeAe@^T00%L{CuwHFIoL=ZoUMIJMMuS z*wU*PU>~{_yBy0s8aVI4xB;?S+-uMux<0kw^S3_Rusi_zbqc7B?349NI0V>i0x*-P4Ws@thIG-sM0J5(l> zj!arfIFS@N2={ysvN$C^!6q6?5?HRE{y%4?KtuzBcME$nA59$4SXC-zjkr2cRME|89?=OdSLR{o`w)F6XCj?umm zW+!iAK5zdcZfJcWru6DgQ^l#@wiiNMS@BZ!@6T`wv#Cs}xN^;`XkQ&XpoFn!NJBic z-}K`Bryznu&&CXf6<_p>kc3C=7WQ5O-2Fyj**#_OS%tkM9aqKs-IU|(icW2cjRM9Qaumbfb3vr0?~-LBk$0 z^wRUDOLKI)wf_yppdz?KGm5S45wXIn?gjeFPWcz%4ZSKfU{;fcDo-yo<>lVJ{ZyFU zUcylX;?6Ov7KjsAj0*X(}RWc=?_AdTzdvbKE7R)1LC8tN4Z1^pI0@ z{a0}usixjJg||mp%CQ_uN=jtW7eAKM|H;mp514y>&%&?B*MoX>TA01Vij?NAI3@3H z0yl9ijtAJ4eSs|(Z|?U@ra}u$qc{bsZ<4$6U_zRGIztK$kGNgFH;#~OaFy%I4{SS1 zYD~GW>`ZB*a}yMuMiz{QBs-phL(X%g4a(xNE3Uwd>#ZZOi3EY43~GtNmsSw{XNr~5 z3F>2fvy~3F8t?^&#*#>2}6gqw(rti%;CpZhgX&pF5}l z3|p?nLCj(5bw&B677&-YBX1Jy?yc*v3aV`eoO!G(HR8CK(D^r!cl_WFE8~8PlsJam&}KF+ntv0a&Wt0*cC;~ zP{FPTGGN*f5oyI2U0)%D$IHMkrDaz7)2cg_eN2-QQImJ!Q@qWmc!zK8{#(IFNDNBJ zgmd|(Atid+6-&3|moCSH-FGtND5 zNtAN04QjJt^o#0%XB?oX^J$P)pDT)`ig-;J>~X_^ZTX4Yy4tI{@q#Jje595TqK}nym%9E{Ww5^2A8n&y*|$yILq4w?Ru;|=W~$)^*tl2BG_u#2lBzEz zR-I;;^;&47;XE_ShSEtfz9Fi6G>k%_KN!tRDwEr|bE~sDzQM6UT^zoDWJV+a$i$_tHj;4d#F-+afES{5 zWdVjxm4L*f>p`ON2E)slIT`(XqP7#f{feRQQr?X2D7mP`J(3Q;j{=zH@E?FT+V!S7 zTz-2*qBrLBcECDOscMvBtepn^l%F@uT*Gc1qKN^9a&ac|E=Ec+s^NXNHiShF(+~D45d*>cQ!h7|6?o{I37g3XU=zciUCC1bPqZ zFDUQWZ14#)s9BpO7lyCaz=?IBf<5vCC0FTuugoho%%y4%+|VP&Grp~kg2q6_6lC(n z&k0I~KQS;u_iKu{NxTGXjjJBim0QARc>VXnkLMPSmnKaB_pz&w3e2s0L|O#g@7VuS zlluP=KgVze*R};O#GhE74O~$4X+{CFsK5s&J_o{K(nyW*ldP+hD~P^?WWn!E?a!Og zd4!B|uML&PZu&#HF64l^|5kxT?DG&*FZ0@%)73HnY^9E z)MhfAF9chT7aH1tfT$W{MW#+{K0OLcsrs-p9;pebXg3JxG$Max&ns zecaK@DPU#TGD|IS@!QtNeT`fi z<1f(Bvd+QS7u#{-FPpp!emt>+)1Ox`HDT^}C`4PgDZ}kHat=MNNwleo7DfiTu=OzN zBrIo$N53bmJnV_+m(l@qC6yv<;(no3$@O+mB%0tORG9;Ojege5CKu5ct%vcqqNMTx zJ!gUrr`%4{`|2Oz8TYd=3LyUT!sb9q1rcQcmJqI;BT zl2qAWa=N?WZ0jOOfCvF!lo zXl1nfM+MPjc&}3DtwKR1q#TDSFfbM2Rx&aV+dd!rFjc4*DGc9h^1qpIrg;PbDFsHp_}eE_|K|3TY2^m8=oV zYBkV$%Em@nF*OdJmSkKWEORw;rB2T~43Bg36rqh|P}$ulFX6n(5MV*hd+g^O8#kKX z+0sp6isVcC+u$0B1ku0vIoG(Q1b#}v4;_>Wi3QNTPw(qiKStz?3Y3LC$K#^G{q@RO zP}h>JeMf{frs(H$22_(Y>eU@UFO~vKy&RK$v99Gtqxq-ZPtn!X<&4je^T%YeX1}w) zhWxA#kg##dnD$wVymPBvWvO@#xm|>sN!t2rJ}ON<6Ed+HS3T))_{z%)Tby-%%z)9T zCm36a-&Eqex-vgYx7H&nK|EvW-X2{5`4mmw-<=iq@oQ{f)QccV*9r2S@q@#d_etR= zt6>dkN={A8!$+L-t+5O|4ucaaa!PB!{NUN$8g~qYneL5QSRi%9orj=~vbeW8jRmKe zE-+O823T=hHb=!bYw%&zmPhGngOCDF>#nV|>ixViyg#?qtmc4vys@a2!gL2LP?*~Z zAyf_vV6#wkna$-aPVjUPkMIY3VRIa&dPZv7P7}Qq*Boz23(Z@H#%6reXbaA| z{;*4-FO~6vM0W90cH`ovCY_P1PMCQ%jF9X$zm1Ta*`A;I;wGtHz`bh$YjiZ5lC*m$ zD$$11p5QrJ+UndnVCF_nWQ|L1T$2 zZ!9|tBp5*v)8x&1ZBh4$vb_3+mw)Q+)cCNpOHOA0AR?{7!#o|#8b1&-Q(0c3EcYVz zX`DcVym$*ZbSxAr6IWnR_4rT@y}0Dmckg{dCxrAPl`9}E7z5eKKQ(SeM8Q> z6ScFWmR>%BezgD5PlX(_?v#6BkN|Lb1^e+iMo^D}*;m(5hU}Mp_ckT2W4P_OezRXc zB1J;@1}D0$rE=anzyfcZ9P^bXKW)MQC8(4S6xAx8FK;Fwi2&O3+kF6HA295)&spXq zf{lGv{1AXTc2n|{Cl!)Lc?o!%=in2i5R77;FXXjt&>c=E_(H9JITx^=-@`%d61jNwnnb3-VbuIWY)jkdAsC^Co-27WtRg}a+#4RF>TN5+Zil5s8th- zOt|Ti321ZPYS1g@lC}CbfEmN+Y^oC?5nyf0Up@C2k!X75C}A0^t_u#CB4XT9zs%U= zy(_lUq8);>DRXFibXBrEbnE9f9QQbNBB0(x6wu?IdSXEuu2LKe1U)7Ep#ss2nEXDBjO9jQh ziJB#3k)2ar4bJzCk!!bM<2x=dz!oO}kkbviQ`8a!Uuqq>xCRHnI-rp`Y^d}MKfAQ zgEp-vt#IvTkzyY%{N*G^`HTs`feN4_baKjb3VY4SYq$mXL~#-W=?71PX_qe zxUA6TfLX&|=>+T8Iz(v1#Y=65b18qcOz2EL)(M_eN|2w0ux~Qv0#r|?=87NEW*yF= zOcMe52~!xxBOPMJ1)68X@wp7Bppnb$VHB`bumaWh!$#(#y~z*Cf{#KY5*|nB9+o=1 zUJL;oAMC<9AZUp((>%U0*<+zrd@rxdMhll%fgZ5{1OfDekFbl?6_NL_qic&xihz1` zWc`wT%iHC=kLksQ7oCH7n|Rh#M!!Q0Vcx5g`shBl8Q#x;ie?S(XSe@&>85eN0b0-n;zE|-6SAreXLRyj7#GUr+*5VV$vKs z&#F>p&aPQSw;h_gQ<7a76`6!HP{9BFFS#?a?imKbB!=%6)Klq(#zU{GUN1zTtszmN<@O=$95LG zyKM5(ZZzO@uP>K7l6?U%swZTlZLKuy=e!Uuv5_j?@C4v2TKKB}4!W)ZlsHd2^=9h6 z=4q22D5e~&ea$r<^mz@iuXO=9_10Mh#9}<4JxMc5jc;VXtXvumD1%IjO|}4a$hXlK zato_Q+IRURj(tTn9ZXLD*%^hRLo4{KC~NIydvF@~#jN0Yiw-elI1NF@oW={Dx&S6E zy8n2)Wv$p`dV99?9>-(iBiw0TSwr7wqQ{0f42myuWFS|WuPb_p{#a}T>0G$4@a7FK z&{JIpoG%_;H4XS?XQfbK6E|q=I8`Z}gs2BR2%eQ&r~@Vq*54sSo&byHOu4(HiB6^9 z7CK)|VETYvWq_tBzbLjwUDd<7o~QK@1}v0JY=We0!D`<0Yn;2pR!h zE@!&Q2aTEDw!8=g>`9CV)fv%tp0ZuCD8w@n=y?Ux!!j6FVoG=Lk99|Gqiimh1zt;$ z-`X>u19Vxbay|E=@iEaL(v%YTA7%o8?F7ZM2Q)z3!9k({T@WwuI*x5yf#XCQzc5xH zGroMa#IMT0DLR5t14BM_8}{>qK6u}{I>3vSONPV%sBk5$Bnm=b%w{M#eRdmgd~=>& z!@?&b;*!MYM8-zJ61FR!Elm~eLBT4-31dQ$j*_0l)|yj=1t55;Q%sCI+o^f%mHI4o z{O%vEeS)b8LF!z$Zm-mPtogFV6q_QhMl76=3rhER9mKebXC^zpRdbzlMl6KWc)pwF zKhNbd(c(JH+~Iu>NLCU4@|$i-X;$Y{7($@D6!QLzS5ad`W!=#=32lULA7ZC{oZX^( zWCit0W4(mwJUer>$AG0JZNL^_b9HLz&Mb?h`L9n73UvIeFGt-Iflu=7@n=cR8fE9; zxel~0qN6`_KZ`WzCP-Rp^psonSPsSJlsnJ8KG;V}EZ`VUQ}11y20R}Y^3+B}8oEvV zHaCrb4glKO(Sq>>?BdqcpF<9~K%U=XXR;kH76Uwl;s85|kE${-Jo}dF9r;0+Oa$-p zfHo)~#RPDtpz&6E>zA&uf!JW}z2@`w?(0m@q3`rs>G(yNz@0!s;nu!BUP8aTmgr#! zxTgb0P6KlDo}?^bEXK_>pj$Ia0T;zfw5@C?LJL|JRbiy$-2u`3-Q7E%zR} zreErOHJnadn{f5T8a+!8h&*&4lsB+Hf;dzsc%WVf_2aEqjNK66KwR0_V=;n zPVe*{aR96mXg@t0->dQFJoEa8Im>eTIDltzJrSav4_+k$U zsi_51ok10KU8!>b*2MYodT^G(V6o*PUmrwN9<|R8MvM^oiikHLBM`N`v$+AuvFc9s}J!9D2k0STTE3&VT4sQ!vfk zY5_v&e#__Uj}{6_nj}YcQXJI0?#`-ZDhJagFbC6JWQ_YiGTpy!hW~pI=U>jiamC#- z<3{Gbl`v(W;Vad%PPhwp+;TcWq!r^U7C(ziSQaQMhh(|!v=GAq@?OaW`|l?pbd6NX zb(WLP(P!(kFkaLw;@V7{)Mt0f=P{zUPw~J^964FainS|)ti?XEy_jg)Z3-n@>gGwK zf;QhZ(-=-kfoFp(|K8gYjrroir&ny`I{HE{nJ*YhIq0y-R6V!%BaWjjfO9NF*Yn~p zrCZG^+QUBHiSq2~EJSopzl|-(iKq3e*uq^khe5RXYV{b8_WupE3w-^2pL)w^Q4V=^~ zRne7`Xd(zWXi4i8v5W(ExU+zu;-}frXK|1(BeR>ElZf|Ls`(iWsz($$&ZUrXwrQs= za#-;t#NlSfTc~fpDIGJg$6m9_NS-<9wf(`P;Cdf8_Q}wRz|@y_wj`5g=0Y&tk6A-Q=i>EWle1vt-DVp# z;t>%uN*S;j>A!}BL7Nu(QuWH;;P>8w#BK~PUjuI1=>RwndI9)9Pg*JW4C>+}50Jg=kVZ1}>AJ$)WKq_Ox# z#i`}9XN9zn1c1}H<66VmdOlwdqoUwKW@=W->Wr~V^B(>)Y{7KFDfFx_1%d0b)*szt z{x@MePch54&@~OwMFnAK5tGQM8$6e|&a%xmG+*g!7H;bK?efx#m!7ruygH1Ym0$nl z-$^NH)wC_NCymZrV=@Y#0EI9yeDiD$@@x9v-vYc4{r|L)oS@Z?3BrPIJX7Zru2b7G zdpjb36L_#6#}8)lH}d`o8qH=2WDutC#pkXkv@~TjG^g)6Tp3@aXQg)qHnP7J6d*_P zXPyB5s$6_CO%IT>PKT)fAD!s`R?xtPbCT<1WB%Wb01yl28)~y^vtWV0xh8=X`&f+z z$OTR!e>ckm{^qo&&dQ(l)?xme`zG)}=br>M2REl`{M{@KxZnrT0`C|7oR;Y(J8+#5 z{39muZ@cq}VEVuKfbS&tzajb0rObar^53uIzlrNV75xAGO8$TTmBa!?#JEYEsVi0$ zRWE|+!UU3x34!nduUB8(^h{jpo|o(dajCk#%+-d8LDS{WE%JO1tY=>6PSz<#P*O@0 z$&*$}&-QLEDhSn#6q@MoEGRt6?Mo5^am!yOco+UvxBY(2V!g8^Rz~yU0ck-!AC;NR zb##yzBb{jM8*b#>afv;8p;NXR0U0qNzWpdIJL&4p3gyD{M7j}9iWY&mE z_@CUO3)(L~i_nIUyb`mCe@T*x=ard2NBcTj9d-NJr!B`S^%l-xp`3C~@m8?43&_BU zlfdTuB-k1)EKH8<&QNJ+RPjnlOBPE_a}LBBD36*D{Ob}qga6fh{Tt~eCi5R#=SyJQ zjJto5>t)%3*wC-KCzcqBQ;uuj`z&3dh9iQZQ-U!2Bfqr(O1pSI@$qS#GJ9T_TCP^| z!e+x}jFwf}RbaM}%moTmUJuo~Y5K78m86l)aCVhFisDi$a&>&=z}}czY~jMuTxESr zGwbz*yJJ?NoqY_Bw=kA6Bn*`zWbC(>_&_ruG$)xb^|B6s5fanf8m%l23n)X<^Y*43 zew^Gy4tPB#|AM@mel*}XpwVZ`mV$R%LWrr5Sef+`FXU;fadx3iYo71J~ zi+-&T5(uw<9IugqL(k5AwfMo_<|fi3b5Ig%S6rvzxq0}kX|jg;oagWhmu3lWSW_aF zDCea4H1mZrL{=q3N4Z1-zM&Z+SJEJhIHHhv%gwPoyXoLHH)Y&()ajYCkoksI}$_4w4-87~b^%V(m_d>PuJiT$?+h-Ky-Io^{ZT`Bx`Lob3k z)eD4DQZ$qCW3($u&fdL=Rg=tWgI?`G%XHoAf|$gif2b(ic+u4jl=4a=B-XY_zvk(b z$0=UxFV79X870RK=gdoGMd)ahOaQWo`A(0(z6nA%Bnn^J&P3u+;LsI_r-NUJv; z@$!(>_i76G^^byq8p@`L>vkx5_QuRm);*{qOGiUatDDLq5yaYFiF+xh5sm#eem)v2 ztjj(lUX_u@h_~%x@8eX$Qmv9G;0%)cHV>CHgjf-8Hh5C5>ub|IC=U zib++3pYmRL}{cSbC;=UIwNGlFIx zg= zt%`NR2Ln7$ME#Kbv}$XSCJ;+3ClJeNk4NR==z+MV2y}*=@t&M~U3ReKj4x|UXaoja z{6ilH1t74MpcP}@uCtPd#=MHn$WoqFGY;vI+pXv$-ww(n4IQ6L@>&>Hb;?qRY{0+SDOQ!nxJMwHS#U7wgI?@LB`- zA_r0n!%G&qmyR2D;``=1rLKESE&9TGOl4+CA$ zh!mMNo`lEaFN#KQjD|1`&npN&QO*#Oa6cvZo2(3mN$Wb+O-d<$%1A(C#k8BdDNYEv zJ6?(lT9ks~t&B3+F}t(suJCzN9)fr zzajEm+6Dt%P4izbwbc`rd0m_=siIvQXH3f-4``7|--90<+J}nJk=&&sqc?pW)-iQ{ z(IdrkS2(W4HV517mte>I9iKOC|y_t?h)@@Jzu*Y3mkh{nhY-hiECY2S5WSYPFwS8qf} zBlc@#g`)i(+xR_iZQr78?~LD;7YDGykPJ7Ajq(Gu4ryy1lB2#mMiWU=vZy`K{ zn`zXKgHOJb4!JYQWGhgE3c?RH>P$!K9C9vhc944$0IhkTR|Zg5A!`$%rBU}|Kex-!j>>rpJ5V1>r$%1-Df%$c|??oQhA0$g!#){$K*MVV<%xNYzUI87e zUhN-dV6ZcX_$lhRloL=}rMbk$aR?ULs>ac??mN@>?yVmdjucWd^?9PRPB9YKj;(BO zlh!7^KJ+G2J)3X0iaoaZs%6W-+J;(j7d};)u9!stUZjO?+(eU} zcj`!sg=M{?f+QbmMnBHM+cd)-sh$BXP9MqF9~}t?(#a-sVKPJ`p+qo@jh9(|si@|! zTkx2f3x_7d#5lujo8^rWijCC5M@PSf<)^~We#ADU)&)L2oeBy!p-0_90#MxNO?XHU7eDuiWqG(mgAFd6ySaX-CD+wVZdH+h^IRna3zx8W?X^vOPi^{91l%=IiodxIyBH4(RJ@_&xtY)(S$gy@F91CuNaB1)w4qh{ zkwxLX3(MXFOPuHx!U^0>LjlDmSLp|w%bM{wM5Z*g{nT(%HTd+0_qnwdyvbZDo?5CT|FA~5Me_PYmC$nW9?3D*|PB-Wi*IQ0^aZMIBmiMHCaZaOfPk ze%dg4bxrqXjGHS~Hoq(4OEEtH3Gaf^CWxkbtGk8ztd;jU<)363+1N~GhZ}8td&rC5 zx$k57`>zU^8Pwhwp;d`3O`wE4>g06l$LmJqW{s06%fI6C`oFxo{>zuhjtj5SCNPp& z#)kDO8hf=@eKw(mECm=@m23|mjq->4xN)c|JY7c_CPcLx^&$J+4K2?}U%zY#7K}&~ z($cQpR>~MaviEWH#6^Q_wKsxxy;5D{ZJCt76jL^||RE-}^E$#|v zCft0J&RNO+92bQu^sLq*vt1>P z{Nu)cRUU8DAbY zE0a!-Ck|V5bAhr_Md3wYX5*gMua&5(5vSp2s^s<4u)Z0u2*x*GZkoh0$Ybr!ls#pMbLMr;@8wKmvZ$gI1v zPzEt#Vv6f(A&(3hi%M{g;8(?w`#QM@o4}swk3)>CwWxHU^SwfxNoOP2$a#~Zb&Ac~ zqFn0q{cBOC^7T45P&=xUlK!on~d97A9YFAj>dlB7>J9^xg)c=b*0{jYT{DEuAE2Ye;$tAwXljC zyh@|-1#hu%($ik4Q{wsik0t|Nf3c*ExUNsQ2tO_UI55c5LRdrC&!@D=wXnj_wwfN)(UBaV7?QP7h9K=%!$ql>li8V46B!&VVwf$d-_u_Hl zT>|Fi$=^I?oAKku*L8%V64obGR6Z5y>?V)6ETvXjb8mx;esReU_Nm8hN+-M%ol^V8 z@}TQN-_5PDPvP0{_Koa(M=7g*dx=;U&YDhO&&TU8^-l!TrSObHp^hrV-Lei;=a>U; zUBwd8VR~2p^4uq=x!$`}%L!Y}OTAw-#hV7x5QO4Ulj`l8Wxzro3*Y`47OD~n6wJ~a z<+-NSnx)uhejh+jHs&%TLp?*h`3EdZPki|T{OBobFavk-Cr+6W&dRUz;0{S`w{h)( zPV(ZW6pIxLPr$P#(*I-ct>2>T+Vyb}1(i|}krEXF5d@_>M7ora4(VeN~@7~+z-QPdp^8?4h!2!d4uejEAo!5Du>kOacTgS+I zqn&Ur2h^6ucIFm=v zWyo<1zcTY1T&S|1c`<4?T^PyqFhWo%{_ZUFGXa$1 z)2h(WQ(=su`oKb8Jogukl`c2MbMgl;f6rBm(!OjixkflH(fJXB7V~bj0&}B}*>EA( z+#9n!lo_f>!<;xxjGXcusRckF3{f0lwV<2Uk#cNxgd6@6~Un%IcJVb9}*50btHy3hx)>i9yD0n-y$8$G5U8=u3ysaHW z55kkVo$#rY8)gVMHU6fXX{o`Jhft7Ib!hgm;|(9b>Ej^n*`>!G&|9jhZn;Vf=u z?*!IPD{=9i=W5_yY~uRVbn%WnITf@UG(R(kJ*J5ye;&r@XUVt4!!}%=dTg8U(?#~P zZ)oNUJ*+bCT4QrvM{A<`8WEhg-W~@!*63BHOcIm^E>OxH4pOw=8+qMcw%!kGSM;N? z@=t(@V2}XH+H)J6#lfU66j-rIJc__0t~8&N))E>VX(%}C``P9n05uNK9N7C7F08DS zY&+32WuJjWOu6i2`lD?Z<;RKU>vdEl!f*4u;B08=jKsvQ;|kR~f2`>5GvOY-T`>9D zE~jA3NrcT#uL5=|-nFoog2fm`>Y|%??8)(cliR|1uH>kT>R-I+rwFLx4A3_MI*Rk-ATDz)~Y{Q zrbQ$k3?b8z$)+=ZRw@+hqTgR>c{iM^7_~X-8J4hB@7C^lQ}T(?fp8e7@7?juDlz5Z z*J}q$-v=|4zaih#%E=V9;53X?B&!b|-wrXqMRrVyMIFu2WHsikjobq{F9*w3s{$*c z?H7JDbkulcTZw*b(w^4^CEdG*_Sb?+c8We~w)W@wHkM?H86!o1_@+JOukz8N9N85lKSui?F2r*=oE7io zUS<&J@BHAn`2EEM6v0fYn$hh4<%4&Jh$o!XIu)m|hP;VQgDb^^)<$p1D+Ts4x3a(f zImfnp&f7_!LSX;%eo=s8?a4UtA6E422)50*f@OAmiuS|JlD+XcCcvZJeH7M5d1Dq% zr-2!CotU!w5Jq3(hpSYDyfIk24A!y+r*I^wZ7I4ctq9%*VnGzs<#oObaxRsYa@ysToQozFuANUUc7QRM#P0VD#%z2o3edU;KGYf zGkTXxu6tSGS)kG&08VWO*KcZZ@~&=YhsQ!X!$oN9y)l&W$JsdElM6b*+^lew^7Obc z1TAtq@vE*rfD>qJm*0Ky>alx^rvpW0F8MU7S>t%4>m-i_k#G|JD!ZlAje%J?BL1!@ zBw-4n+7YBZXG(~ili3e_0^#69qKtZ1Nj?zoUMn_L!FiY+aVyE>9>x7iPsT3L0@~X7f8q24lP}CVEPIK^=FxnD`Y^hg{ah!m>xa|2% zWk=8IKixYQ)QnB;vax-43ku#;i<6QDw51bwJh9&JSA6Cpk{W)`me}o*i=6rR1tr}e z+!~CIs4uot6?5buM4BqS`Y3OeIqVzqtrf1DI+RAWxX!kuzRED3q)JwuLVG*A?+Z9B zO*;ueYa->6iuQ;3?L|EjtoqR!U46O@NqRfUm7`_Ww3bv2khIZ3UlJMtzGV7GxcFKk zKXl^xpv_e)8vFdFUb%*5J(r^I!ECR>8(kVWttPlviaB|u{ohuyXqC&?JEO=elN%l< z`{CO9*WwA8mrPVXpy*`E^243X%?`Et@GptIcEF|7THUg_yaeyTXf-Hk(@Kf&*H?r?A&f#X6Cghj9W{pERkvoz^zlrp??1 zbLrdR~VTD5n*l+wqT5M_a=W+0cCeJPIuPb-S=Z@Npnd9C+W zwk~N^;;2xuw|;(ok=Wem$E3+_ZyIMpfawqE;7_omk#9Bz!C5$?y}YSJ$NXoITt_EA zaO2q8LXTxHrM+Dq{!=xD&r8FM%`7$3&KVOBmQduT%u@5}l{n1unkjb{4%4>fic5bl zao>Q+TvN;@_ZMN8@8=YTOO{mkIjh4hnn6-?chjw2Tppx~c|UUFZufV(TtZWz5KQAw z3^h#!BzFWYOcRBL_+>z~1{K3~563tLly-fhuTb)DHmyTWRy13W*R~->)=_y4hQ0Nn zJ30qVu$FVZl8S5VIb-3Gzw=lc{((AwLOg5+VvGav3wG*q;11K17xh+=D1@h zQ^JY^#5!})B)rjg%;>u8y`<}#yk&(IxR-*XT>}L99PdZZloI_3w&1^x&7MsGK`1WM zWY#x~s(`2}B8TySD=pIZhA6;tE*mzcN zWsKfaS4diX^U_dg0$_%7#ckE)ROPKUk&|obUdhg{6Vw#Z_jv?bzW9sCm7Iz9!b7K> ze(>uG2+@7sWJ3_K#4Z{a4W(YM87Y1t!TGWM(Odwu%%{7uX-3q$Z7GJowNRd;p$r#) zkLIznsM0P(Ll>6jqyzwb|2#8%;wYa|#erA(@@<&2AcR00%uXlGFd1)Z=%=IbRxRa>YfK;*vNo+ebu*ijm*d-!@vt_76o9O?J>=H?wqTeM zHvoix1fLBnh8E=oa$R%~l+(f^w$`f|G4ouJF6q8KH8Im}YkhgFLNkzJuwF|MERUJz zr$~fAruPElw2DYp3UWr~iK4`;uNQSvH5I~Hn5&zkNGmApZgE?)-;>D&TL5|h7Ng}* zJY22Pvohg$BwStGuD`fHetT*r2@7eB*o+HibM7io{)luf5`vSR+BDUuX5Anmol91) zRXP?*`=UizxpZ^G?+i5TeiHh>DE|p%Uw!A7{Q=ECR<^kCcfuy5G#7SlquxA)@fh`F zQd)P}Ps?#N5H;Nxq2su83f1-bCMA~%n|HEP1#c8Pc9X8}onzAAwVJ$8w|!*Qd;4Bp^bP6{ z3V$M7DMnxq#eu(PL#Um*aA$9157VKswr7|u-`(QZ=(ZcXowmQomGJdINqT%IQ372=yIOO9-a)%QMBpcF=DDseTuVi3Y3Ug@+2m7I)Cl0@a0<%DdN zS06^T*Z2ZsO&T*72XStV2X#_YgN9p5fPrqbmPBCTKpOL&mrkL)iCzAVdANG73>@Ae zvruBvw(2s3op^A9W+9f+XK2r=%9)M{7K` z(ZH3ny*weehRG8m7WWu^v0?^`mA@}nYu#+MiEVKCaQ9QOc^km4?fsoQyZaT)0qVgdu6Jvr2xZzabg>g! zvv;88V^;NSM>zCAppBY0%*Wif1n&rs;C9Hut8ubFguWIgED`Q%0v4Jo0+rK^^{6e# zv?`0v|TWPHR&SccSas>rWN!7Zz z;=T)eU&xeY^yt(B^0|xTwY!iNZZLP{TV?`y{5>qqBJXtZDk#g{5KJ)`-VeGC@lZ*U+?4SkcyA8QowqHxJ3O3P0Pl9!UXt zp5G%d?S+?M;jt2ep+5_Ufh##ldNq1|O#??s2h_249!%NglKRIM6A^UfztPizBV z-gy0z`uwTlWk{jMXk9Do$*QsAgQP2(LWX>d)nfm}^!(R%(s9^L=KVYKoaJ6cB3DFK zS4B443jhY>JJqUIBBjD=wYsDsmySmq_3(viWg1;Hmd0T5V{~|zCwimR2ApQ6h?GDD zlBMVSRgk%m&V|KL9t6V_$Iy%v2!!UApuJgSGW~YUWUu!9riH|59GJKb$+*|}j1pI( zuO&{m*K1Jj&9wL}zTwz>F#r-f4>CvOed+ zsV!Be#u=6*>}5A)f~kdzT{T!alqqIo@5Oxf3$AF zqWf5dweIIr6|+%of=raCQhT;lNvB;CUHv?`mgh4&nN`3{coIdLQ4Flyc|z6>Tov2# z-KGX+ybRZDl1=t1j)5d$z6FaP$;J(}_0BJz*HT_cn)eZAiW|GD>3r+rBmNVajUxq> za)CdV1=5Vah&7k6#F|P%>er_~UO0Bx)WlY}*|i?;7$+!>#?dP*12PdytdPXZB3(A< z5mD#^4lyl!;>M$$VC9u!fgXuMz@kXDJ$Ay%U|y9kCXwNcVfW(3osY!*`%#dX65`sh5xtMs66de+riRgXtCP~i2x^qU z-$!_>SE`@ll66T7=kOezypwBOh{2!oDK)7-jXr=y0I69wpo}afeRMY!s$yW7J4 zwyB=K?iI3Tk@Qo>E9nWl@HusPFq7k*7LCH~ zX!0dRT!KF3SJ7UMgU-gEsCOpDm2^^Ih!RvAt;zVDSCmln6LF+x{0>hynO;0ufs5~Z z?Ve52f8w%wSz+U->4jYmLP%bDRfneLQ+Jmq9+dk#1?*=(qAoJD%|(r0{6n-ArM>`2 z+RuJ54Z1-((=l0B!Qj@M>|5INBwZ!nxeF*@msvoAE zYT^b1m}n3l!^TcR-VmlIn%+%p8c+s(m`Op5D4^0dJ2? z_dH9_+VL#$>$D}!ed;KFE1)_f9W&MM?g))^Nr@*in9M9o|DvG9+1;8E zUK1u)oqSFFxYH}^L)_`A-SfdAl6a)X+3P|FFSl$p0#tT{Y3x91kcnM0a>h29lw}Bd za0k_>xB{nw#qz_fO1Dpoc3u|{3%k7Oy+6cvu|Zh(=Em*~YmqYkLlpBU0+}_3o2mN# zo+;!gY0P(MpNi)grB}&wQkX~DC?04&b1#dzb1re zS>NJZpAMr(AReOyFcYNBGW_8V^QtA?JkUpQQArgA((bz9uRxSB!X6V@F|i7qTcWsE zzmh>xwmoy+3g}g?O{ay9pp!q51QAP#|0&>q|M@GXi2sf$1H-W(qZpoo3DMgKYHk?c z{P_b}rPU9 zpM_&_FBB~iUdPS5n%)OVHF7hPKRlz6@)nFmO<{vY5-NzZqXraTpoJBB1Xvcvo($LNTt`Y5oq~2WH3Mp2Q*FS@hd-5iW zwHXi|vo?mqgA?bvv-+#hzN6QBybh9~I7}V8%e{0-lU1MO<2Vl2c$i;Bo_pl2+ zOuKj?#%fSR|5G-0+)LbRh+;l zO0{(26Yh*lQ!zuMGGnXy!v&7niUiUz4aU}JpQM8^C&icNiIOkBtQ>TT99Eh1VpB*I z)Wx>D25Aeu9lO_OAJo_`EGu=RNkkmX#Jv@%d7kb;#ve`29;cCxpgYpUm1hIidhMVd z$&<5MB`*jj<{A?sncq7rZo4PHiOc%dIioKNfVlwi3gX%?PwJJtY{z(gI)+eo+z3r* zT?>&iy6HMB`BY0P5IHc%zn*?PUV3C>2aRM@ik}@$YG}HFFfoJ4k#&8(*?S{Z!Us_> zX2asq3^PNu%k$VM>rV=PzCoCQstXrBl1F>*hAuX*%0>#w1+qGJeGOY z;K)MVfQaPnyMY?AE1hq+XLJ8ptwyY3g#X6zpWr#(Hte2xgB3(!lcF=a?KM(KH!5@? zwkd}9CLmmhJ_XO>nGPVnc0$hFg!k#gv%<_5BDmiy<=Ydwr+qXWHYk{XKvm&xbZIP9)W>lfnWZ!J(XOz0&~SK?hX zKYEQxC%O~P}&%?||{cBcKwMW9*L5(-55{do6t+SNLpbViW z8v%3}etryq806`cN?p+G3SPxb163f0_(H+`Ez*6WI5K{i;!EDXN^h2CBwi=`3Ut>8 zzX4!;TTsJ&ez#J?!;i;KpZ-)ITR%5B ztP^}TYWdLhbhAUN^2>|UjiK4W=xb(@1CXSq-LGUe7e*n&QYoG{i{l}>_mTj;g^?|c zD?*uZMol&)$57%*w_NM72i0EDFabXWYpfL2ivF!>#tFEC*JfboEvo9W=7|iVto_JN zo>USDn!{qXCl#^EDvuOsn^Cg#Y7!i}%<^ddGRwwDkI z5+$?Nf^(oc$%o9O5>y?Y;^@&WP7%TE6zowvW=V~J7gd1Lp>DRBvtqMcTG$`pZc1hw z+tfCfZ}QG=iUU)VI7TQ1!SvEmUS2l&Be$^-x)O{%O<_+0*F-`Xk49@!LvN`cG}CiJ z*_8W}PwnW3c-v;Dz)?r^MBP*Fd(He7^&z)tJWwhJC+zb%rbMqzdLDDA226GTXd;H9 zl9GH>4`}tCu0{PO1{L2PbS`bdq2V{!VP=!4gz7S>&ianlEiIpCchnH!!!$Z=DD+gK z8>e@^1PJk zNjE%^4@|lgeKa}gqQltWV!769_$fJQPq5%+lgNrk*TuoK+b14HQ+~MblJF2%nyBi+ zCNF%m`slXZ&y~n)W}X9cdd{(wJZpj6H?W=GpbAjb4v25EK~nfU;eM?W6=dI(E6KOp ze+2g)zwm%sp+mtf2c}8p)io(&X}Inm&>zFHD%Zw!qf4}?oK-w@4(*#w#CV9ypTh}` zyg{lF!m?TaJSE=YDJo?qSbJ?1wMZgzcE2dujdfO&u!>DLSsHPp1i=)mE|@sKEr@)~ z0d0V=H)m`Lv|v20X=2zp1-(QvCjt8imKpcL2 zHy>gVJrQ$edbZthm((+r!t?9r*NrM)H$<}5)|(s50}Au2j7aKbk#(g)f6Dw$unUXx z*TQ1+!`);`g4pWi4Cdt`i*hXzM9%h{CU%$fAT9xCBkNb%f~OZUJD{VP$D<<$Q;8$J zY3K3S_u3d{JM{rD&ZL{4JFM}sQzq&3y&E^z7F`DU#+(lzQD<)HUR+u!GwM6oV9o)T zM2Pv{WnZn*LlWwi8=%*+)1*AwWAZeZ(~8+1p@My8eLYzDqXLP8)1as7e69T#y0hm; ziT06A94_X!Xznq@mz$h6Fj&1a<$Iudauy4(P^z(h*h{<1nW*4lR82_9{V;x**Jf=M zQtZ1+xPEA9_rbfhYOux{dEK!oQqA0-V-zZ2EzVN<-czyc_1j4XJ;gLM?S>=K>~ILp z;jaE&pEI{{%)M}QJ{H=gsV{TlvMQV3Z}$DZu~T%k84we{F@nebZWtKre0 zenz2Yl^tH3Yu)6{ChVN%kH7hIQ3etFKl@XrBu$MKDx&CtP*KL6fVxyM6sw@TV|yM8 zPm=s7wnf)?^s6g(P*lXFCPnQAT%ndeOg4i1n<^vW4FwZovVl4F9!gepTJIgzmIl&9 zj6|MPKRf3)tak_6q{3OzdcwmPj2C7e#||!G1N^YzGjE*#61V>AyZiCj0v!_dZz@h{ zb3ZK0NH5(zB0jm^^)&n1!F2`m*c&=J?x0AtYJXO@Ia$!Fh7^KT9e2kTKUYb>9w`&9 z9ep#(HLrGK(XEnz!t8q`a}3fIH&NcHZ71(s?xkyLJ7UQwC%=YHo@L&o{?qz1yo_!6 z0}=AS{0scvlHYn#nKG@OA6wT)>%%`A0zU~_*}51teql&^);D`nuffx=E*7LU_e^MW z$)?`%X@^ogk^Vpq?XZT=)g|w2?)-Ry3MK;hpWoh3#m--{6SMrqZb(y|*irKE0nHz| z2Ty(hwIx{80(@bW!Yvkev5YZ30d-cVA{2{0J)leSJt`L9Vr7SdxLUuls&@M$M<1`a zGqrQ?jjL95`_bnePpiga3vcX_KMHl>u?_@?@GZZ+BgH)Byf?;9<-{irD<4_(>!KU} zWP|jvl6U(5;KP;C9yIPq@}5B~Er`sZ6=+Vob@ zERHJ!Ddk3E$+^{8?0=Yqzj5nCe(dTj`nS3G!%_4vma{jN`p{e0{;?be6LmMPHewTnN)t*Umf}QQc2KM@fb^2 zt|v+`tCXAd{Kp-nNcu}a&iPwFF3F!*sL>QcxCs|pkK?i!)T*|@@LJQnOm_DhX@X$s z-9>9~aHD#8Z1C`G1YqVS9uX`BU1lCweV;v)Zb9ed)%~(C0-faX%(*B@pfdW1jDWc3vkLp6%%?`rCLn?=WMVY=BbW*TO1C}xiC?%B^! zV)ZdSpHB`EukyB^lk(YbAyi(QP+$4S2;sMajY)g=ezWAI10yFB8pWVmpZJ_iQ!1<` zYnu|iTv;nvMniD_RvMuPYc-iC&IWUYI}8|1&gZ$I_Iex_dF8l^^;paj9Rm0Ufzyc; zz|-^#8y1?ryUbHx4GpWA8mIu&3ux` zXHqoRw>7seP)n7sZ}3P|c&{8310aNu)bXL+!;U|Deu0+@k^!oJQSRR{IfRh2&iIV5 zZtqiYF9j*@M?c#{r~K_&$R?S3zJ}uMd^H7c%)-Z>L?I<^>&f}zuF{me!GVNVPlkSe z+wi!iFe{qug4sbwMy%1Z9)`QcV%D|`ez=usV|5^wRP zBDdh7#Yx@=|FkI1ogcgej?Y}s<@@Cv6XD0OF!)ox-LJB44m_2G({j#oE_ebD%Ra&F zFhh7SAUqC5zV#y7zC@j?RhPcEF*uRgzW9Wo^csNzG5&7hm|NwR9)#*VzpH)lnehwKuJ{-kHT+f-Y2O=NL1@ z6zR|f5}#URSD17^yl7jiSKQ~5MWJ#uzP(WX8tI5pXxyE=rI-Kg1-92!xAn%e-0C2v zeD1WU91RV1{As_4EO!wBxfg4UzUzA(=edhJITagbDX5PM+QJ`6gnvo{sP(IdJm&ID z^?>fh-ueEfaiE{vM3>AbjomKPtT2u^1^mH;8tYwF5}s=|3r?Z=pF@%w`V;Kx#VplI z*7s95mxvBc;We95Qouj(kaNKRAFQloaq6s){gmW=J*0q>aW#t-#Q5|P{X#TE$w7SLwSaECl4uYsKipY0L)X$>H(({T}(Y9h3WJebhSulqvUI z(o#_w9#(hT@~%7Xo9c~4QzadAHFT?Yvo)XHbW~dEii!ekeByeT%kXDx$UITkdzDv8#|H?Rz9oi*;1}FADT8dE;kkcDsEtEgY*Uh`6-y~ zQx*a2amU$S1zVuHX$tW$05skl3sfFi2wk33SvZ-wBKWMXJKT7(#C8 zKW()j6~rA9z$^#XWlBmxS)teTOMb%~rrTXVP@Nq8uDd0M%fVz!EJjkq+{5|;p@wQ! zJXf^(eO^_D_S4#FZZro6A%4R)A+_H3)JA4MAX2Olq|UGO;Z3J-nIiY_0HS+kYirJx z2Z}XJIxC8Ck4LvZSeX?nwZCR`W1_c{ z@Of_}g+bp*&CHR3VgaaYiF-y@;yiQ=)lt+~6|=xZ~5u~@FA&ql)EgWare zy_h}aZ?d>qTwmrM3@Kc0*3Sy`K7ALAk&Qk`k=JkQU^f+nP?^su@Yj3j&yEKUpQF|c zTxLwX64TD*^Wmtab^YBAC{Q_eD^EEzRjLl=9ble)runbx=HFE0fBj695*s9g6tQfm zmwvbk2_rGwO78%zYOJimB6cKs^o(9ZJPNJo1Bxo2MD28`%yB zzr`Gx-M6P%BzDa`H0=W(v5rgI5`&6(DgPIsqIE#+3Dp>OYQI9oH62uYZl&?qGXoF? zPgMuV7QZOMSqRWg#A=%U%{P1F9!^$>)PYtJ6Z`6tx>tS{dUp@@;a&Usc-exZQs;ml zkuYN$;XfZn$Y)rG94fKj1AW*h!gdGhqMkEWG$~= z@wm(f@k-Lz!af5G@U@350(2`mZ8iEcRmSe#s04Sr+f-J`f$I8!N`0*{U~PMS7=p@q z46a|%(u?DjXd1uf$@}hr+m?9Je!ZgNmT6_6OL%e9!IPnxlb^$4c^dm^(gjkwN`ZCQ zE(Xdqg1V)*>4?y3%hzYH@{C{5 zHyW^bD)H;GQz6|a5^^R3r7`ldo(eIT6}tUJWOQ$%=1oooAJ90SS%oPH*o%^{9yo&@ zc6#U>qtcmCImzD2kQi1uJt66_9OIQ5t)%4x1P(h#rqt#??QQBRW^GIp!bJ{soNt91 zEbY=Xe4I8adN0UKkU*#4O@1!O5ZP5cIH$F2j>I0r$4k_bV^jG@LgC=HY z0h>!~8-xjlG43w;353_{d%XLRXe@wuGJgVpbd)^w>dEx=5|6PwFJ=qC&uP%+=y{8W zkG#raw#3^_(Y`3=)`+BIVD{6-LU~OPww{>&qUBR+Kp^il#b!7)iZ)oK7R+mvRV-0@A^do3;}=ajH~TLZ zgR8ztilnga6R(|a$8H6lcz*wY&?|g}V#PXqCIKqy9xsmZiB{T?kL+7?KEi2%YQ)(c zt!Ez1{Riswh&%~>w_2#hlr(p_4Yu_Zkhujfmd&~{RH9B^q0v!)Kw`RGg6iwI&O2+n zhM8!FsOO!Cx>fK3^}T{pv-i@|&iEgsd)FWOwx20l^gGd9H$aUYdlCbZ;K}I$Tp1|B zQ&P_r6~ZH3y2)STvQ#=~acI0{w#={AGVM)f^TMaUX#aMK6}ARa_X#I*S=5;y1#M*x z6#a3`m*A06JCuPh9_i@Y>Onk@d+(M7}7XAI*j)blN?Owi%kH$p0e;?)M~0{vtEl{+h(G791P-0-cI9B|Gi4 zN-qX81uqHP#yS29c`A5&6MLr_R3L-MluDf~8cR;O`AecA07Y)>gmK7!X(M-|9CoRF zyHs$Vs(Ae_E|&Xi7aGJpay&h1X`{Vge3glrj7D`jy6U z&PZ6lF4-Q39uZUwa=Y!Rfc@;I;gYy8ZhTz^;1-zngASNVdEDa@TzvO^s)u&H+EMFq zZEG-#@Kf@@J+6*n-|+M-+Nfs5#7picPw4Puw$l${UGivn2bvV*7!$h1mo=~{N7Zd_ zmK8pvht8T}ET@9^@y0=_%lbRe^;&;ra_{m`S-+pfKd0+M{=YCjKP-U?ibvd7;rUtc zSch#*URHW>)^x*dF zS0y(E?6FBnmKppm7<~$wn_?>KR);H*{h5PCN_7QqyxR-Xcvku{G*6%7vJKIvdX?rs- z8@z`~$<8`_>P=KE&SDX;EHLYDpR_{7dCi}>4nrNKCTa23|;kHouNWp(* zz583|aE1`s>L0HO__f+UW@DkeonPHOu+|lniY>b6m~3UOiwYJ#1~OwMmW$xj$^#{c z!ce}AS;{JZ7*M`UNKoNoO(Z-kP}+xIDb#g`hkWMuMn?-TrGeZu&3Fj21Dg?$*?*Ya z4$ta-DcivcU2q=)i@4PJ3sv9N4f43PLrWA7jySKzlv*i1=j9c1BJL2=KTMI6pom@WVap>iNfEF~0CC&A$E% z{U)L|2lRF<&3#SYUum`UJb$Ao3%#n{PJLCr?3kj>QV2)irMfB5rsZ#?wvIs5AAD{N z48Jf!XMe}{ek;qrh)cUkAX(j8;ovPs*z$bj{tebFpGqY-7*udnd!g5p)A-Z*a)~EW zkw+WXEoMcr9zAz(XSlBcl5hR`EWN_1&rj)D0kJg;K>JO1YnpdKF(}=lzi4+L2>%k0 zx|NuP=vSfO-2z%?*em0EUYIEar6oTo)_*)yj}jIV^fTqB4kHf!BQ@eLbH6)uzrdRs z;07ws?mjCDYjiX2ZP@K0J4N|mB2hj+2kYy%BB{7-B7|`R@QX4vFW-hJyRSVzPpF@EQk@58R$`1-=8LmxPN#W9S0Vq>NALUcY-R*Zda~^%XZsgXPz8^31K7 ziwa5Rl&-CtBU*7lc$6^85&>LT*v*_;WFkw9rlpj<50`i>evU^9t6ip+o-f74=WKb3 z`{?Os4P1Qi{_6;j-yE1UMK!=JZVC8dCkI&Fy>*!#Eq+vXwTh++`-CZ1d!O^)biptN*;3h>b!SK%|sq4X=C(iUI2YVrf=|S{LLC16hjyJpmtE3b7v z+P%pdu?fYm18PoUYK;op48fUAzLfpV5Ilwu@4;dFgMA(#|9R8EI&CIMnG(Yqi_9!) z9-z+t?NifA!@jw;`XleIE`RlVi)PBw7vS2W*lGpajN+u^h(Jqqo*(H7{mIi!E?(H6 zlxWUD%;IVFK?k(%O?OQnO?82=KNDiAKk*$uzZSW{>w z5hP*+5REY?*sY~=TI<8YpZ#zNOqC*lr;cC();bDU!&E8JFCzg^HURf6T8oL>z4W<= zVL8%R{GTJYq8~PLf5(Cy@2~Qfx=wg@Mk1)4)IR5r#n?3;!|K0|#8q}QjI!W^OlOzx zd4&{}>>6W2oCz>>il`?N1dovzr%eB$3pAz^+Hye4MDKXFziOB9y}5TspU_Z#5u?rP zXH?H#bgnse?4YwjCqMHVY5z#a(O z%G6y{xdL-2z`CmKt@4Z*Ki<-`=+yz0Nj9REmEUnM_Xd+%rxvsF9_;DPj$CrUh{4)S zmuu@Z67_+#}B*b z=%l{V#Jl_lXoUF}GLDap+4q1RS{wxmer#-hte!2DQiWgXgLPD>w1$0SAs*dQ$us1m zn7wdw#k44Ag03px6eH4Lg!&D;h>tr8bafp*8bYj zU0%9!_~hf_W2;cC8wH=M7NOd#e?*^YwjwoM&5V2GK@$)p=!DUVY&t~)woWw2Q{qjI z1j(3mWx{R?Evqn&%L8Wc%3))TVcH@<&G@QJm`A0L`-ILbs^FS40P5hW;@`$OD*b1~ z-#8Y0P1AB!hB*k^3MY26P^sjeVN|_XKcJ6p6$~CFbaera9&nHl6a$74pww>sFrDnn zP9>OhS!L~3FDA-0V+-Q47d^OE;!(4jRF{sTeROn|uhOh3N>@vq0hACpdJQAQ)ZZ5` zoVV72jvosX?#u5Z2Q~A-og00pvtLrOtg|Lf`A2BCJ(D?OCH?y=<$rasmoK+maVsxE5?mn!iL z2KdyNi>GkUS{QkjrN3KBehTRV%DCC$XuwNQy=^hsB~U_&ReMw`iEP?D09O$DU<=H} z1tsB_P?X(gfOYa(z=DI&<1QWV$2NZB4kI!t>{hGHjtfL%ZZS5P6&1}Y_av;B$Y#?T zF%v{zCK5eabd|+^c0QPu6LCK|_EJrL6q|}H_7qrlNj%c`+1-$Udi3kLnWWpLzJV-Y z7&z(Pm`V;@x2DCYMY<70e-BX5o}L(<-z}G8ivVQQU15TbFq#-QMLQQSTie=AN+#X% zSo4fZs>16mRqlc)U2wc+sx-X(#N@<4>$5?$DLlsitRp7{wWZl?2ZyAKF}n^TZ2w~4 zHT*i~CJ+Kx07mYz*#mz7Q{J8YSRlb+8udV14Kx~%Gm?zJfG!o3{X<0+W|s(TM$Q&1 z*gV75`c@7cS3@Q?tq1ynokA3)npcHikGqxM9u2RcxtB*8aAfNK@#^erRyk%0wKUy` z2;wfBf};#5sDbe2;Y^AZ+eBZ_^31+X&Gh{k9nXk34DiMQ!!5E~?8I2@y_L4&b`nx_FU6)ZZ4V> zW?UP!D~DWrOw*5$ft-B_FzK@s8%>QfJx|maN=IGP2R%r!3*(we@!NI*T1+{{Ldw6dcd zbiVJ0TTiC|ov9PI;yYf~xl0wHpTwyv8FHI;(1EmQh%82bbvk9P@3i4gi&Vi>-;hKnxfN8HRA^T;WN_LkD%kR@IO_2naAFvVW* z^Nz|g^U(`5 zFBfxg@lOlzdax8RIaO!aWvGIG8Af0ZE5ir~7iO9ifHg06I)xSLa>IQb%6B{Yvi4wg zjHw4(bM;(}!kdmX`AfioL#ESEVzIM>mMd5sP~Jfzsf2eELfh#D$U6fYoE1+|x09X) zdTz5(_OLtbI1k<_mzD!0LZM{h-LlEcc!RzdP8FEc^P*Q!IDg`${6)9$+hfHldTFrT zRG9+c1$aZ*gk8NjQMvKEkfPn0)zWN#^LNFZ2!{lYZ(_B+eg=%`t=Dy&T)g=XrUe1r z+7VXf6mG8d)BJ-e?E}aD8LTcXJ+y9(yU|WKV*=l07}%61qwJI4-Ft_MUhtubnek}4 zzM<>Aw^CLqkjG1-NhOM6d7W^hsUVd?(x^X1zY{YdK=L#9Loo@h`wA5D{6c0PCxPqL z(u{kCI5^iXmO=DI_PsVfr{7U{&Lm5A^~Kez1ZtwsL_a)$LNXV4%)aWHS#fs;Sbuf; z8IdHw0oj~JME9~GL=Y?*Kdqkk9p9qaj3QZp61Z-{9yuW%1s(nL`-)>HrJ$4=w$VIV z9Es%cA*87Lww5rOQdaN!@Yw(-Ss)1wx5Y)%AW6+2e3ffs5(pm7doind?%^FrI@3{C zj#z7v)x>4O7Z$|MoxkdP?!ujOIG05K*SCg>=jKEoE5F4hj_i3zZLb({5B#E^G3g~n zYH76S`bVbAA+-DNRr57q$x|PdL?UZE_Bx{woihL7Lq(VG`r6<5sdbOqKK2f^y}|Rh z;Gf%nI`_86nqhW!2Yr#c;=R(KaTi+y_o4e>^NEek8x_@+qaYpTc0pN!Mw^n+aA{b?@9aD zM_}L6k5B`vrZ~ z-|rei_?+cMdgfw(iaxkUg$daI^`7G}{c!avpTqoHz=(f_-k4(4uCc|r{12<|@6YvZ zVPXJdaf|mWPT=LQIQWv(LAWhtHZ{-D__sO%wzE}&fmK4#yE?%Mj^zoy? zH?|4ug=(9rLf@8VM(zJJ`M(~*LJYfy)czh6UKYztKk%rNaWsOjp947VR$zs(?A4%T z0E5)ggRSoxNJPbb)$NYE^B#6<{HIm&*Rat1YglfA%c*ULN$*ewwFb9FDa*Hzl9*4v zS?Z3-c}?y5e|^~1ps;ga{R|W@;Zrk!uUif;v9lNT;DrvS(3i~_k9liXdHnNvbkB32 zizQSWtc}`~yR1cz5Vgbp&mp*W`407^c7OYpS8wcF{H~d`<*W5SzB@x|j5)&vWV|`v z1e#$#(N%Eq%Hk)_!=0`#{(;8(t7>^vA<(=3>HV?)_};JmkN@{X_I=>88jS+Ec|7%2 z!_v=fQU`8cN|x8O8$ZnC^NdtQ`viql zvSQqH$G60%8pFGTyRuzT!8@1DSS#$k6l;zu>r<{0C)z@kmwDm9mQ*Hm^#@pl7c@`S zN*rff;BoWcIJp$x&$n&VtxrO=iD`usnfihwrD@MkDZE#O*isg|6UH{HU8cAb zgdfxy4bcAaE$%E_#fFq35LJIg*7?F%$~n;*4%bdH`s+!Wx;LXfJqj$B+1BaEg1aSb zXez?vrQC}jQk$7kyYU;bjbANm_2s&_)U9xM4?WmEx^&QKvvd%!0oi+Y(?Y00ejGhq zcDHDg@hxu7!c0_IKOb-m+*w=*mqo9?e^W>&6$Y#fp^;@utBdO(cHGxRtr^z5EYf>K zCrlOh-`@zjIcb%txbba7BeE%;1za;NyLol*&u7MW(O&_N9~EQAgG8O88#q@bes_&{ea;qtM#|iZ6?_Hl|z&Y;2(t4`1dQ6aaLARfw=ALn^#p_kr2H z^GtOqh7L!el-wVIc^R*o_{L0w`jO|jh?-GiSCZ9OKK&zY9TO5hn24TU<=}RWPU_yx0M4 zaT$XDRGA1eZfv5Q$1H0`97|F^c;6m>F-+Qzy5rP6Nq_O9j*j4y@~4&8@UO&jUgRRU z5IE~yIH@@3OuT*F;m+MD5%$JGN%$d*ym{GYLPk$`+DdJ&6mP8(IwO$7MgM!6z zBrEm6$Ekk7{kIa;E^9B5JO&>hjRF5d3vV?J?K9KW8g|HL4?o2akxy-M+tUjAGTmj4tv$mjhA`Kb>%2!o9Wt4#=>p2_@{cL zS5D{fY3&OZlMdXw;698%mP$GP{l*1lN0GMqZ)%+OgYU>QEptQAtkwgmO*%jJWIg&G z8V+LyDZl!e%M6m7w-8SiJv$jX^(%JWvWKTxI2lX6`yU5|-|%M;xn`Ne{}pm2-r@qU z^0lt@mpxJ*u5>ns_q`cRL@zLUT67n;E@gb7 z?G6aVToHQ1)NV)g)o)QZAz$tPWA81)s!aRn(Lq!~1yK-b5u`)9ySqVpK)P#Fn-Hau zF6r*rbeDj1*OuILcWq!3=V9KN|IF*0_qxvcaz33ee)0mIJASc#YpuJer~Yia7lv{d zCl>O`^ydz->lE;{kI_+_23)|WQOGDS{G=C{67}5tHl^*L&oSy)nf1Be!Byy}CW^E* z7;GF2dNNes#70CcT8{TT$k<){{$!#l`_6kXN*qoYY zXd%s!1{x`3gk|xERdyeSer59Bv|ajuS|^^vQ%560wn=oB+G#*no#m>zNPJ2NWZC0_Nq1&}fR5LT{gW zdiNJIJM5D1NgqF35OS|glC0O?_2QsexzIy zJ7Q*~Jl(NWx!$3(zPRCLKI`ud{D$7;x|f-mGg&lPdB-R z;0%cJsGn61MFEwiu1(Ap0aPR%B*!-@yG9BV__5Q_fKej&S_syWie z1zzfGz0s86&8^Vu%RE~QuP^@7BV6jmAHFXr{0M$keZa|Q+MI_$i0Zwt4~BONP#XOR zjx`92_)v0SHZLIu!)hGlSy%l4V4nk(@vdG!LU1cLvPXGiV7E7jtW;rXt}UjJSJW0T zKX>cX1@(-P`;v3v`m$oONnzrYi?=&_Y7}9fC6$+LopAvsJiEC74&StA2sX9VzE5Sg za*p&lcwfOWSzxNTyt|Aj>LLSnW-J|FliU~2(MtPG|LYlepV1L>6W;1YDaw4{7IX&B z6%QUec;y-Au+G|=mE@8;B;_4H4*T*}(_%w4TM~;}iI+(^mRRCozAF^}+osyPT?3aS z7-g<}$}0!-D@6P)vH!ojYGs-G-a_c_`on~MNif?a)yo;~bz(Us;96VH#w1&-FA_sQa!#^HDQ)I?A~%JEZ{c#Y(wJ*IBH*v@ zvNx%`oh>rGlSFLz#zv1J1{adS zdVZ@;)2J5`B|P%deL&V~X!l`SX^Bwh(6PmA{?3`tkAGcmZ?_&9K(cGIGgW*!&PnQs z=zO%~3$WS>S;k*hJxmQ@CKNd8Pe<^SrEw3Qd1-7Yk|*)lw#bz{?+sFe*yu)=t_DXy z2MEA-KFG`-of|EI7u}YM;fA@=qSn^QmrDy zUWZ6&-$YUdJo$G?j=u)uOEjWSO&k@npUrMDgNF#U_uaj5X?4m8dauj9ZXV<9&S5ry z)9nNgzV84}=B)Mr*Eu?)LxP;T#@u}kXDr{~OTonY+RY)sZF~*Gi zvpf`%O`6M4D(RGMnol$uL!9eGN$>5WaB%bhx}|){PuNFy?-343kFz-zg=sH*p{;r1 z=Ij$bBG0#sF28x+6~4DR+wQ@z!eO6t18dXr+QZ>}S2kCx;|5qNd7W2a6~7RdD(jV( zpWPjg_|IIL6aqruXNHvMsg0OgY&e1X*VTKlSnaTJ+)MPqDgY4 z)w^y<{f^$xPb&SE4~Ve#=Rnn~-Dyv&)$aKaX0Drx=PL@8Mps`CgxPWm7I+z_NM{892m`xuGw z=l!VdNTJ&!8J;}ws?UABI2iekT11Wu=+o&)a(*?fH#Xr$5ME3+b%*U2FJ5G;bT^9W z#`eeGF{pe+=Am}vIz1g(|@HJs^pI#b{r~HF%~kkJ)ntZU(m zT)v1+C62x(sIflZIB%QM?!LcWe0MS4c|bHTA8OZjgE3w{p|jLBX562Xi&ZPi?5RF+ zR1B9)-|}<`cMEVm_zl80!B-k-uPTaTkt{g1{reQoQmHwsKjNSf3coqttHNWpJh!Tq z8R);--<-&3wgtInBYW@}7Y?vwW-GpWB9G~O@ailQlL9@f5tLXM|>}Eug#9 z2AGOwI~6F~I4Zh*#!okCV3`c8uG~blHVu4UDw+@@OnxS(zwnSY&iPk!(t$?+ta4fCt5#!uC#I zVdNh!WnCqCJP7;&cIXwXOw?C-nIFqh@Pu1qn#>fJ)2ddD&K}w{<}i^AW<9s>kc$#P zpiP6wlbpVhT!z<0c8!}M;j;BbXmxK4s~Vn!0Ey7FA@3676Qnx8vpr{^6m$59YxJJ$}|yCBNb zk&kj$+m@}T*lPpQFbZ5SvNZ<}rw zo4ZLiP2V@{CQl|~b25?(xlI}y_YcLCMQ{PcrCDJ4vWghIP6# zxxHaK)3XONTPIoOraI#;E&eeY+VvK(157Zr3|X>-hyJ53e#i~s;<4YDL5Vm?o0Tes zs#5jG!TmR5WBK`#X$OLq7FTY z7&Q2YW2wF`L>~^(znaD6sFyr#HRbjWWk|>JXT=xWo2kBDP@st$8@cuYu34#_VA?Fx zN*5ln*y?$WwZ{Q%vlk7wQe+osy;-N(eVH%Q+*jO#eau!X)vm2br^cgx;QAJY;6}VP z>y$^~0C~8#6uLTC3#*15Ebw0y9P~+$c#UkNL*e7TjIC8lSXH{-D5LeRTWL;VS6<2n zg6VGZ?hyVQSdx3|Rh$9NT{PuvU_4r^=P z7NV0av|=f!?>qsP)bd*3yY+eO*|c1^nP>}|;;*p0kF3Z&;-B+(hiuq{io*Y4?+&F@ zP!_Hs=OpbP%o)MgFTkAt-DKx_>L&HJx{eMO&tiv49gp~S%554_0=3`2td9Z3p@5#| zOWH08&Z2|Y;v9n9O1yujYA@(TzIGS*J@L3xlV{C?EJATS=4qNl{qo&>gLaPcLRMzv zYhwLtTltB$qv)AZmBX?oS7*bjAk%p4_#E&Th>-HfY%ws9%O>hGNZa@<9Y)0{hCSak z7KFJoqIlMp!l#{(8n_2_pA`~KDv@j^mm_B%2kf*>2vh|;At)vNY|Fm&l%Ir+VikS6 z&72o;8qJ^ay$I5Z_cOV8uT7xPl6WODOFW5iK4@5~;&I6MQ0X}3yLKPDp$b}t)`5*( z(70hI^<$nA!KF|Bu3iB1buZY(lbshOqZXet%1+SHZpT884McIad*jukGgrF8Mo9S# zFjhM5#-o(|eFg*k{6r*0ho&ci#l@ya4RYT)1`PD+u?;>c9m=*-Z&ar*(7)5UblN{^=zROmY!FD>EFZ)eGxjOvxK#3-t$B)vG*h}tw_NfT zT2nuAy{`_1YK%{r8NFl!jcnf(tvpH$2D29WyTEQPq59|3V_y50{x5JIhT()*rv&Ug z*C-FmrADuMH3+GAfX2rmr&0F%(%V*s@?{|IT|Ilf`UK(~D&?1%NyVn|CfzU^xCcE` zwhI+vkH4WZmDaZ-;ZzrFl8sD!zx{MM-`&`yFL%pLRT50=VGq$<{7`y9J=fTM99!O8WWU6z>>VZKvqzF)bCZvP4nrTVzDWVO_gI(7vlsD}^ih$5QZz2ATbq`lb=!n+$BN+cj~WEYue z>tRiRgB!2!$!F4i*rjG_P3MQ)IM}zdfJ6)V$iC%7NLAgA1gi6Mh$;ATUO-7)Y5Ho% zGFiP2=+v~oiby+#H7X&U>gai&SQ7}n0gi2!nrf*|ue=(OO+yq%GT(WRNXukaVkO^u zeE-~2MWj$IP89vR^7*UdwNR0bA<+PN@_^_@h3tw__Q}A9O9P=s%s)gq(5fApBqQHC zc*hSpqk;|#SJ-0eF?cz&f}t8@G5Nnq%ZW+iBMGOZWDLkHWzq&Bp1pous67V-LQTZ0 zWT&-C?jLG&nYowC65&LqVl=5)I^*;Cm8|%VL|Z`pK&MmH<8O`$lJykWa?DG!@EJ}K zuOGoiwn#J)A#?R*!Kra8f^~2efb^d+dT$U}dv>E~SRiq}_(36?&|d}00-(Mv|Z$HJ(|{+wiYYBI5)qU@JeNles3&W|fHnZ4t^ zU3GCU#VB&VXHlx)%W`C#f)gXIq%_^pQP5vAwM9f5ehNjYl!5xaBa^Q^s87lOo6MP-%!_&hT>mt)XhGNu&rQ~7MiA`OtL-DK=mCQl0(M=tX>sBEy1_NjOhWRrP zw&X19JmHwL_%*eNUIACjo+ASaj;n$)MB}%S0kIcpPjvZ;FXu(L(xZ8BhZCM}ev;hh zoPphBxmys{P;;FZ%uuqmBf1IF~r!ldGb5y?e`5zwj&K=PCuL*{fZ+;0!1Pm%GfUbiGgb`6uySeDO|EyqFL!*)v)BwLqBLO(8XEW zkwD-kq~W@A)*Bf2Vo6ZT9{npr`WEl;+Ms25;d61KlhH&lEL)uOxy7S;9uya`a{${s z?ngcd-hC@}IdTNxogbAGtExmGI`z{01YCW5U-hR$J2QuLLI)yV*!#iKseg``iXA*+ zlhe#LOpf_b%;|O@BL~Vfm*X9Wr?%Wpv?Q}?sy}kFBWdE z3a}CyoLaioAEC$3!9|q;zSwV}i|JUeCxks=ZmkO0>)>&mRWX*lZW7NAtTTRz!NO{U z+gfz%t6_P@zQ{hJKmC7&>#OeP+uz>*4XUn)aURY^b*d#T9bQAGoM*9VmfN0C@OtUK zq|moMADXK;vD?JCoY5nPqM7TEGtC{7pg8mZZJ=>3pW6~z*^EC#D2 z-LU$1u%tQDS6p^S^rVv}kS%Op?*XUnfPcc1wbhxNRy+4A&Z$|N{;NA0e0~_)Gi1op zqOJ;Myr+7NZRKSR8@#Ucn_QocX~FsAhW2=**~8ED6l`BgKdDRD=1=~}=$e9m9cukO z<8vmh&dF*oS!zrX0NyEQiW+Jq+O;EjN|o{ug7yY&b zskWS7Pj>(&qMo2gb#esv4+B!5kf>UnIPIg&R_b`{m96?M1EYU6u-)(-;kVPt)%208 z{nb*NY2l#)4XFAJt7VOB5`PDD?(CL9xlWO^u>_0yEuVJc#zF{w`mj+NCgQRSO>J|m zKHB~R;*KFv$6hSoK3Zj?rBRtII0PrpuHENqBZYPlVqu~{uQ-PVOagEDvkxq)1p``w z>s2Rmau0rlKlZljU-GJhz0CL~zLOn(+N@b)g;(D=uj`D?4TEVEqjC^wdy7vNw!n}D zILzbJiw_l!UW9Nxx~HYQ!K(RJ8ZEh>)l+=^KZBzmu2>jze~jO{^vb^11iZj_n%}A! zye_L_OkyALsf$mdQX2g7N`P8MVd+z@+Q=!o#kd^4e=644d$Y8FqP?h)kaxL;uEU}# z7ek_Il6_57Pa#RHk1FLFK2Lnp3;EcUHg7DH$->C?@rMP>^(96f#!{Q3PJLf-@HMZ! znh}+0I8@*e^GwgPn17NqJutN304d2n7I(XfZhyGsOWTZ*)G0g9~i#Bn9nwjE`KnLkic}XiL}x+@-As z69tSn53z(A3)>_#(2BBXlMBj>du?UblBiWW#5qTBid6SDR_0^S^Yd)9*R#^;VbXTC1&{-;%-pWfNe#f?jW%#4F(e?ou zMs~HG;H6tDi<##5NdE82d)Hl|yFBv!MRyl~59WnVh{0Z1O>L0)g5tQ#;hfva%~jVj z_@v2kI1(uEjtMwsLKP$588YRyt&kC-lDD(}n*FlbkN^se?_U%lV4jcXi)`b1n*(<2 zi+s66#K~~G2h6+?MJqgNLN5W>0E@9&nAs267N4|DK?xP9aFvf{_?zUTJ6$c++bbR> z+=_l`E@K=9LJnY-_Yk8OkEPm^CCYBGF##6BeLS(>u;N^xnBmbgu3)+JiC8s-VIzsK zFP26*e38oaZfL0;~_Bk!V0<$vRl;Wzuyj#aR%8N=R_F#I&k3!FH6I? z*g#IUY_ykI)i@Lr_55iWvLfalfzcaOsnwp#^YT4;s<;d>W4_ezauC08b+@Iiv@f!GuT)11 z3;^$1!RD98Fyo1D<-Nk+h}(Vi;ev3%k zNL3lmU9cly>kAYo;8Ht3{4yWfMy+(105B|yIe&!MwT1F82fvD`h3j>KnML55nK-tz`yj}ifLKfQ?WFLhwj-8h=O#21Rt_+zQhK^l+{D;e zXeC>Nz=Y7)yIAvMe>m^Nygx2fExe)qXEiV+n|Z%(qpKA^>Gq+*2rY@SR(?;*i!_Bz zwaou%m=P+<+fg(d5za%#Xdq{Va;MAVddl-Yp(Gq}X8h``C0;bbiUf>)I0_y}hxji5 z#XuC7L*^rYJ@|Hq=QhnIkO`#Mq+_%7=DC--c0>mCv!3dJ2Pd7WaL{CxgnoSX6Dr`H zE4u(@VB8Vwk2O*s$u@jZ%r*RM&e9UiMR6rb)mYanmU%bHYT++F+_U?iN@Wau^c|() zGI^8%0l=9N`tjH6>SA*+?;Yu=v%Bf)Dyd~$T_^{gc2e~^$K2v=0!hg8)}qdKDM+N@DPKWN zZ?+>or&$?&-l@d>wp;zb*!H1daE{t^r`p$pQYt!@TFTO?V4Uq!Yz%TEgoHnb^mWGt z`%BV{li^5}-<~oTJU~yAogHy^#^b^_KK1)h9^7reEXVo0C%+j@L9f)xun>bWyePbq3GBiPwtM3i|8%LXwdpMy%I)5nEj17hd`jV$zTAl` zp0JipjkY_m?`rH^?7%2zLwjiH5bI!N3Z=rbi%$BA>WP=-t!?$AC%8fDQJBV~!1OiS zF3WxyAY@E)tJYf2+7K1oT{{Zb*uo45jkJxHGR3gSs9d+morr0ImF-ncYGY7Q#)4If z2y7z~Dg7*XT=pxVZ89QR%h8xu9#oC+-JtELPHE2PG&)T@NbpM{p8Y46)?3W_C-?fj z$zkq(o1L2yxErS>P!!h##qn&@#8;-SuQ+|GI!B+}N{=`CO_Mh6n%dAo!LR5IYtM)> z%vp5omGkWlm0aelp@HZp6Pa>KFSqN;mt)j{kfn_uHFcsN`q-WjM%tL}My1j|=|A@5>Ns~K7*pX)KY2a( zdo=byWL@I9&`Yu3t-m=vh?R5t>>@8t`SU-}t)|gpN*4Mvi&cTQ+M(|aCT};txgr|U zG%Ew~Hp*5x9Ce#`gl_q5fYLS-2czle_(Zwgcm3RIJ-pI*B*I|9B+=8IE0nFNxbwfm3d0V_+T(XGH#fYoXw>n(SN~v zqd8S*p0V-tg{Db+w=H{-8d*ZWVSA?(#n&rCeYmsByaCx~vma2HY^2+X_jz@_-+f`R zov$8(Z7)dVvfVa*y?*ofDeF?LOMQo^+EgGKrohFgy`nG;981FYVQq@J1$f-fW{f9; zO>Epk;??(O?DP0En`KWZFo0yDLBK;Le-Fn(b7EP>lM~Czooxo{6`79)v>n%hv39>t zAFnaH57b3SP@BEg-n?$YCZb7HIZ8IYr)1>`bKa-iamo-FOwu$Mf`EB0Ue`Q}Kz zm@_BVPH?#QrL;e{0GzMcd7k_7B09rsUWZ9AI|ie)k~djIKwaLQQ94Hc^h7Dk7?{3$ zIi9nhF0=rpr+O9pkm$__?B)Tzx}!rkNR#|*w+^B7Yuf{`tz+(oZw`MC-v*wxBZOyjm8Mm$@?URTg%r680sZB9GQpw^s5ES~%uV^L&< z1UC|NyS@$fNVJ9GO0`qM$puX%(H2Z7x=Y27&AV5R`uv5h+Arf8a*`%LMHEY?`&rJBP+{l?E4yZTm{1k;)?XaS|& zY9Zu=hp8+eD|7zKj)0fri$95?k2*jD+hCi+?zEq3_507=5U5tb7}hQnMSkiGaHt$n^h`oKs=fT6xFh_!xGc-D9w zE)aMdJ~bD6dc|vA^%SJnkPH9Bts0VseYp3td|Y&PYs0zRQN26`{?xZm6T(wB3|XkP zZ9dVHv==?wHtiiOv8n1Far6A?{V6mz11FI|2ohv^Kh(qN&8c=X6y_TW7Z{9EY$xZi zkkoRJmRV}k_b!Q5@P8b_GF+%G2fV8pi+PgQ;oqO;eY-Y0^_ljoVz~s&3;FYc0zOkt zh?GF&aA5pIdyqsETt~T8!K@0G(Kw_96Mk?9cQGYu^$*&$j}AJ_$bIXF^yM)@`J{Z~ z-mRJA4!r}=VAaq}RFC_A80_V{W)GE7yb?uL?Uz0s=276h0k@ull?ndQR- zwx>*^w=|M<*76Bv&SyI-!L&{Ox0dL=*>XYNWoO%CD=a{rdE>a#myQX($ju9@c#NwX zyPgpQke}WW9oX20@06NN@UA67bm-p`)|bd;k(4@0U}j*%PkI+eR$D^*+@ob06f-_6 z=QtRjNF30#2&7xKnhCcQRjO-X7deV+UbaOpZ;2-JJnv~jPIK33YtK(rf&EGyD@LMyos z=YZ~T(PhMGCNRfNP$45hvwUu(Sm#`p=z~l=eWlVP8nr^I$syj3ZaqZthk9jx60VsY z8-X1FRML11hRUVLTi#x)OqFOeHVCT|om>?I0+`X#8Mw1LN(r|GUw0P%t+2 zu39fJss-2(x6H%y+|v!#jer%n7L7)!Q5y2F0X~g`4|moC^;*Jy>i{P=NZw7(9r>A$ z-gvPv)O6$}h8ea`XcGIhDewDTL>8t16^;6#Y+7EM>2_ejY$Z5+Ac2E%7hU@qeQA+B zVD@JcFvg_%x(wiWuy}n21jKCpy*wN#`e`jUbSMWu@wQlU+I>DaD31yP^e{U1wPM3( z(7D(>EfRCTs@k-j*J?;#7xXh^5AyB{Rw(wxd}b(({J5zM*&H^l`b4l^6IgJTZLAw0 zf1U&_j{M~XP`zpD2_ zAh>1QXZg3d4EZ2?_c!vnQ()j9vc$oXbh%xQJzX}dM`1g+0RNeg%cE1=t-!*|{FF(8ILEmHOo79#^ag*L`JZnL*%lQkdEP*m+QMxW^HAS6gH-5tB0V>lhShdWt# zH`-oB=gCG>rn>dkrl|Wpezx7OCI_kNXe?Tq!x@LP+~)dfpQXJz8B>efUOgfVks?~z zp;q}q_%q{NNuq??3V1Z_sD6-BhR@P71g*%+#P9VtgPlWCn zU;4opGg&e8ZBz`M@)ATtv2bY4P_fUK$NfaLFbW#2BRW-1JD15CDC1Zj`U3#Wr7;7Q z?W+K6#ro>A!Is$NrLNJ-f-DyRt?2*-A+CS{*=eWY_kv5c!QEgmkpfX6pZgiRVe#*z zSzySf%UAy&m^xx5A6;{YLmOvcMVQx5->KU9A>*$Jr!>$cqM5eR>r#B|_QWck_OcZ} zX#!qM(#ZALA(87LIK5?nUp?fp=_G;Z3QSHHnu|A?iutjVoZgJ=MNt@Y(2Hgx`-apl z?!DXtzaz}6dV!HorKIx1Z1s~}?m;xEI5rl8XNY)i-N|{4RExJ-^5IdQh-W1EgK*sb z134Ut?PML&4~s`l);Jb&R~8kuTX4L7u6kscmo|Zbf&c7!>{-gu41uj%lE=L$?Z3yq z%Ch%FK<>YZfcHX#iA#dwXn~j&zUR>!BMOBO?w=(5j`6Df-sE)~s4jR(8dH6za^`UKR7RB~=4(;PU z3GY|$H{6(Uzd&CvRqFoWa}b&K?kKScMv*&<(4mc3@qPNf0KLi0T-6us%|ML^H~y;n z2pQ2r`E00YgJ-X(v{{YaI*brw9?FH|7e}8sTlJ9J*T4GgR&c*0dt9K@l6X`#e*OOaCj4dnrr@w=n44sS8UhJcM;IY$=va_`BL*>1heY`F67c zsE|>+MyDl(ozF_^p)Q~2+FJaj*xKPa3~0|}lh{RG0Z%Qf0)$mg{UoiUFEuFWD6CPR z`62f_xzZUm%2Z+)+g{ejftc=3JRVJ6Pe>1kdY+; zxXngHn3Gq*Qnq2-Pzv{zKW2Qe2v7dXEubVg4B`^zy7oj)p~I%u`C!vh5F7gZxx7xU z!5UjrlO<5#3p+h(N-V8dD6{{96*Xr#PEmKGik&q)S3K=Fw;f^Zsb^DnI};K15%emY zsr^gabNGTFKwF^$Y$polCqAC<-Uxjd_05IfbCmXq1cb#X4%xcO(uaFPq520huqBL= zgFN$b(;k5LS2s!8U$C@_hH~nfSo(t{G%!0da^iu;puo-Q%6SUbHqUmwX=(++wRP6a z9VgWH1=ZWpd|G@d(x?bAlri{lsQ5$Y^#s$0K}2T_3>-@Is)5*29tp2tNi^4rOR+N6||q&N2+lClA=TegX^`p zB9=vKTLUnK^Bs~B!?RS#RYKj3CY~M5rulYz_JP`+S@oiD(lT2usd4;cp}mVN=&5;+ zKF$mB*Li8&m@eOvn`h;ERz)b1=^y(H2nu#&#nWo52NUG&AVt@@H};n72f0Q$dcBj5 zDWjERTdhgtZe)aHdxDBCR}BEAIp-w8bC!6S=myVp&g=B#sSM zPPOYD)Fe&4>dH%7_wfsKi{h;1X$k|mIcIC?CpYR;0`|sG<(Y_7HCENB^uMR30i98a zwrEGt$J>X@Jd~hf3+}{w@tr=!yD%@{G1>_{tri_FvN~$R)OjNINB&+K#;;Vwfvk&z zlaRB1gN>OC%%T_AXq2-9RoM)#T@qY!bK4X*Kn`+gIL(+7KzT%hTOfbDK1hp06BGY{ z;>CA{Xg#yWpxkWOlh?n3$%2bc_wtAfntu#aE74JwpR$ZgNdUIa_eLn|j_!QNt0w^V zKH!wh6mZ{7MzlOk^|)4Qe5y=}3Zy_;C1ortprvf_9q2Elsu{=W8R@Bv`oTFigd z;XnXoM!AQup;&+O+yL=l6~?%3qjQpCD&=2_#|V0Db!wfu2(zA<92t>~fl_A^H9R(} zbiRnvUB0`hxB?z3@I8gDr|luD0{jc6Q?;qx;i8ZmVxCwkHIyLuwy0OXfB4BOih4{U zK#@Bh0yMqjJMNA(zXb6&cJ43lHF7t9W9lEOxACF?9YZgfdz&w@0HJB+NABm6L_WU0 zI_wXR|I|s6qU?;6mheuLR9X{o8&);lIzu9E^8e6PH#|Q)j@h{^)w>DsU^2)wTeuJ$ zX9ZYct}V$bkW|Rl4Zo*c&(VB~HTwPE0(9d@`^djGET7u8{$LbU{{txsdhyO0WA$Tq zSpR=3Lk+_YOSEfy)uTmtVkS$~L>R0ce5|iFSFkGX+I$q}^S=O_W;gTP>_1dv$l;1J$KQ52SCnQOSJd}Vf2B6?k zNEvj8r7)i$4%TXO`lEJbP*4RIPl1wd@w90_neEq;nw%9uXvHta}{O0rZeuQ_3^M80= zLb>eJCHv&T?ZXOVnWrf!fPDF_nNejZAd!it*A~vB zI)o+B|7*j49Zyf(Q)^_O|8-5mLh#)LA3aR0`R$9KI7Rvvn(BMOY@e?lI0-zzuGW8V z9bPDK@)v3S{~RtxA_W7Xj&&&z(30qI$w`hD>Apv3Hn`bFUciI_v?n!5D|sX)YJb^PWVuB0y`{pW1hIMqs>C zvsa^2F6>6FRaHjMcnc~Za>ca)E zJu!3lAMaX-Kek*b>sqE=9w3STPtOQ>fj<&3*PjXa{?C4+i)NMzZ=9*AaT}D0rva`B zNgSJZX>1xUL)_6zN=mL;U|N4K9RBH5xp7Mev~y9uz7NeucOCBc6i_=NX*L-Zc))KI zGWQ~PzZS*&XAgW0O*u z1K@|p&>7J2$>$%(mz#{Mok1^^mh%K)vhq5#ru?Bx?LCfYTH9^G0PP4#dogG%mTj6qsxTI3!OAnPmZm7q)XUpymY(Yoy}AGg6Z@ zvF{|pb}J0mED~<#!_VE&coYRYLZQ_$Cj^u6%sDba3uKm12*-)0@qPdk)aX>N{uLHVGRVY}u%9s3R7Hi$L+gmV13hO1YK#Y0MH6Dw1RCk)2 zP#7T>{zT519JJR_fzNPfV@RLs*-qLDbfXRCEi+zZi}mb#8m*~(Yt8K!{zyH~$5)eX z1lxARbFFzXG@9*2$A}60gM(XNx@+SgeNQc$>>Rr`bVjf1F7D~cQCOMONxu9${gx)? zY;Q)cldm+Sh#kJ;^~Qq`?tT%?bi2h$CH)?<`9@!l+cdzWO|5V-Qu7L@p=zg&w%bw5JpmMc6Ro#s9j^oN|9G|KWcv zKk0ibN&Nz~$_7~Zr2SqA`#Jsup1VpCMuq!enqDj{4*(6;2F`b^*e9^u-A>d=HOdt5 zIR;AGG3UBxur7^MK_f(K%=e%sqcuVH@W z=9amqxKIEmj?q>x;r+TwuZeo+a4Ni#_h>-oyv(-w>NAckAqN~F=+2WO;3SggPBylV zmAsc`nRQ}WlDsF^{Ze5{djo~XTj_9X)=1ItL5|XR9D)zs#Io_zWKw*`7x%rG-D+PQqVmqwEmZiWD75&t2~8XN=oOlW+0{ zdvi+ib38z+M#~Z?$C6~XnR#oyziqX1f%HGizc-bCv}S&PwPyFmPxuR&oc&t{d1}Zf(BfV3;*?=Oa!@=vF8^j0rab z)|wo*1I6Er#5_(dIJs3=^LX1k z?Ih&Q$lg-k=eK-y0IB@B!0kXt&2SQg_g;$K#uJnOX(gjOExPO3P&9{Zl0QVFaeGGF zI9e)XF##Ru>UTT?AG&ZKKz( z)Fg?iSm&FIhMGXD;(+EG@eB~lR|&a=;q#1&KDy{!KuM%o^fLu`#Ab&R-nW$QX?3~q z{t2lpj|V=0@pn&NheY4R7EF6II5&C3RDAqR{+2yHdp!+RF!21a9y#zR5hwuIdAEt& z#S!TgXvG@s%}jGBiOJwI>WHJ6%m10>fP^V}pXUx9{v(gJQB$TD)uWtyYFMa3DEI_k zzd`1|Vl~5P5cU#GbFpsbE&zPw=<>tWfwg`Hfh}LLPod%ey!=88=@-Lgt(fdZ!-Vi$ zGrPF!prUkn9OX0}wsIgV$lb!d6?#EX^3ruE&fSLo7<2w`HGNXA%&-78VgUUzj$`hX zL(VlWnZP+3UCTF4&D)b@ORtr>b|1h1j)ivrplT1OT$$mB-j)`6TQk}n4y&qJk;kWT zj@wfeYw~b`w}5fn;Tibq2xYBiieqLEum{jKt>sQYww~J%$p(ts8h4;za~5lcQUY%p zG2eAlFPKya?6B6cP9w5S0GfTiouKtz!!rlDqD3`?&q@(h7L*6_(*g&QPbK(>q_hC_ zV_!U3396_cvM!0o7&_-1zfZdlLiQ?vBovu{gd+S-wl9XkylCU5Ns`;yjjAl)Ok&Q85|93y|u5(I5%GfEkkD~#>|B8^G z1Wo>fAN{jx8w>p;j`ZPVW2US-jm5FTmhCFEWHvxnV-X!C*BhS2d-1c9w0pd4SHOek zd+ilp@;K|n(9v7My1K2i%+UuA20?QYFEp{~l|O_fPHuzY`%ehlZCtEi>cqQ=uiHH@ zgVn~a&gH#s01>EZR#$I7q;!2vJ+4Qya?7{RMSd2pUs;f4f`+D*S zGF3P@Utw{Eub-qn5_{gTtVe%%Jz=m=3r%X-+1`&=y$rN3b8%Zd%5L~kHNS?%;Oo7J zB!Nk?8uH-%>iF>L9XzA~{9$lpbU50`i-463#ccI_{v~aT`nCV&i1x zoBN`MVq^71xqAr^|J2@|l)?^uO2|7P-iOK-rX66{mLr7szt?j6KHZoE{@JDrVLf;r zsQsYyrP4{wj>U-}AnzsdxQM(@WMv_?v%2-A1?a5CLaeVa+mUpGNT`c%n@C!Za-2u( zZhdISvbcsRaM|PFFqyB+WzO^NmH4+9t@FT|0i23efLno>X3$u6c-rj+DLU5 z)q**t?7_Y7JVk&q%c!4b_1n$7Mh#(akTuUY68urYM=vaFM3J>4eY{+gS?2|Bo+SA> z9ipTTg^yE61F(wK*;(xzj*`{OI!NOlfVsc*b$Duw(-$8b^wEDNw`~Wn(7Uc7Uf)BU zbBwH1Q>d>rHY>H%=C20C3U1mOw^E+&~;^PBgd_>f6rhYsMJ#S!JK2Hp4Px@bfkHQ&p~^K;HDF= zCHg59Kw?gp$#ZtNYj!~)WNR2i`Vt^L3`a;nxx(B-3E35gnBmP@J3RZ5z&6S{I49~E zUO_)vw`OO`le2vQG*y5;FE;-6BpYj4=eb%UD&T^oOEmMpJKpSP_noKCe`k5Y$-Y>l z)iNY63<+I1Pr!_a3t;Ei@^2?k#7dLn(_PFMRCUFmVX}*H!@*lz8_an*ZM)+P3J$w>Iav{d1+T^2qP5 zne=6A;tSSrNXu)RG2ED%v-0N{9zPNE1nRmd*H#mpZkK29&tZ6lZ?m4D0I~gDba(h+ z%1k)$61@zhi9U(l_Gr?4>2nu_ry!*B6zRV4%+s^R8*7c{yPW|MtR|}0A_m{hgaR;1 z&o#-HgA_A3WPEYVtTDF`rV|(2uzqjjopY&FA9KY^->_nmO8MLNM4e)bf{+CqnZ-672`#!yE z{xx*CZ}sK5eClBeaX@Pdo})}dKs-;$H z_stKuxI~^_Ppw*aRoxYq!B=_m6}VEaEY-Eplvj%$xrl*lPoaq?cgxPxax-5lcNj%@ z4>tQIpil+R3kQi>2dNDL$Y7j&@<;#Hi1I`1DA#p-GU#ky3%uUo)u?`-=W^$jpgDV* zE+*708zSM%-h&e*>g4+9!k6;g-pNU=@sdU&Ca}Q;cQiLrK$qdYcRwY&|Mc)82dE&v zAGW#>jGg@2Kt8?_vvcSGW@nxQbjC@sznsd^m&fk`J0+-grI~r~^Y&rpYi;*8+#y?6 zF^b0>jYrypl1-xnu7+v@#_W{btSK{(kyK$>W&xjJ@mVu;_2*y#7LSz$vz+>ZF_fx;_z~3>=f7Om)g*ct+rQUIiyFpI!f7I%>?tQGJtJk# ztSWpfsd95g|H_L4pyB2ue#^MSg~RH_J8p1My?)#KIC1;4wD$}MKY%An=Hc@<01y2O zfQwJ=5#jdDdv{~rk>Z;BZXc$NeNu7Lh&kyf?Zf^x?+O_%z*fwQB~CqOlko|EDWKii zYmX{6%z5~FS_wCI(wqY*f2>%Q!HCDCr?(p@QHlVelmf`kO}Q7(x{#daFE+tJ&!`-I z4XjrTBG@jo^+6^vHLEn~6EUBFd*!#<8S<8G{l#F>F`ZIw+~D5BQMB|=Wart}+q131 zhEnk3D_vQhO|kxxGd;T40S*mWK`_d8O>`km!D!QI{uab4EH_O(x&iLjk*ksQD;2hq z6k*`GRp&N;P9<6egP;T+XeUDxtHm6UR^W z6}pr{O3S#gphtXIDHDIw4*F&nD zdf28s@g4i!rc{>4>L%*v^lm2Y!wh2Ty!}dS(d?Obb6>z2NoR@s{KimXAHDz=^4$gD zZi+dTa$h(&Z&K0_LcZx2n;^YQp%L~i9SG>#ze%KoH+^Fls{hI^XsVxKXTO5Lo5xSI zB^=r4pM8YDk1Spq8~5|`l|3bTXT%5R3rXN*IeFRbu>n-gS&q6icf6^FLVhLU*$!kC zqd+)GnW^@xO_0{9&+M%4p-zLq?0*?|ttxpkgqq2D7QlMZBV9xq%ko~cA~hGTz$K0& z@GiIkw)rJ4d{-4Ec14&zfL>HIZzJRjlQAeEvY9qMKW4`UT1doLuMXw^(I z_p!N3&iud?@xFS3HuB*8Ci~{c<#nK^kZR6Zw?Lujjj0Dc&+an(2z81^m>40^nVzoD z%DJSEe|mME$)af~OzuzyUDmB#9oF{ihqBCn9Ao^l2VC~Zvi}zo(Z8C_nXB@r>%p;m zrzJP1iX6}U>rOLiI;+YqhZ#V27mpKfwAd$nDm(kH@Q;}D1$9BeR%yz8ta^0 zm$}F7OO<(58X=NzJGj5)?SBAC&%RYUvHc|=2P44G>%J>Jpa{_i)rmYzD+Zbj=^Rpe z&Y&TQ14Z*Bkz2#A;{???)#)`XG@c*HkCz!_7j~7SXYU3X>e=p$3!YAxs3y(SrdYYY z`c{c(0qr{=%c68Mn&+H#Up7#b>`wXQb!BUpM?xI3`2ip+q*0dVijO&+pMJF7pmSn3 zK-8?Sg;y8^b*(ga)z1HNn>P8yk+rQJrCs<9ws1jijTggQzPtDR?dkjNOfFM~L@&95 z&b2?Lbelcg{qQ`^ZgKB;HEjAutr=Bo%4VpfKNa+@}vM+(<@Iv)Fhj3^c`+>xTI zh`WD1)rZv1fNox~VTX`-Z}o%|LJ8#6v!|r|?gVZ;NZULce%bo&Yuu+k(UeCDU(%uX zyEbFb6u|!C;Qz~yu&&3HbZ+zaUQtRs)c9o&=im zoXd@3O!OX7pN)GjJOLQ-XSoM1G5;MX{SH;){NLW9*QxM@9I=0 zazcmi6bee!F~T}iZhqby!rZUi8ncx&-(SXy;A*t_yI}73ml-Ym#v5Gvs|$EZ1O^&W z!AUe~lTLC9jLErIrq`Q;3V-K2aNvj0&u`D$L&=mYT?eD+{}@Lf{`SDj^G2#X+I;ry z_h(k<^PyzeAzxWl)}K78Hjt8hi}=f%@1Q^56~XN4w{#NEU%b6Pz1_dT{@=v>I-IhR zL4kkqi+=1xeK#WVT$FHc^W9$=mmLKC`!d20d^0xb|GXBE^CyWdkHkr>M8SY^O!yD-^T5-Ne?SC-JxYm1sJkHfwM znYQVcaLOGH|ADS2KcyTHFN!A3@Uwh*FTH-x-O}5dS1QvMj5pQp8{q+SCY{Zdf6Q3? z4?w$flTtz=0Srr|#}V3T7q6<-3;^yvQU%J9X`eO8TX12OZ?`x6d~xA-D5tQA9SjDk z4RSc+y)l1@I449&aY|K)y1PaJ&RSeJ~P;f2xT;F zZ2o8 zSqp_Mr9b}S?|)c7dPMwI);WhWrky9j?X{=7g}qMq3!k2xv4EV!$!sc_*9Lz3yubfz z%CBy*?SfI&>FuVz#1$gsV4T2eU8^0P01sXH7{C^!*&eUgc1Zul_htX!F|+5-IJ`M| zb5y=QS!f@v%lsGFZJBjr$G&5H`8- zo8*45g#Aa-!@zCkCj2m}Q<3#d;!Cs6VK?S_vrW6FJMwzB^&k9Kvh1g#RtP(F+&ZV^ z!GE{*%MPRKjETWf{Q9+D*r0)j#n3|k-O5jje<`%=^1#26ojFf^h`{MKy!xo+8T|pE zGe3NDpiZ(Bbt?L%PNt*PcU1To+=sne~i}=vBq2Dl*#u690k{XA7 zrjlFA_1`|iq3#?sXuWUYMW|Er0`6HZH8k>6-r3*v|3~lAtqAT-%8ROo1uL7g!{3HkG!)CxtEv>-|(6b1CX5^i3U4h&%LJa@TI4Y+i&ewpC!mkcJMr z=E*-hnin}JyV~<@S5-!<^{m~(taD1wpB`9(E7oqu*|CAkcH4rres(&|uTx|i`%R|% zSz;8#x9t$2N#=h%)K_NY4e;C`0#QOCu@g)FD5VK*uSSW3v4 z*oVXAu96w60_DvI{-0{<2@2eJQQ+n7Li7ywe-XJ6-z?aYP4=hh0SAzN8zK>uvbgFE0G-eOiuCpz}OMPr_kTukWA# z*&o<*Qc&(LK)LIq^82p+MC|kwOx%2XkfkwZKTD(WPoDl}_uf6UC*-t%x*P{zhko+@ zT5o`IjRo+4#h-OJ^W^Q$X99Nc1b9`#t-Zk70A;#(VrMkD1*51PTLuvB^J>_kr$0Hf zJh#ABEMKDsig8pt{NpXooq+0*{9cM0`K4170DPVN$r(tOr>F~^_Wx}{uXhQg;ol7C z_lx5<>7Ksr*lg?GuzBEkYFD|BV3DP;{*ad@c3K(}Kf~E3LB=lkNBVwtzj|1F#myE~ ze@ul>a`@;PR?avIsREUlf79<7VB6^^LmuOG9D}sKUOZ{I!ycW7QY7Af>vxGKMU5S7 zb`6TNFU@dRe?muHzX+8~cQCr(Z}t}d(c3PxY$~23p_51_!<3fbdL>2GKC0bxRZTRQ zrd`Ii9En=8Np zO#`m|xta>hd1Nrfd zP}#4A9vK?Qa<{(F|9rtSCn9~n>gYLcDKX4Z?Boo3MugldKxBYXHS<@v40)MiU0URb zB7CAlCTTI!o-6jE0cd;(oA2mGjp!}_gM+ zDUBal`y=V%l+6v(`o6h`0+BaH<Q8+cPvvlbwWqqHKi1RLwCJs3(h_$Y5-j* zoqy-tiT`#y8eed~gkANdb2Q&WTS>eL7g3SFpigf`4~K5QeTeqk5b3JTkP!`0j^uhE z$b0<9O8c^mg3r|7jZ4$nO*V!YxznxJ!lV;Zm9U+jIsN9DYNlX{O5pS3-1T$zl?i@( zWvX&Vb@c8J+)JLTd9AC)OG}zUd2IEF9i(@5pJ7q9B7xZ|X|DFjRliq)^HOdP*_drV z*uC{t^KXFra4Cea8g`{zHO${%(e!jK1+yi@=`6Kb?Q7C;qg)|~q1AM>0bVcMjyTdT zi!*r(DW+In#gde0J}uDSE=nq$qZL9|t|q%S74Ifb6LV|UZSJ@9=PgfI*QH?%^C5uZ7eYP{yM=$k5DNN3hg*XbCA`L!cYBq^w&rMW<6RDxU97swwUd2AJljfIGfJYPP}WfrG%f!nttH~)@aANf<5Nwp%IPFbSNfZJSa z(v8)r<(^Udv7Ww>7Nd+J19&wI9QhM-xu_(ET=UkrI)%Ah;LIbe^rCGpIBs}GGvw$9 zc263rZL>H|!yM(?Wx5s_jl3QVIpK$U>?h-QIym3#v^2%7dwPgv8rS{g1z$IJ1DF27 zF^hOFgSC6kMJ>~*^jHM;t@zxKN~*TmO+h*Q!$N@Bspfh@x`k+y3c30mDO+a6(~b^> zwq53fbp|sz-mJmSZSdkHulfrOb)hJG?G&~3$SRza$cSeybbXcwlj#bP$U%C|`Xbx- zdC;CXDo2+^oVA_}*`d06bM<~hJ@5&&${Hpa$^4@C><8aDz6wG*MfeCIR$`eoO0-vp z9^>W&rC$h4(`{Qh*i?;o`;X7KeKu@KEzW(va3z5CI2&|D*I=ontMivfHgSW9aE*GW zo(C94hS~z(2&a5`zYcV6s6IA^^w%d(1zVrPvi^tbWyPx1&`W zXw0}^LxNtFvR7hdW>s$KMSke=q)y&Kn|EJ-j@JX<-PFZxdmqMo_g?6+zl~Xy@!AN) z;h~$E3z5d*I*HzVzKrGyTtN`xmBW-s`3Gm){5{3JwxjGr{W0+`C1@ILL{{6OyjwmO zaFd?-)kZ{*6tt-jinWU`>iaAJo9s-TG>r0SbrqfTu>jm4vVQIcjH+zZ|8z;5-)MCr z9xP-8PBO6(buey;$t&6?IzhcYfPvI+R4hiysgX&&Fsh2hO2c@sn%gt>hK}p&j>M1> ztu5l9`UbX4k;QZRN|9k`QW&`CcDHiAfR>!qK^CP;G(*n#!+xU|3!Y2hT2jYu})M5LFmFbaJAyn-0bZ&+U>N7}ea` zJ&wa}N|Kj*t6_#XNr&=N$McUP7tnpR`OSQ7w@)kut3iz@5Z~n zD>7ZHZ^7_F0ZwknHa8@jEMntv=+q^Vh#NB#rR#0p8Od!BW73w%`*RE`xev$^3olJ+ z29C}$i-!+!Z!J_{y~T4XzD{j|P_D!F^*N3vyYS%4P)mHwm0{bga>8O;MY3>nUEozHEoY z6Y3Jn={1jik7WsweO6QZBHuh`mpc^lf1n;XNX_Z54NF zmE2yJ_x0t-1WzZMg2~jGA|pi>NG{^?$gOib>{Swl+990(9{s(^MYN@SM)L~izppcc z;ht068WN7)b0r0u?;uxYTBNNLIOk2Sx*5rt^l>>Y{`9z0(0gGg=|p^l?}+D=dPSSk zsJtyrUYLxBMT;1=8;{tiN9B<}LSHW~vo%%$(a|#q?No zR>Nv-QZ9>T_<25U!wp!Mh)*@XC)*o(d16avqNFUQWhHlV6J@`-tspY1PtMyz8}BoS z9{Z{YZwa>16N=lAQfIQp!?3cZ6Jwh@@ zWs8M5tHULWIaR97QS+k1S6W-DVLo4KTJHI@?W5%!ppQ5c4www#;WAD}Z24sQJ*%d= zF1M=OstaBacTK$1smntnr9R?PMw2%m0MRu&X&CI+QMFTIKy1)h@~Ry zSc?WpvRhsM0k{F&`;z|6rKJ5G;KKQ0xM?s&r|l7IS|au%(p8lssi=kzLs3*krw6KG zoL}DH2jl90J#gIG+9+*%{+)}jFfAv|w7#9(Wq-eQHWfst2E%Zq>3qLq-gbGMj{TiU zAM<^*m4x}Ej?`8g%TyTWxcmi0QHRmxw8ZD*>XHTOCElc&fOo$5fv#d}`$9ENlwh~p zkeepI2z7rA%lP8b?Qz^D2ua@W-YqHU5s3`!OVIE=pX3rm44aib2lfW31T*z_K9#Zk?~ z`z^qIJ`SXB)r!k!hun!Xw|@X5dE!D@*L7jdv!cIQGfVnS+&| zcX*SbXlsERaWv&Ce%QCRvNGJ4o2{ELYr*O8Ml&sRdHy&q4X083a&roWon6ycPU0M| z9l#jsnc_KlKf>wyxg%^(Q+&Kan}K)rZl9p6M^THF?wlwlvoN&vXmwPaJqrfzYjtxu zH_k4oIk#N2`JL*I&&jR&Z)8OxWu;|%Xr;+(*E$fy`I$!C2= zlrEN6AehxUC(3BVJgv%khJeh-p2_WP2e|0eRbu2P8GG3}%I@@su~}x)EVf1I33wsY zC6yaE?YqGtV)w+N^qiJ!Y!{)^9pVdFGzT%t?9Adz^Ui{G0B5u&y-1t~R7+>`SG&3d z5d@yYR~sR5BWOjoQkf~Z{Yas5{%QGqetF+80s|eAu%ePxSg)|YU}TCn4V|aQ&ll-? zExhb%4+&uP1VZ82fuQ-UL7`iOzwvM&;6xTX5I1;z-&uC0&NTMjvky|*$jMy*mLpEJ zXZv`lg!IOcZ(3n~xBu9w!z_*V(cy4Yfx@$R!-tOkN?WtI=e$AXRF~xkx{#ttR&9gubu4s7IFYa<;#W? zoQg@(3}zM+xT|lmN)REQ`|;sO|K%GY@g1pxPfH1iqTEW}5vL*xS4u`aTnirxxx%*_ z;9gvzQZp?rhXR9Mv50?Ix!K&H)@SrWU2;J0OjGx*&&#TIc^+*;Cd;4^FbSZyn)^pz zFQjl1jH>A@i35jbP>U5&wujbBFxFMk5g;8J>yqgfYmy%<^yD6VemR}@u~*1yZTe~j zzZhD)t$8m4CvIyw?S98ZHC4{f= z%WnU22~14V6214zx(ga(#AndV=to#iD;^#bIw z@%^fzJ|4bzNYa#)&{_jqkevhG5Lc&fkJQnEk_OTg@|WOp`}_!&(miPq&Y)$PUOoQ1 z?YeEk?$Lxv5BxB0Tm#Q`S6t(u3~odwRFcr{@BUzAqG|c#BiQnyH(M989n zgKHIhmSk!S)A8F6DIxz25LOi2peoUfS4*VB<2u&Er;ltWg^cfxzjxXOfNk{c91~to!h9hz8ltSdgZ0 zS8!aPWK2aJIboP}cNtu0)X?TV&j|Q~X3=o_{2Hg>u~=Tgz+&6MiOf_yt_mj?EQ8D5 zN{F)`@$6LH4=zZ}US|cJP^CuDe{9 zU@EUgiDA*EIuM(F-aF^weIbMejLUBwpqvi#-%p2e zWVYLLO8BCthz1VV>!S+6@QduBa$7APQv-|{-53ocuZ~qBiz!AjR z-^Cz%sHmeBid`taS0UK~4n2j!mTP3=GBoO86I{wX#7qGxIq)B}?$ZR!ioq}5H^~|_ zjhyYdoCtTkDd0-p(A31CUE7}^W8WLYtK5Q2{mwNerLtu#%#nOhwZwiT%bmvg@LUZN z*Q)1c2lrT(v*7DRw}Z+GPgMlPghGLdzTuo!4p0-PBux!*o?=R8P1TBA;inv#@ve4c zBfNJL*Fg>L>1%#AAwC+O#L<+#ql3uz%|XqC8npcKmuIi^kJ z%Gao|GXs8aWEJ!IKfVic36YqZj&W1jExylZO_@_)u^Xe5PnT7;o$+E%;UVDRO5?_O zXVBY2NqNO;kLgzQohvsxtsR$+HeBv~%fz@tJa>TNs4RaA=S)ZC%{>AOl#D}q?q2Yb zk#n0b?RUb<)rXa3)3Cha{qWICW^5&h9HoA-Y5Kz=7pgu+$duQ;#$4fte$eJ=8j8AY zeow0)SI0)Y=Z`gk0;H(Pk{J5#mw8qH9Sjr~MPqori{wAgG$pfBQ8O%*JG4kxP{?kl zN?-$C-~M?*@fbiyK0pQhlTiGhoBe-ALeWLvvFITn=X}V-5+UG$wb1`0NqgkHJB&*$ zwsy$5of%~6LrL~XNtM({@*`rE7$or=Qj9|mByW?7C-Ql3G)HBR1+YB@mjtPPHn_b* zdX5r8!<-l&_GWpyEDoLHQa#DWRz>dRdmUO0JNDY*@@P#EKYc+7_T;Ak@`>^K5MFSB z@DA=dgzPma==TZ_93+DBvui)|UNA@Gt2^3B-{A+I~1S1|}kFb(df ziYrQY8aVJ{9R7wEFH_{3ha{I+oLf#E=ME0rg+;!OyB4W?iSFAPy{az=;VPle1Xcx( z01*a#{RHK4-1l=a9+~cU45f(j(3WV4+Q$;e)k&B7?zgh0 z5BKHjHkZdUiX3kK5YvBSOC~wLWN6P#cUy*1phy5!&Ai;43^5;AvLI5guERlD$Jv4l z9iEF@riojO!4#o}$&rp|rm9}BN_qTaaQybEOi1}iE;W-j2`NR8avZPt_+%^qOO4ql zW1WAPQ*t-;O&>)jT8Eo_`PWMxqH=ugnjhzt@zXqgC4PK(hX|Fz+P`7nBk4B(67|lw z_@s>0bsy0WP`mvF0uRr;w*Qpv-Djv@4`a$sy`DJ~R4{$4S)st7y9wAhliFurFBfYm z|3oAqns0pZp&cMoJJh=;a;yu6ZP*+N7^fuqUV)@QB)LcRxm$tzGk->X$%t-x1q9^Yu!wNM&`cLH*`^px)O4e3+4hOEXX^tNu4Ivm+E$6=Px>1y6O+D>Fe2NL{ZO8aY`| zWE&l1{F*OTOK@PV>VR`3%t#tSUdefPEqUm@pPv< z%?W<^+4_=2Lurds)PZWVw;~UmQPR_q%Kmg%g#9PG|C;^Q3AS5WLc)Fzo?_V@jGxY= zy4esOo!xiSr}ihMdpXvU8;Nnl8_#e3V!!h^=F8;Jx|G>pRaPfEU&i{J2py&R8%8_cg!I;1t zCbMxp-fdfAdDtD@y2PTRZ`&ijx~7?=Uy@>)qTL(J1|?WN#inrl0TBQG1MKOM-@H_& zvDT%yJv{e}tEug8fDyT!r)0OZ;G12t)GbRka`R+!m-&971$3GQy5AW~G~YCm+^Hep z;eLugoAJ$`jlVD~HuCUEzdc&6$0PTGW@>HU6Y7Qv?0Q(AkvGRaT0$vE=~}3cWE2E-6i@-c2K3YyIxucBc*P--f$RLB^}8HnO{vC_$M8+}Y1ywS8`` z8zCDaiuUOeU+hPoD7Qgu6$?3bCf9hLW7y{_aZyjiUiO+MehQ=wUoFa*r`ww@tSu{I zZ&bM1c73f{>*UQFZ-erAro;URexOx}t4afjF?u%ahOU_KF5f9ah3TFHEfLpeoPObR z3=~fB28Dnta+`}4*am@$#Zw##p)ESE2)MDJwjdc?f3^es2E!Z!7~swyuVZp$w7M{@ zoU}bxzfdxwl|G%X1J9F-47}X=g^w^bLY`l=J1-;~KK3@oUl_()|>87fTn= zc_UtPC(6mJyQ%m2EZr@%lv=03*VH=u&+6{C`dqoR+yJjtV77m~aJP4onypl-Jh$9Q zGT`ksXz=9SX<5ywE^s-><|t9@AWo7P>^wGj5H_D2owa~lF*}5JR0<||dK7P|^6JpBcr2`onW7fla-7K8 zI=Y}{lr#%kexk4X=G%TGK<{Na7+Jk-?+UB$RN>bxZ3Thon~MXgZ=Eq^W5Xp7{%+1I zR5SvG*S}UCqiK!39CQWZa{^?})Y9Bh3+{D74T<7isoUF`?G0e!Z}bRuRS*$ec`M#+u*rY*xCX)6N6t)1@+r}Y7^R=1uNp5d zB!E~8EET}HZVM^0ooW`SP#q+JBIQ7^UM@+O=_4|@0bbEzmn2XdcFts|ba~lQvTb#& zs4vy`tI_5QNrqV_@ehMmnF_g2(jNG1PDE;q+hdIGjpw>x5jR;n=iby5kw<*(rB@oq zZx4T+&{Pk@BH1ERH_&F0Atp`dq8cSspTVf)o>Q}JBTK*5lTR$()_NDw*bP)hZkAFL&YpPP&V+Z)AnCYcIbTGPy}SBaW6(m8G(RcwHge0^WZ z2lzBlAo{i<{8o%90+9oq1N-siDouT};6}C+>OquYO1=LosPjG7tLvS871b<&yd+S% z)S12*?lXhziHb|Pa@4>6b`7H>li5|k9&7&0v>fv9k*q3{I znXHB@WLW`s)>Xbe3#|?0&AF9bywQc*rhCTgt)X9?Q#D!03-OUOPPNm&1lB!fQU28O?6gnh&lQzcURPKeU*uu)Nl zDfTIPpKKRt}AOTta(zj2!ithG+s=l%-$Hc?;9GM7%8hEP3 zdmB0S!USdUAC63f?-Au2ww*L!cm@vw@Q!;ug2p65h1`a1b9vOvCk8C+mxc;76e5(* zNFcVRQ1?>2!{#RoD?e?ZYI}4HjkF+DXAelf>lE)QHVu z>XNqjzhldv|03c>(Ms93x`$)TnEJi388jJ4QGcir)VXqN!yDO~^?4n%1sw5(dsW^v zyuj^xm_=MBdXPLS$98KnZ7Ko!qGl^dP&zUd(XPX}Yd0MkowU4#fz*1<sAy{~yq=BVUozo1~MEwn0$3%T{_%UdDu3b(w? zh~_j*5olZq>IMde!LZz=ZR9LJTU7H+hjtkpV-H#H2C#!0TV08cBu@)QyDi}9)ZNTf zZRQI%xaa!n)%aWLG;AmeQ=m^PUP_u`Dw^?HAmNC!%zy%($GK#R^v zX4^tP7k%@YwLuc~#?6xZ=S2vR`=gKCO({)B7J`U1r}@HF)}A==o!G-K(|gR<%W=WK zG=hv#1#YBbwo>K_0+q*|c#1Z5E+nyb!X>CJ}j+GJuGifl&_cOCa!{%}m?&?&j zT?6gXpjq2-=s=p*DQ8l$*Jw3b5U*(9(Scr#?m{*VMMzAAS-^b-3?H?_`_}D&=0tm@$rckXx4kyQ42}zgB*Jvb;f!)C z@}a<|D-jporX~Rqu?Mi8!VQyey-J13tk?3adIO)uG$^|`w=-3MdmR)jBi?BsQrp>@ zifL8c^(AByproy_bscy+B#^}BbCTIUU;Aq%cuqLGTz7^jLE$CPfy1H(M`7_xOsK+{R|)c&|RT5zas|^7tr`hcnVHVN}XLD@|BRa_k+z z)Zwl@Ju%p;wTU$>sUMW-RTU(@L>aw9Ic0WttDgQB^YSO90Qgu1ijQ8~Ae6bwdy=6J zH{zY^6|xQQL$iQR5u>W@774|GX?*wi`mAtO{W{1F4=0}SZWokeI|33kX zkR0JYtR`UhMb|K!X@`M3CuG;UwGEBHxbB~U`&V@I*^K1Jh}Im$NKgNcF`dR zVqx>MqZ}qpDLSgnq@8j$Ut;!tL$rI1;py77Y$+py%YyA=$WirwJ$-4BCsWvFpir7nhLMtm>HK1#e$Yh;ertr1CiE(Q<%1}ZfltnvTBtTUSE@KGtve0bfX0*O(3;piltoi=#$P2{xxqg&Jzs;kjW%&SY z7sqFHt$Yf>vw58NI_+dux#R<xoVz9&Yn`$KM)1TmR^4HjhV6#OPU(k|S z8vP_>)jeI~spgMK^k$E6+e+Hr?E(}+05P#rI?-uqxfY2hj_jre#oG*U=*IoRha=LH zge?r*^z{Rs=eY=$dme6J2lVX=G!vzW`hC@~1h+xioXWAzt7x;JNB-pkuM*qmEueLB zUJafYh}LFy>_J0Dqw{S<)a$&(a%;w^X!~fJlou^k+Z|vYxamwvlxk+CMMe5Zuf$H` zs6YGV*4)VYYh*ChdPl4}YKw86$+_h+&PDG!f#C{O6P>ys(0sp+z0jbljcwCAs$rb` zgz`CiCb-XvC4y(fW5GYh9^pxYX-ZtJ$}P)rFp9nkK>a~9*aeuurryrMIFFKULzHD>qI=UNLPK4-Z}i{TU&xqQl~H6Pt6E7TQf$ z9UO8wGWJ+;9-f@pt)z`Kx2F9ZfI1$-*qEA~yyfALgc1w2}-l^;YR!4XFKS_O+}qJAwbH#1$>h<_|qFDlNo;5a+rAM+6sH5 z-efb$8|!<9R)&ziCLlaM0OHgT&=SVkfXCv()I?ES{1)VyAZ?Oo8qd>L^KVP z;UN{`*i7RV+}=mzov~cn&=+hNGAvKFjf7HsaKLu?avZp$#85{~OAxP$3g$SavEi|t z53#i1YFmC1u9oeFTod!q>%zZ*-bh9Tw%44tu>kPxkK1&T|X99I2o7kxPZI z$Vab@6iG+BNeB{5>SI@|mBLm=8zs7rc#ge_#%p^{q3n<ztrjsp?XRb)Ya$P7v<1+yu~Llkg;Yf~ zn(Ze4=b(Njd595P&?<`Rep&{#-p(Lo@CpWiQ-QnH*m5rU+gNwy;ZH@zI zLB4MOf_WZGW0kj8VLPOlfk};i`8Q)0*pzDVHZS{@APo%(!`3|>ZLa`hz@!*Zvr>nvL`Z3?nG16o}$yr zJlhm#FPXenP}qb$=kNwxOSJ51*BroEagCI2$u0xUM!={5fk{7?RidmKbCDeyRD-^g z=(}zlo)jj*ZigI4`g^nab{}tEt}zZ!b~I@;JCv z{n@HuDW~jrdf3QU9Sq`^Sw!O62-&~;uDt^BI)dNru>WioxezM7z9tOlid#f$vJS*o zrWkb1^VrTh-I2-mTbcBgMO`)AMtkEH@<3S|W2`KxRtoKS&|LX4js{wl>MDy}fc}Xx;$kt)2 z(L6w@yvx(WX}?p67FY&Xd^OMrB3P)X{Ro*!H!smaFoaoz|b_L_V;+#6Ij70bI=LH@W2Rvk1*7~vW5oO>1TiK!`!+`DPUj-dF@^7)&H z&d=7VFy=V>Vn>0^B-G_3D?cTyppc7Cx~conOG6a2-4>ni}*V1;9?1s7y%J2>* zNf8=N#{e(R3-n3^#I0sFWZY6!-pWnh)K4rDPFj3<-77r-2Dz)>!q^R_t??`%-jP0o zXu0+1oq!>U(!Vjr!9!hrEG=CMc9Ie^+hA`IH`lt_O%`0ma*PZoEAs$ZK|>UjMyG|l zj#B7HX8YLUR{2*S`@T+$RM>}I?u(DTa=-T3(Y%RH%_qlZzi97J+r4Xd$kpF!c^;m~ z4ZK1y+8c3#pv*&>Z2-NhAnxl8W%E0!N4)cDwVpvd{Lo@3ihO)7yTVQN^`&z;TbT71 zW(Q(A#+Mb@g27xo(FIa;QfQbImZvzj!+TEgkIIRE|H1e!r8((#8Otv!@$VLq&am+? zDW1!^6Su$50tnr1{<_!SEUyYmrZedqCZG89^e}IC-`(jE-&YjhmqCDzX-zu|G3DjI z`t-cZqO=Z+^V$nX;9&P!(jrCc?S`YCQ+Vb!8Eg&2dRFpZPtsqO1XJL?MLRcgiB_b% zul|vr{qClkO=)NM=b)ATB(|zg4&-JC-W(SpKTZVf+D-o(+UfsO<+T6EKeZu$0?hyV zWbdD^@+Xh`XI}rGdc8m8^G{La9~cY!|5N2Dx_=j{4KzEnjdqe#NE{9&usP5 z>i6}7wt;uel{6>v52}8yvH?_i5D0@ONO313xyi>s55a5EUFUW(OJD^tGrK{fa#J=+ zG>Cye7DKW=m91CQ_TZU)B1t_W#JI5p(m;oX3U@)>73})^d#kKtp9E7~ey^3OQp_F$ z{g~5HY=~d`Tq9J^QQ=j3` zHevkaS#KSfd`-hjU|l)c9=~9@a&Dy zeopJk(~>SN57=V(4abRNk-H8vNK-->v3KmdA{P!Y7{1ewwFGmAgoI3LmyTH^N&TVj zfB!?vP1weQ)KszH!{dN()1cn454WXZavHA4>AghTeYU9aSo3w1+d`Kr6}Y86Gv$@l zPd4&l%kI-sZa0PLf>|%0m1W-i$VV9gd8~=~;0MqwP$#`Acrw!IHn$vU^U1=2n?9jG zQSCQx_e9vzOvM*CQ~Gug%+jDckrbyZ`SPhAE~lLn6nCfg@Zs|LisnCefHIPkqVs7W zHEmGDpuSV$nIg*gJ$gk}rxPSwk5am-D7`y&h8HvI1u3;9Pk)K`9@+a&WfAE(USGFF zw#OnqO`Ok>9buc)Zj-Rsb=CqSAIe1?>AcSZUxjPOBShPbpENabJ!vwg+85s>p>*}? zDeymMdULNIE|QTckdmsZD9E&z33Kx@_wve+8}1sOm2|bzb$xAJ-EdU?mtA{~?D&o6 z${WQnBfG}OJo@7Hq5KYHrc99=74Gln6zXBW_W%04Lce0;bXK|7bKCw8vdaP*|G^_; zljOboXooU&>`~K$Fzzby-K6Z@FscW4#6*oTIm0&H{ikiaQq_t*mTN9qxBo|rRDPoY z_v`IF3?Vl)uno?>;$=#)3K?EBgugmFE}z^B1x%)x@rP`qs?T>tJJpZ*%0%uz^XM0F z*T_!a4xj(hS9!<>h}#jIE~`a*o~23EtA_-!ukrapUu>!(TsW8efG2c&w? zjdlsN+13~z`efBcHlZ@_#J)h}8BW6(u)8)>6Ygl_t*+eTEMgcze7xb%J zi^Jt*rt*rY4~|LNIyyS4Rq0BrCyb$1tSGgHORvB<%)u3U-dM$y_vii{;l4bg9EF>$3$+!p16uR9=_O&HJl7!)}WN$MVCcE{T_< zqaiqI{bC=_MNjAdr@i-#YI1AaMMXqhAYcQe2&fB1s!9nRlm#eAZ_+_Jp-OLwfPjL4 zN)e=k^d`L%6s1La2|e_Z03q}ivM0XZ*ynS7YoFzh^W*&3j)7x@k&x$k<~{Fn-Pe81 zWMRTs_UQU_^AYFK&M0~eaN0g@VpMl;(Sa+Op;B!vCc=q}t+1|bENYAoww)jf3my5N zvBp1^>txeao@~0@Pfh+eBZbG^$4vuH+>WaTJOfLcvysGA{CQ_}Y~+V-*-GOf>l<6N z@2yDDjdQ43ulnM3o~j8`#aM0|$(Zb{a{}p2U6*fD=MSq(i!p$za7Kw)c&1b55;R}mjqp@X-T5dQ_r2-{GtNeB87qo6meMG zE#X+)u;YDlzpt3QvCq#hjn|YJy0^>LXc?_^jaAiddgv|VyaiL-T5M{nT*v$HVSUfZ z6edzi-!6?+MJBln#ar;$0vUgI0u@9szbXEC7de>rs;FBxJ<~y#GY3sB)rlbcH4JrX z0&c_+mHd>nx%y*&(htVoC2qLM30I9xINZCU*n%GA&(GLfK5XgQULyMoY72=4i|RJQLa7uV%QnMwZ}K&~?tqK3yD5l|N?Lir;q&-_bgI&3)dULMfr%dn@=c<-*+ z8}Dy?@I8f|yQ8s^5qXPNKgM#~2;8fUKPf13DnpKs9))jfJH6l0F+{usLQ-$7Ya%n( zRCFJ0VFr=PV%u{8VHl7NtVtLu(sOzcmRa;G?R3+sxYo9RF1_aX!l#F0@h3NFXO$2z zDEo_>d494vIi6oW`TqwH_Dij+8}Zt=_^|g;DM^S2^5+)_0`E2kNB9x2G~!#ImEt#? zH4hi~+m)gg$_jIA`Y8l}eqeu*+C-xv!^K-`pxLni3sAMZlJHc26uVl=7YPsxZsBU@5QkGD}#$-7l(_; zsuxCzAW~eB|2>PLN-V!qp?6PHHuvD-Z(E1Yk=q8RqQaqBzZ9$!QxS-5dj3!&w%rs7c{ z;%M8^9?Y=g6~{VV+oUtn*!4zL$r5QkT0(d$(jYue(t<>+OZOPGxFK2pHFEr}q6GMH z23IrG2CD^r_;5{2XCXMX7gAtYHv}9Pw3JwST9Mlg&iOg+i7|EfoSwBJ=!z}+UU&k8 z@m+PyOwSGIzJMl+YTUc|i^J`)K3PtU%fQ!aDoiy!AouOx0R%$hG9R2zQy+cJ z7@en!T}dC!6R{2szO%0{dz2#AH8OZXbcS>PT{E6K(gcro6%+>6_z*8=6?CEBR#gBhjYJs=K|=D(wKe zV(cep(b39!cy9&h9GTCyM7R-KhHs%xiS+Y8M{oC29$?D|2is*v4hh*Yz^e{tr*t73 zGX2UN(T&=Jruvbs`woXZVU@-Xa*(uDe)BdHCLz}x`1ohj3_#9Cg0tJdE6(UblZGy^ z1d`K~COhcoSIvKw4YTVSubH%&?}({@N&ZM>NoWqCav=sEY#UYj#`CKuS&o2c^VKdl za2Q+4h=S9 z`aXrQe2TiIQn`yJ-ra?Xm-v3)s4qc{RXI9l$VI%dLMA95w`^GMt&kDcq5}*h!eYo| z#~iyQ+^en!k{^ol>D3Qkqg)scVTEa0U-T-*UcGDj)ssoUDRpxVicNaALt>BMS3U%* z5`IxQRWw(!c^r1vb;+cxZ~+C)%Ky+(iLM=NP)%)$^iHp+kd{lG=#7r4iOe%LH*Kz$fpJtWt_vcE{c8N1*a`RQhq2 zpqFucRm`mAJp9O@t`baU?>Me5a`Ibsx>7|gZ(=+I`gnBMz0S97_)a0%nxz8CYwTwt zYJwC-9l7l%9|WG&t122({}kf#e#mDf?%vwb2w-Olc&ugb_#YMuSg<_-?$3GZMTF^S zh^FfsBEb#5-Y;rw;4~=$^Y#~`#$3%&2S)(0C(~ejI%jo+5Bf6|Ka6`x2UF}B%tbhB zOl-e`v1VSm#@YfpF`|fF_mQWuwVQ9cW)BQG+qiiFw=~+t?;>0!j@J!NkpgXN?Riv$ z%W<4~(WO*N-M?Q06nu|sA1;y_9xizrkUm4D0So_1c*tR;hHugYtc~98#hcyu% z@3pzGk9`X|SA&LPH2CJl`rYG07^2q~KuKju6U0GRiEW?{o*TbhTh~-{p}r7mVTi^Y zSPbO6TdOrmTqaT+z*}5B3@;eZqAHU+E(kuWE=meMzj&32S{yM2?(FQuZ14;`Bm8ytJ$D+IKrUN|V{nBI;Kq!w<1d|Iwie~met z(m~veV%?W>nYaMkdnZqT9%RVn0p)_lWGj(0xzs2X(JyFjuF7#8!^q~vWTL)NvtIRM z-IcPSNuOmL@owgA8VlgfD{fLt35a+4)UQT}O?1|0x-VT18IBNpl7olIY!BGK5?x+E ztI4yxgK`&?+*o_Kd)jrZ$bV}~-n)6uuteZ=e88o_fWg?La! zAg~hO7-LroPo}V~L3IfX{x26J*I$QP#4<+W@X1AlGR0>METT>FS z4nmwJS(S2vBdo4d5K3Noq`^FlNiPNs6ACXVW? zh8Yi|5kwsE2=&lRZIpcLIE;@Vfh$zxt7#*bnrl!CpkbUW|am;HHa8H%6$!D-u5w|2nPM>lIs*}5_3 z{MesdbaJR_+;fx1Ycas-{E*ii1a;KKD@+_8Fk!PFJ7o3EF?lgqr6Mu{=ycQh?q~wd z!Kq8cKk9_)rbNO~zG%_X-X(<`MNgv+GsE3M^;m{(rXBoVg{K}Bx5v2dCWH>b zn$<0~yJB$R-OdLLFo%eBcai9G-H-qz&1WM>3r?|@K8&8%wn9wYUGOgH!Cj|Z(R zn}y>wyYJ*#%giK^iJII_seN7K^G2I5?Q@~Emm{UR$QFHB&RvJ=883jBh)jrR#;u>4|%0Px`nu(Nf zw4AAMgq>dy66&A-G(7L{8y;rxN}T%od$*sh+4a&3!b*EM}D$s=p^wW~~YMSl3* zHSgIxE?#6Eug{Rh%U)vlOgho{C6Pv)@Wa^m^K`HLHHPl@2}AcVYd$!xtIS0VP%%qe zB7Cm5H7prhD(PtHoQOT056p@CIAl$6PT{*_@OYrL`-6iM>q@caq=p-W1Ig}MO~BsF}yi6>s97Utka>j8`m;BVsdPpXKbNvKTI@w zM9Hign9fKyn|@Yt`&%DEbwF%+M-2BIGEX1$!B$*(+(!Ozs8@c_Q%O!Q)?4{IvhPU{ zBX)Z-H#;#J#V^&VAN~b0=^BP6Zr4+34rpcF~*F*|XD*#H3{>EKk=_^5|fT(XYM3ZY#2%p*DawD|t~}oxWPo5t6H}< z*@L4ca`YzZtf+(3{#Jj@tV&vwzk(R6xpU{ux9+Q>`)ujv7v4wMv#WTsDrhx@ zsTba{*jl#6g}xGF;>>FHqKX88-sdpMV?$eJv90<#W%N--kZGc=~YZtrOFRC6?PwIqJubk%G7+gO!+L)K3uFWM2udc~4I&hv%?{hWw#><+j z3HcqI_gQ&;+okywI4NLQS#)~OAefZLIGoL99+b-t4k;SmJJ~7w&~Q_pa_;ReW(nJ9 zVs&u+-h|b1`RsiAeQICT)X-9c98)=L1(vsNkEJ=SB(*2JTi8h&iGuB}xWHXxEB7*}y49U>_~n{jT=iDI-%;dKhP z53VK~oNj_rq+&jYR8M{F{XT!ULH)}b0^L{%i0?sFtAJ;$Tui{^ML9E-VY;2T0-?Z9X~Zdd;!j_l zl39Zsd3Nzzq~y`li*-7N)7{q(2#xDA=n2@Co3WN5dCo<#GrXB()S5H(gvHfud$$V)!_FB@5zmpLR{(ekYD^cJ@`#3Bk~;OpB5rQii8ThRN;oiaq9zuJQ~zWfqUTy+&c*upL1*4DQLPQZ$izE;L;uHq2Dr=T zm|s2lc*98=@@}?8BHYEh$0~fAwYo(lvqVdc)N4tw=&VTi9hMFd)}dP)G{qkdb6hvl zrUkvqM2#vP-sCP$1N+pwY0Z{=JGpVH-o-`c-pebk_`}R6DL!4gg5EuOU$h}!F-B#% z3Vv7tlcrN*-Gz~%4wJMB_J^5#urdr_)3b;G7nx0UJ@ybkSbAC-qK9E{qPft!}Jvm2Z zZV{$B^y(S=T0>Iom$YFk$_9^A=%16jWh~R}!EYX>y!J7p@0eT%+3y7T1#5<&9Yr$N zM7NgZV`%gNv1%>Yy|nEyWJN}1=LA0M6{uI%NMvn19@JdQ_*c12Al3 znM&D#hB#19T=m)3OG~4o=wwB(K$2-0WB&>_Wwg8=#}y~@;J$M^c!!ZuT-S#0h1mGP zjDSA|IP%qVOgZ)qoz0rXA;E+~-VEu|Z^%I(DdK#wOT(L}P3?}&@+SX1gDc)~m_3he zKA6_c*~fZQ(wRv#bBTk3kpVXUdSbLWG+nB7sGrStVFfiEf^<wuT zpD7K@RnvSBpkoBnEppcR!Y*1hl&Sf3pDlmW@+msli2_;FZRWeG#qmN7l`iVYc@Fdp zb6Hy+w<4;|r>~GxTgP1`v(H^(+-a3Mf~{er3vZnFNjFs&_%mAJJ{vp8WK&ZEY-QfM zIhZ=k>E=9BYvdflM>SQtPXHhBx!umJgUVNCV15 z9WA{Qk$K9GBNZAK9uLJV{;~m@c5a-2QL!h$q7CRtEnx|tysw6^OyGy3@l@xjj8*k- z+CQ(^_F~!nQ;iSBz;8KZb8$5De=&9Mb&x`RZ@C#sN;0s%iI8fjo$|nhd%`4i6kG(?P})}#o#N}hsRoWRJc;8hxl%()og8qM%v8ATaWP2LYThJ8QY`5Go@ z84e5>i|vjS*PA7*&%J7<>D<1cf#cbp;nbcqKt6(#pd_^~8_99nq1i z{vj;Nye(3ZA@)7OcmNY9B=LpO%c#D@9F0NY#u-*d^+(mynulvd!pb6kOfoE#@uKFy z*nud$iF!`-=(+~x2*2ABZk6YfhEDR9=28q%7{&2QN)CGD1XLTDiXt~3Jhjx0#Fu1& zX~gfR*IdQVKhXC$#;Y`v+h}h)yqf=Z2)C|Re;axK)yw3~gUJh;muX_&pJ&ebw&{;a zo(DGSBwBZ#Kd4B4!0D&~XOLOE9qvHE@|epi@!0++fP`(&PJXNu#s|N4Qjcsuv3|?yCR$lEjDj%EV6eC&wq|4r5PQ_?f)uoYyOuPcD0Jd;x_=5-1 zRr~e`0j55ZlaL#+KZNLcT2z8p1iUW=Sj~9N*c*3w@grvqwB15c8vQZ8olyl`-M?br ztFk8z0Uo83h5%6ecUHNu&Dl%k9pl8W(I84OVG^Ou9`!g=1B|*!5K0HaR@Z#iHgnyP zhMutyr5U+{9uqYOLI=Uu59^xARGOZT{qjj&pJqTq`S}Q&oTFlM@#L}I?n$oSE|7`$ z{YVXA5iGp2Vmr*I2^{eRw~pibL#lW9`|hr0Hqe5nG0&*5-cns0Zz~b`4;F4q9Xu#1 z!8r2OoAV(y6pU=$c=DeD0qP99>?HTCGVz5zDa7tEIhj=?skdajaEZQLx7L*8c_SQ%56n3AI@8SL5y=^!z%M~X} z5Yo%e>xE+TmOYL0d3=Hvbb=|URMOuYAQZrqkloXDu8V|afOHzVe%U{!wTBcM8_Y0m zoT2`9Uod;NCD>f0+p|k!V*N6J`PTVt5?uwSG@m~P^^%{8DBhDv%<@SQhN7|e^PTUf z$vN@9skGKP&}3%iXEy{+(Tf!&`&1(Hwno<3Z5TYi-jKQJl_s~sUCj5@L;Jw&u9^fE zwA9EEgY&+rsSJXQ`QI8E2$a37N&H3iM|}qFBooa8Q$hYHu)Q`$kluYkB3qYYiHI5_z55ZAM3ndKdj=kbiA z)_GWUgnWjHIDI&fT5kThf{D}g9V@BtPNGPohpi`qH&S;};5E}St9wJe)boahgxY~B zl5hSCh-}P-EMAX@>Wv9?30;J;UGGj=Q zGo&7?F3^?hosiEF?q@I2<^kp}s)pCa+8U;A%_PKo8&dIM7;?^^p)?q^Dh%|-(^3Xh zgKT5mu`tEBV8O7)HZw)}u)jhPWq@lc*X#iU@^N=z_1ECz_l~)2kiC}=dy0`=HiITcF>y&EgFsM&2VYxiwZAmr-@(` z=~I@N|1^g@PSjZW$b}e@=d`>8JRp6~F>UQ(<+tD4{34C6vXE8qj zjtyo!SnI5!ATRvhJV#hW@Qne8BHHXcqPTV?#P|@S^Q8Gj&L=W2E4Rt>2r5678e(Ke zo^qa|K`Ah(dV{QA{1nL<%t@235$Ghpdu&Ed{mtfWhV5Q^_3kTmQ``FS^8526Du)Pmj4jZ$ZXFLofY zS`Giz?bMkqb0%GLEV{!Y?1T?m)W=OW5Eh|WFILDU9;ex@$(0stq2g-|0?OjplZ8qu zt>vIP$9aE+U!^rA%z*MeEIV2`I9hDqI zD_g-Aig?r8Wbx8wlR9{9$n*hf!>E-Z{o5rFxokjZ54hXhF@}Cj55M zB^vgO#ZCz)Y3v2GtI7@==%Qhp{JUEt9lHEUM;!(J3fUnFUzQy|m-3%LHzdQJ4`mU2 z!MVtW3khx2)nrEV-OgxR2Y(0gYeF_K^0sJ&I@eBqU)?`&t)6NyGeo@rsj4`;{bN&_F; zGmvl9tV?{1=qA@qhExCp>$p=hUMq>VM;R(LE96E$Pf9R=X)7x+2s^z+zO9HvMrU5K zqSTP%un1OvTz6~7@h|v}Tk|dErC=4+$A?mi6HLpLDUdSBE(^8nS8R~@d7{4yrN!xA z@H6RDZW17)v42utZ~|rdH9{>Bc39I=SS%|@@GVUvCErp~_@1Y^dimCys>ufLd5g#A zD%5CZ0H7u4cC>jnlBKNPMmX%=8-zp9+i509=>Yp zFG6ADka5?r?ia=1=+ChnjiUcKJT4QZat({8maoV=PwX&1RTe)I^8i^CKY10D_vR@uJJc} zsVqLe=21~!ZSLoHk+1H3w#>~M%+n9x_(YG9?-AlNcM%Gs+Jl3GZbl9C#N)_;Q{O+o z5aR8B36n|^b)(>m59Y2OvIu$PP&|H~?AF6Tm>J097X!Jw6WX7U_ueyhPKiYC9!8de=}_pwG(h1SqvRG>9dVE9*-ZU;mZ% z70r9H?-~AU-*eDbc?(xOkL0`=OjQP0j0el0bKE~bsW!i77Ksnv`FM;%5Y3?DP9$mE zafsBRa-8X80gD!{(eH3{>H(?Joq72P=6Cw8_hT&9!mV#06K-wWe2UELV~g4@>_6dV z^egp`GG4@}FISUPRlXs`$E89hoAUX>r%GOV*Gr<&+q>hVvJhk;{G|23Z z_CnCRtV+mwN!UVP`RtAyEP?3q-#qvLSYUpRKCr&8?XNX=ldcXJL*xEy2&w zBUf>>jCs=N!p)%^wVO&*57VxI`S)*|4{|(4&2mxV=mt!v0VzIRHf*9~rDR|eLj>Og zdlXx~sZq+}eA9eGLLvp?*eG$dC~v2GLrf(Og2aY>{-FRr-zqs+#_LZ?INYt;?3NfS z9;)BpTpb!)TkqFBdL3vnGylDzdKML?1lj&itdJ=CPppVx|7($*mXHj?l^P9Zo6iqK z`;($Y4>%I4`y*i6QHz_OSvlKII4N#w>4657{&xiwM~RpJpL#3qU+b+EP;ccEKDEFH zgx_*v2Gb_5aVn7KmcMnY5t)_hX@l{H=P!8g zO~4ing+On!rdVR*RWtB??mOMYgB2Ip_~3W~yYdC$&s5x#I0`cA9A|K(R^%4dy6s*4 zYU_8byFN)s@y~5a-N_QD@OgsKYyDJMS-w?CaDnRUkdQm&<*YDbUCkRVXN+Yhy2MOg zgN`z0)O#w3Y0ZXe3r6x?*%o7oYF#Nmr27WC`n^mOoeNDTwi~xsjG;*9Ht}s;m)ZX?`3r;x+tmeX-OZe)I4Ys|0pkR12(uV z>ySn(c`?E-&l1R6TwzPMPgftq_jPNwN{Oi?TFbM#y-d+R3}}f#ZOd~+*0PCE?P{U= z27dYvyr|Xc6~4OYWlyObtca-nzOIO%IXEY$EpyJBzrXhKfkLfNz+>=y9^vO~2UIngYz6j_82~=!mKb#{cyT6{eEwByd0=Cr(8J zBokEpD8lqv{aV-fV#B2e?}p+oYe{WBp5PvJ4*6qeMiI_eyRs)&<4u7*t6uVcUMcE8 zsE^rQCY$kSFEK12hrzK}MrgBn&@ulGm0*wZ37n-<4Q5agcg#6~W{*%NFg) zqSOkpjma|(Xa25m5=%7 zZy5mXQYt+r{U8vZ`n8H}P9}cu@@sB$(~_z3cUGfLoXianSLlROqep8;3gmV$m`wN0 z!m9nPiO1b7zAWUV9E94w&)G4a1^9>I3Tf9>Q;6YEPnEIO*%E*3D91ZlnRl2iaG-G3 zlzW(+?%Uwi-cAwqQ$`n?a?Yn99+lrDyN0hBTKpVymyj6 zOKpXZ=KJuexl*r)Um6^2s!}ILH}Mk;6$N0Zwd*E#7xx9UiB4px)P#$p=4@~RvxIka zlJCqP$HBdQ7RVJ9cx|>x@pg4@Iq$50vd@lgR)RgJ0#y+=*)|VwSH-y9!FiYM!l=`C z1=3pn*@;OZZlJs%g|lZ3wm%&w)v0AChdTETw_%h9y0#58y*dR4@)!L| z)X_~OJp9UDwNBGF>Sl|i4vaw8bNTFSCcW_-lh&4ZsdDXpQE>ZpnX`vz&|W;AI6mA! z;%_|fw@uW>-Q>|=kJmFQHuU;#LDv!=5klVUla_ODS)9J@LSN#2G#+L09iYM{;yMpa-q8IF1Yg(Fk9XQU%Jfv#C9}BpR3e@3_$q7gJlxoukpt<)2QMB2LriUbVCW;`BJ$EJW zV=&Q4zU_c6{&tohVr$ib*A5Fa2uGr8N7F3{zUv|I=`yL&Myq6)-ddG9NRYU@$(;>L z<8IA1Erx=|pmlDHJAimT$}j#QxDxz+=uy_LvgGm^4e5^xC9|0S&8hSK-o*n8z5y?- zZ{*kzq}|r`UCCE$A<>d__@p(}%HvQSfWctuw?nJY@7V{BJ>wD|AAZgJU@%vV-8S-7 z?x}cFq_S0~t|o(VL!qaNp+}Um$0rCVsJ~=HXs;J+7YgUUDoS#!-5;talHS={ELG&~ zGL*?SWCERUTolFJkTYBKg}Apr;;zt;N1cBmf@!;H&@=iM;#?9kCY+1mZZ6i6tYvGp z&fYq!+^=3GWzxNuY%Jgt=%ct*wo!FnqPHK5QWlr=DA1T%ZP=YR-nATjlrrzkn=ksE z52HPKXWp(?RauP9*i^#q8yoptYeph;WF0f-ozt6tNYqVWD zT+fK~)h5|qVOUfT3b{A5-Ppo_@|`bOh!eq|ElH`iHJ$hM|yKTZcx z(~=F{<@4RK!Zqw{HJiUg)w-^}M~0>*V4ek|3W z)E*omv?e#7W6&d=_T=h)t>RqOfms|teK!sj5io*5QVU391CNf2yvmscUtxDId)n6Xvz*- zo$NS=nlc$?3A})`m|Iyv^G!CKzr+1v2VE5(rec4Sei%MZ^?a}-EiCw|YU=AyOuE34 z>C><;f#f4F-kuvPRe-hWJaVS5EYJG9F(Ft+x8Ad)0l%Q%_$AFTMe?LYI3GYGpkhc$ zaz*vi*;|BL(XTU9_gfLi_E1z&A>Mu5GYe8qgTPc4GvPET>t{ns_$C>`7(z2>?}=JMMqeO3SkWj(aBx6JH~rEw-*NS zng}jJkqm_s8SXp6bP(HilNxF7)|v0(=u17i5?l7AL1>UHgRncVW2fa0%_}J4dmfX! z{`w~nOj6ho|`zNs~36cke#PLb&AdG@m~)#XGHIX{}RaTa3}j3NAi=l!XVt!DM(>B%^0FN z!@}U~z4TC`Cqt{vwbU5V*r+D>=A!Y~*V#$U?+b%Uxjl-R!@lj4J)mI9r?KSNi7(%! z*rrxas2V_(7DX{$9Js6jVTW=Bd9Pw!oJorUL}u`={jvE8gS9qE+|IQ_b3Q@I`D!wE zdiy)pO`Cmj89jS(LXk%OR4uIr2k3{h$1J{6B#jp#ycs5^G8;xVfKG-Y?)sN*fRqu~&*+G7UDrxjke21 zjkHcp<_K~qlG;5&`*x$(MZ(DoozoO~+}Zq`(v4KjDTYD8@lMiMJXBAJq-Tvak9%){gLuuX>rrVOoZ!oS$&aDSVCbic;YWD@- z>%Bn97@9uto=$jumKBEHE$9!&rx1=lkkNa!%hE3}>Qh)S3%8N^JpsHc#GC3}i>En5 z_qJvzi*1B2Q$hRQ%A&pAP>wQSOTGV0a_jebkA0VILMOOG&joDA&HZ^UA<7~c`6g|e zGTP`VS`7uVLHXQ}!KD=UC+E;nv2+pw$wBCO00FL)h5s43*s%f#bBr7V`Xas|u&UeD zgnPO)WY@&k0;11W6O<9;F(){3<3S?LRt+~2Kq-L++OO4P6YSm&s4`!7y211ICifxW zB;2PG%jkQ%n6vITF9~`z7`OaWAD_t{`ZF474v=as#+R$xERqsnWz`G4lBvm=*Egk~ zdR$W+5Yx5k6fBgR1@M%J#}CD`)aS3>ZK8HTi~x%9!3$AB*LztvWtOr~iyY3~gB$yV zm;-UpHFxgf1-4m!s-N*2{nTX`=2#(NwQ(6Nw$Eu1DN-9F{6_dYB%#u{TH zLcTfUoBpMG96)Auk#r>5Wzx<%drAu}Z^77y^9(?~ia83mi5Ja2(U{C_SenIgx84E$ zf#W+>6+M(pt(z8i(!CC#PUe3Mv$HZ<`x~S36u6;&ruGOk^w#CEo&jwuD zMcmbK!NQKQK8es*qGT@=su%uno4)wVrjTxW16p$ppg-64=?*D*x~=BQC-tg)^4LE8 z{G7DMvqo8&Nd?0()|Rvyj?I8Ru5XMARFe_qcApILCT-1+J8?jvNjNK?-=vars$%NE5|U#oYx z@IFAIiGoKT5`4IeTVy;A-UFlQc^~$o_ie%jgJQp6gUB9+S_sDFlP|1{xTsqVxvQ%D zAk8Qz>JVpiV4h(EH0DR?JIFK#R#mlYv(xu62U-WKwsC>HNp-^uJPCvza*7V?l}n7> zVAy33me~#_SEHHETI8xA66wBOTu|~ESah@ND={C;8*IcsprX2R2E(Ol18{;OFM&ig zB8*Vzv+Dqh%LYtqRJ?q1c8;kkiMy!#_?mheJ;h3|m#Tk^C?dbW6-9}v9@qnQcQIcm zPZ@h7YJ{vBNSqEtGuY)aby$@!UCNbQe5p`*XV*8?4LMV<+b$7AM5TSlj9c`kEYyH~gx z(0t4bu`c_W^mS*e>>H*WYPNUu%2Wqxn^v8I!TZOqljXmWF1xh%_4qJ8IA7gsXN+;k>F#@SFWzB=+tG;cITd2P;whQWExRN<%Zb(F8a{6c-o;?|XWlF6&=hxk?iIx7zsatrL)! z0^l*A2YvBuRHq)Wddr4=J@fg)nrR_kVcht5Enzl=So(9^$jWZ*d-tvA8Dc6a&1cX9 z+}E)_1H?lKfJEw6^-TE;=VNKJVE8td@s3vCI&Y--PBl@fNI67RnBg?xYd zl|b`=P5*fH49!9(lJXqeXFp1S&-ppRhgg2yg>1&)T6|CDeE&1+n^1A;WT7@IhC7Ts zkG+Pf98(yvRi@2nH8SE95`^rZELRR*pbMR>o39+Rf5)WrQN0#?ATt5~#~Cp24#dLT zm%q~=m_5shY@tNm(~!BCV{P#K8vksm@BZ_;=`4LaaH=LA`Q&7Y0tuF!S^Q;9&jD$qSe8L?i6$ZDyXAMoP><(4U0rad5XJRgKeGb@``q83}CT#5cL~< zgOm>EE^EkK%@ijQ6s~lb=z#pGt;5ITXzB|RgO7hnFK&Xf1@%xx6XOWpp&A#f6gBDA zA2D1yqwg`c2_i1qQ3K(8vZwqPZuaDA+{R8eBowcMx>+*t@n)`s;IDIU7o^#IPLLO+ zKR}-dBp7$&egy;n=iDgC8NF0+}jAyWqe|^3TN{FnXUr!J<9+vPSzaDlV1ts6=HOKB>(0w|K`4b zzEpjD8gwv+GUWO6Yo5h-#);;t1V$P>j2XjulccM`c_c{;1`gUPe8T)*txayD!DEdoY{)ath9^3lP%m2FbPWfBj>6xq` z-Ng)!2{}A4Ju7hq;4qQ{fPN3TSjVy#g+=3p`U^7pgts-&lP1b6ZQj z$h^EfyJ>OAlP~onMG9xAU#>oE@n^%2m?c!&8w~kN6rSi`xY|?{8Y6h`k^h!NNfa2& z*BUCb;u|Zq2tx|32is4U*YxJ0_pU7{8VdqX^`lY`eH>`tG-dkp;g*`e12duCwM$Z>fP0%GymO^6Ejk@UeDLkL876 z?x$Grqqp#J-T5~jbk+qh>YUA#o|;i;RPj|Ze#1Zi;Vc=IwO>NfMBUn`IrF26QQK?d z5&Yice?e!{m4^y+&A7C=6_cXq|HkI{C#TD)k*Z(CG6WntMBvM<1fh99eVk~-W#Cl{ zL2(G#Ux_Ik%7l6_-LoXuz))DQJZ6T*od ztw^YF8<~%>F&WyrqeLf=x(z*W$Bo7S+s^N;G^IRC!(@?E=ep@q3@nk#`17cc-BV+1 zc09?i8OJ}ymuWKKW2S=HmsU#8Ywq*uH`oSsCT%~CIMM??hrK@Z6VEY zR`bLMQZ9Cu$WkuAD%)CElc&n~c4%Vy*RG-R76cf`ZN0B}Rq}YIDkW47Wuv9BBXz|U zE*H73{Oi}9^3S3lUb^4HXIR=s$Cqf80C7Ahc&NlAkg5PQITsq$yk3k2kRRFBw09ZS z>C;1m=Xd?iDp8!{$ZyIDXEF@ltEhC}=G3ov9#m*lGhQH< zC>?&AEOua>Psgav<5)DxnGOAky7(`UiiqLMsVPn9e&ZIn&uq+)ZWy)A=wlkrpC-pB zRjmfb&^-Dk9iI>v`L_(}S5bzafrsG_P3

r@0-DyY8nfA4`z`_iG_J^Zc|ePP?vZv~F{z<^4sgedf{|BC~2<=-*sp;ER@lbKitdku%8$e zjCm#}@lV_O@4w5@O3gF7%~aRSTM3)cOW2qetLQ>&@@u+IHNEP+Tmao=5!#HpQ~6BS zO@jg7@H^KS&D~Qd&&MQbn^CO@?j{STpkBdh$wx}Fn= zZ+dl)7YVCk5rQ7HR{Ac#zm+d>{U9N+1<6Goyud!lll9-70oX5e zCyB{w`Iux@YviO)vylDx@PHQQ9s%5Rs$L|ERIjMp+Osmr_-DuuTJ#1NCx0V!NT;fw zsoh<&XqF2v6U^5c0UFji3Ki;DHM~Em-yuyYNnpr+Ih6uh`oTXrsK5O#wU6xNX-)X- zRPMW2jii&MXuG;`Njjw(*6j$HE=q{c%+XAiQeja#0vs_ifBW!1yc}*M8}(Q+ne`Z* zxcS5zr!-dWEF&AJ={xmeEBMV-dkLTIitpe=uX+)FF2?AN-&htq^l8$N^Hq|Bv!i_& zSs|;I^3YamI)Bn@Vs4f9F1zoxZ#Hw4VB|S}sZfmrO_Y}B`afba*fB3f@E&wO7V7w}1n@Gbr z{2A7ulydOS{wbnpr0{9kdFHpzwOW;hd+aOCueWofRNmL&PBQ3Bh?j}7K&G>AvW4_W7C*nr(^H@Yx2SnI1LOK`RVl-#V(f{G;{_-cv*}#hH^|Y94{Z0)2zefElY!%J9P9$}A?4Wr* T^D+Ds`1e>=`LCjfCISBoaCY0a literal 0 HcmV?d00001 diff --git a/docs/source/imgs/gradio_interface.png b/docs/source/imgs/gradio_interface.png new file mode 100644 index 0000000000000000000000000000000000000000..9584d76fb3532a3090516e3aedd60208185435c2 GIT binary patch literal 331678 zcmeFZXIN9+);5ZW^eR<40)o<{*HBfYqeyRtDpEu5h?EEjsEDBS2LX{Ty+#QE0i{bX z5`@r;lu*N2k!QbqUweOHf9LNxFV_{aNYq@wRrM=UZeJP^zVT4a_%@w9r*B)&U!&zB40Nx3Djph7 zzkgDC)P?Rre{ViLL46DBNtdrB$7AM`y?RgaBZ+c&xqdn|!!3@CH@bX27YTkUvn*b` z0TrMTokRYGSM_CkW^GDmkz(y#lJ-j0$?1 z+Rb%^={j$^D65mxM*8QFI|_ItlRMtnw5r#;k57hVzoZusO&-%a*Miq=gM29MY~fSaj&~c@J{yD*!r(Jye6D9^IrLw ztoi*JqZcA<&%qbd%o1c2RHPhDYRv;^x$R9TkXB#^ZNav2>M-ifUl{7#*Q`$dpISL2 z7c@=E9h9kT7}@z`F5B2XQ;Rwfy{D-tu0`Dy%~VS59h?0kgP`I<8`;6D7q{|yuWhvv z*b7!EYJIc0aj?x=p-?I~e9_5D%%Jd84%%z9Y`ct*)hMJk;HG333>V3k#zS$(+4!ZaE?;;cwC$Ql_Uc6w%lQIjz_|%{y{TFH1!i zHORRLY2h(Bw}K{#(H$XM{Ql&T-{~EHPDgC-uj~11KH^ijI?3sM*-el78~y2Pdx^!! z!<;TNJB$h0B;n)0pDJ}651?Ij>}~|<9)3*F7^2T#*#4(eyOm1Q9@Eoo0{_S3Req*gstg9qK>bB@{@R| zL#IPe_Q{KMNqhhH&%rU((ru51qPQY?yiXf($lcv*R+uT}9W$8MHU{tWM%(AlpI>fR zw2|QL1$^l^-Sq28xS+)L5iecoTXJk19jmZB^-puvi;7OspXp5SCcvR zfeoL@`iQ2(dx@XEpuVB1MouH%#tgY-K|_d4jO4s`7ynhX9p~6H>X%msUk2R0@(I6C zS&CcotKzSlo}~3ps<>-kLhn-jW$W+`u=za|kYUwpuixhhzShE2_Sx^}u0L5QMc6a+>7zk{zjGBflkDk#+0pl~=49 zq3=yDLm%9|!BxV&b~S_3lGo>@;)f3(Qw!@pMt*!>L~OY5@paMa1KCnf!}+&ej-RfL zy<1gF;NAXe(5cmF-l_Ae`G6%U%&1z=nvmt=6B>r^SeJVG!%N!Y&d6l z>0_ng$j5a`?uJLiwV~9|6miLjpfmew2NL4RJb~INy|PccycG?abO&SKMEuFd0SC@k6mcZuFTIR~xK6HL8`;n7AWmi-H#Mq;V{wulD zU^#d&6ar1AoKRX#5PiJ*F#VyzcmFs@5d;=fsF51;)h?scDapzJG^jjiF=&|Ip%e7& zkpsp7ztYfw?_1`e&yseaVu(<4)U>0U3%i@JU=7yX>`-8avVjCmDR$KPvrb%e#tzLv+%ve24cIEu);P0rHu}vVx%l%pPqfc(B^%^ z^P^ATVlj$!USXvE%fUQx5zP|!n?xCDWLg|ltEJm0rIEW@{m%1r`A<&2oQ;W$OP%vp ze55~`f0z)gy;Ezvp0FN_rdY34c*Ne%PNwh<)DjpN5DQvS2x=;B5^MU{B*!H8T&CTo zJ;42`-wU+hM(#96h(ZWy$llTM{K~I1RO4W6(CamR%!iU(_9N4M$g$Ls&*9*S__6eX zA7Lty2mZ&4?3c_4f8*C)QnVcx_j=oyDiUGwv`q#2bh)LyTv7ye+;sczUP-&3^jlF<#^?59)FMr;b=Ky~D|qj&wZTCv z=CGSuTET9s)jrMYDDT!)M>ZGVj!IOLQnAH%Yz-$@z4N%e=ebC_iLPXUQc{L#@>qvG zWW>XI6F;EH9~7*FA;nY*!iwolr^Z6Z1S*3(@jbh|0xt=wMEx{$^IvbDLPcrQIv&EWe7ln7cvW27 zQL6V-x@4Hq$ZGkR{DS_%ukQyTrQMVCUNb#f86y%ic9_+J#4wVhyqa6hw=4!KKKp%X zkJ?xL(pq!dEpAxo(5OqG*<|6qk=35PsNWXCS{>7{HxkZksx|&5Pca0Q z^YTWIO8Bj|{OxzT4h?q<^NOZRGs_QE>w^y3C0|PpNM<tEFkw zk=WkUBu72$0B=rtN@XR&5c&wJx$^R`a5scPmLenGDSi#gO(`vXAjKw!47NMEx1F2Z z;hiW1(v?*_-Txeeq&?xg%@?F+pA|T%kaMuU)L&3m+g4kZ5u@ORaakN2M5Us1)6w!_ zJ5PI~;EY!oS%d?uKDzHd0_`+*Y(U2qCYD8lEZv%rA$f%%5018qE*R@slC}8m$sHZ$ zv4~2KyI-4W`XO4AyUnm+gkC$ICUfs;~W}SE5v8HQubS zZ~W7`BqaB!!79d@qRHyGee?TX?Yeu~(cB?NlYl?^AY_RMP9?7>rkHv3`fzCs)oFd# zdN{QDbbLqX_e68})9?#4>fc_Tl0GcYF*U_&nx@2)q`+gI%(~akzZd4IMi*oxi+@TY zu=mV!n;@GK1M5Jy!qO=5_S~W-L_U;>GCYyFfu}Kz=i|llI3h^$pyj^9tvaevx^2ly zPTFG@gH_qEuSY2H7ljQGS-q$D*DeO%rWNWb1M-PjdlL-@9UZ(Iz%?-*!3Ac#OTg6y z;H7wh<$td4T;RvM_~&zcJUplq9>L%D=mPK9pBKOj`^?$<#aGdIM8IE{ftPqP6bap2f6#V?)?2Y@E7RnBX4gHIS~;* zKR;nVabb5)M-fq3Sy_>rVj^No4A8*jrtJseI=jUvk_Wn-)?8(jR?`Z)O6v2KYA}V}SzB|uYbumYa<~9KxSuDN&3(Bj8 z%F34r2=e$pa9V!Mmbg9;L{C6)o8bNN?LZ32*E^XOU15%MEg|UN(^HMV4ns!temnR% zPWYjOL-(8CE{Q(e{BY@u>Jz*R_yok%oXY>_i>W)=(CoU2CTA+m|MLxhKO6o)S-+SI zXET59%1$F8j%^$xL)V?tA@CJ7o%6->?^txee~NV5YXu3PJ9q*D#Q6EsBWQU0B)cxH z*&|Ww-059VsgS}Up}$8O?skonf89Z3a_%xNS^l{b5EI4!is}E~VhVSor$aA}3GA$n z$+z{ggMy;jJ5N;;e)`ZyyXv7?w(4Ag__(9vr1J8b2}Wfq;3e-pUhS7?W^>2 z1)3yzmh|b|RxaSr6EFy-axC}jE{u4{^XtvuJD`Y`yA zb!5GlL)B?~RUe!8+ia7&ld3|kPUlXNU!X6!6)8+Z$KuxDB47RT+?8QqSiAw(_dCq4 zb1Zw;*I46D2D2*QK7TF&;lPlh54M-PH&%Jf<$bWLn5aMKs7%~h@LyKY^(vqUAF@C~ zf_D|=R|X0rY}9o9h8<8XBbA~tAe@yg@nSJQ0_=Xrht;;IPclj6}L z|4@00fKHJyrwF8#q)(3V#yBsd2jTgT3Pyi!s`qMbIy8-g*d1{U0MCx5$?V;zIcI>t zSJ@x}8+YVrQJuJOh8X{Pf2Hwo1wSdWpj*Pkr%%jzr2 z*DHYg!p{vP-a_F4yOr>!`bzmJHJ`5RIzLxfU#0PeE_5yWJXumNn&EGwzEgOizAbuF z{kR@svrX&Q;Ja|{L*5$UJ#u=G0`ger6~gGgT=f)}yX>*&>Xj}Ju+XXE&X4&TW}|Ay zWh0O+5prx*EZ5Fct!!t~Z*OJu20^LW{v=kWGodG{uwQzXmNWY1<}86DLy?)FAmJ}p z*v<{;%Cg}H+bC%L?k>0|T^2lC{1)8){0i9NcLBHHQ>0WNS@qgzZrwGKNGSOgWot7#>bop5dkG9 z;Z9*srW9af8Ezw)QzMT-eJiALVg#n_mcrHd3cww2hP;JwsUx7k$er?#qdl$Dwv`Uu z435yv&{PGd`)Sh&*>&-C6Jq>UfkR?ZR&84>#1;2`@Y!aW6MOrHEAWR(2hhRu{PxgG z7#xbH?8orTpq-NG=FOoWKk~>c{I-ZT5?#eFt3X)p)f(zIP5u^$yl|1|8VKrGFr$#F z7!o)TQ5VlXU{752NtuhEJ&gdd@WajCwSW(osK!;Ztzn_yzHF|gynLMu=IIdBT1}k` zHC-)<7qKf1og>%Q^DBdc?kwNlPK72}*U}Vqdy9t){%M-b%ceQD(Uf)*7q-jRB5j-f z0BFm*9>+-|a0MWaclVu%5L$`M$>h=?LG3I>q2Qao#X0#Ci_Hz%6QmNvv2%I5H{2!| ze?G>gNyTMgrumrA@ES>X>EHG*b2!IV4yqj@)7}p?O?4=8mIo|5g>yR z+;;?%GuL-VtVnjfmd%%kqPB@+$>>VVAH`?;7rx|%m(Oyys&j-=Uwh6m(?Ioel|b$Z zfTAxp5!B*RE#j(}&p*GYsm`>VykhmRctsRCiW~uqPR8kgwkBA=iLzV4-3&K(Z=rGM zNv*fz6Oc92x4<7TD-vR(qv8;~`rPEP-Tu#$icX$kTJx2fj1*<)Wk3TyjLbUV)Bqy} zCW%KLiz8)WOz3_n__z)t6ey_ziSsm4H0>rtkhOYtmoX8NvocMyZr@3jth9%L^bfAD za`m+Uu4ix8&j;tyxn5dcK8)s4i;i|ONV*q26ChCU-%X(;#P62tH#VTM)m2o?@c3v) zqn$g9mo23!o*fRjh9W=^mIs#UxWdpQ+wkdT=hkO5T2ZooE^i z@O&QQt3lG(#t{9c1+$gNOJoc@6jw=Nlk~{K*ExZ-iqd^d9A{dEz|XxSlvdiZ)9Te( zkX<*(n=3SxD?^BY1T00N2KB;d7AXPIt#7^oapOXz`huMNJ(7XsObQ=ah!A8Pmip<^ z#Chx7K-Y>J<4HKXCcDH8^kd$=b0bsSOS6ERfsU`Nxa294t!APD!Tfbwq=_yw2!mjd zw|352%xrWcmz0zQAMcZZp&4kGwQix|19><|csRmP=B~cQg?0y=Yuimq9J_5Sz^^Mu zUsmtCLJ4Z_8RsB_$t*mxXpYkCn4eU2od%37VaT zDNdvR=U(LVyq|T(ZFF4jrla2@t(}UUUT+ zF}wWhWOr*0Hy5cF-OkMMPZz9pA24Mn|*K4V_|0<1T6;z?P+f~VU7>tCe3Sdnm0tJ)qS%n zCP$mwPqZ3{V^bdVZQ&jmEj1m=T{{tJL>XH--B!|nBu&^?0-0_K_?4$vT|K)jAVOMH z*7CTeLUrkfu{BlWnvZg-l&e|JouKG)7WXQZx@cXsL%9-4qzZ@Pv5e)Zs)L zbmn+Db;n9BQ2)5^#4i#Ik0y;J6H*VyWu%;ao`8l=-q)YhRoSAPaLs^4-Oz%JE~8!r z#8l=1`e=P^Tfp7g8-|`dQkFUXYFjzMAKYHPKUt|qf}TP@+#l=jV65p-4FZ(8ZZ#rw zCt@cQ+xb2>8rl-FLV%z!OQgr8n@^OxiHT#q)Kwtz1c)(@p#$d+r4Pmh>V=*nXW`H& z*vW29Uv5WhOw&x@QAYW^k-VWF{C4%HZ2!G<+1Wvsu=;XG1pOPTtdnoivtuYLViBI^ zq@s)z>FE)a^q5TBHt2AX7Ci8)Gs^1?LulwPy;eWJX~n6@$!wXPD^2~;ERptaOyv}g zH({;nbL}aaEyzGiOUwPXz?DctqbqER`#*Cxvm7%i<~sB~m?*9LAI?xk0RWRMmc{T> z5>A1Z`(p*Fqd!Th0{N8)-a-nxPUY@zs)f<*jk>1n7}@e4{U{47=-00q5Vu#^`u4VR zpx#dpw6wp85OKL|llo3*d*M47?D&Vq?C`{Tmz3oGb1w>x-AXk$X}1XDUm(M4@O(JX)zK`bcT7juvj9 zcfvP8hx9@?BOI516d8kqwwe>zjzag_-;lxARik(XQ*-@EF0kq1V!*uRu!`M9fLNoW zLzO+;(sJ$3-NyvEQ*fs+znuPMy%BbkHE_Q_&IfKvY;JAMaZT^-TtFV)WqQUwXEG~kkG0iV)M;~`=t~c5af)Hq{`_<)vzwUKF8`oQ;Eqm8ROWXPy zWU7Q%fffBRGRdT?aPWNK`w}CbaO2K#UqOxU%aE~XCcLb`L)qC_CF?Y|!*Sylx*nOe zNEg4c-ci~EE$b8Cv5>gNd-_dvo(VX!a5*6K$%J`5wNZggGk3klY{bAA@dp8cHd4F2 zDI8zSM^Z}}JqIs^Wrc7l?4Y6W4|(d*Yc*3J`@E!k5KV)AXg80Rp!Fs$!NygwB?yEV z)xjPn)^s?&0v;3z7IwjWFJoBW%1TsV*gKugS*n-_Mwx@*dSx6fzvp$-V4)#{t`Pqf ze~TIm|Ni;ecv1KKqC7N{+8WU*rgt;Z?&}iPe`gc~D4$}+a%NZ3+*#UNPF295y@b$3 zj6CT`?3Y~~=^1v+?kFud{)aI|<7$;904*MxDJh=11+8aMs)HVWe&X?KmMALxB2i?S zON~=Q7?AGFt_Q6Pq#wmgPm?oR1qXnec}ansM&Fwfmw{)C19*A6mY3fRh^~b>1#!^S z9&2OxeP=X(f|~FtsxypE_%LJ5m!hm$F~58|T@^yi0xh3jIa1PoibRnWj5)sGnwdtVs22nfk7W_0=f^{<&KCs;m z(^cF`0mprZ0Uc)hjZC4*M0a}LFd5AR&KZJ-KD=DYr&3~OZ27%gQaJ_w`t!>y2bD0Z zZ_we+l_lz_sEiVQAmPFPXn-^5vzaODr^DF)_&ZS^4C3_Z_hUl|!6}cqlG(#wdbV0X zR?{s4bHT=tLGx#zha*QrD3->IJ7{|q$f|v#>E(p>oYz_@Oq!0{7s7&1dXv}PyiW={ zI0V0$H_bZ2E+ZA-V*`MH+vC9u-LdYXbl*HUY4#<9vQ~vP2w8Dk37BI}%8tmqgspuM zKW3}Md%pY9E6$~Nj#!2`ftzK58R-~J=x_nh3-~2h0kC6w32V(|&BN~w%=rysLHE*@ z4JXG{wmiFqXfd806iZXp5A-ccYd!Y%6O}x}7AIX7NYbc) z7%hRd##64@{tk@g=;SLEh=LPjWBz+fjp6>I$^FeAWiHI2e&aDqk_FmtnH6i7q{&7^ zgwn&Nmu9!qg0D>lZ-j)AK~Hv4fdLCAf5PcBsUMjF175$*&p?MN$(`fBk}?}Zg@di~8!e1R)~?BYKziwI4a^XsrtLU;;MTZK zmmqVTy>qzHv~4wdJRvVp*NaS+$?{A$mKD}hIPe~I&f>@ew5#i<2resr0_4g-m^M>^ z2oND&%IzJ;cbA2FX(qun3h0aWouNFBo`Y?*GAk$TIrdjv!G$#NK})pKroPOt^4u;% zXU~aO1PD>QmhBp6_^0mYnq{;o!j{?miPiv=HEX_3+8ehY3u}`zp7K|&7MvL8|#*%^?#u7-M}m(v&=9I!Xx;yV(r53N^VM$ z#`RC-elFyXq%o5ts_7B6_1(ZEl;k|JHX}mNnj%-!M%~?;yRU9DV)+v!X?H=6Pt#kZ zY^XH=>3lEUUDqNS`yu8k(UbIHlJWj5Xo%ZpPaE~!g$PPksf_g zh4iu3B18xkPFuB>){%Orr2S^9==onhQymH7Q4d65t<%Pf$i?tYJNW3-Glqz2fX5ufy&O9s#P3*@C;(@tfQI_awYG_~Zo zRX#kiT5j&Y`(uP<)xDD*OW?>c;z}<2t^oUx2j-XQ{Q9zGR&*xEy{J(-IiA-E)5EkD zZm>(aJjwy!twF1$->Kvu_)wBq@^WYY^TyiyqU>pkuGZa@(;c9Rtj)3BF zcCyppMG>w{K4Ff~{d>)PS|;qv0_+=SvL63_vV{+*Cs+Y(gK+tAYJ@zW9JdPHiW)uY{(J5(I z_NH=B=(nuTa@9?bpHIAlob*{E*27L$!mA3rmcnbNeT`)nQuyAdRR%o=dk#C-RfSq4 zUfA^Wn~-kW9}tLJo%YYAXuX8W=wEDZk3g0WLHw5o=J|-&sQ0ET6PmW#;>3l+;zLhUUZ2&_K>o;{!YHV*Fixw?HBYc$HPMLD9Sw&;I#7Q_kld= zKwD<*wKWuTaztN)Xsz3tgZC!LDHc~8KAwFBr_w|x1Bu<-wX7-r^q}GEIr;h5N|H`? zRmGGx6HdK)*0&PmH%LU9_V3H^khp5CbWC>;9WRzsqxedzu-JAsU0hFdqp+%He~Y%Y zOjky-jjO6#sDHnetmnqJLJJGW4$D2w^_6*~j@APPIuV+z2%Iv|lx2d8)AO@+Z>(Vdhp!*%X_<@C~jUuk?pOYo|jC`VSHE zDa@h@6dzKo0_{@YMNaD}9%NCY?v^^lfW27Pzj$fonZZ-JR?xOvOSzV9hO*6x;Yc4V{f+j@dzEfJo3He~V;-*lwG)16(k-IY_3SA+_UO3q%53oq8K!Q(ojH z1kkl4zYkeB5W0d#D{N8tVjhtUv_`7A>k2c9Yq0L4x~${xmS_;83LS=`e^Bmt0=}Yb_qb!^?cr|H0*C z-_pFAjckU7*~tf6X(SxQfx|XX@VW8#owlZ%Y3n;YADBssrtWN0B#n6fY=+1^^u-``$*hUBBow! z#RkiH(z950o}BukfaO2vra#l(TR7POK%it;fu;!juR!(xzb8=34-bG^mBXDSTBM`V zIm1$+FsG+5dAvI(|iMky?Gwznx@d*Ju#SS1|TQ65k-trr0K zIV;9x%Z!|k2ktV{R0Xduo zoo!1j$jVf${*3L{CnaE4?5?_geC~(6<>bVCv%n5fiA*`4$EgYdo*&A}M5d-(M=l`_ ziq61P-xn=N-J(haAw514IgtS*?7y{yj|{S+zaWm1arsQZDHw7_f2~Ne)X~}4(F1ZN z7KO*;&jHr!RdjnrrKO)0V2Kldcc7vLDV+tjbD%kD(EOPGdhdbERa zqDBH64-0eiq~k)0ki#JcW#oqqw-6xXrEC0R~(v{KtTEB9agH*3t;u)S}6+ zPnxnaa}l;}C06W`36kPkYP|4I_@WXCV8}6T5SwH7dyS-l&;9@$9m{vyx(>L-%0F%~ zn)3cKu)Iv7Hn-%ydHZSsJ1LYvKBb9b>!~@wh2elMKUi`0I8WBPTYe~7F_7V>)H>-` z=ZFHIF?!Wc)mm&?Iq;;eF)?MPr>uP*m-NGh zd8ozTMukHx#nw=*L~-zYVVhrwa*7bRdCCKKH@_G`HUz4`o#IEBbm z!bV#j&NfShsG)i5BW3Q)%pH7KHDG4w2zcJOeDzP_iG}*nzWfM{p%hEsv75l=z|JzT zhg5I@a(vqkXBG~WdxnIMz7!#BudXIT-OL1bPyIKV(XD_@nVlkCgD z)m1~vu!pj?bMA2_c+r@j6F|lR5nHleoRg8*#ae2)|Li31@yihe9j& zPCP01o#%JUWPiaDD9=-cHA z1fgn@Gtoxe*U`~22|busJH?y~7U^Uua zO|$?DBHu$HypRDv9htM>T4M1LD8>;eF1I4p z`@0Ns>8Su1cz{$vR|97_UzETHW>4kg6EbiV4~mnwH349BTH2WfGYBJQ^VJjCGxm5g zWxb5V$iF;iXn8t_TLXxz@_?u5;y~Q6=q)#zG*K~#&L_qz3Jui8}daN8y^n4$zr_(&X!x~L2T07MD}J+Io)R2 z7<1wu=TsH5-Q`THlyOk8ZEvE%YEIvjVGF<=x^bTkD@qd(8LO`xMiub#kI6F5Yl=Yb z6;MxgQEgROCggCZ$`PX8P^*as?A)H@@V@9=Pop4Hu;(P+|GnJNS@6ro`Z! z@>}nl;Xrha@iQ+3nfTPkJH5nv@h&JseCZv%ZfYH(m{jB%kii{qMZx{DPO;qA?Y zJCs+FwjIPmDcM*9_g1e=hoGl;#NUl_aoh$#IK!FZ#NCTkT6He2xU)Fb==YoAgK`ly!&@bAygj0M3jRQXK|yuUfyX|lZw z?s{Dq2O&JSP5N*EI7E++zq=WFynoYob(9iq0?a-6Z0?U73b8ez$%aN6iSHzwG!NHa zIAp5gEJDL3B|OyXIc#Ef`iHm5V3@#EQQJGvKsg{&kULXNiOQ-0O#$qlbo5(s$Ec-2 z`C+O6agN*PwaIXKaRz3M-Gsan@2ly*Qc`QSSS;kl3~aESWmGv=@>)WUq798LB?a#u z16+M-@U=^1IM?TiaxLRxf37+cK)&BqY;`Vce&a0%@EpK@f;cETqc8fN4*>N)S>!A? zZ@D8KQVVgG{ZKCo8*XOpV0oVa1r?ga0EPg3+b2@0l=b1VqP{^TjhF$6N_SX`ths^}87__LT%0N6* zrQNOQ)eFx#8P+#N5V*Tc9}Va?gP;kumH%uMrKn~Jz<5RdJ#T&rHaQ8w+DXrxmKH_r zOoi%L?g!^UjKxxXz^`Dd<7TJ)W_0AZbY1)aIvqoLrZNxinK|Vpw|)-4I0(>4eS_zo zTSz8A_6g#dnR7rB?rxbiXd$$$s;6nBkmAN z&`_bA-s8Q$nN#>#OQiXAg4skS>r8Ef1jT_(7K`)s&X7~&+M`ySZbm6R42dtG49|6}>LQi_F$u$@L(}OZ! zr(n9t`J5dj_bo$STKkz(xHaWw2S$={Wh)M zTQQ+;Z^aA#>$tF|9tbHOp9TQH?;wcg53|sB_j->WYe%BA5`_EidGmTr|8?QI6_8Or z+or1>0pjcYnHoLdN4WbJSF|&Ofx<;_YorGK<1uZV@CINjK7B3+tEu*8g0NnD(v5o* zKz8WA1HY=%&(GoDyksNNUEsLI?_Y)gy)kU;2LP=9_IGLcP~lpVEXlt%2Y7$*Tl+U8 z*korgU#CpJ8)s!HcFf6_2m!bZ6c0yM*mhhS%t$D+sGvs!koIrBh+b2aCjq>*vn1XM ztly2HPj9hA5e{UpY#$Dk45|Y7Yv6@UG-9m+XvgO)dF=pZi z4A^K3RLtvR!;=5W?udY~3j#QD!g@xxsYMMe7nQMNi3cFcvo3s3GtPPQ9oJN`Qn*33 zv;0GVoJ~&VTP8@p3Gmp-0yRjWIOrc4jH=Uj$&o*GzkrrtwdKDqt}<3ju-Xz565_v< zmvZCZhiK+pLXVVIXiv;yVi3IhS@68^y}J$^G)lY~yx-?dFu`c$pYv9|R@>*FaZ}YP z!)KUei^vbCO4iV~$a!_{Y?*29B7uB_{f)17@OnLpxo|A75uj^J&WxNi&u7F^dP>}F zGLlr%A=@#lUWps4eI@Bdc={$Kg<+>0Ou$`@$fSi3#&Up~!r zEuSI{A7&U`9LypYY5pAu9GKz!1V;*PaEr1 zI-|BDYh<-Pm?+~&bGK71aI*k|Vr73p>sAM=k$+Xe>9))MSOrX|Si~tu{+)2)+qwVn zRBOP#i<{m_^aukGrC4g7iOJZP$NuAg0XRQJit*D+WgMsFwjO4gNKeTJ?X7xz{-O@R zPB!zLn(qW=cOFhvkC0ca0c7NRzTm}KbvOxM)|42Z9|L}g;AV41Ek$F9UnABblt29P z)T8GhC0c&Hr7XE@JhVXR@EHeG2evuZN^!jPN(sv1)>2tMO;46JV+Y@i%Q}6(`KVXn zNLA(-1rz>%Glf6HwwDqDc@}yg5RQ_g+llJzR4~liTZ$Ufr6&B7BQ#l>YlDUf_`n_x zq*=hhqIjyZ7O87Zn*##i)gz9YiHHoJ@v>Gk=Tf7(Q1XeIv4E_bw(Z|qr7q#K8UCzU z5upHh1JYV)F!Fu*_Bs*6aHCm{eDu4g$kO3h5Y@jsl82pu`_Nh#d8Lj8OFA`tfDgIl z6y-B=U?Ma3-df_r@$!moYq&<4jTVZ*B(&cC)wBmRGTF#Nac$RR-AR#IS}{Ff{r2pP z9;RS&-NE&!^%_eQPy-_iS^#Qzp|F#amv<6jJ5-L5^KaOGWcRcXVI-Ok_8PIXHN&59 zue4{9@K8R~PgK~m^WVxdmY=^k?KP3ba%}NqWCSWyJC$pos3j>a#7~)1$d35#Ix*E> z^{ytL^55r?oeS)*$@ipT0Zp1q{8Qw$7*cw-=Ji*70BH$6c|X`gMy0UR+uERtq8$x5 zDrvQ9+!*5g`grxa*t$jByxjtrv;R=%k~(#?-R1{YPn11NPY1T$4KhXX-kK`T5;m z-Hj5y;ql#GsA z;wtIT+GSGwR_QA;`P-!AOnpj|-n&szM2o-xXSo_Qu-Xd{bc_X9F}nSpLyG>JrwA7Y zYKhimM2!9^#``&Oq;2fT6l-sXaj*T4&h8{BCkKb0^q7O#PBGtTS;+NHlDHGddYd;r z#}0K|n^qROlQO%S2OmF0=2tbKFry1LiD(z#jOS&x)5?GCKVfFzKetY^KLLXHq+`Bu zrhQ&h*ZHlm_R9yHXEE`8oA-yFnCLt%KRq6S!2#wv&3l=j9*gM}Uw%W^A3iSP`C?D3 zr?hRczfmbtbEschfm*x65-j%I%IV;yeOJ}yVQOvboZhiDV&%>X#jv%=bF;u({=3!E za99$TBcER8CzIzT2j8|7k9gFyVTXsQcQ)40ju}iw=Jwt>J%%GwmoeT5pQzmhy7l_C z-l)EM;1G<=@y4~R18)=AfG|i5JijBmvA+4QAf9-)`D7zvpqjb594G;euGk1;Vn0F; zF1;B2F8U$gZ545{Wmkt}5~N^}lem(9)V%q-$UQQ}DzxMsJgwNk+?-YlvvUaES)^BW z)p7OTJ79#Lg0_QGK|#Jr9a06Y1KcT^%}B3rn$Z$r$R)?t?y{!o#p{0pLsjaMNpx?WNNr_f#<-v)QJM!p+XzVU5s@64q!wSe#XWtM+&_ zBg(Jqr^-cmbmzvW`4sDGLqj{193c{1Y`1^H6Wgna)tx>Be7S8Lnda8q=&`O>TZB*WA zslxUuSyVqgv%I0#QzXJ^!qsVf)N5S*KQbNt3_u~|3Ti&^0$ir_SWknRep7#e(iKW2 zyHh*2SrvQGjVSvqrB|mrA7i3TaTYq;8C6xkDm(K+~bwYYdbMlHzm#SMx{-Zv+ zsuDwu2|S%4TS+D*TCY_fJv6r#S;b7rGDgFWcfYg-8ksmn&9^2}P0CY*+L#zOXltsD zS|1-)Z6dYG-mHHNY~DY>?Cu-P*3G1|BX8!w4(4gVYl$T-UjvYv-ykj1<{Ndj zud*LiMyQcL)Jnm!GZA5j-J4&a9Lei98mAe0E%sXNr851Gds_>Xs;$6!>A8nHzu1f` z+g|K_RLV%(u_z^xY<6aNi>|9WiDyybT5T9fBr8Z*Lyy4roxq^LQ%qtC{cvTgUTBm1 z{=!Jh>r&lx8QImQj_@37Sj#M%q*${BZZ9>V+)_MD!#8( zjtGM%N~EZo1BK+=w}Cy9Ru=abw$PIa@xjfyRgI66OfyGR=o#^o`|9;I>=Bbkm_3iW zpMMFYk^6L4FUf4|^sBSd**r8{H??^Z+SboU7g|%)w08TWYt0g>Ri(4nC738r%OPx` zo-xn5?%9#_n16)F=oh8C6B_9pjXPsn6P@rx1@_R80aFTHwu`eD(G#HHSud89X18Jm z=i$}Ow4~k}7Pb2Xd9aQixecVp2#AW#_`3PDS1lV{Sl-2Q)c824xX!Y@J|NJy09y0LDV{;bvVL6U;>oh&{cqO+B;}(k& z*!^BggSwaqTPd>CRQoR8+sHqe{H?pGJHedq&WtG%^$bljv5q-feG|G=xYL64YRf`! zzY1%7cX(gx}P594LQj(wjxtf z+!-}-@vg`8h7ngl&1+WCx!my7UrN6jMXZmy7~!)TPaZI{(vQTOixR%;;P5gT+Z2)y1@P%?2b3+ThbY}u z06k{ZYq35)Ut{Sve~F^y5w%(u;2w%Be%!7>JlRiRVWjY7EZm1t^goJo^%YokyAj!W zg8_Pdt2sURa3kp_1onQ!PRA&3)sZ~F_Vdx*%fYYBZOQ}Z5h#h&-g=AXIw31GhKPf* zARQ8#cO%~Iba=suK+#O*qG@Y2o~t7=o{U;m{n;n__L=be-ZYYlR3 zaqo1imlR|kH(@`?0kiOgfA@1RB+EkVB6~j@$**3~1#OL%_n0};Ex#6MYan9tb;-@e zZ=0TU%s4EPh!pWlXs(v$2KHC^a*nTSh#a6|r3>aqyFr+=%R$nB6A6-Ej~wg{t>DfPfG>8_QpP?4@IN!Cm&tjP{P zwEc7$+LLe^-e@7x$fP}KkZ1DsmJ~~Whpl{l)=0_Gdk%R33nNzB|0`cDQ9+Dbc86l{(3@=U)Z89R#kS;xlBf>O?J6*{`s$&Z=T!%c1Lrl_?WTJ zf9)}a2_t)}{_{TxJ-l~8W$UkNYt^^^pvN_zAKCsMV&8UrI>X3?DO1>ilbo9q2E9U& zwW5=t?V-6>^;e+`>B!($i3*3e8rY$DJ*|0lIWpd{D+m>c#p!k)Dl+d5c_PBwX<13J zBge>AuhOOdq!bqI)dQ0-*MZh2sG%5?;E^#&t@mIug?;8Cgcm#5PLc?@{vYgVfQk^MB&ZaTUPF&c2^|T&MuL>k z0)!+$2>JHwxo6J3b}@>&aR= zjQAbxPLR<8#opjkIk<&;z3e0mZ+HBu8U~}NrRuq{h5F%_DqQd5rQ9b6!`Yp=CjB$X zqTjUG%O=Jt4a5FswK|(_&Y1Cqa<4a4B4MG)5o&TiI&(IooIPz^x*APtCLA{@739V3 z&hudzE*vJ7fwm*PO8Qg!$b6*o4Ns0Y`NHGDyKnO3+gODILuWE`l#N1&aI}keKZUO? z585d@xz&}pP7HBN4jjuhjE_9Y_Mds`=5u>-%F&j9e#D%ubbNf;-5nXJgXCTleZyFO z4No-ZJE_9OEfl&CS(!FUMSHBDINGWys4Y>yd8PNgdTSb5iVcP}!a44;t zv?y0{q5`;tsG-nP9gbGA?G~)eGwv<*S$)JV(p&ui0 z1uzuBV5u@sURB@e)MU&yupV6>ZT1vkoJ)tD&YkB+DP=enK45O$*eyNVf zVxLsbGFtUeL3J|THR3z+Ur3SIn>47eM){ZKBHQ*OGJ%Twd>uy2B)Ehz4o~u!Zf8izPd#ehmSjcfFJL9R?Lb6M_6R$UO&U>Mgyjb@dTg@F5 zaHqJe-fqoDoT~{fmB%?-O4QH4iTOD{dII!r?SJlV3FRE)or0|7#iTiHY@@0vB5upS z3+I_!=ZMZP)e)HHG~-tWvlA%~uNqbBqqtP8_&sZe>tw$*#6D1apC70#=zo5B3dI}p z!rj6nL;Gi)GNM7tYm0NT1YYbZtRQ}x*>h-=?G<{!7|0{`Kzu7jJsGbX(ey8I57I{v+&Rd z>!b-qF1(_`sRcTI;&+s*RKh_(v{$M&L__V_Mpi@l3 zu@LtZ*qR0m(>HH468U}4!4w$BR@>9BrDU%}K73%?l-FR{>39PE?Frle+Ip`VPhA6)|TNxD4B>JfYTTut^fN zsB59sIC9*cJvi)W>+-s-ikLO&YhxPP?V62|;8PoBvgrNNZ;Yv?R>%32x6aI=>NS=a^3`V_xdDYq5dYtKK^nCgMiki z**`&X<$1_RyS(DyA?ebYIFdu~gM{*u5+8HH@w3~_Yheggj|Cdrcdo%^NyksjutFx5 z$o}@;vwo)Gs#|dh%wl_Op2^~_w1keytU8;JaKt_J)ny}l?%J6oLg^&|F!o>@$1J@- zXi}#CE@Q2yKFSzbPu^jlav}s-OU*EzVp;U3+O`cIi&TH>LX zQ=J1ht3DgDoqfHDvw`52DHiu?-|y8Y?L=jSLRPv}<~S!-FOK0i)5tZ5nV-G{u z59=woB5yTIYCih!VdUH)YR8zAalVVS+js5w)TC~R^ZN5H&#k{F{!;2$k6s4hqQ@yG5 z;@^a7kQ!Wu20s3w2A2I=d2*dnA$>>;CD<7|{4SR~k6|nSxPa3#4S*d|v9ssS)B$#g zZV;xmQYU=7&#+$$O7-Z4j8v)*SkbAO9YHa4v1cl_t_^&@$=jumujEr}-&W?}79Tmu z&&p!QiV-~BuswOawlgiSrN%By(tXEgf=G6PWr|!Q7A{XK4UlgF`o4iAmL#@(%Gvoa zkIC-uVXpK~Ec#C33SDt`ybeMMZ~Y~WMeqHx&iAtBt&0dql>|G(%*8mn`KP-uRvAm- zhujgR2H11X%IMLV?{*)2O=-1nC`|Ogy5(~I71arvB5|K|#rKbU7fBAAvsH15UJ-FR4o3sSBd(*}3S0-u>$XBTU51%dFJu4!pNX#j7f! z&TM|ae_tsMOD(oxSYg|YI~4q!&eZrUEEZHF`4k40Jc4y&E}waFko0vNry;?<-E7q` zzvBL3Y+O2PL9kZs&rw#*^_`?P2iqp+9=Q2yZYId-iV6k}HbmUaDak2O!(-S4;mL(h z#av+$MQ>-0ezX^G_oXQ{`xn>P3z7EHQ(%JT8uY5x*xHr-#@?AfpLL&okdQ$x_n$%z z7^dj$%*ZKzFtTDx$>`maF}T=~Z0D44K+b||g)GB+cuH70l*Cfmo&aeOawy$c2Y8QIqZ9VEx$nOy3NS>mHN2~)n zXE}f|j1Gmn-|wn>L?RqHx;=Y`;Gi3t2qaFLum|0BQ)(+qXRm&Hpksrcc0v?kgrvkG z)jrtjPdSOtz7d!ZtD=>`2!56Y$_vy|UTdxiYd7-#9TN+dDU`}QbYNakv8DIO8=E)R zj}>$Qk=J^N;)PnAJ5tn$qo}halw4earCaM&o0xV91*PMy1FX>S91bLMIdA)bL{{M) z13}!{d~lnyV7HZBi|1TF-+tbJX>#`J&I+RN4 zU~#u^WVvhUyVK?uQ+%%b%hbYSKg!WAkYyJF@#m68DbroXR_7$;Z~NNpoX&J99qEN3 zad;W?S{ch2i$iJ7*bw^)ED6jGS9(seN@hQ?y!ETDMI-G=gL~F_T*|4#B~3qf$rxy! z2Ut$2LJB|X3kv1g{5+`bTO(-oQi--_xei*mm^L_ENCOy*Au6tAne!qv+ntumYXC~S zUP;txjk~k9iPMB7%fYXdlq*ipcXD`2E4U@vowXzw(`wX0Jt$%^*pZo@R3djI!ee3% zX;7V1qJ}PA_9K-GD0p7sQ}%Q-Roit{q2G;_8+1m_9)!iUT^Bf5Ral1R!B80&gVH-G zoo@?wxL&1s7WMBH4^Hju+UN^>yy{Q`?%b&#xhvWnb3o&ZxLpF=G%9$!n5>lGFKDc0Ab;H?57DjMr~xsrx^!C|A3 z|9SA6St6+|T&7&v5>Y){8&7y(PnjYx43*m)1>@9O^)1V83`l2Wp%QVP(qj)a-N}@G z6FB0Spl^M9pE!R&Y~*5Y$-`SP4Q+DU{R*DhETK$uexz=`o5-nMFWY!*E9MxUv{s!b+>6?j9dRxd@QGiZRK)pSVm5{ zyrtKWRBh=l@<8@CD8DK1H>0^@PN=GA#bgg{o!U$tNz-Phf`M&tfTi~F;HT^S zFt%UKRZpECL`$_ZUT9;kj6YW|cbY6zvN2a>a3gc=dPxFs>*O82-a`*CU=BU?Q7>U7 zLn3&pD=HA^m;(L(`;%l-0Oo0zm!mU{KvCv(Fv7Man&4>blJ ze%pKb=N6P(Z~qXk6-qk+^)2ca*MVb)-w{RC%`R@7e5AP+^(hZOQr!>`UsP1oFc_zj zQaKmKKDM%r*J__j8Ex*1Y;CZERvO{A_dp~Bn=}|g5bs!Ljm{J8dnrjATt&3-|6bZZ z(KMH?bj#_faP90ub6@OU(CAk_jiQn}m?cXc`BJ!Bv6xs&K*u`rPg5Cb$rg#>Q94w6zzHQhNb z$S5(B^FMo5)KMD{Pu2C+Sj1dy0%EB^z}29{G9^jLxP%-R{^gJ32G3wlRN`<@+ul(1 zNZ10UrzFR7d9BcJ(WL0JeGx668oQ3p;vvmeGWBdGgo<0aV3<)5Wn4vk>FH_Xgb&Q@ zA$xABg*UI@?9i|Q_mQM~Jm#zCNH`nd1QbV&ci)6fWJYPV;$_afnx?ITlvCR57_P07 z4}B%1pUrwB8gjHDVX2P*)Moiw%CQ4A=Xyl~*V32VVFRsK(-L`weJ3%8gMV_SzdhUy zE^WI{&*nlimM-~yUSmm0A)-B&bY`IsT=pV|d|35}6I?JzKICLf?D(I0SSzIch{hNE zbnRD#AZbv3E}Ss;j+lT4oTxeX0)|r3xIYu~TFO0g475?+&Cm@xiTe1_^>?gr*t=SAm%X%BopMIi4G%_q#J~gz-sJ>ql;(h*T8Sb zw!dGnetQUU1Kc|fa|(v6@x4*U?@3DI*3d;Su9-Hl|B9p9Q)*!U0c9at<(3X(i=ZP! zd4b#XnWZ%5dR$fBgtR?=R^7xJu12w2*W`#7sI0 z^aVA8gk2`MH#sdl)FrrUALQPWz%Gy=L|{ADtv)C!W%-l~rmJZ#MCmUF^-l)Y26V5; zeuUdznesKTv5Ac=iE5JP&aAOjqT)zYUnM+J@uOI z&C`5TmAW&ay6-vdO1VNowE5fD9z-?OUDvg5>`YG?G0iDkty@}@$SJn(LVpzxp;t^M z7&YRX+4TW^m+Vr+T>S}SHvEpCwmu6P#6m?^V^td*EQX;NkXcY zIo30CH0qpO;H`p)Mv*8#Tc>`noGnch$Acl9LWaA27JoZie2(-wx&vUAWj2h@I(G;O z@l5dY9(DXPYh`h5k4&1Y@OWM!)(IZaJ?emU`QQ}7)dFlY|_5rLP)k` zA-c_@NWHQlV%Z$Uwk~wvl5ebd7gJ@IttiY;5H`+33csCJ+~kjyNR(shS8BT7&6zCe z2I(CwbFY|#9sC(thH*`P0QD?|!gh2iIl&qoSrvAv&w1PJs6D9dWy~f^NeLB|pI}_I&sE(TD&Fb5YT?4yF$8v8*aZouh51V4|qD=9u({87xM z1?^#ah+YixR@4zMX3k8sOX(cp0e8v{kyX1x>+8w)gtCgpr^pFs^_&*_9-94OqV~)w zn)A@2E`s8EeMfCL6aC)AYHK7O?du<3-CTEiyd8aNLN;Q>&Gt2cF4+!K_5V5!YZhUS zIrWZo$4(t_yS}ZqXk4Auo<5OWPGP|uQAX4PHC&ZPzh9_x6Xp_SW%3fd%`jsahn$JB z;GF&VxchKl?hL%wz3pC_qiejiB|s9B+|M2H9mc^ZZ<-^v67P9vQeIb;QN70)vUU#5 ziRO-du9zVLuLhihTS#*fT&s*7I|yY{!$r>9YP;TAsG1(|@McW+U_C4B(1Gf+$ocD1 z=$zr_Im5Gy2X4YmU>-asLEHdFCositifZdw0|bSyLLjBi5>o?8W!IKa-hFARaL<*; z%DZ{(xDB$_TDRQD^Rr-Q-{9uNX#fJTOUMKS(k>N`MI(r0j#RA+aanp)I}Mun|=K;$ytNhurwR!BSU1{XeXM+6%p|BzEqU%*rJQw-6Dv2C}* zx|?G=LMD=0D?^F-ebUL^>}%3;OWtU{b+N|BJ`45Di{Y)TtI_nEjzJ_o2j(Dvu-TWJ z=<)e(ieIs#mEF?m2%Ajm7`WbC1+5%j>%}cSQ~pub==N@W#lpRUKKh7~HoJ~rPK;@(`bGTyuNcxKa|V9-FmJk@kxrMv#}XPdFM+u75v zsmk!;)vx0z=#mxY)RA*=k(_;BwGA*b+-xc_#lMX>k>l=+4`P2STwyA<+Vi z4~GZ3^S|cQ5`uB)qfw(Q{Z=Jhr}{ff$8^Q-89wD}f;0fZ=1Gs}E1GB$LRKgyu6FGC z;ga?-WPQJWBKX+ep#>3sWw(aN#i6k>GR(gDh0I+Cb+k=Xs+Bb z!eWhB+RvjfU&}!IhZm-HCH;rbML2dQ`4;F7qx4ki^NXw~%;`KiPmir<_LLPWdh(yS z<5o7;*#EZX*0%FwML ~0CfTQ03#(sX*^MFktib56(*B{sg=}m4 z*`n?%$cFU~ftH61i+t=UEU$|I-g9n+E|4P@`rC+{&s@dQfW}~AcF9zkoc-dfGgiLB zUds<#zP!Y#&PRC;UbTCLb<`i}o<^c?TLV3^Y2emaO*Ra?ythzK0Xchs?RS#x`!Ez8 zx#i+Fcf1wjF2gI(|JWWG&|`U>e{nh2=}K3l;}hriybep3TTnX>+0M1THB*T^A$&t= zx&mf6i0~b264juOEDb%b>N^f__fy*?a)!qO1u{`V4-7l3)Aw0oUIssB46(6>e23l= zv*CSJSKAtug4U#G=et}}CyU^fz|qS%83>NJsvq~@vs%dzAhh6CwBlleGgk7)%Z*`a z?8zPJuEckomOi!$Ym}7QEnBR_aE?)zUSJ%Zu-bC8Gwe_8B9OK1R9366_ z&WdklRJ3vry5_`Di+T?5m@w|Qv>jydRqJW1INJZ0W~l;2SYR%d$LC05TS|sSFhP2u zt&HJ)jD3uXHZ)vU#fj0kMu+7r9J!$SeD7zcvHdnPii}l1+#4|ioqVA zBk#d5@2mu_4X=G~-g72;hYpR8dLK#e5NYu*LY8)VHwsi4eK^SDn!Bz{E))?m8unaU zeri9FLoo@0o88wRXk|=`b*wEfm^tNT zsT;Jw8U88P?rkgFTcJ4hh%T00Rjk?^K58Pv91XH!?Q zQ}4{=8^IpS(#|+!*&;QzaYuk-)Y6Kxdi%_%61vXTMm^4^*D7SO|RSw5;mI>XZTZQ!zgGsy`A=rLrfRD>ZuiKd9qu#Gw z82zCwKBuzq>U;u)XMiy`R4({xd|CoNy({cka@5f$83V1jb=F1Y8K!P%eU7U4t7*=a zsUc~NAP{FdA)t<4#Mfipc#7p1ZcCm^YeK2LKOMp6@db{k&LYRKD-UH16nSK@Bhhsv{9nGH19&z$+O@ZZG8%Oef$YNB?U9DDwO!{cL=*YG8i zc(yAH!#eviZWgcd0&5KHbkz=M}}sue%DHK>l~7!Fze0CyZo3`E$6k3c_qO_X|3{== zMSm%6%2?xxaoxz3XbX#}e~8|MriPW%cHjEm4IWe4Y?ZP5~cwF1(XT|Ll+R+&;K z@xMp0ui}NQ9qgp6JZP);MapNYkkLE#aV>YF`?L;=}s9j~cSU}owfA6KXu|#iY zwci@s-r%yCbMi(UhgY-Zjh!E4u2Fp>&`+@}dZg#VU7UX!)^aS(83n?V7)^JJ%Iw?O z&ESTX6|gT#ZQ(}Pxi|}pLwGgq=lp7B??YWVCI$%m)z0RyXwKpxOIAEhA2Tvv-Om0R z+v*3<-;)d4*nZ?2(>aVe&$=k2fx7e7_aM!4VI|P|1D8>de`~Fo+u0+C`dj!oP1ixA zB{Xg+%yChxGa%yVM{ZWDVh3o_%e|M=)hpgV3`)QTaXH1~n*DiaLPXf~Dn#;4WAnG; z*8BcokB%T?LYgA1Te2XP=+ctJS2SbqXlY}8p5`)P@WIqUtqOlO->z>fyl_1tYj`26 zRll+&P2HT|u*&$G2$!~ykIub?`{enB^PfQjvqgb!?aI0*+*rd74d1OMf%KW?49f_m zMxhy)q4nu5Z1+r{S9!%GA%j3uT)+1a+vJnU6(+JiVcR#@EIJ{Ag4-@HFI#1F9jmOk zE)ncsm2t`^hY>^#oT`ipVEVE;Mb!=YLjplDLBd2)T)#O;Q5vM6nJgiOoEZl9^t@-9 zQ_!@zbi2TEzC?~Lu4)BqAMT_#JsP?0Qf1*FE*3eGjl-z@VbEthdi2#6F7c3LX_4a+ zg=x+*TpcCF=4@-EEw0*d>-f)eqYqcXeUxV5wG{g!taZ)=VQpf8#vhawT4ECeNrbPX z-=XHiBqUAv!{pgXJ~?Q!D1wdstbEUvjO_f|(HakMtNpJFL8W>k-aD*^k3`+lCo*^J zV)#*{-94@T%g^^w2aQ{^2E?Vk_gU4D$|iQc>b9_(jjs>caS z9F}=EATDxpJkp9W=EwB`bG8`3YgGOa#jbS_?D=zFF3I8S8KBH2lpv?Kpqp31w~OFQWgTNHFM zvdRp_o8!)SIrz`5pmaPAX`dcC7(2OX6-^^1Dhwwev%J z3IcwQ4_6*9Zbbp$)ho*5%SKde;-XijSNExw#Q4y+w!?h!dK%MT3U%m+)3?sc-tmvh zM2Fl{W0`;^@RwF=F5|F5s=hO`Yt1>$KVpa5J6?pP&(f-hM$m;-#;E>X4SfYifg)+u z5S}Pv-m>si49kKKXA3&cM^oNn9i55YWy=iOs0hn|{{<&ekcjbNlV!+nwTFYAzwNF9 zv2GC^-Z@QT&W=w@ONl5{^>{arYhZt#zAw}VG&Jftk%Q(?-mxH19+d2v5xDwww{^M* z(_-qOS%lG0fkcSg@V)YF0%c{Zy~774??0SMDoH`d3k3>A5?^ob@?Z%EEW48zL@{%n zip&D4o)qI+RH;8MpNTbEN_KUAdM+9HqQ%kjoJo##O4d{PRy(4a8^+Leb}jOhws(Dx z-_M8~_W`)>U~xOC|H6-TWy#+*5L8Q!)ENJ|SSjK=$!*vSH)~rQh!46h9A;!-8THy3 z0#q$2=)5>;rAmWH~FsdsfFZi&1CCIt-FyB)Xe zuGz8~S2i=a_~G~V=z#J>h91Caxc{l#3-*%%j)drsGF=go5FQM+&zfIWO#S%u!6mX2 z7d{{ztJu3bot@aKG}aq{?L7G9`C{!UNg9aaQmS@&?mW+77uX?>N(QjQu~A5ZeycQR zpp|v|u067-fkc?E+mG(EGXcw|@rYr7z^A-{6 z=Ly&~6ZbDsL2gkCH76A_JyJls2GDh7h=ztmhE^ZGaAjWhs%HjSN{l!6A56$VU? zZLjUxq-PG~pRHK2iYV-7sEIxOegt z*!^rnqeP}M4)Im7BTc>PCi!#Rq^@P@#Xsmn$0GG14>vZYXHi9O+(Y8HYqS(}lHvO1~=f^^1zb_3Fl_gtVpDTD!bmpO+6qt(X?CQ8HJr@?;`i^FD)5(*}^_B5`T$9Be3x)G3W61I#!M966jUUT!9#;fH*Xy)u=PnL>k?)XY=j= z#dtKy=wOG@4dL;)X;)p8<$6Zr&Bnj${HD#bJRpU2Aajc%G{ZmOWc?$Kq*PQF+hcR) zOwo?{QqLys14gBAy1a0DhH+@R%e0cZXzp=n3hhGWLaQIJCHLux=T1xqGNaG7=_3*I zFQk2{WhpaYZZK{;*Cd=#1pK*nhjyC; z5IO=tIZ~j|tc*IxBmlH%)b0C*dmd(sLDMk+r`IemfE=ZvgbNyt92o;A;UXgUCvDj&Spi;c zgp;>x-;aoIz!2o4oj3RXG@J4-ETda5z$h^9uF>DmL1-z1_q6u&V6WcU^3&z=uk6=D zWgsT6BT}ilKYk(}AQ$WYF7WeG{`yPj(IaBnTkK_>A-o)HUnd{~+TXXqijp9UBy=U* z{=h%Pb5hJDx_;;f!Cu*G^ZMYWzAi3mZ(|$9EGXdK--))VU?F5sD zazSe~a-dM9fW>aG@ZTPqgO!_~g568}A=@Sk0P3K}FI(r~C68zo|V6rhgCI3!0gskOUTmeDRa0)7& z)$yOR3!LnkRD3{Y!FS+V%hV^5M^w zzL21k43d*YrInn2o5TEXGTXYd6L8k`7@C~{JWtTYX&4}a5l&@fz64l$(I<7|2;=mP z6zqrr3}_haZbRmQ9|q8`62Z`O%qlXD0~n0_8bPIZF4(#fF7!Nhuc**B<8Cys}$R%Lc<-x9( zX9qfM2qJ0-)}EesrwNj|m6RI*&A7U77Hl`P?Pc`5eg9*l{yOD9K0MR`3s0$gDfQ#0 zivT#yj#NnKEdVe@H$kcN|Mt_T$3-Lk!fa*r-L38@Yyjx_0SL?Fq2y;QwBn%_8BCfl zz<9FnmZb9ZaN22YV9H(s1*6Jsh3w2+x6*^}^@TI}O|ch9JH!7?neH!p191JpOnQ_t z-UaGd?HT$4fPZ43lS!b5V6x*|LjLuC{-k3RV5?O0jUe2jfi8X%v%4{J_rE5|0W^L& zQ0()bb+p}JxhYV1>o)}f?YNjY3MzZvH+Qc+*a5lftJxI* zee^jR{~Le$FZI5E39Mmv>+SU`&?>&2!~$58G=Oe8eY4`zBky0MW=dKC6WnS<1iHDf z8CwVF2+{!SUS%P>=4o^XKrG{3Hr?Rcd+!JQkeV%!n(tflQGmCGy8Mjq#l1Fk^hq%B zYk<$+2o}T^1jH)(EC^(HPZvJ`B0_fqpp?!6yS9I7T(B}aaRa5{H~F#i&``DYWVe!2 zZ*Jk2-4e+Fvnvfuho)eM8x1x*DIYV|EoD>}faA0PXYfL3;Nf3K3{4Q%kvCJj{Pg(i$L~>8ORbm<^5$>is%Wm@NG9z7Wvi!fpEq zSq`ry$;flz6OgagN5>8OOH}`x9L8TqTT=eV*Smgv-7x|XRZ9ieLLM8KM3_;Uo=k9E z`0VC{C;uHE=6}BZt*5}{qQsEV(LefI`7dhY$N-!fGR{Uzrz7tI%mw?pXK#c8&GEk^ zng)pATfsW?40cg0A3^GN(fpt4W~S@|WbZX4r*&QCMVy=AFD>OK5`cV{QJ*eug@zrm zU6++-sDS%68JOxSIai2dxdQ-J5ddJ7@=O5N2H0G=cYvvfb>kLiARW4kFr68DodcpL z;Ll@Q!Tm77@RllQ^r{V#(!27nu;uR?^feE>P$bQ{4X`c%p58(f*MN6#u8lq5z)A;z zekmR`6}9cv%URg`-T8k>o!=g*^loNNcEw?jqW(P=~n z;A$D|er78?>M~RogAM!SOP^VOV=FzHAGZ0QU;6vL0^ocI=R|)9y6=D&Dt1318@llP zs21Vd=g09JFrjiN-fDRh&E6R~khCQG%jevB3Qjs|C!!@9vXAltm+jzvz}6Vv7o_rW z^PNuW8*9aA;N&o1b5^D+zHG9&4SNch`}-h#GV9+tbKdLs1JA(YA%J(Zj(nSeSekr6l4tJ|^YPSGQvc=4CzS$u zmNt;AA;dYEIfaRU|9PnYQlB3mVxNFc+HK`P_?Cs+H>4Hd1Ox=qr=Yb-n?K(F%b$mE z2Eo`9ZwwSe@@8pn5V-ttehX0G@J47dFT6Wx!{GfPIJlR7unJsq+!zG^6F8jjelQih zlNZ|%#t`E0z5av!pq>BKUjq6!ALIZ}LdgdRA+O{}EN?)Nb^qDUUplt` zY$s$p{-d40^7l#qIqd(joxf5J$ZP$_75&E*{r~IPzeXBH39A_aI4#0Jjv6cnpxi=P z`?erb(Xu_>#`))#&3}4gQ>V?Yl{90A)w3Ec7`AboqN@ui8_ucZP<3*`XWu1!&;n7` zwwxc)OUNOeeXI1TE*09GkVarW#=0+d$h^1T;g{#~{OB$S78hh@eREc{MLh6m9iYt#ejO;=ny4sK zDX)Pp6b~Ya3;sHrkaTwUMY~ea!-hhZ-5o=lT_$97y3ft(c~<5}dRF>h5-m%G{aFWK zk#ztEOufHgo8#(RpG0FQK7;agmM)wNclk!#g!iH}>US@kPz2lxa>tlYdseFEPx-z2 zRqhA`x0C2#Jao}w-)8Anjd7 zD*Ud!Q9Y;o=FB-=j6zHFX+S&tt9`tc&imwr8Izmu`F2kJgzcQdmh>mjee0f0dMakg z;oVoZ1nYq;t@OtW1{FLzYGo<&1Do6zI$?i)*~g`Q#p(5*B;HHE4!I3rM=p%2o{pSX zg5PU?(r%s>Gk&(U$IogJikZ2<)!4>BaMJ=vZx^hDGGi^i)I2>vs96|=*^*Hvr+&V{ z{`HqmsjndSDF?9Amdks>Q18HK&JkfHH#h$I)(%P8 zwa4I>-Y5Br z1JH=|5x{MbGB<>H<$zpUcDG;a%?m+E2k~wY*@xda5|SdU{@@P3dcZ|;38*zUeIbdb z_&wt#KNZY`^2(Zkw{||ML^}#?U;q4Qx(`J?#T=vH?7+{gi?<(dK72`Hqsz$TX@HN- zOnK4ttnqT7kf|~r;tTrTaKLMF`t(99H@HbTZLQ@L&<_O5q9xN$a~?+nFsb?zkN=j;_v6m`sCeR09WZfcAD;wx799; zow-w=>-2#XiR(Qb9)d612bJc>Q=yx&c z=U|aMJowYwv_iRH;5Du-1fLl%>LCEWov?o64c;9P$gr5)SKgy6vO71q%?*~?lWKOQ zkT0sVY*Y9a3@(&92YD2)fCiZ-fRw{JF3NYc(4O4It|X;Lzf0U;waO{ypB*Y++bxm0 z4sw~}Qh3J%8w|ZX%np1J3as$YIHr=Z> z;BH7g9C68XP!?ogr7gGBS}B2+xYST?5wnji3!`-@ibOu6&dd$&=jTxe_==h*yy=jcOHN%;0!Ac@8cq^B4a5K}i{VzN>{#v|%uS6Baez zn%L3hn%-fF12^T}n(n+DVYY@;o9fHSVEX-}$9~=jvtMu1SFz123ALc`+7j|s7zL8h zJ!076ixe#b%XvVhddontS{I+&(UGPLlffGiM073wMPb+ zOAfG6l45`#$<?ty|5obr1$gS4;%voSUSL^>$_ITo{SJt%>Hm`gS5=Ukb`{cw=eQT0tIDrIhVm+ zr<%S}PDGASpzR~l1(4Ib^oRtBK|OFaSv*=a&4kU+x&C;=9hAw14uR(59d;ejtvaLl zX@ioagVJKs-+{VMErd@QZD0dT&9}a@9mxlz zEu5NaGLG(|4iIL_tyvH{H^qZeDeOD zke`AqNZe_gHN)xLBZ@#OpzkD6POKGi0N=aF4jd=Txeiyn>)ycYZH+aD*q@T+yDcT4 z>IeDeU=<*fmah2mAmKB2OhMFw8LqixadQI+_-fzkJOj9J_8lXDfd^2hJy6mKAkC$d zz*A0(%>f=q{&NrB7)q%zX3qBYE_#IVZz#{HZBVh<7}*ndA{0;|UjfFzYD#x%6UZd$ z02Sfm;TBdsPNQfX0p<3i5q`5p?}|5i#hn$$PWzXR0WPO%$e{qI8(jkR>qtKb%aas~ z%=?ZqyVpl}y(pu~AUf?|G`r+1zQM`e>#Lolk5RYp>gu@yMbQ4E5k0URH*;RFJaw?V zLp)%VPlNOD895N&NmZ^9!Y}D=_`DDMP3^~S8nQQS$@dEry+=K$pR?LI0CgQ6{atMC zj^;wH%`rn&);=vg{Qp9MBejb|NCC{fc$X5yiVmE7=4gL*ng~WMd37}n27$|hBn?Pw zP!1D)X&9&)F5#fFfuj9!KIZ&)7{%Oime8+A6g&q1xR@R!Atrqh?3hW>|1B?q?ER)Lb$RRYA2dyVisk)&c7)??N_It-_7*m^la*lOyPlGoHL9*tON z#_7vH7Bf<yJo;dCHHL37Rn8aNwPg|?<$w+5MY6{&0 zge)|moBMetA{O63vY!C5zp|n11vU>MmaaHv7kL+VS&(Txa7Wwe6vn7(G-B#0-e~F& zR|;EFu72ilH18^3IP*8~^u0Wc$@a65?FI%;&wsAZ?vRgwoBFSAH&At8uPMvQG1a=# zAH*sF$6DeQv76@Zw-X;Fm7G4v;{*N!owVNEefCh?Hioj6bkN>#(TSTSvMO|@G;W;- zX3-LuN8?0Lr?YMsBDo%G;pc0sw?PM#$}uT`Nq1N<63cK`-Z9 z^W;Te4XLstTt*dhul4oy|MgL;^l6l_nOBjVUF!@KFDNt2%M=dq@Y1Bj=C*1s)W2wg zLbg<1G&AREZFuoUd-av;A>U_j$S4AHt!s;~YL8Syp>@Gc-wKogH4Ri9Iwv~;1I5x{ zLE9aD-)8auRg#tdG@T5bV+-WOmd4|4rv@F#{nrdEJF&oCY^`mQEO43v-KYL^a+(gV z)xd1!UquI+WjSG?)q4ycWa*C*a%qjSUsx_+=+z%&Oyy?h&2 z6FlvKiJI;LH-`WG&w$MQ|GVArd;t+0#v8+(W0oeE?Cwk1y>}JDK6Q)qol#qz9JL=@ z-WBlxHfJlo5Ab8VuFl8Cr)dlaf*}PT7cC-f6BBtXkfFkip)7=oqT_;sR%Dte0(tWb zQ7$pkRr6_M7U4({Trsn1jIqbUlwX&(yLtWZMbd{SAif(bILHM-)%{D3lbciE*6uc| z-wsh^tjHjQ>`)k(dxb`<*GjvMN#{xIaVl#F5%0GKT-7kF`@k&n3RA?_*QDx1PZiHD zLC;T$9iMm1#g}rx+iz@M1q&j>YAO#%_`ghr03<;z7r;|h>EvI zIxwrIW%Gj_O<4LTqKJ;lY`)0~>&v0o%N@b;1sZ>^XWzOCS`jRIc%k?X%=68G!YFnOp|9>i>R91WD2W z!FghQCN*_>SB(LQ_fF+Vx4B1zVGiEhXWrouzldMwE;1TSIn&(By938NMD!_U$V39b zlA<}4t#lKM%h+l@!!M03yv9xr1gv!oqye0Y>;`35e%Lnw#B40cff$yhtIU;%sx9Uq zeHsJ;*2$qppoyiaM;#2(7mk<3#}F0GH4PFBE!4sW28vOaJ1#RT-8BBD8J`HC7tW=! zYFlk^)ISo>(gyw{I3o>eD9c&;q!oEhkx1~bHyJj4YqWa?OF@%!wW@}W*E`H!;JqZX zSWYP^m22}#5Dk$gt{#BjpM52MA?AIQYH~$OZ{8@OPGs41rSgL9?lnews9~G1nvNnP zJR@6ITLkd{RnhJkwq=CuD~qg3+t~`=RcEW z5IqbH1Q`F+kFxXj-|P30Ke2ExLbyd@v&+}VA($b;w(R~B#mDEtjL_BT>;mtCDBJrJ zP>&x7A0a=hzzndN5M<%u6iH5Jp&i+xVv)1IKUw)Qp^<1Ew9@4{w+#i@&QIbUU|aT7 zEYD~(`Sbt~rq>n-#!%JE+u5^-0!lO7IxrCA@`F}}`LlB6LgHpQ|1bLux<10qzJ10G zzKJ!vcV{J2Uv=e%GUXA>Xvz24h`TMclxV$kE4`gLaN?pjG>_w0fr#eG*^_^Rel15) z%=$UCH*I=1$Kf+dYO7x>#{?!DHQiTE#=iVQ;BzPz1^-@LDHkVw0w?Vf{kM*v`v#B> zAN=n=%(QuPyVhEp4?DHfV|m5{8LP)8`^o1V@fRWnHFfEZ%d8Xh%F~dlH7C~SgSHak)VVoKi)|T=TDC=$};K26ZR${(8R||*=VT^e+ ztw(AbiovE)$P{k%&JnHB%WVT$*i{42jVe#5*>#{J8h}jz1FbQdl zFmg?Utj`$6)J0%&&+ioSYjaNwWG-C=vo;vx9H~bnT6dt2UwlDZxl*Ec-MKNwta!zI z^Rcn7e(u|9W!ETa>Umz~h%BENY55RcL0`V5)nQ66#t$nz$IvX)n_OW z?!OYlE(%S`25=UPd+|=Mx~+U>3!6W@NUL?LY}P>k2Yq9T$k3`5dFaj6 zSAfnW6rg|ko(7$f1zA=DG`}@2r{!VBtB`YG%EI5iepQ>zx|o4qUe?Cype#L&{o=Rn z3Ins8wM*+@>{KEnN1xO59BDX~}- z04S7?3GfW=j_g7w`D*slNpjA!>VcR4Zd#8RV8G$_0&HRbr4w(rz$}n=nmkv<@oD>x zh4iSxU|Kubtk0nxdhnt0iZ&}8PByE!|5OU&tX+!^Wym*UQKX$xH9BtiTcT?~?>>C$ zG&0a}y|}hVWc?cg42#X)SYLbcUhhf|K=%Oi;HGEyjOnSae~{$q{UCT|98)`8B6+;m z5%ZXEjJ@hJHsu{7L7}vk&*>6-87kr-yDDc$9)!yRE?;L%_Z_JTs6??zQ;T-{E~1h2 zbn(r-Q)3Z|b*Zye6&C(rXgEXT<4TbGphwGr9#g;Cwl9dpqY~P}>v@9?6wc_wKICHg%{KgKGvG&3299?a3 z{xJcK2#})9w7>70*e9t*>pW{uWO3^Pj5?iywt<4*r&!CjQCCFjg{MALjVtMGEyY!l zi#@+Iq!k^W>UQXmr7Zce8PQ95QO-B1*6-4mO}vNF*2;&_jyZ$$Jp(}}69d-TQeyAN{NcC{PJ?f77F6KGu@jXbC7 zOnsx@sp{&a8d&$Nkhm!{rp?0dk10wXMa1LwYL`4NdocOl%2%zbL^{!@ZP6}U!tb`Wb(w<*UK9|^|sZLnp+ zH%wjn+Q}cAWIn76@$AT?x9FeoTYR-NtXtejEM74eauD`g+SOOlteEVIbfgrrgAh_n zfi>GM_WCzp4^>!E6vgG1M5UpF6avQ_iT9~*{{V1(+M>cnaT>}ZV@EXf)>o$-bJF-U z-xeLI1n3$z1N3)tYBX(picnRda}YwDjm@rj7koig?EkU%-Cs?nTiX^C6p<<@y>|#U zdKD1qBGOBwi*!N@B@7}U9YlIlq*v)ZQF=#uC-e@1KnN}HJ~L4(A(V2p9wXtCznX>SvCyqbc`owdo zbg@&ds2`J?zKlkYkyh)NaHP&`wl-7MjN55Rr6{-b9d~cdLnBJHZKDOhgr{ROg*!9c zC>A%-t&YV(AQ$!N4YK1V*6;UeIouQY8Sj4dY`OSpcX?S*qFevuDY#67BR_s^nI1_r zP+yzIR&0mb`uX+jYv6{@a7H;`hlbRZ zrhMkhkEdMD1EkcYBr%0(umA2Z_In1@>ve~@Z-o%v`&Z8k-OH05_d{iTm{&TxLd5Jx zP{Z!4jbtL5`mQdXD(V-w<8wq-2@IFKjN_5b(baoE(atc7LaZL5tM(LD-HXZ<3uH78 zR!)W(LXF!=)$D546}s9#HW_<)KgG@EW;fn6!=>1+ZHY`U)YxFkLf3l$0_D?lz)!rjop0 zJ^1kS_)A+d8!5$DO_-q0_V<1I^`E}tgSPbr{0ITNR zW5@h9{8yE8Fv|bd1j-$l;gf{xYqMfFSp%G3)%otVAnSOrvzoQ=au;M}Ic{rVpBQGI z%r0?emVot@@&-?c9)_bbM*|D!fw9sjt5WBm)A?0<0;`I*{0!*OBfylIp9?8YTb^ z*4bdxxkjRdL0Lk16d#nv!Ye3+? zu!OuJ^+Ru6PtMI7`VaU4rDquQ!)MUA>%cON!&JuS3{0fNACv7Xp3g^kDLH_>d{jba zpC7+^Alh2j^+kC00s}A9nQ@jcsjE6aqLnS$Nx?knj^`<^6QqM|bskT>`fab>wq~;{ zt;NAowrr=a9kx8{N$PmEb@n*vgWNWeAevko35LTaE3C6jDnMPHSMNR$7fy9)(#R6g z+3zm^o8YMSJ%q%?vTJXPc=yK;0yd@5*)wsYj!6zU0Hz+`XWZaNyaxaAAxT=yC?y7l7=R1 zt$p=v!NOzUhp}`@h$#gt#QsBkr!ei^uY6K3>vHI-kX1{G$hGUfrwWH+9?Iuc2PdY2 zQZ9EQ8IB5F6^1Dz|d6+*opO3=BgqfbMW=KEe`N` z3{mn$UjI%4!tRjzg1`0q)-jpg+k!|RN#D)b|G-(m-6ti$kKTBN(SB+8+rRi1-IlBk z5j^K9Bmn#t>$eu2mzuA|Z3vwbyhMI{55%L{df)TQ07@NE^dh<~rl`CBC8i z`^2|fDtsv2mX*DLEjZXbI70VgdLLxj?0|ZGw-#+zP{IekI2GHm7j!T?$n^n`ywG|l z6wPIoP3Y?eg&Q1Kp48C!^e>ZJ1rWFdSrOmoMNF?%U=$=@Q;P~WbprFJ+;K}*Zn-vU z!(BME^PO*A%}?W(9hBh)z8M%3^C=))`;Xk#WImf&xABVi&-9-12seo^dokX%yqG=< zSL`^Quq~e2FB8ex&#F8*y2SQ@hvB=WjmoYy)Dm%K#>1D;rCe4k(C|1Y-q=m+WInjn zb!F40avhi(HR<9Cnn>^$UCAgm9G!U+`o!+!pi8SL?i-N=c35^)Ji~$9pFrQf#SvAj03$bxW(#|< zBUl#qkt{Fdj8C;JnLgo6Tx=A3DGj?+UMd zWXEVhQ;-)cbq70@W4tAvlA8gz=tu_xAm!h{MTBf$!svHHM?&jTiB@OuHnQiai8Wci zQFqKmnlR=7Lg^6_GNt=G)kwXzLqAok@Q(UlN(H0l8*9UPtJEPx{-tY_4i~Q zbZnE{&IYU2D>Orl{r4uIRE9}vBHI_-LOk(6HkqU9nPU`#Aq%G;i|q!cK6mY3%pYzJ zP1-aTIBUhAyGzbMK1|7kt^i@GI$FKjyKpDun-02~_{6v%9=j%s6DtI}U8JWEtwMKCkKCl=k@8LqBg6{gn1k}H zv{OE#{-EmVh9@en=TL*jrv2@Fa6!)Nj{|I@J`TWY0V?;G4V)^zn9$Hc%47#^`10sE zE}u4E^x3kj;CRDCv*z-3uK#4x@Cbk+?q)dPpZ?U34f>FgulCF95uGceGhUIVebcFK z6Fd^T*dT5puGj+E9R2pf9lw#m-JubQ&!(KnHKsVwh{OkAxR2F(#pC{7sp4bLA&gEY z5wcoly?l-D!PS>uah^LMDdRiaxLqWw4;km#Q&eler+`J{qbM?{%nmGj;`NyhfFnQi z1RVcPIFq*x-vHs$Ko6gV&YX8XhS<8+2Ncy59a_d)#J;(jno0L^A=?W`H3#6oUF#eRKaP8cwG(o!dn)N{NlLy*9GApMr zfH3WrxVM-UT;=kav)4*0#wb8X~6AT?UlRWHN9{)zH?6q?~g3d=; z$x5Qikxa{2=)3`PnuX3uckhi%k#t!d%-~%-#y&p%^viX?oJH0Qfb=9ej0w&Ye<8N0 zuueEdNjz8aem!H~u%>y~;CZJ!4fL?6nLSy^r-Ml{)TH4X z))mg%A**`~|L+0N%9j(VFiC$hrj${z6qt3tps25_k;2xox znTYzQG#gJ23piX(y40p}7}5{?(boFFLM-R;bpoVm!cf<2(?bT+p3npk!{iC1H}Gz+ zR5!R^6t{HH3bNJo6my?#sA!v{S!Ec(==bBa=+|1UF7#kDvISL?KzO@FS_50z7mLM4 zHeDJz;afh=zUq2xzprXNs?rylw(0HDSdt2C%Ap?~h!F+nd#$w5wyH~>#1D!W9}DiC zL(=Fme2R>4>0TJhi=&9 zdF$q97a+RirthRkG`@@wh%!E^mJPN0N`ltvo zTR)^weeQTNx00L zalXp1JT&V0gYjDJV>OAdic?+gADM>w0j7eY5k%#ZSB&*R_`E#dMXj>O61UUY=Ffg%wKlMsVQ_ z>o$=*Fq)ZBKFSy`de)1I;s)LgnRVujC!{3b83{rLGhd)Sz9_A?LCv_1%t$$mE3PCb zFDb*5r{ANXX7`L^-!r|a|!%k1vw2TQo#~t-n5`R z#J#3>B5$qT>)SAT-(;UUrHGU?(j@pi0ZEJV^AEzGS%LDEkhXGWp3^yb|IBJFi<;A+ z;(nX_<&as*iTFi-5MR^LSH&U6`-(j%#)gi%t#R#Q_DDR7^{j5^LqY(#LakzQS~wdfhbom2svfyE!!)?(XR8r{!-JDWNUF zZMUXCk;7uDdNBga^wj+XvK0l^T6N_NHtwBJUd=1QTgg)$S{DE{nDmB5WIrtwr>ORB zvRa4~-@*R&T)2?VT%p06P}as9-E3wHM%D#F>sD#ivVS~!=mnj zj5+>M)TEc92B-ch>KBYRePedT^6)}`G@cSs98HPu)R-+tz$I*p#EkqmW>24Y9z1{y z^KqHz4tBB%sO%5bO<04%&BQ76(|)mbM;LnOmA+vA)C^DOj!_e6`>K&bgDRY5Ga9dQ z)vWcBmg0B-ry&^0{WSK`l!WytqAtbVi4^xMINh;n@0u!(y?U1ShMM?*NQuImPbZ@* zbhhauC&yoEj%Pv*Xhh*h3*BD~wrsL}m+hw9JA1t1sr{ZQ2@VPTEr32bOc4OWI>n=R86^$*K=sSt_sK##d8;R9gMrmm4 zCL|nnxEuq$R9P~ONsmm&zkCiCp%JpH+${Cf{w%PA74YFu;0jf;IVlY4guV^)vODv@ zHu`qlEr8q3RB0OPRcplYCUNAXd4_Z;-Q~!*53?Rxr_iu+6b3HeIaq__@ams^ z^1s3IKO4%W)yxt^aUQnGr?P1jMv*XSsAP;65#4H$PP3h=X&T)bD#zFO64Sgv-Ba@Z z$zBX+=Fs_=ph)O0qb`44!-t|QXrGAHM3T+M12Ok)(jz%Ri~b0mS6xI5-BP%N^28qO z5dw{Tsh{Y1Q)8AG+p0>j+9tM%VxS1p+o(QtaDUPtMJX9KZzB@2^fcps3$r&K&s8V4 zy?bmUUmXZk&x+%+F*jB(B$TO1RPS-uuOyeV3aBL@XOtI*YFZ^fNa-ELYLy)YWDPdz z_`1Fg(%5vFmMi7H1NXXsN!<84ad>4vBhCSdLR`qOMcSshuN(xw`syvR3X}U{g8Pws z>)JIfsv%+siH5kx8!79d{5*)**=bWhNbr7xwVuRF7zF8gYLxf7GA%|e@&lV%bG9Zl zh%F@gJWpgO(>;5Pa3xBK(+~T7CxJ0fd8owf zXrEjRL~8!st^L)<4YEjQ2;XNRYvq?@4N;`1uVnJO?+q%h?g`$2!*KNEnTmxw!Ouh` z#7#1X3t`{twCXSovT5Klm751BKDx&EEWl-=3;FxZF zOy=whcaONnN}y$A2I=?}{m5)(np1CFZoXVyVrM#LN-|_~B#&vaeR}otYCQ@b|E#qb z5-}wIWn0**Et7)z7WRq5h@PqdK{w zAw|;Gd=&vjqWAT0=bvqSrjx|=Kn78LOntSS)UE8%A60(yPk*q>Dfh64T(&?rdg>jy z^a!_5$7cng@Y!fJ&I#Rea!|fw5R_^~&1ga_EdLcH&i zxyU*z>tItlu%6K!YNcY19Q>XZM*DCa%5lWz@J{dQc!O&YL@knbNABUQ>uGjum4j!C zUpw2Vjz(j z4t1#uUUglVMfYcZE-GUW_{Yle^X+Qn)QT>Hw?{~WopnnVU{mBOUi`jJ@R%%pbK_I= zzVkVBb#m3e=V`KeUy_dCi*7-q)U!_?<)BjHU=Gmx*&EiXs*|`=WYoJ#>iJJBKxFD` zmX%X6ee{ubzq_wRhSUMJ?gZB#QK|xI#NPbS$ERwjS-0W+`u%Mzp8-0|5H`WaO*m2M z6*=4}QV>`l^MmEuGmFi@<+}s#Sv7O|Kbf$!d1m%`zB~DhpXJ<&?h7E6I@2xY^NoRQ z-4ofa(6n<_h!%b^RAHgeU4f4Kyad?YAYga2FSMhIl6*8O>>mrJh@(_HuoBQsRnHpiIZ!&Z_s^*h@rgKeR`s(w9h%$$3g29Jwo zv~9eT+y~3WfOs_uZRGvUQ$fO@TYn9)AXY>GJRrBQTNrb9aC8ng zix%Qu&jfwd8k?VcS3)+1 zAS{&g5bZPht$Nvi|9DMOcpg=+oFk}q{8fUO??hk$Gh(j%ps+Ec*)N{|Cn~jD=WTn?dbi^K#_l)?Wsxt>NV)Wqcu0{nX+KWExENB>@d78x^vT{3h9}$lXp5{>W24G5 zl~q6s%5piUe|c`UH^S!Xu=N{n)lOO1+I;-!AkUCZ<|g!!+JB^Z|2`u>sYlmXL;Mkv z!KPFUWUZ_rGm|8cLiut7^R+0kR6`me8;8>^7d3N|U?tJ@BwLT^j8>3h>SS@YxoC#d zSk+T57%Sp<#lT?m*8x)=q6?s~RN=>m242wyQVi_$7~BGTh(=qAr`62a1{K(w%hleV zer*W)bLerFM`_dDuwNMs4KEYJJ|a`{LFd2wy2(&XmM>)*?^w9 zmdDsrFbf7f<_OH&rb6fgQyJ@h*GCQUG22O2R7^~uW@DTdu0M9i+@Co7+usKdG z8^MD!cGq_^ti%y?JGM2_E9(7VR@@13A37!1RkU92p|@U%L8YAJ*_xTn^=F{&cR$_r zx;~KKbEr|XRPpJky$K%Mg%)LMhv!$Z7mEr8o4e985qOedltzn?wmw+3jbFDksybuZ zlwy_cr9eJrvV=t6T**-vU045jyCOJLJ1G15el7x4gBeXD3A+V+d8slM%k!VyqWEOo~Skc z_OE$ae5L3nolbPDO=hF~EBV-5R6L$WiYDp~GA0D?iW&lUm~7YOx^<+89vMRl|GKIk zMZvCDX50F1{a0;u48PLS51Zs0FpJn9ywDg*Iy*8-%apyI)cV8R?gz}l$4xF#u%BvhILKs&C(~BjR7wCax$U0>erb|MAwRZGy&@e zj4e_%aW299=JQ1kaqdWi1lR?RSZ(08k=Dz!G$A83@p-(}>Ce`Wm*J;I`-ton5wZo* zL76uXuey?1W}8q_uY7fVU?t%`X;yyYL;cALR&U!s9&sojSDH7FpBCW$VZyqR@737r z`*BNz$!g0qft+oq^;Cbcl)7N_GaE@oUzB14{w#DZ)hE+u>UH_<_{ZOfZZTTAUVR|8 zTReql?ocvyv!h8~Ii(wWcL&6mEaVokSHC{w5EXt4&T{rv2uRRR4tL&-(Q(XQFdNG` zt=oijYeI6w6s1nL2)Uqk7OT3Ea(Po5@Js=%W0r$5+b_jqHsj1$s@kg$$o?B9{^377 zfK+OnhRQWguJqZn4{{HKO{)c4Xs^rc9{O!Za4E)0e0==&Hq-0(t$Sc!oT_B?6J;ts zV48Ze$e1R%0QdTbGuQN%4Jbjn!9m7B!!|$ev(;HzJt|6@&%W20Xc_+)Z{hx=j{<}1 z=a`#gT{AD6HED(vmq>$8V2UKIoZ4y)5M13;i{b0o%(-mFt4=`2B#xPbQg~qhMC$z5 zuLyUGjGsJFz)dd1QUra)FW_vKPs80L+tB)(Wxj@OoazYFh({v}>Vv$RXFSCg=}4b# zFGG^_-Wy&@kYB=EKJiQCW@%<<{Dn2H4(z=q;jS2(t>?4307V!eG#Voc#2t&$mNh5( zS-pluQ|U79XVRvvvi-V$aS+qb`yP9-bTFi@taz=wDj>utU0%#*s4mNSG|?(EGeE?U zWG$B?mjQTyN)UBS0Up{yOt`&vhEq4I7rVr6uI zY7RX~Am{T*j7^;R#G2p!QEwg+FM{c4`(k%d{aW>vNHwV8EEJ2ArD8r)-;j8TH)LDE zO?mRYq=<-r{Or9$mM@2v`fGh#ZvC>i#!A$Nvh+XjEED`&LOCUumk{IB{U<(BxU=$u z4M==rshm`#$9P@Qjxw1h=FW#Hp(2&N$_R*wpeqwyz&|>ZA3#1=8XBMz2K>DJFZKX{ zw20JQWMZ&KB4@ag=zJ}ZL4UrAD}R@!U8oNVReXtXc0)}X&y8t-&#=m&`;4EyNHjVP zjearLE0*^`WPUWI;N_S-PJ#4&3Q+IwdNyrw^|@H-N-WV}oe@@=NQtl3iehO4u=Fxp z(_iGo4@6&C>e5g$CX%%>{%jgf)WJ6JW2gnOPfVsteU#V7atuNaD~;72-)*Xr`4*Q3 zO|))|IMwgpm}N`wc#ZXY696(hvOSCeLWG#y!!Adk+DF$OPhY}BaP+G_S><~D-sfmw z2SIj9`&GfmVKfrF-B6qyX}9|Jm^MATb$-skq5ut%W;1zPxT9JSQ%LlE{qA4Jqc>nN zI>q`0>0SRm$2aiu^hi^ZZ=qCud(&^fpwWaTRkcc(o6#GPS17+n&$=)6)`_PuJtaLW z&|%^uL>o%JQ@}>^IV4i`!A9L}r1vJB9>N3?LTn4lxW5jbfoq?eCU%T2cji!a(7DeP zq?avfYzW%FAk%DgTyy`TSfo6zYIGtIM*a_`JQo>pC3sK)NgGqVk!R@-9ZnSjuTI`>t2M^ZU;c)gWeMVs`-_4YpF3C&3U=GECl31lXcTnv zSG`%6Xy?i@yIz_IF<*K+rc+#(9V8G#bnPC@)fLj|Nj)r)upf}OwspHFaxvM^c`qPM zPiB8F)|JK%QVWJ*Dy^tpAH7QzROKb6e)P;zV()VozY9TGy;1T?RaGUG9ZyjFagS-6BckS3Q$JeBl?TMUq?VR%t`(VooF(seBis#x>G3 z9tY#Wk9^u_YfGJPB%Z8!tV&8{++Qg!y?WIypUn1Io)IEXROQyU;q8WB)-|*mhwH4@MfxlFE56t#>K_F5^d|Ic5W~y^Z(xTx=`c!?2fL7h)6rh?fbcvL z5lkQBXC(?AX5Wib5$C48_HI(j`8{C6c4uI?fo+(XCmjdPQ3Rhx;rGwnTF?5b2pC9w z9U6|S#1ZC+KiaKcKkTG1;;Fp^Hma1-1n7GhPi$Ianaf1C1_Kj7WodAK2J!52oiP+N z?8-JcFDJ=u(7*WJb+5a7Io`v$^s-OY%PW5Do&J#cp?6D=-O$ncp=(0UGut12l&7uQ9z72grnYaM-E$~CcOf!G zJZn~Y3Nvdha$6fWi!AjuqY<>fr>oW>$6s+Wc7l1=i9GjAnt}1d&e@(g59T>DksHhXodHy7 zs0N0!7bvOSKuAx=e9#p^0$V-A^utslRW!iO#4x~)_IADW;G};g$^VU>E}bdabAZAS z_yG_Y-hU7nVWrV5H@o?WHbq1NM{e@yW#snwItN0)Fh<#wrbp-?h2P}%7~r>AuU+ZU zL(NpG-KKEFS-WonOH!F7qVd~4bM<`dPnKxyv_lUkK05Pr>MI2=)htHD_>|Z?^1k}P zdUhO7fl=<;54v@2_c~BDRhlc#(5!}UL^1u-XZ$FQhxdX4-(UVu@7{9%v5z$zj{0r5 za7ykA(-Oz7>ha*dx*m)^(&OAb>_sOc4L<2@fEsUmzBgofxTDRx?abD;75ZZ?F9e) zbv!P=j@jS8&aG?axCtw8$2U!@j-CniP6q^TMVv^kzZ?E(o!Ev-YVVN%Qju-sPT{_jbilSl9`kA%#4Qf}th-EplONr6qN-QCJVGuY zC1JJlEdr_6n8MgtYE7WcdZ7A@yBBeRbI3#H#x_*Br;E1k3dp<}{>U`gF-bAU7O9fP zgpKt4wMo`b`VSeyZv-FYEiEEiZfnvp$`~u_Sk#+RE zD{~YLz?b(C4HF}*;687>;sp0;luxVN5q@RwFm>M8rd=^HAGE*oprY?>qr=KgL8TH& z_f8$KFZl^Qdyf=BLxb&CL+^nbTsO`E-v0*cUcz3?tG@mJv5fUEXe)Z@!5*^w32n(j ze!9Hd&koO2T)c@M8&Zpv!mVE{b$pBX#KQM@h%wOjWZeq7U*vB769VceCj^ruuxF9@ zoqhZfK$XG~Z|qsBd9W?!Hq@T&f0M<8RCqO|Jadp}tQ$z(IWEqzC1qY%o3hQ3v8LHH ztD#uLj{f>=Wn&5!3)zOv0Tvv4p{qZ5QTZg)_jnmu1zl$Y@yAa<(&tg^{jgS#+lRgv zI@0+wx*BSd$L2PgEgH^qbIItCN|8A%@`AkodII4;8N~St_F4wfNg_ zj;SweQ0b9>QgUy?{xQA(o2VJVm*#V{AI<5u_ebp`?4Ol|t_~ZURoyFh(i!}T)JnXLiMF;P*T>IMz{L_<;vCBg}_x`^_{a;k$ z{|@zk5svg9hVg%v`cF2-_@^oUe`BdYFrWpdpj*7RCo74g-iS39miM`TUU2A61GZfMa|`+B?E=c)2<%+ zJ2bFq**tNHcpUu1X7FX;-CE5EadZc7H3EAsmF1nKuh+LPr;T^mSSO<EzO36Vh)s}(hK)GEn~ zcMZ|2#;mJ@@=qaqO_sZo4jgFr%{m{$_F% z)l#F7dnc&^tqIHCAH;dE(tCnw2$pUzzqP23e5EHD3@a z>cl5tt?anZq2YZnpwN9TL36`0$MqQ4K5>b5%GEJzJ~>7@n9VS+-$~~9%)T~xii_iZ zj)`Otpgnl?xl`_2ar`r&IuCYcnp-a9h1c|PKlDumAWzU2&rx$iIDDN`hHQV>b!6(sI9|eV>R8E z2>>H%4Of1b7a$VZx6VXQ`t(~H04aUEeyJy%pZ?U7TP?=MRWJGBYu7kd1FNe_JK9nK zlmR976D&$ninOO&T0)#nlshs8=cuLgH8b2_h0biU5-jLNs*)vC9ek>uh3O@l!D%MiAb zaxs*1Ng!=a8DHH)Ci+>pMjM4lR6(F~^k)V}j&!)0kf;TB=x( zc?!BiI8T2$@TK|>{z;kwWQyC*y|%7<=IG2)E^?j{{NGXvydZ=Ikcmyv{$+K%Px|Iz zF~gp3&v38mEuZ4gS*02A`k-p2L@ZgUfo$&EeC*7y6Q6VYadOv=*z(w^PKmDh>9PE= zN!@WEw=24};75sv-?bYlWru5KdM|a)K0&;NpQAUcLr(aVC_v2pl9(EZfxm(Nt->3$ z5ub)VSU_g?0;D@K`OioN!V8FdyD=XuAfz6rb9Y5fa=&enm#>QRUB@SO2R(X@ca6m3 z-Hr>FkE#<#YuUYmEZNoBKBrgFoZH^Rk6I)syCUWGzI*L_T2ZB$_=P(>ML7>x>$fE7xba)qv!>d+gj109V2djoU@7rm+<0@H?|FasRxWos;b0UzN>&SphJF*xfrmKV<+A z=Bn8rl6+rcwv>a%u4CdE`{@B7z{kyeL-h;#OA-4*vFr6q*wZQpeJ)?zruCuC5=X{S z1puz-ji$P}a#DEvd%&!_ov<2Z7*1OQL3lQ18m)IL*cA9Cq|cd19_0}#hR>tPPMk1b zq3fx-l)iYgzTGpNvKiwH9LAj?R}3jC-#Hb0=GAy@T-v4xNl!^<8O;WCOHY~3n|-JB zC5$zMAyruaiyje^2CX63<0;Rz#IT~Wm8QpXjL_eS0-!PLPga{nBjrl&E%REZZDr@f z2`f<%p8Y|Vc5&8U2LUPX_!#?BCFJ=LTVxz`%rl;|O#L9eIA=9F@#B`)T$**6q80n^ zCaMYwk>6V-gJg~~Z?R}rgr|F2ew~J`O7uyDEjj!lu0eOr>)XliW|&V88G4XER!!Fx zDOP!p{?Ir*JapfXOYiiV9K|Xf9;BGn_p^gVoi?LJ3dEI{BaPyBMPhv#*nP{kNs^^l z^2yeI0za1IXg^*3lZ2SD(`|sK_j?~L9wz)ELUpiA-%%zqg{Eo=G84U`u6%ny2hg#~w$6ZmM%P>Y2&WF{>fvekryaB{e-@PrUQ{Lp z^=JwbRG z8*~xpx+&DTql3L&;k-zh;LPu;`cPzbtDM*OJx5o8mvMCkM5ucV>7Bf-;K#c&mXrL% ze$-g32cr2RRopx5wgRA^j3l=Q|FKxZ*+|i*IN9{=^d9)@eO=HE35+ zJ-qQy?_mvvF3gw_50#N4JjEIZM||HH2Y7yS)Fyt}49x7CihKdMW;#FTqr(h*Xh#8d zehjvJ_>2zhbewgK;RThkV_MNR9KBqrR_{AT*O+B?8mA@+N%z*?p%oq$xl5z?iL=i> z^|mK|#f_t)FW|Ge=1B9goiTHOc!XD9R*^>HsLFlNme~3P;(<+=Tkt~76r{Y+iJ@I@ z6;!u&jY~8C!<&XZhi%1$Oy* zb~L1Zc%MfMVtzo>8ba|&8C3fM1~64#;uUtM{Kp%c4rd`z&TydL?2%8Ge_w6p#|7aP zbu}jPk8e~0C98gE-Pl%Lpb@l-Et+yOp9*_~=5YuwD1BZE?RJxacRkX(RuZidr`YKA z(gYYn0lMt-tle$eU#>BWjE<$g(l zoq#P+v$!g~EfC^lvX-7A*o-%GH9df{YU3xL>XeA$?G%rU;asP0hNif&nw z%p)L6_>8sCXC|S#L&9_(?v5BUD&Czj}5SErdK zyKprnuXoX#s|cWV+!;`xtsCa63m43cH)x)`vru;#sl1BwQr?VHr6po^rJpR1VV7cO zQIN`1v!FGVmN5Ud{R)@O{XW19on$Kzy+BVq^e$e2uC#N6)>nB4eUDl$@@@|as{3^z^ zURSn#WHXszHiy-QZZdUnQ~;6Ay|D7gy%2}PZ06HxtbX^)>~S3#Hi;8tRPib{^E+Ey z+drr~1KIp6cHBd!CGQ-#C=Np_g3ah_bkrm;6!rVDw+b~9CvEtp=Vjx}$=cqB?_ zcyyw~PA~^s-=cQ*`w>wJ**!1xIz~{DWC}E{tJ@M;s+*Q+cDE;&{qd2X)r?N{X<8(| zm$xO9$~5Y`HMV#$IJJ;+MLuzA!1Xbqd&)yK2hH7kKFLX_a(!v{6Pc}<8#HyJAuf2T zioxUebzv)>-N@#*~sFkT|IPE1mkV%k@k3B z~m94&lv|Qg2nw z@R&L29rezSnI0_&eNw%VJLY{8(t6dMI6fdWF4wl~1JL2kalf7C!6TQYFY3Jr4s*{&k98rZ54X;2c)fR)`t2a@ z_ULPT;)a=g&5zlk5DSq0Gugz>ERB0D&C8XMdrPUZO8`>@YiUM>E{aICn z6%1m;XOMO;ogRL%vErMQE$>(Ek+JQqQ_a*0HCSFb`AHqjNc^3LjQYwYoGEhMibaUx z+eMCU*B3OuVCvwx^Njjw;bv{ka_mHNq*`P^%3+s*l+~vn@y2@G-Np4l)8_W{r@oXY zyWTbzZdT~?f-D&q;08kKsXw#jW>CTQZW;Q z%1wTnqST>mRwH+Z+*bkl^v>$vFaCGlA1IH z9S+g^=tVNs#|75SVYA5o(#ns|Bkq%pt9|w1epS{GzE9`l^@1w=?QRnVVJ2Hi5$%yt z&*?991aRAEzXJ8Jjh)KKxT)5@>Z-})KI@5iBmXx?*mgx4|Bh&+d?V0HY031I?)x?7 z5|kL@h>g(Q7fu~?&)wj$UEWROq}K>}P4JeR+FDikZoOH&B`7E3#hm(^_fKXU6#Tly zPGRS(!^v0c_D;o{i`g}8yo$q`IX@&x8)3}11S9!Uqzu0F-eWbHT&g+2)toZLNDBln zYz^OdL&=eyF!W|WhD?{|@+ODq0nr}KZWLhgS9!vRAG?o|M2v<$X6T;WSKMcqZPt(qg~u zkiGPC%vF4o&8$Ru+Wnk=70zdnc)UR)#mHgr*KvzRQ=^$cPOXiYnC&R!Xrab$n_6FS zUsHY_=_4HQdT4)O3h%%vJgFj-flmWEH!^iu4MJ08LJ3CpyRh~a1xog zQ^h;BNF9?G){Bav8k5!-Wj(#8D+zD}Pkz?$r@o1p$w5l%Rg4?qH%Uy;8Dx3!myr9h z{tj=r{|0Z0Wd?V)Z6w?oGddD}d2EN^fF6?@ma7ddw5&{ax}VJF=H>@$Wo^{?*h{P| zHc&3Wt)8kx<`Qw->;vm=?c6jzNV9SO<9&eRR5~U<wKMOWeNG#zpke38nPF=h1#uV58q8-2yZxFYe5~&0S#qJnQDI;fRLj_^3 zcaaQ)WDW*|Fyjmm=Ug4bRX>|wXj?05RcJ7BUu*J90AB2K;@(@Cu->pu0G;eq+vz8k zG5Q~HBGu#c8W$s^Bm*j%pCST68-Cq2k-1H%X-xK^*7{Hd0ZyOJ8sd!DbkCMT@RGeqMvByAOuvcdS0MCvR08W=7v?DHz~PMP4`?T3u$ucIb#+L^6`ppp!+t(rnw}2`ta*Y8GY3u$zTKBmi z*vXETC@n1JmzF>f zP`f&igPJlqw(RDza{2-m8)#yT)KLXU5^4O1m?kFs$$ubG`Z0&)bXkwT#3HBvOzXDN zy)jYKyT3EW_MPU_tL51I{u~~36pZFtt@4Zv_@)mlGP4T$U+qfwL&WqD6N3-4vm4IQ z0-%lwJk_F8A+$lI7a*wWeT*f$7J5^%8!5dG+S?9Q69GgPUfw z*Pdt-Apo7kSDz-h_T#Y>lph%=J;?pT}~a zYOztyKh&ve$G%z6WbP~pe0F8h`5iaLfnpOLZtGNp7k=2crD>M8$*LmbG}~+j@1evf z8Vj|F^ugWvFfyxs|GY`#&`SQ`PFA7!npb5M9mP2sF~kHFujNx6%(|qOcwJ_AM=eJq zLY@Xy|Eg7=P(Hsf=)YJ1LPzha?20TpJAMJah4m*n?QVSAntQzE{d3m~scnfNaZ+a{ zkun5{NOO$PG}5XxE>rUo)8nj(<(*T*%Ej0v%HWt%m7Cj+>}9~hwa?>GvmyOkc(AKD zL5tTJ19vE`{Khqu$Z9&DmK4e9i{K}}!~s!^<5IMp>1qYNukF_~+Jm3)GLJI`+aAtC zY=f0jQLW&77f$PM`KlOiYbMQk&WYy>PBrnDjTqkVhH&Duo&ABrELAGLhQrG2a7mKY z;>$Em@SbZw2G^^bu|DlUT9`E4ezq<-UoscHE86GGUaHA43slP+5QC?CT+eNGlhT z(`)J^Z->!zC+`C4ya&d@C3hspzA%P;Ws0g|mJ!mBDMjB0q+OOjaB;sE+3PqZ>4rTc z<5frqk}HLw_9;8Flo=Sli8f}qt$~8x02S0H}lIGeI3Z)zjh)d6((>6aKIPq z2)90L5_&9KlW0B;Z55vW`Drx#>L4Rto1DA&br^}4LGVY;iFjI%pC|} zx*?TW{J%Egiq+cRxr**y&J~Cj#X6vGS|sSH4G=sQY<0b8%;yW=4il;;f4X<^OO5c0a>y{U$4D%M5_1uvAh# z)Zn{byu8na>Aemm6^gz4WJ!SAe537*PXo+AplVaBbWR?RWEU9=yXIF?KS(4g@cZBs zos_p((YA)u-ZQ7>b+H5Xz1*HsyBS5O#Sbct{Jt5iG?_L`97toe@p zozOE;ICbP!kgUB&)G=vMt;M+Y?0=YG9CNF_y_=scXZYqb-Y#Hj?h z{IU2blfLBE0{JhGa$eS5|KS`QOuYH2A`x*^!?rMC273VOV-}e1 zuC+Y#qrsu>+rha7!i{H8|fOlBuL-1?gGF(ghJ>Y4AY0az4A z5{eW%UD5Mlcof8VN}(1#{7>7EozxR;*p}MIHaEyT<(2J-FV5>wTPH1Z+rB498F(PP zv-<7?Z)?!t(LKo#%@k|ty^(xz2)9IU&u5$Uq3|w#Z&r{*Hk+VrQtFJ9aC5>nA?w&K>?LLbQ;nk^HQ=NSkuyzF5Wx9O2d){g2BxecIoz!BRKpk`T)l zTHi4*^uu9UJONLE41W!L$-EDck>G8DHP@u4oK?nQ9^QrskJIUOf#h~x#PBj!t6o~yo-6Fti$b(y4k zaW}b=FzSVR{li^X^aVQ7kLT^pvsuG>u4nP#`+6ttKwqx4HGHZH7`G>_ez>V!M4ac1 znLf{W-PFzEpyt~jbFe6pDxp~>k47N4DXaB_**SMNR-30wQwDNnRovkx2PK7@Lot+1Km+wh`*L}KAqHw>6mPaLe zOEV^`v|2|ariXzt{JfO}Fhai|6QoZweABHUYhVJ>dayf}`U18N7(z(7Js_b{$t9Y= z9b*}zas}Idw!0rW`*PC_*qkA&r-Vrqq}_5Xc!L>nogT?nF?!TY>Tj>+pN8NK0X9mK z$~`}eD@*j11*{0cq%0pe$JlN5Acu4srM9?+&k@}@tGEq0^m35vHT>KI=Mn^_kPrHo zzyfKQ%INlTj?nl7gjt|*ndgwBQ4hVttV-3Z%?hoEwfuC4%Ukor`|!>SQA(dkcBOZS zxn%^cWcQxvYnN57v+fon&u1BrK9JOPO$U6W(U^OBZ;L#*RUEPnB3+`*8SYZ#S{ex| z_f+4(%MGpO(7Bmerk^HBeAPMu+KKYU52iC7#UHK`f%94Ali`blP>a0{O@826B9m5V zIh0Mi7NBYAuf3^C?e+qJ>=??MamaZHw#0WLaNyVa#34nmWVVH6f2}tA(75IZ6y6kTYW;0Xd8KW>qP|M?l*IA%{gF!-h4x9 zx=p*CHlqsV!TnkO$g zI``Wb^oBXl9MJ`jS#=Edk_an9?_=It+{T4$1qtQ;eJh6lAI+>JY5b%Zi~b4}TVWR$!n-+b@(`22E!Z?L#3 z`fdgtjf6)rX?e#k>Dx8)U9H?W5GgR_*{9w+gzMXeEv~v8Kmx^FDt^G_tulg|9n__jH}z9?(*F}|eB>6HJ@|My&X8r;+O?ta z0MESgw|VXJzQv;ZV|R01FX9t)odf_LvVo?uC1*=tCLpq5aZ}NjBc%gF1Md)*G4Ir| z!UzwLwCfHJ;%Xd6iUTx5;VOvKGbrw+mbT@;&=+?eQl=4!4?Qq2X5pJ5v6%-|rR93q zpYkG1Q>@38f+@!j^E?op`E3MEF_EOOy63yG4?%O4h7-AJD(8i@aTt?j)Qm}OnX({b z9*mP9AP8siHi^$SA-g94J2>hRniH!GRi@P)C2GmX z>+iaf2NQku{9x2$s?-)SU%EN<_S4dR@b&U16Y1E3J&4~WoHHyfj7GhwxtN@i8)(_% zjbi!8{%t(H{g1|zWfs2qv$6`$imcPNodLSP&XA#JMYG4OhnDEL6KKXPO6}jpq#A{z zxr0!ej*3nc@Qhj2ptKlws=ib3mU+oyXKXC7jIfN|BgX)7Bntelgc3z4CvU+>!BIO) zXO3O!&W@t2?@&XA_z;4jtSL(-8lAHhj2x0eGk<#nE>PN1cmDI8nYU!9@2h3|4sTj# zq)(rw!JiAZmrZ+H-g@2^ zf>~U7V2d4IczQe7p^j$_fskg4tB?EVfeXryos7`9HTmm!u%8_ zOv1T)8qfsnND}!vxi5mdYWj$_><5ZcxDrfLl}Fj}FeVw|lTkk#Eef9V;Uw9LQRe3T ztln}sccb>6YkbF<+CeUx3gYNBY=?{IYwoJ%!<e7dlr&^B0@G2G7jpuJ9pFQaYGyX&17L8BdPObu5 z*0aqza~dRxN)o!lfms`o?T}gme$$C;2e_>_+kwW8zh0XwjrT8o@NFj@!g|s~<=DE^ ziaNQI*C^x=^4&tWT!{s`f9pv0zFkL1Tchy0b&5IATi{OU5x5HEyBagynxo57kaZw3 zPckf}3I5YG#bVUDy2DD7w6y8!?f&q7k*UW0C9Zizd65jhLEKS9PFIMh`s|mV2Za`h z{!I)q^_tnLfUeTO&2n4X`48bpDu>p9{;t)u`?VY!9(Hr!>VC3%?iDU-l4?;$mz0(N zb;~lq3o@+CfImm&-JcB72gy$UH20FNGoqkcAcH6$PBw+;g10GZksC8EPd}VZ${j>M zq$0#=>2!|os8Nm?3MiRjxDX^mF*hrR)l{N75}doF2}zBk5)WDP&>5{g`f=pMkC~6nuh>{KBC*^sk1uPnU#<#+c;5>n0-M-U~0p=lbZVN|S z2&~6Bc5(`-m$F<-fiFkD9l4WlR?0?P*5C-xXK+GKAtr#-(71{1`e(6E546mX2#b&{ zHrvr5cOucwQ1f*IH?g>8a(vmVep2UcQvjzQ<} zx;xixD1|)uNx8sm$yp=qyDTW5btNLxY1b_?nj1N(zuA0t0tvA;qFl`tOw~S5>Pwrw z*q+Y~%hQSF3>pSn75z^Nt$q!V$3{vh{_#Y{&~PJR?$svaa=Es`=7B5|+9We7O5?U& zEGTsQj7y5a@!MfdMdwsLhF?))kM4S)G$zG7I){y-&cO~$!pV4%zRfL#2GHyPa;5ZH zmZ4_3H1?0bo9K(zv6XC+XfIh?Hq|!&S#X&t+*)}~y0S4P0c=VsPyvM%+}zZU3joKl zb$cMdiMa-SpO%7oT({#y+|sT~*h>T~-2u5$wH#(fQL?Gk053#rvVFF>V7OccN#~A2 z*5alvm!|zufCm@nfyzc!y-8e)WD+XV(nasraJA#CkYa+no?c3vO_=N!H@>U8bj}t8 zCfNwDBKb$5yV|4bRp%u3qk(I;rb2on4hrOF)wmBMZaT5uCrdlGS+Q^Mb3OzQ!&1{Z zOFDEgoQH){K*Kn_a@bVd-tH$s1s6#H?t+tdPWaK|^yve1(wf;~*Rc;=R>>KHGOoi2 z?@87fZ#)C-IPn-$eh)}dA$s3jTQp=T67Vw2i2b%#eY2py-~;5@xr>=Ie-sUz+{M7E z5uVNmhI|f&-%3j-B=;K{28)OJkEpl0&I*keYB?dSxXPE=s6R(+)xf*6^iqk*iRXG{ zsP$A(?b(%VKr#lQ-?^0fvM=oAQO0}#HrqZ`;1w7>=9`pl(;)afQvG?w%#0q`#==g& zrXi6BJt@|m{zI|u@TU6-Ca^CvWSzP9q|oPeX{9AK`WO9|U<{CgFuUjE+Rps*5X4Go z0B>iUP2IBF;)Py3)9Ub-T+9 z3~vAK8VRJ-L^w~TEKN`lu4-w9we?JCLcDDgx75ji>G0WF4hoV8yy?-1o`-2?ZpoKY zv*9N{=T4;;TDBj}Hl?f7-N+E@S0a#=rUr4j>~(=S0RTAL19((}2O0)3Vaav#wmZ8` zp!~sT6=0$k#@SLid79$&>AzX*Y9RB0jb#Q|vSLeG84p?3fneb3>lO=U4pn29tN-u z<#;>3oHh@;b1$WVz?0mkFd*v5_{{eD%`=arhhy+1Y>y)kh>fFE` zT{NubcjWgOWrPj;1U&_#z^NFK|P7i&wPV*O7lIal8fLhO_-emyo ziNH1fMyEq68GY?ceHydLXY#o^yF9vqtM$F6Zf5)K*X06H^|x^wOe1)P1*Sk}rB;pD zc@Hni(WH#S2ax-MN7vlmNB)y_DgJxbO$;$jyxRJe^#DK=mD!=pL0?@~P+8sg#C%8y z^<-fZ*KyQo++JC#)CTh296mjpa6c)LUfwl|LCp$ayLi1F>y2tm*%0`b2Q|B(D#YHwcBo7!_A*@s?^$fxIh zM*X`nz5Y>fDv9cAw%CJ1bN_l|4Fmu`xe)uO_^vXW#IpcM5n;7o?Du=U_brw;m4M9u z32zmr_K}ResL937;x-h-cC>>C)Zu|ZuVg4|0AGS%&F$LXVfLv9N`8|`4KEVgn86%Q zs1nE2(GGVjm8>q8s$%s4VP_>d-X0jr@C>xZj|4{nwG1 zKfy>(edU^ZhKCmd+d@kP^KH zuI>PA%B6jti)df~wj<@W#xou*&o;tNhSOdBQ|p;yLd8U>$6wz_ZqwNBx?LTRm%nT4 zpOe*AEYRuvrh|TdlX+ruxX}d`Dnwcc(|F5~XZ&7@tRz9|0$8}YWsMn)r32V_1^#zK z&s*`7JZ%%LO_%p*c8*T~Jb@&@!WBo^68SEv4=GL*>)B*j+=$QJ=Yw_V{sy>rA2%Hy zM)**zQ`qr7XBsJQ_cYQZQQk1P-^RrNq%ggR{`>|4X(p=Ge^+G*PtNE5lI|1g^`ErX zQ=CH3yny-r(?|`73y(C9T5CB#xmzi%yHoD~+86!dlobQK|LhWhU7FhURg8j~OKYa( z%X7f+vYzS+yObl@c)^|uoR=JHER1;hM*4emtcTjOI&n2 zIsKoDH27(eifA(Us#m=Cnfe$|sAd;Q6A|hdz0774%XjgWJ!XGonC!j7S*FzRVitY; zmO2bCC-mwau(LUJ4UlU(ib<9a#$`0sAyL_G-b@4fB~l~lN5mKk_%s|vP1)Zkei4ne z)dBKT<(5LZT3;uQZu<0s9Lrn`$9FYyW2fv}w*x15#oNCM(Yu^9a_q?kKOe~zX#dRo zvvt?==kMB_A9dk zK4y1cu#EKGXVTyp{21|6*LRG$c@p9H^uJVs(&JkiOGWCVztO#-&`*wFBhV?hd(sd5 z1tKi`AK#?tiwb~B?fCf>8uK2{WlSefL?b%fVl=nf%_M+BVm&NCNVA+w9YS>m=@B@1{YxN~pCI*A1oCH_rVJ$_ zbGB-Nt^FpnM|NpJl+w+{O*2A2Y5}m&1Rky>uFH7INJ6;TAC7&O+wW3MPEo;BV9ZNX zw6=3s^n=gK$eUxonT8YqrNr7@>UErkp#6+!b>1h9d-A%#Rk_{b?n*WchWJdINpn6j z(L=(04cS}IcOgrC)Zs8yyBQh;Wjmn3vHtTtziLBsN51Vh4Pvf@Q~`|3iy*74%E>A?F!G^{ma^-Fhr8H=oto|0XFK;o06dms|Vk$ zjoL*O1{TO@Qv;ZRfbC1ZqxS4eM|pELU)bUd+Mo)3n{29p%#Seq-F2G_d*OJmX#6TJ zgo^j`C3)`u)D%nity-;IH)N;DD~mtTze)j#_W(4wH_MrSX5@WJ+tQ%)`Z|J;LzY&p zd0E~UHVl|$9PS{6x2vz#{)i6O@|C*jU}JLDJ@o^-TxEbwXdCa+WT75NtHqo@tJQeK z?BTjSK)F;TEcT32shAThk39IZp#a;+HCFI*l>+9(LiqUZw*YTbtItjU$trSNjYn2G zk-Yx)?xEw+-i6#M*-$`+%I)~J^(q(>SL)A*_zwFg^9JAf-KCcaAm(`tA3+mH&J4(i zMmB};0Z(cb+F4uISOY(VXE6d+W^7r1PxH0g>`U>6{93@FcWfQZZMHVNa^_2NygX2d zY{bwgy)$1bOk2TAvNdXKU^rm`34Cq1Vh(B*4>5=)_R^IxFn3$1v7=-*DI@PgOVUqQ zq++VChBPv)MOtSBWSnZNAj4S!?L%(LfYtG7@vQ`XvvLyHuo zb?m>={r}t^kISiuUXP!hMGDtFsqs^0JK)z2JPusLhXWg|IZ|P` zr(ow$afz$1Jz^gd%E5eR9*W@R08EI^_jz$9t+PEWPN~dNgs1^pIYlmj^_UH4epARH zs)EgQk6OIh=c8&rXiG60a$>9(a{24IYNc`W%S?@ihAuQ8+k$t~b1>&E0) z=pspGZP!xmb;w0c2lmAcT6sqYkxSb20ctO8B9%`7PpnjhGR}0S1%3_M(NGspA`mTZ zKP>ZpW{YY|LU=Hgg)Whaf4t_3_1Jv;38wQJ$5TM7`UQVUIH#D?Ro0?9%Z9en)&Nj$ zNA`~{kxJMonD{^%Pv}ab1$a=P9g`GLdnYi$(VZFg67bFh`BBSTE#sP1@ck15AZY;Y9%fm%9C9Yk zL-BNQ!F+>rP1W(6t(MJ>qx0pTI*886wiHFS0cj%+Q-R!@?e~DjOjXM%>qRy*r^+6z zs^4vv!+KU`uon7+K`!##o>5X}J>m*npoC~a90`h9&YJcWuW{ZK!&EB8loB7;Uu|Ja zzK!gaeu++OXH8aQf=npUm*}0AU;InnA1n(P(a~aA??NTET;LM^kcx1p6b= zEV1)cl#ADoWDJVI__Anpy_kyL;1l5PnUDLRgGt zDdf*6qSC7lE?(pCJYUZ3YyRG$4^aEd&zf**nq5F|Bn%6tb&2|Lb|*ej_GozCM^6@u zpD~mXl8|@{)a&kP0J|sUFYssLzrQb79KXf%lUn`@$d%u|qlCA(+jmNQoO;E5nNpQn z9k?!$g&zXeyi=*PyEnI-DU9K95_wfhrSO(cjaKdR2-zwJx4hJEM?-X{$h~8&Q&Yf7 zf3>vY{npH5?YyMcn=HE{d_jFi47VxKxt-C-tK$-Ih0Mno^2<$K0iGlCYhEY0&twW4 zes~+ys(X#t1{LlP%wEX*CVDC^y(1bqD#*hWljD>! z6;UNLCT8 zppr_Mm%p_{`yrP%fcGPi4U}PMg2(*G62w( z)^?FS21|iU_{hU%wjkkGx~=NJacIt$vkdwhiDFuvueEisS;-BQhklBgJ=#&w8|5v>BtPR zN3RTY+9dFfv2VSfaYVeO2(E4OOAXtwQ(b%k>Uc$T>X&Qkz-ovD_}uiQ5_M>o3$3XT)+F~zVuMUv85 zO=cJQ29u+WlHf{xh+p*CpL)H$A`b$0FKLE>vwqf)8C?%>Xa)N{z@IvVGR0omY7Us% zU#8xuq?OlMROYdgs-}3oK_ZVduArkJZ@}i~KJ4x(h zaQu<{8YLg?ByqB>!jGf0vCU@tKSB!W;6$Loc@5$_9j;rB3Lr_eyL|)a;QPEK-7wM& zGlwl#fE-{=k2E0g`s>v0(*lQLRE$p77XlT>jW@?eogdM>~>HA2-?1<5N!4DPgI@j;*^?pm#R&i)8HhLx8>o955 z#yag!%}}16IR$RPD}`>+{|mkko&Ex3pxawqDc?ufwg7*4GgS4hNCh zA{HC!cj7tK1xj~nKRY|r-Z5Ex!~Ky*;MA7S)Xd}Vm*2q3S37k3Cim%pBIEY7ON@B4 zxTV@_^(J-O>?9=lq1uSJT&Ilp$*_6BFsBBq@{sVcq*Vdpxu*(`2@-=~Z=>F@R_d|G zNg%NNy9(WFdEO%35*MTqWQJKj>)lt*i1ATRtk9i zWD_fuSP`@vZ+bXC z85ExUHa{!+V-P1q*jTX<08XAR6Bz+~RM+=)Fx&J&sZgycM>R;Qd*R8^KibK^vhA@z zBfmZF$PReN5201f`mM2 z%%{QIZxR3t%%f6-zh$`nC!F`t!LtjG&b>yqs_(lpWug1i(PYndVw{;sQ&RF^pG)0zI5T#gDC} z^j{kcOMNO(HqZA<+{$Q_7U569@1DAvhV<40fIBE5>L&@Sf=6w;iYz?Ya zK=$;Vp~t;Zhosts-yLRIrc>}m9Ica2W0Va=>76s+rRt@#3%6EF0_NR)-15Sh%TPI8cp7j>?F!?7S|%|8owvi;YDk7bB4;a8O(^Q>r!mRtG~$OXFkX zTW34xox{vJ-ZMdM3wyNEdtaFv*=WV`hQcI(G?)f_a zIn`47wKMqe;QU2ZTkdayE77-t{WL)75Ti*xPDM~COFF}1$zaI9zRj!X03D?L#+^8V zmNjSXhcpm^#x0@Urch=dzWZ7*>udV73r$s&B%*Gbr+enFx8G?TfJQQ3OStfh%k6pY zL5ef7{21YTNqMa91YOU}eJXu>9o6=67hqN;;|?n;7MF$8sB5p$Vlb(M{!tmPOEMEpu4JZS1jO?|CzEUz zcvZ{c{8s8r6yQ`>-}CN3euJJ@U4HU#3R1?K@{%$tJdyY#{_OYNQ{x8LTE#EnYvT7; zqDgKWIEYH2Fb&Mp+Pkr**4q(=47MoS>+?}-Pu!`^o5XoF82DcVI8a@R-G)}s-dH*^ zCPa^Ikp^AIJWxAMmNZ^0W#Q#)*}4{p!p4f%XvuD3ffIPz(-PL8=PZJyaZQ;&svK0I zST%Brm)n^zt_lFDl!gnrx!9(JHksO@`t}K)H*>-G>5ZDe!_oA(2a3ei{Wp-8TgQo!=$m3VNx4J z6lbR0$`UxPrjo;CcC6n;H>qq;;K=e$>ZcSKsq*n;5ac!8KH2iS04F%m;h?8 z@#Hv~DxgMv*&v1O*SuKkAa3XlGW{c&X!Ir}66l$Dsr+h_z5O?A-UDG1RqH%R`GO=p zQ8L$@2-pP1qQ$@89P(6^PN02Ln9uB|Q*}^s0!-loq|2yw=2~;ks&IQJYle4-BR`xCC*xsr;VF7p1ngwjH}T`;wtzZIHNLg zvm-Yh`I6fC`C%$~Zl6IAo&!9rf=;7EO-flrlg@Ygy`E6{l44Fcm{xU(JQ^Te*;^k4 zG-}Djb~Q5yo&lp6r1}obZb^?BEsMU>ndWEbe4zk6L4LtyZ@a+1^q|b3E*Idd%M!W& z03S&4VLcw2E}c@{Yv~h;d~PwGcGoI)+rHdB20UY^p6toC{%Id0{I~*~Bf^qvck({e zA=7{!51*+w8LCO*MSf4f8<2kSJ}iNfYeaNye$a-#bQ^QFDS*O8_=rKDi^~7&>>w6) zz(y7AG-3AHhEaOa3CChq#xPc5sQ;8>vxuW^NW=SLl^m?d$|LS~>jKXfdYNHBbfiaUHNEUVA-!5@TWRzIrar zYSc3ev53Iu)~8fBciHy2BUS2E4>(x$eji!&6b(Je|2-Pa(W6;81bwsGhs-SoK?SqF zj{rRY?x$+`bQXyrJs*9siiXH{&XKY5bU1IvH1yp3{iypyq4Xt=WODF+4J;T-bjrwXo{oqEAyEC+t+5%UppC+so7yNIx_gPcUBNt@j@ zdwqZw5f2h^?R_xx`jz5zmi}^IA+y9D#tA6&{BwgSzuM5;b~! zAe@+(&_!gJ?DOXY2hLEBhx5WqSY8q*Ro=eksO3sZ0#*9xE$6<}nF-TdZ?#qv~ENMu}nCc)dIt6@hk=LKE;3b0)}Y zwl&1_;W{B}wCe~f*l57%U??a&BlNaA!&hZAKiiJqmax|&9=E{Z*$1!^uwPQ?yxJ&( zjtV@A@HjV2b261hlt;(_7obydY#z7n{RVyJ#$MM zJ~}WSsxG-nKen1Bal~KfBMeEyVa1OA5mPON-TSe+Amx|qFHXwy&+G3_?>2=UDe&Jm zpWS)5cxh{w2!j%;Kkee!=9|oePdkEzjXDduehi<2**>Z9CDww?KeDCbZD?&f2+tij z>FF38zktz9A(n^!NV?8VCz*+$5G1YrsWMotoVhAAAKcEMN%_7HjbKh8iCqB3hGedk z;?zh7?$xq5ZZxdDo}y#C*Hpe%(fbg72wsAbWe@Adzv%TfZxicBm0}auUK!F7z~@7$ zniL+<^*+nhaYuO>F6Nx8n6eMTo(*e)sw2T#sV(6@Ox}4?D&fMR+%7@&7#qto^EgPl zf^N3I2}kr@A;**_kL$Tesd&6D(%^+VUsjJ7LSdU#(p%c6a`qZ3T&}egA0VxkHBXps zxWUbBtmt}nL*0BpWOOX&=Dr8taQV<~sg8W)F~$0P3xW=Z0%~sL+F+K&0#(qfUUNH! zxq9jxLs4$S!q%2MQEKxQ6ggu9Xhnt*>jgbTx z7sKuhFMs&mmrJ~KpGiEr+iY#_+fn(m(lk>ICVKVum!YlZJ7%I)cgr1mi+KJMNM&=h z6!phcubu1)04Ud+IKFTT3H5#ODG&F`@Qxa&#bDn)M4_b00DGy&q0np;K6o1CY=!gQ ze!W=vvl1&P4V6yqWI(o2c!3@4XQ@K%a;-?6tGKuMSGM~?2iay(5TzfBDD zpuQFbDjpNvw<(JRF(=vS0yQXcSdeotxQ0i8Q*(Dk&ht1Yc06~!yVYxy*5#&Pra0dH z%hgJNc3dUYCiC;e`t}I`+^2|+RO$?vyA~G*pYFGKcI)Ew*rTq%RfQ1 zd9a4I_a{}0Fa8i$mS~bg!kKlb*CllC(zZ&?GL6MrKi^{}(U75=HAe=8rxh3^IuhI+ zm4`ZH7-{9XpmE=giQ2HK;tvy=uh)a2`zvL-6)CY56Ydj1RCAYS4~QgMlvo;rxwCaY z4t~uMZx1=472~O$7fWA!Lmx^J{rR;6H3org<0sGhl zh0#RN`4(yO-1P!0i@z`*lUy=R5r-f1HWJ}WqVLomRx`!4kL3fzuSOu8Uh=Ef^*`hc z6zjQ8RmYr9_Cd{p8zt`26xN&D=kWlI zOjv zco>Oov@*qLIRbDrj6X$%%}lj~M|Q4R3qsc0)eh2Xxib5T)=5X!S_Re|-jNHHTx+va zUa?wt;D~{OnQw-Wln0>Ov*)~-dz4JY9pcoXyp0L!u$ZC&U6 z%G$NLA@%(0zrE*s9neoQzP#pZp$G8eF>{;Bcqqxl<@Tm%znS#+RIi7%Kf<3w2xf#p z^!{(AjWWK5?Wpa|Z#Eq?hS)$a;-S537PUM!clhK6qlHwN`T>=9Eti6k30q7fg`vnE z!^|tu#WDU3U?nuaLm|J@k=k2$biUuunjXGCpK~m^apsKiJSnkLQ%xs4q~m>xt_G$5 zUv#~^k<37P^Q&)cc831TTYU(N-#oIB1Th{Nb~6qp>2pj{pZD?^ifwM1>6*@w=9LFo z#F^YWFDY3S7}bUEhir+lsHZ%WdL=`|CWbipnHtw=eWo#ap8k-U{Kctx%OA~kU+xX* z>5(^(Y4k7n@Q5*cH||SD#X1Px{V>UI@J}BhDxplxyr_db#XZ(cMcK@`VYIU-p*QcD zSC89k3BEYX#N-WqmVHxTvK0LcITH8?DUm7$!uW>JJNJ|~l60MN3>jLO5B6=#c-UO1 zu>;8BNsLff(l-$svXW68hc)So6muGzY-Gn<9G3~6{`&p1%*(J@4IW&Dvz(8t(5)BN z`%o+ezGY+=yUxu@vt7aw0e*6!7f+u^)aNMO?Pt{3O`g=)Ku&Um^CDLiLo)PAyjJ&< zNzDB|Kgx<@Wo9N)7f8Bd`}EH}L&jEm;eRvi@q4+c*m;@BDNTBgfE{NbAqp>za6d)D zs9!j0HcezW&ke2S7Z&1VTJj7-lvfD_ZS3yUhWesx+rn1-(``ZVc^3Z+|MKiJRRI7y zjRId{e7U&i^Ehb8vjN*Cn&<&N*4}jG9wx4cn1Anz>C>Z)`p-uzglxzk`E;ZyxN62; zU{o8@5h8MsCW!<>bATEdUjI(JcSHHWb8Qmvfi8#Z zU{c#B9I0OJPY*al5y`M+vV>5BtG~N(wr0eRKHd8v$8qS)5ci~`L1q2V&j=^5Ho$1q z{puiNkxu+~-95+bi+kq3YUqm{tS5jSOoQFj;3I%Zt@y&qmM!#`T2((1#)t&j4rmOeEbsoF@xVn0oRvt-i`rpU$ zBeur$BpdESp)Alhc$!Pb`1VBoX>k#KBAlU|jX-Svy%&-cim|P3 zAYGV#qqes`I#I5-wl~CKZNugY@$Ni8AFGNUbAdeT{EUKaTEuZ1?>zD4e zHc(+)COx}{F0a2)iT{FG@O}@{%YRG3jQnGkXoOaz^%y3v9a??L9eRjl1 zY%}a1f%A2henGBiimY0!sQwX+@IB(Ya_>JLy4CAA+wN;bC+8s84}pN^|1Min`lkTR z{?7o-fFFpZCBvbM$FUaiwKaCX4xWB`-{bs&GSB}T*uNDrsKKX(IO_w0$uc1_Owpnk z+?p3KZ^3!zI;iQao=b(r@4LHGrYELjUr0<|f0xuG{N}sulVl;Kp}cE3zDE^T;D8z_ z=mLjnS<6MW`RKjIOc{)NSAP-mh-`uK^zV_r_eq)_jUFK?i(h;2D%Q)JcO3QZOINq( zek@Mxe(Le?L%eQ6xlhjKke>#g;-RVS|8-e>PnT8vAD8t{M*3fWBFH^Gv9+K7{fYgP z9sZY}=lC7$|HT3TvKRZ)H%CN>R15})u=pE;Kw}brQ{n&m#pH@(@YL9df&wVrj3g-C z*ul^xA(Fqz`hWRi;8#MzSQ(1%?ymknxzPW6asShU{J&4`|9^@8-(~qftd##($>smX z-2ZSV|A%ZD8PMwPY&Fnl0gpZk=KaW@O#M}U z2;OS8kdlzInbE-OUapwM9w-R+fCHyoQefcAmRLpuwwWse0%UAjpJ zWrO`ffEw;3L`doGBco>ekBgGm~*ytilnlm*$C8HyRO>%;tVER zMw!(8=VnOn-t(>=b*AKLRO*Il`md%&6~M7Y@aM>+(x^}htYsfTF9jmC&l&$8w%)=o z%C_qoRzW}-1wk5%knToWV5GZ42I=mW5=26}ySp2tWd?>Gx|^Y4K>9touKW4!=X-ze zUobQ09LGM_UVE*z?fwP6mt;v!Gci;OQAH1M^KruO5t<9R95iGaiuZxHqExsCWTv2i z^;xT+=dV<+v1jtDEbXN~0hSj&QxNh`yF>i|YgH*XDT9m?FoKLRdu?p~h#Sm{9moHK znr5_JxE!4QU}=v%QQPIaKC?1-5@X;d3yF0xQwOrB?_)Tisxef)H}vs=mEH4HCJ!48 zmQmU$XuPMt)0$3vb*+!NPrAvAdggR6BHz3)O~3IH;N{eSM9egYaDlg#LJ$AH)0puP z?2nV@`lN1!Xcn)=x4EsG?3dUYtZNRzVnv3{Lx#{1^5|ilq)qczdpZ8 z-Jx{pW9JXkjKMigi#E^qipT4y5oe`J7u)0{+_3!;TTRPRwEx1mbDsWzUK*v=ZXno95zgbF+Xtj*2D5k*u?89MpwGG)~z08wUDi=b=}_`ludaChj z9tAZ?D7thO@5K#ph+TsZJL(oga}@)P-!{5cjpte;KA^rnHtCjz&e?V#=B1U8gu6A6 zf~{!aN7CO4G9R3ZXPM z_%3FD()weTsQNPaB^R2&_3eNETEZ~bbDox?1Ch`x(29~_nCsgI^n3<%tN-B({{3C2 zM=z6dqfpOgmI=ee#jr4>vUnfKf6u5a9|_hV7NEhBp>uiO7UTA}bk%+2;Y^mda)c!- z^(0#*9)^QBHfp^7ris}L3kIbL-~b2Gi>B_f&a@hcRG%HlQcPu0$g>-Z9^$FBJBXmi z5zkZ%T+N+jkVH}dkNEus2S}LUQ?_u7!bN}Wy2!(~8~*-ZY`>lrDP~V897_?KJ2OuE zXQv>|*Z}yb13Z5*dmC)FwXd5K$MSCp!zc*dw^nMxwxYkNpqVwF!00Tb2rR3O+Bw*V zVuw#wTrqnuj|g23I3?&kPl6_Nbkw1go_j{LA!7FJf^l6k(%efUT}=Y{oY?+i1Yt22 zC4`f%zy(O0A7;Ijpa!0u@B-ENuls*mh&TW1QdAx$nne6!&y$@7D*(;7nbwyipT$ah z$7QusH9FxESxI_(FdN@qrMT&MK2!ZGL`D1E`y+k!+s3VQPuaBD$&E7=g_nOd0A3NQ zQ5p~U-8w)82O~?aOWv+?v91HpdpAbJapN_ntfK%C7Bo`ij?l6>o+as>G=+O0g&DUP z+nA%&PO*G4Zq=A}NuQR(i3}}7LET6993rcAKl>hdJ#FXzi7~ER^0}5>;OENidDXif zfOr4ql$aUF5x<&l3egpm!R);kJ|`(sttusVJX3;$qP*AZa4MiG^)h3ioV<$Ny(-jssaSK>BJEY%+!%6tcv;ds^CGR ziD^Ku<;tpQJ9s+n$HB*8qY#SM09FXyxb5U%P&)zf%^Ld&o?Ibx_FL13TDF7UmYX?` z$OqIq|bX>v_)m7v_YOH2_?{7=lD~f@YN}~>mrjS!6 z==$I6-ikjVRJXLM+BNh69FnC!l7p0`wjgZ(REqsx&heU`r>+we!?nA*!1D6kwzHmM zwF(F(;t<%F1ifUphi^p)uG$rS^}dGDxiotnZ)mW)E%)@wux|w~ zNWpxV(|Mtxx?@=xKWw%0FgHs1?~5LWxH`!7jR$hhBdYWUsKm@%B^jnuOzNe&Zypq^ z05)3SVUC^4lT9^Bn0o*LgDD-n@lM;s^UhOxPlt&E9s@fDB`yJB)|b0)@3NrkLd>x& zY7B7=vn0Ky2|&MpXgj&9VWRcEpV1S>sPg$i+_`mPMkunNzq-+K*q#8;ta8AuVRCw?#@_ASt%()(t|p zyOZem2M>jk+vsMk0U7Jv= z_AH|{sYa_S?(7~ajp;&b?C)7x)NC$`Pf zI8}>uMMpxSEi>fH4}Ep5iMIOg&U=n=dybitcObQR=|qsYXl1x7GAiKy9Wc9&>s7!1 zReUj&eBMeZaH~q2ET>&thX{iCKf=56tHjo`M|fdNh*27}9B>u1zqGR`6?wykG^mrgLE${6o!Rf&k%SZ7BW8Xl&#iAH)v2V*Z}UZWx^3+*O8TPB zeiaVW;o8H+)ly}wb0*eKtGr_f6ZQ#M0HP1_U!sqO#$XK#7UOeqhW}kZzne4(z!*XE z@LS3j0^>VMNjj+CuWU;sM+M?eha3s zU!?Gv-$a>?ip!EU4^di;-s!GK(;mb!HLU6?>%OfmcSe%9P_N|_>)6y`8zY(9z4V3E z0rbauGQ!60yxZeQ@UZ=q+uT^lU_M}n+U}yoH%bs0E`S)r4v-J04Hj7^mzr)lXj+=d zxam@H*`kEUt8-vZM1l*j_%SUf%-=q&L&ALR5wxcx=f|Mdo}XDIcyiqN1UFyYhg|rC zUI0k&(7%>p<=r2IoY%DH>6kU0@0KW^ppLoT(3`gYdVZ-1fXVjjy*F(hi?6pbTR zJGBF*{jOi0*_|hgbli8{8i*Fp^QeQ!^ST{*UZi`Q_N)b@uHjQMq&wHwP_6sBzfpU+ zYf1tey|nRMW8!NtavUx^v)kp%jOJOb5_K`lb61|Y*aG?tVBI}anbU!(lxamq_}|v~ z-=dL_k$x$U+jVj%Qj3p3D_wr7E{0p9E-Iib0+Xe|c`jw_Vi8@a~lYO9PMba6e!3O|kPbiLHI4i5+DaXtGnaH?$1ndIz2S^osD; zhg069UDHPpo>&nhe%JR3Ub8Q^>^gikFH5w`(4qyBA{)?#=I06u7Lv?-?g?1t`!l>4 zi1i%eyl##Qd?}9pRwT4|XeC;D%Xbj=L@&14G@!v!GpXut3sXQT!qNv+f1?-;1fiEg zxLLdj4r2FG)Jg$Vl<&n$MOZ>?7fL>CK}E{m5icwH+Q4rZv@T}*YL~b74gd~&`7#OD z9b+jKTHeTSx16Z2kuGG`UVC_^;boHU#>un@tVWY7U~b6Q#iSVISr}2$)}006o)n@` zQyjKk_}A9#w(GAZr5^^gM=v{EGwJ1iMto4-wP0YQ(`nFJ3OT1}(F(W~Rt^zrDA0W3 znBmzxhcZWvpMr}_t)Fr_{$dl1oa2F}$H=qT;{EoziGEx^Vh!Jj_Q7y;(-#d;A3g8- z{c!`LR{Vn3&tko@Q@OzfM}Rk7G6#zV*pP@oV;Z7J{{|E=|)k(baz3m`3T+-?5tQDC4Ibgn<1|8wkwWHs$% z47B4gP9cA#m_ceYIa;hyW`)OT=Tj+&uUWqD79;dsRD`nkFrkkZs?v&8*r8F621-O) zYV3UeAV#h+QT}N zkb13)okFbjle_*Ib)}t7?s8Y^DOV*06-^P;_7;zME94ZHJmOlE+(7fzfug*l_v+jw z?xB+(4JHWdyoRT=xWOk?=Ob~b(%7SZH+_7P{x)#6ahAbD2!zzi*8S+7?Wm!7*;djGUkyoH{C~^uDecwUY~8P(3w+_ z#cKXz8W;=p0_kcd#V>;Drg)sfX%t>#qxn%$tMO|Q)9UT9w8a)Kj4gu&RZ#UTZm!WK zl-%z3odwTY{jsD*F6Ub$t|P{!kjKIYarC7vC;Usv?=z`ovrcE%Gs5YF0kdX2^`zrl zke_vL3d>TG991q2(?7X?p&)GON-b>Y z9P92(QuL|){tlg9FDw5xVt#ht!}DXv)sW?nldLW+r1Bzth22$^x`w`mtNu>^x0}Q& zt%kcGdHgnA(Wbx)qMw)Sc?-einAktCG_+8(7Ux?T-gR&{Z9~E-dMiP!9)VC-XgtsQ zyi_?!X$Ntlky>l4z_`91m5va3ItnAXAh**;S&y?L2-W7Tdk-o3_|c8y1buWpo(TV# z%EZK*0i+fUb9kSR`6`FQYG^+~>G~?;m4ImT(#+JwMWslOT77NPu7f{GWS=_8#qR7> zuKB4ty{IEofXeIB>Csco_V`hIy9ryp3iRbhpqYXQ%ID{schF{#Ca@>Ub+Bwm)8asw z)b#H!I_~3QeyAymn9UorLLOOu#06gl+818e)}Rd-^~EHNLl<}vj=S1!{pE*FKWC8( z*`;chWv>;>#q7&BP|7G3rk*Zx?c<1Y#y`e(JBkiR$-99~ zHN74b7Y~suz};upCw5l&-sEF^p>2upzZgNUQ{}Sk_iHQtRrHCE_;$!LUI_mNSw%(d ztf~6akNU0hNA)5!E92R)L{mbU z|6dZp(p1>few+5T1YLM5U=hfJb-px}Q_PSjKlCNHCK4KgQ1uJo_qP;|y>R)_`4}Jx znKwM2pU;ED*w8*0`bm?7c%Y-vpy5PezEH?#wa4t0saI3%bE)mQlu^%k3c(Z!;fKs@ z>3vBUNW^A$n2!40PSa{~^^vW^?KT48oUD7T)>a;EnJ&{T{6boY6LysXQ}pFlLL4{M z+nza%WXJ-YCIg-$Fwv3={CA?&vuC+b9K=MuGitnk&Z92we&~HJPu3Z_R6J##$X-22 z`1}_2dx4aQMM}2Hlo-lDs$SUAV4oOC7RgrD8ly@#*~&6&_Y>oYgmIDNLh{a)H&lH> zyz5o9Gn5tOZd!(zl1(}`9XfpqiMcL94HactvFy<2Y2Hepg08{gTKcsy1|K({4owcJ zizzUypRt`6^icmX6AM5(wZgBXwBv28$== z3(?Ejic%Hx;o6@iRu8*SH~c*hPg->quhiOAX!}NIa#v4gcijgO7nvra?u<*kyBAqg zD0YG}S(+U-Bpm~UX%^dbO1REhWAUaD)MhRiCL=!>N0U%y>N{(?7Gxsp9fdxjnNBXcx+JoEF z)08|-v`^KG&!w08&hf)KKY++kN=NakJT3(t$lC`BK(N=VUQA`?0dAy>_wJU6w9=I&HQCmf9LJab-`7?FxVF2; z_I0*rwIheXz+kWX=mgNS6#YjC2n@@U>AsWxT!U-4x=3who$l1@v_OdB6~Bc^N48u& zgI`Xh(ZBV0)(bX$NhCZd?6*TRp7b-QQmiTr-9E<=`+WsiYxAhGeslS;_c(|%Yka^m zx9o0I)!z8W43Xm|Q|wl{@2GyuTGxA=yz70Qe9pm2FbRLG^Az&x+#vv2Kn}`;+uBvl zexA#IVGS^#u;W6Cxa~bp=Z$Z%^14oxqM3bG&o=mVI0y~T71Z!M_e+zv@{&S0Gb(0e zLX)Cd>2V8&=_I>PUY6txI&qkFq*ilRF9KtMpqcC)jx9{^qr0X1TZSo}v=@m4B`tYS zb;w3p`6S-VUGA%G;y|q-e>jM^^!P`e(~(yId~I@gXdRH+_HCcK*XTPjhY+Lzrk^8#w6EsW8X4YE8e#O4M7~8u2cGVV-^zX2y zDdu1akA>nAdz_D@EV@HfqhVi=H%1~v!l@>ocp7}H%9}{!wbGj~QQqdgq+OuM5;Axq zjTdhBRnyzJ>|KabW23ZF&u`rqgBNtfU>8d8TBiMz5Z8@$;|&$kW`2VXdArn-knVPe zrdAQK$xG92A-ENZw+j}0R8l&5aWtZnM|0~WQsn;A2Wi#jGwbqUi`Boh|Il{W{-*4W$eyXHd55_hOzd?Y#Yng{V1kmPP$%mfqK|;1{GDjSKc>lP!j%DjCs1m79JfE2Tpi1XS18%$ z6eb<*n0i7xJ=(5uY7wtVhxv*8{1BD$%UWEYt%lHdMT)DEs0Q+(W@ ztQ$n*u8faWgvo6;pPX!t^)p~Ohal(X^yhlnN+M<=w%xiYhXY#&?8$!B^-@)spV#Bt9sX_Q|y{#suD;&U575HkQ0o0Cx;N0y=JV z6|{BtI`lKQ+Z!L2DCF$y&H?1eAQyaJ@1tJzrh$>wgr z6reyer1RQV`=c0z{*V;+_(lU&h+=pnGUCg`(EHCxG%PHdRuvuJ>y^4s5m>Y4SJddS z1U@KlZy*fo<4umlf9!sUq!SIIT;s4yG_(G80K~a)Wqy;rcb~412hrS~W8{Rvr(L`q zoSWL|#(bGOaOy9&X3Ac&`5X6`WYIS#9pgmOFB8xiLcr37L<5r^xDuZ7n+P zKG_kS5A?QMxzZxiEK~TnFF_NAUV(B9!+pnhTeIq;eA{=ri~hVzl0u)8Zeo}|xL3v( z2%H7XTOir8DcV8o9_%L(7_S03^uj*hmAFngH;f<*vUy)F=Jb)@RPPu zscH99;u$&N!qe5ozITNElbhp*3ceL*YlSbeD1#42{n5#(fgR)uo;DtCg5ucP?(kj5 zZjVzuymk?5McI+{$3o2GKzD0(t)_NkzmfD7!C^lszE;{n?=yWRlmbHj<-JZqo{iiS zKal=>{{JQYd$zknj?)0SB#v;nKC}KVV*x>d0cM)CGh(7rQs9>NCO_&cX0IkOc<@Ce z?^GwBXw49Seh9Njh-6CLO)#C$t$sMmDNE>y)adI|eNMFFzyBte`ih_top+v$)VBG9 z{t4sd0O>*rIjNNn53?(O(#S^!rTXTD(VVVrol*m*B2ycgraP1nDRv|D7nYhIWcXBH zmQ3&t?_j9*SM~Bc=LEOBFQX~nwnU=|zx4W?MPd}|IR{a&z%g@5PIc1eL&hAcAMUC6 z1A=tLS-m;6eg3(}7fU9t+r=n8uZ_pBCy=+GA4N-!3&2x-LlLR7`5mf>KwWUqVHuVZ z>2m7WM=NpcAZo1_mmIa#G$+=wL43!+I2X8i6Zbm-1GH@ z=EN)H?iZ)997kO~YzNT-nJ5k@6%qINUZyFQ8OC5013K>V%7$S%5tt%Q-2Leb`{waW z59eB6G&BKw3}1+HaWW|osJzh^WkD;#(tuE@s@pTaHmW=L&pBweLDZ1piDKAKRD=jqD6AyhveA zu`PTRG8R>u%vpi#7gzp%^F>JM8^

ft;{C^u4|oELm8B;J^NVl{v0J5e8I9A!?1q+mrC2f!Ix-uNDe zn5L&;=5V-v>|-9CB|Eh#2{)E3k_{HLq1}}zX_)y4V_CQaxjqNyXz;lQ6~6F4Pr<2Y zVe_<@Wtl8oSngEE3~owDTv1=6vGlp)8vU|fMNL;>R=&aLx8mUHtxcbrFB}(ZGa3heIPjst~KUFq1`#mB|#IWDpY4%vDo` z4^ToL`I>9vPzlOa?#~C8p;VjFFIMS6f0NRf8QSubzE&V*`nA#`HEE@rfG|n11FEB_ z+0Qe(`Ng9ojOx|0+NLZUs3z!X{>`rhD`cB|Fw=)SB7xn_fe`;D`K(K?0DfdPx09Vx zTMgbdD@Qmkh$SQmYN^3w=X3HRCkE_?`X%|*!CQ88g;X*7=`V>teHXsuQifd4qY3h= zRx+0*cpCo|KmYgwNtUi7VEHm;$s-ONGz|^R5Bu;n<#WIyqc48P5;n!{Y<^M8kWT%8 zJDlHD-v%JIH&Q+7QZril)t%Z!DTF+DB}n?ObB0}{UO8oREq7Pk%pL6-_iLkowc;w2 zhnp(s(t(SgN)QSQ17`YH`p;|1=l3w!C9#lWI54PW=yKBF|sWy9^v;|f{ zOR-P9{+}!U>t&duw=LV+Q*(9Pv|vqx1*qP1qd&l6PF>OuJXXMGWBV?}*HJyl=(<)o z?A~@d4j!zUvQyb%Yp{35=u7Ju9mO3V+a_0$-$;^qkuIAQy3N%%LTF9>)#$u5gCpo! zg>B|#1U2(_ZTyCt)Pbi+BxG{B_U!eti?pFmm)#2bQM~= zKX$3zR0VaGXVr}A^kW=PG9}ujbW3Qzx+3E!WSTA0X&ZUf&-8zV18Q{IMipuhJBZLSo1I{FOPIzU0>cK%h=2G^${Lb-Hbqp*3e zIX(ITnb*Vly42nB(-$HZI`6to(=u{eQDoYG$bS zzGNC_+FzNiObrmtYsT7*(46g z4@=u8$_`!fi|l;n-a=k>SSIxw#Vhlegx0ax59j|FoB5KCXrOYX{D2yvwEK95H8O=h zog$w2H#-U>$?{S0+NP&pHHoQ%vV4WvRU84hiA>rXw#SI4(!q+b@Zrb|(0muy!PbZo z5FRT|YmIP8QvoNtmu>twy23WvGwGuRr=jxcO{gI~gjz2nN_9F!lR4$;!m~m`l7>d# zP{Q%F^1mW3aSuSxwyR=xD7qL|0C5nX*=oxCW-{Wb^L-^OmFDK?&6_WNbrYYWQh-sS za=w#{z<1$Ofo<|zo*2t~ZVVWgz*-x)`~Bo7iEDqcaaCK8u?lrSTp+M3q;Q>guFy12 ze@XnU=piLI!1&g6qwYYmiRgzd2)M` z<$rqOxEqN>w(rb$o!A7fSbD@5W0KE}Rw|m=UINxJuEZ=|LFIE?p0-{%jJAhYzc)&| zJII>n9FoZq3UQ$z2f^E$&$&EzsP|Ija+VXDIKL%S8ajmL;QCpHI2$e9(^ zsvk0|#wrGaQ<%P?#lCKE9OP<^gsd>x!A;)!&5(*~#64 zQkfi|@dohI>5A8++^J!6*djK@aJeCNqHjCdI~1W`8qpx}M7V+OOf#zRwj>P2f$AO!R? z!IgIHrrA|uP3-kWGERGixX<2B4NG|D%WL1N4~?!T0gq+@m-M*Up#LnzorY|DT!}w1 zScLnN;d3p%3VG9-&YcFBi)K4unB_F}@Wq$UP}-yYQu5d7ahGjy2PRqV&y&*1m%QyP z5L1i2V!O)SM+jVqWL>sVK3l|zT3+vbw&>FGjbpC%B*bC9Ba#4n(eGn<>J(}2UnW_z zn6MAy+y{c{800w4`Z=YUZd(M%gn^ZXL~%|(9pNp=t~h{@as^I z?j$q9nxR;R7kvy2j_$bRJW#I=kBqQ4Aa59_=TGMjqxW2Yk~SQ<#q57w&XG%)F3RO# z>9{5PPo91y$$EUeo6odVk2X2MhBlda_@^m;z`PDzk;J@bQua4~RvF(%F2pSj;mkNL z5ZGC)Q!1*3uk94#<=5<-_{t@petEZ!EI(-(z838J=eN&T?(yaMidB2)`zeq$zV3E} zj%}1>g^j7fi~Cl7ry<*Wm~akB_TG%TcbRP7iNm)#Tx17R$(yU$8J|lI3#}loCOm;X z{H+;)8pFulLR31F(Vlb&JzOV2uBD(#q{D&*(XTDj6MdG9YIn22>&P72ZmT)kcZ6mZ zCW&3WmSp+~SjqY}iw?8x>5(^#g?VXb7yhqUgv=_q%6$5NF|#U^qzZpL%QLUpU2W%h7JPk|vIH#MlC40b^k@I`k^(nR_#7J-3=9^nO_sL>5z(IQ zKW$z{FrhhII(v~d|H53r2My3=2yBm5Rt=l^8V4iv&m_AYNC33+!^=Ba(#6hAH?oU<+l9TC4_)K1H@yt z2p}|Xjnm58pVA@42dpUcG2OEV5cjx!rA2}s6Ni~b^BPX8RormZDa;m}Q(VzUo|V3~ z!-ApC#v#Jo%6-B-qZhvt7oLfVHnGm*`v5vBr2}zalXg*=Mo!X$_eTnBVKvk2rD}AUbuC&^U(Z#y`shw0Q!8)ttK+t^~wvEy5m0s zrFOz}@M^%GHbFh%*}RQziv%vOp)c(qKuBCNn{S{(x?V`PjUcM+YcI3m#7>r255;+| z-4sf8{@hpp;<*!}1t|xz92Oas8XQ$4!HQS5KiQnyStfW)lD=EZlbP3)9Rr%UIEz@1#RskIP$5BV zx4B)(b98Ae%`*Z$FIWQ{1LOK zu?ttm8#z_Y#}!8E#<b=!$oPd!B(KHNPEqM@De67$wc>?laOxpPT$a$%9T|zuPO^&UWZ^ zpDb4YW!B$_qS^-1r=1*4l;~$TpqJ138@^fqt&~Q`gs0*nR-cCrZRldE(QyMWg_?`2 z#c{od32wjwlvI{`Ns(5JN}=4F?D92X3Q{j)4o7gWVX=NmX0v%qQ;rgf`g>=e?+mYV zo~7R7GCPHZ>#0;n(zXn8hkD4q{a$I7p%Dmy+@Sb$qIV`&!RPk7LhoyOh727yg4T#B zeVKoAaM$zdJOQv{a>YaeX=X?RLx=gWfjGPdmyEKbI)!}4Bm;FY% zCpd6{Tje^fd7eR~U|78c#%*!OhB%g&AP@wLI0mY?CTIR%<^I!12*FO8fWCDK=2`Qg zhJiQJPnKq7MZSmW%x*`LPx!J($)@|}O6+YVFr8UZT^G;>vfKvcX90Cl^aCd<(+a5C zcBb3QWXpAZ*Y?>r@{(XaB#}0_3?R1()`oeXhVTs5fy)7}_Dg2M!>?$#jUMnI=GW;t za-pAHr(3{lX!-MoV?)rOeN;T)?eivF5&QAi$q$Yyj5uCrqy6Vs5s5gDIKlv>oQSFU zV{Fu7wZm_OO1gR+gWR%Pa>EA;xq65Xsb8nc=ctMG_d2-qUyPh=rw#MkB>x%jPQC*g z3`ZQ0^Jk~{kJx=}Ogind5p`fd4AGEqHFGxv4Y!Gfy{E0lPH#Nb1Wx#v8!Xs;9yIx; zr{x((B*ch5nqhKSt>v4Rh(#to2(8P>kQ)~$>J)4&$2gnGRbU$a0%8_-$$W)Z3C3fS zB>`_juTq6uNG~wP2UxSn|0(;geM>mIU;k-V!&=XnWz@I{evC;}G7$qPrlp=b*KfZK zp&!*vO*kUipy;go%NGcjAfuVh@=P+Zh^+2H!7#SKYX!g3ab zocD}FAb=g^)m3bvE`j};RQ$}PX!(r0-n0_2d~pt2QG{BWH#0{VAdQhYXkd)melOdJ zYHu@ND+%wYSJKz2nk~O?Z}&;+dPf*m;iUOHM+9##giYXw9asR(`X@M{zE2?nx=FU1 zn%Ws-y(HC+(F^wG>E8FaYqc9yHlM7I*|POPH>fG4FFeHFz0f3nWoq(24(@^-%1AHPm3jbZFxv9fT0to7Ky*x?agEP6 zu9hEhh1v3W^U3zz#@LMgJ^QPL#KoMw3!6W*4d^fdAnN|U=n6aCw;rjOnxcBk^v{L( zq@!wf&LJ)2Ylfka#rv23e9-hz%0muXW}K| zc6YOUXe;GV>^K69kLmE|BN!bE)VVZTqwuF@!7#swv0hUEsL<&AT@9bnv`jh+0VD?KEWZA zm!y=UlhWkuG5j{Wrz;r}eW~{`#4^RQe0X3z;c_rwFtDL@_4&>M!o^vxe#d<<9JWp! zwX=C_$D#x*B-g7?+!O=UfhU@g=76AvzQLrNM)MCx7?!_DY@cH+P{$QxJZuR>z>&s^ zVGa-rKP}uyN)B}oq|e8vSZ$I?N4td~!P6QMeF3 zcRz5y$1Y^@k?EsqId9$E&ooqy_Sm`R{m(^9zteg{*Nhg$orUX-%yQpORu%uCnfX`v zuXT1JM~@`IAi#g2SP__+C*XZ6h1pvqR&~kzd!4oP&ld+kvnc^2((a<3dK}hV2J;gH zpGoY`Va&Z$PEdfH)oLwC@-2;4AH4)5g$u6qx7qX~W(&mYGWqP#r@`hn%ldRY<}1L&WVu9Vk5pFB*AY?W0TW(&yMPcc5| zO{l9vCn)wK`4hoi-i)%Vu(}Y=*JP3Gen1G;E390L4qJU z0ZNk$jBej0eMNBL-@@KYp-vZ^luQ32LuSWN&3;=4o3d*UU!+q9k8C}1?9xF!E{y$P zXGPvg^nGjE$9VK5_Q7we%;Q%ehSb?U;3TJ!e^4)vmSc7mxSn*bVv>CTBoj+&M|jUW zvz?A;m>Y8!b$YB}%&I8il$2f>x+5bLiUclFB&!o?P2ktNYzaCXY)U*w4o3XEfEIp5 zZwgT%x+D~Lz{`)i23umV?x%7sHT;-WOq&h(10ompxxT9jcdRMDpC zGmkN9a0u^7yG!FQa{fp+|4jmW<^Y^v4rc6N50A4P`kDoTO-M<3Os`zRTxSAf;(L6u zSj5OBShS5_hB)rS7(0-x;vnCqEngkatYe;{?TV{Oc`BqT7D1WEnyUFfSpZTO#80!P z$=`-@5N(5!w*J_b+n`gPG)EfZ|Yj@F~Q%N8Q?cz6TKVY;m za?LKP)t@X^QLi}#v2{XqGn=KR%PzgTF?)^79&P{pRN;Iy=mdV=?81q{(blxr4;<@Z zTfkDWJ|8^f9b@VeLHI69^V$1$9@|^|I^E+itoQ9zF25XDui>NGR8l;a-(g-(yVp&X zhmzBFyZlLuRDt3l!}XsJO1E7X-4!09m587Tac?38ySPO{038&})9UZn%Y&9fwKFZ- zAy`7DSsmbFg5*2PrNJ0c$K!9MTiV&ARSEI$RyJ zYdsZ;NP)4_DMQ5;XQkT zi8rT?VD$cTcA?vmfD73*tob~7LvAV#u>f6iif{ItwnAl_9$xrNk-V#{O*Pj;@~lob zam)tL`Y$WwwSt=*jb+zD>^5FDh?l_Wk9z6Ijk6ec62Iff;yB|mdwT<1f(kc9pHgjZ zX3ZHmURBxbQ?Tu7=SAm*xi)F6AeZ#-%aT07@uuyRgz;4N_^lkf$KJv$LMgE ztZiPrFt!*u;RbsXqj*z?CB_S=-q>3!mABTJ{^Za+3&0ck1z1E3q$1s>+XE9eYVxoO z1Q33JM@!(`YV9FN=q>$MoS8!1uCpDW%mZkKmxr?J#XqN0; z+)F5;40DVr?npBIGLrc<_f63`&T78nGoT68a=QdX->UzLzPoXcj9zwUC+MB_<~O}k z`TlKMW5A;4<_Jhfwut1tVbyB?5?AqdR0SvdAQ2_7a2Bw3oa zemTe9fK8ci!^C+%(|W?pG8NKkrEhXZ@O+8^GkGg? z^7tY@oGO>P&|&FZ$+#jLwPtmln;bDrod?(_$J>qMA3;xd!xEmND8-&yzF4`?y*368 z-jLZ#jT9CO@2o;co<3PCC|AZ*9 zT`BtOu=_UOY^^Eerg2rz+p22$+{V_Kk8D8_9RwGf(iE%D_=`u3@>>2yNAnLi zh76hWN{z0M>#S~;y6477Eo?G5-_byyZohm#%uuTd@Rq#=G(2`N$r~nZeai|N1UaYR z*ZG5*;l(F0>iHZQwow2XGASi!1V;^@ogtRmo{h$~TLQ>KI_o)In(Mu`hBiu;#fk?X ztcauoaFMnjNj_Bt#s*(*f=>*mbMPQD&Xz@)=WqqkkCXYqIyR1^R8DCtbmy=6Z{gGC z05v9bBtgLq+^e-KZi}YujFRi22=CYmi?VK3iXx-SV4Q-nmz%ev=vbLIyloP3si;r? zbE`14sg3&rg|~sn?rHBaApM*BEqg0cO1gMN$v%5+&`cKJMWW z6OXezLDMb2WHo(2TX_+c-0R2<0qGh_#o6K#LvVQlG}!$)2wCgC!J>Be@e`mw@_-l$ z-_C>Xh%>0(05(d8Z^*`y48t_xaU;KFtTkx?j_`@YIY3XzV+{@7pbwa0%|u0SY9K=%d}6XKU$pXp*ZWC#w$%emL~pp-{lEk_>c$2v8v49v47` zQUZy{aiK8?U!|6!rOBB!i6=f} z@DZOJC+ZrZwRnf52nq}t#h>rcQppi4eJ)YhW#7d1)O<>>k`;e7Gb8pkc*fkR9060H zlOpkc&&->l)tqeRf&9yU=^Tx2Z@g@-+F-io&_Y=bJN?k#NpQ#*Z=nQJmqUcO>KbX*Acr+2Z$3b(g1g+L6MB(Vum_(}&&^cFF>ZPO)u)f@~na zK!4dlb8Jk%I`DcX8y>41uBIyr6MUr*;e zU8=N7o3Rb3KFokOpu(WPzS9?a14~44iA`sESeRtiXWlN}7g`f_TysTAmr1`z0>0L< ziy={&zc?JeY$(%C4@TQ0Hs{t%%EA`F%=%d;zo*3Ko6k33wz9(a>GD6^-|lGGR3?$8 z_@1MytEe~0lBK4yR;8Ue#NEhc(5nn@0?Sdj%WZOC?d!vg9)zsSeoaUlXLeM&TKXRq z%>QBUEu-RUwsm2GTX0Qq2oPL?OXCFh;1E0ncXtR9G`LI9;O-DeaA*RA#v!;SC?P&4xfZy}A2^)I^|n%s3NYx_v8+*0#=6^PZ^hMHE#c6O`3c3$mcwK!~h{*mvDh&GdRQayL> z^V|1(7fnZ<8i&!m}JIY42e(E2t2bH`xFbCe*aTItsnf8RusWR)!un=JhV1 z7@#aCSqv5>Or8IP+vZM(P(55ODk;4rn`TEQQQOMKTqQKr`!HG5b364o->l*taAGSJ1i=Pl?Wx7&A4?8ag}s~iLlGHD$U0SLv_sCY{S<0q0OQ- zIJ-U;*G#RWdiZ)CanBDOrq{vlSUrLYL(V)Jg1SD7{$CB2c-=0rkxSoGEqt4{$#z*4 zxAKe=vab{Jy03VYs$`}$a(%(BFU&1H`}1_pWlN!26q_GmPg&qvs)vj9EehFWc>3Em z4DWC%qFwHED#yhM05zJ^P8EiRNHW?rgXGHc89@Owto0P!<`s=K>(|KjM?hL}HVo|d zcDb);BiQTV*vEu|)v|{Ox7W#izH>2%*xyODFMqmo(;68;tY0{9kR1JHR@w8}4W?*? zPIh$Y_ zRUC%PP0ifla(n9bYF|S|XGyJ|JIfOmKUZhQCF4p_>J+HZVOn)_rz$$7tqFR00MZGt4 zE)uBXaQpI?d&rh%L>{L*!(59g7sa;{dGpuuk09#QDtwi+iI;Jpp)V;l_`e$($;}1+uR#i#Tn~ zsBQV(YB^%iLt}0xMrD9|k4}8$C+a`v>a}og(RPFg#s3xnJ)V8OGe-+0A*>iz?Kw8$ z4jJ8k2_!UWYj7C^tA3HT0-PbUGy8MWGDDc&p1=9uT1bb;ZbS3@gawl{sXA0&f@H6 z2@-;UB8u93?GfQ}`+{lAbp`^fCtzGA0F2`_pv@J?;HFH-{Uv~jJLEdYp%PV=!#ld; z{XuoF8MXWJNJva3)?Wtu#ZVm9*{()^kssf8)DW!BW!Zc91r5gj+ivT7^&nl{seJoj zP&JoC*cbp+yms1fsAu^krrY!4mlf9hGw6(wVbWQ4c=|7}p`pw4+eB+#vVlJ*2VJ#@ zuD{VH&fGx&G+(Dq)1pn&}eOsR6`MPYwGUJCRA!LKs7qsoCq-TX1qNo8S4J z+t}dHj=tMN&^~R?_0p|p1MnRY*daEj2CBKgMr`liRk|Zs{TVm5GJkMz<{AH9e>PIT zW%~hYFooH1LvB5tD-auSH&pW4t%NFnHtR4Lsz&slgQkV5GhML8DfS51rc%SN%x(&b z=qqKAmfk^$rN9 zAbx27o?)FRiZh+H9EPgbK{aX%Mps>;JU43FTt=`Py5#@^j*t3%a^`^ynI2S;9-LH>7b?Zc zTq455TGbBC6#WO#sQ`TrtGX=DwbL8wg}AyG;QjXS2SP<>j!F+tw3~<@Y8o2FJzmPc{%!oHA+}P%1F$wT$~ZMRXc{aQ0|b*Tt#}0Nidd{^La$z-3L2Mdg7`MmW&lLuUo)%b zI8KyVGW(_FXXJaO=0$6L4pe+2Yf+Z}L5bq7E8Af`E$^)20D$*o@?xG1eA#~| za^O#9fgy#i6TpFTkY*i->J;!8zVV6eFk4v+1e7_-u!fB7r{BtLavCrg3z*6rbozN0 z=ZP&}a0k3d_WGg-lgiBQXbHDftlD?#bzsAO|4?WuYoo6MzH0n#=v}us(kNQ8=?}6mc(Kay zV|7R^4t4L*gxoUI)Di#ZvnAQY7K_d6k0iw?bSS%Q<_y?(hIOg?WR@$oU%8Nt$I!X49Dtvlb}sc*VD=*mQE!$P6s z46hru&&=Jb&#JNgO_aDBIj(4`*m95@X zeY5-adw9PQP;NF)j%$=Gs#(!w=cB>a)ey6tp|N%I709Wj73ntm8p;1qI2J?w3*1A^ zEku;crvEy%x$WKkf$dlSH$LviDW;|)S)BV)ww@ec?g4)8V$FtoMFG;1Ps`F4+g)8E zMebuCyS@rfI04B&uGYrer5qa|M#`6{is=GWm&U*j)5Hfjk2G2pfi*2_K4a^RTY?IJ zt`iClvOXGgcupyjZRD@P2!5mglNrkmGx+iQ?Q!iGYq{CS&mO+>4Rg-xtRe@hB`W9ygKtd?G9CtqnT zzK2M1XQ#|Yuhkr5%j|s}9qkXY4(SKFo8xN^<)&^~3$_1y@^7EnYtZH9peRU%?ZXb1qImelHF1mOSFIM?CdQKLddT?2 zo|G&K^>8bY^CxsER>YI2HTF|pA{NAu1-O+PZM2|wSX7-J=R35C#frp!GQR9u8GHvS0vs7 z(1nz;kjZESOC`>Fl6^xTzf4?;;^6A8Bjj-hxik+{8?Y${1M*2a_o1dYmAN=w==Gu^ zmICSb^BVvH2vy=<^ZAFA#p_=8;+E-SLG84St|}M4CvyLwFu2Ys{jh=NLxAWA$<&)9 zyhw(^ucg~66V7_zjEti!Cf2<{EeQiJ3U{jkApNQ^j;;GOB4SZ#wmhY$M_n-$W~wvC zU*y?leZ}F_lL0r})0Sdw$9_4tZH#*Kd}|DrE+vQ7mv&ENj!rTz(|edQ2Sa$hXZpUb#SRVTYzlD%&nnhi=9JR;0VncL$ut)P5k} zZizRjafof4{J>TsP}*m3@3bynrUIW(jw9d^4l8bf!UZ<*Otnt?Go>{P_)0xr>|f zykeO_m03=^`{o@%`SbJ_PFvSPrjM;wGN^*jlJ;GWL(Z;=jb|+x-ReylfBbr_S!+5+ zfS=6oaNvT!3FKO}2ojoFg_$d=BY6BKJjL~izL$4rwg(@`fhRT8gH;Hk>p{Jk+p}i|xV2#$aJTi@7dcl|0~)B|OP=INoHw zE8|dW;?Ripz%pF{J=BKV4O7Cu2&XJ8YF>^f@FKZc#sy4LsEIKcwqGGT?Yok8SV z{c*bDV|D#a2@fjDJ_Wcwr`@_=Xw;4B#dNv0f^y?tD@WxWkorayRIBB6;5`j%rSE-j zx#G1AskdVF2aw6{$o-4zm+fcyaj+1ynBOtdYL#n#k;HmPy{7>1>}th|*`%*B8fvb# zQksW@J>W(%1?Ix2?aTe|ZZJ=S*?0tGgK_MwKZ8+t5BH`)7)JOD}|D{5Gq;-vdN z?oYxh6)TT_Xmx9y8>gG*UNGhzmYyFC?$ZOco0IR*OcM@;a4y?Pl4M{nyTV99HR?9f z>i|^|0J-Kzwcf9S$-^(zx7y6AThOw^CIOm!%G~L`xif(1uaq&u6h^&;tNlgN0oK`j zzl(Org=mgcri0Ilq^J>ltnk+(W`N0%4EC#V&tb2tes0U`mOk@#FxBOW7#;0zRzor4 z1nOUp%SOSvHm`Kn6o0B0B}`g`00Bj z&Htd=|5x&Op_C_Ohljrj--5bm#NYR}FtBLoSY-R8Tc_`QTq(<}SHT7t;!XY{cHMQl zjR6C)K7Fa`UGgGUAY%~)BE@>8eQLi)fEO32HLpV3%X@R#S<-H2Zy1&iP(!so^e;@a zJvCq<;d>+l83S}%(H)`|hNi<}cM1qp*bQi&sQ=sMrK74cV$n zKU@K!jyizGQc&9fATDb7xG?me1j2x_z(2pJKu^u}$NJ~q45n56sz>N`Gc<7E_RM3+ zZKmNIo8&w~_--(wVOjjCvC!STs@=a4wl>(v;c-J8j+(GIUr#5GwB4-8+w)9zMS$zEGz~;gsao$*hc0v zIuRUXWl4Ho$@95IW=ZA@;9q7)h3i-RcQn}Xgioc(vwyR|BKRT6!f_vRtb<3w(kr*! zDB=Kf+`ta{sKq=~Z}!QH{=?M1exdmKh;YEjOo)yRi4B&IK-$+&IE}YQ?LPM`_1mMV zHi9km%_0b=RDkVNAT9%p!=BHjZULe1Lhr**XVWt)KfOg{Wd2=qaIl}6L-sd$>r-?7 zYy$Lv8;Qzq<@kOQMf^lS0cAL6Jw0%k?4Khchvag-;;5dqx#kZ;%#&?~Rtp(o`AdX~ zsxy^_w075pD3NfJd#K#Cd5-}j%4S@a(gD1Q-4|2Mw}1=9m}yE^?B@tc2p zgMWP!Jp*t=e+Fd#X{!HQ@A?0^H{*vVXG%%=KM(i!;Qi0Y{p*7JWHSF_mj7nT|G$}Y zkK@a3feLEoOE$6P`x#R&RMF(Y;#^Tdd8p_#R8&k6xHyD}ku*JsUy!6!H4&rV#%FQk z{jAu}aJl3A^WgWQyuH}SWBgq4(ueyv`=;x%t*xZ3ZBMgn8^Vg$$0kY_B}dTS7$iy~ zh8O{dO9d|#h>G*TV&Edz{;bie9jr+f0J6pU zFL!9+|F5ncv_^bOfI>_eX-Z6q*GCt%G%rT^|Jyx(-B(PRR;+-(Y4f!r0cVR40V!rtSgri}* z)V@UJv9vrLc^@SH!(zB09V{a9Ukw+#psKN1U=je(N{5K|C+jNeRqELXa%37`h~SIZ z8DFuFr>}OjZ)aa_tc~ai0(T1H$6<`J32d>Tlg6Fm-=klrgO{T6!SOt8wN}sfC1nt| zsPx*7gWM&?qHfSUjgvym-{ZuH*ws(W7k_FdBPgKT?w@F|RGjFdf(`V>1kK6A!pqI? z2-pp@_v>+>jS47})NN3j>C5{a4r}MRKIMauPqAMj6LX!9cB%iXNa4CXM*mk&y^JwSIm_+<+YZzC2r+JgK`Wxf9Og9SUO!4`J0^~o~KbuCHZ^QR8f7M*n`oZNsYqeMi4y1>r3bz z8U!r;EpCt=PxVgBeut z_G3RQ?Q8Tj3cqu;CexACa7xRWK0v=%e&$GowmdaD0u?9dKmwQ@?#U<`iKI`Rc=}Iv zhaSX!B(0i+v-GPho6Kc&!aWItm@S!gVR@MU2C9%8Kf-2bJ^%%D5{B{Db4K~W1tj8R z78*KEKm!7Xh+sFKa_gVeZ-T$@{_?r#V1OF=i{SnGELf!$s1t7nMz~yI1r*TT>p0ib zi8q$JLb80k(TU-0tHa8Lr-qcOaQd90b!92MLR~w{z9K9X4ov~P-~o z>6M#Wnm2O=VjnZySSq6FYz-OF0pxptMT*ezlq{9)%eRjTzTEn_gMJ-*mtO@ex{vvC2V3+`XOHJ25!#M4`3$2N%OGdC zloP?LG$E&&&Mjr#zT`;*>Zo;+dL}IvhI%U{w>H+-6LX-4GZtr+*!> z1DaB*`nyw~sz3&&0?)-AT=+2$e!RZjRC_XuME{RQDx)p0LpSiS%g0E{ds zU(iyz{z7_;^}lO$hJkx+V1!z%RILQfgALgxX~4IsTM4yYtIzOd){wv&t}iui`Hb)7 z3uY^{!fr)vUotL!AQ)TRL{}r5@y@c#bDZ|U>+9K+E12yIm*R{xZRRC_H=dPN6}KR@ zTC`nhSiJVtlsc_QPb|_)_G^!jBM>_IboNt=Hy)>a>*1jI{OQ0KC4yE%XV2v%KCY!oAX)x__GU^17t1hN{RT5&KIY}k_2@2p1w^Ni)hpyM1g-F!>H zOt;qEuaqPN7UNc#<$y=Y?MsfF zGYf?gB;uqb4HbboUTBB^J-x2Y{r&CAqw))-A2)91XQ8s$j=;m(ycE;{oqUGf;<0i%nz60*9>yg!-PeeL(nJDg~Ux}$lwtm zQn{}&r^9P0^WcAD{Zxc1aQIuJV(ClxNYkN^#pbuhvwLh%hPdPkG3c0#`6$X6k{*z% z@^CsHy?31;OJQ=TUQK5)lI2ihHdL2f=QZNapwtF;2*Bs!>%rxJ?o!4mH1as#pE#xq zTYO41KgYL_PyK?g%x~m=Fl#qGFnxCDu_Dx6vL^r}(Tgs^cnW8At%ir*kw0!_I#^8a zxz(9%FTi(vX>GZQ$~%UX8cECR`{@^Z4e-SZ9qE)AlyxLAIpru_bO9=i@9yX5DgEx^ z^62R9iW3ol&QV*{1J)#ra&_loo?_pJKJ7Z2%Y{0V^>^1Z-4Ds};fxLsXQ{MB-qZ3I z9+wLaAl*jG&We>9xr-wuqufzf^LcD(&aE_Sd6=kj0l$Vp@R@44rPgcXs%RhqX1sBa zEs8g)aX0@X2!(?2`JN%8#fZ`S>9T+kbQ^GREq`(sE#vbcf1%aT^+NZ`J!j?G`puaY z|N7es#V-AdSzYyf18affMbondhi?5t^SEi{ra_OD$9R_47sqOi+TC&F&V$tM0me<| z*Y2Rs`^xM(*Ey!^(}CPOqt`XtusulK?e7&yYcZmJbVk|)ytSW->NvI?z^+{ zEaR$Ts8=+Po6Sr_A=q;M0p=>hV<{r&9P!4%y)1FXHF7POq+VHcSE|f(%wwEgSE8$F zOAJs%b-EEA)h6z^2u@@_4|99*#_1^0>x>qpmSjcXl5eiF)1B)+|2nu42eGqT*YAqU z2CMzP($G6z_<`|ePH{mwN2>3fsv83L*1xI3L5UR2oJhNQcO%kK(2W|z2~%_x6^>41 z?DsTmc2^R(Ki9Ck9`Q->{g}?v|F}LCV41~zUZ3V8v30Tw0wuEQugL3+WKx!@w+6pT zEPJo-qSx(IFQRgPfBDEABJCaLrQ>JQrQa*he3$Oex3HZJ6ux%n8K{eeb34| zTFs^DHp#7fbP`Om^#{IKh#57yFUj~UU9UVo%s{zyn%^&BlMbIkI&2ePare$6xilHL z(yK*#l?;#cpir_r-@r+ubk ziC!>Vna=UVemSqAEO?5@P;qv*4SGzdTJjkVmR6XiU11&!%n!Ml$ERxiF@|~fl&?YL z&-J^I?o}b?sbKy=>H9tBj>37cCm5~ZNU`Vm4(apemcI9%xbVG;vF9*Rmua18@lZ5= zV%t0{Qzx`P)oTqtQZ&!lq0AB&$8sKr1S6DqyNd=PJi6xIi{>r#89lY%@xBQ@t0lqP z#~hF4LzX+!edwG#-zdvG*lh-%Z20a1(?&&WU>)xVVsK5-?`Te zd|WgPsTn~W4}ZSPz@WELyxaTRr2~U{s;1|@&@zSzo=e{25%6WU;sN2{=E1r?c3>Ao zZuuB}hAv4!Y;J6V{^n?xTh&rjNZ-D-wcd4qerPB$$05FBtomc>Bn=h@)9-y#tK8kW zoIq+KeLJB=mMv4}sxFC5i`n?wqG_$M!-c1s&To*8!3&RL_Yh#SV%F~^Yd-j6NyMQ( z^ulV+Df6+bsY}ScHwn?%C`{b=QyA&ZkR%I;D7%gOwvoN-PX3$mtPb|tZXCLH!;-(B zw_H7a%;Cf09m!;Kulmi7%Oj40+PVgGyWx037W;)`E^7zp0v*jgM7?JB_a^UL-C^Z- zhbRBRvxP{{6-N1$K8m|f`qf-@fc-*o(W1}gVJxW|A%!-hbmNEV9(s#u=&kn(^-Qxv zqLqQ{%IVK5hRuB3xbi$}!~6}G@6S+XkCzub;)S{QrQ26>`7cZ|4LE4~JQx(cnH>Es zyFwls%))t{4{8Ir1L}@fmZ4Rc6oSL=EV&S*rj5otVYg-v-VMTBA0tF$f9EoT2p);p zjOaaJd;0>%UIF9ubC2Vvr$Pz|rKUU{Y~^des$afiuS>GJFfjoe!yLWm7X4P>ypFD0 zgRc)u!;v@2Kz#1)3R~T^Ist|Sd(3z=jrMt0=isX6$~pNRss*k*+-WWW)5%H)Ue(?; zE;9?xUQpgW-fg&ZAxS@p1px*jPwBmg2j%?;m}Hx1Xh5cAV|Olk%Q% z>el;qC2na(0F{sU7G9QI#Li-9tJaWnJW{T_xa}7RhpQviT`9}&atk0NYUlOo4dmuZ z$H%HA<-b29eKFlyoSQ-#7IE~sJWAGcW5*vw(QtCAeir}9R%c~TCyfg)krCX$CKb7R zl7GMXjYuhbgWcPub*Ll-myL>ga7{=lSKo`{$N`Ek$j|o%g95;EFn(V9d8W$h{V$U;m>sEy zQ02kdqg}U5*N(1AD}S|RpNiH~r6q6ADr#*OuRi)aLK(tx2)1Th)|&@hg;m$w3W9r^ zMCPhS3;sPnx9O3s7$k>brRIagF(sd~NMWwFRAWC$`_@By*f+nvmb>rwokguY$m#p; zh=F9dXGpi*~49J7g|OWI`49eS`6!4-Kz60xX-8X z2U?xC6R<(lUk8+(do!gtHw+jh2iw@YO1rE`Ka~OF@yHKZPme{wa!g@JY$_d)3~)t_ zXT<9N^7rX6V$dqbDo{=i$1Akn`!v>BTml}oY&pOuk(Q;$KItrZ|C_d*`1}Qs zd+*k&Nh0+QYmzyJn#NR>#a^6&!ER+8_Tl1f01al+8()GJiE?kqs2qIH2d?l7C#!K$ zuH&MaL?oq~b{c~(un8Q*I1(OulZ0}bxNnqErlV@s!y7LM)yN3hnAnZ(UBcZfkt|3y z^7x%$joEfw_7@dp7gg(dC$nCT_s^F?f)O8z6YW)$vrUdkNQJzmCjA3;&$K4H%$mQq zti%B0%x;{6z|(zBU? zV>`EC(`~mhE}kqO_Hl4!IUZ^jH~93(1y<*E7byo}OWZtLbD7F&t)}4XigaKmmNw2` zJ8P4GE5S&h{tnB?TwNla>R)J2(-a=sLM;{pk7mYV5H#PMQ!;&}^y084MV_8xoDVQC zgN&B+>koTt5&FtiYvg*%+&m{q!KupYuV}YFW@=uZd_~P_#0O|s+XSCU{(6CPye;cr zz%;J7En(z|B$E05^|irbsgWwqb~Cw6CGh4eztHj(~DLY3x@ zR@|6*{))fHjJMTNl;};?m$J$U+*DEQem_kcxSvR=tosXoWpkG+S(#~4PQ_;SNol5h z!UWHfeh5D|QATe8gMXJI#3{NaZ3!WDpmZtxhHYGn0tp3T@-VKl!Fw zWRzz*R7;hZ>2Wi4au+z4N*TF*frB1VRkk!5BFpLT7AUX$n*AQB3aBpb@pr5Rn z8AEG>dHnWt7KRg|Y#>mV!Wj|l(uGi)_9pEGE~R^F7KSU_kvM&q!Q#~R;VqG)E|B=D z@&J-k@7honaq-AX>@|Rf6n-#w*9^M@l^9#w`L~8KcUsD=LY2+un{Z)p zAI|r&tjpBv6MSLQd^cu8fLB1v=Jjg$U8$k4L~n4w%ygYqO#m)4NvD_ZN%Ktk2JEfgI>x-{=lk~oeZ9@dw zs8w4!L}VQ|KHVME&;4$mR@3Jvs!;-e;kV{x zcGJF=yv7gElycH+DeXwR=iY)ZLx)$>wm)+2+&;o4?Dn^ty&7&vZgR~pkb~^3oSdQf z5f5nF8r08nkBc9g*EX7=w(BR2G<1IR?5M)YvR|rsBQA%#8u;V!ex-YZL;=nmUAr^l z`i?4!n9b-T#^5jZ3~-i}%(Qme%EWQ4P?1_l8~E$CQ5u7Y&!x+eTbx|QS@Q#Xgj?mi12Lp~aoHN#M!ifhn=!WF@K3uc3u0Ln z*;e5ui_J4}EOc-PD+crMYYfvp(Yfg?6b6jy#~fRVETw&Yk)^FgI{N~hM|ZAn;^IsQ zTT@)M+Ho)myg0#zqBOlNJ)8-C^5-bVuC`4=qR4#_dt_L`7N?wd*ZCS0FE-2Oy7t_4YQ&#|oDF2jTtj#{G-=h-bWstb=R9`ClTV{agG5j3_K$+T|X{z$~!SQAAwHgf4Bl&S=CSAvx=J?&7t>z&p7Uo%OQ(Zb=&>KJV;VH--&F~+Q(?|T*Mti<56WF`1 zMx}w{xpLTICo^UDi{p}h9eQ)N?sJ-LT7QJ|F+L;qPF84PaflZ+yT-4rUMx#Bjvg(r z0b<#9ORq2aXqE`KO%rRVpX15Bm*O<6(VZbpNaloXv{qsY4=WKhjnMB97S<#uk-1R= zcG{h6H4Hw%G;xa&O6%;ZdB-E%d9U_Kxj0ekTO|XBeO;u}&NX3T zoFm7!Ek^ne9FkUTGpzVwd$!NhK6p>JO8*4?Cl^4*;x0aa2TDEjxZY>Zjpruz_3lXD zJED$G^esk&NqWNpjl~6dWD!S)UK{@EnyHv(0W0`5MQ&6*$EHMQfH9#_5w=`te`v#- z8Y9m*D#n)aSkTspbX&jO!Sb0UQRi1_-;oBYh@eLYTRzJjBBj%`OFR_d4j-%6$Ai^d z9X~Xo(S7LY)$~2iY2KP3lQu-Hqxb`t=Y{^Qn&TEPnXbX%AT~H1cAY|l4z~6-|M^Ea z6n>P)C%t7VEQIQ8Yt%=HNWZ-&MYsK=hNRB7f*4~7j&6XMR+QQk~U5NZaIwgFZ(;n67KbB}YOts^)b zA*1;Tcf|Q_MMvp{S<}}%>7kVF7-p9s#f`7yA6=9%g3e;*N+8z1eouv_r3kCA6Yb90 zeNB8vrCL=#NM!Ma&^nXfT{D;q9sF)O&O&TM4JMf$Es&G02sQZ+Z)1xH8zo#iK;+x(4 z+~iiSm22``(gRR>Ote{8DFr#-%|5r&V+f4gML~Yvye$|`?pPCi`ponn?@eeyGs^7Ti}^F6E0r{5}Z?V830O*FNC?kv`}M9DPg%jz?T9 z>7s+u)&u{L@|zcSK8WS$=~-F`=|XbOYoG64&_+SILFp-0nl99T62481GyD`HdHLQ6 z-Vd3qBJaE@R`kQd4bw}PqVFkkpqJ7MK$Y3O&uwKR{Ih!Jnxu0GRzNH2D8|>S*B&tz zQ3PJf(|tVH;3imQY`6d)XXKjV9()%*Pq7a3lF9w+aIc>LcZz1SR}~3DDofV3@a4Wj z7M!(b0Zr7zz`<jKKTfuD4uW-mgW1{M?Jlnl!t*187NLDl5BKRw*<9X20T4}vn<3G+XYK_vJm^*P; zPZ1&=!H&hMY?@o@ihK5}A_D*W%VSUx4pV#ZQ!=3* zxN&q$Omfk+ddzsoOZ4~#;Ev!n;22_jS5Q!o@!ScB6$wl7XO#B48HdblBB$(! zmzVqwW=ykbA2YIyH98STYY6G=p~I$OmJgcf`+3yf6%MS#&LxLRr^!f72+`b#U=*+wS1ODZ z1AguIhj39e@^1lEJYRiY7CHVEqR*aLuPxg6Z=(3mVui(b@-h6Jly9X|4J~qCIc$w6 zl;1?99G<<$?nV2IwMm6oJyV+ArgZy)NP8-j?uI7qDvS4iPS_f*=X~@W4K0|E<-GPQ zYsC-JKW(Hg)rQ1O${+M4_w@wWxuI$ol2^*JoU%o%Q8t=VoRfI3*!=Ig-3>Fg!`JvR zK6Gb-9awnx?U=S=$6BU4Q&lIbyD2$s1CPNytmb~8z3{g>!&Fsz4m*wYsT(98p`8l z9P?chjrDSDs&*J_jg%wU=P%MAF6bu^x?O47K>BDHR9oyCh9NRT@bLnG&(D|ql@BX=)O98*F}Wr^qYvFFgZjkXAj`G=JhMx zZdqgbGlXxzeX`d=E&`lgQc-9OfUqPT*=e{He=Js=Ay5lvW@iG(Wz9@@V%#yGVqlr#Y&_#L{NQv?mv~K5bs`cUDd{iW_9p0q`xq? z+_1dOu2}D0)aguv>%$JCTjp6~-U$74XoeoF&5M#y)&NF^*wys|L&!2e$QnIROVBZ_ z2H6gujl6*5co)Xo{dD385-o8`eCMT@N$clW{8|7f^b&OPr;BpxDnS48tQ5UlIPT)B-&oAFlWJxg^y?;KXoSdv+7-Rck(ab=Th(gRMBUnFZJNN_TZl7)(>CWybXVCUwWbqbfn2^0+CW@@S$Yq z#XKvXY_=O$LNG(}(^Mo(VFZ8T#EADk#GJ`jPWF5qaYvQ?k9n8A%co@V)$W?S$h@n3 zMRsX?&xyfCs2B+z_1uZRGW7>(gQ~3-Sf$nA{>UxHj3VoBU*E+Zk57%PuMJ&3mFHVK z3@(M>Q;g?m`CDl|tr(uS06_;dnmdR6lV{q;079DKLh&&r!VpSd9eTz2=C2w7b7YjV z(JxrzK%8>eYgf5hCux-&1r+a2L{lK+f|}GDBi9zhB4yz3 zj69XZAL|F-^G(RFDrkF7@c50=4Ki+@_k&`$F3}JqrTBm?87MsDt8Ux!4thlgvKo#F z2E)scF{gCU1c2_}2nojjG65%3a@E8c*y#RL1%LiMJz%WUE&%(Tm<_`tpwHva7t*l) zy#;hM|cl zj_zi-6DL(C=ka{}xHjhN-qdNP(idlwCXLf8xlcZSJnWT?RN$`wyIB>=BtO&!QQOZ3 z@l88~r_-f<-q1XY&9<9uED)-eYkz3p<>P^`6;e$f=Kk@`$BzbmD*jN&T>})>Mr1x$ zn4zixhXKvufR?rM?+FmC>BR4Y$2;mM=+O&WsBX~fA0)5g>bnczt@^hb={noPl z!=kI;%e5WmwZZn*czGmB=*~07i$JnphC0-h5BBjO%qmNtAD&_L+|<#i&c)Z|Whx?O z#}ZiHvX5hn)OS~Dkz8x5oKMNMRtV|v=16jFZizG4I0yB?H(nkTnrG5a*6mPq+d&nv zrs>GJuMp2ciRku|0*A}Iyb1S_mg$?UUpu2D+c^9aq64ad6yYSRpD!C0u(gggs!_I z<{RDiJVEysNKpSs)*JB4ciV@nX=$|(3tzK=md+CvfkZWnQnY`Dm-Z0@Q`!1Z~e21I4xLi%N2jF?C1vERChW;OF+-oIEt%P zrLzq2MI@~rwOu7=#~*O|4Hbpxit)KR6C;*oyom{BWQ*vcsSrb;_oI&1W~-p*0Ec`u zPq(NvYW$LAk^L3sVbi=*JMf?+ZFP(%(rxgdLm5GP?Ahg^Se9$ZueZ$Q=9#gV?|?SY$JnG%1N*#^e|T`=E7X6 z_I1PCqDrq-GMUndOQluKMm=JN1Me!^u`nY~_b6yt9=Qp?xo5`o+H34bZ-cgoJ*fNj zfiyqrq2;)j^e16{reAo$6juH?ePsu2gX3NB*E577PH_Ul1S3g8-I{1#hedvt*FnrI z=m0s_cTVmu?=f#5;so?K7aUGDfEc=#c2BUpYuiJqfFWnvrl z=gn@Ks@Q!6kH0>Ajf;KLMoT{OE&_Dq<;Yv*@MBbIvJAuNeN^3~2m1PQxe^SZ|EY3V zK?W^UIXvc{{hq4-ZD&5Cc~hA6;)ezJfv#E)_1g%rQ(LLwn`R^BHc}A>t(dCmL9;yERJds_-&}#bfzG}BA!0!lm+Ix{@JHm5Nf*})A4evEol^|@&V zh!MOzp|W9|@EPi^lPq86qEL^ZzKRmZ@8hj$8+c;NKRoLgZP?s$vu#f4fgS2tLZU{w z-EWs}jFb{%jJ8*E&ArqnK(9yZv6fQcT_|@^F7gTc)cMiD|WLmu{k|2=$ z?z#S)ywDOhj3xxvd2&L#wZ6k#0B+ZA7hTVCHkuoSVp_V!3GT6jU7(}+H4=IZ1#~$a zEy_O5(SKO%Hov4#YOX4Bn?2rbZ>xU=zuMa_)UJ``c;#Kf~<1%jh5zTI1YJM`vPA=6%`2yR#uw)N-)VUYN}fu33)-l zGB!*P2J-mR9eXK8fuxG4y-bLUf&QIH_mpINsC($(s{U&WN6Vbi>+lt$x|=+5-*bg&t(@KloNUhBmKdhA>qG`328t-}RLRuWMF zgp|dx{^OTK4A6ev2!IyD zaL@Z;au682wwZ!gQ01%HDkHfa!KoqcG?`QaQ0w7d+}?Bp9Y)c7COh%)zu0@LsJOOh zT{j63EC~=S2^zs2g1d#_7AV{;K?_Zx!QI{6-Q5EuSmEwg!KHA$lPul)?0wI<54W{* zZ#z$`t%oGD#vHScK6?NEuLFXX#9cl=%8*Jr>ZGp&g!k2o7yr`LF`e6D`{$5~vqvH+ zb}&*EnLfKcz1mj*+PU0HITNvqAtk_p60PZ?MT2L}S}K7ks@gnU=uW zHhLItZ$pu)?J70k%0Hl7dN!a*sihmi%Yqbyl8SEVX)E+Kz&|QqI$-ho{+Dj7%|ff9 z;|D4I^uEV0x`>Vmy&=Ni$$546{bPH5cz%NhdBA{u@|Fymjy_r|@)KhE$lQJ_9rU+Z zc3Kkc*hYt$i4{U)`6kkDlkzpu;Wb)~-B=TS@h(-MI0I0uiuf<4QqWWEQ8UK42@=&hVHu zp3=+JBczYMrS5l7uQ8!MJ8e<4^>#Pd7Zb!u>>*jFmQMD&)!!<}K>e zcDyis$Ph~>QqD>5l+8NLLKwPa2Y~sV%0y5Dqk1=CquZub*>O*^(J4u&a!FS8s3|Y0 z=DWpl#HMdg>^=$Z+;6yoqg7U$-vRmKKLPpsQX-#SQ88J5hO);y6Paap4q6Q`Jz2>> zf{IZtvHgHdK2qAnb@?#1!P;BdJdAv=FlfN|t9QIM*vnGO*FM4OYXznn%Zwf8ob!a~+Q1$euwl1J zH`8H8Gg@x)Vh0$o$!1ks7*-nfUyA1pKLgu2<|vr5riw8=IDak<9Qj+DqrOK9IBQIJ zMApiD>ZEO{?h1THlhE+shJW_-JYis&{?73FW99 zkY(0qWIH82Ou68!EQu}E03FY(=*|=_vR|jHnswf~?8iJlG3^7UtuDRnw=_1W!R#V; zO^N?_B)3l!fmr=pv?KtQ<5bCN-+LD1@-g%k0eIAqK;#0KiZEa`owj}*9mL742(NW+Eki{(p>E^^o z>lgxb7p!zyPPGI(t6}|%R-txcEP)@ddUEfi&N7`8xND)$`Nb5 z*id)cE$Hy6|L}X?OR0UDs++9U-5}T9esqM%nuCU~A&ulNqiGQ-YDr(|8P|h9@0e>> zQ*Foh`S-pb-(TT)ODPxstha{LlgU@jkZ2UB-@BNd58cWzKJQv3qP4bLKz#%+nQfFv zoB*K~ZGGj8vf18lSmW$uSNGGi(KziDt}t%ig9sS$@>8fX!W<}t%>mhT{Uj8S@*XRR z32HCyq|vVvd zM(VY!QzOZrs}f(sRNZoI-|C}^n7^~-LBWvmqN7B5dqHaG6iD?dchcg0DyMB@dlBjT z+X{P<`$>!^5-6z9oqk`4cyq4m;7p!{w8DAHIwX4l0$DDEP- zgL_0Wg&sZB`g!L!0@T#rD@g^5XCB?{fFg8Cyy4VgZ?1-YgKs zm0x7k0m*r@v%LMElDH|_Zm+9PzccjbuX1=+lhDb&#jHFX3DM-M+Y(b)vLjsn1bDYE zE@0Y8Qyr(3^0h?KVWM#DDySgntoQO;J-Rhe=^Sxe5~fzkGaZ93X{apv#3ycl$%qBJL(F{zOKYcQvCuX9jq%u9$J0&!8di}LDf1cmv#l>7i$A4g6IYpw%-cPr zUYr5o2S>G9lmKC9YlX6Z{RXlV8|UiWxH%W6SkJGU2(E^qD0a)~2F(QR_2Q{lc(De` zOkuSgGxxr*&_i*GA+D&7i_&Y1#SfJc0%{zAY)u7$Q z1wgf^;jAh-oQkyDi&lwJ!LM^OiX^KgayU*`8s_VBs@RUS!BbNG{@SW0G(Ai7j3wVnRm{s8@ihDdOC4DG{8(sqi)&YCfO=Oj zc_aT&2_vma^V5Sr$x9Qj)Zzey;=rcb$0xVCauZ?a+}MJ$GY3BU&u+E&&%OzFJ2eJ70`gxI^#}EJ5z%irude& zpQ*rx3!K_tMbPnUa3{0TexIter9}=SqJlrf6@_H7kAYedly5|hmcjAtK3=1t%WGsh z;zEor5p&O*HWYzLSz`R4C!x(!7`~rN)y6(My{_}bJOhji)+^sgw~`{FpKs7OlI%rW zz}ESB!GPHP6xS%hb8oU#i;^P6s)+~@USAmqs)8;y0tz8OgSY-`GXfQ%M~G=P1^!$I&zK;H83V0|W~ zG*`F9{eWqar*^F+v+ellW8IL>kyV-TR0+3e35}eHof$iV^(|9wrxw8alJ6M^`2f~U zaOj|JKc

r$Q#uUAk>m2}IWJ_O>XN%XleehVP;{z@DARRmtt%?dCZR({SAs9W>U_ zTnfFrd96skSLtC~B5JoT+GHypS-AckJJbQ^)nR{?k&kh2mmp5h??l-5B{FbLETH|b z(%=i*2oUPbg4^u)<(G+!V#5$SU$|Y|r2B<3NA}AC3|GOxZ$Kh^BB*(=MmF^xpU;&X zMag*sksdcgh0a5Xa8t`JsXv*}C@GSHhx+OEB$&7m&Xa^poU$6iJ)xDinXuVglqO~c z6(i?s-X|c1_B#%*EACGn;ObWSGyX+TtRQDes$Q7B!}J^x4`Aht~x(i9hZ= zYJx+?`T8hh!ipcavo(u>)6=JYT#5oO-LO~NBoQ6sGRNPaD(a~tervo-`^b7>mP?Pt zPc8U0@BOAlW7+shPV4C_57@w#^tw^AnN4c1>N|vR^mBJY=xB(7UDs-bVd@5e7~EC0 zHZ{=(D^9pr$$?}zPwUC3);M>(7I^P<(a&FSB9u7$-Gx<=;z%LV#}TRuQ`P3tgkX_l z?tsi4nxy{hCBeJkTY1hnaY9bBZ<#1uC;7%%i{Es&jAxkzsB8^2mRFJz6 zZ0L;1<=hiv1u_rD@kR0=7ZrMoPf*m7sPb`7#v3z2bFpj$(0oum^Uo&3PKBNZ5Vff? zFW+b?34F;+eKElh?>#%JB0f$HSf^m6d;F7$&-4uc?UZS%X>mt3GfYl*v_5A&fdE|*l&byRVG32|J2JNuk%BW>AW%9%~2 zkyOMNrFqR$`CVXSRrXfy%?Q>wyrJ53CP0>&FRrB92q8P|JjHHK6k&S`ef?X$D`9V> zqVd!+*g3#jt3vH0k!n`FW|V6>WToz(fGi)4R2u&~+YtU&wn3gsh1hiw!n@zlb(`?U z<3tuz@=5D=ohuKRC`E;)pQfk+oC>%CF*_a@T#1I5Pe#$M*_Btw2?yk*~8XZ!SLzQ~X%xj*4 z?&C!^`_vlX+x(b`w&d_QHt0o#Fq`rKWwtlZ+X6P~K(nw>Ni~P0&^k$Seg) zi8q(WQ6WSnb`|kx@eQAM_1usAt+h7BbWR=w>aAPVXn7Z$jGF^xi8?)b-xj4s&GR{Q zc$V?LYs;c<8KIEP`~^{bn(y$9mWyr0p_rTYSsv8^fb+4= zE7>>hMvIw-h+H?yjNP52<#LV!ZRF`9leCXxc#EUst$n|?A^ehT@P)@>Q+E^Nu2Lw! z;@vg_G(v$akb5{T$9C9v8`g&u-L+f0@XkTSts$`Y*fz?=JEg_IbrgqogPoB-_s0oP z1=m+s7+M|nR%Ik|0zmOQEe~{+i!^A=x!>t!wG!AR2WQ?_4x5=Ba-v$&q7ZsCn=daD zAhdZ*h42&eUCx%skA3=TsKaiaB#mdURg?xLUlyeudA|3igVt>bp`}Wd7^i?6?05o9 zUyboRd;2@c55w86+mW-zl}cDW0P7m*)XUr?u2qeB-{x*6jKR|r^SFBPxQ4W{Py|I> z_7YN0t3lpN1db$jw3m;%U)QDCeAs<3(JBQ?2I9uX*I=7Da7tq;i*$oyarA)&P?RTZ zj8x@eMty7dK1n;_CPhrTVVT?8d;#JRs-C?qO+NQ5>h>KG`5A1)v)yMMhUs$QO#p%fW5dw&I% ztnYhoph8?LO$}@q>>j^mQ4PIX=iTAJ1gRfOlBD|nF1j22tLQGpXCrV8ICR%b-FRni z6H-rGWa^gYSTrm5uB};-8lN?3++ResQ&pU>x)}J_rSCPD9)TqCPe*pVRTbBO=B8ao z$0~428CS#yo_L`M&HXZpKS-b&d!IQ{U3o}g)qH*wg9=kCaL_^98I~6&d`Ox?@m6xI z@$%$F;m)BZ*~jmeiENgUF~y}AbIodiZOo%9YR@{|jB&arOPiBl@Yh4+9fsA^7i7V& z#*_qi3uA|PLL5Uvn@>k7AB~Fjwnvd8Hs8zD9oIApmGUV^b3>}}_qKEaO{D87E*Ry# zeQWiHXZ$;iNWM{(IHYO1rd)9_2w>qkji29Q=LcMnEpI}aoT^Pxh!eD?CnBE^vaPw+ z=Zr^FU!9K2%8M>Sg1PJk>(#^^;N&7ZU~D{46v*>(Pw4q(UHNu$irU<-LejCxZ6W54 z{S3t^Y#Qo|jrxnh zr~K3zFXREWsmV{tpM}uLlZ)m>dMeF!Xwss-?+bh1b57u(@wOHw+$H%Os^9;L-O&qB zjHM$4Q}Nilo{uVG0(y(`#@bSGIhC%d$zKt8sH@=;Gza^3pfj(NHZhVr;jQ;p4*7*; zNZ}+WyT9v^AG3R!9-ZM@wou<)@7MrErlr}w@luC${S@rhPOe8q?03U8v}lP8rHEl8duXai@F+G2uDQPP#a0)p9nY7=isfNYq-Us&+F@E3VTwfB`xO)M#t6&7*B&s;2UmAuS+B>a@x#N>J zI8pArII>H?_$8`c?%mn8wUgyM|NAqGYa3j8y#?(EK>$~a@=6~Y`v4|-o$G&L5MH?K z@2TxS`EhC79h69m5j^FP<8^x-3`&iC-US^enZC7bxtQRKq=aI)@iF2H3K~U!BK-60 zibsUl8$(-8Rm0;7(7`#coQJ~)Xt0Gda09HcjWz7q)1B4K|v$a-EZBk zkjhj%TD_>wJ#G{>CV#I^3oig$v^d>tnnK~uh`BdEoEh50=9Vr!ce~CYhGyTPllyIt zG>ZmQuSM)$Lfj2qgr#t=`^fA*clE)1;Pnu%q@nX&HniCR)$e?Ezd;s|W{zKQ;`es* zxMBIbotJ}|H-B!jF@*vHUUrt;T@W9UkFHb5YgBxPFyBSPo=1F3PCq++`OZe$aiD}9 zmSR(5Yci=;V(y~kVTHkv&Z~p(L{8rdB!7snHZJBFgB|*I9r^&ErTNF9JzXwo_VN6( zqqhM-r(op=_mS>iRyzxd0coB420!6tH5SHGOv+cko^@2ws8>&taJtSTz9!at7~oC0 zMS$n?3y#x0C@IdP3l`<~BN}SxdW=FN28ar}rIWU*43kpXuiI}?^&fUPk2?{ze311J5=Wh{XLdv*-E+BQbuGZq8*e~lX9bwuX7QJxcT@fk(&IKe zYCv$88{)>TvPJ>a=zE=wQi?@V&0Z6f3su@-*jOxYdXTtyj3~jPP^4`E*={^!wxn;42 zF!jxahk%z}ZAhi~x%0Iec()$F{gI}+hDMtaEWr}#C z(MO@(Ub1tWxD^O`K~1;#RmC# zm%j+pLGLMqA_~G*q_x%$N&nVnQ3|H%j?xciD{S$PY^-IQ!!|vhR&XOLbTaS{>AOvt# zSK9ld|Dv8S%cc(TI2KW?FdV;a zzqik7S@$kaNvLIcdHW!^ku~-)oBX58&CSZh7(7c4;1t}0G$mi~O+2`vlHCX{Apcuz ztDA-A)Pmf=q(heoQ@sduLj-rY%(jM*i$i)ZL+{m)FoyV#%u8uaIN*b3AJCkkCaTM5 zaiS8=W29`9@?W{$qFKQ-iGvwaX-MrVooKXPVs)Wl+uf_Hl(c@^hf`g08th8)2&@8< zz^-LHnCBos%@j+@Se<)F&*+VWcpUA=G952?c^1V_TLSpRBl06Z6i#B7e1tw!*3Lfz zQ9tmik$WaS8Sj-Bfb8-fv7PkEH_rvoYOUG{y^WU)_*nR(81PBp=U3hk|QWd)1- zQidSJmche1KHvzyKx_|~oTJPmX$vQ2EZ&0wpI+j1NWO7^lx zHi=x=r#=g9fhAO14h+#8CVx#h`ty*|U(b2e44axr_Y&uIUjQ^A~eZRPB7h2R$i#cy_0kb|^KbARTlL5-fiTW}h7o)NZ&1;|Ab&zSuBgu#_F#eRz3ssZ3 zGt&b%%H&dwEek^VbcdZea~h*Ur<<*6tcPEd9bCuTq#3~X0qm}98%&f)>NW)?EvW$@ zptmtx_lTMDH~*lG^p17UD67ouV4}SRi#OyaQ?I>~6+!uFN@d5S zZxf-lH#J^K)8w=}dXcYL=-4|fpjh!`An-XN0dN6~6X9oF_s_bx25OnR{+0AEep+U{ zSZ8+WRq3e$$z||g^yIQ(W!rlL=?V;_jTI{(%#Tcg(g@PlHVMa~;KqC40n}!JR?Jbz zwNSWDcNJKn_)B)$4tHz&+QIsOdS5M``wFb=>q42MZkkp7U7If=Ow$Lvxr8@FaxZ%q2=Hf6OJYl$?5P=_8%n zoB&=TR(|M%u?z4c-^>%r|~PV7a%TNUg@5e==VskKJ{yg##Q+ z4%DF0pcKbk<@r8DBj4tvdSPtcjppNqIY$_;R(rDXr8BSJ3poZrh=E1FbsEaC`l_#W%K%b9?1>}dIzk2I0B(OQ@BllU z{^%_UN~0IRX*#WwqNw_H$nE|N8n!>KYO_4W{UzBPLczEwX-@*<;YggUon@o93As~v z!Fx3qXnod7+ygLU5|7x)+Sf$5TwROm_?=$=}{ZcLPX0t60c{$0A|&!Zn9 z`72R_!eFW30X!d8Uh_Sb@zKYlBkbz1&ydd$A$^PQdBe<*-XI?CA|Nn*pTFFqLBi%` zSmq~}-=!1emtyk)GpgSlCB=JG$~6)GFpJA%BkIR{w&NDsR#g^@aNAd0)wW}gnZJ5s zF~kzrQ=fSyeVWz5KqzIxZxWy7JeaWKSg%1z5QhBL_i5QcKq90~=pY)j5XS9vQjU+< zz&I8NG-6WA~v`zdAGgl;Xk5d}6t z>)#jtHCeJde##@+g^Mg7RG0aS=F=<1`h-UHq1IU1-BQF~hV_W>`RqOrk}G`)7lGjw z=%t@&zoYFM=I4bSV5+1S4XwIPvOxm?UlHh)BEHLOMxQbb_T=)=m$pg@vS^v(VV~?r z-$Q@HzZqjZX7ieXXN*NSwrm+n|Djk36(ERY2lmsBE;G6K5^NsA8${%7>pE$5;Uzi!YIVUbCx+=5fV)UuPX6 z-%Ee}B@e<7I;WMxqOjJT@Xuu zL?zc=+6jd4krTyCmur0;g#T_Q>1jacHalj^Q-ZN_lvS2pq-8M@w>N={8uTh-h@2+w zuR}TBUIO5{GI(9z8(yu%R?5s{L>KH<-44X!bDN@V08ncpODdRCTzSj5EWH8LZps)- zwaYa`hA!ix3Lb zXp+sJ(Uxtr&+$C$xhwkp+rFK1CNq8`S7|SxcHUVz;>xa7N#l6|UJWMq(E=pK{g&Pob@{ z|ClX2m!04mXZK~QtR%_!VUTE=tj zzFgGw6@>V$*4A11zb&rEqu9XV$i??~SEX<0LN@LEohEg&22@%%#u8^K>ZVy>0KURP zRP6(mpuQK&T~Gp1a_T-)9OpuQZu5e2>@{(}K3cEg3<&2_1NurMkg?Qt(^h4En7Agk zdkS9##y7Qd-8od!sx~lVHh8?MOskV?)iUQiJYO~QcpgLVQ6!9HBy`2p3Xc?reBi_% zAw4)25>U^!7FE}AJ05lmKjS}F!J~B^h38Q~QHp$ZFQ++XLL?^FP(x=-73>z_DYq>W zozzk_nw91*=%Pn7r6zi{JvT)cQ<4m=7NHKVb>8{tu3(W1LbUqEnnsD^xU-m24fain zvhBl&K!RSHQaa53*vhVD6SksA8Qv2EW7p$rbemncQ6*V$eOw5T_+=1J7AuY_aFx7q zQ`uHc5^nmT#-CFWqPYGr+n+kRPjX;OX}wPW&M350y@>Ucr%)O{s`K1NKJ!wc;5#Hl zDI6erp{!+>iHODe2@vk`kDq+ilknJQeu8+%Ug+D)lRjkILaQzJkZ6g8B*W0Nj{D0aL|2<0- z=fNY!(CZD4`a#YrjxUHC-mQ1bJ&!&<<&URB;koONjLDxzld;yS4e4swj3NFv@bViw z@C%y3^0MLp)=lJp1_AwnjA*rY3Sd9XQqkQ2ZsO6eAMF5@%a=o2tHCV0YTq`{MpkxM zUb{t%#0{_*<4@PQSEFb$`MQQ8kLh3Gx+vM~SeU*E;I zHs5_S`@!UK{D=jxEH!J}iKqE6!U}ytP3L)YbB__PhgIPL%{G0o_ zCJKnxwWN+IfUx`if40VcFWs#T8Nz@f+aNspp4T{|G1 za=rzSSXg1--;zMy06+R~#H_o{y_RfM_#Z9Vkl5c3^3S5*FM+$WK$`RPh(fjm(mlYN zJZG|Ko;3A(b=3KsnLoT3g%GlaHFBPDY2DA%^8n7%4hJ`r}M|3{r^0H_VMLki|`-U_V3;O*YCA{b)PSHiTy9Um;dxR=mk^d0_{lDw${Xeht|94;P z|K6`O45?*O%TZY(nszS>5G0qx0u1Os5!=7@Fs|pz_GA3l=aEr`pD04021-f4JIqlG zt6BL)(J+$YCb9K^H#zdUJzs&C{DTki)kNvr$Zr0XcV_Dab|W>N(m>)<@89OqydOru zRc-T}-~6(2#IlVMmO}gn6ZsJs+P?D9CI@T}5EAF?oIn%_01YFSYSf79EZ!0Li{tgW z&-{VE*aERGWpM9EoWkyJB6!2Uh~W2&ydrS#Rx9}DmL-gLj@Qa0Hb`Y36B!Xx%-wtW zG%?x95lB~2oR9bLrY&1$$i%JnO z-lM+LhF%YyuRLaU?f*!jZ8~WgQ8;T0K)`=WBNEr$S!@oKR7ak_gf@|$of56hjS2(L zLb}JPLil;`{oruuvqFoDWEgrh^|$j!tXxZDy7H z1q%oL8x{`4GSX-F*+nLRQ2q;%Y~UU$0>;A;!jck_*YN71lcPb4JFUx1tokxN_nB$Z zu1+-#R|?2L1eS$?HCd)nT=N6|Lpx~(RhqLYGwbK~U+I_3Rvsu|K1z2FvEKfh7`~yc z^8`3N4X_`H>hdN_e}##ueZ+ww_g+5+9+ezOIHX0M?*lO5uYaQ8X%E&P$v?sN?jf|P z3ZCfjmNe|EsjYZW|0~i8ASqf;X9w^2xd9V`A$oI>!*1^+^=JbE%NT`vZMTHv+uq|} z7Sev!<4g^D8_FZpurN0Kv*&M^_a@af%Gz^z!X6&7WH2fI7g2hkZX9LI79%acW!%}{ zUQu2&BChx%gCn^?4u^5T`<6Pnq1~Ygi;~)<)nV@f2W73-@iLb;Y<}@yDJ;r)Vad5H-HejZs=bv-us_-cyya0ZM_8G zFHNLV50j2J+25gu1@fVK64!xB8f&774&4A?BS5kEa2HPN&jiBLmG8j)kQO8JCv_ax zFqx_stOW4z0Qtz4z|hLpvkLI2GAt#fBUFGMs|(z(IPwzNI$e3~(cyD3h>`v@K%-{s zC!{6lqXtSRuLDN&(M2DQYWyl4$tQ5`A#l$aIdg)g$hr%Z6y*ZI6AI&0J0DGxdlBfE z=@{+mBPnCTALp4Dn1X+-7@t7m1jQ6I2Z=ztSMYu#;lKLlP6mAb8l+l$u23>$G!}Ut zi(?dbw%3IIE3_trSQM{=1n80*!f4QErF8BWIP&^X6aQgSAvsXT&qZMgl7Q+6 z9%b9@u8~LdC#+nBO*B9CDGPF6cZCE6;3iu6`(4{^JzhNgkd}9RvrrFHBn_k4-V8Ry z`+3+yVvMf_P}V9tnyEwh^zZQK0EFGv#U@`DQf(F_L%?t?Aa1{v6I><>!eC?{wwTS zE?N>e{F8$;{(8AT(1R)dS#`AcvsZ;uN8JwCSP#UXxO@M#>NFx?jlWiXTBhl{QVIai z{x4S@`17_~NcXEwd2+LWazKUoBl_gnsgeEn$fEy8Pma8JukwkeiHWTMZc3{mH;KS( zIaQ$Q@b>QZtaY~(>bA~FrMkjHK?cB<-vKewde4)mhsdGm6Cmf4P~F$hxEyTUzZU_l zWrp5i9joOn{yr-xc($4%>E$(mo;sC_AQdM(XmlOU&=R59xEKR8wtHJ|PZ%in@_2j+ zt^J~?G$ndt__SsC*56IvPRIaLZlkZyS7qH(RVwvblB|!!?c;3y^rGx`(M>;=R}NOS zUJbA>_7;q98}tm;t^+`Dlxus}yF7*tS0E^HwrWUIpyq)9Jaw2&Kr&tH?P+4NFHqOW ziuOgo;2A2(qzqjx{>pAXO;1yF8EWZc=XTK-uH5jY7w8L7Q>f&mEI8c>tJXMI1J^gV z92}(=?rH#Hx|LCys*(vQAd-?^yQWIRN>H5hBr4K;bbq_p71Jesi)M7-`K4#zs|j4% zpi{No1I@xzjTXIX=ey3(6nWySTKz1od9|96AI7O}RDDHd`|5Ud7NF}Uo2&xE;iV#$ zJ9^-@n0m?){?mbF*M@|MQzU`CaqOr4n8+cM8?Rb#G>!SmIIJ&S`DL+Yt^9arnGt>nZ_X zq8mV~XEkO2!D?=472&+cX)Fo&>(-x`Y9N!x&1R|JGjinz6qAB(HpPtz8CN-W%7n8e zve6U@K7pO)JQxp)i5e00za%|_OC$l3WjG+VeH)0!U~}2zwBI6Z(Ujf{zletp#yDL* zu?4ouMR;mnZzWLm&GH!UNpi4Yy4VF~!x6HX&AnN4xf7Pd8@)6K=4Ig@FJtaochsKmKevfN*T{RMPgJ136#ZF8z zE`W;UCQ9y92K`CrhhPP|x z<<~m1*%n3R61DB)=y-#4;M`Pb(Rj&unb-Q`ip1Gq(zx}6b3y{3ubcFt-JX=X&2DGe zPHO6oICr4-i0`V6zg68|Zj3*zmp{32eUU{&auCQc2vhMD=!Q44+S64}G{mu=a~6db zf{G3LR8Ri{1Ac&h+Ee>-0CrKGGFWx9y%`ihDFZC_VW&%usX!@5Bro?x7It>PcxsAv zg_pDW&o;oHxANm16Tcdu)R4jXg%0@gR^}++YXBXT5_%mru%(iG&ratg-Qa0eW#;}t zThqF}2wIx|O->I==E4Ba+wCUUwz)cXZgkt; zi8fB(vqH?R8z7a!{B{{q!0zXE?ujwgks$nxQ-$=wXtKED zpmv#xMN4)r@%K~VwuMqzhR+jO5y;jOvsk?5f=@w2);bHP|y4wn};V;Y5nSuI{U0s#Qp z?}_{9df)A?Qar4N$O7+sdgHS2Ri6@F0#_(rbG?Q|-KITk0cr!}Do4F@@q^GGpxDHH zD}XkC{PO?Anb-%6+598bPO_?HCi!r!Fp5#WTDL{5k(B^)WjB|yG}-rH#;fmIXUDNH<5rxg zES}bMH}?ZV#F$tFFyr8>R$@0jg_Kl=#w_`c`k$oNvknbpvK%K%ew`SxMCzadbxErl z)smK)N}Ez1qiLMhv;=9Dv#sIzH)CAy#z@AJS)g>KT6QHWU-bzElG%u?X5oWQBQtMQ zZ*Xw>9maSEX`FVJ)3=mQcObH_*gvn&C8m^T`bw?_zTya_kVzq)vSd(Y1PxY2VK2Gf zUf>J1<|T>a2tOF!yDT|`;#2(RnDN%$ZcjqE(hw}XyhhG+6uy-XnIu3 z0ZyC_ohQ!VJsM`KA1KCr84CkzjxJYh)&P763I2NY1~4TvJFl5=UYB)d!89!z#l0y* zxcTK{U_dJ}CWL6_M|q5OPOOs?w2}GiFG}_D_U$u9s9eq2af39M)*@SasvKCywW8;8 z0l;K)z~}Bi#F=N81^%mZ^@NBGar#*+Itho2g|}^+`sUR82{vWGObeabr^lA~+MgD& zTknVV0Di!%A_1?8{gL|QwyKOx#yeN@+c)=g4tOrlS$zNY$&U!EuJ{RWN+!(;E3J-m zyL(Ype-Zx4&euz)IY^*wV}2%FegEolZxIZ}0bIQ@$OD)d9_+=c!BP9;n+Pu@`kT{q zmSFu5&xFxx^HGJp<}3T2cg44SQQc})TRJq$1+cS|7U3bwL>rxc)|3}_Sv>Doz_{;= z3tLj;4AHE0Ggudq>UA2*wt1UU#Zw6k7cDnL=X2kZ$nCFO0tt4-UFx>Seo?hb*Oxs) zNsQcU0BRJm8Zyg%8waPxeGF#&dS9(cWY_CX8@2Mp7jB2mYOr{+@lGouu*_ z>P+;owK@uy+pb=^#>|Okx!HhSE(~Hk`IrJKr$s5>kg#tGQr)Kg5gavJzinRM1R=0X zgk6TQx|vO9#NQCY^#LY4b*6@FA_+9>Mqks558pIPn6{N_k#X{)oJ@hMu09JYWRHai z`IA?C$wMU`-BQ*9P`LZe{$Z4)-ki(&I1;njntp}Rg)_z0Ul{ldl0Zea?FVPMx$$J$ z1ot5L7e0K~e}Uk;%l>%Z5yfW?CN$G;W|_;^U6$_$FIhqU?S;P}<#E}~Hg@s;B$z7w z0f(RMOP-E(SU{%;5u&6@@b&4Wran4{;>F_*9b@Brgg8pbd?%0ID)b^NoN2mlpKCQL5PpI~(Jf+gQEq#xTC_)Wuxxo!rZt zAD-fe>J84(Rr^zPjE*M3gAiA>6w#7NNy*saD1bA6GUm89wX9aZeOi(dsZ(0VX1+iU zs_ydOZe|)C z2V6FZ&gcXTUZ395ra!WOo3k=R&!vur^QB$CB@}sImLgaFp)F)` zK#-q?bkkrs-yWzj+{w>qQpdx|%8Nu8Y3r$j_%5xbQuZ!p56JdYf|;O>1YspHK(twAz#0f+R>0=F?L^YqP(=X&|JU19tSOc zTz~ZxfF3gSM$e0v6KFkz*yY_He2NB-dJjHr zi-nbjECV4I)1K8RfBMhWPG{Wi5;WxG8-!09{WqEzHSe7!18JydzhbGC3iml>KRqm; zh&x*uWdmoVTxh1-+Kcwtr%2e!1EFbu%Lbc=#DI$O-id_$F0$d<$Bged21qE9>rE5( zTkJOhY!5^g*I#KbP`e+?Lr0rGh~fyl6F|(yDv7F)wj_$-aoy>-(RyI;Fp(QW4!7V7RIjmM`6C;8)g`Qsa{ zO%QY>8YAxvd~@DhJV-}t9!;&-S$}qv>3r_Ju}n^@ zm*vWAFO@ahG!KG4)K6aD;CQO3H=<;<0X#$^+pLxPB8vX8i;IiaWVzBWf&E5=aTf2;-t|7w>K)L(0A6 zHiRVTnnrQa5)r-Pjar=(a<~r{a;gLA23GZxDIMqG0t)8NHV=`PU0>J)T)yeQunI`= z2`Gy=Mj2A`^v5M{V_(Z7sRMp#cVg@v1(Ccp}N-VA&ygP*Qqx<-`YD} z97Yn+M>+iUN^OyOS(v<;QSgX+b@BB$O(z|~JQ>}}vQ$ zLZeP-v`0l6Qb-hEpdXRpB4J{ZYg%sV?kjj3ftiuB6G@9N)6S&R-=K(TqF^6eRP28k zhkbZmbkK)v&)2pSw}78n1b+A65-}=vnq7cEZE-kCTZ9Sm;_{;3F{Pi+`EsnhX7dB@ zlWR&A6psN%ZytTA$Dfl6W}Ef~5W6b3X}570;bskY#{1k$&6FiIJvw}>8`2B!Vy2uP zie{sJWOzgAA%TUvDT*kP@C_y7TZZlD5fL*#QsDo?GEgWf15je}{F%C+OY1FsKa=De zV24Ijay#vr#ox_0;~yj~?k^0XzOEXu5S18@|Q9-E#wB(C`b|bUW*qjd}Z+M*!-F`OMDe*8W zpYC%g*kdKaF=eGi_N&)a`H45CO-c$snuA7dF+YGS4rl2JCt094xM|n=rj8Qv<^yax(zB;~TSciD#qradOr7NgWN{*Ci>ak;ZvH6^+y?zlGPiY&X1lWZdLX zr^mB%&>2`HzNs09YgAv7>v%Eq&QcHzX7nDyfRniRx|5vZfwdURLWEP_RZuAmR||!z zq6WaA$pu>z`Jc@{%n{0G?yXN)VP9YQu@+DfRxSMErzl_C9WKt~0!Cl0R9sA+6So6D zU3c%Su8chY>zqqvf=wXzSr4x;_#DBgnd~*C)>X#`OFGo_CloDrn{$VcS)G3gq(}=2 zo@=Xdpq9Q)di(`LpiD(Z(#A+eA(ky~+t}^Pp*5>jnTz%y(kdn5KFgG^b%*oQKA{BT zC8Sd8ld?y-)5egCtVMYjw!p*GXq|9=VekZE9!()HsU>K{@~eD+0%35XqP%5NT3uEO zMc|pLLX)Z^zTAdKN2K`F;*BcJ7|~(m+khY!z%|vH|IA#jQAZGrr;ofluMJTrkBQ(r z$HWu_UvuLMg1?2L1l$#majp--F)wy1Wf2S{7}p(sG&@Ngl#tmJ8qRRxpc6(eaoL68 z-MW{f7oW{=-EN!DD8&wJy_S%3uC(7~H4#vW^9kuCAZ(g-$U&biD$-bSKf z9mLy+ZO*d85hKP>M_}Z`L&Ng8{L!@d{S*w?2c0lf;-4vP= ztDR%fyq+_3)z%S@jvAI*di&Nx{d-@Di70;{;c^8o{fgfT-fWaFNhX`_R9Ue`vvT{$ zW1&;^V{VVrX7NOASJC;QmPSagq%h^t3=M(MBY~v{?d-Il{TvtPL{BDzY#qwWuFDn24-Zi(+c!Z;Mv6-DvE^ZHqPx`#I+7p$)UCMUzk!Ovw_ny%CG=EBTS`F6pI`f%(Ny9N4!57f<#{=zoDWU6Jv(Qmg}sORY#gGgBQ9$_ITO z49YfsJZf86okZhXJMY5N$?B{=(GV;4I@@W9T_cwkm=-)G+`&r-Ax$8OStuMW z7&|H7MB}K)aw);Iy(5>mdpT~nGqCO(Nlx*tdC5#=k`0z5A@M^2PrI)OitdJ&o*4NU z{FlWJ!ME)JB;fiH%*ncLr=@FQm~{)==pTK*hftJd3; z_eZ5|^Jm`#J8u@FehoKgo-O=hC&C(R+h|D&O-!sB`Tk`$A)2z3FR|;KVUWu^1!_+_ z=ar%?wTP@+HmM|KtVCrP7M>eg)AFrQGt#O!!2T}qy2Jo`LrBhTu@3n- z=`ahV87+bP3B;S*vCEiCDF_zllLaqQ(84hzr{4uBeRXeZI6yDU!lM{!t8E!3E-YVM4~#GTM>A21m1>iU>gR-gNL^i<=yhTfarBdT7k-t}h}l$C zoB?&LFBg{6X7@vSwT3Q=5WD1*)z9Xhw?2IlD9fO^++`{DY9T$%YC3x~IGvM6Vv~9A z%FpfrPHK6`=MuX5y|Eyd(Lm2dl;k#*js*p;{jf-LVHJZ-RB@VmQu3p%7w(C zn~L9IP|?Y(XXOC7|58Fo&TIi=(eglQVv~1RdXC!Y z($*iHKRuW;EO9v86`gVqKMOkhYB(iZGqOC+Wy&C8bbb^c!i|i+H_q0(2>WrKoO^VkfiwKt>KtLk#kW^TM!-nT ziWc=KL7oY=VuT<}o1yXSD0A$`<{R|$mf1!ZJ>awX|BJo1jLPa;--Su(l5P+sq`SMM zAEX;WKpF(3JEf&NrAzvuLrS{4k?wqG-i5n<`;7nI=e*cUSNbtSpcq|~r7C2KCh;v0tiE~=g!oI4+q^CI&6 zH0Bq4zw%B(tKoAH5k5|8JopMeZiq-F(tYx-qlJ~*<761U4MTJlfWw`1`{Sx!;^ZpZ zXe>{>-U~8+2(jgsQWe_!gQcG@leouy1I(u>Py9**@McRXK-N>Vv)mleZ1nB`8KJJ0QtA zKWtk2GDUm)1EOK$m)0@tY3$g{Bdu}2b+wT)H9mJStk#MkeT?wcvFaMC!|Id}xS6$j zdU=ICs{zI|@XuzrVoT(1stcKV3)hATmD;rw3)GbDKOCt$1%fbN*FB(@8aRJCM&&Rw zj?g~pBvPH^H?1u%kiD4?KnbNIj|{vBEM?sUQ^84e7k|jZs~$?#*Th0@Q>i|}{kG3r zohHXeFUop^)S(fGj9<6_00`RruP}^WIPZ^>$wX1~)dsqU+zNH>z3kVeK0Otm$!W#% zK$qV*O4|F~TMD5r6t3V+6n1_&;ww!5l#c72^%}k#q!=(Ra2phHofLL=MWKo^JtSdI zDN2h$n%#Vp+S1Yb(qVr$h1u`aFh)TcZDx(L7jG|+UP!DDp4s-A4NNkkrL^Y}e9GDq))9GFSp&@z>61A@=%NQJN179T2ky(VxS zxyjW0y)UV&32Ff4Bc2MLJWoVkN`3Gt!Ht6 zOw(@M`4)Xi$e_OeDf>kYOD=81YM`Z+aizYD?PA;Q?TGS}n?0dIiM|F3b@8kd>j!O; zEObqz_7u8MtE1Sr`nt?lb1r&|CQ4~pNr{}^O>BWfRrb3=Elt5jdY!jKo$|31EYemS zuQZQ4i29`RODrw|tMFCpk&CQNc1czOI*?z8p%dFX=PQdjA}Ywsk*1FOYdy&kXQ)T<)7Nm?#Mlc@+4d{S$b>$lpxRPFk7+6zvl}IgGM>s= znYO;IvC-j0loWZ3{iy5^iCu*>=o1KyQ<0c%cK>W!8_f;uzOPjk^9jqlaZ_9O4hpu$=}DqH~yXc ztaKGZmdjL3>0h3$ZvDtypPe=8_DI>wCj0Ao{S`UUS-z<2E@DS2VZqYWI>KeUTxa`Z zOQGy>lH&@8U*fnMnSjs!-S1@%mM4wkX^McaHyhuvl|ZT7FZ31dB1)MCJ?zbom^dS~ z$TgIv&l#5Jhi2E9P$37N8k`Mh+QqLN17^rHx3Mnw*lFYQa5j6tRdi%ggn$!uPee>y zLU46ROeSCmHi7YJc~OFwl8ELowj2^J`U3Uzd0I2MimhC~bUC} zca}x89nti40^TV%p^0=ywlx^3{P2f$ zv#%RfgAe=kKw`AM-C+6#2I}j4!a=L9bm$4D$g6|-msK{t13a3P1#fI9L(uS&gw4xp zcZ8VaKfI0I>MPT?Ms;oVsp~%IXj$#Y)J~HqPZfF#oa>zvU>k=o-Fm|RkQPdrt=-bd zIrY;a|5ox#TZ@L?*gfA87pyzon$!khee@Z5i>q z85}w0xE)~1ss3f@IKIhiPD)5!WLb#kmRPRK|Oxic- zOxmA3On8-`>=hIuF@0X@n)5t~lBA zaaLMRIAQcPaUrp%-dxRg-jBxcKf|GXVA0lH-HVC-QA7qQbc}qIIVVI1Q7M^ACt@O7dHvJey0mIw z7}H6W$Q-mRhWXbKcrNGh5N`Pu%mC)(Y(eYjS48rLxsW)#jOrXL*EA<$jXI z{JT46q^k~~1ow9Em(PJE90C}_+d#%Ax9@sVT7Des?JgI}ZCSoQY7@}!=No`xA~=!) zQDpye?NwLfs$+Wh#`A=d!f*ZB`K{kN6gd`7SQgI?8ex8ayr6WMekBV4Srf$sd^d+< zrmlX;{m#gc3d314%prsk+iWRZAWat!JNKAuPpdb z+~vXNOvp`5KP4yL6SP?$y9Pu$z|&*q6oT^{1^0Fuk=*r~_SrMDwTA}8v`?BdyR;_W zr-JWxYfwLH@)%@qI8B!-E2Muu`=HEt9nFn!;_Fvl{f3Ru&4Q zeqz?gAGuOC_ck`te5>R;-+H<6a-(LiZ@1~`{dvEey$f)HCK!a3dMAT#%J7z*bN-ZH zrq3)gk!QeW5?9g#HtqE&G~O)c%Q=qV^E3wl5Sr`(ZyfsH79Z&?1Xryl=MN?2nhv_n zh7poCe!5Y9@3+o@%(De~72yLDW!!icQH5DAQ0Kf?2i>AJk_yx9XO`iqc-iXoBhzzs z6uLMp6843dz!8xX(~J#l{T*r(7S*xF`Ihk7b|$M+X=AfJW>p8*(-z_M?nV47mev=1(Z@q1_5;k$kj!Nz$ zw!jG|7n~_(T0_hL5vmha{z)F%-fg|F2;3pndiWBzb4yu@9yTHm9l3DIibFs*q1OyS zMqb~s*vuu9Wvtk(*mTl*M+2wjBqX8&L9ay=7n*lnBIV|7&ZoTzP)(z5Q>-+Db183j zmp`jyKr|F0nqka7ZK+*G zaG>njQ2?66qK2m)=UsaFqoH!60}=kn&c?F>)ZctMC2&Lz^(p7btw zlwPCaa-{oBRhk(JA^#n*Ej>DY!%+>^ zrT4;1;OH97J~gMbeQB`u;S9vo>1fW34m=*3TZk6+m9#&xE{g|A#Z77=h4>-4iD-MU zxB!n?Bjj5G!kZzDz(wpM6oiYzUDn;mF?%W*Y{6nCV)pSs!Z5c6C(D?u*)-zu;lt*K zd%-tIp~}l71ze(184gTl?{MF?Q;@7WZ3J=-h}j(M5N-Xa<&*EAEt|zI7*qx9?Ypz_ z9n()gj<53dvE|65i}45B1|CIBt*sS?ViPFEEucYgaixBRyw7R42JDZ zxL#;QD&uXen51_elS39rATpMI++VETV^U*vRGD5n?C+TU{?^nZ5w6)4svt)VOKh_G znAU5u?UaYr95;QQRmOCD-e_5&Y%GNVA^#SI@&qGC(MnSK@#~4T;AJWD1^X1%`B*?D zRoMzdsd0hD8v@O}Ugf{x(MB(F&?LC-uOXK`=z_1rsX>Buxqyjn-ZW+e$NQp~)26&! zBmE3%cN`K(Ik(f*%wU+lBxxG2qgCvTvs<5D^bp!m&*M^(QPr@IGT)+~argP)y@6Jy z7W`#q7n6tPIy6D~au&x9h*yh^oHC%n?H~?6M4h`Kr;kLy+2_lzM|fvUj_L(V4oQyB z<9Y*y4ZX~AuTE5%BW*36N0KtX=3H}?9|w6SPL%RjGO3WwObMlNLolud9j-i_qPD8Lm4W4$3u4S4o(WmIw->Swv(hoy;@t^=-MPhE#spcG z$pB)`hcdVfQ5;;ej3J|gEWJByTY6*~UT-^!XJdlw3~XLCmrYXk?LUMu$F1)Pw>?lx zmIKKMVyGl2O)l!UjpKw(N-P|+ZYsNM^}-d%qtX?!Q)zM3{6ngyEX{S;g!Ob0xoq<) zGbyByDngC!%>M8rxTRWa@s%yfhY8`&UD2aKeS;Xz!>})widN4iq@I&L4D0#iiC%Ym z{~g_NR*TqQOFQ-*m{GX8pX3EvcdXWPCB=S z*fihT9^Q^lzzCPmB9FWJU^o&qFoVOoOAoc&9s3w za{Iy1nq%~ve0ieW6i#art&hk2$F7yB*7kQ-_@b;&OwvN~gYd&5VZZha*z}Y?2c_>u zmUA)sdcL{`_J+Ad5N^fq}2YUa!M;QCc&va8Qmwek?c@HX`=}0>OONfDwp55&OKKyE@T!319v0>Kcf6lbaQId^ZIOC4eqy& z=IoiiGSM0gJBV5v&u26AKs8fKG+;A+%ozH7kQI^kYj_5e+L+x&KDV5wUmG;I;BA*gdm%3BL?w`tsT5UE7K*4Y4kgGgNDTyos5Y54ctCpzA8+d za~58N;;f{oCWdX@#0KFb&r{3pO^gF(Kch#zt&!xdddB51x($I*K&e|3%<+m{sHGuM zyZK|_p7#Pv_snj-Q3IV}`1#|9a;k~a_#5uQW}4d6NUaEO{Ek;k{6k|V3enqSEaheS zXHaKzy65L-=6`edB?EWA-P>cfh8_Xpk;(v`aB2UH&vdnsENs1d#-hTm+%2EE3_Iie}R#OUU@)Pn2Qou_d>ZJBcq~XnQmL zu1a$8nlf@4o-@0hiXs)BsxCZHl6ZZ+&MF&Cq%7b_WG^UwU_}sm3X_Jt6B~oq@WVSuM5h6>b|t^^*zK3r@Nv))`+|mxPaQz@rro^+Fa) zw+i~oM{XAhUVQq8?2;Q(GHC|}?>FX)wpNEk zH?5{xJ{cX@@D^t{y{mH3!tP&zGQ0Emb`_Wu@w=SnM*}sV*W-|f^CTp-)FTtT4rTWz; z{XnqQlPk&$e>-f zAKW0QFeSau_#p!JoTX(C+y3=J9b4kfsemJMBE-B=SK74{rQBvJvN%-m%vn8KRxiFs zEKCELndZT^MN7|HPYzzrg!n^w<57=nQ!RWl#!-G_dojsd*Z zLL0%`l&EOF!>SkJc{1S=k4GxpU43Bt3Dc=o2j(Yv;N}-tH4v(ZJUq0A#_&|Al%_Yp zhQM19#f(eHG174-4836<@`&HU`{5U~ph)cSP*TyV&VSK}ea?Jk^-zmIHF4O07%F^D zxwXMG@Eiaj#BprOB#-TnuV}YcO5>u)`oG-r;!ZuEM#YB@$vwO*!@-#kS*3#~d(nO; z;vRw^)+u&m?K7RID|%YCR6C;sR6iA)GrYX}| zeHu?yCAgitU0Bk*IxUE5<4y_B4YiENTnFR*J*_D)JSrScRV4e-5LO`cWUtcaVp>av5+Cp z4pkS0-e#^PVLzm=N$VTb@|Q-2$(JWC<^W7Fy7p(I0=2Me`WT%SE25^zU~h}5`KT|W zyze(&1@eD6ESt7<`6ZnBrBubY8Lk@>)mK>z&UYtIL3?yWRT{h%Y!r-gx=j<5UZe_Z z3xkR1PqE_cpu?fisq^3XrSDZ|uYujD{hgA{sB@N5q?tc(pF2wz&Ra{Lm6 zv5uP@GsqiC1j*b}wzu09nkULJCJ^;yfuhrc>9|^N!BS9(EdZsV4d^RNvy^3jl#IjU z?muiY%B!$$_?-Eh>a&rGKP!~vc47K-=U`XE0{>xz|S>H>XgAXH1$>I@vZ?X z?~zGrZOs*J^xwnp zh#4t9Wx>O{Dg#x;P>X8_Nt`c2@Li8kmLaQk%x(niHsECtLvO$XpNE~2?}i|oF-!r% z(O)o>4*Lc!Ii8rPCcA_mPZ^iEPOoh*k7rMq>X2QMUbvW*=(c1RYt-kvxeTjTAYR!O zr0b0_+YDd2Ls|=n4Cb^?F|B8hC0*xPW;lWVX*J+R{3OYfaFG74`uQox_&1G^gNh8D zb1~g-s;tjN`lwwZL~FiT3!P&gT?vC{+$-vgq6QQgwturvlIEOlR!zn~S@#fFFM0TF zqvPuzh_~dg5O30Y0;zfJz&N75d}7R}&$TIubg4Ruk37_|*fTMUC1+>XhgC@;r7T^j zRO{vaI|9NftTuLxWt?0UG<)0x&X{~Ju~Bb+6!BJDZAPn?ebtD`N@7ki3>?oIQ~Nee z%akfV!HGR@pAd-v*AxEh#MxfS{O6lLsSHrdsBuHcJ~cTNJLvxUut3j0S&KnZIvw!>cy2HZxhwg7DY6n!one6?=Zq{ zJbg}S_B%^|v~VXnB(0p3`TB)4{Mw`>xz>#OoGcF+mIH;~4jr=v$eX3<&|L^-bA5FE zjYs%f&u~9%gsxOnBueNpPORBEhcQ*X1Y413`0fUn<0t_-?0JI@`Jj;gQC195d$Q-e zUTh>`Yp06Z#r$VA-}L5JuUe2r3Y&G^6aU)}Jqc17ubsa)liAEE*GlES>gQ^fC+d)U zPDc06gqVxGDz#5`7@n}K&%b+*y;BAMXx^UkQ_QRF$-kG8)=U#9)`L9mX|$o5Y@T}R zZOsYvpmW+xUm)#ONnP$1SUBm?jFbd7*u3RnKvlDt4s@`$WYJZ6E^Em*Sz(;=vIV9u z5vsBXn1(;7$*+8T?+88^che_%-+584Y&<27A*?9OE}WlS5o6OI-xMvZ2zukUzTx>! zhBaRCLsMJFA2G^L&l^Qng+(hZ<+%QfDRMf|wMR zFW(y*Z*Z3zfF^r-uW`8SM;0JB?O1Mh^j5+->i>%aA9?3;e6Xw)O~+T z)q|y3O0z@Ox8C>vM5Eks7n`wXDBJF6(OW5KK#;hJ`8~Am#pPG^Hcn~C+22EnQ9J(y zI^i0=eX`j(0smz5e^KQ$HvGCmzKAVPq7D@NI1ot^c&om7OP2HT@tYSOqSPKrRk*iEsh-UFagKIhs2%(i^BJ7QF{RS zg4Ug>m*ZS*Jc4(FqaRmb>gP5>B`x*GxPFJ>%|YasByi3nLQ~DjU*ZKQ?cxV}h0H)P zSscZ03>}Y`2?t}9{Q|_`a+(d$$=vF-f7r>Ws<83fBD>10?fN_W-NaPzuO^Tmsfzyo z+Ic_SO-G9B7siV*PtfNY?w#lqIuOQUFMEqVn9ixiS!&Q*AvEOSF{#}49)y}}ONtzO zC)Jh{&^4V{6Vq;g7Y7&ZG~C=f**CUfB=`5Kb&cs5yPtw!gF483TKfJ5{9cy9hmP5Z z@qBM!q$D>=sGphD{fNq9vIYBoh3VXi%s7|`#<}?u3YIP#34!3J|08 zwjK+ex7~!$yxVb*^G<~-0oZlwn>5)j_huYB^Vx)Hr!I5eT-do+!)bS-+MS}{&|3!? z|IC1g7g(LN;cdq@2BN2H(s+gZ$E{IJ?#r8oq_S}M6Nvs|HY>4lkwC#3s+`t)z3nHN zx*D6YvI*d<&lWIadNA?BfXWThvO=p_-}}#6^;YPH?8j@@YI~lI`BFk`T%5*`cQQ#{ zO@r00;iMJ_k$14R16EPrF@)eGKqOC!KnPl;dhXgIv`}u~tPmV{7$-Isn>d!%u$g%M z?k3}8M7J7rIeMgzfV!%+W~ODK{F)PB20)-8V9WZ8y)6T&X5Fr#TC60a{5!tiu75~g zC+T$u%e+hk?Z0TblfK1&Z~o!dEv8m}!0Tmm^cKN7n4junIO@DeT+Quw~XPy7D2z!OeQ|4`D=51}C3st6u`3Y3|NznRS;Z|*RV)m%HHIMwaD zRL_6kD3!)C+j?DwW_8Kn;SpJ^UR&i&7r~txY$~{l;szSchtkv@@xKDMDjl^l$?Syv zKuD67U#l9bWte}jOT@8?=X9=+es2K~L~=28()#}7Hz0ny#b*0FK46HWSw+(+#^Z^6AB@@H(mY|3%HjmwgtgK_*F!n5|cy`D;dwZ!OR51#WCL!PU=Rq-$fYGdVvGLvh!@{ulg$(C?IqRcs#wruY`rIFY zPQ9CAo2Bi1m4Eik6hgeuDQzv*j|Hx%U56UV9lY0n77Rvrk?Bx!kY{OWG zo|cw{OZeu{(4_*r9Wdxg?M^7lM5ZU$QG=?SOmj4-FM?LgKX5b*Kl3=J^6tu1^+bD1 zxj%dg0O*n{pw+5PosK6__6<}+H1rYe zrD_!_-xMvsQ5w>Sf2+S^7WPv0gN<&pUB(DFALbP;_3JBA>kXzb-o)O@O_)eK`WWwy zTJxLpfg&dECGCK0i9A;F_Nh0w41HVcBP3>h?1rOuwL80J(HBUYYhEQ+-nRmJeeXe5 zcT8cPMlJHtKR_b zM7niXsfDu*Bpfqzb!N5c4iD%_WMm+6yOvUOdP{i`ux_frTE=KSz09wl(^BDdrMdFj zkAoIY0DEhTj``3NWlLTe7?DNPb6@{eFcr-YgTdig2~b!dUpzC@puh5gH<^Zgvrxge z$uupO+pX+(aRtM?Z7JL8RvF z)1hSB=fjXVzA-3)PipHJ{Wqc~`$EPN7K$tZRxs9V-Ya##1b z!y1{62XHOp3aC7^99mIa9I)A5Z9mXrd9Qv9^axJet0S5io?Gj}7cmUNRqGgbBc!v^ zPJ4J4G>uO6Oo!tAoQ^^O_hk=AbxWbK-oGA>?X+(bQl}*2pZ*va(-6rqldg=My5;q) zf`5(9dMzjKn*!_1H*?engOaAB?00c`J|OlWz&x|h(^RAKu@Vo&~Uj~YxQybHhvEy^KtAh!zjWu z_x94lymbWU>7_=N1CR(3@lS%qTy<3SBmG2P0*V(Y{SK}hTIdxmo~ZpOF`MEV?Bf-& zj+mI}A`FvjR^Y?bjJG-_=1;WOH3|y*Y8Igipm0lF2ej=UbUy8vL`xK!FyT#4Hfo3K zKb+{3W@c(^jyLUrn>`Ld%O4$mDx>aV7EHjm0i4x0R+vT#=|a+iJa-?W?2#kJO%;eO z8LO~bztHVTw$i5<;L}NnkSUI)_)l{p;awehDxtM**j)CcHa(PrUqo+Vf-M3}+d$7Nnmo*2iSq9TG$WjvxER(eVVSoaDy`h`Ch__U(OV8>y31`gkz7 zL&R%07icm)_DuZ4vev{Aicy$(Mi{p8a^bp1%O$)iL&FY#kK-4%3b5og?)H_L?Rcb3 zac&V$_+gr?$59k2E{rW+Z@5wV8Ii@3bhdlJ2D{qV{TO>;Of*Q!LMU^$XaxY`3LFQ2 zY+gsQ-9e0)l*Z#OGF=~5QQ*|&a_Q+#@!_<3mr)2KOmaPYiuvC;FSS?GMtJhrD(i$(upkyV)YeGjW zwd9PtThRE$2EP%F>&P&qt?}V<)YWS67~IlUK=Q@%Dcb_*%4F{|3CovQRxj2dY!~14 z!Vf|>)|qa;assBi$CRF~3xqw@J|0~%?@5sPtqb(D?elee^nh#joqd|5q?a@;MUtxa z8fr6Opj%@Jbbj=)APtU&6;C?6RDXYCmLQ=2TPMV61>mfnF^sO}ec#=TXy%PyS5wg< z0mKUQ@&cQcY}SId{Wu{ML(%!X#YTZBlW7PvxL=7;M`$ot^WT(SvSNJWreGPeCFhbY z(`o|3IMN5?>5Tjs8eO~b6jI2ujuY^2u^X2Sw!n;CzDAMn$9Eo-D>P4C`GN1ludCwX zcPT-BNLeJeO+2-x{UNLWP2$@@UkTUJ{bN$jL7kCNK};naOQk;vawdMhs&M3@`wKn) zc$P)Qg}B#N@)u(9L(kxM(r=fbuNVS_TLKq_ZOnls!(&v(pilfXSUAD(-R(3{y!Fuy znQip9>2JIZdnPO0+T7^W4oO*QfD-{;GcTQ`m=>#rZArHNRHcv;wZF7b6I=dnhf(S; zy~M7ct+3@HP_fv6oN%yeE%KqQL_ZZzwxq@ncW(o3jesGPw}TiSZkvm*-h;D5hNuNq&RueCG<_ubj zAz$0GOe2ajpwXsd(sKUUNM$fzE~S&`g*eAN1__q--mg4f2@5r^stMH^+VmyC_nWUS zy=(yT>IhN`l*qu7y1}y;Jh=5|fSrU-7RV$2$>%`__&g@dFyv0Dt3L;^-_S~Bpp7u2 zu#OdxSR=pFl-;(6JI4|OuYdaXSwVX;?6w^$FoC!Jy7cc5&qsQ(_AL2iHp(7ClTJSb zVrTqy8~h>$mK!l0#q4w%k|#@esUn#=aA5?|v|2SM_SkENx>FBnvK#D?9Ou~(_bI9< zMuwNtN&TxgcD7!^jgiJgqO523meducUYVOrs5-D~&GaTMNV#`E6-XIwM(9ymoE3GK1nFLs zZL)56nmitHbKS}EIBIChke%g<3vhb?vj*K^MY1zFkpc{`n?JeX#M0P=-{wa0AI*-o zus~j+6A>^|cX*Pn)8|o6MtMnw%4wm5j=d0~F4O}bXL3%!Igk>NZ>9CMC96^&vRZfr zd=aXT5l@G`y?ozd%fz=#*?cfIz*HmBAOP15B;hqvPU(JZGOB&fP{^P|NAgYJa+35T z>^fWga=o&jaa|3o+n330BYfwMwO=kLvW28(Iz*Uq2@_18axERhU*c@InG0YX?cWMz ze}c(QKD_g-M64?N=F6+=+!#G{_^N+H@KLe?zinDJ2=PH|8m!74kQh;c3PDSB<*ESq z)jzQ^$eAovCp}H4h$6(qG^*w29reW&5U)VoL+7UMCulwtPw!kRSex-FG3wt2el`Qn z7Z_aEZ{*o`OAVyl@&?mNwopHpv59}I^xCVVIBA5^e*c}0W;REH@0KTBdSDiT@sl+v zcLD@@5%#X@Cx&oo-}%^sJ$63oR+lF znUi1vWgNgn9)XVDVtw?VF`q%v>*WPRkM0rSJF$n00%69S*AXnQ0OmG2p=DNN&AFAn zaj!?Dn2V07s|*T^{WGBsofHcvb>~lNZtC6l291%Qh2%OvWd+r*ng*XqIh}8D)gTlb zN4)R;`W`|2bm!hL0Z!M0vt^HOBxde2?`4r?xkWK`Br$GkX47fl0yaw;SY%Wk2`_fE zKk#HazMa6Y_G2|YG0nPy(?}BU$il^h?Rj6>)S%@%TY$Sk(=3wG8`a56+~Kzxun~1$ z7C(U;YMokT0u18_Jxn0USXpN~07!kIK1sGHaw0cZP+4-+xXaV*r^rl>bVXNGb3?2`dY&fp7~FQj#CKvX9UZ5Kmi>Ke~PCBzV>J ziqrA>Q}w*6ITH?jR}KNUUls;T+iNj_x*=v;lZrxIUnXeuPzehgnnl*760rsFH5vR#45 zKgdw86^gEfv9;xRf%m<uDG?|aOuqDPS~~n>4RZv zon9ENXm%NsC-lvE7tSU2TwE-l^M3Ah*@}dgm^2_?eBh)pXJmT@y{{$|F)H59-PvSJr@tblbS<@L8cSkM!-lCB-UGv9F zogiXb#*Hv4ANr5WHz^7V3Q4JEfX^2t88*wbAjt6bKe^WL)P8?Oj`)zk4dlc)@-ezf zxkshF!h2o!t)2t(n!Y>a zy5d?%i++M7T8S9yzZ`9hT<*JNk-r3>TIMY%0a}LWWbOxJl=-Mpsh0GMl@FK28d1ZK zMq%5&P`5>r1Cx5WJ$PME8<%U`y1`<7rVG(Nr?dLDLaSRNuNf$Mmz?Vxzq`fvNpdhS z*pLhL&v8b%l7H_&lxFzx*<>X0%sSZ>hPw(wNU6%w+h=x zLP7bd5*7CGP_nT0h*W)b|Da{uP-S^1%7kqaU>-wnY@G9=LC^MZj7k5_#YADKk5-hm z|MQF=4}R@0r(Kjn8e4gL49r^+-Fj@vhKHPe!{3J33Mnn^EirYbl!<$B%tOGWMA!uR zns#GwB(a%AvkEawa6~GL+E6+Kysqx)iWs`))Zr*9bC^ZdL#aidN7|;E5Q(o*RA@M* z2-ks$* zZ{GQ+2vdbFl!aU9X7&J7YRS&r16TUZcW&d{%&nguOa{>Re=j9L#OG1y#_p)2?E5Rm zYy;Q{aswT&h46z7i@^J}_~L4R@IG0cJSnWGwy&;G$LPJZZ-Hi^C)dEK(joK5bTO0M z$nq8FVg$?fZ&X3Fns@Vv;I_gUdAp{lAFs7Albpiv22lbvS@r|lE5x_;_m&`Dx$=-r z0h@BIpz*xZ=OmWaCVbpMKCC45)`3e9mXsBv9p){Q)T_-1elq*6wPhlFRPEHfSN&pG zQ2l;(!er8aw_aj=A;qz5;}bz9rc<|AhBEMg@!1ac6nd$BkU8XWX3vgT*_}B~@`+sF z!e+mfD#}Svw{6)@Zva%iAwb)35!lR{C*^>`>;my!*yUNTRJ~i_l4sPfdQ~2B2~6Y^ zBv+K;?#6<=h$AR&nJo3Cw|zI#+5KEuSxwFyuWc?WZW z9jhS%>Bd^glCJ%H(gIOs||V*`@LH&L;0)?K_@?akXB5{z$0X zS2AMC`?w1|%zGyvxLq?u*LU8xp;Yui`?M=ww-r{^nao|Jn13(i_-HXXR*>FMhg^cZ z;UQ94A51#o$qU$INgLohM^F+;>f;JWR1LL@7{Mn031tgeC=C+vpcMAzZ*(<&l)?(HrYR=i4qL)iJS zdnndtr@`Ng@hmcmlUqV?Zvx#mBI;k_uLcTnBpEr?Od)d&WQhi&Z6DP`igD>PqiVy& z8t7AW_q3cI?#+NJG&R@>+6%QxYEg!+KLpl=CHk8{6F6`7XKd*Cv7B`e>7*My`8F*4 z5GBRXVes?5Yl@lzpraaOX3Shdh&bvy#Fi8JkPf8m&-RF@C4x;bYy5ETxW$u3bg&rShist>VX?M0m%0j#lbnzMW^9|!Q5Gid8pUKWf zU%m5t2fx#?8{bvP8PI+gEH~L2PALV>Tid`V=J_a*vx-DKTkfU;ox-{OZ8pN+z#i%K zqg)e5SG8)fOLm>eQzgx;_{uw@S8*Z}=sP+e1-HcpZr=-%tG9q-xYb)9Z<+aCOznTe zYcp@BFWH7;$yDOKkvMh-9|bC?_ju~+{ZI`G;?%7YbGbk5ZPOUg5J+^h`C8I6Y6-Fv zO&y->C4agGDe*+ObXV-Oi7|$tSPwkvGew;m?C`xoDiLdg1x~)PLs3XjUwz%7G*z19r-MWUJ3;Es!;-_uXm5 zXee&%Mp%vAo5ZTxhs?k3wpbr4TMJ7em1-sGSDopiyO%8x;nDj_PIbxa znh~8HXNrA6!}%*Q`)se4`MufPsIzO-lTQ5YT3YLtHxB~cu9Az$ul@V9|{UYL>S;Fnj6OMgV0kp z#q#f9@<;V4OPo^i^VYV>Ht$zITsLIl*Lan~Ect0&jeXmJ zzh^@GgK){{Dj{6|9kPy&qxg~e3JO1_!pckNHxogn$eNJOPH;?aNUOW_agCBriQ#VxeO@m$Oy>gIShVBx zb23h*4B+Jd{!$;!{19)+8ttZ{=As#>Te@Cy3Yd`8md6_tFOs+5IzRd|F+IB{D9Si# zynntO8U;p=zg`6ZZnh0>&IW{w`lpC*+uof16o)FOc0Tr?!#Quz$P~?xn<6J8I_vt2 zD>&8p1#FVqm1QXFu z3j&}#o0+j)9Z^t5*q-M@E6ua>3PjwhvAt31ze1zG4x4ojPLp{nFKsXK--sZ29$I7);4Xyhf`>5E=_wn#yxu_+y=d}(35o)-i^8XN z3*q?SWE#)I(-&yH9H=rbIEELAGLeQOhLbuZS}avHI0>IUKvqhoXPmEYdpfQU&gB2~ zh-mU>S!3uF%|!w%kwfsP78VZnO8^e}(t7WWWr;?IV#$;B93?lX~RhKDg z0${A`otBvgf~l?8N8pG$joLjo;Pr3E8B8P!n z3~Inl%G!l*o^EH_a00qEXcREg6Ul{9;g}jvfO9K%dqV{dg2^8g^}LO_M-*JYg?;|F z9fyNEA`*J%^a&sDg*X9tH;=K{IB_{c(p(Jp^9_Xrl=@aMncJtGU#lJY$L;)+w`k0P z!vh%qFw|$I8Ka%Pw+< z9_MA`OcZCo|FYvq&pQq$Rn!FySXZwU=%4=d!*2mk{f~aQ===ZEcO$|e za6CI#Z_j_bhB|7Bgiutd+2YZLl+d;aeq{3j;-`6unr5zmAygUo+-YXAL%|Fwiw zpFajsq4byD$Llx$a^?RUDcFhp8626R`M=x#Tq^^1`hU0mUm^QH3FEKUKJtVB1@-hK zx|}}!kIOm^(h-`cYF2vuLtqyRL)_`X5GeR=U-xw~TCk)l39XHN7&d|*EJS@#M^$x^ zIPGa|WhO8*D}-Wap&%unrv8vu5R&G8+~9G(HlD_%Fkw4h^DFj#+$gdE|8`(9Db?hv z;ceXd>a@Q%y`wH&_%U!-RynBoCP(X2cHmEJAwquYm9GIoP;eAb(1;>XFql67^B+aR z+mT86g4&4WAfa#aWZbCqYzC!!YCtc4KJBGX7sb*{^G60nX2FZGirI_f^OYm~*QWvR z0FlBVcF*Zzg_4p77@6WUjr(By$Cv+oNm=l^bmK?YPKoNW2WJdm(zVheu5ilFJp*&m zJ#R6*ujc(h2m%b|CpS2*g_EB6yCHERg z-@2$~3h~n2 zv<7K3!)E?QMR8AEt0C9*+IJyGMfm2zp?Z@`FlWW(?_13N7CK=4AWfR~iw9;XR;ZqN zq2IA^W{a_Q%_poL0rTzJoLc&T3{Al zacgpp>#p!174*`zY&etMDN{FTt{VQB35*3DER|I(!A2Gd7mf?CAA(R^Epw@Amc zHhw7N&L%?f0pi`tp~t--bQ=N4@Yxtya$kzj4iCEq81*7#6J*cE2fe*E?UK^0<)1W# z%kL$12$(DDja2@+Sik*2D%Z~*cE@Bdo#a ztUZF-cyjQ`Za+GkJ9*#Mk$qy%8{PzBePKH7veh8$;(L3pmSyf+dQQjDMr++MOy$Z@{V4-61y!wSOST(?aTe(xY?Yni! zy&UPaVJ-v*kZU4%??+*CiHW{EFm4r`3QM=0a(KhkzQ!4dJ8M24CN=G!P>SL>Zia}q zuotF5L|T~Vp|B?XK)Mn4R!}t!%&b&X)vm1_pZ%?JXntdb4$z^n!%S!wR-}A7E&26v zh}Owl!Bi3UO{4n!gpSk7U}@Fo;OKs!oo^#0aPQKB6n{=u2$3YFfC*TC;WPN=kD_Ez z|8XnC|8y(jxE{6Rh$X+vws%3qE?d7$S6ZA=n1TCA9IFSBEpZk4!`pGP!Jk+^aY0Y( zTRJ(c=y6Lzst1k)hf-}@h#z_;?vW;AK=<7=w#mslb*Su{2@6MD1!-P3|Mp2I@BiO_ z(ulBNCqv8ROUiifhr2e@*dgaa?(EOYu$Qf$U?qS(hl*~1`{T(cUq7q9mH1y&ze1hp zLwxh40I9_cdh6_`?>|VyDhJ>Jt;%V*t0~XnYbX<^t$S^nX3dDy{q(SiW>gpl(j9is z!jqL|L}Z@o8_7@NW`8f3x=mb4x248w-cN;-sDcNmU@gcXUWJQ?Cb>QQJ%naO&|NZ1 z{g!sww;t;C!_V^Gi`nxY`_f`5%uY-O zCR%KosSL&t7G(^bqfO(S7pHM-{zFyr^?}7Q{aLIkga+HN0&!B7y!+X`SNKtYv$RoGx>^MDSm(IeR-fpy zc#QMBk#{?d>YWBn^{i1?5L;(SnZk3~sRyvC6so)*~i)6w1K%nDnsf7}1V*MGRP-L`+=_}wbH(5hBT z(N^tQd+Vh3-XliE-XgZDw(e52_MWxHioHUuwnog@Qj{PFf*=y#xBGsc&++_z-{biH z1CB%9*L9w+^EIxsM6U>ibpvNY4B+>sC_^3-WmRVx8LlLWaacX zXN*Tq$0Rz)-tzMIkedm0PA#{SK&W!Y0UMCh&z`~LGom8iW+!7D(+qT@QWLwC81PSy zJEG_yV%t5@LtA`rIxdYzuQ0)fFPTnea=NCmY!DDMm}bhwud%XUY{BgGm04wo61BZW;U9tpoIU>g75x1?4ZnlV7HYOCV(UO9# zY>`tnDtNyH++CeVJwZc>azw&F&6!+7dZXN2u{XenEL9749dv30~0YO3fy zFr_shifq3z;m?)r_gVT+4k^EXJ_HlI`b}>rCdA&KX(@y^F}t51o?s$cY?}mfyN923 zkAFey-g+E!?0AYaX)s1rr-IXd7SkHpuhn#V@2(_+QZZSfC2Ir~j*aOPIa#}TmsGA} zwAIOWE&3gNKt#4m=3cz04_B$=TLIghsFE%MqSZUr0UgjpKwojPTsYwfZJ za)`4|W$>SCLg`m-aTaEdIuIvlb%_36JKIe6Y9R;nUslHz7OdLpy4=Zk1WZ$aP15tN z+AM(eI{9+uQP0Rk@&!*z9hnBF?&FDF-0~KeQNET}uZB*smV{GLE zXfS_UtXwdg{J~V44#Nz(VAgj8XOq?2m~G>@B=s-S&Y{0_BNEQ@=>I^bekY^*-0A(x z;4;{J&+FfJeYfnu1u=Z6rX^sff?Wl(FI<}NP7QlsZRVlp%D~u3I!gokS20C5bmLUd zToSzj4m-Bq#Dxdq+ymGeV;viQU-UKVGD+MRzg&EYe5^ODy=bJbg1@vY3279u2rMwUbxP*uU-pTRH1mHe7_H z2x9Qr3fYfD1REjTq;l*|?pYpwNLUuvnwfgcmUQ?s5p$DH-tFX2c6wWc{%J+asxDlo z=pb6gbV+k*<)QNaHCgg95q~tX?%f43dRgRS?%Gv#%bW=O;r;GCQXgArN*D3IdsYDW zjgMuLGID;sD8tHksv>ae;cA3|NdV8ETno?MR!AEf0A~U!eXOBSPlxFb6UK~-!#W`IPOqJJETW_ zpYbYwWGGZ_>F{S!cDg*pS8><3??Pipu!CO}jr(%Mv7o)5m_nj&%PT}MQaoHpWWHhT z&K7F`Ng(y$5U(ZgkaV4j>j!DCzf$QS57Mq({_GZ!#lug^3iA%HDG5N=VXEkQ|6eY_ zS)nxK#)+3iF3{Erlx; z(6yU=uOM#!mYUMCkCxc>mGRs%_s_Z+>aIhq32aSAE%gcBB=@Ry3gg(Nxsi*MM z=|T|J0*+~?PL_N7+CS6-NBXQjb)t!Ojg{a1yzcE2dZwL;wz%kNd~htHCX{+~x*?y_ zsX^SA8{l3LGg9ZSa5pJ_Jch|;)!8A-9FK>s#&&jUcwr-JfRQ!q6E?MM~G|IusOb zyz5tf7HyRE1dlu;5c|$5YOHIcF@v8dhD4xfe9j1Tl7lhJpy;%iX0JOQ*{dZPAVeqU z{8U)Vb5I72y?SITY+OCr89XS93)c3qtY%|+fL~9M%nL;Q{&3sKMDd{3BHTBry|0|; zFwh4ECYl;Wtc%d~cDBIHniq{Aw9{2)Hf@tZg~$`RGbhH9p$SP$c^Ddp-a*KUOF4!| zH%~T&hAIc6DJ0rY>Myv;`8P1TRaHn6KgYkjz@;`JZGh`jW)1#jF1&UXzY(SNvQO=1 zBr`T7@8we1zjL(MUjSrv`SHI)Mh~-f+C$G@_b{oQ$(f*{k!4W8pD3Yp!20_gZLwpP zuf*19@0Y3M_ADnfFK2RL)hk|Hr9 zYOspgU7U%Mbi@wjisakHBk%itZCWgxU20K1B<7{dC_TQ|Ykyeq3OttTYfcmoJgS(D zxCeR1;7u>7!px6n zW}ZD5iRs9vt(o^FwT$t#qY2IuP~<}J0AHs|tZpU(oGp|0xaylUY=W6lrAEef{N7Pd zpZhD8&uNZLy)qX0#leDF3Zx*Z7d6lak+;1~?CR-}XJq2`&8yWtqt>zg7hg39Kx-8B z8V-W#xNWXbZiuPT|4q84m2sqYv~aSC4?T|XBBAQGn_W{(B-exmSNBi*sF4h}BAEdU z^EEro;9xkoA+3PIvyuUd-6Eu1D9Y4R9qtwmz?3fQ!S^DiKMB!&_8Y(NJ;zh1aNxp` z#gceCn6F#52|8zo&a?Sm*!j7SIYMT{X>&?Tlp1!Ve(Ekf$Qkf^!E7K1KeS9Oa0Cl* z+&YldIMuAZaInDac?Qe%uptz?R4ND-P-?vu1dF^O*dm5`XpN zL(L4Q47_@~NO*LX(!@$hNzYJ-a@e3$|SOH~UmHqb(I_nYYWm1w|&&Dg1wqX~vwc0xP-;+=2^b zFNs!HHg*p_8t3nx2sc5SXl5Q`F0uE+;PA6eoT>{S2LpoH=jr1?KRu?gy-7(73xW+H zYPh7vlZUtz-xVt-m{Cyrb*U!uHA5fU?fopu5~XGfabXxxWM$#*j5h%D?VcKyc{jB`_snLe73pJ%mrLQ@GSQc> z?LoDm`p1KUcXRwk9tXqsmNm>0-zKa~3k zHk~qgA1A(`4}bkv!14s}9}WE?pf6h6p{BuQcnv4~Bi3BSD((<@6B}A#TA~ za(Un|VYS8mGyh$=qMfB3B2I6hwevyZkz`2>#}z!yLk30C=8qjSU7K&X(?hLFibIJa znqudNubvaw!lZc09=6LWRQ>hsUVlNe^$S7KbSRdI5BmkndZJ z(+4?o*1_z1DGhT_3ppNt*|!dyU>J#+hUcG0(jd1{*R{jOg|uAL{8hq3tW48D&g8kL z?R;!#=;?o0c(F_8jquk$jc^y#TR!Yr@a0DXI%MmISdI|y7u&H(N~exL`un<$(Zb?W ztih*_`Kc04FWGMU1UT;p&qW6ui^B=zh;HbIkWY`?+YW=m{wmfS18}ZrhKjA$nzx_( zn9$nmhUk~#oD!Qg24RlY;&XN)sG%5r<%!I&D{?TpQj_-^!*MA=KH1^dl#0O+-Q&yz z0CG-?(3}1dl*4^4#y`CnrM`Kxz1wjqg$ai;NL%k?`%F9daP~;t?4fgXsWNM;{QWTJ zE@uH9h?U`}fhxCxoUoqXlBeOd(JqM z%t9?{_=C!8?-f)3iuZ4ABg{>(R=^XRI;mNM#DrTU<-{uex2KEl!lD?UCgC@i7K0Gz zlDmq6@y;THkUk5-2bKs-wznzEt(_t&IGJQZ8~Nh-AK5UeHK_Kl+In!;k3ZF5b!Umn zmu$qjb@Rk6!TI`F^*pC736o1-u%SWd4j;?e;mR|B%A~?qhMsYq2*iSSoh>SKGpH0C z{(2Ajg~<%qWV0|k=*w!s*=S0afQ+}}!EmtzZI`iPn0hu{^M0wv)^ivm7rf3l3Ev)w z15;)Bd&&7#(eBF@PCg7~hacKh-g~rr&qyemi6w%(Ih&D%XYRBwRv_hE+TxeK>|lJ4j$&ifNc ztNB)2=*)}#Hz_n_>YvU{yv=Jg2@0&Jmqh#3eJydo?rS`BW)#7?O|Fg0l+!D?oZLpS zaTf%ameti)XQappFFnD#2>0VYbfm?!+c(SysCn<}MBL%e%)u7Wkaw}hAnpn+6Ky8G zL7$hx>>#X*S5E%Yq}e>P3pEMN^jz@8`X|#44}GYuokC?$Lgh%xRS+ z81Z}d(RK|Ldv_PdG2gsI^<=SrrWv{!UC8Cg2S>dK=SVCcyRMaLW7uw7blAHRX0q{N zR*0bjgwT9o%4BWlh)ysUEmmB)pG`+%1BWS zZntuxN{iOvi+|O|QjK#W*yCG&^zo0BJ6D;>$-%l;l#4X8jGq@5-o6A#9*w0OA|*Eq zM+VyHq??BzLhZlHcV3`bca$Y&Ng|V6PkzZnPc2w2!ya{ZZ%-O#PE3L@O!_DWGr7-~ zHbgb07t9roWJKSr?HuL)sH7sxL+5UglDG!%*PwG(ul4M{u4PT!8vi=g2ypv7bX+HV zgX5U2Bt_NFOje+vQ}FnW_}|mki8SqCSGS` zWO5-EJUe-11KP{`_U@02!uO+%K!{iVF!YQNJ?%Bix+nwl&8|0X5Ngk=kogNbzx>wk zOWdhSW~7sTnW(Jeozs4HHc=5#Cks>MT%itL%~T#h8$)0GDi1P~7lO`tS%h6tfP8Vf z)Q&qPCVjG}8RedI9IS8rgo>7!mZ1N%?5X-zx_GiZ*VtY5cx(6Lok+I8rS)-$>qP34 z!*HYgfai|;A0DNP@Z8G){_!)=Gs%IJyk8Cr_YX)I9GACT4Sh`}Q6a`Zw#$9w}FRS&9 zy2ny5AQBSil(3CeTvF{7*19q081~t7*IhTgaB9tP;jFnc?@lt2Z>X={G z_ffMrOM6u5d(f2*PfR-3#1r7r^>vrbjVA)nY^d>5q#36}Y5UK76A+c6+Da1(N;U?DLi=(sor&WpiJnTPLz6+h!waNb4cxm6Sju`R`H;8 zz<&BCv>go(gR@5-l}D+|W9hTh{s4kQ$(>+Hv*fzRf794DFtr{Z8&6i~&o}!F-$CJT zKvgIve9R5KeH-kOMlghM&L@QyrI z(6cAB1*8HBY9fPOBd!3$_9RNE-eFD1PD0J}WxJHu&~fs|NAYY6IPJ1}ZF5i|ncEe@ z#6Rh2aaR8wExT0}-Q@Dc%=wANT~#^hz9H4CJq6~5&r4+BFtv@Ti6;PPXKxV?#P((P z&g$jEWTMbl)JJ-hpOxj?CFgigGjc-Wgfvy>;(w%09%n|TReL@(8GrQux3z@+W9qGu`#CZfE z*j*DA>!%=UH!Yd4AwCrT__2n<+>=n)LCbC7PiMw1Bd-REzsB}LIfeuUF8?O&3>JVP zsKBB-r{zBK&g?=f(<1@3rtpd8C$@@D%qX#2oYI58#x~`!F`I!lDxw80ey%{vrMWTQ zoZP-+h1{L|p>C=ytw3Rnp=sGQWPkE_&{5@WdZzXgTFEvH}5x>bhFNU6$a+G=x}X!6IB-_Ht6t)w3#=_9Xc?uP*AiXVHG zDdlE^;cczRvu)lxmJ~k~uA|iI!)+sjCwnV7&VJalil_-R`PIs*j7KMbID!^wj_S&j zt)Qx77y!lj)kCE9r@o~w?n=EWd47klT#fl? z{w~3GtozQt^84`JbE3Y4{r`uke_!y)eYgK4y0E~=82(g$_M5Z4_gGcu2vD@eP%iMU ztP$MfSaoeSXgT(G(&aRXcQWfiEn2H#%d79}sNzvJdzokz6OG(PdiBJR)SvN9{MZ7eWiHlrojV zic+-?Cx4Hf#Fv#fea-v1a;o`?CU4j(Qj7~;z*OyOF`5<`l0ko?4wd{qjvs?Y4EQ+R z@tQL*?bxw@Rva_s-;kxmVo}iiks)#nB@K6sM&3ZRmy5hXexfT>5rMgOak!G2J0!VS zB+H7t9_WSa-izg{9BHzuZ&hR$aWpIewe#wL4{4Efxs*APk@k2z&2xRLnEPnvtL2+RUqJHRIM+v*Y($Mu8KIYVzM8e+xVy9N&4b z-{9!+x&&qJ5=6tnP$pMw{!Kdc<4&RxjW{E zM+I~pvgEC(Ra^mk&W1ITy`GW(>Zvs;NECz@EL74M0f(63y$yo4^bPZH&Fh&HiVD?r zJol@ApjET-;j@3C6;tFn!Q%M81k0Yp_XXO4gF07PP(s{+th{j3Cp^+#yR%dnGzR{g zYrf$y)5Dz2yF1s-s`Evxob}25dBv+C8*i7@aaJtC(y+eyl73X!bcbXfZHwqo`|zC9 z=^^*&xdLMbg>v4D-#`N}nES$&EU)a)Gm;+2J5j=MS58nOe~R(6wy*(pS#UKdSF%6I zU)OwB)0AbB$$#>C(-R(t+DeASg6eJMPXqDUrjn-%vh$=QK*Me9Y|gzGa*d-Tq^>Wn zJ^&p%J{Ijfy^l-pl!I18Obg~&TiMaXWO~b$s)8xl;M=Y)M1CKLSAix1iO|1HP{5wXoSciME(rK7DOagb=Fn{CPansIAF7Dk5o zq}ENU%_iYbiMZ@uVR?%|2`eVPEkg1KUhupM<_Kvs`gy6hnXQ2CKUPEs{&SBW;28o8 zhf9dhCb`S155Kx2NHbp@pMvxs^u;)Klt~N}02k!6Zc}2aF|e^dLt|?POPyX}Au|hf z=#rm^Gggx3-hjkxIb~sKUEw#Gqxg)0de*>!MR3nbvZzQw;>su!e@KxUY9LDPVwi?n zLF$a(GQ4Tx2Ciy@in6fvbf~li$6{TpWGbG4ev(qp>XYT%AGGM%cmJmEsKDKFW=G|` zqMong_fm^Q4R=rAW>HsqcK5VZ?KO30NI|@u416(usjBoMwQev}WnD$ex)Ae_WO`cd zDh&P5pyI7{Wy!%%R`6cuMN!|wlmQP#mXrb3yFPP(#UUUVt^M4yy7lYpZ=Lr%i|&2v zF`wIfvv7f>oo5=?&SsCAAHLFtls|s^waKc9T1k#6O+LhV9}XrT&weBZjTO7MpQw+g zCL0!(>43YsyUAtUc+mQgU3%-0H>f=PU|te*dGcu;#tA&QtR-g~TId!$mSW0o=-iXE zvBWYZqr8eFL8eKKs{eqP1eDxy6J}dB9gmb2Xnu9p^f#>^D@?S^qz1J<-~7kRc`BR3 z)qu^yYh?Npj(?&gJp7q(T1ocGh91&zX?i3rOf?*$a(YU@w#(D0S%`G@#@7N)pF%t^ z8S;Sh!p?K(m@g0Z8LClpAASUsda}VOt;t!#RzrTbBq-ipj>j}d} zuin6agFVE;oj33G^rXy~?b)*u4^kD Wo?qAL~7dqC^1+9g!;p*&$$)cB(PSBD%s zQ#ln1kG*y_(lo3GsKjm#+_{bacM$|CPe~!E4Sfb62alirjOJ-hf^#3E5%yzQ5N5;K z*o)rC(rlQS&GZDU!HW5;SLjtY0o`8^1a~Va}ymC#@fj zZT<%|-PCNygm2>S8aCQZx*rUl7+4DS{BrT>43$gtBc}r{xj~5>BtKbpA!L~R@I^$r zcQ}T-Kk?ByUVl@NZu_sfH_Yi4U?}9gzwTwLAX50@`^Pi8Z2j*efQL8fMyVs^Gj1jB z3a3xyKJ1&HB5*ONE`G^H%;R!_a?S7YL^iXk3Zv?y^mOe_sMk?uLHqD9;BHV_Z7p5o z1?CXcuPiaCk#b}=6{a~W96SygDlU_pI0QJd6yXhelQ2U(yAnY`*z^$%(LpvvWwP%3WMKMCACfzsK09hx;`D&=K@3a2_u-8Y!T8M? zqXAg6YTxiB2zJ_D>$($+^IPgI0f55#1J8d+%-kePz6=_iDYuuo`P9W8P>mIFw#mHH zSh}2%PiY*Ie>R>eVTF@6M!WM(Z%4Mc)aEuF_m<_)5+;8iQl}rI3r0TO-bny3a?|H- zflrz3;&02Pcpowl`~QrWYHun+6U)8v<0O;g6}GXQw{WBs&&ld;Y%DV&Tzb2EtP96^ zmJ%X)J9jEnh^wG(s{PwRJ$$GMTW<{}b;P}c>>jKyyOspvLZ|j+7xkAzi~6XO`Y3z( zv244zAG+;{B&hLen#8=XSqTQgbGc{2g~HTQW%lUz1r{xlL0q;G;8`;!EEgrB|8rzM zo;yG_m~*Fg&yR<1LGIOdRmtyJ0h^Za?|c0yz5=ccpTZ_=7ImJ~X5w%4W4f66B2%MMtyuBmfW)_C3j#B}sSDGmMi)}FqY zi`96Vb^1BoeDe52aH^y)M51&6mTm7`J~ojc@}fUh-%#^}CAe|g8EFv>=H-%}FbLcZ ztkPlau~-PGA4SHDE=7gV#rO^EZ$Z6M)I+rg;qO9Ui81LQaZrGHIh;5p&>ib;Sxll|SPvXy;v_PopGtvqPB&^9b{?isUy7 z*V|O5z$|ckro`-IXOsZLJqxr04U3I<5S^mA#*xlomH3Pa|OTA zyBSG>8ZK!tOgHUcbxvC!JL9MS#j;>u$*M##QyyQyU~*TRrDT7qc{K7vd))F=&5|Cks??zk+YzXH4+AL&$*67VFtcD%d38`3S1xjtNIw*96yLo*3Ko2gP*1eg(O zMhBn#n<#i%ckT(k1swJNvyNAeMpw;#q`QK;ZYiW%0+w*{@nkK!_n!XmKY%RH$S&&Y z$rWE~ny(2qSgz3D<<{b|V-4#lEErXHZJklTcas)Pb$qi~1Oz&_vSi(=TDrtv*WGcf zGY5(!gtzr}xj&2KS+VA8*gfA}j_@w=s-<gZln>pBwzZWhwJY;1PDTko4xjMzO?chIca5T+ zHYO^I%>No@S1G>L?|F`NL(gGm;Gm1#Qt>S>jxEfUTLVIfQ>HBVkxdj;zpfm^6}A~3 zwd?e;`cTuapHJW`TBx#uM zQyDP3m8?vD){L4wu435()!erMxfleQsqB8H|>XD-AJTthq-3Nq7a- zY(AcMjauBoe`0tIn#R>$U;CXwRC%Ls8M$c;xCtRtvqzYON3CKg5;x3$&%28TC4Nqq z`$VZ6yzvoUHLSMb{eZ6#x=&V7hi7_@o6#M^WH{2e$MI&0H6Zn9eN zw{jgnLw|P@`1UNchGng4Wi|ah_KRtuJ#FicIgc^dL;IeFf^nlB!40}?-6Q_otGRxQ zmrGbuoS;YlDSZeHt>Aj5sr+X@eZPEOid$=PiKx5Qj#)wm@ZchyuJn)tO}1R>8p^gT zLsuNf+&U@0_pb#)*KeallFfbUryV0)zXzY45JIDqcB%yx{C0s-dwEhk2B#-wLeR;~ z{KGr1tP}m;M$fqtHP01gb@H)BuuQPdTJ;|rmJ8GL^Z#%L|8K85@BC#vWro*4>C9|` z!+xdKLZnQQ3;P%=p@Ce-C%`~tFCbxO(TQNE<+|C%2Uf@J_}(85;O#RKS`yqPXg2-Z zVA}9G_o1xjn0nhZ^HXk3#z^SAzpUumcensA`$+JFxP%D(Sxp9Ve>;#>tS{R~sgtMu zt@VfScTaDVtGf=vO8%&2l)O2cCdhKbw9}HVLwrJyhKkob(!OZNJlGK3RQ%2F4M@Mr zlch;qf_pQxgF*_v2L+1DTHs?lO#{t4%x{up1mA*TpUh~J{p3@|!_1>GXqvZ<+BEGlCOE>ORuYTJM!-hWf(`#3`H2d$Wze~aEEA#(zch0u|N7~(h0bvFIIGOn)n)j`kFAu)wzR1vL(d%Xf6bwB>9t8iUuxT z8(_SeGqo%r-!YoZfYf!krv3%WFAl=!jU#t%7XhaTlPu0er9^nwpM>j zD-kf-yF09KlWC`&TIQpv9+_h3uJePNYzG2`(t&Km)Yu?>e+BFDa@G++mq*l|sd-RPfgD#-bHLX2=9L2;gPFR1-)S(i#-1e^cP;XCp7#7qw-H}AcL z0=1KSAn#d8>N48422iL%WFKQ(KE;{espd8yU65sh>0cR{wWzV+zlEf25N9PR%OEAO zI0Atw-SBb@$TqWXQkbydZ(FNIadFAH+4-;<{E$l;;jIBDLxKF6?yv)>l~tWSvTXO% zIe02RMovn#OBC5=6~Hnld16t)g$(Qnbze^d9KwzM4~>EHYyllCP=e#j`_txKyN+gC z+?c^Ako%THgZhf^Wfc9Om!vuL)#H6`^SG)7V?zG^pFyoq$B#C7l}Ym2^7M>Uaq9iN z0tXN-R%pUM`7H-P`c=uV>w)zHBl+@l_4=^y_z&elnz9+Lswxs7 zmtCj6xr)Ek2-WDaG31OJwIlRQo{i^}*}XIIb{vxIOFP_Ly%~v5piHlD60Y-fp0&%T z864~!@WbwBPGjG;vESr(&eREvNqSn<|7$APzdrFsBvR_&l6MC=@KrL)d4jN36d4;r zWJOQ?$F3!cnJiACD4^o%I8IK^BcFI507dNi=fn3*>##*0K2Q4R>!Q%WI6Ht8MW-+WIFQmi$5|C0Qw2g1FEBhC#7m)UKWhiQxa?@Ua16iU}G#$S|O7 zQN;cCr27yQtl07;wC5a^n!rg#(t``hi8ruu?+*aJDxI_bKQmYFgY%h-<^ONyN_dID zo4osDNo1j~xRB&~jVX1Z`CyZBwdjtyM+m-`5Bai4Jw^D5n9kj;zx~HA1U^S7=-3)W zsNRk=qI=0QIH2K`wsJe0@(+~hai$(k;V$;EXSK79s>5b~stgeQ-g>C6UjQ2k`l{(W*wM0y>{T%cJ$O}kFJg>tYBI)4{pK6~p%q`)Uc z18_SYMzA@b`|+(ieou<@^V^MI83{O?&bM{Ui$ui0ggjo}Oj!HsIA&nD?GOMsXftU! zm_KY#q&DlPCaQ_-GKLN@tTx`#5+%$LzO;TFJ2oXkmP3EF%4@6S`YrgcN~Ddr%l`6- zZdeEr5}pot8us&fplgc(tHX|3(-4mPRJyop^~&g@WC=B1!M6^A`hmP-A-t(N23gjm>~lll6}(4zhD5^Tr|uH+oiT%22nt=# zBT;CbmL^39Yv|fgmEdV05*=Q29uTN%uO>#iii@hw;R1JbyaOIc70`4vy9`Tz_1ElK zr_1^GI(cFdGUX=Cy+0~) zX`XWIIY)p~oD2QqWVch;)hj6oS5j49j<$fX)2k5RmQ5HA=g?qB-<8hBpwpF~h7@h2ESzFqF{(=C+#1Yhf=Ix+g;KH1a^l!JGSlqI$o8q6BoZ$5Am#XQ!Uz=6 z`DSmu_CjC!grNmbi0PWvtw=!pw}$_rk9xHTNBJ^&qG3wlxN<$N5*gZBaAt-K2Jjh1 zy$dX*(hCf3Ij~129P|AA?^^&F=mavmyiZk+S^ldGI{8AdaO zF`?GynuAVifY%F>KYyPVhvc(_DTlunbNDV(fIYgUB}u2SpJE~`yxWYll>dG(x@_$- z1vEFHl*h-2QymJqqLMR3(*jWXnvM|a)D9C9m zV2oZc_uEa}wa7jDsf>HVS5al2Nm4xBvCV9jWG!a;_qu{3wjT=qaQN#TSQVW5H5%P; zfC;8MDjc~TE2G0vD0kRBw)odY`AT4k96S5=_`T*6ne;#Od|StK^Py&$&|E5k48Oh9 z#p_})*S0N`*?UN-MdeIZnQPiP)1{{waw0nC14mQ) zEN|&TRY;Ru2TLkW)kj3L5FH(8B?YtA{<~HM zM?po;9;E4P%9`YV@yPmPXl_S(`E2AP%qOrJXp^H%AbM3qT|yMq4?a5m?e6(@MW0rN zWP-dlOv0+V)(O=*$^_kDm)`t4Wc45?lbb427G@tKRhvB0@C`Q#_=lR=6odcIJj-xy zo*n()6p2CeO>jK7C?xGCBkpcp-y3>wUEwZOING$%$qpJ}7HU}HE`p*;DwfjgPXe$0 z78(S+j)XbS@UO@G*zv!0D2!Md+GlOx6<~+q-1|`aiN_b~9LHqT&r0kw?A_WAD!5cES#3VLE39pz1*|m;OEoL0{Qj#-V2skdhwe3 z&pj<<(tWmE+?XU9x5KPrQ|hm0C4aNutI5gN&n!^UEEHD^g6fa|1a~E*s9hl2S|@2 z&oKrsxGQ?`m9;0LA+=g?dwnV~=8F5PE3MRGe>ov?KA&i=_y=fn#w}RL)Q?pV?b3b| zmZQ!r-KGl$$70b&+P#~ekOvLdW%!p@kgV`ov0URm=H(Ed`PSo-Fi!fxxCOR=J1gZ@ zH^@gA1ayJe@x#ZSHTjLN4u+2{l(zk5$W^K&N0Or~^JK3;ZXVJ1gB?3yLWkM+6>6E8 zgiLuwHLr?JVryTmey;Po??`s~ltzEZ&;Y|MPfp^69wMU4C%gF_xj!qG=|9@34yKFB zW<5EeHMpnBYbN25rkx?yr49M2p1`}%*sphI(|^LoX?fBRCp5@=i2YKI`|*uKP&M`5 zT|~u!(Q|GT{7k3HtsF$)_48Om9!r8!9k zVz@6BjnVSEI2smg>raW1w^#&{n{=k2e4l|<0*`FmE6wc`p)@*fc~Qo~6tvt@i`(&Q zRBb2%@IPN0*|cq@D8ORt@$Muf3dUUNy;K3j?Zs!_?uv07L)@L%dE46cOvbXme`^B1 z!y@nw-;UEL%n_2OLVwVJa0`3&I$TGT zj26~0>wymA7ZQLfWf%d5c4x|5uO)lL;q3)atS$Wy4zz#N5X*j!TfQ(0y#(KW*zZ#4 zexmC+2l!jS9gdhE>HWuf#?qpZkY`td=}#j8IbLF8WrWhX{~-gH-mFi2^mk3N4f694 zO0+2T#`)x?0_R>@wEL*biHwpSRABX*cmM~i29cCBvDl{V;MX8LkVpQs7#!cB5;m?g z7CbvYQ1vkR6&$#*#lLR8+!7aW*(t}=7>x9;VP?i`TC zc@sc+@OdHEfBqr$mU|2xl@-IX>vH0QwthDvS6Tt`-^t2x+>o1Ad9D`TsWiXb z@@vmXZ#?0P=n~oNMHEc|C$JT1fAWJyl$vNbBap9u1!riQ+SklBAIkEJ&4l(w`q4b3~G(+2fwCuh`erMV4 zv((EykPkMyWlEW5n)BHS$xKp@oKh)t^9cN`uFp=Pxn_Z8?n?q{2pl7Y_m{@4Ug%52%y z_~2`KqF%Q z*ZrbQSOV&yOIM+*c)A6A1{N<|OSEj;M0UHjswaxD>qZ|&2}$t@`Jv6TW+n z6lxL?zQ;M%K6OFwr3f|=M zg=WIQzVrUr;dgZv%VGQK4iYsf^AbU?9mwkECS9*R&Jg)yyCqVJLc~XwBJvZW8RsUF z?qzce&VP8D*SRcA42L+d>1@Ka65FSrq$H-T0=UnBjvH62;l1L)Q#W9^ciga^{K*?g zB5X(*OwTB;Jx-Kfk`u%QcJvD1zu0Gc#*Mh3A9HOBQ8|8{Cb5N2D<^tvl(%4Fn6D)n z|J}#eS!rL%KFF0F;Q?ugY0seOVr=QaS8oK{fhFJYX&G`eJwGwPXBB=F#tFFZ+W&Tt35y zfO}x&e;{1;VFT4Kn)cI+W7S;%C=C>=+R4BRT}igC!f?342YAY8Mu=bXU-Um9ZovbO z$PJ$QFSrfoOf{dW(HCbt4kmDaUU=4UDLfq6^1DXH@Vbv&>=@L3PWR)C6N};W1Mx^A zj@1$)Yk{_A2L(wV+-E5^OojcyySrO`SV|k-F`z&|q)Shv*AQADknd%ebM86U?cv@Z-x%Nc{GkKJAS>^hYt1>I*`D{? zH*La*p7js#l#-!7UT#>)_t!XRcY^Iv=Ls75t4)$_zK~J)QA$QfQC7B*ceto;6rp^FC)k7NWK|&t~ zr&Q57|IZ=88KbSvV{oOcbhYQtX(N58;ThciEO9W*L$bCj53eM8Wb!tKs1(F>z$2|n2Q6rA97mZ1D^%{RF=+~S`+js8;W zSm`uuzo3~p;s>iB2%hg_;}zvmJ8rd?!f_s%hK|>zOXu&0OLWR&?Mi(Q1|Tc2_R0OH zge!031{ntg`zHlTk)8Nq!YuxXG`~0OTrRuZZ%MqCzkKSYN|e7i5Rbo~BYbd;tL>mF zqAlm}8>F$Ke&xX&3Ej~;8!>ItEQRg0cKpurb1r4DecL-6@~afG)I$xvgul-Jk-_%& zJE~i})LTeL?Xm8w_M|0ftK8^1g;uhmU(uHM(I&tz0+gaChX`10$>OT^bA{o;zpw>3 zhrMCinGRTb$^P4>2Ggy6z4?59i!c6r<2J-IT}DizPtty2vISBW{Y49U4vl00S=AC% zdTj7J`M`W&to7oBqqX4UXsD-UoyWs$Z%wN=?N=*|V;9Y5=pBgG$96}I zC53FuL!xqm_7u0KFux+?4jyebOzz9QnRxo9P#=$8Wa?L|LFW)mM-U<0tiRew$G#OP zn7%2Dcl|TfcSr$r>q53qlOOK?P!&(M2_R1stJ&Z8RAa^X_4B9+-o0=BI`?uT_3?Iw?<>ihJe8iqEP%l zX8`_U)qBf!8h@q!GyXbYqo6PVW`}y+HKZ?>Dcl^?1KJnr9H}Se!vDdacgiO;c z!p9CjMZ5uzlZ)(GPd1ttLmuT{Qx&|LTUT#OHe#A}=W+f(^V;d`nPKDc-GQlCoNuUM z;5zSsi_{+n`A^bAO_ozRIR6(p*xWzwr6vYvb{@!AJQtP|*4G@{;zaMYK+8N!^hah2 zD8PMW%T3Ea9XmR3nhQ2xs{Nx?r0X05zw29Zs#9JMxO2;f#R(RzQ6NC&{k37j4EkF& zL_LZqDU$OkAZNt1~n0p8n{SFn#3+SdZUFk6$o1`IthNu`gaAO zIz9Nx|8nqusm9+hYyKph{J+JXdGm9=B5@5Ce5Lo4QmO4|BO@IeZ-0w9Z)`yE_pY{c ztp8BNzd0;9IF;Ra^&3Mk3~et!nf9a&cQWz6T4cC5$tm$`7HgLO*?qb!31C0ZkHTzodXTZ2i z^O?y{4bQvx@<$r}S_OkA-q!9tK*^5$Z=R$T4aB+i4z@u@POs9$KzHHO%EHufnrlnf z&Pu6jbFtR=Ei}z`+E?MFfhB)A?>WF0oKBFAB@qVvrS1Q)I{!4*f51@ThZa?^lsjb; zwHM*)Jy@at`Nx;7Qv?BaJ^^HBpi7}fj^6)Vlq%;1>l~jX8wZ8oS)U0y@>H8vP`JUE zH>Ra@)(fgem1@-|4!;)7Yg4G6Wgm379GTu48F63eZdF0dnOSphRLZq@*|cH4rUR;S z6I=2r!|v%l;8FneJZEIYRXP8VyDl-!seU|c{DUlE4Hz=?O&N^#RZBQW=NrH= zdO7|<2>b3o(-gbo-=KQP4W#IzY`(lUci=rHO|y7rsmyfdTcyb6?BzGvxOuAT4xGoN zSq8wzna}=li)`OT;dMHDmqOJv%g*?&HIJ%Z0KYvu^A5D}>5Ky|4mPVX2|oU<-)8&U zj6LUi?90=~xN_#lP_P&=v=x5H3yr}5%x=B=Gr>&G4!2OTiJH$WiCE6QOs?tCo=vvz zf^hq>$S2iIu{@hEQ>hmfIlOi#f2Zg@!--JipJwip&AgiKlmSH|Xqt$Vk=pB)D(WcB zHMd{R{3!AkAzteM8&DnT$jxl#QVprLTVEz=s7@C-ch*JJvH|440~hGwaREq$9|vJk zM5lH%BOpEF!Ffussrs4Opg&Pp)=w`+`qr8UPwBgflMyp-x&9HM`JqtqAPHRhkx)KN`Nz;2INd z-r1i-r-sv+jbSm19)ix3`wJEA=kthDqJOzt)e(MH<8wU;mDX37&GF1KMd8fbdWBDC_N$iQqKSJN!2X$vLIyj&dQuvG?{r&t&uUN;uHP}A zsJ96rm>{?X!3oKpi1n%F$Ur~6snVl*23(d)XTB_8xz!P;UdxBvl!eQSIb9hu!8wz! zzvw$e$#Zy#N1Rza3UaGdd~M$z*(4j<@F(0lwhzKsTPCO&m)*h#*m9ZA3Lw%+LSS^} zztuVqcB*Dc3}bf&$<=O|)@4!vBKAMCh^YV(djKNNbWeLSSeOQg_*YZJrTI&DtMH=E zx+5j_N=?8jYyegv_yl8nC*hz=j{BrwU~tHU>M0;$s}^`wJr?t=o7 zu_er>5a8?!D81{up?AJB9`{K#ZQH-#&6_5(Qwn14vkH|l{HbYz0lo5#ax*$M8p--I z%Te<@k8!;aF<V!p~Rg#3)KR0_a4VDM`3j#E-si!BB z6U@Tu2yi*A_CKe;%i+}&cjmRO{&nxeN7E%di_`8rZcUR3^b0fbS8BTe7^lFq8Yi8k zr>bfVr^w8`yy>@9)%2Yb4B$_7p|@U20uf`}=}QC7#LQo;WUn)&rLEeN>xiKm=iR*#KupAz6M?P_<~;+_kUk^@(R=&Ywe zbCV5ZkYD-Xany&APo=dZtjTgxKs3{m;Y586P<;Wsv#P%5h29&F8$BHa{kI_Nz`>eP z*mNh}Uf7>uj|?uAH+fq4=@@{0hM{NM>1(~8%@jFe}+{+45V}Fr>~#Y*Z}WFumZG~*Zr)(?UxW6RX|48XAW`MKF0uR*wTB} z;2d~?0Y6~wQ<}~wJ|MX%2)rI-O@RKaw~WHB6EHHS3ulc;e>o3N1!US|a#nqi%VjtX zPttxK!oMf?{!2pU=OO&3G}TW{ID^6aX^#G50_LYV`p;>@pFZI~X3c+^qyJaU(H3rP z>f)BQejw{XiJtdEdzC4kqTyFBC|>=-bjR%7+8CiDzIvvrz(WDua~e0PZ4sXY3>~O! zi&PL}Y3Zs|>~SS)DmIR`0!2EO$V6(QN<6~Wr{k@^P~BydYcVRDe(hZ6H<;-@b2N1f z^`61uPJ)YaWJb$myv6r3-DUT&O!~n`t?V4-=H~It$Lc<7}OMMDG!onTBH2i6)#X2Se=sx9lH_0LJ@u^vWeIGn!(CV=faRx1C^GoC(`MJxe6Y`2G&h8Vm$ZV~_e&HVoO?-+?MfLBwv zIOu3bYu>Ybch`UHGenq24wYFK~T)Bl(3Dd-Z|`~<8|C+W4uL3YKjw$XL&Zkg4OGCaS8)KPr&oqZoBn(R9Z zHE(U<;QbV@ds4r=D2d*QQfsK1BE0}kI9T^?EgcISw}#Di#M3YKWle2(=gR&l$%9{S zV6oU|3Jbv-rMJ!YEj(BWG|%3<#W8KUH!RYA1cq`sZjB=rjGcMPn-5nIV^syP# zN+-1+Wy9FE0w@}>+g#w|ql(+ve%qfAc|@BU3Px)8Gjrfzgp*%K2^qMeW#bU>?tK`n zNSi&Hnmx{CPygodwtd{0_c?!HuqoJFK^NAhp)HMjU-4i~HMY?elSgl!C~*TxIrflM zm&Uxty=J`mb{4T9>==x{AokS{_Zog47}n?8hLEuOB3w}iR`D0X31c_)*IQ5oEHsP0 zXvZpt(G5gg3bZxzUL?6}woJsKW7DSS56_y{EF??Po0f>o#9ZcGpm|$^n7>GH^VW(U z^G#@a?WM9_Te_I{Z4Q>FGhpqOim?`i4ZA$*&2mo(x)^aq(u;;&otaBL ztJ~bt=|?G3xmin^u^L;AD(}q`#Bbrmie_p7CzImxs2xQkEfe^Xib1OjP8b%;=cnbn z_wi*1EKSgNxheLC@{Lxp*x#6f)s3=nL}HgZ$mzS{uS~sqA?9tNh!TOv3|DaU;pO1N zmiG~z%gUPOjjq6(N4`DZL8 z2Rj#w;KOKPBO8utnwecDk+6o=gAqh=NrigP=YGg0myEqNYNYK^8TAjuY!D@9rqU`K z2zDil$~jf7_e2{rUtWlj8)>$_(r;*dSw;+YpE<4+%h}C>8NOHgL4tV}!$mQplBwKi zdKI+sP&1RZ{_(_iu9$4c4Zd=j2#Fktz(WtzkhaDl$4boOA05z^6y--&;lN>Cefkvh zAYbu(E9H2a_4tfFNKb#(^eakYaKp2@nWYgzSW`Pz4VG|ibNwwsAw_^{}9l%ik zeOgKH^=^pc4^5C1(fP%`Y##MYu~Vg~HJMC{tjywyLcSz`U9E!6?-9UIY`5=}3nIk~ zzwvcU*DQ7;wz+t7u+ibBk|ptwa?bIq|0<<_{Wgoi^O(WF{;aG3#O5-f$Po1`Ra{o_ z*Rm1Ph6KdQq29&?aKod9fxSHuO9pwgv_# zZXQqK1FzYP^=YiisPG~~65jmbF>+BXDZVSCE(@dG5ox=~1@1C-Dqe7Bq7jXhxz3mk zYW9T${_y*iF7@_4wKcmuzGYv05v-4FtV~+e(3BF=xs7@OcK_Gy|Lf%!Vc)`D2#?H` z*OYo)O|Y0fBafn!s=RgaNB6*I`Vt_a>rl{C zbZT+^cxPsEnytPaT09rP%KSz5ez>g21>gkEiF;q=n7(B<_2Zps0?WRz6Q;$7m4JpW z{cvX)^b|BM)2=$ip$IKi&L3i$N>ZS3=Oou>`tvIMyb3?B!nyPRA6SL{l-vWbv_DvS zxxI(4J%0@9?_K3ST5bv8;eu@Wn?LkZox~=Mu6bhKdyb%m-ZaF{G9+3XqE4#;!DW+Y z0ETq>H+Ke_%?BY{C&!~S{`>1ZUK^8jY8s_KM3?x@09?r8QD>IbVwUu(70WpPZPA{Z z#w7vkPD!&l*;LcG@opD(;!s|cy$y_!!36s=dHZr?jWcDv+dksJS8=xA$8jOi`m=@f zQ;*0_=%=xoS|+c&a!l82zAfaBR|o8kBVIk^z5VEA_mHV4^P<1Ehw){xY{3P7cxG4mt`QNpvFS zaEM=!r;UYuk0du-Wl#81L z&SZa4ySDX}o1H<1ozL5*5l1_n=du;If?eOJ_kd$BJ?t-8dU9|sJ8xP(PqV@U2;@%#G zdMZn^c1lerd!H;a33-F-199K@cj?Y)B%b>$9fz*zQQz#CQgbnH+Z{jF)Az%>ub*GQ zQPS&Uj3hXz~rn=#Rp3RCEdpE$=*v<%F)QKIG%~2!WCk$C`|NZVVwl*(ugXwr$t2} zyTIBSIjXV>OlgJ491XSaU;Hu45bUj8+!{)A*;vp2p4unD$NVE!v+!rfgR%g?$7(9Knn5cl~sKeUK4DID)YS{Pg z5*lmfa(Tv>6vgiC*Z_%kC{*e@J5Eu1f&R zKr;pIdlA4yXSNZiVH|G?gjM?R9%p|XO9A-%(8rbX;x`)0<|$X?4UQ9X7nO4zaWpSU zZqjJ^(pp1~KR&s}z(Ae&;m)lpHKEJ4NFODJDH0gjRV*ZP9%jVXu*t=+$=Srsjhba#MY;qoD*1I&$fQcWw$} z7Ebag!ntIqlVm93LKNDYVv(=C&25*vRM=u7B z`sRin2(p3M>;f3BKcBr3rwvfTOVAJJJL+t1bgqMB;ue#%8n`1l>p8h4o)hMqG52%O z<0~r6Tx25AQqq`0ipe%GGmcC$CAW+@$Yi}e-zYTe2lLC^oO&q{!=9nrnTqgZCQnp= zlj}rlD-0K;7+OynCtESvO;s+jF~<0Tq)bNIQ0`Kig@x(f$~{b+8Kkl2(`OzE(F4-7 z8E?FkW*bXQyf~15u50(Q8fTYK`>(r8H@gG;PsCP&58u5mUcVGg0E?w>NNO>@ zLetfyw)g_jxa;$*&Bhq1ueZJ%xrHn~V1v-DuSYOBb829JGiKr8jiQCEb7n{yQ6wgc z+pRp7uN&FL;DM71>*(;y=E2bLa|3HElF+(NLqhSQyOIZ`6ZVACyW?sbFhI@9c)!wS zoC_}9GSL}wWRiom#dS?f0la7;G@2fu6&Mqn{$AVC@VV>v zUqhq6F32o~^F>G0Cpvl5(HCWn&D2}z(9LIAl&VALG-~TZ-rjI~b8rQH!D}udo*@Hbr$UPk;T> zK8OmhdQPMUPqqrO}6lo;P{(VkjdnEal2Hgv@ro}(;r>%% zI*m|dU4lKG;VRSV>N}yMKGjv$J1o0MpIpZsbF~Rz-y5d&RUck@Pi_^3dD`%-igv zm3S_DbqAgcM(kYM`ENKD_bK$<$Ct|_;?N3HUB^??2y9_@F>>gE$7denE440TJC{ED zkIN2n$2I)=u5NB1z2g_-IV7GNfuwkdB-cn@?_bL=yR5?O;wqyks5JsF3#vdL z;K@EMlj-V(q`s|sBpuh|W+wx4R~lad)e%{lxsY7%o>!vC1?e;Qr@bdIGpTi?k(Mf|1=q5Zqkt>RQ=7d4F z`MqLhjyBm%hU>-3NqG~%t2~sZwtdo=5S@DBg&|A6xsC*n2REOo4I7=)U2!qvE1To? zzcaK|?Ig6apRpRu?&t!;eBqKP=-7%uhE&Vu>XD6S4QWXmfsww}Lsz&Lr^LjtjaQAL zksxdQ8h)#9zP?MaYwQ23S-`Z0mMRn=T2RHkn5u^jL+lBW=}lFV;{)N z7zQ|%R0nCF6Ppr65~9)kML<$w6nI+ZeA+$0DIn5~uiCeCumr^o57BGtE0!h_4nmk{O_HTIYxHIZs zy1m$F`uIc&Ok28M&Ek!HubH!~I=a`&_;$L{U7=-o@O9SKdQ#DR@$?tyhb)j4vqp~= znrmYn&%?+eR?xnon_u*vch+{cb_#v)VjI(-H%$&to7*Ndt0Y5~=8q)LX_08G2GY2D zvw;?(k;5AY&}cI&WP9wYd(zk@{S9CnAb$PfY9!f{47&LseAqfItn6)msVKTZ4WFS2 z&fm~~1+IxFNF?cTmGQX)5U;G^D^?-gx7h}1VZ_;7e&9tgQfvNmrP7pG`J)PqNB)%$ zFmHn{%=DF_0>DJfXI7C41HeJCz)o0n$h;wBgfyD0Q^>&kPMXHDk3@eratQW1>4GcL z=MgXC`^i0w!eJf#Yn&Un!%;AVDd$e>JtD!9OJLhCrF*{>hapzUm^sKyf-A*9 zSOd`Z&?mP_-|j?N<%Ai%qhE~vCS+WUlDkBEtGyU?PjeEe!%=s%fcXvtzLpstNO zvKzPKI{!j!BW1wYgkh!pN`Mlahto<q$ zu5KIIfxw0}6835P#D}B~L&R}^#grn(CQeDqzW(SU-8^T9hkFv!^Zw2rG-!5p7^<~e zHiwaXC|zSYU{!4QRaeb#JS=?a%mGu&mP1UO$Z3#u?VU2|vBy&TwQB^!q!ncMYh{|5 zi^VqNyNzUzKGwmAvd4^>gRq*Y4|5$4Cz~3JbA74?9@v*kaI|5P>FM=N13f|CKF%T1 zY_o57r9yz1@n*%t>)*XDJ}ktqhaPX2Fk0cm5@=z|72wFwsPQIc+RyjGdL~)moltA> zp?4PHV&WbOwNm+O_@-e^nk)8ZeEhFSlN3mQ$0^4P*~rby8!``rpO!I##MS?diqXz7 z%LC2)r(sQNm4IGm!-B4T^SIGY?JFRdkuq;E$uEGx6=R>YPWpWw0?DvkdEF_eFZ#|+ ztf+~WjGl=Kee!X>{oHl=1CkT6x1dEsTsq|U-#2y{V5|9hB zMz@3_;7oD$kB8+^KvlW+&#H1HxW=|W+3I70xu(>L=iBe*Y(t_A*up6j%!dVsXZAH_H?mxAIfO&dFD4zmvx4V zWgCj{lC<`HiUxPq=R5Am`nK*xfzY>Q@|z`d2&Xk$FsaYi-NO6F{{$uxMwdxi>sq~J1=3p1(!JS}!4`1)-h4*1;FGx}{mrB92Tb%d z6^ZU%R)9@0KZ&j2@LC(wIyJelaANxIIP|YPxOZt-5&t{o-_KRQQ{;Z}2>hvHg`qh2 zKrNmoW$k{jZtJfv_;sYuzcREQ_MK+1&jk?6driC=KlMdPd9Bie%$Vy04$Fs`Ej{-9 z`ETjF3cB;=;+jDwZ_Lok_aup;X`?`G-pVT+*^YV)R#8CmL^JM58)zbRWxydZb|P|7 zTPLR4vm-xjUP&lTqo`n6%Ap@}!F8g;L@2arw2Q>W84u0J>)3Ya`(Qn>Z8>i5Sv2v( zME*ibK1aK!Ab-wCUgZ)krQHosrb|)}5MhHx-eU=^WZ*p7gK)ic=`5-Cf&*>y0F^YN zY5#uuj1&Z5#_hPOjV0r`?7XPsQqGHT(@|S-tV%Q5bMV@``iCDmZv?)jo%#utCPG$;Y4i zv+j{|Zv{60D?W^VcvZmQyWm4Ap5Tl5ON}61tI|+}=whx|nLKwwDvhI1Pq7z1%Wzb< zOjXE3QZ8;O?g zYjXQe$o6KS=_)cIp97Ni)657F9L6F$FR9ky(U?#Vq!ES;R51WSRHL5 zbYU>ci$yQzc@j)GKx!^ufRJ$)A(jL^_u5^HryJ1I4S@!7kp@+~sXn6kg7=M=yE?3U zA$n=fSwlf5!HZj4+OPwn?rBme_3|=OEtaQ4Vx z*{}6*m$|5nGkwOIS;B1ZiJr!5Y-J{onN>zL+oLfH zg`!3fv+GCFNtBJE_6$jB&3c|@;>=jsrD!gEfmf@7`}l`mpxH4%MyPvwDImJ z)|VOzxXo&34o1hseiYWeZKPXT+9cs-wx^YKtq|0|vO^Kga<{U3rUf;bWlY|Nw z_KyX7x|hs2?98l+?#=`sSLKV@z-z*R7?md!@#|HNa`tE6pO|Z{#DBDM2cs+Pg%6XJ zHPyhc^|*Pg=hb#f12kIi8FCoFjf{4;-=t z(w*=F$V!tnGC*H)$ zPlbzS7`-ammmpg8&Iar6H8)1%Yb>z||IpN~-!)a`pPJg&`ohRy($u3Sz;HLit2>JM zSeL$=+G;_YL}Q%Ni)0ix8DFR ziA5?DVXp{c|HzS$T)dZ>WyiMCiHfPFrcnz`@Nh8HNZsK#nC1S402 zwst}bl9lWO>UB)Eh)x{C2o-7klXBR2OP=?&-RWkl52(y@e0FVEl=Ea;uHR~W#${{d z-pPgl>E5QgZGOn9!q3kJJdpidhSn{?!+pNdJZ=XhD+rtnwM_7UcCzsJ>>u9f?musI zv>pxlJhkb=sA%?s^(kST?T&d|ie^>~3}21#xxTvlXF2 zPAsSIjRN{#$^<54ng?nh5oRt{9@iH%?)P#^=7wDbaa6NbjTqju?J^)c1 zL#`L~j@Wc}&f7G+hZ=V}{70}+Xw+KReCSEH>>Wavn1lhjeX+iV96xCbC>-aYzAwLwlDujeck-;@)FLkTC1ocR}YY!L1jT;Q*j$D#;ZKR7S-; zk)y5w0`m$e(yHC{X0330n91$*ah~BTB{MmEF=q0i=ury?Bm(& zJI7DqvRA5E06IWO7kaRA&&5`Wuck}ZL!n}rHv95Q1?7jN_wIgu{OGsYvI&GM$jdH5 z%w0vyEA1fPnwEip+QCyjo!}|hcxQ)vRIs-Dq8yfmH~;w;zuf{%qikGcQ~rnrLq~@G z2)qKoQ7+OscZ+a~TD=(@1q*A{{A=pLRtqV#u)3UFF+*VN`Qqd6H_q~Lha?K92I9tYWeOCrzBrXWn(N675U3 z)6f{wY0*7pcLz}XQ|~(ED=ZzCOjwKf(%F;um}lVY=XL%*4#0fK6Tm**)ilw#@crR- zpT`$p8vv1~-cSr6=FYf zrzFsN7aavjeyk!C~Ypw=V{ zUYR`x8+>VTph7{zj%t@6v3X2NmT$9g9-fTLG+fMf1Xe;qtwIT153bd?I zb38f*l$e)ede0Z4@O{**f&Yk`AEO%!Ap^7C)(={6wbVA|!z^XphmTU;=UY<2i`m?W z5$GJ0=I!ry38KxO-@oexLKSn znjXu7u8Pkli`^+K|LSce()6}|N`K9C*@pneH9vIuyNz4C*FtvmzX8@dbv4(gIlqyb z1_nlaT3)^c_3%FF*>h&5>e5A8c`XRn<`GV0DWO3kaU}vo^0!}>uu7Vy-SR-{({#xd z#7sALf!eh6@KhE>45XbibVd9!aESchvu_bBY>X=@o)q*O;?FZ=OBDeN5!k!6Ao98G zG8_ijK0`fft|={Jg6EVXQ2M=`veYV z2Ip_zJH;R^WMI+8{KlMzLm{?Yjj0|CfD;}#b;2d@Pd%`)^c3Bi$Dyf>7$vrsRoCNA znJz)>GFIr_MpDLeqk@LCdASDY#)9nUg>o{|(g8%PuV4_M%&F6#1^9%VC+;y+h@PJo zb51HczNO$H>MWzU&Qb7Y8B|i>bD}Pe4u=F=ygcEHzg}+_=pnAd&o}{gd)xjrrc3>J zWOy45b0h~OGsin^g}=F}fWh8%nJFIvo1X=vJ&d`6*IFS>9fo^|FZ~l67E60g8m7_) z_K?z4BLlx6*yVf((30RqXhH@7885o6E;ddW?>7#pLR(bBc1>fU{G=c56NZlMJQa3n z&W_h!h3he8IR!P0yEYVvWe@fMJ|Rto!CRH7Y(Zawm3-PC;YBH*^1=MqN<{!xXu4({CLq|-dV&{Iy$H9y4+Pw|3bI;$aG zW8;O0Mo^M91gBD&iMNzWh$2_9Nztm|wR(zY=Go!7`dR$ z$vg!OjeK$1;)R&gnD&+n>qh&;r%B{xc^&X+c93O8iUu1;v z$%Yv`Z^1ItI*oH@LEHD5FfFZ!R7E~yx^jh@n9!gtPRIYWir$P00gjh+QshWS=&~@7 zPw158h+@B-t+AfeIq+pxRwydks%OvD^76WyqjAx6>H?ySTq9@bH}E0=Kz4~~XxO8H zE*bC5Y12v9EsJA9k;eUZVdTzR>J8s-C6(iOarZf}X&vBl<05MFeU!U7B3{K6dre1O zN=TsG;nr5Ct{;*EKj^$+nowTdjKJ0nd12R;Gf4^FUZlh2ei72wlAygGL6kn^FeXPOs#`X-%X9PX%fFq1l+g; ztvtv~D--WIvq){d*`t@`O4lf@S0m=h4&mORnMzxpDvTCppfOfCk{jKZ6@GWKDNkgC z1q=ORu62t;$weD?NCyTHZq=cJo*67R0#Zox7sy-&VEj+BMD|+u{G*R?!{+u`(nklQ z;I3eV26=1zIJzjO@n(fnw)q=>q)MpO{*{W`d+kRWv@I@yx}b#13s|e{;@hH zKudbsnVCF!V7Aq5t5ZyTv-7gN>4-dTNLt#$MMf03RG6Eg{SKm06UW>4jk9UY#@oQvbOI>dnbwl2GA6b)Db(B2GMZDE-Wso z%Y#DcX=IgaG#b7uFP}bwNA^Gn7JDNUk;4~;P`N|NwQ7?DatB)Msxt}cD0N7ZPgCL) z@+u{&roF&V_*#NMStzD@SZkX}BCb#=QP3|<`XrA@X{Sat=^^igwn_7*1?c1;4>3Ln zkq{4LluJ8!L)X)lga7R6G%?v=!iUzV>Ol)DJ3M*A73a;Y;OepcBb;M_nU;x|Z>!Ie z3$(z~nHDJO37+rcNLgh!Gnl09@@ztOJAjQ(P9krW>(&v&@Pdz9Ug17@+%n0Jd2n3q zTo}tudfk342V2Yt&TTZ(Vixact#0e%B^;aJtqV}Lr{?TPFAI)Hb=m2maOI)X2hFLa&Il8 z^+hqpHX5xVqa}5WiKk(gInbUDGGWy1W9_?PZua@}w0B|d!BIpQ;xtw7B9p}ah<%=T zd5V%gnPZW}M;>iWk>%{7*Rm0R0!I`#oOnZ(G_c(`ZRTSQ*=b4H{d+ZY?=On&Z+P%C zJ_j0;ZeM>F>!*M&+Kbn~Ppz%XoBMFbCTV>xHeCZ=7~vcbB1?uceTiCh4I+iJbAeYy zxjEG70O5`997CC$8}odo9L$YP4`wT{-4>cAx|M&Ee$Jq|f3@?Q{;;|r$Mj3QIGw9{ z3vf=zMdRX(74nx`tE^e_K(U2uH)J%&}F7qWI&CVjKL(U3YHroNo5mg^0yb zJ-`W$GR}e~IS02^7Ne`2YaQ!_e5W07NA0>MyUDov(OGR)TsY!m`?3FWKyQHxa0icJ zb+=8Not00f;$Q2IbQvKXXM~;J3?9#t!Xxyd?YTHVMUa$duna8e0L>X^n}VXC-Xya% zF;N-!Z}rGOLQhDWsdNq8PjB2r4GaREhoEE%8h!M8vhSi$GmAg+LD?<9B&RG=5?qqr zAxZh{!!VErqtO|f36wO|5>rz`VAtf>^ULg@zxOBtDY2|Zs8;eh3*;WbaMz+8n$dn| zbS9Pw`QS9M$0Y&51iCr1YV<>X?^7fSn$4lpZ(Wf@x_bETz2u{h$9>CDDTyX;U?N)_ z;31u1f!}L#0bDH+T3MrkYRW@dhQ$vRc7C9vlc@D2W@vr=3zP_K9CV=H42dSf?PV<^ z+Y>+}?nY)PwHYmD>>%LdB7^rL!3jASIeXx3_vxqY30JThcwcv(dRAh9lEQ{mu++Gw z_{MFAHzw_MPIfXp{O|N^mvaSY^EO(_yyIKcfVywAfmS9rZ#~c`%~j6ftXdLp<-e2{ z2^O^N@0_#nJ>DG!3aGx;zUxu!*&OOpm|vTM=d-0@@56|Cjti9I>AK=r`A%zRCf-w? zun_D_2;Mx%cYUl1@lu(Ur;m4ui_jIAE2C|g@iMLw4Sc;fzkIzxBU{R_KKNj34&ln! zVN;LAaj#D_RE*`|CfeI4`(+QhG%`8jAhwOTlOw=1^z%(A2JNM&$}(bsckJ=91g=$8 z$4-u3!5BXBeNybsY3k2ydgQmV2uv30%C>l|b8a*q?|oU{?&0Oi?(WZ=Y&)#0ptbGG zELwq2i~}9upoxcOk4*gG!m#X}pzS~m^qUup-@)8AE-KACIA~OK%Et=ixy?7Fk;SPN z_j=!dN%OER1`Anwm#pRzh>DJ0Er-@O3QV;)0Ar`0gf!v`gV0Ca$NdxA*zV1IET8!N zqR(~_q`Qq()qO{Yu`8b^elC5ev+fCr?}=&J*IswY{&uP0<_XMB+-C4>$i}i;-?SSV zu4wiqXX0aj=AvRyL?o;4uRASE!fW{d?e@N71neMR22foqyL8$rl>MvbFsya(B5yZR zFPxUuz*m<9?*gbjuxk0 z**KPbJ)nUj4~Q(~Ccma?_$s2Ny#s=)(C>I);~nHQ8bM38(GVFFE1~Cp+c6X-jN3IY zGWsMLi5n)Tg*?)f63aoz7dTEe<%f|ekhEeWiI>G!dyZ*;s}d0FHWM%tCzX<$+FF|s zK|MdMlWeT6vAE>7csNvadGKaSCS5;GAnh_y0QfDMhog;~;Ke3(W3zl3LBH7qIp_}D z*X>e}+-yVlS=wGY+4kyBY10Y7aGBrUH{Z#pr>($@xbW<=`fR-yusGh^L5+%AFn@Kk z+ec{xxrWrh>Tcv-W64PtW+x2MyS zAKPeB8g);fEKiaIB{#J_SpKWC6 zDl+*TfLUBX*q7NbmZh}l(TrXLic6p>0vKi&iXwI4&^K8)Wh-P1L}AKB&uh z`jP3;R>g@uYDdVXz0_T&u;ZASYTa9+D+D}&D z_Qd<`Xvww$lf2eS%izq74B!qDPI4yqL5Z z)Luiw%De-n-FYls+HLmAL@R~#ick->4eh__>^t4M9P8o;onS6s^i9|2BoE(giZx*m zPVsj1RooG6KO6>$HDHGzvZl-r<{JL{$o6bF?~9WY*X@!}I|A4f>ECW5ao{R@tF$bv ze(L;T!lXvG8lq9R@Zn&OwRron4g7-TLSk85A>Lbw=SRiV#NKp!({Cw!o%w8y#(HjX z3LWO`WT+?NrPZc|y)T*Bo3_NU9`-=h)q_z;7!tSLPoHA)v2Nw*R<)#=HuC{B?(N5! z*o+2b`wkKoMSi^V?Zhm2*`rcPyr^kzX0eQe#Sc2*^nVz8&$uM}_7AviWokpSoRyW9 zrI~xuQY*{RkaAXL?iC6yAhWVCaWOYO_gxoU#EDhCsx*l6)YS-dYtLR8eAUYS33fBixEfcb zo!55^AyYkt>!(HgBl(nu_hIJ1qmms z(^*~4Zg(?4-;8k#v5ug>;#*y9nz3{JF=6i0`l)`pI}ko@=Or~PTB8|JkZ?t3kp`

rKSwRgzCwlt0E!GnJ#%;p~w%}c{|G%*k7+hXcUgw%%%6C! z(?&p#tf3};>qxzy(@B(Qd%Dq=3UJ*e1y9`5&K~>N%;OxH+!!G2)a0rB{p0MheW*#t zcWJ(3$|up0SvUA~JkiBcMkPTzP_x|&81_IQ#5#7Q%a*K~U)k`7}O)81i4w(KPa$T6c!wOG4D1=V{^i}6)Pu-_seJ_d5lSS4G}7{o>QQIlCF;m(=zwk_e@9#MV~ zy+qVZSm`4cD_#>eLzm>mwGP2K9|7tD?-2Ty!$}(gF5uko?od~%!CR8 zfIz&Tn_PNR-5O}zqe@AE{-}P3M?O`RT(2FML**pGZfNmy(?BWB#!$uWs5pNPZy#g4 zvUZBqEhNd_F(AYn#l4~mxz*tQhb!;Schd*vb>liPwlNQ3IFrrU)c&pKfCU^|FG0E$ z*GE@%XnyAS(&#DFyQ1*lOlvJe@F4~-3mgxtR@%pCG>w8*E79m)1nsBa&$CKvSIw#E zU|q&K(<-xKDvt?kxyujam<0qz;0g0Me1X&FvCNiQ<|vh0H4}-DF#g$jHw4noQAhem zMem3b?PIQ?_y0}qOZ$Gz5-d5{UG7`X`Q;%rFsJKLb;n!cyLU)}Zgc#`Yorhvhy&Mm85njjg`JEVgP zw``S#G#>)GH@`E5!$!jgJSPnvP3e}8^LfP95mv?PzZ4>Go@lEe;T~e=Y*0Xj6U!n>nwMvC3Tuj;wi{|>6xe%e&s^3h>x z4xOtKCirlTVcs%}-g~qwc_aPpk>$%v%(ZEi7UiPTCZ81jUA0V6ord;SG>u5ywXv=q;dhM}~9&Np6(X-%EO5Bk1WpKd;vXAeWZDnb9@n9|dG?zE>Cg`+!hl zOT-j=RDp9`qCVnjI1DGOT2r{|3>8xta@BnVY)^#hl(>DwPc;uQ+fM@6i#^7t zzlfoK8Rw+LjPr(&SI)BIka@z!KGQ&c51RX!R)*MrV$ggZVCVH>fyf22FIniN{ozKC zwK$s5QR=?>g7#fo+9KK4q!hoxyt^sTK=u^uTi@-cH!gxQ0T*Zi-2Uuahef5f}wNsjXm@E z52|}cavs{w3T-k)d3^`-h#%(2o|Yj}hXCD4-p@V*96|3-Dh&f&Gr|NtYvrq@A$2!< zlZy7sGQ@0Qm+D1K7s$)6c*Or1iXRnBrEIm0Ywq4d&Z6yyJuc|lE$8rM@ zdZE%NNVv&^*)7C~umD(CSOvt(sYe-@*##THz+bHz>S}E)z7ixP9)yid&Zd+JH-I26$miNYqXNpW#?v?Ao58cr`_JQ2A__1*VCu zjhxb)7W3PXb%BBHVvPe5UrBGeDYZ%9&jkL}pf4tA%lN=QM6PmrvYL|uK85@+@fK8N zD$y9Bcv?&Q3bLJw$z+6#)u|FKigNyQolp4LZ662{e`vS~pzhiLinvbd`2^1@$5XEY zRm^Oq@&fZi#&ZU89{^z?ALF63`IBGr*Yx=KVdn}Jz^4Gl5ABOiy$sBG+3e^5q&dwU zhuv}K8gj}BuXG)Qq3x7vd|l0iwx~eloxGf^Dlq(`wW(XK#xqtoHv^+MgmN=o=+j`b z#SM@I(Jt>!Eq{df@qM^EBR4-cp>%2j!}lY8rncVoAI0Kw!u_K>8}xoatW}Of-3Ae(aebbxE8l#`dfX3 ze$>5C^@}_sa~nVb4&O!IJs1Tfqu3Bd&o!5uEjAVIpflv3vpTNN8G-90LbcKdo!3pc z@lM~s^+e7xA6}RS5`*>WhvEHr#qtTu%6Vq=iKYh<$kHuDerqk|{cUHbkWHs~~SZtNwk>{s9IW)v3Cl z0{k{D#v{`pzDKdl$D1wXRcpd7;}f0EJh&m~o|$`>*y9bsZ$*D+PCT`z5hUay^73$R61V^RGd6)Bau-Bo+H=A~R%%Q751k z6d=@vL`-rB-i!&3Z};3^Bvql3I#br?UXbko?8Woz&{G3)$Sf3X?U0Nc63)+TU|$2h zC8R?Yx(&d#^jVUh19JHN${TFBWk}0>b!xRCXrR_K(HnW?9T+>t`KX=n*=eBlEcK)z z)rFj7QDzL9%n1M}Sp8)KiC+=v^0J1Y?Kt4PdHEnftQ;>NH&Duk9;$aVW?{qFS4El6j#JeF?*r*C2){eSPDB zN9xXo6jOIlR1v?DXw`inJ@3Hb>8Rcr{z}|BQvXkiAf1;mEdI#Du86no>-%i-pC(e5 zTFh$`AOYopdLO=oqS*%T`ePLp!M;=`7Ia$AhdTL&y?*s7n*7`;E-Eo`wYBXV?)p=! zVpFoK9zhPsV<`&1U+w}@44k$F)Ego{oXQj3Ddo8$kEiX|N?v{^fwxz(Q>8?&+APyj zCfY1@D$a!uvoy58vdG_FP8_c>6iwG%938UaE}o~6C*8H9!Nr z(w1BaJ&|8IjTf}edx`BRiFo;xbMa%RLZC13W6e=6u+Q-6M=0M6H3pTTP`rpm{u_$_GH(u zN**fLrQtH6B5(rV6wN{EsHX)?$u&nL?tmADiZnQDt}T3zGKeXq$hey3BppnccgBBo z@>F4PE>qNe`h!GSOIX}KzP78sbn51? zaVYvEa#NIwxXd{a+0#;_J({=rpJzut{mf+U!@_36k@GA(oZ z8&Jn3e8}>tlQ;g5DCQl&CH*?lBGu!h6jYc|xYM?9nj5&Te;yyDq;~6zl5WQl1%Cq= zB??hfvlvD-oMCd*rRA+f^Hj|&{OsU0VR#wTIO^$woeWo}8sYIiimJb$Oq|}VByP0Z z+hVbRT(*J?+u&?IV)kvH+$^%oQ`Dt{3^NOJ5zS4H|&SB1eCz z9uJL-ZbifE0ADuCQ3gZ)F*6ZaQ|{(h!26`4J{j(|KdgQD^h#`Uv0fK%%sVQ}GHUtJ ztga-_WMcr-S!kAT^(UV$DR>LM`)ML#xgaa|8ZO3Hg&2K#ibv)2mMxmYsB(ryAIy?P z>%SALwxRkIac@D9CU!MS%qe@LAoXh+JU>E`sL~EVt9l&kv!GXwzR+sVSQ;HB$p|)x zD~V{=yhMHNSl-Na#;n~Bfp%KkSvR=Q8(AIimXhQptCZa6UcSYLI=fzjQ|bMcrSsHk zZ1}A64D#txv^0vND-^MWWBVfaCw2;cPk*^L()xf@QW0^oI;t(*} zvK_?{lF<5HEqu|RU7~A7v(ZYy6F_c1 zHXOSFgq(KHyf?CljD(KwXI}~9jN5LK579~VdVRk23fXD%1;B`~tx%Bv7Y^Ea*1n%q zBk{7>z|jDZ_eL#rWS73TQEz*i|Me#v4$27cmy#V^Gej2E%mC7D+NNaePJb5#GD$OB zM#xdsXhTQZ>ZegitG}n!O*Nn7PB{a5ZzPI3VY2Rj zobZJpyqef^ws8~wLk*vWD5=rgTtYe-N0AZEo2KWZve8)<+<1_qlDNBzo7egV8vzR$ zM~hiS0?yai&m>NZ4DVrkENG<*5K)!~9EA{Tb=4S(S?y00ZKgt| zU1^3PcopvsOyIDRs~3U)Ew!eoudSoWjk6JJ=yqk9V^&_7E!lJ4C4O@MRQDQc{SD6w z{XokFm#Y0u-op2mFkPU2`RxOeWi>;920@>ads7Pk-K46e^+B~`EweJBJ!TY`w=SW} zHBthA7C3MGDHEX`#K}eTKA;~VT1Lkl^~J4ra9?_h68KFDur%kHB5Lp|!P$TsIY@z} zs`AMJ^n1XAAEFA2VK;$qg%_uoXUs!4l8Upa^OqfxTRs&n5D!P~@oq0J$8$a%{e1t@&6E>1RfoG0<0{ne>+Vb6 zw?R+&m0R}t-F26?HuU;9ZlSMJPJZLI?fS+`b*3>7iq525>XqDj^}LDZo;#EhK#}pOa*grt_5Q*s;hx@a~dfu6bre-0!4%uHG7QL+3+%Z zRu6SMzD_u+Kt>O}nVcrMhD7Ji%fL)M6Wn>`Ep;y*ZGHZ?pGLC{17Y6Cy6J2OG9f3t zBCMxq8FpN8KP0aUWkJ{I4F0A7dHvvCf~r8df?$8Y|CDdQYI4Ek-cnZBp%``(+`1`r z~Eo1#?U}_7!A^doEs)4Ju`4CT-75%A}ffTUPB=adxkAI!~E%$9qeRJh{ zD~3e%fJT)jO8krZylgRnR!u09H?IIuzWa&iM$-#iH`c?VRzH`MGLgD2vUKUGVhzM^ zdBppOxdwugX+^DdV8l&MJ&RwP)xu?FQu%rGX%542@*vw!yhaTy7WHD$bax}jm)cnY zzGE>f#FAZ&|JoI=^tQA|j(D)aUNwL7)s5UaImy7sjqQZW;`(R~& zS%_VDbAx$C^IfdknDLE)!-!uc^hV|#;MwTkE5ee0-2HqIGiYhQif`oQgb64KFmh$)F9ws}BaZf4f zkFcfB@&y|AV1jx$cAf>4Io+^baKJm809_k4o^>Z>m=J~(h(ZXX_T}I&AvkJ#wyj!~+kKMK_g?B=m_DZ5FejIt zRlr{iN$|LjZT51UD)u)Q@3bxz>B27P!T%l&T55q9Csroa7gx2>Nwr~tWIWIN0O+$jB$=ng^Wti;B7 zN3`LYw^_bH`~~JM7bVMrTa5>}W&zl(1}z28?EO3v-46oxiXMF?L(e+8G~?Ki=hTlc zA#eSje3WY1C{gs4f0Cr{&Q48hmyWZ896lP^g%;M*KX)NF^eGToTG6kUOv@A*bk+Tx z$nO@Bt3OSt6IB7J{iWaJh_GQf;>Xc9Sn(J7s+irfC5|N;T)b*DQQ8^Ew#BLKOHh#csD+kbXmE!j5r$wntEm<{M7PUG zDR#r63tT5MYZKfYlzOGs>00c&7wM~;b)6nL&rQB9T7OS-OQx%e@e!h0OR1OltM_&- zDMy-;5(f2i5=^tEf=}pUfSLc%?dkYjo%$}!;=AlgKK zu3x)q$O~y;dgMc7HW^~68qg}-H6CwH@KTitug69mXqX7Y`epIz8MVhmbPEx$VPGqh zN`^@gig@fkEiST3^POw%iH2HwHV(C;B_z?^vvaLZ}9YU7z?|2||K<~#!zA>a2has?zr$ z=e=XXzYcOv&+(3YW%xQ7$=O?G?kammYLXMO+}crA_CJi8BzbTt$-M;m%&}OBG!HS! zz4{s3P(MK*-8b6mE}W+KAjQ`y3nY*k%QSBKPKshXXclpHc)r{ZUkA{IAo0Wa`Nk)T zZ;K;m4fO^QLf+Gj1yL|WHGFBQww!YN2T;NEQOSKF3tznC6^RqM`R9L3BEJ=DbOI*^ z$fp?Etc;g3^bE!0SlD?of3AAi+D=h|yUi(3RxIJXJqAzf*}mu^+4m~)_RZW2o>I_! zsMkVhDXQJbZ?I46)ye$#SzPYT)E{@;o|{bsysqM%ZaKTP`iHrzgVL!l4*n;!-dKeY zlG=_zuWAj>9jEkK>2Sz$x$1M`;-ufjSC~jbQB*HHRo|y;DXpS#7Ww3qNAdJp2|TpK z?{pojvZ1s1duYNvm?pgIl%HT`apjv~!)L{ZA~KIw$GQv!2h{<&)G)(V<{Zzev|T^I zzOJ;-t8-9RkDXS}tWo zgb!(&z4+QdG~^fKcNp2y6xJ5Wip^ZLnG$ z5bXt{5}5ISi@+Xj^*`5$==v!NwLO;`9%dO*oYX^78Ed`Ds=Uj6A9Bv>7b+7>4KJsv zTlw0I4!iWkQI2N~_C_oCNxYrZGEr^V9H3Tgc*^@POU)wk0DkR=dpCI@Kpbi>dWWvZ zMT~pla}~P`5Mq*@o;uaX4}2Gn39P)Q36{muTm)WtZ0%}0KD(VnjaWU$84e=oD>%=g zXb`AnNEwz3eD&#DNN%Lz6|J`!ceYLOZtqiVH4b6%u6HR5G~YFIIVLoCxOc~{khlUX ztKc89bB+>7?|Yj+3bxy>O{~k>Hz-FmKLl3B^&~ZX3bamiEBD5yUd{y;_BcsQ120^C z6tpkp(HjNgM?N#*tAxeMS#fFoN8@R0P%6DPghwf7RVy*Z@ZA<}=?{E-5^!I*dFg?^ zWVvQ0R7Vfxn#J^JaD~@Cs5TuV)Y9|(xF%uX{bDdzh%eSL4TB`~@rJEWV_e`=te*u= zGX%3-=ymnvvLCtjA;~FF8{tqh^0xGbYAHGfirw4-Ry~>NCyRT)O&w{m>+7(Ef2b-L zbq!maKt~_;-G@U{p;SIO=vSVfpbB?^L1^LD8>yk+fN9XeiB#oQ$nmRURz z8l1>4R|GY6wzWn5{BbvB@XF9xqie3`)c2rbyNB+|MqTmweZQZhALJ3S(_b>pw1H_x zUTSJ8c-l1<76oB}R+_cq(~&E#+@8Mqr#E(FtEqWhI7GVmQwFFE8cUa%u z6D%ukC5&-;^yF1QJ!JEm93#Eepp{PKn2RW9@Pc2KMO0obCu)qGP+Mn()nGoTg6(a{ z*|#GPfK-J3VvQ5>+fX(V_gU(7mcc&i z;j!R=S7!sy%VeF|fJrfQYeHPEhvJ43ARM>mqr0DN9<`4b%ZnC{jB#3m#6vl|IycDAwNUV5LIhLbp>YMCAQ2mFh#@etL<_&*Fu4N3zF!Dkv?cTSZ}d=$ zI<6TbcPf5tC#%qe$>$4uH;U6_a#}bH;;uVF_b)w+p+5 ztnGN%sxT4xW`80{r@+*8R$)zju5EU7F1JyjI(StjkNe>h4&0M zdfTs%&{ymN4GG)xcP9`ymP6=Gb36m!5Gp0m-0PPir1gU}C5poMS5@WP#bE@6<0tQ% zIkj!rQp%WqpX5PE&zbo7xt1BFcFjOpy0ahep_FGbm`X~Ars{2{uxNGXPL_x%E@_Rg z1KbHbDP(@_jHGxJXo4JcdNP6E)wUnIyMC%=%!@juh`-l;(umUoGvtYrS-)+?SH-i0 z0BKi6{JAcYv2cRwtFrcv^>QY3QUXowCi`>x&=pXsL-b-MEBN|IIuXR}M-D3xk+Hk5 z06-h0Mmp;aDA*ZFU~BQfHa0}j)X($@g;Czx3NlD7#!iHy5e73>Gu^oac_=}!iI-gf z^iM(oqqIRV%nJ8N7XF_8Lmt>zJ|>qh-d7od%xbFP_8k53G8ahc{5id;SO(oeWvwTDH% z!*p69GrCsuB_+D0Uq3oOusDWQydA1KzxH8jrdXEc9${8BHZR3%Xbi;GjK&oj&vMhF zvKz>Wwdw8ce5eljO`qO3*F>hw@rRYQ#-~EflucnrmT#CdqwqmB6I;YOBTN*HOk#&n zd*&=qxEqX0?ylM5qr9HM+O>xW{i7QW9r(LXiTtL*&po2huRgM3(0~TM!-SYR%*uY2 za4^pfX6v0^vH|yrxRHL>MCtCpV6hvGxb3N?kx#eun+rCoLWLvpxGuvlIh)h>yBgun zW3>DIL_^|kC2<{Lcyr_ozDtiMc1KnkBWa?*LL_5laD*p|?009%@CE9Kd9SoRVy5~$ zB8nEclGt&x-YzSUxr(p8MLC$gk?Tt65{umTyTIo%>a5tGVR$v)9V+ER0Q;NH?=--O zqufU`O!@D-J_^uJYuPx_JJ+jgBZ1FNWrSpEfJ&5@@NFqANLGA?GZck1PGh|6(L~M9 zu2bH^6ZBTC_m)3%JJ(2{Or#1t9o>h(#ij6enM(K1Lo$(Z02&P3XXV^kz4il`?`YY- z6uHVnme-2C@S0gFq_|Se|1Xq)y zyOk8p54#eqB~$dDO_c?G+GzSofjFV`>ECd#)Gw!js6Zqm1m-L@$#O0uMFHq{3VUJ# zb_RpRbp@X&^8if<5JQ@HMLv*;rB-@(n4xok_`UJs+FiUI2Kl9y zgToK4)k!w|SSic<38)mNV%8mya^1`bovz9!=}kSz1s-u|9k8ZwDsIepA*!VskDvNW zDMB=gZ1s)4ZRUG|$&aIw&Dif`_%qT<$Uw+Mk07@t1{+xd_iqe30ZGL)#%xcx9$$Fa zBEf$5a@EO;d6v*$*nuC$#fvc;_X5h4c;h;8++jz1UVTL*{+>W5G5_O@1lpNvfB5BR zJrva!>{(ZEmI8FKGJ&Z7bGicY`jgP;(M@D)TlWC4ByKcnOyDNoHSD27;T^|mGHQi& z@Inq*v*`9e0+^jKXByMw=j{wHop87c;xOmke#Qn(VR>?8> z8pB=aX6dAomI|u3($1mC{P-)J%abRsrEhQlgv#_G zGpy}`+`tnAvwH3O@4C;i5UfVlVzLTyB^6O&ygW+yOBx_Vv+fQ`(3Xc_Sx$|v{@9?9 zUu_5`EO$xyhHNapix|#s;_6ITrwc^vYCSItQOsO@?*YMwzTik>(#Df~r=a?H|MtPy^$vJ0L7Cd784kesy!z|51ec9t2p~@L;anu>%}|sCib}>BpU>7 zMep|6MfzA9Gnrw!$W5DH8%h$b%-PCiG1Q}nlQh#62^_DY+iQ#b_*d^3Dh<_qGF3Q2 zFO!tdu}V*ea%4nSkcAn)WVr~=bh%gb`Euk>cZao3*wVACDwc)_vqjaMl71?JH%CEs zM~kcpU%63{n<6 zk(0{}&+dd9snoAsZWovv1Rq#TR>&XY*pOmD;*ADDFhP9471~(U_sCmYaOw54FwT?7 z!toAl`Y@7cYr{Zg|@Kxqs;ui&CZg^<0#3naE+Mr%FIT|8A%>ylE z(;*mfeFFp1MW29Hx}FsOK~i)R0nG%2X30dUF_@<962CZl3>E#5v+cTmSI%Z2w-TS{t6) zJo@RVPXeQRZ}6HOJRM%Rk4^<- zvQla(OXC5EzXj5#RK{}b1T=OqaFJ_mnU zd0;7+aXojIkY7s=pzh<_O7SRtFmdA`zx;dYS8ip@S=D!~e#ds*dfCw1lQ0)EdX6iH z!s-|Ad~hu7L-}$GWOvXli>~uG5|m%7K>#5YQkF1uvn4Pze>|s7*;pYh$L26X#{kvG z@_E|yVvqV7P>nRqvUO!{=I-0w+i)oO4Sa6NnXqJ&zER?vJG52^YcXECf~5nWTe z*ru7T2?i*p8AWke2Dy|=a)nVU!9JSB|0MMsIp2t|9214g^=L4QHEb33ryJTHRXUpa zio5+@@si7D&(X+M!)JB(25NPavSeH)5 zH&m(SM6^^>eKWubzg1X3D`VJM43U?)*q6~tMv*r;k5#KRpCpeeKh;DTw-44&b&a&x zi%*VPPw=|n@(eq(Au^u}3OrwU3(Rhd^zUO1(<6OgueJf(>>4 zlCja)fxE65T!R^DoRHa^m)n_DYSmDeW7K^!cj%VO%LE#@Vew6ZpvETpu*Miq zR~s-hMUKIH*6M*?#4++2fNrZPcuEZ|Xz-cVkG0ulmub?!#tf*|&+p1OcD3i9eTo}9 zOSH}l`iin0>bY`6dQkA@LsAMA@l}nPZB~~CyGy$ROeWgQ6pTNg)-9;*9Nggf#$_cq zSdWr5*>c5tzQb?eO5) zhctCX^xomL##F-ZFp?a0Z8z>7ca=U3@k-BU)-@O}e??N@_x1Ua^YaBUf*0qcU11PT zp);YGftXl&L2os+AR9#bj3pMaVqfkqvwR?9Cw?V3rg-PX=-1MAX=NWRWb16Yrkkjl z-p!sBvh!8eTp)BV&%ledcB{hfT4zS*4T-Y`3hNfyE>$Ffmt|~gx_<6`0+RIsODJNE z6M9Kkw=zd1u2tN*tM&p!wwo@uz17IMVdC+#YMrla9OM%~IcoL+b`@49QQ34rsS5ob zDplU1iZsjUM@2E!3TvM+*F~S>UI)mL~z3Mz3^SRdsC5Ju)UWHa~>l@O>{&eq#C*s}*rM#+fZSqb?UVaV!64im6X`+vkA`*u*n4RyLfA5n1Z}Im((>StY-j0#$ zCre$erJotO+sJ$nG|_p1;W5nG*ZB*yi)#4ebo+pE4%^@hy&f|Ve&WGtN!fZDwOLNq$ z(rr;yF5RlIjPU_HvGZSLaPU}}oXYu-qkA4`^h}@vOj?AieNmc{FbG@nI;Y8KTm$On zGnaTQ>vV-t1)koA>S}tz*}GJw(b#4DY(iC``ElcyR9l4^C1a)%2{} z{O=XDf6Hv)4|bIp00rR3=?*4kHqa!^{eYj<%+yv6B(?M)9J z*ej*Q8E0w(M&RqXf?}+{L=8&0M$*`?DFCzUVgY5%<1@PdoRsb^$S7LO*a zsU|J~pt6IjfQ!fE-`aQb*3dD;Hh>Rdd}nplW_{ zP%?PJ{73g5IG2sO@sGk3M=-^6-5U5&_uC97L!p&b)RFv?<#&rqW)(7KmrvFWl{3Fj zS4bt_mK?oQJJn!t)62P)q@+Ucy3lvK z$4`kH9fyM`{k1LOYnD^YUC?kiwdz2pAg9`RI&o7*x!H}`MK5BTPc^|m;~P#_4!gta zl(PES=2G3uD{ppZRn!Nxzw$}SWSbzpJJ>ImT6AaU)!K&)?+HbBRVz&zO+kXycW;&l zu5{68PoRl&(gNR))TP!`zKI1w&@wZ-|8@wsqctg4!1+so2zC3ZF@{OMe*6FYZ*aHW z+LJAuKKIm$&4@;R)mJm93zL%M&`cHew46o_H$-KoFBktl+~hWUe`o4S2PF{-S7__- zcD=v-t{n6%WZTk8Rf0jZROPt-sK&2QX);8i`|jH*t5;8k?e2z4fVW;(uR9xdkaf;^ z=;&cMK`N*AUhTlWp#eNJi`LN6H>7+3;hjQxHosEd5+xDQ4Q!yLmOs<`wA^Zd{WM75% zG;i7g@j50w+8QgozkWy^5P@+!z+=#aGhC0bHQZP4*Y>mOTCexvT)8amJX z;$diVPQHKsM~~lL5;%co8PfLdAF$cK>a5F!|B=KY-m=&TOC9(No%x*F1wE)Y_Fn3n zcehDl_(*mNezvLSs}9h=A9F+eQ_fSb9~_^v(ti>6ayQFi48i^NWZX}aca$b$2NFD| z>{jZw|8>z(EpDNz?Qyzg`yB2mY&A|YLqBwQ(}~!#XJ*`{e~5flyT-SH_sPtZeJJOn zuXbqN`m}n}MWs4}`g_Uw6PY9Trenm;$SQYTFgGR`=Mb|o!(ibmCi*k)Gdj!QyRYml zik6^q*XhJR3Lt>MC9l7JRsrzJ<@8YEDMeM%$rxLB%XsQZ;oF}&e)o8&Us7Ak0|5Z;p z2FP`ueHtR7_AOV3E7Y9pxJgsJs_!Ps(ds9p5?ih82DeTtK}*laR~Y^TcF7ek9SuTM zQ_}gw@SE1}TN_+Gwz{7`8J_T|yp7>r{^+TV?{17x_F1x3_RZb;(z$KD zy01o3!h@Bn6#KfRX3|50_G}TnsPVoTeW+|})JG4&iwkvdx#D$N;q5e!Km+07Oa}?0 zkqgP;O^a~?uSk%?{>zU)2TWdCD;Aa$*;XM#JL}Z{uSeN+ zV#B~qK+h`kk<|v=vf_|kVFq>G<;;!{`C68VB^^I>>4oE|kPFt=GB0-Ld$gA2WL>`egj-Xr@hmP0h^!YlC{@M&|1_6`NBYV@a|u=; z5>~bVde|f3A@jl1-w$w8z#6!Jax0Q~~A3x*9!#_tgti+Z@^XC+y5>WfZZu^YKw~zntE`aY^nyrF$ zO+HF)XC$VE*Jaq$of*oCKO9Sb)*0hIzrba8SZoLHQ+I1e$CbYD|L#_0L;dKoTgf7` z1U}zBT^E$dgd_t4{$VDF$dH&F8?faZ=e)f3G+HTZCTd zICRJL88=@u-KrFqGo*jO#pZC7;_RLK^1mcPnjGeGdbcU`C0X4f*M{`hUX-gotfSx% zeo?v)i+WvKUNLe|av{;&?))UO-@bgqd9~yC9Eeop;hO`-J}F@9jMkvbYOq?kel4bV zYf?*y+Gj&~i{-45$@NDQkFxVsrB5_!Lks3G+z`v$+I$44Qt({!>DSTWnz3ii)2|-z z^cwG;^BC0dYk;c%Jb|QjTn|5Uo2<~4!Pofqspl!{sn7-!`OcjKC?p1TRowAmU3I_I zIexT0YF_q}Lp?)azwQ45xs58jN|6{=$ zTyS`s{^c$3wEkaLxHZ$$2OcYXe4H+P)WE;w75EK@$t+IvS9b@w)1b(-z-~x1W0J^3q#3p{Y>#^twXDkZ59zTC`mAaC$SZooWNLOa}jj z^<99t$XT~t9+}hi?;F=T!BQ@6NQYS8z#7EBpGe!@3h4G0^(6qS`PMo7_gLx3AFPA` zGIGaVIEU({Oq6?`n5Z!V>qG8oMoSQB-M9CLUOOanMakje1LZ0ot*t2s>h72%}Vb@2N9~=M+3os!NZCeoF{? zVrv&qUaGgFWh%sE7=iSMfiub;gJbk82w+?C}$z_6jy*~yI(Lza{hc0csxb1R8$QP1}7 z@f%E3c2jd299NhJTlyrgj~(KKc0bo2)61wix39~uzlrv$^HO=mKr`cnI?M5P>BG9; zB85w@&p&bZF<86cTH|DsymI?dlE>at&nd?Rak?=SpFmKN?N-5#F6SO2*WQ!7J zbMUMl;xWos^A?8Q<+rBVnP)dScecHx?>=y+x>#`^`QV6urf$ei&#y6uMMnh6P0uPKX1L;x#_$9@ey0p(+}mn`YwHmms^jzCVF^<9@4?s1`U5-nzra`mX2&)!5Q6G zGR5Lc{W8Dk5c?eUQXMVdlOJ}TMwtf5>>X;npD%59B;nAVU-~b)5SO2o3oqv|l;y1$ zMp`FGGLt3dBd*5&`};m6{M`PlEmUvCwIHX&42E86uBcA**>M+r3{pNUvsZeHzp*A` z=HmD7a9*g-B&Diqs)yiqa@@C!@R{;W9^4@;^9MXj@%*qOk@6mZwTLp570HEJNiituR7ma zNgC+AGI~wg2n}ByHk?cQzFv|sb81~;Y@j{uKD1%m99HU^XUgt7VOtwLMn>J`{f7;a zGanIbAXIUyS-g7wk7kZ!s}un-;YPxm9M)f2$*3C`({3Q~ISk?FV7sFTYK55Xv5Nk` zK>G8>k5-Lo((`uUeL!ZQL=z>m;=VUdbb0joWd&0YDOB;QTOtbpAYH_A>2I$vkm2C`P)ibx>UVI zBg)!7S}rx+(X>fqcX|!qRUc*OWz^o3lsnjwW(ezzcVFzK|CmZOlralNLYU<1hW74kuCFPSN7L&_Cnbyg!8o-UUTo8;Q9@iPqIf?KdacA z#56(cxt;w5L>t~svlth~@y(}HO?%db>cxbEuClPWP8A=#xL97#3lm+&@5l#P?v4j+ z;rKfl(6Z3nmAv#IpW?dJ8Tv$=s()5R!bD&ohbNjm@y{;Szf>kI;|&8DKy_>LmX!zc zL(94&TToIfru4_1ZSRwJSGv75%5gPmGHqCKeLGf2p7{K`VX6s`9HhYsFUS8nHdF)! zM~=K;Cph`-W4o4Qj`TvIqx-(~9a6|Y|NfTshCjJsk2dle_wJGM5<`dRceXWbVcoP( zQhF9|8dtm0S77zvyR_H5dej=1&}Z`hu=kx&O?KV378Fqg1r_Nk(whR(iv>ZDjx^~V zX^|2L3B`i+UPQWrl+Z#)2mt|U(nLasARQ7Q6q7(`_lfUU&becp_nyCZjPH+1#)v%6 z-gD12*IaAwExjDis3O_KRKT<|Anhm{THcvdZ$ZuIX5}(q#B8BiZ^O6;7;#;>2zbGWRXj3t>-r8*hE?ZKt3PEJyI zL+P5O)#R~IN~8SWxfHL?t#SR8JY+agl81^t5bJj?=fVBMXpvO#MwEHD>Miro)DEp8 zDmtT;yFUlS%@^0ZX&@5`K+WjkyAJEgv*&!iFeGtRNnrh%Uw74h70EsWdb{oVYAAvs zX6G#`%>7IAbIevC??8fzAN>hs)fA}&$oRIf^M?*a8&WV=)Uy}W*@pWrZw0!~GL2J>5xSV%cfn#eXDOVwUm{k1yL)OjpNCVk6{lG`~gTExmYh(U+uYbjV0HtpV#LHn5eJ* z%8t2H()ZBJCUgVIqo`a+b3qAza%o1teCevc>aTV8lIF*Zn%8Y=f>4%K2@3Q z(Xd?~2N(oCMA&Dt=YtpG)X7>^$kN{3&)(n%Q&`F^{fLer=>ni1c7 zGP<3w41B8on&Bn^+*~g6vvDC$bjOHM>E$u%=-qex8d2$joiNkiqhmkz)xFCT*&_E? z+JSxmpGKE3V^zETXa*Km!M-zW)ehfFSsHb3_~=Gb_nx+XS$nas9p6q9{pcsZ<;m|+ zx&(jWSFt@b9TqP{kMezil=^-{b65qf`>nFZCF0PHleg&aHJ$SQp{lVT(bmMVx{%bG zg1)U)>mSLKUy(g_Z}uG5kow6UJa6UAQ$X{0fwwHP3?fv+vetJ(pTe%Q6~l61|p!+e-vi}nd%6wG&7-Tf`R4#o%E zwX);~GXf7Pe3s$5W*!a_27gzu5Rv?gHNU;MQU~-e2O3PV_f2)*rN&~fs-h}$*TVt|X6%)Fq{mE?*OvKXAZIq$vsZk+PL4}Kd>X6UZ ztLiPKL>5brTd3$unS1e6Lyy#2&1JJl0vAmaH6qv0i-ihHVV({$R(fl3htHk~-udN5 z8b2XgR`nRkxVc9uEdcN!BR@8UU9q$4ZcNGk!WyU)s-`YnZt{W*^H!bqueF3dB^vVk{eYdne=Lu9NHb406J7WTGvF%@wD^`;=*%*Iky=l&i zco9f!Ste^=J&H2d^xv4OfBCWn=6pr;oLBE~Elaz|QpJ2{I%_uRVCi7btM3uL!irR% zv54&d)HC>(hx76-W4Va({?OISEbYE~f_COOt_A~1ktK}+>o+P}dbTlb-hGA9k9e%> zWWQ_=v8b5IOu>cS9P6TQ$qVOr$WldsZ(r(3eWQ@yOOX8_eZBleMt!|H)sk&GV5uKA zguj6FK$dk(ytaC(#8qE@y}4!xs0>^o31Qr+|BLO9$Qa$`^hismTTW2f;9=L$y5pPo zB#QBFO{u)s0H=_*^H68boSQJam%Ii@jA}nLe2n|BjknAxaAikof0cyU*<1RS8(Zn^ zVUP>)`0N)t>3Rhj=1)aP)K(mnC$DfFb@OG1t33W6c4ht5S|s9rYNQ>AvSPs2f+4{O zBay01W?bCwPC*tWw?}r36_0@9n#y#C43I!+41x;IXU$v`1a4Wids)-au|4+Db!zSu zGDsGAc1nPK>CH))0{5IV!Pts{dE9a8~a+iOf(+>V1|MDU#6Gfxjl zc)!4%mAyx%hg~mhiganig>kj4?IMTi?oE01SZp!tsF0$^oUXY^F?;WT#Bkh z5HLYzfqdF!J!0;(q-&>n^P9;x0Z}9NfQabr2@_mjsrHV3V3Y#Rnh=y)FRU|d2d@75 z-RXUMb4-k^E(CHjEK%8icTEli5#Yb+yNB~2 zWgvs8Y(9Ijd2pANqc?BHtJ^w+qAGR15+^csst%}juAC_0&?yn(=a7j}c(%RpgsF*h zgicJfpv3&C7o8d_JD5%G*WQa-@*Dmir-Hvom!dBioPUsRPJPb zvC5Xu7rMrc9No1yKZSfa9~}aBF@n~`d#KMi|1AcTtyHN>UI62<1vIMS$eaqA)WIF! zf_*T?qZhp|vZ>6~sx`H=nc+(vPF|22yv@N$qj7Uq&br>y>$ws8y&i$Z#0g36@a!6? zLeMYF*`JD*&Y(-~4If>X2fqC7L{d48^|vK`n)JhN(7y6kt@RA8lt!$Hc6jXzuCWTF z>FGIUjn~V5j3WB3<_t)@F>bg~bKxmaxL)A%DSj(RDAE0vob^|d0r~cS zB^hRjx8ZM3roYd6#h-qzw@othC4S$VacJ}XR{y6;t#7fCu#pRN(qYrsl0!TCj31j| z+?P^=%aJ=Q1*6*o1HW_YSM3abc1?c;9Q_zX-!|PJM_>0_*T;b##kLr<;qR)iCkfU@ zFJ^AW{5Y?=#z|>@HM%}v^tO4ySO&Sjw%CV|hq`%9&KvCkCrtZ&uE%3w=W4(JJK(}5 zParAm031mT%7V2;_q{$%eW0q7Eemc{OdzWTun}VV>stTT_T8p38*OvvDTbB09j~*r zBao9ME;F2j^KBvvlX;s>61c$ey#smC8;R;t(=Nfv5d-7;Qr~%SC@S*ijK&%F*2qt^jNI`!8RE}oWK6V zb?=&#hOJVCY3pT`B8j?Y$-+d}m#={a=2?lH_Mz~^=;}yUi}SCjY09gH08f8whr2k( zJ8=`m$gHJbZRIsjpZz7rj|6u~BaIzTuLCDx_Z^a^Co>?w2m3OOAUe5UPL1&B`vSVW zo&$)b^&2vyQk~g{Ya#^REa{;P1z4>Ir)I#>!5Nqs(hAxnS0?v&%yQ$@e*7z(-B_O; z7*S(%oTq!RAhiK8V>I=>_6zq$#uxNXf3xR=TJ0NWYq#fAQ-eN2DZ9sl}s?Cxty zGN6?uJx&Q#efmDHmo27&0{-PalR3{jHJ{q6~BhzNWf#%D4 zrEX1d|yJ@zg5}h&ZE@chaqNSC=+`f3O6e((Y_NmKf{vEd=JeQ<{ zHvGMaYl(fS;*M#Z9Iz4@k2z(p?Qh+V4_B*P(6Gw!LJ8e}vWC^^Q&3CEi`w{}eC8J+ z|CC5y9jJ&qFfpK7C0AEvgtyP#jmR~UkitN0y@1PWq*O(k{eAk=`2O9*^z^dy{z78k zX?pn|Q!qTLkI^y9`1|_fMOA#aNYPN9-9Nm9UkThFN%oq#B>R*#qM`La1o6wkOJkz= zZXn7Jz8?p+N6&JprZF}CdcV}MtN_wk9MV>rHuD*`j~+Vo_gXbcd+(_Bh#j3+R9eko zp256r-h96rK9$PG-cSkP5%WZ}T#*JFAP`5$c3FMk~2iPkVA zDGQb+hi@zfj7pObkr(VRZOLt(PNnOI^|hos+LOl(UM|%WndK~a*AClh{V!W1Zc@23 z!S9pZXozBLIZ!z4BYJeCxf$z%6F?DY30wW>e%wdJenG+(Tg*oJT+3d15u&umab-Xh zoYMxCIYqtp5siFD9)-tBR1)5mOunG9VneD(6=vW0@-+*7q!3q{DKkC5^{lwW?g~=H zMDCHxRKILcM%_od7Rj&uF7^*ieku$kZ;Z$Uq2Cd65FWmn!Qrm5x+g*dY=(vkR?2i| z&sAWJI5p4Ye%-dPLa7@a*zOj8VOp5gMfK;u%2@KD= z-bljD9oj)0aBkI3yzA^IvA0gI?LFUl@+mMUPjbG8AN=bw@HCSG1hEKU=FFRp(ttzW z8OoY*N`uo+z&4 zl$1B*1b>1|M(c?VmM`<;MJLFRfa{2qCop}{L0zoIx{o{w+KIFH@x?-ZJx4L+^zjUc z0zL;`R}&zcYnts~O=D`_*4Q2MM4PKPrNVjf{%&R zlb3CILj+wTorj)HJ>V@H**u7u1)P3h?B4f@#_hSP@Fj%-<~+J38=$3XbO?;)Vc0(f zd+vgrXpgKzxs`v0+m`9^?o`Z6-CR)0vnGga=!#YVPYjZr98TFt*CalM1T27CS9kqF z){fA-^2@mS#vJtIgg%&E`sGI%@0Ga2=ELt7Nbz#nKuI-ye**<_UVR)OPs-}unyxbS zFOms=On_PE#CMBxsJj7!W(d~Fe#{jgr9v2gD_wa3Jby<*vhKO=S_b)gyGUV>n|;}E zEPMW>m(eG!4!D%lz^9Bjc@sl5EB+@X~h8oMh=7@`zkE`mT`|jTeXLPS981l^@n9(ez01 z)a4n`UD8FOG{JcPnCBPDvm{+HC-F+HvkLv&ayMu+BlaReKReK9%Z5CiN!+XE;fTs`31nP$GacJFCs#*o{wdBrFDIN`s;;x9 zwH_1nvzkWn!J6SDxv==N$6@e9xE^&&xubJotgrjVK{D30Gqqd(Azs;Wq-G*coUGCY z4JA}~J#DaBM29|IdqWN_?jwW>qq-dYpIwMKDg9K zfqw;Ox%l(dz&-NMj3;3p6xKcr$yLN#FKt^$1WA^U>SOKh`8*hZ6+bZ%zRFgBY5iT8 z#f%L9EcN3yBQ}Cz>Dxzi`UFmXrX+0kRi|JKNqI_0r>WEP$Ic7 zeQjGklZ_HxMsP%2`geG~RzIj{@j4FO>G!AypuS3z!}aF6M;tgbt0BG})j*zvuFs?z z@*$`*3KBLR8|>J25IVtmU>gL+$fU*nC=j(ypoClBYH^=ozQEyP%V9IvG`OmGt~>6` z*n@rTTk>CQE`r4T23t&>#hZLtu$3jks85Zwq)jGBaa-d0H_PMB$QUjl(G{tL=i-Ao)6LJL>3tWs_Okl0W$IsBfJU8K9K&T&~~5b>0Hz?W8lq)P3|5TbT! zJ{5YRJLh8@4{bhtWnEE*U@pYb5wwJC8%Cv(X~}VIk0?7%l4oKOTN_8n27z{e%zuo&L;HrjaFlBXXZYQg|m-!02xr6TK~ECFak- z)|AS2lp`ZwzcAw(e`%-O7lBr_TdUl*`<|}3#=<>8OqLA`rpT$9KNzp*2Ypuojr5QByOS2Q^BA>*<^6f2gT2kVW(fB+ zgEt$g3B%^ITT6_H{EWrX=#gZ<9Z9yhi5iM5{wQAvLH<+$BpBmk<4YWF``%WTa&2$+w+&XwA)FIH*V$-VW zSsaRh$P?x0$!ltD6PZY4Kj?#4Y$$Xr^mu=`V_YQ4-g$a{tOv(K*qY;TeEb)G zNFlPxn_&Q?`$jtl6ZCjHDI-ddHDn@ue$Hv{dIl^;P0^lc6aw@?UtUk2cVBKLLnRd@M8@-TJ|th&EAf(RZ<* zHkx_8mS4ku8)&Z-QS>!~fXQEM65a{H+J0^BCo?aS zw*^h=bP4EC#^^CvE zg2^>Y_j)O>LRN7PN)$O4L5$rMsUg{yVP1)oKyR;h4(}Il%ATrct3Wa5(&gBuBtuID zj$f(~CrdIPRZqLri(r|ZGZuLW?#BlpJ%-$9tn9|<|<`?wCb6mv&w>w3^e@F9KEDM!ZIHX6Y@*D55+^F(Q z4@{|BWunGJovkS_dFtUL&au6$CL6Uz#&`#`F8tcuKkWpN|u_Q-&Mq>8Ke!1>EkHK87oaF84!U&h@qXg{BOH$z{v z%2%IrPzS5&)?vG-$vohTo*d~;e#NKd;34}S0hr(rEM!SFcFCZ~-8e7BS97Hz1W4iJG2?28$wkihmMPJhW3 zhTqF+9iO9BCFf5s#=Gy-9aw~j$>!0WEx+*HD@I$qD23exZ$eF|0pIQK{YW+Q`WiMT z>sbKmy`kTDIYx&`PSOBt#|Aqo%$ueuSTvA5uvTqxNM*(06Ro$*LKqGH#L;SHx_eh@ ze!L>Vgfc2ob0Lq$#P7*wFWL5sJ?tBj;edS|qJ0u)mAqkWQ5B_Q_YRdcqU_L?=Nx~* zo!HdGFQ7K3Xx)_$w$T?wgce_4wa(XEaVEHbE(Tqtlgr)sZZj(6Q}=W=V7zEM^T}D4 zZBWLhT*W5l%XARDPw?53#!RNQ>6!GhfyyP13cX6#&csg3)$rRac=<8*xkam1(`{Ec1{#~GmYASem7mQuScwG_&ZA85ZWO7FTS-nh8(rU+$-bM&ahlNEJWY6 z;M{L8z;>a=7sdkhnyPjW>Ua9w>ZV8J!Nj#db&yyJAe4NYTJbe&is`-8aw)~U`XG9i zcp-qvl(H%YyY1d*e0_8=d+c$Zh3T)EHd@wL3FD{gg_|5AQLcv;6=k(qFD*;SZPHRzJ7)Muo7q!(3^n)WL!k8f8MSfC)E%$*&DFP%weM+1jn8ox#7O4=?BGH<@fPQx16k*HFumWZl zZu#|oZG3%Zm46l4*!(BkphQcQ=1s;k8f+X2*?Cb+>;-gUEgS|tUu{iMoL@8G3-ktS zZiO(*CB>~pJ&5Jf?j~?)xjcO+>3iJcx(7bY*}0RwKk=g`WwEkP8#Kyrkkqn){w#zz zpU1>j_%bg?U2a6@B6tV5-*93Q@Hr6PiUk6FoR$TF@>`Iv90CLnDhdRn^h7b`K-4*a zU7b|O4(>F8W~uuaqatBwHK8@X2}R41VYSbPpU`TOl%&q;>I4;_-#rc?a|bqfRX{A6 ziGqViRaJ-u8`wXZY2^S^tZTNv71GA*yKf((fNhiZ(IMS#O;oe+rSeeupwl_UbT>;r zHP$>=oWa!TJYkj&rjeQ+@Z3G=QM-Lt{00mf*MWr7Y0R&$c{~T>M__yr#0Rb&U%&y9Q7@=?Kex=! zL!$bjP;}wy_^UKhD`~9>obZ-p_KC&&*rAjhSNdE4x*jkTLF z-~@@ma^+CGn`3q~IXpwk&}QJ7C8AVVduF~v2>fYLIdp6tZr{-4&3y8DXPIt-`6HS( zJqmV84Mbn-oC3TWucR7gdzsMk3+e%Z3rVF~{rswxx^Xlj0q9)O?aYa~jzd>I(vo=K-Q zLlutx`m8Z*}AHVaB>frXAQVrmr9!Jh-9s(=wZP;3vHhG^}s-{(Tfd6dR`VO zH309>u5AoM^Ml=+kI3Y%($7yvEFi201SaLBhKQd>WAr0 z-M$Ar!3PD^Yi_KnlJImXOE6gP3|l!#r4u*MOp}Z8KSWn3VY$*9mngXCAS^53V|BBP zx_Hn%xL=!utkD3p!^HNH;tLiMyO8SoVKj0YxJGnR-eZX-D5&|E7pmb|w znKQ#y7=vBYUOIBaeWj zUkUco*7On!vZRgFKZGcNI67M6EqcHvSLK_y7PpkkD4}$H)Y6i(J~>HNCI(O!(`Imb zZNO7B>lY=;=V>`m_CgjU%YF!_%n?G-Fr_+Qi|AX#JySe7y0T>mWeeG=taD=Ck^3=e^SFG^#DjKfb)L9C6NT$Lh6qE^{F{1bl9_E zoQX~mh-dEPaMmM;}-Z zyA=;5eqgPvTs>;Et&^=yxT?@8Aeb!C2!pF~xhh^OtGR~8GKPr?i z1b2R@x!~QFN2Davwxb5g;xehw)G(Bu*PB-KpDzqmJMWa2W0Za)}bBzWAtW-I0TS& zyh(5?MtA-(WxN1W#`>S~2w*YOpeFcfdI}oty-T#PPm{Z%Z0T$AML1F(i9U_d5mlt* zPX}5jCoMvIFM^Yj#@RYC(R#dtszv0jZ%cmsLy+{6ibpj8YlXAB6dDw zfTf1rEiagK24*l?JCg_pI#F9S1FM?JenR`;!8IBl#n>uy#eI!QmDYmND z?G?1+9(uID4HNr)K6a`wWy^}*lGKAZhC~v6wVdv|YGgK;5wP98q!h6E*5`);=f<)r zUI94sG6QydO#;SngU{!IjPF4tX0qw-i)GJz250SR9V4RhnBg{TgyCXOK5l_C#D3al z@#df>1S9sDkmvJEYUAOFekox4FN&(e;?bjBPw7@2ZD&W^`Hik3bCvslEP&f@GIggs z0WzW^xfDjOz!?M(R7;fk8%eUKwH^S1O0N6EfuYoJ@VolGDIptsAioPhTU2*BG-|<9 z493n7XdX$HO4XdE!v{d5bH;X9q7%n`l4OlARfPCmoyk#EQ>15A_uiCN0HX!mH-8DZ z5$yc?VS{$2#eFeg)0Mjw51WrkC_c3ydv-{b;ZNRO5;5D>trXB;;lug8 z5`8I-aq2Nb!y(|876NC>06NwC@Lz&8^N(OHvj}1j87|E)^V??}c*+l+qlaZ}L(TMg z<=KM?snCiJM?0 zbmtCBM4fTnArf#cZ84@*@7=Fhdab1`-9rSF4ve~eIm%BD^s28BD`pR+*4*p6tqm;= z;Y}X{Fk@IyPkcirKjg#Gacy8S$Pm2;TX7Rn2m}tTYQ6&T#@b(_WD`w+*tXLYU@7Ej z8B&NiRGpoTlBK$GJJCr8UJ|}z|UXsqdk$5={^*EnVEDq6gaz9>EILnJznjM zDNQ{W6}6c}&E{v57zh0{vX{bmoc;2oKj%kHAMJh#dQFHmlLT6Id??z4houF| zHh}XT@Fs437*|kGyQeYbs>i9@?aj>npwSwU1XaC#Wyh{?R&HnO^Rh&E~G%K zmuW*o9OLgau5mVfm`Neb=+kGn_}%^(nQJ;}z38)z=ba?4H3z11$2L1H>wt<}3QSSu zO-+~H)D2pHo1Xk^A^nX*s$v8l+wfy+7u_R7apVII(V^57s6o{;OFDpwp-YyPenW|( zF?zfh3TF{lD(u&g^(+(%JWaL@9|g&BvSjMOh0OUyuMG$rmXCNpBQ70MuQgD>=g@3oI;1-vRP6wE&#(pTIeSh4njC*=`#RJ7tTf zg;wH8b}{gMquNOmIaIspxM%r_Jhx+#*MK}oTe4{VuGKNCq`R*UKjG4OSUwpAa%uC% zo?rw{u}Dg`tNjQt7*L60ACamNqB^%+wOp<$0Ljr>vPOIgcI5YYkV_Www@Z(0t;9%+ zJDgoBWgCf&yh1eC_mKtMojzcKCM7&9-9b zX@qZ#Eyxo7_6cPT8$LRmy?o!CyZv-$MapPe0|XM~_yE;QkuUBRfZf?woMHylY~4DR z0VM_oocKivpynkQF@W|N&|Du97pKUYNnLEa98sQt5G`{Jn)l`Zv>^kogo2V}!(%c> zYb~(KaH*O}4|%+HUcV&LKLDLRCBmj!Vi z<0@N2eZ>~Qah(&mvK^4hbVzLh^S+X(BHTfLBJlx-Kc(NGx<3Jh)LR}MZsK4X7v`vNT(^i}_D7TBffQ?V`UoWB6`w8IiZF1hqjzmo>N{EfsO6>ZvPP1MXhx@dRhITnCYzA z<1P9NEI!=RL*J3ILKiHatDv)O8HfXUhP;GpS2}xbW#(Hjttc=SH>{0nxkyM*_=7%K znqX8&!u6*Wj@a@h&k?s;X$$d>LH&;FEb6qYY-m0y|9JCQFW4T zz>;ER-z=N2?!oKV)O1z=hr?*ZBxXaP2fs1f;r6=I!OykBzd5g^s-K)pZCcF@&r$Kd zlg4@5hOyI&|@TMksIqIV9@U-2>Uj`JJ{t{p9S?q2yd)hDRpoGd0 z6om<_fw_`P-hyt?Qm+9{!nnhufW{{4SQoS7qe8*#Gy4g-^WchXTYEPMJ1y7#ItSrw z%?#_93fsi`?q4{I@#zFOP4o+4WN9VYs0_pQKJr|;^G2dl`duf8&X!G@z!p&dXwu`1 zFJE>!u5&14AJBg5aGqb!_85tAUrhkZ_(<~@xNd@UegP4h9oU3HFmSp4-k|1?Id zqeDpv?s`(7hjUVx3oTOuFon3gD*)HYsAi3&uNbSmRaM6*nN} z%ZDcb*?~BI)^LDh&(bAT3~g|Mfvu?12g{T0Rn|;iN|-Enh_+8lQ#NEo7!#2Qk~y$g zWU?YIyqRupO*!kL=D1i>s)jbzAqq4%@6;|bpY$i3aJyC;#;B^#5R~L!UNuy`VquAYaU)efv!( zdV{+oNu882kY*p8RseC3q|p6Qu&bVBh-COPK5!)!@}r_PR@rTR%$rVId!XTF)w)xH zG$EeG=fUJ?n}NfAWfP6_y9Q`8b2YEVc|p_>lXi1~bbQNN+eW~6vJnTW5R+1Wc-N=+ z+f38(T)>!mD8k97X{>pW*~(ui*FIL4)1qQoFm|B-?ewRZ`tQ$|Y(Zt{*SwU7XpvH) z#QDjgI$tD9()oaXdE+0`l5~Oq%4y*x+@_UcN9*DA|gY?qfJ}&U5XUP*$_|gub!@|Js zMKVTN4+P6NAOXT^RA4_*7h9@35@J~y9DnE&3w`@jhYk^Rz;)sxI4{=AwFqGEx3bv- z(2?Z6J+I|-QZpXq^R;w5wyq!@a6DPkU}-N7ek|NHdDa{n!1Z4INp59H?^GfbPTvKH zXxP{ixve(()?{o7j+Rj2_*7l&0w4I-dE`fVl|9t&a>af_wF@yGlum++rL>lmSe{G+ z;n=lPvU04ZC@JmnLs|J)<;?hKP41h9l)TH4=ZTGm_PT{(y%2Qp7CCAZI9~Xj`Hzd7 zRv80tNE@!dV*#OF%?bB@2%XnG8D;7qC^a~Hr+2nQk7@D*Oimpoh|d@RiouF{Whmd8 z<9L$Uw3hi9gj)Tq)NjcdUROrQklS}$GoswFGw8$cNYK|9OTWuaqJnYH?I25%=L=ed zjbYRk^Vie@-Y1gDZzg@xbw%eXYkCnk)2Qev$Pe}eOfJARKURW)PVuSfjqV*c>=SUK zq~8Ig-@kuP@rGuc`d;+vg5bG`^MxPJ6@GkEe53H3zzv~@H)XeOg^k`ip92wUI3Lv` zMB8cBJFGB7k_qr_CXv|>k009w2#AS^i-{Kr6kF^E$sm?Zo4a<|XTFFD1PA}6O7%aV>$O#nX`kX8~zJxS9%~GcYAE=MIi|n zfX~)Bkh9eIFMKvYJDHTA5o|-Lx#$D5r)X&%_cu;t?w89= z@fq&)q~9$1MoXvm7#MMv8xGwOZ1;D^*UY!u^I}hS!a3qaTb`X6^XHHJz(phT>b>Q6 zHviM(7jjz-9IwywU*(HeKFgM>VA_zWD>Pj6OZ$vp;n~vXO%GZJ8hW$y(ek(D)tTy< zgF!tFosqBFKA3SD)U_+sE9N6Oby0cD5ig3F8#bPXIaHo0W8T#>c-207?Ai2!ggWwn z(chEso;^#DciFt>6Z(m&JI5*=Ec@(8}j}YZ_MZ?NA-&pk%orPW|G!= zm7B8?$)7oJv%L!jY*z_wwC}=;^TJmRo<||Ax0IKC%L`t=G@mlXQB! z_kOnH-R8B=SDHR^@k)zBuBx6?`tzC^dYSUO+P{@yb5Pkn*b_Zo<}9L{-*o4l86%4P zjCI&~AC?&k!d~0;>w7fzf701I))~2h574CPh zm{|UV52SrCqc@FTwPUM~5|yw1O&+Kdy2Bdh@ayU0bZx<(NxvUbua&}Ll65L!(QeUb8h&AWU+4*+st#Abenkdy>Yi58x zgjFD(*Z$CmErhT%$KIAm%xot&?hhaYYBAfPoEGstag8XrcFAzN^ilU`s>$|5VMy?$ z^_Cm{TQO!)?TO`2eJkPdf*wObiMby*3D2We8Z0j+i0#~&!GCz>nW^maAxYbUCxEA- zcL$gRc2Vp=_5q7eSwJssS$<#slAS&|#^&N+t@*Vh@A3ykCYyWa$6mfsJW-XN2@{*g ztI&J#4S7AXfod1?{A3inYh(m-2l*aBW~u6Vo;MY-_G>A$@?<2k`Q84)VfA_6IdIk~ z@p-hjv2nuXMy_eU1J~o;9W_YsP)}|oZP#|^t*rQ~cU2iYPs}=2Nwvn_e|Y^|iRR}O z`P!n>wFi9p*tK^STO+K&^doVn&z+<BGKhX7t0?B=~s znAX=9-6B$dnp8QwL}t*9%QEAHyO-^EwU2Sy2#apE@_Vw*7Bg2jiPZazSNHc!2&>X0 z5pVSCU+=Rm-FT$LuPhK@3?N_{!*x&|#rAgP8kf9GB=EOS?A^xeA`Lq%f6c_@-`SQ1 zz$u~hN{zQtZu-IWvyF=*Jr33hUC){oo{!GAdT{%vK7!pjXW7WbWZTM=_0i&tgq+Jo zG;7#mYWSu`j^GmYfK4#dLA>LwrG0jn$JN?)p=NmY=KfA(e$VXzdW)Oh^0TAp3|F7r zFRK%vagcK^M4gGd?_y7rciV@CWUQ8VB zX5wlvZ=x_yN_!YDgp2;{Y`E_N6;4wmAj>$7`5-L+I=d2&>0bBM#F9(bOtYBbzm;+x z_!#%qksD$ON4c{V-Y>|5Lp*22>H0pm{mw(HRG7!D-B4_HG1Ss@ogArr>HK`ukj}QL zOfIZS|0@?N+e z%o`K$a>KnV!i;f;KplH$PT&Dsd<12BZa7E-GQhbeA zTOAM2sA^h$y;d6TjOX2v^TlwDA8xsU<%zCfr_(@m#x>jL?8O~&I_=5RWP0A;ph|z9 zw08Tw4ZD7vyM-juaKu?FP~6!q$w^KvLeL zT7){f+{))!nKAhWJs#Y2axSF+OPc^VgEnrLv;&}I_V&K`mziNLxh&y4%xs4(hN0RD znd@c-V%K1?nVYshsC#m;!Ju0C?+Qto?>PP9fHl%z&UggLVT&sU0EZ$M6HT&m>kb?_ znt|n+5MxH47y#^)>@ARQUL$ZeeMXXf4}M*JXtONu`DXo0_rR)L zgyuES*7pIq>uNGMPOnNhoIhG;veNv6NM~^5duzcU%lU$$mw-%}ye5HAC?)*rmYm*t zrG0G{cWJpDJTM+}iGz=eI<7YBlF%H0vz&LLt0&v83n{x-R$?rneHzT<@ZuPa-8Tlj0V1BUT?BF7b0vk*E{Nk&Nnj7FQ9uiu5<&fu!2PAo?sq{k$lP6q)D%r$^693uWZgk|K1uq! znf@tx4;vl`5jo^$JAJ*&<%`@QLsmWi*Azu8bG88{(dDNFW2Q&qv6i)Fzn;o^Drj`r zdYg{2x}CLeoUAw9i=%$K`ArTOU7V(i%BwY7g(k=c6XV{|WmVLhP0fYW_5%4df27?y zUzEm^spkxe!2C$0x5_I;V{oyXx2K1j=cy5w7!&|Fy8c1enCNoH=X}YCy<|HRu6NIV zI+@RZW;C<~x+3q859Pi?_A=x8Q=%^dss+G3pqQ#MtpHhOxxtjR@DLE0^A&!926x*x zKL=J_Cl1mmVK35};@eX(y47J<}=`Wy~qhVzF+iW`Vts;Qct_Dn%5# zN+rQ%N^v7+z0P&7URn>HTXXj!{x=sei_F*n7vAw(V>r0YSt#j03{L(CIu8IWVQqKY#fB%_g%K1+T%|* zYrJTAYqtQ1A|RG!BC$axiYClPpSeh9#D{T)yTizfPsFQn75AhNUodT_gH;WAP^_-b zx|Qi4{LO`x4!x9qJw#K2PhuDwpN?Y0oX)FsA!n8dOB1a9q z(fZ zk-BCE4;5r*&?+8?AnsLJT}V0o#iRd2*n7vb{k?I+-%eGER;?PLl(sfS5z=MVs;b)5 zZ0(8_yH%rRjM}R;YuBhPikcCd*oxRIL`VqkPx}4+?(sbL^Lqa5t9X%f&UL-7_c%vI z=9A#15a8e)aLk8T0botIF(&+aMjCneSiH{h@Azj%-M`s>hhJBTj=Z^bj`v-DCB6bm ztyDjN_)yq8ujw-MllP%&ym!>9*SRu|+Gw4)0zts`v>eLqeQ+TTDUg8c*i-xMMXK=e z_2k{V+Wut+t|r;mg^`KiayrrM#}%be)E;ajz#>sWbS7y^LLI73kD-N0$~6V0-`mtf zZ|hXKg=+Gr2Dxl`rzBWDb?-{XcJNCvS)R815r@2)Xkhy3Bct;=09Gky2q*5`>F>6gwq z{}ZtS(~GLdQ=$KW)<`AT*HB>f+=E5Im2zVL|4TGxurHG{@ww*e?lXBz3nvL1mG%Z0 z#vF{{67PpLu$_Bq&tM|ikD^+VyZCQvBhuG>?(1gyomBG**$(}@(1XnU7}R(mEZ1LF zXj*@uAsy)13X<1Pb_M+lybq9dq6Gz!=D4IbAB#_lO&to`8qs|LLS*2?nI1~sGc|`K z3z(a`b1)wpxv|0QGfZ{VfL|V;8jnN#>16pkFW^JnugoJofiF)C7lKabRIvC7B`j!S zF#=GzVsXzvrqB7wAF({Q$e8K&iC4!^FGkgsa%@ql0e}tZsrE$4x3Rx#pQv@w2ybLP`;1Z7(`RLUhMt7NdLVMjN}Pej0q2)Y@c)Q>h}`Dtk~WQWuyQK=o==m#q_TbbH&$(qSmB$77w(e1Lug?!0ZcA3F)hkZGuk$0zm?{;CIpz~H(V2MUK{`;>)fiA(S zbJ`Ub(-cO3P8hndsl`MNOI*jFa{3s~{xPJF+pv&YRQK1bZhBM`dGt}m{hSxtUv3At zRKJ7kWGcB_Cp^BKJA38VQv&Y_85*(cM(UnTv$t2;SrNl>s|wy>w#%1+`@?+NJ#$;K zg2*Vy0!Jz|51UUenJZ2^yaegv1OPP)O^o*^Z2K)>jizqF)kUkK#7wZ=x9d+H!xpa9 zaZGN&_af&*|ND6Zq>9}Y-&4IT697(+Ew13K8#9ghi#T>tN?nfF{h~Lr=VnimEn3S8 zepXCGn%kC`**j*}9y!ZU2XMZkudCk`9S^)7-qd-NuAEI}bdij7AdC$lpFbfAMq%|@{?%a6 z)VryG`k%H2@zq;Xk+!zrMrS0u0>Tae*% zz1zUB736&+Hvg;UN#rKu$z6*FfZwsgsqRIX>&mXSM@BJCSTDno17FtT^S}pd_zTjZ z76$*B6pQTr0$X`SX`U?5B~h+%>4gBcyJx*VuO{H&`E-#X}OJ`xjL5;Fj zL&+w=5Fa~pXwz!(>D|%3h3yyoPit&1O&ilD(-F(oI)VAU%0W-_c|EJQOgkFMU`_Zh^mhvWg3Fl72Zyc^18toU{%5na42sUZ4A_ z*{RGw;**#5{*>2)TG6{9AW}@*!-pBcrrLH^0Hxzwh+YO>XoE}JBV2|i?vj54smGTs zUj7{xEvU=-g#1Lo1#<%KeGROB>AR?oNrAH#9C8oUGqbBHwB2Vd<&z%o=XZalAJgu) zC>75BJlegpeX3h+$Y+x;)z%byhYuQn6;vXeoY{#z_tl1qUtjb89lVJh%8W=k?U;v& z)|EbKZW=B_VuuW)(}rT6ALkjvggQw~G|#JPne`c}OC^ebq*QNFI}u`{*g81&XebO; z5h*I)abPM!QW7%x|Kg(Y#3$vawW zuU6VlE*`o*GBOJjwe_7(3@gD}NS)KA|LSXi9-U~|QV*g&c}9p0n|^ZZ&Aq}>)b?`4 zi0r?k)UzR`K&HNEyR7u#C(XPIwwy^tSJdwkD`K$x&9ipm2kX~O$k4@wG{XP!=9`4M z*xYc3q{N=={1XuN@sGOLJi|ZdfTN}!eU+O`u5QV7V6#};6|T3%9ihzAI$gIN32q%y zTQfe631@$ZRz&M102A8X|6d<)Ce~$(9YLunHFtM$MpJSW@kdnyb8+4d>==94XxTFiAHOt1RbghUC0AH!s%`}=<$F6>_4IO(+Y!(RP1HychQlK#WpXdl~< zD~4{YyuvL|0l5O~l*jcc+{E)*h=SJw9cYhQRz3V4^b51H`lu)YFr{+ou@^JI_?g+I zr+wtBCw~Wr1TVxWP4BVk+^V!c#CFjStzz7xv8U_zM*wwz6+7BR&oX z*?v=IRSBVp;Y_^kW9V}0cghvo=M4Wg4Eui!#C>e;K9WuRiEi6>`?&u4^auog1hcdn zVpX_xOCy?$OTM0K?H7A_+O-6E_{l$@cAY}<=iP~h@t;2UeiYIg-E-Hlo(U>i&03g| zN1?-&&=tZ4r>1>J=4Uo`xuvem6{~r%Ka=4lRFn2107F9Q-R#9Dh5Ho67>e=JQ&Jj(H=cF#2n7uI%Qcr8-7DIp~ElI!<< z0FiFyxC|CNnll%^L+E&ucGj{7pVu++cecmX$WMlIZd z&xSyO!n4ugF~If3xGqD$&Kr3hxD>Jb zLEKy3i7uLXG1w`V`!+_T{vaa)0&LBiK7y$iRjjj%*z^1w_n>08(a#J!0TwUA7b^LkSlJUl@w9i0)ba$70%&Xtgxt6=W)@c%GA;i~(D?naG3;+oSb< z>m5o6TZjmqvQ0zh>vLyUT_VbpLY#tbO$aVDiT|db1aOJi8)0wM5l@9RU51{LBNxM~ zu)@^7r~+lbOC=PlD6@OQR{{t7h-|)L-)^RIYrjce$J~Un;-9Xne#zq_#cpsw)Uq=6 zphMK}73499H>rtU+#hZTEGI5)Ah91H6)k6*II()FG%{2AKpbYu7Jk7E4zx6AjpAuv>SEeh3m`FVryu5W{7| zYh*b&KAgB{W6cSUdlyCoSQq*JVKhToG8Kk z60kQ*6k*Z-l?I+N|E+*s)euUqQ@XxXi0QJh9?#P@ZZQwQV=bk*+$(VwE16!Ux+uFW zLUL6xX;Hf?q@j-7-_KFg)aV-dU2DWF$EO_^C#mVv@OF=(Gj8=B1*YYV=LtGBkVu*r z6(?L*7X{#|+$Uh%H%E$d$2$kTW1Gv-l$M69HQto8{yRHkT}g|_a!uIK5Ur&ziB3ru8HDM!x zw(x|mfq)=)6YJvSq17d`+oB-=9TJh57>Zl5G+MD{64sd7zm@BRBc6Q%O0z=`c|kQ$nU7K^`p$_WJ*#BfN+i7Y&rkfRuIbpxpupKQh}4=qnKK5bVtJ0WBOn+@nI z!UW79Ji&|SXWd`q-BW7@@Qx4JeE1?`!q2Z7_Dl6WcGnl4Gsf$WB=oIN+xl*B_xCn1q~j+YO_aX`#46h?1V`L5cDbKSr#VEH zjo@vnE|0~cEX*9kQsTPK<cjS6w-fj@*6;M^+)y)Kbylu zsh3Okg{)vO<*^b&ZmNV;AN%9h_lQXlJst%KRRxpiW|=IZoO}M^W^R#y>>kpm@Av9x1)SvwN_0- zZM_4VblULsLe=aGQMp_f@pQO))Y{~PinRZcR`Sq(+cCUHAGl@U;45M`z$n7qN0K2< zO(b@sB+LpQw&XX$hgjzZ4?fZWX1P^d=TPp4rA7nXy^>m2c_9oi)b4+38Fkff&Tfte zX3RA-Fz?M`ooqG8wO&pl0%-ZeT~Ef@>NI4S1_#!n!<~PmmwEcN7aQAFlIBAo|N9tY z^d;s*<;qqW^$N{33 z)RD*A{CiN&cJrt+oj>k~im=IEuF=8j7B>A)aCp^^(24pHkycyld86aJWtA+COg0EC zh%O^dY5N~%`2D}yUXeHE(ZS7rfw>eKDuJ*1RFo0Dz{wF_kjwx@Jz;(y(PqG0QL=V@ znMu%;t4PxH9#x_duiaQNM_1$K>Y#;~*K!WqsOQ9L!Q$w}+x&r>)xT}4b)VNzG*M~a zEB?-T>uX4%fhE)TP~TJlOZJzX{d{@0**Qx?QQQbM=i{D_G7PqdiEHg!aCG~Ca=e4Fgy`( z=8z^Mf1>3QY)3I0m|nU6kLh)?EdFx=MHI7%q7y1Cx4uA@mmB$ds=*JHS26*Xlro@R z3*H4Tb+tmw8iOBbT7Jxum)Fc3h_&DD)48x*YOy#wo>Js-JJs!jr`OvGqP=!NDy)-W#9ZWhyZ#H4s!!n)_x(?PjO}?+&vkjMQbwq9 z1CJzN7qJ|63ir&P@_wnIm+H9+OFl@hf`I0`Jtv=1(?oelKw#25lR0dKo{~Swz8W?4 z2NCKHH&es!P^4d#ZdvchMZK&{Si%Xqa}VnO_y`D=g3Fx~bP2mxcRLtYViwCIn0zXj zcYsF_^iOB7pFH&UcUS&|^Fpb4>s^-Dg?Va>x%s8+?w;xaO(y84>{{}a|AHk|a zT~`MZ*y`EesGGcc0fsPXJ8wO^{AX9NdKJ9R*Bkel?GZdP=DKQ zcKG$6k*;tjfrT*fu4`K(4)ydldeiq$cn#y&0!7SNTB)0r2)~{CHaaYoQ2AMDDCRie zm2hA5)OQf!rPc>Zh>M&n8Un*Uy~9VFb^0ZV@M?{Xidqh~Ou1A6_6+e#V;y zh@(!$-Ax01o$l9?dJ@qNUE@6L(j43&&6g!BQkLefL4w-|F9nA`2>;a)r7IlqR#2ZE zwB`P^esYGmK=$(P&{0^m8LnP(MnK;GwAgP<-Kc2_1~qSWbWC;=!W~tA8(ShfSp9IUs@j84JVxKVc^9kWoE(YN1DHj;~nN4xBp#oUm6J1;o z-{+d;>l}b9sV#4;$66|^ASk%JAaVht8;k!WY-jl`+^Eh?hP94NZfyfY2o@fGJDBg2 zI^%|SAm|p^XVypf#@-5;J}73%U*qW9epFIn!o*YKG|t}-8t0KkJ+S#P0DBQwMq>NFH@A6 z{}yLW=>q5`kKfCL`ff(>lcdZDGqTML5cI?fmcd)!sOP(D;nV(sPSi(Q3A=%&jOC92 z0kLtPkq1TV`6s2_D^0eCL0_W!+IQ}vJTot`H4YgHM>82@+4Jqs`dOIikZ`Fb4*A!0 zj1VSPYojpYO_mQ*T6e2gCpyUl^>u_qrT5_ckdnY(P5*wD;ti5MEL{=+h@foyCW3^iug z`l-KqaAsLsxwKrbROfd8^IzfYa2(j=AIbi|p-&ZhbKInXNB>3n2LPLl{R?x?qYlp; z49B`>(+|6)eQ(b$$v_Oh!c!CGgZ5U-Esj2~+343>CbMfK3K-^junK>n&+qHO1mWuY zB+!mWCe_fOS!wW{{HRPiUu>0c)&3VmIIN}1JX!p~ZDfK#tmp7z{hH4#A`xc0?US7C zZmA}fkMCPp%d9dW}F-FLjyVVJ?`O~ZzkjYc~DSXH2WHF3A>%<^=g zzVDqrZv|swVcky#%*EYtHScz>{3~RlLGIuNacj zq-PcI;Y+6@e6#CW{guF9^>h_iw&BT(`Q${M;+20&pGK(`W^vaRUe$InIYK5d^B8)4 zLDj2#$1mqK&LVazy7mw4+|hXTSCHD7&GrWnN3_L{0#lV6+K1n*zWYztnKQ~jjnz_9 zmLe*@Y$}R4v{|c-fmQxeaUu&}&$)hNsL$8n`e*K{LGQSj%ue zSle2!!ABH*@w|kdV>=>kcx6x$|FC{4 zpZjyUTYQGb#$SHj=bm;ZmYaQoD`J(Ri%cuuav&RizQv4^zP*;V*Nat-Sl~Z{%Jb)6O(cdKwFxVX-l^_nwK*3;f})2 zYlUtVTSuK6Ww)h})sg&otzq8RZRP6&p$Uwns{3kdGk2D0;=+%@m0jl3v*DxV@G;Sc zowtg`tou{LYdstLxQ~juW{$qM&CJz4)nJ#?Q>{5x8{tog20W%W6y?Mz6*TCO;`H;$ z&vbifP1>&rOR;cUEpwf~yVy*5&1Tdj%Vk{X5N6akw>TT#L15~i6A@k)hx=7kojDHF zF!LJ&%p&_KRIcK)!;Gz2@M!Oo`Y4qYKAYri`g^0w4ujP(vcHrMv6UGka?oU!tUY3^ zK^{;mvJ5Y9qlTplzQO<-h{9!z@AW*km+~TjOHjSIjkb)l^p}L8M^`X^X&kb~F)S_7 z?3>3b3VE=JmJMs>QaS`HyNx}`qTF5bk;}Q=70LF9JRKM&$oa_RSP!F%4em7)S)1)a z>?*DRa){SG#q@I@{3HIcExarG!n+ljXWtmiD+|)f4en>pGVzF{U4q?tHa$HOw}t<> z5!eDn)ZV(%D)rebK)5MP#W!WcG{y9Fuv4r&u>Y^^4N+fWxBw|~oU&ulB-{zEeaejP z`>0xQRAoQ$G8N!Um7J4w6b`>l!I-OGyYf~9Y$<n2fh1an3y@)Ruh93IL zS}8lF=rus=W{eOb5`xEvpn*#y^8`YAUDJ*iaJ?`O-{Vr0cIT(I84!5lai@8vz@5|!Dr{5czi;%>lOT! zZ5PH`Qj&Zv9U^RmMvi{dG^_6eh3`y#D1`Cw`T84;iv{Yo?;jg;}z)IlOPvYc% zi~prgv%&>}b@h*pDy>3{{I*Iebo}wKi3$bxVYKHiB>7!$kdwigx|&+gKBiV6N%l@S zV{hDbU+O5&;;7J#fLR-B-rf~Hp`IM-HE(^!)ds`m!UX*uktI6$<+W!=7~pcQw{u(* zxL@f9#_ii#y_gE-a)XXU@QG6MX|mCM`fKf(t{5WkN~ovx9(Vmko6MY6De@r}OcWcI zZ$Wd#sqh*D2JJF?h*Gc>VBi5_=q>qIZFVqE0oOxsfBWlckOhnv?_D6MoFdIaRjyw= zS8XiqP72NH_y;hXAWObU>Hl~!^XMR8|46!R>Cq4NH#y!&sW8CD5OZ$)#&7G$a)F`7 zPEC|I^n=H<2luGnDO^rkGUfh-Ld(A%bEtc{Vzpt5Lhcu4wIusng@nrbS}yHNpH-$N zvM028gJPQweRaey_O$F}XV|?~UYTeb!e?*Cf0@_s5}qf1;gd7*TM{fgWJ{GLW#9r5 ze})$+;-^voxWL&SI8D7!XT^??80_19B~c#RdlV`gi2VcLDUqjAERP=C7!q#;I4EWO z0({eHByS}CV&g9awv`fGA-2X62)ClcC^a#1mtGEFkuT+txOJa1hO5YrpFu{g(e^EsVbg`2(Ru*bKdJaPs>Yo)%A4 zPiY^e&N|k`plcr=q$(6QNM+r=nYHOncr5*X^|OHfgWHKtb3#R~;+{qzR=EdC0PVe+ zT>&JPrG1yvY-LYVrAp&sE>XFn9F!B*kEZ)VIL6e|!Yj7Iw!|bQG=oEyyISiOp)aM1 zY2{^jR}?#AH)0r-Se#gMPRnRaU4}b5u5ADWx_PFY-9nN4?BRAz4<2o@WJMZcH%&B! zZ8*Vf+5;AyF57#{iG8f@xdbFwc!}kPAs|nRpb`30Uy~?*eVO|0^DBVb17#y>}1YRIr(=a-&`*o)I>Sl)ch%I z+a4*X8s4EK^OwgvK)C3%-)s|?t+igdZ)%jYNLPekkDGQYuOZ16Kwux;xOW6VK7)!v zosqJR(J0+n&W*9sSOqG#C$@8PiQ`FPgNT_8ZUiq$u{wrR);>;*e;;#A`*m zs}^nv5&St+GGN1Md3_5sp(QaLfLl{3F^ASFyZed=f$wEe(19z7Y^@{i>px zi~0egw>&ytWruxiespBf)vK%3=;BNGL+WXu;Qe6+JcAW4XWqNJ+!-oY#qmh2$d~^@ zCX3uKqFtY@qitx5G?!>@WPup`$3gn2H7rg_NwybTc}nJgnf173Gp?c@F>$sw3%*>hux(1@itX!4X=#fP1SmiN^rPUXOVY@ zy5RGA=7y7;OGW-`N}6vQ6zVAM2mfcD+TLE5M{@Hl#^bv1BEfH~<*b%$;s5vO&K4nO z)DQI&z2+U})7yKX$ZRC+Hz>&<4WTZusE`?(%Vca}l*SxVoC{R%*9IBUs<&iT^PMYmsLmF&tBOtm6SQ?kf;8#w(QXb4@Q z?!F**75XK?7>)olL7hnrC(k*}vW4`Qgwq%%bADJLD z*pP~Uy)tH2n>mPD==Ys>C$47tK9Pc>CiM}S?loEwHF*OosU!I&N?)$n@#&1evQ!_> z_4{EQ!l!FO_Q;Q2hI{%E54vh&zZRW|KsxY%N+AWPz3{PJ4?N3)iVP!AJe%5N8dLs_ zo*2s|7}z>yKm2i5|CTzzGQDG!R;Z70O(?f7AD~4E*IA$}u{Gj;?>=EsfY;`+YY|2M zU%wTA$_ZwFQF)1jjp%Qz>*Tzq;lATSuv-MoC!~ws?MGanAE*JRFM{!=Al~duYu<{dhXp^ zvAttF(7BTK(D^0Jw%-f)Id3H_@^;wBh7yF3T48o#5NnsM-*sHme`ZQdKO=TRPf70_ zn)V|6hX+Or@C+D*{10k{BHS56fN zJ({jflPP)TvqBnV%Xm{^{j5tF1M(dwvXxlesjb4F3$(9V+#SqE=STbr-{dy-DY zaOv5aOp}Fs|LjbK^NfM?x1Ad%z7w|_xO@~>bnK+;zjT*Z?qS<#kp;smmQ!PqQ2b9_ zB&O>0yVF7dn;h(sl#j<0AI|uvZ+NkBjpEL&4B_L0Ep&r>f6tghBJzpv^SSQDPE3p( zg{3`^Mi=)@F0Qi=HDWX12~QA;ujB)b$|fWt%iE5v))v~KI~U*=J=?;dk2JFDR2CCw z_qF%F@X2dzCAq)yOHyAuJnaMZK+cdWP?kl>uoJJ+8tZ3rHOrt|j`g!H=TKiS60Y9> zw!}uq(?qdV&5;YvoU%Zl_6r00v^f}gdyE1e#pF{MeW0;PMyxpI{{Hq?;Mw0m1jPHc z)L-I!bG`K6F0J!heUY2>Z-aIpu9LxWv{lyP+6h}g#B|jQ3G#y}08|qrZB&}W)kVNA zee}R)8 zzh!MtdcZmYIURNy>onOpL~KG4XhJz{ljaVy!HlH0kyNKC=@Voy0SS3Ar&5}vl6TBt zoRu1nJfT^(N6d*9SiTJ2jPA}`Ok_X2+?LyAmjP;gvI+|(1`%RSZmNjeVB5&3B$aau za=Cl}4CE+{gj0^1O5)cWlO?b12Uxzpcv75V4)s?;Lj8|hptOJ=_9 zooEg)&#~r4nv*B46!}Hh^IvxGt+!vU58&LkzeRkbmj*! zV7_%Ng0flLf%))AgtcARtGnpa)TOEf0ds&gF|k-L5W}p)t{vs1zvbiu)7XXYYd|C@!U8-(Z z?*|`M*;Y3}n`pU%$G-}O<{jqWCQ1NaDp~XA%4d@}v6Fch;^TAC|K^zagQ#X= zj3oBQX6^cO->u0U0mFv^rcF2wZJPk{RREfvs%pnivvVTZObH>R{<$3ZSkrn}r>ilG9EvXw@haW#cnJaOj$7r7K|SkARP z74y;n$r>N?hJepTi(dB}&mFt#7o?fPh1coL{fN4A8*-5%KnonRu=gYIda3xngyZSp zjwL}~l0%4o`Dim)S~G)y%^$vUjwoFqpTL&kNp?~#uB{6{D2<~bo5x(PK)~drIpxr) z1sxR%2cK`pXZ>p_UT-q9x6K}Jq>op8$@J~HAI3a~-kBWWjO^5qp=$gEE3=7Y&YS+uGFOmR|*}RdC6+x1V|Lpa) z`S@GzDMJ+8ugr=3_j6<2F#~pHSYy=_(;)gDHbkvQlg#fa=b1{^_2d@*MrMg?m_!8ih8iSsD zPhFeKVQo+J#)-LBz)YBUGu;=hJ;&lU9B&>o_NB>i)yy@iB4Bf($tewwc7(=4v{Dt< z*d0yWUC$iT6Hl5A+V?qtM_2CDYHdn)QNn3wwMe!oyXR5d+CxF?5n<n_JU%a(>$rD_@4EDz(rAQAWIlgO=gfjz48BC7Xr;}U zMtB7T)Sx8htyu1!dHI~|{n9a|lYjWMG_+bDu#}j*E~C^e`_;&(`uK*Lgo!OH*-W3? z_9*>wWOhuwhZ zJfUQci)r7ujhnme=*sFnY44+)UeFNQOF-}TV!fk-V^|XNFuE-LA-7Ug`WJmwYDvo%QtpZB=pxlYk1(RXS+hCzla2a$E4}DWEyz1oTvSnM3|019{F;vv zrfYDq!fnyE?x92;>nPRI^lzVJHc-6pF|GS6wp1yHXj;Ts`Yd;z#kG^Ix2H9vz8zn^ z(b$bar{)#ifStRN_k?ltvZ$#zRbOJjBr_S$0a zd@@<5s^9G!<-;>SAwW6mJR??C%zZ` z9uI~It2}Ol?Ze#w3P#M*a+IgdhjWlyOc`L^=B}n|jU&$*$zQ$}nWpHJILhA19Cpv_ zRotCK>B6XDmqn#?oeYHL&e&_#y@OU7dXqJQ=ZfPRJUC*~k3j$to$dswo>YsKDmLLo z+Zzi?8I_MYvgPY&W`McQHY!aMoA~nf(`M^8*<}Xpf!|10m!ob4>E6b8YSkDn8P?g+ zVTI=QE~apaF|B~b2#g+<3Yb<`{k*;@UDJ9-oIZ;;^*=nu@zp=y-I%}<^EFRGeKk3) zC+=E{#VYI>Urj3G<4rXy&Is&fTmIMWd6aLmfkVMP2o6{0Kl);huFO?mI{61)|5=C_ z;|<4q);ZUJ!(gY0n1CF{^J?rB(W;RR;yJG@Jl7P{BVC=Qo^X}DANd_uHKH7H>CWdS z1Ln29#r$&sp(N!zvJo0{PG`51?J*|>oxG&Cx*w1o8fCeBo*$EpPrB$sjI7scET2`} zT_ps9mpTOZ*nkT*ekNLfow8bXofo}x=G@t|Cu}PF2BaXDvonr^k2G$6n7FF}S-XUI z$7wnu8#Z?Xkv;zURV&)~R+^Ob|1vpH`nb~XBv4x&m=|AI#EUH=nW4lcGUkhIH0+smv!`XTec<~brcOVIW1 ze$o|{@iU^Wn+=IcvmZZGe7TS6|)&m8anw6@47k7Pw7-E>X4;+R%egLiIAH#%ib`|kLr ztYje;VHw~x6*&559skn!papb!{BhkO>)eYe?Vgn^BmWtlisNq$md6MDORbGoS#a-| z;~oEjQk$0M2oIU)W>Oc0QO7l37)Jnr4yw>j5MK zv}=&!pdX2()YORJtQ~o|`R1ki&aTPGsr_`m?Wl#pauv@#{Kit9W^>&@WZfd6G(7|} z*Ic0;l>xlG?yBYdZg%>>a#{`3+YpJ$`gNerkI~nxH;^pT-=3H(fqy&2_0I}sYd<~O zP#yMv=z1HVH}HeQO0?SvGl$bHq0a>-@1COl&uBe6z2(2w`}wBI=9(T{5dN~`+2W6y zkk}FMt43%?sf0aUmrgFu8dvd_*!e_UX+B}*6{yhKymO%ZKF&^Pb~H|_b82NOJKD=O z*!fa6V7+Et&IHY|pdT00a5WVMZuCAtPR5Tul7Eifx_%pIf5B-1(P+zuA+4Fd@6&w|Df357$NhF|1f!pwtJr zs2ME(XI=(oPy_YAV&+2l#8vx@OvDY$QD=9`HNz%Pmd(nCB|CRbs$n0C_3FEXgf08? zm+EA64eRW`+0XoVQaWAxoN+yyeg-&|1Y#M|;gI8J_(yA##i;#8d-&!c2pi8p@3jQa z;MEmjlAVqlzE}-yk_H<3jizJ38+j-Rl1Q?xw^e@FToBn z8B7BB42E~9HS6{Ec}wITl>0=W6hgbJ@l4Wo685C zrF}`?XSo_*wss*(Hjks=&n$U5X-~awhCDH z=VHAklQ{)2R|pN;7ae}xadd3Dtea6L*5mQAupz%UXg1g5CMc0zwz_-&cTy7$b~>$+ z%wgh)M$)i>Zx}x;^Pk~glAb#i4Dvc731=qKl3J8pKOHw_fP`9IS!FlNM{;zi{?>-CsD{oc%E4nU+P7 ztO}QDBf3h_Ul!zJA;Rvk1Mz6hyvrOB(~xR^kSA{}fCr-3n!Nma8oZ{z*<0Nm1%HFxEAZ{A7Ti{t?A9J?RS{0!G z3tUD&r-Nkd4$tt6bG%jdJTqYm0WBj9CmyaF|8%h`<+7q(l996l$@y;n;jTEIEz6U% zKk;(9SM|*&yEYwljBOJ>;qdGZ&)msnnGb?s@B2b}Cycyg=AGenDjB4}&Ym&B)cx|O zRQoZHI``#zDKI;yyQUbzE`17v2Ct)3_pLFHiD}r$=pJkH5tWV0*jzt}RQCOK#a(la z6jPfNm4_k3rInYyYS%lz`-zJBNoN!`Kk;1_&$zv@uiCY`n&h*_Z<^SAl-l2wfMV@3 z-)%(nFRzv_ni#xh9+GWC2`dkWc*1clPl>`jMCXoOl;cBg@d8(|jJ=a|@lUPb=A$@u zVE=OkvIDqTi7-!DCohKhDY!}j8&PoWhEL12Uij;*6?N}FBgeo7vc8s9=ovl#`LaRa zxnchL6>+x#cu|yMg462*!Gz~A+x7EGUvhV!#T$*8zkaPbgVhG2pMoph`saaAoi67& z1N=JhoO`p8RG2RC63SSkz`bV+xWO;AM6-;#sho8Y1j=mL$Z$+3?k$99Q0H?J#-nxS zf3V3yjO(A~I)=lM=XY#`*F2L?KkUqwRo%}^+iCEZ^HO2ih6`ChUo@GhsEfQJuW=)@zRXHHQ4 zUiJFKYr4ig*oHb9IowPPmD!p4F5`kJ_1=z~I-AqQu+naR>74cY)D(WJ`SKPmZSzL1 zUQL&0<84!KeEeQSzMM+6BTDhJ*sHd_ZE<=y;<$P=Ve|lv1bDs=v zO~*SU1hoyijTksM&jI%@v9^b8r%>O$r7xS=4?>(Oii2Cecx78&ji8sxkG3_GoW@sk=n8^WiqC?V5_$FnSVdengjc>W^wyyXqh*8Gdn=fRJq z?0+Yhfs|$Qo&QW`LWTl|w3fjYwo2Fl$Ryw#vlfl=uc&_V3~+>sIEVa5Hw;I9(ai6> zhBmnu0_PZ${SdM=U8kjA6x2@~Cx2*^4ypffHWp7U!&ROfZO^B}H!FRr9cJ#@)oe!NR$Mj2{azky%ihm- z^{ zN?v?A6=(d9cY{wndr?wdC@B+u+0tq2jpLSd+9}T|^fusHYgN*&!+|1CQ%3ol=y@5V zwx+6kJp9p>TQXlmPiBufzBCgEb6z;jy;~2qP-U4DKgBv^CxxNR_XbI@&g8u4qc(Y{ z@BWnR34~^M*83Fu>jHr54I^f7PXAtgQrqE`F$r8zGtsyQ-)2p964nS?dth{P0sLwy z*q_?G((C=lf8BT11atO?`Shg?-Z%$4+o;+NatN6~4j~`K0XOI2M(efgXM<|Sr?tSE zy!rW7@A?#o2&ZQK(phW9*1gU;m$yq)rU+dGDf)EM*{ifyyxkC*#@fVb_gW}kcU2ay zyPj7ro##qVJXNCU;X6AqVQCV{rMHN9XK$TNn6{C2f3#y%w^JmD#M*}zU#%Lz9>+mQ zf(>-aG@s6noNBsdPdj%eyWOizP( znMH-y=v8mWty?EScIzTu{>0GcxE=37+{*&%UFgdUysDI<1BqIMfb?F_xvW6 z>0Y7zSm{!Q^PPg4qqw#8ZY%V>BI^fGSQkF#VD8#NlV@>1GO}Sw%dM3)d}23qzpwm| zlyV$zxgt=RVnA_(Hi>@VA|_~H$Y`nFJ%4MFS`Ns8I**_0sQpN*eno#KA(`v`NHo2` z@U!fzzTp{xo;QG-SMd7c4T2O#qixK{Z%}2=um|XCkSrC?oj(?CzYI$~i;HJ#bLE6p z4GbwtufB@#IOax_Q|dQBLrDYvzG=nAF^!-Z%jPZq!|u0;TGbWBr$-m$scSta1}}J9 z?fBMf87% zHM-p$QzvS(GwZI&`8Tp(EUE5igUBtO*SM9ll$g`jFE-$c*;mY6ELhRzzQ5)$cS3m4 zEsR7bvO~h|G>cLRN2k%S@?3=&aZ>y!P*9onfZwV;6%|f3=0_&H`)Xb{d-JyB>Ufu@ zCG!e3pb;gS(bbV@tF8V#W&@?aIm#K}W`NybBii{?tbHW9r}^_?{FHFwaDLQU{qW@( zW-uR9hemggjvYQ`a0>G~J0DXjyYmIOtmkC@)SwR9Nuw{416dv3QD2N>!f!8xxDAUF z%TIN5x1zTIEk4vGe_wRedn=NjD#3_1Un3*-sZvzWk|935qm60HJ3NF*C|kBdJVo3-JFpk~xeJ0E@9aTl505!aHC~o=aZ%4` zk@8Iabd}rlJnE|xBy9IQ=q99enSm)C-BE(k#QMz-NgGB>^5Xl}Y z^s3Y+{bm(T>@;mw;Fv6e{~0{I1(NmD==dLOy=gdopYJm34H%S>j((r*akF+QnKshMy-`A+?01_hJQ(a1J%^* z6gJ0W0HoTsn9(<6Loc^2@17uLJ7O1DA&3WR%+UG|x{$CrpHv3OsJVDI?r0ypA$|`P zzU;b&2{;PdI1Pr3-?D2$dfGl)_4`2LhdcGxhxFe953ps-ej96xl8O{plxrH6pG$39 zKGcH&uDeHGKp=;5IVP7oPp`KI{`|7;(bPL@K=#t!IOBD<@SU8U!k=!(LZv=1E--p9 zr_YKPAZfh=hN_jMs1GNe9Ou004?Dip8=qR;MSs*Iy6HcbmEExvb42U$u=N_68ISYp>F@En zD)Jj*CXXxdS98Cb{S_Pz&A(Y3Z4Xwi=ne!*GGqekopHHuRHC zRa=wXI+Ha=h}{cR_~8#_hefHkFF9+YNX@%irvUr&U%(%Jr3EA#Q`B@*{`E&wrn&X< zfuB((es9H|jefH?JoA;??V`5e=GXlxrAr2G&n*IcnNy?gpr&M)AL@?oRCLr6lit%r(%l# zZgHBtzwUBO@fW%w-VhcJF-kk1Q+uI!oQAj2Amg=zw zd#>z=HxNK^M3q4Kq4ffX*#Jtni8*FxB1QV!kx7GJY{ULALbD>(qNMQ2ke6 zpD_sM+a)Z?(FP$gLBx@}%cJW|2O})W@Rf$yu)V=qHZRVPJk%SHMTOytGR53(fPwZa z#4EaQSey`c?l>-Xol5KPm0^ew9aWc>|8Kotsz;~LiJDTCh=@Bqu5P6Q0&LVqKD%)V z3ymC6+J_!jPqk*88xZF-@yG?!G*x$oE?YS7vGouIDfuhb9!mAV-PRf;gz+2J0tC8z z#r@^F9)m7Mg@ezMzUCoxhpV3}%Jo40zFUFM_c`UnR`v!vmbu!N!o(yU-=>-_$A;OW z#{Pg|w1yES>+-`mLv`^dTW~7lPn`__v~iyB9XpnN;gjW!=zs|P9NXjtBPzm{!3 zC>7kaichXsS*#~STQJ#yING**wSQ}o2CZT=_O&f7i6paw?sd#z(e5B)6X@W&1O8Lm zD3AIx%01hraA4SNp@uQop&L1b6b{_zH4ksQV)5#h-wpuetkG^Ogk%bzJH{kTEB z4?Xye;nDw!Ep+zl#`p;ob_Q@rKQz7liT@w_!`a_n&h~g^I3XD@jc;{7e*th<1Nwqx zq%jmLzEqzbvvbm|j>I~BXJ%?jwQNa3tzAqLTOY;NUDQMtD*sNXOxoFTCvNAtC7GSm z&YgM1nUci9az03frr?7_4pRocmRZMN|UbJPy^@Q8)o-%2Y~$7+ zH7_s&kNMjO^OF;`&Q7h-QFZd9X-B-qAl#anbYRRprK9(!-W`qJxOP=4FgctEr@$Gl8-A z9w#3tRGP!*4Xk`*mT&zdqGD(OxkSY(}`Snk(Wel;O%kHG9ZW}yBRUi6|Qo`ay%31{o?iKjhtnR$16mu0$`lS_*` zw856(SUcY>yx2ly^3b)oT5UgQNu8T{?|O@;94Bw?N%StaB?bWDu_5DuZ=?=6>Et@Vb(OTEL0+-f~T~S;~-j#w7v$;MT zwOa&*H5G1(4NEVrM1Kc|_pLwd6I!Z6IX%#6$s8~mk9z~6`l7fis=ohYMO3n-`-+VcO$U@ zK?oJPzw=<)nc1q+{#1W09|eUxtxHr>rTi#>l(nWV=O zzI8b^qlryEh`2BWoG zpUyMz`*($zMs6=1gL)?wZZrX)lydUw&u@>gnUlZ^GcMo{bFvSMNlmQd2$T$qWoQ(q zNt5m_?}YtBi*j;BgD>TWLzbN^amUPu)9wC?DNlY&4>xWoI3Q$7V^a6d1+DoT4Rv)B z%qG#Qs@+~o6fN-M$*d>$RJ!<*@JNOLQ2hdO#vqscB4R$Ir5qNM(&_a5v z{EsC#Nx5b2Rrgz|3n^MJv%!*auoQY(lrJ;f_UGy_cNz$PPY<*lmS9UW@@hD!f!Jfl z?N*$7yY9R3B{BAl6`@o+C#8xvs#AFNk!_T4km2;liUFiu*{H;6$S!~YIZRYYnb^_n zenhX?_=9V<;pCsoxxh_p!AT}CB%3{CTh-^uJwUKM@KdL~fkreZdY%wssblxByI!hp z0727#$^%yB*Z*Ox|3PrX|3PpvU;u3;>c0RFz-kK8J7zVZ0nZA3)~ns zmX!5d%qcckpa<@&6*EnJ%p|2Y@BQYHgYuA8#@J~ zxrR98mU~Lorof)cI(aDMk^?jjbTs9JQs2iz8)$4xZ_oKI4WTZmQwLp5@Rv)j4_#(0 z(ffhBqdgh;R*@$6tGe1^bVS<#QSxImnFG>$`Q}>dAFur>|KN>-_tW;`DSBLr9-6&T zS1O$XMR`5^%xutytGiqCf%R6cs%gu#pQVcj}(08 zY}5Z76O4@V$6U1asJ!jR|M@a5BmT>b)FU#DSyLwOxQc3wg!8#x(?wLEy{7tEQJ>30 z)n2%AxWMjkoCc1Kcy+a{|G}4u!5J0<2g5Q1|<|7xV^KQLP zv2R6RvM^KKHeq_8LB+K;AVQIw^mVF!{Tq-u*W_Ueej(sJ=Scf9b4H%oyy0Cii?&6Y zRj`(pcIQ0Mzdib$R~8~`fSLa8a%8z1J--mPujaxWwsnU_I0(AQ7VQt(iCHK6W*($t z62W!?@dL23$>-@{-_oFE1cXnyTOs&shZ1s53+c!s+INfCCLWa0u)BoG?9XMQO89Tl zTez}OIuDo-Bwqiwu8Mh#R=7ZOED|FYJQGAqKiYXvOPeSv2a>1Ned!WBL=<$ai@=0j zlkgaw#$2grb`oe`HtYT9OFG=dkbeeYalK?C$^u}=p(kyz99W(7++ToE4|W4d42lz( zF_c1y$T^iz>`s47>A|%Hp~TpAtHzMhC=)YlqOHyMRjwAgh-1UnyX~9*(W(GjQl-`R zg0SsdpyLyI_D9W4hh=ND8Edesj;`j_LKBlwnI~N~wz}huQCYVdHKlr_ZY^Y;kQfkJ zOgWE98K*y-lX)+ZI2Tj${LB|-hf@EL)2uD7r?ht%zZSH*E$EZzn!Ho_Fgm?yj=DVh zG8Q~q`H=ad{9vg(juGNiPv*hnS|v9A&8(BO!91Gs){OrX452;f(aIvK4X7-WGdm_8lWe%3^f~->S(N2WSc&}7=NPAz zxzNUJLmq|nWvAshTWwh=8@!^!B4i%5dk(f|_h7tta%Q{Epxd))!K7k*<80OV)B}7| zK0nu|2c!Gl`6V(F2xYL{u=@<=S5BR}n)bcB8RmBVJ9_651@qZAR3Nc)cKoAW)|uk+ zOt-9i`2gc0@pgm#>peA&)RGquEQMID!_o?7zPh}TpxlcBE-xJYTY#TTry^-SJI`*k z+g|Q0U8}K!94S2*^bq{MQu812tYoMA{^*EKn7DuHTo~a$6Tsi-0Uu7M%L|2PW3J2> zjQ+Bp`&4XJZwe9v<`rCn!HZ%;vGoVVY}ACLCOgALK%aE1Zy+wOTU$5F*GtGtORHgC zk@Xc;+LPS^ruxc_aJtfz{WY#4i@F8vdRO>%I)BZSN5uy0@KNn1b@ z74~yO4!bjBsR0XWq&s!lGQl(Q$vEGm*=)-PpWN!^jAeaYpJ%H6DzXk5T#tEx_L8+; z@~Ypqdr_I>b(%_bE^OVCVpt>@A~MMhJ0tQizuBvR1xM@3|DIkvtr0s3_E=u|u45HL z;J*lHnMHq;`f=*O)-pFX_87MQ!dFp_>BjNn+zSoFlV`rw~6B`cO?_b9n)*G{eVc>Kvm2qz$1DsFsZBAD$iPzN5zG; zs`u>9)ZWIoP^G^dVKz!^0)5Ndf72X{8ok~+V=9$ge|f;2P3v`O`_<${CmYmH{8LSRH>0ba6)9uTCLI`|HZOf24v5EF zy4pPB2Uqh3nrrM&x$Y+8A&8iMiHPr-auskb&x1qPsyKI@I-rr!G;BfKPch-+k6*lkO6=FE`Lh}W$XRALn z?Cj1-dYMzdMYOTvJpsVKu1ueblz^8Yi;I^ddgrP_hZqh2=w6O2zEVE+jMzaT~vQBiOq$^jv?}PKA|Zj%QuD)pPQ8dqTughxuV&xwAvL zzg(`v5q=@F1{!1Z>n2i%lAoNU0GPI*-po{hj?qO0MI6;n< zag?ke5|3i)e4b|@RlIMs@-o9QCP9Vn^jkyprNlhd>~bNCinn%ZtXgVflfMYPgb&ZV zFHoK+F?@5GS=8DQ1xVTgpX%yQ)W^3&&iJvIpBI6!+2}&msd7@C3>7#0*%Q(79*lp_ z*iIjV?LwcJ2;6^3bsFf|<7%UwP-`cy|L-$0i%k_pgh6J4R(9H*b%b?-_8gPP-Zn3pIZoxN zL058xolRSJkM-V$I-P-Ga@V6TuI93_PhAf<3!D>AGha;m*Mwa^aYOuj$v6j?n4bN;XDfyzy7%@^1&aE*0XCmeE(Pac#f;h$Inj{V*^jp(@ z%YXkcYF42?C>XRO!L|XQXh|N8`BjrJ@lv#GAf6Vlx?QXnibg4G=qaWPdxmNowrSrJ#;p5N$ z5{J(|AILP;M$PY9sJGu&k~4EF08Pr)&y8NfIInrP0P8)t#C&OlA;+aeym(5ntU2Kj z)a*pvU*AX7V-OrpfGwyR`HEfrl35XbjBwMm=1TI=Y5@);FdP@ALy_iU;decPscvij{A z=I%2;Ed(wRZeesh1@Z+I7H1p>-_>GKm|9a&IH)DkO?)rcYi}TN=1s_C7d1vdCTP|{ zZqVAe%oa|f_=>|v_qx7nnFptFk~rGV884I%JN9kvp0FtTHEXwP6Q1pu_3ImMuyVps|hQ=k{#M}_-b@QD#6-!d%qKF!Ye=Qe(3QbaL|6;ErRZ%qg~7`lTT>e zi_GlF{JP2TTsPp&AVs(BWnsb@!=YH}6YYrYoR_gq4-a2m91pdjULLGH4;}Nn1U4wI z`isuF4Y!BX^-)?}M&gr}`JkqrsB^HuIgr=_iW6$wcY3x_FaY}-^@1Y1GLt|L~c zJc97R==2C)WE=c+Mp4pr-cQ=6g}Evi72rL-1arirGV5fhktlSe|5NmcR_gEs*i8q7 zsh%2BnEREDb^%eerq|=ig~6>IJ?ka7XB^hbk9)H^^aFwt6sz4ZSsA-&NqKNNnpyR; zOR%T_#k=(t1_LE#dwsYFoEKPKX;1>T8obpS+BKCDIms83NuB*ZC8!e48WK$3Hfjc| z2$)VCa5>=r1)UJu)X$!R<-E#`1z%n3WG!p=elr#Lo-yN?5ZFcdTB>&V9N_d*{*oGj zwuC{X^oZ}L>#>=?&Eo<4nFb5E(lW~* zBWBTO)gRl@)1Pq0QG~uNIknuCjchaMml+i1lGE_iZZAK!d9SKeQ6QQhLd{vqS*vCY!|QFHm;8qsp@M-5YEZDPO1(f;SAO*roT0qlpD*P&Mbs@oQ1Ip^M< z1^$8@;bD{S7tP20pQzVAnQrC;2HeZXsm}3GG%x;&DytcEzs1Xx#VYc4&0-G7UPae= zJV!MJk!hjsoPPSN=v?QUE?3j1JG@COOqWh+q!wQ_Tuwewac4QoI4+}M&8ErMRP`Pm znIZqxxq||!=*%K+>Tf+X+5wd?bCpuyMtvb_(WpMo)ogDM3Xm7Xj7-<7rTa{A5_&tP zEX+GVW$APCJ+v$_YAa!x{3pOUfwN1gj62E2w1_Pn#kk#ElQ8`J%?LQBAA(EA&K$^< zdGo#92ZeX7JU4W57BuQpa<|oeCe94e@?jPn*&OyzZw7^nAHEN2Jy=1y2k}z|Mj_d` z(A04I?L6dR38}LKg3%6a*W<}ECtf?yk|Zs9nr|ZZMkWU#zF^l#K(_SP#_r<#wcs0{>aE~PhO|2R zHd4#tCy^_94_7^!0RZeYaG&~Obvv4{Jn@)vOQ3w3+_E|!e-57509zqEd@4Pp~G!z@;7%iRY+{fI?xbI(n1LROwx3CXh9R36DbIQqs-Y9Z4@hJ*Yn*Y|@+?U|S5c3Q){W^Ps z7cZ;U&9IOiEw8&dRjsbr;q!14ETc5-KIszI^lWpoLU`@1^%Tfsp0moMjjKP}(l1&2 z5hje@uW4oT$hQ6N_E7$j!ojb+#=ncuJm1}V`;0<+HTQH+r|QP#b?*ZCy@PM$%vXwe zxe$n?4(rzD*oY*B%7b{03pqOT;86&RDD%4HFo)nsu^DdBMd@e5;=Z1rSxXZBAJ&*J z7R7iI6!Y)d$&cH8BmNJGO+610;Ri&j0=GrW$bUS4(~T?<7VgDIz)eoMtf00$>%;ut z8>^BQdh~6!x&Wophuu-NaXdnF`FocSgOMzgUl?DI&hc86J+iYz4gI<1;^ah0fC`{n zF}ZcN8Qvc&oKI4S$!mUlVZca@bKUID3_nTH~FGuj}+8C`58NW+d;xQL%H4a|TwaqewJ}EWH5MfUJ6Mp1f_pI1= zVfD;B(p;aCl44=Ohd8yd`?s15=l`$)KU-;y!C^XFJdHKCFh2>R>WiNdy;QDKmRQ~RR2A_ z?d{Lk3=)QOzgx{cb*n&$OHm+Dqfx0(NY&K)qn{Fo>=oX>nm&2nS6_+rE6&SQWRH0j_AoX38{b18ZJ%=N*!>rAXpM|Iuu&Evb}j|hZX6Sl~?Mrf7xpi*@=l6 zv_ucGQuFJg^rqR3yrLDzfkV!X`#bVmvYQ@nc)%8-mM;!_c(9kfA|Hspo}N259^uh4 zMNxTlDpzsy-H&BD0DHzl{1W)i-26hOPjJodQoZ0IJE?@^>F?3Ln{C^CHW}#Hcx;V@ zuLhroMKq;7;A%QHj~!gR>Jy`TayRM2d{%Aw)>CxVLps4@IXI$*K{I|6>uW%BrA5o$nQ&qo0Ff`8vtzXxZU?bCT3_s5E@$bCT|A8F=k z(4P!u^7=jmuM= z)<9_&4nW|k?(=V^uLFN9Kq6|PgC0i*N&Jhat$@ST$V+tOf5F@bNwD+C+~w|%tl#OX0Hi&a1}~cx~sG_2vL^MZYWmBCs3)6t!z#6ocI`d?dJzlB%W5E zpXrq>ejoT|0)RvJ`ePHX?2Q~`>#3zsQ;1J1;urF*_`mwd>ElMZ&B>r=aT1n*JZeyqEi+Y0a!ZdsA8^I}9YZega^nELGc4cJe5L84`!H4-OB4?bwSodMop4~_Ug=Si=pbVV z0^TFR(_9S!B1FG&Z>zJ)n~q65C8_Py0lCJ*VH7kdV_So@&bIe!ZWF!=x{w>?7JKdj zL1s{Bf8Xc__wKifc(H+vEDM7Y9-rb(d;GNsUC8D|hdh&}lp1zaoW$+cn)9Do@SO_H z4;Xh;PuzY2v$BK-ZHvw)nB`YV_P%~Ub)tn4)%o_k1l5_6)FtzdRM0ZB zeUYHs51H#oyAFit({dJ)Jsy_Qv2{<4{r>G*<=3>a)fgChL)tPkh0m%;NYAcF7(q#$ z5a$rf)k@8c2JbwU_Bm9na_O;^w%mAJasC}zIM#Fgm!UO(L zw}u(Wvu~-`RJJ2+vk<~XX&nI$>47iGkPEf;{r3qeEApCJZks7RShKwaOcBI1xM+;a zi48a|UT|1v$?V-RzC-IfGSBJ>!EO=OQQecO1XkP%cBtKPL&rrwly(7S@XoBsTe zTssg@9xw=jpk#gVB|YdOi=*f%O)WySWlbZAH5>&*3h~h}J;pB|Fnh09!!v2$z2fj- zYqg!q0=S9Ghj0HXm}^@gBpaGIo#Apz8x5F zlF^%36LGwrvdc-j;9@?H-tPHseGk-jYno!BB3Vc2UNRr*A{H*L?Didi)&fM9)tiZI zj-EQhNq>1EOmAFA>H<`@Q_Jiqlx|Tkm61Bo+XvD@`9mV5@bK1-ae#z zcsyvhRQXN>-!$F~q5La?j;*~_y82jbCCY}j87eRHcgd(1O0rq&+>>#jwVZKm-&=D$ zY%0NSNjsre-FN%uxa!dt1Ua*9`%+3q+$KsHE_3|waWl(y{pFB6{z@L!uUYkSE=bw| ztLZE-d`5?r41fb zKI622aXH+tdS+Uc{R`f%noS4h+t8oq<3G*^IGr=g@oHh4o&l=l&ze29UAmrO&&VLM zVj+L{JtKk$h`a6YZaYqFsFXhw!EAh)79jmSlHnPvFyJbCSsk#6=l;za+!kPpt4HS&gooSfT?dv9ZpVuupAfvl1_P zA>c~MIn20d89;Q(5Q2n4cz!uMzzK)0WbVn^wz^uo>x+LScU>a{#?}M z1m{UepD{R|N>(u&?l zMs?1x%m^%$W?F#6%>=cbs-FQe-t&92t+jL#u*upUfrJh$VJ2u4x&3mTbr*^k(SW@hehu(yAoY}_W zQ+n}475s|AO2OJKruA0tA`1o4u1#AJNUxUGkfxl+y9*HZnjRk3`Q1;ow|KPOL#LRW z@$aCr#vvt^U*vl5!YddifJ1L_%%RVfg&W*GJ#JgHoa%7TGAiYa``;525Am+`z`7Tp-T$-+#7#8<_J-i9I*W)A*c|>^`27&IGNOPtOYvx#|~Y9dIg3`$is=O z(~W{*308jJ6=Z&gCk-5EAs8i_qcf#di3W5dGpPwI-{sWgBKRTeOitS?OGpK z6t?`o->8PwOw_rDrpUYG)~#<5G;o%~Rv+Qo$jp@cuZkz!4@#CjUxP&9qo!)y**0l; z9h&13owj?3k-jXgmeNEY9cTFAg&#R`p5tz;?<6Y8b zQsb&Db{q=(t}ssef%&x?VPTfr6<^R1AUgDcGlJ|Rm?$Z+B zkAe*@b=^#UBv2l)M2R@mw0ND9qG$qrqHQnEydE^M<;q^HK)7m`uQrX@x;wJ+0ux+A z+J_>Zfl9}6R3mTPMr3h2oIPL1NytL{2tCJGpiutQ2E84MeVfNwkg>O2fi@&&XbhHV zH$@!mAgHD;<4t-|H>As}viIIrKYm-2hv%p_X*fo#>3%vhfh>*q;>HZw(TR=@ zA>dK{AvR7wZO+98+(Qu8X*f8Ke~jGDdn>L+?o8@D5V;*s6ruE+ax-EP&&?*Al8N2CZAN4?J ziUz?j&tUzh+cyRO9re^Dpyl`%t;LY!$)yWOwK@KNkI%2AEIcvh^Hx7zN^$(Hw(1R+ z)f=@`R2Ah*iMOe=lxNviQTWSA(cA~Pe3A?oWUka6=3hPUhvu)a8g7va;=Rk$$^Q?$ob6EI`&YVQC2_Q%bB)$$7YcGi~G2_zck^2e@{=LcPiSu zrug~jOh8^NL3@5~sCI2NA{v*In=$SQ{pwRu#Qx+ZQ9GzL!Arr<*Acr3L$lC4aJ(hw{vQU zox|au$Cs|8sYY=i1R}?>eR}Q_(4Vc)KVx?r=Y=YUt1tWZ%;N&Q<3$9Z32loUZoiC5 zERg10;)p12kZq3hN5i}qi0S<0-i$0(uG;6Daico8Bm*7Bjj>zx@X;dCNuR>(C6o&( z`}O1jw=UeSSD(rHV5d2l?t8s)yGJWJJi)hTla2dTzP~l-%TT4hC7yv) zk7nhA^Nb?Lm%mqI7Bq>5@OAOX9}ojiPa9?WHkvMipy^59wwuyAGWdL9y#dwbt7xkE z=XiVJ@{wmGcuajIRP?OirKXlEo5vxoXw%58n^HYM6DFl>DwFi5o{=d7wagQ%`J|gq z3>;qdG1@<-p!WD31%V6^K?iGx<@}u)`J>n7T~&qnB%bYCY`}r=Ur!NmqD)A5TyKF-9h)koTSC)1 zZN|{+UZ1i8(izq7HOZ2>w?sGF|bE#(*m0ex%$We=e&+^fXnE;d-NPw z1R*j;=*=}mzkm>{XO=d0dN8J?G-X9QU9b0BcSd|F5Nc>}3=bK-((`!dZQ9z-Zb4Ee zWgyAJYGl&Yq7)z$)L7Hj?w)DU()}B0L&xTfr%$eZmIA%wWaxE8=>D+A^JR}J`b41C z_l&9>Hpis)tws8_@T?O64_%EF^=OaT{&Usi+>@92eCPNG>|f2qU>-iJ_eQN=2{yOx zoK(F2_wvor(0E^vJ->r@&*o{sm*6Qy`MK4q?PpcwS9+dX@o_!myhQ^)n^nNp`wt>L zMPDI_U19PvS2EP^8-=7SNT`WETUg9$aw=^!5;ebX5p@Hs9hD^elfVH=J!4VgKPB-L z9oxZawS@y$JL`gaLOoJzoI$FB-E{7G=>c)_*9G17IUbmrQDZpJX1*oboZUxI8K2J+ zw-FT?{8ouhhr90QhLx_dpdv>!1s6>F8RR1?!HsoCo!Kzi-D}J_uxeR`#*H%;B}mtGge0Kn{zNe{?S)9X#2J96YU8G;Ec0 zCx_s7z`i)0h~>WLh@gFb^u`+8eXyOdK2l&}P^|0QbD-zbiTXm)N23x-^wK=g-ZE))?@}dOxIyLFf=e$%t3&Uh z0>YlJMq8Hn_OF|ZMSN#06EH@KB^o@2?J*rKbLHUI?4cxUsWQ2e>c0#8Zk!(R(T`vzbgcKPEaehgE3e5L- z<1*$IUg;i$@kgFe(Bkz!89k1R z(g`m!j^^Zf*Amq;Omr;T8$}-hVysJ!zg~ysO#WwoHDJ2GI`~EUL+nX8rs20Ibu|k% z1m47(vYu(SJX`y(F_U}oyVW~Lj?(%2Uo7s{>(xCr6mO}psrth6HZ*QD-)1Q%SUVH> zSSww+=Cx^V<7(MBx6zf6i157W1$*GCIb+c07&YLtG4+FnkAUcq5{4FRBx-{{376#I z)cCDm`TzNJ)$x7gs1hS!O!vY zd_*72isH_r9~LoR70+6`Q+CP&U##?SYi-;aQQWgr+cOjj-bfl%Q^egyz|$W|675eV zeY!Oe>7a+UyS=rB%FV4vl?#aHR%6eQoVyFl*!~(;0HQdBsCEB9lF~`DXENRQs=isY z29E?odww8sp=^a1^ZgFffaj;q#9S*s*N1$RFEJODwwyOh}l zEF|Qy^EG>)j2!2SZ#$B6FFVf7^P@{QQ_%wB>hd0o_Ty&qswD}~Kv6}@EV-X*<~*Wo zLwm`dXjP+IP1I->Z0GH#=pdrN&N9JicoWFo(K&mf-32r9w+us)>acDw1N?HIo*Iv5 z+C+m-t=uhAc02UNr0?bYGN#-@6W69L`3<|3hY{->P1SaA3nkh-E=2a=O?rP3NUMR? zhtsL)XGykH1n*s?_)2)Ij)RgsMS+_tWyu?215Me-7rUowe`Pz6oWs<*h2eMS@ zfzO?H>vNiWS6h3I?8Wgn^J1CWkao1ZWh`m&3=R&Ri>k_7G!XY;?%2z|0Gp^Ul``Y* zlsMn_9-_;19q4^zo&GiZZ{GgD8o>XbCjqASZ-ybD4SysovxaX*&v)N^C%MRx*))D{ z?2hUsIAiUqtu455KGlrYP&E=^;ZA`+Vsn?c{&^wF2X>{&PYtMQdL$4OLA3IJ%_k3E zodp@0gB7%B9C;S>(m2=S3E+r1UM-?O;NL>l7o5G7qx=w{~e zVWP>iCb`&D#_r>dFM|9v-A;LZu9hVN^lkNkI9;djxIU54_VJ=_k1un5Xiw~J-3IV_ zWAj8Dp_X=w*&2+^*(#0XZ{5BZba^XnjyOBQyH+H_HJaGG*BI)Tm_e&EzZL@y0JR4MY&oBPv`J-;v?dTb- zZuhSD#!xnzYAvz%ej#ZYFsf0I^%N+!vHYH6dugP44j#P=MIH@LyGxEFb zH_Ww)pD)#Rh-@Lze=jwWOArQ(A+yQaDXe==2N*&VK7qComf;+9sLXxG&E7x66`(IK zH|dC+ybM1>`aUX#rI4y(0zj2B+b{I%okdCb-%-VwE)mPL$d1aHB}+G>9Tmba%n67-)x~q$zDB4@2Z5%j)1~6 z-(MGi3)EeybeZ?JPx79-xz!oI(VQ!A+a05zF}qu%AO>$Lm}%9!mz$YIsd#c4D5#x4 z-b(cu;x%?^JY?y>?AfJM%Z9InU&oXGZ>+yy;r9;h_CkB8OkU<{hPqc9WmdY^IoeRt zH!kJA=UcgZuF*GqUlx%fBJK7hoR*(U!{#ijpo&BZ$}G3hSfiOaTcd>{Sz0gM3lUg& zHCynGe^vyEoM2FxP&9g*!6ZO1J;)7M6fyf>Ca_ikD-Ul}Nl#z@{1V9GM`DH_kN0cz zWAByCo(as9i?AByZ)W-5OiN+8`#2{AvPVG7&)bM|Mmjgpv2%t`~i&ONhlZy z*6&^Y2zwH|-RR~Ze#^=Pc67)-KxTU!8ZZbfF@EJI)=Fcd=QAmL1{$yz+W0~L3?jOt zD5+_C+%{|Kg6e73aHBUPW7E;|K(XPHHc20JcLD}lH7?e}0tsptfBE$7cFQ4JtX%Ul z(I)$y_}ahm{z&^!$L-~Rrw`hybquYv!<`(Dj(VRBZ~q7sg?Q??OBiXM<$qA#+%eGp zf+8FZ+nJ4Mwvk{ZbBPp)!4B6R!MWvm>(37?%4@D(icS5aF3Bz%yv3Kikzy}^2oG{N z3tqaQN;PW`RNPR+76x=~95=ISvKs+80xgIl; zZ8N>PBy%eiJ6-SF$rLXs_aBj;e4W`$&6tTT45{#oH8B46S64=SK^qcOMId+D8Sf5C&j}RV^pW zXUe{_WmKh0JB>!roZN3??u59BV(yHbjWKL^7EG*Isd2G`HIV7p@_Ry#*annDHqFMK zfO7qFiRv0}(gXP~t`7>x2&DhVfz$m1s z8fD*drRQHa@);|%E+z5<(2{%B?c@)wk!vFO_hHO(EV^9xM(-@m=RbX}GIN0y(Mh<; zUG{ORrRB}s1wb~=NVsh@^cl?j`E;6`TWks+CR>;zDb^r61Ruezr?56>myg2vSCTer z#1cS_#S#u{LDLjl;P_d%QDs9&|3Fw?DC%I3U=v*(Yc&5ljCqd&n}tUYakV95hgqx@ z1>=nF(2^ShNR$}ZOr(wBw^R{EEOmC$!aimtJjme35^K20MMiQSnJt8lJ&ht{1t2}}jy5%OMs{mh(Q z!Kd9)+WE-bdpH61U>qtxXXyR%$hFGtKur2ow`P^^(LL{j>7vEq!Kc7Zm>-?kmpc4Q z9X?D`&PI!ZW^$@sCvCSnP=Gb$Kz5V4DsY8C7Npta5|o`HZIvQvaZUE80oNDu!LI^5 zI%?FcXpipyq3kciqHM!0U|0nSNkQpGKqZEdjzQ@bk&;rnyHg~U&Y@B17Nlzgqy?lK z98!8%kBx3-N&Gr9vd;iUjvOov&uQJ{| zu4nwpv706jX`893X=m>CqLF>IAnIDr6AC(w@8YG7_UQQzs9e`?2pAoW9hNSlZ+)Ci z`Aobc-rldmAGa)_Mo^ShuX1D*?R`3muUEmzXI=LQE6Q@t_x%x|{<3$VtYCck2dfE9 zgX3~RWxGvNGrCH+cwT3Ld%1IbxhqKymbm&k@Z`9X-gZ1geA3e=Kx6F6yhbK4`S2J3 zjt}~`js_XQ=qbuyAM8(~ul5H#gco)nL9dO@PlAfXY?wdPcX=81Rb6B@Gkd zeTO5~={Q*HZa|^Lr;ctYZD95yczeync?&0e6i>heT(Sltz>r~DAKS8|f) z_eq8`@KWaqx?j7sEIvJ{7lz7vvVxgrD5ybKvsKAUd=8XCqT^VhxRvo(d?jEbrv+iX zzGGI=i-BnzRgo~g#bu<=o4~QS0_DdCk)2hyVPQ|qNjgW&N4y#eG&#y8!m*w?h?3gh zT8S<13-(6tJ>Iz8PK=-FD-H3CT%aM*JP3~Tw7RB{7JpEpcf=a;Wh?iNPX|Yo_dZD{ z`9U8(JGxkRIZN3`5~z9q-6pQ8yi~4a(J}|y;nush=zJU7gJ2ok-irgm$Ya5!xA>hIM;X@qbV=RON8t*Api^%_r>le``J|u+==nOp%9pqX=uVtOmK z7r$Oq2Rc{BhQOHWnHHw*GD-r4=?$jtfhAl!s%+1_TN zA8aQMI?FMU3jfCtLs%>-Dg%5E2{!#HMdpdw?r_h?Q+_U~haWU3WLO>88PUKCnZ<`R zcm8A;{gcRPts|J{Aije`LJuB_yVckBv-*e#Mokd=YL!DZQAzIl%vMd+b$kg_CAc67 z@Fl5)WSgxqD%H-mm@XNHWj@C$5|$<)pg85dN@s;?Vh}cG(}Kv9Cgy} zn<+V6S2~HuQl5^wJGx6eRg2YCMm(B)MQDQV+~d-#@sQzH2WYPrGf4E#l+)`GADG>% z5V$$~avK7bmON34V>e*i_B_PQt*#JxsDoj(43Xynn|8T6X8DE#vAR%TWYCl8B%lvq z6skdpV-9dV{9^FjkKZY<*N6BT*})N_{w>;~ZSxOh$ol=p>6T-?M0(>ddi%^9T&=&K z1Lg>xQ_Jq1!UjCDFJt~qPYg&rI~hj+F+YP~RbfuzFTb5`Y(=URoc{}$Y-YM&wwUbmsehGpui`vWaeIwCmF9pW5)ZgDlxxzhY!Pz3{1nJ z_#%UGKM>YEzCSx{5(IRB;Z%T#F`%WX1mlzO^K?c2{MxJUdscDPUUiimf&T$v#IjnW zwxd&OlsTnoNXf|f-5xypIbe%%~v)&j6;f__OuvD{Q!`W$#tks`m<^ewWmWkFzxtRYG zT)*(sU({eSLn7h+cnfGF%F5#0Y)`s`m>m#L#u74CX}{M#Se5=IbvUp z7Y)4xoYEn8m+LleHzjFE%Bj(S<+n7vqO+NtvT6BrjJy_!jy`_e8#9goXtz?vYMBz< zt(s0l=S17#nUE4uw~ao#(xNh%i?d?D@1rc!sZrfwp;nGQXhGBVSLD0x4BAx9XGW`9 zxkzgP%6v0?Nenb}oT+x@vKUjc!=28FaOl)?nSOfmRX2;3XTkF@Lw5prf!mw{hbI)M zaTib95BC^LHzhBgsMmiDGripCfVBUaRd08gujd5(d?C82R%yG9cCQuBk_=6>^%#Et zSs?+N$^+N$6>;CY8G|c0)6F7uhTTwb4mBRjrY6GHw$k4Ei-^fmOg_NS*i}~^OEVpP z726b9lP#xHuak*>=yk{z!n^YBWzU~7s`;h@JrC6J*?W*C6GtCYQYKGL6r5`#ZoT6p zu<72-r=y5Y#yf^e@Xe5+e{BY@ODWBh$Sw&pc~Y!YCjr?KD$}bA-z4AJ~sx z?rPn?aH+VslQrmWE>Q&?qu~^L>oM>rH<6ZTp)j2SMI?3}YELRsiNuI+`i_AR;Dr+7oIdeU_+VBc`dEk}v zJKw~;=Hb0&<3Nu?VYkgCAy zR*RgFF$u!0s<}a-sv_ziX&{q|V}fk8$Bp(7iN*$(N!L2BGj-5Ns7V=n{&!UbJ@@L+ zbnDUG?JoeEiUu5*wZqD`)jbI{am#Kkghv5PrPKL<60YG9%uLwTDcooJLh5VabuVYO zKb{=0-1BlVw|FFAi}vo`t8D}Czq)eJt&^%}QSh1DC$xIUSiy`5)pJcwkVR3FMo>{A zFqeM5bdU9k=aK)bt#z~->baDv_K z4l~bi3xwTgkeX0k{6H96c@aH|PPC3O>k_7wL~|!%UG-6Xc`nK{{k{(IU6W(9GwE0{ zqVRdgcdV<%E^JpOPq%AD!L#Mr26O}mQuTrO9yaqS5dgf$Nb#`ZtC!s~q86_EBH#<;Dli=tH zw;m8xsx<&}m4Om;|V0Wn_DS z#J@J%2Sj=evwFqBe_&QG8AN*= zflf)2Xu8UBE|}3R9nRs4@UjRep{b13bSrI!pd2@S)qF0*cvme_}|7Ik7O9r3Q0-qvOhXWE1wikUaS~gnGuwBL<^nvhnt|mPwsi zrB;g}BmODTnYx|TILSGGd9#jI(SGx6q#j#~)4pE&i%V41^dqTzIO3Lsob(!~SiOdG ze4RH@CAw?s&OOH^+ksX&m9eREMv^+s$HI`3`GdxBr>9@P45jgD_aqmU7)9o!l zF!EmLWd>Vi@FTaU=L}EwjP`XaRhy#<_Wc6M+C}8md0|)}#!irETtU}um_j)~b{UZm z043sH77bT_aHU2;G-d2DjQ)SKl(O$ICVwF+H)=4njN&ZN7D_roM&E7AT{m?A4GOPM=0GSiWDs zT}4D0(O{KkmqLs#=UPR8w!#>X1CM!Bx0^+8KXP}bK%tZ^it=e2vXpUYoMi9vq||Pr zVo)ay`2^Q)@v?yOBct=w)^-jDgO#WD!R&X1%R`Xv$C#i-YJpe6Zg%htuv2Ap!9I5; zFGU}E?fG{XjDSSYEoJ}Pn8QJ{F&Ahbr2pvc7{>Ozn@N+c6{dh8?HGUoL!AjTaT?Ca_Le?n{khF(|86z{evw-h zq_g#LIddm)f5eUa-o}fYK^A)$*8t1Kbv{BcT zc`tA8ZB@JZ1gHIy5pz5u@L)D&)>w41;Gla(VFWm>z1&a&}D4oNEc~-d%Ux_Dc z-0jRl_YJ-E@oACd*y}gtpB}TFeh9eWbz6Fey$K8AV@KEc7c8fQk1V`M)zBf1$!A3U zc`nJ?`xd;ygYcP1l(UVaVJ2wLoO~|$N-cw5rCnP>&J&kl5DRO&IBeWp_mHd3XY={& zJMpU|}d%`uhnd@wE(J467?FESod2Rl;!vE@ctil0jV+z{KiL_CjP&zA6;08}x}9t+lsXFT&&r+q8%jXKWB_pQFoJFHr5^^2b@#(zA* z62^ZnP@h7+P5^PN8uf@?r0T6=Ll*>GBOo9F1=OCF^fZC45s59_yk7N-bNc zyHPZ&U6bxL&z@?+UGwd`7UH_HpPqpCTY@7UgLq@1ThO^y%mclItt0N446||o%0dW9 zK~U1Jrz2Z+w*&29$bD`wJ#^!~lvnaP0dzGst4wHb=1bwP(Oj2Idxrh-T%vRdk5V)P zK&aEP?a0zEY-NvT_)@4!mxn#E!EG=R{4UP|a;PZy%?mo$cJO|#-7XiHk4hVBayNQJ z(ZnCz+L0#NXMAZEaiDWiU!~0nF)Hin-(0C|7lr1fc`DlXQFNg;{9K>#Zl3q59FQ`2 z-xOky#S&aBfOl1ErQ8z{5(T*b-8^~?{ho#(BgetX+@+mUahu$77V& zMBV?;9T~gNgZ^qf58OtP-v)oPPBt~Zlp&VE$^0l3!iQth9m#=YJ;Q~4^jgEF_IDwd z_Z(gSSkTcCXYG%-yw*Axb^eRRKgm6JJ+NShgOX#wONU)Gn|WEDnl(hXO|RUEaEQKn z;n>!Ebv9MF}wNT#&mxikJxVVP%!X6;BQ=Mourr;GJLtiUx{VA8Rpu+ojekqx5xBubXKC=7^|!f<^sN}J(t z4QMm(lZ`-4P!Lj+@{OY{JSy%cynBApJ9wPRT4POkrYN|oaMk-Tj4m8o(Q!@s(I07# z8rmJQZ>&)RT%6qI<-?m&;I(CBO@Wj=So#cRMcc!r3=}pn;?p#94m9?M;oUlFth~^t5WtF){SX65{q*9fQ@r z%-X)Xcf}3aoUy-m@}6>yS9L?@x5LS1EiFmCTf^}GJkoS=Q}f7OQ@e!}m<$H;1fJzz zL*of63Rq@7iMp{+saf2c{HcXQuR2}h`vJ}~ih{1kdf)XVOpmRB8Cdi(yBa?&v%K6R9T%gBV6ht^gH-ndOvXkB5Fkj*cfOkuh=eH$0f)^97<0T$@zdRy=EaOg| z?uk!kv1;gCH`+82eN})eM+_AP@N=*~FyNPLF|vk`FTn5J=U6uEK`!K~L` z-F~TcQ4nZBZ9x)Ru8T~>Ea(kCFWFF!PDY4V5l0@vM};~?M@1|*buOp z9A5*~^&*?6;wK>>b(rdBpVNuW)%$i^?BX}&CnxgrcM52g?kQe1lfY4P^jpZJ9qoInUh{N?XjuRh8(U;O!?7Xt2I3ns z^)Q4o5^ylT1u5Z^tj6t>x98ffq-OF3R-D%@fl(GxKc#N-!Ay@gq5V z|5rF(oyt*6iob7bgPbQr9JRpGasGw*^kF*|?j(i&%P9p-pC8k4@^zr=)#ffySCQ0) zAgw?9?}>Cj%wB;!N(*!0Kj<(OIMct5a%U6Z;D5h*kmHPb3s$Rvkf@>GOUrFxb#ic>cn!-+lgY^n??>@lSCH8{d zIX8XIa)SiH-un5`uXJhLq|gc z+EVCE*gw*v00x(QjJswF_LQ zrfOK5m-d6IHs8tfl>JAyc9`A{NvcT}p>gP!PzWGSiyHg_ZuK{YWoCgkFb66idVqLF zH4d{A*A&OQfIg!t*yYx!9F#Cwtn|1D2iM4zcP~eemJ`52ByViaNJg*BzBxP(v~Gj1 zhOVZTY6R_Kh26t7Kl=^v)gPYZsqkmVfdh{ z)jomRe885E!0&(0Mlm$3V}q(yRl(~nuNO5LnR{I&S_7}mwS4+MJlrU@JNjb$MgFue zN8q9E1xJpeG_fqlp9ev4%}H&|-nM^$k3jZ#Gr>UdZpv4-J7BYlW)Zr;*(q7s7}F;p zJU#RBUbTt7)b?>6SQ4<5!#qYS8=wzq2L868c-yJbvv$-AeZOVA(YTG@iDlInOw}{8 z9>+O=)x<&77Zu)jVm(=`;Ok-zP8?@ZwVB0~v8I5B7hGVrNmUMRu2E+;gLF$|{og`1 z)ak9K7IC98uRNx6ovkx`EZ4jfZsOvO9WOgQk#Ej?7E8uvi~^+35gP-?ba!;!U9TT1 z4MaADeVC_cunM8qmN8Ry^ixHeqfi@tie4yT3R|pjSms3T2J;zx$q)YWx@(8`T62$D z6=mwvHVME3w`WS`GbX6W#<3=5Rz5`67 z;T+ATp&VA2+RNKM=ccOyDzftc_s%$9tjnn#5v?KUKTjI5ZO{*SKuWPF#iqZA+*K3u z8|4GQf`{P`be(v3T2X*2*6in~4m=5I2@8ucT=Gs3^|P0`E+T12bF(i94*FTsz;8A? zf=?p+FM4UFv0Sf5j}nX7T3ZQCh9M0>b74{NUCC<+MsH*_eZ>)gRKd{@eUWYLDVEh~=gkAZX(Tl!eCJ+B_ zNlALe7s)Yjw}}QY^_n!P(-6k}cL@=|k^{T-bqw~}4Qywx=|9x;5W<^zC4yhW<{ZZgSwFI~! zC~TCdzt@_AuWOXvs;~4u_>FPB`@-Gr-`)w~5#UCC@z1=8 zUWLZAJ%tpXPVyfqSevl~Nm9AppMCyFINLnFz)AVk@-9id2*!q>+x6;Qh$18?2=&v4 zWYubCkywfcyVR)kk*Ve6tG_Ql3|zU(_n3=Qw#& zkN{r3>x6uFN0hW=?2Wh%l(Fl5ZNy{KKx zg9A>>j{zCmPP=T&cWFN^`Ejt@^2yiz-U8jkjp9RRu}-8ClRSXObQ<{&^mr6EhWQT~9s>TE@9c0PegIy~KCF{dx3cBhKQ8nS?mhii`+NWr*Rc<8Xk#0v>$DER#ny zA>0O46Gc&19TYmaBswM$ghJdKD3|PuZ$bWLD9tL0k3u$8wbzcMQ72|+=u*?|P+=h( zlD|3!jIBqeLYhJm((@`!yCZhg_#h=x%LP=!%aZTs-9g*Xw;H6pUBC51%-wU_C{sG~b8u7Ue1 z+ZR~72duaNu($>Ql*5X{fO5bz2V+A~!D!~~F|~{_k^DX*dfGIhF)ei+yUU6(`q+B^P|^5;<~4Q8(Pglw-boG3=7uM$mH8@bS=Sn!Im{%Erhp9 z$jb2~id{aTQmKnX?(8J}v1S0$2vZ=bP0Wx@-&lHv-Ru~UbV3z^kRQG1D8Ig@6At^= z%$ZU0mNH#BW3in1ecnpFm6l;ZNlaHG$vp98?t=Hx9S1cO59zjC)1^RK?Zk z8ls}-g+eboAIqK7l4~+Amp89(ZEmir_)-E6(D>#gJ%~Ys+N(SCg>OF9g*xJ*C0p+; zVo&DAJmsRf^ZsU?uZT0x-Ti7D?Ok}2PkgToGM^0$`3|AQm#~MHZ8`Cevbc< z_GTab5ye2$M29N?J$l8nnNqPSTTjtd>75BZgC=@zO~r|qD>s-SF)X(~7^|#6z$z=x z;Xp|g7`-h4VH`kNZvscs{|y{0;T$KK-6C;Y3ybibXeVHVy~uTag%||7n8ibleab0I zj{nk4{mIzhDj?sKc|I6N0>+?uckes2qE`rLqnkpgUlbR){d^-p)dLd$8wv{fZfRQ9 z^>mNSAhqhMH(?`z(S9!IS;vUicQ*Q+wqA^Am9#wz~M>R}WF!_7L>A?YNs!`vqZ;Knc} zW!?cQH*SVN@nd{Z_Z6^sT!O1_;M=T^*V z*)r9xWy^Ml>|$3$p%PTpS&%qk|0G060wv!lPlc-FfQREY!ED^(DIU`C4&oovw7vDx zswzU(@q<#D8nWlJ9reEn!-*MTmHl;MN>pAH>L(@QzqXyy(QP6i3o|dTk7`b=^1fa+;h7vU8>~e$!aiE85t9ZMv-riI3(R&) z)^Lb-C%$mS5mU99JCYW@#k$y0YRW9U+0 zInH33z9y<-C+867QksBZkuXu#o0J$pW9Bu3%v1c11`U_hFZ%*|CNgE6#uB*S7$<(2 zziN1w>Jtb*V&2OJwv^5AmL&Ti6GDMPPi>nLKN>dcf7z8spB<{O@Fmib>J$P7)_q6o zT2}(57n!H;vjB{50;B`1Vm~qULML5dpFw#aT{R;GZ;W+R)dZsfsD8ZfM|g>B1_+7- z%=iq=EBd(wtgD6b?@%rHCq0frb(B$YLjkhg+n@fMBcBt`U!3NxJ+R&KjUfElG$R6WZl#N;3x90~DHYOLL5#~yvba#ZPy z3z}L>bm_+;k6zXS5yo2+qaoN_4#J$0gux~ZLi*I{=NKxjnyCa`(@fo&&d(x@$ zA0P8_mG;mocq-Vlqe}bo#Vv%*E}E5^Rtoyjrw2}z z3gnq3`|gqyh@ZDk9DLr*6i!@~*#*jy+zh@}NK*yjz4Yt}SPAEZzZ@&)LwzEMhwg?t z-Xf#YPr%~KVT>&{;pBkcp%oxO^A`Sn<_}dJ@<&(Yd`FW3&vkj(ym@3wR=;(83;bz_~!JHsVeu za8)Q9RoJ=(hgUk@_^kfa7sw`<5|@L3(V+=cYb%@z5+v$@m@#=w2l{Qr%;{T2?v^dK z6Ay19`R^8QhDM0T{$x%tZaH2QEkal|35Po#Pctjl{!el3mw)vw6a!RaR*nGIJEqFX z`|a(UnrYCx6BOmmPA>|WLVcRITC((d0`JDy;tU3SLkE(8zEyB{7Y!Ee=aqCg1-RU^ z!5|obK5^gwWrQBkd&VSvv}28`c@C+Jk+;LK2Q&RN zykXcF`#ZwUUS9Z7h!x*TDJTubIyq6=Yx?UGL*Vgb8JyAAJ^?SM?|fs@4kiJ36#k_rM8KfcuE z>o)rC;JcGg|E`FpOnLC^^(0&2l+ZZZ^#LxWSMa=dKq$Ux6*znNjrR_5&ekb8ioSeGc7Ff?LF{-ra}N)Nr;?8QYt2>DI{*_)nUf+X4dh!t3YSkasownIqnD zgzOaJKzaW%gted@Ko1U5>AS~j)#yJReoIt3@?cb%yOW~?sC?e#A;_JPcaPW-w>fjO z7MUargzlm4i#+y;d1F@|ez9v)W5JuUmC>2p>%OfB_w&DTOYp`jx|F6RElB7(CnTz*%Ds)f zbqiJk3>CZE&l55>0j%>J^(L_Shc$=X3`yZf4WOu@b-17Vw0#y)*)3K;Wqd4y92v-O znZXa@dP&@eX`t;@;cWDnm;9gJ^{0U(o<=!l7IiA6pv>Vvx_9Fv{Vr_*#C1U0{-1|a z#Epkji^Z$NsH~2#lqg#2*ZmO-jB8&z#9cplI;Sw2uc7$++ z?=I2Bf%H+7;U?%E0MHS<fF&5>m5dIiTlvfwYlEr9mZU`s?bHIm}v5f&UQ>hzygGCLv5{H0# z;ap0@tOq83R#K?gQLe zI{R?Q$WwnHkI_b2M&n+de#mJ!C(GR@`hXMh`EKpm&e2# z@LBc?aR5G1r)fJ67{(o>7a2zB>J(Ym`h%pXFW7n~?!IjX!0%>B8fO`Pg1?5t8&65E z=n;fvGNXrEF9logqVaiihW+}}BwBe=O}uhIGeGJC82u6mpJ-LjCxPY~o~4V=F<$uv z@6UVbhU2lS0|T*FWQ4x~LfC+%)o}nf9>na3NYBx-F``_En|DoQ04x*6dCwTtK6_vY zi6QrgPVBeXE+Jar=Br~7WncZISo`X!s|MJM8VO*Hp5wI{&pyzlmlUMAmQ=ke0duhc zi7Dwc>cJ;IiIcE=gc~rb?5e);+;)_FWc3|DOmPoLDPTT-qA+g%Wfw!om!PKQ`wb+3 z1$iJG%@#`Hgz3=hC1qwb{y>1Fbx7{X4#n_L2Km4&k%`QH@>6u8>a@UL#q zfh2o?-upbpQ`;gzZST5tw0&i1=P-#N2-K6Q$1}nC0-$$03gu@xKnZh8ly32qLAdxA z@Z!m8Glm0jK$-t^KXgF%1a*$mMui3m>#cX?L@++V^i-DTZ%k{;pzKCRhBa7sj4pyB6pmVam49?uW8V~dT_$Y|YrBMABoNJ9E!Fy~- zk2KI+hWMx0?CZG?Jk*k{e=^~AlcDbQecc4<-FuX}0nd@Xn(EHvhezZ-QB*==}+jvR`>26mX=h)cM;NeISCT5yr3fIqG@ zuRM?ZIHTLLjaiw|+S()f{|We2g0a>JHw*868@B)^68UG-puL69*{r5KTv5lW`3Avi{{KJnoEtUU!J0xPW0#L+t~TGq|EDuQU^^4tF1j~bN$?Nd ztt<RqPS@qFSB>aaMW@`f# zo|Cy3Y_?ZisTa_@G5~s4rlFZ8f52eO>fKb_7!zGgt12(iXQF2Z^-Nq{4J>bTxyPO_ z>N@B9A=48nSb*$PoDwQ5dP7^r;_cyj7&0pzSR7nT$$Hy`0*LqzWn%ywm#_rv9oz`` zSwqZx-U8irYVQL#ofQZ)dN6!rz&(H+x4~hpBeJk3*T@?YpO0{re+;Q}m33oyZ5Q^YMK82X2%}b^*LK;JB^u zOlmP$s4d2yYo4Qh`-QhD3Gf#=#CWMe6PzO8{a+{ddqLhSv`th^jQrmw{v&=&D0S(b zy*NxiYssbIa}AyQZWV{Z{S5Y}p%#rN=d!gYtl>&Eao1zW!dreMhV1!+$(P#g2+VO2 zHW&YwChvj$SuYaK8_?V!$3pG@77GAn7I`gDqnic0KCc8&Iu|Wa+@1CtdUp%3Fc`|) z{_|d^z|Hyq9mIho*c8Ex7Qx(t?=`&C=<_!ajPE7au5%7piIE zo-)1+I`3jk)uKXS<9w#L$E}iwen}XDQ2x!reWb7OF zoG1&@JRv93-8?EJf}=GOZ#HfM%8d)0!smPcIokvG;zMeXh!fXK9-wO_1PPQ+xhs<( z6Y+tIv4>fK+leIO#2Wz^K6qaZhM+e=57=)49m`AHRjY5BW}HVk+LNF9Pd>!%UfLRZVrvyFsl&hzmdnrT0fS)agKcW ztVjAlj_FIJqNW&r8}$!G06_K zRLRb&w}0x_8Ar_7^t+UUzIv>Pp&w*(*|eB@QovjoE_@ney&kHxJ#eyLL;#r;1Z z#svdarrPxhutk?oxY?`Dm$_Jo^)!RknOBo1-$ll!}J2B38^74sjJ< zwKl!x)-5g=3trBib*${zX+q~vM%2M8EJxfDQ_a%VI}@(oQ4|Jg+U%;l zQ54)lgQ?sIn|oJZwd)^r))jXK_jiPdbk@^l_u zEKTi1F*LhZYs~#!;UrE_-%>HydfkjZHQ3TpIf^Pj!f7thExGxlpjRwL{Vvfx`fY>t zVzlE#>1FEH>KVADRx9{2bW5V$XWHQL6RsTVrI|$vPG( zxn=?ymIn2yMiJzJ?t1-UI+9z_YFo+gUvNIJb2D;Onx{4rtq5)2bqOk|Kle!J|917g zqOVAj0ZHSyGk@*g4bdL7G1+pcD}j}RNAraZpvM$L#VI_q_1j9iJORoOld;tCA{Qe2 zJ=yA2E|VCqAE^@sE~iBedpd!9cO6*`sx(+Ltqh9Ap?TXnokCNPzMPIPrXlS(TMtQ4 zY~>kRIw(<{bAQkIEOgc`_4!16xpHAZ0i^nA<`o-d!aXVH=7zz&H3N2?r|r? z412HdWK~RC`Q`r3Jk8VaZa3aaPRXgQIzH9k(jvHdN_6@E=PCKDmM&QsY)xpQw95wo zwYRZJo|N`zI>U19a&g5RBA#1EuE=&0Ip7rIP;vFNIiJJ9Adj<(Vr$X`wQldctfjw* zn@OuNIi}BoB| z>?3Cb*KaiZXY-6}M^dK?5KWpU=a8$Vzjph+T6sIAGWbE-NkHsYjCzQ4kmjE@hPSdzR6Z!~Z`*05i6&!0 z?MsSPuT#oJ&fgbX4e1LOjUB1X^_=H(*4RE38B`HU{%vsK|JUv7y}_AJLxOo#Zw{V% zK2DNRZ7kJFMALBwNVQx`$gd_(C2W?JY}l*!@_Y1%&B#V55dX$pF1!6-PqmasG_KtK zXn_1Arvij~x5>ZyTg+{hNsJYot}E2H8gErLl(r#s#3pLcNA)Ww9Ej%psd=ZtotnJY zErt|lr9oQ*$#-D>{t6%|zD!H&+*3KMv+9?`;or(6H$eEih5 z^x=%l>aVGJ`&HkuLEz* z-c3krOOcx70^ZA6d(LdJr~$BLmKht? z*{@VLmbB~{d-!?I>kmW|15VLh>D^+=FMKD+QDHhCY&T065_QhqyOWmh({9&Ppoq<5 ziX>kSZh|v~%=meWU^>m{AIb{Lf6KOBacmE(Y!MwPM``x7pTI==e@V zqUp+DvFc72gl~j%#!?(6U+;x}Tu|R#|C^Xod*bT={c1W^wlo-59=m3M@6bg}@9_-k zBUlz^Q7(SsJ3d)_yI`=&bn-2Ey|RGXR#IgjyPtkFRz8@^BSr4U;BkFH$8(x1r+7xdfLZXSpN+zglD5 z6*cCzKYcz{)V7o2(+p@^F2e-065XxQO~EnXtjKf#p+HLk^M%2pa)k0tJ{2{PD;hqiZ+f|hu8fv;i5L#m!Tq!dezDZNN?d4#biT-0 zk*iz(f7pBXcqaS*54=)TlFpK(`z}#J5i6&0cTk8_PC3jWQy2-uW?OQX+qn|M%#`!_ zyg6-4g~TXwm^qZw%&;8CX8bO?PoK~C^Zowr-#@>ihnGQbv?9Iy)83b&-2I$KEcs;_>1b&^ZYKiYuyv3Rm&f8 z)cbCaaLJKA_qE#1t6nB+4WzbSG_CT1+hLqgZTqd})r;EFZ?B;gv47zT!DFzF{K9zT z#UDI?^&-FDwdT@(yau&h-45~)!h5+MjSJoVEE650af$_vsBb~Plgq;P+b`#6Yhfo% zb7#b)HNH?pJR_{i9=xjb9X*ZQ=%dD`-s!j`$}S$VUu{L7P+&yV-&`8$a?cjWD@wH% z4y_;Xqv{Td0NWjC;}>(cT?bO4zc1|j|*IKSoRDZ}SW4*yJkyH}=5i+ukP?Di>? z>o4f2j`=zlF7S7)28B_~+j;CD(s;!A51v7a?KO#t=_XY_<0hQzgps*aNAHdX)thog zreTa*B0u`Q!#X{&dvPhz5gqHQf-rrSmbjKmb2pw$(0*^*G+^X4f6k+tS!r}M-56~z zn5XxLgy&aBFbzm1_`~B40 z(x;68s)Lvbj4o)s%xdcj5Uj=C-44bZ1MJPUSYsd?Rg195Pzg&E}?3gQV_5B+=91UVX1$Cz4kflx? zV)y8v#L4mkpCyzhdmY>A>@;GJbjfN*=J}i={KeJ`FlUktCL_@upEVSub4=0C&H5jL zX8Xrr0)Gh>fC91oqwX&A+$#hhQ0A@lG|;31Il_$2o96tWraAyMZ8;&M?mLhR?RYrQ z?TD{xWv`q7-w&T3Wj*AafOFm=2d5iosp8<9)e}|TSzwR27y)?N2#R?lL7nG&+uD*; zh9&kMpufrSN9G}3_pHZWt|Gg%YsEp9lxEh1a=eHl`eL$6e(~8&D_S{Bl0<#&JR-h4 z?9U35Q?UJ=s~Ks%fR4rL6v}_GXjkYvnY+Tx*71G)$_{5LU2cwu30k~@aN6&m;#wnS zj<{XE{DL?=U$4XHigZaaVQsVDMjlLQDPLo9vlYb@A5O=kHef#6=)bO27dDrTU0-H9a$S9{qN8AB9knDEZ;jV`PqpV1L`)Z0;^)jyW_038U9Y8&V9UyP z0#udIG{M#to>)UU*5-S;^O@lVxBADcQv-7qyzX|?MYQ5}&I7>Ug2V&RCJS~6Wk

U# zg9^bSm#j0d*lLg&)uikqpi=WDpgu(C@)r&iR@|aXs4rhY1x;48HI22ht*WUC$8IF) zQ|egxV*9FF6iJ*Z>qd8O=OoHRqQ?r<6j!-dEuiFt$=kVWC&5eK=JxM=M*YZ&X?ZjL zlh0AXDV5d1gw;>dtZ*e`*6gwGe2=1ZL+xkle9aw&%7kbnc4fW@AW8~y3~h$F{RW=< zt@F)TW+&H~SeLSGj^0}Eb!s(V&2)`jYdp|l>E~-SPK|f_FBLRH$y@=In6fC{xvy_= zwOK}d1Cu-1dqkG-WgeBR>7fSR2~X<0dZ~SFU^PdlP!Q0BJki81t-sj-MAqjJ*IC=KUIhI;<}h;DrUE#r^*8>sev;oa^ND@ z920lE4kP^gavA?yg)ocGn+R}Uy=AWgOE+ZAXgW%bi~6t#_I3ptqu}T^+0y`k9D7dL ztlQCf#M+eL^U=T)t@gYn5Otq?pDNMTan|f7Y8ouhBvrynZGVMVN`8`#KjrJbC$yCLOc>_Q4q^1B6)gP2S zW0=Sh(R+i4JiD?MgAmu3YokttU7@;utXLhp5km}9LiX_ZX5|@a7U}D7W($O+FrZ@t zaP7HmC2a`{v3B+UW1`kD0l3irF;RH`6UT1!zS<4bPwDO>4OC2DfkvuzO?2luYPsWVeq!0abo_56X{JC*|pXrW^eu8&p zJ|Bs>+(f2R-Vxf_miv3^G$6Rft@jGh$3u7`T@8CJcIU`?czF{`dAVb4%(4r% z&W38K5MQ>|vqHYET5j8)TtEWg0*=M~XGnD;^;Ay~%S?G##GHGPaSNx%Ycog0Dp6}F zXPvI#l9L@qS((Wy;@aQsAB0bv%FXA3?=-z%A^4vtEe#X8xpD)inIGp$8*_Yp`oOTZBuw|>*xJjZ^f8o|biQkh%=){02I2dF$x- z^>OSb7It7a;|5{!i3T*k#bf})X=pf`t#D}IE+Wlvzy3+ZffyaL)6}%se);OT?cvKG z#;;P$T<)Ymq;JInR1ASyBJV`xhQ*oyM5#sHA@|%+tI2$6z*<(_xu{U-Du1>=$L{hl z(=w9K+Mi~p2Jpe6s5R!K-l->RmVLs!#^9`4N0OQ0DMe~p$AxP)4w#T%0xu?Ce8(3x@D^SS4#dEKJzc=IMDO>b__3HFzrcS$o zXBxx;5m~q#$nevUVgvLshjfUei@j-RcTc4O;tPZ%;m?pPZ1JW~BsQfPgv|Wlatteh znGdU~C#j$$9S|5*>;@``r;%!b!scgyD!5jCW|1FSM|jGWNOvXVG>-xu7cRfc1Sh zDw=gGd`&Zm<;|U1=ceQ?<+Th8u+$MC2b0#MPA=nAl~1y{XL=Su&TGQb?+YlFU^K(1Y6Q7nxM?@U-L;K!dVzk&9xTHLw+ ztE8H1npM@^>`DQV(F_n%*(8yxDdz=7#P`}Utl`|!I76{enV}D64`1!?sJ9Rvaet*) zmGy;{hp%pPiRKpr)&{YN8=2ddos$G)&aXj?9cS@D$vszp<>eXkA^WnvJW#Mj+jrb1 zSh8s4ZBk>{{xludM0cIW4jQj#H<;b70eVS^$cSDv% z3;>ty#=I;{mG>nl4{@s{$LS*wefixE&fI+Z&L(C0oySB(uO`kNI3)cD-$p&hExBp2 zAkWW5Ki7~yYs*Asj8vC2OIh=3Wxu(-Osw9N<0F|neamhw>j;~7$^4*VTweM`we`SC ziZvctM+{%<%h(mk+vM2Sd(3q+=+JpMeDIiJZ@g?PqaU?Ys`UT0bu+1B?}o z9=CLh{13`5>7Y;dR|WU|(vG#}q^~%n-y%Pk>DG<**+RTE5W8ExGro*Y)VVRC;cjx2 zzbe&f^Ul3&y56q#7tLU2dWrK{H4XaX<9K&Uia_HckPe+qzur74XdQRVO1@k*aV(B} z?+RLQ(Iw3UBx%*rxtcdO%0S?0V0S<@P?REF$Y#oF{T=hJz0PJ%Y#3)=Z9GP2)hus( zf(!&>f+)y4n zY%VST&dsW2N^zZr@2X+{YPv3*@%cH)poyXm78;NCc)(7FO z`TY|at)zXk;W?&-zg|Pzha2QvI)=+T`^_xRj+t4Qng(#dJ(D1xQ}9oa^SAHk1sml2 zpmrH|oTSPoG!Gt28hc=bF;)wmAikS+fA_SKS@PBFXy1%Yy`S>}f^CfL$-qdBoeK*@ zVu~e8mw5_AS2jkyZU0ySm+cD}s3^)F!me~KehQ%-ppoh`ukLRXg&9oG%rT41Cy~3l zvP@`W7Wl#A)T&sW*!8^ZO(X+Bii%mwW^Q?NIqGW>+!>7lv@A2nl^qdlTz-O@BsL{d zj>)uKa-PdcrvQ@HIox*~ohkP!&8|;8rpnA)Zq9dR%{)0oS7*S5!}?ehQ`k0FKb0=< z29lM{lkxd);Yz3G#a&n-ZM=~3?PBorZfOdLgYB1(70n~q^Q??x3cCTujW2D>JTStv zwvDK}2mo4VPa#C(%j-2n6H>q%tfyBtY`?8Wbng5w8fFG-nAZXGoKwpL^)>u(Nvc{t z020C!Qo==KLd0EiszC@deHU0Ux=%b;D`LFnfti}@%Jk?(*@mk@U^RXwK3_xRrQQ`2|d5pOLJ9~ZK}hso%G z>-V^ubiV0INQR}G#1H$u7tXu~ZwL%=7Y0s`0WL1`dVi6T`M_>l*Gs%MN*gSe#HGMpDt6%tB*QNId8E%MJI2_G zb)9ceXrFGfmPT#CsD+!nv|Hx;vF>MPFWsb{n9E&yVU93_a>-_#l6=>;b-NYXeX8Sq zy@^`v$%2~epGcRR%2E0a?3OKCyCsyLt6V z@6Y_52SDMn?Bj;uzz#mIXp0O*J(XoPUnQUQ;;QFdky)XM#C||O=p4y7)s9ViHL!$7 zpxR%ipWpz#8b zwg&1JaK+!VdwBPXwBRwulllJ=i!K=$$0A#a9h0!BjGyst&&Q=g4T zd=5BGomF2x-;o$c@Jt7_K?+T(X7YT-E0z}+fwpb8JY&dhu@`?-iO7?Yl&^svUouZ= zv=8AHq{_`OHawq8|KBH%^TM??6290pW@dQEVhQCG?kmijpz7xrE%WV*r;NwO!#N+f z?(=iJ3zM1@-KV*6L1@>q4EikVW}nda7qVoei`R6>3I4^Ao}I*T(B9aIm$1O%Tpat>Vmj%gw^e))ymVc8k=UgUD{Eqg(wd$@3UcQJ95 z6=v7j$<2;Dd9w2JwR?;?X`-YfLI|ZMt8R-{d2mudCq}xC=N7cf+0xZ34p^bey;?gn zy=cUF{Fr^%TwIx5#-!cLK71j)>ZP6L*^-UC#GWkB_`M;Idj~u4@dU_qpR}s}9_BBT zYEG%6hOBV0q*e$xIWosIcsy?{hz!Rp%1`|akhB65u@Nh^yrH$6TW((W5^xTIx!yC^ zksGf_FTN4qRG9xoLx9Lmf@;&!U zb>w=5-Q+QnjWYwA{G3BYuMAeDPG7f)-?jI4>3RanWk>xm@cZ6X%0`%`Q5@y4h6X9;_*uPSGeKoMD>|@^%RF3wV2ieq0tP&ruy@|+; z$h|Wzqi*eT6Ecb1f}-AVi5ap1&i+GLTuG@Izp9o%oeXm9Qbr!BzGc;PqdG0!5Sv&R zj`bOp4k&m^`Ih>}Spc=dIHf%wGs3Su?|5f1^d`@B5^Ro`g{{0R_W==6nIH&uvK)2b zqeel$a6`BTF=_z$_**H>L*F2EtF}Ggw3;cvegPPk!+_jHG(>y@a(4hRHdxi*u4_;?(PgEvkcKKL819R_miwF96o6m_B${M8yt zpgr__0Y9U*K-sjU7pr&p%M^R>3l~^i3#{=~6zVEWB2WY-j8Bv7WS2`;%Ra!N7aKdV znV_p(U+)J3Am^%O*mIT8(o`R0z2%(qFyr)^zt0};SIG*aFdBjb$i4-~AkIMFT|-Fk@?#s`+4OXi}t)Y635|?3{*i-vCOaIm zU}ipQAhc_?mGxy8L|`gMJZH?Em^pdUA`iTXzErhVjXJY3?lKUOd8f6sP3mwoo6TN> zGiIb!csoN%9**~wv$F5)KjfC@px*u9qUw8{xoTLb;9(n03EpJd0J6RGG9#uGh>x+K zWDQuJ$Tle}?Z9z|E9kq&Y(HN&z2Qm8{+d3GeFd`W?;Hs8TW zpyXy|4H&}$U@akJ7HcnNqAR{!br~4@=w^lp9-{Nk6$GefgA3DZd`L;JaQTC1X%;bk zfj;~^L`K~NEGUbX6UI~?o6=By25RW>=&T@@wBDr{sbb7(x)+NrG#8G?6{;R@dP<*+ z((M1Nn0qnQ`Z;YE|BNugRq_{g`nUXe8V&1KQay3F1#M)WEfmofar+WReu6oTHZgwSI68MDP?2GmtJ)B=h6b4$ zN2%KsP^V6T8w?1oQ>|Ugb~%36a>ESO4{F|e9e11fYks%Qy8P1kB4z#QN}X#u$)Nag zr{Uq8wwp7}(?gKrcOR!T&1V8*x0WkW)rdjp0SDjPoy3Xobith}4XAjvteAc`vjhOY zY1;%%?TLWB$fWJb%FEs_+a?A2t!8Rcwl0&~%=`r%Uie3N()?Q4WPCOgfZFaB)=Qmo z`E{ch5bNo;e$TRsZL-?;t@9j}vVzhSP0Y;i#M=y9T8?*%f@a!k(hiA3W?jnbU)*IC_%l@?32c^Hc zfYlk46HcbP4!|#RNDuV434N)WCgIdPl=%tn33%I@=F{b^&QnXx;ApxyJU>mj|2fqr z__MxpKpX3>0#Lpnnr-gr#vG)t?&(O+<@}aqd8{q-_|mI#O*?|dDKJn(1Nz-2W2x35 zr~XA&yN;MdN%O#;F-<86Hc(*S@^pJQ@w5XD%BjZ}UPIjn+rIAie1-NkGW*zCykeyr zkbh?B0^O-9{dJ&m><#aEv`^ZcUEOFd+mL%n?8kU%#0hOlEfUGoSA8!aU_| z@G@yVvu?Unw&&bExjOAd2{Pr_ApznFq z$}1CcBS|p7CU=*dI@^xnx9zJ|sus7lcSS~Gb=kbjBlfsvG^{wj6x+1x(T2@O zH*DPMllbQK6H%IXtF}urV*Gxo{Yv0K4WAhm-E%2*p4+pKx7y;DaVx}>3Q{A4s)T9Us;r44uO^+l=;8bl%ZA6(>qwq5% z;%P|Jt&oK7pdUBZ-LQ##^lMzKUD>@?37$f43n{#u09Jay+Qxvlhft5Y0~)V-`PWp# zfff`@UlRM#0(<-2Un@`R*lA(w?iYz~+zMP}*CSo#);XpN&~a%EaSdQ$z8ybSSN?Km zc827S?|~cc+O;+KU5gn0L1xq0X`{{&p$M%X-@SSC^E3ZVqWA|cy7#mDt&*E^R8u$q zxYveFlSiNV^R@#&lD-b@yq0o)uOWLc@=C-uyWUT{9K$aMMa3K7wzV&O@5%D4biLGu{Ig$^=}#jM!XR(7D=u$E z+`7KADrvaC?2^?Vy*1a}RM}cyDPpdB)%an{I}78P?c2A}|CsT)r@B)^7E?j}W$KTb zMztP|92d8L2s_OsYllj?u zUfNnq#Dnr3zc<(X!6wvdisl)SM=m8n7d++;{kYgC!nW@=oxNkgKZijEe{}8H^%RUw zjG}eb`_oo)2soDSHU4PFK5DsdfpFmkJ zo(LJQ|NfBHd*IMv*Eh8p@BYE>b^Uf*)_hkh4WzeilylV;4wD{PnN1$t2paV{`1?rw z&zt^r|9cyQf_P|0In*2Bu#g>64o`PX-mm>``uj5Zu`t$F*MI6k(D>#!C9At}N_IEM z_F(z5_Cj^R!eR@hVf>93tNxeY{L7W{9&J9L?B7vgRVU;nRTShXg3Q93X~hHoovul3 z{eSQL!B3mx==Ar7!ej1^B4}NaFcGmscb|&Mz|3|W`Xsj-AM#f}e+P;G*$v@cK_gAH z!6zQ-(=!{tuvmPq|MvI)(yKcL_VhMc zXyfZH>pLzGvraW&7@ygDLGV|~ME_;5|7X{MB|dfH({(*Z&`sO@^6M!@&}InpVvsyS{v~fb)wBB>ETUS$)gvl zRc=gli;Ez2{R%*w&|34} zcYDmFRBgg4ssTH7r{P;nJ8L-z=KowzW4~EJ{{T%M>F(aRsqV7RdEtN54$*D91UEcGL*4WQLAH&BU?D$9Rkl1tdM(>m0 zu;c$Aj5ch#4D?ar^S6Wls2xhjbyF%tMfP9&N9!|e>n?ev*YD2$qjuO{1twi$=cyO} zdENcnr~7svy6NUiH>nC8+ZO><2 zef;0RULUZO{|5HIh5LU4``?z%e-ZoNa+B_V5&Pe8{J)6(Z>0UdROH|CYtVnG$iESk z|5A~E(+U6oQxW^RpUxnf&(Dr4%$gsrm0#0epA0wYPigm0RD9lA+MaQYRtBcr%MdKo z*>zs^`a4nBVFXZokHK2}(oqD9w}JqV@faU{svCinpq;uYEJ7MLvD}D|Cljc?8xo9i=9m_2WrBBK%0N!mE*7on<2w|cK6?;iN5J6`Xiu=POCoj2XI)5(YtQNfLk zwtpw_x=291c!*oL{!K-XZP_Ss*Z-YV|5rEv(FB*~zSXMS+b{-f)m#<5>`&kl86K14 zR;KrldYBJ3tT6XF%!px#8#5sb(U01dswbgOxjVQh>yo>nmEV*xJNZ^+rHv(UKW(?ubVo5{{`UC8fEl4H^n6|laZZTOYiqWr+ObUuR$+gKY{K=pBQil<9d{E zXK7lCZ-gF_uWt_He`czGZrt87_BoYi#Y~oo^XIWLn*&@3`+2m9Ip8VC@(yzl@BjqP zs;?h=!YJPUO83`sb|DpNm&AWvIkdoNo?Ji=wUh@{#BSS%y%5bWU67G3$w?^F3luIE z-6f(Ly8q}xX}HtI;rrBNw~Un+kGwEPEO>vD$Da-XF>n3u`JKNZ*`2Fj$DE?Yg?Hec|)iBS1yFm#{AAa#3Mw;(*+1C(V zEy$(@_S-zHn-!QILH$ci8?MFlwkxOpX3X7_1r%(@{i;Xx!^fhxi`oGXNj3X+2~N)O zU75k~%5K}qlHzGtXR~#4G-ff_|K=!Ks|c6y)3MK6+%xQag1mxe<_;7VQia2UN7A~k z`Du4y#zQYCOc*)pos^O{N4u;BFu}_gDjs}$>;JHc3ocsSh`#1;Hi52rt^*nh8l-mX z&ctTxmVC5R5!_Vu|85^6;crHv5HCL6p(bIc(!uSRA6YA_wrASebmjYznc7eSYGATG z+^2d6YPUSPVG0>9*tuwLOAX)C>$=Tjx%Z&h%Y37hispb`!ravP;)cUI_sbH>l5HE` z3L{b9Eex-VOS^jnxW7v95@>lhU94qG+N3Og;1=8zmG6ps9Ys4&S)z2&6uC4Ib~(+@ zFpDd9Wp2ygjTgWlusr^Ehf?qq@E1m(I+Eu*LDn#GdBEIfeICtyJnv{SZW_5t~!wmGnpR#IxZP`x?B{(T<)nv!vv>w+63*$_k>KA8+BNsyUO(&3*-l4etZj_ad zd;5W7ro1^j2S%?m63gX`=Qt=|L;FlpNzmY8*f#wmD^GmX1RwnhH*}*zq-l?r3ULXf z-WM0F7cYun-q*mGwt-a7Wd?-STDWLc@U*Xg1!AzGuE^6puzWLc(3xu!1O|_a4A_=uR_J?cUExsW1(Cffx z^<4S;f6qtE_tkMRw|2N>V zNn|EwQNVHj=nYOgKbV#JE}=lF{hHiiCRVZa^ru(vRe}b6&J3mKT&3@lA7v8r z*nVhaari3LV#2Di}pap*K8sIGJQ+x!wRm~h|%8eq)zwQ z>&>N3wP^Jd_KF}<{7dk9ffa(GWA8Uj{#@sArC4mD1>yL-axTMS`C+?8O;7hvqo9WC z=W?TqKi`$&6Beo^Xmf6mHV02R*x9y;yQ@Y^MYhFnTaDnVFDF$xbu8U+rF@Yi^S@Ot z$rPUW`0ei&g?$QOE^2+c60!2jA3{B7|07+uO?8msCgp*cYi@%Y?09m2Wwez*%(JMe zEea$$ar1+T)=CBb8}C`=QU~sI zn2<3SdL@D~U0&Ed^12~@Rb!ZnDhCeHP-&3XPA>}03F=#28qTB(c-HvH?dnSI5wJ(0&@_#fgPDah_-PV${{6iVZxx+} zYnhm)_&4Pf&nXmY)mG3ydG>iV6m^$lmM>Ot;73ryuX@>d&8f#h$ccGl5}RZ_iF`{y zsUxopD^+*9wSBBrYqZ_VAiP*{J#RSsv3#%)MZg1Vdq=eR;LIW+bMB z`Ka12mfl%pILF}^aN|Sa+vZH+^c4w6;OAsjaVaikkshrL?d2Di+pgd}>5_G;&)6`4 z+S=JS#}`8j$=n4)@9NehM(@K_ajgeb%#%m_pGJ=fm)l3nocR4d4V-y5x{{Z^t&-<_q$9)a6S`rg> zq>S7JmU-=>Q^RPsGxFRKlh5M|;^AB5P@$ubh_^ ztV!D(4zO79eGRRKpggQG!Sx8@d+qzJ)y~bNtS!3Co()XN1w8QNnP^evqhto!yQ7ds zaOHe9U#Z!U8Kr5xPoDC1FndM+==8D`f3=FOG&|{u)@DC5ky5#vRe5i`?7MAMfU19A zL{E5zvH0wM7TEFQXPrl^L;jzHHz?!IYP*9?+K(WooP$&OAxmL!W38$9Gu=|mu>wDc z*2?)=daKLi#6*)s?S|_YX9TI}ts5ej)1^&3YnQ zL}%vn#XYv(oZEhheM@Cc1=R`MZZ|Ue*JmFry_4zBzi1WaHYrY%J31Nf zWA^g>deJye^q$g9bzUn`;nl=Zn2grg`MWha-7T=jm5 zH>+vKO++8+jd0{QSb?nRlZAqX>bL5D+fxE;4bTNdr~lEYL#U^J8b7wiM}Y^*hs;_M zv>R-I#hJd(r21Uq(mbsY)zKT1xXtfVh`72Yr^N8J5{TcmMZXOvibxa6hn!#Dsww3$ z!mE_6(#4bU;x!Yw+hw#HQ>;s5qa!quy?*GC2vS%wM(CthKJ9D90^$5J=A|~ zLY)*iB{gB>K@to_X*P2yd}%6S-eOmGUyn>8w@u3%&>POqqV2#Gt*UDpZ-?3n*G9a1 z=j4&;iVA%T^Er{{q^IPu)>eq_Q?!KyWnG{ba!3a z0hL`HX~n2ptjtL0Q27b~krEZu;4yvkg>Q9>75v^_gQx%-GJ{>=$Zty$%?S&KV*82R z4-IFj1cA-ZOTe-!a2i&z@ozlGZ<&6j!%@bpc+Yl8tqgMeAOVSwb*Iq|l-KyZ(VFV} z<#(R1Qki8c7nP<3JCDwe-#Z6qJ=8mcay@vw@FC|~SM0)&7WzAxlW_ZaN*mu@PZTJi1utRJFUIJ9K*Y_Vc5^)3GWDS%Tb)x&k zi=(u72Q~R9vj68^LC6ur2OABT>(q3KNN-zv2pyb#-fJ;xP>QLj>Pbhxlmxx)XWAa^ zuKyLQoW13#n`)RX;-D5>H8LMOq&(aFOzkNnJyULTlV$bMVhFrvOBl=sF=nsYLCIVk z>ECuzAGV(;#n|-5kK1$N^$-71OVYDUXcLr}LmyqNo zP}`NJRWW_Vks!K!{C!P_$CX1-3>sH?)x&XD@6Gn9Xd-q(C2^p7nE_vF(T5e?jiaM_{T!`s{4Vz*^{W@saT=Yus%ve8?Q`~e8qo%rgDmTy3_`+nw z_iiAleZ9GTt1}eweT<_e#3m(Q)vy><*}fN51f4lmou=1y;S(whP=d(RYP9V`zC-C$ z6+02{hpMK#=F00XeK=T&RcZwZWrL;AB~-wcY8_=;m$8^uRiAmOZZY! zJD5pGa!N)x!ZoYAIcyz%%`7eJd3k!~WZNWod4)U9n`j9apTIl{XvY!7qWc1v_kXJf#24_MQ!A0aC*cI@-4 zVr9TXm(`k~I8P5^uL{q7Wsteb413DhYf_QUk=ZH8lLlR!$8&l$wS0VFYLCy$dN46TQ2Ueoc(s7I=x!*3olt#xS_@nZN%IDio@zIa{9SaTE(6;GAkQ;x|FTDeY&HQWjCgtoDH4`?CElvr z?&{j&-Oh+pzkwJab9vmA@pz>2cgbRIq7HO`b$;?i_@1Ui)ivyVYI4HE+xwEVn0cxX z2ZVB}@B&UW=B+!a53X~cr9xVsds$W0r$?ReQT$Zu-{>fF*SBYTEYUj(YmmWd<8CyR=HX1$N5+18w+(*5jBMCW-rGZqZ(oRQg2wYX zREWo0^KBDTZrG&$F5y?@eU1Zuympz@s|6&pal?J?ul9=X7*KI}m2g)`vL|C;&ft7* zcY5r$16&gQy|qVWy4VNwr-;L9ZB#ji;#tO&;t_fDp;-v(Y#1`t9qiT)Z`Bl#{@bRTef!?+L1LC(&wRXU>J-K0z7KwBy zZb>#fcEJU?A^_gy{?hs$sR+FDfm31rSWlXkGb|}YVaU$wLE~i!sFIX+;QOsefmS^g zqvN)O;Zj(tEjn_`UZ1u2Xyz^IzR29%0kSCXOMB~s>Thvs{ZqYyPqi{U)j%QUDMqVc zbzFL884~`u&IEjc(Q+a=HBa!??!j}s^uXRNLB`+_!+KhnI}^T8$z;6od1!vu6sb&} zd%ZLhAqij{p{aTF@e^DZbRYG67ddtDOLL&o^q^(RBI!bqBdD|;Hl;7E@4O{Ux#GiI z_tzAL$5opALg<3Bggl(1-(@}*WjWGVGCl33ca^L35T>*tpk+w{!TlB@qaxMqH(!;I zBHCN+9UYBc{HAg^;EU-(Dd|-!zcnI(&cuEij&qulJhC~gFH8tnDuZK(Xw_G$oTiE4 z1n{!B)G&gCr%H{5f|Q<)82=&Gh!}cS|;>qG4KV?z;JG3dbKRK zPpyVdBa8TBd)yr)eBhi%(Vo-DI78~UR(J-l5WTx~&s0N0pR8T7YO3XOX^WFz^Oo6i z!LUBKj8j3q?D2MLj)+G+xD%?>&FSa#g-upOQ)>_NR$^&erM}D?UgP`iWW1^A*IVY@ zEM1OGhw@|~D(!{g&^Mfl&jzC(xRIb3bs0(F-}*Fvvwh%SBp8ZIv{P{jvf>q?l;@22 zQ%?Mlzc!Eq_DA0@0Kk$Wj%)mB#_LM#z6YFkLQAT;H$L~B{A_J=_fni~au02%M;Sg~ zp%M-qJV;QKPPvZtHB_jk=X-b?zJ@9WTEGFfbcSmBmO=+$bsW|;}X;sNnf zr|oLG4`Z556I8Q?f^04Nx3J19N8Zi?dai+ua|y;~(vLzjiO|RI$F0qaD!{^_U3ZX* z-V@V}36U1%50>nvuQC>0fuG})u zpDF|-T`31Ht`lsf*Q$~@*cEyzXfRLLx%+Ij_ge`Q6;I1!mirxAk{X(2C4_4^{>ga) z@`S6HM}P-xO1i}x;;-j%4;=U%lW@%wX+4(dx3bVB)vSEWFt{P0!mo<}_^B+%bQCv7 z2$YHhr2e+13b%WK@EZQ-XVr2QiVDvTA1Kqz-@1tl( zRD%e!N9Oogl4o7{VhrEpjJ|Y9qGa_8d>{48iu}bUL%wdY?M+0jO6kN>9W+5~pLos| zG#>BO*R)_;AKX05wKVVKHURaqg?6UY-8dcchtmP>6v?t)#Q|9RB%ujPxl+_Ec0%e; z1K^Ra#V){P-2d*kt$(KKPqv$*h^20FJyt{yWZ;6k_w;3nmwPPFrFNTrUwS7FlJcy% z|60jnm4m@tAKhW+9MO8HhE4G6;ez_JFZwQAXHe+wvl*bJIlu*R&9DnJwN1|3%tbel zIFxGe@TS)dM`CTzJ&KxS7Z{zj1b;dpaZ7bMaofH^*8y)9bfW6bTAWD;=L`vQ4~==3 z?vgVJ?bl3FOZAy7?^16#jB)I4?xOOXXF2>SQPkvnp=tw#5|4!ey;BeNx3!J>uGD}p6x7@weLouDKvzpfrA8ApE5((!!J?oyQ3qHNsvTAeG zka58!*~7&d+S+TD+~4D$(k*y47fl3z0O5gNNKH*vDRicEJeReUQGUSTvf zyWHoAQE2ACSGe8;z$PWl-B-FXw&`hr^>WXriHJe}UB!JtgVM0?U1T8(Yppl_V=g-E zfgpAi9@)u)Mw1uRm8gutt{|X<)P5NG%N%;pUs|VBkQt ztE;;eXn&r251mk8&|Q*}lcMpD9D#EjpxJUIR#aO!rBI- zDxHb={<#A{VaUk?nnWB~7>Ow)FH|MCV0XhgIPww)16uR)cDnTVqwqWS@#*h1os}<0 zbH?-wifRWNBoh45>2Uh*=O$j&1>W82zblsk>{r&)+oIand|P7K7kC@r`^=s5Hh}Nypo$uo zMPmqncRk^@TA>&nJ^nCY=JCMf_I;QsQOFsbdsU@d&+Tpds9vd&%?>fCk49pt%TGAf zh4Kqs8tgnb$mJsa({=G6ST*eYyQ=*`j(!6Hd`&nBe5B`CQa0}1nUN$IM|DbLOr0An z#uXBV0mf7HQZ1LFKx#RF-}`YQF1E!Q(X*wXm1NlBRz1}N=5c+ST1z}6eMw1G>Lefd zGOu7x1>OQ7MgcBmdIpw}o64~bJeHmVzjiT`tzy7 z61qtqSYLGUfv<;w$Ehv|njfmRwZ&@BHn)$~l(rhkF?n+Mz#J>z@{Z+c{uG%JIOL}H z{hahAomR>$SCT(9u(+l@9*a+D)0B?x8R(5DkW2P|+gx0lD-*Ej0JzaQM~)O6l80{$ z&Xe?Xcp|B3rJ7|+G(qt&G;`pPS;MlQix}fPm9)Dj)Do{BRz5d7{RFD78Dq(q!$H*A zJ;WSt>_g|c!V2*Kb@Eehr-WBmurN?jublWN1teqNB>mBJU=WB(=Q!XT2Ah%LV z{^zKJtq5S!R5`W6)xY}rK_{c01Q=uofU^mwq|9ygm6j7jnFi98dIfw&58Gjk^{@5T zuIhL_!Ae%w;xS|eAmndnjw4lNya&>U9;c+N%oSCDlOPYf2=T)<<)j?Onj6zFqJzO2 zZGUsiVzRLhUcZHyNeduP_03{aAo?oGmYXIU(fk^?RG1cb56)lPOVx0pvsz@2ycC81 zf)Yu@o&;-^btljkn)ww+hH&pyeuAD)N!KK?wNq6L8C!pKdogiQIN2~fs(pv-lZX(T^7W) z?OH_|oZ|1RHtc|}>!zv~p$vSwE}(652$+YR&JzIliZ2(Lg0aI@dVua)D8_! z##a48_9(RQ$Ey5ZGz^hF0?udSMh`dSpnUc1PZb9Y)&}4M0$ZO&D~`8dbnLu`LI6{F zzO8Vga&!-l76^FUfUXRhTt6~XE6njMtz3l?P^*i&vQxAT6fIG4R~fPL>fLR8XO9)L zaT2YPGI!f|HdFP;-apGH0XwSqIOixYS#BIJmQK^AG8Vhq=#-(SYMQYV9{i3l^St&! zE~IwxWJ}oI>J7QY@EhUYN?(l#ko@C)@Wr&09nY-)+G@)z!pWqybu2C&)|s%RN3`8} zFC$w%75~%NtTmal8V$OP1}|N z6X!~TM_vEk3QIJjk-AT7JrHQ-3C+Uz4?DJDm#LE-InNO2%~<5s!M&LUnON9Ukk{3K zof5vjXm*(g>`xE&qSJ*-52AE)Zr_$D&96<+LmkI>tiBa5yo#*Q^-dU8r-d5;f~d{M z*LXHIH65tGcX_W#H+y8-lCfZGJiz0u`RwJL-_wk0HxytNM(An^d{)z=!gSCqTN-;P zB8Onn(zoPhU(v?&G=qE-RlGF2*6o8UZLG4rGz`Qti1-W>UCj(Xb~0wQGp}ub?wkkO zZ{#7};RJlI(hTfQyPq%??a}y49n0=HxSIE1*7_+XlQ8OM+oTzAcImdG{}%ldOY=H_ z-Qw8FTV~Rg(AJpy5=e9rKU&*scCvn4$P-8s0T&T$3|d1V!{ZM|J6WJU{-bEe()c4% znU~D{M{pm??r~bxHjyFwCZU;~%yJRIgY2sN>wQ^7qovOFLOWb6ktG#X$s9EK1T2%e z%@h94(XWc3G2j=D!(Frv8!|G*tB!-AwO845bAD5>RhmiH0Nn&Z!Grjez0ZRUcK42n z7m|JluA@WFs?4C^G|`v&$LZT@tBou|GT;gg5wVe&B62Ypz_JC#--tX#&B8^lCMYXR zzv#!lqL+w!S8Z`qpA-WnvGFD;3M3u{+ra73D<(-D=xEt6vTC+ztR&eFf^m{c z>)22ZefDi^>!98f5aoc7YR$2)$BAm<8frH8Ta1@hvqxwvRM&1Ya!aK;X+WCbmaz1Y zV5MTb+1;n#(QWq15BTU!jMHKQ7ig^aTyMj6`VxxUQ4a=hjwh6BaEgL6dzBL6f?_22JKyjJM%> zj*crXu+8>Q#aP=DJ?0szq3DIdX~tF_tH5wRg{z6taJ2P@%`hT8y-<`aE!;DFm32;{ zlq{gW`Q}o1#;+Vp(7JNzWr%vkS;Nt$-PMP!ebyPwka8^0=W##IHjip zP{IjFEA#pl#@0>5JLp=FJymL89X8I)lRVL^y?tL;jAo*rvHe8MFfbJnwldio*=G{B zItC#yZyw?((On)Pxq|mg_Kn(JXG6?mlj?0TJ3uYqkPWx(HpS#-S2D?0bCBO-FHd=; z0RY87t9K)%l%%`)iI*}jrRVf+G_hRf_q)3m$r~i6Q`7?`M|lnEU7%yDA6#5lmmRPZ3O5Bl!w*ho zq_ST{J7z6NAkV2k<1z5q3X+Jz32;r_L~I?e~&zoU6OU2S^F* zW=SUlh*U}AH>eQDguDURnD1aR(aJY&vau!A|PaCf5+t0}x&w1s-KnKTL)Qe@+% zW;qh+2f+q54V{8|EIOS)S-30XsB*H#z~)J6OyBmMT%A}#m1zQPj2zfCx`9WATGX5| zTx!|-oMF-(v&b~-EmqKi3r$cpIm=@Kc;S(GG0<$s*Ts3y-3|!*}%WaMO7|fZC zh0{f*aVq4i1StPICQl#(bA?EoQEXb6)O3W8Rn4ZOn_chsa`<&JY5kFK#9 zJS6?G%MjE*#~3KiQxFi+2!Is@VU1(#Qfcb_(l-PzWWy{^ip30a)w5(Jmv%ivHY^pl z+t{rR=YlA-O>Jg$-M-)ig-8&-w|{1~-4uF020;?%EtBUG(>#X2aIZupZtjz!qRJZw zJdjh1zCpr@d7X7raC3tq?u8B5;bG#(_`Qc+qJU0R1Emof-5ks&zQA5~%F}idY+$}| zCZ@9F@`J;!^p$|MkH-Cs2&os-}gRDDZeH_aCw;LnzB< z`mJFrWuUjL-Jt{1B-mz_LOYp}D*S>~s>;EPzH0`XpI?lV9IVM$HZ32n(OuHhR-7vC zooT~MYUCCBbh#l(T?rwrlfD>L@_F?7ddEbu6U@>YJgk0IF3<0fu0cYpD^W2p-P#HM z3Qxw(zvF9NUMp+<{T6^|+?1bcD-;+Yse_?y4+4EOhw0(Wj$^U97`FuPK?tHdu@{y* z)7))YZ2Oxy@O4*EA{S8@Z8lwKw`@710C&H3%|VNz3`?@LzUPEZbxu$z;3&#>hJ@8v zq>PM~Y->hwuwX1H?FE+;Wtz~`L>B+Yl~)?<1g^?EYIjZ@y$M-KQ~ zw|ix^Ydjj3rN{%+#i{Urssd{&)=mMeTvq8L;u!mBue+LxnSQ^H=rH5&rW!3R@Tx%H zDNmQlHMT~z zJ^~DerUiPDgWIu7bV0+l0TT*fC_p`Ji zdf;l`gzr5TOoCiy)pO*4+G?lE=Qa+##G$C?vw<4^ws%v{}X8onp>e!Z0|Mt*s$-r`;+sF+cE3QrdSU7&?1bi|o7AJ+WkhqP z7ITn2&x1fmn|1ZI*PHFE;?V+0+irP630tn<>TcED_o6zh1I%=sS5V#1MgwAC4B{Ux zKXB6uK0146FwV80B4|81bsE^JNrUSs1L%M%tUV{7f>*?&V@cT z*k&PBHUh0s_aHdjgrR-Py=d=MAKq}|wOWUb!weqGxeJAm>$YU6nAy7UtAhqSLs$Z? zd~fsw5!%JhO|G7`nNce@IcBBazFlEax)kdhnuK-B*y>8a%1Dn4vR1hdj-B>d=Ocl` zL6km&Vb>kyYj;O#gzNr5Ydi2u&&4+)y_VS(GE{L&F;KTj0)S2n-T{JiTn1OZ_>&$Z zo7hdY4%CsZ8rMkxEM_khur3QA9r;IGr)zvk)!p-;VqI#(W0u4(!Aio>%4#}sysoX& z=%nWXjR%pI+n`HXPom>19z2m|(|St@dd~ES@8uKDpZ7BEo_+Nj2o!ij^k6XCGhPi| z&KF|LOlNj)H1$2J&=Sfodv#C3{T9g5-4djiOG&L;g2Bnlb?5-d3~rg0eG9D;Fw?DE zFISl|N>7z59+R%$^R{}6&?3}Wvb1th>*3?IMfaVj;kuEo|HD59c-vg1HvQd1`WN=&2Ye$*7b)$kGI$aCRgp@P^m#nA^2GBB}nih!Op zWU#YFuppjo#QxRP)6PI8DXLgitxpzhjc3ZaTwLhvROqr^<_iupHc$}_+4?0 zuLdXbq2zW}67A=e>z%Xhd}L&@ZAV`|ZbY^^5+fS%I8)(Rw6q#>b``{*t(7aW@MjD|mL=IEv zPrC~Dpd0SfMBl}Mwqd%W!crcx@0QFp#BQ062)M5CciJ{^0b9T_8;8cWp(7lwN!jtz zqP&iqvWd{7$g456&VRbx^^AsdB7iwR9&Zo6_CKr9JSIc47R4O=Wf@2 z1QXZuI;ratU00W#yX+5n4s4j5R=|NTox@~Eox@B}dWW4is2?W86=HEGG<=4a&pWl} z4rt$IaPHVxD21k|zpcIf`eTKXs9OHdManoEEA&iK(c}BSoAibv+6A~0Ahgk;=_F$@ zDuICHO7~w4_jIu`-F5J?_*}M6D{%C>wX1!4VeZ@~08i{xwG<)F?<7NCLV* z5wBC;kA#HY5Z;+f+@GJ73h5#}CCjq32+pQb<5q$=gp8&BN}gM3 z@MrBph%%VlTI4RhegrF~-!9$ifHxDZ60m!_It8$%BTQJ$-p958Di;QgIS!Bm{eV;)R}pO>T| z)R*Ktg@Q`lB%X1Oi!dduEox8V@g%#hcr!~xDiw47BK8PRpn{YJD*knWdb5(0e4YL0 zXwQ8{!sOa>=9X%Xr(9q5s6X`Zl@7?z>g_}i?_V3&fglRg2OmD*(p=+5Le6Z?mC&)% z37RHhxGGpk16t= zUX|4anc)*n&CQEeUdXr(yeo1YEO_CBjZv+X!o=WBNX;a74}xfS8Z~_=r`MKJJ_u`} zMO{@*sXQ{K6Dysk#-(lRjquOh65LgJl4h?o^L;|hWBq?Thv{%F8#cXXi|(J zT$BKBp^7G}xlfR7O4JQ_e7XK!bxyIACmB_4qQ9vzT{-@KThT13?AG}|EysbNi4*4a z>lr#>eR5%i0j>%Z&%<+)f7_M6tcju1K~1kDZEZrRA*ey+S#UwuUuNe3yMR0#UldrH zrN`p;-1i@Y^hd|_e1NJ)n0|BDMUl^v#KZ?cg_ggl{^tQC4v-K(No6hL=47h8U;OJ| zr2o>JuL=Y_WRRJdS`mu(Pp6`uh$v~?lmBO)Unl(c!>e0>zGe6J#4|8IYht!nIr$&L zU+I84fML(`b7DJmK{BZc@B9ycOY~3m{kvt?0Ax6dbjWTMJj@!ZVt?oP*HQlSU<-H+ zt=E;8f`@?ORw6NP)sO$p+uwtI-IG~Ai;9Th+D-C9SXsl^?O-RqyIKc=g19(=oZE|I z{$bCWgVmdMwf~pFf5zk#c~So)LkZ~sNi1u45!QTteE$|${qpyj;N%6k5V~%!t;}@j zs-R#GG;S)nm+c3v=mQf>)462!0wjDP7cunt>7hR7Z|VO_ggTc2Q#9Xj1YD|4l;G}{Jsfl*tLVH-=Q)s|&i5w^vi<3xCgs`BZ1<%NpZ;9Ibo>2y zIU@S{z}P0I`k8-m@(q)3SXm>iS((}N?5vbSa+Xh3#U_uq55Bti{T^cBfO4N>lpVq* zKArPn(;s#@qWdAr^mggrF4+Im)%N)pT~OWLf!BRXEX(LS`7n7etex*0KK?D1P*aeh zYT0!Zq@6wLj*qHIjLB?xA@uvgILN@o#u{<)+?@=Spr@1l0{XvRj~nZ~bo&PjmLUSX zd73@J$Ycil0RjD&*W-dUKKkGMevh6lKq467rrc%FK_l1DocBdc-1*8});Co;B(?CpX}A zXO@H7dfj6CH~h>OrH;1PoMn`ax~BOL_y3xhm^rWm$@n4DrKV&`x)y3A#=}~IiOuQ) z+_x|MKPvL)I7iB!WcX!-DMsQPdF^2qCJC*jb8h;+NAwN68Seahcm8G+P$M9kr;XIb zhkVSbYIpmZ&luk%fb-LS63h8MG0wkK*$CrwpZhV);&wtSyS+Fhk8Pp0@4tK0*Ont0 z0FMV)d{?bIMR-Xi5mJ!`n8>LC!-9b*!d>8-{TsIawIp+z!0yR7@=53yBhbhVd24R$ z%Q}Jl_r<|)f545`dRd7(e*Wye5gK{c$8^s3&{IRN}_-O&HJ;3wOs{|B3A Bp7H`__ + +2. `Prompt Template Example with speculative inference `__ + + +Example Implementation: + + .. code-block:: python + + import flexflow.serve as ff + from langchain.prompts import PromptTemplate + + ff_llm = FlexFlowLLM(...) + ff_llm.compile_and_start(...) + + template = "Question: {question}\nAnswer:" + prompt = PromptTemplate(template=template, input_variables=["question"]) + + response = ff_llm.generate("Who was the US president in 1997?") diff --git a/docs/source/rag.rst b/docs/source/rag.rst new file mode 100644 index 0000000000..4b869c2352 --- /dev/null +++ b/docs/source/rag.rst @@ -0,0 +1,90 @@ +:tocdepth: 1 +******** +RAG Q&A +******** + +Retrieval Augmented Generation (RAG) combines language models with external knowledge. This use case integrates RAG with FlexFlow Serve for Q&A with documents. + +Requirements +============ + +- FlexFlow Serve setup. +- Retriever setup for RAG. + +Implementation +============== + +1. FlexFlow Initialization + Initialize and configure FlexFlow Serve. + +2. Data Retrieval Setup + Setup a retriever for sourcing information relevant to user queries. + +3. RAG Integration + Integrate the retriever with FlexFlow Serve. + +4. Response Generation + Use the LLM with RAG to generate responses based on model's knowledge and retrieved information. + +5. Shutdown + The FlexFlow server automatically shuts down after generating the response. + +Example +======= + +A complete code example for a web-document Q&A using FlexFlow can be found here: + +1. `Rag Q&A Example with incremental decoding `__ + +2. `Rag Q&A Example with speculative inference `__ + + +Example Implementation: + + .. code-block:: python + + # imports + + # compile and start server + ff_llm = FlexFlowLLM(...) + gen_config = ff.GenerationConfig(...) + ff_llm.compile_and_start(...) + ff_llm_wrapper = FF_LLM_wrapper(flexflow_llm=ff_llm) + + + # Load web page content + loader = WebBaseLoader("https://example.com/data") + data = loader.load() + + # Split text + text_splitter = RecursiveCharacterTextSplitter(...) + all_splits = text_splitter.split_documents(data) + + # Initialize embeddings + embeddings = OpenAIEmbeddings(...) + + # Create VectorStore + vectorstore = Chroma.from_documents(all_splits, embeddings) + + # Use VectorStore as a retriever + retriever = vectorstore.as_retriever() + + # Apply similarity search + question = "Example Question" + docs = vectorstore.similarity_search(question) + max_chars_per_doc = 100 + docs_text = ''.join([docs[i].page_content[:max_chars_per_doc] for i in range(len(docs))]) + + # Using a Prompt Template + prompt_rag = PromptTemplate.from_template( + "Summarize the main themes in these retrieved docs: {docs_text}" + ) + + # Build Chain + llm_chain_rag = LLMChain(llm=ff_llm_wrapper, prompt=prompt_rag) + + # Run + rag_result = llm_chain_rag(docs_text) + + # Stop the server + ff_llm.stop_server() \ No newline at end of file diff --git a/docs/source/serve_api.rst b/docs/source/serve_api.rst new file mode 100644 index 0000000000..6a607cbf0c --- /dev/null +++ b/docs/source/serve_api.rst @@ -0,0 +1,7 @@ +************************** +FlexFlow Serve Python API +************************** + +.. toctree:: + serve_fastapi + serve_gradioapi \ No newline at end of file diff --git a/docs/source/serve_fastapi.rst b/docs/source/serve_fastapi.rst new file mode 100644 index 0000000000..0aa6634670 --- /dev/null +++ b/docs/source/serve_fastapi.rst @@ -0,0 +1,106 @@ +:tocdepth: 1 +*********************** +FlexFlow Serve FastAPI +*********************** + +Introduction +============ + +The Python API for FlexFlow Serve enables users to initialize, manage and interact with large language models (LLMs) via FastAPI or Gradio. + +Requirements +------------ + +- FlexFlow Serve setup with necessary configurations. +- FastAPI and Uvicorn for running the API server. + +API Configuration +================= + +Users can configure the API using FastAPI to handle requests and manage the model. + +1. FastAPI Application Initialization + Initialize the FastAPI application to create API endpoints. + +2. Request Model Definition + Define the model for API requests using Pydantic. + +3. Global Variable for LLM Model + Declare a global variable to store the LLM model. + +Example +------- + +.. code-block:: python + + from fastapi import FastAPI + from pydantic import BaseModel + import flexflow.serve as ff + + app = FastAPI() + + class PromptRequest(BaseModel): + prompt: str + + llm = None + +Endpoint Creation +================= + +Create API endpoints for LLM interactions to handle generation requests. + +1. Initialize Model on Startup + Use the FastAPI event handler to initialize and compile the LLM model when the API server starts. + +2. Generate Response Endpoint + Create a POST endpoint to generate responses based on the user's prompt. + +Example +------- + +.. code-block:: python + + @app.on_event("startup") + async def startup_event(): + global llm + # Initialize and compile the LLM model + llm.compile( + generation_config, + # ... other params as needed + ) + llm.start_server() + + @app.post("/generate/") + async def generate(prompt_request: PromptRequest): + # ... exception handling + full_output = llm.generate([prompt_request.prompt])[0].output_text.decode('utf-8') + # ... split prompt and response text for returning results + return {"prompt": prompt_request.prompt, "response": full_output} + +Running and Testing +=================== + +Instructions for running and testing the FastAPI server. + +1. Run the FastAPI Server + Use Uvicorn to run the FastAPI server with specified host and port. + +2. Testing the API + Make requests to the API endpoints and verify the responses. + +Example +------- + +.. code-block:: bash + + # Running within the inference/python folder: + uvicorn entrypoint.fastapi_incr:app --reload --port 3000 + +Full API Entrypoint Code +========================= + +A complete code example for a web-document Q&A using FlexFlow can be found here: + +1. `FastAPI Example with incremental decoding `__ + +2. `FastAPI Example with speculative inference `__ diff --git a/docs/source/serve_gradioapi.rst b/docs/source/serve_gradioapi.rst new file mode 100644 index 0000000000..ed19e05347 --- /dev/null +++ b/docs/source/serve_gradioapi.rst @@ -0,0 +1,30 @@ +:tocdepth: 1 +************************* +FlexFlow Serve Gradio API +************************* + +Introduction +============ + +Users can also set up the API endpoints with a Gradio Chatbot Interface. + +Requirements +------------ + +- FlexFlow Serve setup with necessary configurations. +- Running the gradio chatbot interface. + +Example +======== + +In a running gradio chatbot interface, hit the "Use via API" button on the bottom left. + + .. image:: /imgs/gradio_interface.png + :alt: Gradio Chatbot Interface + :align: center + +Users can easily access an API endpoint for sending prompts to the model. + + .. image:: /imgs/gradio_api.png + :alt: Gradio API + :align: center \ No newline at end of file diff --git a/docs/source/serve_usecases.rst b/docs/source/serve_usecases.rst new file mode 100644 index 0000000000..4aa3fd2807 --- /dev/null +++ b/docs/source/serve_usecases.rst @@ -0,0 +1,8 @@ +******************* +Serving Usecases +******************* + +.. toctree:: + chatbot + prompt_template + rag \ No newline at end of file diff --git a/inference/.gitignore b/inference/.gitignore index 8ab99cb1eb..1da34a668b 100644 --- a/inference/.gitignore +++ b/inference/.gitignore @@ -3,3 +3,4 @@ weights tokenizers prompt output +.env \ No newline at end of file diff --git a/inference/python/entrypoint/fastapi_incr.py b/inference/python/entrypoint/fastapi_incr.py new file mode 100644 index 0000000000..34f61739fb --- /dev/null +++ b/inference/python/entrypoint/fastapi_incr.py @@ -0,0 +1,162 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Running Instructions: +- To run this FastAPI application, make sure you have FastAPI and Uvicorn installed. +- Save this script as 'fastapi_incr.py'. +- Run the application using the command: `uvicorn fastapi_incr:app --reload --port PORT_NUMBER` +- The server will start on `http://localhost:PORT_NUMBER`. Use this base URL to make API requests. +- Go to `http://localhost:PORT_NUMBER/docs` for API documentation. +""" + + +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel +import flexflow.serve as ff +import uvicorn +import json, os, argparse +from types import SimpleNamespace + +# Initialize FastAPI application +app = FastAPI() + +# Define the request model +class PromptRequest(BaseModel): + prompt: str + +# Global variable to store the LLM model +llm = None + + +def get_configs(): + + # Fetch configuration file path from environment variable + config_file = os.getenv("CONFIG_FILE", "") + + # Load configs from JSON file (if specified) + if config_file: + if not os.path.isfile(config_file): + raise FileNotFoundError(f"Config file {config_file} not found.") + try: + with open(config_file) as f: + return json.load(f) + except json.JSONDecodeError as e: + print("JSON format error:") + print(e) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 2, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 40000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 2, + "offload": False, + "offload_reserve_space_size": 1024**2, + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "profiling": False, + "inference_debugging": False, + "fusion": True, + } + llm_configs = { + # required parameters + "llm_model": "tiiuae/falcon-7b", + # optional parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + "prompt": "", + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + + +# Initialize model on startup +@app.on_event("startup") +async def startup_event(): + global llm + + # Initialize your LLM model configuration here + configs_dict = get_configs() + configs = SimpleNamespace(**configs_dict) + ff.init(configs_dict) + + ff_data_type = ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + + generation_config = ff.GenerationConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 + ) + llm.compile( + generation_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=64, + ) + llm.start_server() + +# API endpoint to generate response +@app.post("/generate/") +async def generate(prompt_request: PromptRequest): + if llm is None: + raise HTTPException(status_code=503, detail="LLM model is not initialized.") + + # Call the model to generate a response + full_output = llm.generate([prompt_request.prompt])[0].output_text.decode('utf-8') + + # Separate the prompt and response + split_output = full_output.split('\n', 1) + if len(split_output) > 1: + response_text = split_output[1] + else: + response_text = "" + + # Return the prompt and the response in JSON format + return { + "prompt": prompt_request.prompt, + "response": response_text + } + +# Shutdown event to stop the model server +@app.on_event("shutdown") +async def shutdown_event(): + global llm + if llm is not None: + llm.stop_server() + +# Main function to run Uvicorn server +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000) + +# Running within the entrypoint folder: +# uvicorn fastapi_incr:app --reload --port + +# Running within the python folder: +# uvicorn entrypoint.fastapi_incr:app --reload --port 3000 diff --git a/inference/python/entrypoint/fastapi_specinfer.py b/inference/python/entrypoint/fastapi_specinfer.py new file mode 100644 index 0000000000..416aee6dc5 --- /dev/null +++ b/inference/python/entrypoint/fastapi_specinfer.py @@ -0,0 +1,202 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Running Instructions: +- To run this FastAPI application, make sure you have FastAPI and Uvicorn installed. +- Save this script as 'fastapi_specinfer.py'. +- Run the application using the command: `uvicorn fastapi_specinfer:app --reload --port PORT_NUMBER` +- The server will start on `http://localhost:PORT_NUMBER`. Use this base URL to make API requests. +- Go to `http://localhost:PORT_NUMBER/docs` for API documentation. +""" + + +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel +import flexflow.serve as ff +import uvicorn +import json, os, argparse +from types import SimpleNamespace + +# Initialize FastAPI application +app = FastAPI() + +# Define the request model +class PromptRequest(BaseModel): + prompt: str + +# Global variable to store the LLM model +llm = None + +def get_configs(): + # Fetch configuration file path from environment variable + config_file = os.getenv("CONFIG_FILE", "") + + # Load configs from JSON file (if specified) + if config_file: + if not os.path.isfile(config_file): + raise FileNotFoundError(f"Config file {config_file} not found.") + try: + with open(config_file) as f: + return json.load(f) + except json.JSONDecodeError as e: + print("JSON format error:") + print(e) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 2, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 40000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 2, + "offload": False, + "offload_reserve_space_size": 1024**2, + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "profiling": False, + "inference_debugging": False, + "fusion": True, + } + llm_configs = { + # required llm arguments + "llm_model": "meta-llama/Llama-2-7b-hf", + # optional llm parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + "ssms": [ + { + # required ssm parameter + "ssm_model": "JackFram/llama-160m", + # optional ssm parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + } + ], + # "prompt": "", + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + +# Initialize model on startup +@app.on_event("startup") +async def startup_event(): + global llm + + # Initialize your LLM model configuration here + configs_dict = get_configs() + configs = SimpleNamespace(**configs_dict) + ff.init(configs_dict) + + # Create the FlexFlow LLM + ff_data_type = ( + ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + ) + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + + # Create the SSMs + ssms = [] + for ssm_config in configs.ssms: + ssm_config = SimpleNamespace(**ssm_config) + ff_data_type = ( + ff.DataType.DT_FLOAT if ssm_config.full_precision else ff.DataType.DT_HALF + ) + ssm = ff.SSM( + ssm_config.ssm_model, + data_type=ff_data_type, + cache_path=ssm_config.cache_path, + refresh_cache=ssm_config.refresh_cache, + output_file=configs.output_file, + ) + ssms.append(ssm) + + # Create the sampling configs + generation_config = ff.GenerationConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 + ) + + # Compile the SSMs for inference and load the weights into memory + for ssm in ssms: + ssm.compile( + generation_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=64, + ) + + # Compile the LLM for inference and load the weights into memory + llm.compile( + generation_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=64, + ssms=ssms, + ) + + llm.start_server() + +# API endpoint to generate response +@app.post("/generate/") +async def generate(prompt_request: PromptRequest): + if llm is None: + raise HTTPException(status_code=503, detail="LLM model is not initialized.") + + # Call the model to generate a response + full_output = llm.generate([prompt_request.prompt])[0].output_text.decode('utf-8') + + # Separate the prompt and response + split_output = full_output.split('\n', 1) + if len(split_output) > 1: + response_text = split_output[1] + else: + response_text = "" + + # Return the prompt and the response in JSON format + return { + "prompt": prompt_request.prompt, + "response": response_text + } + +# Shutdown event to stop the model server +@app.on_event("shutdown") +async def shutdown_event(): + global llm + if llm is not None: + llm.stop_server() + +# Main function to run Uvicorn server +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000) + +# Running within the entrypoint folder: +# uvicorn fastapi_specinfer:app --reload --port + +# Running within the python folder: +# uvicorn entrypoint.fastapi_specinfer:app --reload --port 3000 diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py index 6706cf3c29..f7707816c8 100644 --- a/inference/python/incr_decoding.py +++ b/inference/python/incr_decoding.py @@ -41,7 +41,7 @@ def get_configs(): # Define sample configs ff_init_configs = { # required parameters - "num_gpus": 4, + "num_gpus": 2, "memory_per_gpu": 14000, "zero_copy_memory_per_node": 40000, # optional parameters @@ -49,7 +49,7 @@ def get_configs(): "legion_utility_processors": 4, "data_parallelism_degree": 1, "tensor_parallelism_degree": 1, - "pipeline_parallelism_degree": 4, + "pipeline_parallelism_degree": 2, "offload": False, "offload_reserve_space_size": 1024**2, "use_4bit_quantization": False, @@ -64,7 +64,7 @@ def get_configs(): # optional parameters "cache_path": "", "refresh_cache": False, - "full_precision": True, + "full_precision": False, "prompt": "", "output_file": "", } diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index 8b9a116dc5..fcb1b8f891 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -41,14 +41,14 @@ def get_configs(): # Define sample configs ff_init_configs = { # required parameters - "num_gpus": 4, + "num_gpus": 2, "memory_per_gpu": 14000, "zero_copy_memory_per_node": 40000, # optional parameters "num_cpus": 4, "legion_utility_processors": 4, "data_parallelism_degree": 1, - "tensor_parallelism_degree": 2, + "tensor_parallelism_degree": 1, "pipeline_parallelism_degree": 2, "offload": False, "offload_reserve_space_size": 1024**2, @@ -75,7 +75,7 @@ def get_configs(): "full_precision": False, } ], - "prompt": "", + # "prompt": "", "output_file": "", } # Merge dictionaries diff --git a/inference/python/usecases/gradio_incr.py b/inference/python/usecases/gradio_incr.py new file mode 100644 index 0000000000..2735b665bb --- /dev/null +++ b/inference/python/usecases/gradio_incr.py @@ -0,0 +1,162 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Functionality: +1. Configuration Handling: + - Parses command-line arguments to get a configuration file path. + - Loads configuration settings from a JSON file if provided, or uses default settings. + +2. FlexFlow Model Initialization: + - Initializes FlexFlow with the provided or default configurations. + - Sets up the LLM with the specified model and configurations. + - Compiles the model with generation settings and starts the FlexFlow server. + +3. Gradio Interface Setup: + - Defines a function to generate responses based on user input using FlexFlow. + - Sets up a Gradio Chat Interface to interact with the model in a conversational format. + +4. Main Execution: + - Calls the main function to initialize configurations, set up the FlexFlow LLM, and launch the Gradio interface. + - Stops the FlexFlow server after the Gradio interface is closed. + +Usage: +1. Run the script with an optional configuration file argument for custom settings. +2. Interact with the FlexFlow model through the Gradio web interface. +3. Enter text inputs to receive generated responses from the model. +4. The script will stop the FlexFlow server automatically upon closing the Gradio interface. +""" + +import gradio as gr +import flexflow.serve as ff +import argparse, json, os +from types import SimpleNamespace + + +def get_configs(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-config-file", + help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.", + type=str, + default="", + ) + args = parser.parse_args() + + # Load configs from JSON file (if specified) + if len(args.config_file) > 0: + if not os.path.isfile(args.config_file): + raise FileNotFoundError(f"Config file {args.config_file} not found.") + try: + with open(args.config_file) as f: + return json.load(f) + except json.JSONDecodeError as e: + print("JSON format error:") + print(e) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 2, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 40000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 2, + "offload": False, + "offload_reserve_space_size": 1024**2, + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "profiling": False, + "inference_debugging": False, + "fusion": True, + } + llm_configs = { + # required parameters + "llm_model": "tiiuae/falcon-7b", + # optional parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + "prompt": "", + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + + +# def generate_response(user_input): +# result = llm.generate(user_input) +# return result.output_text.decode('utf-8') + +def generate_response(message, history): + user_input = message + results = llm.generate(user_input) + if isinstance(results, list): + result_txt = results[0].output_text.decode('utf-8') + else: + result_txt = results.output_text.decode('utf-8') + return result_txt + + +def main(): + + global llm + + configs_dict = get_configs() + configs = SimpleNamespace(**configs_dict) + + ff.init(configs_dict) + + ff_data_type = ( + ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + ) + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + + generation_config = ff.GenerationConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 + ) + llm.compile( + generation_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=64, + ) + + # # interface version 1 + # iface = gr.Interface( + # fn=generate_response, + # inputs="text", + # outputs="text" + # ) + + # interface version 2 + iface = gr.ChatInterface(fn=generate_response) + llm.start_server() + iface.launch() + llm.stop_server() + +if __name__ == "__main__": + print("flexflow inference example with gradio interface") + main() \ No newline at end of file diff --git a/inference/python/usecases/gradio_specinfer.py b/inference/python/usecases/gradio_specinfer.py new file mode 100644 index 0000000000..08cde3f00b --- /dev/null +++ b/inference/python/usecases/gradio_specinfer.py @@ -0,0 +1,205 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Functionality: +1. Configuration Handling: + - Parses command-line arguments to get a configuration file path. + - Loads configuration settings from a JSON file if provided, or uses default settings. + +2. FlexFlow Model Initialization: + - Initializes FlexFlow with the provided or default configurations. + - Sets up the LLM with the specified model and configurations. + - Compiles the model with generation settings and starts the FlexFlow server. + +3. Gradio Interface Setup: + - Defines a function to generate responses based on user input using FlexFlow. + - Sets up a Gradio Chat Interface to interact with the model in a conversational format. + +4. Main Execution: + - Calls the main function to initialize configurations, set up the FlexFlow LLM, and launch the Gradio interface. + - Stops the FlexFlow server after the Gradio interface is closed. + +Usage: +1. Run the script with an optional configuration file argument for custom settings. +2. Interact with the FlexFlow model through the Gradio web interface. +3. Enter text inputs to receive generated responses from the model. +4. The script will stop the FlexFlow server automatically upon closing the Gradio interface. +""" + +""" +TODO: fix current issue: model init is stuck at "prepare next batch init" and "prepare next batch verify" +""" + +import gradio as gr +import flexflow.serve as ff +import argparse, json, os +from types import SimpleNamespace + +def get_configs(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-config-file", + help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.", + type=str, + default="", + ) + args = parser.parse_args() + + # Load configs from JSON file (if specified) + if len(args.config_file) > 0: + if not os.path.isfile(args.config_file): + raise FileNotFoundError(f"Config file {args.config_file} not found.") + try: + with open(args.config_file) as f: + return json.load(f) + except json.JSONDecodeError as e: + print("JSON format error:") + print(e) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 2, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 40000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 2, + "offload": False, + "offload_reserve_space_size": 1024**2, + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "profiling": False, + "inference_debugging": False, + "fusion": True, + } + llm_configs = { + # required llm arguments + "llm_model": "meta-llama/Llama-2-7b-hf", + # optional llm parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + "ssms": [ + { + # required ssm parameter + "ssm_model": "JackFram/llama-160m", + # optional ssm parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + } + ], + # "prompt": "", + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + + +# def generate_response(user_input): +# result = llm.generate(user_input) +# return result.output_text.decode('utf-8') + +def generate_response(message, history): + user_input = message + results = llm.generate(user_input) + if isinstance(results, list): + result_txt = results[0].output_text.decode('utf-8') + else: + result_txt = results.output_text.decode('utf-8') + return result_txt + +def main(): + + global llm + + configs_dict = get_configs() + configs = SimpleNamespace(**configs_dict) + + # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs + ff.init(configs_dict) + + # Create the FlexFlow LLM + ff_data_type = ( + ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + ) + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + + # Create the SSMs + ssms = [] + for ssm_config in configs.ssms: + ssm_config = SimpleNamespace(**ssm_config) + ff_data_type = ( + ff.DataType.DT_FLOAT if ssm_config.full_precision else ff.DataType.DT_HALF + ) + ssm = ff.SSM( + ssm_config.ssm_model, + data_type=ff_data_type, + cache_path=ssm_config.cache_path, + refresh_cache=ssm_config.refresh_cache, + output_file=configs.output_file, + ) + ssms.append(ssm) + + # Create the sampling configs + generation_config = ff.GenerationConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 + ) + + # Compile the SSMs for inference and load the weights into memory + for ssm in ssms: + ssm.compile( + generation_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=256, + ) + + # Compile the LLM for inference and load the weights into memory + llm.compile( + generation_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=256, + ssms=ssms, + ) + + # # interface version 1 + # iface = gr.Interface( + # fn=generate_response, + # inputs="text", + # outputs="text" + # ) + + # interface version 2 + iface = gr.ChatInterface(fn=generate_response) + llm.start_server() + iface.launch() + llm.stop_server() + +if __name__ == "__main__": + print("flexflow inference example with gradio interface") + main() \ No newline at end of file diff --git a/inference/python/usecases/prompt_template_incr.py b/inference/python/usecases/prompt_template_incr.py new file mode 100644 index 0000000000..8bffe9ddad --- /dev/null +++ b/inference/python/usecases/prompt_template_incr.py @@ -0,0 +1,187 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script implements the usecase of prompt template upon FlexFlow. + +Functionality: +1. FlexFlowLLM Class: + - Initializes and configures FlexFlow. + - Loads configurations from a file or uses default settings. + - Compiles and starts the language model server for text generation. + - Stops the server when operations are complete. + +2. FF_LLM_wrapper Class: + - Serves as a wrapper for FlexFlow. + - Implements the necessary interface to interact with the LangChain library. + +3. Main: + - Initializes FlexFlow. + - Compiles and starts the server with specific generation configurations. + - Sets up a prompt template for generating responses to questions. + - Use LLMChain to run the model and generate response. + - Stops the FlexFlow server after generating the response. +""" + +import flexflow.serve as ff +import argparse, json, os +from types import SimpleNamespace +from langchain.llms.base import LLM +from typing import Any, List, Mapping, Optional +from langchain.chains import LLMChain +from langchain.prompts import PromptTemplate + +class FlexFlowLLM: + def __init__(self, config_file=""): + self.configs = self.get_configs(config_file) + ff.init(self.configs) + self.llm = self.create_llm() + + def get_configs(self, config_file): + # Load configurations from a file or use default settings + if config_file and os.path.isfile(config_file): + with open(config_file) as f: + return json.load(f) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 2, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 40000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 2, + "offload": False, + "offload_reserve_space_size": 1024**2, + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "profiling": False, + "inference_debugging": False, + "fusion": True, + } + llm_configs = { + # required parameters + "llm_model": "tiiuae/falcon-7b", + # optional parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + "prompt": "", + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + + def create_llm(self): + configs = SimpleNamespace(**self.configs) + ff_data_type = ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + return llm + + def compile_and_start(self, generation_config, max_requests_per_batch, max_seq_length, max_tokens_per_batch): + self.llm.compile(generation_config, max_requests_per_batch, max_seq_length, max_tokens_per_batch) + self.llm.start_server() + + def generate(self, prompt): + results = self.llm.generate(prompt) + if isinstance(results, list): + result_txt = results[0].output_text.decode('utf-8') + else: + result_txt = results.output_text.decode('utf-8') + return result_txt + + def stop_server(self): + self.llm.stop_server() + + def __enter__(self): + return self.llm.__enter__() + + def __exit__(self, exc_type, exc_value, traceback): + return self.llm.__exit__(exc_type, exc_value, traceback) + +class FF_LLM_wrapper(LLM): + flexflow_llm: FlexFlowLLM + + @property + def _llm_type(self) -> str: + return "custom" + + def _call( + self, + prompt: str, + stop: Optional[List[str]] = None, + **kwargs: Any, + ) -> str: + if stop is not None: + raise ValueError("stop kwargs are not permitted.") + response = self.flexflow_llm.generate(prompt) + return response + + +if __name__ == "__main__": + # initialization + ff_llm = FlexFlowLLM() + + # compile and start server + gen_config = ff.GenerationConfig(do_sample=False, temperature=0.9, topp=0.8, topk=1) + ff_llm.compile_and_start( + gen_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=64 + ) + + # the wrapper class serves as the 'Model' in LCEL + ff_llm_wrapper = FF_LLM_wrapper(flexflow_llm=ff_llm) + + # USE CASE 1: Prompt Template + template = """Question: {question} + Answer: Let's think step by step.""" + + # Build prompt template and langchain + prompt = PromptTemplate(template=template, input_variables=["question"]) + llm_chain = LLMChain(prompt=prompt, llm=ff_llm_wrapper) + + question = "Who was the US president in the year the first Pokemon game was released?" + print(llm_chain.run(question)) + + # stop the server + ff_llm.stop_server() + diff --git a/inference/python/usecases/prompt_template_specinfer.py b/inference/python/usecases/prompt_template_specinfer.py new file mode 100644 index 0000000000..dfc92e9ac2 --- /dev/null +++ b/inference/python/usecases/prompt_template_specinfer.py @@ -0,0 +1,236 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script implements the usecase of prompt template upon FlexFlow. + +Functionality: +1. FlexFlowLLM Class: + - Initializes and configures FlexFlow. + - Loads configurations from a file or uses default settings. + - Compiles and starts the language model server for text generation. + - Stops the server when operations are complete. + +2. FF_LLM_wrapper Class: + - Serves as a wrapper for FlexFlow. + - Implements the necessary interface to interact with the LangChain library. + +3. Main: + - Initializes FlexFlow. + - Compiles and starts the server with specific generation configurations. + - Sets up a prompt template for generating responses to questions. + - Use LLMChain to run the model and generate response. + - Stops the FlexFlow server after generating the response. +""" + +import flexflow.serve as ff +import argparse, json, os +from types import SimpleNamespace +from langchain.llms.base import LLM +from typing import Any, List, Mapping, Optional +from langchain.chains import LLMChain +from langchain.prompts import PromptTemplate + + +class FlexFlowLLM: + def __init__(self, config_file=""): + self.configs = self.get_configs(config_file) + ff.init(self.configs) + self.llm = self.create_llm() + self.ssms = self.create_ssms() + + def get_configs(self, config_file): + # Load configurations from a file or use default settings + if config_file and os.path.isfile(config_file): + with open(config_file) as f: + return json.load(f) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 2, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 40000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 2, + "offload": False, + "offload_reserve_space_size": 1024**2, + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "profiling": False, + "inference_debugging": False, + "fusion": True, + } + llm_configs = { + # required llm arguments + "llm_model": "meta-llama/Llama-2-7b-hf", + # optional llm parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + "ssms": [ + { + # required ssm parameter + "ssm_model": "JackFram/llama-160m", + # optional ssm parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + } + ], + # "prompt": "", + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + + def create_llm(self): + configs = SimpleNamespace(**self.configs) + ff_data_type = ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + return llm + + def create_ssms(self): + # Create the SSMs + configs = SimpleNamespace(**self.configs) + ssms = [] + for ssm_config in configs.ssms: + ssm_config = SimpleNamespace(**ssm_config) + ff_data_type = ( + ff.DataType.DT_FLOAT if ssm_config.full_precision else ff.DataType.DT_HALF + ) + ssm = ff.SSM( + ssm_config.ssm_model, + data_type=ff_data_type, + cache_path=ssm_config.cache_path, + refresh_cache=ssm_config.refresh_cache, + output_file=configs.output_file, + ) + ssms.append(ssm) + return ssms + + def compile_and_start(self, generation_config, max_requests_per_batch, max_seq_length, max_tokens_per_batch): + + # Compile the SSMs for inference and load the weights into memory + for ssm in self.ssms: + ssm.compile( + generation_config, + max_requests_per_batch, + max_seq_length, + max_tokens_per_batch, + ) + + # Compile the LLM for inference and load the weights into memory + self.llm.compile( + generation_config, + max_requests_per_batch, + max_seq_length, + max_tokens_per_batch, + ssms = self.ssms + ) + self.llm.start_server() + + def generate(self, prompt): + results = self.llm.generate(prompt) + if isinstance(results, list): + result_txt = results[0].output_text.decode('utf-8') + else: + result_txt = results.output_text.decode('utf-8') + return result_txt + + def stop_server(self): + self.llm.stop_server() + + def __enter__(self): + return self.llm.__enter__() + + def __exit__(self, exc_type, exc_value, traceback): + return self.llm.__exit__(exc_type, exc_value, traceback) + +class FF_LLM_wrapper(LLM): + flexflow_llm: FlexFlowLLM + + @property + def _llm_type(self) -> str: + return "custom" + + def _call( + self, + prompt: str, + stop: Optional[List[str]] = None, + **kwargs: Any, + ) -> str: + if stop is not None: + raise ValueError("stop kwargs are not permitted.") + response = self.flexflow_llm.generate(prompt) + return response + + +if __name__ == "__main__": + # initialization + ff_llm = FlexFlowLLM() + + # compile and start server + gen_config = ff.GenerationConfig(do_sample=False, temperature=0.9, topp=0.8, topk=1) + ff_llm.compile_and_start( + gen_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=64 + ) + + # the wrapper class serves as the 'Model' in LCEL + ff_llm_wrapper = FF_LLM_wrapper(flexflow_llm=ff_llm) + + # USE CASE 1: Prompt Template + template = """Question: {question} + Answer: Let's think step by step.""" + + # Build prompt template and langchain + prompt = PromptTemplate(template=template, input_variables=["question"]) + llm_chain = LLMChain(prompt=prompt, llm=ff_llm_wrapper) + + question = "Who was the US president in the year the first Pokemon game was released?" + print(llm_chain.run(question)) + + # stop the server + ff_llm.stop_server() + + diff --git a/inference/python/usecases/rag_incr.py b/inference/python/usecases/rag_incr.py new file mode 100644 index 0000000000..15e7f3d092 --- /dev/null +++ b/inference/python/usecases/rag_incr.py @@ -0,0 +1,220 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script implements the usecase of rag-search upon FlexFlow. + +Functionality: +1. FlexFlowLLM Class: + - Initializes and configures FlexFlow. + - Loads configurations from a file or uses default settings. + - Compiles and starts the language model server for text generation. + - Stops the server when operations are complete. + +2. FF_LLM_wrapper Class: + - Serves as a wrapper for FlexFlow. + - Implements the necessary interface to interact with the LangChain library. + +3. Main: + - Initializes FlexFlow. + - Compiles and starts the server with specific generation configurations. + - Taking in specific source information with RAG(Retrieval Augmented Generation) technique for Q&A towards specific realm/knowledgebase. + - Use LLMChain to run the model and generate response. + - Stops the FlexFlow server after generating the response. +""" + +import flexflow.serve as ff +import argparse, json, os +from types import SimpleNamespace +from langchain.llms.base import LLM +from typing import Any, List, Mapping, Optional +from langchain.chains import LLMChain +from langchain.prompts import PromptTemplate +from langchain.document_loaders import WebBaseLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.embeddings import OpenAIEmbeddings +from langchain.vectorstores import Chroma +from langchain.vectorstores import FAISS + +class FlexFlowLLM: + def __init__(self, config_file=""): + self.configs = self.get_configs(config_file) + ff.init(self.configs) + self.llm = self.create_llm() + + def get_configs(self, config_file): + # Load configurations from a file or use default settings + if config_file and os.path.isfile(config_file): + with open(config_file) as f: + return json.load(f) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 2, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 40000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 2, + "offload": False, + "offload_reserve_space_size": 1024**2, + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "profiling": False, + "inference_debugging": False, + "fusion": True, + } + llm_configs = { + # required parameters + "llm_model": "tiiuae/falcon-7b", + # optional parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + "prompt": "", + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + + def create_llm(self): + configs = SimpleNamespace(**self.configs) + ff_data_type = ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + return llm + + def compile_and_start(self, generation_config, max_requests_per_batch, max_seq_length, max_tokens_per_batch): + self.llm.compile(generation_config, max_requests_per_batch, max_seq_length, max_tokens_per_batch) + self.llm.start_server() + + def generate(self, prompt): + results = self.llm.generate(prompt) + if isinstance(results, list): + result_txt = results[0].output_text.decode('utf-8') + else: + result_txt = results.output_text.decode('utf-8') + return result_txt + + def stop_server(self): + self.llm.stop_server() + + def __enter__(self): + return self.llm.__enter__() + + def __exit__(self, exc_type, exc_value, traceback): + return self.llm.__exit__(exc_type, exc_value, traceback) + + +class FF_LLM_wrapper(LLM): + flexflow_llm: FlexFlowLLM + + @property + def _llm_type(self) -> str: + return "custom" + + def _call( + self, + prompt: str, + stop: Optional[List[str]] = None, + **kwargs: Any, + ) -> str: + if stop is not None: + raise ValueError("stop kwargs are not permitted.") + response = self.flexflow_llm.generate(prompt) + return response + + +if __name__ == "__main__": + # initialization + ff_llm = FlexFlowLLM() + + # compile and start server + gen_config = ff.GenerationConfig(do_sample=False, temperature=0.9, topp=0.8, topk=1) + ff_llm.compile_and_start( + gen_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=64 + ) + + # the wrapper class serves as the 'Model' in LCEL + ff_llm_wrapper = FF_LLM_wrapper(flexflow_llm=ff_llm) + + # USE CASE 2: Rag Search + + # Load web page content + loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/") + data = loader.load() + + # Split text + text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0) + all_splits = text_splitter.split_documents(data) + + # Initialize embeddings + embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY')) # fill in openai api key + + # Create VectorStore + vectorstore = Chroma.from_documents(all_splits, embeddings) + + # Use VectorStore as a retriever + retriever = vectorstore.as_retriever() + + # Test if similarity search is working + question = "What are the approaches to Task Decomposition?" + docs = vectorstore.similarity_search(question) + max_chars_per_doc = 100 + # docs_text_list = [docs[i].page_content for i in range(len(docs))] + docs_text_list = [docs[i].page_content[:max_chars_per_doc] for i in range(len(docs))] + docs_text = ''.join(docs_text_list) + + # Using a Prompt Template + prompt_rag = PromptTemplate.from_template( + "Summarize the main themes in these retrieved docs: {docs_text}" + ) + + # Chain + llm_chain_rag = LLMChain(llm=ff_llm_wrapper, prompt=prompt_rag) + + # Run + rag_result = llm_chain_rag(docs_text) + + # Stop the server + ff_llm.stop_server() + diff --git a/inference/python/usecases/rag_specinfer.py b/inference/python/usecases/rag_specinfer.py new file mode 100644 index 0000000000..512b973955 --- /dev/null +++ b/inference/python/usecases/rag_specinfer.py @@ -0,0 +1,266 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script implements the usecase of rag-search upon FlexFlow. + +Functionality: +1. FlexFlowLLM Class: + - Initializes and configures FlexFlow. + - Loads configurations from a file or uses default settings. + - Compiles and starts the language model server for text generation. + - Stops the server when operations are complete. + +2. FF_LLM_wrapper Class: + - Serves as a wrapper for FlexFlow. + - Implements the necessary interface to interact with the LangChain library. + +3. Main: + - Initializes FlexFlow. + - Compiles and starts the server with specific generation configurations. + - Taking in specific source information with RAG(Retrieval Augmented Generation) technique for Q&A towards specific realm/knowledgebase. + - Use LLMChain to run the model and generate response. + - Stops the FlexFlow server after generating the response. +""" + +import flexflow.serve as ff +import argparse, json, os +from types import SimpleNamespace +from langchain.llms.base import LLM +from typing import Any, List, Mapping, Optional +from langchain.chains import LLMChain +from langchain.prompts import PromptTemplate +from langchain.document_loaders import WebBaseLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.embeddings import OpenAIEmbeddings +from langchain.vectorstores import Chroma +from langchain.vectorstores import FAISS + +class FlexFlowLLM: + def __init__(self, config_file=""): + self.configs = self.get_configs(config_file) + ff.init(self.configs) + self.llm = self.create_llm() + self.ssms = self.create_ssms() + + def get_configs(self, config_file): + # Load configurations from a file or use default settings + if config_file and os.path.isfile(config_file): + with open(config_file) as f: + return json.load(f) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 2, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 40000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 2, + "offload": False, + "offload_reserve_space_size": 1024**2, + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "profiling": False, + "inference_debugging": False, + "fusion": True, + } + llm_configs = { + # required llm arguments + "llm_model": "meta-llama/Llama-2-7b-hf", + # optional llm parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + "ssms": [ + { + # required ssm parameter + "ssm_model": "JackFram/llama-160m", + # optional ssm parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + } + ], + # "prompt": "", + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + + def create_llm(self): + configs = SimpleNamespace(**self.configs) + ff_data_type = ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + return llm + + def create_ssms(self): + # Create the SSMs + configs = SimpleNamespace(**self.configs) + ssms = [] + for ssm_config in configs.ssms: + ssm_config = SimpleNamespace(**ssm_config) + ff_data_type = ( + ff.DataType.DT_FLOAT if ssm_config.full_precision else ff.DataType.DT_HALF + ) + ssm = ff.SSM( + ssm_config.ssm_model, + data_type=ff_data_type, + cache_path=ssm_config.cache_path, + refresh_cache=ssm_config.refresh_cache, + output_file=configs.output_file, + ) + ssms.append(ssm) + return ssms + + def compile_and_start(self, generation_config, max_requests_per_batch, max_seq_length, max_tokens_per_batch): + + # Compile the SSMs for inference and load the weights into memory + for ssm in self.ssms: + ssm.compile( + generation_config, + max_requests_per_batch, + max_seq_length, + max_tokens_per_batch, + ) + + # Compile the LLM for inference and load the weights into memory + self.llm.compile( + generation_config, + max_requests_per_batch, + max_seq_length, + max_tokens_per_batch, + ssms = self.ssms + ) + # start server + self.llm.start_server() + + def generate(self, prompt): + results = self.llm.generate(prompt) + if isinstance(results, list): + result_txt = results[0].output_text.decode('utf-8') + else: + result_txt = results.output_text.decode('utf-8') + return result_txt + + def stop_server(self): + self.llm.stop_server() + + def __enter__(self): + return self.llm.__enter__() + + def __exit__(self, exc_type, exc_value, traceback): + return self.llm.__exit__(exc_type, exc_value, traceback) + +class FF_LLM_wrapper(LLM): + flexflow_llm: FlexFlowLLM + + @property + def _llm_type(self) -> str: + return "custom" + + def _call( + self, + prompt: str, + stop: Optional[List[str]] = None, + **kwargs: Any, + ) -> str: + if stop is not None: + raise ValueError("stop kwargs are not permitted.") + response = self.flexflow_llm.generate(prompt) + return response + + +if __name__ == "__main__": + # initialization + ff_llm = FlexFlowLLM() + + # compile and start server + gen_config = ff.GenerationConfig(do_sample=False, temperature=0.9, topp=0.8, topk=1) + ff_llm.compile_and_start( + gen_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=200 + ) + + # the wrapper class serves as the 'Model' in LCEL + ff_llm_wrapper = FF_LLM_wrapper(flexflow_llm=ff_llm) + + # USE CASE 2: Rag Search + + # Load web page content + loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/") + data = loader.load() + + # Split text + text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0) + all_splits = text_splitter.split_documents(data) + + # Initialize embeddings + embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY')) # fill in openai api key + + # Create VectorStore + vectorstore = Chroma.from_documents(all_splits, embeddings) + + # Use VectorStore as a retriever + retriever = vectorstore.as_retriever() + + # Test if similarity search is working + question = "What are the approaches to Task Decomposition?" + docs = vectorstore.similarity_search(question) + max_chars_per_doc = 50 + # docs_text_list = [docs[i].page_content for i in range(len(docs))] + docs_text_list = [docs[i].page_content[:max_chars_per_doc] for i in range(len(docs))] + docs_text = ''.join(docs_text_list) + + # Using a Prompt Template + prompt_rag = PromptTemplate.from_template( + "Summarize the main themes in these retrieved docs: {docs_text}" + ) + + # Chain + llm_chain_rag = LLMChain(llm=ff_llm_wrapper, prompt=prompt_rag) + + # Run + rag_result = llm_chain_rag(docs_text) + + # stop the server + ff_llm.stop_server() diff --git a/tests/training_tests.sh b/tests/training_tests.sh index 2d1f00883b..a6cab7d117 100755 --- a/tests/training_tests.sh +++ b/tests/training_tests.sh @@ -2,6 +2,9 @@ set -x set -e +# Enable backtrace in case we run into a segfault or assertion failure +export LEGION_BACKTRACE=1 + # Default to single-node, single GPU GPUS=${1:-1} # number of GPUS per node NUM_NODES=${2:-1} # number of nodes @@ -87,3 +90,4 @@ $EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat.py -config-file /t $EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_model.py -config-file /tmp/flexflow/training_tests/test_params.json $EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_seq_model.py -config-file /tmp/flexflow/training_tests/test_params.json $EXE "$FF_HOME"/examples/python/native/cifar10_cnn_concat.py -config-file /tmp/flexflow/training_tests/test_params_40_epochs_no_batch_size.json + From d21ed66a5baf2bfdeb06fd74e080abbd6eec9ce7 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 26 Jan 2024 23:01:16 -0500 Subject: [PATCH 49/61] Bug fixes and update Legion version (#1287) * bug fixes and update Legion version * fix * bug fix * update legion * fix arithmetic error due to num_devices uninitialized * update legion version * update ci * fix * debugging ci * Revert "debugging ci" This reverts commit 0b3148ef6adfcb64935e6b1e83a88494910a7b22. * update mapper interface * add ncclFinalize * Only delete nccl communications for training jobs --------- Co-authored-by: Zhihao Jia --- .github/workflows/gpu-ci.yml | 12 +++--- CMakeLists.txt | 8 ++-- cmake/pip_install/CMakeLists.txt | 4 +- deps/legion | 2 +- include/flexflow/mapper.h | 9 ++--- include/flexflow/model.h | 2 + include/flexflow/operator.h | 5 +++ include/flexflow/request_manager.h | 1 - src/mapper/mapper.cc | 47 ++++++++++------------ src/ops/linear.cc | 8 +--- src/runtime/inference_manager.cc | 30 +------------- src/runtime/model.cc | 63 ++++++++++++++++++++++++++++++ 12 files changed, 111 insertions(+), 80 deletions(-) diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 3901d6b5f7..48dcda157e 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -222,7 +222,7 @@ jobs: CONDA: "3" needs: inference-tests container: - image: ghcr.io/flexflow/flexflow-environment-cuda:latest + image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest options: --gpus all --shm-size=8192m steps: - name: Install updated git version @@ -243,7 +243,7 @@ jobs: - name: Build and Install FlexFlow run: | - export PATH=/opt/conda/bin:$PATH + export PATH=$CONDA_PREFIX/bin:$PATH export FF_HOME=$(pwd) export FF_BUILD_ALL_EXAMPLES=ON export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON @@ -252,18 +252,18 @@ jobs: - name: Check FlexFlow Python interface (pip) run: | - export PATH=/opt/conda/bin:$PATH + export PATH=$CONDA_PREFIX/bin:$PATH export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib ./tests/python_interface_test.sh after-installation - name: Run multi-gpu tests run: | - export PATH=/opt/conda/bin:$PATH + export PATH=$CONDA_PREFIX/bin:$PATH export CUDNN_DIR=/usr/local/cuda export CUDA_DIR=/usr/local/cuda export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib # C++ tests ./tests/cpp_gpu_tests.sh 4 # Python tests diff --git a/CMakeLists.txt b/CMakeLists.txt index acbe7e385f..43ce4f7044 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -413,6 +413,7 @@ if(NOT BUILD_LEGION_ONLY) # python related if (FF_USE_PYTHON) + find_package(Python COMPONENTS Interpreter Development) # create flexflow_cffi_header.py add_custom_command(TARGET flexflow PRE_BUILD @@ -424,13 +425,13 @@ if(NOT BUILD_LEGION_ONLY) # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library add_custom_command(TARGET flexflow POST_BUILD - COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} + COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python ) # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead. add_custom_command(TARGET flexflow PRE_BUILD - COMMAND ${PYTHON_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR} + COMMAND ${Python_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMENT "Creating flexflow_python interpreter..." ) @@ -567,7 +568,8 @@ if(NOT BUILD_LEGION_ONLY) install(TARGETS flexflow DESTINATION ${LIB_DEST}) # install python if (FF_USE_PYTHON) - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) + find_package(Python COMPONENTS Interpreter Development) + execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) if (NOT FF_BUILD_FROM_PYPI) install( DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/ diff --git a/cmake/pip_install/CMakeLists.txt b/cmake/pip_install/CMakeLists.txt index 7ce38c4abc..105133a310 100644 --- a/cmake/pip_install/CMakeLists.txt +++ b/cmake/pip_install/CMakeLists.txt @@ -1,10 +1,10 @@ # Use setup.py script to re-install the Python bindings library with the right library paths if (FF_USE_PYTHON) - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) if(FF_BUILD_FROM_PYPI) install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${PY_DEST}/flexflow/lib \")") # CMAKE_CURRENT_SOURCE_DIR=/usr/FlexFlow/cmake/pip_install # Legion_BINARY_DIR=/usr/FlexFlow/build//deps/legion - install(CODE "execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)") + install(CODE "execute_process(COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)") endif() endif() diff --git a/deps/legion b/deps/legion index 626b55689c..24e8c45234 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit 626b55689c77848b246e1da19678c7ad58899f0c +Subproject commit 24e8c452341dea41427e0ce61e154d61715e6835 diff --git a/include/flexflow/mapper.h b/include/flexflow/mapper.h index 71be1892aa..e8337818ec 100644 --- a/include/flexflow/mapper.h +++ b/include/flexflow/mapper.h @@ -83,11 +83,10 @@ class FFMapper : public NullMapper { Task const &task, MapTaskInput const &input, MapTaskOutput &output); - virtual void map_replicate_task(const MapperContext ctx, - Task const &task, - MapTaskInput const &input, - MapTaskOutput const &default_output, - MapReplicateTaskOutput &output); + virtual void replicate_task(const MapperContext ctx, + Task const &task, + ReplicateTaskInput const &input, + ReplicateTaskOutput &output); virtual void select_task_variant(const MapperContext ctx, Task const &task, SelectVariantInput const &input, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index dd6dc76b4d..95be9ab581 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -202,6 +202,7 @@ enum TaskIDs { // NCCL tasks NCCL_GETUNIQUEID_TASK_ID, NCCL_INIT_COMMS_TASK_ID, + NCCL_FINISH_COMMS_TASK_ID, // Search STRATEGY_SEARCH_TASK_ID, // Graph @@ -397,6 +398,7 @@ std::vector class FFModel { public: FFModel(FFConfig &config, bool cpu_offload = false); + ~FFModel(); static constexpr float PROPAGATION_CHANCE = 0.25; static constexpr float CONTINUE_PROPAGATION_CHANCE = 0.75; diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 73c2c3e092..1b19bdb82f 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -406,6 +406,11 @@ class Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void + finish_nccl_comms_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); #endif protected: void set_argumentmap_for_init(FFModel const &ff, Legion::ArgumentMap &argmap); diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 50a51705cd..4763eb1ef3 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -55,7 +55,6 @@ class InferenceManager { public: std::unordered_map> tensor_buffer; std::unordered_map model_weights_loaders; - int num_devices; }; struct Request { diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index bc26a79d3e..d46bfc2877 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -661,44 +661,37 @@ void FFMapper::map_task(const MapperContext ctx, } // for idx } -void FFMapper::map_replicate_task(const MapperContext ctx, - Task const &task, - MapTaskInput const &input, - MapTaskOutput const &default_output, - MapReplicateTaskOutput &output) { +void FFMapper::replicate_task(const MapperContext ctx, + Task const &task, + ReplicateTaskInput const &input, + ReplicateTaskOutput &output) { // Should only be replicated for the top-level task assert((task.get_depth() == 0) && (task.regions.size() == 0)); const Processor::Kind target_kind = task.target_proc.kind(); - VariantID chosen_variant; + VariantID vid; { std::vector variant_ids; - runtime->find_valid_variants( - ctx, task.task_id, variant_ids, task.target_proc.kind()); + runtime->find_valid_variants(ctx, task.task_id, variant_ids, target_kind); // Currently assume there is exactly one variant assert(variant_ids.size() == 1); - chosen_variant = variant_ids[0]; + output.chosen_variant = variant_ids[0]; } - std::vector const &all_procs = all_procs_by_kind(target_kind); - // Place on replicate on each node by default - output.task_mappings.resize(total_nodes, default_output); - // Assume default_output does not include any target_procs - assert(default_output.target_procs.size() == 0); - for (std::vector::const_iterator it = all_procs.begin(); - it != all_procs.end(); + output.target_processors.resize(total_nodes); + std::vector handled(total_nodes, false); + size_t count = 0; + Machine::ProcessorQuery procs(machine); + procs.only_kind(target_kind); + for (Machine::ProcessorQuery::iterator it = procs.begin(); it != procs.end(); it++) { - AddressSpace space = it->address_space(); - assert(space < output.task_mappings.size()); - // Add *it as a target_proc if we haven't found one - if (output.task_mappings[space].target_procs.size() == 0) { - output.task_mappings[space].target_procs.push_back(*it); + const AddressSpace space = it->address_space(); + if (handled[space]) { + continue; } + output.target_processors[space] = *it; + handled[space] = true; + count++; } - output.control_replication_map.resize(total_nodes); - for (int idx = 0; idx < total_nodes; idx++) { - output.task_mappings[idx].chosen_variant = chosen_variant; - output.control_replication_map[idx] = - output.task_mappings[idx].target_procs[0]; - } + assert(count == total_nodes); } void FFMapper::select_task_variant(const MapperContext ctx, diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 03c9e48af8..0c7a0f78fe 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -467,12 +467,8 @@ OpMeta *Linear::init_task_with_dim(Task const *task, ctx, runtime, false /*readOutput*/); - // TensorAccessorW acc_kernel(regions[2], - // task->regions[2], - // FID_DATA, - // ctx, - // runtime, - // false /*readOutput*/); + TensorAccessorR acc_kernel( + regions[2], task->regions[2], FID_DATA, ctx, runtime); // TensorAccessorR acc_bias( // regions[3], task->regions[3], FID_DATA, ctx, runtime); diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 6588cbceeb..2a94df8b4d 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -28,33 +28,7 @@ using namespace Legion; LegionRuntime::Logger::Category log_inf_mgr("InferenceManager"); LegionRuntime::Logger::Category log_offload("Offloading"); -InferenceManager::InferenceManager() { -#ifdef DEADCODE - num_devices = ff_config.workersPerNode * ff_config.numNodes; - // Check parallelization degrees - assert(ff_config.data_parallelism_degree <= num_devices && - "Data parallelism degree exceeds number of available devices"); - assert(num_devices % ff_config.data_parallelism_degree == 0 && - "Number of available devices is not divisible by data parallelism " - "degree"); - assert(ff_config.tensor_parallelism_degree <= num_devices && - "Tensor parallelism degree exceeds number of available devices"); - assert(num_devices % ff_config.tensor_parallelism_degree == 0 && - "Number of available devices is not divisible by tensor parallelism " - "degree"); - assert(ff_config.pipeline_parallelism_degree <= num_devices && - "Pipeline parallelism degree exceeds number of available devices"); - assert(num_devices % ff_config.pipeline_parallelism_degree == 0 && - "Number of available devices is not divisible by pipeline parallelism " - "degree"); - assert(ff_config.data_parallelism_degree * - ff_config.tensor_parallelism_degree * - ff_config.pipeline_parallelism_degree == - num_devices && - "Product of data, tensor, and pipeline parallelism degrees does not " - "match the number of available devices"); -#endif -} +InferenceManager::InferenceManager() {} InferenceManager *inference_manager_singleton = nullptr; @@ -296,8 +270,6 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { void InferenceManager::init_operators_inference(FFModel *model) { for (int batch_index = 0; batch_index < model->config.data_parallelism_degree; batch_index++) { - int expert_device_index = 0; - int device_index = batch_index % num_devices; for (size_t o = 0; o < model->operators.size(); o++) { Op *op = model->operators[o]; if (op->op_type == OP_WEIGHT) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index c07c33efca..440ae19047 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -606,6 +606,15 @@ ncclComm_t Op::init_nccl_comms_task(Task const *task, // ncclComm, allRanks, myRank, ncclId); return ncclComm; } + +void Op::finish_nccl_comms_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + ncclComm_t comm = *((ncclComm_t *)task->local_args); + checkNCCL(ncclCommFinalize(comm)); + checkNCCL(ncclCommDestroy(comm)); +} #endif /** @@ -1578,6 +1587,45 @@ FFModel::FFModel(FFConfig &_config, bool cpu_offload) model_id = model_counter++; } +FFModel::~FFModel() { + // Destroy nccl communication groups +#ifdef FF_USE_NCCL + if (config.computationMode == COMP_MODE_TRAINING) { + Context ctx = config.lg_ctx; + Runtime *runtime = config.lg_hlr; + for (auto const &comm : view_hash_to_nccl_comms) { + // Find the machine view that has the hash + MachineView view; + for (size_t l = 0; l < operators.size(); l++) { + view = operators[l]->outputs[0]->machine_view; + if (view.hash() == comm.first) { + break; + } + } + assert(view.hash() == comm.first && "Cannot find the machine view"); + IndexSpace task_is = get_or_create_task_is(view); + Domain domain = runtime->get_index_space_domain(ctx, task_is); + ArgumentMap argmap; + int idx = 0; + for (Domain::DomainPointIterator it(domain); it; it++, idx++) { + argmap.set_point(*it, + TaskArgument(&comm.second[idx], sizeof(ncclComm_t))); + } + IndexLauncher index_launcher(NCCL_FINISH_COMMS_TASK_ID, + task_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + comm.first); + FutureMap fm = runtime->execute_index_space(ctx, index_launcher); + fm.wait_all_results(); + } + } +#endif +} + void FFModel::clear_graph_search_cache() { this->graph_search->clear_cache(); this->search->clear_cache(); @@ -6853,6 +6901,21 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(NCCL_FINISH_COMMS_TASK_ID, + "NCCL Finish Communicators"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "NCCL Finish Communicators Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } #endif // Search { From be28d718c06c199866126a8bf4f1e35dfc4509a1 Mon Sep 17 00:00:00 2001 From: April Yang <114364211+april-yyt@users.noreply.github.com> Date: Sun, 4 Feb 2024 19:58:39 -0800 Subject: [PATCH 50/61] Docs Modification for Python Usecases (#1291) * modify README * fix link issues * update legion version --------- Co-authored-by: Zhihao Jia --- SERVE.md | 69 +++++++++++++++++++++++++++++++++ docs/source/chatbot.rst | 4 +- docs/source/prompt_template.rst | 4 +- docs/source/rag.rst | 4 +- docs/source/serve_fastapi.rst | 4 +- 5 files changed, 77 insertions(+), 8 deletions(-) diff --git a/SERVE.md b/SERVE.md index e64756e8f4..e9bab3d702 100644 --- a/SERVE.md +++ b/SERVE.md @@ -182,6 +182,75 @@ FlexFlow Serve supports int4 and int8 quantization. The compressed tensors are s ### Prompt Datasets We provide five prompt datasets for evaluating FlexFlow Serve: [Chatbot instruction prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatbot.json), [ChatGPT Prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatgpt.json), [WebQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/webqa.json), [Alpaca](https://specinfer.s3.us-east-2.amazonaws.com/prompts/alpaca.json), and [PIQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/piqa.json). + + + +## Python Interface Features and Interaction Methods + +FlexFlow Serve provides a comprehensive Python interface for serving with low latency and high performance. This interface facilitates the deployment and interaction with the serving platform for a variety of applications, from chatbots and prompt templates to retrieval augmented generation and API services. + +### Chatbot with Gradio + +The Python interface allows setting up a chatbot application using Gradio, enabling interactive dialogues with users through a user-friendly web interface. + +#### Implementation Steps +1. **FlexFlow Initialization:** Configure and initialize FlexFlow Serve with the desired settings and the specific LLM. +```python +import gradio as gr +import flexflow.serve as ff + +ff.init(num_gpus=2, memory_per_gpu=14000, ...) +``` +2. **Gradio Interface Setup:** Implement a function to generate responses from user inputs and set up the Gradio Chat Interface for interaction. +```python +def generate_response(user_input): + result = llm.generate(user_input) + return result.output_text.decode('utf-8') +``` +3. **Running the Interface:** Launch the Gradio interface to interact with the LLM through a web-based chat interface. +```python +iface = gr.ChatInterface(fn=generate_response) +iface.launch() +``` +4. **Shutdown:** Properly stop the FlexFlow server after interaction is complete. + + + +### Langchain Usecases +FlexFlow Serve supports langchain usecases including dynamic prompt template handling and RAG usecases, enabling the customization of model responses based on structured input templates and Retrieval Augmented Generation. + +#### Implementation Steps +1. **FlexFlow Initialization**: Start by initializing FlexFlow Serve with the appropriate configurations. +2. **LLM Setup**: Compile and load the LLM for text generation. +3. **Prompt Template/RAG Setup**: Configure prompt templates to guide the model's responses. +4. **Response Generation**: Use the LLM with the prompt template to generate responses. + + +### Python FastAPI Entrypoint +Flexflow Serve also supports deploying and managing LLMs with FastAPI, offering a RESTful API interface for generating responses from models. + +```python +@app.on_event("startup") +async def startup_event(): + global llm + # Initialize and compile the LLM model + llm.compile( + generation_config, + # ... other params as needed + ) + llm.start_server() + +@app.post("/generate/") +async def generate(prompt_request: PromptRequest): + # ... exception handling + full_output = llm.generate([prompt_request.prompt])[0].output_text.decode('utf-8') + # ... split prompt and response text for returning results + return {"prompt": prompt_request.prompt, "response": full_output} +``` + + + + ## TODOs FlexFlow Serve is still under active development. We currently focus on the following tasks and strongly welcome all contributions from bug fixes to new features and extensions. diff --git a/docs/source/chatbot.rst b/docs/source/chatbot.rst index fc6f616fae..c41307e231 100644 --- a/docs/source/chatbot.rst +++ b/docs/source/chatbot.rst @@ -42,9 +42,9 @@ Example Complete code example can be found here: -1. `Chatbot Example with incremental decoding `__ +1. `Chatbot Example with incremental decoding `__ -2. `Chatbot Example with speculative inference `__ +2. `Chatbot Example with speculative inference `__ Example Implementation: diff --git a/docs/source/prompt_template.rst b/docs/source/prompt_template.rst index 4e0f1beab5..7f987b0f18 100644 --- a/docs/source/prompt_template.rst +++ b/docs/source/prompt_template.rst @@ -34,9 +34,9 @@ Example Complete code example can be found here: -1. `Prompt Template Example with incremental decoding `__ +1. `Prompt Template Example with incremental decoding `__ -2. `Prompt Template Example with speculative inference `__ +2. `Prompt Template Example with speculative inference `__ Example Implementation: diff --git a/docs/source/rag.rst b/docs/source/rag.rst index 4b869c2352..640b2fe131 100644 --- a/docs/source/rag.rst +++ b/docs/source/rag.rst @@ -34,9 +34,9 @@ Example A complete code example for a web-document Q&A using FlexFlow can be found here: -1. `Rag Q&A Example with incremental decoding `__ +1. `Rag Q&A Example with incremental decoding `__ -2. `Rag Q&A Example with speculative inference `__ +2. `Rag Q&A Example with speculative inference `__ Example Implementation: diff --git a/docs/source/serve_fastapi.rst b/docs/source/serve_fastapi.rst index 0aa6634670..62a28e5937 100644 --- a/docs/source/serve_fastapi.rst +++ b/docs/source/serve_fastapi.rst @@ -101,6 +101,6 @@ Full API Entrypoint Code A complete code example for a web-document Q&A using FlexFlow can be found here: -1. `FastAPI Example with incremental decoding `__ +1. `FastAPI Example with incremental decoding `__ -2. `FastAPI Example with speculative inference `__ +2. `FastAPI Example with speculative inference `__ From ec2002e98a40bc7814ba38ba5dbc0ba87c9727e3 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 15 Feb 2024 22:16:52 +0000 Subject: [PATCH 51/61] fix --- src/runtime/inference_manager.cc | 2 +- tests/peft/alignment/align_test_utils.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 15d02edbbb..e480e74baa 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -617,9 +617,9 @@ void FFModel::compile_inference() { op->op_type == OP_RESIDUAL_RMS_NORM || op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM)) { if (reset_inputs.find(op->outputs[0]->region) != reset_inputs.end()) { - reset_inputs.insert(op->inputs[0]->region); op->reset_input_grads[0] = false; } + reset_inputs.insert(op->inputs[i]->region); } else { reset_inputs.insert(op->inputs[i]->region); } diff --git a/tests/peft/alignment/align_test_utils.py b/tests/peft/alignment/align_test_utils.py index b0cb5fe428..dbe7a0be40 100644 --- a/tests/peft/alignment/align_test_utils.py +++ b/tests/peft/alignment/align_test_utils.py @@ -1,8 +1,8 @@ import os, re, torch import numpy as np abs_dirname = os.path.dirname(os.path.abspath(__file__)) -hf_path = os.path.join(abs_dirname, "hf_peft_tensors") -ff_path = os.path.join(os.path.dirname(os.path.dirname(abs_dirname)), "build", "inference_tensors") +hf_path = os.path.join(os.path.dirname(abs_dirname), "hf_peft_tensors") +ff_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(abs_dirname))), "build", "inference_tensors") def print_unique_files_list(dirname): files_list = os.listdir(dirname) for f in sorted(files_list): From 098e88016fe8557da498ae876701f96df46ae966 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 16 Feb 2024 02:48:11 +0000 Subject: [PATCH 52/61] fix residual rms --- .../ops/kernels/residual_rms_norm_kernels.h | 7 +- src/ops/fused.cu | 3 +- src/ops/kernels/residual_rms_norm_kernels.cu | 59 ++++---- src/ops/residual_rms_norm.cc | 138 ++++++++++++------ 4 files changed, 134 insertions(+), 73 deletions(-) diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h index 6eb5c0ae21..dfc9937cc3 100644 --- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h @@ -66,9 +66,10 @@ void backward_kernel_wrapper( GenericTensorAccessorW const &weight_grad); void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m, BatchConfig const *bc, - GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const &residual_input0_grad, - GenericTensorAccessorW const &residual_input1_grad, + GenericTensorAccessorR const &output_grad_0, + GenericTensorAccessorR const &output_grad_1, + GenericTensorAccessorW const &input_grad_0, + GenericTensorAccessorW const &input_grad_1, GenericTensorAccessorR const &weight); } // namespace ResidualRMSNorm } // namespace Kernels diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 55892ab7e9..c589f6a5be 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -1026,9 +1026,10 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper( m, bc, - my_output_grad_accessor[1], my_input_grad_accessor[0], my_input_grad_accessor[1], + my_output_grad_accessor[0], + my_output_grad_accessor[1], my_weight_accessor[0]); break; } diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu index 969c6458a4..4b92e70787 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cu +++ b/src/ops/kernels/residual_rms_norm_kernels.cu @@ -332,6 +332,7 @@ __global__ void ComputeInternalGradientsCUDAKernel( template __global__ void RMSNormBackwardCUDAKernel(int64_t N, + T const *dX1_residual, T const *dY, T const *X, T const *gamma, @@ -351,7 +352,7 @@ __global__ void RMSNormBackwardCUDAKernel(int64_t N, if (reset_input_grad1) { dX1[index] = static_cast(dX_val); } else { - dX1[index] += static_cast(dX_val); + dX1[index] = dX1_residual[index] + static_cast(dX_val); } if (reset_input_grad2) { dX2[index] = static_cast(dX1[index]); @@ -399,6 +400,7 @@ void backward_kernel(ResidualRMSNormMeta const *m, RMSNormBackwardCUDAKernel<<>>( N, + nullptr, output_grad_ptr, residual_output_rms_input_ptr, weight_ptr, @@ -421,9 +423,10 @@ void backward_kernel(ResidualRMSNormMeta const *m, template void peft_bwd_kernel(ResidualRMSNormMeta const *m, BatchConfig const *bc, - T const *output_grad_ptr, - T *residual_input0_grad_ptr, - T *residual_input1_grad_ptr, + T const *output_grad_0_ptr, + T const *output_grad_1_ptr, + T *input_grad_0_ptr, + T *input_grad_1_ptr, T const *weight_ptr, cudaStream_t stream) { for (int i = 0; i < bc->max_requests_per_batch(); i++) { @@ -448,7 +451,7 @@ void peft_bwd_kernel(ResidualRMSNormMeta const *m, ComputeInternalGradientsCUDAKernel <<>>( N, - output_grad_ptr, + output_grad_1_ptr, residual_output_rms_input_ptr, weight_ptr, static_cast(m->rms_ptr), @@ -457,13 +460,14 @@ void peft_bwd_kernel(ResidualRMSNormMeta const *m, RMSNormBackwardCUDAKernel <<>>( N, - output_grad_ptr, + output_grad_0_ptr, + output_grad_1_ptr, residual_output_rms_input_ptr, weight_ptr, static_cast(m->rms_ptr), static_cast(m->norm_ptr), - residual_input0_grad_ptr, - residual_input1_grad_ptr, + input_grad_0_ptr, + input_grad_1_ptr, m->reset_input_grads[0], m->reset_input_grads[1]); } @@ -532,17 +536,12 @@ void backward_kernel_wrapper( } } -/* - regions[0](I): RMS output_grad - regions[1](I/O): Residual input 0 grad - regions[2](I/O): Residual input 1 grad - regions[3](I): weight -*/ void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m, BatchConfig const *bc, - GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const &residual_input0_grad, - GenericTensorAccessorW const &residual_input1_grad, + GenericTensorAccessorR const &output_grad_0, + GenericTensorAccessorR const &output_grad_1, + GenericTensorAccessorW const &input_grad_0, + GenericTensorAccessorW const &input_grad_1, GenericTensorAccessorR const &weight) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -552,24 +551,28 @@ void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - assert(output_grad.data_type == residual_input0_grad.data_type); - assert(residual_input0_grad.data_type == residual_input1_grad.data_type); - assert(residual_input1_grad.data_type == weight.data_type); + assert(output_grad_1.data_type == input_grad_0.data_type); + assert(input_grad_0.data_type == input_grad_1.data_type); + assert(input_grad_1.data_type == weight.data_type); - if (output_grad.data_type == DT_HALF) { + if (output_grad_1.data_type == DT_HALF) { peft_bwd_kernel(m, bc, - output_grad.get_half_ptr(), - residual_input0_grad.get_half_ptr(), - residual_input1_grad.get_half_ptr(), + m->reset_input_grads[0] ? nullptr + : output_grad_0.get_half_ptr(), + output_grad_1.get_half_ptr(), + input_grad_0.get_half_ptr(), + input_grad_1.get_half_ptr(), weight.get_half_ptr(), stream); - } else if (output_grad.data_type == DT_FLOAT) { + } else if (output_grad_1.data_type == DT_FLOAT) { peft_bwd_kernel(m, bc, - output_grad.get_float_ptr(), - residual_input0_grad.get_float_ptr(), - residual_input1_grad.get_float_ptr(), + m->reset_input_grads[0] ? nullptr + : output_grad_0.get_float_ptr(), + output_grad_1.get_float_ptr(), + input_grad_0.get_float_ptr(), + input_grad_1.get_float_ptr(), weight.get_float_ptr(), stream); } else { diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index 28fafcf224..c0e517f5c4 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -732,37 +732,47 @@ Legion::FutureMap 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); - // regions[0](I): RMS output_grad - launcher.add_region_requirement( - RegionRequirement(batch_outputs[1]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_outputs[1]->region_grad)); - launcher.add_field(0, FID_DATA); - // regions[2](I/O): residual input grad 0 - launcher.add_region_requirement( - RegionRequirement(batch_inputs[0]->part_grad, - 0 /*projection id*/, - reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, - EXCLUSIVE, - batch_inputs[0]->region_grad)); - launcher.add_field(1, FID_DATA); - // regions[3](I/O): residual input grad 1 + int fid = 0; + // residual input grad 0 + launcher.add_region_requirement(RegionRequirement( + batch_inputs[0]->part_grad, + 0 /*projection id*/, + inplace_residual && !reset_input_grads[0] ? READ_WRITE : WRITE_ONLY, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(fid++, FID_DATA); + // residual input grad 1 launcher.add_region_requirement( RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/, reset_input_grads[1] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad)); - launcher.add_field(2, FID_DATA); - // regions[4](I): gamma + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual && !reset_input_grads[0]) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(fid++, FID_DATA); + } + // RMS output_grad + launcher.add_region_requirement( + RegionRequirement(batch_outputs[1]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[1]->region_grad)); + launcher.add_field(fid++, FID_DATA); + // gamma launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(fid++, FID_DATA); return runtime->execute_index_space(ctx, launcher); } @@ -776,45 +786,91 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - assert(task->regions.size() == 4); - assert(regions.size() == 4); ResidualRMSNormMeta *m = *((ResidualRMSNormMeta **)task->local_args); + int expected_regions = + (m->inplace_residual || m->reset_input_grads[0]) ? 4 : 5; + assert(task->regions.size() == expected_regions); + assert(regions.size() == expected_regions); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_active_peft_tokens() == 0) { return; } - GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( - m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorW residual_input0_grad = + + int rid = 0, t_rid = 0; + GenericTensorAccessorW input_grad_0 = helperGetGenericTensorAccessorRW(m->input_type[0], - regions[1], - task->regions[1], + regions[rid++], + task->regions[t_rid++], FID_DATA, ctx, runtime); - GenericTensorAccessorW residual_input1_grad = + GenericTensorAccessorW input_grad_1 = helperGetGenericTensorAccessorRW(m->input_type[0], - regions[2], - task->regions[2], + regions[rid++], + task->regions[t_rid++], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); + + GenericTensorAccessorR output_grad_0; + if (!m->reset_input_grads[0]) { + if (m->inplace_residual) { + // mapped to input 0 + output_grad_0 = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + } else { + output_grad_0 = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[rid++], + task->regions[t_rid++], + FID_DATA, + ctx, + runtime); + } + } + GenericTensorAccessorR output_grad_1 = + helperGetGenericTensorAccessorRO(m->output_type[0], + regions[rid++], + task->regions[t_rid++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR weight = + helperGetGenericTensorAccessorRO(m->weight_type[0], + regions[rid++], + task->regions[t_rid++], + FID_DATA, + ctx, + runtime); + peft_bwd_kernel_wrapper( - m, bc, output_grad, residual_input0_grad, residual_input1_grad, weight); + m, bc, output_grad_0, output_grad_1, input_grad_0, input_grad_1, weight); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; - ResidualRMSNorm::save_inference_tensors_to_file( - m, - shard_id, - bc, - {residual_input0_grad, residual_input1_grad}, - {weight}, - {output_grad}, - false); + if (!m->reset_input_grads[0]) { + ResidualRMSNorm::save_inference_tensors_to_file( + m, + shard_id, + bc, + {input_grad_0, input_grad_1}, + {weight}, + {output_grad_0, output_grad_1}, + false); + } else { + ResidualRMSNorm::save_inference_tensors_to_file( + m, + shard_id, + bc, + {input_grad_0, input_grad_1}, + {weight}, + {output_grad_1}, + false); + } } } From 5688e16b374c6cd1b95433879ec68c9b002248d7 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 16 Feb 2024 05:02:10 +0000 Subject: [PATCH 53/61] fix --- src/ops/fused.cc | 2 +- src/ops/fused.cu | 43 ++++++++++++++++++++++++------------------- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/src/ops/fused.cc b/src/ops/fused.cc index bdb6d4d7a2..4c934f8612 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -652,7 +652,7 @@ FutureMap FusedOp::inference(FFModel const &ff, offset += numOutputs; // add softmax output grad if (operators[numOperators - 1]->op_type == OP_SOFTMAX) { - printf("operator %i is last SOFTMAX! adding output %i\n", + printf("operator %i is last SOFTMAX! adding grad for output %i\n", numOperators - 1, numOutputs - 1); assert(outputs[numOutputs - 1]->region != LogicalRegion::NO_REGION); diff --git a/src/ops/fused.cu b/src/ops/fused.cu index c589f6a5be..b89b6909cf 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -44,6 +44,7 @@ #include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/parallel_ops/kernels/allreduce_kernels.h" #include "flexflow/utils/cuda_helper.h" +#include "flexflow/ffconst_utils.h" namespace FlexFlow { // declare Legion names @@ -161,6 +162,9 @@ __host__ void int ioff = 0, woff = 0, ooff = 0; for (int op = 0; op < fused->numOperators; op++) { +#if 0 + std::cout << get_operator_type_name(fused->op_op_type[op]) << std::endl; +#endif // Domain my_id[MAX_NUM_INPUTS]; // Domain my_wd[MAX_NUM_WEIGHTS]; // Domain my_od[MAX_NUM_OUTPUTS]; @@ -172,9 +176,15 @@ __host__ void if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { // my_id[i] = input_domain[my_off]; my_input_accessor[i] = input_accessor[my_off]; +#if 0 + printf("\tmy_input_accessor[%i] = input_accessor[%i]\n", i, my_off); +#endif } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { // my_id[i] = output_domain[my_off]; my_input_accessor[i] = output_accessor[my_off]; +#if 0 + printf("\tmy_input_accessor[%i] = output_accessor[%i]\n", i, my_off); +#endif } else { assert(false); } @@ -191,6 +201,9 @@ __host__ void // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; my_output_accessor[i] = output_accessor[my_off]; +#if 0 + printf("\tmy_output_accessor[%i] = output_accessor[%i]\n", i, my_off); +#endif } switch (fused->op_op_type[op]) { case OP_CONCAT: { @@ -439,13 +452,14 @@ __host__ void assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 2); - ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op]; - Kernels::ResidualRMSNorm::forward_kernel_wrapper(m, - my_input_accessor[0], - my_input_accessor[1], - my_weight_accessor[0], - my_output_accessor[0], - my_output_accessor[1]); + ResidualRMSNormMeta *m = (ResidualRMSNormMeta *)metas->meta[op]; + Kernels::ResidualRMSNorm::inference_kernel_wrapper(m, + bc, + my_input_accessor[0], + my_input_accessor[1], + my_weight_accessor[0], + my_output_accessor[0], + my_output_accessor[1]); break; } case OP_INC_MULTIHEAD_SELF_ATTENTION: { @@ -668,22 +682,13 @@ __host__ void std::vector weight_accessors_to_save; std::vector output_accessors_to_save; for (int i = 0; i < fused->op_num_inputs[op]; i++) { - int my_off = fused->op_input_idx[i + ioff]; - if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { - input_accessors_to_save.push_back(input_accessor[my_off]); - } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { - input_accessors_to_save.push_back(output_accessor[my_off]); - } else { - assert(false); - } + input_accessors_to_save.push_back(my_input_accessor[i]); } for (int i = 0; i < fused->op_num_weights[op]; i++) { - assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); - weight_accessors_to_save.push_back( - weight_accessor[fused->op_weight_idx[i + woff]]); + weight_accessors_to_save.push_back(my_weight_accessor[i]); } for (int i = 0; i < fused->op_num_outputs[op]; i++) { - output_accessors_to_save.push_back(output_accessor[i + ooff]); + output_accessors_to_save.push_back(my_output_accessor[i]); } assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; From 9225e0c966cc5156ee6967c25be62c59bb1c2b4b Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 16 Feb 2024 05:39:37 +0000 Subject: [PATCH 54/61] fix --- src/ops/fused.cu | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/src/ops/fused.cu b/src/ops/fused.cu index b89b6909cf..33b0aeca19 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -441,11 +441,12 @@ __host__ void assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 1); - RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op]; - Kernels::RMSNorm::forward_kernel_wrapper(m, - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0]); + RMSNormMeta *m = (RMSNormMeta *)metas->meta[op]; + Kernels::RMSNorm::inference_kernel_wrapper(m, + bc, + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0]); break; } case OP_RESIDUAL_RMS_NORM: { @@ -805,6 +806,9 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, } for (int op = fused->numOperators - 1; op >= 0; op--) { +#if 0 + std::cout << get_operator_type_name(fused->op_op_type[op]) << std::endl; +#endif ioff -= fused->op_num_inputs[op]; woff -= fused->op_num_weights[op]; ooff -= fused->op_num_outputs[op]; @@ -813,9 +817,15 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { // my_id[i] = input_domain[my_off]; my_input_grad_accessor[i] = input_grad_accessor[my_off]; +#if 0 + printf("\tmy_input_grad_accessor[%i] = input_grad_accessor[%i]\n", i, my_off); +#endif } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { // my_id[i] = output_domain[my_off]; my_input_grad_accessor[i] = output_grad_accessor[my_off]; +#if 0 + printf("\tmy_input_grad_accessor[%i] = output_grad_accessor[%i]\n", i, my_off); +#endif } else { assert(false); } @@ -832,6 +842,9 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; my_output_grad_accessor[i] = output_grad_accessor[my_off]; +#if 0 + printf("\tmy_output_grad_accessor[%i] = output_grad_accessor[%i]\n", i, my_off); +#endif } switch (fused->op_op_type[op]) { case OP_CONCAT: { From e12bff14f266d4b6ee1d868c3e883c76b916079a Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 19 Feb 2024 02:19:11 +0000 Subject: [PATCH 55/61] enable inf debugging in fusion bwd --- src/ops/fused.cu | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 33b0aeca19..965e08d6f9 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -1195,6 +1195,29 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, assert(false && "Fusion currently does not support type"); } } + if (metas->meta[op]->inference_debugging) { + std::vector input_accessors_to_save; + std::vector weight_accessors_to_save; + std::vector output_accessors_to_save; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + input_accessors_to_save.push_back(my_input_grad_accessor[i]); + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + weight_accessors_to_save.push_back(my_weight_accessor[i]); + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + output_accessors_to_save.push_back(my_output_grad_accessor[i]); + } + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + FusedOp::save_inference_tensors_to_file(metas->meta[op], + shard_id, + bc, + input_accessors_to_save, + weight_accessors_to_save, + output_accessors_to_save, + false); + } } } From ed9afb7c0e1bff9f4966ff0afbe6c3b55e2e9cf5 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 19 Feb 2024 02:25:47 +0000 Subject: [PATCH 56/61] hack to silence warning in fused bwd --- src/ops/fused.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 4c934f8612..a81bf716bd 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -652,9 +652,9 @@ FutureMap FusedOp::inference(FFModel const &ff, offset += numOutputs; // add softmax output grad if (operators[numOperators - 1]->op_type == OP_SOFTMAX) { - printf("operator %i is last SOFTMAX! adding grad for output %i\n", - numOperators - 1, - numOutputs - 1); + // printf("operator %i is last SOFTMAX! adding grad for output %i\n", + // numOperators - 1, + // numOutputs - 1); assert(outputs[numOutputs - 1]->region != LogicalRegion::NO_REGION); launcher.add_region_requirement( RegionRequirement(batch_outputs[numOutputs - 1]->part_grad, @@ -700,7 +700,7 @@ FutureMap FusedOp::peft_bwd(FFModel const &ff, launcher.add_region_requirement( RegionRequirement(batch_inputs[i]->part_grad, 0 /*projection id*/, - READ_WRITE, + WRITE_ONLY, EXCLUSIVE, batch_inputs[i]->region_grad)); launcher.add_field(offset + i, FID_DATA); @@ -721,7 +721,7 @@ FutureMap FusedOp::peft_bwd(FFModel const &ff, launcher.add_region_requirement( RegionRequirement(batch_outputs[i]->part_grad, 0 /*projection id*/, - READ_WRITE, + i == numOutputs -1 ? READ_WRITE : WRITE_ONLY, EXCLUSIVE, batch_outputs[i]->region_grad)); launcher.add_field(offset + i, FID_DATA); From 96d0e9b00fc1e33ec34e682f8b231b098f52bffc Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 19 Feb 2024 02:43:25 +0000 Subject: [PATCH 57/61] fix --- src/ops/fused.cc | 2 +- src/ops/fused.cu | 35 ++++++++++++++++++++++------------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/ops/fused.cc b/src/ops/fused.cc index a81bf716bd..d5f1ace86d 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -721,7 +721,7 @@ FutureMap FusedOp::peft_bwd(FFModel const &ff, launcher.add_region_requirement( RegionRequirement(batch_outputs[i]->part_grad, 0 /*projection id*/, - i == numOutputs -1 ? READ_WRITE : WRITE_ONLY, + i == numOutputs - 1 ? READ_WRITE : WRITE_ONLY, EXCLUSIVE, batch_outputs[i]->region_grad)); launcher.add_field(offset + i, FID_DATA); diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 965e08d6f9..99d9e3410f 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -14,6 +14,7 @@ */ #include "flexflow/accessor.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/model.h" #include "flexflow/ops/add_bias_residual_layer_norm.h" #include "flexflow/ops/batch_norm.h" @@ -44,7 +45,6 @@ #include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/parallel_ops/kernels/allreduce_kernels.h" #include "flexflow/utils/cuda_helper.h" -#include "flexflow/ffconst_utils.h" namespace FlexFlow { // declare Legion names @@ -444,9 +444,9 @@ __host__ void RMSNormMeta *m = (RMSNormMeta *)metas->meta[op]; Kernels::RMSNorm::inference_kernel_wrapper(m, bc, - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0]); + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0]); break; } case OP_RESIDUAL_RMS_NORM: { @@ -454,13 +454,14 @@ __host__ void assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 2); ResidualRMSNormMeta *m = (ResidualRMSNormMeta *)metas->meta[op]; - Kernels::ResidualRMSNorm::inference_kernel_wrapper(m, - bc, - my_input_accessor[0], - my_input_accessor[1], - my_weight_accessor[0], - my_output_accessor[0], - my_output_accessor[1]); + Kernels::ResidualRMSNorm::inference_kernel_wrapper( + m, + bc, + my_input_accessor[0], + my_input_accessor[1], + my_weight_accessor[0], + my_output_accessor[0], + my_output_accessor[1]); break; } case OP_INC_MULTIHEAD_SELF_ATTENTION: { @@ -678,7 +679,11 @@ __host__ void assert(false && "Fusion currently does not support type"); } } - if (metas->meta[op]->inference_debugging) { + if (metas->meta[op]->inference_debugging && + !(fused->op_op_type[op] == OP_ALLREDUCE || + fused->op_op_type[op] == OP_REPLICATE || + fused->op_op_type[op] == OP_REPARTITION || + fused->op_op_type[op] == OP_COMBINE)) { std::vector input_accessors_to_save; std::vector weight_accessors_to_save; std::vector output_accessors_to_save; @@ -1195,7 +1200,11 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, assert(false && "Fusion currently does not support type"); } } - if (metas->meta[op]->inference_debugging) { + if (metas->meta[op]->inference_debugging && + !(fused->op_op_type[op] == OP_ALLREDUCE || + fused->op_op_type[op] == OP_REPLICATE || + fused->op_op_type[op] == OP_REPARTITION || + fused->op_op_type[op] == OP_COMBINE)) { std::vector input_accessors_to_save; std::vector weight_accessors_to_save; std::vector output_accessors_to_save; From 2cbc0b717bd5063627595059ece7c357f74cba23 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 19 Feb 2024 04:31:05 +0000 Subject: [PATCH 58/61] fix --- src/runtime/model.cc | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 603e87a937..10ce05ca1e 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3798,9 +3798,16 @@ bool FFModel::check_operators_integrity( } for (int i = 0; i < fused->op_num_outputs[op]; i++) { int my_off = fused->op_output_idx[i + ooff]; - assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT); - assert(FusedOp::use_same_regions( - fused->outputs[my_off], old_op->outputs[i], pt_mapping)); + assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT || + (fused->op_output_source[i + ooff] == FusedOp::SOURCE_INPUT && + (old_op->op_type == OP_RESIDUAL_LAYERNORM || + old_op->op_type == OP_RESIDUAL_RMS_NORM || + old_op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM))); + if (fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT) { + assert(FusedOp::use_same_regions(fused->outputs[my_off], old_op->outputs[i], pt_mapping)); + } else { + assert(FusedOp::use_same_regions(fused->inputs[my_off], old_op->outputs[i], pt_mapping)); + } } ioff += fused->op_num_inputs[op]; woff += fused->op_num_weights[op]; From 36cb2b39d1ff573e2b8f60dcc81deb1b4a4378f0 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 19 Feb 2024 05:39:13 +0000 Subject: [PATCH 59/61] fix build --- inference/incr_decoding/incr_decoding.cc | 2 +- src/c/flexflow_c.cc | 2 +- src/ops/arg_topk.cc | 5 +-- src/ops/inc_multihead_self_attention.cu | 1 + src/ops/sigmoid_silu_multi.cc | 4 --- src/runtime/inference_manager.cc | 4 +-- src/runtime/model.cc | 14 +++++---- src/runtime/request_manager.cc | 39 ++++++++++++++---------- 8 files changed, 39 insertions(+), 32 deletions(-) diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 7f2ea21148..d376c3e39c 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -270,7 +270,7 @@ void FlexFlow::top_level_task(Task const *task, : model.register_peft_model( LoraLinearConfig::DefaultConfig /*mlp_first*/, mlp_second /*mlp_second*/); - + // Start background server rm->start_background_server(&model); diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index a9ba9158ee..58acf3d010 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1616,7 +1616,7 @@ void flexflow_model_generate(flexflow_model_t handle_, text_str.c_str(), max_seq_length); } - + std::vector results = handle->generate(requests); // If the prompt exceeds max seq len, check that we return the prompt with no diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc index 53332791c4..53b259a703 100644 --- a/src/ops/arg_topk.cc +++ b/src/ops/arg_topk.cc @@ -431,9 +431,10 @@ BeamInferenceResult ArgTopK::inference_speculative_task( ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, &bc); BeamInferenceResult ir; - download_tensor( + copy_tensor_dev_to_host( indices.get_int32_ptr(), ir.token_ids, batch_size * m->k); - download_tensor(probs.get_float_ptr(), ir.probs, batch_size * m->k); + copy_tensor_dev_to_host( + probs.get_float_ptr(), ir.probs, batch_size * m->k); return ir; } diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 92bafaead3..83fdbaf927 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1644,6 +1644,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, // Copy C_softmax to m->softmax_activation_buffer if we need to compute // PEFT backward if (bc->requestsInfo[i].peft_bwd) { + DT *C_softmax = static_cast
(m->qk_prods_softmax); MemoryAllocator *allocator = m->handle.peft_activation_allocator; m->softmax_activation_buffer = allocator->allocate_instance_untyped( sizeof(DT) * total_tokens * num_new_tokens * m->num_q_heads); diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc index e87bd16699..98cd662efd 100644 --- a/src/ops/sigmoid_silu_multi.cc +++ b/src/ops/sigmoid_silu_multi.cc @@ -570,10 +570,6 @@ Node SigmoidSiluMulti::deserialize(FFModel &ff, dez.deserialize(name_len); dez.deserialize(name, name_len); LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); - size_t name_len; - char name[MAX_OPNAME] = {0}; - dez.deserialize(name_len); - dez.deserialize(name, name_len); SigmoidSiluMultiParams params; params.layer_guid = layer_guid; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 34c807dee4..91a6dab9b5 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -236,8 +236,8 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { // Check whether we need to reset input grads // We use a parallel tensor's region as the key std::set reset_inputs; - for (int l = operators.size() - 1; l >= 0; l--) { - Op *op = operators[l]; + for (int l = model->operators.size() - 1; l >= 0; l--) { + Op *op = model->operators[l]; for (int i = 0; i < op->numInputs; i++) { assert(op->inputs[i]->region != LogicalRegion::NO_REGION); if (reset_inputs.find(op->inputs[i]->region) != reset_inputs.end()) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 10ce05ca1e..a64fb8ec9c 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3799,14 +3799,16 @@ bool FFModel::check_operators_integrity( for (int i = 0; i < fused->op_num_outputs[op]; i++) { int my_off = fused->op_output_idx[i + ooff]; assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT || - (fused->op_output_source[i + ooff] == FusedOp::SOURCE_INPUT && - (old_op->op_type == OP_RESIDUAL_LAYERNORM || - old_op->op_type == OP_RESIDUAL_RMS_NORM || - old_op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM))); + (fused->op_output_source[i + ooff] == FusedOp::SOURCE_INPUT && + (old_op->op_type == OP_RESIDUAL_LAYERNORM || + old_op->op_type == OP_RESIDUAL_RMS_NORM || + old_op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM))); if (fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT) { - assert(FusedOp::use_same_regions(fused->outputs[my_off], old_op->outputs[i], pt_mapping)); + assert(FusedOp::use_same_regions( + fused->outputs[my_off], old_op->outputs[i], pt_mapping)); } else { - assert(FusedOp::use_same_regions(fused->inputs[my_off], old_op->outputs[i], pt_mapping)); + assert(FusedOp::use_same_regions( + fused->inputs[my_off], old_op->outputs[i], pt_mapping)); } } ioff += fused->op_num_inputs[op]; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 7bc1966abf..41c371d4e2 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -435,12 +435,13 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } } else { int processed_tokens = - old_bc.requestsInfo[i].first_token_depth_in_request + - old_bc.requestsInfo[i].num_tokens_in_batch; + old_bc.requestsInfo[i].first_token_depth_in_request + + old_bc.requestsInfo[i].num_tokens_in_batch; assert(processed_tokens < request.tokens.size()); bool request_completed = false; // printf("model_type = %d\n", this->model_type); - if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) { + if (request.tokens.size() >= + old_bc.requestsInfo[i].max_sequence_length) { request_completed = true; } else if (request.tokens.back() == eos_token_id) { // Encounter EOS token id @@ -469,7 +470,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, log_req_mgr.print("Final output: %s", output.c_str()); num_processed_requests++; ProfileInfo profile_info = profiling_requests[request.guid]; - profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); + profile_info.finish_time = + Realm::Clock::current_time_in_microseconds(); total_request_run_time += profile_info.finish_time - profile_info.start_time; profiling_requests[request.guid] = profile_info; @@ -486,10 +488,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, std::ofstream outputFile(output_filepath, std::ios::app); if (outputFile.is_open()) { outputFile << "end-to-end latency: " << std::fixed - << std::setprecision(3) << total_request_run_time - << std::endl; + << std::setprecision(3) << total_request_run_time + << std::endl; outputFile << "num decoding steps: " - << profile_info.llm_decoding_steps << std::endl; + << profile_info.llm_decoding_steps << std::endl; outputFile << "token IDs: "; for (int i = 0; i < request.tokens.size(); i++) { outputFile << request.tokens[i]; @@ -509,11 +511,14 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } else { new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens; - new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; + new_bc.requestsInfo[i].first_token_depth_in_request = + processed_tokens; + new_bc.requestsInfo[i].first_token_offset_in_batch = + new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].peft_model_id = old_bc.requestsInfo[i].peft_model_id; + new_bc.requestsInfo[i].peft_model_id = + old_bc.requestsInfo[i].peft_model_id; new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; @@ -527,10 +532,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].prompt_phase = false; } else { // Prompt phase - new_bc.requestsInfo[i].num_tokens_in_batch = - std::min(get_max_tokens_per_batch() - new_bc.num_tokens, - (int)request.tokens.size() - - new_bc.requestsInfo[i].first_token_depth_in_request); + new_bc.requestsInfo[i].num_tokens_in_batch = std::min( + get_max_tokens_per_batch() - new_bc.num_tokens, + (int)request.tokens.size() - + new_bc.requestsInfo[i].first_token_depth_in_request); new_bc.requestsInfo[i].prompt_phase = true; } for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { @@ -538,7 +543,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; assert(depth < request.tokens.size()); - new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[depth]; + new_bc.tokensInfo[new_bc.num_tokens].token_id = + request.tokens[depth]; new_bc.num_tokens++; } // Update profiling @@ -2399,7 +2405,8 @@ std::vector> return merged_tree; } -std::vector FFModel::generate(std::vector const &requests) { +std::vector + FFModel::generate(std::vector const &requests) { RequestManager *rm = RequestManager::get_request_manager(); std::vector guids; for (int i = 0; i < requests.size(); i++) { From 21b77f11c3cacb06c294bdb17a2b3be52e8fdb83 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 19 Feb 2024 18:47:01 +0000 Subject: [PATCH 60/61] fix --- src/ops/noop.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/ops/noop.cc b/src/ops/noop.cc index dabdf835dd..45bd76d59d 100644 --- a/src/ops/noop.cc +++ b/src/ops/noop.cc @@ -90,9 +90,10 @@ OpMeta *NoOp::init_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { + NoOp *no_op = (NoOp *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - // OpMeta *m = new OpMeta(handle); - return nullptr; + OpMeta *m = new OpMeta(handle, no_op); + return m; } void NoOp::init_inference(FFModel const &ff, @@ -167,7 +168,7 @@ void NoOp::init_inference(FFModel const &ff, set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher(NOOP_INIT_TASK_ID, parallel_is, - TaskArgument(NULL, 0), + TaskArgument(this, sizeof(NoOp)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -244,7 +245,7 @@ void NoOp::init(FFModel const &ff) { set_argumentmap_for_init(ff, argmap); IndexLauncher launcher(NOOP_INIT_TASK_ID, parallel_is, - TaskArgument(NULL, 0), + TaskArgument(this, sizeof(NoOp)), argmap, Predicate::TRUE_PRED, false /*must*/, From 9075d3fb7ea3ef893c46f554551f681a109d8f90 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 19 Feb 2024 20:29:41 +0000 Subject: [PATCH 61/61] fix --- python/flexflow/core/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py index 2614518acf..522dbe7e44 100644 --- a/python/flexflow/core/__init__.py +++ b/python/flexflow/core/__init__.py @@ -88,7 +88,7 @@ "offload_reserve_space_size": "-offload-reserve-space-size", "use_4bit_quantization": "--4bit-quantization", "use_8bit_quantization": "--8bit-quantization", - "enable_peft": "", + "enable_peft": "-enable-peft", "peft_activation_reserve_space_size": "-peft-activation-reserve-space-size", "peft_weight_reserve_space_size": "-peft-weight-reserve-space-size", }