Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an LLM demo to apps/ #8216

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 108 additions & 0 deletions apps/hallmark/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
cmake_minimum_required(VERSION 3.22)
project(hallmark)

# We need to set this for some of the subprojects pulled in by TFLite (eg flatbuffers)
# set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)

enable_testing()

# ----------------------------

# Compatibility cruft
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
cmake_policy(SET CMP0135 NEW)
endif()

# ----------------------------

# Set up language settings
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED YES)
set(CMAKE_CXX_EXTENSIONS NO)

# ----------------------------

# Find HalideHelpers -- this is just the Runtime headers and CMake functions, but no libraries
find_package(Halide REQUIRED)
find_package(ZLIB REQUIRED)

# ----------------------------

# go get various deps we need

include(FetchContent)

set(ABSL_PROPAGATE_CXX_STD ON)
set(ABSL_USE_EXTERNAL_GOOGLETEST ON)
FetchContent_Declare(abseil
GIT_REPOSITORY https://github.com/abseil/abseil-cpp.git
GIT_TAG 20240116.2)

set(FLATBUFFERS_BUILD_TESTS OFF)
set(FLATBUFFERS_INSTALL OFF)
FetchContent_Declare(flatbuffers
GIT_REPOSITORY https://github.com/google/flatbuffers.git
GIT_TAG v23.5.26
GIT_SHALLOW TRUE)

set(BUILD_GMOCK OFF)
set(INSTALL_GTEST OFF)
set(GTEST_HAS_ABSL OFF)
FetchContent_Declare(googletest
GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG v1.14.0)

set(BENCHMARK_ENABLE_TESTING OFF)
set(BENCHMARK_ENABLE_EXCEPTIONS OFF)
set(BENCHMARK_ENABLE_INSTALL OFF)
set(BENCHMARK_INSTALL_DOCS OFF)
set(BENCHMARK_ENABLE_GTEST_TESTS OFF)
FetchContent_Declare(googlebenchmark
GIT_REPOSITORY https://github.com/google/benchmark.git
GIT_TAG v1.8.3)

set(protobuf_INSTALL OFF)
set(protobuf_BUILD_TESTS OFF)
set(protobuf_BUILD_CONFORMANCE OFF)
set(protobuf_BUILD_EXAMPLES OFF)
set(protobuf_BUILD_PROTOC_BINARIES ON)
set(protobuf_BUILD_LIBPROTOC OFF)
set(protobuf_BUILD_LIBUPB OFF)
set(protobuf_DISABLE_RTTI ON)
set(protobuf_WITH_ZLIB ON CACHE BOOL "" FORCE)
FetchContent_Declare(protobuf
GIT_REPOSITORY https://github.com/protocolbuffers/protobuf.git
GIT_TAG v26.1
GIT_SHALLOW TRUE)

# TODO(srj,zvookin): Exact parameters should be double checked.
FetchContent_Declare(sentencepiece
GIT_REPOSITORY https://github.com/google/sentencepiece
GIT_TAG v0.2.0 # Old: 53de76561cfc149d3c01037f0595669ad32a5e7c
)

FetchContent_MakeAvailable(abseil flatbuffers googletest googlebenchmark protobuf sentencepiece)

# ---------- Set up targets for flatc
add_library(hallmark_flatbuffers INTERFACE)
target_sources(hallmark_flatbuffers INTERFACE $<BUILD_INTERFACE:$<TARGET_OBJECTS:flatbuffers>>)
target_include_directories(hallmark_flatbuffers
SYSTEM # Use -isystem instead of -I; this is a trick so that clang-tidy won't analyze these includes
INTERFACE
$<BUILD_INTERFACE:${flatbuffers_SOURCE_DIR}>/include
$<BUILD_INTERFACE:${flatbuffers_BINARY_DIR}>/include)
set_target_properties(hallmark_flatbuffers PROPERTIES EXPORT_NAME flatbuffers)
add_executable(flatbuffers::flatc ALIAS flatc)

# ---------- Set up targets for protobuf
FetchContent_GetProperties(protobuf SOURCE_DIR protobuf_SOURCE_DIR)
# Include the script which defines 'protobuf_generate'
include(${protobuf_SOURCE_DIR}/cmake/protobuf-generate.cmake)

# ----------------------------

add_subdirectory(contrib)
add_subdirectory(src)
add_subdirectory(test)

# ----------------------------
17 changes: 17 additions & 0 deletions apps/hallmark/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Hallmark (HAlide LLM Advanced Research Kit) is Halide-written execution engine for Gemini/Gemma models;
it serves as a testbed for writing efficient ML kernels in Halide.

To build with CMake:

- build and install Halide locally to ${HALIDE_INSTALL}
- cd apps/hallmark
- mkdir build && cd build
- cmake .. -DHalide_DIR=${HALIDE_INSTALL}/lib/cmake/Halide -DCMAKE_BUILD_TYPE=Release
- cd build && ninja (or make)

To run the tests:
- ./build/test/llm_generator_test --model_path=/path/to/model.tflite

To run the benchmarks:
- ./build/test/llm_generator_bench --model_path=/path/to/model.tflite --benchmark_filter=all

41 changes: 41 additions & 0 deletions apps/hallmark/contrib/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@

# --------------------------- Generate flatbuffer files
set(tflite_schema_source "${CMAKE_CURRENT_SOURCE_DIR}/tflite_schema.fbs")
set(tflite_generated_header "${CMAKE_CURRENT_BINARY_DIR}/tflite_schema_generated.h")
add_custom_command(
OUTPUT "${tflite_generated_header}"
COMMAND flatbuffers::flatc --cpp --cpp-std C++17 --no-union-value-namespacing --keep-prefix -o "${CMAKE_CURRENT_BINARY_DIR}" "${tflite_schema_source}"
DEPENDS "${fb_def}"
VERBATIM
)
add_custom_target(generate_tflite_schema_header DEPENDS "${tflite_generated_header}")
set_source_files_properties("${tflite_generated_header}" PROPERTIES GENERATED TRUE)

# --------------------------- Generate protobuf files
add_library(proto_objects OBJECT llm_params.proto transformer_params.proto)
target_link_libraries(proto_objects PUBLIC protobuf::libprotobuf)
target_include_directories(proto_objects PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>")

protobuf_generate(
TARGET proto_objects
PROTOC_OUT_DIR "${CMAKE_CURRENT_BINARY_DIR}")


# --------------------------- Ordinary code
add_library(hallmark_contrib
llm_params.cc
sampler.cc
weights_loader.cc)
add_dependencies(hallmark_contrib
generate_tflite_schema_header)
target_include_directories(hallmark_contrib INTERFACE
$<BUILD_INTERFACE:${hallmark_SOURCE_DIR}>)
target_include_directories(hallmark_contrib PRIVATE
$<BUILD_INTERFACE:${hallmark_SOURCE_DIR}>
$<BUILD_INTERFACE:${hallmark_BINARY_DIR}>)
target_link_libraries(hallmark_contrib
PRIVATE
absl::status
Halide::Runtime
hallmark_flatbuffers
proto_objects)
194 changes: 194 additions & 0 deletions apps/hallmark/contrib/llm_params.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
#include "contrib/llm_params.h"

#include "contrib/memory_mapped_file.h"
#include "contrib/status_helpers.h"
#include "contrib/llm_params.pb.h"
#include "contrib/transformer_params.pb.h"
// macOS system headers #define this value in syslimits.h
#undef ARG_MAX
#include "contrib/tflite_schema_generated.h"

namespace hallmark {

namespace {

using odml::infra::proto::LlmParameters;
using odml::infra::proto::TransformerParameters;

const ::tflite::Metadata *FindMetadata(const ::tflite::Model *tflite_model,
std::string name) {
if (tflite_model->metadata() == nullptr) {
return nullptr;
}

for (const auto *metadata : *tflite_model->metadata()) {
if (name == metadata->name()->c_str()) {
return metadata;
}
}
return nullptr;
}

LlmParams::Norm TransformerParametersProtoNormTypeToLlmParamsNormType(
TransformerParameters::Norm norm_type) {
switch (norm_type) {
case TransformerParameters::NORM_UNSPECIFIED:
ABSL_LOG(DFATAL) << "Unspecified norm type.";
return LlmParams::Norm::UNSPECIFIED;
case TransformerParameters::NO_NORM:
return LlmParams::Norm::NO_NORM;
case TransformerParameters::RMS_NORM:
return LlmParams::Norm::RMS_NORM;
case TransformerParameters::LAYER_NORM:
return LlmParams::Norm::LAYER_NORM;
default:
ABSL_LOG(DFATAL) << "Unknown norm type: " << norm_type;
}
return LlmParams::Norm::UNSPECIFIED;
}

LlmParams FromLLMParametersProto(const LlmParameters &llm_params) {
const auto &transformer_params = llm_params.transformer_parameters();
LlmParams params = {
.num_transformer_M = static_cast<size_t>(transformer_params.num_stacks()),
.batch_size_B = static_cast<size_t>(transformer_params.batch_size()),
.seq_size_T = static_cast<size_t>(transformer_params.max_seq_length()),
.model_dim_D = static_cast<size_t>(transformer_params.embedding_dim()),
.hidden_dim_HD =
static_cast<size_t>(transformer_params.hidden_dimension()),
.head_dim_H = static_cast<size_t>(transformer_params.head_dimension()),
.n_heads_N = static_cast<size_t>(transformer_params.num_heads()),
.voc_size_V = static_cast<size_t>(llm_params.vocab_size()),

.num_kv_heads =
static_cast<size_t>(transformer_params.num_kv_heads() == 0 ? transformer_params.num_heads() : transformer_params.num_kv_heads()),
.enable_kv_cache = true,
.enable_dynamic_shape = false};
switch (
transformer_params.self_attention_parameters().attention_mask_type()) {
case TransformerParameters::UNSPECIFIED:
ABSL_LOG(DFATAL) << "Unspecified attention_mask_type, assuming causal";
params.model_type = LlmParams::ModelType::UNSPECIFIED;
break;
case TransformerParameters::CAUSAL:
params.model_type = LlmParams::ModelType::CAUSAL;
break;
case TransformerParameters::PREFIX:
params.model_type = LlmParams::ModelType::PREFIX;
break;
default:
ABSL_LOG(DFATAL) << "Unknown attention_mask_type: "
<< transformer_params.self_attention_parameters()
.attention_mask_type();
}
params.ff_params = LlmParams::FeedForwardParams{
.no_bias = transformer_params.feed_forward_parameters().no_bias(),
};
params.final_proj_params = LlmParams::FinalProjectParams{
.no_bias = transformer_params.final_project_parameters().no_bias(),
};
switch (transformer_params.feed_forward_parameters().activation()) {
case TransformerParameters::ACTIVATION_UNSPECIFIED:
ABSL_LOG(DFATAL) << "Unspecified feed_forward_parameters.activation.";
params.ff_params.activation = LlmParams::Activation::UNSPECIFIED;
break;
case TransformerParameters::GELU:
params.ff_params.activation = LlmParams::Activation::GELU;
break;
case TransformerParameters::SILU:
params.ff_params.activation = LlmParams::Activation::SILU;
break;
case TransformerParameters::RELU:
params.ff_params.activation = LlmParams::Activation::RELU;
break;
default:
ABSL_LOG(DFATAL)
<< "Unknown feed_forward_parameters.activation: "
<< transformer_params.feed_forward_parameters().activation();
}
params.sa_params.qkv_no_bias =
transformer_params.self_attention_parameters().qkv_no_bias();
params.sa_params.post_proj_no_bias =
transformer_params.self_attention_parameters().post_proj_no_bias();
params.sa_params.pre_norm =
TransformerParametersProtoNormTypeToLlmParamsNormType(
transformer_params.pre_norm());
params.sa_params.post_norm =
TransformerParametersProtoNormTypeToLlmParamsNormType(
transformer_params.post_norm());
params.sa_params.soft_cap_value =
transformer_params.self_attention_parameters().soft_cap_value();
params.ff_params.pre_norm =
TransformerParametersProtoNormTypeToLlmParamsNormType(
transformer_params.feed_forward_parameters().pre_norm());
params.ff_params.post_norm =
TransformerParametersProtoNormTypeToLlmParamsNormType(
transformer_params.feed_forward_parameters().post_norm());
params.final_norm = TransformerParametersProtoNormTypeToLlmParamsNormType(
transformer_params.final_norm());
params.skip_absolute_positional_embeddings =
transformer_params.skip_absolute_positional_embeddings();
if (transformer_params.self_attention_parameters()
.has_attention_scale_type()) {
switch (
transformer_params.self_attention_parameters().attention_scale_type()) {
case TransformerParameters::SCALE_TYPE_UNSPECIFIED:
ABSL_LOG(DFATAL) << "Unspecified attention_scale_type.";
params.sa_params.attention_scale_type =
LlmParams::AttentionScaleType::UNSPECIFIED;
break;
case TransformerParameters::SCALE_TYPE_PER_DIM_SCALE:
params.sa_params.attention_scale_type =
LlmParams::AttentionScaleType::PER_DIM_SCALE;
break;
case TransformerParameters::SCALE_TYPE_INV_SQRT_HEAD_DIM:
params.sa_params.attention_scale_type =
LlmParams::AttentionScaleType::INV_SQRT_HEAD_DIM;
break;
default:
ABSL_LOG(DFATAL) << "Unknown attention_scale_type: "
<< transformer_params.self_attention_parameters()
.attention_scale_type();
}
} else {
if (transformer_params.num_kv_heads() == 0 ||
transformer_params.num_heads() == transformer_params.num_kv_heads()) {
// If MHA, PER_DIM_SCALE is used.
params.sa_params.attention_scale_type =
LlmParams::AttentionScaleType::PER_DIM_SCALE;
} else {
// If MQA or GQA, INV_SQRT_HEAD_DIM is used.
params.sa_params.attention_scale_type =
LlmParams::AttentionScaleType::INV_SQRT_HEAD_DIM;
}
}

return params;
}

} // namespace

absl::StatusOr<LlmParams> LoadLlmParams(absl::string_view tflite_path) {
MemoryMappedFile file(tflite_path);
if (!file.valid()) {
return absl::InvalidArgumentError("Could not open file for llm_params");
}

const ::tflite::Model *tflite_model = ::tflite::GetModel(file.data());
const auto *metadata =
FindMetadata(tflite_model, "odml.infra.proto.LlmParameters");
if (!metadata) {
return absl::InvalidArgumentError("No metadata found in model");
}

const ::tflite::Buffer *buffer =
tflite_model->buffers()->Get(metadata->buffer());
const void *base = (const char *)file.data() + buffer->offset();
const size_t len = buffer->size();

LlmParameters llm_parameters;
RET_CHECK(llm_parameters.ParseFromArray(base, len));
return FromLLMParametersProto(llm_parameters);
}

} // namespace hallmark
Loading
Loading