From be29d91ef242f630942e12092aa20af9ab2936a5 Mon Sep 17 00:00:00 2001 From: Yi Sun Date: Fri, 10 Nov 2023 16:27:28 +0800 Subject: [PATCH] Initialize workload of xsave. Implement a series of micro workloads to experience the functionality of xsave/xrestore in user space. These micro benchmarks are valuable for performance tuning and debugging purposes. Signed-off-by: Dongcheng Yan Signed-off-by: Yi Sun --- workload-xsave/.gitignore | 4 + workload-xsave/Makefile | 51 +++ workload-xsave/README.md | 60 +++ workload-xsave/run_common.c | 41 ++ workload-xsave/start_test.sh | 73 ++++ workload-xsave/work_AMX.c | 119 ++++++ workload-xsave/work_AVX.c | 67 ++++ workload-xsave/work_AVX2.c | 68 ++++ workload-xsave/work_AVX512.c | 87 +++++ workload-xsave/work_DOTPROD.c | 77 ++++ workload-xsave/work_GETCPU.c | 63 ++++ workload-xsave/work_MEM.c | 130 +++++++ workload-xsave/work_PAUSE.c | 39 ++ workload-xsave/work_RDTSC.c | 37 ++ workload-xsave/work_SSE.c | 66 ++++ workload-xsave/work_TPAUSE.c | 58 +++ workload-xsave/work_UMWAIT.c | 53 +++ workload-xsave/work_VNNI.c | 83 +++++ workload-xsave/work_VNNI512.c | 80 ++++ workload-xsave/work_memcpy.c | 123 ++++++ workload-xsave/worker_init4.c | 70 ++++ workload-xsave/worker_init_amx.c | 77 ++++ workload-xsave/worker_init_avx.c | 61 +++ workload-xsave/worker_init_avx2.c | 61 +++ workload-xsave/worker_init_dotprod.c | 78 ++++ workload-xsave/worker_init_sse.c | 61 +++ workload-xsave/yogini.c | 537 +++++++++++++++++++++++++++ workload-xsave/yogini.h | 112 ++++++ 28 files changed, 2436 insertions(+) create mode 100644 workload-xsave/.gitignore create mode 100644 workload-xsave/Makefile create mode 100644 workload-xsave/README.md create mode 100644 workload-xsave/run_common.c create mode 100755 workload-xsave/start_test.sh create mode 100644 workload-xsave/work_AMX.c create mode 100644 workload-xsave/work_AVX.c create mode 100644 workload-xsave/work_AVX2.c create mode 100644 workload-xsave/work_AVX512.c create mode 100644 workload-xsave/work_DOTPROD.c create mode 100644 workload-xsave/work_GETCPU.c create mode 100644 workload-xsave/work_MEM.c create mode 100644 workload-xsave/work_PAUSE.c create mode 100644 workload-xsave/work_RDTSC.c create mode 100644 workload-xsave/work_SSE.c create mode 100644 workload-xsave/work_TPAUSE.c create mode 100644 workload-xsave/work_UMWAIT.c create mode 100644 workload-xsave/work_VNNI.c create mode 100644 workload-xsave/work_VNNI512.c create mode 100644 workload-xsave/work_memcpy.c create mode 100644 workload-xsave/worker_init4.c create mode 100644 workload-xsave/worker_init_amx.c create mode 100644 workload-xsave/worker_init_avx.c create mode 100644 workload-xsave/worker_init_avx2.c create mode 100644 workload-xsave/worker_init_dotprod.c create mode 100644 workload-xsave/worker_init_sse.c create mode 100644 workload-xsave/yogini.c create mode 100644 workload-xsave/yogini.h diff --git a/workload-xsave/.gitignore b/workload-xsave/.gitignore new file mode 100644 index 00000000..144a88f0 --- /dev/null +++ b/workload-xsave/.gitignore @@ -0,0 +1,4 @@ +yogini +*.o +*.S +result diff --git a/workload-xsave/Makefile b/workload-xsave/Makefile new file mode 100644 index 00000000..8a1acfaa --- /dev/null +++ b/workload-xsave/Makefile @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Copyright (c) 2022 Intel Corporation. +# Len Brown +# Yi Sun +# Dongcheng Yan + +CC = $(CROSS_COMPILE)gcc +BUILD_OUTPUT := $(CURDIR) +PREFIX ?= /usr +DESTDIR ?= +DAY := $(shell date +%Y.%m.%d) +DATE_TIME := $(shell date +%Y%m%d_%H%M%S) + +ifeq ("$(origin O)", "command line") + BUILD_OUTPUT := $(O) +endif + +PROGS= yogini +SRC= yogini.c work_AMX.c work_AVX.c work_AVX2.c work_AVX512.c work_VNNI512.c work_VNNI.c work_DOTPROD.c work_PAUSE.c work_TPAUSE.c work_UMWAIT.c work_RDTSC.c work_SSE.c work_MEM.c work_memcpy.c run_common.c worker_init4.c worker_init_dotprod.c worker_init_amx.c yogini.h +OBJS= yogini.o work_AMX.o work_AVX.o work_AVX2.o work_AVX512.o work_VNNI512.o $(GCC11_OBJS) work_DOTPROD.o work_PAUSE.o work_TPAUSE.o work_UMWAIT.o work_RDTSC.o work_SSE.o work_MEM.o work_memcpy.o +ASMS= work_AMX.S work_AVX.S work_AVX2.S work_AVX512.S work_VNNI512.S work_VNNI.S work_DOTPROD.S work_PAUSE.S work_TPAUSE.S work_UMWAIT.S work_RDTSC.S work_SSE.S work_MEM.S work_memcpy.S +GCC11_OBJS=work_VNNI.o + +yogini : $(OBJS) $(ASMS) +ifeq ($(DEBUG), 1) +override CFLAGS += -march=sapphirerapids -g +else +override CFLAGS += -march=native +endif +override CFLAGS += -D_FORTIFY_SOURCE=2 +override CFLAGS += -Wall +override CFLAGS += -O3 +override CFLAGS += -mtune=skylake-avx512 +#override CFLAGS += -mtune=alderlake +override CFLAGS += -mavx512bf16 + +LDFLAGS += -lm +LDFLAGS += -lpthread + +%: %.c %.h + @mkdir -p $(BUILD_OUTPUT) + $(CC) $(CFLAGS) $(OBJS) -o $(BUILD_OUTPUT)/$@ $(LDFLAGS) + +%.S: %.c + @mkdir -p $(BUILD_OUTPUT) + $(CC) $(CFLAGS) -S $^ -o $(BUILD_OUTPUT)/$@ + +.PHONY : clean +clean : + @rm -f $(BUILD_OUTPUT)/yogini $(OBJS) $(ASMS) diff --git a/workload-xsave/README.md b/workload-xsave/README.md new file mode 100644 index 00000000..6c167d6d --- /dev/null +++ b/workload-xsave/README.md @@ -0,0 +1,60 @@ +# Intel SIMD Instruction Microbenchmark Suite + +The Intel SIMD Instruction Microbenchmark Suite is a collection of microbenchmarks designed to evaluate and debug various Intel SIMD instructions. The suite includes a total of 15 benchmarks covering instructions such as AMX, AVX, AVX2, AVX512, VNNI, VNNI512, SSE, RDTSC, PAUSE, and more. These benchmarks are primarily used for debugging xsave/xrestor related issues. + +## Features +* Provides a comprehensive set of microbenchmarks for Intel SIMD instructions. +* Covers a range of instruction sets, including AMX, AVX, AVX2, AVX512, VNNI, VNNI512, SSE, RDTSC, PAUSE, and more. +* Assists in debugging xsave/xrestor related problems. +* Lightweight and easy to use. +* Open-source and freely available. + +## Getting Started +These instructions will help you get a copy of the Intel SIMD Instruction Microbenchmark Suite up and running on your local machine for development and testing purposes. + +### Prerequisites +* C/C++ compiler with support for the desired SIMD instruction sets. +* Make installed. + +### Installation +Clone the repository to your local machine: +``` +git clone https://github.com/intel-sandbox/workload-xsave.git +``` +Enter the project directory: +``` +cd workload-xsave +``` +Build the benchmarks using CMake: +``` +make +``` +Or, if your platform is not support all SIMD feature, you can try +``` +DEBUG=1 make +``` +Run the benchmarks: +``` +usage: ./yogini [OPTIONS] + +./yogini runs some simple micro workloads + -w, --workload [AVX,AVX2,AVX512,AMX,MEM,memcpy,SSE,VNNI,VNNI512,UMWAIT,TPAUSE,PAUSE,RDTSC] + -r, --repeat, each instance needs to be run + -b, --break_reason, [yield/sleep/trap/signal/futex]Available workloads: AMX memcpy MEM SSE RDTSC PAUSE DOTPROD VNNI512 AVX512_BF16 AVX2 AVX + +``` + +## Contributing +Contributions are welcome and encouraged! If you would like to contribute to the Intel SIMD Instruction Microbenchmark Suite, please follow these steps: + +* License +This project is licensed under the GPL2.0. +* Architecture/Workflow +![2023-07-07_15-03](https://github.com/intel-sandbox/workload-xsave/assets/1448148/1485edf4-91f5-4f46-ab34-a4eed9ff77f5) + +## Acknowledgments +Mention any contributors or references you have used for this project. +Contact +For any questions or suggestions regarding the Intel SIMD Instruction Microbenchmark Suite, please contact [Yi Sun ]. + +Replace [your-email-address] with an appropriate contact email or remove this section if not necessary. diff --git a/workload-xsave/run_common.c b/workload-xsave/run_common.c new file mode 100644 index 00000000..37307388 --- /dev/null +++ b/workload-xsave/run_common.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * generic worker code for re-use via inclusion + * + * Copyright (c) 2022 Intel Corporation. + * Len Brown + * Yi Sun + * Dongcheng Yan + * + */ + +#include +#include +#include "yogini.h" + +void thread_break(int32_t reason, uint32_t thread_idx); +/* + * run() + * complete work in chunks of "data_entries" operations + * between each chunk, check the time + * return when requested operations complete, or out of time + * + * return operationds completed + */ +static unsigned long long run(struct work_instance *wi) +{ + unsigned int count; + unsigned int operations = wi->repeat; + struct thread_data *dp = wi->worker_data; + + if (operations == 0) + operations = (~0U); + + for (count = 0; count < operations; count++) { + thread_break(wi->break_reason, wi->thread_number); + /* each invocation of work() does "entries" operations */ + work(dp); + } + unsigned long long tsc_now = rdtsc(); + return tsc_now; +} diff --git a/workload-xsave/start_test.sh b/workload-xsave/start_test.sh new file mode 100755 index 00000000..6544f5f3 --- /dev/null +++ b/workload-xsave/start_test.sh @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Copyright (c) 2022 Intel Corporation. +# Yi Sun +# Dongcheng Yan + +#!/bin/bash +option="" +result="" +repeat="" +num=$# + +# store test results to a specified folder +script_dir=$(dirname "$0") +result_dir="$script_dir/result" + +if [ ! -d "$result_dir" ]; then + mkdir "$result_dir" +fi + +# mode1: test workloads in specific break_reason +test_single () { + echo "trace-cmd record -e x86_fpu -F ./yogini -b $break_reason -r $repeat $option" + trace-cmd record -e x86_fpu -F ./yogini -b $break_reason -r $repeat $option + if [ $? -ne 0 ]; then + echo "Failed to execute trace-cmd record." + exit 1 + fi + trace-cmd report > "${result_dir}/${result}${break_reason}" +} + +# mode2: test workloads in all break_reason +test_all () { +for ((i=1; i<=5; i++)) +do + break_reason=$i + test_single +done +} + +usage() { + GREEN='\033[0;32m' + NC='\033[0m' + echo -e "${GREEN}Usage:${NC}" + echo "first param: break_reason" + echo "second param: repeat_cnt" + echo "remain params: workload" + echo -e "${GREEN}Example:${NC}" + echo "$0 2 100 AVX MEM VNNI" + echo "$0 -1 100 AVX MEM VNNI" + echo -e "${GREEN}You can test all break_reason if first param is -1.${NC} " +} + +# main func +if [ $num -lt 3 ]; then + usage +else + break_reason=$1 + repeat=$2 + for ((i=3; i<=$#; i++)) + do + # add workload cmd + arg="${!i}" + option+="-w $arg " + result+="${arg}_" + done + + if [ "$1" == "-1" ]; then + test_all "$@" + else + test_single "$@" + fi +fi diff --git a/workload-xsave/work_AMX.c b/workload-xsave/work_AMX.c new file mode 100644 index 00000000..2afc722f --- /dev/null +++ b/workload-xsave/work_AMX.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * work_AVX.c - offer the "AVX" workload to yogini + * + * Copyright (c) 2022 Intel Corporation. + * Yi Sun + * Dongcheng Yan + * + */ + +#define _GNU_SOURCE +#include /* printf(3) */ +#include /* random(3) */ +#include /* CPU_SET */ +#include +#include +#include +#include "yogini.h" +#include +#include +#include +#include + +// #pragma GCC target("amx") +#define WORKLOAD_NAME "AMX" +#define XFEATURE_XTILEDATA 18 +#define ARCH_REQ_XCOMP_PERM 0x1023 +#define ROW_NUM 16 +#define COL_NUM 64 +#define BYTES_PER_VECTOR 1024 +#define load_tile_reg(tmm_num, tile, stride) \ +do { \ + asm volatile("tileloadd\t(%0,%1,1), %%tmm" #tmm_num \ + : : "r" ((void *)(tile)->buf), "r" ((long)stride) : "memory") \ +} while (0) + +struct __tile_config { + uint8_t palette_id; + uint8_t start_row; + uint8_t reserved_0[14]; + uint16_t colsb[8]; + uint16_t reserved_1[8]; + uint8_t rows[8]; + uint8_t reserved_2[8]; +}; + +union __union_tile_config { + struct __tile_config s; + uint8_t a[64]; +}; + +struct thread_data { + int8_t *input_x; + int8_t *input_y; + int32_t *output; + int data_entries; +}; + +static void init_tile_config(union __union_tile_config *dst, uint8_t rows, uint8_t colsb) +{ + int32_t i; + + dst->s.palette_id = 1; + dst->s.start_row = 0; + + for (i = 0; i < 14; i++) + dst->s.reserved_0[i] = 0; + + for (i = 0; i < 8; i++) { + dst->s.reserved_1[i] = 0; + dst->s.reserved_2[i] = 0; + } + + for (i = 0; i < 8; i++) { + dst->s.colsb[i] = colsb; + dst->s.rows[i] = rows; + } + + _tile_loadconfig(dst->a); +} + +/* Set_tiledata_use() - Invoke syscall to set ARCH_SET_STATE_USE */ +static void set_tiledata_use(void) +{ + if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) + printf("Fail to do XFEATURE_XTILEDATA\n"); +} + +static void work(void *arg) +{ + int i; + struct thread_data *dp = (struct thread_data *)arg; + int entries = dp->data_entries; + + set_tiledata_use(); + + for (i = 0; i < entries; ++i) { + _tile_loadd(2, dp->input_x + BYTES_PER_VECTOR * i, COL_NUM); + _tile_loadd(3, dp->input_y + BYTES_PER_VECTOR * i, COL_NUM); + _tile_loadd(1, dp->output + BYTES_PER_VECTOR / 4 * i, COL_NUM); + _tile_dpbssd(1, 2, 3); + _tile_stored(1, dp->output + BYTES_PER_VECTOR / 4 * i, COL_NUM); + } +} + +#include "worker_init_amx.c" +#include "run_common.c" + +static struct workload w = { + "AMX", + init, + cleanup, + run, +}; + +struct workload *register_AMX(void) +{ + return &w; +} diff --git a/workload-xsave/work_AVX.c b/workload-xsave/work_AVX.c new file mode 100644 index 00000000..b06db085 --- /dev/null +++ b/workload-xsave/work_AVX.c @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * offer the "AVX" workload to yogini + * + * Copyright (c) 2022 Intel Corporation. + * Len Brown + * Yi Sun + * Dongcheng Yan + * + */ +#define _GNU_SOURCE +#include /* printf(3) */ +#include /* random(3) */ +#include /* CPU_SET */ +#include +#include "yogini.h" +#include +#include + +#pragma GCC target("avx") +#pragma GCC optimize("unroll-loops") +#define WORKLOAD_NAME "AVX" +#define BITS_PER_VECTOR 256 +#define BYTES_PER_VECTOR (BITS_PER_VECTOR / 8) +#define WORDS_PER_VECTOR (BITS_PER_VECTOR / 16) +#define DWORD_PER_VECTOR (BITS_PER_VECTOR / 32) +struct thread_data { + float *input_x; + float *input_y; + float *output; + int data_entries; +}; + +static void work(void *arg) +{ + int i; + struct thread_data *dp = (struct thread_data *)arg; + int entries = dp->data_entries / sizeof(double); + + for (i = 0; i < entries; ++i) { + if (clfulsh) { + clflush_range((void *)dp->input_x + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR); + clflush_range((void *)dp->input_y + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR); + } + __m256 vx, vy, voutput; + + vx = _mm256_loadu_ps(dp->input_x + i * DWORD_PER_VECTOR); + vy = _mm256_loadu_ps(dp->input_y + i * DWORD_PER_VECTOR); + voutput = _mm256_add_ps(vx, vy); + _mm256_storeu_ps(dp->output + i * DWORD_PER_VECTOR, voutput); + } +} + +#include "run_common.c" +#include "worker_init_avx.c" + +static struct workload w = { + "AVX", + init, + cleanup, + run, +}; + +struct workload *register_AVX(void) +{ + return &w; +} diff --git a/workload-xsave/work_AVX2.c b/workload-xsave/work_AVX2.c new file mode 100644 index 00000000..4f237d56 --- /dev/null +++ b/workload-xsave/work_AVX2.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * offer the "AVX2" workload to yogini + * + * Copyright (c) 2022 Intel Corporation. + * Len Brown + * Yi Sun + * Dongcheng Yan + * + */ +#define _GNU_SOURCE +#include /* printf(3) */ +#include /* random(3) */ +#include /* CPU_SET */ +#include "yogini.h" +#include +#include + +#pragma GCC target("avx2,fma") +#define WORKLOAD_NAME "AVX2" +#define BITS_PER_VECTOR 256 +#define BYTES_PER_VECTOR (BITS_PER_VECTOR / 8) +#define WORDS_PER_VECTOR (BITS_PER_VECTOR / 16) +#define DWORD_PER_VECTOR (BITS_PER_VECTOR / 32) + +#pragma GCC optimize("unroll-loops") + +struct thread_data { + u_int8_t *input_x; + int8_t *input_y; + int16_t *output; + int data_entries; +}; + +static void work(void *arg) +{ + int i; + struct thread_data *dp = (struct thread_data *)arg; + int entries = dp->data_entries / sizeof(double); + + for (i = 0; i < entries; ++i) { + if (clfulsh) { + clflush_range((void *)dp->input_x + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR); + clflush_range((void *)dp->input_y + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR); + } + __m256i vx, vy, voutput; + + vx = _mm256_loadu_si256((__m256i *)(dp->input_x + i * BYTES_PER_VECTOR)); + vy = _mm256_loadu_si256((__m256i *)(dp->input_y + i * BYTES_PER_VECTOR)); + voutput = _mm256_maddubs_epi16(vx, vy); + _mm256_storeu_si256((__m256i *)(dp->output + i * WORDS_PER_VECTOR), voutput); + } +} + +#include "worker_init_avx2.c" +#include "run_common.c" + +static struct workload w = { + "AVX2", + init, + cleanup, + run, +}; + +struct workload *register_AVX2(void) +{ + return &w; +} diff --git a/workload-xsave/work_AVX512.c b/workload-xsave/work_AVX512.c new file mode 100644 index 00000000..63542e4b --- /dev/null +++ b/workload-xsave/work_AVX512.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * offer the "AVX512" workload to yogini + * + * Copyright (c) 2022 Intel Corporation. + * Len Brown + * Yi Sun + * Dongcheng Yan + * + */ + +#define _GNU_SOURCE +#include /* printf(3) */ +#include /* random(3) */ +#include /* CPU_SET */ +#include "yogini.h" +#include +#include + +#if __GNUC__ >= 11 + +#pragma GCC target("avx512bf16") +#define WORKLOAD_NAME "AVX512" +#define BITS_PER_VECTOR 512 +#define BYTES_PER_VECTOR (BITS_PER_VECTOR / 8) +#define WORDS_PER_VECTOR (BITS_PER_VECTOR / 16) +#define DWORD_PER_VECTOR (BITS_PER_VECTOR / 32) + +#pragma GCC optimize("unroll-loops") + +struct thread_data { + u_int8_t *input_x; + int8_t *input_y; + int32_t *input_z; + int16_t *input_ones; + int32_t *output; + int data_entries; +}; + +static void work(void *arg) +{ + int i; + struct thread_data *dp = (struct thread_data *)arg; + int entries = dp->data_entries / sizeof(double); + + for (i = 0; i < entries; ++i) { + if (clfulsh) { + clflush_range((void *)dp->input_x + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR); + clflush_range((void *)dp->input_y + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR); + clflush_range((void *)dp->input_z + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR); + } + + __m512 vx, vy, vz, voutput; + + vx = _mm512_loadu_ps((float *)(dp->input_x + i * BYTES_PER_VECTOR)); + vy = _mm512_loadu_ps((float *)(dp->input_y + i * BYTES_PER_VECTOR)); + vz = _mm512_loadu_ps((float *)(dp->input_z + i * DWORD_PER_VECTOR)); + __m512bh bvx = _mm512_cvtne2ps_pbh(vx, _mm512_setzero_ps()); + __m512bh bvy = _mm512_cvtne2ps_pbh(vy, _mm512_setzero_ps()); + + voutput = _mm512_dpbf16_ps(vz, bvx, bvy); + _mm512_storeu_si512((__m512i *)(dp->output + i * BYTES_PER_VECTOR), _mm512_castps_si512(voutput)); + } +} + +#include "worker_init_dotprod.c" +#include "run_common.c" + +static struct workload w = { + "AVX512", + init, + cleanup, + run, +}; + +struct workload *register_AVX512(void) +{ + if (cpuid.avx512f) + return &w; + + return NULL; +} +#else + +#warning GCC < 11 can not build work_AVX512.c + +#endif /* GCC < 11 */ diff --git a/workload-xsave/work_DOTPROD.c b/workload-xsave/work_DOTPROD.c new file mode 100644 index 00000000..15a186b1 --- /dev/null +++ b/workload-xsave/work_DOTPROD.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Intel Corporation. + * Len Brown + */ + +#define _GNU_SOURCE +#include /* printf(3) */ +#include /* random(3) */ +#include /* CPU_SET */ +#include "yogini.h" +#include +#include + +#pragma GCC target("avx2,fma") + +#define BITS_PER_VECTOR 256 +#define BYTES_PER_VECTOR (BITS_PER_VECTOR / 8) +#define WORDS_PER_VECTOR (BITS_PER_VECTOR / 16) +#define DWORD_PER_VECTOR (BITS_PER_VECTOR / 32) + +#pragma GCC optimize("unroll-loops") + +struct thread_data { + u_int8_t *input_x; + int8_t *input_y; + int32_t *input_z; + int16_t *input_ones; + int32_t *output; + int data_entries; +}; + +static void work(void *arg) +{ + int i; + struct thread_data *dp = (struct thread_data *)arg; + int entries = dp->data_entries; + + __m256i v_ones; + + v_ones = _mm256_loadu_si256((void *)dp->input_ones); + + for (i = 0; i < entries; ++i) { + if (clfulsh) { + clflush_range((void *)dp->input_x + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR); + clflush_range((void *)dp->input_y + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR); + clflush_range((void *)dp->input_z + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR); + } + + __m256i vx, vy, vz, voutput; + __m256i vtmp1, vtmp2; + + vx = _mm256_loadu_si256((void *)(dp->input_x + i * BYTES_PER_VECTOR)); + vy = _mm256_loadu_si256((void *)(dp->input_y + i * BYTES_PER_VECTOR)); + vz = _mm256_loadu_si256((void *)(dp->input_z + i * DWORD_PER_VECTOR)); + + vtmp1 = _mm256_maddubs_epi16(vx, vy); /* 8-bit mul, 16-bit add */ + vtmp2 = _mm256_madd_epi16(vtmp1, v_ones); /* 32-bit convert */ + voutput = _mm256_add_epi32(vtmp2, vz); /* 32-bit add */ + _mm256_storeu_si256((void *)(dp->output + i * DWORD_PER_VECTOR), voutput); + } +} + +#include "worker_init_dotprod.c" +#include "run_common.c" + +static struct workload w = { + "DOTPROD", + init, + cleanup, + run, +}; + +struct workload *register_DOTPROD(void) +{ + return &w; +} diff --git a/workload-xsave/work_GETCPU.c b/workload-xsave/work_GETCPU.c new file mode 100644 index 00000000..7a502439 --- /dev/null +++ b/workload-xsave/work_GETCPU.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * see yogini.8 + * + * Initial implementation is specific to Intel hardware. + * + * Copyright (c) 2022 Intel Corporation. + * Len Brown + * Yi Sun + * Dongcheng Yan + * + */ + +#define _GNU_SOURCE +#include /* printf(3) */ +#include /* random(3) */ +#include /* CPU_SET */ +#include "yogini.h" + +extern unsigned int tsc_to_msec_from_start(unsigned long long tsc); + +/* + * GETCPU_run() + * Run the GETCPU spin loop either "loops" times, or until tsc_end + * return loops completed + */ +static unsigned long long GETCPU_run(struct work_instance *wi, + unsigned long long loops, unsigned long long tsc_end) +{ + unsigned long long count; + + if (loops == 0) + loops = (unsigned int)-1; + + if (tsc_end == 0) + tsc_end = (unsigned long long)-1; + + for (count = 0; count < loops; count++) { + unsigned long long tsc_now = rdtsc(); + int cpu; + + if (tsc_now >= tsc_end) + break; + + cpu = record_cpu_residency(wi, tsc_to_msec_from_start(tsc_now)); + record_wi_duration(wi, tsc_now); + record_cpu_work(wi, cpu, 1); + } + + return (count); +} + +static struct workload w = { + "GETCPU", + NULL, + NULL, + GETCPU_run, +}; + +struct workload *register_GETCPU(void) +{ + return &w; +} diff --git a/workload-xsave/work_MEM.c b/workload-xsave/work_MEM.c new file mode 100644 index 00000000..12a41196 --- /dev/null +++ b/workload-xsave/work_MEM.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * see yogini.8 + * + * Initial implementation is specific to Intel hardware. + * + * Copyright (c) 2022 Intel Corporation. + * Len Brown + * Yi Sun + * Dongcheng Yan + * + */ + +#define _GNU_SOURCE +#include /* printf(3) */ +#include /* random(3) */ +#include /* CPU_SET */ +#include "yogini.h" +#include "string.h" +#include +#include +void thread_break(int32_t reason, uint32_t thread_idx); +#define MEM_BYTES_PER_ITERATION (4 * 1024) + +struct thread_data { + char *buf1; + char *buf2; +}; + +static int init(struct work_instance *wi) +{ + struct thread_data *dp; + + dp = (struct thread_data *)calloc(1, sizeof(struct thread_data)); + if (!dp) + err(1, "thread_data"); + + /* + * set default working set to equal l3 cache + */ + if (wi->wi_bytes == 0) + wi->wi_bytes = SIZE_1GB * 1024; /* small calibration buffer for high score */ + + if (wi->wi_bytes % (2 * MEM_BYTES_PER_ITERATION)) { + warnx("MEM: %d bytes is invalid working set size.\n", wi->wi_bytes); + errx(-1, "MEM: working-set size minimum of %dKB.\n", + (2 * MEM_BYTES_PER_ITERATION) / 1024); + } + dp->buf1 = malloc(wi->wi_bytes / 2); + dp->buf2 = malloc(wi->wi_bytes / 2); + + if (!dp->buf1 || !dp->buf2) { + perror("malloc"); + exit(-1); + } + + wi->worker_data = dp; + + return 0; +} + +static int cleanup(struct work_instance *wi) +{ + struct thread_data *dp = wi->worker_data; + + free(dp->buf1); + free(dp->buf2); + free(dp); + + wi->worker_data = NULL; + + return 0; +} + +static void *linux_memcpy(void *dest, const void *src, size_t n) +{ + long d0, d1, d2; + + asm volatile ("rep ; movsq;\n\t movq %4,%%rcx;\n\t rep ; movsb\n\t" + : "=&c" (d0), "=&D"(d1), "=&S"(d2) + : "0"(n >> 3), "g"(n & 7), "1"(dest), "2"(src) + : "memory"); + + return dest; +} + +/* + * run() + * MEM bytes_to_copy, or until tsc_end + * return bytes copied + * use buf1 and buf2, in alternate directions + */ +static unsigned long long run(struct work_instance *wi) +{ + char *src, *dst; + unsigned long long bytes_done; + unsigned long long bytes_to_copy = wi->repeat * MEM_BYTES_PER_ITERATION; + struct thread_data *dp = wi->worker_data; + + src = dp->buf1; + dst = dp->buf2; + + for (bytes_done = 0;;) { + int kb; + + for (kb = 0; kb < wi->wi_bytes / 1024 / 2; kb += 4) { + linux_memcpy(dst + kb * 1024, src + kb * 1024, MEM_BYTES_PER_ITERATION); + + bytes_done += MEM_BYTES_PER_ITERATION; + + thread_break(wi->break_reason, wi->thread_number); + if (bytes_to_copy && bytes_done >= bytes_to_copy) + goto done; + } + } +done: + return rdtsc(); +} + +static struct workload MEM_workload = { + "MEM", + init, + cleanup, + run, +}; + +struct workload *register_MEM(void) +{ + return &MEM_workload; +} diff --git a/workload-xsave/work_PAUSE.c b/workload-xsave/work_PAUSE.c new file mode 100644 index 00000000..d69fa5ab --- /dev/null +++ b/workload-xsave/work_PAUSE.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * offer the "PAUSE" workload to yogini + * + * Copyright (c) 2022 Intel Corporation. + * Len Brown + */ + +#define _GNU_SOURCE +#include /* printf(3) */ +#include /* random(3) */ +#include /* CPU_SET */ +#include +#include "yogini.h" +#include + +#define DATA_ENTRIES 1024 + +static void work(void *arg) +{ + int i; + + for (i = 0; i < DATA_ENTRIES; ++i) + asm volatile ("pause"); +} + +#include "run_common.c" + +static struct workload w = { + "PAUSE", + NULL, + NULL, + run, +}; + +struct workload *register_PAUSE(void) +{ + return &w; +} diff --git a/workload-xsave/work_RDTSC.c b/workload-xsave/work_RDTSC.c new file mode 100644 index 00000000..e32d27fa --- /dev/null +++ b/workload-xsave/work_RDTSC.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * offer the "RDTSC" workload to yogini + * + * Copyright (c) 2022 Intel Corporation. + * Len Brown + */ + +#define _GNU_SOURCE +#include /* printf(3) */ +#include /* random(3) */ +#include /* CPU_SET */ +#include "yogini.h" + +/* + * This "workload" runs the worker infrastructure without any inner work. + * As a result, it spends all of its time using RDTSC to end promptly. + */ +static inline void work(void *arg) +{ +} + +#define DATA_ENTRIES 1 + +#include "run_common.c" + +static struct workload w = { + "RDTSC", + NULL, + NULL, + run, +}; + +struct workload *register_RDTSC(void) +{ + return &w; +} diff --git a/workload-xsave/work_SSE.c b/workload-xsave/work_SSE.c new file mode 100644 index 00000000..b5b01575 --- /dev/null +++ b/workload-xsave/work_SSE.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * offer the "SSE" workload to yogini + * + * Copyright (c) 2022 Intel Corporation. + * Len Brown + */ +#define _GNU_SOURCE +#include /* printf(3) */ +#include /* random(3) */ +#include /* CPU_SET */ +#include +#include "yogini.h" +#include +#include + +#pragma GCC target("sse4.2") +#pragma GCC optimize("unroll-loops") +#define WORKLOAD_NAME "SSE" +#define BITS_PER_VECTOR 128 +#define BYTES_PER_VECTOR (BITS_PER_VECTOR / 8) +#define WORDS_PER_VECTOR (BITS_PER_VECTOR / 16) +#define DWORD_PER_VECTOR (BITS_PER_VECTOR / 32) + +struct thread_data { + int32_t *input_x; + int32_t *input_y; + int32_t *output; + int data_entries; +}; + +static void work(void *arg) +{ + int i; + struct thread_data *dp = (struct thread_data *)arg; + int entries = dp->data_entries / sizeof(double); + + for (i = 0; i < entries; ++i) { + if (clfulsh) { + clflush_range((void *)dp->input_x + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR); + clflush_range((void *)dp->input_y + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR); + } + __m128i vx, vy, voutput; + + vx = _mm_loadu_si128((__m128i *)dp->input_x); + vy = _mm_loadu_si128((__m128i *)dp->input_y); + + voutput = _mm_add_epi32(vx, vy); + _mm_storeu_si128((__m128i *)dp->output, voutput); + } +} + +#include "run_common.c" +#include "worker_init_sse.c" + +static struct workload w = { + "SSE", + init, + cleanup, + run, +}; + +struct workload *register_SSE(void) +{ + return &w; +} diff --git a/workload-xsave/work_TPAUSE.c b/workload-xsave/work_TPAUSE.c new file mode 100644 index 00000000..3c922cc8 --- /dev/null +++ b/workload-xsave/work_TPAUSE.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * offer the "TPAUSE" workload to yogini + * + * Copyright (c) 2022 Intel Corporation. + * Len Brown + */ + +#define _GNU_SOURCE +#include /* printf(3) */ +#include /* random(3) */ +#include /* CPU_SET */ +#include +#include "yogini.h" +#include + +#if __GNUC__ >= 9 + +#pragma GCC target("waitpkg") + +#define WORKLOAD_NAME "TPAUSE" +#define TPAUSE_TSC_CYCLES ((unsigned long long)(1000 * 1000)) + +#define DATA_ENTRIES 1 + +static void work(void *arg) +{ + unsigned int ctrl; + unsigned long long tsc; + + ctrl = 0; + tsc = _rdtsc(); + tsc += TPAUSE_TSC_CYCLES; + + _tpause(ctrl, tsc); +} + +#include "run_common.c" + +static struct workload w = { + "TPAUSE", + NULL, + NULL, + run, +}; + +struct workload *register_TPAUSE(void) +{ + if (cpuid.tpause) + return &w; + + return NULL; +} +#else + +#warning GCC < 9 can not build work_TPAUSE.c + +#endif /* GCC < 9 */ diff --git a/workload-xsave/work_UMWAIT.c b/workload-xsave/work_UMWAIT.c new file mode 100644 index 00000000..b726b540 --- /dev/null +++ b/workload-xsave/work_UMWAIT.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * offer the "UMWAIT" workload to yogini + * + * Copyright (c) 2023 Intel Corporation. + * Len Brown + */ + +#define _GNU_SOURCE +#include /* printf(3) */ +#include /* random(3) */ +#include /* CPU_SET */ +#include +#include "yogini.h" +#include + +#if __GNUC__ >= 9 + +#pragma GCC target("waitpkg") + +#define WORKLOAD_NAME "UMWAIT" + +#define DATA_ENTRIES 1 + +static void work(void *arg) +{ + char dummy; + + _umonitor(&dummy); + _umwait(0, (unsigned long long)-1); +} + +#include "run_common.c" + +static struct workload w = { + "UMWAIT", + NULL, + NULL, + run, +}; + +struct workload *register_UMWAIT(void) +{ + if (cpuid.tpause) + return &w; + + return NULL; +} +#else + +#warning GCC < 9 can not build work_UMWAIT.c + +#endif /* GCC < 9 */ diff --git a/workload-xsave/work_VNNI.c b/workload-xsave/work_VNNI.c new file mode 100644 index 00000000..4ccf0ec7 --- /dev/null +++ b/workload-xsave/work_VNNI.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Intel Corporation. + * Len Brown + * Yi Sun + * Dongcheng Yan + * + */ + +#define _GNU_SOURCE +#include /* printf(3) */ +#include /* random(3) */ +#include /* CPU_SET */ +#include "yogini.h" +#include +#include + +#if __GNUC__ >= 11 + +#pragma GCC target("avxvnni") +#define WORKLOAD_NAME "VNNI" +#define BITS_PER_VECTOR 256 +#define BYTES_PER_VECTOR (BITS_PER_VECTOR / 8) +#define WORDS_PER_VECTOR (BITS_PER_VECTOR / 16) +#define DWORD_PER_VECTOR (BITS_PER_VECTOR / 32) + +#pragma GCC optimize("unroll-loops") + +struct thread_data { + u_int8_t *input_x; + int8_t *input_y; + int32_t *input_z; + int16_t *input_ones; + int32_t *output; + int data_entries; +}; + +static void work(void *arg) +{ + int i; + struct thread_data *dp = (struct thread_data *)arg; + int entries = dp->data_entries / sizeof(double); + + for (i = 0; i < entries; ++i) { + if (clfulsh) { + clflush_range((void *)dp->input_x + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR); + clflush_range((void *)dp->input_y + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR); + clflush_range((void *)dp->input_z + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR); + } + __m256i vx, vy, vz, voutput; + + vx = _mm256_loadu_si256((void *)(dp->input_x + i * BYTES_PER_VECTOR)); + vy = _mm256_loadu_si256((void *)(dp->input_y + i * BYTES_PER_VECTOR)); + vz = _mm256_loadu_si256((void *)(dp->input_z + i * DWORD_PER_VECTOR)); + + voutput = _mm256_dpbusds_epi32(vz, vx, vy); + + _mm256_storeu_si256((void *)(dp->output + i * DWORD_PER_VECTOR), voutput); + } +} + +#include "worker_init_dotprod.c" +#include "run_common.c" + +static struct workload w = { + "VNNI", + init, + cleanup, + run, +}; + +struct workload *register_VNNI(void) +{ + if (cpuid.avx2vnni) + return &w; + + return NULL; +} +#else + +#warning GCC < 11 can not build work_VNNI.c + +#endif /* GCC < 11 */ diff --git a/workload-xsave/work_VNNI512.c b/workload-xsave/work_VNNI512.c new file mode 100644 index 00000000..1849da85 --- /dev/null +++ b/workload-xsave/work_VNNI512.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Intel Corporation. + * Len Brown + */ + +#define _GNU_SOURCE +#include /* printf(3) */ +#include /* random(3) */ +#include /* CPU_SET */ +#include "yogini.h" +#include +#include + +#if __GNUC__ >= 9 + +#pragma GCC target("avx512vnni") +#define WORKLOAD_NAME "VNNI512" +#define BITS_PER_VECTOR 512 +#define BYTES_PER_VECTOR (BITS_PER_VECTOR / 8) +#define WORDS_PER_VECTOR (BITS_PER_VECTOR / 16) +#define DWORD_PER_VECTOR (BITS_PER_VECTOR / 32) + +#pragma GCC optimize("unroll-loops") + +struct thread_data { + u_int8_t *input_x; + int8_t *input_y; + int32_t *input_z; + int16_t *input_ones; + int32_t *output; + int data_entries; +}; + +static void work(void *arg) +{ + int i; + struct thread_data *dp = (struct thread_data *)arg; + int entries = dp->data_entries; + + for (i = 0; i < entries; ++i) { + if (clfulsh) { + clflush_range((void *)dp->input_x + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR); + clflush_range((void *)dp->input_y + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR); + clflush_range((void *)dp->input_z + i * BYTES_PER_VECTOR, BYTES_PER_VECTOR); + } + __m512i vx, vy, vz, voutput; + + vx = _mm512_loadu_si512((void *)(dp->input_x + i * BYTES_PER_VECTOR)); + vy = _mm512_loadu_si512((void *)(dp->input_y + i * BYTES_PER_VECTOR)); + vz = _mm512_loadu_si512((void *)(dp->input_z + i * DWORD_PER_VECTOR)); + + voutput = _mm512_dpbusds_epi32(vz, vx, vy); + + _mm512_storeu_si512((void *)(dp->output + i * DWORD_PER_VECTOR), voutput); + } +} + +#include "worker_init_dotprod.c" +#include "run_common.c" + +static struct workload w = { + "VNNI512", + init, + cleanup, + run, +}; + +struct workload *register_VNNI512(void) +{ + if (cpuid.vnni512) + return &w; + + return NULL; +} +#else + +#warning GCC < 9 can not build work_VNNI512.c + +#endif /* GCC < 9 */ diff --git a/workload-xsave/work_memcpy.c b/workload-xsave/work_memcpy.c new file mode 100644 index 00000000..8dd91f5c --- /dev/null +++ b/workload-xsave/work_memcpy.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * offer the "memcpy" workload to yogini + * + * see yogini.8 + * + * Initial implementation is specific to Intel hardware. + * + * Copyright (c) 2022 Intel Corporation. + * Len Brown + * Yi Sun + * Dongcheng Yan + * + */ + +#define _GNU_SOURCE +#include /* printf(3) */ +#include /* random(3) */ +#include /* CPU_SET */ +#include "yogini.h" +#include "string.h" +#include +#include +void thread_break(int32_t reason, uint32_t thread_idx); +#define MEM_BYTES_PER_ITERATION (4 * 1024) + +struct thread_data { + char *buf1; + char *buf2; +}; + +static int init(struct work_instance *wi) +{ + struct thread_data *dp; + + dp = (struct thread_data *)calloc(1, sizeof(struct thread_data)); + if (!dp) + err(1, "thread_data"); + + /* + * set default working set to equal l3 cache + */ + + if (wi->wi_bytes == 0) + wi->wi_bytes = SIZE_1GB * 1024; /* small calibration buffer for high score */ + + if (wi->wi_bytes % (2 * MEM_BYTES_PER_ITERATION)) { + warnx("memcpy: %d bytes is invalid working set size.\n", wi->wi_bytes); + errx(-1, "memcpy: requires multiple of %d KB.\n", + (2 * MEM_BYTES_PER_ITERATION) / 1024); + } + + dp->buf1 = malloc(wi->wi_bytes / 2); + dp->buf2 = malloc(wi->wi_bytes / 2); + + if (!dp->buf1 || !dp->buf2) { + perror("malloc"); + exit(-1); + } + + wi->worker_data = dp; + + return 0; +} + +static int cleanup(struct work_instance *wi) +{ + struct thread_data *dp = wi->worker_data; + + free(dp->buf1); + free(dp->buf2); + free(dp); + + wi->worker_data = NULL; + + return 0; +} + +/* + * run() + * MEM bytes_to_copy, or until tsc_end + * return bytes copied + * use buf1 and buf2, in alternate directions + */ +static unsigned long long run(struct work_instance *wi) +{ + char *src, *dst; + unsigned long long bytes_done; + unsigned long long bytes_to_copy = wi->repeat * MEM_BYTES_PER_ITERATION; + struct thread_data *dp = wi->worker_data; + + src = dp->buf1; + dst = dp->buf2; + + for (bytes_done = 0;;) { + int kb; + + for (kb = 0; kb < wi->wi_bytes / 1024 / 2; kb += 4) { + /* MEM 4KB */ + memcpy(dst + kb * 1024, src + kb * 1024, MEM_BYTES_PER_ITERATION); + + bytes_done += MEM_BYTES_PER_ITERATION; + + thread_break(wi->break_reason, wi->thread_number); + if (bytes_to_copy && bytes_done >= bytes_to_copy) + goto done; + } + } +done: + return rdtsc(); +} + +static struct workload memcpy_workload = { + "memcpy", + init, + cleanup, + run, +}; + +struct workload *register_memcpy(void) +{ + return &memcpy_workload; +} diff --git a/workload-xsave/worker_init4.c b/workload-xsave/worker_init4.c new file mode 100644 index 00000000..1c0d53bb --- /dev/null +++ b/workload-xsave/worker_init4.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * generic worker code for re-use via inclusion + * + * Copyright (c) 2022 Intel Corporation. + * Len Brown + */ + +#include + +#include +#include "yogini.h" + +static double get_random_double(void) +{ + unsigned long long random_int64; + + random_int64 = (long long)random() | (((long long)random()) << 32); + + return (double)random_int64; +} + +static int init(struct work_instance *wi) +{ + int i; + struct thread_data *dp; + int bytes_per_entry = sizeof(double) * 4; /* a[], x[], y[], z[] */ + int entries; + + entries = wi->wi_bytes / bytes_per_entry; + + dp = (struct thread_data *)calloc(1, sizeof(struct thread_data)); + if (!dp) + err(1, "thread_data"); + + srand((int)time(0)); + + dp->a = malloc(entries * sizeof(double)); + dp->x = malloc(entries * sizeof(double)); + dp->y = malloc(entries * sizeof(double)); + dp->z = malloc(entries * sizeof(double)); + dp->data_entries = entries; + + if (!dp->a || !dp->x || !dp->y || !dp->z) + errx(-1, "malloc failed"); + + for (i = 0; i < entries; ++i) { + dp->a[i] = get_random_double(); + dp->x[i] = get_random_double(); + dp->y[i] = get_random_double(); + dp->z[i] = get_random_double(); + } + wi->worker_data = dp; + + return 0; +} + +static int cleanup(struct work_instance *wi) +{ + struct thread_data *dp = wi->worker_data; + + free(dp->a); + free(dp->x); + free(dp->y); + free(dp->z); + free(dp); + wi->worker_data = NULL; + + return 0; +} diff --git a/workload-xsave/worker_init_amx.c b/workload-xsave/worker_init_amx.c new file mode 100644 index 00000000..08c9d68c --- /dev/null +++ b/workload-xsave/worker_init_amx.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Intel Corporation. + * Yi Sun + * Dongcheng Yan + * + */ +#include +#include +#include +#define YOGINI_MAIN +#include "yogini.h" + +static void init_dword_tile(int8_t *ptr, uint8_t rows, uint8_t colsb, int entries) +{ + int32_t i, j, k; + int32_t cols = colsb / 4; + + for (k = 0; k < entries; ++k) { + for (i = 0; i < rows; i++) + for (j = 0; j < cols; j++) + ptr[k * rows * cols + i * cols + j] = random(); + } +} + +static int init(struct work_instance *wi) +{ + struct thread_data *dp; + /* int8_t x[], y[], int32_t output[] */ + int bytes_per_entry = BYTES_PER_VECTOR * 3; + int entries; + + entries = wi->wi_bytes / bytes_per_entry; + + union __union_tile_config cfg; + + init_tile_config(&cfg, ROW_NUM, COL_NUM); + + dp = (struct thread_data *)calloc(1, sizeof(struct thread_data)); + if (!dp) + err(1, "thread_data"); + + dp->input_x = (int8_t *)calloc(entries, BYTES_PER_VECTOR); + if (!dp->input_x) + err(1, "calloc input_x"); + + dp->input_y = (int8_t *)calloc(entries, BYTES_PER_VECTOR); + if (!dp->input_y) + err(1, "calloc input_y"); + + /* initialize input -- make every iteration the same for now */ + init_dword_tile(dp->input_x, ROW_NUM, COL_NUM, entries); + init_dword_tile(dp->input_y, ROW_NUM, COL_NUM, entries); + + dp->output = (int32_t *) calloc(entries, BYTES_PER_VECTOR); + if (dp->output == NULL) + err(1, "calloc output"); + + dp->data_entries = entries; + + wi->worker_data = dp; + + return 0; +} + +static int cleanup(struct work_instance *wi) +{ + struct thread_data *dp = wi->worker_data; + + free(dp->input_x); + free(dp->input_y); + free(dp->output); + free(dp); + wi->worker_data = NULL; + + return 0; +} diff --git a/workload-xsave/worker_init_avx.c b/workload-xsave/worker_init_avx.c new file mode 100644 index 00000000..d2b4299c --- /dev/null +++ b/workload-xsave/worker_init_avx.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Intel Corporation. + * Len Brown + */ + +static int init(struct work_instance *wi) +{ + int i; + struct thread_data *dp; + int bytes_per_entry = BYTES_PER_VECTOR * 3; /* x[], y[], output[] */ + int entries; + + entries = wi->wi_bytes / bytes_per_entry; + + dp = (struct thread_data *)calloc(1, sizeof(struct thread_data)); + if (!dp) + err(1, "thread_data"); + + dp->input_x = (float *)calloc(entries, BYTES_PER_VECTOR); + if (!dp->input_x) + err(1, "calloc input_x"); + + dp->input_y = (float *)calloc(entries, BYTES_PER_VECTOR); + if (!dp->input_y) + err(1, "calloc input_y"); + + /* initialize input -- make every iteration the same for now */ + for (i = 0; i < entries; ++i) { + int j; + + for (j = 0; j < DWORD_PER_VECTOR; j++) { + int index = i * DWORD_PER_VECTOR + j; + + dp->input_x[index] = j; + dp->input_y[index] = j; + } + } + + dp->output = (float *)calloc(entries, BYTES_PER_VECTOR); + if (!dp->output) + err(1, "calloc output"); + dp->data_entries = entries; + + wi->worker_data = dp; + + return 0; +} + +static int cleanup(struct work_instance *wi) +{ + struct thread_data *dp = wi->worker_data; + + free(dp->input_x); + free(dp->input_y); + free(dp->output); + free(dp); + wi->worker_data = NULL; + + return 0; +} diff --git a/workload-xsave/worker_init_avx2.c b/workload-xsave/worker_init_avx2.c new file mode 100644 index 00000000..03c02171 --- /dev/null +++ b/workload-xsave/worker_init_avx2.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Intel Corporation. + * Len Brown + */ + +static int init(struct work_instance *wi) +{ + int i; + struct thread_data *dp; + int bytes_per_entry = BYTES_PER_VECTOR * 3; /* x[], y[], output[] */ + int entries; + + entries = wi->wi_bytes / bytes_per_entry; + + dp = (struct thread_data *)calloc(1, sizeof(struct thread_data)); + if (!dp) + err(1, "thread_data"); + + dp->input_x = (u_int8_t *)calloc(entries, BYTES_PER_VECTOR); + if (!dp->input_x) + err(1, "calloc input_x"); + + dp->input_y = (int8_t *)calloc(entries, BYTES_PER_VECTOR); + if (!dp->input_y) + err(1, "calloc input_y"); + + /* initialize input -- make every iteration the same for now */ + for (i = 0; i < entries; ++i) { + int j; + + for (j = 0; j < BYTES_PER_VECTOR; j++) { + int index = i * BYTES_PER_VECTOR + j; + + dp->input_x[index] = j; + dp->input_y[index] = BYTES_PER_VECTOR + j; + } + } + + dp->output = (int16_t *) calloc(entries, BYTES_PER_VECTOR); + if (dp->output == NULL) + err(1, "calloc output"); + dp->data_entries = entries; + + wi->worker_data = dp; + + return 0; +} + +static int cleanup(struct work_instance *wi) +{ + struct thread_data *dp = wi->worker_data; + + free(dp->input_x); + free(dp->input_y); + free(dp->output); + free(dp); + wi->worker_data = NULL; + + return 0; +} diff --git a/workload-xsave/worker_init_dotprod.c b/workload-xsave/worker_init_dotprod.c new file mode 100644 index 00000000..1810870a --- /dev/null +++ b/workload-xsave/worker_init_dotprod.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Intel Corporation. + * Len Brown + */ + +static int init(struct work_instance *wi) +{ + int i; + struct thread_data *dp; + int bytes_per_entry = BYTES_PER_VECTOR * 4;/* x[], y[], z[] (ignores ones[]), output[] */ + int entries; + + entries = wi->wi_bytes / bytes_per_entry; + + dp = (struct thread_data *)calloc(1, sizeof(struct thread_data)); + if (!dp) + err(1, "thread_data"); + + dp->input_x = (u_int8_t *)calloc(entries, BYTES_PER_VECTOR); + if (!dp->input_x) + err(1, "calloc input_x"); + + dp->input_y = (int8_t *)calloc(entries, BYTES_PER_VECTOR); + if (!dp->input_y) + err(1, "calloc input_y"); + + dp->input_z = (int32_t *)calloc(entries, BYTES_PER_VECTOR); + if (!dp->input_z) + err(1, "calloc input_z"); + + dp->input_ones = (int16_t *)calloc(1, BYTES_PER_VECTOR); + if (!dp->input_ones) + err(1, "calloc input_ones"); + + /* initialize input -- make every iteration the same for now */ + for (i = 0; i < entries; ++i) { + int j; + + for (j = 0; j < BYTES_PER_VECTOR; j++) { + int index = i * BYTES_PER_VECTOR + j; + + dp->input_x[index] = j; + dp->input_y[index] = BYTES_PER_VECTOR + j; + } + for (j = 0; j < DWORD_PER_VECTOR; j++) { + int index = i * DWORD_PER_VECTOR + j; + + dp->input_z[index] = j; + } + } + for (i = 0; i < WORDS_PER_VECTOR; i++) + dp->input_ones[i] = 1; + + dp->output = (int32_t *)calloc(entries, BYTES_PER_VECTOR); + if (!dp->output) + err(1, "calloc output"); + dp->data_entries = entries; + + wi->worker_data = dp; + + return 0; +} + +static int cleanup(struct work_instance *wi) +{ + struct thread_data *dp = wi->worker_data; + + free(dp->input_x); + free(dp->input_y); + free(dp->input_z); + free(dp->input_ones); + free(dp->output); + free(dp); + wi->worker_data = NULL; + + return 0; +} diff --git a/workload-xsave/worker_init_sse.c b/workload-xsave/worker_init_sse.c new file mode 100644 index 00000000..2e06de61 --- /dev/null +++ b/workload-xsave/worker_init_sse.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Intel Corporation. + * Len Brown + */ + +static int init(struct work_instance *wi) +{ + int i; + struct thread_data *dp; + int bytes_per_entry = BYTES_PER_VECTOR * 3; /* x[], y[], output[] */ + int entries; + + entries = wi->wi_bytes / bytes_per_entry; + + dp = (struct thread_data *)calloc(1, sizeof(struct thread_data)); + if (!dp) + err(1, "thread_data"); + + dp->input_x = (int32_t *)calloc(entries, BYTES_PER_VECTOR); + if (!dp->input_x) + err(1, "calloc input_x"); + + dp->input_y = (int32_t *)calloc(entries, BYTES_PER_VECTOR); + if (!dp->input_y) + err(1, "calloc input_y"); + + /* initialize input -- make every iteration the same for now */ + for (i = 0; i < entries; ++i) { + int j; + + for (j = 0; j < DWORD_PER_VECTOR; j++) { + int index = i * DWORD_PER_VECTOR + j; + + dp->input_x[index] = j; + dp->input_y[index] = j; + } + } + + dp->output = (int32_t *)calloc(entries, BYTES_PER_VECTOR); + if (!dp->output) + err(1, "calloc output"); + dp->data_entries = entries; + + wi->worker_data = dp; + + return 0; +} + +static int cleanup(struct work_instance *wi) +{ + struct thread_data *dp = wi->worker_data; + + free(dp->input_x); + free(dp->input_y); + free(dp->output); + free(dp); + wi->worker_data = NULL; + + return 0; +} diff --git a/workload-xsave/yogini.c b/workload-xsave/yogini.c new file mode 100644 index 00000000..8d49e680 --- /dev/null +++ b/workload-xsave/yogini.c @@ -0,0 +1,537 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Intel Corporation. + * Len Brown + * Yi Sun + * + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define YOGINI_MAIN +#include "yogini.h" + +enum { + BREAK_BY_NOTHING = 0, + BREAK_BY_YIELD = 1, + BREAK_BY_SLEEP, + BREAK_BY_TRAP, + BREAK_BY_SIGNAL, + BREAK_BY_FUTEX, + BREAK_REASON_MAX = BREAK_BY_FUTEX +} BREAK_REASON; +#define FUTEX_VAL 0x5E5E5E5E + +int repeat_cnt; +int clfulsh; +char *progname; +struct workload *all_workloads; +struct work_instance *first_worker; +struct work_instance *last_worker; + +static int num_worker_threads; +static int num_checked_in_threads; +static pthread_mutex_t checkin_mutex; +static pthread_cond_t checkin_cv = PTHREAD_COND_INITIALIZER; +int32_t break_reason = BREAK_BY_NOTHING; +static int32_t *futex_ptr; +static bool *thread_done; +pthread_t *tid_ptr; + +unsigned int SIZE_1GB = 1024 * 1024 * 1024; + +struct cpuid cpuid; + +static void dump_command(int argc, char **argv) +{ + int i; + + for (i = 0; i < argc; i++) + printf("%s ", argv[i]); + + putchar('\n'); +} + +void dump_workloads(void) +{ + struct workload *wp; + + for (wp = all_workloads; wp; wp = wp->next) + fprintf(stderr, " %s", wp->name); + + fprintf(stderr, "\n"); +} + +static void help(void) +{ + fprintf(stderr, + "usage: %s [OPTIONS]\n" + "\n" + "%s runs some simple micro workloads\n" + " -w, --workload [workload_name,threads#,break#, ...]\n", progname, progname); + fprintf(stderr, "Available workloads: "); + dump_workloads(); + fprintf(stderr, + " -r, --repeat, each instance needs to be run\n" + " -b, --break_reason, [yield/sleep/trap/signal/futex]\n" + "For more help, see README\n"); + exit(0); +} + +int parse_break_cmd(char *input_string) +{ + if (strcmp(input_string, "sleep") == 0) + break_reason = BREAK_BY_SLEEP; + else if (strcmp(input_string, "yield") == 0) + break_reason = BREAK_BY_YIELD; + else if (strcmp(input_string, "trap") == 0) + break_reason = BREAK_BY_TRAP; + else if (strcmp(input_string, "signal") == 0) + break_reason = BREAK_BY_SIGNAL; + else if (strcmp(input_string, "futex") == 0) + break_reason = BREAK_BY_FUTEX; + else + return -1; + return 0; +} + +static void set_tsc_per_sec(void) +{ + unsigned int ebx = 0, ecx = 0, edx = 0; + unsigned int max_level; + + __cpuid(0, max_level, ebx, ecx, edx); + + /* Structured Extended Feature Flags Enumeration Leaf */ + if (max_level >= 0x7) { + unsigned int eax_subleaves; + + eax_subleaves = 0; + ecx = 0; + edx = 0; + + __cpuid_count(0x7, 0, eax_subleaves, ebx, ecx, edx); + + if (ebx & (1 << 16)) + cpuid.avx512f = 1; + if (ecx & (1 << 5)) + cpuid.tpause = 1; + if (ecx & (1 << 11)) + cpuid.vnni512 = 1; + + if (eax_subleaves > 0) { + unsigned int eax = 0; + + eax = ebx = ecx = edx = 0; + __cpuid_count(0x7, 1, eax, ebx, ecx, edx); + if (eax & (1 << 4)) + cpuid.avx2vnni = 1; + } + } + + if (max_level < 0x15) + errx(1, "sorry CPU too old: cpuid level 0x%x < 0x15", max_level); +} + +void register_all_workloads(void) +{ + int i; + struct workload *wp; + + for (i = 0; all_register_routines[i]; ++i) { + wp = all_register_routines[i] (); + + if (!wp) + continue; + + wp->next = all_workloads; + all_workloads = wp; + } +} + +struct work_instance *alloc_new_work_instance(void) +{ + struct work_instance *wi; + + wi = calloc(1, sizeof(struct work_instance)); + if (!wi) + err(1, "work_instance"); + + wi->workload = all_workloads; /* default workload is last probed */ + return wi; +} + +void register_new_worker(struct work_instance *wi) +{ + if (!first_worker) + first_worker = wi; + else + last_worker->next = wi; + last_worker = wi; + + wi->next = NULL; +} + +int parse_work_cmd(char *work_cmd) +{ + struct work_instance *wi; + struct workload *wp; + + wp = find_workload(work_cmd); + if (wp) { + wi = alloc_new_work_instance(); + wi->workload = wp; + } else { + fprintf(stderr, "Unrecognized work parameter '%s' try -h for help\n", work_cmd); + exit(1); + } + + /* register this work_instance */ + register_new_worker(wi); + return 0; +} + +static void initial_ptr(void) +{ + futex_ptr = (int32_t *)malloc(sizeof(int32_t) * num_worker_threads); + thread_done = (bool *)malloc(sizeof(bool) * num_worker_threads); + tid_ptr = (pthread_t *)malloc(sizeof(pthread_t) * num_worker_threads); + if (!futex_ptr || !thread_done || !tid_ptr) { + printf("Fail to malloc memory for futex_ptr & tid_ptr\n"); + exit(1); + } +} + +static void initial_wi(void) +{ + struct work_instance *wi; + + wi = first_worker; + while (wi) { + wi->break_reason = break_reason; + wi->wi_bytes = SIZE_1GB; + wi->repeat = repeat_cnt; + wi = wi->next; + num_worker_threads++; + } +} + +static void deinitialize(void) +{ + struct work_instance *wi; + struct work_instance *cur; + + wi = first_worker; + while (wi) { + cur = wi->next; + free(wi); + wi = cur; + } + + free(futex_ptr); + free(thread_done); + free(tid_ptr); +} + +static void cmdline(int argc, char **argv) +{ + int opt; + int option_index = 0; + static struct option long_options[] = { + { "help", no_argument, 0, 'h' }, + { "work", required_argument, 0, 'w' }, + { "repeat", required_argument, 0, 'r' }, + { "break_reason", required_argument, 0, 'b' }, + {"clflush", no_argument, 0, 'f'}, + { 0, 0, 0, 0 } + }; + + progname = argv[0]; + + while ((opt = getopt_long_only(argc, argv, "h:w:r:b:f", + long_options, &option_index)) != -1) { + switch (opt) { + case 'h': + help(); + break; + case 'w': + if (parse_work_cmd(optarg)) + help(); + break; + case 'r': + repeat_cnt = atoi(optarg); + break; + case 'b': + if (parse_break_cmd(optarg)) + help(); + break; + case 'f': + clfulsh = 1; + break; + default: + help(); + } + } + + dump_command(argc, argv); +} + +static void initialize(int argc, char **argv) +{ + set_tsc_per_sec(); + register_all_workloads(); + cmdline(argc, argv); + initial_wi(); + initial_ptr(); +} + +void clflush_range(void *address, size_t size) +{ + uintptr_t start = (uintptr_t)address; + uintptr_t end = start + size; + + // Align according to the size of the cache line. + uintptr_t aligned_start = (start & ~(63UL)); + uintptr_t aligned_end = (end + 63UL) & ~(63UL); + + for (uintptr_t addr = aligned_start; addr < aligned_end; addr += 64) + _mm_clflush((void *)addr); + + // Ensure clearing of unaligned portions. + for (uintptr_t addr = aligned_end; addr < end; addr++) + _mm_clflush((void *)addr); +} + +static uint64_t do_syscall(uint64_t nr, uint64_t rdi, uint64_t rsi, uint64_t rdx, + uint64_t r10, uint64_t r8, uint64_t r9) +{ + uint64_t rtn; + + asm volatile("movq %0, %%rdi" : : "r"(rdi) : "%rdi"); + asm volatile("movq %0, %%rsi" : : "r"(rsi) : "%rsi"); + asm volatile("movq %0, %%rdx" : : "r"(rdx) : "%rdx"); + asm volatile("movq %0, %%r10" : : "r"(r10) : "%r10"); + asm volatile("movq %0, %%r8" : : "r"(r8) : "%r8"); + asm volatile("movq %0, %%r9" : : "r"(r9) : "%r9"); + asm volatile("syscall" + : "=a" (rtn) + : "a" (nr) + : "rcx", "r11", "memory", "cc"); + + return rtn; +} + +static void signal_handler(int32_t signum) +{ + //int32_t current_cpu = sched_getcpu(); + + //if (signum == SIGTRAP) + //printf("Break by trap, current_cpu=%d\n", current_cpu); + + //if (signum == SIGUSR1) + //printf("Break by signal, current_cpu=%d\n", current_cpu); +} + +void thread_break(int32_t reason, uint32_t thread_idx) +{ + struct timespec req; + + switch (reason) { + case BREAK_BY_YIELD: + /* + * Schedule out current thread by executing syscall + * instruction with syscall number SYS_sched_yield + */ + do_syscall(SYS_sched_yield, 0, 0, 0, 0, 0, 0); + break; + case BREAK_BY_SLEEP: + /* + * Schedule out current thread by executing syscall + * instruction with syscall number SYS_nanosleep + */ + req.tv_sec = 1; + req.tv_nsec = 0; + do_syscall(SYS_nanosleep, (uint64_t)&req, 0, 0, 0, 0, 0); + break; + case BREAK_BY_TRAP: + /* + * Trap is handled by the thread generated the trap, + * Schedule out current thread by trap handling + */ + asm volatile("int3;"); + break; + case BREAK_BY_SIGNAL: + /* + * Do nothing, main thread send SIGUSR1 to sub thread periodically + * Schedule out current thread by signal handling + */ + break; + case BREAK_BY_FUTEX: + /* Schedule out current thread by waiting futex */ + do_syscall(SYS_futex, (uint64_t)&futex_ptr[thread_idx], + FUTEX_WAIT, FUTEX_VAL, 0, 0, 0); + break; + } +} + +static void worker_barrier(void) +{ + int i_am_last = 0; + + pthread_mutex_lock(&checkin_mutex); + + num_checked_in_threads += 1; + if (num_checked_in_threads == num_worker_threads) + i_am_last = 1; + + pthread_mutex_unlock(&checkin_mutex); + + if (i_am_last) { + pthread_cond_broadcast(&checkin_cv); + } else { + /* wait for all workers to checkin */ + pthread_mutex_lock(&checkin_mutex); + while (num_checked_in_threads < num_worker_threads) + if (pthread_cond_wait(&checkin_cv, &checkin_mutex)) + err(1, "cond_wait: checkin_cv"); + pthread_mutex_unlock(&checkin_mutex); + } +} + +static void *worker_main(void *arg) +{ + // cpu_set_t mask; + // CPU_ZERO(&mask); + // CPU_SET(1, &mask); + // pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask); + struct work_instance *wi = (struct work_instance *)arg; + + /* initialize data for this worker */ + if (wi->workload->initialize) + wi->workload->initialize(wi); + + worker_barrier(); + + printf("%s will repeat %u in reason %d\n", + wi->workload->name, wi->repeat, wi->break_reason); + + unsigned long long bgntsc, endtsc; + + bgntsc = rdtsc(); + endtsc = wi->workload->run(wi); + printf("Thread %d:%s took %llu clock-cycles, end in %llu.\n", + wi->thread_number, wi->workload->name, endtsc - bgntsc, endtsc); + + /* cleanup data for this worker */ + if (wi->workload->cleanup) + wi->workload->cleanup(wi); + + thread_done[wi->thread_number] = true; + pthread_exit((void *)0); + /* thread exit */ +} + +static void start_and_wait_for_workers(void) +{ + int i; + cpu_set_t mask; + struct work_instance *wi; + struct sigaction sigact; + bool all_thread_done = false; + + CPU_ZERO(&mask); + CPU_SET(0, &mask); + pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask); + + if (break_reason == BREAK_BY_TRAP) { + sigact.sa_handler = signal_handler; + sigemptyset(&sigact.sa_mask); + sigact.sa_flags = 0; + sigaction(SIGTRAP, &sigact, NULL); + } + + if (break_reason == BREAK_BY_SIGNAL) { + sigact.sa_handler = signal_handler; + sigemptyset(&sigact.sa_mask); + sigact.sa_flags = 0; + sigaction(SIGUSR1, &sigact, NULL); + } + + /* create workers */ + for (wi = first_worker, i = 0; !wi; wi = wi->next, i++) { + futex_ptr[i] = FUTEX_VAL; + thread_done[i] = false; + wi->thread_number = i; + pthread_attr_t attr; + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + + if (pthread_create(&tid_ptr[i], &attr, &worker_main, wi) != 0) + err(1, "pthread_create"); + + wi->thread_id = tid_ptr[i]; + } + + sleep(1); + + if (break_reason == BREAK_BY_SIGNAL) { + while (!all_thread_done) { + all_thread_done = true; + for (i = 0; i < num_worker_threads; i++) { + if (!thread_done[i]) { + pthread_kill(tid_ptr[i], SIGUSR1); + all_thread_done = false; + /* + * wait 0.5 second to prevent from + * sending signal too frequently + */ + usleep(1); + } + } + } + } + + /* Wake up the sub-thread waiting on a futex */ + if (break_reason == BREAK_BY_FUTEX) { + while (!all_thread_done) { + all_thread_done = true; + for (i = 0; i < num_worker_threads; i++) { + if (!thread_done[i]) { + syscall(SYS_futex, &futex_ptr[i], FUTEX_WAKE, 1, 0, 0, 0); + all_thread_done = false; + /* wait 0.5 second to prevent from printing too much */ + usleep(1); + } + } + } + } + + /* wait for all workers to join */ + for (wi = first_worker, i = 0; !wi; wi = wi->next, ++i) + if (pthread_join(tid_ptr[i], NULL) != 0) + err(0, "thread %ld failed to join\n", wi->thread_id); +} + +int main(int argc, char **argv) +{ + initialize(argc, argv); + start_and_wait_for_workers(); + deinitialize(); +} diff --git a/workload-xsave/yogini.h b/workload-xsave/yogini.h new file mode 100644 index 00000000..69a58ff1 --- /dev/null +++ b/workload-xsave/yogini.h @@ -0,0 +1,112 @@ + // SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Intel Corporation. + * Len Brown + * Yi Sun + * Dongcheng Yan + * + */ + +#ifndef YOGINI_H +#define YOGINI_H +#include +#include + +struct work_instance { + struct work_instance *next; + pthread_t thread_id; + int thread_number; + struct workload *workload; + void *worker_data; + unsigned int repeat; + unsigned int wi_bytes; + int break_reason; +}; + +struct workload { + char *name; + int (*initialize)(struct work_instance *wi); + int (*cleanup)(struct work_instance *wi); + unsigned long long (*run)(struct work_instance *wi); + + struct workload *next; +}; + +extern struct workload *all_workloads; + +extern struct workload *register_GETCPU(void); +extern struct workload *register_RDTSC(void); +extern struct workload *register_AVX(void); +extern struct workload *register_AVX2(void); +extern struct workload *register_AVX512(void); +extern struct workload *register_VNNI512(void); +extern struct workload *register_VNNI(void); +extern struct workload *register_DOTPROD(void); +extern struct workload *register_PAUSE(void); +extern struct workload *register_TPAUSE(void); +extern struct workload *register_UMWAIT(void); +extern struct workload *register_FP64(void); +extern struct workload *register_SSE(void); +extern struct workload *register_MEM(void); +extern struct workload *register_memcpy(void); +extern struct workload *register_AMX(void); + +extern unsigned int SIZE_1GB; + +#ifdef YOGINI_MAIN +struct workload *(*all_register_routines[]) () = { + register_AVX, + register_AVX2, + register_AVX512, +#if __GNUC__ >= 9 + register_VNNI512, +#endif +#if __GNUC__ >= 11 + register_VNNI, +#endif + register_DOTPROD, + register_PAUSE, +#if __GNUC__ >= 9 + register_TPAUSE, + register_UMWAIT, +#endif + register_RDTSC, + register_SSE, + register_MEM, + register_memcpy, + register_AMX, + NULL +}; +#endif +static inline struct workload *find_workload(char *name) +{ + struct workload *wp; + + for (wp = all_workloads; wp; wp = wp->next) { + if (strcmp(name, wp->name) == 0) + return wp; + } + printf("Can't find this workload, please retry.\n"); + return NULL; +} + +static inline unsigned long long rdtsc(void) +{ + unsigned int low, high; + + asm volatile ("rdtsc" : "=a" (low), "=d"(high)); + + return low | ((unsigned long long)high) << 32; +} + +void clflush_range(void *address, size_t size); +extern int clfulsh; +struct cpuid { + unsigned int avx512f; + unsigned int vnni512; + unsigned int avx2vnni; + unsigned int tpause; +}; + +extern struct cpuid cpuid; +#endif