diff --git a/BM/amx/README.md b/BM/amx/README.md
index e1f308cf..0b5fc8e4 100644
--- a/BM/amx/README.md
+++ b/BM/amx/README.md
@@ -16,10 +16,14 @@ It will be checked if the result is correct or not.
 
 - How to build:  
 
-    gcc 11.1 or above is required.  
+    gcc 11.1 or above is required for amx_bf16, amx_int8 test. 
+    gcc 13.3 or above is required for extra amx_fp16 test.
 
-    To compile,  
+    To compile,
+    for amx_bf16, amx_int8 test:  
     $ make
+    for extra amx_fp16 test:
+    $ make fp16
 
     To clean,  
     $ make clean
@@ -47,3 +51,7 @@ It will be checked if the result is correct or not.
     f. Break sub-thread which is doing TMUL TDPBUUD calculation by futex  
     $ ./tmul -b 5 -t 10 -c 20 -i 4
 
+    g. Break sub-thread which is doing TMUL TDPFP16PS calculation by yield
+    $ ./tmul -b 1 -t 10 -c 20 -i 5
+
+
diff --git a/BM/amx/tmul/Makefile b/BM/amx/tmul/Makefile
index 8161d0d1..665d6bf9 100644
--- a/BM/amx/tmul/Makefile
+++ b/BM/amx/tmul/Makefile
@@ -8,6 +8,9 @@ CFILES_AMX = tmul.c
 all:
 	$(CC) $(CFLAG) $(CFILES_AMX) -o $(BIN_AMX) $(LIBS)
 
+fp16: CFLAG += -g -DFP16
+fp16: all
+
 clean:
 	-rm $(BIN_AMX)
 
diff --git a/BM/amx/tmul/tmul.c b/BM/amx/tmul/tmul.c
index 07ea217b..45f76d8c 100644
--- a/BM/amx/tmul/tmul.c
+++ b/BM/amx/tmul/tmul.c
@@ -28,19 +28,25 @@
 #define FUTEX_VAL 0x5E5E5E5E
 
 #define DPBD(c, x, y, type1, type2)								\
-	c = c +											\
-	(uint32_t)((type1)(((uint8_t *)(x))[0])) * (uint32_t)((type2)(((uint8_t *)(y))[0])) +	\
-	(uint32_t)((type1)(((uint8_t *)(x))[1])) * (uint32_t)((type2)(((uint8_t *)(y))[1])) +	\
-	(uint32_t)((type1)(((uint8_t *)(x))[2])) * (uint32_t)((type2)(((uint8_t *)(y))[2])) +	\
-	(uint32_t)((type1)(((uint8_t *)(x))[3])) * (uint32_t)((type2)(((uint8_t *)(y))[3]))
+	{														\
+		c = (c +											\
+		(uint32_t)((type1)(((uint8_t *)(x))[0])) * (uint32_t)((type2)(((uint8_t *)(y))[0])) +	\
+		(uint32_t)((type1)(((uint8_t *)(x))[1])) * (uint32_t)((type2)(((uint8_t *)(y))[1])) +	\
+		(uint32_t)((type1)(((uint8_t *)(x))[2])) * (uint32_t)((type2)(((uint8_t *)(y))[2])) +	\
+		(uint32_t)((type1)(((uint8_t *)(x))[3])) * (uint32_t)((type2)(((uint8_t *)(y))[3])));	\
+	}
 
 #define load_tile_reg(tmm_num, tile, stride)						\
-	asm volatile("tileloadd\t(%0,%1,1), %%tmm" #tmm_num				\
-		     : : "r" ((void *)(tile)->buf), "r" ((long)stride) : "memory")
+	{								\
+		asm volatile("tileloadd\t(%0,%1,1), %%tmm" #tmm_num				\
+			     : : "r" ((void *)(tile)->buf), "r" ((long)stride) : "memory");	\
+	}
 
 #define store_tile_reg(tmm_num, tile, stride)						\
-	asm volatile("tilestored\t%%tmm" #tmm_num ", (%0,%1,1)"				\
-		     : : "r" ((void *)(tile)->buf), "r" ((long)stride) : "memory")
+	{								\
+		asm volatile("tilestored\t%%tmm" #tmm_num ", (%0,%1,1)"				\
+			     : : "r" ((void *)(tile)->buf), "r" ((long)stride) : "memory");	\
+	}
 
 enum {
 	BREAK_BY_NOTHING = 0,
@@ -58,7 +64,12 @@ enum {
 	INS_TDPBSUD,
 	INS_TDPBUSD,
 	INS_TDPBUUD,
+#ifdef FP16
+	INS_TDPFP16PS,
+	INS_MAX_NUM = INS_TDPFP16PS
+#else
 	INS_MAX_NUM = INS_TDPBUUD
+#endif
 } ENUM_INSTRUCTION_TYPE;
 
 struct __tile_config {
@@ -118,6 +129,77 @@ static float convert_bf16_to_fp32(uint16_t bf16)
 	return *((float *)&u);
 }
 
+#ifdef FP16
+/*
+ * convert_fp32_to_fp16() - Convert data format.
+ * @fp32: A FP32 value.
+ *
+ * Covert FP32 to FP16.
+ */
+static uint16_t convert_fp32_to_fp16(float fp32)
+{
+	uint32_t u = *((uint32_t *)&fp32);
+	uint16_t sign = (u >> 16) & 0x8000;
+	uint16_t fraction = (u & 0x007fffff) >> 13;
+	uint16_t exponent = (((u & 0x7f800000) >> 23) - 127 + 15) << 10;
+
+	uint16_t fp16 = sign | exponent | fraction;
+
+	return fp16;
+}
+
+/*
+ * convert_fp16_to_fp32() - Convert data format.
+ * @fp: A FP16 value.
+ *
+ * Covert FP16 to FP32.
+ */
+static float convert_fp16_to_fp32(uint16_t fp16)
+{
+	int shift;
+	uint32_t u;
+	float fp32 = 0;
+	uint32_t sign = (fp16 & 0x8000) << 16;
+	uint32_t fraction = (fp16 & 0x3ff) << 13;
+	uint32_t exponent = (fp16 & 0x7c00) >> 10;
+
+	if (exponent == 0x1f && fraction == 0) {
+		if (sign)
+			fp32 = -INFINITY;
+		else
+			fp32 = INFINITY;
+	} else if (exponent == 0x1f && fraction != 0) {
+		fp32 = NAN;
+	} else if (exponent == 0) {
+		if (fraction == 0) {
+			/* +0.0, -0.0 */
+			u = sign;
+			fp32 = *((float *)&u);
+		} else {
+			/* Convert subnormal into normal fp32 number */
+			for (int i = 22; i >= 12; i--) {
+				if (fraction & (1 << i)) {
+					shift = (1 + 22 - i);
+					fraction = (fraction << shift) & 0x7fffff;
+					u = sign |
+					    ((exponent - 15 - (shift - 1) + 127) << 23) |
+					    fraction;
+					fp32 = *((float *)&u);
+
+					break;
+				}
+			}
+		}
+		//fp32 = *((float *)&u);
+	} else {
+		u = sign | ((exponent - 15 + 127) << 23) | fraction;
+		fp32 = *((float *)&u);
+	}
+
+	return fp32;
+}
+#endif
+
 /*
  * do_syscall() - Execute syscall instruction.
  * @nr: syscall number will be saved rax register.
@@ -191,6 +273,34 @@ static void init_bf16_tile(struct __tile *tile_ptr, uint8_t rows, uint8_t colsb)
 		}
 }
 
+#ifdef FP16
+/*
+ * init_fp16_tile() - Init buffer.
+ * @buf: The buffer for saving data.
+ * @rows: Row number of the matrix.
+ * @colsb: Column number of the matrix.
+ *
+ * Init buffer with chaotic float.
+ */
+static void init_fp16_tile(struct __tile *tile_ptr, uint8_t rows,
+			   uint8_t colsb)
+{
+	int32_t i, j;
+	uint16_t *ptr = (uint16_t *)tile_ptr->buf;
+	int32_t cols = colsb / 2;
+	float f = 0;
+
+	tile_ptr->rows = rows;
+	tile_ptr->colsb = colsb;
+
+	for (i = 0; i < rows; i++)
+		for (j = 0; j < cols; j++) {
+			f = 2.718f;
+			ptr[i * cols + j] = convert_fp32_to_fp16(f);
+		}
+}
+#endif
+
 /*
  * init_dword_tile() - Init buffer.
  * @buf: The buffer for saving data.
@@ -297,6 +407,41 @@ static void calc_matrix_tdpbf16ps(struct __tile *dst, struct __tile *src1, struc
 			}
 }
 
+#ifdef FP16
+/*
+ * calc_matrix_tdpfp16ps() - Software algorithm for instruction TDPBF16PS.
+ * @dst: The product of matrix multiplication.
+ * @src1: The first multiplier.
+ * @src2: The second multiplier.
+ *
+ * Compute dot-product of FP16 (16-bit) floating-point pairs in tiles a and b,
+ * accumulating the intermediate single-precision (32-bit) floating-point
+ * elements with elements in dst,
+ * and store the 32-bit result back to tile dst.
+ */
+static void calc_matrix_tdpfp16ps(struct __tile *dst, struct __tile *src1, struct __tile *src2)
+{
+	uint16_t *src1_buf = (uint16_t *)src1->buf;
+	uint16_t *src2_buf = (uint16_t *)src2->buf;
+	float *dst_buf = (float *)dst->buf;
+
+	int32_t M = src1->rows;
+	int32_t K = src1->colsb / 4;
+	int32_t N = src2->colsb / 4;
+	int32_t m, k, n;
+
+	for (m = 0; m < M; m++)
+		for (k = 0; k < K; k++)
+			for (n = 0; n < N; n++) {
+				dst_buf[m * N + n] +=
+					(convert_fp16_to_fp32(src1_buf[m * K * 2 + k * 2 + 0]) *
+					 convert_fp16_to_fp32(src2_buf[k * N * 2 + n * 2 + 0])) +
+					(convert_fp16_to_fp32(src1_buf[m * K * 2 + k * 2 + 1]) *
+					 convert_fp16_to_fp32(src2_buf[k * N * 2 + n * 2 + 1]));
+		}
+}
+#endif
+
 /*
  * calc_matrix_tdpbssd() - Software algorithm for instruction TDPBSSD.
  * @dst: The product of matrix multiplication.
@@ -441,6 +586,16 @@ static void tile_dpbf16ps(void)
 	asm volatile("tdpbf16ps %tmm2, %tmm1, %tmm0");
 }
 
+#ifdef FP16
+static void tile_dpfp16ps(void)
+{
+	asm volatile("tdpfp16ps %tmm7, %tmm6, %tmm5");
+	asm volatile("tdpfp16ps %tmm4, %tmm3, %tmm2");
+	asm volatile("tdpfp16ps %tmm5, %tmm2, %tmm1");
+	asm volatile("tdpfp16ps %tmm2, %tmm1, %tmm0");
+}
+#endif
+
 static void tile_dpbssd(void)
 {
 	asm volatile("tdpbssd %tmm7, %tmm6, %tmm5");
@@ -511,6 +666,46 @@ static bool check_tile_bf16_register(struct __tile *ref, struct __tile *target)
 	return true;
 }
 
+#ifdef FP16
+/*
+ * check_tile_fp16_register() - check calculation result.
+ * @ref: The result calculated by AMX/TMUL.
+ * @target: The result calculated by software.
+ *
+ * Check if the difference of the 2 results is small enough.
+ *
+ * Return:
+ * true - OK
+ * false - Abnormal
+ */
+static bool check_tile_fp16_register(struct __tile *ref, struct __tile *target)
+{
+	/*
+	 * Tile register should be stored from tmm to
+	 * memory and compare with emulation results.
+	 */
+	int32_t rows = target->rows;
+	int32_t colsb = target->colsb / 4;
+	uint8_t *rbuf = ref->buf;
+	uint8_t *tbuf = target->buf;
+	int32_t i, j, idx;
+
+	for (i = 0; i < rows; i++)
+		for (j = 0; j < colsb; j++) {
+			idx = i * colsb + j;
+			if ((((float *)rbuf)[idx] - ((float *)tbuf)[idx]) > (0.5) ||
+			    (((float *)rbuf)[idx] - ((float *)tbuf)[idx]) < (-0.5)) {
+				printf("Mismatch: idx=%d, ref=%f, target=%f\n", idx,
+				       ((float *)rbuf)[idx],
+				       ((float *)tbuf)[idx]);
+				return false;
+			}
+		}
+
+	return true;
+}
+#endif
+
 /*
  * check_tile_dword_register() - check calculation result.
  * @ref: The result calculated by AMX/TMUL.
@@ -633,9 +828,9 @@ static void thread_break(int32_t reason, uint32_t thread_idx)
  * @arg: The index of sub-thread.
  *       Index from 0 to the total number of threads - 1.
  *
- * Two results are generated by AMX/TMUL calcultion procedure,
+ * Two results are generated by AMX/TMUL calculation procedure,
  * one is calculated by software, the other is calculated by TMUL.
- * Interrupt the AMX/TMUL calcultion procedure by different reasons.
+ * Interrupt the AMX/TMUL calculation procedure by different reasons.
  * These reasons may cause context-switch by Kernel.
  * Check if the thread context is saved and restored correctly
  * by comparing the two results.
@@ -663,6 +858,10 @@ static void *worker_thread(void *arg)
 	/* Init the test data in memory */
 	if (ins_type == INS_TDPBF16PS)
 		init_bf16_tile(ptr_tile1, ROW_NUM, COL_NUM);
+#ifdef FP16
+	else if (ins_type == INS_TDPFP16PS)
+		init_fp16_tile(ptr_tile1, ROW_NUM, COL_NUM);
+#endif
 	else
 		init_dword_tile(ptr_tile1, ROW_NUM, COL_NUM);
 
@@ -675,6 +874,12 @@ static void *worker_thread(void *arg)
 		calc_matrix_tdpbf16ps(ptr_tile4, ptr_tile3, ptr_tile2);
 		calc_matrix_tdpbf16ps(ptr_tile3, ptr_tile4, ptr_tile4);
 		calc_matrix_tdpbf16ps(ptr_tile2, ptr_tile3, ptr_tile4);
+#ifdef FP16
+	} else if (ins_type == INS_TDPFP16PS) {
+		calc_matrix_tdpfp16ps(ptr_tile4, ptr_tile3, ptr_tile2);
+		calc_matrix_tdpfp16ps(ptr_tile3, ptr_tile4, ptr_tile4);
+		calc_matrix_tdpfp16ps(ptr_tile2, ptr_tile3, ptr_tile4);
+#endif
 	} else if (ins_type == INS_TDPBSSD) {
 		calc_matrix_tdpbssd(ptr_tile4, ptr_tile3, ptr_tile2);
 		calc_matrix_tdpbssd(ptr_tile3, ptr_tile4, ptr_tile4);
@@ -719,6 +924,10 @@ static void *worker_thread(void *arg)
 		/* Step4: Calculate a result by TMUL and store it in TMM0 register */
 		if (ins_type == INS_TDPBF16PS)
 			tile_dpbf16ps();
+#ifdef FP16
+		else if (ins_type == INS_TDPFP16PS)
+			tile_dpfp16ps();
+#endif
 		else if (ins_type == INS_TDPBSSD)
 			tile_dpbssd();
 		else if (ins_type == INS_TDPBSUD)
@@ -743,6 +952,14 @@ static void *worker_thread(void *arg)
 				       ins_type, thread_idx, i);
 				rtn = false;
 			}
+#ifdef FP16
+		} else if (ins_type == INS_TDPFP16PS) {
+			if (!check_tile_fp16_register(ptr_tile3, ptr_tile2)) {
+				printf("Instruction %d test in Thread %d Cycle %d: failed\n",
+				       ins_type, thread_idx, i);
+				rtn = false;
+			}
+#endif
 		} else {
 			if (!check_tile_dword_register(ptr_tile3, ptr_tile2)) {
 				printf("Instruction %d test in Thread %d Cycle %d: failed\n",
@@ -788,7 +1005,13 @@ static void help(void)
 		"      5: break by futex\n"
 		"  -t, --thread-count [Should not be less than %d]\n"
 		"  -c, --cycle-number [Should not be less than 1]\n"
-		"  -i, --instruction-type [0:TDPBF16PS 1:TDPBSSD 2:TDPBSUD 3:TDPBUSD 4:TDPBUUD]\n"
+#ifdef FP16
+		"  -i, --instruction-type [0:TDPBF16PS 1:TDPBSSD 2:TDPBSUD"
+		" 3:TDPBUSD 4:TDPBUUD 5:TDPFP16PS]\n"
+#else
+		"  -i, --instruction-type [0:TDPBF16PS 1:TDPBSSD 2:TDPBSUD"
+		" 3:TDPBUSD 4:TDPBUUD]\n"
+#endif
 		, progname, progname, BREAK_BY_YIELD, BREAK_REASON_MAX, MIN_THREAD_NUM);
 }