Align HCL source to 1.18.0

HabanaAI · Oct 13, 2024 · 7e01f03 · 7e01f03
1 parent d108b1c
commit 7e01f03
Show file tree

Hide file tree

Showing 386 changed files with 15,469 additions and 9,436 deletions.
diff --git a/dependencies/habanalabs/include/uapi/drm/habanalabs_accel.h b/dependencies/habanalabs/include/uapi/drm/habanalabs_accel.h
@@ -2797,6 +2797,8 @@ struct hl_debug_params_read_block {
 #define HL_DEBUG_OP_SET_MODE	7
 /* Opcode for fetching trace data */
 #define HL_DEBUG_OP_FETCH_TRACE	8
+/* Opcode for direct I/O operations */
+#define HL_DEBUG_OP_DIO		9
 
 /* Opcode for debug read memory */
 #define HL_DEBUG_OP_READMEM	1024
@@ -3658,6 +3660,20 @@ struct hl_nic_args {
 #define HL_IOCTL_DEBUG		0x05
 #define HL_IOCTL_NIC		0x06
 
+#define HL_DIO_CMD_SSD2HL	1
+#define HL_DIO_CMD_HL2SSD	2
+
+struct hl_dio_args {
+	struct {
+		__u64 device_va;
+		__u64 off_bytes;
+		__u64 len_bytes;
+		__u32 fd;
+	} ssd2hl;
+
+	__u32 op;
+};
+
 /*
  * Various information operations such as:
  * - H/W IP information

diff --git a/dependencies/hl-thunk/include/uapi/hlthunk.h b/dependencies/hl-thunk/include/uapi/hlthunk.h
@@ -2167,6 +2167,14 @@ hlthunk_public int hlthunk_nic_user_encap_unset(
 hlthunk_public int hlthunk_nic_dump_qp(int fd, uint32_t port, uint32_t qpn, uint32_t req,
 					char *buf, uint32_t buf_size);
 
+/**
+ * This function retrieves the NIC ports enabled ports masks. This function is common for all ASICs.
+ * @param fd file descriptor handle of habanalabs main device.
+ * @param mask returned masks.
+ * @return 0 if success. Non-zero for any error.
+ */
+hlthunk_public int hlthunk_nic_get_enabled_ports_mask(int fd, uint64_t *mask);
+
 /**
  * This function retrieves the NIC ports and external ports masks. This function shall be used
  * only for Gaudi2 and later ASICs.

diff --git a/dependencies/qman_fw/engines-arc/include/gaudi2_arc_common_packets.h b/dependencies/qman_fw/engines-arc/include/gaudi2_arc_common_packets.h
@@ -162,6 +162,16 @@ enum scheduler_type_t {
 	SCHED_TYPE_SIZE = 0xF
 };
 
+/**
+ * Max number of MMEs
+ */
+#define GAUDI2_MAX_MME_COUNT				2
+
+/**
+ * Max number of MMEs
+ */
+#define GAUDI2_MAX_EDMA_COUNT				5
+
 /**
  * Total number of engine groups supported by firmware
  */
@@ -218,8 +228,9 @@ enum sched_cmpt_sync_scheme_bitmap {
  */
 enum {
 	SYNC_SCHEME_FENCE_ID = 0,
+	EXT_SIGNAL_FENCE_ID = SYNC_SCHEME_FENCE_ID,
 	B2B_FENCE_ID = 1,
-	EXT_SIGNAL_FENCE_ID = 2
+	GC_USED_FENCE_ID = 2
 };
 
 /**<

diff --git a/dependencies/qman_fw/engines-arc/include/gaudi2_arc_eng_packets.h b/dependencies/qman_fw/engines-arc/include/gaudi2_arc_eng_packets.h
@@ -41,10 +41,11 @@ enum eng_arc_cmd_t {
 	ECB_CMD_NOP = 1,
 	ECB_CMD_WD_FENCE_AND_EXE = 2,
 	ECB_CMD_SCHED_DMA = 3,
-	ECB_CMD_STATIC_DESC_V2 = 4,
-	ECB_CMD_SFG = 5,
-	ECB_CMD_RESET_SOSET = 6,
-	ECB_CMD_COUNT = 7
+	ECB_CMD_SCHED_DMA_V2 = 4,
+	ECB_CMD_STATIC_DESC_V2 = 5,
+	ECB_CMD_SFG = 6,
+	ECB_CMD_RESET_SOSET = 7,
+	ECB_CMD_COUNT = 8
 };
 
 /**
@@ -211,6 +212,10 @@ enum nic_scaleout_eng_arc_cmd_t {
  */
 #define WD_CTXT_COUNT	8
 
+#define EXPERT_MAPPING_CTXT_COUNT	2
+#define EXPERT_MAPPING_ENTRY_COUNT	32
+#define INVALID_EXPERT_MAPPING_ENTRY	0XFFFF
+
 #define MAX_DIMENSIONS	5
 
 #define TENSOR_DIM0	0
@@ -460,6 +465,23 @@ struct virt_sob_ids_t {
 	 */
 } __attribute__ ((aligned(4), __packed__));
 
+
+/**
+ * \struct  full_hbm_addr_ctxt_t
+ * \brief   full hbm addr ctxt
+ * \details full hbm addr used for patching
+ */
+struct full_hbm_addr_ctxt_t {
+	union {
+		uint64_t hbm_addr;
+		struct {
+			uint64_t addr_low:32;
+			uint64_t addr_high:32;
+		} __attribute__ ((aligned(4), __packed__));
+	};
+} __attribute__ ((aligned(4), __packed__));
+
+
 /**
  * \struct  rot_wd_ctxt_t
  * \brief   Rotator specific work distribution context
@@ -507,6 +529,8 @@ struct rot_wd_ctxt_t {
  */
 struct rot_wd_ctxts_t {
 	struct rot_wd_ctxt_t rot_ctxt[WD_CTXT_COUNT];
+	struct full_hbm_addr_ctxt_t weight_base_address_ctxt[WD_CTXT_COUNT];
+	uint16_t expert_mapping_ctxt[EXPERT_MAPPING_CTXT_COUNT * EXPERT_MAPPING_ENTRY_COUNT];
 	/**<
 	 * array of contexts for Rotator
 	 */
@@ -517,6 +541,13 @@ struct rot_wd_ctxts_t {
 	 */
 } __attribute__ ((aligned(4), __packed__));
 
+enum mme_operand_type_t {
+	MME_ADDR_A = 0,
+	MME_ADDR_B = 1,
+	MME_ADDR_COUT0 = 2,
+	MME_OPERAND_COUNT = 3
+};
+
 /**
  * \struct  mme_wd_ctxt_t
  * \brief   MME specific work distribution context
@@ -535,7 +566,11 @@ struct mme_wd_ctxt_t {
 			 * value of the switch bit to be configured when pushing the
 			 * descriptor into ARC CQ
 			 */
-			uint32_t reserved:7;
+			uint32_t mme_operand:2;
+			/**<
+			 * mme operand to patch from mme_operand_type_t
+			 */
+			uint32_t reserved:5;
 			/**<
 			 * reserved
 			 */
@@ -554,6 +589,10 @@ struct mme_wd_ctxt_t {
 	/**<
 	 * Virtual SOB array
 	 */
+	struct full_hbm_addr_ctxt_t weight_offset[GAUDI2_MAX_MME_COUNT];
+	/**<
+	 * hbm addr offset of tensor for patching
+	 */
 } __attribute__ ((aligned(4), __packed__));
 
 /**
@@ -563,6 +602,8 @@ struct mme_wd_ctxt_t {
  */
 struct mme_wd_ctxts_t {
 	struct mme_wd_ctxt_t mme_ctxt[WD_CTXT_COUNT];
+	struct full_hbm_addr_ctxt_t weight_base_address_ctxt[WD_CTXT_COUNT];
+	uint16_t expert_mapping_ctxt[EXPERT_MAPPING_CTXT_COUNT * EXPERT_MAPPING_ENTRY_COUNT];
 	/**<
 	 * array of contexts for MME
 	 */
@@ -590,6 +631,12 @@ enum edma_op_type_t {
 	EDMA_OP_COUNT = 6
 };
 
+enum edma_operand_type_t {
+	EDMA_SRC = 0,
+	EDMA_DST = 1,
+	EDMA_OPERAND_COUNT = 2
+};
+
 /**<
  * Total number of EDMA engines involved in compute
  */
@@ -665,11 +712,15 @@ struct edma_wd_ctxt_t {
 			 * alternate address of RD_HBW_MAX_OUTSTAND as completion address
 			 * value of 0 is set by the GC in the WR_COMP_WDATA
 			 */
+			uint32_t dma_operand:1;
+			/**<
+			 * Edma operand to patch from edma_operand_type_t
+			 */
 			uint32_t sig_inc_value:16;
 			/**<
 			 * Increment value to be added to previous threshold
 			 */
-			uint32_t virtual_sob_bitmap:8;
+			uint32_t virtual_sob_bitmap:7;
 			/**<
 			 * Virtual SOB bitmap indicating index which are valid
 			 * in the virtual_sob array
@@ -688,6 +739,10 @@ struct edma_wd_ctxt_t {
 	/**<
 	 * Virtual SOB array
 	 */
+	struct full_hbm_addr_ctxt_t weight_offset[GAUDI2_MAX_EDMA_COUNT];
+	/**<
+	 * hbm addr offset of tensor for patching
+	 */
 } __attribute__ ((aligned(4), __packed__));
 
 /**
@@ -697,6 +752,8 @@ struct edma_wd_ctxt_t {
  */
 struct edma_wd_ctxts_t {
 	struct edma_wd_ctxt_t edma_ctxt[WD_CTXT_COUNT];
+	struct full_hbm_addr_ctxt_t weight_base_address_ctxt[WD_CTXT_COUNT];
+	uint16_t expert_mapping_ctxt[EXPERT_MAPPING_CTXT_COUNT * EXPERT_MAPPING_ENTRY_COUNT];
 	/**<
 	 * array of contexts for EDMA
 	 */
@@ -867,7 +924,11 @@ struct tpc_wd_ctxt_t {
 	union {
 		uint32_t word2;
 		struct {
-			uint16_t reserved1;
+			uint16_t tensor_id: 4;
+			/**<
+			 * tpc operand to patch (0-15)
+			 */
+			uint16_t reserved1: 12;
 			/**<
 			 * reserved
 			 */
@@ -881,6 +942,10 @@ struct tpc_wd_ctxt_t {
 	/**<
 	 * Virtual SOB array
 	 */
+	struct full_hbm_addr_ctxt_t weight_offset;
+	/**<
+	 * hbm addr offset of tensor for patching
+	 */
 } __attribute__ ((aligned(4), __packed__));
 
 /**
@@ -890,6 +955,8 @@ struct tpc_wd_ctxt_t {
  */
 struct tpc_wd_ctxts_t {
 	struct tpc_wd_ctxt_t tpc_ctxt[WD_CTXT_COUNT];
+	struct full_hbm_addr_ctxt_t weight_base_address_ctxt[WD_CTXT_COUNT];
+	uint16_t expert_mapping_ctxt[EXPERT_MAPPING_CTXT_COUNT * EXPERT_MAPPING_ENTRY_COUNT];
 	/**<
 	 * Array of contexts for TPC
 	 */
@@ -1000,6 +1067,17 @@ struct eng_arc_cmd_static_desc_v2_t {
 	 */
 } __attribute__ ((aligned(4), __packed__));
 
+/**
+ * \enum    signaling_completion_type_t
+ * \brief   completion signal sent to sob by firmware
+ * \details completion signal sent to sob by firmware
+ */
+enum signaling_completion_type_t {
+	SIGNAL_TO_SYNC_SCHEME_SOB = 0x0,
+	SINGAL_TO_AUX_REG = 0x1,
+	SINGAL_COUNT = 0x2
+};
+
 /**
  * \struct  eng_arc_cmd_wd_fence_and_exec_t
  * \brief   Work distribution, fence and execute
@@ -1019,18 +1097,40 @@ struct eng_arc_cmd_wd_fence_and_exec_t {
 	 * Number of DMAs should complete before the execution can start.
 	 * Expected value is 1.
 	 */
-	uint32_t reserved:19;
+	uint32_t dma2_completion:3;
 	/**<
-	 * reserved
+	 * Number of DMAs should complete before the execution can start.
+	 * This wait is for dma waiting for dma. Can have 0 or more value.
 	 */
 	uint32_t wd_ctxt_id:3;
 	/**<
 	 * a context number from 0 to max number of contexts that fw supports
 	 */
-	uint32_t reserved2:2;
+	uint32_t wd_ctxt2_id:3;
+	/**<
+	 * a context number from 0 to max number of weight_base_address contexts
+	 */
+	uint32_t patch_address:1;
+	/**<
+	 * Patch address before execution
+	 */
+	uint32_t signal_arc:1;
+	/**<
+	 * which sob to signal from signaling_completion_type_t
+	 */
+	uint32_t expert_mapping_idx: 6;
+	/**<
+	 * expert mapping index
+	 */
+	uint32_t conditional_activation:1;
+	/**<
+	 * conditional_activation
+	 */
+	uint32_t :6;
 	/**<
 	 * reserved
 	 */
+
 } __attribute__ ((aligned(4), __packed__));
 
 /**
@@ -1069,6 +1169,73 @@ struct eng_arc_cmd_sched_dma_t {
 	 */
 } __attribute__ ((aligned(4), __packed__));
 
+
+/**
+ * DMA type
+ */
+enum dma_type_t {
+	DMA_EXPERT_MAPPING_TABLE = 0x0,
+	DMA_HBM_TENSOR_ADDR = 0x1,
+	DMA_COUNT = 0x2
+};
+
+/**
+ * \struct  eng_arc_cmd_sched_dma_v2_t
+ * \brief   Schedule DMA  version 2 to update GC context
+ * \details Initiate a DMA transfer to update expert mapping context.
+ */
+struct eng_arc_cmd_sched_dma_v2_t {
+	uint32_t cmd_type:4;
+	/**<
+	 * set to ECB_CMD_SCHED_DMA_V2
+	 */
+	uint32_t yield:1;
+	/**<
+	 * Yield ARC control to the other list (s/d) after execution
+	 */
+	uint32_t dma_completion:3;
+	/**<
+	 * Number of DMAs should complete before starting this DMA
+	 */
+	uint32_t addr_index:3;
+	/**<
+	 * Recipe base address register index to be used to generate
+	 * target address of 64 bits
+	 */
+	uint32_t size:8;
+	/**<
+	 * size of the buffer in bytes
+	 */
+	uint32_t dma_type:1;
+	/*
+	 * What needs to be dma from dma_type_t
+	 * 0 - DMA_EXPERT_MAPPING_TABLE
+	 * 1 - DMA_HBM_TENSOR_ADDR
+	 */
+	uint32_t wait_for_eng:1;
+	/*
+	 * Wait for a signal from Engine
+	 */
+	uint32_t expert_mapping_idx: 6;
+	/**<
+	 * expert mapping index
+	 */
+	uint32_t :2;
+	/*
+	 * Reserved
+	 */
+	uint32_t wd_ctxt_id:3;
+	/*
+	 * GC Context ID that needs to be updated
+	 * This is used to calculate Destination Address
+	 */
+	uint32_t addr_offset;
+	/**<
+	 * 32bit address offset into recipe base address
+	 */
+} __attribute__ ((aligned(4), __packed__));
+
+
 /**
  * \struct  eng_arc_cmd_sfg_t
  * \brief   Signal From Graph