diff --git a/flagcx/core/flagcx_hetero.cc b/flagcx/core/flagcx_hetero.cc deleted file mode 100644 index e5a9563..0000000 --- a/flagcx/core/flagcx_hetero.cc +++ /dev/null @@ -1,17 +0,0 @@ -#include "flagcx_hetero.h" -#include "adaptor.h" -#include "utils.h" - -struct flagcxCCLAdaptor flagcxHeteroAdaptor{ - "flagcx_Hetero", - LOADAPI(flagcxCCLAdaptor,getVersion, flagcxHeteroGetVersion), - LOADAPI(flagcxCCLAdaptor,getUniqueId, flagcxHeteroGetUniqueId), - LOADAPI(flagcxCCLAdaptor,commInitRank, flagcxHeteroCommInitRank), - LOADAPI(flagcxCCLAdaptor,commDestroy, flagcxHeteroCommDestroy), - LOADAPI(flagcxCCLAdaptor,commCount, flagcxHeteroCommCount), - LOADAPI(flagcxCCLAdaptor,commUserRank, flagcxHeteroCommUserRank), - LOADAPI(flagcxCCLAdaptor,send, flagcxHeteroSend), - LOADAPI(flagcxCCLAdaptor,recv, flagcxHeteroRecv), - LOADAPI(flagcxCCLAdaptor,groupStart, flagcxHeteroGroupStart), - LOADAPI(flagcxCCLAdaptor,groupEnd, flagcxHeteroGroupEnd) -}; diff --git a/flagcx/core/group.cc b/flagcx/core/group.cc index e1836b8..65709aa 100644 --- a/flagcx/core/group.cc +++ b/flagcx/core/group.cc @@ -14,6 +14,7 @@ #include "assert.h" #include "net.h" #include "adaptor.h" +#include "launch_kernel.h" __thread int flagcxGroupDepth = 0; __thread bool flagcxGroupJobAbortFlag = false; diff --git a/flagcx/core/hostGpuMemAlloc.h b/flagcx/core/hostGpuMemAlloc.h deleted file mode 100644 index fc8ddd4..0000000 --- a/flagcx/core/hostGpuMemAlloc.h +++ /dev/null @@ -1,363 +0,0 @@ -#ifndef HOST_GPU_MEMALLOC_H -#define HOST_GPU_MEMALLOC_H - -/* - * nvcc memAlloc.cu -o mem -lcuda -I /usr/local/cuda/include - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include "flagcx.h" - -#ifdef CREATE_GPU_MEMALLOC_API -#define GPU_MEMALLOC_API_EXTERN -#else -#define GPU_MEMALLOC_API_EXTERN extern -#endif - -struct DIM3{ - unsigned int x; - unsigned int y; - unsigned int z; -}; - -struct hostLaunchArgs{ - volatile bool stopLaunch; - volatile bool retLaunch; -}; - -void cpuAsyncLaunch(void *args); -flagcxResult_t flagcxLaunchKernel(void *func, DIM3 grid, DIM3 block, void **args, size_t share_mem, void *stream, void *memHandle); - -GPU_MEMALLOC_API_EXTERN void **flagcxDevKernelFunc; - -/** - * @brief Initializes the specified GPU device and sets up the memory handle. - * - * This function initializes the specified GPU device and prepares the associated memory handle. - * By providing the device ID, use the device number specified by dev_id for subsequent operations, - * and the `memHandle` will store the memory handle associated with this device for subsequent memory management operations. - * - * @param[in] dev_id The device ID to initialize. This specifies the GPU device to be used. - * @param[out] memHandle Pointer to store the memory handle associated with the device. - * - * @return Returns 0 on success, or a non-zero error code on failure. - */ -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxCuInit)(int dev_id, void **memHandle); - -/** - * @brief Cleans up and releases resources associated with the specified GPU device. - * - * This function destroys the resources associated with the specified GPU device, identified by `dev`. - * It also handles the cleanup of the memory handle provided in `memHandle`. - * This is typically called when the GPU resources are no longer needed. - * - * @param[in] dev The device ID of the GPU whose resources are to be destroyed. - * @param[in] memHandle The memory handle associated with the GPU device that needs to be released. - * - * @return Returns 0 on success, or a non-zero error code on failure. - */ -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxCuDestroy)(int dev, void *memHandle); - -/** - * @brief Allocates a block of memory accessible to both RDMA and GPU. - * - * This function allocates a block of memory of the specified `size` and stores its address in the `ptr` pointer. - * The allocated memory can be accessed by both RDMA and GPU. The `memHandle` parameter is used to provide - * additional input and output handles. - * - * @param[out] ptr Pointer to store the allocated memory address. - * @param[in] size Size of the memory block to allocate, in bytes. - * @param[in,out] memHandle Handle for additional input and output parameters. - * - * @return Returns 0 on success, or a non-zero error code on failure. - */ -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxCuGdrMemAlloc)(void **ptr, size_t size, void *memHandle); - -/** - * @brief Frees a block of memory previously allocated and accessible to RDMA and GPU. - * - * This function releases a block of memory that was previously allocated using `flagcxCuGdrMemAlloc`. - * The `ptr` parameter points to the memory to be freed, and `memHandle` may be used for - * additional input and output handles associated with the memory block. - * - * @param[in] ptr Pointer to the memory block to be freed. - * @param[in,out] memHandle Handle for additional input and output parameters, if applicable. - * - * @return Returns 0 on success, or a non-zero error code on failure. - */ -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxCuGdrMemFree)(void *ptr, void *memHandle); - -/** - * @brief Creates a CUDA stream and associates it with a memory handle. - * - * This function creates a CUDA stream for asynchronous operations on the device - * and potentially associates it with a memory handle if required. The stream can be - * used to perform asynchronous tasks like memory copy, kernel launches, etc., in the GPU. - * - * @param[out] stream A pointer to the location where the created stream handle will be stored. - * This pointer will be set to the created CUDA stream. - * - * @param[in] memHandle A memory handle that can be associated with the stream for specific operations. - * If not used, this parameter can be NULL or ignored, depending on the implementation. - * - * @return int Returns 0 (flagcxSuccess) if the stream was created successfully, - * or an error code if there was a failure. - * - * @note The caller is responsible for destroying the created stream using `cudaStreamDestroy` - * when it is no longer needed to avoid resource leaks. - */ -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxDeviceCreateStream)(void **stream); - -/** - * @brief Destroys a previously created CUDA stream and optionally disassociates it from a memory handle. - * - * This function destroys a CUDA stream that was previously created, ensuring that all operations - * in the stream are completed before the stream is destroyed. If a memory handle was associated - * with the stream, the function may also handle the necessary cleanup or disassociation. - * - * @param[in] stream The CUDA stream to be destroyed. This should be a valid stream created - * by `flagcxDeviceCreateStream` or equivalent. - * - * @param[in] memHandle A memory handle that may be associated with the stream. This parameter can - * be used to handle any specific disassociation or cleanup if needed. - * It can be NULL if no specific memory handle is associated. - * - * @return int Returns 0 (flagcxSuccess) if the stream was successfully destroyed, - * or an error code if there was a failure. - * - * @note The function ensures that all tasks queued in the stream are completed before the - * stream is destroyed. The caller should ensure that the stream is no longer in use - * before calling this function. - */ -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxDeviceDestroyStream)(void *stream); - -/** - * @brief Synchronizes a CUDA stream, ensuring all tasks submitted to the stream are complete. - * - * This function blocks the host until all tasks currently queued in the specified CUDA stream - * have been completed. If a memory handle is associated with the stream, the function may also - * perform additional operations related to memory synchronization or cleanup. - * - * @param[in] stream The CUDA stream to be synchronized. This should be a valid stream created - * by `flagcxDeviceCreateStream` or equivalent. - * - * @param[in] memHandle A memory handle that may be associated with the stream. This parameter can - * be used for memory-related operations, such as synchronization or cleanup - * tasks. It can be NULL if no specific memory handle is associated. - * - * @return int Returns 0 (flagcxSuccess) if the stream was successfully synchronized, - * or an error code if there was a failure. - * - * @note This function will block the host until the specified stream has completed all pending tasks. - * Ensure that the stream has active tasks to synchronize before calling this function to avoid - * unnecessary blocking. - */ -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxDeviceStreamSynchronize)(void *stream); - -/** - * @brief Launches a CUDA kernel with the specified execution configuration. - * - * This function launches a CUDA kernel with the provided grid and block dimensions, - * shared memory size, and stream. The kernel function is executed with the specified - * arguments on the device. If a memory handle is provided, it may be used for additional - * memory management or synchronization tasks. - * - * @param[in] func The CUDA kernel function to be launched. This should be a pointer to - * a compiled device function. - * - * @param[in] block_x The number of threads in the x-dimension of each block. - * @param[in] block_y The number of threads in the y-dimension of each block. - * @param[in] block_z The number of threads in the z-dimension of each block. - * - * @param[in] grid_x The number of blocks in the x-dimension of the grid. - * @param[in] grid_y The number of blocks in the y-dimension of the grid. - * @param[in] grid_z The number of blocks in the z-dimension of the grid. - * - * @param[in] args An array of pointers to the arguments to be passed to the CUDA kernel. - * These arguments must match the signature of the kernel function. - * - * @param[in] share_mem The amount of dynamic shared memory in bytes that the kernel can use - * during execution. If no additional shared memory is needed, this can be 0. - * - * @param[in] stream The CUDA stream in which the kernel is to be launched. This allows - * for asynchronous execution. If no stream is provided, the default stream - * (0) will be used. - * - * @param[in] memHandle A memory handle that may be associated with the kernel launch for - * specific memory operations or optimizations. This parameter can be NULL - * if not used. - * - * @return void This function does not return a value. Any errors during the kernel - * launch should be handled by the caller or through CUDA's error handling - * mechanisms. - * - * @note The caller must ensure that the grid and block dimensions, as well as the kernel - * arguments, are correctly configured before launching the kernel. Improper configuration - * can result in undefined behavior or kernel launch failure. - */ -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*_flagcxLaunchKernel)(void *func, unsigned int block_x, unsigned int block_y, unsigned int block_z, unsigned int grid_x, unsigned int grid_y, unsigned int grid_z, void **args, size_t share_mem, void *stream, void *memHandle); - -/** - * @brief Allocates shared memory on the host side for use with CUDA operations. - * - * This function is used to allocate a block of shared memory on the host that can be - * utilized by CUDA kernels or other CUDA operations. The memory allocated is intended - * to be shared among threads within a block or across blocks, depending on the use case. - * - * @param[out] ptr A pointer to the allocated memory block. This pointer should be - * allocated with enough space to hold the requested 'size' of shared - * memory. The pointer will be set to the address of the allocated memory. - * - * @param[in] size The size in bytes of the shared memory block to be allocated. This - * size must be sufficient to accommodate the needs of the CUDA - * operations that will use this memory. - * - * @param[in] memHandle A memory handle that can be used to associate the allocated memory - * with specific memory operations or optimizations. This handle can - * be NULL if no specific memory operations are required. - * - * @return void This function does not return a value. The allocated memory is - * accessed through the 'ptr' parameter. Any errors during memory - * allocation should be handled by the caller. - * - * @note The caller is responsible for ensuring that the allocated memory is appropriately - * sized and aligned for the intended use. Additionally, the caller should manage - * the lifetime of the allocated memory, including freeing it when no longer needed. - * The 'memHandle' can be used for more advanced memory management if required. - */ -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxHostShareMemAlloc)(void **ptr, size_t size, void *memHandle); - -/** - * @brief Frees shared memory on the host side that was previously allocated for use with UCCL operations. - * - * This function is used to deallocate memory on the host that was allocated for shared use among UCCL operations. - * It is crucial to call this function to release the memory once it is no longer required to prevent memory leaks. - * - * @param[in] ptr A pointer to the memory block to be freed. This should be a valid pointer to the memory - * that was previously allocated using a corresponding UCCL memory allocation function. - * - * @param[in] memHandle A memory handle that was associated with the allocated memory. This handle is used to - * identify the specific memory block and ensure proper deallocation. It must be the - * same handle that was used when the memory was allocated. - * - * @return void This function does not return a value. The memory pointed to by 'ptr' will be freed. Any errors - * during memory deallocation should be handled by the caller. - * - * @note The caller must ensure that the memory is not accessed after it has been freed to avoid undefined behavior. - * It is also the caller's responsibility to track the memory handle associated with the allocated memory - * and pass it correctly to this function to ensure the correct memory block is freed. - */ -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxHostShareMemFree)(void *ptr, void *memHandle); -/** - * @brief Function pointer type for device synchronization with a memory handle in UCCL. - * - * This function pointer type defines a callback for synchronizing a UCCL device, where the synchronization - * operation is associated with a specific memory handle. This allows for more targeted synchronization, - * potentially improving performance by synchronizing only the operations related to a particular memory block. - * - * @param[in] memHandle A pointer to a memory handle that identifies the memory block or context for which - * synchronization is required. This handle should be obtained during the allocation or - * initialization of the memory block and should be passed to this function to ensure - * the correct synchronization context. - * - * @return void This function does not return any value. The action performed is the synchronization of the - * UCCL device for the memory context associated with the provided handle. Any errors during - * synchronization should be handled internally. - * - * @note The actual implementation of the function should be provided by the UCCL library or the user. It should - * handle all the necessary steps to synchronize the device for the specific memory context. The function - * pointer can be assigned to a specific synchronization function that matches this signature and is - * capable of handling the synchronization for the given memory handle. - */ -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxDeviceSynchronize)(); - -/** - * @brief Copies memory between two device buffers using a specific memory handle in UCCL. - * - * This function pointer defines a callback for copying memory from a source buffer to a destination buffer - * in the device, where the memory operation is associated with a specific memory handle. The use of the - * memory handle ensures the copy operation occurs within the correct memory context. - * - * @param[out] dst A pointer to the destination buffer in the device memory where the data will be copied to. - * @param[in] src A pointer to the source buffer in the device memory from which the data will be copied. - * @param[in] size The size of the data to copy, in bytes. - * @param[in] memHandle A pointer to a memory handle that identifies the memory block or context for which - * the memory copy is required. The handle should be obtained during the allocation or - * initialization of the memory block. - * - * @return void This function does not return any value. Any errors during the copy process should be handled - * internally. - * - * @note The actual implementation should ensure the correct and efficient copying of memory between the source - * and destination buffers using the provided memory handle. It should handle device-specific synchronization - * if necessary. - */ -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxDeviceMemcpy)(void *dst, void *src, size_t size, flagcxMemcpyType_t type, flagcxStream_t stream, void *args); - -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxDeviceMemset)(void *ptr, int value, size_t size, flagcxMemType_t type); -/** - * @brief Allocates device memory using a specific memory handle in UCCL. - * - * This function pointer defines a callback for allocating memory on the device, where the allocation operation - * is associated with a specific memory handle. The memory handle ensures the allocation is made in the correct - * memory context, potentially optimizing memory management. - * - * @param[out] ptr A double pointer to the memory location that will store the address of the allocated memory - * on the device. - * @param[in] size The size of the memory to allocate, in bytes. - * @param[in] memHandle A pointer to a memory handle that identifies the memory context for which the - * allocation is required. The handle should be obtained during initialization of - * the memory block. - * - * @return void This function does not return any value. Any errors during memory allocation should be handled - * internally. - * - * @note The actual implementation should ensure the correct and efficient allocation of memory within the - * specified memory context using the memory handle. - */ -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxDeviceMalloc)(void **ptr, size_t size, flagcxMemType_t type); - -/** - * @brief Frees device memory using a specific memory handle in UCCL. - * - * This function pointer defines a callback for freeing memory on the device, where the memory free operation - * is associated with a specific memory handle. The memory handle ensures the correct memory context is used - * for deallocation. - * - * @param[in,out] ptr A double pointer to the memory location on the device that should be freed. After the memory - * is freed, the pointer will be set to `NULL`. - * @param[in] memHandle A pointer to a memory handle that identifies the memory context for which the - * memory deallocation is required. The handle should be obtained during the - * initialization of the memory block. - * - * @return void This function does not return any value. Any errors during memory deallocation should be handled - * internally. - * - * @note The actual implementation should ensure that the memory is correctly freed and that any synchronization - * or cleanup tasks required by the memory context are handled. - */ -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxDeviceFree)(void *ptr, flagcxMemType_t type); -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxTopoGetSystem)(void *topoArgs, void **system); -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxTopoGetLocalNet)(int gpu, char *name); -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxSetDevice)(int dev); -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxGetDevice)(int *dev); -GPU_MEMALLOC_API_EXTERN flagcxResult_t *(*flagcxGetVendor)(char *vendor); -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxDeviceStreamQuery)(void *stream); -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxCopyArgsInit)(void **args); -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxCopyArgsFree)(void *args); - -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxDeviceCreateEvent)(void **event, void *memHandle); -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxDeviceEventQuery)(void *event, void *memHandle); -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxDeviceEventBlock)(void *event, void *memHandle); -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxDeviceDestroyEvent)(void *event, void *memHandle); -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxDeviceEventRecord)(void *event, void *stream, void *memHandle); - -GPU_MEMALLOC_API_EXTERN flagcxResult_t (*flagcxDeviceLaunchHostFunc)(void *stream, void (*fn)(void *), void *args); - -#endif diff --git a/flagcx/core/launch_kernel.cc b/flagcx/core/launch_kernel.cc new file mode 100644 index 0000000..da2f88b --- /dev/null +++ b/flagcx/core/launch_kernel.cc @@ -0,0 +1,7 @@ +#include "launch_kernel.h" + +void cpuAsyncLaunch(void *_args){ + struct hostLaunchArgs *args = (struct hostLaunchArgs *) _args; + while(!args->stopLaunch); + __atomic_store_n(&args->retLaunch, 1, __ATOMIC_RELAXED); +} diff --git a/flagcx/core/launch_kernel.h b/flagcx/core/launch_kernel.h new file mode 100644 index 0000000..bd623ba --- /dev/null +++ b/flagcx/core/launch_kernel.h @@ -0,0 +1,27 @@ +#ifndef FLAGCX_LAUNCH_KERNEL_H_ +#define FLAGCX_LAUNCH_KERNEL_H_ + +#include "topo.h" +#include "debug.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "adaptor.h" +#include "utils.h" +#include "param.h" + +struct hostLaunchArgs{ + volatile bool stopLaunch; + volatile bool retLaunch; +}; + +void cpuAsyncLaunch(void *_args); + +#endif + diff --git a/flagcx/core/proxy.h b/flagcx/core/proxy.h index 8d23555..be108fd 100644 --- a/flagcx/core/proxy.h +++ b/flagcx/core/proxy.h @@ -14,7 +14,7 @@ #include #include "info.h" #include "net.h" -#include "hostGpuMemAlloc.h" +#include "launch_kernel.h" enum flagcxProxyOpState { flagcxProxyOpNone, flagcxProxyOpReady, flagcxProxyOpProgress }; diff --git a/flagcx/core/topo.cc b/flagcx/core/topo.cc index 4348799..ef333db 100644 --- a/flagcx/core/topo.cc +++ b/flagcx/core/topo.cc @@ -12,7 +12,6 @@ #include #include #include "bootstrap.h" -#include "hostGpuMemAlloc.h" #define BUSID_SIZE (sizeof("0000:00:00.0")) #define BUSID_REDUCED_SIZE (sizeof("0000:00")) diff --git a/flagcx/core/topo.h b/flagcx/core/topo.h index 75d736b..a28e3c1 100644 --- a/flagcx/core/topo.h +++ b/flagcx/core/topo.h @@ -249,4 +249,14 @@ static int mirrorBits(int val, int pow2) { for (int b=1, mb=(pow2>>1); b>=1) if (val & b) mirror |= mb; return mirror; } + + +#ifdef CREATE_DEVICE_TOPO_API +#define DEVICE_TOPO_API_EXTERN +#else +#define DEVICE_TOPO_API_EXTERN extern +#endif + +DEVICE_TOPO_API_EXTERN flagcxResult_t (*flagcxTopoGetLocalNet)(int gpu, char *name); + #endif diff --git a/flagcx/core/transport.cc b/flagcx/core/transport.cc index 69d57c1..26d6d9d 100644 --- a/flagcx/core/transport.cc +++ b/flagcx/core/transport.cc @@ -5,7 +5,6 @@ #include "net.h" #include "topo.h" #include "adaptor.h" -#include "hostGpuMemAlloc.h" #define ENABLE_TIMER 0 #include "timer.h" diff --git a/flagcx/service/launch_kernel.cc b/flagcx/service/launch_kernel.cc deleted file mode 100644 index db4fc6d..0000000 --- a/flagcx/service/launch_kernel.cc +++ /dev/null @@ -1,110 +0,0 @@ -#define CREATE_GPU_MEMALLOC_API -#include "hostGpuMemAlloc.h" -#include "debug.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "adaptor.h" -#include "utils.h" -#include "param.h" - -using namespace std; - -struct flagcxDeviceAdaptor devRunTimeApi; - -void *dlsymCheck(void *handle, const char *funcName){ - void *funcPtr = dlsym(handle, funcName); - if(funcPtr == NULL) INFO(FLAGCX_INIT, "fail to load symbol %s", funcName); - return funcPtr; -} - -#define LOADSYMBOL(handle,api) do{ \ - api = (typeof(api)) dlsymCheck(handle, #api); \ -}while(0); - -flagcxResult_t loadDeviceSymbol(){ - void *libHandle = dlopen("./libmylib.so", RTLD_LAZY); - if(libHandle == nullptr){ - const char* useNet = flagcxGetEnv("FLAGCX_USENET"); - if(useNet == NULL){ - INFO(FLAGCX_INIT, "fail to open libmylib.so"); - return flagcxRemoteError; - } - return flagcxSuccess; - } - - LOADSYMBOL(libHandle, flagcxDevKernelFunc); - LOADSYMBOL(libHandle, flagcxCuInit); - LOADSYMBOL(libHandle, flagcxCuGdrMemAlloc); - LOADSYMBOL(libHandle, flagcxDeviceCreateStream); - LOADSYMBOL(libHandle, _flagcxLaunchKernel); - LOADSYMBOL(libHandle, flagcxDeviceStreamSynchronize); - LOADSYMBOL(libHandle, flagcxDeviceDestroyStream); - LOADSYMBOL(libHandle, flagcxCuGdrMemFree); - LOADSYMBOL(libHandle, flagcxCuDestroy); - LOADSYMBOL(libHandle, flagcxHostShareMemAlloc); - LOADSYMBOL(libHandle, flagcxHostShareMemFree); - LOADSYMBOL(libHandle, flagcxDeviceSynchronize); - LOADSYMBOL(libHandle, flagcxDeviceMemcpy); - LOADSYMBOL(libHandle, flagcxDeviceMemset); - LOADSYMBOL(libHandle, flagcxDeviceMalloc); - LOADSYMBOL(libHandle, flagcxDeviceFree); - LOADSYMBOL(libHandle, flagcxSetDevice); - LOADSYMBOL(libHandle, flagcxGetDevice); - LOADSYMBOL(libHandle, flagcxGetVendor); - LOADSYMBOL(libHandle, flagcxDeviceStreamQuery); - LOADSYMBOL(libHandle, flagcxCopyArgsInit); - LOADSYMBOL(libHandle, flagcxCopyArgsFree); - LOADSYMBOL(libHandle, flagcxDeviceCreateEvent); - LOADSYMBOL(libHandle, flagcxDeviceEventQuery); - LOADSYMBOL(libHandle, flagcxDeviceEventBlock); - LOADSYMBOL(libHandle, flagcxDeviceDestroyEvent); - LOADSYMBOL(libHandle, flagcxDeviceEventRecord); - LOADSYMBOL(libHandle, flagcxDeviceLaunchHostFunc); - LOADSYMBOL(libHandle, flagcxTopoGetLocalNet); - - struct flagcxDeviceAdaptor loadApi{ - "runTimeApi", - LOADAPI(flagcxDeviceAdaptor,deviceSynchronize, flagcxDeviceSynchronize), - LOADAPI(flagcxDeviceAdaptor,deviceMemcpy, flagcxDeviceMemcpy), - LOADAPI(flagcxDeviceAdaptor,deviceMemset, flagcxDeviceMemset), - LOADAPI(flagcxDeviceAdaptor,deviceMalloc, flagcxDeviceMalloc), - LOADAPI(flagcxDeviceAdaptor,deviceFree, flagcxDeviceFree), - LOADAPI(flagcxDeviceAdaptor,setDevice, flagcxSetDevice), - LOADAPI(flagcxDeviceAdaptor,getDevice, flagcxGetDevice), - LOADAPI(flagcxDeviceAdaptor,getVendor, flagcxGetVendor), - LOADAPI(flagcxDeviceAdaptor,memHandleInit, flagcxCuInit), - LOADAPI(flagcxDeviceAdaptor,memHandleDestroy, flagcxCuDestroy), - LOADAPI(flagcxDeviceAdaptor,gdrMemAlloc, flagcxCuGdrMemAlloc), - LOADAPI(flagcxDeviceAdaptor,gdrMemFree, flagcxCuGdrMemFree), - LOADAPI(flagcxDeviceAdaptor,hostShareMemAlloc, flagcxHostShareMemAlloc), - LOADAPI(flagcxDeviceAdaptor,hostShareMemFree, flagcxHostShareMemFree), - LOADAPI(flagcxDeviceAdaptor,streamCreate, flagcxDeviceCreateStream), - LOADAPI(flagcxDeviceAdaptor,streamDestroy, flagcxDeviceDestroyStream), - LOADAPI(flagcxDeviceAdaptor,streamSynchronize, flagcxDeviceStreamSynchronize), - LOADAPI(flagcxDeviceAdaptor,streamQuery, flagcxDeviceStreamQuery), - LOADAPI(flagcxDeviceAdaptor,launchKernel, _flagcxLaunchKernel), - LOADAPI(flagcxDeviceAdaptor,copyArgsInit, flagcxCopyArgsInit), - LOADAPI(flagcxDeviceAdaptor,copyArgsFree, flagcxCopyArgsFree), - LOADAPI(flagcxDeviceAdaptor,topoGetSystem, flagcxTopoGetSystem), - LOADAPI(flagcxDeviceAdaptor,launchHostFunc, flagcxDeviceLaunchHostFunc), - }; - devRunTimeApi = loadApi; - return flagcxSuccess; -} - -flagcxResult_t flagcxLaunchKernel(void *func, DIM3 grid, DIM3 block, void **args, size_t share_mem, void *stream, void *memHandle){ - return _flagcxLaunchKernel(func, block.x, block.y, block.z, grid.x, grid.y, grid.z, args, share_mem, stream, memHandle); -} - -void cpuAsyncLaunch(void *_args){ - struct hostLaunchArgs *args = (struct hostLaunchArgs *) _args; - while(!args->stopLaunch); - __atomic_store_n(&args->retLaunch, 1, __ATOMIC_RELAXED); -} diff --git a/flagcx/service/load_devapi.cc b/flagcx/service/load_devapi.cc new file mode 100644 index 0000000..9f0d65d --- /dev/null +++ b/flagcx/service/load_devapi.cc @@ -0,0 +1,45 @@ +#define CREATE_DEVICE_TOPO_API +#include "topo.h" +#include "debug.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "adaptor.h" +#include "utils.h" +#include "param.h" + +using namespace std; + +struct flagcxDeviceAdaptor devRunTimeApi; + +void *dlsymCheck(void *handle, const char *funcName){ + void *funcPtr = dlsym(handle, funcName); + if(funcPtr == NULL) INFO(FLAGCX_INIT, "fail to load symbol %s", funcName); + return funcPtr; +} + +#define LOADSYMBOL(handle,api) do{ \ + api = (typeof(api)) dlsymCheck(handle, #api); \ +}while(0); + +flagcxResult_t loadDeviceSymbol(){ + void *libHandle = dlopen("./libmylib.so", RTLD_LAZY); + if(libHandle == nullptr){ + const char* useNet = flagcxGetEnv("FLAGCX_USENET"); + if(useNet == NULL){ + INFO(FLAGCX_INIT, "fail to open libmylib.so"); + return flagcxRemoteError; + } + return flagcxSuccess; + } + + LOADSYMBOL(libHandle, flagcxTopoGetLocalNet); + return flagcxSuccess; +} +