Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

How do I use enqueue_kernel to calculate vec_add? #19

Open
ryansoq opened this issue Jun 28, 2016 · 0 comments
Open

How do I use enqueue_kernel to calculate vec_add? #19

ryansoq opened this issue Jun 28, 2016 · 0 comments

Comments

@ryansoq
Copy link

ryansoq commented Jun 28, 2016

Hi All,
I have a question.
How do I use enqueue_kernel ?

This is my environment :
CLOC-0.8
/opt/amd/bin/clc2 -version
LLVM version 3.2svn
/opt/amd/bin/opt -version
LLVM version 3.2svn
HSA-Runtime-AMD-release-v1.0

.cl code

void VectorAdd_child(__global int* a, __global int* b, __global int* res);

__kernel void vectoradd(__global int* a, __global int* b, __global int* res)
{
        int gid = get_global_id(0);
        //res[gid] = a[gid] + b[gid];
        queue_t defQ = get_default_queue();
        ndrange_t ndrange1 = ndrange_1D(256);
        void (^myblock)(void)=^{VectorAdd_child(a, b, res);};
        int err_ret = enqueue_kernel(defQ, 1, ndrange1, myblock);
}

void VectorAdd_child(__global int* a, __global int* b, __global int* res)
{
        int gid = get_global_id(0);
        res[gid] = a[gid] + b[gid];
}

host code

// ref libclc code

typedef struct _HsaAqlDispatchPacket {
    uint    mix;
    ushort  workgroup_size[3];
    ushort  reserved2;
    uint    grid_size[3];
    uint    private_segment_size_bytes;
    uint    group_segment_size_bytes;
    ulong   kernel_object_address;
    ulong   kernel_arg_address;
    ulong   reserved3;
    ulong   completion_signal;
} HsaAqlDispatchPacket;

typedef struct _AmdAqlWrap {
    uint state;
    uint enqueue_flags; 
    uint command_id;
    uint child_counter;
    ulong completion;
    ulong parent_wrap;
    ulong wait_list;
    uint wait_num;
    uint reserved[5];
    HsaAqlDispatchPacket aql;
} AmdAqlWrap;

typedef struct _AmdVQueueHeader {
    uint    aql_slot_num;
    uint    event_slot_num;
    ulong   event_slot_mask;
    ulong   event_slots;
    ulong   aql_slot_mask;
    uint    command_counter;
    uint    wait_size;
    uint    arg_size;
    uint    reserved0;
    ulong   kernel_table;
    uint    reserved[2];
} AmdVQueueHeader;

int main(int argc, char **argv) {
...
hsa_status_t err;
status_t build_err;

err = hsa_init();
check(Initializing the hsa runtime, err);

/* 
 * Iterate over the agents and pick the gpu agent using 
 * the find_gpu callback.
 */
hsa_agent_t device = 0;
err = hsa_iterate_agents(find_gpu, &device);
check(Calling hsa_iterate_agents, err);

err = (device == 0) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS;
check(Checking if the GPU device is non-zero, err);

/*
 * Query the name of the device.
 */
char name[64] = { 0 };
err = hsa_agent_get_info(device, HSA_AGENT_INFO_NAME, name);
check(Querying the device name, err);
printf("The device name is %s.\n", name);

/*
 * Query the maximum size of the queue.
 */
uint32_t queue_size = 0;
err = hsa_agent_get_info(device, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size);
check(Querying the device maximum queue size, err);
printf("The maximum queue size is %u.\n", (unsigned int) queue_size);

/*
 * Create a queue using the maximum size.
 */
hsa_queue_t* commandQueue;
err = hsa_queue_create(device, queue_size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, &commandQueue);
check(Creating the queue, err);

/*
 * Load BRIG, encapsulated in an ELF container, into a BRIG module.
 */
hsa_ext_brig_module_t* brigModule;
char file_name[128] = "shader.brig";
build_err = create_brig_module_from_brig_file(file_name, &brigModule);
check_build(Creating the brig module from shader.brig, build_err);

/*
 * Create hsa program.
 */
hsa_ext_program_handle_t hsaProgram;
err = hsa_ext_program_create(&device, 1, HSA_EXT_BRIG_MACHINE_LARGE, HSA_EXT_BRIG_PROFILE_FULL, &hsaProgram);
check(Creating the hsa program, err);

/*
 * Add the BRIG module to hsa program.
 */
hsa_ext_brig_module_handle_t module;
err = hsa_ext_add_module(hsaProgram, brigModule, &module);
check(Adding the brig module to the program, err);

/* 
 * Construct finalization request list.
 */
hsa_ext_finalization_request_t finalization_request_list;
finalization_request_list.module = module;
finalization_request_list.program_call_convention = 0;
char kernel_name[128] = "&__OpenCL_vectoradd_kernel"; //&__OpenCL_%s_kernel
//char kernel_name[128] = "&__OpenCL___amd_blocks_func___vectoradd_block_invoke_kernel";
err = find_symbol_offset(brigModule, kernel_name, &finalization_request_list.symbol);
check(Finding the symbol offset for the kernel, err);

/*
 * Finalize the hsa program.
 */
err = hsa_ext_finalize_program(hsaProgram, device, 1, &finalization_request_list, NULL, NULL, 0, NULL, 0);
check(Finalizing the program, err);

/*
 * Destroy the brig module. The program was successfully created the kernel
 * symbol was found and the program was finalized, so it is no longer needed.
 */
 destroy_brig_module(brigModule);

/*
 * Get the hsa code descriptor address.
 */
hsa_ext_code_descriptor_t *hsaCodeDescriptor;
err = hsa_ext_query_kernel_descriptor_address(hsaProgram, module,    finalization_request_list.symbol, &hsaCodeDescriptor);
check(Querying the kernel descriptor address, err);

/*
 * Create a signal to wait for the dispatch to finish.
 */ 
hsa_signal_t signal;
err=hsa_signal_create(1, 0, NULL, &signal);
check(Creating a HSA signal, err);

/*
 * Initialize the dispatch packet.
 */
hsa_dispatch_packet_t aql;
memset(&aql, 0, sizeof(aql));

/*
 * Allocate and initialize the kernel arguments.
 */
const int DATA_SIZE = 128;

int   *a=(int*)malloc(sizeof(int)*DATA_SIZE);
int   *b=(int*)malloc(sizeof(int)*DATA_SIZE);
int *res=(int*)malloc(sizeof(int)*DATA_SIZE);

memset(a, 0, DATA_SIZE*sizeof(int));
//err=hsa_memory_register(a, DATA_SIZE*4);
//check(Registering argument memory for input parameter, err);

memset(b, 0, DATA_SIZE*sizeof(int));
//err=hsa_memory_register(b, DATA_SIZE*4);
//check(Registering argument memory for input parameter, err);

memset(res, 0, DATA_SIZE*sizeof(int));
//err=hsa_memory_register(res, DATA_SIZE*4);
//check(Registering argument memory for input parameter, err);
srand (time(NULL));
for(int i = 0; i < DATA_SIZE; i++) {
    a[i] = rand() % 10;
    b[i] = rand() % 10;
}

    /*
 * Setup the dispatch information.
 */
aql.completion_signal=signal;
aql.dimensions=(uint16_t)1;
aql.workgroup_size_x=(uint16_t)32;
aql.workgroup_size_y=(uint16_t)1;
aql.workgroup_size_z=(uint16_t)1;
aql.grid_size_x=DATA_SIZE;
aql.grid_size_y=1;
aql.grid_size_z=1;
aql.header.type=HSA_PACKET_TYPE_DISPATCH;
aql.header.acquire_fence_scope=2;
aql.header.release_fence_scope=2;
aql.header.barrier=1;
aql.group_segment_size=0;
aql.private_segment_size=0;

...


size_t kernel_arg_buffer_size = hsaCodeDescriptor->kernarg_segment_byte_size;
void* args __attribute__ ((aligned(HSA_ARGUMENT_ALIGN_BYTES))) = malloc(hsaCodeDescriptor->kernarg_segment_byte_size) ;
memset(args, 0, kernel_arg_buffer_size);
void* args_pointer = args;
args_pointer += sizeof(void*) * (/*kernel->hsaArgCount*/9 - /*kernel->num_args*/3 - /* enque arg*/3);

  uint64_t   *pb=(uint64_t *)malloc(sizeof(uint64_t)*128);
  AmdVQueueHeader *vq =(AmdVQueueHeader *) malloc(sizeof(AmdVQueueHeader));
  AmdAqlWrap *aw = (AmdAqlWrap *)malloc(sizeof(AmdAqlWrap));

  memset(&pb, 0, sizeof(uint64_t)*128);
  memset(&vq, 0, sizeof(AmdVQueueHeader));
  memset(&aw, 0, sizeof(AmdAqlWrap));

  void* q1 = &pb;
  void* q2 = &vq;
  void* q3 = &aw;

   //set argument 
   //map  __vqueue_pointer, __aqlwrap_pointer

    memcpy(args_pointer, &q1, sizeof(void*));
    args_pointer += sizeof(void*);
    memcpy(args_pointer, &q2, sizeof(void*));
    args_pointer += sizeof(void*);
    memcpy(args_pointer, &q3, sizeof(void*));
    args_pointer += sizeof(void*);

    void* p1 = &vq;
    memcpy(args_pointer, &p1, sizeof(void*));
    args_pointer += sizeof(void*);

    void* p2 = &aw;
    memcpy(args_pointer, &p2, sizeof(void*));
    args_pointer += sizeof(void*);

    void* p3 = res;
    memcpy(args_pointer, &p3, sizeof(void*));
    args_pointer += sizeof(void*);

...

    printf("aw->aql.mix  : %d \n",(aw->aql).mix);
    printf("aw->aql.ws 0 : %hd \n",(aw->aql).workgroup_size[0]);
    printf("aw->aql.ws 1 : %hd \n",(aw->aql).workgroup_size[1]);
    printf("aw->aql.ws 2 : %hd \n",(aw->aql).workgroup_size[2]);
    printf("aw->aql.gs 0 : %d \n",(aw->aql).grid_size[0]);
    printf("aw->aql.gs 1 : %d \n",(aw->aql).grid_size[1]);
    printf("aw->aql.gs 2 : %d \n",(aw->aql).grid_size[2]);

// I get error data

.hsail

version 0:20140528:$full:$large;
extension "amd:gcn";
extension "IMAGE";

decl prog function &abort()();
...

prog kernel &__OpenCL_vectoradd_kernel(
        kernarg_u64 %__global_offset_0,
        kernarg_u64 %__global_offset_1,
        kernarg_u64 %__global_offset_2,
        kernarg_u64 %__printf_buffer,
        kernarg_u64 %__vqueue_pointer,
        kernarg_u64 %__aqlwrap_pointer,
        kernarg_u64 %a,
        kernarg_u64 %b,
        kernarg_u64 %res)
{

...
Thanks

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant