Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize generator #1540

Open
wants to merge 8 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions install.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env bash

# ########################################################################
# Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
# Copyright (C) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -61,7 +61,7 @@ supported_distro( )
printf "supported_distro(): \$ID must be set\n"
exit 2
fi

case "${ID}" in
ubuntu|centos|almalinux|rhel|fedora|sles|opensuse-leap|mariner|azurelinux)
true
Expand Down Expand Up @@ -323,7 +323,7 @@ install_blis()
#Download prebuilt AMD multithreaded blis
if [[ ! -e "./blis/lib/libblis.a" ]]; then
case "${ID}" in
centos|rhel|sles|opensuse-leap)
centos|rhel|sles|opensuse-leap|almalinux)
wget -nv -O blis.tar.gz https://github.com/amd/blis/releases/download/2.0/aocl-blis-mt-centos-2.0.tar.gz
;;
ubuntu)
Expand Down
4 changes: 2 additions & 2 deletions tensilelite/Tensile/Source/lib/source/TensorDescriptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
*
* MIT License
*
* Copyright (C) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -111,7 +111,7 @@ namespace TensileLite

result << "), strides(";
streamJoin(result, m_strides, ", ");

result << ") )";
return result.str();
}

Expand Down
9 changes: 4 additions & 5 deletions tensilelite/Tensile/Utilities/tensile_generator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,16 +61,15 @@ To use the `tensile_config_generator.py` script, follow these steps:

5. Merge tune results:

MI308:
gfx942:

For cpx, use the gfx942_20cu folder; for spx, use the gfx942_80cu folder.
```
python3 ./tensilelite/Tensile/Utilities/merge.py --no_eff library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/{gfx942_20cu|gfx942_80cu}/{Equality|GridBased}/ <tune result directory>/3_LibraryLogic/ library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/{gfx942_20cu|gfx942_80cu}/{Equality|GridBased}/
python3 ./tensilelite/Tensile/Utilities/merge.py --no_eff library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942_{cu count}/{Equality|GridBased}/ <tune result directory>/3_LibraryLogic/ library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aquavanjaram/gfx942_{cu count}/{Equality|GridBased}/
```
MI210:
gfx90a:

```
python3 ./tensilelite/Tensile/Utilities/merge.py --no_eff library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aldebaran/104CU/{Equality|GridBased}/ <tune result directory>/3_LibraryLogic/ library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aldebaran/104CU/{Equality|GridBased}/
python3 ./tensilelite/Tensile/Utilities/merge.py --no_eff library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aldebaran/{cu count}/{Equality|GridBased}/ <tune result directory>/3_LibraryLogic/ library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/aldebaran/{cu count}/{Equality|GridBased}/
```

6. Rebuild hipBLASLt with the merged results:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
################################################################################
#
# Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
# Copyright (C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -31,6 +31,8 @@
import subprocess
import math
import numpy as np
import concurrent.futures


# Paths to the input and output files
parser = argparse.ArgumentParser(description="""Generate Tensile config file""")
Expand Down Expand Up @@ -88,26 +90,36 @@

CU_RE = r"Compute Unit:(?P<COMPUTE_UNIT>[\w ]+)"

res = subprocess.run("/opt/rocm/llvm/bin/offload-arch", stdout=subprocess.PIPE)
ArchitectureName = res.stdout.decode("utf-8").strip()
res = subprocess.run("rocminfo | grep Compute", stdout=subprocess.PIPE, shell=True, env={"ROCR_VISIBLE_DEVICES":"0"})
match = re.search(CU_RE, res.stdout.decode("utf-8").split('\n')[-2])
NUM_STAGES = args.num_stages
DIV_MI = 3 # 33.3%
MIN_MI = 5 # min 5 solutions
NONTEMPORALRATIO = 8
CU = 0

OFFLOAD_ARCH = "/opt/rocm/llvm/bin/offload-arch"
NUM_INST = "/sys/class/drm/card1/device/compute_partition_config/xcc/num_inst"

ArchitectureName = None
if os.path.exists(OFFLOAD_ARCH):
res = subprocess.run(OFFLOAD_ARCH, stdout=subprocess.PIPE)
ArchitectureName = res.stdout.decode("utf-8").strip()
else:
raise FileNotFoundError(f"{OFFLOAD_ARCH} not found, please specific ArchitectureName in the script.")

res = subprocess.run("rocminfo | grep Compute", stdout=subprocess.PIPE, shell=True, env={"ROCR_VISIBLE_DEVICES":"0"})
match = re.search(CU_RE, res.stdout.decode("utf-8").split('\n')[-2])
if match:
CU = int(match.group('COMPUTE_UNIT').strip())
else:
raise RuntimeError("Failed to get compute unit from rocminfo")

XCC = None
if ArchitectureName == 'gfx942':
res = subprocess.run(["cat", "/sys/class/drm/card1/device/current_compute_partition"], stdout=subprocess.PIPE)
if res.stdout.decode("utf-8").strip() == "CPX":
XCC = 1
if os.path.exists(OFFLOAD_ARCH):
res = subprocess.run(["cat", NUM_INST], stdout=subprocess.PIPE)
XCC = int(res.stdout.decode("utf-8").strip())
else:
XCC = 4
raise FileNotFoundError(f"{NUM_INST} not found, please specific XCC in the script.")
DeviceNames = ["Device 0049", "Device 0050"]
ScheduleName = "aquavanjaram"
elif ArchitectureName == 'gfx90a':
Expand All @@ -122,15 +134,15 @@
fp32_instructions = [[32,32,1,2], [32,32,2,1], [16,16,1,4], [16,16,4,1], [4,4,1,16]]
fp8_instructions = [[32,32,16,1], [16,16,32,1]]
else:
fp16_instructions = [[16,16,16,1]]
bf16_instructions = [[16,16,16,1],[32,32,8,1]]
tf32_instructions = [[16,16,8,1]]
fp32_instructions = [[16,16,4,1]]
fp16_instructions = [[16,16,16,1], [32,32,8,1]]
bf16_instructions = [[16,16,16,1], [32,32,8,1]]
tf32_instructions = [[16,16,8,1], [32,32,4,1]]
fp32_instructions = [[16,16,4,1], [32,32,2,1]]
fp8_instructions = [[16,16,32,1]]


HIPBLASLT_BENCH_BASE = (
r"(?P<CMD>\w+) --api_method c "
r"hipblaslt-bench --api_method (?P<API_METHOD>\w+) "
r"-m (?P<M>[\d ]+)"
r"-n (?P<N>[\d ]+)"
r"-k (?P<K>[\d ]+)"
Expand Down Expand Up @@ -354,11 +366,11 @@ def split_gemms_by_gpus(unique_gemms, gpus):
unique_gemms_subgroups[i%gpus] = [(k, v)]
return unique_gemms_subgroups

def calculate_min_flops(m_sum, n_sum, batch_sum, k_sum, iters):
m_avg = m_sum / len(unique_gemms_subgroup)
n_avg = n_sum / len(unique_gemms_subgroup)
batch_avg = batch_sum / len(unique_gemms_subgroup)
k_avg = k_sum / len(unique_gemms_subgroup)
def calculate_min_flops(m_sum, n_sum, batch_sum, k_sum, samples_num, iters):
m_avg = m_sum / samples_num
n_avg = n_sum / samples_num
batch_avg = batch_sum / samples_num
k_avg = k_sum / samples_num

return (ENQUEUES_PER_SYNC + iters) * m_avg * n_avg * batch_avg * k_avg / 2

Expand All @@ -367,8 +379,8 @@ def calculate_gsu(matmul_instruction, size):
mt1 = matmul_instruction[1] * matmul_instruction[6] * matmul_instruction[8]
return max(1, CU // (math.ceil(size[0] / mt0) * math.ceil(size[1] / mt1)))

def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, iters, groups, gsu_group):
MinFlopsPerSync = calculate_min_flops(m_sum, n_sum, batch_sum, k_sum, iters)
def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, samples_num, iters, groups, gsu_group, matmul_instructions):
MinFlopsPerSync = calculate_min_flops(m_sum, n_sum, batch_sum, k_sum, samples_num, iters)
# Read the YAML file
with open(yaml_file, 'r') as f:
data = yaml.safe_load(f)
Expand Down Expand Up @@ -440,25 +452,36 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
# Write the updated YAML file
yaml_file = os.path.basename(yaml_file)
slices = yaml_file.split('.')
with open(slices[0]+'.'+str(gpu_idx)+'.'+slices[1], 'w') as f:
fname = slices[0]+'.'+str(gpu_idx)+'.'+slices[1]
with open(fname, 'w') as f:
yaml.dump(data, f, default_flow_style=None)
print(f"Dumped yaml to {fname}")


if args.hipblaslt_log and args.gridbase_config is None:
LibraryType = "Equality"
unique_gemms = {}
# Read problem sizes from the input file
with open(args.hipblaslt_log, 'r') as f:
for line in f:
lines = f.readlines()
def _extract_gemms(line):
match = match_pattern(line)
if match:
size = extract_problem_size(match)
dtype = extract_dtype(match)
if dtype is None:
print(f"Can't find dtype for {line}, please contact hipblaslt expert")
continue
return None
size_str = json.dumps(size)
dtype_str = json.dumps(dtype)
return (size_str, dtype_str)
return None

with concurrent.futures.ProcessPoolExecutor(8) as executor:
results = executor.map(_extract_gemms, list(lines))
for res in results:
if res is not None:
(size_str, dtype_str) = res
if (size_str, dtype_str) in unique_gemms:
unique_gemms[(size_str, dtype_str)] += 1
else:
Expand All @@ -470,13 +493,14 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it

unique_gemms_subgroups = split_gemms_by_gpus(unique_gemms, args.gpus)

for gpu_idx, unique_gemms_subgroup in enumerate(unique_gemms_subgroups):
def _process_gemms(item):
gpu_idx, unique_gemms_subgroup = item
gemm_group = {}
gsu_group = {}
matmul_instructions = {}
groups = {}
if unique_gemms_subgroup is None:
continue
return None

m_sum = 0
n_sum = 0
Expand Down Expand Up @@ -556,8 +580,9 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
n_sum += original_size[1]
batch_sum += original_size[2]
k_sum += original_size[3]
samples_num = len(unique_gemms_subgroup)
return dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, samples_num, args.iters, groups, gsu_group, matmul_instructions)

dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, args.iters, groups, gsu_group)

elif args.gridbase_config and args.hipblaslt_log is None:
LibraryType = "GridBased"
Expand Down Expand Up @@ -588,7 +613,8 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it

unique_gemms_subgroups = split_gemms_by_gpus(unique_gemms, args.gpus)

for gpu_idx, unique_gemms_subgroup in enumerate(unique_gemms_subgroups):
def _process_gemms(item):
gpu_idx, unique_gemms_subgroup = item
gemm_group = {}
matmul_instructions = {}
gsu_group = {}
Expand Down Expand Up @@ -641,5 +667,8 @@ def dump_yaml(gpu_idx, gemm_group, yaml_file, m_sum, n_sum, batch_sum, k_sum, it
n_sum += original_size[1]
batch_sum += original_size[2]
k_sum += original_size[3]
samples_num = len(unique_gemms_subgroup)
return dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, samples_num, args.iters, {}, gsu_group, matmul_instructions)

dump_yaml(gpu_idx, gemm_group, args.tensile_config, m_sum, n_sum, batch_sum, k_sum, args.iters, {}, gsu_group)
with concurrent.futures.ProcessPoolExecutor(args.gpus) as executor:
results = executor.map(_process_gemms, list(enumerate(unique_gemms_subgroups)))
Loading