Skip to content

Commit

Permalink
Add reduction_unroll_factor to autotuning script (#3487)
Browse files Browse the repository at this point in the history
This PR renames `unroll_factor` to `iteration_unroll_factor` and adds
`reduction_unroll_factor`. `reduction_unroll_factor` adds unroll factor
on top of vectorization factor for the inner reduction domain.
  • Loading branch information
rdspring1 authored and jacobhinkle committed Dec 16, 2024
1 parent 8882aa9 commit 36f49db
Showing 1 changed file with 43 additions and 10 deletions.
53 changes: 43 additions & 10 deletions doc/dev/python_scheduling/autotune_inner_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,10 @@ class FUSION(Enum):
class InnerReductionConfiguration:
# The vectorization factor for inner reduction domain.
vectorize_factor: int = 1
# The unroll factor for the inner reduction domain.
reduction_unroll_factor: int = 1
# The unroll factor for the outer iteration domain.
unroll_factor: int = 1
iteration_unroll_factor: int = 1
# The grid size for the outer iteration domain.
# If grdim > 1, then godim corresponds with y axis of the grid.
# Otherwise, it is the x axis of the grid.
Expand Down Expand Up @@ -121,11 +123,16 @@ def convert_to_inner_reduction_params(self, scheduler_config, reduction_params):
reduction_params.vectorize_inner_reduction = (
scheduler_config.vectorize_factor > 1
)
reduction_params.unroll_factor_top_of_vectorization = (
scheduler_config.reduction_unroll_factor
)

if scheduler_config.bdimy > 1:
reduction_params.block_dim_iter_dom = ParallelType.block_y

reduction_params.unroll_factor_iter_dom = scheduler_config.unroll_factor
reduction_params.unroll_factor_iter_dom = (
scheduler_config.iteration_unroll_factor
)

gdimx = -1
gdimy = -1
Expand Down Expand Up @@ -161,16 +168,27 @@ def convert_to_inner_reduction_params(self, scheduler_config, reduction_params):
def generate_scheduler_configurations(self, input_shape):
threads_per_cta_options = [128, 256, 512, 1024]
vectorization_factor_options = [1, 2, 4, 8]
unroll_factor_options = list(range(1, 11))
reduction_unroll_factor_options = list(range(1, 6))
iteration_unroll_factor_options = list(range(1, 6))
warp_size = 32

num_iterations, num_reductions = input_shape

for threads_per_cta, vectorize_factor, unroll_factor in itertools.product(
threads_per_cta_options, vectorization_factor_options, unroll_factor_options
for (
threads_per_cta,
vectorize_factor,
reduction_unroll_factor,
iteration_unroll_factor,
) in itertools.product(
threads_per_cta_options,
vectorization_factor_options,
reduction_unroll_factor_options,
iteration_unroll_factor_options,
):
scheduler_config = self.InnerReductionConfiguration(
vectorize_factor=vectorize_factor, unroll_factor=unroll_factor
vectorize_factor=vectorize_factor,
reduction_unroll_factor=reduction_unroll_factor,
iteration_unroll_factor=iteration_unroll_factor,
)
scheduler_config.bdimx = min(
threads_per_cta,
Expand All @@ -184,20 +202,35 @@ def generate_scheduler_configurations(self, input_shape):
max(1, floor_div(threads_per_cta, scheduler_config.bdimx)),
)
scheduler_config.godim = ceil_div(
num_iterations, scheduler_config.bdimy * scheduler_config.unroll_factor
num_iterations, scheduler_config.bdimy * iteration_unroll_factor
)

# number of reduction elements not handled by a CTA
remaining_reduction = ceil_div(
num_reductions,
(scheduler_config.bdimx * scheduler_config.vectorize_factor),
ceil_div(
ceil_div(num_reductions, vectorize_factor), scheduler_config.bdimx
),
reduction_unroll_factor,
)

if unroll_factor == 1 and remaining_reduction > 1:
if iteration_unroll_factor == 1 and remaining_reduction > 1:
# all remaining reduction goes to grdim
scheduler_config.grdim = remaining_reduction
yield scheduler_config

# When iteration dim is small, there may be unused SMs. We need
# to shift work from block reduction to grid reduction to
# increase SM usage.
godim = scheduler_config.godim
grdim = 1
while (
godim * grdim * 2 <= self.gpu_properties.multi_processor_count
and (remaining_reduction / grdim) >= 2
):
grdim *= 2
scheduler_config.grdim = grdim
yield scheduler_config

# grid stride across reduction iterDomain is 1
scheduler_config.grdim = 1
yield scheduler_config
Expand Down

0 comments on commit 36f49db

Please sign in to comment.