Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add reduction_unroll_factor to autotuning script #3487

Merged
merged 9 commits into from
Dec 13, 2024
53 changes: 43 additions & 10 deletions doc/dev/python_scheduling/autotune_inner_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,10 @@ class FUSION(Enum):
class InnerReductionConfiguration:
# The vectorization factor for inner reduction domain.
vectorize_factor: int = 1
# The unroll factor for the inner reduction domain.
reduction_unroll_factor: int = 1
# The unroll factor for the outer iteration domain.
unroll_factor: int = 1
iteration_unroll_factor: int = 1
# The grid size for the outer iteration domain.
# If grdim > 1, then godim corresponds with y axis of the grid.
# Otherwise, it is the x axis of the grid.
Expand Down Expand Up @@ -121,11 +123,16 @@ def convert_to_inner_reduction_params(self, scheduler_config, reduction_params):
reduction_params.vectorize_inner_reduction = (
scheduler_config.vectorize_factor > 1
)
reduction_params.unroll_factor_top_of_vectorization = (
scheduler_config.reduction_unroll_factor
)

if scheduler_config.bdimy > 1:
reduction_params.block_dim_iter_dom = ParallelType.block_y

reduction_params.unroll_factor_iter_dom = scheduler_config.unroll_factor
reduction_params.unroll_factor_iter_dom = (
scheduler_config.iteration_unroll_factor
)

gdimx = -1
gdimy = -1
Expand Down Expand Up @@ -161,16 +168,27 @@ def convert_to_inner_reduction_params(self, scheduler_config, reduction_params):
def generate_scheduler_configurations(self, input_shape):
threads_per_cta_options = [128, 256, 512, 1024]
vectorization_factor_options = [1, 2, 4, 8]
unroll_factor_options = list(range(1, 11))
reduction_unroll_factor_options = list(range(1, 6))
iteration_unroll_factor_options = list(range(1, 6))
warp_size = 32

num_iterations, num_reductions = input_shape

for threads_per_cta, vectorize_factor, unroll_factor in itertools.product(
threads_per_cta_options, vectorization_factor_options, unroll_factor_options
for (
threads_per_cta,
vectorize_factor,
reduction_unroll_factor,
iteration_unroll_factor,
) in itertools.product(
threads_per_cta_options,
vectorization_factor_options,
reduction_unroll_factor_options,
iteration_unroll_factor_options,
):
scheduler_config = self.InnerReductionConfiguration(
vectorize_factor=vectorize_factor, unroll_factor=unroll_factor
vectorize_factor=vectorize_factor,
reduction_unroll_factor=reduction_unroll_factor,
iteration_unroll_factor=iteration_unroll_factor,
)
scheduler_config.bdimx = min(
threads_per_cta,
Expand All @@ -184,20 +202,35 @@ def generate_scheduler_configurations(self, input_shape):
max(1, floor_div(threads_per_cta, scheduler_config.bdimx)),
)
scheduler_config.godim = ceil_div(
num_iterations, scheduler_config.bdimy * scheduler_config.unroll_factor
num_iterations, scheduler_config.bdimy * iteration_unroll_factor
)

# number of reduction elements not handled by a CTA
remaining_reduction = ceil_div(
num_reductions,
(scheduler_config.bdimx * scheduler_config.vectorize_factor),
ceil_div(
ceil_div(num_reductions, vectorize_factor), scheduler_config.bdimx
),
reduction_unroll_factor,
)

if unroll_factor == 1 and remaining_reduction > 1:
if iteration_unroll_factor == 1 and remaining_reduction > 1:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks strange to me. Why grdim = remaining_reduction? We can do serial reduction instread of grid reduction.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nvFuser's default heuristic does:

  // When iteration dim is small, may have unused SMs, to increase SM usage
  // needs to shift from block reduction to grid reduction.
  int64_t grdim = 1;
  while (godim * grdim * 2 <= sm_count && getInnerRemainder() / grdim >= 2) {
    grdim *= 2;
  }

Copy link
Collaborator Author

@rdspring1 rdspring1 Dec 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From inner2dReductionHeuristic, I see this:

  // Cross grid reduction if we haven't hit our target blocks, and we have manyr
  // reduction elements.
  if ((godim < target_blocks && remainder_in_reduction >= 0) ||
      (remainder_in_reduction >= kEight)) {
    grdim = remainder_in_reduction;
  }

   // Try to do some cleanup of ragged waves on device
   { do_something }

   // Grid reductions do not support unrolling iteration dimension, revert if
   // set. Recalculate godim.
   { do_something }

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

another approch is we can add another search para is_block_reduction, if it is true, we only use block reduction, if it is false, we do grid reduction.

# all remaining reduction goes to grdim
scheduler_config.grdim = remaining_reduction
yield scheduler_config

# When iteration dim is small, there may be unused SMs. We need
# to shift work from block reduction to grid reduction to
# increase SM usage.
godim = scheduler_config.godim
grdim = 1
while (
godim * grdim * 2 <= self.gpu_properties.multi_processor_count
and (remaining_reduction / grdim) >= 2
):
grdim *= 2
scheduler_config.grdim = grdim
yield scheduler_config

# grid stride across reduction iterDomain is 1
scheduler_config.grdim = 1
yield scheduler_config
Expand Down
Loading