From 61d944db17d504d660d41b5a7479d39d84d25654 Mon Sep 17 00:00:00 2001
From: ZhiyuLi-goog <lizhiyu@google.com>
Date: Fri, 8 Nov 2024 22:10:27 +0000
Subject: [PATCH 01/40] base

---
 MaxText/kernels/__init__.py          |   0
 MaxText/kernels/megablox/__init__.py |   0
 MaxText/kernels/megablox/common.py   |  63 +++
 MaxText/kernels/megablox/gmm.py      | 793 +++++++++++++++++++++++++++
 MaxText/kernels/megablox/ops.py      | 109 ++++
 5 files changed, 965 insertions(+)
 create mode 100644 MaxText/kernels/__init__.py
 create mode 100644 MaxText/kernels/megablox/__init__.py
 create mode 100644 MaxText/kernels/megablox/common.py
 create mode 100644 MaxText/kernels/megablox/gmm.py
 create mode 100644 MaxText/kernels/megablox/ops.py

diff --git a/MaxText/kernels/__init__.py b/MaxText/kernels/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/MaxText/kernels/megablox/__init__.py b/MaxText/kernels/megablox/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/MaxText/kernels/megablox/common.py b/MaxText/kernels/megablox/common.py
new file mode 100644
index 000000000..bd843cf46
--- /dev/null
+++ b/MaxText/kernels/megablox/common.py
@@ -0,0 +1,63 @@
+# Copyright 2024 The JAX Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Common utilities for GMM kernels."""
+
+import re
+
+import jax
+import jax.numpy as jnp
+
+
+def is_tpu() -> bool:
+  return "TPU" in jax.devices()[0].device_kind
+
+
+def tpu_kind() -> str:
+  """Query identification string for the currently attached TPU."""
+  return jax.devices()[0].device_kind
+
+
+_TPU_KIND_PATTERN = re.compile(r"TPU v(\d+)")
+
+
+def tpu_generation() -> int:
+  """Generation number of the currently attached TPU."""
+  if version := _TPU_KIND_PATTERN.match(tpu_kind()):
+    return int(version[1])
+  raise NotImplementedError("only TPU devices are supported")
+
+
+def supports_bfloat16_matmul() -> bool:
+  """Does the currently attached CPU support bfloat16 inputs?"""
+  return not is_tpu() or tpu_generation() >= 4
+
+
+def assert_is_supported_dtype(dtype: jnp.dtype) -> None:
+  if dtype != jnp.bfloat16 and dtype != jnp.float32:
+    raise ValueError(f"Expected bfloat16 or float32 array but got {dtype}.")
+
+
+def select_input_dtype(lhs: jnp.ndarray, rhs: jnp.ndarray) -> jnp.dtype:
+  """A type to which both input should be adapted to before dot product."""
+  # bf16xbf16 matmul is only supported since TPUv4 generation. In case of mixed
+  # input precision, we need to convert bf16 argument to fp32 beforehand.
+  if (
+      supports_bfloat16_matmul()
+      and lhs.dtype == jnp.bfloat16
+      and rhs.dtype == jnp.bfloat16
+  ):
+    return jnp.bfloat16
+  else:
+    return jnp.float32
diff --git a/MaxText/kernels/megablox/gmm.py b/MaxText/kernels/megablox/gmm.py
new file mode 100644
index 000000000..5c2f93859
--- /dev/null
+++ b/MaxText/kernels/megablox/gmm.py
@@ -0,0 +1,793 @@
+# Copyright 2024 The JAX Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Grouped matrix multiplication kernels for TPU written in Pallas."""
+
+from collections.abc import Callable
+import functools
+from typing import Any, Optional
+
+import jax
+from jax import lax
+from jax.experimental import pallas as pl
+from jax.experimental.pallas import tpu as pltpu
+from jax.experimental.pallas.ops.tpu.megablox import common
+import jax.numpy as jnp
+
+
+partial = functools.partial
+
+
+def _validate_args(
+    *,
+    lhs: jnp.ndarray,
+    rhs: jnp.ndarray,
+    group_sizes: jnp.ndarray,
+    expected_rhs_dims: int = 3,
+) -> tuple[jnp.ndarray, jnp.ndarray, jnp.dtype]:
+  """Validates the arguments for the gmm function."""
+  # Validate 'lhs'.
+  if lhs.ndim != 2:
+    raise ValueError(f"Expected 2-tensor for 'lhs' but got {lhs.ndim}-tensor.")
+  common.assert_is_supported_dtype(lhs.dtype)
+
+  # Validate 'rhs'.
+  if rhs.ndim != expected_rhs_dims:
+    raise ValueError(
+        f"Expected {expected_rhs_dims}-tensor for 'rhs' but got"
+        f" {rhs.ndim}-tensor."
+    )
+  common.assert_is_supported_dtype(rhs.dtype)
+
+  # Validate 'group_sizes'.
+  if group_sizes.dtype != jnp.int32:
+    raise ValueError(
+        f"Expected 32-bit integer 'group_sizes' but got {group_sizes.dtype}."
+    )
+
+  return lhs, group_sizes, common.select_input_dtype(lhs, rhs)
+
+
+def _calculate_num_tiles(x: int, tx: int) -> int:
+  tiles, rem = divmod(x, tx)
+  if rem:
+    raise ValueError(f"{x} must be divisible by x-dimension tile size ({tx}).")
+  return tiles
+
+
+def _calculate_irregular_num_tiles(x: int, tx: int) -> tuple[int, int]:
+  tiles, rem = divmod(x, tx)
+  if rem:
+    tiles += 1
+  return tiles, rem
+
+
+GroupMetadata = Any  # TODO(enriqueps): Clean this up and use a namedtuple
+
+
+def make_group_metadata(
+    *,
+    group_sizes: jnp.ndarray,
+    m: int,
+    tm: int,
+    start_group: jnp.ndarray,
+    num_nonzero_groups: int,
+    visit_empty_groups: bool = True,
+) -> GroupMetadata:
+  """Create the metadata needed for grouped matmul computation.
+
+  Args:
+    group_sizes: A 1d, jnp.ndarray with shape [num_groups] and jnp.int32 dtype.
+    m: The number of rows in lhs.
+    tm: The m-dimension tile size being used.
+    start_group: The group in group sizes to start computing from. This is
+      particularly useful for when rhs num_groups is sharded.
+    num_nonzero_groups: Number of groups in group sizes to compute on. Useful in
+      combination with group_offset.
+    visit_empty_groups: If True, do not squeeze tiles for empty groups out of
+      the metadata. This is necessary for tgmm, where we at least need to zero
+      the output for each group.
+
+  Returns:
+    tuple of:
+      group_offsets: A 1d, jnp.ndarray with shape [num_groups+1] and jnp.int32
+        dtype. group_offsets[i] indicates the row at which group [i] starts in
+        the lhs matrix and group_offsets[i-1] = m.
+      group_ids: A 1d, jnp.ndarray with shape [m_tiles + num_groups] and
+        jnp.int32 dtype. group_ids[i] indicates which group grid index 'i' will
+        work on.
+      m_tile_ids: A 1d, jnp.ndarray with shape [m_tiles + num_groups] and
+        jnp.int32. m_tile_ids[i] indicates which m-dimension tile grid index 'i'
+        will work on.
+    num_tiles: The number of m-dimension tiles to execute.
+  """
+  num_groups = group_sizes.shape[0]
+  end_group = start_group + num_nonzero_groups - 1
+
+  # Calculate the offset of each group, starting at zero. This metadata is
+  # similar to row offsets in a CSR matrix. The following properties hold:
+  #
+  # group_offsets.shape = [num_groups + 1]
+  # group_offsets[0] = 0
+  # group_offsets[num_groups] = m
+  #
+  # The row at which group 'i' starts is group_offsets[i].
+  group_ends = jnp.cumsum(group_sizes)
+  group_offsets = jnp.concatenate([jnp.zeros(1, dtype=jnp.int32), group_ends])
+
+  # Assign a group id to each grid index.
+  #
+  # If a group starts somewhere other than the start of a tile or ends somewhere
+  # other than the end of a tile we need to compute that full tile. Calculate
+  # the number of tiles for each group by rounding their end up to the nearest
+  # 'tm' and their start down to the nearest 'tm'.
+
+  # (1) Round the group_ends up to the nearest multiple of 'tm'.
+  #
+  # NOTE: This does not change group_offsets[num_groups], which is m
+  # (because we enforce m is divisible by tm).
+  rounded_group_ends = ((group_ends + tm - 1) // tm * tm).astype(jnp.int32)
+
+  # (2) Round the group_starts down to the nearest multiple of 'tm'.
+  group_starts = jnp.concatenate(
+      [jnp.zeros(1, dtype=jnp.int32), group_ends[:-1]]
+  )
+  rounded_group_starts = group_starts // tm * tm
+
+  # (3) Calculate the number of rows in each group.
+  #
+  # NOTE: Handle zero-sized groups as a special case. If the start for a
+  # zero-sized group is not divisible by 'tm' its start will be rounded down and
+  # its end will be rounded up such that its size will become 1 tile here.
+  rounded_group_sizes = rounded_group_ends - rounded_group_starts
+  rounded_group_sizes = jnp.where(group_sizes == 0, 0, rounded_group_sizes)
+
+  # (4) Convert the group sizes from units of rows to unit of 'tm' sized tiles.
+  #
+  # An m-dimension tile is 'owned' by group 'i' if the first row of the tile
+  # belongs to group 'i'. In addition to owned tiles, each group can have 0 or 1
+  # initial partial tiles if it's first row does not occur in the first row of a
+  # tile. The '0-th' group never has a partial tile because it always starts at
+  # the 0-th row.
+  #
+  # If no group has a partial tile, the total number of tiles is equal to
+  # 'm // tm'. If every group has a partial except the 0-th group, the total
+  # number of tiles is equal to 'm // tm + num_groups - 1'. Thus we know that
+  #
+  # tiles_m <= group_tiles.sum() <= tiles_m + num_groups - 1
+  #
+  # Where tiles_m = m // tm.
+  #
+  # NOTE: All group sizes are divisible by 'tm' because of the rounding in steps
+  # (1) and (2) so this division is exact.
+  group_tiles = rounded_group_sizes // tm
+
+  if visit_empty_groups:
+    # Insert one tile for empty groups.
+    group_tiles = jnp.where(group_sizes == 0, 1, group_tiles)
+
+  # Create the group ids for each grid index based on the tile counts for each
+  # group.
+  #
+  # NOTE: This repeat(...) will pad group_ids with the final group id if
+  # group_tiles.sum() < tiles_m + num_groups - 1. The kernel grid will be sized
+  # such that we only execute the necessary number of tiles.
+  tiles_m = _calculate_num_tiles(m, tm)
+  group_ids = jnp.repeat(
+      jnp.arange(num_groups, dtype=jnp.int32),
+      group_tiles,
+      total_repeat_length=tiles_m + num_groups - 1,
+  )
+
+  # Assign an m-dimension tile id to each grid index.
+  #
+  # NOTE: Output tiles can only be re-visited consecutively. The following
+  # procedure guarantees that m-dimension tile indices respect this.
+
+  # (1) Calculate how many times each m-dimension tile will be visited.
+  #
+  # Each tile is guaranteed to be visited once by the group that owns the tile.
+  # The remaining possible visits occur when a group starts inside of a tile at
+  # a position other than the first row. We can calculate which m-dimension tile
+  # each group starts in by floor-dividing its offset with `tm` and then count
+  # tile visits with a histogram.
+  #
+  # To avoid double counting tile visits from the group that owns the tile,
+  # filter these out by assigning their tile id to `tile_m` (one beyond the max)
+  # such that they're ignored by the subsequent histogram. Also filter out any
+  # group which is empty.
+  #
+  # TODO(tgale): Invert the 'partial_tile_mask' predicates to be more clear.
+  partial_tile_mask = jnp.logical_or(
+      (group_offsets[:-1] % tm) == 0, group_sizes == 0
+  )
+
+  # Explicitly enable tiles for zero sized groups, if specified. This covers
+  # zero sized groups that start on a tile-aligned row and those that do not.
+  if visit_empty_groups:
+    partial_tile_mask = jnp.where(group_sizes == 0, 0, partial_tile_mask)
+
+  partial_tile_ids = jnp.where(
+      partial_tile_mask, tiles_m, group_offsets[:-1] // tm
+  )
+
+  tile_visits = (
+      jnp.histogram(partial_tile_ids, bins=tiles_m, range=(0, tiles_m - 1))[0]
+      + 1
+  )
+
+  # Create the m-dimension tile ids for each grid index based on the visit
+  # counts for each tile.
+  m_tile_ids = jnp.repeat(
+      jnp.arange(tiles_m, dtype=jnp.int32),
+      tile_visits.astype(jnp.int32),
+      total_repeat_length=tiles_m + num_groups - 1,
+  )
+
+  # Account for sharding.
+  #
+  # Find the start of the groups owned by our shard and shift the group_ids and
+  # m_tile_ids s.t. the metadata for our tiles are at the front of the arrays.
+  #
+  # TODO(tgale): Move this offset into the kernel to avoid these rolls.
+  first_tile_in_shard = (group_ids < start_group).sum()
+  group_ids = jnp.roll(group_ids, shift=-first_tile_in_shard, axis=0)
+  m_tile_ids = jnp.roll(m_tile_ids, shift=-first_tile_in_shard, axis=0)
+
+  # Calculate the number of tiles we need to compute for our shard.
+  #
+  # Remove tile visits that belong to a group not in our shard.
+  iota = jnp.arange(num_groups, dtype=jnp.int32)
+  active_group_mask = jnp.logical_and(iota <= end_group, iota >= start_group)
+  group_tiles = jnp.where(active_group_mask, group_tiles, 0)
+  num_tiles = group_tiles.sum()
+  return (group_offsets, group_ids, m_tile_ids), num_tiles
+
+
+def _get_group_size(
+    *, grid_id: jnp.ndarray, group_metadata: GroupMetadata
+) -> jnp.ndarray:
+  """Calculate the number of rows in the current group."""
+  group_offsets, group_ids = group_metadata[:2]
+  group_id = group_ids[grid_id]
+  group_start = group_offsets[group_id]
+  group_end = group_offsets[group_id + 1]
+  return group_end - group_start
+
+
+def _get_store_mask(
+    *,
+    grid_id: jnp.ndarray,
+    group_metadata: GroupMetadata,
+    tm: int,
+    tn: int,
+) -> jnp.ndarray:
+  """Mask for rows that belong to the current group in the current tile."""
+  group_offsets, group_ids, m_tile_ids = group_metadata[:3]
+  group_id = group_ids[grid_id]
+  group_start = group_offsets[group_id]
+  group_end = group_offsets[group_id + 1]
+  m_id = m_tile_ids[grid_id] * tm
+  iota = jax.lax.broadcasted_iota(jnp.int32, (tm, tn), 0) + m_id
+  return jnp.logical_and(iota >= group_start, iota < group_end)
+
+
+def _zero_uninitialized_memory(
+    out: jnp.ndarray,
+    *,
+    start_group: jnp.ndarray,
+    num_nonzero_groups: int,
+    group_metadata: GroupMetadata,
+) -> jnp.ndarray:
+  """Zero out uninitialized memory from output."""
+  group_offsets = group_metadata[0]
+  group_start = group_offsets[start_group]
+  group_end = group_offsets[start_group + num_nonzero_groups]
+  valid_mask = jax.lax.broadcasted_iota(jnp.int32, (out.shape[0],), 0)
+  valid_mask = (valid_mask >= group_start) & (valid_mask < group_end)
+  return jnp.where(valid_mask[:, None], out, 0)
+
+
+LutFn = Callable[[int, int, int], Optional[tuple[int, int, int]]]
+
+
+@functools.partial(
+    jax.jit,
+    static_argnames=[
+        "preferred_element_type",
+        "tiling",
+        "transpose_rhs",
+        "interpret",
+    ],
+)
+def gmm(
+    lhs: jnp.ndarray,
+    rhs: jnp.ndarray,
+    group_sizes: jnp.ndarray,
+    preferred_element_type: jnp.dtype = jnp.float32,
+    tiling: tuple[int, int, int] | LutFn | None = (128, 128, 128),
+    group_offset: jnp.ndarray | None = None,
+    existing_out: jnp.ndarray | None = None,
+    transpose_rhs: bool = False,
+    interpret: bool = False,
+) -> jnp.ndarray:
+  """Compute lhs[sizes[i-1]:sizes[i], :] @ rhs for each group 'i'.
+
+  Args:
+    lhs: A 2d, jnp.ndarray with shape [m, k].
+    rhs: A 3d, jnp.ndarray with shape [num_groups, k, n].
+    group_sizes: A 1d, jnp.ndarray with shape [num_groups] and jnp.int32 dtype.
+    preferred_element_type: jnp.dtype, the element type for the output matrix.
+    tiling: 3-tuple of ints. The m, k and n-dimension tile sizes.
+    group_offset: The group in group sizes to start computing from. This is
+      particularly useful for when rhs num_groups is sharded.
+    existing_out: Existing output to write to.
+    transpose_rhs: True if the rhs needs to be transposed.
+    interpret: Whether or not to run the kernel in interpret mode, helpful for
+      testing and debugging.
+
+  Returns:
+    A 2d, jnp.ndarray with shape [m, n].
+  """
+
+  if existing_out is not None:
+    assert isinstance(existing_out, jax.Array)
+    expected_dtype = existing_out.dtype
+    if expected_dtype != preferred_element_type:
+      raise ValueError(
+          "Existing output dtype must match preferred_element_type."
+      )
+  if group_offset is None:
+    group_offset = jnp.array([0], dtype=jnp.int32)
+  else:
+    if group_offset.shape:
+      raise ValueError(
+          f"group_offset must be a ()-shaped array. Got: {group_offset.shape}."
+      )
+    group_offset = group_offset[None]
+  num_current_groups = rhs.shape[0]
+  num_total_groups = group_sizes.shape[0]
+  lhs, group_sizes, input_dtype = _validate_args(
+      lhs=lhs, rhs=rhs, group_sizes=group_sizes
+  )
+
+  # Gather shape information.
+  m, k, n = (lhs.shape[0], lhs.shape[1], rhs.shape[2])
+  if transpose_rhs:
+    n = rhs.shape[1]
+
+  # If tiling is callable, look up the problem dimensions in the LUT. If no tuned
+  # tile dimensions are available throw an error.
+  if callable(tiling):
+    tiling = tiling(m, k, n)
+
+  if tiling is None:
+    raise ValueError(f"No tuned tiling found for (m, k, n) = ({m}, {k}, {n})")
+
+  tm, tk, tn = tiling
+  tiles_k, k_rem = _calculate_irregular_num_tiles(k, tk)
+  tiles_n, n_rem = _calculate_irregular_num_tiles(n, tn)
+  del n_rem
+
+  # Create the metadata we need for computation.
+  group_metadata, num_active_tiles = make_group_metadata(  # pylint: disable=unbalanced-tuple-unpacking
+      group_sizes=group_sizes,
+      m=m,
+      tm=tm,
+      start_group=group_offset[0],
+      num_nonzero_groups=rhs.shape[0],
+      visit_empty_groups=False,
+  )
+
+  def kernel(
+      group_metadata,
+      group_offset,
+      lhs,
+      rhs,
+      existing_out,
+      out,
+      acc_scratch,
+  ):
+    group_offsets, group_ids, m_tile_ids = group_metadata
+    del group_offsets, group_ids, group_offset
+
+    grid_id = pl.program_id(1)
+    k_i = pl.program_id(2)
+
+    @pl.when(k_i == 0)
+    def _zero_acc():
+      acc_scratch[...] = jnp.zeros_like(acc_scratch)
+
+      if existing_out is not None:
+        prev_grid_id = jnp.where(grid_id > 0, grid_id - 1, 0)
+        is_first_processed_group = grid_id == 0
+        m_tile_changed = m_tile_ids[grid_id] != m_tile_ids[prev_grid_id]
+        first_time_seeing_out = jnp.logical_or(
+            is_first_processed_group, m_tile_changed
+        )
+
+        @pl.when(first_time_seeing_out)
+        def _init_out():
+          out[...] = existing_out[...]
+
+    def mask_k_rem(x, *, dim):
+      if k_rem == 0:
+        return x
+
+      orig_dtype = x.dtype
+      iota = lax.broadcasted_iota(jnp.int32, x.shape, dim)
+      x = x.astype(jnp.float32)
+      return jnp.where(iota < k_rem, x, 0).astype(orig_dtype)
+
+    def _store_accum():
+      mask = _get_store_mask(
+          grid_id=grid_id,
+          group_metadata=group_metadata,
+          tm=tm,
+          tn=tn,
+      )
+      to_store = acc_scratch[...]
+      out[...] = jax.lax.select(
+          mask[...], to_store, out[...].astype(jnp.float32)
+      ).astype(preferred_element_type)
+
+    def _accum(is_last_k_tile):
+      if is_last_k_tile:
+        mask_k_rem_lhs = partial(mask_k_rem, dim=1)
+        mask_k_rem_rhs = partial(mask_k_rem, dim=int(transpose_rhs))
+      else:
+        mask_k_rem_lhs = lambda x: x
+        mask_k_rem_rhs = lambda x: x
+
+      if transpose_rhs:
+        dot_general_dims = (((1,), (1,)), ((), ()))
+      else:
+        dot_general_dims = (((1,), (0,)), ((), ()))
+
+      loaded_lhs = lhs[...]
+      loaded_rhs = rhs[...]
+      acc_scratch[...] += lax.dot_general(
+          mask_k_rem_lhs(loaded_lhs).astype(input_dtype),
+          mask_k_rem_rhs(loaded_rhs).astype(input_dtype),
+          preferred_element_type=jnp.float32,
+          dimension_numbers=dot_general_dims,
+      )
+
+      if is_last_k_tile:
+        _store_accum()
+
+    lax.cond(
+        k_i == tiles_k - 1,
+        partial(_accum, True),
+        partial(_accum, False),
+    )
+
+  def lhs_transform_indices(n_i, grid_id, k_i, group_metadata, group_offset):
+    # lhs is (m, k). Load the [tm, tk] matrix for this m-tile.
+    group_offsets, group_ids, m_tile_ids = group_metadata
+    del n_i, group_offsets, group_ids, group_offset
+    return m_tile_ids[grid_id], k_i
+
+  def rhs_transform_indices(n_i, grid_id, k_i, group_metadata, group_offset):
+    # rhs is (num_groups, k, n). Load the [tk, tn] matrix based on the group id
+    # for this m-tile.
+    group_offsets, group_ids, m_tile_ids = group_metadata
+    del group_offsets, m_tile_ids
+    if transpose_rhs:
+      k_i, n_i = n_i, k_i
+
+    # NOTE: If we're working on only a shard of the rhs we need to adjust the
+    # group index we load from to account for this. The group_ids are in the
+    # "unsharded" domain.
+    return group_ids[grid_id] - group_offset[0], k_i, n_i
+
+  def out_transform_indices(n_i, grid_id, k_i, group_metadata, group_offset):
+    # out is (m, n). Load the [tm, tn] matrix for this m-tile.
+    group_offsets, group_ids, m_tile_ids = group_metadata
+    del k_i, group_offsets, group_ids, group_offset
+    return m_tile_ids[grid_id], n_i
+
+  out_block_spec = pl.BlockSpec((tm, tn), out_transform_indices)
+  if existing_out is None:
+    in_out_block_spec: Any = None
+    input_output_aliases = {}
+  else:
+    in_out_block_spec = out_block_spec
+    input_output_aliases = {6: 0}
+
+  lhs_block_spec = pl.BlockSpec((tm, tk), lhs_transform_indices)
+  if transpose_rhs:
+    rhs_block_spec = pl.BlockSpec((None, tn, tk), rhs_transform_indices)
+  else:
+    rhs_block_spec = pl.BlockSpec((None, tk, tn), rhs_transform_indices)
+
+  lhs_bytes = lhs.size * lhs.itemsize
+  rhs_bytes = (k * n) * rhs.itemsize  # We don't read all of rhs
+  out_bytes = (m * n) * jnp.dtype(preferred_element_type).itemsize
+  max_active_tiles = group_metadata[1].size
+  bytes_accessed = (
+      (lhs_bytes * tiles_n) + (rhs_bytes * max_active_tiles) + out_bytes
+  )
+  flops = 2 * m * k * n
+  cost_estimate = pl.CostEstimate(
+      flops=flops, bytes_accessed=bytes_accessed, transcendentals=0
+  )
+  call_gmm = pl.pallas_call(
+      kernel,
+      out_shape=jax.ShapeDtypeStruct((m, n), preferred_element_type),
+      grid_spec=pltpu.PrefetchScalarGridSpec(
+          num_scalar_prefetch=2,
+          in_specs=[
+              lhs_block_spec,
+              rhs_block_spec,
+              in_out_block_spec,
+          ],
+          out_specs=out_block_spec,
+          grid=(tiles_n, num_active_tiles, tiles_k),
+          scratch_shapes=[pltpu.VMEM((tm, tn), jnp.float32)],
+      ),
+      input_output_aliases=input_output_aliases,
+      compiler_params=pltpu.TPUCompilerParams(
+              dimension_semantics=("parallel", "arbitrary", "arbitrary")),
+      interpret=interpret,
+      cost_estimate=cost_estimate,
+  )
+
+  out = call_gmm(
+      group_metadata,
+      group_offset,
+      lhs,
+      rhs,
+      existing_out,
+  )
+  if existing_out is None and num_current_groups < num_total_groups:
+    out = _zero_uninitialized_memory(
+        out,
+        start_group=group_offset[0],
+        num_nonzero_groups=rhs.shape[0],
+        group_metadata=group_metadata,
+    )
+  return out
+
+
+@functools.partial(
+    jax.jit,
+    static_argnames=[
+        "preferred_element_type",
+        "tiling",
+        "num_actual_groups",
+        "interpret",
+    ],
+)
+def tgmm(
+    lhs: jnp.ndarray,
+    rhs: jnp.ndarray,
+    group_sizes: jnp.ndarray,
+    preferred_element_type: jnp.dtype = jnp.float32,
+    tiling: tuple[int, int, int] | LutFn | None = (128, 128, 128),
+    group_offset: jnp.ndarray | None = None,
+    num_actual_groups: int | None = None,
+    existing_out: jnp.ndarray | None = None,
+    interpret: bool = False,
+) -> jnp.ndarray:
+  """Compute lhs[:, sizes[i-1]:sizes[i]] @ rhs[sizes[i-1]:sizes[i], :].
+
+  Args:
+    lhs: A 2d, jnp.ndarray with shape [k, m].
+    rhs: A 2d, jnp.ndarray with shape [m, n].
+    group_sizes: A 1d, jnp.ndarray with shape [num_groups] and jnp.int32 dtype.
+    preferred_element_type: jnp.dtype, the element type for the output matrix.
+    tiling: 3-tuple of ints. The m, k and n-dimension tile sizes.
+    group_offset: The group in group sizes to start computing from. This is
+      particularly useful for when rhs num_groups is sharded.
+    num_actual_groups: For when num_groups is sharded and we should only compute
+      the groups that are local, starting from group_offset.
+    existing_out: Existing output to write to.
+    interpret: Whether or not to run the kernel in interpret mode, helpful for
+      testing and debugging.
+
+  Returns:
+    A  3d, jnp.ndarray with shape [num_groups, k, n].
+  """
+  if group_offset is None:
+    group_offset = jnp.array([0], dtype=jnp.int32)
+  else:
+    group_offset = group_offset[None]
+  lhs, group_sizes, input_dtype = _validate_args(
+      lhs=lhs, rhs=rhs, group_sizes=group_sizes, expected_rhs_dims=2
+  )
+
+  # Gather shape information.
+  k, m, n = (lhs.shape[0], lhs.shape[1], rhs.shape[1])
+  num_groups = group_sizes.shape[0]
+  num_actual_groups = (
+      num_actual_groups if num_actual_groups is not None else num_groups
+  )
+
+  # If tiling is callable, look up the problem dimensions in the LUT. If no tuned
+  # tile dimensions are available throw an error.
+  if callable(tiling):
+    tiling = tiling(m, k, n)
+
+  if tiling is None:
+    raise ValueError(f"No tuned tiling found for (m, k, n) = ({m}, {k}, {n})")
+
+  tm, tk, tn = tiling
+  tiles_k, k_rem = _calculate_irregular_num_tiles(k, tk)
+  del k_rem
+  tiles_n, n_rem = _calculate_irregular_num_tiles(n, tn)
+  del n_rem
+
+  # Create the metadata we need for computation.
+  group_metadata, num_active_tiles = make_group_metadata(
+      group_sizes=group_sizes,
+      m=m,
+      tm=tm,
+      start_group=group_offset[0],
+      num_nonzero_groups=num_actual_groups,
+      visit_empty_groups=True,
+  )
+
+  def kernel(
+      group_metadata,
+      group_offset,
+      lhs,
+      rhs,
+      existing_out,
+      out,
+      acc_scratch,
+  ):
+    grid_id = pl.program_id(2)
+    group_offsets, group_ids, m_tile_ids = group_metadata
+    del group_offsets, group_offset, m_tile_ids
+
+    group = group_ids[grid_id]
+    prev_grid_id = jnp.where(grid_id > 0, grid_id - 1, 0)
+    prev_group = group_ids[prev_grid_id]
+
+    group_has_changed = jnp.logical_or(grid_id == 0, prev_group != group)
+
+    @pl.when(group_has_changed)
+    def _zero_acc():
+      acc_scratch[...] = jnp.zeros_like(acc_scratch)
+
+    # We'll only do computation if our group has a nonzero number of rows in it.
+    dont_skip = (
+        _get_group_size(grid_id=grid_id, group_metadata=group_metadata) > 0
+    )
+
+    @pl.when(dont_skip)
+    def _do():
+      rhs_mask = _get_store_mask(
+          grid_id=grid_id,
+          group_metadata=group_metadata,
+          tm=tm,
+          tn=tn,
+      )
+      lhs_mask = _get_store_mask(
+          grid_id=grid_id,
+          group_metadata=group_metadata,
+          tm=tm,
+          tn=tk,
+      )
+
+      loaded_lhs = lhs[...]
+      loaded_rhs = rhs[...]
+      loaded_lhs = lax.select(
+          lhs_mask[...],
+          loaded_lhs.astype(jnp.float32),
+          jnp.zeros_like(lhs, jnp.float32),
+      ).swapaxes(0, 1)
+      loaded_rhs = lax.select(
+          rhs_mask[...],
+          loaded_rhs.astype(jnp.float32),
+          jnp.zeros_like(rhs, jnp.float32),
+      )
+
+      acc_scratch[...] += lax.dot(
+          loaded_lhs.astype(input_dtype),
+          loaded_rhs.astype(input_dtype),
+          preferred_element_type=jnp.float32,
+      )
+
+    is_end_of_grid = grid_id == (pl.num_programs(2) - 1)
+    next_grid_id = jnp.where(is_end_of_grid, grid_id, grid_id + 1)
+    next_group = group_ids[next_grid_id]
+
+    group_is_changing = jnp.logical_or(is_end_of_grid, group != next_group)
+
+    @pl.when(group_is_changing)
+    def _store_accum():
+      to_store = acc_scratch[...]
+      if existing_out is not None:
+        to_store += existing_out[...].astype(jnp.float32)
+      out[...] = to_store.astype(preferred_element_type)
+
+  def lhs_transform_indices(n_i, k_i, grid_id, group_metadata, group_offset):
+    # lhs is (m, k). Load the [tm, tk] matrix for this m-tile.
+    group_offsets, group_ids, m_tile_ids = group_metadata
+    del n_i, group_offsets, group_ids, group_offset
+    return m_tile_ids[grid_id], k_i
+
+  def rhs_transform_indices(n_i, k_i, grid_id, group_metadata, group_offset):
+    # rhs is (m, n). Load the [tm, tn] matrix for this m-tile.
+    group_offsets, group_ids, m_tile_ids = group_metadata
+    del k_i, group_offsets, group_ids, group_offset
+    return m_tile_ids[grid_id], n_i
+
+  def out_transform_indices(n_i, k_i, grid_id, group_metadata, group_offset):
+    # out is (num_groups, k, n). Load the [tk, tn] matrix based on the group id
+    # for this m-tile.
+    group_offsets, group_ids, m_tile_ids = group_metadata
+    del group_offsets, m_tile_ids
+
+    # NOTE: If we're working on only a shard of the output we need to adjust the
+    # group index we load from to account for this. The group_ids are in the
+    # "unsharded" domain.
+    return group_ids[grid_id] - group_offset[0], k_i, n_i
+
+  out_block_spec = pl.BlockSpec((None, tk, tn), out_transform_indices)
+  if existing_out is None:
+    in_out_block_spec: Any = None
+    input_output_aliases = {}
+  else:
+    in_out_block_spec = out_block_spec
+    input_output_aliases = {6: 0}
+
+  lhs_block_spec = pl.BlockSpec((tm, tk), lhs_transform_indices)
+  rhs_block_spec = pl.BlockSpec((tm, tn), rhs_transform_indices)
+
+  lhs_bytes = lhs.size * lhs.itemsize
+  rhs_bytes = rhs.size * rhs.itemsize
+  out_bytewidth = jnp.dtype(preferred_element_type).itemsize
+  out_bytes = (num_actual_groups * k * n) * out_bytewidth
+  bytes_accessed = (
+      (lhs_bytes * tiles_n) + (rhs_bytes * tiles_k) + out_bytes
+  )
+  flops = 2 * m * k * n
+  cost_estimate = pl.CostEstimate(
+      flops=flops, bytes_accessed=bytes_accessed, transcendentals=0
+  )
+  lhs = lhs.swapaxes(0, 1)
+  call_gmm = pl.pallas_call(
+      kernel,
+      out_shape=jax.ShapeDtypeStruct(
+          (num_actual_groups, k, n), preferred_element_type
+      ),
+      grid_spec=pltpu.PrefetchScalarGridSpec(
+          num_scalar_prefetch=2,
+          in_specs=[
+              lhs_block_spec,
+              rhs_block_spec,
+              in_out_block_spec,
+          ],
+          out_specs=out_block_spec,
+          grid=(tiles_n, tiles_k, num_active_tiles),
+          scratch_shapes=[pltpu.VMEM((tk, tn), jnp.float32)],
+      ),
+      input_output_aliases=input_output_aliases,
+      compiler_params=pltpu.TPUCompilerParams(
+              dimension_semantics=("parallel", "arbitrary", "arbitrary")),
+      interpret=interpret,
+      cost_estimate=cost_estimate,
+  )
+
+  out = call_gmm(
+      group_metadata,
+      group_offset,
+      lhs,
+      rhs,
+      existing_out,
+  )
+  return out
diff --git a/MaxText/kernels/megablox/ops.py b/MaxText/kernels/megablox/ops.py
new file mode 100644
index 000000000..583dd2f92
--- /dev/null
+++ b/MaxText/kernels/megablox/ops.py
@@ -0,0 +1,109 @@
+# Copyright 2024 The JAX Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Grouped matrix multiplication operations with custom VJPs."""
+
+import jax
+from jax.experimental.pallas.ops.tpu.megablox import gmm as backend
+import jax.numpy as jnp
+
+
+gmm = jax.custom_vjp(
+    backend.gmm,
+    nondiff_argnums=(3, 4, 7, 8),
+)
+
+
+def _gmm_fwd(
+    lhs: jnp.ndarray,
+    rhs: jnp.ndarray,
+    group_sizes: jnp.ndarray,
+    preferred_element_type: jnp.dtype = jnp.float32,
+    tiling: tuple[int, int, int] = (128, 128, 128),
+    group_offset: jnp.ndarray | None = None,
+    existing_out: jnp.ndarray | None = None,
+    transpose_rhs: bool = False,
+    interpret: bool = False,
+) -> tuple[
+    jnp.ndarray,
+    tuple[
+        jnp.ndarray,
+        jnp.ndarray,
+        jnp.ndarray,
+        jnp.ndarray | None,
+        int,
+    ],
+]:
+  """Forward function for GMM VJP."""
+  out = backend.gmm(
+      lhs,
+      rhs,
+      group_sizes,
+      preferred_element_type,
+      tiling,
+      group_offset,
+      existing_out,
+      transpose_rhs=transpose_rhs,
+      interpret=interpret,
+  )
+  return out, (lhs, rhs, group_sizes, group_offset, rhs.shape[0])
+
+
+def _gmm_bwd(
+    preferred_element_type: jnp.dtype,
+    tiling: tuple[int, int, int],
+    transpose_rhs: bool,
+    interpret: bool,
+    residual: tuple[
+        jnp.ndarray,
+        jnp.ndarray,
+        jnp.ndarray,
+        jnp.ndarray | None,
+        int,
+    ],
+    grad: jnp.ndarray,
+) -> tuple[jnp.ndarray, jnp.ndarray, None, None, jnp.ndarray]:
+  """Backward function for throughput GMM VJP."""
+  del preferred_element_type
+  lhs, rhs, group_sizes, group_offset, num_actual_groups = residual
+  grad_lhs = backend.gmm(
+      grad,
+      rhs,
+      group_sizes,
+      lhs[0].dtype,
+      tiling,
+      group_offset,
+      transpose_rhs=not transpose_rhs,
+      interpret=interpret,
+  )
+  grad_rhs = backend.tgmm(
+      lhs.swapaxes(0, 1),
+      grad,
+      group_sizes,
+      rhs.dtype,
+      tiling,
+      group_offset,
+      num_actual_groups,
+      interpret=interpret,
+  )
+
+  # NOTE: If the rhs transposition is fused into the forward pass we need to
+  # return the transpose of the rhs gradient that we calculated above.
+  #
+  # TODO(tgale, enriqueps, apaske): Fuse this transposition into the tgmm.
+  grad_rhs = grad_rhs.swapaxes(1, 2) if transpose_rhs else grad_rhs
+  return grad_lhs, grad_rhs, None, None, grad
+
+
+gmm.defvjp(_gmm_fwd, _gmm_bwd)
\ No newline at end of file

From 3d25eb3db9e1b5795cb3e8b9f3de81bce3f53986 Mon Sep 17 00:00:00 2001
From: ZhiyuLi-goog <lizhiyu@google.com>
Date: Sat, 9 Nov 2024 01:57:27 +0000
Subject: [PATCH 02/40] fix import error in pallas megablox kernel

---
 MaxText/kernels/megablox/__init__.py | 1 +
 MaxText/kernels/megablox/gmm.py      | 2 +-
 MaxText/kernels/megablox/ops.py      | 2 +-
 MaxText/layers/linears.py            | 4 +++-
 4 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/MaxText/kernels/megablox/__init__.py b/MaxText/kernels/megablox/__init__.py
index e69de29bb..7afffeef8 100644
--- a/MaxText/kernels/megablox/__init__.py
+++ b/MaxText/kernels/megablox/__init__.py
@@ -0,0 +1 @@
+from kernels.megablox.ops import gmm
\ No newline at end of file
diff --git a/MaxText/kernels/megablox/gmm.py b/MaxText/kernels/megablox/gmm.py
index 5c2f93859..5babebf83 100644
--- a/MaxText/kernels/megablox/gmm.py
+++ b/MaxText/kernels/megablox/gmm.py
@@ -22,7 +22,7 @@
 from jax import lax
 from jax.experimental import pallas as pl
 from jax.experimental.pallas import tpu as pltpu
-from jax.experimental.pallas.ops.tpu.megablox import common
+from kernels.megablox import common
 import jax.numpy as jnp
 
 
diff --git a/MaxText/kernels/megablox/ops.py b/MaxText/kernels/megablox/ops.py
index 583dd2f92..3b5a6873d 100644
--- a/MaxText/kernels/megablox/ops.py
+++ b/MaxText/kernels/megablox/ops.py
@@ -15,7 +15,7 @@
 """Grouped matrix multiplication operations with custom VJPs."""
 
 import jax
-from jax.experimental.pallas.ops.tpu.megablox import gmm as backend
+from kernels.megablox import gmm as backend
 import jax.numpy as jnp
 
 
diff --git a/MaxText/layers/linears.py b/MaxText/layers/linears.py
index 3d61f25d5..64e49fa42 100644
--- a/MaxText/layers/linears.py
+++ b/MaxText/layers/linears.py
@@ -32,7 +32,9 @@
 import max_logging
 
 try:
-  from jax.experimental.pallas.ops.tpu import megablox as mblx
+  # from jax.experimental.pallas.ops.tpu import megablox as mblx
+  from kernels import megablox as mblx
+
 except ImportError:
   max_logging.log("JAX megablox is available for TPU only.")
   pass

From 4a3d9ba2d0673f077e0ecab10cd5f33f9930cc98 Mon Sep 17 00:00:00 2001
From: ZhiyuLi-goog <lizhiyu@google.com>
Date: Sat, 9 Nov 2024 02:15:32 +0000
Subject: [PATCH 03/40] Quantize GMM kernel

---
 MaxText/kernels/megablox/gmm.py | 85 +++++++++++++++++++++++++++------
 MaxText/kernels/megablox/ops.py |  7 ++-
 MaxText/layers/linears.py       |  3 +-
 3 files changed, 78 insertions(+), 17 deletions(-)

diff --git a/MaxText/kernels/megablox/gmm.py b/MaxText/kernels/megablox/gmm.py
index 5babebf83..631a75cb1 100644
--- a/MaxText/kernels/megablox/gmm.py
+++ b/MaxText/kernels/megablox/gmm.py
@@ -15,6 +15,7 @@
 """Grouped matrix multiplication kernels for TPU written in Pallas."""
 
 from collections.abc import Callable
+import dataclasses
 import functools
 from typing import Any, Optional
 
@@ -25,7 +26,10 @@
 from kernels.megablox import common
 import jax.numpy as jnp
 
+from aqt.jax.v2 import pallas as aqt_pl
+from aqt.jax.v2 import aqt_tensor
 
+QTensor = aqt_tensor.QTensor
 partial = functools.partial
 
 
@@ -309,6 +313,7 @@ def _zero_uninitialized_memory(
         "tiling",
         "transpose_rhs",
         "interpret",
+        "quant"
     ],
 )
 def gmm(
@@ -321,6 +326,7 @@ def gmm(
     existing_out: jnp.ndarray | None = None,
     transpose_rhs: bool = False,
     interpret: bool = False,
+    quant: bool = False,
 ) -> jnp.ndarray:
   """Compute lhs[sizes[i-1]:sizes[i], :] @ rhs for each group 'i'.
 
@@ -336,6 +342,7 @@ def gmm(
     transpose_rhs: True if the rhs needs to be transposed.
     interpret: Whether or not to run the kernel in interpret mode, helpful for
       testing and debugging.
+    quant: Whether to quantize lhs and rhs.
 
   Returns:
     A 2d, jnp.ndarray with shape [m, n].
@@ -390,11 +397,18 @@ def gmm(
       visit_empty_groups=False,
   )
 
+  # We need to know contracting axis when we quantized lhs and rhs
+  # Thus move this code part outside of kernel.
+  if transpose_rhs:
+    dot_general_dims = (((1,), (1,)), ((), ()))
+  else:
+    dot_general_dims = (((1,), (0,)), ((), ()))
+
   def kernel(
       group_metadata,
       group_offset,
-      lhs,
-      rhs,
+      lhs: jax.Array | QTensor,
+      rhs: jax.Array | QTensor,
       existing_out,
       out,
       acc_scratch,
@@ -427,7 +441,10 @@ def mask_k_rem(x, *, dim):
 
       orig_dtype = x.dtype
       iota = lax.broadcasted_iota(jnp.int32, x.shape, dim)
-      x = x.astype(jnp.float32)
+      if not quant:
+        x = x.astype(jnp.float32)
+      else:
+        x = x.astype(jnp.int32)
       return jnp.where(iota < k_rem, x, 0).astype(orig_dtype)
 
     def _store_accum():
@@ -450,19 +467,39 @@ def _accum(is_last_k_tile):
         mask_k_rem_lhs = lambda x: x
         mask_k_rem_rhs = lambda x: x
 
-      if transpose_rhs:
-        dot_general_dims = (((1,), (1,)), ((), ()))
+      if isinstance(lhs, QTensor):
+        # loaded_lhs = aqt_pl.load_qtensor(lhs)
+        # Let qx: QTensor, qx = quant(x, 8 , ...)
+        # qx.dequant() == qx.qvalue * qx.scale ~= x
+        # Thus, setting qvalue to zero is equivalent to setting original tensor
+        # to zero.
+        qvalue = mask_k_rem_lhs(lhs.qvalue[...])
+        loaded_lhs = dataclasses.replace(lhs, qvalue=qvalue)
+        loaded_lhs = aqt_pl.load_qtensor(loaded_lhs)
       else:
-        dot_general_dims = (((1,), (0,)), ((), ()))
+        loaded_lhs = mask_k_rem_lhs(lhs[...]).astype(input_dtype)
 
-      loaded_lhs = lhs[...]
-      loaded_rhs = rhs[...]
-      acc_scratch[...] += lax.dot_general(
-          mask_k_rem_lhs(loaded_lhs).astype(input_dtype),
-          mask_k_rem_rhs(loaded_rhs).astype(input_dtype),
-          preferred_element_type=jnp.float32,
-          dimension_numbers=dot_general_dims,
-      )
+      if isinstance(rhs, QTensor):
+        qvalue = mask_k_rem_rhs(rhs.qvalue[...])
+        loaded_rhs = dataclasses.replace(rhs, qvalue=qvalue)
+        loaded_rhs = aqt_pl.load_qtensor(loaded_rhs)
+      else:
+        loaded_rhs = mask_k_rem_rhs(rhs[...]).astype(input_dtype)
+
+      if quant:
+        acc_scratch[...] += aqt_pl.dot_general(
+            loaded_lhs,
+            loaded_rhs,
+            preferred_element_type=jnp.float32,
+            dimension_numbers=dot_general_dims,
+        )
+      else:
+        acc_scratch[...] += lax.dot_general(
+            loaded_lhs,
+            loaded_rhs,
+            preferred_element_type=jnp.float32,
+            dimension_numbers=dot_general_dims,
+        )
 
       if is_last_k_tile:
         _store_accum()
@@ -505,6 +542,8 @@ def out_transform_indices(n_i, grid_id, k_i, group_metadata, group_offset):
   else:
     in_out_block_spec = out_block_spec
     input_output_aliases = {6: 0}
+    if quant:
+      input_output_aliases = {8: 0}
 
   lhs_block_spec = pl.BlockSpec((tm, tk), lhs_transform_indices)
   if transpose_rhs:
@@ -523,7 +562,11 @@ def out_transform_indices(n_i, grid_id, k_i, group_metadata, group_offset):
   cost_estimate = pl.CostEstimate(
       flops=flops, bytes_accessed=bytes_accessed, transcendentals=0
   )
-  call_gmm = pl.pallas_call(
+  if quant:
+    pallas_call_fn = aqt_pl.pallas_call
+  else:
+    pallas_call_fn = pl.pallas_call
+  call_gmm = pallas_call_fn(
       kernel,
       out_shape=jax.ShapeDtypeStruct((m, n), preferred_element_type),
       grid_spec=pltpu.PrefetchScalarGridSpec(
@@ -544,6 +587,16 @@ def out_transform_indices(n_i, grid_id, k_i, group_metadata, group_offset):
       cost_estimate=cost_estimate,
   )
 
+  if quant:
+    lhs_contracting_axis, rhs_contracting_axis = dot_general_dims[0]
+    # Since block_spec.block_shape of rhs is None, the first axis is reduced
+    # inside kernel, e.g., if block_shape is (None, tn, tk) then a tensor of 
+    # shape (tn, tk) will be feteched inside kernel instead of (1, tn, tk).
+    # Therefore, we need to add one to rhs_contracting_axis.
+    rhs_contracting_axis = map(lambda x: x + 1, rhs_contracting_axis)
+    lhs = aqt_pl.quant(lhs, 8, lhs_contracting_axis)
+    rhs = aqt_pl.quant(rhs, 8, list(rhs_contracting_axis))
+
   out = call_gmm(
       group_metadata,
       group_offset,
@@ -568,6 +621,7 @@ def out_transform_indices(n_i, grid_id, k_i, group_metadata, group_offset):
         "tiling",
         "num_actual_groups",
         "interpret",
+        "quant",
     ],
 )
 def tgmm(
@@ -580,6 +634,7 @@ def tgmm(
     num_actual_groups: int | None = None,
     existing_out: jnp.ndarray | None = None,
     interpret: bool = False,
+    quant: bool = False,
 ) -> jnp.ndarray:
   """Compute lhs[:, sizes[i-1]:sizes[i]] @ rhs[sizes[i-1]:sizes[i], :].
 
diff --git a/MaxText/kernels/megablox/ops.py b/MaxText/kernels/megablox/ops.py
index 3b5a6873d..265566047 100644
--- a/MaxText/kernels/megablox/ops.py
+++ b/MaxText/kernels/megablox/ops.py
@@ -21,7 +21,7 @@
 
 gmm = jax.custom_vjp(
     backend.gmm,
-    nondiff_argnums=(3, 4, 7, 8),
+    nondiff_argnums=(3, 4, 7, 8, 9),
 )
 
 
@@ -35,6 +35,7 @@ def _gmm_fwd(
     existing_out: jnp.ndarray | None = None,
     transpose_rhs: bool = False,
     interpret: bool = False,
+    quant: bool = False,
 ) -> tuple[
     jnp.ndarray,
     tuple[
@@ -56,6 +57,7 @@ def _gmm_fwd(
       existing_out,
       transpose_rhs=transpose_rhs,
       interpret=interpret,
+      quant=quant,
   )
   return out, (lhs, rhs, group_sizes, group_offset, rhs.shape[0])
 
@@ -65,6 +67,7 @@ def _gmm_bwd(
     tiling: tuple[int, int, int],
     transpose_rhs: bool,
     interpret: bool,
+    quant: bool,
     residual: tuple[
         jnp.ndarray,
         jnp.ndarray,
@@ -86,6 +89,7 @@ def _gmm_bwd(
       group_offset,
       transpose_rhs=not transpose_rhs,
       interpret=interpret,
+      quant=quant,
   )
   grad_rhs = backend.tgmm(
       lhs.swapaxes(0, 1),
@@ -96,6 +100,7 @@ def _gmm_bwd(
       group_offset,
       num_actual_groups,
       interpret=interpret,
+      quant=quant,
   )
 
   # NOTE: If the rhs transposition is fused into the forward pass we need to
diff --git a/MaxText/layers/linears.py b/MaxText/layers/linears.py
index 64e49fa42..3c49486bd 100644
--- a/MaxText/layers/linears.py
+++ b/MaxText/layers/linears.py
@@ -407,7 +407,8 @@ def gmm(inputs, kernel, group_sizes):
       inputs = inputs.astype(self.dtype)
       kernel = kernel.astype(self.dtype)
       output = mblx.gmm(
-          lhs=inputs, rhs=kernel, group_sizes=group_sizes, preferred_element_type=jnp.bfloat16, tiling=tile_size
+          lhs=inputs, rhs=kernel, group_sizes=group_sizes, preferred_element_type=jnp.bfloat16, tiling=tile_size,
+          quant=True if self.quant else False,
       )
 
       if hs_shape[0] % pad_length:

From 2ebe80e5ae517b3dd34606806b93ead74eea55dd Mon Sep 17 00:00:00 2001
From: ZhiyuLi-goog <lizhiyu@google.com>
Date: Tue, 12 Nov 2024 22:17:05 +0000
Subject: [PATCH 04/40] [MoE] fix typo

---
 MaxText/layers/linears.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MaxText/layers/linears.py b/MaxText/layers/linears.py
index 3c49486bd..0b58ba390 100644
--- a/MaxText/layers/linears.py
+++ b/MaxText/layers/linears.py
@@ -436,7 +436,7 @@ def wrapper(x, logits, w0, w1, wo):
       layer_w0 = gmm(x, w0, group_sizes)
       layer_w0 = checkpoint_name(layer_w0, "mlpwi_0")
       layer_w1 = gmm(x, w1, group_sizes)
-      layer_w1 = checkpoint_name(layer_w0, "mlpwi_1")
+      layer_w1 = checkpoint_name(layer_w1, "mlpwi_1")
       layer_act = _convert_to_activation_function(self.config.mlp_activations[0])(layer_w0)
       intermediate_layer = jnp.multiply(layer_act, layer_w1)
       intermediate_output = gmm(intermediate_layer, wo, group_sizes)

From 0e59567dac22c2d1a8b45aac6d88575c8f81e129 Mon Sep 17 00:00:00 2001
From: ZhiyuLi-goog <lizhiyu@google.com>
Date: Tue, 12 Nov 2024 22:17:32 +0000
Subject: [PATCH 05/40] walk around quantize_params

---
 MaxText/maxengine.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/MaxText/maxengine.py b/MaxText/maxengine.py
index 1a604b044..02a30d21e 100644
--- a/MaxText/maxengine.py
+++ b/MaxText/maxengine.py
@@ -131,7 +131,8 @@ def load_params(self, *args, rng: Optional[jax.random.PRNGKey] = None, **kwargs)
     )
 
     if self.model.quant and not self.config.checkpoint_is_quantized:
-      params = self.quantize_params(state, rng3)
+      # params = self.quantize_params(state, rng3)
+      params = state.params
     else:
       params = state.params
     max_utils.print_mem_stats("After load_params")

From bd208edad68b66c5814d569f6e6f34af18fe795a Mon Sep 17 00:00:00 2001
From: ZhiyuLi-goog <lizhiyu@google.com>
Date: Tue, 12 Nov 2024 22:22:20 +0000
Subject: [PATCH 06/40] inferencing shape hack

---
 MaxText/layers/linears.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/MaxText/layers/linears.py b/MaxText/layers/linears.py
index 0b58ba390..23f4e64fa 100644
--- a/MaxText/layers/linears.py
+++ b/MaxText/layers/linears.py
@@ -391,7 +391,14 @@ def unpermute(self, intermediate, sorted_selected_experts, weights):
           reshaped_weights.astype(jnp.float32),
           precision=matmul_precision,
       )
-    return output.reshape(-1, self.config.max_target_length, self.config.emb_dim // tensor_parallelism).astype(self.dtype)
+    updated_batch = int(self.config.per_device_batch_size * jax.device_count() // self.config.ici_fsdp_parallelism)
+    # inferencing hack
+    # prefill has BS =1 sequence length = max_prefill_length 
+    # decode has BS = B, sequence_length= 1
+    if output.shape[0] % updated_batch != 0:
+      updated_batch = 1
+
+    return output.reshape(updated_batch, -1, self.config.emb_dim // tensor_parallelism).astype(self.dtype)
 
   def megablox(self, inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel):
     tile_size = (512, 1024, 1024)

From 57e238303133d0a2009d0a63524936dc4ceca92f Mon Sep 17 00:00:00 2001
From: Wonpyo Park <wppark.pio@gmail.com>
Date: Sat, 23 Nov 2024 05:24:10 +0000
Subject: [PATCH 07/40] Enable (1) checkpoint conversion for MoEBlock when
 megablox=True (2) load quantized weights from AqtEinsum to feed to gmm kernel
 later.

---
 MaxText/layers/linears.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/MaxText/layers/linears.py b/MaxText/layers/linears.py
index 23f4e64fa..dadeaff46 100644
--- a/MaxText/layers/linears.py
+++ b/MaxText/layers/linears.py
@@ -18,6 +18,7 @@
 import operator
 from typing import Any, Callable, Iterable, Sequence, Tuple, Union, Optional
 
+import flax
 import flax.linen as nn
 import jax
 from jax import lax
@@ -30,6 +31,7 @@
 from jax.ad_checkpoint import checkpoint_name
 from jax.experimental import shard_map
 import max_logging
+import max_utils
 
 try:
   # from jax.experimental.pallas.ops.tpu import megablox as mblx
@@ -658,6 +660,21 @@ def __call__(self, inputs):
 
     if cfg.megablox:
       max_logging.log("Running MoE megablox implementation.")
+      # This is called only during tracing. This is to invoke creation of quantized tensor.
+      # After jit, this will become no-op and will not affect performance.
+      _ = self.dense_matmul(inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel)
+
+      if quantizations.in_serve_mode(self.quant):
+        w0_kernel = self.variables['aqt']['AqtEinsum_0']['AqtDotGeneral_0']['qrhs']['frozen']
+        w1_kernel = self.variables['aqt']['AqtEinsum_1']['AqtDotGeneral_0']['qrhs']['frozen']
+        wo_kernel = self.variables['aqt']['AqtEinsum_2']['AqtDotGeneral_0']['qrhs']['frozen']
+
+        # Currently, megablox kernel does not accept QTensor as inputs. 
+        # Dequantizes before feeding it to megablox, as none of tesnsors are not quantized
+        # there will be no acceleration during serving. This is just a temporary solution.
+        w0_kernel = max_utils.unbox_logicallypartioned(w0_kernel).dequant()
+        w1_kernel = max_utils.unbox_logicallypartioned(w1_kernel).dequant()
+        wo_kernel = max_utils.unbox_logicallypartioned(wo_kernel).dequant()
       return self.megablox(inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel)
     else:
       max_logging.log("Running MoE matmul implementation.")

From 9117c4093e398dfae45eae85122a7bcd55465a3c Mon Sep 17 00:00:00 2001
From: Wonpyo Park <wppark.pio@gmail.com>
Date: Sat, 23 Nov 2024 05:45:40 +0000
Subject: [PATCH 08/40] Fix lint error

---
 MaxText/kernels/megablox/__init__.py |  2 +-
 MaxText/kernels/megablox/common.py   |  6 +-
 MaxText/kernels/megablox/gmm.py      | 98 +++++++---------------------
 MaxText/kernels/megablox/ops.py      |  2 +-
 MaxText/layers/linears.py            | 16 +++--
 5 files changed, 37 insertions(+), 87 deletions(-)

diff --git a/MaxText/kernels/megablox/__init__.py b/MaxText/kernels/megablox/__init__.py
index 7afffeef8..431f56c9b 100644
--- a/MaxText/kernels/megablox/__init__.py
+++ b/MaxText/kernels/megablox/__init__.py
@@ -1 +1 @@
-from kernels.megablox.ops import gmm
\ No newline at end of file
+from kernels.megablox.ops import gmm
diff --git a/MaxText/kernels/megablox/common.py b/MaxText/kernels/megablox/common.py
index bd843cf46..f0854f85c 100644
--- a/MaxText/kernels/megablox/common.py
+++ b/MaxText/kernels/megablox/common.py
@@ -53,11 +53,7 @@ def select_input_dtype(lhs: jnp.ndarray, rhs: jnp.ndarray) -> jnp.dtype:
   """A type to which both input should be adapted to before dot product."""
   # bf16xbf16 matmul is only supported since TPUv4 generation. In case of mixed
   # input precision, we need to convert bf16 argument to fp32 beforehand.
-  if (
-      supports_bfloat16_matmul()
-      and lhs.dtype == jnp.bfloat16
-      and rhs.dtype == jnp.bfloat16
-  ):
+  if supports_bfloat16_matmul() and lhs.dtype == jnp.bfloat16 and rhs.dtype == jnp.bfloat16:
     return jnp.bfloat16
   else:
     return jnp.float32
diff --git a/MaxText/kernels/megablox/gmm.py b/MaxText/kernels/megablox/gmm.py
index 631a75cb1..0f7cdad5e 100644
--- a/MaxText/kernels/megablox/gmm.py
+++ b/MaxText/kernels/megablox/gmm.py
@@ -48,17 +48,12 @@ def _validate_args(
 
   # Validate 'rhs'.
   if rhs.ndim != expected_rhs_dims:
-    raise ValueError(
-        f"Expected {expected_rhs_dims}-tensor for 'rhs' but got"
-        f" {rhs.ndim}-tensor."
-    )
+    raise ValueError(f"Expected {expected_rhs_dims}-tensor for 'rhs' but got" f" {rhs.ndim}-tensor.")
   common.assert_is_supported_dtype(rhs.dtype)
 
   # Validate 'group_sizes'.
   if group_sizes.dtype != jnp.int32:
-    raise ValueError(
-        f"Expected 32-bit integer 'group_sizes' but got {group_sizes.dtype}."
-    )
+    raise ValueError(f"Expected 32-bit integer 'group_sizes' but got {group_sizes.dtype}.")
 
   return lhs, group_sizes, common.select_input_dtype(lhs, rhs)
 
@@ -144,9 +139,7 @@ def make_group_metadata(
   rounded_group_ends = ((group_ends + tm - 1) // tm * tm).astype(jnp.int32)
 
   # (2) Round the group_starts down to the nearest multiple of 'tm'.
-  group_starts = jnp.concatenate(
-      [jnp.zeros(1, dtype=jnp.int32), group_ends[:-1]]
-  )
+  group_starts = jnp.concatenate([jnp.zeros(1, dtype=jnp.int32), group_ends[:-1]])
   rounded_group_starts = group_starts // tm * tm
 
   # (3) Calculate the number of rows in each group.
@@ -213,23 +206,16 @@ def make_group_metadata(
   # group which is empty.
   #
   # TODO(tgale): Invert the 'partial_tile_mask' predicates to be more clear.
-  partial_tile_mask = jnp.logical_or(
-      (group_offsets[:-1] % tm) == 0, group_sizes == 0
-  )
+  partial_tile_mask = jnp.logical_or((group_offsets[:-1] % tm) == 0, group_sizes == 0)
 
   # Explicitly enable tiles for zero sized groups, if specified. This covers
   # zero sized groups that start on a tile-aligned row and those that do not.
   if visit_empty_groups:
     partial_tile_mask = jnp.where(group_sizes == 0, 0, partial_tile_mask)
 
-  partial_tile_ids = jnp.where(
-      partial_tile_mask, tiles_m, group_offsets[:-1] // tm
-  )
+  partial_tile_ids = jnp.where(partial_tile_mask, tiles_m, group_offsets[:-1] // tm)
 
-  tile_visits = (
-      jnp.histogram(partial_tile_ids, bins=tiles_m, range=(0, tiles_m - 1))[0]
-      + 1
-  )
+  tile_visits = jnp.histogram(partial_tile_ids, bins=tiles_m, range=(0, tiles_m - 1))[0] + 1
 
   # Create the m-dimension tile ids for each grid index based on the visit
   # counts for each tile.
@@ -259,9 +245,7 @@ def make_group_metadata(
   return (group_offsets, group_ids, m_tile_ids), num_tiles
 
 
-def _get_group_size(
-    *, grid_id: jnp.ndarray, group_metadata: GroupMetadata
-) -> jnp.ndarray:
+def _get_group_size(*, grid_id: jnp.ndarray, group_metadata: GroupMetadata) -> jnp.ndarray:
   """Calculate the number of rows in the current group."""
   group_offsets, group_ids = group_metadata[:2]
   group_id = group_ids[grid_id]
@@ -308,13 +292,7 @@ def _zero_uninitialized_memory(
 
 @functools.partial(
     jax.jit,
-    static_argnames=[
-        "preferred_element_type",
-        "tiling",
-        "transpose_rhs",
-        "interpret",
-        "quant"
-    ],
+    static_argnames=["preferred_element_type", "tiling", "transpose_rhs", "interpret", "quant"],
 )
 def gmm(
     lhs: jnp.ndarray,
@@ -352,22 +330,16 @@ def gmm(
     assert isinstance(existing_out, jax.Array)
     expected_dtype = existing_out.dtype
     if expected_dtype != preferred_element_type:
-      raise ValueError(
-          "Existing output dtype must match preferred_element_type."
-      )
+      raise ValueError("Existing output dtype must match preferred_element_type.")
   if group_offset is None:
     group_offset = jnp.array([0], dtype=jnp.int32)
   else:
     if group_offset.shape:
-      raise ValueError(
-          f"group_offset must be a ()-shaped array. Got: {group_offset.shape}."
-      )
+      raise ValueError(f"group_offset must be a ()-shaped array. Got: {group_offset.shape}.")
     group_offset = group_offset[None]
   num_current_groups = rhs.shape[0]
   num_total_groups = group_sizes.shape[0]
-  lhs, group_sizes, input_dtype = _validate_args(
-      lhs=lhs, rhs=rhs, group_sizes=group_sizes
-  )
+  lhs, group_sizes, input_dtype = _validate_args(lhs=lhs, rhs=rhs, group_sizes=group_sizes)
 
   # Gather shape information.
   m, k, n = (lhs.shape[0], lhs.shape[1], rhs.shape[2])
@@ -427,9 +399,7 @@ def _zero_acc():
         prev_grid_id = jnp.where(grid_id > 0, grid_id - 1, 0)
         is_first_processed_group = grid_id == 0
         m_tile_changed = m_tile_ids[grid_id] != m_tile_ids[prev_grid_id]
-        first_time_seeing_out = jnp.logical_or(
-            is_first_processed_group, m_tile_changed
-        )
+        first_time_seeing_out = jnp.logical_or(is_first_processed_group, m_tile_changed)
 
         @pl.when(first_time_seeing_out)
         def _init_out():
@@ -455,9 +425,7 @@ def _store_accum():
           tn=tn,
       )
       to_store = acc_scratch[...]
-      out[...] = jax.lax.select(
-          mask[...], to_store, out[...].astype(jnp.float32)
-      ).astype(preferred_element_type)
+      out[...] = jax.lax.select(mask[...], to_store, out[...].astype(jnp.float32)).astype(preferred_element_type)
 
     def _accum(is_last_k_tile):
       if is_last_k_tile:
@@ -555,13 +523,9 @@ def out_transform_indices(n_i, grid_id, k_i, group_metadata, group_offset):
   rhs_bytes = (k * n) * rhs.itemsize  # We don't read all of rhs
   out_bytes = (m * n) * jnp.dtype(preferred_element_type).itemsize
   max_active_tiles = group_metadata[1].size
-  bytes_accessed = (
-      (lhs_bytes * tiles_n) + (rhs_bytes * max_active_tiles) + out_bytes
-  )
+  bytes_accessed = (lhs_bytes * tiles_n) + (rhs_bytes * max_active_tiles) + out_bytes
   flops = 2 * m * k * n
-  cost_estimate = pl.CostEstimate(
-      flops=flops, bytes_accessed=bytes_accessed, transcendentals=0
-  )
+  cost_estimate = pl.CostEstimate(flops=flops, bytes_accessed=bytes_accessed, transcendentals=0)
   if quant:
     pallas_call_fn = aqt_pl.pallas_call
   else:
@@ -581,8 +545,7 @@ def out_transform_indices(n_i, grid_id, k_i, group_metadata, group_offset):
           scratch_shapes=[pltpu.VMEM((tm, tn), jnp.float32)],
       ),
       input_output_aliases=input_output_aliases,
-      compiler_params=pltpu.TPUCompilerParams(
-              dimension_semantics=("parallel", "arbitrary", "arbitrary")),
+      compiler_params=pltpu.TPUCompilerParams(dimension_semantics=("parallel", "arbitrary", "arbitrary")),
       interpret=interpret,
       cost_estimate=cost_estimate,
   )
@@ -590,7 +553,7 @@ def out_transform_indices(n_i, grid_id, k_i, group_metadata, group_offset):
   if quant:
     lhs_contracting_axis, rhs_contracting_axis = dot_general_dims[0]
     # Since block_spec.block_shape of rhs is None, the first axis is reduced
-    # inside kernel, e.g., if block_shape is (None, tn, tk) then a tensor of 
+    # inside kernel, e.g., if block_shape is (None, tn, tk) then a tensor of
     # shape (tn, tk) will be feteched inside kernel instead of (1, tn, tk).
     # Therefore, we need to add one to rhs_contracting_axis.
     rhs_contracting_axis = map(lambda x: x + 1, rhs_contracting_axis)
@@ -659,16 +622,12 @@ def tgmm(
     group_offset = jnp.array([0], dtype=jnp.int32)
   else:
     group_offset = group_offset[None]
-  lhs, group_sizes, input_dtype = _validate_args(
-      lhs=lhs, rhs=rhs, group_sizes=group_sizes, expected_rhs_dims=2
-  )
+  lhs, group_sizes, input_dtype = _validate_args(lhs=lhs, rhs=rhs, group_sizes=group_sizes, expected_rhs_dims=2)
 
   # Gather shape information.
   k, m, n = (lhs.shape[0], lhs.shape[1], rhs.shape[1])
   num_groups = group_sizes.shape[0]
-  num_actual_groups = (
-      num_actual_groups if num_actual_groups is not None else num_groups
-  )
+  num_actual_groups = num_actual_groups if num_actual_groups is not None else num_groups
 
   # If tiling is callable, look up the problem dimensions in the LUT. If no tuned
   # tile dimensions are available throw an error.
@@ -718,9 +677,7 @@ def _zero_acc():
       acc_scratch[...] = jnp.zeros_like(acc_scratch)
 
     # We'll only do computation if our group has a nonzero number of rows in it.
-    dont_skip = (
-        _get_group_size(grid_id=grid_id, group_metadata=group_metadata) > 0
-    )
+    dont_skip = _get_group_size(grid_id=grid_id, group_metadata=group_metadata) > 0
 
     @pl.when(dont_skip)
     def _do():
@@ -807,19 +764,13 @@ def out_transform_indices(n_i, k_i, grid_id, group_metadata, group_offset):
   rhs_bytes = rhs.size * rhs.itemsize
   out_bytewidth = jnp.dtype(preferred_element_type).itemsize
   out_bytes = (num_actual_groups * k * n) * out_bytewidth
-  bytes_accessed = (
-      (lhs_bytes * tiles_n) + (rhs_bytes * tiles_k) + out_bytes
-  )
+  bytes_accessed = (lhs_bytes * tiles_n) + (rhs_bytes * tiles_k) + out_bytes
   flops = 2 * m * k * n
-  cost_estimate = pl.CostEstimate(
-      flops=flops, bytes_accessed=bytes_accessed, transcendentals=0
-  )
+  cost_estimate = pl.CostEstimate(flops=flops, bytes_accessed=bytes_accessed, transcendentals=0)
   lhs = lhs.swapaxes(0, 1)
   call_gmm = pl.pallas_call(
       kernel,
-      out_shape=jax.ShapeDtypeStruct(
-          (num_actual_groups, k, n), preferred_element_type
-      ),
+      out_shape=jax.ShapeDtypeStruct((num_actual_groups, k, n), preferred_element_type),
       grid_spec=pltpu.PrefetchScalarGridSpec(
           num_scalar_prefetch=2,
           in_specs=[
@@ -832,8 +783,7 @@ def out_transform_indices(n_i, k_i, grid_id, group_metadata, group_offset):
           scratch_shapes=[pltpu.VMEM((tk, tn), jnp.float32)],
       ),
       input_output_aliases=input_output_aliases,
-      compiler_params=pltpu.TPUCompilerParams(
-              dimension_semantics=("parallel", "arbitrary", "arbitrary")),
+      compiler_params=pltpu.TPUCompilerParams(dimension_semantics=("parallel", "arbitrary", "arbitrary")),
       interpret=interpret,
       cost_estimate=cost_estimate,
   )
diff --git a/MaxText/kernels/megablox/ops.py b/MaxText/kernels/megablox/ops.py
index 265566047..ba04fff54 100644
--- a/MaxText/kernels/megablox/ops.py
+++ b/MaxText/kernels/megablox/ops.py
@@ -111,4 +111,4 @@ def _gmm_bwd(
   return grad_lhs, grad_rhs, None, None, grad
 
 
-gmm.defvjp(_gmm_fwd, _gmm_bwd)
\ No newline at end of file
+gmm.defvjp(_gmm_fwd, _gmm_bwd)
diff --git a/MaxText/layers/linears.py b/MaxText/layers/linears.py
index dadeaff46..2912a19d0 100644
--- a/MaxText/layers/linears.py
+++ b/MaxText/layers/linears.py
@@ -395,7 +395,7 @@ def unpermute(self, intermediate, sorted_selected_experts, weights):
       )
     updated_batch = int(self.config.per_device_batch_size * jax.device_count() // self.config.ici_fsdp_parallelism)
     # inferencing hack
-    # prefill has BS =1 sequence length = max_prefill_length 
+    # prefill has BS =1 sequence length = max_prefill_length
     # decode has BS = B, sequence_length= 1
     if output.shape[0] % updated_batch != 0:
       updated_batch = 1
@@ -416,7 +416,11 @@ def gmm(inputs, kernel, group_sizes):
       inputs = inputs.astype(self.dtype)
       kernel = kernel.astype(self.dtype)
       output = mblx.gmm(
-          lhs=inputs, rhs=kernel, group_sizes=group_sizes, preferred_element_type=jnp.bfloat16, tiling=tile_size,
+          lhs=inputs,
+          rhs=kernel,
+          group_sizes=group_sizes,
+          preferred_element_type=jnp.bfloat16,
+          tiling=tile_size,
           quant=True if self.quant else False,
       )
 
@@ -665,11 +669,11 @@ def __call__(self, inputs):
       _ = self.dense_matmul(inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel)
 
       if quantizations.in_serve_mode(self.quant):
-        w0_kernel = self.variables['aqt']['AqtEinsum_0']['AqtDotGeneral_0']['qrhs']['frozen']
-        w1_kernel = self.variables['aqt']['AqtEinsum_1']['AqtDotGeneral_0']['qrhs']['frozen']
-        wo_kernel = self.variables['aqt']['AqtEinsum_2']['AqtDotGeneral_0']['qrhs']['frozen']
+        w0_kernel = self.variables["aqt"]["AqtEinsum_0"]["AqtDotGeneral_0"]["qrhs"]["frozen"]
+        w1_kernel = self.variables["aqt"]["AqtEinsum_1"]["AqtDotGeneral_0"]["qrhs"]["frozen"]
+        wo_kernel = self.variables["aqt"]["AqtEinsum_2"]["AqtDotGeneral_0"]["qrhs"]["frozen"]
 
-        # Currently, megablox kernel does not accept QTensor as inputs. 
+        # Currently, megablox kernel does not accept QTensor as inputs.
         # Dequantizes before feeding it to megablox, as none of tesnsors are not quantized
         # there will be no acceleration during serving. This is just a temporary solution.
         w0_kernel = max_utils.unbox_logicallypartioned(w0_kernel).dequant()

From 35e243a0e6b2be616f0e36eb6aa82b737cd9a1b9 Mon Sep 17 00:00:00 2001
From: Wonpyo Park <wppark.pio@gmail.com>
Date: Mon, 25 Nov 2024 21:24:19 +0000
Subject: [PATCH 09/40] [rollback] quantize param when ckpt is not quantized.

---
 MaxText/maxengine.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/MaxText/maxengine.py b/MaxText/maxengine.py
index 02a30d21e..1a604b044 100644
--- a/MaxText/maxengine.py
+++ b/MaxText/maxengine.py
@@ -131,8 +131,7 @@ def load_params(self, *args, rng: Optional[jax.random.PRNGKey] = None, **kwargs)
     )
 
     if self.model.quant and not self.config.checkpoint_is_quantized:
-      # params = self.quantize_params(state, rng3)
-      params = state.params
+      params = self.quantize_params(state, rng3)
     else:
       params = state.params
     max_utils.print_mem_stats("After load_params")

From fa5d5f37a8678123df619cb947413488e27c922d Mon Sep 17 00:00:00 2001
From: Wonpyo Park <wppark.pio@gmail.com>
Date: Wed, 27 Nov 2024 22:02:42 +0000
Subject: [PATCH 10/40] Let `gmm` accept pre-quantized rhs (weight).

By accepting pre-quantized weight, quantized gmm does not need to
quantize weight for every iteration.
---
 MaxText/kernels/megablox/gmm.py | 11 ++++++++---
 MaxText/kernels/megablox/ops.py |  7 ++++---
 MaxText/layers/linears.py       | 34 +++++++++++++++++++++++----------
 3 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/MaxText/kernels/megablox/gmm.py b/MaxText/kernels/megablox/gmm.py
index 0f7cdad5e..8e2a953a3 100644
--- a/MaxText/kernels/megablox/gmm.py
+++ b/MaxText/kernels/megablox/gmm.py
@@ -296,7 +296,7 @@ def _zero_uninitialized_memory(
 )
 def gmm(
     lhs: jnp.ndarray,
-    rhs: jnp.ndarray,
+    rhs: jnp.ndarray | QTensor,
     group_sizes: jnp.ndarray,
     preferred_element_type: jnp.dtype = jnp.float32,
     tiling: tuple[int, int, int] | LutFn | None = (128, 128, 128),
@@ -520,7 +520,11 @@ def out_transform_indices(n_i, grid_id, k_i, group_metadata, group_offset):
     rhs_block_spec = pl.BlockSpec((None, tk, tn), rhs_transform_indices)
 
   lhs_bytes = lhs.size * lhs.itemsize
-  rhs_bytes = (k * n) * rhs.itemsize  # We don't read all of rhs
+  if isinstance(rhs, QTensor):
+    rhs_bytes = (k * n) * rhs.qvalue.itemsize # ignore scale factor as its size marginal.
+  else:
+    rhs_bytes = (k * n) * rhs.itemsize  # We don't read all of rhs
+
   out_bytes = (m * n) * jnp.dtype(preferred_element_type).itemsize
   max_active_tiles = group_metadata[1].size
   bytes_accessed = (lhs_bytes * tiles_n) + (rhs_bytes * max_active_tiles) + out_bytes
@@ -558,7 +562,8 @@ def out_transform_indices(n_i, grid_id, k_i, group_metadata, group_offset):
     # Therefore, we need to add one to rhs_contracting_axis.
     rhs_contracting_axis = map(lambda x: x + 1, rhs_contracting_axis)
     lhs = aqt_pl.quant(lhs, 8, lhs_contracting_axis)
-    rhs = aqt_pl.quant(rhs, 8, list(rhs_contracting_axis))
+    if not isinstance(rhs, QTensor):
+      rhs = aqt_pl.quant(rhs, 8, list(rhs_contracting_axis))
 
   out = call_gmm(
       group_metadata,
diff --git a/MaxText/kernels/megablox/ops.py b/MaxText/kernels/megablox/ops.py
index ba04fff54..2fad31322 100644
--- a/MaxText/kernels/megablox/ops.py
+++ b/MaxText/kernels/megablox/ops.py
@@ -17,6 +17,7 @@
 import jax
 from kernels.megablox import gmm as backend
 import jax.numpy as jnp
+from aqt.jax.v2 import aqt_tensor
 
 
 gmm = jax.custom_vjp(
@@ -27,7 +28,7 @@
 
 def _gmm_fwd(
     lhs: jnp.ndarray,
-    rhs: jnp.ndarray,
+    rhs: jnp.ndarray | aqt_tensor.QTensor,
     group_sizes: jnp.ndarray,
     preferred_element_type: jnp.dtype = jnp.float32,
     tiling: tuple[int, int, int] = (128, 128, 128),
@@ -40,7 +41,7 @@ def _gmm_fwd(
     jnp.ndarray,
     tuple[
         jnp.ndarray,
-        jnp.ndarray,
+        jnp.ndarray | aqt_tensor.QTensor,
         jnp.ndarray,
         jnp.ndarray | None,
         int,
@@ -70,7 +71,7 @@ def _gmm_bwd(
     quant: bool,
     residual: tuple[
         jnp.ndarray,
-        jnp.ndarray,
+        jnp.ndarray | aqt_tensor.QTensor,
         jnp.ndarray,
         jnp.ndarray | None,
         int,
diff --git a/MaxText/layers/linears.py b/MaxText/layers/linears.py
index 2912a19d0..29747b106 100644
--- a/MaxText/layers/linears.py
+++ b/MaxText/layers/linears.py
@@ -32,6 +32,7 @@
 from jax.experimental import shard_map
 import max_logging
 import max_utils
+from aqt.jax.v2 import aqt_tensor
 
 try:
   # from jax.experimental.pallas.ops.tpu import megablox as mblx
@@ -52,7 +53,7 @@
 
 RMSNorm = normalizations.RMSNorm
 Quant = quantizations.AqtQuantization
-
+QTensor = aqt_tensor.QTensor
 
 def _convert_to_activation_function(fn_or_string: Union[str, Callable[..., Any]]) -> Callable[..., Any]:
   """Convert a string to an activation function."""
@@ -421,7 +422,7 @@ def gmm(inputs, kernel, group_sizes):
           group_sizes=group_sizes,
           preferred_element_type=jnp.bfloat16,
           tiling=tile_size,
-          quant=True if self.quant else False,
+          quant=True if self.quant else False
       )
 
       if hs_shape[0] % pad_length:
@@ -431,15 +432,28 @@ def gmm(inputs, kernel, group_sizes):
     # Currently, we only support data and tensor parallelism with Megablox.
     # We all gather the input activations over tensor parallelism to follow strategy
     # in https://parsa.epfl.ch/course-info/cs723/papers/Megatron.pdf.
+    input_partition_spec = nn.logical_to_mesh_axes(("activation_batch", None, None))
+    gate_logits_pspec = nn.logical_to_mesh_axes(("activation_batch", None, None))
+    w0_pspec = nn.logical_to_mesh_axes((None, None, "mlp"))
+    w1_pspec = nn.logical_to_mesh_axes((None, None, "mlp"))
+    wo_pspec = nn.logical_to_mesh_axes((None, "mlp", None))
+
+    if isinstance(w0_kernel, QTensor):
+      w0_pspec = aqt_tensor.partition_spec(w0_pspec, [1,], dtype=w0_kernel.dtype, use_bias=False)
+    if isinstance(w1_kernel, QTensor):
+      w1_pspec = aqt_tensor.partition_spec(w1_pspec, [1,], dtype=w1_kernel.dtype, use_bias=False)
+    if isinstance(wo_kernel, QTensor):
+      wo_pspec = aqt_tensor.partition_spec(wo_pspec, [1,], dtype=wo_kernel.dtype, use_bias=False)
+
     @functools.partial(
         shard_map.shard_map,
         mesh=self.mesh,
         in_specs=(
-            (nn.logical_to_mesh_axes(("activation_batch", None, None))),
-            (nn.logical_to_mesh_axes(("activation_batch", None, None))),
-            (nn.logical_to_mesh_axes((None, None, "mlp"))),
-            (nn.logical_to_mesh_axes((None, None, "mlp"))),
-            (nn.logical_to_mesh_axes((None, "mlp", None))),
+            input_partition_spec,
+            gate_logits_pspec,
+            w0_pspec,
+            w1_pspec,
+            wo_pspec
         ),
         out_specs=(nn.logical_to_mesh_axes(("activation_batch", None, "activation_embed"))),
         check_rep=False,
@@ -676,9 +690,9 @@ def __call__(self, inputs):
         # Currently, megablox kernel does not accept QTensor as inputs.
         # Dequantizes before feeding it to megablox, as none of tesnsors are not quantized
         # there will be no acceleration during serving. This is just a temporary solution.
-        w0_kernel = max_utils.unbox_logicallypartioned(w0_kernel).dequant()
-        w1_kernel = max_utils.unbox_logicallypartioned(w1_kernel).dequant()
-        wo_kernel = max_utils.unbox_logicallypartioned(wo_kernel).dequant()
+        w0_kernel = max_utils.unbox_logicallypartioned(w0_kernel)
+        w1_kernel = max_utils.unbox_logicallypartioned(w1_kernel)
+        wo_kernel = max_utils.unbox_logicallypartioned(wo_kernel)
       return self.megablox(inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel)
     else:
       max_logging.log("Running MoE matmul implementation.")

From 11f1f432f0d764f8103a7b84dcbe8ebb784951d9 Mon Sep 17 00:00:00 2001
From: Wonpyo Park <wppark.pio@gmail.com>
Date: Wed, 4 Dec 2024 22:10:59 +0000
Subject: [PATCH 11/40] Support different quantization precision for lhs / rhs
 of megablox.

Megablox now supports:
- int8 quantization
- int8w quantization
- int4w quantization
---
 MaxText/kernels/megablox/gmm.py | 98 +++++++++++++++++++--------------
 MaxText/kernels/megablox/ops.py | 26 ++++-----
 MaxText/layers/linears.py       | 30 +++++++---
 3 files changed, 89 insertions(+), 65 deletions(-)

diff --git a/MaxText/kernels/megablox/gmm.py b/MaxText/kernels/megablox/gmm.py
index 8e2a953a3..3d0645bb5 100644
--- a/MaxText/kernels/megablox/gmm.py
+++ b/MaxText/kernels/megablox/gmm.py
@@ -17,14 +17,14 @@
 from collections.abc import Callable
 import dataclasses
 import functools
-from typing import Any, Optional
+from typing import Any, Optional, Literal
 
 import jax
+import jax.numpy as jnp
 from jax import lax
 from jax.experimental import pallas as pl
 from jax.experimental.pallas import tpu as pltpu
 from kernels.megablox import common
-import jax.numpy as jnp
 
 from aqt.jax.v2 import pallas as aqt_pl
 from aqt.jax.v2 import aqt_tensor
@@ -292,7 +292,14 @@ def _zero_uninitialized_memory(
 
 @functools.partial(
     jax.jit,
-    static_argnames=["preferred_element_type", "tiling", "transpose_rhs", "interpret", "quant"],
+    static_argnames=[
+        "preferred_element_type",
+        "tiling",
+        "transpose_rhs",
+        "interpret",
+        "lhs_quantize_dtype",
+        "rhs_quantize_dtype",
+    ],
 )
 def gmm(
     lhs: jnp.ndarray,
@@ -304,7 +311,8 @@ def gmm(
     existing_out: jnp.ndarray | None = None,
     transpose_rhs: bool = False,
     interpret: bool = False,
-    quant: bool = False,
+    lhs_quantize_dtype: Literal[jnp.int4, jnp.int8] | None = None,
+    rhs_quantize_dtype: Literal[jnp.int4, jnp.int8] | None = None,
 ) -> jnp.ndarray:
   """Compute lhs[sizes[i-1]:sizes[i], :] @ rhs for each group 'i'.
 
@@ -320,12 +328,23 @@ def gmm(
     transpose_rhs: True if the rhs needs to be transposed.
     interpret: Whether or not to run the kernel in interpret mode, helpful for
       testing and debugging.
-    quant: Whether to quantize lhs and rhs.
+    lhs_quantize_dtype: Bit precision of lhs after quantization.
+    rhs_quantirhs_quantize_dtypeze_bits: Bit precision of rhs after quantization. If rhs (weight) is already quantized this must be samed with precision of the given weight.
 
   Returns:
     A 2d, jnp.ndarray with shape [m, n].
   """
 
+  if rhs_quantize_dtype is None and isinstance(rhs, QTensor):
+    raise ValueError("rhs_quantize_dtype is None, but quantized rhs is given.")
+
+  if rhs_quantize_dtype is not None and isinstance(rhs, QTensor):
+    # If weight is alreeady quantized check precision.
+    if rhs_quantize_dtype != rhs.qvalue.dtype:
+      raise ValueError(
+          f"{rhs_quantize_dtype=} and already given quantized {rhs.qvalue.dtype=} does not have the same precision"
+      )
+
   if existing_out is not None:
     assert isinstance(existing_out, jax.Array)
     expected_dtype = existing_out.dtype
@@ -405,13 +424,13 @@ def _zero_acc():
         def _init_out():
           out[...] = existing_out[...]
 
-    def mask_k_rem(x, *, dim):
+    def mask_k_rem(x, *, dim, quantize):
       if k_rem == 0:
         return x
 
       orig_dtype = x.dtype
       iota = lax.broadcasted_iota(jnp.int32, x.shape, dim)
-      if not quant:
+      if quantize is None:
         x = x.astype(jnp.float32)
       else:
         x = x.astype(jnp.int32)
@@ -429,8 +448,8 @@ def _store_accum():
 
     def _accum(is_last_k_tile):
       if is_last_k_tile:
-        mask_k_rem_lhs = partial(mask_k_rem, dim=1)
-        mask_k_rem_rhs = partial(mask_k_rem, dim=int(transpose_rhs))
+        mask_k_rem_lhs = partial(mask_k_rem, dim=1, quantize=lhs_quantize_dtype is not None)
+        mask_k_rem_rhs = partial(mask_k_rem, dim=int(transpose_rhs), quantize=rhs_quantize_dtype is not None)
       else:
         mask_k_rem_lhs = lambda x: x
         mask_k_rem_rhs = lambda x: x
@@ -454,21 +473,12 @@ def _accum(is_last_k_tile):
       else:
         loaded_rhs = mask_k_rem_rhs(rhs[...]).astype(input_dtype)
 
-      if quant:
-        acc_scratch[...] += aqt_pl.dot_general(
-            loaded_lhs,
-            loaded_rhs,
-            preferred_element_type=jnp.float32,
-            dimension_numbers=dot_general_dims,
-        )
-      else:
-        acc_scratch[...] += lax.dot_general(
-            loaded_lhs,
-            loaded_rhs,
-            preferred_element_type=jnp.float32,
-            dimension_numbers=dot_general_dims,
-        )
-
+      acc_scratch[...] += aqt_pl.dot_general(
+          loaded_lhs,
+          loaded_rhs,
+          preferred_element_type=jnp.float32,
+          dimension_numbers=dot_general_dims,
+      )
       if is_last_k_tile:
         _store_accum()
 
@@ -509,9 +519,13 @@ def out_transform_indices(n_i, grid_id, k_i, group_metadata, group_offset):
     input_output_aliases = {}
   else:
     in_out_block_spec = out_block_spec
-    input_output_aliases = {6: 0}
-    if quant:
-      input_output_aliases = {8: 0}
+    num_inputs = 6
+    # adding one more input because of scale factor of quantized tensor.
+    if lhs_quantize_dtype is not None:
+      num_inputs += 1
+    if rhs_quantize_dtype is not None:
+      num_inputs += 1
+    input_output_aliases = {num_inputs: 0}
 
   lhs_block_spec = pl.BlockSpec((tm, tk), lhs_transform_indices)
   if transpose_rhs:
@@ -521,7 +535,7 @@ def out_transform_indices(n_i, grid_id, k_i, group_metadata, group_offset):
 
   lhs_bytes = lhs.size * lhs.itemsize
   if isinstance(rhs, QTensor):
-    rhs_bytes = (k * n) * rhs.qvalue.itemsize # ignore scale factor as its size marginal.
+    rhs_bytes = (k * n) * rhs.qvalue.itemsize  # ignore scale factor as its size marginal.
   else:
     rhs_bytes = (k * n) * rhs.itemsize  # We don't read all of rhs
 
@@ -530,7 +544,7 @@ def out_transform_indices(n_i, grid_id, k_i, group_metadata, group_offset):
   bytes_accessed = (lhs_bytes * tiles_n) + (rhs_bytes * max_active_tiles) + out_bytes
   flops = 2 * m * k * n
   cost_estimate = pl.CostEstimate(flops=flops, bytes_accessed=bytes_accessed, transcendentals=0)
-  if quant:
+  if lhs_quantize_dtype is not None or rhs_quantize_dtype is not None:
     pallas_call_fn = aqt_pl.pallas_call
   else:
     pallas_call_fn = pl.pallas_call
@@ -554,16 +568,20 @@ def out_transform_indices(n_i, grid_id, k_i, group_metadata, group_offset):
       cost_estimate=cost_estimate,
   )
 
-  if quant:
-    lhs_contracting_axis, rhs_contracting_axis = dot_general_dims[0]
-    # Since block_spec.block_shape of rhs is None, the first axis is reduced
-    # inside kernel, e.g., if block_shape is (None, tn, tk) then a tensor of
-    # shape (tn, tk) will be feteched inside kernel instead of (1, tn, tk).
-    # Therefore, we need to add one to rhs_contracting_axis.
-    rhs_contracting_axis = map(lambda x: x + 1, rhs_contracting_axis)
-    lhs = aqt_pl.quant(lhs, 8, lhs_contracting_axis)
-    if not isinstance(rhs, QTensor):
-      rhs = aqt_pl.quant(rhs, 8, list(rhs_contracting_axis))
+  lhs_contracting_axis, rhs_contracting_axis = dot_general_dims[0]
+  # Since block_spec.block_shape of rhs is None, the first axis is reduced
+  # inside kernel, e.g., if block_shape is (None, tn, tk) then a tensor of
+  # shape (tn, tk) will be feteched inside kernel instead of (1, tn, tk).
+  # Therefore, we need to add one to rhs_contracting_axis.
+  rhs_contracting_axis = map(lambda x: x + 1, rhs_contracting_axis)
+
+  if lhs_quantize_dtype is not None:
+    lhs_quantize_bits = 4 if lhs_quantize_dtype == jnp.int4 else 8
+    lhs = aqt_pl.quant(lhs, lhs_quantize_bits, lhs_contracting_axis)
+
+  if not isinstance(rhs, QTensor) and rhs_quantize_dtype is not None:
+    rhs_quantize_bits = 4 if rhs_quantize_dtype == jnp.int4 else 8
+    rhs = aqt_pl.quant(rhs, rhs_quantize_bits, list(rhs_contracting_axis))
 
   out = call_gmm(
       group_metadata,
@@ -589,7 +607,6 @@ def out_transform_indices(n_i, grid_id, k_i, group_metadata, group_offset):
         "tiling",
         "num_actual_groups",
         "interpret",
-        "quant",
     ],
 )
 def tgmm(
@@ -602,7 +619,6 @@ def tgmm(
     num_actual_groups: int | None = None,
     existing_out: jnp.ndarray | None = None,
     interpret: bool = False,
-    quant: bool = False,
 ) -> jnp.ndarray:
   """Compute lhs[:, sizes[i-1]:sizes[i]] @ rhs[sizes[i-1]:sizes[i], :].
 
diff --git a/MaxText/kernels/megablox/ops.py b/MaxText/kernels/megablox/ops.py
index 2fad31322..c8706c227 100644
--- a/MaxText/kernels/megablox/ops.py
+++ b/MaxText/kernels/megablox/ops.py
@@ -18,11 +18,11 @@
 from kernels.megablox import gmm as backend
 import jax.numpy as jnp
 from aqt.jax.v2 import aqt_tensor
-
+from typing import Literal
 
 gmm = jax.custom_vjp(
     backend.gmm,
-    nondiff_argnums=(3, 4, 7, 8, 9),
+    nondiff_argnums=(3, 4, 7, 8, 9, 10),
 )
 
 
@@ -36,7 +36,8 @@ def _gmm_fwd(
     existing_out: jnp.ndarray | None = None,
     transpose_rhs: bool = False,
     interpret: bool = False,
-    quant: bool = False,
+    lhs_quantize_dtype: Literal[jnp.int4, jnp.int8] | None = None,
+    rhs_quantize_dtype: Literal[jnp.int4, jnp.int8] | None = None,
 ) -> tuple[
     jnp.ndarray,
     tuple[
@@ -58,7 +59,8 @@ def _gmm_fwd(
       existing_out,
       transpose_rhs=transpose_rhs,
       interpret=interpret,
-      quant=quant,
+      lhs_quantize_dtype=lhs_quantize_dtype,
+      rhs_quantize_dtype=rhs_quantize_dtype,
   )
   return out, (lhs, rhs, group_sizes, group_offset, rhs.shape[0])
 
@@ -68,7 +70,8 @@ def _gmm_bwd(
     tiling: tuple[int, int, int],
     transpose_rhs: bool,
     interpret: bool,
-    quant: bool,
+    lhs_quantize_dtype: Literal[jnp.int4, jnp.int8] | None,
+    rhs_quantize_dtype: Literal[jnp.int4, jnp.int8] | None,
     residual: tuple[
         jnp.ndarray,
         jnp.ndarray | aqt_tensor.QTensor,
@@ -90,18 +93,11 @@ def _gmm_bwd(
       group_offset,
       transpose_rhs=not transpose_rhs,
       interpret=interpret,
-      quant=quant,
+      lhs_quantize_dtype=lhs_quantize_dtype,
+      rhs_quantize_dtype=rhs_quantize_dtype,
   )
   grad_rhs = backend.tgmm(
-      lhs.swapaxes(0, 1),
-      grad,
-      group_sizes,
-      rhs.dtype,
-      tiling,
-      group_offset,
-      num_actual_groups,
-      interpret=interpret,
-      quant=quant,
+      lhs.swapaxes(0, 1), grad, group_sizes, rhs.dtype, tiling, group_offset, num_actual_groups, interpret=interpret
   )
 
   # NOTE: If the rhs transposition is fused into the forward pass we need to
diff --git a/MaxText/layers/linears.py b/MaxText/layers/linears.py
index 29747b106..fcf140cbd 100644
--- a/MaxText/layers/linears.py
+++ b/MaxText/layers/linears.py
@@ -55,6 +55,7 @@
 Quant = quantizations.AqtQuantization
 QTensor = aqt_tensor.QTensor
 
+
 def _convert_to_activation_function(fn_or_string: Union[str, Callable[..., Any]]) -> Callable[..., Any]:
   """Convert a string to an activation function."""
   if fn_or_string == "linear":
@@ -416,15 +417,30 @@ def gmm(inputs, kernel, group_sizes):
 
       inputs = inputs.astype(self.dtype)
       kernel = kernel.astype(self.dtype)
+
+      quantization_config: str = self.config.quantization
+      match quantization_config:
+        case "int8":
+          lhs_quantize_dtype = jnp.int8
+          rhs_quantize_dtype = jnp.int8
+        case "int8w":
+          lhs_quantize_dtype = None
+          rhs_quantize_dtype = jnp.int8
+        case "int4w":
+          lhs_quantize_dtype = None
+          rhs_quantize_dtype = jnp.int4
+        case _:
+          lhs_quantize_dtype = None
+          rhs_quantize_dtype = None
       output = mblx.gmm(
           lhs=inputs,
           rhs=kernel,
           group_sizes=group_sizes,
           preferred_element_type=jnp.bfloat16,
           tiling=tile_size,
-          quant=True if self.quant else False
+          lhs_quantize_dtype=lhs_quantize_dtype,
+          rhs_quantize_dtype=rhs_quantize_dtype,
       )
-
       if hs_shape[0] % pad_length:
         output = output[: hs_shape[0]]
       return output
@@ -438,6 +454,7 @@ def gmm(inputs, kernel, group_sizes):
     w1_pspec = nn.logical_to_mesh_axes((None, None, "mlp"))
     wo_pspec = nn.logical_to_mesh_axes((None, "mlp", None))
 
+
     if isinstance(w0_kernel, QTensor):
       w0_pspec = aqt_tensor.partition_spec(w0_pspec, [1,], dtype=w0_kernel.dtype, use_bias=False)
     if isinstance(w1_kernel, QTensor):
@@ -445,16 +462,11 @@ def gmm(inputs, kernel, group_sizes):
     if isinstance(wo_kernel, QTensor):
       wo_pspec = aqt_tensor.partition_spec(wo_pspec, [1,], dtype=wo_kernel.dtype, use_bias=False)
 
+
     @functools.partial(
         shard_map.shard_map,
         mesh=self.mesh,
-        in_specs=(
-            input_partition_spec,
-            gate_logits_pspec,
-            w0_pspec,
-            w1_pspec,
-            wo_pspec
-        ),
+        in_specs=(input_partition_spec, gate_logits_pspec, w0_pspec, w1_pspec, wo_pspec),
         out_specs=(nn.logical_to_mesh_axes(("activation_batch", None, "activation_embed"))),
         check_rep=False,
     )

From 1844936bcb84835f9fc9a688e6d375e0588cd865 Mon Sep 17 00:00:00 2001
From: Wonpyo Park <wppark.pio@gmail.com>
Date: Thu, 5 Dec 2024 21:18:08 +0000
Subject: [PATCH 12/40] Fix license

---
 MaxText/kernels/megablox/__init__.py | 14 ++++++++++++++
 MaxText/kernels/megablox/common.py   |  2 +-
 MaxText/kernels/megablox/gmm.py      |  2 +-
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/MaxText/kernels/megablox/__init__.py b/MaxText/kernels/megablox/__init__.py
index 431f56c9b..b432ac431 100644
--- a/MaxText/kernels/megablox/__init__.py
+++ b/MaxText/kernels/megablox/__init__.py
@@ -1 +1,15 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from kernels.megablox.ops import gmm
diff --git a/MaxText/kernels/megablox/common.py b/MaxText/kernels/megablox/common.py
index f0854f85c..51d80663e 100644
--- a/MaxText/kernels/megablox/common.py
+++ b/MaxText/kernels/megablox/common.py
@@ -1,4 +1,4 @@
-# Copyright 2024 The JAX Authors.
+# Copyright 2024 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/MaxText/kernels/megablox/gmm.py b/MaxText/kernels/megablox/gmm.py
index 3d0645bb5..3b389acf4 100644
--- a/MaxText/kernels/megablox/gmm.py
+++ b/MaxText/kernels/megablox/gmm.py
@@ -1,4 +1,4 @@
-# Copyright 2024 The JAX Authors.
+# Copyright 2024 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 0d9db17c9e1eb7ab48967f2e88cfeefdcea6df5e Mon Sep 17 00:00:00 2001
From: Wonpyo Park <wppark.pio@gmail.com>
Date: Thu, 5 Dec 2024 21:19:16 +0000
Subject: [PATCH 13/40] Refactoring MoEBlock

---
 MaxText/layers/linears.py | 91 ++++++++++++++++++++-------------------
 1 file changed, 47 insertions(+), 44 deletions(-)

diff --git a/MaxText/layers/linears.py b/MaxText/layers/linears.py
index fcf140cbd..6903b69e7 100644
--- a/MaxText/layers/linears.py
+++ b/MaxText/layers/linears.py
@@ -33,14 +33,7 @@
 import max_logging
 import max_utils
 from aqt.jax.v2 import aqt_tensor
-
-try:
-  # from jax.experimental.pallas.ops.tpu import megablox as mblx
-  from kernels import megablox as mblx
-
-except ImportError:
-  max_logging.log("JAX megablox is available for TPU only.")
-  pass
+from kernels import megablox as mblx
 
 Array = common_types.Array
 Config = common_types.Config
@@ -418,20 +411,22 @@ def gmm(inputs, kernel, group_sizes):
       inputs = inputs.astype(self.dtype)
       kernel = kernel.astype(self.dtype)
 
-      quantization_config: str = self.config.quantization
-      match quantization_config:
-        case "int8":
-          lhs_quantize_dtype = jnp.int8
-          rhs_quantize_dtype = jnp.int8
-        case "int8w":
-          lhs_quantize_dtype = None
-          rhs_quantize_dtype = jnp.int8
-        case "int4w":
-          lhs_quantize_dtype = None
-          rhs_quantize_dtype = jnp.int4
-        case _:
-          lhs_quantize_dtype = None
-          rhs_quantize_dtype = None
+      # 'int8' for dynamic range quantization using 8-bits
+      # 'int8w' for weight only quantization using 8-bits
+      # 'int4w' for weight only quantization using 4-bits
+      quantization_config = self.config.quantization
+      quantization_types = {
+          "int8": (jnp.int8, jnp.int8),
+          "int8w": (None, jnp.int8),
+          "int4w": (None, jnp.int4),
+      }
+      lhs_quantize_dtype, rhs_quantize_dtype = None, None
+      if quantization_config:
+        if quantization_config in quantization_types:
+          lhs_quantize_dtype, rhs_quantize_dtype = quantization_types[quantization_config]
+        else:
+          raise ValueError(f"{quantization_config=} is not yet supported in megablox.")
+
       output = mblx.gmm(
           lhs=inputs,
           rhs=kernel,
@@ -671,6 +666,33 @@ def dense_matmul(self, inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel):
         )
       return output, None
 
+  def retrieve_quantized_weight(self, inputs, gate_logits) -> tuple[QTensor, QTensor, QTensor]:
+    # This is called only during tracing. This is to invoke creation of quantized tensor inside AqtEinsum.
+    # After jit, this will become no-op and will not affect performance.
+    _ = self.capped_dense(inputs, gate_logits)
+
+    w0_kernel = self.variables["aqt"]["AqtEinsum_0"]["AqtDotGeneral_0"]["qrhs"]["frozen"]
+    w1_kernel = self.variables["aqt"]["AqtEinsum_1"]["AqtDotGeneral_0"]["qrhs"]["frozen"]
+    wo_kernel = self.variables["aqt"]["AqtEinsum_2"]["AqtDotGeneral_0"]["qrhs"]["frozen"]
+
+    w0_kernel = max_utils.unbox_logicallypartioned(w0_kernel)
+    w1_kernel = max_utils.unbox_logicallypartioned(w1_kernel)
+    wo_kernel = max_utils.unbox_logicallypartioned(wo_kernel)
+    return w0_kernel, w1_kernel, wo_kernel
+
+  def capped_dense(self, inputs, gate_logits):
+    cfg = self.config
+    w0_kernel, w1_kernel, wo_kernel = self.generate_kernels(cfg.num_experts, cfg.emb_dim, cfg.mlp_dim)
+    return self.dense_matmul(inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel)
+
+  def no_capped_megablox(self, inputs, gate_logits):
+    if quantizations.in_serve_mode(self.quant):
+      w0_kernel, w1_kernel, wo_kernel = self.retrieve_quantized_weight(inputs, gate_logits)
+    else:
+      cfg = self.config
+      w0_kernel, w1_kernel, wo_kernel = self.generate_kernels(cfg.num_experts, cfg.emb_dim, cfg.mlp_dim)
+    return self.megablox(inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel)
+
   @nn.compact
   def __call__(self, inputs):
     cfg = self.config
@@ -685,27 +707,8 @@ def __call__(self, inputs):
         name="gate",
         matmul_precision=self.config.matmul_precision,
     )(inputs)
-
-    w0_kernel, w1_kernel, wo_kernel = self.generate_kernels(cfg.num_experts, cfg.emb_dim, cfg.mlp_dim)
-
+    max_logging.log("Running MoE megablox implementation.")
     if cfg.megablox:
-      max_logging.log("Running MoE megablox implementation.")
-      # This is called only during tracing. This is to invoke creation of quantized tensor.
-      # After jit, this will become no-op and will not affect performance.
-      _ = self.dense_matmul(inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel)
-
-      if quantizations.in_serve_mode(self.quant):
-        w0_kernel = self.variables["aqt"]["AqtEinsum_0"]["AqtDotGeneral_0"]["qrhs"]["frozen"]
-        w1_kernel = self.variables["aqt"]["AqtEinsum_1"]["AqtDotGeneral_0"]["qrhs"]["frozen"]
-        wo_kernel = self.variables["aqt"]["AqtEinsum_2"]["AqtDotGeneral_0"]["qrhs"]["frozen"]
-
-        # Currently, megablox kernel does not accept QTensor as inputs.
-        # Dequantizes before feeding it to megablox, as none of tesnsors are not quantized
-        # there will be no acceleration during serving. This is just a temporary solution.
-        w0_kernel = max_utils.unbox_logicallypartioned(w0_kernel)
-        w1_kernel = max_utils.unbox_logicallypartioned(w1_kernel)
-        wo_kernel = max_utils.unbox_logicallypartioned(wo_kernel)
-      return self.megablox(inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel)
+      return self.no_capped_megablox(inputs, gate_logits)
     else:
-      max_logging.log("Running MoE matmul implementation.")
-      return self.dense_matmul(inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel)
+      return self.capped_dense(inputs, gate_logits)

From cd8b2d1bad0179c233367ec5dca88a768ca3c852 Mon Sep 17 00:00:00 2001
From: Wonpyo Park <wppark.pio@gmail.com>
Date: Thu, 5 Dec 2024 21:32:15 +0000
Subject: [PATCH 14/40] Rename in_out_block_spec for better readability

---
 MaxText/kernels/megablox/gmm.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/MaxText/kernels/megablox/gmm.py b/MaxText/kernels/megablox/gmm.py
index 3b389acf4..440f1aab6 100644
--- a/MaxText/kernels/megablox/gmm.py
+++ b/MaxText/kernels/megablox/gmm.py
@@ -519,13 +519,13 @@ def out_transform_indices(n_i, grid_id, k_i, group_metadata, group_offset):
     input_output_aliases = {}
   else:
     in_out_block_spec = out_block_spec
-    num_inputs = 6
+    existing_out_arg_index = 6
     # adding one more input because of scale factor of quantized tensor.
     if lhs_quantize_dtype is not None:
-      num_inputs += 1
+      existing_out_arg_index += 1
     if rhs_quantize_dtype is not None:
-      num_inputs += 1
-    input_output_aliases = {num_inputs: 0}
+      existing_out_arg_index += 1
+    input_output_aliases = {existing_out_arg_index: 0}
 
   lhs_block_spec = pl.BlockSpec((tm, tk), lhs_transform_indices)
   if transpose_rhs:

From cfd3b1565d79b643071b617c31807ba175b6524d Mon Sep 17 00:00:00 2001
From: Wonpyo Park <wppark.pio@gmail.com>
Date: Fri, 6 Dec 2024 05:44:39 +0000
Subject: [PATCH 15/40] Fix lint error & refactor retrieving quantized weight
 logic

---
 MaxText/kernels/megablox/__init__.py |  1 +
 MaxText/kernels/megablox/common.py   |  2 +-
 MaxText/kernels/megablox/gmm.py      |  5 +++-
 MaxText/kernels/megablox/ops.py      |  2 ++
 MaxText/kernels/ragged_attention.py  | 15 ++++++-----
 MaxText/layers/linears.py            | 37 +++++++++++-----------------
 6 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/MaxText/kernels/megablox/__init__.py b/MaxText/kernels/megablox/__init__.py
index b432ac431..5df55e489 100644
--- a/MaxText/kernels/megablox/__init__.py
+++ b/MaxText/kernels/megablox/__init__.py
@@ -11,5 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Megablox kernel"""
 
 from kernels.megablox.ops import gmm
diff --git a/MaxText/kernels/megablox/common.py b/MaxText/kernels/megablox/common.py
index 51d80663e..d11c80387 100644
--- a/MaxText/kernels/megablox/common.py
+++ b/MaxText/kernels/megablox/common.py
@@ -45,7 +45,7 @@ def supports_bfloat16_matmul() -> bool:
 
 
 def assert_is_supported_dtype(dtype: jnp.dtype) -> None:
-  if dtype != jnp.bfloat16 and dtype != jnp.float32:
+  if dtype not in (jnp.bfloat16, jnp.float32):
     raise ValueError(f"Expected bfloat16 or float32 array but got {dtype}.")
 
 
diff --git a/MaxText/kernels/megablox/gmm.py b/MaxText/kernels/megablox/gmm.py
index 440f1aab6..0c2a06fde 100644
--- a/MaxText/kernels/megablox/gmm.py
+++ b/MaxText/kernels/megablox/gmm.py
@@ -14,6 +14,8 @@
 
 """Grouped matrix multiplication kernels for TPU written in Pallas."""
 
+# pylint: disable=too-many-positional-arguments, unnecessary-lambda-assignment
+
 from collections.abc import Callable
 import dataclasses
 import functools
@@ -329,7 +331,8 @@ def gmm(
     interpret: Whether or not to run the kernel in interpret mode, helpful for
       testing and debugging.
     lhs_quantize_dtype: Bit precision of lhs after quantization.
-    rhs_quantirhs_quantize_dtypeze_bits: Bit precision of rhs after quantization. If rhs (weight) is already quantized this must be samed with precision of the given weight.
+    rhs_quantirhs_quantize_dtypeze_bits: Bit precision of rhs after quantization.
+      If rhs (weight) is already quantized this must be samed with precision of the given weight.
 
   Returns:
     A 2d, jnp.ndarray with shape [m, n].
diff --git a/MaxText/kernels/megablox/ops.py b/MaxText/kernels/megablox/ops.py
index c8706c227..822b4852f 100644
--- a/MaxText/kernels/megablox/ops.py
+++ b/MaxText/kernels/megablox/ops.py
@@ -14,6 +14,8 @@
 
 """Grouped matrix multiplication operations with custom VJPs."""
 
+# pylint: disable=too-many-positional-arguments
+
 import jax
 from kernels.megablox import gmm as backend
 import jax.numpy as jnp
diff --git a/MaxText/kernels/ragged_attention.py b/MaxText/kernels/ragged_attention.py
index 58ff3e215..aadcfaf5d 100644
--- a/MaxText/kernels/ragged_attention.py
+++ b/MaxText/kernels/ragged_attention.py
@@ -20,13 +20,12 @@
 
 import jax
 from jax import lax
+from jax.experimental import shard_map
 from jax.experimental import pallas as pl
 from jax.experimental.pallas import tpu as pltpu
 import jax.numpy as jnp
 import common_types
 
-from jax.experimental import shard_map
-
 
 BATCH = common_types.BATCH
 DEFAULT_MASK_VALUE = common_types.DEFAULT_MASK_VALUE
@@ -144,7 +143,7 @@ def reference_gqa(
   return o, logits_max, denominator
 
 
-def ragged_flash_attention_kernel(
+def ragged_flash_attention_kernel(  # pylint: disable=too-many-positional-arguments
     lengths_ref,
     q_ref,
     k_ref,
@@ -255,11 +254,11 @@ def compute_ragged_block_indices(b, i, lengths_ref):
           ],
           grid=(batch_size, seq_len // block_size),
       ),
-      compiler_params=dict(
-          mosaic=dict(
-              dimension_semantics=("parallel", "arbitrary"),
-          )
-      ),
+      compiler_params={
+          "mosaic": {
+              "dimension_semantics": ("parallel", "arbitrary"),
+          },
+      },
       out_shape=[
           jax.ShapeDtypeStruct((batch_size, num_heads, head_dim), jnp.float32),
           jax.ShapeDtypeStruct((batch_size, num_heads, head_dim), jnp.float32),
diff --git a/MaxText/layers/linears.py b/MaxText/layers/linears.py
index 6903b69e7..ae017f724 100644
--- a/MaxText/layers/linears.py
+++ b/MaxText/layers/linears.py
@@ -449,14 +449,12 @@ def gmm(inputs, kernel, group_sizes):
     w1_pspec = nn.logical_to_mesh_axes((None, None, "mlp"))
     wo_pspec = nn.logical_to_mesh_axes((None, "mlp", None))
 
-
     if isinstance(w0_kernel, QTensor):
-      w0_pspec = aqt_tensor.partition_spec(w0_pspec, [1,], dtype=w0_kernel.dtype, use_bias=False)
+      w0_pspec = aqt_tensor.partition_spec(w0_pspec, (1,), w0_kernel.dtype, use_bias=False)
     if isinstance(w1_kernel, QTensor):
-      w1_pspec = aqt_tensor.partition_spec(w1_pspec, [1,], dtype=w1_kernel.dtype, use_bias=False)
+      w1_pspec = aqt_tensor.partition_spec(w1_pspec, (1,), w1_kernel.dtype, use_bias=False)
     if isinstance(wo_kernel, QTensor):
-      wo_pspec = aqt_tensor.partition_spec(wo_pspec, [1,], dtype=wo_kernel.dtype, use_bias=False)
-
+      wo_pspec = aqt_tensor.partition_spec(wo_pspec, (1,), wo_kernel.dtype, use_bias=False)
 
     @functools.partial(
         shard_map.shard_map,
@@ -666,10 +664,12 @@ def dense_matmul(self, inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel):
         )
       return output, None
 
-  def retrieve_quantized_weight(self, inputs, gate_logits) -> tuple[QTensor, QTensor, QTensor]:
+  def retrieve_quantized_weight(
+      self, inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel
+  ) -> tuple[QTensor, QTensor, QTensor]:
     # This is called only during tracing. This is to invoke creation of quantized tensor inside AqtEinsum.
     # After jit, this will become no-op and will not affect performance.
-    _ = self.capped_dense(inputs, gate_logits)
+    _ = self.dense_matmul(inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel)
 
     w0_kernel = self.variables["aqt"]["AqtEinsum_0"]["AqtDotGeneral_0"]["qrhs"]["frozen"]
     w1_kernel = self.variables["aqt"]["AqtEinsum_1"]["AqtDotGeneral_0"]["qrhs"]["frozen"]
@@ -680,19 +680,6 @@ def retrieve_quantized_weight(self, inputs, gate_logits) -> tuple[QTensor, QTens
     wo_kernel = max_utils.unbox_logicallypartioned(wo_kernel)
     return w0_kernel, w1_kernel, wo_kernel
 
-  def capped_dense(self, inputs, gate_logits):
-    cfg = self.config
-    w0_kernel, w1_kernel, wo_kernel = self.generate_kernels(cfg.num_experts, cfg.emb_dim, cfg.mlp_dim)
-    return self.dense_matmul(inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel)
-
-  def no_capped_megablox(self, inputs, gate_logits):
-    if quantizations.in_serve_mode(self.quant):
-      w0_kernel, w1_kernel, wo_kernel = self.retrieve_quantized_weight(inputs, gate_logits)
-    else:
-      cfg = self.config
-      w0_kernel, w1_kernel, wo_kernel = self.generate_kernels(cfg.num_experts, cfg.emb_dim, cfg.mlp_dim)
-    return self.megablox(inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel)
-
   @nn.compact
   def __call__(self, inputs):
     cfg = self.config
@@ -708,7 +695,13 @@ def __call__(self, inputs):
         matmul_precision=self.config.matmul_precision,
     )(inputs)
     max_logging.log("Running MoE megablox implementation.")
+    cfg = self.config
+    w0_kernel, w1_kernel, wo_kernel = self.generate_kernels(cfg.num_experts, cfg.emb_dim, cfg.mlp_dim)
     if cfg.megablox:
-      return self.no_capped_megablox(inputs, gate_logits)
+      if quantizations.in_serve_mode(self.quant):
+        w0_kernel, w1_kernel, wo_kernel = self.retrieve_quantized_weight(
+            inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel
+        )
+      return self.megablox(inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel)
     else:
-      return self.capped_dense(inputs, gate_logits)
+      return self.dense_matmul(inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel)

From 5d510307268c10e912fdeffc9a92efb46a3b0541 Mon Sep 17 00:00:00 2001
From: Wonpyo Park <wppark.pio@gmail.com>
Date: Fri, 6 Dec 2024 06:17:00 +0000
Subject: [PATCH 16/40] fix lint error

---
 MaxText/kernels/megablox/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MaxText/kernels/megablox/ops.py b/MaxText/kernels/megablox/ops.py
index 822b4852f..1fce34b8a 100644
--- a/MaxText/kernels/megablox/ops.py
+++ b/MaxText/kernels/megablox/ops.py
@@ -17,8 +17,8 @@
 # pylint: disable=too-many-positional-arguments
 
 import jax
-from kernels.megablox import gmm as backend
 import jax.numpy as jnp
+from kernels.megablox import gmm as backend
 from aqt.jax.v2 import aqt_tensor
 from typing import Literal
 

From 325c4c38094b23a9cfffdb7779ba4b3b1671c558 Mon Sep 17 00:00:00 2001
From: Wonpyo Park <wppark.pio@gmail.com>
Date: Mon, 9 Dec 2024 04:30:26 +0000
Subject: [PATCH 17/40] Read lhs and rhs quantization dtype directly from
 DotGeneral

---
 MaxText/layers/linears.py | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/MaxText/layers/linears.py b/MaxText/layers/linears.py
index a144bbb13..6a4b1c3ed 100644
--- a/MaxText/layers/linears.py
+++ b/MaxText/layers/linears.py
@@ -411,21 +411,11 @@ def gmm(inputs, kernel, group_sizes):
       inputs = inputs.astype(self.dtype)
       kernel = kernel.astype(self.dtype)
 
-      # 'int8' for dynamic range quantization using 8-bits
-      # 'int8w' for weight only quantization using 8-bits
-      # 'int4w' for weight only quantization using 4-bits
-      quantization_config = self.config.quantization
-      quantization_types = {
-          "int8": (jnp.int8, jnp.int8),
-          "int8w": (None, jnp.int8),
-          "int4w": (None, jnp.int4),
-      }
       lhs_quantize_dtype, rhs_quantize_dtype = None, None
-      if quantization_config:
-        if quantization_config in quantization_types:
-          lhs_quantize_dtype, rhs_quantize_dtype = quantization_types[quantization_config]
-        else:
-          raise ValueError(f"{quantization_config=} is not yet supported in megablox.")
+      if self.quant is not None:
+        quant_dg = self.quant.quant_dg
+        lhs_quantize_dtype = quant_dg.fwd.dg_quantizer.lhs.numerics.get_dtype()
+        rhs_quantize_dtype = quant_dg.fwd.dg_quantizer.rhs.numerics.get_dtype()
 
       output = mblx.gmm(
           lhs=inputs,

From 95bb7c19142753121549e1ad2e765951e0604c7d Mon Sep 17 00:00:00 2001
From: Rissy Ran <ran.rissy@gmail.com>
Date: Tue, 26 Nov 2024 07:16:52 +0000
Subject: [PATCH 18/40] Fix MoE related tests

---
 MaxText/layers/linears.py                     |  1 +
 .../tpu/mixtral/8x22b/2_test_mixtral.sh       | 72 +------------------
 end_to_end/tpu/mixtral/8x7b/2_test_mixtral.sh | 12 ++--
 3 files changed, 9 insertions(+), 76 deletions(-)

diff --git a/MaxText/layers/linears.py b/MaxText/layers/linears.py
index 6a4b1c3ed..68f05df8f 100644
--- a/MaxText/layers/linears.py
+++ b/MaxText/layers/linears.py
@@ -629,6 +629,7 @@ def dense_matmul(self, inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel):
         )
       return output, loss
     else:
+      top_k_weights /= top_k_weights.sum(-1, keepdims=True)
       weights = self.reshape_and_update_weights(top_k_weights, top_k_indices)
       inputs = nn.with_logical_constraint(inputs, ("activation_batch", "activation_length", "activation_embed"))
       with jax.named_scope("wi_0"):
diff --git a/end_to_end/tpu/mixtral/8x22b/2_test_mixtral.sh b/end_to_end/tpu/mixtral/8x22b/2_test_mixtral.sh
index 4bc7d2d3c..f0ca70cd4 100644
--- a/end_to_end/tpu/mixtral/8x22b/2_test_mixtral.sh
+++ b/end_to_end/tpu/mixtral/8x22b/2_test_mixtral.sh
@@ -12,9 +12,6 @@
 
 set -ex
 MODEL_VARIATION='8x22b'
-PREDICT_LEN=7
-ATOL=60.0
-RTOL=10.0
 
 if [ -z "${BASE_OUTPUT_PATH}" ]; then
     # Non-Googlers please remember to point BASE_OUTPUT_PATH to GCS buckets that you own, this script uses internal buckets for testing.
@@ -29,65 +26,7 @@ export SCANNED_CHECKPOINT=${BASE_OUTPUT_PATH}/${MODEL_VARIATION}/scanned_ckpt/0/
 
 export TOKENIZER_PATH=assets/tokenizer.mistral-v3
 
-# Run decoding with converted ckpt - matmul implementation
-python3 MaxText/decode.py MaxText/configs/base.yml \
-  load_parameters_path=${SCANNED_CHECKPOINT} run_name=scanned_decoding \
-  per_device_batch_size=1 model_name=mixtral-8x22b async_checkpointing=false \
-  tokenizer_path=${TOKENIZER_PATH} ici_tensor_parallelism=1 \
-  ici_fsdp_parallelism=-1 max_prefill_predict_length=64 max_target_length=64 \
-  prompt="[INST] I love to [/INST]" megablox=False weight_dtype=float16
- 
-# TODO(rdyro): add decoding test for megablox implementation
-#python3 MaxText/decode.py MaxText/configs/base.yml \
-#    load_parameters_path=${SCANNED_CHECKPOINT} run_name=scanned_decoding \
-#    per_device_batch_size=1 model_name=mixtral-8x22b async_checkpointing=false \
-#    tokenizer_path=${TOKENIZER_PATH} ici_tensor_parallelism=1 \
-#    ici_fsdp_parallelism=-1 max_prefill_predict_length=16 max_target_length=24 \
-#    prompt="[INST] I love to [/INST]" megablox=True weight_dtype=float16
- 
-# Test whether the forward pass logits match the golden logits - matmul implementation
-python3 MaxText/tests/forward_pass_logit_checker.py MaxText/configs/base.yml \
-  base_output_directory=${BASE_OUTPUT_PATH} \
-  load_parameters_path=${SCANNED_CHECKPOINT} run_name=matmul_forward_pass_test \
-  per_device_batch_size=1 model_name=mixtral-8x22b \
-  tokenizer_path=${TOKENIZER_PATH} ici_tensor_parallelism=1 \
-  ici_fsdp_parallelism=-1 max_prefill_predict_length=$PREDICT_LEN max_target_length=$PREDICT_LEN \
-  dataset_type=synthetic dtype=bfloat16 weight_dtype=float16 megablox=False \
-  --atol=$ATOL --rtol=$RTOL --token_size=$PREDICT_LEN
-  # TODO(rdyro): figure out the reason for numerical mismatch for some tokens
-
-python3 MaxText/tests/forward_pass_logit_checker.py MaxText/configs/base.yml \
-  base_output_directory=${BASE_OUTPUT_PATH} \
-  load_parameters_path=${SCANNED_CHECKPOINT} run_name=matmul_forward_pass_test \
-  per_device_batch_size=1 model_name=mixtral-8x22b \
-  tokenizer_path=${TOKENIZER_PATH} ici_tensor_parallelism=1 \
-  ici_fsdp_parallelism=-1 max_prefill_predict_length=$PREDICT_LEN max_target_length=$PREDICT_LEN \
-  dataset_type=synthetic dtype=bfloat16 weight_dtype=bfloat16 megablox=False \
-  --atol=$ATOL --rtol=$RTOL --token_size=$PREDICT_LEN
-  # TODO(rdyro): figure out the reason for numerical mismatch for some tokens
-
-# Test whether the forward pass logits match the golden logits - megablox implementation
-python3 MaxText/tests/forward_pass_logit_checker.py MaxText/configs/base.yml \
-  base_output_directory=${BASE_OUTPUT_PATH} \
-  load_parameters_path=${SCANNED_CHECKPOINT} run_name=megablox_forward_pass_test \
-  per_device_batch_size=1 model_name=mixtral-8x22b \
-  tokenizer_path=${TOKENIZER_PATH} ici_tensor_parallelism=1 \
-  ici_fsdp_parallelism=-1 max_prefill_predict_length=$PREDICT_LEN max_target_length=$PREDICT_LEN \
-  dataset_type=synthetic dtype=bfloat16 weight_dtype=bfloat16 megablox=True \
-  --atol=$ATOL --rtol=$RTOL --token_size=$PREDICT_LEN
-  # TODO(rdyro): figure out the reason for numerical mismatch for some tokens
-
-python3 MaxText/tests/forward_pass_logit_checker.py MaxText/configs/base.yml \
-  base_output_directory=${BASE_OUTPUT_PATH} \
-  load_parameters_path=${SCANNED_CHECKPOINT} run_name=megablox_forward_pass_test \
-  per_device_batch_size=1 model_name=mixtral-8x22b \
-  tokenizer_path=${TOKENIZER_PATH} ici_tensor_parallelism=1 \
-  ici_fsdp_parallelism=-1 max_prefill_predict_length=$PREDICT_LEN max_target_length=$PREDICT_LEN \
-  dataset_type=synthetic dtype=bfloat16 weight_dtype=float16 megablox=True \
-  --atol=$ATOL --rtol=$RTOL --token_size=$PREDICT_LEN
-  # TODO(rdyro): figure out the reason for numerical mismatch for some tokens
- 
-# training
+# TODO(ranran): enable the fine-tuning, decoding, and forward_pass_logit_checker tests once b/380148614 has been fixed
 
 # Run pre-training without load_parameters_path - megablox implementation
 python3 MaxText/train.py MaxText/configs/base.yml \
@@ -97,12 +36,3 @@ python3 MaxText/train.py MaxText/configs/base.yml \
   steps=5 max_target_length=1024 async_checkpointing=false \
   tokenizer_path=${TOKENIZER_PATH} attention=flash dtype=bfloat16 \
   weight_dtype=bfloat16 megablox=True
-
-# Run fine-tuning - megablox implementation
-python3 MaxText/train.py MaxText/configs/base.yml \
-  base_output_directory=${BASE_OUTPUT_PATH} dataset_path=${DATASET_PATH} \
-  load_parameters_path=${SCANNED_CHECKPOINT} run_name=fine_tuning \
-  per_device_batch_size=1 model_name=mixtral-8x22b ici_tensor_parallelism=1 \
-  ici_fsdp_parallelism=-1 steps=10 max_target_length=1024 \
-  async_checkpointing=false tokenizer_path=${TOKENIZER_PATH} checkpoint_period=100 \
-  attention=flash dtype=bfloat16 weight_dtype=bfloat16 megablox=False
diff --git a/end_to_end/tpu/mixtral/8x7b/2_test_mixtral.sh b/end_to_end/tpu/mixtral/8x7b/2_test_mixtral.sh
index 3574d46da..bc112677b 100644
--- a/end_to_end/tpu/mixtral/8x7b/2_test_mixtral.sh
+++ b/end_to_end/tpu/mixtral/8x7b/2_test_mixtral.sh
@@ -24,20 +24,22 @@ export DATASET_PATH=gs://maxtext-dataset
 # `SCANNED_CHECKPOINT` refers to the checkpoint that used for both `train.py` and `decode.py` 
 export SCANNED_CHECKPOINT=${BASE_OUTPUT_PATH}/${MODEL_VARIATION}/scanned_ckpt/0/items
 
+# `UNSCANNED_CHECKPOINT` refers to run decoding
+export UNSCANNED_CKPT_PATH=${BASE_OUTPUT_PATH}/unscanned_ckpt/checkpoints/0/items
+
 # Run decoding with converted ckpt - matmul implementation
 # TODO(ranran): add decoding test for megablox implementation
-python3 MaxText/decode.py MaxText/configs/base.yml load_parameters_path=${SCANNED_CHECKPOINT} run_name=scanned_decoding per_device_batch_size=1 model_name=mixtral-8x7b async_checkpointing=false tokenizer_path=assets/tokenizer.mistral-v1 ici_tensor_parallelism=4 ici_fsdp_parallelism=16 max_prefill_predict_length=11 max_target_length=24 prompt="[INST] I love to [/INST]" megablox=False
+python3 MaxText/decode.py MaxText/configs/base.yml load_parameters_path=${UNSCANNED_CKPT_PATH} run_name=unscanned_decoding per_device_batch_size=1 model_name=mixtral-8x7b async_checkpointing=false tokenizer_path=assets/tokenizer.mistral-v1 ici_tensor_parallelism=4 ici_fsdp_parallelism=16 max_prefill_predict_length=11 max_target_length=24 prompt="[INST] I love to [/INST]" megablox=False scan_layers=false
 
 # Test whether the forward pass logits match the golden logits - matmul implementation
-python3 MaxText/tests/forward_pass_logit_checker.py MaxText/configs/base.yml base_output_directory=${BASE_OUTPUT_PATH} load_parameters_path=${SCANNED_CHECKPOINT} run_name=matmul_forward_pass_test per_device_batch_size=8 model_name=mixtral-8x7b tokenizer_path=assets/tokenizer.mistral-v1 ici_tensor_parallelism=4 ici_fsdp_parallelism=16 max_prefill_predict_length=11 max_target_length=11 dataset_type=synthetic dtype=float32 megablox=False --atol=3 --rtol=1 --token_size=4
+python3 MaxText/tests/forward_pass_logit_checker.py MaxText/configs/base.yml base_output_directory=${BASE_OUTPUT_PATH} load_parameters_path=${UNSCANNED_CKPT_PATH} run_name=matmul_forward_pass_test per_device_batch_size=8 model_name=mixtral-8x7b tokenizer_path=assets/tokenizer.mistral-v1 ici_tensor_parallelism=4 ici_fsdp_parallelism=16 max_prefill_predict_length=11 max_target_length=11 dataset_type=synthetic dtype=float32 megablox=False scan_layers=false --atol=3 --rtol=1 --token_size=4
 
 # Test whether the forward pass logits match the golden logits - megablox implementation
-python3 MaxText/tests/forward_pass_logit_checker.py MaxText/configs/base.yml base_output_directory=${BASE_OUTPUT_PATH} load_parameters_path=${SCANNED_CHECKPOINT} run_name=megablox_forward_pass_test per_device_batch_size=8 model_name=mixtral-8x7b tokenizer_path=assets/tokenizer.mistral-v1 ici_fsdp_parallelism=64 max_prefill_predict_length=11 max_target_length=11 dataset_type=synthetic dtype=bfloat16 weight_dtype=bfloat16 --atol=4 --rtol=1 --token_size=4
+# TODO(ranran): investigate the root cause of the excessive tolerance
+python3 MaxText/tests/forward_pass_logit_checker.py MaxText/configs/base.yml base_output_directory=${BASE_OUTPUT_PATH} load_parameters_path=${UNSCANNED_CKPT_PATH} run_name=megablox_forward_pass_test per_device_batch_size=8 model_name=mixtral-8x7b tokenizer_path=assets/tokenizer.mistral-v1 ici_fsdp_parallelism=64 max_prefill_predict_length=11 max_target_length=11 dataset_type=synthetic dtype=bfloat16 weight_dtype=bfloat16 scan_layers=false --atol=20 --rtol=10 --token_size=4
 
 # Run fine-tuning - megablox implementation
 python3 MaxText/train.py MaxText/configs/base.yml base_output_directory=${BASE_OUTPUT_PATH} dataset_path=${DATASET_PATH} load_parameters_path=${SCANNED_CHECKPOINT} run_name=fine_tuning per_device_batch_size=8 model_name=mixtral-8x7b ici_fsdp_parallelism=64 steps=10 max_target_length=1024 async_checkpointing=false tokenizer_path=assets/tokenizer.mistral-v1 checkpoint_period=5 attention=flash dtype=bfloat16 weight_dtype=bfloat16
 
 # Run pre-training without load_parameters_path - megablox implementation
 python3 MaxText/train.py MaxText/configs/base.yml base_output_directory=${BASE_OUTPUT_PATH} dataset_path=${DATASET_PATH} run_name=pre_training per_device_batch_size=8 enable_checkpointing=false model_name=mixtral-8x7b ici_fsdp_parallelism=64 steps=5 max_target_length=1024 async_checkpointing=false tokenizer_path=assets/tokenizer.mistral-v1 attention=flash dtype=bfloat16 weight_dtype=bfloat16
-
-# TODO(ranran): Run decoding with unscanned ckpt

From 7addc0f02c97ab6d4eeb657e99c80bcbeecfdfaa Mon Sep 17 00:00:00 2001
From: Xuefeng Gu <xfgu@google.com>
Date: Tue, 26 Nov 2024 21:20:21 +0000
Subject: [PATCH 19/40] Add checkpoint topology discovery for the Replicator
 Service

---
 MaxText/configs/base.yml |  6 ++++++
 MaxText/max_utils.py     | 43 ++++++++++++++++++++++++++++++++++++++++
 MaxText/pyconfig.py      | 27 ++++++++++---------------
 3 files changed, 59 insertions(+), 17 deletions(-)

diff --git a/MaxText/configs/base.yml b/MaxText/configs/base.yml
index fdb99197f..5353686b6 100644
--- a/MaxText/configs/base.yml
+++ b/MaxText/configs/base.yml
@@ -208,6 +208,12 @@ local_checkpoint_directory: ""
 # It should be a positive number when and only when `enable_emergency_checkpoint` is True.
 local_checkpoint_period: 0
 
+# Whether or not to use emergency checkpoint with the replicator service.
+use_replicator_service: False
+
+# The interval to backup local checkpoints to the persistent storage.
+replicator_backup_interval_minutes: 0
+
 # Jax cache directory
 jax_cache_dir: "~/jax_cache"
 
diff --git a/MaxText/max_utils.py b/MaxText/max_utils.py
index 40fb8bbbb..550d22bf3 100644
--- a/MaxText/max_utils.py
+++ b/MaxText/max_utils.py
@@ -289,6 +289,34 @@ def initialize_jax_for_tpu_with_emergency_checkpointing(raw_keys):
         " coordinator_address to initialize JAX distributed runtime..."
     )
     jax.distributed.initialize(coordinator_address=coordinator_address, process_id=int(process_id))
+    if raw_keys["use_replicator_service"]:
+      REPLICATOR_FILE = "replicator.yaml"
+      TEMP_FILE = REPLICATOR_FILE + ".tmp"
+      replicator_file = epath.Path(raw_keys["local_checkpoint_directory"]) / REPLICATOR_FILE
+      temp_file = epath.Path(raw_keys["local_checkpoint_directory"]) / TEMP_FILE
+      num_slices = get_num_slices(raw_keys)
+      num_nodes = jax.process_count()
+      nodes_per_slice = num_nodes // num_slices
+      max_logging.log(f"num_slices: {num_slices}, num_nodes: {num_nodes}, nodes_per_slice: {nodes_per_slice}")
+      node_rank = jax.process_index()
+      peer_ranks = []
+      for i in range(num_slices):
+        peer = node_rank % nodes_per_slice + i * nodes_per_slice
+        if peer != node_rank:
+          peer_ranks.append(peer)
+      run_name = raw_keys["run_name"]
+      if run_name == "":
+        run_name = os.environ.get("JOBSET_NAME")  # using XPK default
+
+      replicator_yaml = f"""job-name: {run_name}
+      node-rank: {node_rank}
+      nodes: {num_nodes}
+      workers-per-node: 1
+      peer-ranks: {peer_ranks}
+      backup-interval-minutes: {raw_keys["replicator_backup_interval_minutes"]}"""
+
+      temp_file.write_text("\n".join([l.strip() for l in replicator_yaml.split("\n")]))
+      os.rename(temp_file, replicator_file)
   else:
     max_logging.log(
         "Initializing JAX distributed runtime without args when emergency checkpointing is"
@@ -319,6 +347,21 @@ def _retrieve_jax_init_info(raw_keys):
   return "", ""
 
 
+def get_num_slices(raw_keys):
+  """Calculate num_slices based on number of devices."""
+  if raw_keys["hardware"] == "cpu":
+    max_logging.log(" Setting num_slices=1 for CPU hardware type")
+    return 1
+  if int(raw_keys["compile_topology_num_slices"]) > 0:
+    return raw_keys["compile_topology_num_slices"]
+  else:
+    devices = jax.devices()
+    try:
+      return 1 + max(d.slice_index for d in devices)
+    except (ValueError, AttributeError):
+      return 1
+
+
 def is_cpu_backend(raw_keys):
   """Determine whether Maxtext is intended to run on a CPU backend."""
   return raw_keys["hardware"] == "cpu"
diff --git a/MaxText/pyconfig.py b/MaxText/pyconfig.py
index 5adc88199..fa80c9226 100644
--- a/MaxText/pyconfig.py
+++ b/MaxText/pyconfig.py
@@ -105,8 +105,16 @@ def validate_keys(keys):
     assert (
         keys["local_checkpoint_period"] > 0
     ), "A positive local checkpoint period must be specified when using emergency checkpoint"
+    if keys["use_replicator_service"]:
+      assert (
+          keys["replicator_backup_interval_minutes"] > 0
+      ), "Replicator service is enabled, the backup interval minutes must be positive"
   else:
-    max_logging.log("Not using emergency checkpoint, ignoring local_checkpoint_directory and local_checkpoint_period")
+    max_logging.log(
+        "Not using emergency checkpoint, ignoring local_checkpoint_directory, local_checkpoint_period,"
+        " use_replicator_service and replicator_backup_interval_minutes"
+    )
+
   if keys["num_experts"] > 1:
     validate_megablox_parallelism(keys)
 
@@ -388,7 +396,7 @@ def user_init(raw_keys):
         raw_keys["eval_per_device_batch_size"], raw_keys["expansion_factor_real_data"], get_num_target_devices(raw_keys), 1
     )
 
-    raw_keys["num_slices"] = get_num_slices(raw_keys)
+    raw_keys["num_slices"] = max_utils.get_num_slices(raw_keys)
     raw_keys["quantization_local_shard_count"] = get_quantization_local_shard_count(raw_keys)
 
     if using_pipeline_parallelism(raw_keys):
@@ -589,21 +597,6 @@ def get_num_target_devices(raw_keys):
     return len(jax.devices())
 
 
-def get_num_slices(raw_keys):
-  """Calculate num_slices based on number of devices."""
-  if raw_keys["hardware"] == "cpu":
-    max_logging.log(" Setting num_slices=1 for CPU hardware type")
-    return 1
-  if int(raw_keys["compile_topology_num_slices"]) > 0:
-    return raw_keys["compile_topology_num_slices"]
-  else:
-    devices = jax.devices()
-    try:
-      return 1 + max([d.slice_index for d in devices])
-    except:
-      return 1
-
-
 def get_quantization_local_shard_count(raw_keys):
   if raw_keys["quantization_local_shard_count"] == -1:
     return raw_keys["num_slices"]

From f6c7c5ad6997023bd052691499fe1de5d1e80202 Mon Sep 17 00:00:00 2001
From: Colin Gaffney <cpgaffney@google.com>
Date: Wed, 27 Nov 2024 10:09:21 -0800
Subject: [PATCH 20/40] Fix local restore by re-mapping device ids directly
 instead of inferring them from how process indexes changed across restarts
 with some false assumptions.

PiperOrigin-RevId: 700737164
---
 MaxText/max_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MaxText/max_utils.py b/MaxText/max_utils.py
index 550d22bf3..c74297ef3 100644
--- a/MaxText/max_utils.py
+++ b/MaxText/max_utils.py
@@ -325,6 +325,7 @@ def initialize_jax_for_tpu_with_emergency_checkpointing(raw_keys):
     jax.distributed.initialize()
 
   ocp.multihost.initialize_runtime_to_distributed_ids()
+  ocp.multihost.initialize_distributed_to_device_ids()
 
 
 def _retrieve_jax_init_info(raw_keys):

From dca3ee5ac8774e0db1fe03c4ba3693fe7b084af1 Mon Sep 17 00:00:00 2001
From: Zhihao Shan <zhihaoshan@google.com>
Date: Tue, 26 Nov 2024 16:40:55 -0800
Subject: [PATCH 21/40] Compact the number of variables for the prefill result
 cache to reduce Python layer overhead

---
 MaxText/configs/base.yml        |   3 +
 MaxText/maxengine.py            | 110 +++++++++++++++++++++++++++-----
 MaxText/tests/maxengine_test.py |  62 ++++++++++++++++++
 3 files changed, 159 insertions(+), 16 deletions(-)
 create mode 100644 MaxText/tests/maxengine_test.py

diff --git a/MaxText/configs/base.yml b/MaxText/configs/base.yml
index 5353686b6..8c22ac576 100644
--- a/MaxText/configs/base.yml
+++ b/MaxText/configs/base.yml
@@ -441,6 +441,9 @@ inference_microbenchmark_log_file_path: ""
 inference_metadata_file: "" # path to a json file
 enable_model_warmup: False
 
+# Stack prefill cache across the layer to reduce the
+# Python layer latency.
+stack_prefill_result_cache: False
 
 # KV Cache layout control
 # Logical layout: 0,1,2,3 ; CACHE_BATCH, CACHE_SEQUENCE, CACHE_HEADS, CACHE_KV
diff --git a/MaxText/maxengine.py b/MaxText/maxengine.py
index e1d4b0ca5..fca2fdb9e 100644
--- a/MaxText/maxengine.py
+++ b/MaxText/maxengine.py
@@ -131,12 +131,22 @@ def load_params(self, *args, rng: Optional[jax.random.PRNGKey] = None, **kwargs)
 
     self.prefill_kv_cache_annotations = max_utils.get_prefill_kv_cache_annotations(self.model, self.config, rng2, self._mesh)
     self.prefill_kv_cache_shardings = jax.tree_util.tree_map(
-        lambda x: jax.sharding.NamedSharding(self._mesh, x), self.prefill_kv_cache_annotations
+        lambda x: jax.sharding.NamedSharding(self._mesh, x),
+        self.prefill_kv_cache_annotations,
     )
 
+    if self.config.stack_prefill_result_cache:
+      # Add extra axis for the axis generated by the stack.
+      self.prefill_kv_cache_shardings = jax.tree_util.tree_map(
+          lambda x: jax.sharding.NamedSharding(self._mesh, jax.sharding.PartitionSpec(None, *x.spec)),
+          self.prefill_kv_cache_shardings,
+      )
+      self.prefill_kv_cache_shardings = self.prefill_kv_cache_shardings["decoder"]["layers_0"]
+
     self.kv_cache_annotations = max_utils.get_kv_cache_annotations(self.model, self.config, rng2, self._mesh)
     self.kv_cache_shardings = jax.tree_util.tree_map(
-        lambda x: jax.sharding.NamedSharding(self._mesh, x), self.kv_cache_annotations
+        lambda x: jax.sharding.NamedSharding(self._mesh, x),
+        self.kv_cache_annotations,
     )
 
     if self.model.quant and not self.config.checkpoint_is_quantized:
@@ -172,12 +182,40 @@ def model_apply(_p, _rng):
     params["aqt"] = new_vars["aqt"]
     params["params"] = quantizations.remove_quantized_params(state.params["params"], new_vars["aqt"])
     self.abstract_params = jax.tree_util.tree_map(
-        lambda x: jax.ShapeDtypeStruct(shape=x.shape, dtype=x.dtype, sharding=x.sharding), params
+        lambda x: jax.ShapeDtypeStruct(shape=x.shape, dtype=x.dtype, sharding=x.sharding),
+        params,
     )
     max_utils.save_quantized_checkpoint_if_configured(self.config, params)
     self.model.quant.quant_mode = quantizations.get_quant_mode("serve")
     return params
 
+  def _maybe_stack_prefill_result_cache(self, cache):
+    """Stack the caches across the layers."""
+    if not self.config.stack_prefill_result_cache:
+      return cache
+
+    layer_keys = []
+    for i in range(self.config.num_decoder_layers):
+      layer_keys.append(f"layers_{i}")
+
+    layer_cache = [cache["decoder"][layer_key] for layer_key in layer_keys]
+
+    return jax.tree.map(lambda *c: jnp.stack(c), *layer_cache)
+
+  def _maybe_unstack_prefill_result_cache(self, cache):
+    """Unstack the caches across the layers."""
+    if not self.config.stack_prefill_result_cache:
+      return cache
+
+    flat_cache, treedef = jax.tree.flatten(cache)
+    layer_cache = [jax.tree.unflatten(treedef, flat_cache_vars) for flat_cache_vars in zip(*flat_cache, strict=True)]
+    res_cache = {"decoder": {}}
+
+    for i in range(self.config.num_decoder_layers):
+      res_cache["decoder"][f"layers_{i}"] = layer_cache[i]
+
+    return res_cache
+
   @functools.partial(jax.jit, static_argnums=(0,))
   def prefill(
       self,
@@ -231,7 +269,9 @@ def prefill(
     next_pos = jnp.full((1, 1), true_length, dtype=jnp.int32)
     generated_tokens = jnp.zeros((1, 1), dtype=jnp.int32)
     selected_logits = jax.lax.dynamic_slice(
-        flat_logits, (0, true_length - 1, 0), (flat_logits.shape[0], 1, flat_logits.shape[2])
+        flat_logits,
+        (0, true_length - 1, 0),
+        (flat_logits.shape[0], 1, flat_logits.shape[2]),
     )
     selected_logits = jax.lax.with_sharding_constraint(selected_logits, self.replicated_sharding)
 
@@ -259,9 +299,12 @@ def prefill(
         samples_per_slot=1,
     )
 
+    cache = new_vars["cache"]
+    cache = self._maybe_stack_prefill_result_cache(cache)
+
     return {
         "logits": selected_logits,
-        "cache": new_vars["cache"],
+        "cache": cache,
         "next_pos": next_pos,
         "generated_tokens": generated_tokens,
         "tokens": first_generated_token,
@@ -346,9 +389,17 @@ def insert(
     """Insert into KV cache"""
     unboxed_prefix = max_utils.unbox_logicallypartioned(prefix)
 
+    unboxed_prefix["cache"] = self._maybe_unstack_prefill_result_cache(unboxed_prefix["cache"])
+
     def copy(path, partial_cache, full_cache, annotations):
       path_key = path[-1].key
-      if path_key in ["cache_ar_index", "cached_ar_key", "cached_ar_value", "cached_ar_key_scale", "cached_ar_value_scale"]:
+      if path_key in [
+          "cache_ar_index",
+          "cached_ar_key",
+          "cached_ar_value",
+          "cached_ar_key_scale",
+          "cached_ar_value_scale",
+      ]:
         return full_cache  # we don't even zero these out because we can mask them out.
 
       batch_idx = -1
@@ -388,12 +439,18 @@ def copy(path, partial_cache, full_cache, annotations):
         raise ValueError(f"We don't have a strategy for inserting {path_key}")
 
     inserted_cache = jax.tree_util.tree_map_with_path(
-        copy, unboxed_prefix["cache"], decode_state["cache"], self.kv_cache_annotations_named
+        copy,
+        unboxed_prefix["cache"],
+        decode_state["cache"],
+        self.kv_cache_annotations_named,
     )
     inserted_logits = jax.lax.dynamic_update_index_in_dim(decode_state["logits"], unboxed_prefix["logits"], slot, 0)
     inserted_next_pos = jax.lax.dynamic_update_index_in_dim(decode_state["next_pos"], unboxed_prefix["next_pos"], slot, 0)
     inserted_generated_tokens = jax.lax.dynamic_update_index_in_dim(
-        decode_state["generated_tokens"], unboxed_prefix["generated_tokens"], slot, 0
+        decode_state["generated_tokens"],
+        unboxed_prefix["generated_tokens"],
+        slot,
+        0,
     )
     inserted_tokens = jax.lax.dynamic_update_index_in_dim(decode_state["tokens"], unboxed_prefix["tokens"], slot, 0)
 
@@ -458,11 +515,26 @@ def init(abstract_params):
           mutable=["cache"],
       )
 
-      next_pos = jnp.zeros((int(self.config.per_device_batch_size * jax.device_count()), 1), dtype=jnp.int32)
-      generated_tokens = jnp.zeros((int(self.config.per_device_batch_size * jax.device_count()), 1), dtype=jnp.int32)
-      tokens = jnp.zeros((int(self.config.per_device_batch_size * jax.device_count()), 1), dtype=jnp.int32)
+      next_pos = jnp.zeros(
+          (int(self.config.per_device_batch_size * jax.device_count()), 1),
+          dtype=jnp.int32,
+      )
+      generated_tokens = jnp.zeros(
+          (int(self.config.per_device_batch_size * jax.device_count()), 1),
+          dtype=jnp.int32,
+      )
+      tokens = jnp.zeros(
+          (int(self.config.per_device_batch_size * jax.device_count()), 1),
+          dtype=jnp.int32,
+      )
       return {
-          "logits": jnp.zeros((int(self.config.per_device_batch_size * jax.device_count()), 1, self.config.vocab_size)),
+          "logits": jnp.zeros(
+              (
+                  int(self.config.per_device_batch_size * jax.device_count()),
+                  1,
+                  self.config.vocab_size,
+              )
+          ),
           "cache": cache["cache"],
           "next_pos": next_pos,
           "generated_tokens": generated_tokens,
@@ -477,7 +549,8 @@ def init(abstract_params):
       mesh_annotations = nn.logical_to_mesh(logical_annotations)
 
     shardings = jax.tree_util.tree_map(
-        lambda mesh_annotation: jax.sharding.NamedSharding(self._mesh, mesh_annotation), mesh_annotations
+        lambda mesh_annotation: jax.sharding.NamedSharding(self._mesh, mesh_annotation),
+        mesh_annotations,
     )
 
     @functools.partial(jax.jit, out_shardings=shardings)
@@ -519,16 +592,21 @@ def colocated_cpus(self) -> None:
     raise NotImplementedError
 
 
-def set_engine_vars_from_base_engine(engine: engine_api.Engine, base_engine: engine_api.Engine, rng: jax.random.PRNGKey):
+def set_engine_vars_from_base_engine(
+    engine: engine_api.Engine,
+    base_engine: engine_api.Engine,
+    rng: jax.random.PRNGKey,
+):
   """Set internal vars from base_engine, which has already loaded the checkpoint and has sharding,
   mesh, and kv cache related vars set.
   """
   engine.model.quant.quant_mode = base_engine.model.quant.quant_mode
   engine.state_mesh_annotations = base_engine.state_mesh_annotations
   engine.abstract_params = base_engine.abstract_params
-  engine.kv_cache_annotations = max_utils.get_kv_cache_annotations(engine.model, engine.config, rng, engine._mesh)  # pylint: disable=protected-access
+  engine.kv_cache_annotations = max_utils.get_kv_cache_annotations(engine.model, engine.config, rng, engine.mesh)  # pylint: disable=protected-access
   engine.kv_cache_shardings = jax.tree_util.tree_map(
-      lambda x: jax.sharding.NamedSharding(engine._mesh, x), engine.kv_cache_annotations  # pylint: disable=protected-access
+      lambda x: jax.sharding.NamedSharding(engine.mesh, x),
+      engine.kv_cache_annotations,  # pylint: disable=protected-access
   )
 
 
diff --git a/MaxText/tests/maxengine_test.py b/MaxText/tests/maxengine_test.py
new file mode 100644
index 000000000..c59e11b3e
--- /dev/null
+++ b/MaxText/tests/maxengine_test.py
@@ -0,0 +1,62 @@
+"""
+Copyright 2024 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+""" Tests for the maxengine """
+
+import jax
+from jax import numpy as jnp
+import numpy as np
+import unittest
+import pyconfig
+from maxengine import MaxEngine
+
+
+class MaxEngineTest(unittest.TestCase):
+  """Tests for MaxEngine."""
+
+  # TODO: add unit test for the MaxEngine.
+
+  def test_stack_and_unstack_prefill_cache(self):
+    pyconfig.initialize(
+        [None, "configs/base.yml"],
+        enable_checkpointing=False,
+        stack_prefill_result_cache=True,
+    )
+    config = pyconfig.config
+    engine = MaxEngine(config, jax.devices())
+    num_layers = engine.config.num_decoder_layers
+    input = {
+        "decoder": {},
+    }
+    for i in range(num_layers):
+      input["decoder"][f"layers_{i}"] = {
+          "a": jnp.ones((1, 10)),
+          "b": jnp.ones((1, 9)),
+      }
+
+    expected_stacked = {
+        "a": jnp.ones((num_layers, 1, 10)),
+        "b": jnp.ones((num_layers, 1, 9)),
+    }
+    got_stacked = engine._maybe_stack_prefill_result_cache(input)
+    jax.tree.map(np.testing.assert_array_equal, got_stacked, expected_stacked)
+
+    got_unstacked = engine._maybe_unstack_prefill_result_cache(got_stacked)
+    jax.tree.map(np.testing.assert_array_equal, got_unstacked, input)
+
+
+if __name__ == "__main__":
+  unittest.main()

From c993cf278b6eb47443655653d9bf5c9aa7a26ba4 Mon Sep 17 00:00:00 2001
From: Rissy Ran <ran.rissy@gmail.com>
Date: Wed, 27 Nov 2024 19:37:11 +0000
Subject: [PATCH 22/40] add more MoE tests

---
 end_to_end/tpu/mixtral/8x7b/2_test_mixtral.sh | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/end_to_end/tpu/mixtral/8x7b/2_test_mixtral.sh b/end_to_end/tpu/mixtral/8x7b/2_test_mixtral.sh
index bc112677b..fa6acdd35 100644
--- a/end_to_end/tpu/mixtral/8x7b/2_test_mixtral.sh
+++ b/end_to_end/tpu/mixtral/8x7b/2_test_mixtral.sh
@@ -31,15 +31,21 @@ export UNSCANNED_CKPT_PATH=${BASE_OUTPUT_PATH}/unscanned_ckpt/checkpoints/0/item
 # TODO(ranran): add decoding test for megablox implementation
 python3 MaxText/decode.py MaxText/configs/base.yml load_parameters_path=${UNSCANNED_CKPT_PATH} run_name=unscanned_decoding per_device_batch_size=1 model_name=mixtral-8x7b async_checkpointing=false tokenizer_path=assets/tokenizer.mistral-v1 ici_tensor_parallelism=4 ici_fsdp_parallelism=16 max_prefill_predict_length=11 max_target_length=24 prompt="[INST] I love to [/INST]" megablox=False scan_layers=false
 
+# Run decoding with converted ckpt - dropping implementation
+python3 MaxText/decode.py MaxText/configs/base.yml load_parameters_path=${UNSCANNED_CKPT_PATH} run_name=unscanned_decoding per_device_batch_size=1 model_name=mixtral-8x7b async_checkpointing=false tokenizer_path=assets/tokenizer.mistral-v1 ici_tensor_parallelism=4 ici_fsdp_parallelism=16 max_prefill_predict_length=11 max_target_length=24 prompt="[INST] I love to [/INST]" megablox=False scan_layers=false capacity_factor=1.25
+
 # Test whether the forward pass logits match the golden logits - matmul implementation
-python3 MaxText/tests/forward_pass_logit_checker.py MaxText/configs/base.yml base_output_directory=${BASE_OUTPUT_PATH} load_parameters_path=${UNSCANNED_CKPT_PATH} run_name=matmul_forward_pass_test per_device_batch_size=8 model_name=mixtral-8x7b tokenizer_path=assets/tokenizer.mistral-v1 ici_tensor_parallelism=4 ici_fsdp_parallelism=16 max_prefill_predict_length=11 max_target_length=11 dataset_type=synthetic dtype=float32 megablox=False scan_layers=false --atol=3 --rtol=1 --token_size=4
+python3 MaxText/tests/forward_pass_logit_checker.py MaxText/configs/base.yml base_output_directory=${BASE_OUTPUT_PATH} load_parameters_path=${UNSCANNED_CKPT_PATH} run_name=matmul_forward_pass_test per_device_batch_size=4 model_name=mixtral-8x7b tokenizer_path=assets/tokenizer.mistral-v1 ici_tensor_parallelism=4 ici_fsdp_parallelism=16 max_prefill_predict_length=11 max_target_length=11 dataset_type=synthetic dtype=float32 megablox=False scan_layers=false --atol=3 --rtol=1 --token_size=4
 
 # Test whether the forward pass logits match the golden logits - megablox implementation
 # TODO(ranran): investigate the root cause of the excessive tolerance
-python3 MaxText/tests/forward_pass_logit_checker.py MaxText/configs/base.yml base_output_directory=${BASE_OUTPUT_PATH} load_parameters_path=${UNSCANNED_CKPT_PATH} run_name=megablox_forward_pass_test per_device_batch_size=8 model_name=mixtral-8x7b tokenizer_path=assets/tokenizer.mistral-v1 ici_fsdp_parallelism=64 max_prefill_predict_length=11 max_target_length=11 dataset_type=synthetic dtype=bfloat16 weight_dtype=bfloat16 scan_layers=false --atol=20 --rtol=10 --token_size=4
+python3 MaxText/tests/forward_pass_logit_checker.py MaxText/configs/base.yml base_output_directory=${BASE_OUTPUT_PATH} load_parameters_path=${UNSCANNED_CKPT_PATH} run_name=megablox_forward_pass_test per_device_batch_size=4 model_name=mixtral-8x7b tokenizer_path=assets/tokenizer.mistral-v1 ici_fsdp_parallelism=64 max_prefill_predict_length=11 max_target_length=11 dataset_type=synthetic dtype=bfloat16 weight_dtype=bfloat16 scan_layers=false --atol=20 --rtol=10 --token_size=4
+
+# Run pre-training - megablox implementation
+python3 MaxText/train.py MaxText/configs/base.yml base_output_directory=${BASE_OUTPUT_PATH} dataset_path=${DATASET_PATH} run_name=megablox_pre_training per_device_batch_size=4 enable_checkpointing=false model_name=mixtral-8x7b ici_fsdp_parallelism=64 steps=5 max_target_length=1024 async_checkpointing=false tokenizer_path=assets/tokenizer.mistral-v1 attention=flash dtype=bfloat16 weight_dtype=bfloat16
 
-# Run fine-tuning - megablox implementation
-python3 MaxText/train.py MaxText/configs/base.yml base_output_directory=${BASE_OUTPUT_PATH} dataset_path=${DATASET_PATH} load_parameters_path=${SCANNED_CHECKPOINT} run_name=fine_tuning per_device_batch_size=8 model_name=mixtral-8x7b ici_fsdp_parallelism=64 steps=10 max_target_length=1024 async_checkpointing=false tokenizer_path=assets/tokenizer.mistral-v1 checkpoint_period=5 attention=flash dtype=bfloat16 weight_dtype=bfloat16
+# Run pre-training - matmul implementation
+python3 MaxText/train.py MaxText/configs/base.yml base_output_directory=${BASE_OUTPUT_PATH} dataset_path=${DATASET_PATH} run_name=matmul_pre_training per_device_batch_size=4 enable_checkpointing=false model_name=mixtral-8x7b ici_fsdp_parallelism=64 steps=5 max_target_length=1024 async_checkpointing=false tokenizer_path=assets/tokenizer.mistral-v1 attention=flash dtype=bfloat16 weight_dtype=bfloat16 megablox=False
 
-# Run pre-training without load_parameters_path - megablox implementation
-python3 MaxText/train.py MaxText/configs/base.yml base_output_directory=${BASE_OUTPUT_PATH} dataset_path=${DATASET_PATH} run_name=pre_training per_device_batch_size=8 enable_checkpointing=false model_name=mixtral-8x7b ici_fsdp_parallelism=64 steps=5 max_target_length=1024 async_checkpointing=false tokenizer_path=assets/tokenizer.mistral-v1 attention=flash dtype=bfloat16 weight_dtype=bfloat16
+# Run pre-training - dropping implementation
+python3 MaxText/train.py MaxText/configs/base.yml base_output_directory=${BASE_OUTPUT_PATH} dataset_path=${DATASET_PATH} run_name=dropping_pre_training per_device_batch_size=4 enable_checkpointing=false model_name=mixtral-8x7b ici_fsdp_parallelism=64 steps=5 max_target_length=1024 async_checkpointing=false tokenizer_path=assets/tokenizer.mistral-v1 attention=flash dtype=bfloat16 weight_dtype=bfloat16 megablox=False capacity_factor=1

From 86d0b546ac65e9ab01191168091bef0be3de6d6f Mon Sep 17 00:00:00 2001
From: aireenmei <aireenmei@gmail.com>
Date: Wed, 27 Nov 2024 23:33:43 +0000
Subject: [PATCH 23/40] update setup_gcsfuse for better perf

---
 getting_started/Data_Input_Pipeline.md |  4 +++-
 setup_gcsfuse.sh                       | 18 +++++++++++-------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/getting_started/Data_Input_Pipeline.md b/getting_started/Data_Input_Pipeline.md
index fb53e63ee..84351325b 100644
--- a/getting_started/Data_Input_Pipeline.md
+++ b/getting_started/Data_Input_Pipeline.md
@@ -96,7 +96,9 @@ In HF or TFDS data pipeline, global shuffle is performed by a shuffle buffer wit
 1. Dataset needs to be in a format that supports random access. The default format is [ArrayRecord](https://github.com/google/array_record). For converting a dataset into ArrayRecord, see [instructions](https://github.com/google/array_record/tree/main/beam). Additionally, other random accessible data sources can be supported via a custom data source class ([docs](https://github.com/google/grain/blob/main/docs/data_sources.md)).
 2. ArrayRecord dataset, when hosted on GCS bucket, can only be read through [Cloud Storage FUSE](https://cloud.google.com/storage/docs/gcs-fuse). The installation of Cloud Storage FUSE is included in [setup.sh](https://github.com/google/maxtext/blob/main/setup.sh). User then needs to mount the GCS bucket to a local path for each worker, using the script [setup_gcsfuse.sh](https://github.com/google/maxtext/blob/main/setup_gcsfuse.sh). The script configs some parameters for the mount.
 ```
-bash setup_gcsfuse.sh DATASET_GCS_BUCKET=$BUCKET_NAME MOUNT_PATH=$MOUNT_PATH
+bash setup_gcsfuse.sh DATASET_GCS_BUCKET=$BUCKET_NAME MOUNT_PATH=$MOUNT_PATH [FILE_PATH=$MOUNT_PATH/my_dataset]
+# FILE_PATH is optional, when provided, the script runs "ls -R" for pre-filling the metadata cache
+# https://cloud.google.com/storage/docs/cloud-storage-fuse/performance#improve-first-time-reads
 ```
 3. Set `dataset_type=grain` and set `grain_train_files` to match the ArrayRecord files via a local path since the bucket has been mounted.
 4. Tune `grain_worker_count` for performance. This parameter controls the number of child process used by Grain (more details in [behind_the_scene](https://github.com/google/grain/blob/main/docs/behind_the_scenes.md), [code](https://github.com/google/grain/blob/main/grain/_src/python/grain_pool.py)). If you use a large number of workers, please check your config for gcsfuse in [setup_gcsfuse.sh](https://github.com/google/maxtext/blob/main/setup_gcsfuse.sh) to avoid gcsfuse throttling.
diff --git a/setup_gcsfuse.sh b/setup_gcsfuse.sh
index 53e3baa7f..40c2066b7 100644
--- a/setup_gcsfuse.sh
+++ b/setup_gcsfuse.sh
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 # Description:
-# bash setup_gcsfuse.sh DATASET_GCS_BUCKET=maxtext-dataset MOUNT_PATH=dataset
+# bash setup_gcsfuse.sh DATASET_GCS_BUCKET=maxtext-dataset MOUNT_PATH=/tmp/gcsfuse FILE_PATH=/tmp/gcsfuse/my_dataset
 
 set -e
 
@@ -44,9 +44,13 @@ fi
 mkdir -p $MOUNT_PATH
 
 # see https://cloud.google.com/storage/docs/gcsfuse-cli for all configurable options of gcsfuse CLI
-# Grain uses _PROCESS_MANAGEMENT_MAX_THREADS = 64 (https://github.com/google/grain/blob/main/grain/_src/python/grain_pool.py)
-# Please make sure max-conns-per-host > grain_worker_count * _PROCESS_MANAGEMENT_MAX_THREADS
-
-gcsfuse -o ro --implicit-dirs --http-client-timeout=5s --max-conns-per-host=2000 \
-        --debug_fuse_errors --debug_fuse --debug_gcs --debug_invariants --debug_mutex \
-        --log-file=$HOME/gcsfuse.json "$DATASET_GCS_BUCKET" "$MOUNT_PATH"
+TIMESTAMP=$(date +%Y%m%d-%H%M)
+gcsfuse -o ro --implicit-dirs --log-severity=debug \
+        --type-cache-max-size-mb=-1 --stat-cache-max-size-mb=-1 --kernel-list-cache-ttl-secs=-1 --metadata-cache-ttl-secs=-1 \
+        --log-file=$HOME/gcsfuse_$TIMESTAMP.json "$DATASET_GCS_BUCKET" "$MOUNT_PATH"
+
+# Use ls to prefill the metadata cache: https://cloud.google.com/storage/docs/cloud-storage-fuse/performance#improve-first-time-reads
+if [[ ! -z ${FILE_PATH} ]] ; then 
+  FILE_COUNT=$(ls -R $FILE_PATH | wc -l)
+  echo $FILE_COUNT files found in $FILE_PATH
+fi

From 346218a27988e4aefcba00dffe2e8b40a2a33e96 Mon Sep 17 00:00:00 2001
From: gobbleturk <mattdavidow@google.com>
Date: Mon, 2 Dec 2024 18:23:45 +0000
Subject: [PATCH 24/40] Assert multiple slices available when requesting DCN
 parallelisms

---
 MaxText/pyconfig.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/MaxText/pyconfig.py b/MaxText/pyconfig.py
index fa80c9226..a05dd0690 100644
--- a/MaxText/pyconfig.py
+++ b/MaxText/pyconfig.py
@@ -115,6 +115,7 @@ def validate_keys(keys):
         " use_replicator_service and replicator_backup_interval_minutes"
     )
 
+  validate_multiple_slices(keys)
   if keys["num_experts"] > 1:
     validate_megablox_parallelism(keys)
 
@@ -488,6 +489,27 @@ def update_model_vars(base_config_path, raw_keys, config_name: str):
     return updated_keys
 
 
+def validate_multiple_slices(raw_keys):
+  if (
+      math.fabs(
+          math.prod(
+              [
+                  raw_keys["dcn_data_parallelism"],
+                  raw_keys["dcn_pipeline_parallelism"],
+                  raw_keys["dcn_fsdp_parallelism"],
+                  raw_keys["dcn_fsdp_transpose_parallelism"],
+                  raw_keys["dcn_sequence_parallelism"],
+                  raw_keys["dcn_tensor_parallelism"],
+                  raw_keys["dcn_expert_parallelism"],
+                  raw_keys["dcn_autoregressive_parallelism"],
+              ]
+          )
+      )
+      > 1
+  ):
+    assert raw_keys["num_slices"] > 1, "DCN parallelism requested but only one slice available."
+
+
 def validate_megablox_parallelism(raw_keys):
   if raw_keys["megablox"] and (
       using_sequence_parallelism(raw_keys) or using_pipeline_parallelism(raw_keys) or using_expert_parallelism(raw_keys)

From f6c38f1be55409fd68bea426b264190e80bc91bd Mon Sep 17 00:00:00 2001
From: Raymond Zou <raymondzou@google.com>
Date: Tue, 5 Nov 2024 08:11:50 +0000
Subject: [PATCH 25/40] Add llama 3.1 70b config

---
 benchmarks/benchmark_runner.py               |  1 +
 benchmarks/maxtext_trillium_model_configs.py | 44 ++++++++++++++++++++
 benchmarks/xla_flags_library.py              |  3 +-
 3 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_runner.py b/benchmarks/benchmark_runner.py
index f7b859da0..909c36276 100644
--- a/benchmarks/benchmark_runner.py
+++ b/benchmarks/benchmark_runner.py
@@ -93,6 +93,7 @@ def add_shared_arguments(custom_parser: argparse.ArgumentParser):
           'mixtral_8x7b_dropless',
           'gemma2_9b_8192',
           'gemma2_27b_8192',
+          'llama3_1_70b_129024',
       ],
       default='llama2_70b_4096',
       help=(
diff --git a/benchmarks/maxtext_trillium_model_configs.py b/benchmarks/maxtext_trillium_model_configs.py
index 7a28effde..b3606ec8f 100644
--- a/benchmarks/maxtext_trillium_model_configs.py
+++ b/benchmarks/maxtext_trillium_model_configs.py
@@ -412,6 +412,49 @@ class MaxTextModel:
     ),
 )
 
+llama3_1_70b_129024 = MaxTextModel(
+    model_name="llama3_1-70b-129024",
+    model_type="llama3.1-70b",
+    tuning_params={
+        "per_device_batch_size": 0.125,
+        "ici_fsdp_parallelism": -1,
+        "ici_sequence_parallelism": 8,
+        "remat_policy": "custom",
+        "decoder_layer_input": "offload",
+        "out_proj": "offload",
+        "query_proj": "offload",
+        "key_proj": "offload",
+        "value_proj": "offload",
+        "max_target_length": 129024,
+        "attention": "flash",
+        "use_iota_embed": True,
+        "dataset_path": "gs://max-datasets-rogue",
+        "dataset_type": "synthetic",
+        "enable_checkpointing": False,
+        "sa_block_q": 2048,
+        "sa_block_kv": 2048,
+        "sa_block_kv_compute": 2048,
+        "sa_block_q_dkv": 2048,
+        "sa_block_kv_dkv": 2048,
+        "sa_block_kv_dkv_compute": 2048,
+        "sa_block_q_dq": 2048,
+        "sa_block_kv_dq": 2048,
+        "sa_use_fused_bwd_kernel": True,
+        "profiler": "xplane",
+        "skip_first_n_steps_for_profiler": 10,
+        "profiler_steps": 5,
+        "allow_split_physical_axes": True,
+        "custom_mesh": "hybrid_ring_32x8",
+    },
+    xla_flags=(
+        xla_flags_library.DENSE_VMEM_LIMIT_FLAG
+        + xla_flags_library.LAYOUT_FOR_ALL_REDUCE_SCATTER
+        + xla_flags_library.DATA_PARALLEL_OVERLAP
+        + xla_flags_library.ENABLE_SPARSECORE_OFFLOADING_FOR_1D_ALL_GATHER
+        + xla_flags_library.HOST_OFFLOAD_FLAGS
+    ),
+)
+
 mixtral_8x7b_dropless = MaxTextModel(
     model_name="mixtral-8x7b",
     model_type="mixtral-8x7b",
@@ -576,6 +619,7 @@ class MaxTextModel:
     llama3_8b_8192,  # Not Optimizied yet
     llama3_70b_8192,  # Not Optimizied yet
     llama3_1_405b_8192_fsdp_dcn,
+    llama3_1_70b_129024,
     mixtral_8x7b_dropped,
     mixtral_8x7b_dropped_int8,
     mixtral_8x7b_dropless,
diff --git a/benchmarks/xla_flags_library.py b/benchmarks/xla_flags_library.py
index 35d03bdf1..705e838d1 100644
--- a/benchmarks/xla_flags_library.py
+++ b/benchmarks/xla_flags_library.py
@@ -51,7 +51,7 @@
 
 #Only ready for 1D All-Gather but should support 2D soon, and
 # hopefully All-Reduce soon.
-ENABLE_SPARECORE_OFFLOADING_FOR_1D_ALL_GATHER = (
+ENABLE_SPARSECORE_OFFLOADING_FOR_1D_ALL_GATHER = (
     " --xla_sc_disable_megacore_partitioning=true"
     " --xla_tpu_enable_async_collective_fusion_fuse_all_gather=false"
     " --xla_tpu_enable_all_gather_offload_tracing=true"
@@ -59,6 +59,7 @@
     " --xla_tpu_enable_sparse_core_collective_offload_all_gather=true"
     " --xla_sc_enable_instruction_fusion=false"
     " --xla_sc_disjoint_spmem=false"
+    " --2a886c8_chip_config_name=megachip_tccontrol"
     # Interesting flags to try:
     # " --xla_tpu_enable_offloading_gather_to_sparsecore=true"
     # " --xla_tpu_enable_offloading_reduce_to_sparsecore=true"

From 7a4a44e88a639c4a9dfad2202b49a7ab4d465641 Mon Sep 17 00:00:00 2001
From: Xuefeng Gu <xfgu@google.com>
Date: Tue, 3 Dec 2024 17:41:49 +0000
Subject: [PATCH 26/40] Update replicator.yaml to include framework and
 num_slices information

---
 MaxText/max_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MaxText/max_utils.py b/MaxText/max_utils.py
index c74297ef3..0076ce67b 100644
--- a/MaxText/max_utils.py
+++ b/MaxText/max_utils.py
@@ -309,6 +309,8 @@ def initialize_jax_for_tpu_with_emergency_checkpointing(raw_keys):
         run_name = os.environ.get("JOBSET_NAME")  # using XPK default
 
       replicator_yaml = f"""job-name: {run_name}
+      framework: orbax
+      assume-data-parallelism: {num_slices}
       node-rank: {node_rank}
       nodes: {num_nodes}
       workers-per-node: 1

From e3823c0d05eda6317c87bb2badaee0790cfcdcad Mon Sep 17 00:00:00 2001
From: Branden Vandermoon <bvandermoon@google.com>
Date: Wed, 4 Dec 2024 00:05:06 +0000
Subject: [PATCH 27/40] Support JAX_VERSION for nightly mode on GPU

---
 docker_build_dependency_image.sh |  2 ++
 setup.sh                         | 14 ++++++++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/docker_build_dependency_image.sh b/docker_build_dependency_image.sh
index efbbe9f0e..1277726f8 100644
--- a/docker_build_dependency_image.sh
+++ b/docker_build_dependency_image.sh
@@ -19,6 +19,8 @@
 # bash docker_build_dependency_image.sh DEVICE={{gpu|tpu}} MODE=stable_stack BASEIMAGE={{JAX_STABLE_STACK BASEIMAGE FROM ARTIFACT REGISTRY}}
 # bash docker_build_dependency_image.sh MODE=nightly
 # bash docker_build_dependency_image.sh MODE=stable JAX_VERSION=0.4.13
+# Nightly build with JAX_VERSION for GPUs. Available versions listed at https://storage.googleapis.com/jax-releases/jax_nightly_releases.html:
+# bash docker_build_dependency_image.sh DEVICE=gpu MODE=nightly JAX_VERSION=0.4.36.dev20241109 # Note: this sets both jax-nightly and jaxlib-nightly 
 
 # Enable "exit immediately if any command fails" option
 set -e
diff --git a/setup.sh b/setup.sh
index bb3d0919a..4b4ba090e 100644
--- a/setup.sh
+++ b/setup.sh
@@ -22,6 +22,7 @@
 # You have the option to provide a LIBTPU_GCS_PATH that points to a libtpu.so provided to you by Google.
 # In libtpu-only MODE, the LIBTPU_GCS_PATH is mandatory.
 # For MODE=stable you may additionally specify JAX_VERSION, e.g. JAX_VERSION=0.4.13
+# For DEVICE=gpu, you may also specify JAX_VERSION when MODE=nightly, e.g. JAX_VERSION=0.4.36.dev20241109
 
 
 # Enable "exit immediately if any command fails" option
@@ -59,8 +60,8 @@ if [[ $LIBTPU_GCS_PATH == NONE ]]; then
   unset LIBTPU_GCS_PATH
 fi
 
-if [[ -n $JAX_VERSION && ! ($MODE == "stable" || -z $MODE) ]]; then
-     echo -e "\n\nError: You can only specify a JAX_VERSION with stable mode.\n\n"
+if [[ -n $JAX_VERSION && ! ($MODE == "stable" || -z $MODE || ($MODE == "nightly" && $DEVICE == "gpu")) ]]; then
+     echo -e "\n\nError: You can only specify a JAX_VERSION with stable mode (plus nightly mode on GPU).\n\n"
      exit 1
 fi
 
@@ -157,9 +158,14 @@ elif [[ "$MODE" == "stable" || ! -v MODE ]]; then
 elif [[ $MODE == "nightly" ]]; then
 # Nightly mode
     if [[ $DEVICE == "gpu" ]]; then
-        echo "Installing jax-nightly, jaxlib-nightly"
         # Install jax-nightly
-        pip install -U --pre jax jaxlib jax-cuda12-plugin[with_cuda] jax-cuda12-pjrt -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
+        if [[ -n "$JAX_VERSION" ]]; then
+            echo "Installing jax-nightly, jaxlib-nightly ${JAX_VERSION}"
+            pip install -U --pre jax==${JAX_VERSION} jaxlib==${JAX_VERSION} jax-cuda12-plugin[with_cuda] jax-cuda12-pjrt -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
+        else
+            echo "Installing latest jax-nightly, jaxlib-nightly"
+            pip install -U --pre jax jaxlib jax-cuda12-plugin[with_cuda] jax-cuda12-pjrt -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
+        fi
         # Install Transformer Engine
         export NVTE_FRAMEWORK=jax
         pip3 install git+https://github.com/NVIDIA/TransformerEngine.git@stable

From e17fcc6fd87d3af4cc86789a2d6965e6fc948e34 Mon Sep 17 00:00:00 2001
From: Luke Baumann <lukebaumann@google.com>
Date: Tue, 3 Dec 2024 18:46:12 +0000
Subject: [PATCH 28/40] Added a custom_wheel mode for building the dependency
 image.

This mode is the same as nightly except that after nightly is installed, any file in `maxtext/*.whl` is forcefully reinstalled.
---
 docker_build_dependency_image.sh | 15 +++++++++++++++
 maxtext_custom_wheels.Dockerfile |  6 ++++++
 2 files changed, 21 insertions(+)
 create mode 100644 maxtext_custom_wheels.Dockerfile

diff --git a/docker_build_dependency_image.sh b/docker_build_dependency_image.sh
index 1277726f8..ce3a5d3c3 100644
--- a/docker_build_dependency_image.sh
+++ b/docker_build_dependency_image.sh
@@ -21,6 +21,11 @@
 # bash docker_build_dependency_image.sh MODE=stable JAX_VERSION=0.4.13
 # Nightly build with JAX_VERSION for GPUs. Available versions listed at https://storage.googleapis.com/jax-releases/jax_nightly_releases.html:
 # bash docker_build_dependency_image.sh DEVICE=gpu MODE=nightly JAX_VERSION=0.4.36.dev20241109 # Note: this sets both jax-nightly and jaxlib-nightly 
+# MODE=custom_wheels is the same as nightly except that it reinstalls any
+# additional wheels that are present in the maxtext directory.
+# The main use case is to install custom jax or jaxlib wheels but it also
+# works with any custom wheels.
+# bash docker_build_dependency_image.sh MODE=custom_wheels
 
 # Enable "exit immediately if any command fails" option
 set -e
@@ -48,6 +53,11 @@ fi
 if [[ -z ${MODE} ]]; then
   export MODE=stable
   echo "Default MODE=${MODE}"
+elif [[ ${MODE} == "custom_wheels" ]] ; then
+  export MODE=nightly
+  export CUSTOM_JAX=1
+else
+  export CUSTOM_JAX=0
 fi
 
 if [[ -z ${DEVICE} ]]; then
@@ -98,6 +108,11 @@ else
   docker build --network host --build-arg CUSTOM_LIBTPU=true -f ./maxtext_libtpu_path.Dockerfile -t ${LOCAL_IMAGE_NAME} .
 fi
 
+if [[ ${CUSTOM_JAX} -eq 1 ]] ; then
+  echo "Installing custom jax and jaxlib"
+  docker build --network host -f ./maxtext_custom_wheels.Dockerfile -t ${LOCAL_IMAGE_NAME} .
+fi
+
 echo ""
 echo "*************************"
 echo ""
diff --git a/maxtext_custom_wheels.Dockerfile b/maxtext_custom_wheels.Dockerfile
new file mode 100644
index 000000000..e27ad7dd0
--- /dev/null
+++ b/maxtext_custom_wheels.Dockerfile
@@ -0,0 +1,6 @@
+ARG BASEIMAGE=maxtext_base_image
+FROM $BASEIMAGE
+
+# Requires wheels be in /deps. This means any custom wheels should be placed
+# in the maxtext directory.
+RUN python3 -m pip install --force-reinstall /deps/*.whl

From 4cdc15de48737757a9109590c4acbc9186242e29 Mon Sep 17 00:00:00 2001
From: gobbleturk <mattdavidow@google.com>
Date: Wed, 4 Dec 2024 17:16:15 +0000
Subject: [PATCH 29/40] clean up pipeline config setting in its own method

---
 MaxText/pyconfig.py | 70 ++++++++++++++++++++++++---------------------
 1 file changed, 37 insertions(+), 33 deletions(-)

diff --git a/MaxText/pyconfig.py b/MaxText/pyconfig.py
index a05dd0690..d2dea9fcd 100644
--- a/MaxText/pyconfig.py
+++ b/MaxText/pyconfig.py
@@ -399,39 +399,7 @@ def user_init(raw_keys):
 
     raw_keys["num_slices"] = max_utils.get_num_slices(raw_keys)
     raw_keys["quantization_local_shard_count"] = get_quantization_local_shard_count(raw_keys)
-
-    if using_pipeline_parallelism(raw_keys):
-      raw_keys["using_pipeline_parallelism"] = True
-      num_stages = int(raw_keys["ici_pipeline_parallelism"] * raw_keys["dcn_pipeline_parallelism"])
-      if raw_keys["num_pipeline_repeats"] == -1:
-        num_pipeline_repeats, remainder = divmod(
-            raw_keys["num_decoder_layers"], num_stages * raw_keys["num_layers_per_pipeline_stage"]
-        )
-        assert (
-            not remainder
-        ), f"The number of layers per stage ({raw_keys['num_layers_per_pipeline_stage']}) times the number of stages ({num_stages}) must divide the number of decoder layers ({raw_keys['num_decoder_layers']}) "
-        raw_keys["num_pipeline_repeats"] = num_pipeline_repeats
-      assert (
-          num_stages * raw_keys["num_pipeline_repeats"] * raw_keys["num_layers_per_pipeline_stage"]
-          == raw_keys["num_decoder_layers"]
-      ), f"The product of pipeline stages ({num_stages}), repeats ({raw_keys['num_pipeline_repeats']}), and layers per stage ({raw_keys['num_layers_per_pipeline_stage']}) must be equal to the number of layers ({raw_keys['num_decoder_layers']})"
-      if raw_keys["num_pipeline_microbatches"] == -1:
-        if raw_keys["pipeline_delay_activation_forwarding"]:
-          raw_keys["num_pipeline_microbatches"] = 2 * num_stages
-        else:
-          raw_keys["num_pipeline_microbatches"] = num_stages
-      assert (
-          raw_keys["num_pipeline_microbatches"] % num_stages == 0
-      ), f"The number of microbatches ({raw_keys['num_pipeline_microbatches']}) must be divisible by the number of stages ({num_stages})"
-      assert (
-          raw_keys["micro_batch_size_to_train_on"] % raw_keys["num_pipeline_microbatches"] == 0
-      ), f"The batch size ({raw_keys['micro_batch_size_to_train_on']}) must be divisible by the number of microbatches ({raw_keys['num_pipeline_microbatches']})"
-      if raw_keys["pipeline_delay_activation_forwarding"]:
-        assert (
-            raw_keys["num_pipeline_microbatches"] >= 2 * num_stages
-        ), f"Delayed activation forwarding requires at least 2 * num_stages microbatches, but {num_stages} stages are used with {raw_keys['num_pipeline_microbatches']} microbatches"
-    else:
-      raw_keys["using_pipeline_parallelism"] = False
+    raw_keys = set_and_validate_pipeline_config(raw_keys)
 
     if raw_keys["dataset_type"] == "c4_mlperf":
       raw_keys["add_bos"] = False
@@ -510,6 +478,42 @@ def validate_multiple_slices(raw_keys):
     assert raw_keys["num_slices"] > 1, "DCN parallelism requested but only one slice available."
 
 
+def set_and_validate_pipeline_config(raw_keys):
+  if using_pipeline_parallelism(raw_keys):
+    raw_keys["using_pipeline_parallelism"] = True
+    num_stages = int(raw_keys["ici_pipeline_parallelism"] * raw_keys["dcn_pipeline_parallelism"])
+    if raw_keys["num_pipeline_repeats"] == -1:
+      num_pipeline_repeats, remainder = divmod(
+          raw_keys["num_decoder_layers"], num_stages * raw_keys["num_layers_per_pipeline_stage"]
+      )
+      assert (
+          not remainder
+      ), f"The number of layers per stage ({raw_keys['num_layers_per_pipeline_stage']}) times the number of stages ({num_stages}) must divide the number of decoder layers ({raw_keys['num_decoder_layers']}) "
+      raw_keys["num_pipeline_repeats"] = num_pipeline_repeats
+    assert (
+        num_stages * raw_keys["num_pipeline_repeats"] * raw_keys["num_layers_per_pipeline_stage"]
+        == raw_keys["num_decoder_layers"]
+    ), f"The product of pipeline stages ({num_stages}), repeats ({raw_keys['num_pipeline_repeats']}), and layers per stage ({raw_keys['num_layers_per_pipeline_stage']}) must be equal to the number of layers ({raw_keys['num_decoder_layers']})"
+    if raw_keys["num_pipeline_microbatches"] == -1:
+      if raw_keys["pipeline_delay_activation_forwarding"]:
+        raw_keys["num_pipeline_microbatches"] = 2 * num_stages
+      else:
+        raw_keys["num_pipeline_microbatches"] = num_stages
+    assert (
+        raw_keys["num_pipeline_microbatches"] % num_stages == 0
+    ), f"The number of microbatches ({raw_keys['num_pipeline_microbatches']}) must be divisible by the number of stages ({num_stages})"
+    assert (
+        raw_keys["micro_batch_size_to_train_on"] % raw_keys["num_pipeline_microbatches"] == 0
+    ), f"The batch size ({raw_keys['micro_batch_size_to_train_on']}) must be divisible by the number of microbatches ({raw_keys['num_pipeline_microbatches']})"
+    if raw_keys["pipeline_delay_activation_forwarding"]:
+      assert (
+          raw_keys["num_pipeline_microbatches"] >= 2 * num_stages
+      ), f"Delayed activation forwarding requires at least 2 * num_stages microbatches, but {num_stages} stages are used with {raw_keys['num_pipeline_microbatches']} microbatches"
+  else:
+    raw_keys["using_pipeline_parallelism"] = False
+  return raw_keys
+
+
 def validate_megablox_parallelism(raw_keys):
   if raw_keys["megablox"] and (
       using_sequence_parallelism(raw_keys) or using_pipeline_parallelism(raw_keys) or using_expert_parallelism(raw_keys)

From 7390f561fe29acd1d01c708ff4e426e1226e814a Mon Sep 17 00:00:00 2001
From: Vijaya <mailvijayasingh@gmail.com>
Date: Wed, 4 Dec 2024 21:16:59 +0000
Subject: [PATCH 30/40] Fixes for dropping

---
 MaxText/configs/base.yml  |  4 ++
 MaxText/layers/linears.py | 80 ++++++++++++++++++++++++++++++---------
 MaxText/pyconfig.py       |  7 ++++
 3 files changed, 73 insertions(+), 18 deletions(-)

diff --git a/MaxText/configs/base.yml b/MaxText/configs/base.yml
index 8c22ac576..1950a1182 100644
--- a/MaxText/configs/base.yml
+++ b/MaxText/configs/base.yml
@@ -94,6 +94,10 @@ kv_quant_dtype: "int8"
 checkpoint_is_quantized: False # Set to True if reading from a saved aqt quantized checkpoint
 # Saves params quantized on fly at following path
 save_quantized_params_path: ""
+#Used to configure the mode in which model is called
+# when left as is, corresponds to training
+# accepted values are "inference" 
+model_call_mode: ""
 
 # Shard the range finding operation for quantization. By default this is set to number of slices.
 quantization_local_shard_count: -1
diff --git a/MaxText/layers/linears.py b/MaxText/layers/linears.py
index 68f05df8f..d06da67e3 100644
--- a/MaxText/layers/linears.py
+++ b/MaxText/layers/linears.py
@@ -35,6 +35,7 @@
 from aqt.jax.v2 import aqt_tensor
 from kernels import megablox as mblx
 
+
 Array = common_types.Array
 Config = common_types.Config
 DType = common_types.DType
@@ -48,6 +49,16 @@
 Quant = quantizations.AqtQuantization
 QTensor = aqt_tensor.QTensor
 
+DISPATCH = "dispatch"
+COMBINE = "combine"
+
+
+def _get_model_call_mode(config):
+  if config.model_cal_mode == "inference":
+    return "inference"
+  else:
+    return None
+
 
 def _convert_to_activation_function(fn_or_string: Union[str, Callable[..., Any]]) -> Callable[..., Any]:
   """Convert a string to an activation function."""
@@ -148,7 +159,10 @@ def compute_dot_general(inputs, kernel, axis, contract_ind):
     output = compute_dot_general(inputs, kernel, axis, contract_ind)
 
     if self.use_bias:
-      bias_axes, bias_shape = self.kernel_axes[-len(features) :], kernel_shape[-len(features) :]
+      bias_axes, bias_shape = (
+          self.kernel_axes[-len(features) :],
+          kernel_shape[-len(features) :],
+      )
       bias = self.param(
           "bias",
           nn.with_logical_partitioning(bias_init, bias_axes),
@@ -378,7 +392,8 @@ def unpermute(self, intermediate, sorted_selected_experts, weights):
     reshaped_weights = jnp.reshape(weights, (-1, self.num_experts_per_tok))
     tensor_parallelism = self.config.ici_tensor_parallelism * self.config.dcn_tensor_parallelism
     reshaped_intermediate = jnp.reshape(
-        unsort_intermediate, (-1, self.num_experts_per_tok, self.config.emb_dim // tensor_parallelism)
+        unsort_intermediate,
+        (-1, self.num_experts_per_tok, self.config.emb_dim // tensor_parallelism),
     )
     with jax.named_scope("weight_sum"):
       matmul_precision = lax.Precision(self.config.matmul_precision)
@@ -475,7 +490,11 @@ def reshape_and_update_weights(self, weights, indices):
     # input of weights & indices: (batch_size, seq_len, num_experts_per_tok)
     # output of updated weights: (batch_size, seq_len, num_experts)
     update_weights = jnp.zeros((weights.shape[0], weights.shape[1], self.num_experts), dtype=self.dtype)
-    index_update = (jnp.arange(weights.shape[0])[:, None, None], jnp.arange(weights.shape[1])[:, None], indices)
+    index_update = (
+        jnp.arange(weights.shape[0])[:, None, None],
+        jnp.arange(weights.shape[1])[:, None],
+        indices,
+    )
     update_weights = update_weights.at[index_update].set(weights)
     return update_weights
 
@@ -483,7 +502,13 @@ def generate_masks(self, top_k_indices, softmax_probs):
     # calculate expert_capacity = (tokens_per_batch / num_experts) * capacity_factor
     batch_size, seq_len, _ = top_k_indices.shape
     tokens_per_batch = seq_len * self.num_experts_per_tok
-    expert_capacity_per_batch = int((tokens_per_batch / self.num_experts) * self.config.capacity_factor)
+    # this is to avoid expert_capacity_per_batch = 0
+    expert_capacity_per_batch = int(
+        max(
+            math.ceil(tokens_per_batch / self.num_experts) * self.config.capacity_factor,
+            self.config.capacity_factor,
+        )
+    )
     max_logging.log(f"Applying potential token dropping with a batch expert_capacity of {expert_capacity_per_batch}")
 
     # calculate expert mask and drop tokens if needed
@@ -501,7 +526,8 @@ def generate_masks(self, top_k_indices, softmax_probs):
     expert_mask_fused = nn.with_logical_constraint(expert_mask_fused, ("activation_batch", None, None))
     expert_token_count_fused = jnp.cumsum(expert_mask_fused, axis=1)
     expert_token_count = jnp.reshape(
-        expert_token_count_fused, ((batch_size, seq_len, self.num_experts_per_tok, self.num_experts))
+        expert_token_count_fused,
+        ((batch_size, seq_len, self.num_experts_per_tok, self.num_experts)),
     )
     expert_token_count = nn.with_logical_constraint(
         expert_token_count, ("activation_batch", "activation_length", None, None)
@@ -515,11 +541,14 @@ def generate_masks(self, top_k_indices, softmax_probs):
     # calculate token position in expert capacity dimension
     expert_token_position_fused = expert_mask_fused * expert_token_count_fused
     expert_token_position = jnp.reshape(
-        expert_token_position_fused, (batch_size, seq_len, self.num_experts_per_tok, self.num_experts)
+        expert_token_position_fused,
+        (batch_size, seq_len, self.num_experts_per_tok, self.num_experts),
     )
     combined_expert_token_position = jnp.sum(expert_token_position, axis=2) * combined_expert_mask
     expert_token_position_in_capacity = jax.nn.one_hot(
-        combined_expert_token_position, num_classes=expert_capacity_per_batch + 1, dtype=jnp.int32
+        combined_expert_token_position,
+        num_classes=expert_capacity_per_batch + 1,
+        dtype=jnp.int32,
     )
 
     # shape of combine_mask is (batch_size, seq_len, num_experts, expert_capacity_per_batch + 1),
@@ -540,7 +569,13 @@ def load_balance_loss(self, top_k_indices, logits):
     loss = jnp.mean(density * density_prob) * (self.num_experts**2) * self.config.load_balance_loss_weight
     return loss
 
-  def get_einsum(self, rhs_mesh_axes: Tuple[Optional[str], ...] = ()):
+  def get_einsum(self, rhs_mesh_axes: Tuple[Optional[str], ...] = (), einsum_name=None):
+
+    # the check is to prevent aqteinsum as einsum op for dispatch and combine einsums in ase when capacity_factor > 0
+    # this is necessary to load pre-quantized weights in case of inference
+    if self.config.model_call_mode == "inference" and (einsum_name == DISPATCH or einsum_name == COMBINE):
+      return jnp.einsum
+
     if self.quant:
 
       def aqt_einsum(*args, **kwargs):
@@ -580,11 +615,12 @@ def dense_matmul(self, inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel):
       loss = self.load_balance_loss(top_k_indices, softmax_probs)
       inputs = nn.with_logical_constraint(inputs, ("activation_batch", "activation_length", "activation_embed"))
       with jax.named_scope("dispatch"):
-        dispatch = self.get_einsum(rhs_mesh_axes=mask_axes)(
+        dispatch = self.get_einsum(rhs_mesh_axes=mask_axes, einsum_name=DISPATCH)(
             "BSM,BSEC -> EBCM", inputs, dispatch_mask, precision=matmul_precision
         )
         dispatch = nn.with_logical_constraint(
-            dispatch, ("activation_exp", "activation_batch_no_exp", None, "activation_embed")
+            dispatch,
+            ("activation_exp", "activation_batch_no_exp", None, "activation_embed"),
         )
       with jax.named_scope("wi_0"):
         w0_kernel_axes = ("exp", None, "mlp")
@@ -595,7 +631,8 @@ def dense_matmul(self, inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel):
         if self.config.activations_in_float32:
           layer_w0 = layer_w0.astype(jnp.float32)
         layer_w0 = nn.with_logical_constraint(
-            layer_w0, ("activation_exp", "activation_batch_no_exp", None, "activation_mlp")
+            layer_w0,
+            ("activation_exp", "activation_batch_no_exp", None, "activation_mlp"),
         )
         layer_w0 = checkpoint_name(layer_w0, "mlpwi_0")
       with jax.named_scope("wi_1"):
@@ -607,7 +644,8 @@ def dense_matmul(self, inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel):
         if self.config.activations_in_float32:
           layer_w1 = layer_w1.astype(jnp.float32)
         layer_w1 = nn.with_logical_constraint(
-            layer_w1, ("activation_exp", "activation_batch_no_exp", None, "activation_mlp")
+            layer_w1,
+            ("activation_exp", "activation_batch_no_exp", None, "activation_mlp"),
         )
         layer_w1 = checkpoint_name(layer_w1, "mlpwi_1")
       layer_w0_act = _convert_to_activation_function(self.config.mlp_activations[0])(layer_w0)
@@ -619,13 +657,17 @@ def dense_matmul(self, inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel):
             "EBCH,EHM -> EBCM", layer_multiply, wo_kernel, precision=matmul_precision
         )
         intermediate_layer = nn.with_logical_constraint(
-            intermediate_layer, ("activation_exp", "activation_batch_no_exp", None, "activation_embed")
+            intermediate_layer,
+            ("activation_exp", "activation_batch_no_exp", None, "activation_embed"),
         )
         intermediate_layer = checkpoint_name(intermediate_layer, "mlpwo")
       with jax.named_scope("combine"):
         # Matmul & element wise operation
-        output = self.get_einsum(rhs_mesh_axes=mask_axes)(
-            "EBCM,BSEC -> BSM", intermediate_layer, combine_mask, precision=matmul_precision
+        output = self.get_einsum(rhs_mesh_axes=mask_axes, einsum_name=COMBINE)(
+            "EBCM,BSEC -> BSM",
+            intermediate_layer,
+            combine_mask,
+            precision=matmul_precision,
         )
       return output, loss
     else:
@@ -650,9 +692,11 @@ def dense_matmul(self, inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel):
         )
         intermediate_layer = checkpoint_name(intermediate_layer, "mlpwo")
       with jax.named_scope("w_sum"):
-        output = jnp.einsum("BSEM,BSE -> BSM", intermediate_layer.astype(jnp.float32), weights.astype(jnp.float32)).astype(
-            self.dtype
-        )
+        output = jnp.einsum(
+            "BSEM,BSE -> BSM",
+            intermediate_layer.astype(jnp.float32),
+            weights.astype(jnp.float32),
+        ).astype(self.dtype)
       return output, None
 
   def retrieve_quantized_weight(
diff --git a/MaxText/pyconfig.py b/MaxText/pyconfig.py
index d2dea9fcd..8e55999e0 100644
--- a/MaxText/pyconfig.py
+++ b/MaxText/pyconfig.py
@@ -85,12 +85,19 @@ def validate_profiler_type(s: str) -> None:
     raise ValueError("Invalid profiler type was passed. Valid options ", valid_profiler_types)
 
 
+def validate_model_call_mode(s: str) -> None:
+  valid_model_call_modes = ("", "inference")
+  if s not in valid_model_call_modes:  # currently supported attention
+    raise ValueError(f"Invalid model call mode {s}. Valid options are {valid_model_call_modes}")
+
+
 def validate_keys(keys):
   validate_attention_kernel(keys["attention"])
   validate_attention_type(keys["attention_type"])
   validate_profiler_type(keys["profiler"])
   validate_compute_axis_order(keys["compute_axis_order"])
   validate_kv_quant_axis(keys["kv_quant_axis"], keys["quantize_kvcache"])
+  validate_model_call_mode(keys["model_call_mode"])
 
   assert (keys["load_parameters_path"] == "" and keys["load_full_state_path"] == "") or keys[
       "enable_checkpointing"

From 58315478e5dddad1c76c0aff355ea3a755d768df Mon Sep 17 00:00:00 2001
From: Mohit Khatwani <mohitkhatwani@google.com>
Date: Wed, 4 Dec 2024 22:43:18 +0000
Subject: [PATCH 31/40] Fix setup_training_state

---
 MaxText/convert_gpt3_ckpt_from_paxml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MaxText/convert_gpt3_ckpt_from_paxml.py b/MaxText/convert_gpt3_ckpt_from_paxml.py
index 3ec57f8a2..0f6d6111c 100644
--- a/MaxText/convert_gpt3_ckpt_from_paxml.py
+++ b/MaxText/convert_gpt3_ckpt_from_paxml.py
@@ -107,7 +107,7 @@ def convert(paxml_ckpt_path, maxtext_model_name, base_output_directory, run_name
       cfg.checkpoint_period,
   )
 
-  state, _, _ = max_utils.setup_training_state(model, None, tx, cfg, init_rng, mesh, checkpoint_manager)
+  state, _, _, _ = max_utils.setup_training_state(model, None, tx, cfg, init_rng, mesh, checkpoint_manager)
   max_logging.log("start")
   check_memory()
 

From 90430e1d09ce5bd2ed93304f8fdc9a790675f07a Mon Sep 17 00:00:00 2001
From: Jake Harmon <jakeharmon@google.com>
Date: Thu, 5 Dec 2024 04:17:01 -0800
Subject: [PATCH 32/40] point to new jax github location in documentation

PiperOrigin-RevId: 703063210
---
 MaxText/layers/attentions.py     | 2 +-
 benchmarks/maxtext_xpk_runner.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/MaxText/layers/attentions.py b/MaxText/layers/attentions.py
index 57f7afce0..315ef1ff1 100644
--- a/MaxText/layers/attentions.py
+++ b/MaxText/layers/attentions.py
@@ -170,7 +170,7 @@ def check_attention_inputs(self, query: Array, key: Array | KVTensor, value: Arr
     assert query.shape[-1] == key.shape[-1], "q, k depths must match."
 
   # Following Pallas MHA Flash Attention Reference.
-  # https://github.com/google/jax/blob/main/jax/experimental/pallas/ops/tpu/flash_attention.py
+  # https://github.com/jax-ml/jax/blob/main/jax/experimental/pallas/ops/tpu/flash_attention.py
   # This mask models (1) separate sequences (decoder_segment_ids) and (2) causality
   def generate_attention_mask(self, query, key, decoder_segment_ids: Array | None, model_mode: str) -> Array | None:
     mask = None
diff --git a/benchmarks/maxtext_xpk_runner.py b/benchmarks/maxtext_xpk_runner.py
index 6957cd1b3..8c728da3e 100644
--- a/benchmarks/maxtext_xpk_runner.py
+++ b/benchmarks/maxtext_xpk_runner.py
@@ -294,7 +294,7 @@ def build_user_command(
       # f'python3 -m pip install google-cloud-aiplatform==v1.61.0 &&'
       # f'pip install -U "jax[tpu]==0.4.30" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html &&'
       # f' pip install https://storage.googleapis.com/jax-releases/nightly/nocuda/jaxlib-0.4.27.dev20240501-cp310-cp310-manylinux2014_x86_64.whl &&'
-      # f' pip install git+https://github.com/google/jax.git@57bfe81260545556ec22509347f7ced112496200 &&'
+      # f' pip install git+https://github.com/jax-ml/jax.git@57bfe81260545556ec22509347f7ced112496200 &&'
       f' {install_libtpu_cmd}'
       # f' mv libtpu.so /lib/ &&'
       # f' export TPU_LIBRARY_PATH=$PWD/libtpu.so &&'

From 2d1c51aaaf8b143b74eefeaa31056424442dc29b Mon Sep 17 00:00:00 2001
From: michelle-yooh <yooh@google.com>
Date: Tue, 3 Dec 2024 22:16:05 +0000
Subject: [PATCH 33/40] Change awk regex command to capture the coordinate
 address properly

---
 gpu_multi_process_run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu_multi_process_run.sh b/gpu_multi_process_run.sh
index f93c982d7..99d03b0f2 100644
--- a/gpu_multi_process_run.sh
+++ b/gpu_multi_process_run.sh
@@ -124,7 +124,7 @@ resolve_coordinator_ip() {
   echo "Coordinator Address $JAX_COORDINATOR_ADDRESS"
 
   while [[ "$coordinator_found" = false && $lookup_attempt -le $max_coordinator_lookups ]]; do
-    coordinator_ip_address=$(nslookup "$JAX_COORDINATOR_ADDRESS" 2>/dev/null | awk '/^Address: / { print $2 }' | head -n 1)
+    coordinator_ip_address=$(nslookup "$JAX_COORDINATOR_ADDRESS" 2>/dev/null | awk '/Address: / { print $2 }' | head -n 1)
     if [[ -n "$coordinator_ip_address" ]]; then
       coordinator_found=true
       echo "Coordinator IP address: $coordinator_ip_address"

From 53a6abe895960ca5534d0b00c509aa027be2aacf Mon Sep 17 00:00:00 2001
From: Pate Motter <patemotter@google.com>
Date: Fri, 6 Dec 2024 21:55:02 +0000
Subject: [PATCH 34/40] Fixes non-hashable error in ragged attn.

---
 MaxText/kernels/ragged_attention.py | 66 ++++++++++++++---------------
 1 file changed, 31 insertions(+), 35 deletions(-)

diff --git a/MaxText/kernels/ragged_attention.py b/MaxText/kernels/ragged_attention.py
index aadcfaf5d..20cd292e8 100644
--- a/MaxText/kernels/ragged_attention.py
+++ b/MaxText/kernels/ragged_attention.py
@@ -24,6 +24,7 @@
 from jax.experimental import pallas as pl
 from jax.experimental.pallas import tpu as pltpu
 import jax.numpy as jnp
+import numpy as np
 import common_types
 
 
@@ -32,6 +33,27 @@
 shard_map = shard_map.shard_map
 
 
+def get_mha_cost_estimate(shape_dtype):
+  """Get cost estimate for MHA based on static shape information."""
+  batch_size, _, num_heads, head_dim = shape_dtype[0].shape
+  seq_len = shape_dtype[1].shape[1]
+
+  # Approximate flops calculation for attention
+  # fmt: off
+  flops = batch_size * num_heads * seq_len * (
+    2 * head_dim +  # QK multiplication
+    seq_len +       # softmax
+    2 * head_dim    # V multiplication
+  )
+  # fmt: on
+
+  return pl.CostEstimate(
+      flops=flops,
+      transcendentals=batch_size * num_heads * seq_len,
+      bytes_accessed=sum(np.prod(s.shape) * s.dtype.itemsize for s in shape_dtype),
+  )
+
+
 @functools.partial(jax.jit, static_argnames=["mask_value"])
 def reference_mqa(
     q: jax.Array,
@@ -301,22 +323,8 @@ def ragged_mha(
     max logit ([batch_size, num_heads, 1]) and softmax denominator ([batch_size,
     num_heads, 1]).
   """
-  cost_analysis = (
-      reference_mha.lower(
-          query,
-          key,
-          value,
-          lengths,
-          mask_value=mask_value,
-      )
-      .compile()
-      .cost_analysis()[0]
-  )
-  cost_estimate = pl.CostEstimate(
-      flops=int(cost_analysis["flops"]),
-      transcendentals=int(cost_analysis["transcendentals"]),
-      bytes_accessed=int(cost_analysis["bytes accessed"]),
-  )
+  shape_dtype = (query, key, value, lengths)
+  cost_estimate = get_mha_cost_estimate(shape_dtype)
 
   query = jnp.swapaxes(query, 1, 2)
   key = jnp.swapaxes(key, 1, 2)
@@ -369,28 +377,16 @@ def ragged_gqa(
     max logit ([batch_size, num_heads, 1]) and softmax denominator ([batch_size,
     num_heads, 1]).
   """
-  cost_analysis = (
-      reference_gqa.lower(
-          jnp.squeeze(query),
-          jnp.swapaxes(key, 1, 2),
-          jnp.swapaxes(value, 1, 2),
-          lengths,
-          mask_value=mask_value,
-      )
-      .compile()
-      .cost_analysis()[0]
-  )
-  cost_estimate = pl.CostEstimate(
-      flops=int(cost_analysis["flops"]),
-      transcendentals=int(cost_analysis["transcendentals"]),
-      bytes_accessed=int(cost_analysis["bytes accessed"]),
-  )
+  shape_dtype = (query, key, value, lengths)
+  cost_estimate = get_mha_cost_estimate(shape_dtype)
+
   batch_size, _, num_heads_q, head_dim = query.shape
   _, _, num_heads_kv, _ = key.shape
 
-  query = query.reshape(batch_size, num_heads_kv, num_heads_q // num_heads_kv, head_dim)  # (b, n_kv, n_q // n_kv, d)
-  key = jnp.swapaxes(key, 1, 2)  # (b, n_kv, s, d)
-  value = jnp.swapaxes(value, 1, 2)  # (b, n_kv, s, d)
+  query = query.reshape(batch_size, num_heads_kv, num_heads_q // num_heads_kv, head_dim)
+  key = jnp.swapaxes(key, 1, 2)
+  value = jnp.swapaxes(value, 1, 2)
+
   o, m, l = jax.vmap(
       functools.partial(
           ragged_mqa,

From 4546d68dba111a028ea3ae3b74cf5d8ebec3b490 Mon Sep 17 00:00:00 2001
From: Branden Vandermoon <bvandermoon@google.com>
Date: Wed, 4 Dec 2024 20:54:54 +0000
Subject: [PATCH 35/40] Add new remat policy for save_dot_except_mlp with
 context

---
 MaxText/configs/base.yml | 4 +++-
 MaxText/layers/models.py | 9 +++++++++
 MaxText/pyconfig.py      | 1 +
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/MaxText/configs/base.yml b/MaxText/configs/base.yml
index 1950a1182..de1a570d1 100644
--- a/MaxText/configs/base.yml
+++ b/MaxText/configs/base.yml
@@ -159,12 +159,14 @@ set_remat_policy_on_pipeline_iterations: True
 set_remat_policy_on_layers_per_stage: False
 
 
-# Choose 'remat_policy' between 'minimal', 'save_dot_except_mlpwi', 'save_dot_except_mlp', 'save_qkv_proj', 'qkv_proj_offloaded', 'custom' 'minimal_offloaded', 'save_out_proj' and 'full'.
+# Choose 'remat_policy' between 'minimal', 'save_dot_with_context_except_mlp', 'save_dot_except_mlpwi', 'save_dot_except_mlp',
+# 'save_qkv_proj', 'qkv_proj_offloaded', 'custom' 'minimal_offloaded', 'save_out_proj' and 'full'.
 # These options offer a trade-off between speed (fastest to slowest) and HBM usage (highest to lowest)
 remat_policy: 'full'
 # If custom_save_offload remat_policy is chosen, you can select tensors from the following list to offload on host memory, rematerialize or save on device memory.
 # Pick one of these options for following tensors: ['remat','device','offload']
 decoder_layer_input: 'device' # this tensor cannot be rematerialized - it serves as periodic checkpoints that act as the remat start points
+context: 'remat' # From https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/jax/attention.py#L581-L583
 mlpwi: 'remat'
 mlpwi_0: 'remat'
 mlpwi_1: 'remat'
diff --git a/MaxText/layers/models.py b/MaxText/layers/models.py
index 5ad1893e7..4c2046c1f 100644
--- a/MaxText/layers/models.py
+++ b/MaxText/layers/models.py
@@ -302,6 +302,15 @@ def __call__(
     if cfg.remat_policy != "none":
       if cfg.remat_policy == "minimal":
         policy = jax.checkpoint_policies.checkpoint_dots_with_no_batch_dims
+      elif cfg.remat_policy == "save_dot_with_context_except_mlp":
+        policy = jax.checkpoint_policies.save_only_these_names(
+            "query_proj",
+            "value_proj",
+            "key_proj",
+            "qkv_proj",
+            "context",
+            "out_proj",
+        )
       elif cfg.remat_policy == "save_dot_except_mlpwi":
         policy = jax.checkpoint_policies.save_only_these_names(
             "query_proj",
diff --git a/MaxText/pyconfig.py b/MaxText/pyconfig.py
index 8e55999e0..f35734c46 100644
--- a/MaxText/pyconfig.py
+++ b/MaxText/pyconfig.py
@@ -200,6 +200,7 @@ def validate_and_assign_remat_tensors(keys):
   # list of allowed tensors for custom remat policy
   tensors = [
       "decoder_layer_input",
+      "context",
       "mlpwi",
       "mlpwi_0",
       "mlpwi_1",

From 945ee3daee28784076515ea1955ebe319420664f Mon Sep 17 00:00:00 2001
From: Wonpyo Park <wppark.pio@gmail.com>
Date: Mon, 9 Dec 2024 18:08:29 +0000
Subject: [PATCH 36/40] Fix moe logging to differentiate dense and megablox
 runs

---
 MaxText/layers/linears.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/MaxText/layers/linears.py b/MaxText/layers/linears.py
index d06da67e3..e369639cc 100644
--- a/MaxText/layers/linears.py
+++ b/MaxText/layers/linears.py
@@ -729,14 +729,15 @@ def __call__(self, inputs):
         name="gate",
         matmul_precision=self.config.matmul_precision,
     )(inputs)
-    max_logging.log("Running MoE megablox implementation.")
     cfg = self.config
     w0_kernel, w1_kernel, wo_kernel = self.generate_kernels(cfg.num_experts, cfg.emb_dim, cfg.mlp_dim)
     if cfg.megablox:
+      max_logging.log("Running MoE megablox implementation.")
       if quantizations.in_serve_mode(self.quant):
         w0_kernel, w1_kernel, wo_kernel = self.retrieve_quantized_weight(
             inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel
         )
       return self.megablox(inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel)
     else:
+      max_logging.log("Running MoE matmul implementation.")
       return self.dense_matmul(inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel)

From 2b62fe85b67f3fc6be96aa594b994733cbc44f02 Mon Sep 17 00:00:00 2001
From: ZhiyuLi-goog <lizhiyu@google.com>
Date: Tue, 10 Dec 2024 11:21:10 -0800
Subject: [PATCH 37/40] update sharding

---
 MaxText/layers/linears.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/MaxText/layers/linears.py b/MaxText/layers/linears.py
index e369639cc..82adcd0f3 100644
--- a/MaxText/layers/linears.py
+++ b/MaxText/layers/linears.py
@@ -391,6 +391,9 @@ def unpermute(self, intermediate, sorted_selected_experts, weights):
     unsort_intermediate = jnp.take(intermediate, indices=jnp.argsort(sorted_selected_experts), axis=0)
     reshaped_weights = jnp.reshape(weights, (-1, self.num_experts_per_tok))
     tensor_parallelism = self.config.ici_tensor_parallelism * self.config.dcn_tensor_parallelism
+    data_parallelism = self.config.ici_data_parallelism * self.config.dcn_data_parallelism
+    fsdp_parallelism = self.config.ici_fsdp_parallelism * self.config.dcn_fsdp_parallelism
+    batch_sharding = data_parallelism * fsdp_parallelism
     reshaped_intermediate = jnp.reshape(
         unsort_intermediate,
         (-1, self.num_experts_per_tok, self.config.emb_dim // tensor_parallelism),
@@ -403,7 +406,7 @@ def unpermute(self, intermediate, sorted_selected_experts, weights):
           reshaped_weights.astype(jnp.float32),
           precision=matmul_precision,
       )
-    updated_batch = int(self.config.per_device_batch_size * jax.device_count() // self.config.ici_fsdp_parallelism)
+    updated_batch = int(self.config.per_device_batch_size * jax.device_count() // batch_sharding)
     # inferencing hack
     # prefill has BS =1 sequence length = max_prefill_length
     # decode has BS = B, sequence_length= 1

From 742019f51b7cbd4ea45cd0fa35e4d63d9dd3cd91 Mon Sep 17 00:00:00 2001
From: Wonpyo Park <wppark.pio@gmail.com>
Date: Thu, 12 Dec 2024 23:01:47 +0000
Subject: [PATCH 38/40] Reshape based on the original input shape

---
 MaxText/layers/linears.py | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/MaxText/layers/linears.py b/MaxText/layers/linears.py
index 82adcd0f3..2fa6f42bf 100644
--- a/MaxText/layers/linears.py
+++ b/MaxText/layers/linears.py
@@ -385,18 +385,14 @@ def permute(self, inputs, gate_logits):
     group_size = jnp.bincount(flatten_selected_experts, length=self.num_experts)
     return sorted_inputs, sorted_selected_experts, weights, group_size
 
-  def unpermute(self, intermediate, sorted_selected_experts, weights):
+  def unpermute(self, intermediate, sorted_selected_experts, weights, batch_size: int, sequence_length: int):
     """Unpermute tokens to original order and combine weights."""
 
     unsort_intermediate = jnp.take(intermediate, indices=jnp.argsort(sorted_selected_experts), axis=0)
     reshaped_weights = jnp.reshape(weights, (-1, self.num_experts_per_tok))
-    tensor_parallelism = self.config.ici_tensor_parallelism * self.config.dcn_tensor_parallelism
-    data_parallelism = self.config.ici_data_parallelism * self.config.dcn_data_parallelism
-    fsdp_parallelism = self.config.ici_fsdp_parallelism * self.config.dcn_fsdp_parallelism
-    batch_sharding = data_parallelism * fsdp_parallelism
     reshaped_intermediate = jnp.reshape(
         unsort_intermediate,
-        (-1, self.num_experts_per_tok, self.config.emb_dim // tensor_parallelism),
+        (reshaped_weights.shape[0], self.num_experts_per_tok, -1),
     )
     with jax.named_scope("weight_sum"):
       matmul_precision = lax.Precision(self.config.matmul_precision)
@@ -406,14 +402,7 @@ def unpermute(self, intermediate, sorted_selected_experts, weights):
           reshaped_weights.astype(jnp.float32),
           precision=matmul_precision,
       )
-    updated_batch = int(self.config.per_device_batch_size * jax.device_count() // batch_sharding)
-    # inferencing hack
-    # prefill has BS =1 sequence length = max_prefill_length
-    # decode has BS = B, sequence_length= 1
-    if output.shape[0] % updated_batch != 0:
-      updated_batch = 1
-
-    return output.reshape(updated_batch, -1, self.config.emb_dim // tensor_parallelism).astype(self.dtype)
+    return output.reshape(batch_size, sequence_length, -1).astype(self.dtype)
 
   def megablox(self, inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel):
     tile_size = (512, 1024, 1024)
@@ -472,6 +461,7 @@ def gmm(inputs, kernel, group_sizes):
         check_rep=False,
     )
     def wrapper(x, logits, w0, w1, wo):
+      batch_size, sequence_length, _ = x.shape
       x, sorted_selected_experts, weights, group_sizes = self.permute(x, logits)
       layer_w0 = gmm(x, w0, group_sizes)
       layer_w0 = checkpoint_name(layer_w0, "mlpwi_0")
@@ -484,7 +474,9 @@ def wrapper(x, logits, w0, w1, wo):
       tensor_parallelism = self.config.ici_tensor_parallelism * self.config.dcn_tensor_parallelism
       if tensor_parallelism > 1:
         intermediate_output = jax.lax.psum_scatter(intermediate_output, "tensor", scatter_dimension=1, tiled=True)
-      output = self.unpermute(intermediate_output, sorted_selected_experts, weights)
+      output = self.unpermute(
+          intermediate_output, sorted_selected_experts, weights, batch_size=batch_size, sequence_length=sequence_length
+      )
       return output, None
 
     return wrapper(inputs, gate_logits, w0_kernel, w1_kernel, wo_kernel)

From 91b84f69cd875491e79b37b87743033fc0c8613b Mon Sep 17 00:00:00 2001
From: Wonpyo Park <wppark.pio@gmail.com>
Date: Sat, 14 Dec 2024 01:20:36 +0000
Subject: [PATCH 39/40] Resolve math import error

---
 MaxText/layers/linears.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MaxText/layers/linears.py b/MaxText/layers/linears.py
index 2fa6f42bf..7cae1be96 100644
--- a/MaxText/layers/linears.py
+++ b/MaxText/layers/linears.py
@@ -30,6 +30,7 @@
 import numpy as np
 from jax.ad_checkpoint import checkpoint_name
 from jax.experimental import shard_map
+import math
 import max_logging
 import max_utils
 from aqt.jax.v2 import aqt_tensor

From b0f21f98966f0df6ad984df20b25af9dd1f75c89 Mon Sep 17 00:00:00 2001
From: Wonpyo Park <wppark.pio@gmail.com>
Date: Sat, 14 Dec 2024 01:35:32 +0000
Subject: [PATCH 40/40] fix lint error

---
 MaxText/layers/linears.py | 1 +
 MaxText/pyconfig.py       | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/MaxText/layers/linears.py b/MaxText/layers/linears.py
index 6d24c3fec..7a622734d 100644
--- a/MaxText/layers/linears.py
+++ b/MaxText/layers/linears.py
@@ -60,6 +60,7 @@ def _get_model_call_mode(config):
   else:
     return None
 
+
 DISPATCH = "dispatch"
 COMBINE = "combine"
 
diff --git a/MaxText/pyconfig.py b/MaxText/pyconfig.py
index 2c60fcd39..c6af8b591 100644
--- a/MaxText/pyconfig.py
+++ b/MaxText/pyconfig.py
@@ -471,7 +471,7 @@ def update_model_vars(base_config_path, raw_keys, config_name: str):
       raw_keys = validate_and_update_keys(raw_keys, model_vars, config_name)
     return updated_keys
 
-  
+
 def create_parallelisms_list(raw_keys):
   ici_parallelism = [
       raw_keys["ici_data_parallelism"],