[Model] Use tanh approximation of GeLU in Gemma MLP (#2106)

This is in line with the implementation in the [transformers](https://github.com/huggingface/transformers/blob/main/src/transformers/models/gemma/modeling_gemma.py#L183) library. Also, the [gemma-1.1](https://huggingface.co/google/gemma-1.1-2b-it/blob/main/config.json#L10) model config.
mlc-ai · Apr 8, 2024 · 95d268b · 95d268b
1 parent cc8b747
commit 95d268b
Showing 1 changed file with 2 additions and 2 deletions.
diff --git a/python/mlc_llm/model/gemma/gemma_model.py b/python/mlc_llm/model/gemma/gemma_model.py
@@ -39,7 +39,7 @@ class GemmaConfig(ConfigBase):  # pylint: disable=too-many-instance-attributes
     kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict)
 
     def __post_init__(self):
-        if self.hidden_act != "gelu":
+        if self.hidden_act not in ("gelu", "gelu_pytorch_tanh"):
             raise ValueError("Only GeLU is supported as the activation for gemma.")
         if self.attention_bias:
             raise ValueError('Only "False" attention_bias is supported for gemma')
@@ -115,7 +115,7 @@ def __init__(self, config: GemmaConfig):
     def forward(self, x: Tensor):
         concat_x1_x2 = self.gate_up_proj(x)
         x1, x2 = op.split(concat_x1_x2, 2, axis=-1)
-        return self.down_proj(op.gelu(x1) * x2)
+        return self.down_proj(op.gelu(x1, approximate="tanh") * x2)
 
 
 class GemmaAttention(nn.Module):  # pylint: disable=too-many-instance-attributes