From a7d91c39318d78b89ad73d7cac58e9039316c20a Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Wed, 27 May 2026 11:11:04 +0800
Subject: [PATCH 01/20] feat(models): add Hy-MT2-30B-A3B SFT support

Add a dedicated ``HyMT2ForCausalLM`` module under
``components/models/hy_mt2`` for tencent/Hy-MT2-30B-A3B (translation MoE,
30B total / 3B activated). The on-disk checkpoint shares
``architectures: ["HYV3ForCausalLM"]`` and ``model_type: "hy_v3"`` with
Tencent's older Hy3-preview, but the two models differ substantially in
sizing (48 layers vs 80, 128 experts vs 192, GQA 32/4 vs 64/8,
hidden=2048 vs 4096, rms_norm_eps=1e-5 vs 1e-6) and in three flags that
the existing ``hy_v3`` module either hard-codes or does not handle:
``moe_router_use_sigmoid`` (made configurable here),
``enable_lm_head_fp32`` (in-model fp32 upcast fallback when the YAML
does not set ``lm_head_precision``), and ``expert_hidden_dim`` (synonym
of ``moe_intermediate_size`` preferred when both are present).

The new module is intentionally independent of ``components/models/hy_v3``
so the Hy3-preview recipes are unaffected. The example YAML at
``examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml`` instantiates the
new class via a fully-qualified ``_target_`` instead of going through the
NeMoAutoModel registry, avoiding the architecture-string collision.

Files:
  nemo_automodel/components/models/hy_mt2/{__init__,config,layers,model,state_dict_adapter}.py
  examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml
  tests/unit_tests/models/hy_mt2/test_hy_mt2_{config,layers,model,state_dict_adapter}.py

EP / TP / DP / FSDP2 wire up through the standard MoE stack
(``MoEFSDPSyncMixin`` + ``components/moe``). EP must divide ``num_experts``
= 128; the example uses ``ep_size: 8`` (16 experts per rank) on an
8xH100 node.

Signed-off-by: khazic <khazzz1c@gmail.com>
---
 .../hy_mt2/hy_mt2_30b_a3b_sft.yaml            | 150 +++++++
 .../components/models/hy_mt2/__init__.py      |  17 +
 .../components/models/hy_mt2/config.py        | 129 ++++++
 .../components/models/hy_mt2/layers.py        | 147 ++++++
 .../components/models/hy_mt2/model.py         | 418 ++++++++++++++++++
 .../models/hy_mt2/state_dict_adapter.py       | 163 +++++++
 tests/unit_tests/models/hy_mt2/__init__.py    |   0
 .../models/hy_mt2/test_hy_mt2_config.py       | 165 +++++++
 .../models/hy_mt2/test_hy_mt2_layers.py       | 164 +++++++
 .../models/hy_mt2/test_hy_mt2_model.py        | 285 ++++++++++++
 .../hy_mt2/test_hy_mt2_state_dict_adapter.py  | 299 +++++++++++++
 11 files changed, 1937 insertions(+)
 create mode 100644 examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml
 create mode 100644 nemo_automodel/components/models/hy_mt2/__init__.py
 create mode 100644 nemo_automodel/components/models/hy_mt2/config.py
 create mode 100644 nemo_automodel/components/models/hy_mt2/layers.py
 create mode 100644 nemo_automodel/components/models/hy_mt2/model.py
 create mode 100644 nemo_automodel/components/models/hy_mt2/state_dict_adapter.py
 create mode 100644 tests/unit_tests/models/hy_mt2/__init__.py
 create mode 100644 tests/unit_tests/models/hy_mt2/test_hy_mt2_config.py
 create mode 100644 tests/unit_tests/models/hy_mt2/test_hy_mt2_layers.py
 create mode 100644 tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py
 create mode 100644 tests/unit_tests/models/hy_mt2/test_hy_mt2_state_dict_adapter.py

diff --git a/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml b/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml
new file mode 100644
index 0000000000..9f2b71d8d9
--- /dev/null
+++ b/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml
@@ -0,0 +1,150 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# SFT recipe for tencent/Hy-MT2-30B-A3B (translation MoE, 30B total / 3B activated).
+#
+# Architecture (from config.json): 48 layers (layer 0 dense), 128 experts top-8
+# + 1 shared expert, sigmoid routing with bias, GQA 32/4, hidden=2048,
+# moe_intermediate=expert_hidden=768, dense intermediate=6912,
+# vocab=120832, 256K context, rope_theta=11158840, qk_norm.
+#
+# Hardware target: 8 GPUs (80 GB+ each) for full SFT with EP8 + FSDP2.
+#   automodel finetune llm -c examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml \
+#     --nproc-per-node 8
+#
+# EP size must divide num_experts (128). ep_size=8 -> 16 experts per rank.
+# Other valid EP sizes: 1, 2, 4, 16, 32, 64, 128.
+#
+# Note: the on-disk checkpoint declares ``architectures: ["HYV3ForCausalLM"]``,
+# so NeMoAutoModel's registry would dispatch to ``components/models/hy_v3``.
+# This recipe deliberately bypasses the registry and instantiates the new
+# ``HyMT2ForCausalLM`` directly via ``_target_``, isolating Hy-MT2 logic
+# (sigmoid-routing flag, ``enable_lm_head_fp32``, expert_hidden_dim) from
+# the existing Hy3-preview support.
+
+recipe: TrainFinetuneRecipeForNextTokenPrediction
+
+step_scheduler:
+  global_batch_size: 64
+  local_batch_size: 1
+  ckpt_every_steps: 500
+  val_every_steps: 500
+  num_epochs: 1
+  max_steps: 100
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 30
+
+rng:
+  _target_: nemo_automodel.components.training.rng.StatefulRNG
+  seed: 1111
+  ranked: true
+
+model:
+  _target_: nemo_automodel.components.models.hy_mt2.model.HyMT2ForCausalLM.from_pretrained
+  pretrained_model_name_or_path: tencent/Hy-MT2-30B-A3B
+  torch_dtype: bfloat16
+  backend:
+    _target_: nemo_automodel.components.models.common.BackendConfig
+    attn: te
+    linear: torch
+    rms_norm: torch_fp32
+    experts: torch_mm
+    dispatcher: torch
+    fake_balanced_gate: false
+    gate_precision: float32
+    enable_hf_state_dict_adapter: true
+    enable_fsdp_optimizations: true
+
+checkpoint:
+  enabled: true
+  checkpoint_dir: /tmp/checkpoints/hy_mt2_30b_a3b/
+  model_save_format: safetensors
+  save_consolidated: true
+
+distributed:
+  strategy: fsdp2
+  tp_size: 1
+  cp_size: 1
+  pp_size: 1
+  # Expert parallelism: 128 experts / 8 ranks = 16 experts per rank.
+  # dp_size is derived as ``world_size // (tp_size * cp_size * pp_size * ep_size)``
+  # i.e. 1 with ep_size=8 on an 8-GPU node -- experts shard across the
+  # full node and the remaining (non-expert) weights replicate.
+  ep_size: 8
+
+  sequence_parallel: false
+  activation_checkpointing: true
+
+  moe:
+    reshard_after_forward: false
+    wrap_outer_model: false
+    # HF reference upcasts the lm_head to fp32 (``enable_lm_head_fp32: true``).
+    # The MoE parallelizer handles this via MixedPrecisionPolicy when set
+    # here; HyMT2ForCausalLM also has an in-model fp32 fallback if this is
+    # left unset.
+    lm_head_precision: float32
+
+loss_fn:
+  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
+
+dataset:
+  _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
+  path_or_dataset: rowan/hellaswag
+  split: train
+  tokenizer:
+    _target_: transformers.AutoTokenizer.from_pretrained
+    pretrained_model_name_or_path: tencent/Hy-MT2-30B-A3B
+    trust_remote_code: true
+
+packed_sequence:
+  packed_sequence_size: 0
+
+dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  collate_fn:
+    _target_: nemo_automodel.components.datasets.utils.default_collater
+    pad_seq_len_divisible: 64
+  shuffle: true
+
+validation_dataset:
+  _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
+  path_or_dataset: rowan/hellaswag
+  split: validation
+  num_samples_limit: 64
+  tokenizer:
+    _target_: transformers.AutoTokenizer.from_pretrained
+    pretrained_model_name_or_path: tencent/Hy-MT2-30B-A3B
+    trust_remote_code: true
+
+validation_dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  collate_fn:
+    _target_: nemo_automodel.components.datasets.utils.default_collater
+    pad_seq_len_divisible: 64
+  shuffle: false
+  drop_last: true
+
+optimizer:
+  _target_: torch.optim.AdamW
+  betas: [0.9, 0.95]
+  eps: 1e-8
+  lr: 1e-5
+  weight_decay: 0.0
+
+# Uncomment for W&B logging
+# wandb:
+#   project: hy_mt2-30b-a3b-sft
+#   name: hy_mt2_30b_a3b_sft
diff --git a/nemo_automodel/components/models/hy_mt2/__init__.py b/nemo_automodel/components/models/hy_mt2/__init__.py
new file mode 100644
index 0000000000..cab3ef8fa7
--- /dev/null
+++ b/nemo_automodel/components/models/hy_mt2/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo_automodel.components.models.hy_mt2.model import HyMT2ForCausalLM
+
+__all__ = ["HyMT2ForCausalLM"]
diff --git a/nemo_automodel/components/models/hy_mt2/config.py b/nemo_automodel/components/models/hy_mt2/config.py
new file mode 100644
index 0000000000..9578a77f56
--- /dev/null
+++ b/nemo_automodel/components/models/hy_mt2/config.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from transformers import PretrainedConfig
+
+
+class HyMT2Config(PretrainedConfig):
+    """Configuration class for Tencent Hy-MT2-30B-A3B (translation MoE).
+
+    Architecture (from tencent/Hy-MT2-30B-A3B config.json):
+      - 48 transformer layers; layer 0 is dense, layers 1-47 are MoE
+      - MoE: 128 routed experts + 1 shared expert, top-8 activated
+      - Sigmoid routing with expert-bias correction (e_score_correction_bias)
+        and router_scaling_factor = 2.826
+      - route_norm = True (normalize top-k routing weights)
+      - GQA: 32 Q heads, 4 KV heads, head_dim=128, hidden_size=2048
+      - Per-head Q/K RMSNorm before RoPE (qk_norm)
+      - 256K context, rope_theta=11158840
+      - vocab_size=120832, dense intermediate_size=6912, moe_intermediate_size=768
+      - enable_lm_head_fp32 = True (HF reference upcasts lm_head to fp32)
+
+    Note: the on-disk HF checkpoint declares ``model_type: "hy_v3"`` and
+    ``architectures: ["HYV3ForCausalLM"]``. NeMo AutoModel's existing
+    ``HYV3Config`` therefore wins ``AutoConfig.from_pretrained``. This class
+    is provided for tests and for standalone instantiation; the model code in
+    ``model.py`` is duck-typed against ``config.<field>`` and works with either
+    config class.
+    """
+
+    model_type = "hy_mt2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size: int = 120832,
+        hidden_size: int = 2048,
+        intermediate_size: int = 6912,
+        moe_intermediate_size: int = 768,
+        expert_hidden_dim: int = 768,
+        num_hidden_layers: int = 48,
+        num_attention_heads: int = 32,
+        num_key_value_heads: int = 4,
+        head_dim: int = 128,
+        # MoE routing
+        num_experts: int = 128,
+        num_shared_experts: int = 1,
+        num_experts_per_tok: int = 8,
+        router_scaling_factor: float = 2.826,
+        route_norm: bool = True,
+        moe_router_enable_expert_bias: bool = True,
+        moe_router_use_sigmoid: bool = True,
+        # Dense layers
+        first_k_dense_replace: int = 1,
+        # Position encoding
+        max_position_embeddings: int = 262144,
+        rope_theta: float = 11158840.0,
+        rope_scaling: dict | None = None,
+        # Norm / attention
+        rms_norm_eps: float = 1e-5,
+        qk_norm: bool = True,
+        attention_bias: bool = False,
+        hidden_act: str = "silu",
+        # FP32 upcast hints (mirroring HF config). NeMo AutoModel wires
+        # ``enable_lm_head_fp32`` either via the YAML ``lm_head_precision: float32``
+        # (preferred, handled by the MoE parallelizer) or via the in-model
+        # cast in ``HyMT2ForCausalLM.forward`` when ``lm_head_precision`` is
+        # unset.
+        enable_lm_head_fp32: bool = True,
+        enable_attention_fp32_softmax: bool = False,
+        enable_moe_fp32_combine: bool = False,
+        # Standard options
+        use_cache: bool = True,
+        pad_token_id: int | None = 120002,
+        bos_token_id: int = 120000,
+        eos_token_id: int = 120025,
+        tie_word_embeddings: bool = False,
+        torch_dtype: str = "bfloat16",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.expert_hidden_dim = expert_hidden_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.num_experts = num_experts
+        self.num_shared_experts = num_shared_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.router_scaling_factor = router_scaling_factor
+        self.route_norm = route_norm
+        self.moe_router_enable_expert_bias = moe_router_enable_expert_bias
+        self.moe_router_use_sigmoid = moe_router_use_sigmoid
+        self.first_k_dense_replace = first_k_dense_replace
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.rms_norm_eps = rms_norm_eps
+        self.qk_norm = qk_norm
+        self.attention_bias = attention_bias
+        self.hidden_act = hidden_act
+        self.enable_lm_head_fp32 = enable_lm_head_fp32
+        self.enable_attention_fp32_softmax = enable_attention_fp32_softmax
+        self.enable_moe_fp32_combine = enable_moe_fp32_combine
+        self.torch_dtype = torch_dtype
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            use_cache=use_cache,
+            **kwargs,
+        )
diff --git a/nemo_automodel/components/models/hy_mt2/layers.py b/nemo_automodel/components/models/hy_mt2/layers.py
new file mode 100644
index 0000000000..3450ea8338
--- /dev/null
+++ b/nemo_automodel/components/models/hy_mt2/layers.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any
+
+import torch
+from torch import nn
+
+from nemo_automodel.components.attention.utils import (
+    initialize_attn_module_and_func,
+    postprocess_output_for_attn,
+    preprocess_args_and_kwargs_for_attn,
+)
+from nemo_automodel.components.models.common import (
+    BackendConfig,
+    initialize_linear_module,
+    initialize_rms_norm_module,
+)
+from nemo_automodel.components.models.gpt_oss.rope_utils import apply_rotary_emb_qk
+
+
+class HyMT2Attention(nn.Module):
+    """Hy-MT2-30B-A3B attention: GQA, per-head Q/K RMSNorm, and RoPE.
+
+    Differences vs. the existing Hy3-preview ``HYV3Attention``:
+      - ``qk_norm`` is gated by ``config.qk_norm`` (defaults to True). For
+        Hy-MT2-30B-A3B this is always True; the flag is here so the same
+        module can also be reused for non-qk-norm variants without code
+        edits.
+      - Dimensions follow Hy-MT2-30B-A3B: 32 Q heads / 4 KV heads,
+        head_dim=128, hidden_size=2048.
+    """
+
+    def __init__(self, config: Any, backend: BackendConfig):
+        super().__init__()
+        self.backend = backend
+
+        self.num_heads = config.num_attention_heads
+        self.num_kv_heads = config.num_key_value_heads
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // self.num_heads)
+        self.qk_norm_enabled = bool(getattr(config, "qk_norm", True))
+
+        attention_bias = getattr(config, "attention_bias", False)
+
+        self.q_proj = initialize_linear_module(
+            backend.linear, config.hidden_size, self.num_heads * self.head_dim, attention_bias
+        )
+        self.k_proj = initialize_linear_module(
+            backend.linear, config.hidden_size, self.num_kv_heads * self.head_dim, attention_bias
+        )
+        self.v_proj = initialize_linear_module(
+            backend.linear, config.hidden_size, self.num_kv_heads * self.head_dim, attention_bias
+        )
+        self.o_proj = initialize_linear_module(
+            backend.linear, self.num_heads * self.head_dim, config.hidden_size, attention_bias
+        )
+
+        if self.qk_norm_enabled:
+            self.q_norm = initialize_rms_norm_module(backend.rms_norm, self.head_dim, eps=config.rms_norm_eps)
+            self.k_norm = initialize_rms_norm_module(backend.rms_norm, self.head_dim, eps=config.rms_norm_eps)
+        else:
+            self.q_norm = None
+            self.k_norm = None
+
+        softmax_scale = self.head_dim**-0.5
+        self.attn_module, self.attn_func = initialize_attn_module_and_func(
+            attn_impl=backend.attn,
+            num_attention_heads=self.num_heads,
+            num_qk_channels=self.head_dim,
+            num_v_channels=self.head_dim,
+            softmax_scale=softmax_scale,
+            num_gqa_groups=self.num_kv_heads,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        *,
+        freqs_cis: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        **attn_kwargs: Any,
+    ) -> torch.Tensor:
+        if len(x.shape) == 2:
+            qkv_format = "thd"
+            num_tokens = x.shape[0]
+        else:
+            qkv_format = "bshd"
+            bsz, seqlen, _ = x.size()
+
+        q = self.q_proj(x)
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+
+        if qkv_format == "thd":
+            q = q.view(num_tokens, self.num_heads, self.head_dim)
+            k = k.view(num_tokens, self.num_kv_heads, self.head_dim)
+            v = v.view(num_tokens, self.num_kv_heads, self.head_dim)
+        else:
+            q = q.view(bsz, seqlen, self.num_heads, self.head_dim)
+            k = k.view(bsz, seqlen, self.num_kv_heads, self.head_dim)
+            v = v.view(bsz, seqlen, self.num_kv_heads, self.head_dim)
+
+        if self.qk_norm_enabled:
+            q = self.q_norm(q)
+            k = self.k_norm(k)
+
+        q, k = apply_rotary_emb_qk(
+            q,
+            k,
+            freqs_cis,
+            format=qkv_format,
+            rope_fusion=self.backend.rope_fusion,
+            cu_seqlens=attn_kwargs.get("cu_seqlens", None),
+            cp_size=attn_kwargs.get("cp_size", 1),
+            cp_rank=attn_kwargs.get("cp_rank", 0),
+        )
+
+        q, k, v, _attn_kwargs = preprocess_args_and_kwargs_for_attn(
+            q, k, v, attention_mask, self.backend.attn, **attn_kwargs
+        )
+        out = self.attn_func(q, k, v, **_attn_kwargs)
+        out = postprocess_output_for_attn(out, self.backend.attn)
+
+        flatten_dim = 2 if qkv_format == "bshd" else 1
+        out = self.o_proj(out.flatten(flatten_dim))
+        return out
+
+    def init_weights(self, buffer_device: torch.device, init_std: float = 0.02):
+        for linear in (self.q_proj, self.k_proj, self.v_proj, self.o_proj):
+            nn.init.trunc_normal_(linear.weight, mean=0.0, std=init_std)
+            if hasattr(linear, "bias") and linear.bias is not None:
+                nn.init.zeros_(linear.bias)
+        if self.q_norm is not None:
+            self.q_norm.reset_parameters()
+        if self.k_norm is not None:
+            self.k_norm.reset_parameters()
diff --git a/nemo_automodel/components/models/hy_mt2/model.py b/nemo_automodel/components/models/hy_mt2/model.py
new file mode 100644
index 0000000000..db2d85db8e
--- /dev/null
+++ b/nemo_automodel/components/models/hy_mt2/model.py
@@ -0,0 +1,418 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""HyMT2ForCausalLM — Tencent Hy-MT2-30B-A3B (translation MoE) SFT support.
+
+Architecture (from tencent/Hy-MT2-30B-A3B config.json):
+  - 48 transformer layers; layer 0 is dense, layers 1-47 are MoE
+  - MoE: 128 routed experts + 1 shared expert, top-8 activated
+  - Sigmoid routing with expert-bias correction, router_scaling_factor=2.826
+  - route_norm = True (normalize top-k weights to sum to 1)
+  - GQA: 32 Q heads, 4 KV heads, head_dim=128, hidden_size=2048
+  - Per-head Q/K RMSNorm (qk_norm=True) before RoPE
+  - 256K context, rope_theta=11158840
+  - dense intermediate_size=6912, moe_intermediate_size=expert_hidden_dim=768
+  - vocab_size=120832
+  - enable_lm_head_fp32 = True (HF reference upcasts lm_head to fp32)
+
+Notes vs. ``components/models/hy_v3`` (Hy3-preview 295B):
+  - Smaller everywhere (48L / 128 experts / 32+4 heads / hidden=2048).
+  - Adds an in-model ``enable_lm_head_fp32`` fallback (applies when the
+    YAML's ``lm_head_precision`` is not set). The preferred path is to set
+    ``distributed.moe.lm_head_precision: float32`` in the YAML, which the
+    MoE parallelizer handles via ``MixedPrecisionPolicy``.
+  - ``score_func`` is driven by ``config.moe_router_use_sigmoid`` instead
+    of being hard-coded.
+"""
+
+from typing import Any
+
+import torch
+import torch.nn as nn
+
+from nemo_automodel.components.models.common import (
+    BackendConfig,
+    get_rope_config,
+    initialize_linear_module,
+    initialize_rms_norm_module,
+)
+from nemo_automodel.components.models.common.hf_checkpointing_mixin import HFCheckpointingMixin
+from nemo_automodel.components.models.common.utils import cast_model_to_dtype
+from nemo_automodel.components.models.gpt_oss.rope_utils import RotaryEmbedding, position_ids_to_freqs_cis
+from nemo_automodel.components.models.hy_mt2.layers import HyMT2Attention
+from nemo_automodel.components.models.hy_mt2.state_dict_adapter import HyMT2StateDictAdapter
+from nemo_automodel.components.moe.config import MoEConfig
+from nemo_automodel.components.moe.fsdp_mixin import MoEFSDPSyncMixin
+from nemo_automodel.components.moe.layers import MLP, MoE
+from nemo_automodel.components.utils.model_utils import squeeze_input_for_thd
+from nemo_automodel.shared.utils import dtype_from_str as get_dtype
+
+
+def _resolve_score_func(config: Any) -> str:
+    """Map ``config.moe_router_use_sigmoid`` to a gate ``score_func`` name.
+
+    Returns "sigmoid" when the flag is True (Hy-MT2 default) and "softmax"
+    otherwise. The bias-aware variants ("sigmoid_with_bias" /
+    "softmax_with_bias") are selected at the gate level by the presence of
+    ``e_score_correction_bias`` plus expert-group routing, which Hy-MT2 does
+    not use (n_expert_groups=0).
+    """
+    use_sigmoid = bool(getattr(config, "moe_router_use_sigmoid", True))
+    return "sigmoid" if use_sigmoid else "softmax"
+
+
+class Block(nn.Module):
+    """Single Hy-MT2 transformer block: attention + (dense MLP | MoE) + residual norms."""
+
+    def __init__(self, layer_idx: int, config: Any, moe_config: MoEConfig, backend: BackendConfig):
+        super().__init__()
+        self.self_attn = HyMT2Attention(config, backend)
+
+        first_k_dense = getattr(config, "first_k_dense_replace", 1)
+        if layer_idx < first_k_dense:
+            self.mlp = MLP(config.hidden_size, config.intermediate_size, backend.linear)
+        else:
+            self.mlp = MoE(moe_config, backend)
+
+        self.input_layernorm = initialize_rms_norm_module(backend.rms_norm, config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = initialize_rms_norm_module(
+            backend.rms_norm, config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.layer_idx = layer_idx
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        *,
+        freqs_cis: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        padding_mask: torch.Tensor | None = None,
+        **attn_kwargs: Any,
+    ) -> torch.Tensor:
+        if attention_mask is not None and padding_mask is None:
+            padding_mask = attention_mask.bool().logical_not()
+
+        attn_out = self.self_attn(
+            x=self.input_layernorm(x),
+            freqs_cis=freqs_cis,
+            attention_mask=attention_mask,
+            **attn_kwargs,
+        )
+        x = x + attn_out
+
+        mlp_out = self._mlp(x=self.post_attention_layernorm(x), padding_mask=padding_mask)
+        x = x + mlp_out
+        return x
+
+    def _mlp(self, x: torch.Tensor, padding_mask: torch.Tensor | None) -> torch.Tensor:
+        if isinstance(self.mlp, MLP):
+            return self.mlp(x)
+        assert isinstance(self.mlp, MoE)
+        return self.mlp(x, padding_mask)
+
+    def init_weights(self, buffer_device: torch.device):
+        for norm in (self.input_layernorm, self.post_attention_layernorm):
+            norm.reset_parameters()
+        self.self_attn.init_weights(buffer_device)
+        self.mlp.init_weights(buffer_device)
+
+
+class HyMT2Model(nn.Module):
+    """Hy-MT2 backbone: token embeddings + transformer blocks + final RMSNorm.
+
+    The MoE / dense split is governed by ``config.first_k_dense_replace``
+    (layer 0 dense, the rest MoE for the published Hy-MT2-30B-A3B). The
+    MoE configuration is assembled from the HF config fields and forwarded
+    to every MoE-bearing ``Block``.
+    """
+
+    def __init__(
+        self,
+        config: Any,
+        backend: BackendConfig,
+        *,
+        moe_config: MoEConfig | None = None,
+        moe_overrides: dict | None = None,
+    ):
+        super().__init__()
+        self.backend = backend
+        self.config = config
+        if moe_config is not None and moe_overrides is not None:
+            raise ValueError("Cannot pass both moe_config and moe_overrides.")
+
+        # ``expert_hidden_dim`` and ``moe_intermediate_size`` are synonyms in
+        # the on-disk config. Prefer ``expert_hidden_dim`` when present
+        # (matches the field name used by the HF reference for the expert MLP
+        # hidden dim); fall back to ``moe_intermediate_size`` otherwise.
+        moe_inter = getattr(config, "expert_hidden_dim", None)
+        if moe_inter is None:
+            moe_inter = config.moe_intermediate_size
+
+        moe_defaults = dict(
+            dim=config.hidden_size,
+            inter_dim=config.intermediate_size,
+            moe_inter_dim=moe_inter,
+            n_routed_experts=config.num_experts,
+            n_shared_experts=getattr(config, "num_shared_experts", 0),
+            n_activated_experts=config.num_experts_per_tok,
+            n_expert_groups=0,
+            n_limited_groups=0,
+            train_gate=True,
+            gate_bias_update_factor=0.0,
+            score_func=_resolve_score_func(config),
+            route_scale=getattr(config, "router_scaling_factor", 1.0),
+            aux_loss_coeff=0.0,
+            norm_topk_prob=getattr(config, "route_norm", True),
+            expert_bias=False,
+            router_bias=False,
+            expert_activation="swiglu",
+            softmax_before_topk=False,
+            # Ensures e_score_correction_bias buffer is created so HF
+            # checkpoints with ``expert_bias`` load cleanly.
+            force_e_score_correction_bias=getattr(config, "moe_router_enable_expert_bias", False),
+        )
+        if moe_overrides:
+            moe_defaults.update(moe_overrides)
+        self.moe_config = moe_config or MoEConfig(**moe_defaults)
+
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size, config.hidden_size, dtype=get_dtype(config.torch_dtype, torch.bfloat16)
+        )
+        self.layers = torch.nn.ModuleDict()
+        for layer_id in range(config.num_hidden_layers):
+            self.layers[str(layer_id)] = Block(layer_id, config, self.moe_config, backend)
+        self.norm = initialize_rms_norm_module(backend.rms_norm, config.hidden_size, eps=config.rms_norm_eps)
+
+        self.max_seq_len = config.max_position_embeddings
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+
+        base, rope_scaling, _ = get_rope_config(config)
+
+        self.rotary_emb = RotaryEmbedding(
+            head_dim=self.head_dim,
+            base=base,
+            dtype=torch.float32,
+            initial_context_length=rope_scaling.get("original_max_position_embeddings", 4096),
+            scaling_factor=rope_scaling.get("factor", 1.0),
+            ntk_alpha=rope_scaling.get("beta_slow", 1.0),
+            ntk_beta=rope_scaling.get("beta_fast", 32.0),
+            device=torch.device(f"cuda:{torch.cuda.current_device()}")
+            if torch.cuda.is_available()
+            else torch.device("cpu"),
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        *,
+        position_ids: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        padding_mask: torch.Tensor | None = None,
+        **attn_kwargs: Any,
+    ) -> torch.Tensor:
+        if position_ids is None:
+            position_ids = (
+                torch.arange(0, input_ids.shape[1], device=input_ids.device).unsqueeze(0).expand(input_ids.shape[0], -1)
+            )
+
+        freqs_cis = position_ids_to_freqs_cis(
+            self.rotary_emb,
+            position_ids,
+            qkv_format=attn_kwargs.get("qkv_format", "bshd"),
+            for_fused_rope=self.backend.rope_fusion,
+            cp_size=attn_kwargs.get("cp_size", 1),
+        )
+
+        h = self.embed_tokens(input_ids) if self.embed_tokens is not None else input_ids
+
+        for layer in self.layers.values():
+            h = layer(
+                x=h,
+                freqs_cis=freqs_cis,
+                attention_mask=attention_mask,
+                padding_mask=padding_mask,
+                **attn_kwargs,
+            )
+
+        h = self.norm(h) if self.norm else h
+        return h
+
+    @torch.no_grad()
+    def init_weights(self, buffer_device: torch.device | None = None) -> None:
+        if buffer_device is None:
+            buffer_device = (
+                torch.device(f"cuda:{torch.cuda.current_device()}")
+                if torch.cuda.is_available()
+                else torch.device("cpu")
+            )
+        with buffer_device:
+            if self.embed_tokens is not None:
+                nn.init.normal_(self.embed_tokens.weight)
+            if self.norm is not None:
+                self.norm.reset_parameters()
+            self.rotary_emb.device = buffer_device
+
+        for layer in self.layers.values():
+            if layer is not None:
+                layer.init_weights(buffer_device=buffer_device)
+
+
+class HyMT2ForCausalLM(HFCheckpointingMixin, nn.Module, MoEFSDPSyncMixin):
+    """Hy-MT2-30B-A3B causal-LM wrapper.
+
+    Mixes in ``MoEFSDPSyncMixin`` so EP / FSDP2 expert-gradient sync works
+    out of the box (set ``distributed.ep_size`` in the YAML; must divide
+    ``num_experts``=128). The ``HFCheckpointingMixin`` provides
+    ``from_pretrained`` / ``save_pretrained`` over the HF safetensors layout.
+    """
+
+    @classmethod
+    def from_config(
+        cls,
+        config: Any,
+        moe_config: MoEConfig | None = None,
+        backend: BackendConfig | None = None,
+        **kwargs,
+    ):
+        return cls(config, moe_config, backend, **kwargs)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        *model_args,
+        **kwargs,
+    ):
+        from transformers import AutoConfig
+
+        # The on-disk Hy-MT2 checkpoint declares ``model_type: hy_v3`` so
+        # ``AutoConfig`` returns a ``HYV3Config`` instance. Our model code
+        # is duck-typed against the field names (which match) so this works
+        # transparently.
+        config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=False)
+        return cls.from_config(config, *model_args, **kwargs)
+
+    def __init__(
+        self,
+        config: Any,
+        moe_config: MoEConfig | None = None,
+        backend: BackendConfig | None = None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.config = config
+        self.backend = backend or BackendConfig()
+        moe_overrides = kwargs.pop("moe_overrides", None)
+        self.model = HyMT2Model(config, backend=self.backend, moe_config=moe_config, moe_overrides=moe_overrides)
+        self.lm_head = initialize_linear_module(self.backend.linear, config.hidden_size, config.vocab_size, bias=False)
+        # In-model fp32 fallback for the lm_head matmul. The preferred wiring
+        # is the YAML ``distributed.moe.lm_head_precision: float32``, which
+        # the MoE parallelizer enables via ``MixedPrecisionPolicy``. When that
+        # path is not used, ``enable_lm_head_fp32`` in the model config still
+        # triggers the in-forward upcast.
+        self._enable_lm_head_fp32 = bool(getattr(config, "enable_lm_head_fp32", False))
+        if self.backend.enable_hf_state_dict_adapter:
+            self.state_dict_adapter = HyMT2StateDictAdapter(
+                self.config,
+                self.model.moe_config,
+                self.backend,
+                dtype=get_dtype(config.torch_dtype, torch.bfloat16),
+            )
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        *,
+        position_ids: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        padding_mask: torch.Tensor | None = None,
+        **attn_kwargs: Any,
+    ) -> torch.Tensor:
+        if "qkv_format" in attn_kwargs and attn_kwargs["qkv_format"] == "thd":
+            input_ids, position_ids, padding_mask, attn_kwargs = squeeze_input_for_thd(
+                input_ids, position_ids, padding_mask, attn_kwargs
+            )
+            attention_mask = None
+
+        hidden = self.model(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            padding_mask=padding_mask,
+            **attn_kwargs,
+        )
+
+        if self.lm_head is None:
+            logits = hidden
+        elif self._enable_lm_head_fp32 and hidden.dtype != torch.float32:
+            # Upcast input to fp32 for the lm_head matmul, then cast logits
+            # back to the model dtype so downstream loss / sampling code is
+            # not surprised by an fp32 tensor. Matches the HF reference's
+            # ``enable_lm_head_fp32`` behavior.
+            original_dtype = hidden.dtype
+            logits = self.lm_head(hidden.float()).to(original_dtype)
+        else:
+            logits = self.lm_head(hidden)
+
+        if "qkv_format" in attn_kwargs and attn_kwargs["qkv_format"] == "thd":
+            logits = logits.unsqueeze(0)
+        return logits
+
+    def update_moe_gate_bias(self) -> None:
+        with torch.no_grad():
+            for block in self.model.layers.values():
+                if isinstance(block.mlp, MoE) and block.mlp.gate.bias_update_factor > 0:
+                    block.mlp.gate.update_bias()
+
+    @torch.no_grad()
+    def initialize_weights(
+        self, buffer_device: torch.device | None = None, dtype: torch.dtype = torch.bfloat16
+    ) -> None:
+        if buffer_device is None:
+            buffer_device = (
+                torch.device(f"cuda:{torch.cuda.current_device()}")
+                if torch.cuda.is_available()
+                else torch.device("cpu")
+            )
+        with buffer_device:
+            self.model.init_weights(buffer_device=buffer_device)
+            final_out_std = self.config.hidden_size**-0.5
+            cutoff_factor = 3
+            if self.lm_head is not None:
+                nn.init.trunc_normal_(
+                    self.lm_head.weight,
+                    mean=0.0,
+                    std=final_out_std,
+                    a=-cutoff_factor * final_out_std,
+                    b=cutoff_factor * final_out_std,
+                )
+
+        cast_model_to_dtype(self, dtype)
+        with buffer_device:
+            self.model.rotary_emb.device = buffer_device
+
+
+ModelClass = HyMT2ForCausalLM
diff --git a/nemo_automodel/components/models/hy_mt2/state_dict_adapter.py b/nemo_automodel/components/models/hy_mt2/state_dict_adapter.py
new file mode 100644
index 0000000000..7d98b4dd4e
--- /dev/null
+++ b/nemo_automodel/components/models/hy_mt2/state_dict_adapter.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""State dict conversion between the on-disk tencent/Hy-MT2-30B-A3B HF
+checkpoint and Automodel's native (grouped-experts) format.
+
+The on-disk key layout is identical to tencent/Hy3-preview because both
+share ``model_type: "hy_v3"`` and ``architectures: ["HYV3ForCausalLM"]``:
+
+  model.layers.{L}.mlp.expert_bias                                       # [n_experts]
+  model.layers.{L}.mlp.router.gate.weight                                # [n_experts, hidden]
+  model.layers.{L}.mlp.experts.{E}.gate_proj.weight                      # [moe_inter, hidden]
+  model.layers.{L}.mlp.experts.{E}.up_proj.weight                        # [moe_inter, hidden]
+  model.layers.{L}.mlp.experts.{E}.down_proj.weight                      # [hidden, moe_inter]
+  model.layers.{L}.mlp.shared_mlp.{gate,up,down}_proj.weight             # shared expert
+
+Automodel native:
+
+  model.layers.{L}.mlp.gate.e_score_correction_bias                      # [n_local]
+  model.layers.{L}.mlp.gate.weight                                       # [n_experts, hidden]
+  model.layers.{L}.mlp.experts.gate_and_up_projs                         # grouped
+  model.layers.{L}.mlp.experts.down_projs                                # grouped
+  model.layers.{L}.mlp.shared_experts.{gate,up,down}_proj.weight
+
+This adapter handles three on-disk-specific renames plus per-expert
+split/merge (via ``MoESplitExpertsStateDictMixin``). It is functionally a
+clone of ``HYV3StateDictAdapter``; kept separate so future Hy-MT2-only
+key changes (e.g. an MTP / aux-head extension that Hy-MT2 ships but
+Hy3-preview does not) can be added here without affecting Hy3-preview.
+"""
+
+import logging
+import re
+from typing import Any, Optional
+
+import torch
+from torch.distributed.device_mesh import DeviceMesh
+
+from nemo_automodel.components.checkpoint.state_dict_adapter import StateDictAdapter
+from nemo_automodel.components.models.common import BackendConfig
+from nemo_automodel.components.moe.config import MoEConfig
+from nemo_automodel.components.moe.state_dict_mixin import MoESplitExpertsStateDictMixin
+
+logger = logging.getLogger(__name__)
+
+
+_NATIVE_TO_HF_RENAMES: tuple[tuple[re.Pattern[str], str], ...] = (
+    (re.compile(r"\.mlp\.gate\.e_score_correction_bias$"), ".mlp.expert_bias"),
+    (re.compile(r"\.mlp\.gate\.weight$"), ".mlp.router.gate.weight"),
+    (re.compile(r"\.mlp\.shared_experts\."), ".mlp.shared_mlp."),
+)
+_HF_TO_NATIVE_RENAMES: tuple[tuple[re.Pattern[str], str], ...] = (
+    (re.compile(r"\.mlp\.expert_bias$"), ".mlp.gate.e_score_correction_bias"),
+    (re.compile(r"\.mlp\.router\.gate\.weight$"), ".mlp.gate.weight"),
+    (re.compile(r"\.mlp\.shared_mlp\."), ".mlp.shared_experts."),
+)
+
+
+class HyMT2StateDictAdapter(MoESplitExpertsStateDictMixin, StateDictAdapter):
+    """Bridges Automodel native (grouped experts) and on-disk Hy-MT2 HF format."""
+
+    def __init__(
+        self,
+        config: Any,
+        moe_config: MoEConfig,
+        backend: BackendConfig,
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        self.config = config
+        self.moe_config = moe_config
+        self.backend = backend
+        self.dtype = dtype
+        self._uses_model_prefix = True
+
+    def to_hf(
+        self,
+        state_dict: dict[str, Any],
+        exclude_key_regex: Optional[str] = None,
+        **kwargs,
+    ) -> dict[str, Any]:
+        """Native -> on-disk Hy-MT2 HF: per-expert split + name renames."""
+        hf_split: dict[str, Any] = self._to_hf_w_split_experts(state_dict)
+
+        out: dict[str, Any] = {}
+        for k, v in hf_split.items():
+            new_k = k
+            for pat, repl in _NATIVE_TO_HF_RENAMES:
+                new_k, n = pat.subn(repl, new_k)
+                if n:
+                    break
+            if exclude_key_regex and re.match(exclude_key_regex, new_k):
+                continue
+            out[new_k] = v
+        return out
+
+    def from_hf(
+        self,
+        hf_state_dict: dict[str, Any],
+        device_mesh: Optional[DeviceMesh] = None,
+        **kwargs,
+    ) -> dict[str, Any]:
+        """On-disk Hy-MT2 HF -> native: filter MTP, rename, then merge experts."""
+        renamed: dict[str, Any] = {}
+        for k, v in hf_state_dict.items():
+            if self._is_mtp_key(k):
+                continue
+            new_k = k
+            for pat, repl in _HF_TO_NATIVE_RENAMES:
+                new_k, n = pat.subn(repl, new_k)
+                if n:
+                    break
+            renamed[new_k] = v
+
+        return self._from_hf_w_merged_experts(renamed, device_mesh)
+
+    def convert_single_tensor_to_hf(
+        self,
+        fqn: str,
+        tensor: Any,
+        **kwargs,
+    ) -> list[tuple[str, Any]]:
+        """Per-tensor variant of ``to_hf`` for streaming-save code paths."""
+        exclude_key_regex = kwargs.get("exclude_key_regex", None)
+
+        expert_split = self._convert_single_merged_expert_to_hf_split_experts(fqn, tensor, **kwargs)
+        if expert_split is not None:
+            pairs = expert_split
+        else:
+            pairs = [(fqn, tensor)]
+
+        out: list[tuple[str, Any]] = []
+        for k, v in pairs:
+            new_k = k
+            for pat, repl in _NATIVE_TO_HF_RENAMES:
+                new_k, n = pat.subn(repl, new_k)
+                if n:
+                    break
+            if exclude_key_regex and re.match(exclude_key_regex, new_k):
+                continue
+            out.append((new_k, v))
+        return out
+
+    def _is_mtp_key(self, key: str) -> bool:
+        """Return True if *key* belongs to an MTP layer (index >= num_hidden_layers).
+
+        Hy-MT2-30B-A3B does not appear to ship MTP layers in its public
+        checkpoint, but the filter is kept as a defensive no-op so the
+        adapter remains symmetric with ``HYV3StateDictAdapter``.
+        """
+        num_hidden = getattr(self.config, "num_hidden_layers", 48)
+        m = re.match(r"(?:model\.)?layers\.(\d+)\.", key)
+        return bool(m and int(m.group(1)) >= num_hidden)
diff --git a/tests/unit_tests/models/hy_mt2/__init__.py b/tests/unit_tests/models/hy_mt2/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/unit_tests/models/hy_mt2/test_hy_mt2_config.py b/tests/unit_tests/models/hy_mt2/test_hy_mt2_config.py
new file mode 100644
index 0000000000..2b30ae20b9
--- /dev/null
+++ b/tests/unit_tests/models/hy_mt2/test_hy_mt2_config.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for ``HyMT2Config``."""
+
+from transformers import PretrainedConfig
+
+from nemo_automodel.components.models.hy_mt2.config import HyMT2Config
+
+
+class TestDefaults:
+    def test_model_type(self):
+        assert HyMT2Config.model_type == "hy_mt2"
+
+    def test_inherits_pretrained_config(self):
+        cfg = HyMT2Config()
+        assert isinstance(cfg, PretrainedConfig)
+
+    def test_default_attributes_match_30b_a3b(self):
+        cfg = HyMT2Config()
+        # Architecture defaults from the published Hy-MT2-30B-A3B config.json.
+        assert cfg.vocab_size == 120832
+        assert cfg.hidden_size == 2048
+        assert cfg.intermediate_size == 6912
+        assert cfg.moe_intermediate_size == 768
+        assert cfg.expert_hidden_dim == 768
+        assert cfg.num_hidden_layers == 48
+        assert cfg.num_attention_heads == 32
+        assert cfg.num_key_value_heads == 4
+        assert cfg.head_dim == 128
+        assert cfg.num_experts == 128
+        assert cfg.num_shared_experts == 1
+        assert cfg.num_experts_per_tok == 8
+        assert cfg.first_k_dense_replace == 1
+        assert cfg.max_position_embeddings == 262144
+        assert cfg.rope_theta == 11158840.0
+        assert cfg.rms_norm_eps == 1e-5
+        assert cfg.attention_bias is False
+        assert cfg.hidden_act == "silu"
+        assert cfg.qk_norm is True
+        assert cfg.route_norm is True
+        assert cfg.router_scaling_factor == 2.826
+        assert cfg.moe_router_use_sigmoid is True
+        assert cfg.moe_router_enable_expert_bias is True
+        assert cfg.enable_lm_head_fp32 is True
+        assert cfg.enable_attention_fp32_softmax is False
+        assert cfg.enable_moe_fp32_combine is False
+        assert cfg.tie_word_embeddings is False
+        # torch_dtype is auto-coerced by PretrainedConfig in newer transformers
+        # (deprecated -> dtype); accept either the string we set or whatever
+        # the base class normalizes to.
+        assert cfg.torch_dtype in ("bfloat16", None) or str(cfg.torch_dtype).endswith("bfloat16")
+
+    def test_default_token_ids(self):
+        cfg = HyMT2Config()
+        assert cfg.pad_token_id == 120002
+        assert cfg.bos_token_id == 120000
+        assert cfg.eos_token_id == 120025
+
+    def test_keys_to_ignore_at_inference(self):
+        assert HyMT2Config.keys_to_ignore_at_inference == ["past_key_values"]
+
+
+class TestOverrides:
+    def test_override_attention_dims(self):
+        cfg = HyMT2Config(num_attention_heads=8, num_key_value_heads=2, head_dim=64, hidden_size=512)
+        assert cfg.num_attention_heads == 8
+        assert cfg.num_key_value_heads == 2
+        assert cfg.head_dim == 64
+        assert cfg.hidden_size == 512
+
+    def test_override_moe_routing(self):
+        cfg = HyMT2Config(
+            num_experts=64,
+            num_experts_per_tok=4,
+            num_shared_experts=2,
+            router_scaling_factor=1.5,
+            route_norm=False,
+        )
+        assert cfg.num_experts == 64
+        assert cfg.num_experts_per_tok == 4
+        assert cfg.num_shared_experts == 2
+        assert cfg.router_scaling_factor == 1.5
+        assert cfg.route_norm is False
+
+    def test_override_router_flavor(self):
+        cfg = HyMT2Config(moe_router_use_sigmoid=False, moe_router_enable_expert_bias=False)
+        assert cfg.moe_router_use_sigmoid is False
+        assert cfg.moe_router_enable_expert_bias is False
+
+    def test_truncated_layer_count(self):
+        cfg = HyMT2Config(num_hidden_layers=4)
+        assert cfg.num_hidden_layers == 4
+
+    def test_first_k_dense_replace(self):
+        cfg = HyMT2Config(first_k_dense_replace=3)
+        assert cfg.first_k_dense_replace == 3
+
+    def test_rope_overrides(self):
+        cfg = HyMT2Config(rope_theta=500000.0, max_position_embeddings=4096)
+        assert cfg.rope_theta == 500000.0
+        assert cfg.max_position_embeddings == 4096
+
+    def test_rope_scaling_dict(self):
+        scaling = {"factor": 8.0, "rope_type": "yarn"}
+        cfg = HyMT2Config(rope_scaling=scaling)
+        assert cfg.rope_scaling == scaling
+
+    def test_qk_norm_override(self):
+        cfg = HyMT2Config(qk_norm=False)
+        assert cfg.qk_norm is False
+
+    def test_lm_head_fp32_override(self):
+        cfg = HyMT2Config(enable_lm_head_fp32=False)
+        assert cfg.enable_lm_head_fp32 is False
+
+    def test_expert_hidden_dim_override(self):
+        cfg = HyMT2Config(expert_hidden_dim=1024)
+        assert cfg.expert_hidden_dim == 1024
+
+    def test_token_ids(self):
+        cfg = HyMT2Config(pad_token_id=0, bos_token_id=10, eos_token_id=11)
+        assert cfg.pad_token_id == 0
+        assert cfg.bos_token_id == 10
+        assert cfg.eos_token_id == 11
+
+    def test_super_init_kwargs_accepted(self):
+        # Verify PretrainedConfig kwargs flow through without raising.
+        HyMT2Config(use_cache=False, tie_word_embeddings=True)
+
+    def test_extra_kwargs_pass_through_super_init(self):
+        # PretrainedConfig **kwargs in newer transformers no longer attaches
+        # arbitrary fields; the call should still succeed.
+        cfg = HyMT2Config(custom_field="abc")
+        assert isinstance(cfg, HyMT2Config)
+
+
+class TestSerialization:
+    def test_to_dict_round_trip(self):
+        cfg = HyMT2Config(num_hidden_layers=4, num_experts=8, hidden_size=256)
+        d = cfg.to_dict()
+        assert d["model_type"] == "hy_mt2"
+        assert d["num_hidden_layers"] == 4
+        assert d["num_experts"] == 8
+
+        rebuilt = HyMT2Config(**{k: v for k, v in d.items() if k != "model_type"})
+        assert rebuilt.num_hidden_layers == 4
+        assert rebuilt.num_experts == 8
+        assert rebuilt.hidden_size == 256
+
+    def test_model_type_class_attribute_not_overridden_by_instance(self):
+        cfg = HyMT2Config()
+        assert cfg.model_type == "hy_mt2"
+        assert HyMT2Config.model_type == "hy_mt2"
diff --git a/tests/unit_tests/models/hy_mt2/test_hy_mt2_layers.py b/tests/unit_tests/models/hy_mt2/test_hy_mt2_layers.py
new file mode 100644
index 0000000000..ee5f014053
--- /dev/null
+++ b/tests/unit_tests/models/hy_mt2/test_hy_mt2_layers.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for ``HyMT2Attention``."""
+
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from nemo_automodel.components.models.common import BackendConfig
+from nemo_automodel.components.models.hy_mt2.config import HyMT2Config
+from nemo_automodel.components.models.hy_mt2.layers import HyMT2Attention
+
+pytestmark = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+
+
+HIDDEN = 64
+N_HEADS = 8
+N_KV = 2
+HEAD_DIM = 16
+
+
+@pytest.fixture
+def device():
+    return torch.device(f"cuda:{torch.cuda.current_device()}")
+
+
+@pytest.fixture
+def config():
+    return HyMT2Config(
+        vocab_size=128,
+        hidden_size=HIDDEN,
+        intermediate_size=128,
+        moe_intermediate_size=64,
+        num_hidden_layers=1,
+        num_attention_heads=N_HEADS,
+        num_key_value_heads=N_KV,
+        head_dim=HEAD_DIM,
+        max_position_embeddings=128,
+        rope_theta=10000.0,
+        rms_norm_eps=1e-5,
+    )
+
+
+@pytest.fixture
+def sdpa_backend():
+    return BackendConfig(
+        linear="torch",
+        attn="sdpa",
+        rms_norm="torch",
+        experts="torch",
+        dispatcher="torch",
+        fake_balanced_gate=False,
+        enable_hf_state_dict_adapter=False,
+        rope_fusion=False,
+    )
+
+
+def _make_freqs_cis(seq_len: int, device: torch.device) -> torch.Tensor:
+    """Synthesize a freqs_cis tensor matching ``apply_rotary_emb_qk(format='bshd')``."""
+    return torch.zeros(1, seq_len, HEAD_DIM, device=device)
+
+
+class TestInit:
+    def test_module_attributes(self, config, sdpa_backend):
+        attn = HyMT2Attention(config, backend=sdpa_backend)
+        assert attn.num_heads == N_HEADS
+        assert attn.num_kv_heads == N_KV
+        assert attn.head_dim == HEAD_DIM
+        assert attn.backend is sdpa_backend
+        assert attn.qk_norm_enabled is True
+
+    def test_projection_shapes(self, config, sdpa_backend):
+        attn = HyMT2Attention(config, backend=sdpa_backend)
+        assert attn.q_proj.weight.shape == (N_HEADS * HEAD_DIM, HIDDEN)
+        assert attn.k_proj.weight.shape == (N_KV * HEAD_DIM, HIDDEN)
+        assert attn.v_proj.weight.shape == (N_KV * HEAD_DIM, HIDDEN)
+        assert attn.o_proj.weight.shape == (HIDDEN, N_HEADS * HEAD_DIM)
+
+    def test_q_k_norm_per_head_dim_when_enabled(self, config, sdpa_backend):
+        attn = HyMT2Attention(config, backend=sdpa_backend)
+        assert attn.q_norm is not None
+        assert attn.k_norm is not None
+        assert attn.q_norm.weight.shape == (HEAD_DIM,)
+        assert attn.k_norm.weight.shape == (HEAD_DIM,)
+
+    def test_qk_norm_disabled_when_config_flag_false(self, config, sdpa_backend):
+        config.qk_norm = False
+        attn = HyMT2Attention(config, backend=sdpa_backend)
+        assert attn.qk_norm_enabled is False
+        assert attn.q_norm is None
+        assert attn.k_norm is None
+
+    def test_no_attention_bias_by_default(self, config, sdpa_backend):
+        attn = HyMT2Attention(config, backend=sdpa_backend)
+        assert attn.q_proj.bias is None
+        assert attn.k_proj.bias is None
+        assert attn.v_proj.bias is None
+        assert attn.o_proj.bias is None
+
+
+class TestForward:
+    def test_output_shape_bshd(self, config, sdpa_backend, device):
+        attn = HyMT2Attention(config, backend=sdpa_backend).to(device)
+        bsz, seqlen = 2, 4
+        x = torch.randn(bsz, seqlen, HIDDEN, device=device, dtype=torch.bfloat16)
+        freqs = _make_freqs_cis(seqlen, device)
+
+        out = attn(x, freqs_cis=freqs)
+        assert out.shape == (bsz, seqlen, HIDDEN)
+
+    def test_calls_q_k_v_o_projections(self, config, sdpa_backend, device):
+        attn = HyMT2Attention(config, backend=sdpa_backend).to(device)
+        x = torch.randn(1, 3, HIDDEN, device=device, dtype=torch.bfloat16)
+        freqs = _make_freqs_cis(3, device)
+        with (
+            patch.object(attn.q_proj, "forward", wraps=attn.q_proj.forward) as q,
+            patch.object(attn.k_proj, "forward", wraps=attn.k_proj.forward) as k,
+            patch.object(attn.v_proj, "forward", wraps=attn.v_proj.forward) as v,
+            patch.object(attn.o_proj, "forward", wraps=attn.o_proj.forward) as o,
+        ):
+            attn(x, freqs_cis=freqs)
+        q.assert_called_once()
+        k.assert_called_once()
+        v.assert_called_once()
+        o.assert_called_once()
+
+    def test_forward_skips_norms_when_qk_norm_disabled(self, config, sdpa_backend, device):
+        config.qk_norm = False
+        attn = HyMT2Attention(config, backend=sdpa_backend).to(device)
+        x = torch.randn(1, 3, HIDDEN, device=device, dtype=torch.bfloat16)
+        freqs = _make_freqs_cis(3, device)
+        out = attn(x, freqs_cis=freqs)
+        assert out.shape == x.shape
+
+
+class TestInitWeights:
+    def test_resets_norms_and_linears_when_qk_norm_enabled(self, config, sdpa_backend, device):
+        attn = HyMT2Attention(config, backend=sdpa_backend).to(device)
+        with (
+            patch.object(attn.q_norm, "reset_parameters") as qn,
+            patch.object(attn.k_norm, "reset_parameters") as kn,
+        ):
+            attn.init_weights(buffer_device=device, init_std=0.01)
+        qn.assert_called_once()
+        kn.assert_called_once()
+
+    def test_init_weights_no_qk_norm(self, config, sdpa_backend, device):
+        config.qk_norm = False
+        attn = HyMT2Attention(config, backend=sdpa_backend).to(device)
+        # Should not raise even though q_norm / k_norm are None.
+        attn.init_weights(buffer_device=device, init_std=0.01)
diff --git a/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py b/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py
new file mode 100644
index 0000000000..19e0a7975b
--- /dev/null
+++ b/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for the Hy-MT2 Block / HyMT2Model / HyMT2ForCausalLM layers."""
+
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from nemo_automodel.components.models.common import BackendConfig
+from nemo_automodel.components.models.hy_mt2.config import HyMT2Config
+from nemo_automodel.components.models.hy_mt2.model import (
+    Block,
+    HyMT2ForCausalLM,
+    HyMT2Model,
+    ModelClass,
+    _resolve_score_func,
+)
+from nemo_automodel.components.moe.config import MoEConfig
+from nemo_automodel.components.moe.layers import MLP, MoE
+
+pytestmark = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+
+
+HIDDEN = 64
+INTER = 128
+MOE_INTER = 64
+N_HEADS = 8
+N_KV = 2
+HEAD_DIM = 16
+N_EXPERTS = 4
+
+
+@pytest.fixture
+def device():
+    return torch.device(f"cuda:{torch.cuda.current_device()}")
+
+
+@pytest.fixture
+def config():
+    return HyMT2Config(
+        vocab_size=128,
+        hidden_size=HIDDEN,
+        intermediate_size=INTER,
+        moe_intermediate_size=MOE_INTER,
+        expert_hidden_dim=MOE_INTER,
+        num_hidden_layers=2,
+        num_attention_heads=N_HEADS,
+        num_key_value_heads=N_KV,
+        head_dim=HEAD_DIM,
+        num_experts=N_EXPERTS,
+        num_experts_per_tok=2,
+        num_shared_experts=1,
+        first_k_dense_replace=1,
+        max_position_embeddings=128,
+        rope_theta=10000.0,
+        rms_norm_eps=1e-5,
+        router_scaling_factor=2.826,
+        route_norm=True,
+        moe_router_use_sigmoid=True,
+        moe_router_enable_expert_bias=True,
+        enable_lm_head_fp32=True,
+    )
+
+
+@pytest.fixture
+def backend_config():
+    return BackendConfig(
+        linear="torch",
+        attn="sdpa",
+        rms_norm="torch",
+        experts="torch",
+        dispatcher="torch",
+        fake_balanced_gate=False,
+        gate_precision="float32",
+        rope_fusion=False,
+        enable_hf_state_dict_adapter=False,
+        enable_fsdp_optimizations=False,
+    )
+
+
+@pytest.fixture
+def moe_config(config):
+    return MoEConfig(
+        dim=config.hidden_size,
+        inter_dim=config.intermediate_size,
+        moe_inter_dim=config.moe_intermediate_size,
+        n_routed_experts=config.num_experts,
+        n_shared_experts=config.num_shared_experts,
+        n_activated_experts=config.num_experts_per_tok,
+        n_expert_groups=0,
+        n_limited_groups=0,
+        train_gate=True,
+        gate_bias_update_factor=0.0,
+        score_func="sigmoid",
+        route_scale=config.router_scaling_factor,
+        aux_loss_coeff=0.0,
+        norm_topk_prob=True,
+        expert_bias=False,
+        router_bias=False,
+        expert_activation="swiglu",
+        softmax_before_topk=False,
+        force_e_score_correction_bias=True,
+    )
+
+
+class TestResolveScoreFunc:
+    def test_default_is_sigmoid(self):
+        # Config without the flag falls back to sigmoid (Hy-MT2 default).
+        class _NoFlag:
+            pass
+
+        assert _resolve_score_func(_NoFlag()) == "sigmoid"
+
+    def test_true_maps_to_sigmoid(self):
+        class _Cfg:
+            moe_router_use_sigmoid = True
+
+        assert _resolve_score_func(_Cfg()) == "sigmoid"
+
+    def test_false_maps_to_softmax(self):
+        class _Cfg:
+            moe_router_use_sigmoid = False
+
+        assert _resolve_score_func(_Cfg()) == "softmax"
+
+
+class TestBlock:
+    def test_dense_layer_uses_mlp_when_idx_below_first_k_dense(self, config, moe_config, backend_config):
+        config.first_k_dense_replace = 1
+        block = Block(layer_idx=0, config=config, moe_config=moe_config, backend=backend_config)
+        assert isinstance(block.mlp, MLP)
+
+    def test_moe_layer_uses_moe_when_idx_at_or_above_first_k_dense(self, config, moe_config, backend_config):
+        config.first_k_dense_replace = 1
+        block = Block(layer_idx=1, config=config, moe_config=moe_config, backend=backend_config)
+        assert isinstance(block.mlp, MoE)
+
+    def test_block_has_required_submodules(self, config, moe_config, backend_config):
+        block = Block(layer_idx=1, config=config, moe_config=moe_config, backend=backend_config)
+        assert hasattr(block, "self_attn")
+        assert hasattr(block, "mlp")
+        assert hasattr(block, "input_layernorm")
+        assert hasattr(block, "post_attention_layernorm")
+        assert block.layer_idx == 1
+
+
+class TestHyMT2Model:
+    def test_construction_sets_components(self, config, backend_config):
+        model = HyMT2Model(config, backend=backend_config)
+        assert len(model.layers) == config.num_hidden_layers
+        assert model.embed_tokens.num_embeddings == config.vocab_size
+        assert model.norm is not None
+        assert model.rotary_emb.head_dim == config.head_dim
+        assert isinstance(model.moe_config, MoEConfig)
+
+    def test_dense_then_moe_layer_structure(self, config, backend_config):
+        config.first_k_dense_replace = 1
+        config.num_hidden_layers = 3
+        model = HyMT2Model(config, backend=backend_config)
+        assert isinstance(model.layers["0"].mlp, MLP)
+        assert isinstance(model.layers["1"].mlp, MoE)
+        assert isinstance(model.layers["2"].mlp, MoE)
+
+    def test_moe_config_inferred_from_config(self, config, backend_config):
+        model = HyMT2Model(config, backend=backend_config)
+        mc = model.moe_config
+        assert mc.dim == config.hidden_size
+        assert mc.moe_inter_dim == config.moe_intermediate_size
+        assert mc.n_routed_experts == config.num_experts
+        assert mc.n_activated_experts == config.num_experts_per_tok
+        assert mc.n_shared_experts == config.num_shared_experts
+        assert mc.score_func == "sigmoid"  # because moe_router_use_sigmoid=True
+        assert mc.expert_activation == "swiglu"
+        assert mc.route_scale == config.router_scaling_factor
+        assert mc.norm_topk_prob is True  # because route_norm=True
+        assert mc.force_e_score_correction_bias is True
+
+    def test_score_func_follows_use_sigmoid_flag(self, config, backend_config):
+        config.moe_router_use_sigmoid = False
+        model = HyMT2Model(config, backend=backend_config)
+        assert model.moe_config.score_func == "softmax"
+
+    def test_expert_hidden_dim_preferred_over_moe_intermediate(self, config, backend_config):
+        # When both are set, expert_hidden_dim wins for the expert MLP dim.
+        config.expert_hidden_dim = 32
+        config.moe_intermediate_size = 999  # would be wrong if used
+        model = HyMT2Model(config, backend=backend_config)
+        assert model.moe_config.moe_inter_dim == 32
+
+    def test_moe_overrides_take_effect(self, config, backend_config):
+        model = HyMT2Model(config, backend=backend_config, moe_overrides={"score_func": "softmax", "route_scale": 1.5})
+        assert model.moe_config.score_func == "softmax"
+        assert model.moe_config.route_scale == 1.5
+
+    def test_explicit_moe_config_and_overrides_conflict(self, config, backend_config, moe_config):
+        with pytest.raises(ValueError, match="Cannot pass both"):
+            HyMT2Model(config, backend=backend_config, moe_config=moe_config, moe_overrides={"score_func": "softmax"})
+
+
+class TestHyMT2ForCausalLM:
+    def test_model_class_alias(self):
+        assert ModelClass is HyMT2ForCausalLM
+
+    def test_construction(self, config, backend_config):
+        model = HyMT2ForCausalLM(config, backend=backend_config)
+        assert hasattr(model, "model")
+        assert hasattr(model, "lm_head")
+        assert model.config is config
+        assert model._enable_lm_head_fp32 is True
+
+    def test_enable_lm_head_fp32_default_false_without_config_flag(self, backend_config):
+        # When the config does not declare the flag, default to False.
+        class _Cfg:
+            vocab_size = 32
+            hidden_size = HIDDEN
+            intermediate_size = INTER
+            moe_intermediate_size = MOE_INTER
+            num_hidden_layers = 1
+            num_attention_heads = N_HEADS
+            num_key_value_heads = N_KV
+            head_dim = HEAD_DIM
+            num_experts = N_EXPERTS
+            num_experts_per_tok = 2
+            num_shared_experts = 1
+            first_k_dense_replace = 1
+            max_position_embeddings = 128
+            rope_theta = 10000.0
+            rms_norm_eps = 1e-5
+            torch_dtype = "bfloat16"
+            attention_bias = False
+            qk_norm = True
+            route_norm = False
+            router_scaling_factor = 1.0
+            moe_router_enable_expert_bias = False
+            moe_router_use_sigmoid = True
+
+        model = HyMT2ForCausalLM(_Cfg(), backend=backend_config)
+        assert model._enable_lm_head_fp32 is False
+
+    def test_lm_head_fp32_casts_back_to_input_dtype(self, config, backend_config, device):
+        model = HyMT2ForCausalLM(config, backend=backend_config).to(device).to(torch.bfloat16)
+        # Mock the inner backbone so we can control the dtype of its output.
+        bf16_hidden = torch.randn(1, 4, HIDDEN, device=device, dtype=torch.bfloat16)
+        with patch.object(model.model, "forward", return_value=bf16_hidden):
+            input_ids = torch.randint(0, config.vocab_size, (1, 4), device=device)
+            logits = model(input_ids)
+        # Output logits dtype must match the input hidden dtype, not fp32.
+        assert logits.dtype == torch.bfloat16
+
+    def test_lm_head_no_upcast_when_disabled(self, config, backend_config, device):
+        config.enable_lm_head_fp32 = False
+        model = HyMT2ForCausalLM(config, backend=backend_config).to(device).to(torch.bfloat16)
+        bf16_hidden = torch.randn(1, 4, HIDDEN, device=device, dtype=torch.bfloat16)
+        with patch.object(model.model, "forward", return_value=bf16_hidden):
+            input_ids = torch.randint(0, config.vocab_size, (1, 4), device=device)
+            logits = model(input_ids)
+        assert logits.dtype == torch.bfloat16
+
+    def test_get_set_input_embeddings(self, config, backend_config):
+        model = HyMT2ForCausalLM(config, backend=backend_config)
+        emb = model.get_input_embeddings()
+        assert emb is model.model.embed_tokens
+        new_emb = torch.nn.Embedding(8, HIDDEN)
+        model.set_input_embeddings(new_emb)
+        assert model.get_input_embeddings() is new_emb
+
+    def test_get_set_output_embeddings(self, config, backend_config):
+        model = HyMT2ForCausalLM(config, backend=backend_config)
+        assert model.get_output_embeddings() is model.lm_head
+        new_head = torch.nn.Linear(HIDDEN, 8, bias=False)
+        model.set_output_embeddings(new_head)
+        assert model.get_output_embeddings() is new_head
diff --git a/tests/unit_tests/models/hy_mt2/test_hy_mt2_state_dict_adapter.py b/tests/unit_tests/models/hy_mt2/test_hy_mt2_state_dict_adapter.py
new file mode 100644
index 0000000000..facaab67f1
--- /dev/null
+++ b/tests/unit_tests/models/hy_mt2/test_hy_mt2_state_dict_adapter.py
@@ -0,0 +1,299 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for ``HyMT2StateDictAdapter``.
+
+Covers the rename tables, per-expert split/merge inherited from
+``MoESplitExpertsStateDictMixin``, and the defensive MTP-layer filter.
+"""
+
+from unittest.mock import Mock
+
+import pytest
+import torch
+
+from nemo_automodel.components.models.common import BackendConfig
+from nemo_automodel.components.models.hy_mt2.state_dict_adapter import (
+    _HF_TO_NATIVE_RENAMES,
+    _NATIVE_TO_HF_RENAMES,
+    HyMT2StateDictAdapter,
+)
+from nemo_automodel.components.moe.config import MoEConfig
+
+N_EXPERTS = 4
+HIDDEN = 16
+MOE_INTER = 8
+NUM_LAYERS = 2  # layer 0 dense, layer 1 MoE
+
+
+@pytest.fixture
+def config():
+    cfg = Mock()
+    cfg.num_hidden_layers = NUM_LAYERS
+    cfg.hidden_size = HIDDEN
+    cfg.intermediate_size = 32
+    cfg.moe_intermediate_size = MOE_INTER
+    cfg.expert_hidden_dim = MOE_INTER
+    cfg.num_attention_heads = 4
+    cfg.num_key_value_heads = 2
+    cfg.num_experts = N_EXPERTS
+    cfg.num_experts_per_tok = 2
+    cfg.num_shared_experts = 1
+    cfg.first_k_dense_replace = 1
+    return cfg
+
+
+@pytest.fixture
+def moe_config():
+    return MoEConfig(
+        dim=HIDDEN,
+        inter_dim=32,
+        moe_inter_dim=MOE_INTER,
+        n_routed_experts=N_EXPERTS,
+        n_shared_experts=1,
+        n_activated_experts=2,
+        n_expert_groups=0,
+        n_limited_groups=0,
+        train_gate=True,
+        gate_bias_update_factor=0.0,
+        score_func="sigmoid",
+        route_scale=2.826,
+        aux_loss_coeff=0.0,
+        norm_topk_prob=True,
+        expert_bias=False,
+        router_bias=False,
+        expert_activation="swiglu",
+        softmax_before_topk=False,
+        force_e_score_correction_bias=True,
+    )
+
+
+@pytest.fixture
+def backend_config():
+    return BackendConfig(
+        linear="torch",
+        attn="sdpa",
+        rms_norm="torch",
+        experts="torch",
+        dispatcher="torch",
+        fake_balanced_gate=False,
+        enable_hf_state_dict_adapter=False,
+    )
+
+
+@pytest.fixture
+def adapter(config, moe_config, backend_config):
+    return HyMT2StateDictAdapter(config=config, moe_config=moe_config, backend=backend_config, dtype=torch.float32)
+
+
+def _make_disk_state_dict(*, with_mtp: bool = False):
+    """Synthesize an on-disk Hy-MT2 (== Hy3-preview key layout) state dict."""
+    sd: dict[str, torch.Tensor] = {
+        "model.embed_tokens.weight": torch.randn(32, HIDDEN),
+        "model.norm.weight": torch.randn(HIDDEN),
+        "lm_head.weight": torch.randn(32, HIDDEN),
+        # Layer 0: dense
+        "model.layers.0.input_layernorm.weight": torch.randn(HIDDEN),
+        "model.layers.0.post_attention_layernorm.weight": torch.randn(HIDDEN),
+        "model.layers.0.self_attn.q_proj.weight": torch.randn(HIDDEN, HIDDEN),
+        "model.layers.0.self_attn.k_proj.weight": torch.randn(HIDDEN // 2, HIDDEN),
+        "model.layers.0.self_attn.v_proj.weight": torch.randn(HIDDEN // 2, HIDDEN),
+        "model.layers.0.self_attn.o_proj.weight": torch.randn(HIDDEN, HIDDEN),
+        "model.layers.0.mlp.gate_proj.weight": torch.randn(32, HIDDEN),
+        "model.layers.0.mlp.up_proj.weight": torch.randn(32, HIDDEN),
+        "model.layers.0.mlp.down_proj.weight": torch.randn(HIDDEN, 32),
+        # Layer 1: MoE with on-disk Tencent-internal names
+        "model.layers.1.input_layernorm.weight": torch.randn(HIDDEN),
+        "model.layers.1.post_attention_layernorm.weight": torch.randn(HIDDEN),
+        "model.layers.1.self_attn.q_proj.weight": torch.randn(HIDDEN, HIDDEN),
+        "model.layers.1.self_attn.k_proj.weight": torch.randn(HIDDEN // 2, HIDDEN),
+        "model.layers.1.self_attn.v_proj.weight": torch.randn(HIDDEN // 2, HIDDEN),
+        "model.layers.1.self_attn.o_proj.weight": torch.randn(HIDDEN, HIDDEN),
+        "model.layers.1.mlp.router.gate.weight": torch.randn(N_EXPERTS, HIDDEN),
+        "model.layers.1.mlp.expert_bias": torch.randn(N_EXPERTS),
+        "model.layers.1.mlp.shared_mlp.gate_proj.weight": torch.randn(MOE_INTER, HIDDEN),
+        "model.layers.1.mlp.shared_mlp.up_proj.weight": torch.randn(MOE_INTER, HIDDEN),
+        "model.layers.1.mlp.shared_mlp.down_proj.weight": torch.randn(HIDDEN, MOE_INTER),
+    }
+    for e in range(N_EXPERTS):
+        sd[f"model.layers.1.mlp.experts.{e}.gate_proj.weight"] = torch.randn(MOE_INTER, HIDDEN)
+        sd[f"model.layers.1.mlp.experts.{e}.up_proj.weight"] = torch.randn(MOE_INTER, HIDDEN)
+        sd[f"model.layers.1.mlp.experts.{e}.down_proj.weight"] = torch.randn(HIDDEN, MOE_INTER)
+    if with_mtp:
+        sd[f"model.layers.{NUM_LAYERS}.input_layernorm.weight"] = torch.randn(HIDDEN)
+        sd[f"model.layers.{NUM_LAYERS}.mlp.expert_bias"] = torch.randn(N_EXPERTS)
+    return sd
+
+
+class TestInitialization:
+    def test_attributes_set(self, config, moe_config, backend_config):
+        a = HyMT2StateDictAdapter(config=config, moe_config=moe_config, backend=backend_config, dtype=torch.float16)
+        assert a.config is config
+        assert a.moe_config is moe_config
+        assert a.backend is backend_config
+        assert a.dtype == torch.float16
+        assert a._uses_model_prefix is True
+
+    def test_default_dtype_is_bfloat16(self, config, moe_config, backend_config):
+        a = HyMT2StateDictAdapter(config=config, moe_config=moe_config, backend=backend_config)
+        assert a.dtype == torch.bfloat16
+
+    def test_inherits_split_experts_mixin(self, adapter):
+        from nemo_automodel.components.moe.state_dict_mixin import MoESplitExpertsStateDictMixin
+
+        assert isinstance(adapter, MoESplitExpertsStateDictMixin)
+
+
+class TestRenameTables:
+    """Each rename pattern must be reversible: native -> hf -> native."""
+
+    @pytest.mark.parametrize(
+        "native, hf",
+        [
+            ("model.layers.5.mlp.gate.e_score_correction_bias", "model.layers.5.mlp.expert_bias"),
+            ("model.layers.5.mlp.gate.weight", "model.layers.5.mlp.router.gate.weight"),
+            ("model.layers.5.mlp.shared_experts.gate_proj.weight", "model.layers.5.mlp.shared_mlp.gate_proj.weight"),
+            ("model.layers.5.mlp.shared_experts.up_proj.weight", "model.layers.5.mlp.shared_mlp.up_proj.weight"),
+            ("model.layers.5.mlp.shared_experts.down_proj.weight", "model.layers.5.mlp.shared_mlp.down_proj.weight"),
+        ],
+    )
+    def test_round_trip(self, native, hf):
+        nk = native
+        for pat, repl in _NATIVE_TO_HF_RENAMES:
+            nk, n = pat.subn(repl, nk)
+            if n:
+                break
+        assert nk == hf
+
+        hk = hf
+        for pat, repl in _HF_TO_NATIVE_RENAMES:
+            hk, n = pat.subn(repl, hk)
+            if n:
+                break
+        assert hk == native
+
+    def test_unrelated_keys_pass_through(self):
+        """Renames must not touch attention, embed, lm_head, layernorm, or dense MLP keys."""
+        for k in (
+            "model.embed_tokens.weight",
+            "lm_head.weight",
+            "model.layers.0.self_attn.q_proj.weight",
+            "model.layers.0.input_layernorm.weight",
+            "model.layers.0.mlp.gate_proj.weight",  # dense MLP gate_proj must NOT match
+            "model.norm.weight",
+        ):
+            for tab in (_NATIVE_TO_HF_RENAMES, _HF_TO_NATIVE_RENAMES):
+                v = k
+                for pat, repl in tab:
+                    v, n = pat.subn(repl, v)
+                    if n:
+                        break
+                assert v == k, f"{k} unexpectedly renamed to {v}"
+
+
+class TestFromHF:
+    def test_renames_router_gate(self, adapter):
+        native = adapter.from_hf(_make_disk_state_dict(), device_mesh=None)
+        assert "model.layers.1.mlp.gate.weight" in native
+        assert "model.layers.1.mlp.router.gate.weight" not in native
+
+    def test_renames_expert_bias_to_gate_bias(self, adapter):
+        native = adapter.from_hf(_make_disk_state_dict(), device_mesh=None)
+        assert "model.layers.1.mlp.gate.e_score_correction_bias" in native
+        assert "model.layers.1.mlp.expert_bias" not in native
+
+    def test_renames_shared_mlp_to_shared_experts(self, adapter):
+        native = adapter.from_hf(_make_disk_state_dict(), device_mesh=None)
+        for proj in ("gate_proj", "up_proj", "down_proj"):
+            assert f"model.layers.1.mlp.shared_experts.{proj}.weight" in native
+            assert f"model.layers.1.mlp.shared_mlp.{proj}.weight" not in native
+
+    def test_merges_experts_into_grouped_form(self, adapter):
+        native = adapter.from_hf(_make_disk_state_dict(), device_mesh=None)
+        for e in range(N_EXPERTS):
+            for proj in ("gate_proj", "up_proj", "down_proj"):
+                assert f"model.layers.1.mlp.experts.{e}.{proj}.weight" not in native
+        assert "model.layers.1.mlp.experts.gate_and_up_projs" in native
+        assert "model.layers.1.mlp.experts.down_projs" in native
+
+    def test_merged_shapes_are_native_layout(self, adapter):
+        native = adapter.from_hf(_make_disk_state_dict(), device_mesh=None)
+        assert tuple(native["model.layers.1.mlp.experts.gate_and_up_projs"].shape) == (
+            N_EXPERTS,
+            HIDDEN,
+            2 * MOE_INTER,
+        )
+        assert tuple(native["model.layers.1.mlp.experts.down_projs"].shape) == (
+            N_EXPERTS,
+            MOE_INTER,
+            HIDDEN,
+        )
+
+    def test_drops_mtp_layer_keys(self, adapter):
+        hf = _make_disk_state_dict(with_mtp=True)
+        assert any(k.startswith(f"model.layers.{NUM_LAYERS}.") for k in hf)
+        native = adapter.from_hf(hf, device_mesh=None)
+        assert not any(k.startswith(f"model.layers.{NUM_LAYERS}.") for k in native)
+
+
+class TestToHF:
+    def test_renames_native_back_to_on_disk(self, adapter):
+        # Build a minimal native state dict; reuse from_hf to produce it.
+        native = adapter.from_hf(_make_disk_state_dict(), device_mesh=None)
+        hf = adapter.to_hf(native)
+        # On-disk renames present after to_hf
+        assert "model.layers.1.mlp.router.gate.weight" in hf
+        assert "model.layers.1.mlp.expert_bias" in hf
+        for proj in ("gate_proj", "up_proj", "down_proj"):
+            assert f"model.layers.1.mlp.shared_mlp.{proj}.weight" in hf
+        # Native-only names must be gone.
+        assert "model.layers.1.mlp.gate.weight" not in hf
+        assert "model.layers.1.mlp.gate.e_score_correction_bias" not in hf
+
+    def test_splits_grouped_experts_to_per_expert(self, adapter):
+        native = adapter.from_hf(_make_disk_state_dict(), device_mesh=None)
+        hf = adapter.to_hf(native)
+        # Per-expert keys re-appear after splitting.
+        for e in range(N_EXPERTS):
+            for proj in ("gate_proj", "up_proj", "down_proj"):
+                assert f"model.layers.1.mlp.experts.{e}.{proj}.weight" in hf
+        # Grouped keys gone.
+        assert "model.layers.1.mlp.experts.gate_and_up_projs" not in hf
+        assert "model.layers.1.mlp.experts.down_projs" not in hf
+
+    def test_round_trip_preserves_per_expert_weights(self, adapter):
+        """A full disk -> native -> disk round-trip preserves expert weights."""
+        disk = _make_disk_state_dict()
+        native = adapter.from_hf(disk, device_mesh=None)
+        round_tripped = adapter.to_hf(native)
+        for e in range(N_EXPERTS):
+            for proj in ("gate_proj", "up_proj", "down_proj"):
+                key = f"model.layers.1.mlp.experts.{e}.{proj}.weight"
+                assert key in round_tripped
+                assert torch.allclose(round_tripped[key].to(disk[key].dtype), disk[key])
+
+
+class TestMTPFilter:
+    def test_filters_layer_at_num_hidden(self, adapter):
+        assert adapter._is_mtp_key(f"model.layers.{NUM_LAYERS}.foo") is True
+        assert adapter._is_mtp_key(f"layers.{NUM_LAYERS}.foo") is True
+
+    def test_does_not_filter_in_range_layers(self, adapter):
+        assert adapter._is_mtp_key("model.layers.0.foo") is False
+        assert adapter._is_mtp_key(f"model.layers.{NUM_LAYERS - 1}.foo") is False
+
+    def test_does_not_filter_non_layer_keys(self, adapter):
+        assert adapter._is_mtp_key("model.embed_tokens.weight") is False
+        assert adapter._is_mtp_key("lm_head.weight") is False
+        assert adapter._is_mtp_key("model.norm.weight") is False

From a21e01422236dfd6874e777923df65a31c4ea87b Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Wed, 27 May 2026 11:25:55 +0800
Subject: [PATCH 02/20] feat(models): add Hy-MT2 config-shape dispatcher and
 fp32 lm_head fix

Two refinements on top of a7d91c39:

1. **Config-shape dispatcher** in ``_transformers/model_init.py``: when
   ``architectures: ["HYV3ForCausalLM"]`` is paired with the Hy-MT2-30B-A3B
   config fingerprint (hidden=2048, 48 layers, 128 experts,
   expert_hidden_dim=768, ``enable_lm_head_fp32`` present), resolve to
   ``HyMT2ForCausalLM`` instead of the default ``HYV3ForCausalLM``. Hy3-preview
   (hidden=4096, 80 layers, 192 experts) still resolves to ``HYV3ForCausalLM``.
   Two tests in ``test_model_init.py`` lock this dispatch in.

2. **lm_head fp32 dtype fix** in ``HyMT2ForCausalLM.forward``: when
   ``enable_lm_head_fp32`` is on, the upcast path was calling
   ``self.lm_head(hidden.float())`` which would fail because ``lm_head.weight``
   stays in bf16 after ``cast_model_to_dtype``. Replace with an explicit
   ``F.linear(hidden.float(), self.lm_head.weight.float(), bias.float() | None)``
   so both operands are fp32; the result is cast back to the original dtype.

The example YAML now uses the fully-qualified ``HyMT2ForCausalLM`` target;
combined with (1) it can also be loaded via ``NeMoAutoModelForCausalLM``,
which gives users both an explicit and an auto-dispatch path.

Signed-off-by: khazic <khazzz1c@gmail.com>
---
 .../hy_mt2/hy_mt2_30b_a3b_sft.yaml            | 14 ++++----
 nemo_automodel/_transformers/model_init.py    | 18 ++++++++++
 .../components/models/hy_mt2/model.py         |  8 ++++-
 .../_transformers/test_model_init.py          | 36 +++++++++++++++++++
 4 files changed, 69 insertions(+), 7 deletions(-)

diff --git a/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml b/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml
index 9f2b71d8d9..cb26198ed6 100644
--- a/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml
+++ b/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml
@@ -26,12 +26,14 @@
 # EP size must divide num_experts (128). ep_size=8 -> 16 experts per rank.
 # Other valid EP sizes: 1, 2, 4, 16, 32, 64, 128.
 #
-# Note: the on-disk checkpoint declares ``architectures: ["HYV3ForCausalLM"]``,
-# so NeMoAutoModel's registry would dispatch to ``components/models/hy_v3``.
-# This recipe deliberately bypasses the registry and instantiates the new
-# ``HyMT2ForCausalLM`` directly via ``_target_``, isolating Hy-MT2 logic
-# (sigmoid-routing flag, ``enable_lm_head_fp32``, expert_hidden_dim) from
-# the existing Hy3-preview support.
+# Note: the on-disk checkpoint declares ``architectures: ["HYV3ForCausalLM"]``
+# and ``model_type: "hy_v3"``, which NeMoAutoModel's string-keyed registry maps
+# to ``components/models/hy_v3`` (Hy3-preview). This recipe deliberately
+# bypasses the registry by instantiating ``HyMT2ForCausalLM`` via a
+# fully-qualified ``_target_`` -- this keeps the Hy-MT2-specific logic
+# (``moe_router_use_sigmoid`` dispatch, ``enable_lm_head_fp32`` in-forward
+# upcast, ``expert_hidden_dim`` preference) isolated from the existing
+# Hy3-preview support without registry surgery.
 
 recipe: TrainFinetuneRecipeForNextTokenPrediction
 
diff --git a/nemo_automodel/_transformers/model_init.py b/nemo_automodel/_transformers/model_init.py
index a351920582..de12748498 100644
--- a/nemo_automodel/_transformers/model_init.py
+++ b/nemo_automodel/_transformers/model_init.py
@@ -214,6 +214,19 @@ def _is_config_compatible_with_custom_model(arch_name: str, config) -> bool:
     return True
 
 
+def _is_hy_mt2_config(config) -> bool:
+    """Return whether a ``hy_v3`` config describes Tencent Hy-MT2-30B-A3B."""
+    return (
+        getattr(config, "model_type", None) == "hy_v3"
+        and getattr(config, "hidden_size", None) == 2048
+        and getattr(config, "num_hidden_layers", None) == 48
+        and getattr(config, "num_experts", None) == 128
+        and getattr(config, "expert_hidden_dim", None) == 768
+        and getattr(config, "moe_intermediate_size", None) == 768
+        and hasattr(config, "enable_lm_head_fp32")
+    )
+
+
 def _resolve_custom_model_cls_for_config(config):
     """Resolve the custom model class for *config*, if the config is compatible."""
     architectures = get_architectures(config)
@@ -221,6 +234,11 @@ def _resolve_custom_model_cls_for_config(config):
         return None
 
     arch_name = architectures[0]
+    if arch_name == "HYV3ForCausalLM" and _is_hy_mt2_config(config):
+        from nemo_automodel.components.models.hy_mt2.model import HyMT2ForCausalLM
+
+        return HyMT2ForCausalLM
+
     if not ModelRegistry.has_custom_model(arch_name):
         return None
 
diff --git a/nemo_automodel/components/models/hy_mt2/model.py b/nemo_automodel/components/models/hy_mt2/model.py
index db2d85db8e..8af9a979ef 100644
--- a/nemo_automodel/components/models/hy_mt2/model.py
+++ b/nemo_automodel/components/models/hy_mt2/model.py
@@ -40,6 +40,7 @@
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 
 from nemo_automodel.components.models.common import (
     BackendConfig,
@@ -373,7 +374,12 @@ def forward(
             # not surprised by an fp32 tensor. Matches the HF reference's
             # ``enable_lm_head_fp32`` behavior.
             original_dtype = hidden.dtype
-            logits = self.lm_head(hidden.float()).to(original_dtype)
+            lm_head_bias = self.lm_head.bias if getattr(self.lm_head, "bias", None) is not None else None
+            logits = F.linear(
+                hidden.float(),
+                self.lm_head.weight.float(),
+                lm_head_bias.float() if lm_head_bias is not None else None,
+            ).to(original_dtype)
         else:
             logits = self.lm_head(hidden)
 
diff --git a/tests/unit_tests/_transformers/test_model_init.py b/tests/unit_tests/_transformers/test_model_init.py
index 5318719ad5..4ece5371d1 100644
--- a/tests/unit_tests/_transformers/test_model_init.py
+++ b/tests/unit_tests/_transformers/test_model_init.py
@@ -15,6 +15,7 @@
 """Tests for nested config override handling in get_hf_config and _consume_config_overrides."""
 
 import os
+from types import SimpleNamespace
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -27,6 +28,7 @@
     _init_model,
     _load_config_with_layer_types_fix,
     _propagate_torch_dtype_to_subconfigs,
+    _resolve_custom_model_cls_for_config,
     _resolve_model_dir,
     _setup_bnb_loading_kwargs,
     _stream_load_bnb_weights,
@@ -37,6 +39,40 @@
 from nemo_automodel.components.models.common.utils import BackendConfig
 
 
+class TestHyMT2ModelResolution:
+    """Hy-MT2 shares HYV3ForCausalLM metadata but needs its own implementation."""
+
+    def test_hy_mt2_config_resolves_to_hy_mt2_model(self):
+        from nemo_automodel.components.models.hy_mt2.model import HyMT2ForCausalLM
+
+        config = SimpleNamespace(
+            architectures=["HYV3ForCausalLM"],
+            model_type="hy_v3",
+            hidden_size=2048,
+            num_hidden_layers=48,
+            num_experts=128,
+            expert_hidden_dim=768,
+            moe_intermediate_size=768,
+            enable_lm_head_fp32=True,
+        )
+
+        assert _resolve_custom_model_cls_for_config(config) is HyMT2ForCausalLM
+
+    def test_hy_v3_config_still_resolves_to_hy_v3_model(self):
+        from nemo_automodel.components.models.hy_v3.model import HYV3ForCausalLM
+
+        config = SimpleNamespace(
+            architectures=["HYV3ForCausalLM"],
+            model_type="hy_v3",
+            hidden_size=4096,
+            num_hidden_layers=80,
+            num_experts=192,
+            moe_intermediate_size=1536,
+        )
+
+        assert _resolve_custom_model_cls_for_config(config) is HYV3ForCausalLM
+
+
 class TestConsumeConfigOverridesNestedDict:
     """Nested dict overrides should be deep-merged into sub-config objects."""
 

From 97492ff4c96885346f5ee773e625eaf4511b9324 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Wed, 27 May 2026 11:35:02 +0800
Subject: [PATCH 03/20] docs(examples): fix Hy-MT2 launch command in YAML
 comment

The header comment showed ``automodel finetune llm -c <yaml> ...`` which is
not the real CLI signature -- ``nemo_automodel/cli/app.py:76-81`` takes the
YAML path as the first positional argument, so the previous form silently
treated ``finetune`` as the config path and failed with FileNotFoundError
on ``./finetune``. Update the comment to match the actual usage:

    automodel <config.yaml> --nproc-per-node 8

Signed-off-by: khazic <khazzz1c@gmail.com>
---
 examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml b/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml
index cb26198ed6..663d36ef7a 100644
--- a/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml
+++ b/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml
@@ -20,8 +20,7 @@
 # vocab=120832, 256K context, rope_theta=11158840, qk_norm.
 #
 # Hardware target: 8 GPUs (80 GB+ each) for full SFT with EP8 + FSDP2.
-#   automodel finetune llm -c examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml \
-#     --nproc-per-node 8
+#   automodel examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml --nproc-per-node 8
 #
 # EP size must divide num_experts (128). ep_size=8 -> 16 experts per rank.
 # Other valid EP sizes: 1, 2, 4, 16, 32, 64, 128.

From 3e8c3c0a46eb8d54fd0d884e21fc57f8750c024c Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Wed, 27 May 2026 12:04:49 +0800
Subject: [PATCH 04/20] fix(models): use nn.Linear (DTensor-aware) on Hy-MT2
 lm_head fp32 path

The in-model ``enable_lm_head_fp32`` path called ``F.linear`` directly with
``self.lm_head.weight.float()``. Under FSDP2 the lm_head weight is a
DTensor, and ``F.linear`` does not handle DTensor redistribution -- the
hidden state is a plain torch.Tensor, so the matmul crashes with::

    RuntimeError: aten.mm.default got mixed torch.Tensor and DTensor,
    need to convert all torch.Tensor to DTensor before calling
    distributed operators!

Drop the explicit ``F.linear`` and rely on ``self.lm_head(...)`` instead;
``nn.Linear.forward`` is DTensor-aware and will redistribute the input as
needed. To avoid the original dtype-mismatch motivation for the manual
upcast (fp32 input vs. bf16 weight), only upcast when ``lm_head.weight``
has already been promoted to fp32 -- which is exactly what the YAML's
``distributed.moe.lm_head_precision: float32`` path does via the MoE
parallelizer's ``MixedPrecisionPolicy``. If the weight is still in the
model dtype, fall through to the standard ``self.lm_head(hidden)`` path.

Also drop the now-unused ``torch.nn.functional`` import and update the
unit tests to validate the new condition (weight promoted -> upcast
runs; weight not promoted -> fall through).

Signed-off-by: khazic <khazzz1c@gmail.com>
---
 .../components/models/hy_mt2/model.py         | 22 +++++++--------
 .../models/hy_mt2/test_hy_mt2_model.py        | 27 ++++++++++++++++---
 2 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/nemo_automodel/components/models/hy_mt2/model.py b/nemo_automodel/components/models/hy_mt2/model.py
index 8af9a979ef..72b7f015f8 100644
--- a/nemo_automodel/components/models/hy_mt2/model.py
+++ b/nemo_automodel/components/models/hy_mt2/model.py
@@ -40,7 +40,6 @@
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 
 from nemo_automodel.components.models.common import (
     BackendConfig,
@@ -368,18 +367,17 @@ def forward(
 
         if self.lm_head is None:
             logits = hidden
-        elif self._enable_lm_head_fp32 and hidden.dtype != torch.float32:
-            # Upcast input to fp32 for the lm_head matmul, then cast logits
-            # back to the model dtype so downstream loss / sampling code is
-            # not surprised by an fp32 tensor. Matches the HF reference's
-            # ``enable_lm_head_fp32`` behavior.
+        elif self._enable_lm_head_fp32 and self.lm_head.weight.dtype == torch.float32 and hidden.dtype != torch.float32:
+            # The MoE parallelizer (``distributed.moe.lm_head_precision:
+            # float32`` in the YAML) has already promoted ``lm_head.weight`` to
+            # fp32. Feed it fp32 input via ``nn.Linear`` -- which is
+            # DTensor-aware under FSDP2 -- and cast logits back to the input
+            # dtype. We must NOT use ``F.linear`` directly with a manually
+            # ``.float()``-ed weight here, because that bypasses nn.Linear's
+            # DTensor redistribution and crashes with
+            # "got mixed torch.Tensor and DTensor".
             original_dtype = hidden.dtype
-            lm_head_bias = self.lm_head.bias if getattr(self.lm_head, "bias", None) is not None else None
-            logits = F.linear(
-                hidden.float(),
-                self.lm_head.weight.float(),
-                lm_head_bias.float() if lm_head_bias is not None else None,
-            ).to(original_dtype)
+            logits = self.lm_head(hidden.float()).to(original_dtype)
         else:
             logits = self.lm_head(hidden)
 
diff --git a/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py b/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py
index 19e0a7975b..20705d6d3c 100644
--- a/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py
+++ b/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py
@@ -250,14 +250,35 @@ class _Cfg:
         model = HyMT2ForCausalLM(_Cfg(), backend=backend_config)
         assert model._enable_lm_head_fp32 is False
 
-    def test_lm_head_fp32_casts_back_to_input_dtype(self, config, backend_config, device):
+    def test_lm_head_fp32_upcast_when_weight_promoted(self, config, backend_config, device):
+        """When the parallelizer has promoted lm_head.weight to fp32 (the
+        ``distributed.moe.lm_head_precision: float32`` path), the in-model
+        fallback feeds the bf16 hidden state up to fp32, runs lm_head, and
+        casts logits back to bf16."""
         model = HyMT2ForCausalLM(config, backend=backend_config).to(device).to(torch.bfloat16)
-        # Mock the inner backbone so we can control the dtype of its output.
+        # Simulate the parallelizer's promotion of lm_head.weight to fp32.
+        model.lm_head = model.lm_head.to(torch.float32)
+        bf16_hidden = torch.randn(1, 4, HIDDEN, device=device, dtype=torch.bfloat16)
+        with patch.object(model.model, "forward", return_value=bf16_hidden):
+            input_ids = torch.randint(0, config.vocab_size, (1, 4), device=device)
+            logits = model(input_ids)
+        # Output dtype must be the input dtype (bf16), not fp32.
+        assert logits.dtype == torch.bfloat16
+
+    def test_lm_head_no_upcast_when_weight_is_bf16(self, config, backend_config, device):
+        """If the parallelizer did NOT promote lm_head.weight, the model must
+        fall through to ``self.lm_head(hidden)`` without trying to upcast,
+        to avoid the dtype mismatch (fp32 input vs bf16 weight) and the
+        ``F.linear`` DTensor mixing crash that the prior implementation hit."""
+        model = HyMT2ForCausalLM(config, backend=backend_config).to(device).to(torch.bfloat16)
+        # lm_head.weight is bf16 (no promotion). Even though enable_lm_head_fp32
+        # is True on the config, the in-model path must NOT activate.
+        assert model.lm_head.weight.dtype == torch.bfloat16
+        assert model._enable_lm_head_fp32 is True
         bf16_hidden = torch.randn(1, 4, HIDDEN, device=device, dtype=torch.bfloat16)
         with patch.object(model.model, "forward", return_value=bf16_hidden):
             input_ids = torch.randint(0, config.vocab_size, (1, 4), device=device)
             logits = model(input_ids)
-        # Output logits dtype must match the input hidden dtype, not fp32.
         assert logits.dtype == torch.bfloat16
 
     def test_lm_head_no_upcast_when_disabled(self, config, backend_config, device):

From c9945eb16c298359cb8dd2b0a8dc9eb9526a5e38 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Wed, 27 May 2026 12:20:40 +0800
Subject: [PATCH 05/20] fix(examples): route Hy-MT2 example via NeMoAutoModel
 for weight loading

The fully-qualified ``_target_: HyMT2ForCausalLM.from_pretrained`` path
bypasses ``_transformers/model_init.py``, which is where the HF
safetensors loader actually runs. Our class method only invokes
``AutoConfig.from_pretrained`` and ``cls.from_config(...)`` -- the
returned model has the right architecture but random weights, so SFT
starts at ``loss ~= ln(vocab) = 11.7`` instead of the loaded pre-trained
weights.

Switch the YAML back to ``NeMoAutoModelForCausalLM.from_pretrained``.
The config-shape dispatcher added in a21e0142 will still route this to
``HyMT2ForCausalLM`` (hidden=2048 + 48 layers + 128 experts +
``enable_lm_head_fp32``), and the standard NeMoAutoModel loader pipeline
will then stream the safetensors through ``HyMT2StateDictAdapter`` into
the FSDP2 / EP-sharded parameters.

Signed-off-by: khazic <khazzz1c@gmail.com>
---
 .../llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml b/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml
index 663d36ef7a..26ff83caa8 100644
--- a/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml
+++ b/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml
@@ -26,13 +26,14 @@
 # Other valid EP sizes: 1, 2, 4, 16, 32, 64, 128.
 #
 # Note: the on-disk checkpoint declares ``architectures: ["HYV3ForCausalLM"]``
-# and ``model_type: "hy_v3"``, which NeMoAutoModel's string-keyed registry maps
-# to ``components/models/hy_v3`` (Hy3-preview). This recipe deliberately
-# bypasses the registry by instantiating ``HyMT2ForCausalLM`` via a
-# fully-qualified ``_target_`` -- this keeps the Hy-MT2-specific logic
-# (``moe_router_use_sigmoid`` dispatch, ``enable_lm_head_fp32`` in-forward
-# upcast, ``expert_hidden_dim`` preference) isolated from the existing
-# Hy3-preview support without registry surgery.
+# and ``model_type: "hy_v3"``. NeMoAutoModel's model resolver
+# (``_transformers/model_init.py``) detects the Hy-MT2-30B-A3B config
+# fingerprint (hidden=2048, 48 layers, 128 experts, ``enable_lm_head_fp32``)
+# and dispatches to ``HyMT2ForCausalLM`` instead of the default
+# ``HYV3ForCausalLM``. Going through ``NeMoAutoModelForCausalLM`` here is
+# important: it runs the full HF safetensors loader, while a fully-qualified
+# ``_target_: HyMT2ForCausalLM.from_pretrained`` would only construct the
+# architecture with random weights.
 
 recipe: TrainFinetuneRecipeForNextTokenPrediction
 
@@ -54,7 +55,7 @@ rng:
   ranked: true
 
 model:
-  _target_: nemo_automodel.components.models.hy_mt2.model.HyMT2ForCausalLM.from_pretrained
+  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
   pretrained_model_name_or_path: tencent/Hy-MT2-30B-A3B
   torch_dtype: bfloat16
   backend:

From 4965a993b745e4ee1d8b8a76253827a6201fb8e2 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Wed, 27 May 2026 16:09:07 +0800
Subject: [PATCH 06/20] fix(transformers): register HyMT2ForCausalLM in
 MODEL_ARCH_MAPPING

The L0 unit test ``test_all_model_folders_registered_in_auto_map``
scans every ``components/models/*/model.py`` and asserts the architecture
has a matching ``MODEL_ARCH_MAPPING`` entry. ``hy_mt2`` was missing
because its dispatch is done via the config-shape detector in
``_transformers/model_init.py`` rather than the string-keyed registry,
so the meta-test flagged it.

Add the registry entry so the meta-test passes. HF checkpoints declare
``architectures: [\"HYV3ForCausalLM\"]``, so this new key will only be hit
if a user explicitly writes ``architectures: [\"HyMT2ForCausalLM\"]`` in
their config -- the standard config-shape dispatch path is unchanged.

Signed-off-by: khazic <khazzz1c@gmail.com>
---
 nemo_automodel/_transformers/registry.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/nemo_automodel/_transformers/registry.py b/nemo_automodel/_transformers/registry.py
index b1160ffe1b..16b25272ee 100644
--- a/nemo_automodel/_transformers/registry.py
+++ b/nemo_automodel/_transformers/registry.py
@@ -164,6 +164,10 @@
             "HYV3ForCausalLM",
             ("nemo_automodel.components.models.hy_v3.model", "HYV3ForCausalLM"),
         ),
+        (
+            "HyMT2ForCausalLM",
+            ("nemo_automodel.components.models.hy_mt2.model", "HyMT2ForCausalLM"),
+        ),
         (
             "Qwen2ForCausalLM",
             ("nemo_automodel.components.models.qwen2.model", "Qwen2ForCausalLM"),

From 1234d1b4c3e180ee643b5b1053ffacd3ab06006e Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Wed, 27 May 2026 18:27:08 +0800
Subject: [PATCH 07/20] docs(model-coverage): add Hy-MT2 model card

Adds the Hy-MT2-30B-A3B model page under docs/model-coverage/llm/tencent/
(legacy Sphinx tree) and fern/versions/nightly/pages/model-coverage/llm/tencent/
(nightly Fern tree), plus the matching nightly.yml sidebar entry.

Satisfies test_every_registered_arch_has_model_coverage_doc, which scans
docs/model-coverage/*.md for every architecture registered in
MODEL_ARCH_MAPPING and was failing on the new HyMT2ForCausalLM entry.

v0.4 frozen tree and latest/v0.4 alias YAMLs are intentionally not touched
- this is nightly drift, not a back-port.

Signed-off-by: khazic <khazzz1c@gmail.com>
---
 docs/model-coverage/llm/tencent/hy-mt2.md     | 63 +++++++++++++++++
 fern/versions/nightly.yml                     |  2 +
 .../model-coverage/llm/tencent/hy-mt2.mdx     | 67 +++++++++++++++++++
 3 files changed, 132 insertions(+)
 create mode 100644 docs/model-coverage/llm/tencent/hy-mt2.md
 create mode 100644 fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx

diff --git a/docs/model-coverage/llm/tencent/hy-mt2.md b/docs/model-coverage/llm/tencent/hy-mt2.md
new file mode 100644
index 0000000000..62794a8f00
--- /dev/null
+++ b/docs/model-coverage/llm/tencent/hy-mt2.md
@@ -0,0 +1,63 @@
+# Hy-MT2 (Hunyuan-MT2)
+
+[Hy-MT2-30B-A3B](https://huggingface.co/tencent/Hy-MT2-30B-A3B) is Tencent's translation Mixture-of-Experts language model with 30B total parameters and 3B activated per token. It features 48 transformer layers (layer 0 dense, layers 1–47 MoE), 128 routed experts plus 1 shared expert with top-8 sigmoid routing, Grouped Query Attention (32 Q / 4 KV heads), per-head QK RMSNorm, RoPE, and an in-forward fp32 upcast on the language-model head (`enable_lm_head_fp32`). It supports a 256K context window.
+
+:::{card}
+| | |
+|---|---|
+| **Task** | Text Generation (MoE, translation) |
+| **Architecture** | `HyMT2ForCausalLM` |
+| **Parameters** | 30B total / 3B activated |
+| **HF Org** | [tencent](https://huggingface.co/tencent) |
+:::
+
+## Available Models
+
+- **Hy-MT2-30B-A3B**: 30B total, top-8 routed experts (out of 128) activated per token, plus 1 shared expert
+
+## Architectures
+
+- `HyMT2ForCausalLM`
+
+## Example HF Models
+
+| Model | HF ID |
+|---|---|
+| Hy-MT2-30B-A3B | [`tencent/Hy-MT2-30B-A3B`](https://huggingface.co/tencent/Hy-MT2-30B-A3B) |
+
+## Example Recipes
+
+| Recipe | Description |
+|---|---|
+| {download}`hy_mt2_30b_a3b_sft.yaml <../../../../examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml>` | SFT — Hy-MT2-30B-A3B with FSDP2 + EP8 + fp32 LM head |
+
+## Try with NeMo AutoModel
+
+**1. Install** ([NeMo AutoModel](../../../guides/installation.md)):
+
+```bash
+pip install nemo-automodel
+```
+
+**2. Clone the repo** to get the example recipes:
+
+```bash
+git clone https://github.com/NVIDIA-NeMo/Automodel.git
+cd Automodel
+```
+
+**3. Run the recipe** from inside the repo:
+
+```bash
+automodel --nproc-per-node=8 examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml
+```
+
+See the [NeMo AutoModel Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
+
+## Fine-Tuning
+
+See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md) and the [Large MoE Fine-Tuning Guide](../../../guides/llm/large-moe-finetune.md).
+
+## Hugging Face Model Cards
+
+- [tencent/Hy-MT2-30B-A3B](https://huggingface.co/tencent/Hy-MT2-30B-A3B)
diff --git a/fern/versions/nightly.yml b/fern/versions/nightly.yml
index 2881fe7cd5..e89c2240d4 100644
--- a/fern/versions/nightly.yml
+++ b/fern/versions/nightly.yml
@@ -154,6 +154,8 @@ navigation:
             path: ./nightly/pages/model-coverage/llm/parasail-ai/gritlm.mdx
           - page: "Hy3-preview"
             path: ./nightly/pages/model-coverage/llm/tencent/hy3.mdx
+          - page: "Hy-MT2"
+            path: ./nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx
           - page: "MiMo-V2-Flash"
             path: ./nightly/pages/model-coverage/llm/xiaomimimo/mimo-v2-flash.mdx
           - page: "Ling 2.0"
diff --git a/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx b/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx
new file mode 100644
index 0000000000..87020ebe55
--- /dev/null
+++ b/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx
@@ -0,0 +1,67 @@
+---
+title: "Hy-MT2 (Hunyuan-MT2)"
+description: ""
+---
+[Hy-MT2-30B-A3B](https://huggingface.co/tencent/Hy-MT2-30B-A3B) is Tencent's translation Mixture-of-Experts language model with 30B total parameters and 3B activated per token. It features 48 transformer layers (layer 0 dense, layers 1–47 MoE), 128 routed experts plus 1 shared expert with top-8 sigmoid routing, Grouped Query Attention (32 Q / 4 KV heads), per-head QK RMSNorm, RoPE, and an in-forward fp32 upcast on the language-model head (`enable_lm_head_fp32`). It supports a 256K context window.
+
+<Info>
+
+| | |
+|---|---|
+| **Task** | Text Generation (MoE, translation) |
+| **Architecture** | `HyMT2ForCausalLM` |
+| **Parameters** | 30B total / 3B activated |
+| **HF Org** | [tencent](https://huggingface.co/tencent) |
+
+</Info>
+
+## Available Models
+
+- **Hy-MT2-30B-A3B**: 30B total, top-8 routed experts (out of 128) activated per token, plus 1 shared expert
+
+## Architectures
+
+- `HyMT2ForCausalLM`
+
+## Example HF Models
+
+| Model | HF ID |
+|---|---|
+| Hy-MT2-30B-A3B | [`tencent/Hy-MT2-30B-A3B`](https://huggingface.co/tencent/Hy-MT2-30B-A3B) |
+
+## Example Recipes
+
+| Recipe | Description |
+|---|---|
+| [hy_mt2_30b_a3b_sft.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml) | SFT — Hy-MT2-30B-A3B with FSDP2 + EP8 + fp32 LM head |
+
+## Try with NeMo AutoModel
+
+**1. Install** ([NeMo AutoModel](/get-started/installation)):
+
+```bash
+pip install nemo-automodel
+```
+
+**2. Clone the repo** to get the example recipes:
+
+```bash
+git clone https://github.com/NVIDIA-NeMo/Automodel.git
+cd Automodel
+```
+
+**3. Run the recipe** from inside the repo:
+
+```bash
+automodel --nproc-per-node=8 examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml
+```
+
+See the [NeMo AutoModel Installation Guide](/get-started/installation) and [LLM Fine-Tuning Guide](/recipes-e2e-examples/sft-peft).
+
+## Fine-Tuning
+
+See the [LLM Fine-Tuning Guide](/recipes-e2e-examples/sft-peft) and the [Large MoE Fine-Tuning Guide](/recipes-e2e-examples/large-moe-fine-tuning).
+
+## Hugging Face Model Cards
+
+- [tencent/Hy-MT2-30B-A3B](https://huggingface.co/tencent/Hy-MT2-30B-A3B)

From 577231139d50789d2394f89e2b87484564524b66 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Wed, 27 May 2026 18:55:34 +0800
Subject: [PATCH 08/20] docs(model-coverage): wire Hy-MT2 into the LLM toctree

Sphinx --fail-on-warning treated 'document isn't included in any
toctree' as an error after the previous commit added hy-mt2.md
without registering it in docs/model-coverage/llm/index.md.

Adds the row to the LLM coverage table and the entry to the hidden
toctree, alongside the existing Hy3-preview entry.

Signed-off-by: khazic <khazzz1c@gmail.com>
---
 docs/model-coverage/llm/index.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/model-coverage/llm/index.md b/docs/model-coverage/llm/index.md
index 2a81e3cbde..d43dd12bb1 100644
--- a/docs/model-coverage/llm/index.md
+++ b/docs/model-coverage/llm/index.md
@@ -73,6 +73,7 @@ NeMo AutoModel supports the [AutoModelForCausalLM](https://huggingface.co/transf
 | Stepfun AI | [Step-3.5](stepfun-ai/step-3-5.md) | `Step3p5ForCausalLM` |
 | Parasail AI | [GritLM](parasail-ai/gritlm.md) | `GritLM` |
 | Tencent | [Hy3-preview](tencent/hy3.md) | `HYV3ForCausalLM` |
+| Tencent | [Hy-MT2](tencent/hy-mt2.md) | `HyMT2ForCausalLM` |
 | Xiaomi MiMo | [MiMo-V2-Flash](xiaomimimo/mimo-v2-flash.md) | `MiMoV2FlashForCausalLM` |
 | inclusionAI | [Ling 2.0](inclusionai/ling-2.md) | `BailingMoeV2ForCausalLM` |
 
@@ -146,6 +147,7 @@ stabilityai/stablelm
 stepfun-ai/step-3-5
 parasail-ai/gritlm
 tencent/hy3
+tencent/hy-mt2
 xiaomimimo/mimo-v2-flash
 inclusionai/ling-2
 ```

From 4e6e6e8f8328b52e73c400e9cfb9c6b1ee45e8b3 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Wed, 27 May 2026 19:22:47 +0800
Subject: [PATCH 09/20] refactor(models): concentrate Hy-MT2 dispatch logic
 inside hy_mt2 module

Moves the ``_is_hy_mt2_config`` fingerprint predicate from
``_transformers/model_init.py`` into a new
``components/models/hy_mt2/dispatch.py`` module, and migrates the
matching dispatcher tests from ``tests/.../_transformers/test_model_init.py``
to ``tests/.../models/hy_mt2/test_dispatch.py``.

The auto-resolver in model_init.py now keeps only a 4-line shim that
imports ``is_hy_mt2_config`` from the model package when the architecture
name matches HYV3ForCausalLM, so Hy-MT2-specific knowledge (which
hidden_size, layer count, expert count, etc. identify the checkpoint)
lives entirely inside ``components/models/hy_mt2/`` rather than leaking
into shared code.

No behavior change: same fingerprint fields, same dispatch outcome, the
existing Hy3-preview path is untouched.

Signed-off-by: khazic <khazzz1c@gmail.com>
---
 nemo_automodel/_transformers/model_init.py    | 22 ++---
 .../components/models/hy_mt2/dispatch.py      | 37 ++++++++
 .../_transformers/test_model_init.py          | 36 --------
 .../unit_tests/models/hy_mt2/test_dispatch.py | 84 +++++++++++++++++++
 4 files changed, 127 insertions(+), 52 deletions(-)
 create mode 100644 nemo_automodel/components/models/hy_mt2/dispatch.py
 create mode 100644 tests/unit_tests/models/hy_mt2/test_dispatch.py

diff --git a/nemo_automodel/_transformers/model_init.py b/nemo_automodel/_transformers/model_init.py
index de12748498..7280bceae7 100644
--- a/nemo_automodel/_transformers/model_init.py
+++ b/nemo_automodel/_transformers/model_init.py
@@ -214,19 +214,6 @@ def _is_config_compatible_with_custom_model(arch_name: str, config) -> bool:
     return True
 
 
-def _is_hy_mt2_config(config) -> bool:
-    """Return whether a ``hy_v3`` config describes Tencent Hy-MT2-30B-A3B."""
-    return (
-        getattr(config, "model_type", None) == "hy_v3"
-        and getattr(config, "hidden_size", None) == 2048
-        and getattr(config, "num_hidden_layers", None) == 48
-        and getattr(config, "num_experts", None) == 128
-        and getattr(config, "expert_hidden_dim", None) == 768
-        and getattr(config, "moe_intermediate_size", None) == 768
-        and hasattr(config, "enable_lm_head_fp32")
-    )
-
-
 def _resolve_custom_model_cls_for_config(config):
     """Resolve the custom model class for *config*, if the config is compatible."""
     architectures = get_architectures(config)
@@ -234,10 +221,13 @@ def _resolve_custom_model_cls_for_config(config):
         return None
 
     arch_name = architectures[0]
-    if arch_name == "HYV3ForCausalLM" and _is_hy_mt2_config(config):
-        from nemo_automodel.components.models.hy_mt2.model import HyMT2ForCausalLM
+    if arch_name == "HYV3ForCausalLM":
+        from nemo_automodel.components.models.hy_mt2.dispatch import is_hy_mt2_config
+
+        if is_hy_mt2_config(config):
+            from nemo_automodel.components.models.hy_mt2.model import HyMT2ForCausalLM
 
-        return HyMT2ForCausalLM
+            return HyMT2ForCausalLM
 
     if not ModelRegistry.has_custom_model(arch_name):
         return None
diff --git a/nemo_automodel/components/models/hy_mt2/dispatch.py b/nemo_automodel/components/models/hy_mt2/dispatch.py
new file mode 100644
index 0000000000..097fc316ab
--- /dev/null
+++ b/nemo_automodel/components/models/hy_mt2/dispatch.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Config-shape fingerprint that distinguishes Hy-MT2-30B-A3B from Hy3-preview.
+
+Tencent ships both checkpoints with ``architectures: ["HYV3ForCausalLM"]`` and
+``model_type: "hy_v3"`` even though the two models differ substantially
+(48 vs 80 layers, 128 vs 192 experts, hidden 2048 vs 4096, etc.). The
+auto-resolver in ``_transformers/model_init.py`` looks up the fingerprint here
+so all Hy-MT2-specific knowledge stays inside this module.
+"""
+
+from typing import Any
+
+
+def is_hy_mt2_config(config: Any) -> bool:
+    """Return whether *config* describes Tencent's Hy-MT2-30B-A3B checkpoint."""
+    return (
+        getattr(config, "model_type", None) == "hy_v3"
+        and getattr(config, "hidden_size", None) == 2048
+        and getattr(config, "num_hidden_layers", None) == 48
+        and getattr(config, "num_experts", None) == 128
+        and getattr(config, "expert_hidden_dim", None) == 768
+        and getattr(config, "moe_intermediate_size", None) == 768
+        and hasattr(config, "enable_lm_head_fp32")
+    )
diff --git a/tests/unit_tests/_transformers/test_model_init.py b/tests/unit_tests/_transformers/test_model_init.py
index 4ece5371d1..5318719ad5 100644
--- a/tests/unit_tests/_transformers/test_model_init.py
+++ b/tests/unit_tests/_transformers/test_model_init.py
@@ -15,7 +15,6 @@
 """Tests for nested config override handling in get_hf_config and _consume_config_overrides."""
 
 import os
-from types import SimpleNamespace
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -28,7 +27,6 @@
     _init_model,
     _load_config_with_layer_types_fix,
     _propagate_torch_dtype_to_subconfigs,
-    _resolve_custom_model_cls_for_config,
     _resolve_model_dir,
     _setup_bnb_loading_kwargs,
     _stream_load_bnb_weights,
@@ -39,40 +37,6 @@
 from nemo_automodel.components.models.common.utils import BackendConfig
 
 
-class TestHyMT2ModelResolution:
-    """Hy-MT2 shares HYV3ForCausalLM metadata but needs its own implementation."""
-
-    def test_hy_mt2_config_resolves_to_hy_mt2_model(self):
-        from nemo_automodel.components.models.hy_mt2.model import HyMT2ForCausalLM
-
-        config = SimpleNamespace(
-            architectures=["HYV3ForCausalLM"],
-            model_type="hy_v3",
-            hidden_size=2048,
-            num_hidden_layers=48,
-            num_experts=128,
-            expert_hidden_dim=768,
-            moe_intermediate_size=768,
-            enable_lm_head_fp32=True,
-        )
-
-        assert _resolve_custom_model_cls_for_config(config) is HyMT2ForCausalLM
-
-    def test_hy_v3_config_still_resolves_to_hy_v3_model(self):
-        from nemo_automodel.components.models.hy_v3.model import HYV3ForCausalLM
-
-        config = SimpleNamespace(
-            architectures=["HYV3ForCausalLM"],
-            model_type="hy_v3",
-            hidden_size=4096,
-            num_hidden_layers=80,
-            num_experts=192,
-            moe_intermediate_size=1536,
-        )
-
-        assert _resolve_custom_model_cls_for_config(config) is HYV3ForCausalLM
-
-
 class TestConsumeConfigOverridesNestedDict:
     """Nested dict overrides should be deep-merged into sub-config objects."""
 
diff --git a/tests/unit_tests/models/hy_mt2/test_dispatch.py b/tests/unit_tests/models/hy_mt2/test_dispatch.py
new file mode 100644
index 0000000000..00f93aa5fd
--- /dev/null
+++ b/tests/unit_tests/models/hy_mt2/test_dispatch.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the Hy-MT2 config-shape fingerprint and its routing through the
+shared ``_resolve_custom_model_cls_for_config`` entry point."""
+
+from types import SimpleNamespace
+
+from nemo_automodel._transformers.model_init import _resolve_custom_model_cls_for_config
+from nemo_automodel.components.models.hy_mt2.dispatch import is_hy_mt2_config
+
+
+def _hy_mt2_config() -> SimpleNamespace:
+    return SimpleNamespace(
+        architectures=["HYV3ForCausalLM"],
+        model_type="hy_v3",
+        hidden_size=2048,
+        num_hidden_layers=48,
+        num_experts=128,
+        expert_hidden_dim=768,
+        moe_intermediate_size=768,
+        enable_lm_head_fp32=True,
+    )
+
+
+def _hy3_preview_config() -> SimpleNamespace:
+    return SimpleNamespace(
+        architectures=["HYV3ForCausalLM"],
+        model_type="hy_v3",
+        hidden_size=4096,
+        num_hidden_layers=80,
+        num_experts=192,
+        moe_intermediate_size=1536,
+    )
+
+
+class TestIsHyMT2Config:
+    """Direct tests of the fingerprint predicate."""
+
+    def test_hy_mt2_fingerprint_matches(self):
+        assert is_hy_mt2_config(_hy_mt2_config())
+
+    def test_hy3_preview_fingerprint_does_not_match(self):
+        assert not is_hy_mt2_config(_hy3_preview_config())
+
+    def test_missing_enable_lm_head_fp32_does_not_match(self):
+        config = _hy_mt2_config()
+        del config.enable_lm_head_fp32
+        assert not is_hy_mt2_config(config)
+
+    def test_wrong_hidden_size_does_not_match(self):
+        config = _hy_mt2_config()
+        config.hidden_size = 4096
+        assert not is_hy_mt2_config(config)
+
+    def test_non_hy_v3_model_type_does_not_match(self):
+        config = _hy_mt2_config()
+        config.model_type = "llama"
+        assert not is_hy_mt2_config(config)
+
+
+class TestHyMT2ModelResolution:
+    """Hy-MT2 shares ``HYV3ForCausalLM`` metadata but needs its own implementation."""
+
+    def test_hy_mt2_config_resolves_to_hy_mt2_model(self):
+        from nemo_automodel.components.models.hy_mt2.model import HyMT2ForCausalLM
+
+        assert _resolve_custom_model_cls_for_config(_hy_mt2_config()) is HyMT2ForCausalLM
+
+    def test_hy_v3_config_still_resolves_to_hy_v3_model(self):
+        from nemo_automodel.components.models.hy_v3.model import HYV3ForCausalLM
+
+        assert _resolve_custom_model_cls_for_config(_hy3_preview_config()) is HYV3ForCausalLM

From ef924493ca4e4a75c73ac70f695d5bdc33e4629a Mon Sep 17 00:00:00 2001
From: Huiying <willwin.lee@gmail.com>
Date: Wed, 27 May 2026 13:16:37 -0700
Subject: [PATCH 10/20] Update
 fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx

Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com>

Signed-off-by: Huiying <willwin.lee@gmail.com>
---
 .../nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx b/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx
index 87020ebe55..248170aeec 100644
--- a/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx
+++ b/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx
@@ -56,7 +56,7 @@ cd Automodel
 automodel --nproc-per-node=8 examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml
 ```
 
-See the [NeMo AutoModel Installation Guide](/get-started/installation) and [LLM Fine-Tuning Guide](/recipes-e2e-examples/sft-peft).
+Refer to the [NeMo AutoModel Installation Guide](/get-started/installation) and [LLM Fine-Tuning Guide](/recipes-e2e-examples/sft-peft).
 
 ## Fine-Tuning
 

From ef897a5bafe2d2442a91fae36a8748949235f6a5 Mon Sep 17 00:00:00 2001
From: Huiying <willwin.lee@gmail.com>
Date: Wed, 27 May 2026 13:17:00 -0700
Subject: [PATCH 11/20] Update
 fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx

Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com>

Signed-off-by: Huiying <willwin.lee@gmail.com>
---
 .../nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx b/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx
index 248170aeec..d4325accb8 100644
--- a/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx
+++ b/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx
@@ -58,7 +58,7 @@ automodel --nproc-per-node=8 examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yam
 
 Refer to the [NeMo AutoModel Installation Guide](/get-started/installation) and [LLM Fine-Tuning Guide](/recipes-e2e-examples/sft-peft).
 
-## Fine-Tuning
+## Fine-Tune the Model
 
 See the [LLM Fine-Tuning Guide](/recipes-e2e-examples/sft-peft) and the [Large MoE Fine-Tuning Guide](/recipes-e2e-examples/large-moe-fine-tuning).
 

From 4b2e09083eb13ca6b3ef87075a7cec7313cc4404 Mon Sep 17 00:00:00 2001
From: Huiying <willwin.lee@gmail.com>
Date: Wed, 27 May 2026 13:17:26 -0700
Subject: [PATCH 12/20] Update
 fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx

Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com>

Signed-off-by: Huiying <willwin.lee@gmail.com>
---
 .../nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx b/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx
index d4325accb8..d4479b75f0 100644
--- a/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx
+++ b/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx
@@ -60,7 +60,7 @@ Refer to the [NeMo AutoModel Installation Guide](/get-started/installation) and
 
 ## Fine-Tune the Model
 
-See the [LLM Fine-Tuning Guide](/recipes-e2e-examples/sft-peft) and the [Large MoE Fine-Tuning Guide](/recipes-e2e-examples/large-moe-fine-tuning).
+Refer to the [LLM Fine-Tuning Guide](/recipes-e2e-examples/sft-peft) and the [Large MoE Fine-Tuning Guide](/recipes-e2e-examples/large-moe-fine-tuning).
 
 ## Hugging Face Model Cards
 

From f7576b10219e4dc9d496eea35510b95f23c1c0fe Mon Sep 17 00:00:00 2001
From: Huiying <willwin.lee@gmail.com>
Date: Wed, 27 May 2026 13:17:52 -0700
Subject: [PATCH 13/20] Update docs/model-coverage/llm/index.md

Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com>

Signed-off-by: Huiying <willwin.lee@gmail.com>
---
 docs/model-coverage/llm/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/model-coverage/llm/index.md b/docs/model-coverage/llm/index.md
index d43dd12bb1..6d1a6aa7c7 100644
--- a/docs/model-coverage/llm/index.md
+++ b/docs/model-coverage/llm/index.md
@@ -11,7 +11,7 @@ To run LLMs with NeMo AutoModel, make sure you're using NeMo container version [
 pip3 install --upgrade git+git@github.com:NVIDIA-NeMo/AutoModel.git
 ```
 
-For other installation options (e.g., uv), see the [NeMo AutoModel Installation Guide](../../guides/installation.md).
+For other installation options (for example, uv), refer to the [NeMo AutoModel Installation Guide](../../guides/installation.md).
 
 ## Supported Models
 

From 8990d20456cb25834c48a5cedaf92951c540a336 Mon Sep 17 00:00:00 2001
From: Huiying <willwin.lee@gmail.com>
Date: Wed, 27 May 2026 13:18:09 -0700
Subject: [PATCH 14/20] Update docs/model-coverage/llm/index.md

Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com>

Signed-off-by: Huiying <willwin.lee@gmail.com>
---
 docs/model-coverage/llm/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/model-coverage/llm/index.md b/docs/model-coverage/llm/index.md
index 6d1a6aa7c7..16a0108a85 100644
--- a/docs/model-coverage/llm/index.md
+++ b/docs/model-coverage/llm/index.md
@@ -87,7 +87,7 @@ The models listed above can be fine-tuned using NeMo AutoModel. We support two p
 See the [Fine-Tuning Guide](../../guides/llm/finetune.md) to learn how to apply both methods to your data.
 
 :::{tip}
-In these guides, we use the `SQuAD v1.1` dataset for demonstration purposes, but you can use your own data. Update the recipe YAML `dataset` / `validation_dataset` sections accordingly. See [LLM datasets](../../guides/llm/dataset.md) and [dataset overview](../../guides/dataset-overview.md).
+In these guides, the examples use the `SQuAD v1.1` dataset for demonstration purposes, but you can use your own data. Update the recipe YAML `dataset` / `validation_dataset` sections accordingly. Refer to [LLM datasets](../../guides/llm/dataset.md) and [dataset overview](../../guides/dataset-overview.md).
 :::
 
 ```{toctree}

From c1a29dc5b3720b8de7f3c13af7dc02fe6a319726 Mon Sep 17 00:00:00 2001
From: Huiying <willwin.lee@gmail.com>
Date: Wed, 27 May 2026 13:18:31 -0700
Subject: [PATCH 15/20] Update docs/model-coverage/llm/index.md

Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com>

Signed-off-by: Huiying <willwin.lee@gmail.com>
---
 docs/model-coverage/llm/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/model-coverage/llm/index.md b/docs/model-coverage/llm/index.md
index 16a0108a85..6933fbd5c9 100644
--- a/docs/model-coverage/llm/index.md
+++ b/docs/model-coverage/llm/index.md
@@ -77,7 +77,7 @@ NeMo AutoModel supports the [AutoModelForCausalLM](https://huggingface.co/transf
 | Xiaomi MiMo | [MiMo-V2-Flash](xiaomimimo/mimo-v2-flash.md) | `MiMoV2FlashForCausalLM` |
 | inclusionAI | [Ling 2.0](inclusionai/ling-2.md) | `BailingMoeV2ForCausalLM` |
 
-## Fine-Tuning LLMs with NeMo AutoModel
+## Fine-Tune LLMs with NeMo AutoModel
 
 The models listed above can be fine-tuned using NeMo AutoModel. We support two primary fine-tuning approaches:
 

From 521fe22ab8e25680e99d258acb420e32181284d7 Mon Sep 17 00:00:00 2001
From: Huiying <willwin.lee@gmail.com>
Date: Wed, 27 May 2026 13:18:42 -0700
Subject: [PATCH 16/20] Update docs/model-coverage/llm/index.md

Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com>

Signed-off-by: Huiying <willwin.lee@gmail.com>
---
 docs/model-coverage/llm/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/model-coverage/llm/index.md b/docs/model-coverage/llm/index.md
index 6933fbd5c9..d499ecb08c 100644
--- a/docs/model-coverage/llm/index.md
+++ b/docs/model-coverage/llm/index.md
@@ -84,7 +84,7 @@ The models listed above can be fine-tuned using NeMo AutoModel. We support two p
 1. **Parameter-Efficient Fine-Tuning (PEFT)**: Updates only a small subset of parameters (typically <1%) using techniques like Low-Rank Adaptation (LoRA).
 2. **Supervised Fine-Tuning (SFT)**: Updates all or most model parameters for deeper adaptation.
 
-See the [Fine-Tuning Guide](../../guides/llm/finetune.md) to learn how to apply both methods to your data.
+Refer to the [Fine-Tuning Guide](../../guides/llm/finetune.md) to learn how to apply both methods to your data.
 
 :::{tip}
 In these guides, the examples use the `SQuAD v1.1` dataset for demonstration purposes, but you can use your own data. Update the recipe YAML `dataset` / `validation_dataset` sections accordingly. Refer to [LLM datasets](../../guides/llm/dataset.md) and [dataset overview](../../guides/dataset-overview.md).

From 83b053b7824d189ab176f657446e0f7a5e695390 Mon Sep 17 00:00:00 2001
From: Huiying <willwin.lee@gmail.com>
Date: Wed, 27 May 2026 13:18:55 -0700
Subject: [PATCH 17/20] Update docs/model-coverage/llm/tencent/hy-mt2.md

Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com>

Signed-off-by: Huiying <willwin.lee@gmail.com>
---
 docs/model-coverage/llm/tencent/hy-mt2.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/model-coverage/llm/tencent/hy-mt2.md b/docs/model-coverage/llm/tencent/hy-mt2.md
index 62794a8f00..a82a2bba5b 100644
--- a/docs/model-coverage/llm/tencent/hy-mt2.md
+++ b/docs/model-coverage/llm/tencent/hy-mt2.md
@@ -52,7 +52,7 @@ cd Automodel
 automodel --nproc-per-node=8 examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml
 ```
 
-See the [NeMo AutoModel Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
+Refer to the [NeMo AutoModel Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
 
 ## Fine-Tuning
 

From bb41ada4b90bdd633edbd075db98c6eed9c29f1e Mon Sep 17 00:00:00 2001
From: Huiying <willwin.lee@gmail.com>
Date: Wed, 27 May 2026 13:19:08 -0700
Subject: [PATCH 18/20] Update docs/model-coverage/llm/tencent/hy-mt2.md

Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com>

Signed-off-by: Huiying <willwin.lee@gmail.com>
---
 docs/model-coverage/llm/tencent/hy-mt2.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/model-coverage/llm/tencent/hy-mt2.md b/docs/model-coverage/llm/tencent/hy-mt2.md
index a82a2bba5b..e91177456e 100644
--- a/docs/model-coverage/llm/tencent/hy-mt2.md
+++ b/docs/model-coverage/llm/tencent/hy-mt2.md
@@ -54,7 +54,7 @@ automodel --nproc-per-node=8 examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yam
 
 Refer to the [NeMo AutoModel Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
 
-## Fine-Tuning
+## Fine-Tune the Model
 
 See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md) and the [Large MoE Fine-Tuning Guide](../../../guides/llm/large-moe-finetune.md).
 

From 460dab9e7cac9e783c456dfdfb4b9250603ffab3 Mon Sep 17 00:00:00 2001
From: Huiying <willwin.lee@gmail.com>
Date: Wed, 27 May 2026 13:19:22 -0700
Subject: [PATCH 19/20] Update docs/model-coverage/llm/tencent/hy-mt2.md

Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com>

Signed-off-by: Huiying <willwin.lee@gmail.com>
---
 docs/model-coverage/llm/tencent/hy-mt2.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/model-coverage/llm/tencent/hy-mt2.md b/docs/model-coverage/llm/tencent/hy-mt2.md
index e91177456e..b7db2ad92f 100644
--- a/docs/model-coverage/llm/tencent/hy-mt2.md
+++ b/docs/model-coverage/llm/tencent/hy-mt2.md
@@ -56,7 +56,7 @@ Refer to the [NeMo AutoModel Installation Guide](../../../guides/installation.md
 
 ## Fine-Tune the Model
 
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md) and the [Large MoE Fine-Tuning Guide](../../../guides/llm/large-moe-finetune.md).
+Refer to the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md) and the [Large MoE Fine-Tuning Guide](../../../guides/llm/large-moe-finetune.md).
 
 ## Hugging Face Model Cards
 

From 7b9e32c2325976ef215ebcc8bf59a3f5e98a7fff Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Thu, 28 May 2026 14:11:10 +0800
Subject: [PATCH 20/20] fix(tests): declare rope_parameters on minimal _Cfg
 mock in hy_mt2 test

`test_enable_lm_head_fp32_default_false_without_config_flag` constructs
`HyMT2ForCausalLM(_Cfg(), ...)` with a bare mock class to verify that
the flag defaults to ``False`` when the config does not declare it. The
bare mock skips ``PretrainedConfig.__init__``, which is what normally
synthesizes ``rope_parameters`` from ``rope_theta``. As a result,
``get_rope_config`` (called during model construction) raised
``AttributeError: '_Cfg' object has no attribute 'rope_parameters'``
on GPU CI.

Add the field to the mock with the same shape ``PretrainedConfig``
would produce. The CPU test suite cannot trigger this (the whole
``TestHyMT2ForCausalLM`` class is CUDA-gated), so the regression was
only visible on the L0_Unit_Tests_GPU job.

Signed-off-by: khazic <khazzz1c@gmail.com>
---
 tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py b/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py
index 20705d6d3c..04f18c59ad 100644
--- a/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py
+++ b/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py
@@ -238,6 +238,10 @@ class _Cfg:
             first_k_dense_replace = 1
             max_position_embeddings = 128
             rope_theta = 10000.0
+            # PretrainedConfig populates ``rope_parameters`` from ``rope_theta``
+            # in its ``__init__``; this bare mock skips that, so declare it
+            # explicitly to match what ``get_rope_config`` reads.
+            rope_parameters = {"rope_theta": 10000.0, "rope_type": "default"}
             rms_norm_eps = 1e-5
             torch_dtype = "bfloat16"
             attention_bias = False