From a7d91c39318d78b89ad73d7cac58e9039316c20a Mon Sep 17 00:00:00 2001 From: khazic Date: Wed, 27 May 2026 11:11:04 +0800 Subject: [PATCH 01/20] feat(models): add Hy-MT2-30B-A3B SFT support Add a dedicated ``HyMT2ForCausalLM`` module under ``components/models/hy_mt2`` for tencent/Hy-MT2-30B-A3B (translation MoE, 30B total / 3B activated). The on-disk checkpoint shares ``architectures: ["HYV3ForCausalLM"]`` and ``model_type: "hy_v3"`` with Tencent's older Hy3-preview, but the two models differ substantially in sizing (48 layers vs 80, 128 experts vs 192, GQA 32/4 vs 64/8, hidden=2048 vs 4096, rms_norm_eps=1e-5 vs 1e-6) and in three flags that the existing ``hy_v3`` module either hard-codes or does not handle: ``moe_router_use_sigmoid`` (made configurable here), ``enable_lm_head_fp32`` (in-model fp32 upcast fallback when the YAML does not set ``lm_head_precision``), and ``expert_hidden_dim`` (synonym of ``moe_intermediate_size`` preferred when both are present). The new module is intentionally independent of ``components/models/hy_v3`` so the Hy3-preview recipes are unaffected. The example YAML at ``examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml`` instantiates the new class via a fully-qualified ``_target_`` instead of going through the NeMoAutoModel registry, avoiding the architecture-string collision. Files: nemo_automodel/components/models/hy_mt2/{__init__,config,layers,model,state_dict_adapter}.py examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml tests/unit_tests/models/hy_mt2/test_hy_mt2_{config,layers,model,state_dict_adapter}.py EP / TP / DP / FSDP2 wire up through the standard MoE stack (``MoEFSDPSyncMixin`` + ``components/moe``). EP must divide ``num_experts`` = 128; the example uses ``ep_size: 8`` (16 experts per rank) on an 8xH100 node. Signed-off-by: khazic --- .../hy_mt2/hy_mt2_30b_a3b_sft.yaml | 150 +++++++ .../components/models/hy_mt2/__init__.py | 17 + .../components/models/hy_mt2/config.py | 129 ++++++ .../components/models/hy_mt2/layers.py | 147 ++++++ .../components/models/hy_mt2/model.py | 418 ++++++++++++++++++ .../models/hy_mt2/state_dict_adapter.py | 163 +++++++ tests/unit_tests/models/hy_mt2/__init__.py | 0 .../models/hy_mt2/test_hy_mt2_config.py | 165 +++++++ .../models/hy_mt2/test_hy_mt2_layers.py | 164 +++++++ .../models/hy_mt2/test_hy_mt2_model.py | 285 ++++++++++++ .../hy_mt2/test_hy_mt2_state_dict_adapter.py | 299 +++++++++++++ 11 files changed, 1937 insertions(+) create mode 100644 examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml create mode 100644 nemo_automodel/components/models/hy_mt2/__init__.py create mode 100644 nemo_automodel/components/models/hy_mt2/config.py create mode 100644 nemo_automodel/components/models/hy_mt2/layers.py create mode 100644 nemo_automodel/components/models/hy_mt2/model.py create mode 100644 nemo_automodel/components/models/hy_mt2/state_dict_adapter.py create mode 100644 tests/unit_tests/models/hy_mt2/__init__.py create mode 100644 tests/unit_tests/models/hy_mt2/test_hy_mt2_config.py create mode 100644 tests/unit_tests/models/hy_mt2/test_hy_mt2_layers.py create mode 100644 tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py create mode 100644 tests/unit_tests/models/hy_mt2/test_hy_mt2_state_dict_adapter.py diff --git a/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml b/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml new file mode 100644 index 0000000000..9f2b71d8d9 --- /dev/null +++ b/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml @@ -0,0 +1,150 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# SFT recipe for tencent/Hy-MT2-30B-A3B (translation MoE, 30B total / 3B activated). +# +# Architecture (from config.json): 48 layers (layer 0 dense), 128 experts top-8 +# + 1 shared expert, sigmoid routing with bias, GQA 32/4, hidden=2048, +# moe_intermediate=expert_hidden=768, dense intermediate=6912, +# vocab=120832, 256K context, rope_theta=11158840, qk_norm. +# +# Hardware target: 8 GPUs (80 GB+ each) for full SFT with EP8 + FSDP2. +# automodel finetune llm -c examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml \ +# --nproc-per-node 8 +# +# EP size must divide num_experts (128). ep_size=8 -> 16 experts per rank. +# Other valid EP sizes: 1, 2, 4, 16, 32, 64, 128. +# +# Note: the on-disk checkpoint declares ``architectures: ["HYV3ForCausalLM"]``, +# so NeMoAutoModel's registry would dispatch to ``components/models/hy_v3``. +# This recipe deliberately bypasses the registry and instantiates the new +# ``HyMT2ForCausalLM`` directly via ``_target_``, isolating Hy-MT2 logic +# (sigmoid-routing flag, ``enable_lm_head_fp32``, expert_hidden_dim) from +# the existing Hy3-preview support. + +recipe: TrainFinetuneRecipeForNextTokenPrediction + +step_scheduler: + global_batch_size: 64 + local_batch_size: 1 + ckpt_every_steps: 500 + val_every_steps: 500 + num_epochs: 1 + max_steps: 100 + +dist_env: + backend: nccl + timeout_minutes: 30 + +rng: + _target_: nemo_automodel.components.training.rng.StatefulRNG + seed: 1111 + ranked: true + +model: + _target_: nemo_automodel.components.models.hy_mt2.model.HyMT2ForCausalLM.from_pretrained + pretrained_model_name_or_path: tencent/Hy-MT2-30B-A3B + torch_dtype: bfloat16 + backend: + _target_: nemo_automodel.components.models.common.BackendConfig + attn: te + linear: torch + rms_norm: torch_fp32 + experts: torch_mm + dispatcher: torch + fake_balanced_gate: false + gate_precision: float32 + enable_hf_state_dict_adapter: true + enable_fsdp_optimizations: true + +checkpoint: + enabled: true + checkpoint_dir: /tmp/checkpoints/hy_mt2_30b_a3b/ + model_save_format: safetensors + save_consolidated: true + +distributed: + strategy: fsdp2 + tp_size: 1 + cp_size: 1 + pp_size: 1 + # Expert parallelism: 128 experts / 8 ranks = 16 experts per rank. + # dp_size is derived as ``world_size // (tp_size * cp_size * pp_size * ep_size)`` + # i.e. 1 with ep_size=8 on an 8-GPU node -- experts shard across the + # full node and the remaining (non-expert) weights replicate. + ep_size: 8 + + sequence_parallel: false + activation_checkpointing: true + + moe: + reshard_after_forward: false + wrap_outer_model: false + # HF reference upcasts the lm_head to fp32 (``enable_lm_head_fp32: true``). + # The MoE parallelizer handles this via MixedPrecisionPolicy when set + # here; HyMT2ForCausalLM also has an in-model fp32 fallback if this is + # left unset. + lm_head_precision: float32 + +loss_fn: + _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy + +dataset: + _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag + path_or_dataset: rowan/hellaswag + split: train + tokenizer: + _target_: transformers.AutoTokenizer.from_pretrained + pretrained_model_name_or_path: tencent/Hy-MT2-30B-A3B + trust_remote_code: true + +packed_sequence: + packed_sequence_size: 0 + +dataloader: + _target_: torchdata.stateful_dataloader.StatefulDataLoader + collate_fn: + _target_: nemo_automodel.components.datasets.utils.default_collater + pad_seq_len_divisible: 64 + shuffle: true + +validation_dataset: + _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag + path_or_dataset: rowan/hellaswag + split: validation + num_samples_limit: 64 + tokenizer: + _target_: transformers.AutoTokenizer.from_pretrained + pretrained_model_name_or_path: tencent/Hy-MT2-30B-A3B + trust_remote_code: true + +validation_dataloader: + _target_: torchdata.stateful_dataloader.StatefulDataLoader + collate_fn: + _target_: nemo_automodel.components.datasets.utils.default_collater + pad_seq_len_divisible: 64 + shuffle: false + drop_last: true + +optimizer: + _target_: torch.optim.AdamW + betas: [0.9, 0.95] + eps: 1e-8 + lr: 1e-5 + weight_decay: 0.0 + +# Uncomment for W&B logging +# wandb: +# project: hy_mt2-30b-a3b-sft +# name: hy_mt2_30b_a3b_sft diff --git a/nemo_automodel/components/models/hy_mt2/__init__.py b/nemo_automodel/components/models/hy_mt2/__init__.py new file mode 100644 index 0000000000..cab3ef8fa7 --- /dev/null +++ b/nemo_automodel/components/models/hy_mt2/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_automodel.components.models.hy_mt2.model import HyMT2ForCausalLM + +__all__ = ["HyMT2ForCausalLM"] diff --git a/nemo_automodel/components/models/hy_mt2/config.py b/nemo_automodel/components/models/hy_mt2/config.py new file mode 100644 index 0000000000..9578a77f56 --- /dev/null +++ b/nemo_automodel/components/models/hy_mt2/config.py @@ -0,0 +1,129 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from transformers import PretrainedConfig + + +class HyMT2Config(PretrainedConfig): + """Configuration class for Tencent Hy-MT2-30B-A3B (translation MoE). + + Architecture (from tencent/Hy-MT2-30B-A3B config.json): + - 48 transformer layers; layer 0 is dense, layers 1-47 are MoE + - MoE: 128 routed experts + 1 shared expert, top-8 activated + - Sigmoid routing with expert-bias correction (e_score_correction_bias) + and router_scaling_factor = 2.826 + - route_norm = True (normalize top-k routing weights) + - GQA: 32 Q heads, 4 KV heads, head_dim=128, hidden_size=2048 + - Per-head Q/K RMSNorm before RoPE (qk_norm) + - 256K context, rope_theta=11158840 + - vocab_size=120832, dense intermediate_size=6912, moe_intermediate_size=768 + - enable_lm_head_fp32 = True (HF reference upcasts lm_head to fp32) + + Note: the on-disk HF checkpoint declares ``model_type: "hy_v3"`` and + ``architectures: ["HYV3ForCausalLM"]``. NeMo AutoModel's existing + ``HYV3Config`` therefore wins ``AutoConfig.from_pretrained``. This class + is provided for tests and for standalone instantiation; the model code in + ``model.py`` is duck-typed against ``config.`` and works with either + config class. + """ + + model_type = "hy_mt2" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size: int = 120832, + hidden_size: int = 2048, + intermediate_size: int = 6912, + moe_intermediate_size: int = 768, + expert_hidden_dim: int = 768, + num_hidden_layers: int = 48, + num_attention_heads: int = 32, + num_key_value_heads: int = 4, + head_dim: int = 128, + # MoE routing + num_experts: int = 128, + num_shared_experts: int = 1, + num_experts_per_tok: int = 8, + router_scaling_factor: float = 2.826, + route_norm: bool = True, + moe_router_enable_expert_bias: bool = True, + moe_router_use_sigmoid: bool = True, + # Dense layers + first_k_dense_replace: int = 1, + # Position encoding + max_position_embeddings: int = 262144, + rope_theta: float = 11158840.0, + rope_scaling: dict | None = None, + # Norm / attention + rms_norm_eps: float = 1e-5, + qk_norm: bool = True, + attention_bias: bool = False, + hidden_act: str = "silu", + # FP32 upcast hints (mirroring HF config). NeMo AutoModel wires + # ``enable_lm_head_fp32`` either via the YAML ``lm_head_precision: float32`` + # (preferred, handled by the MoE parallelizer) or via the in-model + # cast in ``HyMT2ForCausalLM.forward`` when ``lm_head_precision`` is + # unset. + enable_lm_head_fp32: bool = True, + enable_attention_fp32_softmax: bool = False, + enable_moe_fp32_combine: bool = False, + # Standard options + use_cache: bool = True, + pad_token_id: int | None = 120002, + bos_token_id: int = 120000, + eos_token_id: int = 120025, + tie_word_embeddings: bool = False, + torch_dtype: str = "bfloat16", + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.moe_intermediate_size = moe_intermediate_size + self.expert_hidden_dim = expert_hidden_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.head_dim = head_dim + self.num_experts = num_experts + self.num_shared_experts = num_shared_experts + self.num_experts_per_tok = num_experts_per_tok + self.router_scaling_factor = router_scaling_factor + self.route_norm = route_norm + self.moe_router_enable_expert_bias = moe_router_enable_expert_bias + self.moe_router_use_sigmoid = moe_router_use_sigmoid + self.first_k_dense_replace = first_k_dense_replace + self.max_position_embeddings = max_position_embeddings + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.rms_norm_eps = rms_norm_eps + self.qk_norm = qk_norm + self.attention_bias = attention_bias + self.hidden_act = hidden_act + self.enable_lm_head_fp32 = enable_lm_head_fp32 + self.enable_attention_fp32_softmax = enable_attention_fp32_softmax + self.enable_moe_fp32_combine = enable_moe_fp32_combine + self.torch_dtype = torch_dtype + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + use_cache=use_cache, + **kwargs, + ) diff --git a/nemo_automodel/components/models/hy_mt2/layers.py b/nemo_automodel/components/models/hy_mt2/layers.py new file mode 100644 index 0000000000..3450ea8338 --- /dev/null +++ b/nemo_automodel/components/models/hy_mt2/layers.py @@ -0,0 +1,147 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any + +import torch +from torch import nn + +from nemo_automodel.components.attention.utils import ( + initialize_attn_module_and_func, + postprocess_output_for_attn, + preprocess_args_and_kwargs_for_attn, +) +from nemo_automodel.components.models.common import ( + BackendConfig, + initialize_linear_module, + initialize_rms_norm_module, +) +from nemo_automodel.components.models.gpt_oss.rope_utils import apply_rotary_emb_qk + + +class HyMT2Attention(nn.Module): + """Hy-MT2-30B-A3B attention: GQA, per-head Q/K RMSNorm, and RoPE. + + Differences vs. the existing Hy3-preview ``HYV3Attention``: + - ``qk_norm`` is gated by ``config.qk_norm`` (defaults to True). For + Hy-MT2-30B-A3B this is always True; the flag is here so the same + module can also be reused for non-qk-norm variants without code + edits. + - Dimensions follow Hy-MT2-30B-A3B: 32 Q heads / 4 KV heads, + head_dim=128, hidden_size=2048. + """ + + def __init__(self, config: Any, backend: BackendConfig): + super().__init__() + self.backend = backend + + self.num_heads = config.num_attention_heads + self.num_kv_heads = config.num_key_value_heads + self.head_dim = getattr(config, "head_dim", config.hidden_size // self.num_heads) + self.qk_norm_enabled = bool(getattr(config, "qk_norm", True)) + + attention_bias = getattr(config, "attention_bias", False) + + self.q_proj = initialize_linear_module( + backend.linear, config.hidden_size, self.num_heads * self.head_dim, attention_bias + ) + self.k_proj = initialize_linear_module( + backend.linear, config.hidden_size, self.num_kv_heads * self.head_dim, attention_bias + ) + self.v_proj = initialize_linear_module( + backend.linear, config.hidden_size, self.num_kv_heads * self.head_dim, attention_bias + ) + self.o_proj = initialize_linear_module( + backend.linear, self.num_heads * self.head_dim, config.hidden_size, attention_bias + ) + + if self.qk_norm_enabled: + self.q_norm = initialize_rms_norm_module(backend.rms_norm, self.head_dim, eps=config.rms_norm_eps) + self.k_norm = initialize_rms_norm_module(backend.rms_norm, self.head_dim, eps=config.rms_norm_eps) + else: + self.q_norm = None + self.k_norm = None + + softmax_scale = self.head_dim**-0.5 + self.attn_module, self.attn_func = initialize_attn_module_and_func( + attn_impl=backend.attn, + num_attention_heads=self.num_heads, + num_qk_channels=self.head_dim, + num_v_channels=self.head_dim, + softmax_scale=softmax_scale, + num_gqa_groups=self.num_kv_heads, + ) + + def forward( + self, + x: torch.Tensor, + *, + freqs_cis: torch.Tensor, + attention_mask: torch.Tensor | None = None, + **attn_kwargs: Any, + ) -> torch.Tensor: + if len(x.shape) == 2: + qkv_format = "thd" + num_tokens = x.shape[0] + else: + qkv_format = "bshd" + bsz, seqlen, _ = x.size() + + q = self.q_proj(x) + k = self.k_proj(x) + v = self.v_proj(x) + + if qkv_format == "thd": + q = q.view(num_tokens, self.num_heads, self.head_dim) + k = k.view(num_tokens, self.num_kv_heads, self.head_dim) + v = v.view(num_tokens, self.num_kv_heads, self.head_dim) + else: + q = q.view(bsz, seqlen, self.num_heads, self.head_dim) + k = k.view(bsz, seqlen, self.num_kv_heads, self.head_dim) + v = v.view(bsz, seqlen, self.num_kv_heads, self.head_dim) + + if self.qk_norm_enabled: + q = self.q_norm(q) + k = self.k_norm(k) + + q, k = apply_rotary_emb_qk( + q, + k, + freqs_cis, + format=qkv_format, + rope_fusion=self.backend.rope_fusion, + cu_seqlens=attn_kwargs.get("cu_seqlens", None), + cp_size=attn_kwargs.get("cp_size", 1), + cp_rank=attn_kwargs.get("cp_rank", 0), + ) + + q, k, v, _attn_kwargs = preprocess_args_and_kwargs_for_attn( + q, k, v, attention_mask, self.backend.attn, **attn_kwargs + ) + out = self.attn_func(q, k, v, **_attn_kwargs) + out = postprocess_output_for_attn(out, self.backend.attn) + + flatten_dim = 2 if qkv_format == "bshd" else 1 + out = self.o_proj(out.flatten(flatten_dim)) + return out + + def init_weights(self, buffer_device: torch.device, init_std: float = 0.02): + for linear in (self.q_proj, self.k_proj, self.v_proj, self.o_proj): + nn.init.trunc_normal_(linear.weight, mean=0.0, std=init_std) + if hasattr(linear, "bias") and linear.bias is not None: + nn.init.zeros_(linear.bias) + if self.q_norm is not None: + self.q_norm.reset_parameters() + if self.k_norm is not None: + self.k_norm.reset_parameters() diff --git a/nemo_automodel/components/models/hy_mt2/model.py b/nemo_automodel/components/models/hy_mt2/model.py new file mode 100644 index 0000000000..db2d85db8e --- /dev/null +++ b/nemo_automodel/components/models/hy_mt2/model.py @@ -0,0 +1,418 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""HyMT2ForCausalLM — Tencent Hy-MT2-30B-A3B (translation MoE) SFT support. + +Architecture (from tencent/Hy-MT2-30B-A3B config.json): + - 48 transformer layers; layer 0 is dense, layers 1-47 are MoE + - MoE: 128 routed experts + 1 shared expert, top-8 activated + - Sigmoid routing with expert-bias correction, router_scaling_factor=2.826 + - route_norm = True (normalize top-k weights to sum to 1) + - GQA: 32 Q heads, 4 KV heads, head_dim=128, hidden_size=2048 + - Per-head Q/K RMSNorm (qk_norm=True) before RoPE + - 256K context, rope_theta=11158840 + - dense intermediate_size=6912, moe_intermediate_size=expert_hidden_dim=768 + - vocab_size=120832 + - enable_lm_head_fp32 = True (HF reference upcasts lm_head to fp32) + +Notes vs. ``components/models/hy_v3`` (Hy3-preview 295B): + - Smaller everywhere (48L / 128 experts / 32+4 heads / hidden=2048). + - Adds an in-model ``enable_lm_head_fp32`` fallback (applies when the + YAML's ``lm_head_precision`` is not set). The preferred path is to set + ``distributed.moe.lm_head_precision: float32`` in the YAML, which the + MoE parallelizer handles via ``MixedPrecisionPolicy``. + - ``score_func`` is driven by ``config.moe_router_use_sigmoid`` instead + of being hard-coded. +""" + +from typing import Any + +import torch +import torch.nn as nn + +from nemo_automodel.components.models.common import ( + BackendConfig, + get_rope_config, + initialize_linear_module, + initialize_rms_norm_module, +) +from nemo_automodel.components.models.common.hf_checkpointing_mixin import HFCheckpointingMixin +from nemo_automodel.components.models.common.utils import cast_model_to_dtype +from nemo_automodel.components.models.gpt_oss.rope_utils import RotaryEmbedding, position_ids_to_freqs_cis +from nemo_automodel.components.models.hy_mt2.layers import HyMT2Attention +from nemo_automodel.components.models.hy_mt2.state_dict_adapter import HyMT2StateDictAdapter +from nemo_automodel.components.moe.config import MoEConfig +from nemo_automodel.components.moe.fsdp_mixin import MoEFSDPSyncMixin +from nemo_automodel.components.moe.layers import MLP, MoE +from nemo_automodel.components.utils.model_utils import squeeze_input_for_thd +from nemo_automodel.shared.utils import dtype_from_str as get_dtype + + +def _resolve_score_func(config: Any) -> str: + """Map ``config.moe_router_use_sigmoid`` to a gate ``score_func`` name. + + Returns "sigmoid" when the flag is True (Hy-MT2 default) and "softmax" + otherwise. The bias-aware variants ("sigmoid_with_bias" / + "softmax_with_bias") are selected at the gate level by the presence of + ``e_score_correction_bias`` plus expert-group routing, which Hy-MT2 does + not use (n_expert_groups=0). + """ + use_sigmoid = bool(getattr(config, "moe_router_use_sigmoid", True)) + return "sigmoid" if use_sigmoid else "softmax" + + +class Block(nn.Module): + """Single Hy-MT2 transformer block: attention + (dense MLP | MoE) + residual norms.""" + + def __init__(self, layer_idx: int, config: Any, moe_config: MoEConfig, backend: BackendConfig): + super().__init__() + self.self_attn = HyMT2Attention(config, backend) + + first_k_dense = getattr(config, "first_k_dense_replace", 1) + if layer_idx < first_k_dense: + self.mlp = MLP(config.hidden_size, config.intermediate_size, backend.linear) + else: + self.mlp = MoE(moe_config, backend) + + self.input_layernorm = initialize_rms_norm_module(backend.rms_norm, config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = initialize_rms_norm_module( + backend.rms_norm, config.hidden_size, eps=config.rms_norm_eps + ) + self.layer_idx = layer_idx + + def forward( + self, + x: torch.Tensor, + *, + freqs_cis: torch.Tensor, + attention_mask: torch.Tensor | None = None, + padding_mask: torch.Tensor | None = None, + **attn_kwargs: Any, + ) -> torch.Tensor: + if attention_mask is not None and padding_mask is None: + padding_mask = attention_mask.bool().logical_not() + + attn_out = self.self_attn( + x=self.input_layernorm(x), + freqs_cis=freqs_cis, + attention_mask=attention_mask, + **attn_kwargs, + ) + x = x + attn_out + + mlp_out = self._mlp(x=self.post_attention_layernorm(x), padding_mask=padding_mask) + x = x + mlp_out + return x + + def _mlp(self, x: torch.Tensor, padding_mask: torch.Tensor | None) -> torch.Tensor: + if isinstance(self.mlp, MLP): + return self.mlp(x) + assert isinstance(self.mlp, MoE) + return self.mlp(x, padding_mask) + + def init_weights(self, buffer_device: torch.device): + for norm in (self.input_layernorm, self.post_attention_layernorm): + norm.reset_parameters() + self.self_attn.init_weights(buffer_device) + self.mlp.init_weights(buffer_device) + + +class HyMT2Model(nn.Module): + """Hy-MT2 backbone: token embeddings + transformer blocks + final RMSNorm. + + The MoE / dense split is governed by ``config.first_k_dense_replace`` + (layer 0 dense, the rest MoE for the published Hy-MT2-30B-A3B). The + MoE configuration is assembled from the HF config fields and forwarded + to every MoE-bearing ``Block``. + """ + + def __init__( + self, + config: Any, + backend: BackendConfig, + *, + moe_config: MoEConfig | None = None, + moe_overrides: dict | None = None, + ): + super().__init__() + self.backend = backend + self.config = config + if moe_config is not None and moe_overrides is not None: + raise ValueError("Cannot pass both moe_config and moe_overrides.") + + # ``expert_hidden_dim`` and ``moe_intermediate_size`` are synonyms in + # the on-disk config. Prefer ``expert_hidden_dim`` when present + # (matches the field name used by the HF reference for the expert MLP + # hidden dim); fall back to ``moe_intermediate_size`` otherwise. + moe_inter = getattr(config, "expert_hidden_dim", None) + if moe_inter is None: + moe_inter = config.moe_intermediate_size + + moe_defaults = dict( + dim=config.hidden_size, + inter_dim=config.intermediate_size, + moe_inter_dim=moe_inter, + n_routed_experts=config.num_experts, + n_shared_experts=getattr(config, "num_shared_experts", 0), + n_activated_experts=config.num_experts_per_tok, + n_expert_groups=0, + n_limited_groups=0, + train_gate=True, + gate_bias_update_factor=0.0, + score_func=_resolve_score_func(config), + route_scale=getattr(config, "router_scaling_factor", 1.0), + aux_loss_coeff=0.0, + norm_topk_prob=getattr(config, "route_norm", True), + expert_bias=False, + router_bias=False, + expert_activation="swiglu", + softmax_before_topk=False, + # Ensures e_score_correction_bias buffer is created so HF + # checkpoints with ``expert_bias`` load cleanly. + force_e_score_correction_bias=getattr(config, "moe_router_enable_expert_bias", False), + ) + if moe_overrides: + moe_defaults.update(moe_overrides) + self.moe_config = moe_config or MoEConfig(**moe_defaults) + + self.embed_tokens = nn.Embedding( + config.vocab_size, config.hidden_size, dtype=get_dtype(config.torch_dtype, torch.bfloat16) + ) + self.layers = torch.nn.ModuleDict() + for layer_id in range(config.num_hidden_layers): + self.layers[str(layer_id)] = Block(layer_id, config, self.moe_config, backend) + self.norm = initialize_rms_norm_module(backend.rms_norm, config.hidden_size, eps=config.rms_norm_eps) + + self.max_seq_len = config.max_position_embeddings + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + + base, rope_scaling, _ = get_rope_config(config) + + self.rotary_emb = RotaryEmbedding( + head_dim=self.head_dim, + base=base, + dtype=torch.float32, + initial_context_length=rope_scaling.get("original_max_position_embeddings", 4096), + scaling_factor=rope_scaling.get("factor", 1.0), + ntk_alpha=rope_scaling.get("beta_slow", 1.0), + ntk_beta=rope_scaling.get("beta_fast", 32.0), + device=torch.device(f"cuda:{torch.cuda.current_device()}") + if torch.cuda.is_available() + else torch.device("cpu"), + ) + + def forward( + self, + input_ids: torch.Tensor, + *, + position_ids: torch.Tensor | None = None, + attention_mask: torch.Tensor | None = None, + padding_mask: torch.Tensor | None = None, + **attn_kwargs: Any, + ) -> torch.Tensor: + if position_ids is None: + position_ids = ( + torch.arange(0, input_ids.shape[1], device=input_ids.device).unsqueeze(0).expand(input_ids.shape[0], -1) + ) + + freqs_cis = position_ids_to_freqs_cis( + self.rotary_emb, + position_ids, + qkv_format=attn_kwargs.get("qkv_format", "bshd"), + for_fused_rope=self.backend.rope_fusion, + cp_size=attn_kwargs.get("cp_size", 1), + ) + + h = self.embed_tokens(input_ids) if self.embed_tokens is not None else input_ids + + for layer in self.layers.values(): + h = layer( + x=h, + freqs_cis=freqs_cis, + attention_mask=attention_mask, + padding_mask=padding_mask, + **attn_kwargs, + ) + + h = self.norm(h) if self.norm else h + return h + + @torch.no_grad() + def init_weights(self, buffer_device: torch.device | None = None) -> None: + if buffer_device is None: + buffer_device = ( + torch.device(f"cuda:{torch.cuda.current_device()}") + if torch.cuda.is_available() + else torch.device("cpu") + ) + with buffer_device: + if self.embed_tokens is not None: + nn.init.normal_(self.embed_tokens.weight) + if self.norm is not None: + self.norm.reset_parameters() + self.rotary_emb.device = buffer_device + + for layer in self.layers.values(): + if layer is not None: + layer.init_weights(buffer_device=buffer_device) + + +class HyMT2ForCausalLM(HFCheckpointingMixin, nn.Module, MoEFSDPSyncMixin): + """Hy-MT2-30B-A3B causal-LM wrapper. + + Mixes in ``MoEFSDPSyncMixin`` so EP / FSDP2 expert-gradient sync works + out of the box (set ``distributed.ep_size`` in the YAML; must divide + ``num_experts``=128). The ``HFCheckpointingMixin`` provides + ``from_pretrained`` / ``save_pretrained`` over the HF safetensors layout. + """ + + @classmethod + def from_config( + cls, + config: Any, + moe_config: MoEConfig | None = None, + backend: BackendConfig | None = None, + **kwargs, + ): + return cls(config, moe_config, backend, **kwargs) + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: str, + *model_args, + **kwargs, + ): + from transformers import AutoConfig + + # The on-disk Hy-MT2 checkpoint declares ``model_type: hy_v3`` so + # ``AutoConfig`` returns a ``HYV3Config`` instance. Our model code + # is duck-typed against the field names (which match) so this works + # transparently. + config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=False) + return cls.from_config(config, *model_args, **kwargs) + + def __init__( + self, + config: Any, + moe_config: MoEConfig | None = None, + backend: BackendConfig | None = None, + **kwargs, + ): + super().__init__() + self.config = config + self.backend = backend or BackendConfig() + moe_overrides = kwargs.pop("moe_overrides", None) + self.model = HyMT2Model(config, backend=self.backend, moe_config=moe_config, moe_overrides=moe_overrides) + self.lm_head = initialize_linear_module(self.backend.linear, config.hidden_size, config.vocab_size, bias=False) + # In-model fp32 fallback for the lm_head matmul. The preferred wiring + # is the YAML ``distributed.moe.lm_head_precision: float32``, which + # the MoE parallelizer enables via ``MixedPrecisionPolicy``. When that + # path is not used, ``enable_lm_head_fp32`` in the model config still + # triggers the in-forward upcast. + self._enable_lm_head_fp32 = bool(getattr(config, "enable_lm_head_fp32", False)) + if self.backend.enable_hf_state_dict_adapter: + self.state_dict_adapter = HyMT2StateDictAdapter( + self.config, + self.model.moe_config, + self.backend, + dtype=get_dtype(config.torch_dtype, torch.bfloat16), + ) + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def forward( + self, + input_ids: torch.Tensor, + *, + position_ids: torch.Tensor | None = None, + attention_mask: torch.Tensor | None = None, + padding_mask: torch.Tensor | None = None, + **attn_kwargs: Any, + ) -> torch.Tensor: + if "qkv_format" in attn_kwargs and attn_kwargs["qkv_format"] == "thd": + input_ids, position_ids, padding_mask, attn_kwargs = squeeze_input_for_thd( + input_ids, position_ids, padding_mask, attn_kwargs + ) + attention_mask = None + + hidden = self.model( + input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + padding_mask=padding_mask, + **attn_kwargs, + ) + + if self.lm_head is None: + logits = hidden + elif self._enable_lm_head_fp32 and hidden.dtype != torch.float32: + # Upcast input to fp32 for the lm_head matmul, then cast logits + # back to the model dtype so downstream loss / sampling code is + # not surprised by an fp32 tensor. Matches the HF reference's + # ``enable_lm_head_fp32`` behavior. + original_dtype = hidden.dtype + logits = self.lm_head(hidden.float()).to(original_dtype) + else: + logits = self.lm_head(hidden) + + if "qkv_format" in attn_kwargs and attn_kwargs["qkv_format"] == "thd": + logits = logits.unsqueeze(0) + return logits + + def update_moe_gate_bias(self) -> None: + with torch.no_grad(): + for block in self.model.layers.values(): + if isinstance(block.mlp, MoE) and block.mlp.gate.bias_update_factor > 0: + block.mlp.gate.update_bias() + + @torch.no_grad() + def initialize_weights( + self, buffer_device: torch.device | None = None, dtype: torch.dtype = torch.bfloat16 + ) -> None: + if buffer_device is None: + buffer_device = ( + torch.device(f"cuda:{torch.cuda.current_device()}") + if torch.cuda.is_available() + else torch.device("cpu") + ) + with buffer_device: + self.model.init_weights(buffer_device=buffer_device) + final_out_std = self.config.hidden_size**-0.5 + cutoff_factor = 3 + if self.lm_head is not None: + nn.init.trunc_normal_( + self.lm_head.weight, + mean=0.0, + std=final_out_std, + a=-cutoff_factor * final_out_std, + b=cutoff_factor * final_out_std, + ) + + cast_model_to_dtype(self, dtype) + with buffer_device: + self.model.rotary_emb.device = buffer_device + + +ModelClass = HyMT2ForCausalLM diff --git a/nemo_automodel/components/models/hy_mt2/state_dict_adapter.py b/nemo_automodel/components/models/hy_mt2/state_dict_adapter.py new file mode 100644 index 0000000000..7d98b4dd4e --- /dev/null +++ b/nemo_automodel/components/models/hy_mt2/state_dict_adapter.py @@ -0,0 +1,163 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""State dict conversion between the on-disk tencent/Hy-MT2-30B-A3B HF +checkpoint and Automodel's native (grouped-experts) format. + +The on-disk key layout is identical to tencent/Hy3-preview because both +share ``model_type: "hy_v3"`` and ``architectures: ["HYV3ForCausalLM"]``: + + model.layers.{L}.mlp.expert_bias # [n_experts] + model.layers.{L}.mlp.router.gate.weight # [n_experts, hidden] + model.layers.{L}.mlp.experts.{E}.gate_proj.weight # [moe_inter, hidden] + model.layers.{L}.mlp.experts.{E}.up_proj.weight # [moe_inter, hidden] + model.layers.{L}.mlp.experts.{E}.down_proj.weight # [hidden, moe_inter] + model.layers.{L}.mlp.shared_mlp.{gate,up,down}_proj.weight # shared expert + +Automodel native: + + model.layers.{L}.mlp.gate.e_score_correction_bias # [n_local] + model.layers.{L}.mlp.gate.weight # [n_experts, hidden] + model.layers.{L}.mlp.experts.gate_and_up_projs # grouped + model.layers.{L}.mlp.experts.down_projs # grouped + model.layers.{L}.mlp.shared_experts.{gate,up,down}_proj.weight + +This adapter handles three on-disk-specific renames plus per-expert +split/merge (via ``MoESplitExpertsStateDictMixin``). It is functionally a +clone of ``HYV3StateDictAdapter``; kept separate so future Hy-MT2-only +key changes (e.g. an MTP / aux-head extension that Hy-MT2 ships but +Hy3-preview does not) can be added here without affecting Hy3-preview. +""" + +import logging +import re +from typing import Any, Optional + +import torch +from torch.distributed.device_mesh import DeviceMesh + +from nemo_automodel.components.checkpoint.state_dict_adapter import StateDictAdapter +from nemo_automodel.components.models.common import BackendConfig +from nemo_automodel.components.moe.config import MoEConfig +from nemo_automodel.components.moe.state_dict_mixin import MoESplitExpertsStateDictMixin + +logger = logging.getLogger(__name__) + + +_NATIVE_TO_HF_RENAMES: tuple[tuple[re.Pattern[str], str], ...] = ( + (re.compile(r"\.mlp\.gate\.e_score_correction_bias$"), ".mlp.expert_bias"), + (re.compile(r"\.mlp\.gate\.weight$"), ".mlp.router.gate.weight"), + (re.compile(r"\.mlp\.shared_experts\."), ".mlp.shared_mlp."), +) +_HF_TO_NATIVE_RENAMES: tuple[tuple[re.Pattern[str], str], ...] = ( + (re.compile(r"\.mlp\.expert_bias$"), ".mlp.gate.e_score_correction_bias"), + (re.compile(r"\.mlp\.router\.gate\.weight$"), ".mlp.gate.weight"), + (re.compile(r"\.mlp\.shared_mlp\."), ".mlp.shared_experts."), +) + + +class HyMT2StateDictAdapter(MoESplitExpertsStateDictMixin, StateDictAdapter): + """Bridges Automodel native (grouped experts) and on-disk Hy-MT2 HF format.""" + + def __init__( + self, + config: Any, + moe_config: MoEConfig, + backend: BackendConfig, + dtype: torch.dtype = torch.bfloat16, + ): + self.config = config + self.moe_config = moe_config + self.backend = backend + self.dtype = dtype + self._uses_model_prefix = True + + def to_hf( + self, + state_dict: dict[str, Any], + exclude_key_regex: Optional[str] = None, + **kwargs, + ) -> dict[str, Any]: + """Native -> on-disk Hy-MT2 HF: per-expert split + name renames.""" + hf_split: dict[str, Any] = self._to_hf_w_split_experts(state_dict) + + out: dict[str, Any] = {} + for k, v in hf_split.items(): + new_k = k + for pat, repl in _NATIVE_TO_HF_RENAMES: + new_k, n = pat.subn(repl, new_k) + if n: + break + if exclude_key_regex and re.match(exclude_key_regex, new_k): + continue + out[new_k] = v + return out + + def from_hf( + self, + hf_state_dict: dict[str, Any], + device_mesh: Optional[DeviceMesh] = None, + **kwargs, + ) -> dict[str, Any]: + """On-disk Hy-MT2 HF -> native: filter MTP, rename, then merge experts.""" + renamed: dict[str, Any] = {} + for k, v in hf_state_dict.items(): + if self._is_mtp_key(k): + continue + new_k = k + for pat, repl in _HF_TO_NATIVE_RENAMES: + new_k, n = pat.subn(repl, new_k) + if n: + break + renamed[new_k] = v + + return self._from_hf_w_merged_experts(renamed, device_mesh) + + def convert_single_tensor_to_hf( + self, + fqn: str, + tensor: Any, + **kwargs, + ) -> list[tuple[str, Any]]: + """Per-tensor variant of ``to_hf`` for streaming-save code paths.""" + exclude_key_regex = kwargs.get("exclude_key_regex", None) + + expert_split = self._convert_single_merged_expert_to_hf_split_experts(fqn, tensor, **kwargs) + if expert_split is not None: + pairs = expert_split + else: + pairs = [(fqn, tensor)] + + out: list[tuple[str, Any]] = [] + for k, v in pairs: + new_k = k + for pat, repl in _NATIVE_TO_HF_RENAMES: + new_k, n = pat.subn(repl, new_k) + if n: + break + if exclude_key_regex and re.match(exclude_key_regex, new_k): + continue + out.append((new_k, v)) + return out + + def _is_mtp_key(self, key: str) -> bool: + """Return True if *key* belongs to an MTP layer (index >= num_hidden_layers). + + Hy-MT2-30B-A3B does not appear to ship MTP layers in its public + checkpoint, but the filter is kept as a defensive no-op so the + adapter remains symmetric with ``HYV3StateDictAdapter``. + """ + num_hidden = getattr(self.config, "num_hidden_layers", 48) + m = re.match(r"(?:model\.)?layers\.(\d+)\.", key) + return bool(m and int(m.group(1)) >= num_hidden) diff --git a/tests/unit_tests/models/hy_mt2/__init__.py b/tests/unit_tests/models/hy_mt2/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/unit_tests/models/hy_mt2/test_hy_mt2_config.py b/tests/unit_tests/models/hy_mt2/test_hy_mt2_config.py new file mode 100644 index 0000000000..2b30ae20b9 --- /dev/null +++ b/tests/unit_tests/models/hy_mt2/test_hy_mt2_config.py @@ -0,0 +1,165 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for ``HyMT2Config``.""" + +from transformers import PretrainedConfig + +from nemo_automodel.components.models.hy_mt2.config import HyMT2Config + + +class TestDefaults: + def test_model_type(self): + assert HyMT2Config.model_type == "hy_mt2" + + def test_inherits_pretrained_config(self): + cfg = HyMT2Config() + assert isinstance(cfg, PretrainedConfig) + + def test_default_attributes_match_30b_a3b(self): + cfg = HyMT2Config() + # Architecture defaults from the published Hy-MT2-30B-A3B config.json. + assert cfg.vocab_size == 120832 + assert cfg.hidden_size == 2048 + assert cfg.intermediate_size == 6912 + assert cfg.moe_intermediate_size == 768 + assert cfg.expert_hidden_dim == 768 + assert cfg.num_hidden_layers == 48 + assert cfg.num_attention_heads == 32 + assert cfg.num_key_value_heads == 4 + assert cfg.head_dim == 128 + assert cfg.num_experts == 128 + assert cfg.num_shared_experts == 1 + assert cfg.num_experts_per_tok == 8 + assert cfg.first_k_dense_replace == 1 + assert cfg.max_position_embeddings == 262144 + assert cfg.rope_theta == 11158840.0 + assert cfg.rms_norm_eps == 1e-5 + assert cfg.attention_bias is False + assert cfg.hidden_act == "silu" + assert cfg.qk_norm is True + assert cfg.route_norm is True + assert cfg.router_scaling_factor == 2.826 + assert cfg.moe_router_use_sigmoid is True + assert cfg.moe_router_enable_expert_bias is True + assert cfg.enable_lm_head_fp32 is True + assert cfg.enable_attention_fp32_softmax is False + assert cfg.enable_moe_fp32_combine is False + assert cfg.tie_word_embeddings is False + # torch_dtype is auto-coerced by PretrainedConfig in newer transformers + # (deprecated -> dtype); accept either the string we set or whatever + # the base class normalizes to. + assert cfg.torch_dtype in ("bfloat16", None) or str(cfg.torch_dtype).endswith("bfloat16") + + def test_default_token_ids(self): + cfg = HyMT2Config() + assert cfg.pad_token_id == 120002 + assert cfg.bos_token_id == 120000 + assert cfg.eos_token_id == 120025 + + def test_keys_to_ignore_at_inference(self): + assert HyMT2Config.keys_to_ignore_at_inference == ["past_key_values"] + + +class TestOverrides: + def test_override_attention_dims(self): + cfg = HyMT2Config(num_attention_heads=8, num_key_value_heads=2, head_dim=64, hidden_size=512) + assert cfg.num_attention_heads == 8 + assert cfg.num_key_value_heads == 2 + assert cfg.head_dim == 64 + assert cfg.hidden_size == 512 + + def test_override_moe_routing(self): + cfg = HyMT2Config( + num_experts=64, + num_experts_per_tok=4, + num_shared_experts=2, + router_scaling_factor=1.5, + route_norm=False, + ) + assert cfg.num_experts == 64 + assert cfg.num_experts_per_tok == 4 + assert cfg.num_shared_experts == 2 + assert cfg.router_scaling_factor == 1.5 + assert cfg.route_norm is False + + def test_override_router_flavor(self): + cfg = HyMT2Config(moe_router_use_sigmoid=False, moe_router_enable_expert_bias=False) + assert cfg.moe_router_use_sigmoid is False + assert cfg.moe_router_enable_expert_bias is False + + def test_truncated_layer_count(self): + cfg = HyMT2Config(num_hidden_layers=4) + assert cfg.num_hidden_layers == 4 + + def test_first_k_dense_replace(self): + cfg = HyMT2Config(first_k_dense_replace=3) + assert cfg.first_k_dense_replace == 3 + + def test_rope_overrides(self): + cfg = HyMT2Config(rope_theta=500000.0, max_position_embeddings=4096) + assert cfg.rope_theta == 500000.0 + assert cfg.max_position_embeddings == 4096 + + def test_rope_scaling_dict(self): + scaling = {"factor": 8.0, "rope_type": "yarn"} + cfg = HyMT2Config(rope_scaling=scaling) + assert cfg.rope_scaling == scaling + + def test_qk_norm_override(self): + cfg = HyMT2Config(qk_norm=False) + assert cfg.qk_norm is False + + def test_lm_head_fp32_override(self): + cfg = HyMT2Config(enable_lm_head_fp32=False) + assert cfg.enable_lm_head_fp32 is False + + def test_expert_hidden_dim_override(self): + cfg = HyMT2Config(expert_hidden_dim=1024) + assert cfg.expert_hidden_dim == 1024 + + def test_token_ids(self): + cfg = HyMT2Config(pad_token_id=0, bos_token_id=10, eos_token_id=11) + assert cfg.pad_token_id == 0 + assert cfg.bos_token_id == 10 + assert cfg.eos_token_id == 11 + + def test_super_init_kwargs_accepted(self): + # Verify PretrainedConfig kwargs flow through without raising. + HyMT2Config(use_cache=False, tie_word_embeddings=True) + + def test_extra_kwargs_pass_through_super_init(self): + # PretrainedConfig **kwargs in newer transformers no longer attaches + # arbitrary fields; the call should still succeed. + cfg = HyMT2Config(custom_field="abc") + assert isinstance(cfg, HyMT2Config) + + +class TestSerialization: + def test_to_dict_round_trip(self): + cfg = HyMT2Config(num_hidden_layers=4, num_experts=8, hidden_size=256) + d = cfg.to_dict() + assert d["model_type"] == "hy_mt2" + assert d["num_hidden_layers"] == 4 + assert d["num_experts"] == 8 + + rebuilt = HyMT2Config(**{k: v for k, v in d.items() if k != "model_type"}) + assert rebuilt.num_hidden_layers == 4 + assert rebuilt.num_experts == 8 + assert rebuilt.hidden_size == 256 + + def test_model_type_class_attribute_not_overridden_by_instance(self): + cfg = HyMT2Config() + assert cfg.model_type == "hy_mt2" + assert HyMT2Config.model_type == "hy_mt2" diff --git a/tests/unit_tests/models/hy_mt2/test_hy_mt2_layers.py b/tests/unit_tests/models/hy_mt2/test_hy_mt2_layers.py new file mode 100644 index 0000000000..ee5f014053 --- /dev/null +++ b/tests/unit_tests/models/hy_mt2/test_hy_mt2_layers.py @@ -0,0 +1,164 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for ``HyMT2Attention``.""" + +from unittest.mock import patch + +import pytest +import torch + +from nemo_automodel.components.models.common import BackendConfig +from nemo_automodel.components.models.hy_mt2.config import HyMT2Config +from nemo_automodel.components.models.hy_mt2.layers import HyMT2Attention + +pytestmark = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + + +HIDDEN = 64 +N_HEADS = 8 +N_KV = 2 +HEAD_DIM = 16 + + +@pytest.fixture +def device(): + return torch.device(f"cuda:{torch.cuda.current_device()}") + + +@pytest.fixture +def config(): + return HyMT2Config( + vocab_size=128, + hidden_size=HIDDEN, + intermediate_size=128, + moe_intermediate_size=64, + num_hidden_layers=1, + num_attention_heads=N_HEADS, + num_key_value_heads=N_KV, + head_dim=HEAD_DIM, + max_position_embeddings=128, + rope_theta=10000.0, + rms_norm_eps=1e-5, + ) + + +@pytest.fixture +def sdpa_backend(): + return BackendConfig( + linear="torch", + attn="sdpa", + rms_norm="torch", + experts="torch", + dispatcher="torch", + fake_balanced_gate=False, + enable_hf_state_dict_adapter=False, + rope_fusion=False, + ) + + +def _make_freqs_cis(seq_len: int, device: torch.device) -> torch.Tensor: + """Synthesize a freqs_cis tensor matching ``apply_rotary_emb_qk(format='bshd')``.""" + return torch.zeros(1, seq_len, HEAD_DIM, device=device) + + +class TestInit: + def test_module_attributes(self, config, sdpa_backend): + attn = HyMT2Attention(config, backend=sdpa_backend) + assert attn.num_heads == N_HEADS + assert attn.num_kv_heads == N_KV + assert attn.head_dim == HEAD_DIM + assert attn.backend is sdpa_backend + assert attn.qk_norm_enabled is True + + def test_projection_shapes(self, config, sdpa_backend): + attn = HyMT2Attention(config, backend=sdpa_backend) + assert attn.q_proj.weight.shape == (N_HEADS * HEAD_DIM, HIDDEN) + assert attn.k_proj.weight.shape == (N_KV * HEAD_DIM, HIDDEN) + assert attn.v_proj.weight.shape == (N_KV * HEAD_DIM, HIDDEN) + assert attn.o_proj.weight.shape == (HIDDEN, N_HEADS * HEAD_DIM) + + def test_q_k_norm_per_head_dim_when_enabled(self, config, sdpa_backend): + attn = HyMT2Attention(config, backend=sdpa_backend) + assert attn.q_norm is not None + assert attn.k_norm is not None + assert attn.q_norm.weight.shape == (HEAD_DIM,) + assert attn.k_norm.weight.shape == (HEAD_DIM,) + + def test_qk_norm_disabled_when_config_flag_false(self, config, sdpa_backend): + config.qk_norm = False + attn = HyMT2Attention(config, backend=sdpa_backend) + assert attn.qk_norm_enabled is False + assert attn.q_norm is None + assert attn.k_norm is None + + def test_no_attention_bias_by_default(self, config, sdpa_backend): + attn = HyMT2Attention(config, backend=sdpa_backend) + assert attn.q_proj.bias is None + assert attn.k_proj.bias is None + assert attn.v_proj.bias is None + assert attn.o_proj.bias is None + + +class TestForward: + def test_output_shape_bshd(self, config, sdpa_backend, device): + attn = HyMT2Attention(config, backend=sdpa_backend).to(device) + bsz, seqlen = 2, 4 + x = torch.randn(bsz, seqlen, HIDDEN, device=device, dtype=torch.bfloat16) + freqs = _make_freqs_cis(seqlen, device) + + out = attn(x, freqs_cis=freqs) + assert out.shape == (bsz, seqlen, HIDDEN) + + def test_calls_q_k_v_o_projections(self, config, sdpa_backend, device): + attn = HyMT2Attention(config, backend=sdpa_backend).to(device) + x = torch.randn(1, 3, HIDDEN, device=device, dtype=torch.bfloat16) + freqs = _make_freqs_cis(3, device) + with ( + patch.object(attn.q_proj, "forward", wraps=attn.q_proj.forward) as q, + patch.object(attn.k_proj, "forward", wraps=attn.k_proj.forward) as k, + patch.object(attn.v_proj, "forward", wraps=attn.v_proj.forward) as v, + patch.object(attn.o_proj, "forward", wraps=attn.o_proj.forward) as o, + ): + attn(x, freqs_cis=freqs) + q.assert_called_once() + k.assert_called_once() + v.assert_called_once() + o.assert_called_once() + + def test_forward_skips_norms_when_qk_norm_disabled(self, config, sdpa_backend, device): + config.qk_norm = False + attn = HyMT2Attention(config, backend=sdpa_backend).to(device) + x = torch.randn(1, 3, HIDDEN, device=device, dtype=torch.bfloat16) + freqs = _make_freqs_cis(3, device) + out = attn(x, freqs_cis=freqs) + assert out.shape == x.shape + + +class TestInitWeights: + def test_resets_norms_and_linears_when_qk_norm_enabled(self, config, sdpa_backend, device): + attn = HyMT2Attention(config, backend=sdpa_backend).to(device) + with ( + patch.object(attn.q_norm, "reset_parameters") as qn, + patch.object(attn.k_norm, "reset_parameters") as kn, + ): + attn.init_weights(buffer_device=device, init_std=0.01) + qn.assert_called_once() + kn.assert_called_once() + + def test_init_weights_no_qk_norm(self, config, sdpa_backend, device): + config.qk_norm = False + attn = HyMT2Attention(config, backend=sdpa_backend).to(device) + # Should not raise even though q_norm / k_norm are None. + attn.init_weights(buffer_device=device, init_std=0.01) diff --git a/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py b/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py new file mode 100644 index 0000000000..19e0a7975b --- /dev/null +++ b/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py @@ -0,0 +1,285 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for the Hy-MT2 Block / HyMT2Model / HyMT2ForCausalLM layers.""" + +from unittest.mock import patch + +import pytest +import torch + +from nemo_automodel.components.models.common import BackendConfig +from nemo_automodel.components.models.hy_mt2.config import HyMT2Config +from nemo_automodel.components.models.hy_mt2.model import ( + Block, + HyMT2ForCausalLM, + HyMT2Model, + ModelClass, + _resolve_score_func, +) +from nemo_automodel.components.moe.config import MoEConfig +from nemo_automodel.components.moe.layers import MLP, MoE + +pytestmark = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + + +HIDDEN = 64 +INTER = 128 +MOE_INTER = 64 +N_HEADS = 8 +N_KV = 2 +HEAD_DIM = 16 +N_EXPERTS = 4 + + +@pytest.fixture +def device(): + return torch.device(f"cuda:{torch.cuda.current_device()}") + + +@pytest.fixture +def config(): + return HyMT2Config( + vocab_size=128, + hidden_size=HIDDEN, + intermediate_size=INTER, + moe_intermediate_size=MOE_INTER, + expert_hidden_dim=MOE_INTER, + num_hidden_layers=2, + num_attention_heads=N_HEADS, + num_key_value_heads=N_KV, + head_dim=HEAD_DIM, + num_experts=N_EXPERTS, + num_experts_per_tok=2, + num_shared_experts=1, + first_k_dense_replace=1, + max_position_embeddings=128, + rope_theta=10000.0, + rms_norm_eps=1e-5, + router_scaling_factor=2.826, + route_norm=True, + moe_router_use_sigmoid=True, + moe_router_enable_expert_bias=True, + enable_lm_head_fp32=True, + ) + + +@pytest.fixture +def backend_config(): + return BackendConfig( + linear="torch", + attn="sdpa", + rms_norm="torch", + experts="torch", + dispatcher="torch", + fake_balanced_gate=False, + gate_precision="float32", + rope_fusion=False, + enable_hf_state_dict_adapter=False, + enable_fsdp_optimizations=False, + ) + + +@pytest.fixture +def moe_config(config): + return MoEConfig( + dim=config.hidden_size, + inter_dim=config.intermediate_size, + moe_inter_dim=config.moe_intermediate_size, + n_routed_experts=config.num_experts, + n_shared_experts=config.num_shared_experts, + n_activated_experts=config.num_experts_per_tok, + n_expert_groups=0, + n_limited_groups=0, + train_gate=True, + gate_bias_update_factor=0.0, + score_func="sigmoid", + route_scale=config.router_scaling_factor, + aux_loss_coeff=0.0, + norm_topk_prob=True, + expert_bias=False, + router_bias=False, + expert_activation="swiglu", + softmax_before_topk=False, + force_e_score_correction_bias=True, + ) + + +class TestResolveScoreFunc: + def test_default_is_sigmoid(self): + # Config without the flag falls back to sigmoid (Hy-MT2 default). + class _NoFlag: + pass + + assert _resolve_score_func(_NoFlag()) == "sigmoid" + + def test_true_maps_to_sigmoid(self): + class _Cfg: + moe_router_use_sigmoid = True + + assert _resolve_score_func(_Cfg()) == "sigmoid" + + def test_false_maps_to_softmax(self): + class _Cfg: + moe_router_use_sigmoid = False + + assert _resolve_score_func(_Cfg()) == "softmax" + + +class TestBlock: + def test_dense_layer_uses_mlp_when_idx_below_first_k_dense(self, config, moe_config, backend_config): + config.first_k_dense_replace = 1 + block = Block(layer_idx=0, config=config, moe_config=moe_config, backend=backend_config) + assert isinstance(block.mlp, MLP) + + def test_moe_layer_uses_moe_when_idx_at_or_above_first_k_dense(self, config, moe_config, backend_config): + config.first_k_dense_replace = 1 + block = Block(layer_idx=1, config=config, moe_config=moe_config, backend=backend_config) + assert isinstance(block.mlp, MoE) + + def test_block_has_required_submodules(self, config, moe_config, backend_config): + block = Block(layer_idx=1, config=config, moe_config=moe_config, backend=backend_config) + assert hasattr(block, "self_attn") + assert hasattr(block, "mlp") + assert hasattr(block, "input_layernorm") + assert hasattr(block, "post_attention_layernorm") + assert block.layer_idx == 1 + + +class TestHyMT2Model: + def test_construction_sets_components(self, config, backend_config): + model = HyMT2Model(config, backend=backend_config) + assert len(model.layers) == config.num_hidden_layers + assert model.embed_tokens.num_embeddings == config.vocab_size + assert model.norm is not None + assert model.rotary_emb.head_dim == config.head_dim + assert isinstance(model.moe_config, MoEConfig) + + def test_dense_then_moe_layer_structure(self, config, backend_config): + config.first_k_dense_replace = 1 + config.num_hidden_layers = 3 + model = HyMT2Model(config, backend=backend_config) + assert isinstance(model.layers["0"].mlp, MLP) + assert isinstance(model.layers["1"].mlp, MoE) + assert isinstance(model.layers["2"].mlp, MoE) + + def test_moe_config_inferred_from_config(self, config, backend_config): + model = HyMT2Model(config, backend=backend_config) + mc = model.moe_config + assert mc.dim == config.hidden_size + assert mc.moe_inter_dim == config.moe_intermediate_size + assert mc.n_routed_experts == config.num_experts + assert mc.n_activated_experts == config.num_experts_per_tok + assert mc.n_shared_experts == config.num_shared_experts + assert mc.score_func == "sigmoid" # because moe_router_use_sigmoid=True + assert mc.expert_activation == "swiglu" + assert mc.route_scale == config.router_scaling_factor + assert mc.norm_topk_prob is True # because route_norm=True + assert mc.force_e_score_correction_bias is True + + def test_score_func_follows_use_sigmoid_flag(self, config, backend_config): + config.moe_router_use_sigmoid = False + model = HyMT2Model(config, backend=backend_config) + assert model.moe_config.score_func == "softmax" + + def test_expert_hidden_dim_preferred_over_moe_intermediate(self, config, backend_config): + # When both are set, expert_hidden_dim wins for the expert MLP dim. + config.expert_hidden_dim = 32 + config.moe_intermediate_size = 999 # would be wrong if used + model = HyMT2Model(config, backend=backend_config) + assert model.moe_config.moe_inter_dim == 32 + + def test_moe_overrides_take_effect(self, config, backend_config): + model = HyMT2Model(config, backend=backend_config, moe_overrides={"score_func": "softmax", "route_scale": 1.5}) + assert model.moe_config.score_func == "softmax" + assert model.moe_config.route_scale == 1.5 + + def test_explicit_moe_config_and_overrides_conflict(self, config, backend_config, moe_config): + with pytest.raises(ValueError, match="Cannot pass both"): + HyMT2Model(config, backend=backend_config, moe_config=moe_config, moe_overrides={"score_func": "softmax"}) + + +class TestHyMT2ForCausalLM: + def test_model_class_alias(self): + assert ModelClass is HyMT2ForCausalLM + + def test_construction(self, config, backend_config): + model = HyMT2ForCausalLM(config, backend=backend_config) + assert hasattr(model, "model") + assert hasattr(model, "lm_head") + assert model.config is config + assert model._enable_lm_head_fp32 is True + + def test_enable_lm_head_fp32_default_false_without_config_flag(self, backend_config): + # When the config does not declare the flag, default to False. + class _Cfg: + vocab_size = 32 + hidden_size = HIDDEN + intermediate_size = INTER + moe_intermediate_size = MOE_INTER + num_hidden_layers = 1 + num_attention_heads = N_HEADS + num_key_value_heads = N_KV + head_dim = HEAD_DIM + num_experts = N_EXPERTS + num_experts_per_tok = 2 + num_shared_experts = 1 + first_k_dense_replace = 1 + max_position_embeddings = 128 + rope_theta = 10000.0 + rms_norm_eps = 1e-5 + torch_dtype = "bfloat16" + attention_bias = False + qk_norm = True + route_norm = False + router_scaling_factor = 1.0 + moe_router_enable_expert_bias = False + moe_router_use_sigmoid = True + + model = HyMT2ForCausalLM(_Cfg(), backend=backend_config) + assert model._enable_lm_head_fp32 is False + + def test_lm_head_fp32_casts_back_to_input_dtype(self, config, backend_config, device): + model = HyMT2ForCausalLM(config, backend=backend_config).to(device).to(torch.bfloat16) + # Mock the inner backbone so we can control the dtype of its output. + bf16_hidden = torch.randn(1, 4, HIDDEN, device=device, dtype=torch.bfloat16) + with patch.object(model.model, "forward", return_value=bf16_hidden): + input_ids = torch.randint(0, config.vocab_size, (1, 4), device=device) + logits = model(input_ids) + # Output logits dtype must match the input hidden dtype, not fp32. + assert logits.dtype == torch.bfloat16 + + def test_lm_head_no_upcast_when_disabled(self, config, backend_config, device): + config.enable_lm_head_fp32 = False + model = HyMT2ForCausalLM(config, backend=backend_config).to(device).to(torch.bfloat16) + bf16_hidden = torch.randn(1, 4, HIDDEN, device=device, dtype=torch.bfloat16) + with patch.object(model.model, "forward", return_value=bf16_hidden): + input_ids = torch.randint(0, config.vocab_size, (1, 4), device=device) + logits = model(input_ids) + assert logits.dtype == torch.bfloat16 + + def test_get_set_input_embeddings(self, config, backend_config): + model = HyMT2ForCausalLM(config, backend=backend_config) + emb = model.get_input_embeddings() + assert emb is model.model.embed_tokens + new_emb = torch.nn.Embedding(8, HIDDEN) + model.set_input_embeddings(new_emb) + assert model.get_input_embeddings() is new_emb + + def test_get_set_output_embeddings(self, config, backend_config): + model = HyMT2ForCausalLM(config, backend=backend_config) + assert model.get_output_embeddings() is model.lm_head + new_head = torch.nn.Linear(HIDDEN, 8, bias=False) + model.set_output_embeddings(new_head) + assert model.get_output_embeddings() is new_head diff --git a/tests/unit_tests/models/hy_mt2/test_hy_mt2_state_dict_adapter.py b/tests/unit_tests/models/hy_mt2/test_hy_mt2_state_dict_adapter.py new file mode 100644 index 0000000000..facaab67f1 --- /dev/null +++ b/tests/unit_tests/models/hy_mt2/test_hy_mt2_state_dict_adapter.py @@ -0,0 +1,299 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for ``HyMT2StateDictAdapter``. + +Covers the rename tables, per-expert split/merge inherited from +``MoESplitExpertsStateDictMixin``, and the defensive MTP-layer filter. +""" + +from unittest.mock import Mock + +import pytest +import torch + +from nemo_automodel.components.models.common import BackendConfig +from nemo_automodel.components.models.hy_mt2.state_dict_adapter import ( + _HF_TO_NATIVE_RENAMES, + _NATIVE_TO_HF_RENAMES, + HyMT2StateDictAdapter, +) +from nemo_automodel.components.moe.config import MoEConfig + +N_EXPERTS = 4 +HIDDEN = 16 +MOE_INTER = 8 +NUM_LAYERS = 2 # layer 0 dense, layer 1 MoE + + +@pytest.fixture +def config(): + cfg = Mock() + cfg.num_hidden_layers = NUM_LAYERS + cfg.hidden_size = HIDDEN + cfg.intermediate_size = 32 + cfg.moe_intermediate_size = MOE_INTER + cfg.expert_hidden_dim = MOE_INTER + cfg.num_attention_heads = 4 + cfg.num_key_value_heads = 2 + cfg.num_experts = N_EXPERTS + cfg.num_experts_per_tok = 2 + cfg.num_shared_experts = 1 + cfg.first_k_dense_replace = 1 + return cfg + + +@pytest.fixture +def moe_config(): + return MoEConfig( + dim=HIDDEN, + inter_dim=32, + moe_inter_dim=MOE_INTER, + n_routed_experts=N_EXPERTS, + n_shared_experts=1, + n_activated_experts=2, + n_expert_groups=0, + n_limited_groups=0, + train_gate=True, + gate_bias_update_factor=0.0, + score_func="sigmoid", + route_scale=2.826, + aux_loss_coeff=0.0, + norm_topk_prob=True, + expert_bias=False, + router_bias=False, + expert_activation="swiglu", + softmax_before_topk=False, + force_e_score_correction_bias=True, + ) + + +@pytest.fixture +def backend_config(): + return BackendConfig( + linear="torch", + attn="sdpa", + rms_norm="torch", + experts="torch", + dispatcher="torch", + fake_balanced_gate=False, + enable_hf_state_dict_adapter=False, + ) + + +@pytest.fixture +def adapter(config, moe_config, backend_config): + return HyMT2StateDictAdapter(config=config, moe_config=moe_config, backend=backend_config, dtype=torch.float32) + + +def _make_disk_state_dict(*, with_mtp: bool = False): + """Synthesize an on-disk Hy-MT2 (== Hy3-preview key layout) state dict.""" + sd: dict[str, torch.Tensor] = { + "model.embed_tokens.weight": torch.randn(32, HIDDEN), + "model.norm.weight": torch.randn(HIDDEN), + "lm_head.weight": torch.randn(32, HIDDEN), + # Layer 0: dense + "model.layers.0.input_layernorm.weight": torch.randn(HIDDEN), + "model.layers.0.post_attention_layernorm.weight": torch.randn(HIDDEN), + "model.layers.0.self_attn.q_proj.weight": torch.randn(HIDDEN, HIDDEN), + "model.layers.0.self_attn.k_proj.weight": torch.randn(HIDDEN // 2, HIDDEN), + "model.layers.0.self_attn.v_proj.weight": torch.randn(HIDDEN // 2, HIDDEN), + "model.layers.0.self_attn.o_proj.weight": torch.randn(HIDDEN, HIDDEN), + "model.layers.0.mlp.gate_proj.weight": torch.randn(32, HIDDEN), + "model.layers.0.mlp.up_proj.weight": torch.randn(32, HIDDEN), + "model.layers.0.mlp.down_proj.weight": torch.randn(HIDDEN, 32), + # Layer 1: MoE with on-disk Tencent-internal names + "model.layers.1.input_layernorm.weight": torch.randn(HIDDEN), + "model.layers.1.post_attention_layernorm.weight": torch.randn(HIDDEN), + "model.layers.1.self_attn.q_proj.weight": torch.randn(HIDDEN, HIDDEN), + "model.layers.1.self_attn.k_proj.weight": torch.randn(HIDDEN // 2, HIDDEN), + "model.layers.1.self_attn.v_proj.weight": torch.randn(HIDDEN // 2, HIDDEN), + "model.layers.1.self_attn.o_proj.weight": torch.randn(HIDDEN, HIDDEN), + "model.layers.1.mlp.router.gate.weight": torch.randn(N_EXPERTS, HIDDEN), + "model.layers.1.mlp.expert_bias": torch.randn(N_EXPERTS), + "model.layers.1.mlp.shared_mlp.gate_proj.weight": torch.randn(MOE_INTER, HIDDEN), + "model.layers.1.mlp.shared_mlp.up_proj.weight": torch.randn(MOE_INTER, HIDDEN), + "model.layers.1.mlp.shared_mlp.down_proj.weight": torch.randn(HIDDEN, MOE_INTER), + } + for e in range(N_EXPERTS): + sd[f"model.layers.1.mlp.experts.{e}.gate_proj.weight"] = torch.randn(MOE_INTER, HIDDEN) + sd[f"model.layers.1.mlp.experts.{e}.up_proj.weight"] = torch.randn(MOE_INTER, HIDDEN) + sd[f"model.layers.1.mlp.experts.{e}.down_proj.weight"] = torch.randn(HIDDEN, MOE_INTER) + if with_mtp: + sd[f"model.layers.{NUM_LAYERS}.input_layernorm.weight"] = torch.randn(HIDDEN) + sd[f"model.layers.{NUM_LAYERS}.mlp.expert_bias"] = torch.randn(N_EXPERTS) + return sd + + +class TestInitialization: + def test_attributes_set(self, config, moe_config, backend_config): + a = HyMT2StateDictAdapter(config=config, moe_config=moe_config, backend=backend_config, dtype=torch.float16) + assert a.config is config + assert a.moe_config is moe_config + assert a.backend is backend_config + assert a.dtype == torch.float16 + assert a._uses_model_prefix is True + + def test_default_dtype_is_bfloat16(self, config, moe_config, backend_config): + a = HyMT2StateDictAdapter(config=config, moe_config=moe_config, backend=backend_config) + assert a.dtype == torch.bfloat16 + + def test_inherits_split_experts_mixin(self, adapter): + from nemo_automodel.components.moe.state_dict_mixin import MoESplitExpertsStateDictMixin + + assert isinstance(adapter, MoESplitExpertsStateDictMixin) + + +class TestRenameTables: + """Each rename pattern must be reversible: native -> hf -> native.""" + + @pytest.mark.parametrize( + "native, hf", + [ + ("model.layers.5.mlp.gate.e_score_correction_bias", "model.layers.5.mlp.expert_bias"), + ("model.layers.5.mlp.gate.weight", "model.layers.5.mlp.router.gate.weight"), + ("model.layers.5.mlp.shared_experts.gate_proj.weight", "model.layers.5.mlp.shared_mlp.gate_proj.weight"), + ("model.layers.5.mlp.shared_experts.up_proj.weight", "model.layers.5.mlp.shared_mlp.up_proj.weight"), + ("model.layers.5.mlp.shared_experts.down_proj.weight", "model.layers.5.mlp.shared_mlp.down_proj.weight"), + ], + ) + def test_round_trip(self, native, hf): + nk = native + for pat, repl in _NATIVE_TO_HF_RENAMES: + nk, n = pat.subn(repl, nk) + if n: + break + assert nk == hf + + hk = hf + for pat, repl in _HF_TO_NATIVE_RENAMES: + hk, n = pat.subn(repl, hk) + if n: + break + assert hk == native + + def test_unrelated_keys_pass_through(self): + """Renames must not touch attention, embed, lm_head, layernorm, or dense MLP keys.""" + for k in ( + "model.embed_tokens.weight", + "lm_head.weight", + "model.layers.0.self_attn.q_proj.weight", + "model.layers.0.input_layernorm.weight", + "model.layers.0.mlp.gate_proj.weight", # dense MLP gate_proj must NOT match + "model.norm.weight", + ): + for tab in (_NATIVE_TO_HF_RENAMES, _HF_TO_NATIVE_RENAMES): + v = k + for pat, repl in tab: + v, n = pat.subn(repl, v) + if n: + break + assert v == k, f"{k} unexpectedly renamed to {v}" + + +class TestFromHF: + def test_renames_router_gate(self, adapter): + native = adapter.from_hf(_make_disk_state_dict(), device_mesh=None) + assert "model.layers.1.mlp.gate.weight" in native + assert "model.layers.1.mlp.router.gate.weight" not in native + + def test_renames_expert_bias_to_gate_bias(self, adapter): + native = adapter.from_hf(_make_disk_state_dict(), device_mesh=None) + assert "model.layers.1.mlp.gate.e_score_correction_bias" in native + assert "model.layers.1.mlp.expert_bias" not in native + + def test_renames_shared_mlp_to_shared_experts(self, adapter): + native = adapter.from_hf(_make_disk_state_dict(), device_mesh=None) + for proj in ("gate_proj", "up_proj", "down_proj"): + assert f"model.layers.1.mlp.shared_experts.{proj}.weight" in native + assert f"model.layers.1.mlp.shared_mlp.{proj}.weight" not in native + + def test_merges_experts_into_grouped_form(self, adapter): + native = adapter.from_hf(_make_disk_state_dict(), device_mesh=None) + for e in range(N_EXPERTS): + for proj in ("gate_proj", "up_proj", "down_proj"): + assert f"model.layers.1.mlp.experts.{e}.{proj}.weight" not in native + assert "model.layers.1.mlp.experts.gate_and_up_projs" in native + assert "model.layers.1.mlp.experts.down_projs" in native + + def test_merged_shapes_are_native_layout(self, adapter): + native = adapter.from_hf(_make_disk_state_dict(), device_mesh=None) + assert tuple(native["model.layers.1.mlp.experts.gate_and_up_projs"].shape) == ( + N_EXPERTS, + HIDDEN, + 2 * MOE_INTER, + ) + assert tuple(native["model.layers.1.mlp.experts.down_projs"].shape) == ( + N_EXPERTS, + MOE_INTER, + HIDDEN, + ) + + def test_drops_mtp_layer_keys(self, adapter): + hf = _make_disk_state_dict(with_mtp=True) + assert any(k.startswith(f"model.layers.{NUM_LAYERS}.") for k in hf) + native = adapter.from_hf(hf, device_mesh=None) + assert not any(k.startswith(f"model.layers.{NUM_LAYERS}.") for k in native) + + +class TestToHF: + def test_renames_native_back_to_on_disk(self, adapter): + # Build a minimal native state dict; reuse from_hf to produce it. + native = adapter.from_hf(_make_disk_state_dict(), device_mesh=None) + hf = adapter.to_hf(native) + # On-disk renames present after to_hf + assert "model.layers.1.mlp.router.gate.weight" in hf + assert "model.layers.1.mlp.expert_bias" in hf + for proj in ("gate_proj", "up_proj", "down_proj"): + assert f"model.layers.1.mlp.shared_mlp.{proj}.weight" in hf + # Native-only names must be gone. + assert "model.layers.1.mlp.gate.weight" not in hf + assert "model.layers.1.mlp.gate.e_score_correction_bias" not in hf + + def test_splits_grouped_experts_to_per_expert(self, adapter): + native = adapter.from_hf(_make_disk_state_dict(), device_mesh=None) + hf = adapter.to_hf(native) + # Per-expert keys re-appear after splitting. + for e in range(N_EXPERTS): + for proj in ("gate_proj", "up_proj", "down_proj"): + assert f"model.layers.1.mlp.experts.{e}.{proj}.weight" in hf + # Grouped keys gone. + assert "model.layers.1.mlp.experts.gate_and_up_projs" not in hf + assert "model.layers.1.mlp.experts.down_projs" not in hf + + def test_round_trip_preserves_per_expert_weights(self, adapter): + """A full disk -> native -> disk round-trip preserves expert weights.""" + disk = _make_disk_state_dict() + native = adapter.from_hf(disk, device_mesh=None) + round_tripped = adapter.to_hf(native) + for e in range(N_EXPERTS): + for proj in ("gate_proj", "up_proj", "down_proj"): + key = f"model.layers.1.mlp.experts.{e}.{proj}.weight" + assert key in round_tripped + assert torch.allclose(round_tripped[key].to(disk[key].dtype), disk[key]) + + +class TestMTPFilter: + def test_filters_layer_at_num_hidden(self, adapter): + assert adapter._is_mtp_key(f"model.layers.{NUM_LAYERS}.foo") is True + assert adapter._is_mtp_key(f"layers.{NUM_LAYERS}.foo") is True + + def test_does_not_filter_in_range_layers(self, adapter): + assert adapter._is_mtp_key("model.layers.0.foo") is False + assert adapter._is_mtp_key(f"model.layers.{NUM_LAYERS - 1}.foo") is False + + def test_does_not_filter_non_layer_keys(self, adapter): + assert adapter._is_mtp_key("model.embed_tokens.weight") is False + assert adapter._is_mtp_key("lm_head.weight") is False + assert adapter._is_mtp_key("model.norm.weight") is False From a21e01422236dfd6874e777923df65a31c4ea87b Mon Sep 17 00:00:00 2001 From: khazic Date: Wed, 27 May 2026 11:25:55 +0800 Subject: [PATCH 02/20] feat(models): add Hy-MT2 config-shape dispatcher and fp32 lm_head fix Two refinements on top of a7d91c39: 1. **Config-shape dispatcher** in ``_transformers/model_init.py``: when ``architectures: ["HYV3ForCausalLM"]`` is paired with the Hy-MT2-30B-A3B config fingerprint (hidden=2048, 48 layers, 128 experts, expert_hidden_dim=768, ``enable_lm_head_fp32`` present), resolve to ``HyMT2ForCausalLM`` instead of the default ``HYV3ForCausalLM``. Hy3-preview (hidden=4096, 80 layers, 192 experts) still resolves to ``HYV3ForCausalLM``. Two tests in ``test_model_init.py`` lock this dispatch in. 2. **lm_head fp32 dtype fix** in ``HyMT2ForCausalLM.forward``: when ``enable_lm_head_fp32`` is on, the upcast path was calling ``self.lm_head(hidden.float())`` which would fail because ``lm_head.weight`` stays in bf16 after ``cast_model_to_dtype``. Replace with an explicit ``F.linear(hidden.float(), self.lm_head.weight.float(), bias.float() | None)`` so both operands are fp32; the result is cast back to the original dtype. The example YAML now uses the fully-qualified ``HyMT2ForCausalLM`` target; combined with (1) it can also be loaded via ``NeMoAutoModelForCausalLM``, which gives users both an explicit and an auto-dispatch path. Signed-off-by: khazic --- .../hy_mt2/hy_mt2_30b_a3b_sft.yaml | 14 ++++---- nemo_automodel/_transformers/model_init.py | 18 ++++++++++ .../components/models/hy_mt2/model.py | 8 ++++- .../_transformers/test_model_init.py | 36 +++++++++++++++++++ 4 files changed, 69 insertions(+), 7 deletions(-) diff --git a/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml b/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml index 9f2b71d8d9..cb26198ed6 100644 --- a/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml +++ b/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml @@ -26,12 +26,14 @@ # EP size must divide num_experts (128). ep_size=8 -> 16 experts per rank. # Other valid EP sizes: 1, 2, 4, 16, 32, 64, 128. # -# Note: the on-disk checkpoint declares ``architectures: ["HYV3ForCausalLM"]``, -# so NeMoAutoModel's registry would dispatch to ``components/models/hy_v3``. -# This recipe deliberately bypasses the registry and instantiates the new -# ``HyMT2ForCausalLM`` directly via ``_target_``, isolating Hy-MT2 logic -# (sigmoid-routing flag, ``enable_lm_head_fp32``, expert_hidden_dim) from -# the existing Hy3-preview support. +# Note: the on-disk checkpoint declares ``architectures: ["HYV3ForCausalLM"]`` +# and ``model_type: "hy_v3"``, which NeMoAutoModel's string-keyed registry maps +# to ``components/models/hy_v3`` (Hy3-preview). This recipe deliberately +# bypasses the registry by instantiating ``HyMT2ForCausalLM`` via a +# fully-qualified ``_target_`` -- this keeps the Hy-MT2-specific logic +# (``moe_router_use_sigmoid`` dispatch, ``enable_lm_head_fp32`` in-forward +# upcast, ``expert_hidden_dim`` preference) isolated from the existing +# Hy3-preview support without registry surgery. recipe: TrainFinetuneRecipeForNextTokenPrediction diff --git a/nemo_automodel/_transformers/model_init.py b/nemo_automodel/_transformers/model_init.py index a351920582..de12748498 100644 --- a/nemo_automodel/_transformers/model_init.py +++ b/nemo_automodel/_transformers/model_init.py @@ -214,6 +214,19 @@ def _is_config_compatible_with_custom_model(arch_name: str, config) -> bool: return True +def _is_hy_mt2_config(config) -> bool: + """Return whether a ``hy_v3`` config describes Tencent Hy-MT2-30B-A3B.""" + return ( + getattr(config, "model_type", None) == "hy_v3" + and getattr(config, "hidden_size", None) == 2048 + and getattr(config, "num_hidden_layers", None) == 48 + and getattr(config, "num_experts", None) == 128 + and getattr(config, "expert_hidden_dim", None) == 768 + and getattr(config, "moe_intermediate_size", None) == 768 + and hasattr(config, "enable_lm_head_fp32") + ) + + def _resolve_custom_model_cls_for_config(config): """Resolve the custom model class for *config*, if the config is compatible.""" architectures = get_architectures(config) @@ -221,6 +234,11 @@ def _resolve_custom_model_cls_for_config(config): return None arch_name = architectures[0] + if arch_name == "HYV3ForCausalLM" and _is_hy_mt2_config(config): + from nemo_automodel.components.models.hy_mt2.model import HyMT2ForCausalLM + + return HyMT2ForCausalLM + if not ModelRegistry.has_custom_model(arch_name): return None diff --git a/nemo_automodel/components/models/hy_mt2/model.py b/nemo_automodel/components/models/hy_mt2/model.py index db2d85db8e..8af9a979ef 100644 --- a/nemo_automodel/components/models/hy_mt2/model.py +++ b/nemo_automodel/components/models/hy_mt2/model.py @@ -40,6 +40,7 @@ import torch import torch.nn as nn +import torch.nn.functional as F from nemo_automodel.components.models.common import ( BackendConfig, @@ -373,7 +374,12 @@ def forward( # not surprised by an fp32 tensor. Matches the HF reference's # ``enable_lm_head_fp32`` behavior. original_dtype = hidden.dtype - logits = self.lm_head(hidden.float()).to(original_dtype) + lm_head_bias = self.lm_head.bias if getattr(self.lm_head, "bias", None) is not None else None + logits = F.linear( + hidden.float(), + self.lm_head.weight.float(), + lm_head_bias.float() if lm_head_bias is not None else None, + ).to(original_dtype) else: logits = self.lm_head(hidden) diff --git a/tests/unit_tests/_transformers/test_model_init.py b/tests/unit_tests/_transformers/test_model_init.py index 5318719ad5..4ece5371d1 100644 --- a/tests/unit_tests/_transformers/test_model_init.py +++ b/tests/unit_tests/_transformers/test_model_init.py @@ -15,6 +15,7 @@ """Tests for nested config override handling in get_hf_config and _consume_config_overrides.""" import os +from types import SimpleNamespace from unittest.mock import MagicMock, patch import pytest @@ -27,6 +28,7 @@ _init_model, _load_config_with_layer_types_fix, _propagate_torch_dtype_to_subconfigs, + _resolve_custom_model_cls_for_config, _resolve_model_dir, _setup_bnb_loading_kwargs, _stream_load_bnb_weights, @@ -37,6 +39,40 @@ from nemo_automodel.components.models.common.utils import BackendConfig +class TestHyMT2ModelResolution: + """Hy-MT2 shares HYV3ForCausalLM metadata but needs its own implementation.""" + + def test_hy_mt2_config_resolves_to_hy_mt2_model(self): + from nemo_automodel.components.models.hy_mt2.model import HyMT2ForCausalLM + + config = SimpleNamespace( + architectures=["HYV3ForCausalLM"], + model_type="hy_v3", + hidden_size=2048, + num_hidden_layers=48, + num_experts=128, + expert_hidden_dim=768, + moe_intermediate_size=768, + enable_lm_head_fp32=True, + ) + + assert _resolve_custom_model_cls_for_config(config) is HyMT2ForCausalLM + + def test_hy_v3_config_still_resolves_to_hy_v3_model(self): + from nemo_automodel.components.models.hy_v3.model import HYV3ForCausalLM + + config = SimpleNamespace( + architectures=["HYV3ForCausalLM"], + model_type="hy_v3", + hidden_size=4096, + num_hidden_layers=80, + num_experts=192, + moe_intermediate_size=1536, + ) + + assert _resolve_custom_model_cls_for_config(config) is HYV3ForCausalLM + + class TestConsumeConfigOverridesNestedDict: """Nested dict overrides should be deep-merged into sub-config objects.""" From 97492ff4c96885346f5ee773e625eaf4511b9324 Mon Sep 17 00:00:00 2001 From: khazic Date: Wed, 27 May 2026 11:35:02 +0800 Subject: [PATCH 03/20] docs(examples): fix Hy-MT2 launch command in YAML comment The header comment showed ``automodel finetune llm -c ...`` which is not the real CLI signature -- ``nemo_automodel/cli/app.py:76-81`` takes the YAML path as the first positional argument, so the previous form silently treated ``finetune`` as the config path and failed with FileNotFoundError on ``./finetune``. Update the comment to match the actual usage: automodel --nproc-per-node 8 Signed-off-by: khazic --- examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml b/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml index cb26198ed6..663d36ef7a 100644 --- a/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml +++ b/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml @@ -20,8 +20,7 @@ # vocab=120832, 256K context, rope_theta=11158840, qk_norm. # # Hardware target: 8 GPUs (80 GB+ each) for full SFT with EP8 + FSDP2. -# automodel finetune llm -c examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml \ -# --nproc-per-node 8 +# automodel examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml --nproc-per-node 8 # # EP size must divide num_experts (128). ep_size=8 -> 16 experts per rank. # Other valid EP sizes: 1, 2, 4, 16, 32, 64, 128. From 3e8c3c0a46eb8d54fd0d884e21fc57f8750c024c Mon Sep 17 00:00:00 2001 From: khazic Date: Wed, 27 May 2026 12:04:49 +0800 Subject: [PATCH 04/20] fix(models): use nn.Linear (DTensor-aware) on Hy-MT2 lm_head fp32 path The in-model ``enable_lm_head_fp32`` path called ``F.linear`` directly with ``self.lm_head.weight.float()``. Under FSDP2 the lm_head weight is a DTensor, and ``F.linear`` does not handle DTensor redistribution -- the hidden state is a plain torch.Tensor, so the matmul crashes with:: RuntimeError: aten.mm.default got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators! Drop the explicit ``F.linear`` and rely on ``self.lm_head(...)`` instead; ``nn.Linear.forward`` is DTensor-aware and will redistribute the input as needed. To avoid the original dtype-mismatch motivation for the manual upcast (fp32 input vs. bf16 weight), only upcast when ``lm_head.weight`` has already been promoted to fp32 -- which is exactly what the YAML's ``distributed.moe.lm_head_precision: float32`` path does via the MoE parallelizer's ``MixedPrecisionPolicy``. If the weight is still in the model dtype, fall through to the standard ``self.lm_head(hidden)`` path. Also drop the now-unused ``torch.nn.functional`` import and update the unit tests to validate the new condition (weight promoted -> upcast runs; weight not promoted -> fall through). Signed-off-by: khazic --- .../components/models/hy_mt2/model.py | 22 +++++++-------- .../models/hy_mt2/test_hy_mt2_model.py | 27 ++++++++++++++++--- 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/nemo_automodel/components/models/hy_mt2/model.py b/nemo_automodel/components/models/hy_mt2/model.py index 8af9a979ef..72b7f015f8 100644 --- a/nemo_automodel/components/models/hy_mt2/model.py +++ b/nemo_automodel/components/models/hy_mt2/model.py @@ -40,7 +40,6 @@ import torch import torch.nn as nn -import torch.nn.functional as F from nemo_automodel.components.models.common import ( BackendConfig, @@ -368,18 +367,17 @@ def forward( if self.lm_head is None: logits = hidden - elif self._enable_lm_head_fp32 and hidden.dtype != torch.float32: - # Upcast input to fp32 for the lm_head matmul, then cast logits - # back to the model dtype so downstream loss / sampling code is - # not surprised by an fp32 tensor. Matches the HF reference's - # ``enable_lm_head_fp32`` behavior. + elif self._enable_lm_head_fp32 and self.lm_head.weight.dtype == torch.float32 and hidden.dtype != torch.float32: + # The MoE parallelizer (``distributed.moe.lm_head_precision: + # float32`` in the YAML) has already promoted ``lm_head.weight`` to + # fp32. Feed it fp32 input via ``nn.Linear`` -- which is + # DTensor-aware under FSDP2 -- and cast logits back to the input + # dtype. We must NOT use ``F.linear`` directly with a manually + # ``.float()``-ed weight here, because that bypasses nn.Linear's + # DTensor redistribution and crashes with + # "got mixed torch.Tensor and DTensor". original_dtype = hidden.dtype - lm_head_bias = self.lm_head.bias if getattr(self.lm_head, "bias", None) is not None else None - logits = F.linear( - hidden.float(), - self.lm_head.weight.float(), - lm_head_bias.float() if lm_head_bias is not None else None, - ).to(original_dtype) + logits = self.lm_head(hidden.float()).to(original_dtype) else: logits = self.lm_head(hidden) diff --git a/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py b/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py index 19e0a7975b..20705d6d3c 100644 --- a/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py +++ b/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py @@ -250,14 +250,35 @@ class _Cfg: model = HyMT2ForCausalLM(_Cfg(), backend=backend_config) assert model._enable_lm_head_fp32 is False - def test_lm_head_fp32_casts_back_to_input_dtype(self, config, backend_config, device): + def test_lm_head_fp32_upcast_when_weight_promoted(self, config, backend_config, device): + """When the parallelizer has promoted lm_head.weight to fp32 (the + ``distributed.moe.lm_head_precision: float32`` path), the in-model + fallback feeds the bf16 hidden state up to fp32, runs lm_head, and + casts logits back to bf16.""" model = HyMT2ForCausalLM(config, backend=backend_config).to(device).to(torch.bfloat16) - # Mock the inner backbone so we can control the dtype of its output. + # Simulate the parallelizer's promotion of lm_head.weight to fp32. + model.lm_head = model.lm_head.to(torch.float32) + bf16_hidden = torch.randn(1, 4, HIDDEN, device=device, dtype=torch.bfloat16) + with patch.object(model.model, "forward", return_value=bf16_hidden): + input_ids = torch.randint(0, config.vocab_size, (1, 4), device=device) + logits = model(input_ids) + # Output dtype must be the input dtype (bf16), not fp32. + assert logits.dtype == torch.bfloat16 + + def test_lm_head_no_upcast_when_weight_is_bf16(self, config, backend_config, device): + """If the parallelizer did NOT promote lm_head.weight, the model must + fall through to ``self.lm_head(hidden)`` without trying to upcast, + to avoid the dtype mismatch (fp32 input vs bf16 weight) and the + ``F.linear`` DTensor mixing crash that the prior implementation hit.""" + model = HyMT2ForCausalLM(config, backend=backend_config).to(device).to(torch.bfloat16) + # lm_head.weight is bf16 (no promotion). Even though enable_lm_head_fp32 + # is True on the config, the in-model path must NOT activate. + assert model.lm_head.weight.dtype == torch.bfloat16 + assert model._enable_lm_head_fp32 is True bf16_hidden = torch.randn(1, 4, HIDDEN, device=device, dtype=torch.bfloat16) with patch.object(model.model, "forward", return_value=bf16_hidden): input_ids = torch.randint(0, config.vocab_size, (1, 4), device=device) logits = model(input_ids) - # Output logits dtype must match the input hidden dtype, not fp32. assert logits.dtype == torch.bfloat16 def test_lm_head_no_upcast_when_disabled(self, config, backend_config, device): From c9945eb16c298359cb8dd2b0a8dc9eb9526a5e38 Mon Sep 17 00:00:00 2001 From: khazic Date: Wed, 27 May 2026 12:20:40 +0800 Subject: [PATCH 05/20] fix(examples): route Hy-MT2 example via NeMoAutoModel for weight loading The fully-qualified ``_target_: HyMT2ForCausalLM.from_pretrained`` path bypasses ``_transformers/model_init.py``, which is where the HF safetensors loader actually runs. Our class method only invokes ``AutoConfig.from_pretrained`` and ``cls.from_config(...)`` -- the returned model has the right architecture but random weights, so SFT starts at ``loss ~= ln(vocab) = 11.7`` instead of the loaded pre-trained weights. Switch the YAML back to ``NeMoAutoModelForCausalLM.from_pretrained``. The config-shape dispatcher added in a21e0142 will still route this to ``HyMT2ForCausalLM`` (hidden=2048 + 48 layers + 128 experts + ``enable_lm_head_fp32``), and the standard NeMoAutoModel loader pipeline will then stream the safetensors through ``HyMT2StateDictAdapter`` into the FSDP2 / EP-sharded parameters. Signed-off-by: khazic --- .../llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml b/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml index 663d36ef7a..26ff83caa8 100644 --- a/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml +++ b/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml @@ -26,13 +26,14 @@ # Other valid EP sizes: 1, 2, 4, 16, 32, 64, 128. # # Note: the on-disk checkpoint declares ``architectures: ["HYV3ForCausalLM"]`` -# and ``model_type: "hy_v3"``, which NeMoAutoModel's string-keyed registry maps -# to ``components/models/hy_v3`` (Hy3-preview). This recipe deliberately -# bypasses the registry by instantiating ``HyMT2ForCausalLM`` via a -# fully-qualified ``_target_`` -- this keeps the Hy-MT2-specific logic -# (``moe_router_use_sigmoid`` dispatch, ``enable_lm_head_fp32`` in-forward -# upcast, ``expert_hidden_dim`` preference) isolated from the existing -# Hy3-preview support without registry surgery. +# and ``model_type: "hy_v3"``. NeMoAutoModel's model resolver +# (``_transformers/model_init.py``) detects the Hy-MT2-30B-A3B config +# fingerprint (hidden=2048, 48 layers, 128 experts, ``enable_lm_head_fp32``) +# and dispatches to ``HyMT2ForCausalLM`` instead of the default +# ``HYV3ForCausalLM``. Going through ``NeMoAutoModelForCausalLM`` here is +# important: it runs the full HF safetensors loader, while a fully-qualified +# ``_target_: HyMT2ForCausalLM.from_pretrained`` would only construct the +# architecture with random weights. recipe: TrainFinetuneRecipeForNextTokenPrediction @@ -54,7 +55,7 @@ rng: ranked: true model: - _target_: nemo_automodel.components.models.hy_mt2.model.HyMT2ForCausalLM.from_pretrained + _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained pretrained_model_name_or_path: tencent/Hy-MT2-30B-A3B torch_dtype: bfloat16 backend: From 4965a993b745e4ee1d8b8a76253827a6201fb8e2 Mon Sep 17 00:00:00 2001 From: khazic Date: Wed, 27 May 2026 16:09:07 +0800 Subject: [PATCH 06/20] fix(transformers): register HyMT2ForCausalLM in MODEL_ARCH_MAPPING The L0 unit test ``test_all_model_folders_registered_in_auto_map`` scans every ``components/models/*/model.py`` and asserts the architecture has a matching ``MODEL_ARCH_MAPPING`` entry. ``hy_mt2`` was missing because its dispatch is done via the config-shape detector in ``_transformers/model_init.py`` rather than the string-keyed registry, so the meta-test flagged it. Add the registry entry so the meta-test passes. HF checkpoints declare ``architectures: [\"HYV3ForCausalLM\"]``, so this new key will only be hit if a user explicitly writes ``architectures: [\"HyMT2ForCausalLM\"]`` in their config -- the standard config-shape dispatch path is unchanged. Signed-off-by: khazic --- nemo_automodel/_transformers/registry.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nemo_automodel/_transformers/registry.py b/nemo_automodel/_transformers/registry.py index b1160ffe1b..16b25272ee 100644 --- a/nemo_automodel/_transformers/registry.py +++ b/nemo_automodel/_transformers/registry.py @@ -164,6 +164,10 @@ "HYV3ForCausalLM", ("nemo_automodel.components.models.hy_v3.model", "HYV3ForCausalLM"), ), + ( + "HyMT2ForCausalLM", + ("nemo_automodel.components.models.hy_mt2.model", "HyMT2ForCausalLM"), + ), ( "Qwen2ForCausalLM", ("nemo_automodel.components.models.qwen2.model", "Qwen2ForCausalLM"), From 1234d1b4c3e180ee643b5b1053ffacd3ab06006e Mon Sep 17 00:00:00 2001 From: khazic Date: Wed, 27 May 2026 18:27:08 +0800 Subject: [PATCH 07/20] docs(model-coverage): add Hy-MT2 model card Adds the Hy-MT2-30B-A3B model page under docs/model-coverage/llm/tencent/ (legacy Sphinx tree) and fern/versions/nightly/pages/model-coverage/llm/tencent/ (nightly Fern tree), plus the matching nightly.yml sidebar entry. Satisfies test_every_registered_arch_has_model_coverage_doc, which scans docs/model-coverage/*.md for every architecture registered in MODEL_ARCH_MAPPING and was failing on the new HyMT2ForCausalLM entry. v0.4 frozen tree and latest/v0.4 alias YAMLs are intentionally not touched - this is nightly drift, not a back-port. Signed-off-by: khazic --- docs/model-coverage/llm/tencent/hy-mt2.md | 63 +++++++++++++++++ fern/versions/nightly.yml | 2 + .../model-coverage/llm/tencent/hy-mt2.mdx | 67 +++++++++++++++++++ 3 files changed, 132 insertions(+) create mode 100644 docs/model-coverage/llm/tencent/hy-mt2.md create mode 100644 fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx diff --git a/docs/model-coverage/llm/tencent/hy-mt2.md b/docs/model-coverage/llm/tencent/hy-mt2.md new file mode 100644 index 0000000000..62794a8f00 --- /dev/null +++ b/docs/model-coverage/llm/tencent/hy-mt2.md @@ -0,0 +1,63 @@ +# Hy-MT2 (Hunyuan-MT2) + +[Hy-MT2-30B-A3B](https://huggingface.co/tencent/Hy-MT2-30B-A3B) is Tencent's translation Mixture-of-Experts language model with 30B total parameters and 3B activated per token. It features 48 transformer layers (layer 0 dense, layers 1–47 MoE), 128 routed experts plus 1 shared expert with top-8 sigmoid routing, Grouped Query Attention (32 Q / 4 KV heads), per-head QK RMSNorm, RoPE, and an in-forward fp32 upcast on the language-model head (`enable_lm_head_fp32`). It supports a 256K context window. + +:::{card} +| | | +|---|---| +| **Task** | Text Generation (MoE, translation) | +| **Architecture** | `HyMT2ForCausalLM` | +| **Parameters** | 30B total / 3B activated | +| **HF Org** | [tencent](https://huggingface.co/tencent) | +::: + +## Available Models + +- **Hy-MT2-30B-A3B**: 30B total, top-8 routed experts (out of 128) activated per token, plus 1 shared expert + +## Architectures + +- `HyMT2ForCausalLM` + +## Example HF Models + +| Model | HF ID | +|---|---| +| Hy-MT2-30B-A3B | [`tencent/Hy-MT2-30B-A3B`](https://huggingface.co/tencent/Hy-MT2-30B-A3B) | + +## Example Recipes + +| Recipe | Description | +|---|---| +| {download}`hy_mt2_30b_a3b_sft.yaml <../../../../examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml>` | SFT — Hy-MT2-30B-A3B with FSDP2 + EP8 + fp32 LM head | + +## Try with NeMo AutoModel + +**1. Install** ([NeMo AutoModel](../../../guides/installation.md)): + +```bash +pip install nemo-automodel +``` + +**2. Clone the repo** to get the example recipes: + +```bash +git clone https://github.com/NVIDIA-NeMo/Automodel.git +cd Automodel +``` + +**3. Run the recipe** from inside the repo: + +```bash +automodel --nproc-per-node=8 examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml +``` + +See the [NeMo AutoModel Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md). + +## Fine-Tuning + +See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md) and the [Large MoE Fine-Tuning Guide](../../../guides/llm/large-moe-finetune.md). + +## Hugging Face Model Cards + +- [tencent/Hy-MT2-30B-A3B](https://huggingface.co/tencent/Hy-MT2-30B-A3B) diff --git a/fern/versions/nightly.yml b/fern/versions/nightly.yml index 2881fe7cd5..e89c2240d4 100644 --- a/fern/versions/nightly.yml +++ b/fern/versions/nightly.yml @@ -154,6 +154,8 @@ navigation: path: ./nightly/pages/model-coverage/llm/parasail-ai/gritlm.mdx - page: "Hy3-preview" path: ./nightly/pages/model-coverage/llm/tencent/hy3.mdx + - page: "Hy-MT2" + path: ./nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx - page: "MiMo-V2-Flash" path: ./nightly/pages/model-coverage/llm/xiaomimimo/mimo-v2-flash.mdx - page: "Ling 2.0" diff --git a/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx b/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx new file mode 100644 index 0000000000..87020ebe55 --- /dev/null +++ b/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx @@ -0,0 +1,67 @@ +--- +title: "Hy-MT2 (Hunyuan-MT2)" +description: "" +--- +[Hy-MT2-30B-A3B](https://huggingface.co/tencent/Hy-MT2-30B-A3B) is Tencent's translation Mixture-of-Experts language model with 30B total parameters and 3B activated per token. It features 48 transformer layers (layer 0 dense, layers 1–47 MoE), 128 routed experts plus 1 shared expert with top-8 sigmoid routing, Grouped Query Attention (32 Q / 4 KV heads), per-head QK RMSNorm, RoPE, and an in-forward fp32 upcast on the language-model head (`enable_lm_head_fp32`). It supports a 256K context window. + + + +| | | +|---|---| +| **Task** | Text Generation (MoE, translation) | +| **Architecture** | `HyMT2ForCausalLM` | +| **Parameters** | 30B total / 3B activated | +| **HF Org** | [tencent](https://huggingface.co/tencent) | + + + +## Available Models + +- **Hy-MT2-30B-A3B**: 30B total, top-8 routed experts (out of 128) activated per token, plus 1 shared expert + +## Architectures + +- `HyMT2ForCausalLM` + +## Example HF Models + +| Model | HF ID | +|---|---| +| Hy-MT2-30B-A3B | [`tencent/Hy-MT2-30B-A3B`](https://huggingface.co/tencent/Hy-MT2-30B-A3B) | + +## Example Recipes + +| Recipe | Description | +|---|---| +| [hy_mt2_30b_a3b_sft.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml) | SFT — Hy-MT2-30B-A3B with FSDP2 + EP8 + fp32 LM head | + +## Try with NeMo AutoModel + +**1. Install** ([NeMo AutoModel](/get-started/installation)): + +```bash +pip install nemo-automodel +``` + +**2. Clone the repo** to get the example recipes: + +```bash +git clone https://github.com/NVIDIA-NeMo/Automodel.git +cd Automodel +``` + +**3. Run the recipe** from inside the repo: + +```bash +automodel --nproc-per-node=8 examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml +``` + +See the [NeMo AutoModel Installation Guide](/get-started/installation) and [LLM Fine-Tuning Guide](/recipes-e2e-examples/sft-peft). + +## Fine-Tuning + +See the [LLM Fine-Tuning Guide](/recipes-e2e-examples/sft-peft) and the [Large MoE Fine-Tuning Guide](/recipes-e2e-examples/large-moe-fine-tuning). + +## Hugging Face Model Cards + +- [tencent/Hy-MT2-30B-A3B](https://huggingface.co/tencent/Hy-MT2-30B-A3B) From 577231139d50789d2394f89e2b87484564524b66 Mon Sep 17 00:00:00 2001 From: khazic Date: Wed, 27 May 2026 18:55:34 +0800 Subject: [PATCH 08/20] docs(model-coverage): wire Hy-MT2 into the LLM toctree Sphinx --fail-on-warning treated 'document isn't included in any toctree' as an error after the previous commit added hy-mt2.md without registering it in docs/model-coverage/llm/index.md. Adds the row to the LLM coverage table and the entry to the hidden toctree, alongside the existing Hy3-preview entry. Signed-off-by: khazic --- docs/model-coverage/llm/index.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/model-coverage/llm/index.md b/docs/model-coverage/llm/index.md index 2a81e3cbde..d43dd12bb1 100644 --- a/docs/model-coverage/llm/index.md +++ b/docs/model-coverage/llm/index.md @@ -73,6 +73,7 @@ NeMo AutoModel supports the [AutoModelForCausalLM](https://huggingface.co/transf | Stepfun AI | [Step-3.5](stepfun-ai/step-3-5.md) | `Step3p5ForCausalLM` | | Parasail AI | [GritLM](parasail-ai/gritlm.md) | `GritLM` | | Tencent | [Hy3-preview](tencent/hy3.md) | `HYV3ForCausalLM` | +| Tencent | [Hy-MT2](tencent/hy-mt2.md) | `HyMT2ForCausalLM` | | Xiaomi MiMo | [MiMo-V2-Flash](xiaomimimo/mimo-v2-flash.md) | `MiMoV2FlashForCausalLM` | | inclusionAI | [Ling 2.0](inclusionai/ling-2.md) | `BailingMoeV2ForCausalLM` | @@ -146,6 +147,7 @@ stabilityai/stablelm stepfun-ai/step-3-5 parasail-ai/gritlm tencent/hy3 +tencent/hy-mt2 xiaomimimo/mimo-v2-flash inclusionai/ling-2 ``` From 4e6e6e8f8328b52e73c400e9cfb9c6b1ee45e8b3 Mon Sep 17 00:00:00 2001 From: khazic Date: Wed, 27 May 2026 19:22:47 +0800 Subject: [PATCH 09/20] refactor(models): concentrate Hy-MT2 dispatch logic inside hy_mt2 module Moves the ``_is_hy_mt2_config`` fingerprint predicate from ``_transformers/model_init.py`` into a new ``components/models/hy_mt2/dispatch.py`` module, and migrates the matching dispatcher tests from ``tests/.../_transformers/test_model_init.py`` to ``tests/.../models/hy_mt2/test_dispatch.py``. The auto-resolver in model_init.py now keeps only a 4-line shim that imports ``is_hy_mt2_config`` from the model package when the architecture name matches HYV3ForCausalLM, so Hy-MT2-specific knowledge (which hidden_size, layer count, expert count, etc. identify the checkpoint) lives entirely inside ``components/models/hy_mt2/`` rather than leaking into shared code. No behavior change: same fingerprint fields, same dispatch outcome, the existing Hy3-preview path is untouched. Signed-off-by: khazic --- nemo_automodel/_transformers/model_init.py | 22 ++--- .../components/models/hy_mt2/dispatch.py | 37 ++++++++ .../_transformers/test_model_init.py | 36 -------- .../unit_tests/models/hy_mt2/test_dispatch.py | 84 +++++++++++++++++++ 4 files changed, 127 insertions(+), 52 deletions(-) create mode 100644 nemo_automodel/components/models/hy_mt2/dispatch.py create mode 100644 tests/unit_tests/models/hy_mt2/test_dispatch.py diff --git a/nemo_automodel/_transformers/model_init.py b/nemo_automodel/_transformers/model_init.py index de12748498..7280bceae7 100644 --- a/nemo_automodel/_transformers/model_init.py +++ b/nemo_automodel/_transformers/model_init.py @@ -214,19 +214,6 @@ def _is_config_compatible_with_custom_model(arch_name: str, config) -> bool: return True -def _is_hy_mt2_config(config) -> bool: - """Return whether a ``hy_v3`` config describes Tencent Hy-MT2-30B-A3B.""" - return ( - getattr(config, "model_type", None) == "hy_v3" - and getattr(config, "hidden_size", None) == 2048 - and getattr(config, "num_hidden_layers", None) == 48 - and getattr(config, "num_experts", None) == 128 - and getattr(config, "expert_hidden_dim", None) == 768 - and getattr(config, "moe_intermediate_size", None) == 768 - and hasattr(config, "enable_lm_head_fp32") - ) - - def _resolve_custom_model_cls_for_config(config): """Resolve the custom model class for *config*, if the config is compatible.""" architectures = get_architectures(config) @@ -234,10 +221,13 @@ def _resolve_custom_model_cls_for_config(config): return None arch_name = architectures[0] - if arch_name == "HYV3ForCausalLM" and _is_hy_mt2_config(config): - from nemo_automodel.components.models.hy_mt2.model import HyMT2ForCausalLM + if arch_name == "HYV3ForCausalLM": + from nemo_automodel.components.models.hy_mt2.dispatch import is_hy_mt2_config + + if is_hy_mt2_config(config): + from nemo_automodel.components.models.hy_mt2.model import HyMT2ForCausalLM - return HyMT2ForCausalLM + return HyMT2ForCausalLM if not ModelRegistry.has_custom_model(arch_name): return None diff --git a/nemo_automodel/components/models/hy_mt2/dispatch.py b/nemo_automodel/components/models/hy_mt2/dispatch.py new file mode 100644 index 0000000000..097fc316ab --- /dev/null +++ b/nemo_automodel/components/models/hy_mt2/dispatch.py @@ -0,0 +1,37 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Config-shape fingerprint that distinguishes Hy-MT2-30B-A3B from Hy3-preview. + +Tencent ships both checkpoints with ``architectures: ["HYV3ForCausalLM"]`` and +``model_type: "hy_v3"`` even though the two models differ substantially +(48 vs 80 layers, 128 vs 192 experts, hidden 2048 vs 4096, etc.). The +auto-resolver in ``_transformers/model_init.py`` looks up the fingerprint here +so all Hy-MT2-specific knowledge stays inside this module. +""" + +from typing import Any + + +def is_hy_mt2_config(config: Any) -> bool: + """Return whether *config* describes Tencent's Hy-MT2-30B-A3B checkpoint.""" + return ( + getattr(config, "model_type", None) == "hy_v3" + and getattr(config, "hidden_size", None) == 2048 + and getattr(config, "num_hidden_layers", None) == 48 + and getattr(config, "num_experts", None) == 128 + and getattr(config, "expert_hidden_dim", None) == 768 + and getattr(config, "moe_intermediate_size", None) == 768 + and hasattr(config, "enable_lm_head_fp32") + ) diff --git a/tests/unit_tests/_transformers/test_model_init.py b/tests/unit_tests/_transformers/test_model_init.py index 4ece5371d1..5318719ad5 100644 --- a/tests/unit_tests/_transformers/test_model_init.py +++ b/tests/unit_tests/_transformers/test_model_init.py @@ -15,7 +15,6 @@ """Tests for nested config override handling in get_hf_config and _consume_config_overrides.""" import os -from types import SimpleNamespace from unittest.mock import MagicMock, patch import pytest @@ -28,7 +27,6 @@ _init_model, _load_config_with_layer_types_fix, _propagate_torch_dtype_to_subconfigs, - _resolve_custom_model_cls_for_config, _resolve_model_dir, _setup_bnb_loading_kwargs, _stream_load_bnb_weights, @@ -39,40 +37,6 @@ from nemo_automodel.components.models.common.utils import BackendConfig -class TestHyMT2ModelResolution: - """Hy-MT2 shares HYV3ForCausalLM metadata but needs its own implementation.""" - - def test_hy_mt2_config_resolves_to_hy_mt2_model(self): - from nemo_automodel.components.models.hy_mt2.model import HyMT2ForCausalLM - - config = SimpleNamespace( - architectures=["HYV3ForCausalLM"], - model_type="hy_v3", - hidden_size=2048, - num_hidden_layers=48, - num_experts=128, - expert_hidden_dim=768, - moe_intermediate_size=768, - enable_lm_head_fp32=True, - ) - - assert _resolve_custom_model_cls_for_config(config) is HyMT2ForCausalLM - - def test_hy_v3_config_still_resolves_to_hy_v3_model(self): - from nemo_automodel.components.models.hy_v3.model import HYV3ForCausalLM - - config = SimpleNamespace( - architectures=["HYV3ForCausalLM"], - model_type="hy_v3", - hidden_size=4096, - num_hidden_layers=80, - num_experts=192, - moe_intermediate_size=1536, - ) - - assert _resolve_custom_model_cls_for_config(config) is HYV3ForCausalLM - - class TestConsumeConfigOverridesNestedDict: """Nested dict overrides should be deep-merged into sub-config objects.""" diff --git a/tests/unit_tests/models/hy_mt2/test_dispatch.py b/tests/unit_tests/models/hy_mt2/test_dispatch.py new file mode 100644 index 0000000000..00f93aa5fd --- /dev/null +++ b/tests/unit_tests/models/hy_mt2/test_dispatch.py @@ -0,0 +1,84 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for the Hy-MT2 config-shape fingerprint and its routing through the +shared ``_resolve_custom_model_cls_for_config`` entry point.""" + +from types import SimpleNamespace + +from nemo_automodel._transformers.model_init import _resolve_custom_model_cls_for_config +from nemo_automodel.components.models.hy_mt2.dispatch import is_hy_mt2_config + + +def _hy_mt2_config() -> SimpleNamespace: + return SimpleNamespace( + architectures=["HYV3ForCausalLM"], + model_type="hy_v3", + hidden_size=2048, + num_hidden_layers=48, + num_experts=128, + expert_hidden_dim=768, + moe_intermediate_size=768, + enable_lm_head_fp32=True, + ) + + +def _hy3_preview_config() -> SimpleNamespace: + return SimpleNamespace( + architectures=["HYV3ForCausalLM"], + model_type="hy_v3", + hidden_size=4096, + num_hidden_layers=80, + num_experts=192, + moe_intermediate_size=1536, + ) + + +class TestIsHyMT2Config: + """Direct tests of the fingerprint predicate.""" + + def test_hy_mt2_fingerprint_matches(self): + assert is_hy_mt2_config(_hy_mt2_config()) + + def test_hy3_preview_fingerprint_does_not_match(self): + assert not is_hy_mt2_config(_hy3_preview_config()) + + def test_missing_enable_lm_head_fp32_does_not_match(self): + config = _hy_mt2_config() + del config.enable_lm_head_fp32 + assert not is_hy_mt2_config(config) + + def test_wrong_hidden_size_does_not_match(self): + config = _hy_mt2_config() + config.hidden_size = 4096 + assert not is_hy_mt2_config(config) + + def test_non_hy_v3_model_type_does_not_match(self): + config = _hy_mt2_config() + config.model_type = "llama" + assert not is_hy_mt2_config(config) + + +class TestHyMT2ModelResolution: + """Hy-MT2 shares ``HYV3ForCausalLM`` metadata but needs its own implementation.""" + + def test_hy_mt2_config_resolves_to_hy_mt2_model(self): + from nemo_automodel.components.models.hy_mt2.model import HyMT2ForCausalLM + + assert _resolve_custom_model_cls_for_config(_hy_mt2_config()) is HyMT2ForCausalLM + + def test_hy_v3_config_still_resolves_to_hy_v3_model(self): + from nemo_automodel.components.models.hy_v3.model import HYV3ForCausalLM + + assert _resolve_custom_model_cls_for_config(_hy3_preview_config()) is HYV3ForCausalLM From ef924493ca4e4a75c73ac70f695d5bdc33e4629a Mon Sep 17 00:00:00 2001 From: Huiying Date: Wed, 27 May 2026 13:16:37 -0700 Subject: [PATCH 10/20] Update fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com> Signed-off-by: Huiying --- .../nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx b/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx index 87020ebe55..248170aeec 100644 --- a/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx +++ b/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx @@ -56,7 +56,7 @@ cd Automodel automodel --nproc-per-node=8 examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml ``` -See the [NeMo AutoModel Installation Guide](/get-started/installation) and [LLM Fine-Tuning Guide](/recipes-e2e-examples/sft-peft). +Refer to the [NeMo AutoModel Installation Guide](/get-started/installation) and [LLM Fine-Tuning Guide](/recipes-e2e-examples/sft-peft). ## Fine-Tuning From ef897a5bafe2d2442a91fae36a8748949235f6a5 Mon Sep 17 00:00:00 2001 From: Huiying Date: Wed, 27 May 2026 13:17:00 -0700 Subject: [PATCH 11/20] Update fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com> Signed-off-by: Huiying --- .../nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx b/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx index 248170aeec..d4325accb8 100644 --- a/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx +++ b/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx @@ -58,7 +58,7 @@ automodel --nproc-per-node=8 examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yam Refer to the [NeMo AutoModel Installation Guide](/get-started/installation) and [LLM Fine-Tuning Guide](/recipes-e2e-examples/sft-peft). -## Fine-Tuning +## Fine-Tune the Model See the [LLM Fine-Tuning Guide](/recipes-e2e-examples/sft-peft) and the [Large MoE Fine-Tuning Guide](/recipes-e2e-examples/large-moe-fine-tuning). From 4b2e09083eb13ca6b3ef87075a7cec7313cc4404 Mon Sep 17 00:00:00 2001 From: Huiying Date: Wed, 27 May 2026 13:17:26 -0700 Subject: [PATCH 12/20] Update fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com> Signed-off-by: Huiying --- .../nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx b/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx index d4325accb8..d4479b75f0 100644 --- a/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx +++ b/fern/versions/nightly/pages/model-coverage/llm/tencent/hy-mt2.mdx @@ -60,7 +60,7 @@ Refer to the [NeMo AutoModel Installation Guide](/get-started/installation) and ## Fine-Tune the Model -See the [LLM Fine-Tuning Guide](/recipes-e2e-examples/sft-peft) and the [Large MoE Fine-Tuning Guide](/recipes-e2e-examples/large-moe-fine-tuning). +Refer to the [LLM Fine-Tuning Guide](/recipes-e2e-examples/sft-peft) and the [Large MoE Fine-Tuning Guide](/recipes-e2e-examples/large-moe-fine-tuning). ## Hugging Face Model Cards From f7576b10219e4dc9d496eea35510b95f23c1c0fe Mon Sep 17 00:00:00 2001 From: Huiying Date: Wed, 27 May 2026 13:17:52 -0700 Subject: [PATCH 13/20] Update docs/model-coverage/llm/index.md Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com> Signed-off-by: Huiying --- docs/model-coverage/llm/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/model-coverage/llm/index.md b/docs/model-coverage/llm/index.md index d43dd12bb1..6d1a6aa7c7 100644 --- a/docs/model-coverage/llm/index.md +++ b/docs/model-coverage/llm/index.md @@ -11,7 +11,7 @@ To run LLMs with NeMo AutoModel, make sure you're using NeMo container version [ pip3 install --upgrade git+git@github.com:NVIDIA-NeMo/AutoModel.git ``` -For other installation options (e.g., uv), see the [NeMo AutoModel Installation Guide](../../guides/installation.md). +For other installation options (for example, uv), refer to the [NeMo AutoModel Installation Guide](../../guides/installation.md). ## Supported Models From 8990d20456cb25834c48a5cedaf92951c540a336 Mon Sep 17 00:00:00 2001 From: Huiying Date: Wed, 27 May 2026 13:18:09 -0700 Subject: [PATCH 14/20] Update docs/model-coverage/llm/index.md Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com> Signed-off-by: Huiying --- docs/model-coverage/llm/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/model-coverage/llm/index.md b/docs/model-coverage/llm/index.md index 6d1a6aa7c7..16a0108a85 100644 --- a/docs/model-coverage/llm/index.md +++ b/docs/model-coverage/llm/index.md @@ -87,7 +87,7 @@ The models listed above can be fine-tuned using NeMo AutoModel. We support two p See the [Fine-Tuning Guide](../../guides/llm/finetune.md) to learn how to apply both methods to your data. :::{tip} -In these guides, we use the `SQuAD v1.1` dataset for demonstration purposes, but you can use your own data. Update the recipe YAML `dataset` / `validation_dataset` sections accordingly. See [LLM datasets](../../guides/llm/dataset.md) and [dataset overview](../../guides/dataset-overview.md). +In these guides, the examples use the `SQuAD v1.1` dataset for demonstration purposes, but you can use your own data. Update the recipe YAML `dataset` / `validation_dataset` sections accordingly. Refer to [LLM datasets](../../guides/llm/dataset.md) and [dataset overview](../../guides/dataset-overview.md). ::: ```{toctree} From c1a29dc5b3720b8de7f3c13af7dc02fe6a319726 Mon Sep 17 00:00:00 2001 From: Huiying Date: Wed, 27 May 2026 13:18:31 -0700 Subject: [PATCH 15/20] Update docs/model-coverage/llm/index.md Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com> Signed-off-by: Huiying --- docs/model-coverage/llm/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/model-coverage/llm/index.md b/docs/model-coverage/llm/index.md index 16a0108a85..6933fbd5c9 100644 --- a/docs/model-coverage/llm/index.md +++ b/docs/model-coverage/llm/index.md @@ -77,7 +77,7 @@ NeMo AutoModel supports the [AutoModelForCausalLM](https://huggingface.co/transf | Xiaomi MiMo | [MiMo-V2-Flash](xiaomimimo/mimo-v2-flash.md) | `MiMoV2FlashForCausalLM` | | inclusionAI | [Ling 2.0](inclusionai/ling-2.md) | `BailingMoeV2ForCausalLM` | -## Fine-Tuning LLMs with NeMo AutoModel +## Fine-Tune LLMs with NeMo AutoModel The models listed above can be fine-tuned using NeMo AutoModel. We support two primary fine-tuning approaches: From 521fe22ab8e25680e99d258acb420e32181284d7 Mon Sep 17 00:00:00 2001 From: Huiying Date: Wed, 27 May 2026 13:18:42 -0700 Subject: [PATCH 16/20] Update docs/model-coverage/llm/index.md Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com> Signed-off-by: Huiying --- docs/model-coverage/llm/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/model-coverage/llm/index.md b/docs/model-coverage/llm/index.md index 6933fbd5c9..d499ecb08c 100644 --- a/docs/model-coverage/llm/index.md +++ b/docs/model-coverage/llm/index.md @@ -84,7 +84,7 @@ The models listed above can be fine-tuned using NeMo AutoModel. We support two p 1. **Parameter-Efficient Fine-Tuning (PEFT)**: Updates only a small subset of parameters (typically <1%) using techniques like Low-Rank Adaptation (LoRA). 2. **Supervised Fine-Tuning (SFT)**: Updates all or most model parameters for deeper adaptation. -See the [Fine-Tuning Guide](../../guides/llm/finetune.md) to learn how to apply both methods to your data. +Refer to the [Fine-Tuning Guide](../../guides/llm/finetune.md) to learn how to apply both methods to your data. :::{tip} In these guides, the examples use the `SQuAD v1.1` dataset for demonstration purposes, but you can use your own data. Update the recipe YAML `dataset` / `validation_dataset` sections accordingly. Refer to [LLM datasets](../../guides/llm/dataset.md) and [dataset overview](../../guides/dataset-overview.md). From 83b053b7824d189ab176f657446e0f7a5e695390 Mon Sep 17 00:00:00 2001 From: Huiying Date: Wed, 27 May 2026 13:18:55 -0700 Subject: [PATCH 17/20] Update docs/model-coverage/llm/tencent/hy-mt2.md Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com> Signed-off-by: Huiying --- docs/model-coverage/llm/tencent/hy-mt2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/model-coverage/llm/tencent/hy-mt2.md b/docs/model-coverage/llm/tencent/hy-mt2.md index 62794a8f00..a82a2bba5b 100644 --- a/docs/model-coverage/llm/tencent/hy-mt2.md +++ b/docs/model-coverage/llm/tencent/hy-mt2.md @@ -52,7 +52,7 @@ cd Automodel automodel --nproc-per-node=8 examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yaml ``` -See the [NeMo AutoModel Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md). +Refer to the [NeMo AutoModel Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md). ## Fine-Tuning From bb41ada4b90bdd633edbd075db98c6eed9c29f1e Mon Sep 17 00:00:00 2001 From: Huiying Date: Wed, 27 May 2026 13:19:08 -0700 Subject: [PATCH 18/20] Update docs/model-coverage/llm/tencent/hy-mt2.md Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com> Signed-off-by: Huiying --- docs/model-coverage/llm/tencent/hy-mt2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/model-coverage/llm/tencent/hy-mt2.md b/docs/model-coverage/llm/tencent/hy-mt2.md index a82a2bba5b..e91177456e 100644 --- a/docs/model-coverage/llm/tencent/hy-mt2.md +++ b/docs/model-coverage/llm/tencent/hy-mt2.md @@ -54,7 +54,7 @@ automodel --nproc-per-node=8 examples/llm_finetune/hy_mt2/hy_mt2_30b_a3b_sft.yam Refer to the [NeMo AutoModel Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md). -## Fine-Tuning +## Fine-Tune the Model See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md) and the [Large MoE Fine-Tuning Guide](../../../guides/llm/large-moe-finetune.md). From 460dab9e7cac9e783c456dfdfb4b9250603ffab3 Mon Sep 17 00:00:00 2001 From: Huiying Date: Wed, 27 May 2026 13:19:22 -0700 Subject: [PATCH 19/20] Update docs/model-coverage/llm/tencent/hy-mt2.md Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com> Signed-off-by: Huiying --- docs/model-coverage/llm/tencent/hy-mt2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/model-coverage/llm/tencent/hy-mt2.md b/docs/model-coverage/llm/tencent/hy-mt2.md index e91177456e..b7db2ad92f 100644 --- a/docs/model-coverage/llm/tencent/hy-mt2.md +++ b/docs/model-coverage/llm/tencent/hy-mt2.md @@ -56,7 +56,7 @@ Refer to the [NeMo AutoModel Installation Guide](../../../guides/installation.md ## Fine-Tune the Model -See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md) and the [Large MoE Fine-Tuning Guide](../../../guides/llm/large-moe-finetune.md). +Refer to the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md) and the [Large MoE Fine-Tuning Guide](../../../guides/llm/large-moe-finetune.md). ## Hugging Face Model Cards From 7b9e32c2325976ef215ebcc8bf59a3f5e98a7fff Mon Sep 17 00:00:00 2001 From: khazic Date: Thu, 28 May 2026 14:11:10 +0800 Subject: [PATCH 20/20] fix(tests): declare rope_parameters on minimal _Cfg mock in hy_mt2 test `test_enable_lm_head_fp32_default_false_without_config_flag` constructs `HyMT2ForCausalLM(_Cfg(), ...)` with a bare mock class to verify that the flag defaults to ``False`` when the config does not declare it. The bare mock skips ``PretrainedConfig.__init__``, which is what normally synthesizes ``rope_parameters`` from ``rope_theta``. As a result, ``get_rope_config`` (called during model construction) raised ``AttributeError: '_Cfg' object has no attribute 'rope_parameters'`` on GPU CI. Add the field to the mock with the same shape ``PretrainedConfig`` would produce. The CPU test suite cannot trigger this (the whole ``TestHyMT2ForCausalLM`` class is CUDA-gated), so the regression was only visible on the L0_Unit_Tests_GPU job. Signed-off-by: khazic --- tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py b/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py index 20705d6d3c..04f18c59ad 100644 --- a/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py +++ b/tests/unit_tests/models/hy_mt2/test_hy_mt2_model.py @@ -238,6 +238,10 @@ class _Cfg: first_k_dense_replace = 1 max_position_embeddings = 128 rope_theta = 10000.0 + # PretrainedConfig populates ``rope_parameters`` from ``rope_theta`` + # in its ``__init__``; this bare mock skips that, so declare it + # explicitly to match what ``get_rope_config`` reads. + rope_parameters = {"rope_theta": 10000.0, "rope_type": "default"} rms_norm_eps = 1e-5 torch_dtype = "bfloat16" attention_bias = False