ChinChangYang · ChinChangYang · Jan 19, 2026 · Jan 19, 2026 · Jan 19, 2026
diff --git a/src/builder/MILBuilder.cpp b/src/builder/MILBuilder.cpp
@@ -1981,8 +1981,8 @@ void MILBuilder::buildPolicyHead(CoreML::Specification::MILSpec::Block* block,
     std::string p1_activated = genVarName("policy_p1_act");
     addBatchNormActivationOps(block, p1_biased, ph.p1_bn, ph.p1_activation, mask, p1_activated);
 
-    // P2 conv -> policy output (match Python name)
-    // Use _fp16 suffix only for mixed precision (FP16 compute with FP32 I/O)
+    // P2 conv -> policy output
+    // Mixed precision uses _fp16 suffix for this intermediate op; cast ops later rename to base name
     policy_out = (m_use_fp16 && !m_use_fp16_io) ? "policy_p2_conv_fp16" : "policy_p2_conv";
     addConvOp(block, p1_activated, ph.p2_conv, policy_out);
 
@@ -2006,11 +2006,13 @@ void MILBuilder::buildPolicyHead(CoreML::Specification::MILSpec::Block* block,
             pass_activated = pass_biased;
         }
 
-        pass_out = (m_use_fp16 && !m_use_fp16_io) ? "policy_pass_fp16" : "policy_pass";  // Match Python naming
+        // Mixed precision: _fp16 intermediate, cast ops rename to base name
+        pass_out = (m_use_fp16 && !m_use_fp16_io) ? "policy_pass_fp16" : "policy_pass";
         addMatMulOp(block, pass_activated, *ph.gpool_to_pass_mul2, pass_out);
     } else {
         // Pre-v15: single layer pass
-        pass_out = (m_use_fp16 && !m_use_fp16_io) ? "policy_pass_fp16" : "policy_pass";  // pre-v15 name
+        // Mixed precision: _fp16 intermediate, cast ops rename to base name (pre-v15)
+        pass_out = (m_use_fp16 && !m_use_fp16_io) ? "policy_pass_fp16" : "policy_pass";
         addMatMulOp(block, g1_pooled, ph.gpool_to_pass_mul, pass_out);
     }
 }
@@ -2051,15 +2053,18 @@ void MILBuilder::buildValueHead(CoreML::Specification::MILSpec::Block* block,
         v2 = v2_bias;
     }
 
-    // V3: linear -> value output (fused matmul+bias -> linear) (match Python name)
+    // V3: linear -> value output (fused matmul+bias -> linear)
+    // Mixed precision: _fp16 intermediate, cast ops rename to base name
     value_out = (m_use_fp16 && !m_use_fp16_io) ? "value_v3_bias_fp16" : "value_v3_bias";
     addLinearOp(block, v2, vh.v3_mul, vh.v3_bias, value_out);
 
-    // SV3: linear -> score value output (fused matmul+bias -> linear) (match Python name)
+    // SV3: linear -> score value output (fused matmul+bias -> linear)
+    // Mixed precision: _fp16 intermediate, cast ops rename to base name
     score_value_out = (m_use_fp16 && !m_use_fp16_io) ? "value_sv3_bias_fp16" : "value_sv3_bias";
     addLinearOp(block, v2, vh.sv3_mul, vh.sv3_bias, score_value_out);
 
-    // Ownership conv (match Python name)
+    // Ownership conv
+    // Mixed precision: _fp16 intermediate, cast ops rename to base name
     ownership_out = (m_use_fp16 && !m_use_fp16_io) ? "value_ownership_conv_fp16" : "value_ownership_conv";
     addConvOp(block, v1, vh.v_ownership_conv, ownership_out);
 }

diff --git a/tests/test_cpp_vs_python.py b/tests/test_cpp_vs_python.py
@@ -1004,3 +1004,119 @@ def test_fp16_io_with_dynamic_batch(
             for name, value in outputs.items():
                 assert not np.isnan(value).any(), f"Output '{name}' contains NaN values"
                 assert not np.isinf(value).any(), f"Output '{name}' contains Inf values"
+
+
+class TestCppMixedVsPureFP16:
+    """Self-validation tests comparing mixed precision and pure FP16 I/O modes.
+
+    Since the Python coremltools KataGo converter does not support float16_io mode,
+    cross-validation against Python is not possible. Instead, these tests validate
+    that the pure FP16 I/O mode produces inference results equivalent to the
+    mixed precision mode (which has been validated against Python).
+
+    Both modes produce outputs with the same names (e.g., policy_p2_conv, value_v3_bias),
+    allowing direct comparison without name mapping.
+    """
+
+    @pytest.mark.parametrize(
+        "model_name",
+        [
+            "g170-b6c96-s175395328-d26788732.bin.gz",       # Smaller model
+            "b5c192nbt-distilled.bin.gz",                   # Distilled model (human SL with metadata)
+        ],
+    )
+    @pytest.mark.parametrize("board_size", [9, 19])
+    def test_mixed_vs_pure_fp16_inference(
+        self,
+        model_name: str,
+        board_size: int,
+        katago2coreml_exe: Path,
+        all_test_models: dict,
+        temp_output_dir: Path,
+    ):
+        """Test that pure FP16 I/O and mixed precision produce equivalent inference results.
+
+        This test validates that the pure FP16 I/O mode produces results equivalent to
+        the mixed precision mode (which has already been validated against Python).
+
+        Args:
+            model_name: Name of the KataGo model file
+            board_size: Board size (9 or 19)
+            katago2coreml_exe: Path to C++ CLI tool (fixture)
+            all_test_models: Dict mapping model names to paths (fixture)
+            temp_output_dir: Temporary directory for outputs (fixture)
+        """
+        import platform
+        if platform.machine() != "arm64":
+            pytest.skip("Core ML inference only available on Apple Silicon")
+
+        import numpy as np
+
+        # Skip if model not available
+        if model_name not in all_test_models:
+            pytest.skip(f"Model not available: {model_name}")
+
+        model_path = all_test_models[model_name]
+
+        # Convert with mixed precision (float16=True, float16_io=False)
+        mixed_output = temp_output_dir / f"mixed_fp16_{board_size}x{board_size}.mlpackage"
+        convert_with_cpp(
+            katago2coreml_exe,
+            model_path,
+            mixed_output,
+            board_size,
+            optimize_mask=False,
+            float16=True,
+            float16_io=False,
+        )
+
+        # Convert with pure FP16 I/O (float16=True, float16_io=True)
+        pure_output = temp_output_dir / f"pure_fp16_io_{board_size}x{board_size}.mlpackage"
+        convert_with_cpp(
+            katago2coreml_exe,
+            model_path,
+            pure_output,
+            board_size,
+            optimize_mask=False,
+            float16=True,
+            float16_io=True,
+        )
+
+        # Load models
+        import coremltools as ct
+        mixed_model = ct.models.MLModel(str(mixed_output))
+        pure_model = ct.models.MLModel(str(pure_output))
+
+        # Check for meta_input
+        mixed_spec = mixed_model.get_spec()
+        has_meta_input = "meta_input" in [inp.name for inp in mixed_spec.description.input]
+
+        # Generate FP32 inputs (CoreML auto-converts for FP16 I/O model)
+        inputs = create_batched_inputs(1, board_size, has_meta_input)
+
+        # Run inference on both models
+        mixed_outputs = mixed_model.predict(inputs)
+        pure_outputs = pure_model.predict(inputs)
+
+        # Compare outputs using relative tolerance (rtol) and absolute tolerance (atol)
+        # FP16 has ~3 decimal digits of precision, so 1% relative tolerance is appropriate
+        rtol, atol = 1e-2, 1e-2
+        failed_outputs = []
+
+        for name in pure_outputs.keys():
+            if name not in mixed_outputs:
+                failed_outputs.append(f"Missing output key in mixed model: {name}")
+                continue
+
+            pure_val = pure_outputs[name]
+            mixed_val = mixed_outputs[name]
+
+            if not np.allclose(pure_val, mixed_val, rtol=rtol, atol=atol):
+                max_diff = np.max(np.abs(pure_val - mixed_val))
+                failed_outputs.append(
+                    f"Output '{name}' not close: max_diff={max_diff:.6f}, "
+                    f"rtol={rtol}, atol={atol}"
+                )
+
+        if failed_outputs:
+            pytest.fail("\n".join(failed_outputs))