diff --git a/.github/workflows/fern-docs-ci.yml b/.github/workflows/fern-docs-ci.yml
index ecc67c1187..89bb98d0c9 100644
--- a/.github/workflows/fern-docs-ci.yml
+++ b/.github/workflows/fern-docs-ci.yml
@@ -25,7 +25,7 @@ on:
     branches:
       - "pull-request/[0-9]+"
     paths:
-      - 'fern/**'
+      - 'docs/fern/**'
       - '.github/workflows/fern-docs-ci.yml'
 
 permissions:
@@ -47,5 +47,5 @@ jobs:
         run: npm install -g fern-api
 
       - name: Validate Fern configuration
-        working-directory: ./fern
+        working-directory: ./docs/fern
         run: fern check
diff --git a/.github/workflows/fern-docs-preview-build.yml b/.github/workflows/fern-docs-preview-build.yml
index 5423f7cccc..e7ee714f9f 100644
--- a/.github/workflows/fern-docs-preview-build.yml
+++ b/.github/workflows/fern-docs-preview-build.yml
@@ -15,7 +15,7 @@
 
 # Workflow 1 of 2 for Fern doc previews.
 #
-# Collects the fern/ sources and PR metadata from the (possibly untrusted) PR
+# Collects the docs/fern/ sources and PR metadata from the (possibly untrusted) PR
 # branch and uploads them as an artifact. No secrets are used here, so this is
 # safe to run on fork PRs via the regular pull_request trigger.
 #
@@ -27,7 +27,7 @@ name: "Preview Fern Docs: Build"
 on:
   pull_request:
     paths:
-      - 'fern/**'
+      - 'docs/fern/**'
       - '.github/workflows/fern-docs-preview-build.yml'
 
 permissions:
@@ -58,6 +58,6 @@ jobs:
         with:
           name: fern-preview
           path: |
-            fern/
+            docs/fern/
             preview-metadata/
           retention-days: 1
diff --git a/.github/workflows/fern-docs-preview-comment.yml b/.github/workflows/fern-docs-preview-comment.yml
index 411bb53940..1bc40f6cf4 100644
--- a/.github/workflows/fern-docs-preview-comment.yml
+++ b/.github/workflows/fern-docs-preview-comment.yml
@@ -16,7 +16,7 @@
 # Workflow 2 of 2 for Fern doc previews.
 #
 # Triggered by workflow_run after "Preview Fern Docs: Build" completes.
-# Downloads the fern/ artifact, builds a preview with DOCS_FERN_TOKEN, and
+# Downloads the docs/fern/ artifact, builds a preview with DOCS_FERN_TOKEN, and
 # posts a stable :herb: comment on the PR. This workflow never checks out the
 # PR branch directly, keeping secrets isolated from untrusted code.
 #
@@ -65,7 +65,7 @@ jobs:
         env:
           FERN_TOKEN: ${{ secrets.DOCS_FERN_TOKEN }}
           HEAD_REF: ${{ steps.metadata.outputs.head_ref }}
-        working-directory: ./fern
+        working-directory: ./docs/fern
         run: |
           OUTPUT=$(fern generate --docs --preview --id "$HEAD_REF" 2>&1)
           echo "$OUTPUT"
diff --git a/.github/workflows/publish-fern-docs.yml b/.github/workflows/publish-fern-docs.yml
index 4204e55e55..cfec75784e 100644
--- a/.github/workflows/publish-fern-docs.yml
+++ b/.github/workflows/publish-fern-docs.yml
@@ -29,7 +29,6 @@ on:
       - main
     paths:
       - 'docs/**'
-      - 'fern/**'
     tags:
       - 'docs/v*'
   workflow_dispatch: {}
@@ -59,5 +58,5 @@ jobs:
       - name: Publish Docs
         env:
           FERN_TOKEN: ${{ secrets.DOCS_FERN_TOKEN }}
-        working-directory: ./fern
+        working-directory: ./docs/fern
         run: fern generate --docs
diff --git a/.github/workflows/release-freeze.yml b/.github/workflows/release-freeze.yml
index c4c70a872e..3fc0287468 100644
--- a/.github/workflows/release-freeze.yml
+++ b/.github/workflows/release-freeze.yml
@@ -49,74 +49,9 @@ jobs:
       SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
       PAT: ${{ secrets.PAT }}
 
-  bump-docs-versions:
-    needs: [code-freeze]
-    runs-on: ubuntu-latest
-    env:
-      VERSION: ${{ needs.code-freeze.outputs.release-branch }}
-      DRY_RUN: ${{ inputs.dry-run }}
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v6
-        with:
-          ref: main
-          token: ${{ secrets.PAT }}
-
-      - name: Configure git
-        run: |
-          git config user.name "github-actions[bot]"
-          git config user.email "github-actions[bot]@users.noreply.github.com"
-
-      - name: Update versions1.json
-        run: |
-          version_number=$(echo ${VERSION} | sed 's/^r//')
-
-          # Remove (latest) tag from previous latest entry, insert new version after nightly
-          jq --arg ver "$version_number" '
-            map(if .name then del(.name) else . end)
-            | [.[0]] + [{"name": ($ver + " (latest)"), "version": $ver, "url": "https://docs.nvidia.com/nemo/automodel/latest/"}] + .[1:]
-          ' docs/versions1.json > tmp.json && mv tmp.json docs/versions1.json
-
-      - name: Commit changes
-        id: commit-versions1
-        run: |
-          git add docs/versions1.json
-          git commit -m "[bot]: Update docs-versions after code-freeze for ${VERSION}"
-          commit_hash=$(git rev-parse HEAD)
-          echo "commit_hash=${commit_hash}" | tee -a $GITHUB_OUTPUT
-
-          if [[ "$DRY_RUN" != "true" ]]; then
-            git push
-          fi
-
-      - name: Switch to release branch
-        run: |
-          git fetch origin ${{ needs.code-freeze.outputs.release-branch }}
-          git checkout ${{ needs.code-freeze.outputs.release-branch }}
-
-      - name: Cherry-pick docs-versions commit
-        run: |
-          git cherry-pick ${{ steps.commit-versions1.outputs.commit_hash }}
-
-      - name: Update project.json
-        run: |
-          version_number=$(echo ${VERSION} | sed 's/^r//')
-
-          jq --arg ver "$version_number" \
-            '. = {"version": $ver, "name": "nemo-automodel"}' \
-            docs/project.json > tmp.json && mv tmp.json docs/project.json
-
-      - name: Update conf.py
-        run: |
-          version_number=$(echo ${VERSION} | sed 's/^r//')
-          sed -i "s/release = .*/release = \"${version_number}\"/" docs/conf.py
-
-      - name: Commit changes
-        run: |
-          git add docs/project.json
-          git add docs/conf.py
-          git commit -m "Bump docs version to ${VERSION}"
-
-          if [[ "$DRY_RUN" != "true" ]]; then
-            git push
-          fi
+  # bump-docs-versions: the Sphinx-era docs/versions1.json, docs/project.json,
+  # and docs/conf.py no longer exist; Fern manages version metadata via
+  # docs/fern/docs.yml `versions:` and docs/fern/versions/*.yml. The
+  # equivalent step under Fern is to add a new version pin in those files
+  # and tag docs/v<X>.<Y>.0 to trigger publish-fern-docs.yml — done by hand
+  # at release cut time, not on every code freeze.
diff --git a/.gitignore b/.gitignore
index 298ad67da4..7cdf3b9db2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -186,6 +186,6 @@ skypilot_jobs/
 
 # Fern: generated library reference (regenerated by `fern docs md generate`
 # during preview / publish — not committed)
-fern/product-docs/
+docs/fern/product-docs/
 
 training_logs/
diff --git a/AGENTS.md b/AGENTS.md
index 210643377e..688dc7272e 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -218,7 +218,7 @@ file gives step-by-step instructions an AI agent can follow.
 | 7 | build-and-dependency     | `build-and-dependency`     | Container setup, uv package management, environment variables, CLI usage |
 | 8 | cicd                     | `cicd`                     | Commit/PR workflow, CI trigger mechanism, failure investigation |
 | 9 | testing                  | `testing`                  | Unit and functional test layout, tier semantics (L0/L1/L2), adding tests |
-| 10 | fern-docs               | `fern-docs`                | Maintain the Fern docs site under `fern/` — pages, slugs, redirects, version aliases, library reference |
+| 10 | fern-docs               | `fern-docs`                | Maintain the Fern docs site under `docs/` (MDX content) + `docs/fern/` (infra) — pages, slugs, redirects, version aliases, library reference |
 
 **Always read the relevant `SKILL.md` before starting any task it covers —
 skills are mandatory context, not optional background reading.**
diff --git a/docs/about/index.md b/docs/about/index.md
deleted file mode 100644
index 40dc5b5974..0000000000
--- a/docs/about/index.md
+++ /dev/null
@@ -1,101 +0,0 @@
----
-description: "Overview of NeMo AutoModel, a PyTorch DTensor-native SPMD library with optimized model implementations and a Hugging Face-compatible API for training, fine-tuning, and as an accelerated backend for other frameworks"
-categories: ["getting-started"]
-tags: ["overview", "spmd", "dtensor", "distributed", "getting-started"]
-personas: ["machine-learning-engineer", "researcher", "devops"]
-difficulty: "beginner"
-content_type: "concept"
----
-
-(about-overview)=
-
-# About NeMo AutoModel
-
-NeMo AutoModel is a PyTorch DTensor-native SPMD (Single Program, Multiple Data) open-source library under [NVIDIA NeMo Framework](https://github.com/NVIDIA-NeMo). It provides **optimized model implementations** with a **Hugging Face-compatible API**, so any model on the Hub works out of the box with no checkpoint conversion. On top of that, it ships ready-made **recipes** for training and fine-tuning LLMs and VLMs at scale.
-
-Because AutoModel exposes the same Autoclass interface as `transformers`, it can also be used as a **drop-in accelerated backend for other libraries** -- reinforcement learning frameworks, evaluation harnesses, or any codebase that loads Hugging Face models.
-
-## Target Users
-
-- **Machine learning engineers**: Fine-tune and pre-train LLMs and VLMs at scale with minimal boilerplate.
-- **Researchers**: Rapidly prototype with hackable, linear training scripts and YAML-driven configuration.
-- **Library and framework authors**: Use AutoModel's optimized model implementations as a drop-in replacement for `transformers` to accelerate RL, alignment, evaluation, or any downstream workflow.
-
-## How It Works
-
-NeMo AutoModel is built around two core ideas: **recipes** and **components**.
-
-- **Recipes** are executable Python scripts paired with YAML configs. Each recipe defines an end-to-end workflow -- model loading, data preparation, training loop, and checkpointing -- and can be launched with a single command.
-- **Components** are modular, self-contained building blocks (datasets, optimizers, loss functions, distribution strategies) that recipes compose together. Swap any component by changing a `_target_` field in your YAML.
-
-This design means the training loop is always visible and hackable -- no hidden abstractions. You configure parallelism, precision, and scaling through config, not code changes.
-
-### SPMD and DTensor
-
-NeMo AutoModel uses PyTorch's native SPMD (Single Program, Multiple Data) model with DTensor and DeviceMesh:
-
-- **One program, any scale**: The same training script runs on 1 GPU or 1000+ by changing the mesh configuration.
-- **Parallelism is configuration**: Mix tensor, sequence, pipeline, and data parallelism by editing placements -- no model rewrites.
-- **Decoupled concerns**: Model code stays pure PyTorch; the parallel strategy lives in config.
-
-### Key Technologies
-
-- **FSDP2 and MegatronFSDP**: Memory-efficient sharded data parallelism for large-scale training, including Hybrid Sharding (HSDP).
-- **Pipeline Parallelism**: Torch-native pipelining composable with FSDP2 and DTensor for 3D parallelism.
-- **Custom CUDA Kernels**: Fused attention, TransformerEngine, DeepEP, and FlexAttn for optimized throughput.
-- **FP8 Mixed Precision**: FP8 training via torchao for supported models.
-- **Distributed Checkpointing (DCP)**: Sharded SafeTensors checkpoints with merge and reshard utilities, interoperable with Hugging Face.
-
-## Hugging Face Integration
-
-NeMo AutoModel builds on top of `transformers` rather than replacing it:
-
-- Load any `AutoModelForCausalLM` or `AutoModelForImageTextToText` model directly from the Hub.
-- Use Hugging Face tokenizers, datasets, and chat templates as-is.
-- Checkpoints stay in the native Hugging Face format -- no conversion step before or after training.
-- New models released on the Hub get day-0 support because AutoModel tracks the latest `transformers` version.
-
-See the [Hugging Face API Compatibility](../guides/huggingface-api-compatibility.md) guide and [Model Coverage](../model-coverage/overview.md) for details.
-
-## Optimized Model Implementations
-
-AutoModel ships optimized implementations for supported architectures (fused attention, TransformerEngine layers, DeepEP for MoE routing, FlexAttn) while keeping the standard `transformers` API surface. This means:
-
-- **Faster training and inference** with no code changes -- load a model the same way you would with `transformers` and get accelerated kernels automatically.
-- **No checkpoint conversion** -- weights are loaded from and saved to the native Hugging Face format.
-- **Day-0 model support** -- because AutoModel builds on `transformers`, newly released models on the Hub work immediately. Optimized kernels are added incrementally for popular architectures.
-
-## Use as a Library
-
-NeMo AutoModel is not limited to its built-in training recipes. Because it implements the Hugging Face `AutoModel` API, any library or framework that loads models through `transformers` can swap in AutoModel to get optimized performance:
-
-- **Reinforcement learning** (e.g., TRL, OpenRLHF) -- replace the policy or reference model with an AutoModel instance for faster rollouts and gradient steps.
-- **Evaluation and benchmarking** -- plug into lm-evaluation-harness or custom eval loops with no API changes.
-- **Custom training loops** -- import individual components (optimizers, loss functions, distributed strategies) without using recipes at all.
-
-```python
-from nemo_automodel import NeMoAutoModelForCausalLM
-
-model = NeMoAutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")
-```
-
-The returned model is a standard `nn.Module` with the same forward signature as the `transformers` equivalent, so it works anywhere a Hugging Face model is expected.
-
-## What's Next
-
-::::{grid} 1 2 2 2
-:gutter: 1 1 1 2
-
-:::{grid-item-card} {octicon}`zap;1.5em;sd-mr-1` Key Features and Concepts
-:link: about-key-features
-:link-type: ref
-Explore the main features, supported workflows, and core concepts.
-:::
-
-:::{grid-item-card} {octicon}`rocket;1.5em;sd-mr-1` Quickstart
-:link: ../index
-:link-type: doc
-Jump to the quickstart table to find the right guide for your task.
-:::
-
-::::
diff --git a/fern/versions/nightly/pages/about/index.mdx b/docs/about/index.mdx
similarity index 100%
rename from fern/versions/nightly/pages/about/index.mdx
rename to docs/about/index.mdx
diff --git a/docs/about/key-features.md b/docs/about/key-features.md
deleted file mode 100644
index 3f8b43168b..0000000000
--- a/docs/about/key-features.md
+++ /dev/null
@@ -1,188 +0,0 @@
----
-description: "Key features and core concepts of NeMo AutoModel for scalable LLM and VLM training with Hugging Face integration"
-categories: ["concepts-architecture"]
-tags: ["features", "benchmarks", "parallelism", "peft", "distributed", "recipes", "components"]
-personas: ["machine-learning-engineer", "researcher", "devops"]
-difficulty: "beginner"
-content_type: "concept"
----
-
-(about-key-features)=
-
-# Key Features and Concepts
-
-NeMo AutoModel provides GPU-accelerated, `transformers`-compatible training for LLMs and VLMs. It combines Hugging Face's model ecosystem with NVIDIA's optimized training stack, delivering high throughput without sacrificing ease of use.
-
-## Why NeMo AutoModel?
-
-- **Hugging Face native**: Train any model from the Hub with no checkpoint conversion -- day-0 support for new releases.
-- **High performance**: Custom CUDA kernels (TransformerEngine, DeepEP, FlexAttn) deliver up to 279 TFLOPs/sec/GPU.
-- **Any scale**: The same recipe runs on 1 GPU or across hundreds of nodes -- parallelism is configuration, not code.
-- **Hackable**: Linear training scripts with YAML config. No hidden trainer abstractions.
-- **Open source**: Apache 2.0 licensed, NVIDIA-supported, and actively maintained.
-
-### Performance Highlights
-
-| Model | GPUs | TFLOPs/sec/GPU | Tokens/sec/GPU | Optimizations |
-|-------|-----:|---------------:|---------------:|---------------|
-| DeepSeek V3 671B | 256 | 250 | 1,002 | TE + DeepEP |
-| GPT-OSS 20B | 8 | 279 | 13,058 | TE + DeepEP + FlexAttn |
-| Qwen3 MoE 30B | 8 | 212 | 11,842 | TE + DeepEP |
-
-See the [full benchmark results](../performance-summary.md) for configuration details and more models.
-
----
-
-## Training Workflows
-
-NeMo AutoModel supports a range of training tasks across LLM and VLM modalities.
-
-::::{grid} 1 2 2 3
-:gutter: 2
-
-:::{grid-item-card} {octicon}`mortar-board;1.5em;sd-mr-1` Supervised Fine-Tuning (SFT)
-:link: ../guides/llm/finetune
-:link-type: doc
-Full-parameter fine-tuning for task-specific adaptation.
-:::
-
-:::{grid-item-card} {octicon}`cpu;1.5em;sd-mr-1` PEFT (LoRA / QLoRA)
-:link: ../guides/llm/finetune
-:link-type: doc
-Memory-efficient fine-tuning by updating only low-rank adapter weights.
-:::
-
-:::{grid-item-card} {octicon}`iterations;1.5em;sd-mr-1` Pre-Training
-:link: ../guides/llm/pretraining
-:link-type: doc
-Train models from scratch on large-scale datasets.
-:::
-
-:::{grid-item-card} {octicon}`dependabot;1.5em;sd-mr-1` Knowledge Distillation
-:link: ../guides/llm/knowledge-distillation
-:link-type: doc
-Transfer knowledge from a large teacher to a smaller student model.
-:::
-
-:::{grid-item-card} {octicon}`tools;1.5em;sd-mr-1` Tool Calling
-:link: ../guides/llm/toolcalling
-:link-type: doc
-Fine-tune models for structured function calling with tool schemas.
-:::
-
-:::{grid-item-card} {octicon}`meter;1.5em;sd-mr-1` Quantization-Aware Training
-:link: ../guides/quantization-aware-training
-:link-type: doc
-Train with quantization for deployment-ready models.
-:::
-
-::::
-
----
-
-## Parallelism and Scaling
-
-NeMo AutoModel leverages PyTorch-native parallelism strategies to scale training from a single GPU to multi-node clusters.
-
-::::{grid} 1 2 2 2
-:gutter: 2
-
-:::{grid-item-card} {octicon}`git-merge;1.5em;sd-mr-1` FSDP2
-Fully Sharded Data Parallelism with DTensor for memory-efficient distributed training. Supports Hybrid Sharding (HSDP) for multi-node.
-:::
-
-:::{grid-item-card} {octicon}`git-merge;1.5em;sd-mr-1` Pipeline Parallelism
-Torch-native pipelining composable with FSDP2 and DTensor for 3D parallelism.
-:::
-
-:::{grid-item-card} {octicon}`zap;1.5em;sd-mr-1` FP8 Mixed Precision
-FP8 training via torchao for reduced memory and higher throughput on supported models.
-:::
-
-:::{grid-item-card} {octicon}`server;1.5em;sd-mr-1` Multi-Node with SLURM
-Add a `slurm:` section to any YAML config and launch with the `automodel` CLI. See the [Cluster guide](../launcher/slurm.md).
-:::
-
-::::
-
----
-
-## Core Concepts
-
-### Recipes
-
-Recipes are executable Python scripts paired with YAML configuration files. Each recipe defines a complete training workflow:
-
-1. **Load** a model and tokenizer from Hugging Face (via `_target_` in YAML)
-2. **Prepare** a dataset with the appropriate collator and chat template
-3. **Train** with a configurable loop (gradient accumulation, validation, logging)
-4. **Checkpoint** using Distributed Checkpoint (DCP) with SafeTensors output
-
-```yaml
-recipe:
-  _target_: nemo_automodel.recipes.llm.train_ft.TrainFinetuneRecipeForNextTokenPrediction
-
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad
-  split: train
-```
-
-Override any field from the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --step_scheduler.local_batch_size 16
-```
-
-### Components
-
-Components are modular, self-contained building blocks that recipes assemble:
-
-| Component | Purpose |
-|-----------|---------|
-| `datasets/` | LLM and VLM datasets with collators, tokenization, and chat templates |
-| `distributed/` | FSDP2, MegatronFSDP, tensor/sequence/pipeline parallelism |
-| `_peft/` | LoRA and QLoRA implementations |
-| `attention/` | Fused attention, rotary embeddings, FlexAttn |
-| `checkpoint/` | DCP save/load with SafeTensors output |
-| `moe/` | Mixture of Experts routing and DeepEP integration |
-| `optim/` | Optimizers and LR schedulers |
-| `loss/` | Cross-entropy, linear cross-entropy, KD loss |
-| `launcher/` | SLURM and interactive job launch |
-
-Each component can be used independently and has no cross-module imports.
-
-### The `automodel` CLI
-
-The CLI simplifies job launch across environments:
-
-```bash
-# Single-node interactive
-automodel config.yaml
-
-# Multi-node SLURM batch
-sbatch my_cluster.sub  # copy slurm.sub, edit CONFIG & SBATCH directives, then submit
-```
-
-See the [Local Workstation](../launcher/local-workstation.md) and [Cluster](../launcher/slurm.md) guides.
-
----
-
-## Checkpointing
-
-NeMo AutoModel writes Distributed Checkpoints (DCP) with SafeTensors shards. Checkpoints carry partition metadata to:
-
-- **Merge** into a single Hugging Face-compatible checkpoint for inference or sharing.
-- **Reshard** when loading onto a different mesh or topology.
-- **Resume** training from any checkpoint without manual intervention.
-
-See the [Checkpointing guide](../guides/checkpointing.md) for details.
-
-## Experiment Tracking
-
-NeMo AutoModel integrates with MLflow and Weights & Biases for experiment tracking, metric logging, and artifact management. See the [Experiment Tracking guide](../guides/mlflow-logging.md).
diff --git a/fern/versions/nightly/pages/about/key-features.mdx b/docs/about/key-features.mdx
similarity index 100%
rename from fern/versions/nightly/pages/about/key-features.mdx
rename to docs/about/key-features.mdx
diff --git a/docs/announcements.md b/docs/announcements.md
deleted file mode 100644
index 77310123ee..0000000000
--- a/docs/announcements.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# Announcements
-
-See also the [Model Coverage Release Log](model-coverage/overview.md#release-log) for newly supported models across releases.
-
-- [Accelerating Large-Scale Mixture-of-Experts Training in PyTorch with NeMo Automodel](https://github.com/NVIDIA-NeMo/Automodel/discussions/777)
-- [Challenges in Enabling PyTorch Native Pipeline Parallelism for Hugging Face Transformer Models](https://github.com/NVIDIA-NeMo/Automodel/discussions/589)
-- [Google Gemma 3n: Efficient Multimodal Fine-tuning Made Simple](https://github.com/NVIDIA-NeMo/Automodel/discussions/494)
-- [Fine-tune Hugging Face Models Instantly with Day-0 Support with NVIDIA NeMo AutoModel](https://github.com/NVIDIA-NeMo/Automodel/discussions/477)
diff --git a/fern/versions/nightly/pages/announcements.mdx b/docs/announcements.mdx
similarity index 100%
rename from fern/versions/nightly/pages/announcements.mdx
rename to docs/announcements.mdx
diff --git a/fern/versions/nightly/pages/api-reference/index.mdx b/docs/api-reference/index.mdx
similarity index 100%
rename from fern/versions/nightly/pages/api-reference/index.mdx
rename to docs/api-reference/index.mdx
diff --git a/docs/autodoc2_docstrings_parser.py b/docs/autodoc2_docstrings_parser.py
deleted file mode 100644
index 1f8e1bdbfd..0000000000
--- a/docs/autodoc2_docstrings_parser.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# flake8: noqa
-# pylint: skip-file
-from docutils import nodes
-from myst_parser.parsers.sphinx_ import MystParser
-from sphinx.ext.napoleon.docstring import GoogleDocstring
-
-
-class NapoleonParser(MystParser):
-    def parse(self, input_string: str, document: nodes.document) -> None:
-        # Get the Sphinx configuration
-        config = document.settings.env.config
-
-        # Process with Google style
-        google_parsed = str(GoogleDocstring(input_string, config))
-
-        return super().parse(google_parsed, document)
-
-
-Parser = NapoleonParser
diff --git a/docs/breaking-changes.md b/docs/breaking-changes.md
deleted file mode 100644
index 0c7f376731..0000000000
--- a/docs/breaking-changes.md
+++ /dev/null
@@ -1,106 +0,0 @@
-# Breaking Changes
-
-## 0.4.0 · 26.04
-
-### CLI Signature Change
-
-**Before:**
-
-```
-automodel <command> <domain> -c <config.yaml>
-```
-
-**After:**
-
-```
-automodel <config.yaml> [--nproc-per-node N] [--overrides ...]
-```
-
-A short alias `am` is also available:
-
-```
-am <config.yaml> [--nproc-per-node N] [--overrides ...]
-```
-
-The positional `<command>` and `<domain>` arguments have been removed. The recipe
-class is now specified inside the YAML config via the `recipe._target_` key.
-
-### YAML Config: New Required `recipe` Section
-
-All YAML configs now require a top-level `recipe:` key:
-
-```yaml
-recipe:
-  _target_: nemo_automodel.recipes.llm.train_ft.TrainFinetuneRecipeForNextTokenPrediction
-```
-
-Configs without this key will produce an error with guidance on which target to add.
-
-#### Available Recipe Targets
-
-| Use Case | `_target_` |
-|---|---|
-| LLM fine-tuning / pre-training | `nemo_automodel.recipes.llm.train_ft.TrainFinetuneRecipeForNextTokenPrediction` |
-| VLM fine-tuning | `nemo_automodel.recipes.vlm.finetune.FinetuneRecipeForVLM` |
-| Knowledge distillation | `nemo_automodel.recipes.llm.kd.KnowledgeDistillationRecipeForNextTokenPrediction` |
-| Benchmarking | `nemo_automodel.recipes.llm.benchmark.BenchmarkingRecipeForNextTokenPrediction` |
-| Sequence classification | `nemo_automodel.recipes.llm.train_seq_cls.TrainFinetuneRecipeForSequenceClassification` |
-| Biencoder training | `nemo_automodel.recipes.biencoder.train_biencoder.TrainBiencoderRecipe` |
-
-### Launcher Configuration Moved to YAML
-
-Multi-node launch settings (Kubernetes, NeMo-Run) are now configured
-entirely within the YAML config file rather than through CLI arguments.
-
-| Launcher | YAML section |
-|---|---|
-| Kubernetes | `k8s:` |
-| NeMo-Run | `nemo_run:` |
-
-If none of these sections are present the job runs locally (interactive mode).
-
-### SLURM: Script-Based Submission
-
-The `slurm:` YAML section and all related fields have been removed.  SLURM
-jobs are now submitted with `sbatch` directly, using a self-contained sbatch
-script.  Copy the reference template and adapt it to your cluster:
-
-```bash
-cp slurm.sub my_cluster.sub
-# Edit CONFIG, #SBATCH directives, container, mounts, etc.
-sbatch my_cluster.sub
-```
-
-The script runs `torchrun -m nemo_automodel.cli.app` on each node, which
-detects the distributed environment and executes the recipe in-process.
-All cluster-specific configuration lives in the sbatch script where you can
-see and edit it directly.
-
-### Lightweight CLI-Only Install
-
-A new `automodel[cli]` install extra is available for login nodes or environments
-where you only need to submit jobs (SLURM, k8s, NeMo-Run) without running
-training locally:
-
-```
-pip install nemo-automodel[cli]
-```
-
-This installs only `pyyaml` -- no PyTorch, no CUDA dependencies. It is enough
-to submit jobs via SLURM or Kubernetes. If you also need NeMo-Run, install it
-separately (`pip install nemo-run`). If you try to run a local/interactive job
-with the CLI-only install, you will get a clear error message with instructions
-to install the full package.
-
-### CLI Module Lives Inside the Package
-
-The CLI entry-point lives at `nemo_automodel/cli/app.py` and is registered as
-the `automodel` / `am` console entry-points. A thin convenience wrapper
-(`app.py`) at the repository root is available for running from a source
-checkout but is **not** installed as part of the package.
-
-### Example Wrapper Scripts Deprecated
-
-The Python wrapper scripts in `examples/` (for example, `examples/llm_finetune/finetune.py`)
-are deprecated. They now print a deprecation warning and delegate to the recipe
-directly. Use `automodel <config.yaml>` instead.
diff --git a/fern/versions/nightly/pages/breaking-changes.mdx b/docs/breaking-changes.mdx
similarity index 100%
rename from fern/versions/nightly/pages/breaking-changes.mdx
rename to docs/breaking-changes.mdx
diff --git a/docs/conf.py b/docs/conf.py
deleted file mode 100644
index ad9e002958..0000000000
--- a/docs/conf.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Configuration file for the Sphinx documentation builder.
-#
-# For the full list of built-in configuration values, see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Project information -----------------------------------------------------
-# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
-
-import datetime
-import os
-import sys
-
-# flake8: noqa
-# pylint: skip-file
-
-# Embed a build timestamp so every docs build produces unique content,
-# ensuring Akamai ECCU revalidate detects changed ETags on S3.
-build_timestamp = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
-
-project = "NeMo-AutoModel"
-copyright = "2026, NVIDIA Corporation"
-author = "NVIDIA Corporation"
-release = "nightly"
-
-# -- General configuration ---------------------------------------------------
-# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
-
-extensions = [
-    "myst_parser",  # For our markdown docs
-    "autodoc2",  # Generates API docs
-    "sphinx.ext.viewcode",  # For adding a link to view source code in docs
-    "sphinx.ext.doctest",  # Allows testing in docstrings
-    "sphinx.ext.napoleon",  # For google style docstrings
-    "sphinx_copybutton",  # For copy button in code blocks
-    "sphinx_design",  # For grid layout and card components
-]
-
-templates_path = ["_templates"]
-exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "documentation.md"]
-
-# -- Options for MyST Parser (Markdown) --------------------------------------
-# MyST Parser settings
-myst_enable_extensions = [
-    "dollarmath",  # Enables dollar math for inline math
-    "amsmath",  # Enables LaTeX math for display mode
-    "colon_fence",  # Enables code blocks using ::: delimiters instead of ```
-    "deflist",  # Supports definition lists with term: definition format
-    "fieldlist",  # Enables field lists for metadata like :author: Name
-    "tasklist",  # Adds support for GitHub-style task lists with [ ] and [x]
-]
-myst_heading_anchors = 5  # Generates anchor links for headings up to level 5
-
-# -- Options for Autodoc2 ---------------------------------------------------
-sys.path.insert(0, os.path.abspath(".."))
-
-autodoc2_packages = [
-    "../nemo_automodel",  # Path to your package relative to conf.py
-]
-autodoc2_render_plugin = "myst"  # Use MyST for rendering docstrings
-autodoc2_output_dir = "apidocs"  # Output directory for autodoc2 (relative to docs/)
-# This is a workaround that uses the parser located in autodoc2_docstrings_parser.py to allow autodoc2 to
-# render google style docstrings.
-# Related Issue: https://github.com/sphinx-extensions2/sphinx-autodoc2/issues/33
-autodoc2_docstring_parser_regexes = [
-    (r".*", "docs.autodoc2_docstrings_parser"),
-]
-# Exclude specific modules from autodoc2 generation
-autodoc2_skip_module_regexes = [
-    r"nemo_automodel\.package_info",  # Exclude top-level package info file
-]
-
-# Suppress build warnings that arise from generated files (harmless)
-suppress_warnings = [
-    "myst.header",  # Skip warnings about heading level starting at H2 in generated docs
-    "autodoc2.dup_item",  # Skip duplicate item warnings from autodoc2 analysis
-]
-
-# -- Options for HTML output -------------------------------------------------
-# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
-
-html_theme = "nvidia_sphinx_theme"
-# Add the docs render/build date to the footer on every page.
-# The NVIDIA theme includes a "last-updated" footer component that shows
-# Sphinx's `last_updated` value when this is set.
-html_last_updated_fmt = "%Y-%m-%d"
-html_theme_options = {
-    "icon_links": [
-        {
-            "name": "GitHub",
-            "url": "https://github.com/NVIDIA-NeMo/Automodel/",
-            "icon": "fa-brands fa-github",
-        }
-    ],
-    "switcher": {
-        "json_url": "../versions1.json",
-        "version_match": release,
-    },
-    "extra_head": {
-        f"""
-    <meta name="build-timestamp" content="{build_timestamp}">
-    <script src="https://assets.adobedtm.com/5d4962a43b79/c1061d2c5e7b/launch-191c2462b890.min.js" ></script>
-    """
-    },
-    "extra_footer": {
-        """
-    <script type="text/javascript">if (typeof _satellite !== "undefined") {_satellite.pageBottom();}</script>
-    """
-    },
-}
-html_extra_path = ["project.json", "versions1.json"]
-
-# Github links are now getting rate limited from the Github Actions
-linkcheck_ignore = [
-    ".*github\\.com.*",
-    ".*githubusercontent\\.com.*",
-    ".*huggingface\\.co.*",  # Gated model pages require authentication; non-checkable from CI
-    ".*llama\\.com.*",  # Returns 400 to automated crawlers
-    ".*ai\\.meta\\.com.*",  # Returns 400 to automated crawlers
-]
diff --git a/docs/documentation.md b/docs/documentation.md
deleted file mode 100644
index ba556497b0..0000000000
--- a/docs/documentation.md
+++ /dev/null
@@ -1,42 +0,0 @@
-# Documentation Development
-
-- [Documentation Development](#documentation-development)
-  - [Build the Documentation](#build-the-documentation)
-  - [Live Building](#live-building)
-  - [Documentation Version](#documentation-version)
-
-
-## Build the Documentation
-
-The following sections describe how to set up and build the NeMo Automodel documentation.
-
-Switch to the documentation source folder and generate HTML output.
-
-```sh
-cd docs/
-uv run sphinx-build . _build/html
-```
-
-* The resulting HTML files are generated in a `_build/html` folder that is created under the project `docs/` folder.
-* The generated python API docs are placed in `apidocs` under the `docs/` folder.
-
-## Live Building
-
-When writing documentation, it can be helpful to serve the documentation and have it update live while you edit.
-
-To do so, run:
-
-```sh
-cd docs/
-uv run sphinx-autobuild . _build/html --port 12345 --host 0.0.0.0
-```
-
-Open a web browser and go to `http://0.0.0.0:12345` to view the output.
-
-## Documentation Version
-
-The three files below control the version switcher. Before you attempt to publish a new version of the documentation, update these files to match the latest version numbers.
-
-* docs/versions1.json
-* docs/project.json
-* docs/conf.py
diff --git a/fern/versions/nightly/pages/documentation.mdx b/docs/documentation.mdx
similarity index 100%
rename from fern/versions/nightly/pages/documentation.mdx
rename to docs/documentation.mdx
diff --git a/fern/Makefile b/docs/fern/Makefile
similarity index 95%
rename from fern/Makefile
rename to docs/fern/Makefile
index e2dfbb57c3..21a19361d3 100644
--- a/fern/Makefile
+++ b/docs/fern/Makefile
@@ -1,9 +1,9 @@
 # NeMo AutoModel — Fern docs convenience targets.
-# Runs from this directory (fern/). Invoke as:
+# Runs from this directory (docs/fern/). Invoke as:
 #
-#     cd fern && make docs
+#     cd docs/fern && make docs
 #     # or from anywhere in the repo:
-#     make -C fern docs
+#     make -C docs/fern docs
 #
 # CI workflows under `.github/workflows/fern-docs-*.yml` are the source of
 # truth for the published pipeline; these targets just mirror the
@@ -24,8 +24,8 @@ help:
 	@echo "NeMo AutoModel — Fern docs Make targets"
 	@echo "========================================"
 	@echo ""
-	@echo "Run from this directory ('cd fern && make <target>')"
-	@echo "or from anywhere in the repo ('make -C fern <target>')."
+	@echo "Run from this directory ('cd docs/fern && make <target>')"
+	@echo "or from anywhere in the repo ('make -C docs/fern <target>')."
 	@echo ""
 	@echo "  make docs-login             FIRST-TIME SETUP — provision Fern account + CLI auth"
 	@echo "  make docs                   Generate library reference and start Fern dev server"
diff --git a/fern/README.md b/docs/fern/README.md
similarity index 63%
rename from fern/README.md
rename to docs/fern/README.md
index 51426de57f..ff3bfe9271 100644
--- a/fern/README.md
+++ b/docs/fern/README.md
@@ -1,8 +1,10 @@
 # NeMo AutoModel — Fern Docs
 
-This directory holds the Fern MDX source for the NeMo AutoModel documentation site at **[docs.nvidia.com/nemo/automodel](https://docs.nvidia.com/nemo/automodel)**.
+This directory holds the Fern build infrastructure (config, repo-specific components, frozen version snapshots) for the NeMo AutoModel documentation site at **[docs.nvidia.com/nemo/automodel](https://docs.nvidia.com/nemo/automodel)**.
 
-The legacy Sphinx tree under `../docs/` remains in place for reference until the Fern site is fully validated; new pages and edits should land here.
+**The MDX content lives one level up, in `docs/` itself** — every nightly page is a top-level sibling of this `docs/fern/` directory (e.g. `docs/index.mdx`, `docs/guides/llm/finetune.mdx`). Fern reads those files via relative `path: ../../<...>.mdx` entries in `docs/fern/versions/nightly.yml`.
+
+NVIDIA branding (logos, favicon, footer, fonts, NVIDIA-green CSS, OneTrust JS) comes from the central control repo at **[NVIDIA/fern-components](https://github.com/NVIDIA/fern-components)** via `global-theme: nvidia` in `docs.yml` — no logos or theme CSS are vendored locally.
 
 ## Quick links
 
@@ -10,8 +12,8 @@ The legacy Sphinx tree under `../docs/` remains in place for reference until the
 |---|---|
 | Published site | https://docs.nvidia.com/nemo/automodel |
 | Fern dashboard | https://dashboard.buildwithfern.com (NVIDIA org) |
-| Skill for agents | [`../skills/fern-docs/SKILL.md`](../skills/fern-docs/SKILL.md) |
-| CI workflows | [`../.github/workflows/fern-docs-*.yml`](../.github/workflows/) |
+| Skill for agents | [`../../skills/fern-docs/SKILL.md`](../../skills/fern-docs/SKILL.md) |
+| CI workflows | [`../../.github/workflows/fern-docs-*.yml`](../../.github/workflows/) |
 | Make targets | [`./Makefile`](./Makefile) |
 
 ## Quickstart
@@ -19,8 +21,8 @@ The legacy Sphinx tree under `../docs/` remains in place for reference until the
 First time on this machine:
 
 ```bash
-# All Make targets live in fern/Makefile — run them from this directory
-# (`cd fern && make <target>`) or from anywhere with `make -C fern <target>`.
+# All Make targets live in docs/fern/Makefile — run them from this directory
+# (`cd docs/fern && make <target>`) or from anywhere with `make -C docs/fern <target>`.
 
 # 1. Install the Fern CLI globally (one-time)
 npm install -g fern-api
@@ -28,7 +30,7 @@ npm install -g fern-api
 
 # 2. Provision your Fern account + CLI auth (one-time per machine).
 #    Walks you through the dashboard sign-in step before running `fern login`.
-cd fern && make docs-login
+cd docs/fern && make docs-login
 
 # 3. Build the API library reference and start the local dev server
 make docs           # http://localhost:3002
@@ -55,34 +57,36 @@ make docs-check
 ## Layout
 
 ```
-fern/
-├── fern.config.json          # Fern CLI pin (4.62.4+) and org slug
-├── docs.yml                  # Site config: instances, versions, redirects, libraries, theme
-├── main.css                  # NVIDIA-green theme overrides
-├── assets/                   # Logos and shared SVGs
-├── components/               # BadgeLinks.tsx, Tag.tsx, CustomFooter.tsx
-├── versions/
-│   ├── nightly.yml           # Nav for the bleeding-edge tree — paths point at ./nightly/pages/
-│   ├── nightly/pages/        # Bleeding-edge MDX content (edited on every PR)
-│   ├── v0.4.yml              # Nav for the frozen 0.4.0 GA snapshot — paths point at ./v0.4/pages/
-│   ├── v0.4/pages/           # Frozen 0.4.0 content (back-ports only)
-│   └── latest.yml            # GA alias — paths point at ./v0.4/pages/; bumps to ./v0.5/pages/ at next GA cut
-└── product-docs/             # GENERATED Python API reference (gitignored — `make docs` regenerates)
+docs/                            ← nightly MDX lives here (sibling of fern/)
+├── index.mdx, breaking-changes.mdx, release-notes.mdx, ...
+├── about/, guides/, model-coverage/, launcher/, api-reference/
+├── *.png / *.jpg                ← page-scoped images
+└── fern/                        ← THIS DIRECTORY
+    ├── fern.config.json         # Fern CLI pin (5.29.0+) and org slug
+    ├── docs.yml                 # Site config: instances, versions, redirects, libraries, global-theme: nvidia
+    ├── components/              # BadgeLinks.tsx, Tag.tsx (repo-specific only;
+    │                            #   NVIDIA-branded footer/logo/CSS ship via global-theme)
+    ├── versions/
+    │   ├── nightly.yml          # Nav for nightly — paths point at ../../<path>.mdx (up into docs/)
+    │   ├── v0.4.yml             # Nav for the frozen 0.4.0 GA snapshot — paths at ./v0.4/pages/
+    │   ├── v0.4/pages/          # Frozen 0.4.0 content (back-ports only; never edited from nightly)
+    │   └── latest.yml           # GA alias — paths at ./v0.4/pages/ today; repointed at next GA cut
+    └── product-docs/            # GENERATED Python API reference (gitignored — `make docs` regenerates)
 ```
 
 ```
 File path                                                  Published URL
 ─────────────────────────────────────────────────────────  ─────────────────────────────────────────────────
-fern/versions/nightly/pages/get-started/installation.mdx   docs.nvidia.com/nemo/automodel/nightly/get-started/installation
-fern/versions/v0.4/pages/get-started/installation.mdx      docs.nvidia.com/nemo/automodel/v0.4/get-started/installation
+docs/get-started/installation.mdx                          docs.nvidia.com/nemo/automodel/nightly/get-started/installation
+docs/fern/versions/v0.4/pages/get-started/installation.mdx docs.nvidia.com/nemo/automodel/v0.4/get-started/installation
                                                            docs.nvidia.com/nemo/automodel/latest/get-started/installation  (latest mounts v0.4 content)
 ```
 
-`nightly/pages/` and `v0.4/pages/` are **separate, independent content trees**. `nightly/` is the bleeding-edge tree edited on every PR; `v0.4/` is the frozen 0.4.0 release snapshot, only changed via deliberate back-port. `latest.yml` mounts `./v0.4/pages/` so `/latest/...` URLs serve the current GA — at the next GA cut, `latest.yml` repoints at the new train. Today the two trees are byte-for-byte identical (we just shipped 0.4.0); they'll diverge as nightly accumulates post-release edits.
+The **`docs/` top-level tree IS the nightly tree** — every PR lands there. The **`docs/fern/versions/v0.4/pages/` tree is a frozen 0.4.0 release snapshot**, only changed via deliberate back-port. `latest.yml` mounts `./v0.4/pages/` so `/latest/...` URLs serve the current GA — at the next GA cut, `latest.yml` repoints at the new train. Today the two trees are byte-for-byte identical (we just shipped 0.4.0); they'll diverge as nightly accumulates post-release edits.
 
 ## Local development
 
-From this directory (`cd fern` first, or use `make -C fern <target>` from anywhere):
+From this directory (`cd docs/fern` first, or use `make -C docs/fern <target>` from anywhere):
 
 ```bash
 make docs           # `fern docs md generate` + `fern docs dev` → http://localhost:3002
@@ -93,7 +97,7 @@ make docs-publish   # trigger the `Publish Fern Docs` workflow on origin/main
 
 For first-time-on-this-machine setup, see the [Quickstart](#quickstart) above — `make docs-login` walks through dashboard provisioning + `fern login` together.
 
-`fern docs md generate` (run by `make docs`) populates `fern/product-docs/` from the `nemo_automodel` package source declared in the `libraries:` block of `docs.yml`. Without it, a cold `fern docs dev` will fail with `Folder not found: ./product-docs/...`. Re-run only when the upstream Python source changes — for prose-only iteration, `cd fern && fern docs dev` alone is enough.
+`fern docs md generate` (run by `make docs`) populates `docs/fern/product-docs/` from the `nemo_automodel` package source declared in the `libraries:` block of `docs.yml`. Without it, a cold `fern docs dev` will fail with `Folder not found: ./product-docs/...`. Re-run only when the upstream Python source changes — for prose-only iteration, `cd docs/fern && fern docs dev` alone is enough.
 
 ## Sidebar fidelity rule
 
@@ -123,7 +127,8 @@ Use the bundled custom components in `components/`:
 |---|---|---|
 | `<BadgeLinks ... />` | Header badge rows on landing pages (PyPI, license, GitHub, …) | `import { BadgeLinks } from "@/components/BadgeLinks";` |
 | `<Tag variant="...">label</Tag>` | Card chips ("start here", "5 min", etc.) | `import { Tag } from "@/components/Tag";` |
-| `<CustomFooter />` | Wired in `docs.yml` `footer:`; **required** for NVIDIA legal/privacy compliance | (auto) |
+
+The shared NVIDIA `<CustomFooter />` (privacy / Do Not Sell / etc.) ships from the `nvidia` global theme — wired automatically, **not** authored in this repo.
 
 Standard Fern components are also available — `<Note>`, `<Tip>`, `<Info>`, `<Warning>`, `<Cards>` / `<Card>`, etc. Don't use GitHub `> [!NOTE]` syntax — it does not render in MDX.
 
@@ -156,28 +161,28 @@ Repository source paths like `examples/llm_finetune/foo.yaml` or `nemo_automodel
 | `Latest` | `latest` | `stable` | `./versions/latest.yml` |
 | `0.4.0 · 26.04` | `v0.4` | `stable` | `./versions/v0.4.yml` |
 
-**`nightly` is the bleeding-edge tree** — every PR lands here, and (once wired up) the daily build publishes from here. **`v0.4` is the frozen 0.4.0 GA snapshot** with its own copy of every page; it only changes via deliberate back-ports from nightly. `latest.yml` mounts the current GA's content (today: `./v0.4/pages/...`).
+**`nightly` reads the MDX directly from `docs/`** (via `path: ../../<...>.mdx` in `nightly.yml`) — every PR lands there, and (once wired up) the daily build publishes from that tree. **`v0.4` is the frozen 0.4.0 GA snapshot** with its own copy of every page under `docs/fern/versions/v0.4/pages/`; it only changes via deliberate back-ports from nightly. `latest.yml` mounts the current GA's content (today: `./v0.4/pages/...`).
 
 When the next GA cuts (e.g. `v0.5`):
 
-1. `cp -r versions/nightly versions/v0.5` — fresh frozen snapshot of nightly at release time
-2. `cp versions/nightly.yml versions/v0.5.yml`, then sed `./nightly/` → `./v0.5/` in the new file
+1. `cp -r ../* versions/v0.5/pages/` (excluding `docs/fern/`) — fresh frozen snapshot of nightly at release time
+2. `cp versions/nightly.yml versions/v0.5.yml`, then sed `../../` → `./v0.5/pages/` in the new file
 3. Repoint `versions/latest.yml` at the new GA: `cp versions/v0.5.yml versions/latest.yml`
 4. Add the new frozen-pin entry to `docs.yml` `versions:` (`display-name: "0.5.0"`, `slug: v0.5`, `availability: stable`); keep `v0.4` per support policy
-5. `versions/nightly/pages/` keeps moving forward as the bleeding-edge tree; `versions/v0.4/pages/` and `versions/v0.5/pages/` are now both frozen
+5. `docs/` keeps moving forward as the nightly tree; `versions/v0.4/pages/` and `versions/v0.5/pages/` are both frozen
 
 ## CI and publishing
 
 | Workflow | Trigger | Purpose |
 |---|---|---|
 | `fern-docs-ci.yml` | `push: pull-request/[0-9]+` (FW-CI mirror) | `fern check` on PRs |
-| `fern-docs-preview-build.yml` | `pull_request` | Untrusted half: collect `fern/` artifact (no secrets) |
+| `fern-docs-preview-build.yml` | `pull_request` | Untrusted half: collect `docs/fern/` artifact (no secrets) |
 | `fern-docs-preview-comment.yml` | `workflow_run` after build | Trusted half: build preview with `DOCS_FERN_TOKEN`, post 🌿 comment |
-| `publish-fern-docs.yml` | push to `main` (`fern/**`), `docs/v*` tag, or manual | Publish to docs.nvidia.com/nemo/automodel |
+| `publish-fern-docs.yml` | push to `main` (`docs/**`), `docs/v*` tag, or manual | Publish to docs.nvidia.com/nemo/automodel |
 
 Required org secret: **`DOCS_FERN_TOKEN`** (already wired for the existing `build-docs.yml`).
 
-PRs that touch `fern/**` get an automatic preview URL posted as a 🌿 comment.
+PRs that touch `docs/**` get an automatic preview URL posted as a 🌿 comment.
 
 ## Commits
 
@@ -187,19 +192,19 @@ DCO sign-off is required:
 git commit -s -m "docs: <add|update|remove> <page-title>"
 ```
 
-PR titles follow Conventional Commits (e.g. `docs(fern): add gemma4 fine-tuning guide`) — see [`AGENTS.md`](../AGENTS.md) for the full convention.
+PR titles follow Conventional Commits (e.g. `docs(fern): add gemma4 fine-tuning guide`) — see [`AGENTS.md`](../../AGENTS.md) for the full convention.
 
 ## Troubleshooting
 
 | Symptom | Fix |
 |---|---|
-| `fern check` YAML error | 2-space indent; `- page:` inside `contents:`; `path:` is relative to the version YAML |
+| `fern check` YAML error | 2-space indent; `- page:` inside `contents:`; `path:` is relative to the version YAML (so nightly paths reach back up via `../../`) |
 | Page 404 in preview | `slug:` collision in the same section, or missing `slug:` override (default slugifies the long display title) |
 | `Folder not found: ./product-docs/...` in `fern docs dev` | Run `make docs` once; library generation populates `product-docs/` |
 | `[ERR_PNPM_IGNORED_BUILDS]` on first `fern docs dev` | pnpm 10+ blocks esbuild's postinstall — `pnpm config set onlyBuiltDependencies '["esbuild"]' --location global`, then `rm -rf ~/.fern/app-preview` and retry |
 | Broken-link warning for version-agnostic path | `fern docs broken-links` false-positives on links without a version slug; the URLMap-based `validate_fern_internal_links.py` is authoritative |
 | `JSX expressions must have one parent element` | Wrap multi-element JSX in `<>...</>` or a `<div>` |
-| Card badges have no spacing | Use `<Tag>` (NeMo AutoModel landing pattern), not raw HTML; spacing is in `main.css` |
+| Card badges have no spacing | Use `<Tag>` (NeMo AutoModel landing pattern), not raw HTML; spacing comes from the `nvidia` global theme's CSS |
 | Old Sphinx URL breaks | Add a `redirects:` entry in `docs.yml` |
 | `<basepath>/<version>/index.html` 404s but deep paths work | `:path*` does not match the empty-path case ([NVIDIA-NeMo/Curator#1938](https://github.com/NVIDIA-NeMo/Curator/pull/1938)). Each version-root `index.html` needs its own explicit redirect rule — slot before the `:path*/index.html` catch-all |
 
diff --git a/fern/components/BadgeLinks.tsx b/docs/fern/components/BadgeLinks.tsx
similarity index 100%
rename from fern/components/BadgeLinks.tsx
rename to docs/fern/components/BadgeLinks.tsx
diff --git a/fern/components/Tag.tsx b/docs/fern/components/Tag.tsx
similarity index 95%
rename from fern/components/Tag.tsx
rename to docs/fern/components/Tag.tsx
index d7c7b5d78e..1bface0b58 100644
--- a/fern/components/Tag.tsx
+++ b/docs/fern/components/Tag.tsx
@@ -10,7 +10,7 @@
  * their visual cues after migration. Used by `convert_myst_to_fern.py` when
  * converting `{bdg-primary}\`text\`` to `<Tag variant="primary">text</Tag>`.
  *
- * Copy to your repo's `fern/components/` together with `BadgeLinks.tsx`.
+ * Copy to your repo's `docs/fern/components/` together with `BadgeLinks.tsx`.
  */
 export type TagVariant =
   | "primary"
diff --git a/fern/docs.yml b/docs/fern/docs.yml
similarity index 88%
rename from fern/docs.yml
rename to docs/fern/docs.yml
index 0480240298..ea0784e18f 100644
--- a/fern/docs.yml
+++ b/docs/fern/docs.yml
@@ -2,36 +2,20 @@ instances:
 - url: https://nemo-automodel.docs.buildwithfern.com/nemo/automodel
   custom-domain: docs.nvidia.com/nemo/automodel
 title: NVIDIA NeMo AutoModel
-footer: ./components/CustomFooter.tsx
-layout:
-  searchbar-placement: header
-  page-width: 1376px
-  sidebar-width: 248px
-  content-width: 812px
-  tabs-placement: header
-  hide-feedback: true
-colors:
-  accentPrimary:
-    dark: '#76B900'
-    light: '#76B900'
-  background:
-    light: '#FFFFFF'
-    dark: '#000000'
-theme:
-  page-actions: toolbar
-  footer-nav: minimal
+
+# Inherit NVIDIA branding (footer, colors, layout, fonts, NVIDIA logos+favicon,
+# OneTrust JS, theme CSS) from the central control repo at
+# https://github.com/NVIDIA/fern-components. At publish time Fern fetches the
+# `nvidia` theme from the registry and merges it on top of this file, so the
+# theme-owned fields (footer / layout / colors / theme / logo SVGs / favicon /
+# js / css) intentionally do NOT appear below.
+global-theme: nvidia
+
+# Override only the `right-text` slot of the theme's logo; SVG paths, height,
+# href stay inherited from the global theme. Same pattern as NeMo Curator.
 logo:
-  dark: ./assets/NVIDIA_dark.svg
-  light: ./assets/NVIDIA_light.svg
-  height: 20
-  href: /
   right-text: NeMo AutoModel
-favicon: ./assets/NVIDIA_symbol.svg
-js:
-- url: https://assets.adobedtm.com/5d4962a43b79/c1061d2c5e7b/launch-191c2462b890.min.js
-  strategy: beforeInteractive
-css:
-- ./main.css
+
 navbar-links:
 - type: github
   value: https://github.com/NVIDIA-NeMo/Automodel
diff --git a/fern/fern.config.json b/docs/fern/fern.config.json
similarity index 59%
rename from fern/fern.config.json
rename to docs/fern/fern.config.json
index 3d47d15e31..aacfdddc82 100644
--- a/fern/fern.config.json
+++ b/docs/fern/fern.config.json
@@ -1,4 +1,4 @@
 {
   "organization": "nvidia",
-  "version": "4.62.4"
+  "version": "5.29.0"
 }
diff --git a/fern/versions/latest.yml b/docs/fern/versions/latest.yml
similarity index 100%
rename from fern/versions/latest.yml
rename to docs/fern/versions/latest.yml
diff --git a/docs/fern/versions/nightly.yml b/docs/fern/versions/nightly.yml
new file mode 100644
index 0000000000..5849938bb7
--- /dev/null
+++ b/docs/fern/versions/nightly.yml
@@ -0,0 +1,375 @@
+navigation:
+  - section: "Get Started"
+    contents:
+      - page: "About NeMo AutoModel"
+        path: ../../about/index.mdx
+        slug: about
+      - page: "Key Features and Concepts"
+        path: ../../about/key-features.mdx
+        slug: key-features
+      - page: "Install NeMo AutoModel"
+        path: ../../guides/installation.mdx
+        slug: installation
+      - page: "YAML Configuration"
+        path: ../../guides/configuration.mdx
+        slug: configuration
+      - page: "🤗 Transformers API Compatibility"
+        path: ../../guides/huggingface-api-compatibility.mdx
+        slug: hf-compatibility
+      - page: "Repository Structure"
+        path: ../../repository-structure.mdx
+        slug: repo-structure
+      - page: "Release Notes"
+        path: ../../release-notes.mdx
+        slug: release-notes
+  - section: "Announcements"
+    contents:
+      - page: "Announcements"
+        path: ../../announcements.mdx
+  - section: "NeMo AutoModel Performance"
+    slug: performance
+    contents:
+      - page: "Performance Summary"
+        path: ../../performance-summary.mdx
+        slug: performance-summary
+  - section: "Model Coverage"
+    contents:
+      - page: "Model Coverage Overview"
+        path: ../../model-coverage/overview.mdx
+        slug: overview
+      - page: "Model Release Log"
+        path: ../../model-coverage/latest-models.mdx
+        slug: release-log
+      - section: "Large Language Models (LLMs)"
+        slug: large-language-models
+        contents:
+          - page: "Overview"
+            path: ../../model-coverage/llm/index.mdx
+            slug: overview
+          - page: "Llama"
+            path: ../../model-coverage/llm/meta/llama.mdx
+          - page: "Gemma"
+            path: ../../model-coverage/llm/google/gemma.mdx
+          - page: "Qwen2"
+            path: ../../model-coverage/llm/qwen/qwen2.mdx
+          - page: "Qwen2 MoE"
+            path: ../../model-coverage/llm/qwen/qwen2-moe.mdx
+          - page: "Qwen3"
+            path: ../../model-coverage/llm/qwen/qwen3.mdx
+          - page: "Qwen3 MoE"
+            path: ../../model-coverage/llm/qwen/qwen3-moe.mdx
+          - page: "Qwen3-Next"
+            path: ../../model-coverage/llm/qwen/qwen3-next.mdx
+          - page: "ERNIE 4.5"
+            path: ../../model-coverage/llm/baidu/ernie4-5.mdx
+          - page: "DeepSeek"
+            path: ../../model-coverage/llm/deepseek-ai/deepseek.mdx
+          - page: "DeepSeek-V3"
+            path: ../../model-coverage/llm/deepseek-ai/deepseek-v3.mdx
+          - page: "DeepSeek-V4 Flash"
+            path: ../../model-coverage/llm/deepseek-ai/dsv4-flash.mdx
+          - page: "Mistral"
+            path: ../../model-coverage/llm/mistralai/mistral.mdx
+          - page: "Mixtral"
+            path: ../../model-coverage/llm/mistralai/mixtral.mdx
+          - page: "Ministral3 / Devstral"
+            path: ../../model-coverage/llm/mistralai/ministral3.mdx
+          - page: "Phi"
+            path: ../../model-coverage/llm/microsoft/phi.mdx
+          - page: "Phi-3 / Phi-4"
+            path: ../../model-coverage/llm/microsoft/phi3.mdx
+          - page: "Phi-3-Small"
+            path: ../../model-coverage/llm/microsoft/phi3-small.mdx
+          - page: "Nemotron / Minitron"
+            path: ../../model-coverage/llm/nvidia/nemotron.mdx
+          - page: "Nemotron-H"
+            path: ../../model-coverage/llm/nvidia/nemotron-h.mdx
+          - page: "Nemotron-Flash"
+            path: ../../model-coverage/llm/nvidia/nemotron-flash.mdx
+          - page: "Nemotron-Super (Llama-3.3-Nemotron-Super-49B)"
+            path: ../../model-coverage/llm/nvidia/nemotron-super.mdx
+          - page: "ChatGLM"
+            path: ../../model-coverage/llm/thudm/chatglm.mdx
+          - page: "GLM-4"
+            path: ../../model-coverage/llm/thudm/glm4.mdx
+          - page: "GLM-4 MoE (GLM-4.5 / GLM-4.7)"
+            path: ../../model-coverage/llm/thudm/glm4-moe.mdx
+          - page: "GLM-5 MoE (DSA)"
+            path: ../../model-coverage/llm/thudm/glm5-moe-dsa.mdx
+          - page: "Granite"
+            path: ../../model-coverage/llm/ibm/granite.mdx
+          - page: "Granite MoE"
+            path: ../../model-coverage/llm/ibm/granite-moe.mdx
+          - page: "Bamba"
+            path: ../../model-coverage/llm/ibm/bamba.mdx
+          - page: "OLMo"
+            path: ../../model-coverage/llm/allenai/olmo.mdx
+          - page: "OLMo2"
+            path: ../../model-coverage/llm/allenai/olmo2.mdx
+          - page: "OLMoE"
+            path: ../../model-coverage/llm/allenai/olmoe.mdx
+          - page: "GPT-OSS"
+            path: ../../model-coverage/llm/openai/gpt-oss.mdx
+          - page: "GPT-2"
+            path: ../../model-coverage/llm/openai/gpt2.mdx
+          - page: "GPT-J"
+            path: ../../model-coverage/llm/eleutherai/gpt-j.mdx
+          - page: "GPT-NeoX / Pythia"
+            path: ../../model-coverage/llm/eleutherai/gpt-neox.mdx
+          - page: "StarCoder"
+            path: ../../model-coverage/llm/bigcode/starcoder.mdx
+          - page: "StarCoder2"
+            path: ../../model-coverage/llm/bigcode/starcoder2.mdx
+          - page: "Aquila / Aquila2"
+            path: ../../model-coverage/llm/baai/aquila.mdx
+          - page: "Baichuan / Baichuan2"
+            path: ../../model-coverage/llm/baichuan-inc/baichuan.mdx
+          - page: "Command-R"
+            path: ../../model-coverage/llm/cohere/command-r.mdx
+          - page: "Falcon"
+            path: ../../model-coverage/llm/tiiuae/falcon.mdx
+          - page: "EXAONE"
+            path: ../../model-coverage/llm/lgai-exaone/exaone.mdx
+          - page: "InternLM"
+            path: ../../model-coverage/llm/internlm/internlm.mdx
+          - page: "Jais"
+            path: ../../model-coverage/llm/inceptionai/jais.mdx
+          - page: "MiniMax-M2"
+            path: ../../model-coverage/llm/minimax/minimax-m2.mdx
+          - page: "MiniCPM"
+            path: ../../model-coverage/llm/openbmb/minicpm.mdx
+          - page: "Moonlight"
+            path: ../../model-coverage/llm/moonshotai/moonlight.mdx
+          - page: "Seed (ByteDance)"
+            path: ../../model-coverage/llm/bytedance-seed/seed.mdx
+          - page: "Solar Pro"
+            path: ../../model-coverage/llm/upstage/solar.mdx
+          - page: "Orion"
+            path: ../../model-coverage/llm/orionstar/orion.mdx
+          - page: "StableLM"
+            path: ../../model-coverage/llm/stabilityai/stablelm.mdx
+          - page: "Step-3.5"
+            path: ../../model-coverage/llm/stepfun-ai/step-3-5.mdx
+          - page: "GritLM"
+            path: ../../model-coverage/llm/parasail-ai/gritlm.mdx
+          - page: "Hy3-preview"
+            path: ../../model-coverage/llm/tencent/hy3.mdx
+          - page: "MiMo-V2-Flash"
+            path: ../../model-coverage/llm/xiaomimimo/mimo-v2-flash.mdx
+          - page: "Ling 2.0"
+            path: ../../model-coverage/llm/inclusionai/ling-2.mdx
+      - section: "Vision Language Models (VLMs)"
+        slug: vision-language-models
+        contents:
+          - page: "Overview"
+            path: ../../model-coverage/vlm/index.mdx
+            slug: overview
+          - page: "Kimi-VL"
+            path: ../../model-coverage/vlm/moonshotai/kimi-vl.mdx
+          - page: "Gemma 3 VL / Gemma 3n"
+            path: ../../model-coverage/vlm/google/gemma3-vl.mdx
+          - page: "Gemma 4"
+            path: ../../model-coverage/vlm/google/gemma4.mdx
+          - page: "Qwen2.5-VL"
+            path: ../../model-coverage/vlm/qwen/qwen2-5-vl.mdx
+          - page: "Qwen3-VL / Qwen3-VL-MoE"
+            path: ../../model-coverage/vlm/qwen/qwen3-vl.mdx
+          - page: "Qwen3.5-VL"
+            path: ../../model-coverage/vlm/qwen/qwen3-5-vl.mdx
+          - page: "Nemotron-Parse"
+            path: ../../model-coverage/vlm/nvidia/nemotron-parse.mdx
+          - page: "Ministral3 VL"
+            path: ../../model-coverage/vlm/mistralai/ministral3-vl.mdx
+          - page: "Mistral Medium 3.5"
+            path: ../../model-coverage/vlm/mistralai/mistral-medium-3-5.mdx
+          - page: "Mistral-Small-4"
+            path: ../../model-coverage/vlm/mistralai/mistral-small-4.mdx
+          - page: "InternVL"
+            path: ../../model-coverage/vlm/internlm/internvl.mdx
+          - page: "Llama 4"
+            path: ../../model-coverage/vlm/meta/llama4.mdx
+          - page: "LLaVA-OneVision"
+            path: ../../model-coverage/vlm/lmms-lab/llava-onevision.mdx
+          - page: "SmolVLM"
+            path: ../../model-coverage/vlm/huggingface/smolvlm.mdx
+          - page: "LLaVA"
+            path: ../../model-coverage/vlm/llava-hf/llava.mdx
+      - section: "Omni Models"
+        slug: omni
+        contents:
+          - page: "Overview"
+            path: ../../model-coverage/omni/index.mdx
+            slug: overview
+          - page: "Qwen3-Omni"
+            path: ../../model-coverage/omni/qwen/qwen3-omni.mdx
+          - page: "Phi-4-multimodal"
+            path: ../../model-coverage/omni/microsoft/phi4-multimodal.mdx
+          - page: "Nemotron-Omni"
+            path: ../../model-coverage/omni/nvidia/nemotron-omni.mdx
+      - section: "Diffusion Models"
+        slug: diffusion
+        contents:
+          - page: "Overview"
+            path: ../../model-coverage/diffusion/index.mdx
+            slug: overview
+          - page: "Wan 2.1 T2V"
+            path: ../../model-coverage/diffusion/wan-ai/wan2-1-t2v.mdx
+          - page: "FLUX.1-dev"
+            path: ../../model-coverage/diffusion/black-forest-labs/flux.mdx
+          - page: "HunyuanVideo 1.5"
+            path: ../../model-coverage/diffusion/hunyuanvideo-community/hunyuanvideo.mdx
+          - page: "Qwen-Image"
+            path: ../../model-coverage/diffusion/qwen/qwen-image.mdx
+      - section: "Embedding Models"
+        slug: embedding-models
+        contents:
+          - page: "Overview"
+            path: ../../model-coverage/embedding/index.mdx
+            slug: overview
+          - page: "Llama (Bidirectional)"
+            path: ../../model-coverage/embedding/nvidia/llama-bidirectional.mdx
+            slug: llama-bidirectional
+          - page: "Ministral3 (Bidirectional)"
+            path: ../../model-coverage/embedding/mistralai/ministral3-bidirectional.mdx
+            slug: ministral3-bidirectional
+      - section: "Reranking Models"
+        slug: reranking-models
+        contents:
+          - page: "Overview"
+            path: ../../model-coverage/reranker/index.mdx
+            slug: overview
+          - page: "Llama (Bidirectional)"
+            path: ../../model-coverage/reranker/nvidia/llama-bidirectional.mdx
+            slug: llama-bidirectional
+  - section: "Recipes & E2E Examples"
+    contents:
+      - page: "Recipes and End-to-End Examples"
+        path: ../../guides/overview.mdx
+        slug: overview
+      - page: "Supervised Fine-Tuning (SFT) and Parameter-Efficient Fine-Tuning (PEFT) with NeMo AutoModel"
+        path: ../../guides/llm/finetune.mdx
+        slug: sft-peft
+      - page: "Function Calling with NeMo AutoModel using FunctionGemma"
+        path: ../../guides/llm/toolcalling.mdx
+        slug: function-calling
+      - page: "Knowledge Distillation with NeMo AutoModel"
+        path: ../../guides/llm/knowledge-distillation.mdx
+        slug: knowledge-distillation
+      - page: "Fine-Tune Large MoE LLMs"
+        path: ../../guides/llm/large-moe-finetune.mdx
+        slug: large-moe-fine-tuning
+      - page: "DeepSeek V4 Flash"
+        path: ../../guides/llm/dsv4-flash.mdx
+        slug: deepseek-v4-flash
+      - page: "Hy3-preview"
+        path: ../../guides/llm/hy3.mdx
+        slug: hy3-preview
+      - page: "Pretraining Megatron Core Datasets with NeMo AutoModel"
+        path: ../../guides/llm/pretraining.mdx
+        slug: pretraining
+      - page: "LLM Pre-Training with NeMo AutoModel"
+        path: ../../guides/llm/nanogpt-pretraining.mdx
+        slug: nanogpt-pretraining
+      - page: "Sequence Classification (SFT/PEFT) with NeMo AutoModel"
+        path: ../../guides/llm/sequence-classification.mdx
+        slug: sequence-classification
+      - page: "Fine-Tune Gemma 3 and Gemma 3n"
+        path: ../../guides/omni/gemma3-3n.mdx
+        slug: gemma-3-3n
+      - page: "Fine-Tuning Gemma 4 31B on CORD-v2 Receipts — End-to-End Guide"
+        path: ../../guides/vlm/gemma4.mdx
+        slug: gemma-4
+      - page: "Fine-Tune Qwen3.5-VL"
+        path: ../../guides/vlm/qwen3-5.mdx
+        slug: qwen3-5-vl
+      - page: "Nemotron-Omni"
+        path: ../../guides/vlm/nemotron-omni.mdx
+        slug: nemotron-omni
+      - page: "Mistral Medium 3.5 VL"
+        path: ../../guides/vlm/mistral-medium-3-5.mdx
+        slug: mistral-medium-3-5
+      - page: "Fine-Tune Qwen3-Omni for ASR"
+        path: ../../guides/audio/qwen3-omni-asr.mdx
+        slug: qwen3-omni-asr
+      - page: "Diffusion Model Fine-Tuning with NeMo AutoModel"
+        path: ../../guides/diffusion/finetune.mdx
+        slug: diffusion-fine-tuning
+      - page: "dLLM Fine-Tuning"
+        path: ../../guides/dllm/finetune.mdx
+        slug: dllm-fine-tuning
+      - page: "Quantization-Aware Training (QAT) in NeMo Automodel"
+        path: ../../guides/quantization-aware-training.mdx
+        slug: qat
+      - page: "Model Training on Databricks"
+        path: ../../guides/llm/databricks.mdx
+        slug: databricks
+  - section: "Datasets"
+    contents:
+      - page: "Dataset Overview: LLM, VLM, and Retrieval Datasets in NeMo AutoModel"
+        path: ../../guides/dataset-overview.mdx
+        slug: overview
+      - page: "Integrate Your Own Text Dataset"
+        path: ../../guides/llm/dataset.mdx
+        slug: text-dataset
+      - page: "Retrieval Dataset (Embedding Fine-tuning)"
+        path: ../../guides/llm/retrieval-dataset.mdx
+        slug: retrieval-dataset
+      - page: "Use the ColumnMappedTextInstructionDataset"
+        path: ../../guides/llm/column-mapped-text-instruction-dataset.mdx
+        slug: columnmapped-dataset
+      - page: "Use the ColumnMappedTextInstructionIterableDataset (Streaming)"
+        path: ../../guides/llm/column-mapped-text-instruction-iterable-dataset.mdx
+        slug: columnmapped-iterable
+      - page: "Integrate Your Own Multi-Modal Dataset"
+        path: ../../guides/vlm/dataset.mdx
+        slug: multi-modal-dataset
+      - page: "Diffusion Dataset Preparation"
+        path: ../../guides/diffusion/dataset.mdx
+        slug: diffusion-dataset
+  - section: "Job Launchers"
+    contents:
+      - page: "Job Launchers"
+        path: ../../launcher/overview.mdx
+        slug: overview
+      - page: "Run on Your Local Workstation"
+        path: ../../launcher/local-workstation.mdx
+        slug: local-workstation
+      - page: "Run on a Cluster"
+        path: ../../launcher/slurm.mdx
+        slug: slurm-cluster
+      - page: "Run with NeMo Run"
+        path: ../../launcher/nemo-run.mdx
+        slug: nemo-run
+      - page: "Run on Any Cloud with SkyPilot"
+        path: ../../launcher/skypilot.mdx
+        slug: skypilot
+      - page: "SkyPilot k8s"
+        path: ../../launcher/skypilot-kubernetes.mdx
+        slug: skypilot-k8s
+  - section: "Development"
+    contents:
+      - page: "Checkpointing in NeMo Automodel"
+        path: ../../guides/checkpointing.mdx
+        slug: checkpointing
+      - page: "Gradient (Activation) Checkpointing in NeMo AutoModel"
+        path: ../../guides/gradient-checkpointing.mdx
+        slug: gradient-checkpointing
+      - page: "Pipeline Parallelism with AutoPipeline"
+        path: ../../guides/pipelining.mdx
+        slug: pipeline-parallelism
+      - page: "FP8 Training in NeMo AutoModel"
+        path: ../../guides/fp8-training.mdx
+        slug: fp8-training
+      - page: "MLflow Logging in NeMo AutoModel"
+        path: ../../guides/mlflow-logging.mdx
+        slug: mlflow-logging
+      - page: "Breaking Changes"
+        path: ../../breaking-changes.mdx
+        slug: breaking-changes
+      - section: "API Reference"
+        slug: api-reference
+        contents:
+          - page: "Overview"
+            path: ../../api-reference/index.mdx
+            slug: overview
+          - folder: ../product-docs/nemo-automodel/Full-Library-Reference
diff --git a/fern/versions/v0.4.yml b/docs/fern/versions/v0.4.yml
similarity index 100%
rename from fern/versions/v0.4.yml
rename to docs/fern/versions/v0.4.yml
diff --git a/fern/versions/v0.4/pages/about/index.mdx b/docs/fern/versions/v0.4/pages/about/index.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/about/index.mdx
rename to docs/fern/versions/v0.4/pages/about/index.mdx
diff --git a/fern/versions/v0.4/pages/about/key-features.mdx b/docs/fern/versions/v0.4/pages/about/key-features.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/about/key-features.mdx
rename to docs/fern/versions/v0.4/pages/about/key-features.mdx
diff --git a/fern/versions/v0.4/pages/announcements.mdx b/docs/fern/versions/v0.4/pages/announcements.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/announcements.mdx
rename to docs/fern/versions/v0.4/pages/announcements.mdx
diff --git a/fern/versions/v0.4/pages/api-reference/index.mdx b/docs/fern/versions/v0.4/pages/api-reference/index.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/api-reference/index.mdx
rename to docs/fern/versions/v0.4/pages/api-reference/index.mdx
diff --git a/fern/versions/nightly/pages/automodel_diagram.png b/docs/fern/versions/v0.4/pages/automodel_diagram.png
similarity index 100%
rename from fern/versions/nightly/pages/automodel_diagram.png
rename to docs/fern/versions/v0.4/pages/automodel_diagram.png
diff --git a/fern/versions/v0.4/pages/breaking-changes.mdx b/docs/fern/versions/v0.4/pages/breaking-changes.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/breaking-changes.mdx
rename to docs/fern/versions/v0.4/pages/breaking-changes.mdx
diff --git a/fern/versions/v0.4/pages/documentation.mdx b/docs/fern/versions/v0.4/pages/documentation.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/documentation.mdx
rename to docs/fern/versions/v0.4/pages/documentation.mdx
diff --git a/fern/versions/v0.4/pages/guides/checkpointing.mdx b/docs/fern/versions/v0.4/pages/guides/checkpointing.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/checkpointing.mdx
rename to docs/fern/versions/v0.4/pages/guides/checkpointing.mdx
diff --git a/fern/versions/nightly/pages/guides/configuration.mdx b/docs/fern/versions/v0.4/pages/guides/configuration.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/configuration.mdx
rename to docs/fern/versions/v0.4/pages/guides/configuration.mdx
diff --git a/fern/versions/nightly/pages/guides/dataset-overview.mdx b/docs/fern/versions/v0.4/pages/guides/dataset-overview.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/dataset-overview.mdx
rename to docs/fern/versions/v0.4/pages/guides/dataset-overview.mdx
diff --git a/fern/versions/nightly/pages/guides/diffusion/dataset.mdx b/docs/fern/versions/v0.4/pages/guides/diffusion/dataset.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/diffusion/dataset.mdx
rename to docs/fern/versions/v0.4/pages/guides/diffusion/dataset.mdx
diff --git a/fern/versions/nightly/pages/guides/diffusion/finetune.mdx b/docs/fern/versions/v0.4/pages/guides/diffusion/finetune.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/diffusion/finetune.mdx
rename to docs/fern/versions/v0.4/pages/guides/diffusion/finetune.mdx
diff --git a/fern/versions/nightly/pages/guides/dllm/finetune.mdx b/docs/fern/versions/v0.4/pages/guides/dllm/finetune.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/dllm/finetune.mdx
rename to docs/fern/versions/v0.4/pages/guides/dllm/finetune.mdx
diff --git a/fern/versions/nightly/pages/guides/fp8-training.mdx b/docs/fern/versions/v0.4/pages/guides/fp8-training.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/fp8-training.mdx
rename to docs/fern/versions/v0.4/pages/guides/fp8-training.mdx
diff --git a/fern/versions/nightly/pages/guides/fp8_convergence.jpg b/docs/fern/versions/v0.4/pages/guides/fp8_convergence.jpg
similarity index 100%
rename from fern/versions/nightly/pages/guides/fp8_convergence.jpg
rename to docs/fern/versions/v0.4/pages/guides/fp8_convergence.jpg
diff --git a/fern/versions/nightly/pages/guides/gradient-checkpointing.mdx b/docs/fern/versions/v0.4/pages/guides/gradient-checkpointing.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/gradient-checkpointing.mdx
rename to docs/fern/versions/v0.4/pages/guides/gradient-checkpointing.mdx
diff --git a/fern/versions/nightly/pages/guides/huggingface-api-compatibility.mdx b/docs/fern/versions/v0.4/pages/guides/huggingface-api-compatibility.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/huggingface-api-compatibility.mdx
rename to docs/fern/versions/v0.4/pages/guides/huggingface-api-compatibility.mdx
diff --git a/fern/versions/nightly/pages/guides/installation.mdx b/docs/fern/versions/v0.4/pages/guides/installation.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/installation.mdx
rename to docs/fern/versions/v0.4/pages/guides/installation.mdx
diff --git a/fern/versions/nightly/pages/guides/llm/column-mapped-text-instruction-dataset.mdx b/docs/fern/versions/v0.4/pages/guides/llm/column-mapped-text-instruction-dataset.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/llm/column-mapped-text-instruction-dataset.mdx
rename to docs/fern/versions/v0.4/pages/guides/llm/column-mapped-text-instruction-dataset.mdx
diff --git a/fern/versions/nightly/pages/guides/llm/column-mapped-text-instruction-iterable-dataset.mdx b/docs/fern/versions/v0.4/pages/guides/llm/column-mapped-text-instruction-iterable-dataset.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/llm/column-mapped-text-instruction-iterable-dataset.mdx
rename to docs/fern/versions/v0.4/pages/guides/llm/column-mapped-text-instruction-iterable-dataset.mdx
diff --git a/fern/versions/nightly/pages/guides/llm/databricks-gpu-metrics-multi.png b/docs/fern/versions/v0.4/pages/guides/llm/databricks-gpu-metrics-multi.png
similarity index 100%
rename from fern/versions/nightly/pages/guides/llm/databricks-gpu-metrics-multi.png
rename to docs/fern/versions/v0.4/pages/guides/llm/databricks-gpu-metrics-multi.png
diff --git a/fern/versions/nightly/pages/guides/llm/databricks-gpu-metrics-single.png b/docs/fern/versions/v0.4/pages/guides/llm/databricks-gpu-metrics-single.png
similarity index 100%
rename from fern/versions/nightly/pages/guides/llm/databricks-gpu-metrics-single.png
rename to docs/fern/versions/v0.4/pages/guides/llm/databricks-gpu-metrics-single.png
diff --git a/fern/versions/nightly/pages/guides/llm/databricks.mdx b/docs/fern/versions/v0.4/pages/guides/llm/databricks.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/llm/databricks.mdx
rename to docs/fern/versions/v0.4/pages/guides/llm/databricks.mdx
diff --git a/fern/versions/nightly/pages/guides/llm/dataset.mdx b/docs/fern/versions/v0.4/pages/guides/llm/dataset.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/llm/dataset.mdx
rename to docs/fern/versions/v0.4/pages/guides/llm/dataset.mdx
diff --git a/fern/versions/nightly/pages/guides/llm/dsv4-flash.mdx b/docs/fern/versions/v0.4/pages/guides/llm/dsv4-flash.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/llm/dsv4-flash.mdx
rename to docs/fern/versions/v0.4/pages/guides/llm/dsv4-flash.mdx
diff --git a/fern/versions/v0.4/pages/guides/llm/finetune.mdx b/docs/fern/versions/v0.4/pages/guides/llm/finetune.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/llm/finetune.mdx
rename to docs/fern/versions/v0.4/pages/guides/llm/finetune.mdx
diff --git a/fern/versions/nightly/pages/guides/llm/functiongemma-peft-loss.png b/docs/fern/versions/v0.4/pages/guides/llm/functiongemma-peft-loss.png
similarity index 100%
rename from fern/versions/nightly/pages/guides/llm/functiongemma-peft-loss.png
rename to docs/fern/versions/v0.4/pages/guides/llm/functiongemma-peft-loss.png
diff --git a/fern/versions/nightly/pages/guides/llm/functiongemma-sft-loss.png b/docs/fern/versions/v0.4/pages/guides/llm/functiongemma-sft-loss.png
similarity index 100%
rename from fern/versions/nightly/pages/guides/llm/functiongemma-sft-loss.png
rename to docs/fern/versions/v0.4/pages/guides/llm/functiongemma-sft-loss.png
diff --git a/fern/versions/nightly/pages/guides/llm/gpt2_loss.png b/docs/fern/versions/v0.4/pages/guides/llm/gpt2_loss.png
similarity index 100%
rename from fern/versions/nightly/pages/guides/llm/gpt2_loss.png
rename to docs/fern/versions/v0.4/pages/guides/llm/gpt2_loss.png
diff --git a/fern/versions/v0.4/pages/guides/llm/hy3.mdx b/docs/fern/versions/v0.4/pages/guides/llm/hy3.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/llm/hy3.mdx
rename to docs/fern/versions/v0.4/pages/guides/llm/hy3.mdx
diff --git a/fern/versions/nightly/pages/guides/llm/knowledge-distillation.mdx b/docs/fern/versions/v0.4/pages/guides/llm/knowledge-distillation.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/llm/knowledge-distillation.mdx
rename to docs/fern/versions/v0.4/pages/guides/llm/knowledge-distillation.mdx
diff --git a/fern/versions/nightly/pages/guides/llm/large-moe-finetune.mdx b/docs/fern/versions/v0.4/pages/guides/llm/large-moe-finetune.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/llm/large-moe-finetune.mdx
rename to docs/fern/versions/v0.4/pages/guides/llm/large-moe-finetune.mdx
diff --git a/fern/versions/nightly/pages/guides/llm/nanogpt-pretraining.mdx b/docs/fern/versions/v0.4/pages/guides/llm/nanogpt-pretraining.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/llm/nanogpt-pretraining.mdx
rename to docs/fern/versions/v0.4/pages/guides/llm/nanogpt-pretraining.mdx
diff --git a/fern/versions/v0.4/pages/guides/llm/pretraining.mdx b/docs/fern/versions/v0.4/pages/guides/llm/pretraining.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/llm/pretraining.mdx
rename to docs/fern/versions/v0.4/pages/guides/llm/pretraining.mdx
diff --git a/fern/versions/nightly/pages/guides/llm/retrieval-dataset.mdx b/docs/fern/versions/v0.4/pages/guides/llm/retrieval-dataset.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/llm/retrieval-dataset.mdx
rename to docs/fern/versions/v0.4/pages/guides/llm/retrieval-dataset.mdx
diff --git a/fern/versions/nightly/pages/guides/llm/sequence-classification.mdx b/docs/fern/versions/v0.4/pages/guides/llm/sequence-classification.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/llm/sequence-classification.mdx
rename to docs/fern/versions/v0.4/pages/guides/llm/sequence-classification.mdx
diff --git a/fern/versions/nightly/pages/guides/llm/toolcalling.mdx b/docs/fern/versions/v0.4/pages/guides/llm/toolcalling.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/llm/toolcalling.mdx
rename to docs/fern/versions/v0.4/pages/guides/llm/toolcalling.mdx
diff --git a/fern/versions/nightly/pages/guides/mlflow-logging.mdx b/docs/fern/versions/v0.4/pages/guides/mlflow-logging.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/mlflow-logging.mdx
rename to docs/fern/versions/v0.4/pages/guides/mlflow-logging.mdx
diff --git a/fern/versions/nightly/pages/guides/omni/gemma3-3n.mdx b/docs/fern/versions/v0.4/pages/guides/omni/gemma3-3n.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/omni/gemma3-3n.mdx
rename to docs/fern/versions/v0.4/pages/guides/omni/gemma3-3n.mdx
diff --git a/fern/versions/nightly/pages/guides/omni/medpix.jpg b/docs/fern/versions/v0.4/pages/guides/omni/medpix.jpg
similarity index 100%
rename from fern/versions/nightly/pages/guides/omni/medpix.jpg
rename to docs/fern/versions/v0.4/pages/guides/omni/medpix.jpg
diff --git a/fern/versions/nightly/pages/guides/omni/medpix_peft.jpg b/docs/fern/versions/v0.4/pages/guides/omni/medpix_peft.jpg
similarity index 100%
rename from fern/versions/nightly/pages/guides/omni/medpix_peft.jpg
rename to docs/fern/versions/v0.4/pages/guides/omni/medpix_peft.jpg
diff --git a/fern/versions/v0.4/pages/guides/overview.mdx b/docs/fern/versions/v0.4/pages/guides/overview.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/overview.mdx
rename to docs/fern/versions/v0.4/pages/guides/overview.mdx
diff --git a/fern/versions/v0.4/pages/guides/pipelining.mdx b/docs/fern/versions/v0.4/pages/guides/pipelining.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/pipelining.mdx
rename to docs/fern/versions/v0.4/pages/guides/pipelining.mdx
diff --git a/fern/versions/nightly/pages/guides/quantization-aware-training.mdx b/docs/fern/versions/v0.4/pages/guides/quantization-aware-training.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/quantization-aware-training.mdx
rename to docs/fern/versions/v0.4/pages/guides/quantization-aware-training.mdx
diff --git a/fern/versions/nightly/pages/guides/vlm/dataset.mdx b/docs/fern/versions/v0.4/pages/guides/vlm/dataset.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/vlm/dataset.mdx
rename to docs/fern/versions/v0.4/pages/guides/vlm/dataset.mdx
diff --git a/fern/versions/nightly/pages/guides/vlm/gemma4.mdx b/docs/fern/versions/v0.4/pages/guides/vlm/gemma4.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/vlm/gemma4.mdx
rename to docs/fern/versions/v0.4/pages/guides/vlm/gemma4.mdx
diff --git a/fern/versions/nightly/pages/guides/vlm/mistral-medium-3-5.mdx b/docs/fern/versions/v0.4/pages/guides/vlm/mistral-medium-3-5.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/vlm/mistral-medium-3-5.mdx
rename to docs/fern/versions/v0.4/pages/guides/vlm/mistral-medium-3-5.mdx
diff --git a/fern/versions/nightly/pages/guides/vlm/mistralm35.png b/docs/fern/versions/v0.4/pages/guides/vlm/mistralm35.png
similarity index 100%
rename from fern/versions/nightly/pages/guides/vlm/mistralm35.png
rename to docs/fern/versions/v0.4/pages/guides/vlm/mistralm35.png
diff --git a/fern/versions/nightly/pages/guides/vlm/nemotron-omni.mdx b/docs/fern/versions/v0.4/pages/guides/vlm/nemotron-omni.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/vlm/nemotron-omni.mdx
rename to docs/fern/versions/v0.4/pages/guides/vlm/nemotron-omni.mdx
diff --git a/fern/versions/nightly/pages/guides/vlm/qwen3-5.mdx b/docs/fern/versions/v0.4/pages/guides/vlm/qwen3-5.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/vlm/qwen3-5.mdx
rename to docs/fern/versions/v0.4/pages/guides/vlm/qwen3-5.mdx
diff --git a/fern/versions/nightly/pages/guides/vlm/qwen3_5.png b/docs/fern/versions/v0.4/pages/guides/vlm/qwen3_5.png
similarity index 100%
rename from fern/versions/nightly/pages/guides/vlm/qwen3_5.png
rename to docs/fern/versions/v0.4/pages/guides/vlm/qwen3_5.png
diff --git a/fern/versions/nightly/pages/guides/vlm/qwen3_5scores.png b/docs/fern/versions/v0.4/pages/guides/vlm/qwen3_5scores.png
similarity index 100%
rename from fern/versions/nightly/pages/guides/vlm/qwen3_5scores.png
rename to docs/fern/versions/v0.4/pages/guides/vlm/qwen3_5scores.png
diff --git a/fern/versions/v0.4/pages/index.mdx b/docs/fern/versions/v0.4/pages/index.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/index.mdx
rename to docs/fern/versions/v0.4/pages/index.mdx
diff --git a/fern/versions/nightly/pages/launcher/local-workstation.mdx b/docs/fern/versions/v0.4/pages/launcher/local-workstation.mdx
similarity index 100%
rename from fern/versions/nightly/pages/launcher/local-workstation.mdx
rename to docs/fern/versions/v0.4/pages/launcher/local-workstation.mdx
diff --git a/fern/versions/v0.4/pages/launcher/nemo-run.mdx b/docs/fern/versions/v0.4/pages/launcher/nemo-run.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/launcher/nemo-run.mdx
rename to docs/fern/versions/v0.4/pages/launcher/nemo-run.mdx
diff --git a/fern/versions/nightly/pages/launcher/overview.mdx b/docs/fern/versions/v0.4/pages/launcher/overview.mdx
similarity index 100%
rename from fern/versions/nightly/pages/launcher/overview.mdx
rename to docs/fern/versions/v0.4/pages/launcher/overview.mdx
diff --git a/fern/versions/nightly/pages/launcher/skypilot-kubernetes.mdx b/docs/fern/versions/v0.4/pages/launcher/skypilot-kubernetes.mdx
similarity index 100%
rename from fern/versions/nightly/pages/launcher/skypilot-kubernetes.mdx
rename to docs/fern/versions/v0.4/pages/launcher/skypilot-kubernetes.mdx
diff --git a/fern/versions/nightly/pages/launcher/skypilot.mdx b/docs/fern/versions/v0.4/pages/launcher/skypilot.mdx
similarity index 100%
rename from fern/versions/nightly/pages/launcher/skypilot.mdx
rename to docs/fern/versions/v0.4/pages/launcher/skypilot.mdx
diff --git a/fern/versions/nightly/pages/launcher/slurm.mdx b/docs/fern/versions/v0.4/pages/launcher/slurm.mdx
similarity index 100%
rename from fern/versions/nightly/pages/launcher/slurm.mdx
rename to docs/fern/versions/v0.4/pages/launcher/slurm.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/diffusion/black-forest-labs/flux.mdx b/docs/fern/versions/v0.4/pages/model-coverage/diffusion/black-forest-labs/flux.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/diffusion/black-forest-labs/flux.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/diffusion/black-forest-labs/flux.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/diffusion/hunyuanvideo-community/hunyuanvideo.mdx b/docs/fern/versions/v0.4/pages/model-coverage/diffusion/hunyuanvideo-community/hunyuanvideo.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/diffusion/hunyuanvideo-community/hunyuanvideo.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/diffusion/hunyuanvideo-community/hunyuanvideo.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/diffusion/index.mdx b/docs/fern/versions/v0.4/pages/model-coverage/diffusion/index.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/diffusion/index.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/diffusion/index.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/diffusion/qwen/qwen-image.mdx b/docs/fern/versions/v0.4/pages/model-coverage/diffusion/qwen/qwen-image.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/diffusion/qwen/qwen-image.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/diffusion/qwen/qwen-image.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/diffusion/wan-ai/wan2-1-t2v.mdx b/docs/fern/versions/v0.4/pages/model-coverage/diffusion/wan-ai/wan2-1-t2v.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/diffusion/wan-ai/wan2-1-t2v.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/diffusion/wan-ai/wan2-1-t2v.mdx
diff --git a/fern/versions/v0.4/pages/model-coverage/latest-models.mdx b/docs/fern/versions/v0.4/pages/model-coverage/latest-models.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/latest-models.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/latest-models.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/allenai/olmo.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/allenai/olmo.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/allenai/olmo.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/allenai/olmo.mdx
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/allenai/olmo2.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/allenai/olmo2.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/allenai/olmo2.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/allenai/olmo2.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/allenai/olmoe.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/allenai/olmoe.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/allenai/olmoe.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/allenai/olmoe.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/baai/aquila.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/baai/aquila.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/baai/aquila.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/baai/aquila.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/baichuan-inc/baichuan.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/baichuan-inc/baichuan.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/baichuan-inc/baichuan.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/baichuan-inc/baichuan.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/bigcode/starcoder.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/bigcode/starcoder.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/bigcode/starcoder.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/bigcode/starcoder.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/bigcode/starcoder2.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/bigcode/starcoder2.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/bigcode/starcoder2.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/bigcode/starcoder2.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/bytedance-seed/seed.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/bytedance-seed/seed.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/bytedance-seed/seed.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/bytedance-seed/seed.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/cohere/command-r.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/cohere/command-r.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/cohere/command-r.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/cohere/command-r.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/deepseek-ai/deepseek-v3.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/deepseek-ai/deepseek-v3.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/deepseek-ai/deepseek-v3.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/deepseek-ai/deepseek-v3.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/deepseek-ai/deepseek.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/deepseek-ai/deepseek.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/deepseek-ai/deepseek.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/deepseek-ai/deepseek.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/deepseek-ai/dsv4-flash.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/deepseek-ai/dsv4-flash.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/deepseek-ai/dsv4-flash.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/deepseek-ai/dsv4-flash.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/eleutherai/gpt-j.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/eleutherai/gpt-j.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/eleutherai/gpt-j.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/eleutherai/gpt-j.mdx
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/eleutherai/gpt-neox.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/eleutherai/gpt-neox.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/eleutherai/gpt-neox.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/eleutherai/gpt-neox.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/google/gemma.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/google/gemma.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/google/gemma.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/google/gemma.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/ibm/bamba.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/ibm/bamba.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/ibm/bamba.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/ibm/bamba.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/ibm/granite-moe.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/ibm/granite-moe.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/ibm/granite-moe.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/ibm/granite-moe.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/ibm/granite.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/ibm/granite.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/ibm/granite.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/ibm/granite.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/inceptionai/jais.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/inceptionai/jais.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/inceptionai/jais.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/inceptionai/jais.mdx
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/index.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/index.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/index.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/index.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/internlm/internlm.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/internlm/internlm.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/internlm/internlm.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/internlm/internlm.mdx
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/lgai-exaone/exaone.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/lgai-exaone/exaone.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/lgai-exaone/exaone.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/lgai-exaone/exaone.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/meta/llama.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/meta/llama.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/meta/llama.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/meta/llama.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/microsoft/phi.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/microsoft/phi.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/microsoft/phi.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/microsoft/phi.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/microsoft/phi3-small.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/microsoft/phi3-small.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/microsoft/phi3-small.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/microsoft/phi3-small.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/microsoft/phi3.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/microsoft/phi3.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/microsoft/phi3.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/microsoft/phi3.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/minimax/minimax-m2.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/minimax/minimax-m2.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/minimax/minimax-m2.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/minimax/minimax-m2.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/mistralai/ministral3.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/mistralai/ministral3.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/mistralai/ministral3.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/mistralai/ministral3.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/mistralai/mistral.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/mistralai/mistral.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/mistralai/mistral.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/mistralai/mistral.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/mistralai/mixtral.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/mistralai/mixtral.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/mistralai/mixtral.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/mistralai/mixtral.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/moonshotai/moonlight.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/moonshotai/moonlight.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/moonshotai/moonlight.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/moonshotai/moonlight.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/nvidia/nemotron-flash.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/nvidia/nemotron-flash.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/nvidia/nemotron-flash.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/nvidia/nemotron-flash.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/nvidia/nemotron-h.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/nvidia/nemotron-h.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/nvidia/nemotron-h.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/nvidia/nemotron-h.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/nvidia/nemotron-super.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/nvidia/nemotron-super.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/nvidia/nemotron-super.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/nvidia/nemotron-super.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/nvidia/nemotron.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/nvidia/nemotron.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/nvidia/nemotron.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/nvidia/nemotron.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/openai/gpt-oss.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/openai/gpt-oss.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/openai/gpt-oss.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/openai/gpt-oss.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/openai/gpt2.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/openai/gpt2.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/openai/gpt2.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/openai/gpt2.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/openbmb/minicpm.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/openbmb/minicpm.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/openbmb/minicpm.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/openbmb/minicpm.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/orionstar/orion.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/orionstar/orion.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/orionstar/orion.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/orionstar/orion.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/parasail-ai/gritlm.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/parasail-ai/gritlm.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/parasail-ai/gritlm.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/parasail-ai/gritlm.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/qwen/qwen2-moe.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/qwen/qwen2-moe.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/qwen/qwen2-moe.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/qwen/qwen2-moe.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/qwen/qwen2.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/qwen/qwen2.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/qwen/qwen2.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/qwen/qwen2.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/qwen/qwen3-moe.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/qwen/qwen3-moe.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/qwen/qwen3-moe.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/qwen/qwen3-moe.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/qwen/qwen3-next.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/qwen/qwen3-next.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/qwen/qwen3-next.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/qwen/qwen3-next.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/qwen/qwen3.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/qwen/qwen3.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/qwen/qwen3.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/qwen/qwen3.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/stabilityai/stablelm.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/stabilityai/stablelm.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/stabilityai/stablelm.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/stabilityai/stablelm.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/stepfun-ai/step-3-5.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/stepfun-ai/step-3-5.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/stepfun-ai/step-3-5.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/stepfun-ai/step-3-5.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/tencent/hy3.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/tencent/hy3.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/tencent/hy3.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/tencent/hy3.mdx
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/thudm/chatglm.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/thudm/chatglm.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/thudm/chatglm.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/thudm/chatglm.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/thudm/glm4-moe.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/thudm/glm4-moe.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/thudm/glm4-moe.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/thudm/glm4-moe.mdx
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/thudm/glm4.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/thudm/glm4.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/thudm/glm4.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/thudm/glm4.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/thudm/glm5-moe-dsa.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/thudm/glm5-moe-dsa.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/thudm/glm5-moe-dsa.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/thudm/glm5-moe-dsa.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/tiiuae/falcon.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/tiiuae/falcon.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/tiiuae/falcon.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/tiiuae/falcon.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/llm/upstage/solar.mdx b/docs/fern/versions/v0.4/pages/model-coverage/llm/upstage/solar.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/upstage/solar.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/llm/upstage/solar.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/omni/index.mdx b/docs/fern/versions/v0.4/pages/model-coverage/omni/index.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/omni/index.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/omni/index.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/omni/microsoft/phi4-multimodal.mdx b/docs/fern/versions/v0.4/pages/model-coverage/omni/microsoft/phi4-multimodal.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/omni/microsoft/phi4-multimodal.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/omni/microsoft/phi4-multimodal.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/omni/nvidia/nemotron-omni.mdx b/docs/fern/versions/v0.4/pages/model-coverage/omni/nvidia/nemotron-omni.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/omni/nvidia/nemotron-omni.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/omni/nvidia/nemotron-omni.mdx
diff --git a/fern/versions/v0.4/pages/model-coverage/omni/qwen/qwen3-omni.mdx b/docs/fern/versions/v0.4/pages/model-coverage/omni/qwen/qwen3-omni.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/omni/qwen/qwen3-omni.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/omni/qwen/qwen3-omni.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/overview.mdx b/docs/fern/versions/v0.4/pages/model-coverage/overview.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/overview.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/overview.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/troubleshooting.mdx b/docs/fern/versions/v0.4/pages/model-coverage/troubleshooting.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/troubleshooting.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/troubleshooting.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/vlm/google/gemma3-vl.mdx b/docs/fern/versions/v0.4/pages/model-coverage/vlm/google/gemma3-vl.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/vlm/google/gemma3-vl.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/vlm/google/gemma3-vl.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/vlm/google/gemma4.mdx b/docs/fern/versions/v0.4/pages/model-coverage/vlm/google/gemma4.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/vlm/google/gemma4.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/vlm/google/gemma4.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/vlm/huggingface/smolvlm.mdx b/docs/fern/versions/v0.4/pages/model-coverage/vlm/huggingface/smolvlm.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/vlm/huggingface/smolvlm.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/vlm/huggingface/smolvlm.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/vlm/index.mdx b/docs/fern/versions/v0.4/pages/model-coverage/vlm/index.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/vlm/index.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/vlm/index.mdx
diff --git a/fern/versions/v0.4/pages/model-coverage/vlm/internlm/internvl.mdx b/docs/fern/versions/v0.4/pages/model-coverage/vlm/internlm/internvl.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/vlm/internlm/internvl.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/vlm/internlm/internvl.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/vlm/llava-hf/llava.mdx b/docs/fern/versions/v0.4/pages/model-coverage/vlm/llava-hf/llava.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/vlm/llava-hf/llava.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/vlm/llava-hf/llava.mdx
diff --git a/fern/versions/v0.4/pages/model-coverage/vlm/lmms-lab/llava-onevision.mdx b/docs/fern/versions/v0.4/pages/model-coverage/vlm/lmms-lab/llava-onevision.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/vlm/lmms-lab/llava-onevision.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/vlm/lmms-lab/llava-onevision.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/vlm/meta/llama4.mdx b/docs/fern/versions/v0.4/pages/model-coverage/vlm/meta/llama4.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/vlm/meta/llama4.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/vlm/meta/llama4.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/vlm/mistralai/ministral3-vl.mdx b/docs/fern/versions/v0.4/pages/model-coverage/vlm/mistralai/ministral3-vl.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/vlm/mistralai/ministral3-vl.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/vlm/mistralai/ministral3-vl.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/vlm/mistralai/mistral-medium-3-5.mdx b/docs/fern/versions/v0.4/pages/model-coverage/vlm/mistralai/mistral-medium-3-5.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/vlm/mistralai/mistral-medium-3-5.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/vlm/mistralai/mistral-medium-3-5.mdx
diff --git a/fern/versions/v0.4/pages/model-coverage/vlm/mistralai/mistral-small-4.mdx b/docs/fern/versions/v0.4/pages/model-coverage/vlm/mistralai/mistral-small-4.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/vlm/mistralai/mistral-small-4.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/vlm/mistralai/mistral-small-4.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/vlm/moonshotai/kimi-vl.mdx b/docs/fern/versions/v0.4/pages/model-coverage/vlm/moonshotai/kimi-vl.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/vlm/moonshotai/kimi-vl.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/vlm/moonshotai/kimi-vl.mdx
diff --git a/fern/versions/v0.4/pages/model-coverage/vlm/nvidia/nemotron-parse.mdx b/docs/fern/versions/v0.4/pages/model-coverage/vlm/nvidia/nemotron-parse.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/vlm/nvidia/nemotron-parse.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/vlm/nvidia/nemotron-parse.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/vlm/qwen/qwen2-5-vl.mdx b/docs/fern/versions/v0.4/pages/model-coverage/vlm/qwen/qwen2-5-vl.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/vlm/qwen/qwen2-5-vl.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/vlm/qwen/qwen2-5-vl.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/vlm/qwen/qwen3-5-vl.mdx b/docs/fern/versions/v0.4/pages/model-coverage/vlm/qwen/qwen3-5-vl.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/vlm/qwen/qwen3-5-vl.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/vlm/qwen/qwen3-5-vl.mdx
diff --git a/fern/versions/nightly/pages/model-coverage/vlm/qwen/qwen3-vl.mdx b/docs/fern/versions/v0.4/pages/model-coverage/vlm/qwen/qwen3-vl.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/vlm/qwen/qwen3-vl.mdx
rename to docs/fern/versions/v0.4/pages/model-coverage/vlm/qwen/qwen3-vl.mdx
diff --git a/fern/versions/nightly/pages/performance-summary.mdx b/docs/fern/versions/v0.4/pages/performance-summary.mdx
similarity index 100%
rename from fern/versions/nightly/pages/performance-summary.mdx
rename to docs/fern/versions/v0.4/pages/performance-summary.mdx
diff --git a/fern/versions/nightly/pages/repository-structure.mdx b/docs/fern/versions/v0.4/pages/repository-structure.mdx
similarity index 100%
rename from fern/versions/nightly/pages/repository-structure.mdx
rename to docs/fern/versions/v0.4/pages/repository-structure.mdx
diff --git a/docs/guides/audio/qwen3-omni-asr.md b/docs/guides/audio/qwen3-omni-asr.mdx
similarity index 98%
rename from docs/guides/audio/qwen3-omni-asr.md
rename to docs/guides/audio/qwen3-omni-asr.mdx
index 196ba7f3ca..0191e7f662 100644
--- a/docs/guides/audio/qwen3-omni-asr.md
+++ b/docs/guides/audio/qwen3-omni-asr.mdx
@@ -1,4 +1,7 @@
-# Fine-Tune Qwen3-Omni for ASR
+---
+title: "Fine-Tune Qwen3-Omni for ASR"
+description: "End-to-end ASR fine-tuning of Qwen3-Omni-30B on Hugging Face audio datasets with NeMo AutoModel."
+---
 
 End-to-end ASR fine-tuning of `Qwen/Qwen3-Omni-30B-A3B-Instruct` on a
 Hugging Face audio dataset, using the NeMo AutoModel VLM training stack. The
diff --git a/docs/guides/checkpointing.md b/docs/guides/checkpointing.md
deleted file mode 100644
index a575925252..0000000000
--- a/docs/guides/checkpointing.md
+++ /dev/null
@@ -1,344 +0,0 @@
-# Checkpointing
-
-## Introduction
-
-During machine-learning experiments, the model-training routine regularly saves checkpoints. A checkpoint is a complete snapshot of a run that includes model weights, optimizer states, and other metadata required to resume training exactly where it left off. Writing these snapshots at regular intervals lets you recover quickly from crashes or pauses without losing progress.
-
-NeMo Automodel checkpoints capture the complete state of a distributed training run across multiple GPUs or nodes. This reduces memory overhead, improves GPU utilization, and allows training to be resumed with a different parallelism strategy.
-
-NeMo Automodel writes checkpoints in two formats: [Hugging Face Safetensors](https://github.com/safetensors/safetensors) and [PyTorch Distributed Checkpointing (DCP)](https://docs.pytorch.org/docs/stable/distributed.checkpoint.html). It also supports two layouts:
-
-- **Consolidated Checkpoints**: The complete model state is saved as a Hugging Face-compatible bundle, typically in a single file or a compact set of files with an index. Because tensors are not split across GPUs (unsharded), tools like Hugging Face, vLLM, and SGLang can load these checkpoints directly.
-
-- **Sharded Checkpoints**: During distributed training with parameter sharing, each GPU holds a subset (or "shard") of the full state, such as model weights and optimizer states. When checkpointing, each GPU writes its own shard independently without reconstructing the full model state.
-
-We provide an overview of the different types of available checkpoint formats in the table below.
-
-Task | Model domain  | DCP (sharded) | Safetensors (sharded) | Safetensors (consolidated) |
------|----------------------|:-----------:|:-------------------:|:------------------------:|
-SFT  | LLM                  | ✅          | ✅                   | ✅                      |
-SFT  | VLM                  | ✅          | ✅                   | ✅                      |
-PEFT | LLM / VLM            | 🚧          | 🚧                   | ✅                      | 
-
-
-Changing between output formats can be done seamlessly through the recipe's `yaml` configuration file:
-```yaml
-checkpoint:
-    ...
-    model_save_format: safetensors # Format for saving (torch_save or safetensors)
-    save_consolidated: true # Change to false if you want to save sharded checkpoints.
-    ...
-```
-> **Note:** For optimal compatibility with the Hugging Face ecosystem, including downstream tools such as vLLM and SGLang, we recommend using the checkpoint configuration provided above.
-
-::: {note}
-The optimizer states are _always_ saved in DCP (`.distcp` extension) format.
-:::
-
-## Checkpoint Symbolic Links
-
-NeMo Automodel automatically creates symbolic links in the checkpoint directory to provide convenient access to important checkpoints:
-
-- **LATEST**: Points to the most recently saved checkpoint. This is useful for resuming training from the last saved state.
-- **LOWEST_VAL**: Points to the checkpoint with the lowest validation score/loss. This provides easy access to the best-performing checkpoint based on validation metrics, making it ideal for model evaluation or deployment.
-
-These symbolic links eliminate the need to manually track checkpoint names or search through directories to find the best model. When validation is enabled in your training run, both links are automatically maintained and updated as training progresses.
-
-## Safetensors
-To ensure seamless integration with the Hugging Face ecosystem, NeMo Automodel saves checkpoints in the [Safetensors](https://github.com/safetensors/safetensors) format. Safetensors is a memory-safe, zero-copy alternative to Python's pickle (PyTorch .bin), natively supported by Hugging Face Transformers, offering both safety and performance advantages over Python pickle-based approaches.
-
-### Key Benefits:
-- **Native Hugging Face Compatibility**: Checkpoints can be loaded directly into Hugging Face-compatible tools, including vLLM, SGLang, and others.
-- **Memory Safety and Speed**: The Safetensors format prohibits saving serialized Python code, ensuring memory safety, and supports zero-copy loading for improved performance.
-- **Optional Consolidation**: Sharded checkpoints can be merged into a standard Hugging Face model format for easier downstream use.
-
-**Most importantly**, this format offers the added advantage of optionally consolidating multiple shards into a complete Hugging Face format model.
-
-### Example
-
-The following command runs the LLM fine-tuning recipe on two GPUs and saves the resulting checkpoint in the Safetensors format:
-```bash
-automodel --nproc-per-node=2 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-    --step_scheduler.ckpt_every_steps 20 \
-    --checkpoint.model_save_format safetensors \
-    --checkpoint.save_consolidated True
-```
-
-::: {note}
-In the above command we used the [`llama3_2_1b_squad.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/492add84a2b9d495946fe211c28973cd00051f3e/examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml) config as a running example, adjust as necessary to your case.
-More config examples can be found in our [`examples/`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples) directory.
-:::
-
-If you're running on a single GPU, you can run:
-```bash
-automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-    --step_scheduler.ckpt_every_steps 20 \
-    --checkpoint.model_save_format safetensors \
-    --checkpoint.save_consolidated True
-```
-
-After running for a few seconds, the standard output should be:
-```
-...
-> Saving checkpoint to checkpoints/epoch_0_step_20
-...
-```
-
-The `checkpoints/` should have the following contents:
-```
-checkpoints/
-├── LATEST -> epoch_0_step_20
-├── LOWEST_VAL -> epoch_0_step_20
-└── epoch_0_step_20
-   ├── model
-   │   ├── consolidated
-   │   │   ├── config.json
-   │   │   ├── generation_config.json
-   │   │   ├── model-00001-of-00001.safetensors
-   │   │   ├── model.safetensors.index.json
-   │   │   ├── special_tokens_map.json
-   │   │   ├── tokenizer.json
-   │   │   └── tokenizer_config.json
-   │   ├── shard-00001-model-00001-of-00001.safetensors
-   │   └── shard-00002-model-00001-of-00001.safetensors
-   └── optim
-       ├── __0_0.distcp
-       └── __1_0.distcp
-...
-```
-
-The `epoch_0_step_20/` directory stores the full training state from step `20` of the first epoch, including both the model and optimizer states.
-
-We can load and run the consolidated checkpoint using the Hugging Face Transformers API directly:
-```python
-import torch
-from transformers import pipeline
-
-model_id = "checkpoints/epoch_0_step_20/model/consolidated/"
-pipe = pipeline(
-    "text-generation", 
-    model=model_id, 
-    torch_dtype=torch.bfloat16, 
-    device_map="auto",
-)
-
-print(pipe("The key to life is"))
-
->>> [{'generated_text': 'The key to life is to be happy. The key to happiness is to be kind. The key to kindness is to be'}]
-```
-
-Although this example uses the Hugging Face Transformers API, the `consolidated/` checkpoint is compatible with any Hugging Face-compatible tool, such as vLLM, SGLang, and others.
-
-
-## PEFT
-When training with Parameter-Efficient Fine-Tuning (PEFT) techniques, only a small subset of model weights are updated — the rest of the model remains frozen. This dramatically reduces the size of the checkpoint, often to just a few megabytes.
-
-### Why Consolidated Checkpoints?
-Because the PEFT state is so lightweight, sharded checkpointing adds unnecessary overhead. Instead, NeMo Automodel automatically saves a single, consolidated Hugging Face–compatible checkpoint when using PEFT. This makes it:
-
-- easier to manage and share (just the adapters),
-- compatible with Hugging Face Transformers out of the box,
-- ideal for deployment and downstream evaluation.
-
-### Example: PEFT Fine-Tuning on Two GPUs
-
-To fine-tune a model using PEFT and save a Hugging Face–ready checkpoint:
-```bash
-automodel --nproc-per-node=2 examples/llm_finetune/llama3_2/llama3_2_1b_hellaswag_peft.yaml --step_scheduler.ckpt_every_steps 20 --checkpoint.model_save_format safetensors
-```
-
-After training, you'll get a compact, consolidated Safetensors checkpoint that can be loaded directly with Hugging Face tools:
-
-```
-checkpoints/
-├── LATEST -> epoch_0_step_20
-├── LOWEST_VAL -> epoch_0_step_20
-├── epoch_0_step_20
-│   ├── config.yaml
-│   ├── dataloader
-│   │   ├── dataloader_dp_rank_0.pt
-│   │   └── dataloader_dp_rank_1.pt
-│   ├── losses.json
-│   ├── model
-│   │   ├── adapter_config.json
-│   │   ├── adapter_model.safetensors
-│   │   ├── automodel_peft_config.json
-│   │   ├── special_tokens_map.json
-│   │   ├── tokenizer.json
-│   │   └── tokenizer_config.json
-│   ├── optim
-│   │   ├── __0_0.distcp
-│   │   └── __1_0.distcp
-│   ├── rng
-│   │   ├── rng_dp_rank_0.pt
-│   │   └── rng_dp_rank_1.pt
-│   └── step_scheduler.pt
-├── training.jsonl
-└── validation.jsonl
-```
-
-The example below showcases the direct compatibility of NeMo Automodel with Hugging Face and PEFT:
-```python
-from peft import AutoPeftModelForCausalLM
-from transformers import AutoTokenizer
-
-checkpoint_path = "checkpoints/epoch_0_step_20/model/"
-model = AutoPeftModelForCausalLM.from_pretrained(checkpoint_path)
-tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
-
-model = model.to("cuda")
-model.eval()
-inputs = tokenizer("Preheat the oven to 350 degrees and place the cookie dough", return_tensors="pt")
-
-outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=50)
-print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])
-
->>> Preheat the oven to 350 degrees and place the cookie dough in a large bowl. Roll the dough into 1-inch balls and place them on a cookie sheet. Bake the cookies for 10 minutes. While the cookies are baking, melt the chocolate chips in the microwave for 30 seconds.
-```
-
-## PyTorch DCP
-NeMo Automodel also offers native PyTorch DCP checkpointing support (`.distcp` extension). Similar to Safetensors, it also provides the same features of load-time resharding and parallel saving.
-
-As a simple example, we can run the following command to launch the training recipe on two GPUs.
-```bash
-automodel --nproc-per-node=2 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-    --step_scheduler.ckpt_every_steps 20 \
-    --checkpoint.model_save_format torch_save
-
-...
-> Saving checkpoint to checkpoints/epoch_0_step_20
-...
-```
-After 20 steps, the following checkpoint will be saved:
-
-```
-checkpoints/
-├── LATEST -> epoch_0_step_20
-├── LOWEST_VAL -> epoch_0_step_20
-└── epoch_0_step_20
-   ├── config.yaml
-   ├── dataloader
-   │   ├── dataloader_dp_rank_0.pt
-   │   └── dataloader_dp_rank_1.pt
-   ├── losses.json
-   ├── model
-   │   ├── __0_0.distcp
-   │   └── __1_0.distcp
-   └── optim
-       ├── __0_0.distcp
-       └── __1_0.distcp
-...
-```
-
-If you rerun the script, NeMo Automodel automatically detects and restores the most recent checkpoint.
-```bash
-automodel --nproc-per-node=2 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-    --step_scheduler.ckpt_every_steps 20 \
-    --checkpoint.model_save_format torch_save
-
-...
-> Loading checkpoint from checkpoints/epoch_0_step_20
-...
-```
-
-## Saving Checkpoints When Using Docker
-
-When training inside a Docker container (see [Installation Guide](installation.md)), any files written to the container's filesystem are lost when the container exits (especially with `--rm`). To keep your checkpoints, you must **bind-mount a host directory** to the checkpoint path before starting the container:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v "$(pwd)"/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:25.11.00
-```
-
-You can also set a custom checkpoint directory via the YAML config or CLI override:
-```yaml
-checkpoint:
-  checkpoint_dir: /mnt/shared/my_checkpoints
-```
-```bash
-# Or via CLI override:
-automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-    --checkpoint.checkpoint_dir /mnt/shared/my_checkpoints
-```
-
-When using a custom path, make sure the corresponding host directory is mounted into the container with `-v`.
-
-::: {tip}
-Mount additional host directories for datasets and the Hugging Face model cache to avoid re-downloading large models across container restarts. See the [Installation Guide](installation.md) for a complete `docker run` example with all recommended mounts.
-:::
-
-## Asynchronous Checkpointing
-
-NeMo Automodel can write checkpoints asynchronously to reduce training stalls caused by I/O. When enabled, checkpoint writes are scheduled in the background using PyTorch Distributed Checkpointing's async API while training continues.
-
-- **Enable** (YAML):
-  ```yaml
-  checkpoint:
-    is_async: true
-  ```
-- **Enable** (CLI): add `--checkpoint.is_async True` to your run command.
-- **Requirements**: PyTorch ≥ 2.9.0. If an older version is detected, async mode is automatically disabled.
-- **Behavior**: At most one checkpoint uploads at a time; the next save waits for the previous upload to finish. The `LATEST` symlink is updated after the async save completes (may be deferred until the next save call). During PEFT, adapter model files are written synchronously on rank 0; optimizer states can still use async.
-
-## Advanced Usage: Save Additional States
-You can also save additional states in NeMo Automodel. By default, we also automatically checkpoint the `dataloader`, `rng`, and `step_scheduler` states which are necessary to resume training accurately. In full, a Safetensors consolidated checkpoint will look like this:
-
-```
-checkpoints/
-├── LATEST -> epoch_0_step_20
-├── LOWEST_VAL -> epoch_0_step_20
-├── epoch_0_step_20
-│   ├── config.yaml
-│   ├── dataloader
-│   │   ├── dataloader_dp_rank_0.pt
-│   │   └── dataloader_dp_rank_1.pt
-│   ├── losses.json
-│   ├── model
-│   │   ├── consolidated
-│   │   │   ├── config.json
-│   │   │   ├── generation_config.json
-│   │   │   ├── model-00001-of-00001.safetensors
-│   │   │   ├── model.safetensors.index.json
-│   │   │   ├── special_tokens_map.json
-│   │   │   ├── tokenizer.json
-│   │   │   └── tokenizer_config.json
-│   │   ├── shard-00001-model-00001-of-00001.safetensors
-│   │   └── shard-00002-model-00001-of-00001.safetensors
-│   ├── optim
-│   │   ├── __0_0.distcp
-│   │   └── __1_0.distcp
-│   ├── rng
-│   │   ├── rng_dp_rank_0.pt
-│   │   └── rng_dp_rank_1.pt
-│   └── step_scheduler.pt
-├── training.jsonl
-└── validation.jsonl
-```
-
-If you want to define a new state to be checkpointed in the recipe, the easiest way is to create a new attribute in the recipe class (defined using `self.` inside the recipe). Just make sure that the new attribute uses both the `load_state_dict` and `state_dict` methods.
-
-Here is an example of what it might look like:
-
-```python
-
-class NewState:
-
-    def __init__(self, ...):
-        self.state_value = ...
-        self.another_value = ...
-        ...
-    
-    def state_dict(self) -> dict[str, Any]:
-        return {
-            "<some state you're tracking>": self.state_value,
-            "<another state you're tracking>": self.another_value,
-        }
-    
-    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
-        self.state_value = state_dict["<some state you're tracking>"]
-        self.another_value = state_dict["<another state you're tracking>"]
-```
-
-Inside your recipe class, define the new state as an instance attribute using `self.new_state = NewState(...)`.
diff --git a/fern/versions/nightly/pages/guides/checkpointing.mdx b/docs/guides/checkpointing.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/checkpointing.mdx
rename to docs/guides/checkpointing.mdx
diff --git a/docs/guides/configuration.md b/docs/guides/configuration.md
deleted file mode 100644
index 66ffa67fad..0000000000
--- a/docs/guides/configuration.md
+++ /dev/null
@@ -1,123 +0,0 @@
-# YAML Configuration
-
-NeMo AutoModel recipes are configured with YAML. Under the hood, YAML is parsed into a `ConfigNode` which:
-
-- Translates common scalar strings into typed Python values (e.g., `"10"` → `10`).
-- Resolves `_target_` (and `*_fn`) into Python callables/classes.
-- Supports environment variable interpolation inside YAML strings.
-- Tries to make config printing safe by preserving original placeholders (to avoid leaking secrets).
-
-
-## Load Model and Dataset Configs
-
-Most recipes load the YAML using `nemo_automodel.components.config.loader.load_yaml_config()`, which returns a `ConfigNode`.
-
-Within a `ConfigNode`:
-
-- Nested dicts become nested `ConfigNode` objects.
-- Lists are recursively wrapped.
-- Scalars are translated with `translate_value()` when they are YAML strings.
-
-### Typed Scalar Translation (`translate_value`)
-
-Only **strings** are translated. Examples:
-
-- `"123"` → `123`
-- `"3.14"` → `3.14`
-- `"true"` / `"false"` → `True` / `False`
-- `"None"` / `"none"` → `None`
-
-YAML-native types (like `step_size: 10` without quotes) are already typed by the YAML parser and remain unchanged.
-
-
-## Use `_target_` for Instantiation
-
-Any mapping containing a `_target_` key can be instantiated using `ConfigNode.instantiate()`:
-
-```yaml
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-```
-
-There is also support for resolving callables from:
-
-- **Dotted paths**: `pkg.module.symbol`
-- **Local file paths**: `/abs/path/to/file.py:symbol`
-
-### Safety and Policy
-
-By default, resolving targets is restricted:
-
-- Imports are allowed from common safe prefixes (e.g. `nemo_automodel`, `torch`, `transformers`, …).
-- Accessing private or dunder attributes is blocked by default.
-- Loading out-of-tree user code can be enabled with `NEMO_ENABLE_USER_MODULES=1` or by calling `set_enable_user_modules(True)`.
-
-## Distributed Section (Strategy-Based)
-
-The `distributed:` section is **not** instantiated using `_target_`. Recipes parse it with a fixed schema: use `strategy: fsdp2`, `strategy: ddp`, or `strategy: megatron_fsdp`, plus optional parallelism sizes (`dp_size`, `tp_size`, `pp_size`, etc.) and strategy-specific options. When pipeline parallelism is enabled (`pp_size > 1`), add a `pipeline:` subsection with options such as `pp_schedule`, `pp_microbatch_size`, and `layers_per_stage`. See the [Pipelining](pipelining.md) guide and recipe example configs for full examples.
-
-
-## Interpolate Environment Variables in YAML
-
-NeMo AutoModel supports env var interpolation inside YAML **string values**.
-
-### Supported Forms
-
-- **Braced**:
-  - `${VAR}`
-  - `${VAR,default}`
-  - `${var.dot.var}` (dots are treated as part of the env var name)
-- **Dollar**:
-  - `$VAR`
-  - `$var.dot.var`
-- **Back-compat**:
-  - `${oc.env:VAR}`
-  - `${oc.env:VAR,default}`
-
-### Interpolation Behavior
-
-- Interpolation happens when values are wrapped into a `ConfigNode`.
-- If a referenced env var is **missing** and **no default** is provided, config loading raises a `KeyError`.
-- Defaults are supported only for braced forms using the first comma: `${VAR,default_value}`.
-
-### Example (Databricks Delta)
-
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_iterable_dataset.ColumnMappedTextInstructionIterableDataset
-  path_or_dataset_id: delta://catalog.schema.training_data
-  delta_storage_options:
-    DATABRICKS_HOST: ${DATABRICKS_HOST}
-    DATABRICKS_TOKEN: ${DATABRICKS_TOKEN}
-    DATABRICKS_HTTP_PATH: ${DATABRICKS_HTTP_PATH}
-```
-
-
-## Prevent Secret Leakage in Logs
-
-When an env var placeholder is resolved, the config keeps the original placeholder in an internal `._orig_value` field for **safe printing**:
-
-- `str(cfg)` / `repr(cfg)` prints placeholders (e.g. `${DATABRICKS_TOKEN}`), not resolved secrets.
-- `cfg.to_yaml_dict(use_orig_values=True, redact_sensitive=True)` is the recommended way to produce a loggable YAML dict.
-
-:::{important}
-Printing a **leaf value** (for example, `print(cfg.dataset.delta_storage_options.DATABRICKS_TOKEN)`) outputs the resolved secret. Instead, print the full config or use a redacted YAML dict.
-:::
-
-
-## Configure Slurm
-
-SLURM jobs are submitted with `sbatch` directly — no YAML section needed.
-Copy the reference script, edit `CONFIG` and cluster settings, then submit:
-
-```bash
-cp slurm.sub my_cluster.sub
-vim my_cluster.sub
-sbatch my_cluster.sub
-```
-
-All cluster-specific configuration (SBATCH directives, container image, mounts,
-secrets, environment variables) lives in your sbatch script.  See
-[Run on a Cluster](../launcher/slurm.md) for full examples.
-
diff --git a/fern/versions/v0.4/pages/guides/configuration.mdx b/docs/guides/configuration.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/configuration.mdx
rename to docs/guides/configuration.mdx
diff --git a/docs/guides/dataset-overview.md b/docs/guides/dataset-overview.md
deleted file mode 100644
index 00dfed81a6..0000000000
--- a/docs/guides/dataset-overview.md
+++ /dev/null
@@ -1,613 +0,0 @@
-# Dataset Overview: LLM, VLM, and Retrieval Datasets
-
-This page summarizes the datasets supported in NeMo AutoModel for LLM, VLM, and retrieval training and shows how to plug in your own datasets using Python functions or the YAML `_target_` mechanism.
-
-- See also: [LLM datasets](llm/dataset.md), [VLM datasets](vlm/dataset.md), and [Retrieval dataset](llm/retrieval-dataset.md) for deeper, task-specific guides.
-
-- If a dataset you need is missing, please open a [GitHub issue](https://github.com/NVIDIA-NeMo/Automodel/issues) with a short description and example schema so we can prioritize support.
----
-
-## LLM Datasets
-
-NeMo AutoModel supports several common patterns for language modeling and instruction tuning.
-### HellaSwag (Completion SFT)
-- Wrapper: `nemo_automodel.components.datasets.llm.hellaswag.HellaSwag`
-- Use case: single-turn completion-style SFT where a prompt (ctx) is followed by a gold continuation (ending)
-- Key args: `path_or_dataset`, `split`, `num_samples_limit`
-- Example YAML:
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
-  path_or_dataset: rowan/hellaswag
-  split: train
-```
-
-### SQuAD-Style Question Answering (QA) (Instruction SFT)
-- Factory: `nemo_automodel.components.datasets.llm.squad.make_squad_dataset`
-- Use case: instruction/QA tuning with either prompt-and-answer formatting or chat-template formatting
-:::{note}
-- If the tokenizer has a chat template and you want answer-only loss, you must provide `start_of_turn_token`.
-- Optional `seq_length` can be used for padding/truncation.
-:::
-- Example YAML:
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  split: train
-  dataset_name: rajpurkar/squad
-  start_of_turn_token: "<|assistant|>"
-```
-
-- **ColumnMappedTextInstructionDataset (generic instruction SFT)**
-  - Class: `nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset.ColumnMappedTextInstructionDataset`
-  - Use case: quickly adapt instruction datasets by mapping your schema's columns to `context`, `question`, `answer`
-  - Sources: local JSON/JSONL or Hugging Face Hub dataset ID
-  - Notes:
-    - For tokenizers with chat templates and answer-only loss, you may set `answer_only_loss_mask: true` and provide `start_of_turn_token`.
-    - Supports streaming mode for large datasets (see [Streaming Datasets](#streaming-datasets) section below).
-    - Map-style, non-streaming dataset (supports `len(ds)` and `ds[i]`)
-    - For streaming (including Delta Lake / Databricks), use `ColumnMappedTextInstructionIterableDataset`
-  - Example YAML:
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset.ColumnMappedTextInstructionDataset
-  path_or_dataset_id: Muennighoff/natural-instructions
-  split: train
-  column_mapping:
-    context: definition
-    question: inputs
-    answer: targets
-  answer_only_loss_mask: true
-  start_of_turn_token: "<|assistant|>"
-```
-See the detailed guide, [Column-Mapped Text Instruction Dataset](llm/column-mapped-text-instruction-dataset.md), for more information.
-
-- **ChatDataset (multi-turn conversations and tool calling)**
-  - Class: `nemo_automodel.components.datasets.llm.ChatDataset`
-  - Use case: multi-turn conversations and tool calling in OpenAI chat format
-  - Sources: local JSON/JSONL or Hugging Face Hub dataset ID
-  - Key args:
-    - `path_or_dataset_id`: path to local file(s) or HuggingFace dataset ID
-    - `tokenizer`: tokenizer instance (required. Must have chat template support)
-    - `split`: dataset split (e.g., "train", "validation")
-    - `name`: dataset configuration/subset name
-    - `seq_length`: maximum sequence length for padding/truncation
-    - `padding`: padding strategy ("do_not_pad", "max_length", etc.)
-    - `truncation`: truncation strategy ("do_not_truncate", "longest_first", etc.)
-    - `start_of_turn_token`: token marking assistant response start (for answer-only loss)
-    - `chat_template`: optional override for tokenizer's chat template
-    - `skip_invalid_samples`: if ``true``, skip malformed JSONL lines when reading local files (warnings log skip counts); default ``false`` fails fast on a bad line
-  - Notes:
-    - Requires a tokenizer with chat template support
-    - Supports both single-turn and multi-turn tool calling
-    - Tool definitions are provided in a `tools` field at the conversation level
-    - Tool calls appear in assistant messages via `tool_calls` field
-    - Tool responses use the `tool` role
-### ChatDataset (Multi-Turn Conversations and Tool Calling)
-- Class: `nemo_automodel.components.datasets.llm.ChatDataset`
-- Use case: multi-turn conversations and tool calling in OpenAI chat format
-- Sources: local JSON/JSONL or Hugging Face Hub dataset ID
-- Key args:
-  - `path_or_dataset_id`: path to local file(s) or Hugging Face dataset ID
-  - `tokenizer`: tokenizer instance (required; must have chat template support)
-  - `split`: dataset split (e.g., "train", "validation")
-  - `name`: dataset configuration/subset name
-  - `seq_length`: maximum sequence length for padding/truncation
-  - `padding`: padding strategy ("do_not_pad", "max_length", etc.)
-  - `truncation`: truncation strategy ("do_not_truncate", "longest_first", etc.)
-  - `start_of_turn_token`: token marking assistant response start (for answer-only loss)
-  - `chat_template`: optional override for tokenizer's chat template
-  - `mask_reasoning_content`: optionally exclude rendered `reasoning_content` tokens from loss
-  - `skip_invalid_samples`: if ``true``, skip malformed JSONL lines when reading local files (warnings log skip counts); default ``false`` fails fast on a bad line
-:::{note}
-- Requires a tokenizer with chat template support
-- Supports both single-turn and multi-turn tool calling
-- Assistant messages may also include `reasoning_content` for structured reasoning traces
-- Tool definitions are provided in a `tools` field at the conversation level
-- Tool calls appear in assistant messages through the `tool_calls` field
-- Tool responses use the `tool` role and must include `tool_call_id`
-- If your dataset contains `reasoning_content`, your chat template must render it explicitly or it will be dropped
-- For multi-turn tool-calling datasets, prefer chat templates that use `{% generation %}` blocks so assistant-turn loss masking is exact
-- Set `mask_reasoning_content: true` if you want to train on the final assistant answer while excluding rendered reasoning traces from loss
-- Set `skip_invalid_samples: true` for noisy local JSONL so lines that are not valid JSON are skipped instead of failing the load
-:::
-- Example YAML:
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.ChatDataset
-  path_or_dataset_id: Salesforce/xlam-function-calling-60k
-  split: train
-  tokenizer:
-    _target_: transformers.AutoTokenizer.from_pretrained
-    pretrained_model_name_or_path: google/functiongemma-270m-it
-  seq_length: 2048
-  start_of_turn_token: "<start_of_turn>"
-  mask_reasoning_content: false
-  skip_invalid_samples: false
-```
-  - Expected data format (OpenAI messages format):
-```json
-{
-  "messages": [
-    {
-      "role": "system",
-      "content": "You are a helpful assistant."
-    },
-    {
-      "role": "user",
-      "content": "What's the weather in Seattle and should I bring an umbrella?"
-    },
-    {
-      "role": "assistant",
-      "reasoning_content": "The user wants weather info and advice. I should call get_weather first, then decide whether an umbrella is needed.",
-      "content": "",
-      "tool_calls": [
-        {
-          "id": "call_1",
-          "type": "function",
-          "function": {
-            "name": "get_weather",
-            "arguments": "{\"city\": \"Seattle\"}"
-          }
-        }
-      ]
-    },
-    {
-      "role": "tool",
-      "tool_call_id": "call_1",
-      "content": "{\"temperature\": 55, \"condition\": \"rain\", \"precipitation_chance\": 0.85}"
-    },
-    {
-      "role": "assistant",
-      "reasoning_content": "It is raining with a high precipitation chance, so I should recommend bringing an umbrella.",
-      "content": "It's currently 55 degrees F and raining in Seattle with an 85% chance of continued precipitation. Yes, definitely bring an umbrella."
-    }
-  ],
-  "tools": [
-    {
-      "type": "function",
-      "function": {
-        "name": "get_weather",
-        "description": "Get current weather for a city",
-        "parameters": {
-          "type": "object",
-          "properties": {
-            "city": {"type": "string"}
-          },
-          "required": ["city"]
-        }
-      }
-    }
-  ]
-}
-```
-  - Template requirement example for `reasoning_content`:
-```jinja
-{%- if message.reasoning_content %}
-{% generation %}
-{{ "<think>\n" + message.reasoning_content + "\n</think>\n" }}
-{% endgeneration %}
-{%- endif %}
-{% generation %}
-{{ message.content }}
-{% endgeneration %}
-```
-  - For single-turn tool calling (one tool call per conversation), omit the tool response and final assistant message:
-```json
-{
-  "messages": [
-    {
-      "role": "user",
-      "content": "Book a table for two at 7pm in Seattle."
-    },
-    {
-      "role": "assistant",
-      "content": "",
-      "tool_calls": [
-        {
-          "id": "call_1",
-          "type": "function",
-          "function": {
-            "name": "book_table",
-            "arguments": "{\"party_size\": 2, \"time\": \"19:00\", \"city\": \"Seattle\"}"
-          }
-        }
-      ]
-    }
-  ],
-  "tools": [
-    {
-      "type": "function",
-      "function": {
-        "name": "book_table",
-        "description": "Book a restaurant table",
-        "parameters": {
-          "type": "object",
-          "properties": {
-            "party_size": {"type": "integer"},
-            "time": {"type": "string"},
-            "city": {"type": "string"}
-          }
-        }
-      }
-    }
-  ]
-}
-```
-See the [Function Calling guide](llm/toolcalling.md) for an end-to-end example with FunctionGemma.
-For a small reasoning-style chat SFT starting point, see [qwen2_5_0p5b_instruct_fineproofs_chat.yaml](../../examples/llm_finetune/qwen/qwen2_5_0p5b_instruct_fineproofs_chat.yaml).
-
-### Retrieval (Embedding Fine-Tuning)
-- Factory: `nemo_automodel.components.datasets.llm.make_retrieval_dataset`
-- Collator: `nemo_automodel.components.datasets.llm.BiEncoderCollator`
-- Use case: embedding model fine-tuning with (query, positive doc, negative docs) contrastive learning
-- Supported schemas:
-  - Corpus-ID JSON (Merlin/NeMo-retriever style)
-  - Inline-text JSONL (e.g., `{"query": "...", "pos_doc": "...", "neg_doc": ["...", "..."]}`)
-- Example YAML:
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.make_retrieval_dataset
-  data_dir_list: /abs/path/to/train.jsonl
-  data_type: train
-  n_passages: 5
-collate_fn:
-  _target_: nemo_automodel.components.datasets.llm.BiEncoderCollator
-  q_max_len: 512
-  p_max_len: 512
-```
-See the detailed guide, [Retrieval dataset](llm/retrieval-dataset.md), for more information.
-
-### NanoGPT Binary Shards (Pretraining)
-- Class: `nemo_automodel.components.datasets.llm.nanogpt_dataset.NanogptDataset`
-- Use case: token-level LM pretraining over `.bin` shards produced by NanoGPT-style preprocessors (supports legacy and current formats)
-:::{note}
-- Streams contiguous `seq_len` slices, supports optional BOS alignment and `.bos.idx` sidecar files
-- Related tool: `tools/nanogpt_data_processor.py`
-:::
-
-### Megatron (Pretraining; Interoperable With Pre-Tokenized Megatron Data)
-- Class: `nemo_automodel.components.datasets.llm.megatron_dataset.MegatronPretraining`
-- Use case: large-scale LM pretraining over Megatron-LM formatted tokenized corpora
-- Interoperability: If your corpus has already been tokenized/indexed for Megatron (i.e., `.bin`/`.idx` pairs), you can point AutoModel to those assets directly. No re-tokenization required.
-- Key args: `paths` (single path, glob, weighted list, or per-split dict), `seq_length`, `tokenizer`, `split`, `index_mapping_dir`, `splits_to_build`
-- Example YAML:
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.megatron_dataset.MegatronPretraining
-  paths: /abs/path/to/processed_data_*_text_document*  # glob or explicit list
-  index_mapping_dir: /abs/path/to/mapping_dir
-  tokenizer:
-    _target_: transformers.AutoTokenizer.from_pretrained
-    pretrained_model_name_or_path: openai-community/gpt2
-  seq_length: 1024
-  split: "0.99, 0.01, 0.00"  # train, validation, test
-  splits_to_build: "train"
-```
-See the detailed [pretraining guide](llm/pretraining.md), which uses MegatronPretraining data.
-
-## Streaming Datasets
-
-Streaming datasets enable processing very large datasets without loading them entirely into memory. This is particularly useful when working with datasets that exceed available RAM or when you want to start training immediately without waiting for the full dataset to download.
-
-### What Are Streaming Datasets?
-
-Streaming datasets load and process data incrementally, one batch at a time, rather than loading the entire dataset into memory upfront. This approach:
-
-- **Reduces memory footprint**: Only the current batch resides in memory
-- **Enables training on massive datasets**: Process terabyte-scale datasets on machines with limited RAM
-- **Faster startup**: Begin training immediately without waiting for full dataset download
-- **Efficient for remote datasets**: Stream directly from Hugging Face Hub without local storage
-
-### When to Use Streaming
-
-Use streaming mode when:
-
-- Your dataset is very large (hundreds of GB or TB)
-- Available memory is limited compared to dataset size
-- You want to start training quickly without downloading the full dataset
-- You're experimenting with a subset of a large dataset
-
-Avoid streaming when:
-
-- Your dataset is small enough to fit comfortably in memory
-- You need random access to samples (e.g., for certain sampling strategies)
-- You need to know the exact dataset length upfront
-- Training requires multiple passes with different orderings
-
-### How to Enable Streaming
-
-For `ColumnMappedTextInstructionDataset`, use the streaming variant by changing the class to `ColumnMappedTextInstructionIterableDataset`:
-
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_iterable_dataset.ColumnMappedTextInstructionIterableDataset
-  path_or_dataset_id: Muennighoff/natural-instructions
-  split: train
-  column_mapping:
-    context: definition
-    question: inputs
-    answer: targets
-  answer_only_loss_mask: true
-  start_of_turn_token: "<|assistant|>"
-```
-
-For Hugging Face datasets loaded directly, set `streaming=True`:
-
-```python
-from datasets import load_dataset
-
-# Non-streaming (loads entire dataset into memory)
-dataset = load_dataset("large-dataset/corpus", split="train", streaming=False)
-
-# Streaming (loads data incrementally)
-dataset = load_dataset("large-dataset/corpus", split="train", streaming=True)
-```
-
-### Streaming Limitations
-
-When using streaming datasets, be aware of these limitations:
-
-1. **No random access**: You cannot use `dataset[index]` to access specific samples. Streaming datasets only support iteration.
-
-2. **No length information**: The `len(dataset)` operation is not available. You cannot determine the total number of samples upfront.
-
-3. **Single-pass iteration**: Each iteration consumes the stream. To iterate multiple times, you need to recreate the dataset or use the `repeat_on_exhaustion` parameter.
-
-4. **Limited shuffling**: Shuffling is done with a buffer (not the entire dataset), which may not provide perfect randomization.
-
-### Distributed Training with Streaming
-
-Streaming datasets support distributed training through sharding:
-
-```python
-from nemo_automodel.components.datasets.llm.column_mapped_text_instruction_iterable_dataset import (
-    ColumnMappedTextInstructionIterableDataset
-)
-
-dataset = ColumnMappedTextInstructionIterableDataset(
-    path_or_dataset_id="large-dataset/corpus",
-    column_mapping={"question": "input", "answer": "output"},
-    tokenizer=tokenizer,
-)
-
-# Shard the dataset across workers
-dataset = dataset.shard(num_shards=8, index=worker_id)
-
-# Enable shuffling with a buffer
-dataset = dataset.shuffle(buffer_size=10000, seed=42)
-
-# Set epoch for deterministic shuffling across epochs
-dataset.set_epoch(epoch_num)
-```
-
-### Performance Considerations
-
-**Memory vs. Speed Trade-offs**:
-- Streaming reduces memory usage but may be slower than in-memory datasets
-- Network latency can impact streaming performance for remote datasets
-- Use local caching when repeatedly accessing the same remote dataset
-
-**Buffer Size for Shuffling**:
-- Larger buffers provide better randomization but use more memory
-- A buffer size of 10,000-100,000 samples is typically a good balance
-- For perfect shuffling, you need a buffer size equal to the dataset size (defeating the purpose of streaming)
-
-**Prefetching**:
-- Most streaming implementations prefetch data in the background
-- This helps hide network latency and keeps GPUs busy
-- Adjust prefetch settings based on your network speed and batch size
-
-### Example: Streaming a Large Dataset
-
-Here's a complete example of using streaming for a large instruction-tuning dataset:
-
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_iterable_dataset.ColumnMappedTextInstructionIterableDataset
-  path_or_dataset_id: HuggingFaceH4/ultrachat_200k
-  split: train_sft
-  column_mapping:
-    question: prompt
-    answer: completion
-  answer_only_loss_mask: true
-  start_of_turn_token: "<|assistant|>"
-  repeat_on_exhaustion: true  # Automatically restart when stream ends
-
-dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  batch_size: 4
-  num_workers: 4
-```
-
-This configuration:
-- Streams the dataset without loading it fully into memory
-- Automatically repeats when the stream is exhausted
-- Uses multiple workers for efficient data loading
-- Applies answer-only loss masking during tokenization
-
-## Packed Sequence Support
-To reduce padding and improve throughput with variable-length sequences:
-```yaml
-packed_sequence:
-  packed_sequence_size: 8192   # > 0 enables packing
-  split_across_pack: false
-```
-Use a collator that pads to an FP8-friendly multiple when training with FP8:
-```yaml
-dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn:
-    _target_: nemo_automodel.components.datasets.utils.default_collater
-    pad_seq_len_divisible: 16
-```
-
----
-
-## VLM Datasets (Vision/Audio + Language)
-VLM datasets are represented as conversations (message lists) that combine text with images or audio and are processed with the model's `AutoProcessor.apply_chat_template` and a suitable collate function.
-
-Built-in dataset makers (return lists of `conversation` dicts):
-- **RDR items**: `nemo_automodel.components.datasets.vlm.datasets.make_rdr_dataset` (HF: `quintend/rdr-items`)
-- **CORD-V2 receipts (Consolidated Receipt Dataset for Post-OCR Parsing)**: `nemo_automodel.components.datasets.vlm.datasets.make_cord_v2_dataset` (HF: `naver-clova-ix/cord-v2`)
-- **MedPix-VQA (Medical Pixel Question Answering)**: `nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset`
-- **CommonVoice 17 (CV17) (audio)**: `nemo_automodel.components.datasets.vlm.datasets.make_cv17_dataset`
-
-
-Each example follows the conversation schema expected by `apply_chat_template`, e.g.:
-```python
-{
-  "conversation": [
-    {
-      "role": "user",
-      "content": [
-        {"type": "image", "image": example_image},
-        {"type": "text",  "text":  "Describe this image."}
-      ]
-    },
-    {
-      "role": "assistant",
-      "content": [{"type": "text", "text": ground_truth_text}]
-    }
-  ]
-}
-```
-
-### Custom Chat Template
-By default, VLM fine-tuning uses the chat template built into the model's `AutoProcessor`. To override it, add `chat_template` under `dataset:` in your YAML config:
-
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset
-  split: train
-  chat_template: "{% for msg in messages %}{{ msg.role }}: {{ msg.content }}\n{% endfor %}"
-```
-
-`chat_template` accepts a Jinja template string, a path to a `.jinja` file, or a path to a JSON file containing a `chat_template` key. The override is applied to both the processor and its tokenizer before dataset instantiation.
-
-### Collate Functions
-- `nemo_automodel.components.datasets.vlm.collate_fns.default_collate_fn`
-- `nemo_automodel.components.datasets.vlm.collate_fns.qwen2_5_collate_fn` (Qwen2.5 VL)
-- `nemo_automodel.components.datasets.vlm.collate_fns.phi4_mm_collate_fn` (audio)
-
-Select in your YAML:
-```yaml
-dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  batch_size: 1
-  collate_fn:
-    _target_: nemo_automodel.components.datasets.vlm.collate_fns.qwen2_5_collate_fn
-```
-If you want answer-only loss masking, provide a model-appropriate `start_of_response_token` to the collate function.
-
-See [Gemma-3n](omni/gemma3-3n.md) and [VLM dataset](vlm/dataset.md) for end-to-end examples.
-
----
-
-## Diffusion Datasets
-
-Diffusion models don't train directly on raw images or videos. Instead, the data is first encoded into a compact numerical representation called a latent — this is what the model actually learns from. Text captions are similarly converted into text embeddings that the model uses as conditioning.
-
-This encoding is done once during preprocessing, and the results are saved as cache files (.meta). Training then reads these cache files directly, which is significantly faster than re-encoding on every step.
-
-The built-in preprocessing tool ([`tools/diffusion/preprocessing_multiprocess.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/tools/diffusion/preprocessing_multiprocess.py)) handles this conversion. It uses a VAE (Variational Autoencoder) to encode visual data and a text encoder for captions, grouping outputs into resolution-bucketed directories compatible with the multiresolution dataloader.
-
-### Dataloader Builders
-
-- **Video (T2V)**: `nemo_automodel.components.datasets.diffusion.build_video_multiresolution_dataloader` — for Wan 2.1 and HunyuanVideo
-- **Image (T2I)**: `nemo_automodel.components.datasets.diffusion.build_text_to_image_multiresolution_dataloader` — for FLUX.1-dev
-
-### Example YAML (Video Dataloader)
-
-```yaml
-data:
-  dataloader:
-    _target_: nemo_automodel.components.datasets.diffusion.build_video_multiresolution_dataloader
-    cache_dir: /path/to/processed_meta
-    model_type: wan
-    base_resolution: [512, 512]
-    dynamic_batch_size: false
-    shuffle: true
-    drop_last: false
-    num_workers: 0
-```
-
-See the [Diffusion Dataset Preparation](diffusion/dataset.md) guide for full preprocessing instructions and configuration details.
-
----
-
-## Bring Your Own Dataset
-You can integrate custom datasets with zero code changes to NeMo AutoModel by using `_target_` in YAML. There are three approaches:
-
-### Point to an Existing Class or Function (Dotted Path)
-- LLM example (class):
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
-  path_or_dataset: rowan/hellaswag
-  split: train
-```
-- LLM example (factory function):
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  split: train
-  dataset_name: rajpurkar/squad
-```
-- VLM example (factory function):
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.vlm.datasets.make_cord_v2_dataset
-  split: train
-```
-
-### Point to a Local Python File and Function
-```yaml
-dataset:
-  _target_: /abs/path/to/my_custom_dataset.py:build_my_dataset
-  some_arg: 123
-  split: train
-```
-Where `build_my_dataset` returns either a `datasets.Dataset` or a list/iterator of conversation dicts (for VLM).
-
-### Use ColumnMappedTextInstructionDataset for Most Instruction Datasets (LLM)
-- Ideal when your data has columns like `instruction`, `input`, or `output` but with arbitrary names
-- Supports local JSON/JSONL and HF Hub
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset.ColumnMappedTextInstructionDataset
-  path_or_dataset_id: /abs/path/to/*.jsonl  # or org/repo on HF
-  column_mapping:
-    context: definition
-    question: inputs
-    answer: targets
-  answer_only_loss_mask: true
-  start_of_turn_token: "<|assistant|>"
-```
-
-### Implement a Minimal Custom Class Pattern (LLM Completion)
-If you prefer Python, implement `get_context` and `get_target` and reuse the built-in preprocessor:
-```python
-from datasets import load_dataset
-from nemo_automodel.components.datasets.utils import SFTSingleTurnPreprocessor
-
-class MyCompletionDataset:
-    def __init__(self, path_or_dataset, tokenizer, split="train"):
-        raw_ds = load_dataset(path_or_dataset, split=split)
-        self.dataset = SFTSingleTurnPreprocessor(tokenizer).process(raw_ds, self)
-
-    def get_context(self, examples):
-        return examples["my_context_field"]
-
-    def get_target(self, examples):
-        return examples["my_target_field"]
-```
-Then reference your class with `_target_` in YAML.
-
-### Important Considerations
-- **Chat templates**: If your tokenizer has a chat template and you want answer-only loss, provide the correct `start_of_turn_token` (LLM) or `start_of_response_token` (VLM collate functions).
-- **Padding for FP8**: If training with FP8, set `pad_seq_len_divisible: 16` in your collate function to align sequence lengths.
-- **Packed sequences**: Prefer packed sequences for throughput when fine-tuning LLMs on variable-length corpora.
-- **Validation**: You can define a separate `validation_dataset` and `validation_dataloader` block mirroring your training config.
-
-For detailed, end-to-end recipes, browse the example configs under [examples/llm_finetune/](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_finetune), [examples/llm_pretrain/](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_pretrain), and [examples/vlm_finetune/](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/vlm_finetune).
diff --git a/fern/versions/v0.4/pages/guides/dataset-overview.mdx b/docs/guides/dataset-overview.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/dataset-overview.mdx
rename to docs/guides/dataset-overview.mdx
diff --git a/docs/guides/diffusion/dataset.md b/docs/guides/diffusion/dataset.md
deleted file mode 100644
index 22bcd9c908..0000000000
--- a/docs/guides/diffusion/dataset.md
+++ /dev/null
@@ -1,207 +0,0 @@
-(diffusion-dataset)=
-
-# Diffusion Dataset Preparation
-
-## Introduction
-
-Diffusion model training in NeMo AutoModel requires pre-encoded `.meta` files rather than raw images or videos. During preprocessing, a VAE encodes visual data into latent representations and a text encoder produces text embeddings. These are saved as `.meta` files so that training operates entirely in latent space, avoiding the need to load heavy encoder models during training.
-
-## Input Data Format
-
-### Images
-
-Place your images in a directory. Supported formats: `jpg`, `jpeg`, `png`, `webp`, `bmp`.
-
-Captions can be provided in several formats:
-- **Sidecar JSON** (default for images): A `.json` file alongside each image with a caption field
-- **JSONL**: A `.jsonl` file with `internvl` or `usr` caption fields
-
-### Videos
-
-Place your videos in a directory. Supported formats: `mp4`, `avi`, `mov`, `mkv`, `webm`.
-
-Captions can be provided in several formats:
-- **Sidecar JSON** (`--caption_format sidecar`): A `.json` file alongside each video
-- **meta.json** (`--caption_format meta_json`): A single `meta.json` manifest in the video directory
-- **JSONL** (`--caption_format jsonl`): A `.jsonl` file with captions
-
-If no caption is found for a sample, the filename (with underscores replaced by spaces) is used as a fallback.
-
-## Preprocessing
-
-NeMo AutoModel includes a unified preprocessing tool at [`tools/diffusion/preprocessing_multiprocess.py`](../../../tools/diffusion/preprocessing_multiprocess.py) that encodes raw images and videos into cache files compatible with the multiresolution dataloader. It uses model-specific processors from `tools/diffusion/processors/` to handle VAE encoding, text embedding, and cache data formatting for each supported model.
-
-The tool automatically distributes work across all available GPUs using multiprocessing, with one worker per GPU.
-
-### Available Processors
-
-| Processor | Media Type | Model |
-|-----------|-----------|-------|
-| `flux` | Image | FLUX.1-dev |
-| `wan` | Video | Wan 2.1 |
-| `hunyuan` | Video | HunyuanVideo 1.5 |
-
-You can list all registered processors with:
-
-```bash
-python -m tools.diffusion.preprocessing_multiprocess --list_processors
-```
-
-### Image Preprocessing (FLUX)
-
-```bash
-python -m tools.diffusion.preprocessing_multiprocess image \
-  --image_dir /path/to/images \
-  --output_dir /path/to/cache \
-  --processor flux \
-  --resolution_preset 512p
-```
-
-### Video Preprocessing (Wan 2.1)
-
-**Video mode** (encodes the full video as a single sample, recommended for training):
-
-```bash
-python -m tools.diffusion.preprocessing_multiprocess video \
-  --video_dir /path/to/videos \
-  --output_dir /path/to/cache \
-  --processor wan \
-  --resolution_preset 512p \
-  --caption_format sidecar
-```
-
-**Frames mode** (extracts evenly-spaced frames, each becomes a separate sample):
-
-```bash
-python -m tools.diffusion.preprocessing_multiprocess video \
-  --video_dir /path/to/videos \
-  --output_dir /path/to/cache \
-  --processor wan \
-  --mode frames \
-  --num_frames 40 \
-  --resolution_preset 512p
-```
-
-### Video Preprocessing (HunyuanVideo)
-
-```bash
-python -m tools.diffusion.preprocessing_multiprocess video \
-  --video_dir /path/to/videos \
-  --output_dir /path/to/cache \
-  --processor hunyuan \
-  --target_frames 121 \
-  --caption_format meta_json
-```
-
-### Key Arguments
-
-**Common arguments:**
-
-| Argument | Description |
-|----------|-------------|
-| `--processor` | Processor name (`flux`, `wan`, `hunyuan`) |
-| `--model_name` | HuggingFace model name (uses processor default if omitted) |
-| `--output_dir` | Output directory for cached data |
-| `--shard_size` | Number of samples per metadata shard (default: 10000) |
-
-**Image-specific arguments:**
-
-| Argument | Description |
-|----------|-------------|
-| `--image_dir` | Input image directory |
-| `--resolution_preset` | Resolution preset: `256p`, `512p`, `768p`, `1024p`, `1536p` |
-| `--max_pixels` | Custom pixel budget (alternative to preset) |
-| `--caption_field` | Caption field in JSONL files (`internvl` or `usr`) |
-| `--verify` | Verify latents can be decoded back |
-
-**Video-specific arguments:**
-
-| Argument | Description |
-|----------|-------------|
-| `--video_dir` | Input video directory |
-| `--mode` | `video` (full video) or `frames` (extract evenly-spaced frames) |
-| `--num_frames` | Number of frames to extract in `frames` mode |
-| `--target_frames` | Target frame count (for example, 121 for HunyuanVideo 4n+1 constraint) |
-| `--resolution_preset` | Resolution preset for bucketing |
-| `--height` / `--width` | Explicit target size (disables bucketing) |
-| `--resize_mode` | Interpolation: `bilinear`, `bicubic`, `nearest`, `area`, `lanczos` |
-| `--center_crop` / `--no_center_crop` | Enable/disable center cropping (default: enabled) |
-| `--caption_format` | Caption source: `sidecar`, `meta_json`, `jsonl` |
-| `--caption_field` | Field name for captions (default: `caption`) |
-| `--output_format` | Output format: `meta` (pickle) or `pt` (torch.save) |
-
-## Output Format
-
-The preprocessing tool produces a cache directory organized by resolution bucket:
-
-```
-/path/to/cache/
-├── 512x512/
-│   ├── <hash1>.meta
-│   ├── <hash2>.meta
-│   └── ...
-├── 832x480/
-│   └── ...
-├── metadata.json          # Global config (processor, model, total items)
-└── metadata_shard_0000.json  # Per-sample metadata (paths, resolutions, captions)
-```
-
-Each cache file (`.meta` or `.pt`) contains:
-
-- **Encoded latents** — VAE latent representations of the image or video
-- **Text embeddings** — Pre-computed from the model's text encoder
-- **First frame** — Reference image for image-to-video conditioning (video mode only)
-- **Image embeddings** — For models that support i2v conditioning (video mode only)
-- **Metadata** — Original and bucket resolutions, caption, source path
-
-## Multiresolution Bucketing
-
-NeMo AutoModel supports multiresolution training through bucketed sampling. This groups samples by their spatial resolution so that each batch contains samples of the same size, avoiding padding waste.
-
-During preprocessing, the `--resolution_preset` argument controls the pixel budget used for bucketing. Available presets: `256p`, `512p`, `768p`, `1024p`, `1536p`. Alternatively, use `--max_pixels` for a custom pixel budget, or `--height`/`--width` to disable bucketing and use a fixed resolution.
-
-During training, the dataloader uses these key configuration parameters:
-
-- `base_resolution`: The target resolution used for bucketing (for example, `[512, 512]`)
-- The `SequentialBucketSampler` groups samples by resolution bucket
-- `dynamic_batch_size`: When `true`, adjusts batch size per resolution bucket to maintain constant memory usage
-
-## YAML Configuration
-
-### Video Dataloader (Wan 2.1 / HunyuanVideo)
-
-Used for text-to-video models. Set `model_type` to match your model (`wan` or `hunyuan`):
-
-```yaml
-data:
-  dataloader:
-    _target_: nemo_automodel.components.datasets.diffusion.build_video_multiresolution_dataloader
-    cache_dir: /path/to/processed_meta
-    model_type: wan          # or "hunyuan"
-    base_resolution: [512, 512]
-    dynamic_batch_size: false
-    shuffle: true
-    drop_last: false
-    num_workers: 0
-```
-
-### Image Dataloader (FLUX)
-
-Used for text-to-image models:
-
-```yaml
-data:
-  dataloader:
-    _target_: nemo_automodel.components.datasets.diffusion.build_text_to_image_multiresolution_dataloader
-    cache_dir: /path/to/processed_meta
-    train_text_encoder: false
-    num_workers: 0
-    base_resolution: [512, 512]
-    dynamic_batch_size: false
-    shuffle: true
-    drop_last: false
-```
-
-:::{tip}
-Supported image resolutions for FLUX include `[256, 256]`, `[512, 512]`, and `[1024, 1024]`. While a 1:1 aspect ratio is currently used as a proxy for the closest image size, the implementation is designed to support multiple aspect ratios.
-:::
diff --git a/fern/versions/v0.4/pages/guides/diffusion/dataset.mdx b/docs/guides/diffusion/dataset.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/diffusion/dataset.mdx
rename to docs/guides/diffusion/dataset.mdx
diff --git a/docs/guides/diffusion/finetune.md b/docs/guides/diffusion/finetune.md
deleted file mode 100644
index e4a789b48b..0000000000
--- a/docs/guides/diffusion/finetune.md
+++ /dev/null
@@ -1,392 +0,0 @@
-(diffusion-finetune)=
-
-# Diffusion Model Fine-Tuning
-
-## Introduction
-
-Diffusion models generate images and videos by learning to reverse a noise process — starting from random noise and iteratively refining it into coherent visual output guided by a text prompt. Pretrained diffusion models (like [FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev) for images or [Wan 2.1](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers) for video) produce impressive general-purpose results, but they know nothing about your particular visual domain, style, or subject matter. Fine-tuning bridges that gap — you adapt the model on your own data so it produces outputs that match your requirements, without the cost of training from scratch.
-
-Under the hood, NeMo AutoModel uses [flow matching](https://arxiv.org/abs/2210.02747), a modern generative framework that learns to transform noise into data by regressing a velocity field along straight interpolation paths. It integrates with [Hugging Face Diffusers](https://huggingface.co/docs/diffusers) to provide distributed fine-tuning for text-to-image and text-to-video models. This guide walks you through the process end-to-end — from installation through training and inference — using [Wan 2.1 T2V 1.3B](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers) as a running example.
-
-### Workflow Overview
-
-```text
-┌──────────────┐    ┌──────────────┐    ┌──────────────┐    ┌──────────────┐    ┌──────────────┐
-│ 1. Install   │--->│ 2. Prepare   │--->│ 3. Configure │--->│  4. Train    │--->│ 5. Generate  │
-│              │    │    Data      │    │              │    │              │    │              │
-│ pip install  │    │ Encode to    │    │ YAML recipe  │    │ torchrun     │    │ Run inference│
-│ or Docker    │    │ .meta files  │    │              │    │              │    │ with ckpt    │
-└──────────────┘    └──────────────┘    └──────────────┘    └──────────────┘    └──────────────┘
-```
-
-| Step | Section | What You Do |
-|------|---------|-------------|
-| **1. Install** | [Install NeMo AutoModel](#install-nemo-automodel) | Install the package via pip or Docker |
-| **2. Prepare Data** | [Prepare Your Dataset](#prepare-your-dataset) | Encode raw images/videos into `.meta` latent files |
-| **3. Configure** | [Configure Your Training Recipe](#configure-your-training-recipe) | Write a YAML config specifying model, data, and training settings |
-| **4. Train** | [Fine-Tune the Model](#fine-tune-the-model) | Launch training with `torchrun` on a single node |
-| **4b. Multi-Node** | [Multi-Node Training](#multi-node-training) | Scale training across multiple nodes |
-| **5. Generate** | [Generation / Inference](#generation--inference) | Run inference using the fine-tuned checkpoint |
-
-For model-specific configuration (FLUX.1-dev, HunyuanVideo), see [Model-Specific Notes](#model-specific-notes).
-
-### Supported Models
-
-| Model | HF Model ID | Task | Parameters | Example Config |
-|-------|-------------|------|------------|----------------|
-| Wan 2.1 T2V 1.3B | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers` | Text-to-Video | 1.3B | [wan2_1_t2v_flow.yaml](../../../examples/diffusion/finetune/wan2_1_t2v_flow.yaml) |
-| FLUX.1-dev | `black-forest-labs/FLUX.1-dev` | Text-to-Image | 12B | [flux_t2i_flow.yaml](../../../examples/diffusion/finetune/flux_t2i_flow.yaml) |
-| HunyuanVideo 1.5 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v` | Text-to-Video | — | [hunyuan_t2v_flow.yaml](../../../examples/diffusion/finetune/hunyuan_t2v_flow.yaml) |
-
-All models use FSDP2 for distributed training and flow matching for loss computation.
-
-## Install NeMo AutoModel
-
-```bash
-pip3 install nemo-automodel
-```
-
-Alternatively, if you run into dependency or driver issues, use the pre-built Docker container:
-
-```bash
-docker pull nvcr.io/nvidia/nemo-automodel:26.02.00
-docker run --gpus all -it --rm --shm-size=8g nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-:::{important}
-**Docker users:** Checkpoints are lost when the container exits unless you bind-mount the checkpoint directory to the host. See [Install with NeMo Docker Container](../installation.md#install-with-nemo-docker-container) and [Saving Checkpoints When Using Docker](../checkpointing.md#saving-checkpoints-when-using-docker).
-:::
-
-For the full set of installation methods, see the [installation guide](../installation.md).
-
-## Prepare Your Dataset
-
-Diffusion models operate in latent space — a compressed representation of visual data — rather than directly on raw images or videos. To avoid re-encoding data on every training step, the preprocessing
-  pipeline encodes all inputs ahead of time and saves them as .meta files.
-
- Each .meta file contains:
- - Latent representations produced by a VAE (Variational Autoencoder) from the raw visual data
- - Text embeddings produced by a text encoder from the associated captions/prompts
-
-Fine-tuning then operates entirely on these pre-encoded .meta files, which is significantly faster than encoding on the fly.
-
-Preprocess your data using the built-in tool at [`tools/diffusion/preprocessing_multiprocess.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/tools/diffusion/preprocessing_multiprocess.py). The script provides `image` and `video` subcommands:
-
-**Video preprocessing (using Wan 2.1 as a running example):**
-```bash
-python -m tools.diffusion.preprocessing_multiprocess video \
-    --video_dir /data/videos \
-    --output_dir /cache \
-    --processor wan \
-    --resolution_preset 512p \
-    --caption_format sidecar
-```
-
-**Image preprocessing (FLUX):**
-```bash
-python -m tools.diffusion.preprocessing_multiprocess image \
-    --image_dir /data/images \
-    --output_dir /cache \
-    --processor flux
-```
-
-**Video preprocessing (HunyuanVideo):**
-```bash
-python -m tools.diffusion.preprocessing_multiprocess video \
-    --video_dir /data/videos \
-    --output_dir /cache \
-    --processor hunyuan \
-    --target_frames 121 \
-    --caption_format meta_json
-```
-
-For the full set of arguments and input format details, see the [Diffusion Dataset Preparation](dataset.md) guide.
-
-## Configure Your Training Recipe
-
-Fine-tuning is driven by two components:
-
-1. A recipe script (e.g., [`train.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/recipes/diffusion/train.py)) — the Python entry point that orchestrates the training loop: loading the model, building the dataloader, running forward/backward passes, computing the flow matching loss, checkpointing, and logging.
-2. A YAML configuration file — a text file in YAML format that specifies all settings the recipe uses: which model to fine-tune, where the data lives, optimizer hyperparameters, parallelism strategy, etc.
-  You customize training by editing this file rather than modifying code, allowing you to scale from 1 to 100s of GPUs seamlessly.
-
-Below is the annotated [wan2_1_t2v_flow.yaml](../../../examples/diffusion/finetune/wan2_1_t2v_flow.yaml), with each section explained:
-
-```yaml
-seed: 42
-
-# Weights & Biases experiment tracking
-wandb:
-  project: wan-t2v-flow-matching
-  mode: online
-  name: wan2_1_t2v_fm_v2
-
-dist_env:
-  backend: nccl
-  timeout_minutes: 30
-
-# Model configuration
-# pretrained_model_name_or_path: Hugging Face model ID
-# mode: "finetune" loads pretrained weights and adapts them to your dataset
-model:
-  pretrained_model_name_or_path: Wan-AI/Wan2.1-T2V-1.3B-Diffusers
-  mode: finetune
-
-# Training schedule
-step_scheduler:
-  global_batch_size: 8       # Effective batch size across all GPUs
-  local_batch_size: 1        # Per-GPU batch size (gradient accumulation = global/local/num_gpus)
-  ckpt_every_steps: 1000     # Checkpoint frequency
-  num_epochs: 100
-  log_every: 2               # Log metrics every N steps
-
-# Data: uses pre-encoded .meta files
-data:
-  dataloader:
-    _target_: nemo_automodel.components.datasets.diffusion.build_video_multiresolution_dataloader
-    cache_dir: PATH_TO_YOUR_DATA
-    model_type: wan # "wan" for Wan 2.1, "hunyuan" for HunyuanVideo
-    base_resolution: [512, 512]
-    dynamic_batch_size: false
-    shuffle: true
-    drop_last: false
-    num_workers: 0
-
-# Optimizer
-optim:
-  learning_rate: 5e-6
-  optimizer:
-    weight_decay: 0.01
-    betas: [0.9, 0.999]
-
-# Learning rate scheduler
-lr_scheduler:
-  lr_decay_style: cosine
-  lr_warmup_steps: 0
-  min_lr: 1e-6
-
-# Flow matching configuration
-flow_matching:
-  adapter_type: "simple"          # Model-specific adapter (simple, flux, hunyuan)
-  adapter_kwargs: {}
-  timestep_sampling: "uniform"    # How timesteps are sampled during training
-  logit_mean: 0.0
-  logit_std: 1.0
-  flow_shift: 3.0                # Shifts the flow schedule
-  mix_uniform_ratio: 0.1
-  sigma_min: 0.0
-  sigma_max: 1.0
-  num_train_timesteps: 1000
-  i2v_prob: 0.3                  # Probability of image-to-video conditioning
-  use_loss_weighting: true
-  log_interval: 100
-  summary_log_interval: 10
-
-# FSDP2 distributed training
-fsdp:
-  tp_size: 1      # Tensor parallelism
-  cp_size: 1      # Context parallelism
-  pp_size: 1      # Pipeline parallelism
-  dp_replicate_size: 1
-  dp_size: 8      # Data parallelism (number of GPUs)
-
-# Checkpointing
-checkpoint:
-  enabled: true
-  checkpoint_dir: PATH_TO_YOUR_CKPT_DIR
-  model_save_format: torch_save
-  save_consolidated: false
-  restore_from: null
-```
-
-### Config Field Reference
-
-| Section | Required? | What to Change |
-|---------|-----------|----------------|
-| `model` | Yes | Set `pretrained_model_name_or_path` to the Hugging Face model ID. Set `mode: finetune`. |
-| `step_scheduler` | Yes | `global_batch_size` is the effective batch size across all GPUs. `ckpt_every_steps` controls checkpoint frequency. |
-| `data` | Yes | Set `cache_dir` to the path containing your preprocessed `.meta` files. Change `model_type` and `_target_` for different models (see [Model-Specific Notes](#model-specific-notes)). |
-| `optim` | Yes | `learning_rate: 5e-6` is a good default for fine-tuning. |
-| `flow_matching` | Yes | `adapter_type` must match the model (`simple` for Wan, `flux` for FLUX, `hunyuan` for HunyuanVideo). |
-| `fsdp` | Yes | Set `dp_size` to the number of GPUs on your node. |
-| `checkpoint` | Recommended | Set `checkpoint_dir` to a persistent path, especially in Docker. |
-| `wandb` | Optional | Configure to enable Weights & Biases logging. |
-
-(fine-tune-the-model)=
-## Fine-Tune the Model
-
-Launch fine-tuning with `torchrun`:
-
-```bash
-torchrun --nproc-per-node=8 \
-  examples/diffusion/finetune/finetune.py \
-  -c examples/diffusion/finetune/wan2_1_t2v_flow.yaml
-```
-
-Adjust `--nproc-per-node` to match the number of GPUs on your node, and ensure `fsdp.dp_size` in the YAML matches.
-
-(multi-node-training)=
-## Multi-Node Training
-
-When a single node doesn't provide enough GPUs or memory for your workload, you can scale training across multiple nodes. NeMo AutoModel handles multi-node distributed training through `torchrun` rendezvous and FSDP2 — the same recipe script works on one node or many.
-
-### YAML Configuration Changes
-
-The main change is in the `fsdp` section. Set `dp_size` to the **total number of GPUs across all nodes**, and optionally increase `dp_replicate_size` for gradient replication across nodes.
-
-For example, to train on 2 nodes with 8 GPUs each (16 GPUs total):
-
-```yaml
-fsdp:
-  tp_size: 1
-  cp_size: 1
-  pp_size: 1
-  dp_replicate_size: 2   # Replicate across 2 nodes for robustness
-  dp_size: 16             # Total GPUs: 2 nodes × 8 GPUs
-```
-
-A complete multi-node config is provided at [wan2_1_t2v_flow_multinode.yaml](../../../examples/diffusion/finetune/wan2_1_t2v_flow_multinode.yaml).
-
-### Launch with torchrun
-
-Run the following command on **each node**, setting `NODE_RANK` to `0` on the first node, `1` on the second, and so on:
-
-```bash
-export MASTER_ADDR=node0.hostname   # hostname or IP of the first node
-export MASTER_PORT=29500
-export NODE_RANK=0                  # 0 on master, 1 on second node, etc.
-
-torchrun \
-  --nnodes=2 \
-  --nproc-per-node=8 \
-  --node_rank=${NODE_RANK} \
-  --rdzv_backend=c10d \
-  --rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
-  examples/diffusion/finetune/finetune.py \
-  -c examples/diffusion/finetune/wan2_1_t2v_flow_multinode.yaml
-```
-
-(model-specific-notes)=
-## Model-Specific Notes
-
-Use the table below to pick the right model for your use case:
-
-| Use Case | Model | Why Choose It |
-|----------|-------|---------------|
-| **Video generation on limited hardware** | [Wan 2.1 T2V 1.3B](#wan-21-t2v-13b) | Smallest model (1.3B params) — fast iteration, fits on a single A100 40GB |
-| **High-quality image generation** | [FLUX.1-dev](#flux1-dev-text-to-image) | State-of-the-art text-to-image with 12B params and guidance-based control |
-| **High-quality video generation** | [HunyuanVideo 1.5](#hunyuanvideo-15) | Larger video model with condition-latent support for richer motion and detail |
-
-### Wan 2.1 T2V 1.3B
-
-- **Adapter type**: `simple`
-- **Dataloader**: `build_video_multiresolution_dataloader` with `model_type: wan`
-- **Config**: [wan2_1_t2v_flow.yaml](../../../examples/diffusion/finetune/wan2_1_t2v_flow.yaml)
-
-### FLUX.1-dev (Text-to-Image)
-
-- **Adapter type**: `flux`
-- **Dataloader**: `build_text_to_image_multiresolution_dataloader`
-- **Key differences**:
-  - Uses `pipeline_spec` to specify the transformer architecture:
-    ```yaml
-    model:
-      pipeline_spec:
-        transformer_cls: "FluxTransformer2DModel"
-        subfolder: "transformer"
-        load_full_pipeline: false
-    ```
-  - Requires `guidance_scale` in adapter kwargs:
-    ```yaml
-    flow_matching:
-      adapter_type: "flux"
-      adapter_kwargs:
-        guidance_scale: 3.5
-        use_guidance_embeds: true
-    ```
-  - Uses `logit_normal` timestep sampling instead of `uniform`
-- **Config**: [flux_t2i_flow.yaml](../../../examples/diffusion/finetune/flux_t2i_flow.yaml)
-
-### HunyuanVideo 1.5
-
-- **Adapter type**: `hunyuan`
-- **Dataloader**: `build_video_multiresolution_dataloader` with `model_type: hunyuan`
-- **Key differences**:
-  - Requires `activation_checkpointing: true` in FSDP config due to model size
-  - Uses condition latents in adapter kwargs:
-    ```yaml
-    flow_matching:
-      adapter_type: "hunyuan"
-      adapter_kwargs:
-        use_condition_latents: true
-        default_image_embed_shape: [729, 1152]
-    ```
-  - Uses `logit_normal` timestep sampling
-- **Config**: [hunyuan_t2v_flow.yaml](../../../examples/diffusion/finetune/hunyuan_t2v_flow.yaml)
-
-## Generation / Inference
-
-Once training is complete, you can use the model to generate images or videos from text prompts. This step is called inference — as opposed to training, where the model learns from data, inference is where it produces new outputs.
-
-In diffusion models, generation works by starting from random noise and iteratively denoising it, guided by your text prompt, until a clean image or video emerges.
-
-The generation script ([`generate.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/diffusion/generate/generate.py)) handles this: it loads your model weights (pretrained or fine-tuned), configures the diffusion sampler, and produces outputs for one or more prompts.
-
-**Single-GPU (Wan 2.1 1.3B):**
-```bash
-python examples/diffusion/generate/generate.py \
-  -c examples/diffusion/generate/configs/generate_wan.yaml
-```
-
-**Multi-GPU (Wan 2.1 1.3B):**
-
-Wan 2.1 supports tensor parallelism for inference, which shards the transformer across GPUs to reduce per-GPU memory. Pass the `distributed` config via CLI overrides:
-
-```bash
-torchrun --nproc-per-node=8 \
-  examples/diffusion/generate/generate.py \
-  -c examples/diffusion/generate/configs/generate_wan.yaml \
-  --distributed.backend nccl \
-  --distributed.parallel_scheme.transformer.tp_size 8
-```
-
-**With a fine-tuned checkpoint:**
-```bash
-python examples/diffusion/generate/generate.py \
-  -c examples/diffusion/generate/configs/generate_wan.yaml \
-  --model.checkpoint ./checkpoints/step_1000 \
-  --inference.prompts '["A dog running on a beach"]'
-```
-
-**FLUX image generation:**
-```bash
-python examples/diffusion/generate/generate.py \
-  -c examples/diffusion/generate/configs/generate_flux.yaml
-```
-
-**HunyuanVideo:**
-```bash
-python examples/diffusion/generate/generate.py \
-  -c examples/diffusion/generate/configs/generate_hunyuan.yaml
-```
-
-### Available Generation Configs
-
-| Config | Model | Output | GPUs |
-|--------|-------|--------|------|
-| [`generate_wan.yaml`](../../../examples/diffusion/generate/configs/generate_wan.yaml) | Wan 2.1 1.3B | Video | 1 |
-| [`generate_flux.yaml`](../../../examples/diffusion/generate/configs/generate_flux.yaml) | FLUX.1-dev | Image | 1 |
-| [`generate_hunyuan.yaml`](../../../examples/diffusion/generate/configs/generate_hunyuan.yaml) | HunyuanVideo | Video | 1 |
-
-:::{note}
-You can use `--model.checkpoint ./checkpoints/LATEST` to automatically load the most recent checkpoint.
-:::
-
-## Hardware Requirements
-
-| Component | Minimum | Recommended |
-|-----------|---------|-------------|
-| GPU | A100 40GB | A100 80GB / H100 |
-| GPUs | 4 | 8 |
-| RAM | 128 GB | 256 GB+ |
-| Storage | 500 GB SSD | 2 TB NVMe |
diff --git a/fern/versions/v0.4/pages/guides/diffusion/finetune.mdx b/docs/guides/diffusion/finetune.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/diffusion/finetune.mdx
rename to docs/guides/diffusion/finetune.mdx
diff --git a/docs/guides/dllm/finetune.md b/docs/guides/dllm/finetune.md
deleted file mode 100644
index f23d2937c7..0000000000
--- a/docs/guides/dllm/finetune.md
+++ /dev/null
@@ -1,122 +0,0 @@
-
-# Diffusion Language Model (dLLM) Fine-Tuning and Generation with NeMo AutoModel
-
-## Introduction
-
-Diffusion language models (dLLMs) generate text by iteratively denoising masked tokens, rather than generating one token at a time left-to-right like autoregressive (AR) models. Starting from a sequence of `[MASK]` tokens, the model progressively unmasks the most confident positions over multiple denoising steps until the full response is revealed.
-
-This approach enables **parallel token generation** and **bidirectional attention**, which gives the model more context for each prediction compared to AR models.
-
-NeMo AutoModel currently supports the following dLLM model family:
-
-- **LLaDA (MDLM)** — Bidirectional masked diffusion. The model receives corrupted tokens and predicts the clean token at each masked position.
-
-### Workflow Overview
-
-```text
-┌──────────────┐    ┌──────────────┐    ┌──────────────┐    ┌──────────────┐
-│  1. Install  │--->│ 2. Configure │--->│   3. Train   │--->│ 4. Generate  │
-│              │    │    YAML      │    │              │    │              │
-│ pip install  │    │  Recipe +    │    │  torchrun    │    │  Run dLLM    │
-│ or Docker    │    │  dLLM config │    │              │    │  inference   │
-└──────────────┘    └──────────────┘    └──────────────┘    └──────────────┘
-```
-
-| Step | Section | What You Do |
-|------|---------|-------------|
-| **1. Install** | [Install NeMo AutoModel](#install-nemo-automodel) | Install the package via pip or Docker |
-| **2. Configure** | [Configure Your Training Recipe](#configure-your-training-recipe) | Write a YAML config specifying model, data, dLLM mode, and training settings |
-| **3. Train** | [Fine-Tune the Model](#fine-tune-the-model) | Launch training with `torchrun` |
-| **4. Generate** | [Generation / Inference](#generation--inference) | Generate text from a fine-tuned checkpoint |
-
-### Supported Models
-
-| Model Family | dLLM Mode | Loss | Inference | Example Config |
-|---|---|---|---|---|
-| LLaDA | `mdlm` | MDLM cross-entropy | Block-by-block, full-forward (no KV cache) | [llada_sft.yaml](../../../examples/dllm_sft/llada_sft.yaml) |
-
-## Install NeMo AutoModel
-
-```bash
-pip3 install nemo-automodel
-```
-
-Alternatively, use the pre-built Docker container:
-
-```bash
-docker pull nvcr.io/nvidia/nemo-automodel:26.02.00
-docker run --gpus all -it --rm --shm-size=8g nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-For the full set of installation methods, see the [installation guide](../installation.md).
-
-## Configure Your Training Recipe
-
-dLLM fine-tuning is driven by:
-
-1. A **recipe script** ([`train_ft.py`](../../../nemo_automodel/recipes/dllm/train_ft.py)) — orchestrates the training loop with dLLM-specific corruption, loss, and batch handling.
-2. A **YAML configuration file** — specifies the model, data, optimizer, dLLM-specific settings, and distributed training strategy.
-
-The recipe uses a **strategy pattern** to handle differences between model families. The `dllm.mode` field in the YAML selects the strategy:
-
-| Mode | Strategy | Description |
-|------|----------|-------------|
-| `mdlm` | `MDLMStrategy` | LLaDA-style: model receives corrupted tokens, MDLM cross-entropy loss |
-
-### LLaDA Configuration
-
-See [llada_sft.yaml](../../../examples/dllm_sft/llada_sft.yaml) for the full working config. The key dLLM-specific sections are:
-
-```yaml
-model:
-  pretrained_model_name_or_path: GSAI-ML/LLaDA-8B-Base
-  torch_dtype: float32
-  trust_remote_code: true
-
-dllm:
-  mode: mdlm
-  mask_token_id: 126336       # LLaDA mask token
-  eps: 0.001                  # Minimum corruption ratio
-
-dataset:
-  unshifted: true             # Required for dLLM training
-```
-
-### Key dLLM Config Fields
-
-| Field | Description |
-|-------|-------------|
-| `dllm.mode` | Training strategy (`mdlm`) |
-| `dllm.mask_token_id` | Token ID used for masking (`126336` for LLaDA) |
-| `dllm.eps` | Minimum corruption ratio to avoid zero-corruption samples |
-| `dataset.unshifted` | Must be `true` for dLLM — disables the autoregressive input/target shift |
-
-## Fine-Tune the Model
-
-```bash
-torchrun --nproc-per-node=8 \
-    nemo_automodel/recipes/dllm/train_ft.py \
-    -c examples/dllm_sft/llada_sft.yaml
-```
-
-## Generation / Inference
-
-The generation script ([`generate.py`](../../../examples/dllm_generate/generate.py)) supports chat, raw, and infilling modes for LLaDA checkpoints.
-
-### LLaDA Generation
-
-```bash
-python examples/dllm_generate/generate.py \
-    --checkpoint <path> \
-    --prompt "Explain what a neural network is."
-```
-
-### Generation Parameters
-
-| Parameter | Description | Default |
-|-----------|-------------|---------|
-| `--steps` | Number of denoising steps | 128 |
-| `--max_new_tokens` | Maximum tokens to generate | 128 |
-| `--block_size` | Tokens per denoising block | 32 |
-| `--temperature` | Gumbel noise temperature (0 = greedy) | 0.0 |
-| `--remasking` | Confidence scoring strategy for selecting which positions to unmask | `low_confidence` |
diff --git a/fern/versions/v0.4/pages/guides/dllm/finetune.mdx b/docs/guides/dllm/finetune.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/dllm/finetune.mdx
rename to docs/guides/dllm/finetune.mdx
diff --git a/docs/guides/fp8-training.md b/docs/guides/fp8-training.md
deleted file mode 100644
index 7d5e901f4d..0000000000
--- a/docs/guides/fp8-training.md
+++ /dev/null
@@ -1,149 +0,0 @@
-# FP8 Training
-
-NeMo AutoModel supports FP8 quantization using [TorchAO](https://github.com/pytorch/ao) and `torch.compile` to accelerate training on compatible hardware.
-
-FP8 (8-bit floating point) quantization can provide substantial speedups for models where the majority of GEMMs are sufficiently large. The speedup from using FP8 tensor cores must outweigh the overhead of dynamic quantization.
-
-### Requirements for FP8 Training in NeMo AutoModel
-
-To enable FP8 training in NeMo AutoModel, the following hardware and software requirements must be met:
-
-- **Hardware**:  
-  An NVIDIA H100 GPU or newer is required. These GPUs feature FP8 tensor cores that accelerate training.
-
-- **Software**:  
-  The TorchAO library must be installed.
-
-- **Configuration**:  
-  Both `torch.compile` and `fp8` must be enabled in your training configuration.  
-  **Important**: `torch.compile` is essential for achieving meaningful speedup with TorchAO FP8 training.
-
-## Install TorchAO
-
-Make sure you have TorchAO installed. Follow the [installation guide](https://github.com/pytorch/ao?tab=readme-ov-file#-installation) for TorchAO.
-
-## Usage
-
-### Configure FP8
-
-To enable FP8 quantization with `torch.compile`, you need both FP8 and compilation enabled in your configuration:
-
-```yaml
-# Enable torch.compile (required for FP8 speedup)
-compile:
-  enabled: true
-  mode: "default"
-  fullgraph: false
-  dynamic: false
-
-# Enable FP8 quantization
-fp8:
-  enabled: true
-  recipe_name: tensorwise
-  enable_fsdp_float8_all_gather: true
-  precompute_float8_dynamic_scale_for_fsdp: true
-  force_recompute_fp8_weight_in_bwd: true
-  filter_fqns: ["lm_head"]
-  emulate: false
-```
-
-### FP8 Config Parameters
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `recipe_name` | str | None | FP8 recipe: "tensorwise", "rowwise", or "rowwise_with_gw_hp" |
-| `enable_fsdp_fp8_all_gather` | bool | False | Enable FP8 all-gather in FSDP for bandwidth savings |
-| `force_recompute_fp8_weight_in_bwd` | bool | False | Force recomputation of FP8 weights in backward pass |
-| `precompute_fp8_dynamic_scale_for_fsdp` | bool | False | Precompute FP8 scales for FSDP optimization |
-| `filter_fqns` | list[str] | [] | Module names to exclude from FP8 conversion |
-| `emulate` | bool | False | Use emulation instead of hardware acceleration |
-
-### Scaling Strategies
-
-#### Tensorwise Scaling (Default)
-- Single scale per tensor
-- Good performance, moderate accuracy
-- Recommended for most use cases
-
-
-#### Rowwise Scaling
-- Scale per row for better accuracy
-- Slower than tensorwise
-- Better numerical stability
-
-
-For more on scaling strategies, refer to the [TorchAO FP8 documentation](https://github.com/pytorch/ao/tree/main/torchao/float8).
-
-## Filter Modules
-
-You can exclude specific modules from FP8 conversion using `filter_fqns`:
-
-```yaml
-fp8:
-  enabled: true
-  recipe_name: tensorwise
-  filter_fqns: ["lm_head"]  # Skip these modules
-```
-
-### Speed and Convergence
-
-FP8 quantization provides measurable performance improvements while maintaining model convergence:
-
-- **Speed**: Over 1.2x training speedup on 8xH100 with tensorwise scaling.
-- **Convergence**: FP8 training achieves loss parity with BF16 training.
-- **Memory**: FP8 training achieves on par memory usage with BF16 baseline.
-
-```{image} fp8_convergence.jpg
-:alt: FP8 Convergence Comparison
-:class: bg-primary
-:width: 600px
-:align: center
-```
-
-*Figure: Loss curves comparing FP8 tensorwise scaling + torch.compile vs. BF16 + torch.compile training on 8xH100 with 8k sequence length, demonstrating virtually identical convergence behavior with 1.24x speedup*
-
-## Ready-to-Use Recipes
-We provide FP8 training configs for popular models:
-
-- **Llama**: [Llama 3.1 8B](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/llama3_1/llama3_1_8b_hellaswag_fp8.yaml)
-- **Mistral**: [Mistral 7B](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/mistral/mistral_7b_hellaswag_fp8.yaml), [Mistral Nemo 2407](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/mistral/mistral_nemo_2407_hellaswag_fp8.yaml)
-- **Qwen**: [Qwen 2.5 7B](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/qwen/qwen2_5_7b_hellaswag_fp8.yaml)
-- **Phi**: [Phi 4](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/phi/phi_4_hellaswag_fp8.yaml)
-
-Check out our [examples directory](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_finetune) for more recipes and configurations.
-
-To run any of these FP8 training recipes, use the following command:
-
-```bash
-automodel --nproc-per-node=8 <path-to-config.yaml>
-```
-
-For example, to train Llama 3.1 8B with FP8:
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_1/llama3_1_8b_hellaswag_fp8.yaml
-```
-
-
-## Performance Considerations
-
-FP8 requires specific conditions to be effective:
-- Input tensors must have dimensions divisible by 16 
-- Use compatible hardware (H100+)
-- Train with `torch.compile`
-
-FP8 works best when the majority of GEMM operations are sufficiently large such that the speedup achieved by using FP8 tensor cores is greater than the overhead of dynamic quantization.
-
-### Ideal Conditions for FP8 Performance
-
-- Linear layers are large and compute-intensive
-- The model consists of fewer small operations and more large matrix multiplications
-- You have modern (H100+) hardware optimized for FP8 acceleration
-- Moderate numerical precision is acceptable and slight approximations won't affect outcomes
-
-
-
-## References
-
-- [TorchAO FP8 Documentation](https://github.com/pytorch/ao/tree/main/torchao/float8)
-- [FP8 Performance Benchmarks](https://github.com/pytorch/ao/tree/main/torchao/float8#performance)
-- [NVIDIA FP8 Primer](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html) 
diff --git a/fern/versions/v0.4/pages/guides/fp8-training.mdx b/docs/guides/fp8-training.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/fp8-training.mdx
rename to docs/guides/fp8-training.mdx
diff --git a/docs/guides/gradient-checkpointing.md b/docs/guides/gradient-checkpointing.md
deleted file mode 100644
index 58156b8453..0000000000
--- a/docs/guides/gradient-checkpointing.md
+++ /dev/null
@@ -1,85 +0,0 @@
-# Gradient (Activation) Checkpointing
-
-Gradient checkpointing, also called _activation checkpointing_, trades a little extra compute for a **large reduction in GPU memory** by recomputing intermediate activations during the backwards pass instead of storing them.  
-It is especially powerful when combined with memory-efficient loss functions (e.g., Linear-Cut Cross-Entropy) and parameter sharding using FSDP.
-
-## Enable Gradient Checkpointing
-
-### Configure in YAML
-Add the `activation_checkpointing: true` flag under your distributed strategy.  
-Example (snippet):
-
-```yaml
-# examples/llm_finetune/llama_3_2_1b_my_finetune.yaml
-...
-
-# FSDP2 (use strategy name; optional parallelism sizes)
-distributed:
-  strategy: fsdp2
-  activation_checkpointing: true
-  # dp_size: null
-  # tp_size: 1
-  # cp_size: 1
-  ...
-```
-
-### Configure Programmatically
-```python
-from nemo_automodel.components.distributed.config import FSDP2Config
-from nemo_automodel.components.distributed.fsdp2 import FSDP2Manager
-
-config = FSDP2Config(activation_checkpointing=True)
-# device_mesh is created elsewhere (e.g. by the recipe via setup_distributed)
-manager = FSDP2Manager(config, device_mesh=device_mesh, moe_mesh=moe_mesh)
-model = manager.parallelize(model)
-```
-
-## Combine with Linear-Cut Cross-Entropy (LC-CE)
-
-Linear-Cut Cross-Entropy (LC-CE) reduces the hidden-state memory required to compute the loss by calculating the softmax on the fly, thus avoiding the need to allocate memory for the logits.
-It is already available using `nemo_automodel.components.loss.linear_ce.FusedLinearCrossEntropy` and can be enabled in recipes by using the following:
-
-```yaml
-model:
-  ...
-  output_hidden_states: true
-
-loss_fn:
-  _target_: nemo_automodel.components.loss.linear_ce.FusedLinearCrossEntropy
-```
-
-LC-CE and gradient checkpointing target **different memory hot-spots** (output layer vs. transformer blocks) so their benefits stack almost linearly.
-
-## Example Memory Savings (H100-80GB, Llama-3.2-1B)
-| Technique | Max GPU Mem (GB) | Δ vs Baseline |
-|-----------|-----------------|---------------|
-| Baseline | 53.03 | - |
-| + FSDP (dp_size=8) | 47.59 | ↓ 10 % |
-| + Gradient Checkpointing | 33.06 | ↓ 38 % |
-| + LC-CE | 7.30 | ↓ 86 % |
-| **FSDP + LC-CE + Checkpointing** | **7.30** | **↓ 86 %** |
-
-:::{note}
-- Measurements taken with local batch size = 8, sequence len = 2048, AdamW, PyTorch 2.8.
-- Peak memory reported by `torch.cuda.max_memory_allocated()` averaged across DP ranks.
-- Expect ±5 % variance depending on exact model, sequence length and GPU architecture.
-:::
-
-## Performance Considerations
-1. **Extra compute**: Each checkpointed segment is recomputed once during the backward pass. In practice, the wall-clock overhead is ≈5-10% for transformer models.
-2. **Throughput vs. Batch Size**: The goal is usually to _increase batch size_ or _sequence length_ while keeping throughput constant.
-
-## Verify It Works
-Run your training script and inspect the peak memory:
-```bash
-
-# If running on 8x GPUs
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama_3_2_1b_my_finetune.yaml
-
-# If running on 1x GPU
-automodel examples/llm_finetune/llama3_2/llama_3_2_1b_my_finetune.yaml
-```
-If we run with the above settings (activation ckpt = on, lc-ce = on, fsdp = on), look for a log line similar to:
-```
-... | mem 7.30 GiB | ...
-```
\ No newline at end of file
diff --git a/fern/versions/v0.4/pages/guides/gradient-checkpointing.mdx b/docs/guides/gradient-checkpointing.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/gradient-checkpointing.mdx
rename to docs/guides/gradient-checkpointing.mdx
diff --git a/docs/guides/huggingface-api-compatibility.md b/docs/guides/huggingface-api-compatibility.md
deleted file mode 100644
index 3b990a66dc..0000000000
--- a/docs/guides/huggingface-api-compatibility.md
+++ /dev/null
@@ -1,229 +0,0 @@
-# 🤗 Transformers API Compatibility
-
-NeMo AutoModel is built to work with the 🤗 Hugging Face ecosystem.
-In practice, compatibility comes in two layers:
-
-- **API compatibility**: for many workflows, you can keep your existing `transformers` code and swap in NeMo AutoModel “drop-in” wrappers (`NeMoAutoModel*`, `NeMoAutoTokenizer`) with minimal changes.
-- **Artifact compatibility**: NeMo AutoModel produces **Hugging Face-compatible checkpoints** (config + tokenizer + safetensors) that can be loaded by Hugging Face Transformers and downstream tools (vLLM, SGLang, etc.).
-
-This page summarizes what "HF compatibility" means in NeMo AutoModel, calls out differences you should be aware of, and provides side-by-side examples.
-
-## Transformers Version Compatibility: v4 and v5
-
-### Transformers v4 (Current Default)
-
-NeMo AutoModel currently pins Hugging Face Transformers to the **v4** major line (see `pyproject.toml`, currently `transformers<=4.57.5`).
-
-This means:
-
-- NeMo AutoModel is primarily tested and released against **Transformers v4.x**
-- New model releases on the Hugging Face Hub that require a newer Transformers may require upgrading NeMo AutoModel as well (similar to upgrading `transformers` directly)
-
-### Transformers v5 (Forward-Compatibility and Checkpoint Interoperability)
-
-Transformers **v5** introduces breaking changes across some internal utilities (e.g., cache APIs) and adds/reshapes tokenizer backends for some model families.
-
-NeMo AutoModel addresses this in two complementary ways:
-
-- **Forward-compatibility shims**: NeMo AutoModel includes small compatibility patches to smooth over known API differences across Transformers releases (for example, cache utility method names). The built-in recipes apply these patches automatically.
-- **Backports where needed**: for some model families, NeMo AutoModel may vendor/backport Hugging Face code that originated in the v5 development line so users can run those models while staying on a pinned v4 dependency.
-- **Stable artifact format**: NeMo AutoModel checkpoints are written in Hugging Face-compatible `save_pretrained` layouts (config + tokenizer + safetensors). These artifacts are designed to be loadable by both Transformers **v4** and **v5** (and non-Transformers tools that consume HF-style model repos).
-
-:::{note}
-If you are running Transformers v5 in another environment, you can still use NeMo AutoModel-produced consolidated checkpoints with Transformers' standard loading APIs. For details on the checkpoint layouts, see [checkpointing](checkpointing.md).
-:::
-
-## Drop-In Compatibility and Key Differences
-
-### Drop-In (Same Mental Model as Transformers)
-
-- **Load by model ID or local path**: `from_pretrained(...)`
-- **Standard HF config objects**: `AutoConfig` / `config.json`
-- **Tokenizers**: standard `PreTrainedTokenizerBase` behavior, including `__call__` to create tensors and `decode`/`batch_decode`
-- **Generation**: `model.generate(...)` and the usual generation kwargs
-
-### Differences (Where NeMo AutoModel Adds Value or Has Constraints)
-
-- **Performance features**: NeMo AutoModel can automatically apply optional kernel patches/optimizations (e.g., SDPA selection, Liger kernels, DeepEP, etc.) while keeping the public model API the same.
-- **Distributed training stack**: NeMo AutoModel's recipes/CLI are designed for multi-GPU/multi-node fine-tuning with PyTorch-native distributed features (FSDP2, pipeline parallelism, etc.).
-- **CUDA expectation**: NeMo AutoModel's `NeMoAutoModel*` wrappers are primarily optimized for NVIDIA GPU workflows, and offer support for CPU workflows as well.
-
-:::{important}
-`NeMoAutoModelForCausalLM.from_pretrained(...)` currently assumes CUDA is available (it uses `torch.cuda.current_device()` internally). If you need CPU-only inference, use Hugging Face `transformers` directly.
-:::
-
-## API Mapping (Transformers and NeMo AutoModel)
-
-### API Name Mapping
-
-:::{raw} html
-<table>
-  <thead>
-    <tr>
-      <th style="width: 45%;">🤗 Hugging Face (<code>transformers</code>)</th>
-      <th style="width: 45%;">NeMo AutoModel (<code>nemo_automodel</code>)</th>
-      <th style="width: 10%;">Status</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td><code>transformers.AutoModelForCausalLM</code></td>
-      <td><code>nemo_automodel.NeMoAutoModelForCausalLM</code></td>
-      <td>✅</td>
-    </tr>
-    <tr>
-      <td><code>transformers.AutoModelForImageTextToText</code></td>
-      <td><code>nemo_automodel.NeMoAutoModelForImageTextToText</code></td>
-      <td>✅</td>
-    </tr>
-    <tr>
-      <td><code>transformers.AutoModelForSequenceClassification</code></td>
-      <td><code>nemo_automodel.NeMoAutoModelForSequenceClassification</code></td>
-      <td>✅</td>
-    </tr>
-    <tr>
-      <td><code>transformers.AutoModelForTextToWaveform</code></td>
-      <td><code>nemo_automodel.NeMoAutoModelForTextToWaveform</code></td>
-      <td>✅</td>
-    </tr>
-    <tr>
-      <td><code>transformers.AutoTokenizer.from_pretrained(...)</code></td>
-      <td><code>nemo_automodel.NeMoAutoTokenizer.from_pretrained(...)</code></td>
-      <td>✅</td>
-    </tr>
-    <tr>
-      <td><code>model.generate(...)</code></td>
-      <td><code>model.generate(...)</code></td>
-      <td>🚧</td>
-    </tr>
-    <tr>
-      <td><code>model.save_pretrained(path)</code></td>
-      <td><code>model.save_pretrained(path, checkpointer=...)</code></td>
-      <td>🚧</td>
-    </tr>
-  </tbody>
-</table>
-:::
-
-## Side-by-Side Examples
-
-### Load a Model and Tokenizer (Transformers v4)
-
-:::{raw} html
-<table>
-  <thead>
-    <tr>
-      <th style="width: 50%;">🤗 Hugging Face (<code>transformers</code>)</th>
-      <th style="width: 50%;">NeMo AutoModel (<code>nemo_automodel</code>)</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td style="vertical-align: top;">
-        <div class="highlight"><pre><code>import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_id = "gpt2"
-
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    torch_dtype=torch.bfloat16,
-)</code></pre></div>
-      </td>
-      <td style="vertical-align: top;">
-        <div class="highlight"><pre><code>import torch
-from nemo_automodel import NeMoAutoModelForCausalLM, NeMoAutoTokenizer
-
-model_id = "gpt2"
-
-tokenizer = NeMoAutoTokenizer.from_pretrained(model_id)
-model = NeMoAutoModelForCausalLM.from_pretrained(
-    model_id,
-    torch_dtype=torch.bfloat16,
-)</code></pre></div>
-      </td>
-    </tr>
-  </tbody>
-</table>
-:::
-
-### Text Generation
-
-This snippet assumes you already have a `model` and `tokenizer` (see the loading snippet above).
-
-:::{raw} html
-<table>
-  <thead>
-    <tr>
-      <th style="width: 50%;">🤗 Hugging Face (<code>transformers</code>)</th>
-      <th style="width: 50%;">NeMo AutoModel (<code>nemo_automodel</code>)</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td style="vertical-align: top; padding-top: 0;">
-        <div class="highlight" style="margin-top: 0;"><pre style="margin: 0;"><code>import torch
-
-prompt = "Write a haiku about GPU kernels."
-inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-
-with torch.inference_mode():
-    out = model.generate(**inputs, max_new_tokens=64)
-
-print(tokenizer.decode(out[0], skip_special_tokens=True))</code></pre></div>
-      </td>
-      <td style="vertical-align: top; padding-top: 0;">
-        <div class="highlight" style="margin-top: 0;"><pre style="margin: 0;"><code>import torch
-
-prompt = "Write a haiku about GPU kernels."
-inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-
-with torch.inference_mode():
-    out = model.generate(**inputs, max_new_tokens=64)
-
-print(tokenizer.decode(out[0], skip_special_tokens=True))</code></pre></div>
-      </td>
-    </tr>
-  </tbody>
-</table>
-:::
-
-
-### Tokenizers (Transformers vs NeMo AutoModel)
-
-NeMo AutoModel provides `NeMoAutoTokenizer` as a Transformers-like auto-tokenizer with a small registry for specialized backends (and a safe fallback when no specialization is needed).
-
-:::{raw} html
-<table>
-  <thead>
-    <tr>
-      <th style="width: 50%;">🤗 Hugging Face (<code>transformers</code>)</th>
-      <th style="width: 50%;">NeMo AutoModel (<code>nemo_automodel</code>)</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td style="vertical-align: top;">
-        <div class="highlight"><pre><code>from transformers import AutoTokenizer
-
-tok = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")</code></pre></div>
-      </td>
-      <td style="vertical-align: top;">
-        <div class="highlight"><pre><code>from nemo_automodel import NeMoAutoTokenizer
-
-tok = NeMoAutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")</code></pre></div>
-      </td>
-    </tr>
-  </tbody>
-</table>
-:::
-
-## Checkpoints: Save in NeMo AutoModel, Load Everywhere
-
-NeMo AutoModel training recipes write checkpoints in Hugging Face-compatible layouts, including consolidated safetensors that you can load directly with Transformers:
-
-- See [checkpointing](checkpointing.md) for checkpoint formats and example directory layouts.
-- See [model coverage](../model-coverage/overview.md) for notes on how model support depends on the pinned Transformers version.
-
-If your goal is: **train/fine-tune in NeMo AutoModel → deploy in the HF ecosystem**, the recommended workflow is to enable consolidated safetensors checkpoints and then load them with the standard HF APIs or downstream inference engines.
diff --git a/fern/versions/v0.4/pages/guides/huggingface-api-compatibility.mdx b/docs/guides/huggingface-api-compatibility.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/huggingface-api-compatibility.mdx
rename to docs/guides/huggingface-api-compatibility.mdx
diff --git a/docs/guides/installation.md b/docs/guides/installation.md
deleted file mode 100644
index c8bbfad040..0000000000
--- a/docs/guides/installation.md
+++ /dev/null
@@ -1,323 +0,0 @@
-# Install NeMo AutoModel
-
-This guide explains how to install NeMo AutoModel for LLM, VLM, and OMNI models on various platforms and environments. Depending on your use case, there are several ways to install it:
-
-| Method                  | Dev Mode | Use Case                                                          | Recommended For             |
-| ----------------------- | ---------|----------------------------------------------------------------- | ---------------------------- |
-| 📦 **PyPI**             | - | Install stable release with minimal setup                         | Most users, production usage |
-| 🐳 **Docker**           | - | Use in isolated GPU environments, e.g., with NeMo container       | Multi-node deployments     |
-| 🐍 **Git Repo**         | ✅ | Use the latest code without cloning or installing extras manually | Power users, testers         |
-| 🧪 **Editable Install** | ✅ | Contribute to the codebase or make local modifications            | Contributors, researchers    |
-| 🐳 **Docker + Mount**   | ✅ | Use in isolated GPU environments, e.g., with NeMo container       | Multi-node deployments     |
-
-## Choose Your Installation Method
-
-Pick the installation method that matches your needs and platform.
-
-### Decision Criteria
-
-| Method | Best For | Pros | Cons |
-|--------|----------|------|------|
-| **Docker Container** | Production, multi-node, Debian-based systems | Reproducible environment, pre-configured dependencies, GPU driver isolation | Larger download size, container overhead |
-| **virtualenv (PyPI/Git)** | Local development, quick prototyping, macOS | Fast setup, lightweight, direct code access | Manual dependency management, platform-specific issues |
-
-### When to Use Docker Containers
-
-Use Docker containers when you need:
-
-- **Multi-node deployments**: Containers ensure consistency across cluster nodes
-- **Production environments**: Reproducible builds with tested dependency versions
-- **GPU driver compatibility**: Isolates CUDA/driver versions from host system
-- **Debian-based systems**: Recommended for Ubuntu, Debian, and derivatives due to dependency complexity
-- **Complex dependencies**: Pre-configured environment with all optimizations (TransformerEngine, DeepEP, etc.)
-- **Team consistency**: Same environment across development, testing, and production
-
-### When to Use virtualenv
-
-Use virtualenv (PyPI, Git, or editable install) when you need:
-
-- **Local development**: Fast iteration on code changes
-- **Quick prototyping**: Minimal setup for experimentation
-- **macOS systems**: Better native support without container overhead
-- **Frequent code changes**: Contributors working on the codebase (use editable install)
-- **Compatible GPU drivers**: System has correct CUDA toolkit and drivers installed
-- **Lightweight setup**: Minimal disk space and memory footprint
-
-### Platform-Specific Recommendations
-
-#### Linux (Debian-based: Ubuntu, Debian)
-
-**Recommended: Docker Container**
-
-Debian-based systems can have dependency conflicts with system packages. Containers provide isolation and consistency.
-
-```bash
-docker pull nvcr.io/nvidia/nemo-automodel:25.11.00
-docker run --gpus all -it --rm --shm-size=8g nvcr.io/nvidia/nemo-automodel:25.11.00
-```
-
-**Alternative: virtualenv** (if Docker is not available)
-
-Ensure CUDA 11.8+ and compatible drivers are installed:
-
-```bash
-# Check CUDA version
-nvidia-smi
-
-# Install via PyPI
-pip3 install nemo-automodel
-```
-
-#### Linux (RHEL, CentOS, Fedora)
-
-**Recommended: Docker Container**
-
-Containers avoid enterprise Linux package management complexity.
-
-Follow the same Docker commands as Debian-based systems above.
-
-#### macOS
-
-**Recommended: virtualenv**
-
-Docker on macOS has GPU limitations. Use native Python installation:
-
-```bash
-# Using PyPI
-pip3 install nemo-automodel
-
-# Or using uv for reproducible environments
-uv pip install nemo-automodel
-```
-
-:::{note}
-GPU training on macOS is not supported. Use macOS for CPU-based experimentation or remote cluster submission.
-:::
-
-#### Windows
-
-**Recommended: WSL2 + Docker**
-
-Run NeMo AutoModel in WSL2 with Docker Desktop:
-
-1. Install WSL2 and Docker Desktop.
-2. Use Docker container within WSL2 (follow Linux instructions).
-
-**Alternative: WSL2 and virtualenv**
-
-Install directly in WSL2 Ubuntu environment (follow Debian instructions).
-
-### Common Issues and Solutions
-
-**GPU driver compatibility errors**
-- **Problem**: CUDA version mismatch between host and application
-- **Solution**: Use Docker container to isolate driver versions
-
-**Dependency conflicts on Debian/Ubuntu**
-- **Problem**: System packages conflict with Python packages
-- **Solution**: Use Docker container or create isolated virtualenv with `uv`
-
-**Out of memory during container startup**
-- **Problem**: Insufficient shared memory for PyTorch data loading
-- **Solution**: Increase `--shm-size` parameter (e.g., `--shm-size=16g`)
-
-**TransformerEngine import failures**
-- **Problem**: Incorrect CUDA toolkit or missing dependencies
-- **Solution**: Use pre-configured Docker container
-
-## Prerequisites
-
-### System Requirements
-- **Python**: 3.10 or higher
-- **CUDA**: 11.8 or higher (for GPU support)
-- **Memory**: Minimum 16GB RAM, 32GB+ recommended
-- **Storage**: At least 50GB free space for models and datasets
-
-### Hardware Requirements
-
-- **GPU**: NVIDIA GPU with 8GB+ VRAM (16GB+ recommended)
-- **CPU**: Multi-core processor (8+ cores recommended)
-- **Network**: Stable internet connection for downloading models
-
-## Installation Options for Non-Developers
-This section explains the easiest installation options for non-developers, including using pip3 via PyPI or leveraging a preconfigured NVIDIA NeMo Docker container. Both methods offer quick access to the latest stable release of NeMo AutoModel with all required dependencies.
-
-### Install via PyPI (Recommended)
-
-For most users, the easiest way to get started is using `pip3`.
-
-```bash
-pip3 install nemo-automodel
-```
-:::{tip}
-This installs the latest stable release of NeMo AutoModel from PyPI.
-
-To verify the install, run `python -c "import nemo_automodel; print(nemo_automodel.__version__)"`. See [nemo-automodel on PyPI](https://pypi.org/project/nemo-automodel/).
-:::
-
-### Install with NeMo Docker Container
-You can use NeMo AutoModel with the NeMo Docker container. Pull the container by running:
-```bash
-docker pull nvcr.io/nvidia/nemo-automodel:25.11.00
-```
-:::{note}
-The above `docker` command uses the [`25.11.00`](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo-automodel?version=25.11.00) container. Use the [most recent container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo-automodel) version to ensure you get the latest version of AutoModel and its dependencies like PyTorch, Transformers, etc.
-:::
-
-Then you can enter the container using:
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v "$(pwd)"/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:25.11.00
-```
-
-:::{important}
-**Persist your checkpoints.** By default, checkpoints are written to `checkpoints/` inside the container. Because `--rm` destroys the container on exit, any data stored only inside the container is lost. Always bind-mount a host directory for the checkpoint path (as shown with `-v` above) so that your trained weights survive after the container stops. You can also mount additional directories for datasets and Hugging Face cache:
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v /path/to/your/checkpoints:/opt/Automodel/checkpoints \
-  -v /path/to/your/datasets:/datasets \
-  -v /path/to/your/hf_cache:/root/.cache/huggingface \
-  nvcr.io/nvidia/nemo-automodel:25.11.00
-```
-:::
-
-:::{tip}
-**Models that require CUDA-specific packages (e.g., Nemotron).** Some model families—such as Nemotron Nano and Nemotron Flash—depend on packages like `mamba-ssm` and `causal-conv1d` that must be compiled against a matching CUDA toolkit. Installing these from source on a bare-metal host can be error-prone. The NeMo Automodel Docker container ships with these dependencies pre-built, so **using the container is the recommended approach** for fine-tuning Nemotron and other models with similar requirements.
-:::
-
----
-## Installation Options for Developers
-
-This section provides installation options for developers, including pulling the latest source from GitHub, using editable mode, or mounting the repo inside a NeMo Docker container.
-
-### Install from GitHub (Source)
-
-
-If you want the **latest features** from the `main` branch or want to contribute:
-
-#### Option A – Use `pip` With Git Repo
-```bash
-pip3 install git+https://github.com/NVIDIA-NeMo/Automodel.git
-```
-:::{note}
-This installs the repo as a standard Python package (not editable).
-:::
-
-#### Option B – Use `uv` With Git Repo
-```bash
-uv pip install git+https://github.com/NVIDIA-NeMo/Automodel.git
-```
-:::{note}
-`uv` handles virtual environment transparently and enables more reproducible installs.
-:::
-
-### Install in Developer Mode (Editable Install)
-
-To contribute or modify the code:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-pip3 install -e .
-```
-
-:::{note}
-This installs AutoModel in editable mode, so changes to the code are immediately reflected in Python.
-:::
-
-### Mount the Repo into a NeMo Docker Container
-
-To run `Automodel` inside a NeMo container while **mounting your local repo**, follow these steps:
-
-```bash
-# Step 1: Clone the AutoModel repository.
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-
-# Step 2: Pull a compatible container image (replace the tag as needed).
-docker pull nvcr.io/nvidia/nemo-automodel:25.11.00
-
-# Step 3: Run the container, mount the repo, and run a quick sanity check.
-docker run --gpus all -it --rm \
-  -v $(pwd):/workspace/Automodel \         # Mount repo into container workspace
-  -v $(pwd)/Automodel:/opt/Automodel \     # Optional: Mount Automodel under /opt for flexibility
-  --shm-size=8g \                           # Increase shared memory for PyTorch/data loading
-  nvcr.io/nvidia/nemo-automodel:25.11.00 /bin/bash -c "\
-    cd /workspace/Automodel && \           # Enter the mounted repo
-    pip install -e . && \                  # Install Automodel in editable mode
-    automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml" # Run a usage example
-```
-:::{note}
-The above `docker` command mounts your local `Automodel` directory into the container at `/workspace/Automodel`.
-:::
-
-## Install Profiles
-
-NeMo AutoModel provides several install extras for different use cases.
-
-### Full Install (default)
-
-Installs the core library with all LLM training dependencies (PyTorch, CUDA, etc.):
-
-```bash
-pip3 install nemo-automodel
-```
-
-### CLI-Only Install (Login Nodes)
-
-If you only need to **submit jobs** from a login node or CI environment (SLURM,
-SkyPilot, NeMo-Run) and do **not** need to run training locally, use the
-lightweight CLI-only install:
-
-```bash
-pip3 install nemo-automodel[cli]
-```
-
-This installs only `pyyaml` -- no PyTorch, no CUDA. The `automodel` and `am`
-CLI commands will be available for SLURM and SkyPilot job submission. If you
-also need the NeMo-Run launcher, install it separately (`pip install nemo-run`).
-If you accidentally try to run a local/interactive job with this install, you
-will get a clear error with instructions to install the full package.
-
-### VLM Dependencies
-
-For vision-language model training, add the VLM extras:
-
-```bash
-pip3 install nemo-automodel[vlm]
-```
-
-### CUDA-Specific Packages
-
-For models requiring TransformerEngine, bitsandbytes, Mamba, or other
-CUDA-compiled packages:
-
-```bash
-pip3 install nemo-automodel[cuda]
-```
-
-### All Extras
-
-Install everything (CUDA, VLM, NeMo-Run, etc.):
-
-```bash
-pip3 install nemo-automodel[all]
-```
-
-:::{tip}
-You can combine extras: `pip3 install nemo-automodel[vlm,cuda]`
-:::
-
-## Summary
-| Goal                        | Command or Method                                               |
-| --------------------------- | --------------------------------------------------------------- |
-| Stable install (PyPI)       | `pip3 install nemo-automodel`                                   |
-| CLI-only (login nodes)      | `pip3 install nemo-automodel[cli]`                              |
-| Latest from GitHub          | `pip3 install git+https://github.com/NVIDIA-NeMo/Automodel.git` |
-| Editable install (dev mode) | `pip install -e .` after cloning                                |
-| Run without installing      | Use `PYTHONPATH=$(pwd)` to run scripts                          |
-| Use in Docker container     | Mount repo and `pip install -e .` inside container              |
-| Fast install (using `uv`)   | `uv pip install ...`                                            |
diff --git a/fern/versions/v0.4/pages/guides/installation.mdx b/docs/guides/installation.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/installation.mdx
rename to docs/guides/installation.mdx
diff --git a/docs/guides/llm/column-mapped-text-instruction-dataset.md b/docs/guides/llm/column-mapped-text-instruction-dataset.md
deleted file mode 100644
index 073b9ee5e0..0000000000
--- a/docs/guides/llm/column-mapped-text-instruction-dataset.md
+++ /dev/null
@@ -1,248 +0,0 @@
-# Use the ColumnMappedTextInstructionDataset
-
-This guide explains how to use `ColumnMappedTextInstructionDataset` to quickly and flexibly load instruction-answer datasets for LLM fine-tuning, with minimal code changes and support for common tokenization strategies.
-
-The `ColumnMappedTextInstructionDataset` is a lightweight, plug-and-play helper that lets you train on instruction-answer style corpora without writing custom Python for every new schema. You simply specify which columns map to logical fields like `context`, `question`, and `answer`, and the loader handles the rest automatically. This enables:
-
-- Quick prototyping across diverse instruction datasets
-- Schema flexibility without requiring code changes
-- Consistent field names for training loops, regardless of dataset source
-
-`ColumnMappedTextInstructionDataset` is a **map-style** dataset (`torch.utils.data.Dataset`): it supports `len(ds)` and `ds[i]`, and it loads data **non-streaming**.
-
-It supports two data sources out-of-the-box:
-
-1. **Local JSON/JSONL files** - pass a single file path or a list of paths on disk. Newline-delimited JSON works great.
-2. **Hugging Face Hub** - point to any dataset repo (`org/dataset`) that contains the required columns.
-
-For **streaming** (including **Delta Lake / Databricks**), use [`ColumnMappedTextInstructionIterableDataset`](column-mapped-text-instruction-iterable-dataset.md). The iterable variant always streams by design to avoid accidentally materializing entire datasets to disk/memory.
-
----
-## Quickstart
-The fastest way to sanity-check the loader is to point it at an existing Hugging Face dataset and print the first sample. This section provides a minimal, runnable example to help you quickly try out the dataset.
-
-```python
-from transformers import AutoTokenizer
-from nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset import ColumnMappedTextInstructionDataset
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
-
-ds = ColumnMappedTextInstructionDataset(
-    path_or_dataset_id="Muennighoff/natural-instructions",
-    column_mapping={
-      "context": "definition",
-      "question": "inputs",
-      "answer": "targets"
-    },
-    tokenizer=tokenizer,
-    answer_only_loss_mask=True,
-)
-
-sample = ds[0]
-print(sample.keys())
-
-# Typical keys include: input_ids, labels, attention_mask (and an internal ___PAD_TOKEN_IDS___ helper).
-# Note: when answer_only_loss_mask=True, prompt tokens are masked in labels with -100
-# (the standard CrossEntropy "ignore_index").
-```
-
-The code above is intended only for a quick sanity check of the dataset and its tokenization output. For training or production use, configure the dataset using YAML as shown below. YAML offers a reproducible, maintainable, and scalable way to specify dataset and tokenization settings.
-
----
-## Usage Examples
-
-This section provides practical usage examples, including how to load remote datasets, work with local files, and configure pipelines using YAML recipes.
-
-### Local JSONL Example
-
-Assume you have a local newline-delimited JSON file at `/data/my_corpus.jsonl`
-with the simple schema `{instruction, output}`. A few sample rows:
-
-```json
-{"instruction": "Translate 'Hello' to French", "output": "Bonjour"}
-{"instruction": "Summarize the planet Neptune.", "output": "Neptune is the eighth planet from the Sun."}
-```
-
-You can load it using Python code like:
-
-```python
-local_ds = ColumnMappedTextInstructionDataset(
-    path_or_dataset_id=["/data/my_corpus_1.jsonl", "/data/my_corpus_2.jsonl"], # can also be a single path (string)
-    column_mapping={
-        "question": "instruction",
-        "answer": "output",
-    },
-    tokenizer=tokenizer,
-    answer_only_loss_mask=False,  # compute loss over full sequence
-)
-
-print(local_ds[0].keys())   # dict_keys(['input_ids', 'labels', 'attention_mask', '___PAD_TOKEN_IDS___'])
-```
-
-You can configure the dataset entirely from your recipe YAML. For example:
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset.ColumnMappedTextInstructionDataset
-  path_or_dataset_id:
-    - /data/my_corpus_1.jsonl
-    - /data/my_corpus_2.jsonl
-  column_mapping:
-    question: instruction
-    answer: output
-  answer_only_loss_mask: false
-```
-
-
-### Remote Dataset Example
-
-In the following section, we demonstrate how to load the instruction-tuning corpus
-[`Muennighoff/natural-instructions`](https://huggingface.co/datasets/Muennighoff/natural-instructions).
-The dataset schema is `{task_name, id, definition, inputs, targets}`.
-
-The following are examples from the training split:
-
-```json
-{
-  "task_name": "task001_quoref_question_generation",
-  "id": "task001-abc123",
-  "definition": "In this task, you're given passages that...",
-  "inputs": "Passage: A man is sitting at a piano...",
-  "targets": "What is the first name of the person who doubted it would be explosive?"
-}
-{
-  "task_name": "task002_math_word_problems",
-  "id": "task002-def456",
-  "definition": "Solve the following word problem.",
-  "inputs": "If there are 3 apples and you take 2...",
-  "targets": "1"
-}
-```
-
-For basic QA fine-tuning, we usually map `definition → context`, `inputs → question`, and `targets → answer` as follows:
-
-```python
-from nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset import (
-    ColumnMappedTextInstructionDataset,
-)
-from transformers import AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
-
-remote_ds = ColumnMappedTextInstructionDataset(
-    path_or_dataset_id="Muennighoff/natural-instructions",  # Hugging Face repo ID
-    column_mapping={
-        "context": "definition",  # high-level context
-        "question": "inputs",      # the actual prompt / input
-        "answer": "targets",       # expected answer string
-    },
-    tokenizer=tokenizer,
-    split="train[:5%]",        # demo slice; omit (i.e., `split="train",`) for full data
-    answer_only_loss_mask=True,
-)
-```
-
-You can configure the entire dataset directly from your recipe YAML. For example:
-```yaml
-# dataset section of your recipe's config.yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset.ColumnMappedTextInstructionDataset
-  path_or_dataset_id: Muennighoff/natural-instructions
-  split: train
-  column_mapping:
-    context: definition
-    question: inputs
-    answer: targets
-  answer_only_loss_mask: true
-```
-
-### Streaming / Delta Lake / Databricks
-
-:::{note}
-`ColumnMappedTextInstructionDataset` does not support streaming or Delta Lake / Databricks sources. For those, use [`ColumnMappedTextInstructionIterableDataset`](column-mapped-text-instruction-iterable-dataset.md).
-:::
-
-:::{note}
-Delta Lake / Databricks (including `delta_sql_query` and authentication) is supported only by `ColumnMappedTextInstructionIterableDataset`. See [`column-mapped-text-instruction-iterable-dataset.md`](column-mapped-text-instruction-iterable-dataset.md) for details.
-:::
-
-### Advanced Options
-| Arg                     | Default | Description |
-|-------------------------|---------|-------------|
-| `split`                 | `"train"` | Which split to pull from a HF repo (`train`, `validation`, etc.). Ignored for local JSON/JSONL. |
-| `name`                  | `None`    | Name of the Hugging Face dataset configuration/subset to load. |
-| `answer_only_loss_mask` | `True`    | Mask prompt tokens in `labels` with `-100` (the standard CrossEntropy `ignore_index`). |
-| `use_hf_chat_template`  | `False`   | If `True` and the tokenizer supports chat templates, format as a system/user/assistant conversation via `tokenizer.apply_chat_template(...)`. |
-| `seq_length`            | `None`    | Optional max sequence length; used for padding/truncation when enabled. |
-| `padding`               | `"do_not_pad"` | Padding strategy passed to the tokenizer (`"do_not_pad"`, `"max_length"`, `True`, etc.). |
-| `truncation`            | `"do_not_truncate"` | Truncation strategy passed to the tokenizer (`"do_not_truncate"`, `True`, etc.). |
-| `limit_dataset_samples` | `None`    | Optionally load only the first \(N\) samples (useful for debugging). |
-
----
-## Tokenization Paths
-This section explains how the dataset formats and tokenizes samples.
-
-`ColumnMappedTextInstructionDataset` produces standard next-token training tensors:
-
-- `input_ids`
-- `labels`
-- `attention_mask`
-
-When `answer_only_loss_mask=True`, prompt tokens are masked in `labels` with `-100` (the standard CrossEntropy `ignore_index`).
-
-The dataset supports two formatting paths:
-
-1. **Chat-template path (opt-in)**: if `use_hf_chat_template=True` and the tokenizer exposes a `chat_template` and `apply_chat_template`, the dataset builds messages like:
-
-   `[{"role": "system", "content": <context or "">}, {"role": "user", "content": <question or "">}, {"role": "assistant", "content": <answer>}]`
-
-   and tokenizes them via `tokenizer.apply_chat_template(..., tokenize=True, return_dict=True)`.
-
-2. **Plain prompt/completion path (default)**: otherwise the dataset concatenates prompt and answer and tokenizes the result.
-
-In both cases, `labels` are the next-token targets (shifted by one relative to `input_ids`). The dataset also includes an internal `___PAD_TOKEN_IDS___` field used downstream for padding.
-
----
-## Parameter Requirements
-
-The following section lists important requirements and caveats for correct usage.
-- `column_mapping` must include `answer`, and must include at least one of `context` or `question` (2- or 3-column mapping only).
-- If `use_hf_chat_template=True`, the tokenizer must support chat templates (`chat_template` + `apply_chat_template`).
-
----
-## Slurm Configuration for Distributed Training
-
-For distributed training on Slurm clusters, add a `slurm` section to your YAML configuration. This section configures the Slurm batch job parameters and automatically generates the appropriate `#SBATCH` directives.
-
-### Slurm Configuration
-
-SLURM jobs are submitted with `sbatch` directly — no YAML section needed.
-Copy the reference script, set the `CONFIG` variable to your YAML, and submit:
-
-```sh
-cp slurm.sub my_cluster.sub
-# Edit my_cluster.sub — change CONFIG, #SBATCH directives, container, mounts, etc.
-sbatch my_cluster.sub
-```
-
-All cluster-specific settings (nodes, GPUs, partition, container, mounts, secrets)
-live in your sbatch script. See the [cluster guide](../../launcher/slurm.md) for
-full examples (Pyxis, bare-metal, Apptainer).
-
-### Multi-Node Slurm Configuration
-
-:::{note}
-**Multi-Node Training**: When using Hugging Face datasets in multi-node setups, you need shared storage accessible by all nodes. Set `HF_DATASETS_CACHE` to a shared directory in your sbatch script (e.g., `export HF_DATASETS_CACHE=/shared/hf_cache`) to ensure all nodes can access the cached datasets.
-:::
-
-When using multiple nodes with Hugging Face datasets:
-
-1. **Shared Storage**: Ensure all nodes can access the same storage paths
-2. **HF Cache**: Export `HF_HOME` and `HF_DATASETS_CACHE` in your sbatch script pointing to shared directories
-3. **Mounts**: Add shared directories as container mounts in your sbatch script
-
-Configure all of this in your sbatch script (`my_cluster.sub`), not in the YAML.
-
-
----
-### That's It!
-With the mapping specified, the rest of the NeMo Automodel pipeline (pre-tokenization, packing, collate-fn, *etc.*) works as usual.
diff --git a/fern/versions/v0.4/pages/guides/llm/column-mapped-text-instruction-dataset.mdx b/docs/guides/llm/column-mapped-text-instruction-dataset.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/llm/column-mapped-text-instruction-dataset.mdx
rename to docs/guides/llm/column-mapped-text-instruction-dataset.mdx
diff --git a/docs/guides/llm/column-mapped-text-instruction-iterable-dataset.md b/docs/guides/llm/column-mapped-text-instruction-iterable-dataset.md
deleted file mode 100644
index 748282981f..0000000000
--- a/docs/guides/llm/column-mapped-text-instruction-iterable-dataset.md
+++ /dev/null
@@ -1,221 +0,0 @@
-# Use the ColumnMappedTextInstructionIterableDataset (Streaming)
-
-This guide explains how to use `ColumnMappedTextInstructionIterableDataset` to **stream** instruction datasets for LLM fine-tuning, including **Delta Lake/Databricks** sources.
-
-Unlike `ColumnMappedTextInstructionDataset` (map-style, non-streaming), this class is a `torch.utils.data.IterableDataset` and **always** loads data in streaming mode. This is intentional: it helps ensure data is consumed as a stream and avoids accidentally materializing full datasets/tables to disk or memory (which is especially important for large or sensitive corpora).
-
-## When to Use This Dataset
-
-Use `ColumnMappedTextInstructionIterableDataset` when you need:
-
-- **Streaming-only behavior** (e.g., to reduce accidental data leakages from full dataset materialization)
-- **Delta Lake/Databricks** (Unity Catalog, cloud lakehouse storage, DBFS, etc.)
-- **Very large datasets** where map-style loading/caching is undesirable
-
-If you do *not* need streaming (and you want `len(ds)` / `ds[i]`), use [`ColumnMappedTextInstructionDataset`](column-mapped-text-instruction-dataset.md).
-
-## Key Differences vs ColumnMappedTextInstructionDataset
-
-- **Iterable**: you iterate (`for sample in ds:`); you cannot rely on `len(ds)` or `ds[i]`.
-- **Always streaming**: there is no `streaming=` flag; it is always enabled.
-- **Repeat behavior**: by default, `repeat_on_exhaustion=True` (infinite stream). Set `repeat_on_exhaustion=False` to do a single pass.
-- **(Optional) sharding/shuffle helpers**: use `.shard(num_shards, index)` / `.shuffle(buffer_size, seed)` when supported by the underlying backend.
-
-The column mapping and tokenization logic are shared with `ColumnMappedTextInstructionDataset`. See [Tokenization Paths](column-mapped-text-instruction-dataset.md#tokenization-paths) for details on output fields (`input_ids`, `labels`, `attention_mask`) and masking behavior.
-
-## Quickstart (Hugging Face Streaming)
-
-```python
-from transformers import AutoTokenizer
-
-from nemo_automodel.components.datasets.llm.column_mapped_text_instruction_iterable_dataset import (
-    ColumnMappedTextInstructionIterableDataset,
-)
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
-
-ds = ColumnMappedTextInstructionIterableDataset(
-    path_or_dataset_id="Muennighoff/natural-instructions",
-    split="train",
-    column_mapping={
-        "context": "definition",
-        "question": "inputs",
-        "answer": "targets",
-    },
-    tokenizer=tokenizer,
-    # Optional:
-    # limit_dataset_samples=10_000,
-    # repeat_on_exhaustion=False,   # do one pass instead of infinite stream
-)
-
-sample = next(iter(ds))
-print(sample.keys())  # input_ids / labels / attention_mask (and ___PAD_TOKEN_IDS___)
-```
-
-## Delta Lake/Databricks
-
-`ColumnMappedTextInstructionIterableDataset` supports Delta Lake tables from:
-
-- Local Delta tables (directories containing `_delta_log`)
-- Cloud storage (S3, Azure Blob/ADLS via `abfss://`, GCS via `gs://`)
-- Databricks (DBFS paths and Unity Catalog tables)
-
-### Installation
-
-Install the basic Delta Lake reader:
-
-```bash
-pip install deltalake
-```
-
-For **Unity Catalog access outside of Spark** (optional), install:
-
-```bash
-pip install databricks-sql-connector
-```
-
-### Local Delta Table
-
-```python
-ds = ColumnMappedTextInstructionIterableDataset(
-    path_or_dataset_id="/path/to/delta_table",  # directory containing _delta_log
-    column_mapping={"question": "prompt", "answer": "completion"},
-    tokenizer=tokenizer,
-)
-```
-
-### Databricks Unity Catalog
-
-Use the `delta://` prefix so the loader selects the Delta backend:
-
-```python
-ds = ColumnMappedTextInstructionIterableDataset(
-    path_or_dataset_id="delta://catalog.schema.instruction_data",
-    column_mapping={
-        "context": "system_prompt",
-        "question": "user_input",
-        "answer": "assistant_response",
-    },
-    tokenizer=tokenizer,
-    delta_storage_options={
-        "DATABRICKS_TOKEN": "dapi...",  # or set DATABRICKS_TOKEN env var
-        "DATABRICKS_HOST": "https://your-workspace.databricks.com",
-        # Optional (depending on how you connect):
-        # "DATABRICKS_HTTP_PATH": "/sql/1.0/warehouses/...",
-    },
-)
-```
-
-### Cloud Storage (S3/Azure/GCS)
-
-```python
-# S3 Delta table
-ds = ColumnMappedTextInstructionIterableDataset(
-    path_or_dataset_id="s3://my-bucket/path/to/delta_table",
-    column_mapping={"question": "input", "answer": "output"},
-    tokenizer=tokenizer,
-    delta_storage_options={
-        "AWS_ACCESS_KEY_ID": "...",
-        "AWS_SECRET_ACCESS_KEY": "...",
-        "AWS_REGION": "us-east-1",
-    },
-)
-
-# Azure (ADLS Gen2/ABFS)
-ds = ColumnMappedTextInstructionIterableDataset(
-    path_or_dataset_id="abfss://container@account.dfs.core.windows.net/delta_table",
-    column_mapping={"question": "input", "answer": "output"},
-    tokenizer=tokenizer,
-    delta_storage_options={
-        "AZURE_STORAGE_ACCOUNT_NAME": "...",
-        "AZURE_STORAGE_ACCOUNT_KEY": "...",
-    },
-)
-```
-
-### YAML Configuration (Delta Lake/Databricks)
-
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_iterable_dataset.ColumnMappedTextInstructionIterableDataset
-  path_or_dataset_id: delta://catalog.schema.training_data
-  column_mapping:
-    context: system_prompt
-    question: user_message
-    answer: assistant_message
-  answer_only_loss_mask: true
-  delta_storage_options:
-    DATABRICKS_TOKEN: ${oc.env:DATABRICKS_TOKEN}
-    DATABRICKS_HOST: ${oc.env:DATABRICKS_HOST}
-    # Optional:
-    # DATABRICKS_HTTP_PATH: ${oc.env:DATABRICKS_HTTP_PATH}
-```
-
-## Streaming from a Delta SQL Query (Computed/Aliased Columns)
-
-If you want to generate columns dynamically (joins, filters, computed prompt strings, etc.), pass a SQL query that returns the fields referenced by your `column_mapping`.
-
-```python
-ds = ColumnMappedTextInstructionIterableDataset(
-    path_or_dataset_id="delta://catalog.schema.training_data",
-    column_mapping={"question": "question", "answer": "answer"},
-    tokenizer=tokenizer,
-    delta_storage_options={
-        "DATABRICKS_HOST": "https://your-workspace.databricks.com",
-        "DATABRICKS_TOKEN": "dapi...",
-        "DATABRICKS_HTTP_PATH": "/sql/1.0/warehouses/...",
-    },
-    delta_sql_query="""
-      SELECT
-        concat(system_prompt, '\n', user_message) AS question,
-        assistant_message AS answer
-      FROM catalog.schema.training_data
-      WHERE split = 'train'
-    """,
-)
-```
-
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_iterable_dataset.ColumnMappedTextInstructionIterableDataset
-  path_or_dataset_id: delta://catalog.schema.training_data
-  column_mapping:
-    question: question
-    answer: answer
-  delta_sql_query: |
-    SELECT
-      concat(system_prompt, '\n', user_message) AS question,
-      assistant_message AS answer
-    FROM catalog.schema.training_data
-    WHERE split = 'train'
-  delta_storage_options:
-    DATABRICKS_HOST: ${oc.env:DATABRICKS_HOST}
-    DATABRICKS_TOKEN: ${oc.env:DATABRICKS_TOKEN}
-    DATABRICKS_HTTP_PATH: ${oc.env:DATABRICKS_HTTP_PATH}
-```
-
-:::{note}
-**SQL engine requirement:** `delta_sql_query` is executed via Spark (Databricks runtime/pyspark) when available, otherwise via `databricks-sql-connector`. It is not supported in a deltalake-only environment.
-:::
-
-:::{note}
-**Authentication:** The Delta Lake loader automatically picks up credentials from environment variables (`DATABRICKS_TOKEN`, `AWS_ACCESS_KEY_ID`, `AZURE_STORAGE_ACCOUNT_KEY`, etc.) if not explicitly provided in `delta_storage_options`.
-:::
-
-## Common Arguments
-
-| Arg                     | Default | Description |
-|-------------------------|---------|-------------|
-| `split`                 | `None`  | Which split to stream from a HF repo (`train`, `validation`, etc.). Ignored for local files and Delta tables. |
-| `name`                  | `None`  | Name of the dataset configuration/subset to load. |
-| `answer_only_loss_mask` | `True`  | Mask prompt tokens in `labels` with `-100` (CrossEntropy `ignore_index`). |
-| `use_hf_chat_template`  | `False` | If `True` and the tokenizer supports chat templates, format via `tokenizer.apply_chat_template(...)`. |
-| `seq_length`            | `None`  | Optional max sequence length; used for padding/truncation when enabled. |
-| `padding`               | `"do_not_pad"` | Padding strategy passed to the tokenizer. |
-| `truncation`            | `"do_not_truncate"` | Truncation strategy passed to the tokenizer. |
-| `limit_dataset_samples` | `None`  | Optionally limit the stream to the first \(N\) samples (best-effort; depends on backend). |
-| `repeat_on_exhaustion`  | `True`  | If `True`, restart the stream on exhaustion (useful for step-based training). |
-| `delta_storage_options` | `None`  | Storage/auth options for Delta Lake backends (Databricks, S3, Azure, GCS). |
-| `delta_version`         | `None`  | Specific Delta table version to read. |
-| `delta_sql_query`       | `None`  | SQL query to generate rows for Delta sources (Spark / Databricks SQL only). |
-
diff --git a/fern/versions/v0.4/pages/guides/llm/column-mapped-text-instruction-iterable-dataset.mdx b/docs/guides/llm/column-mapped-text-instruction-iterable-dataset.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/llm/column-mapped-text-instruction-iterable-dataset.mdx
rename to docs/guides/llm/column-mapped-text-instruction-iterable-dataset.mdx
diff --git a/docs/guides/llm/databricks.md b/docs/guides/llm/databricks.md
deleted file mode 100644
index 022f666e28..0000000000
--- a/docs/guides/llm/databricks.md
+++ /dev/null
@@ -1,295 +0,0 @@
-# Model Training on Databricks
-
-Databricks is a widely used platform for managing data, models, applications, and compute on the cloud. This guide shows how to use AutoModel for scalable, performant model training on Databricks.
-
-The specific example here fine-tunes a [Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) model using the [SQuAD dataset](https://huggingface.co/datasets/rajpurkar/squad) from Hugging Face, but any AutoModel functionality (for example, {doc}`model pre-training <pretraining>`, {doc}`VLMs </model-coverage/vlm/index>`, {doc}`other supported models </model-coverage/overview>`) can also be run on Databricks.
-
-## Provision Compute
-
-Let's start by [provisioning](https://docs.databricks.com/aws/en/compute/configure) a Databricks classic compute cluster with the following setup:
-
-- Databricks runtime: [18.0 LTS (Machine Learning version)](https://docs.databricks.com/aws/en/release-notes/runtime/18.0ml)
-- Worker instance type: `g6e.12xlarge` on AWS (4x L40S GPUs per node)  
-- Number of workers: 2  
-- Global [environment variable](https://docs.databricks.com/aws/en/compute/configure#environment-variables): `GLOO_SOCKET_IFNAME=eth0` (see [this](https://docs.databricks.com/aws/en/machine-learning/train-model/distributed-training/spark-pytorch-distributor#gloo-failure-runtimeerror-connection-refused) for details)   
-- Cluster-scoped [init script](https://docs.databricks.com/aws/en/init-scripts/cluster-scoped):
-
-```bash
-#!/bin/bash
-
-# Install AutoModel on all nodes
-/databricks/python3/bin/pip install git+https://github.com/NVIDIA-NeMo/Automodel
-```
-
-This will provision three compute nodes – one driver node we'll attach a notebook to, and two worker nodes we'll use for multi-node training.
-
-Note that we've selected a small number of instances for demo purposes, but you can adjust the specific instance type and number of workers for your actual use case.
-
-## Train the Model
-
-With the above compute resources provisioned, we're ready to fine-tune a model using AutoModel.
-
-AutoModel uses YAML file recipes to configure various settings for the training process (for example, model, dataset, loss function, optimizer, etc.). Here we'll use this [preconfigured recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml) for fine-tuning a Llama-3.2-1B model using the SQuAD dataset from Hugging Face. In a notebook connected to our compute resource, download the configuration file:
-
-```bash
-# Download configuration file
-!curl -O https://raw.githubusercontent.com/NVIDIA-NeMo/Automodel/refs/heads/main/examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```
-
-Here's what the model, dataset, and optimizer portions of the config file look like:
-
-```yaml
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad
-  split: train
-
-optimizer:
-  _target_: torch.optim.Adam
-  betas: [0.9, 0.999]
-  eps: 1e-8
-  lr: 1.0e-5
-  weight_decay: 0
-
-```
-
-See the full file for complete details (`!cat llama3_2_1b_squad.yaml`). 
-
-Finally, we'll [authenticate](https://huggingface.co/docs/hub/en/security-tokens) the VM running the notebook with Hugging Face so we can download the model and dataset:
-
-```python
-from getpass import getpass
-
-hf_token = getpass("HF token: ")
-```
-```bash
-!hf auth login --token {hf_token}
-```
-
-### Single-Node
-
-Since AutoModel is installed via the init script, the `automodel` CLI is available on all nodes.
-
-To run training on a single GPU, use this command:
-
-```bash
-!automodel llama3_2_1b_squad.yaml \
-    --step_scheduler.max_steps 20 \
-    --checkpoint.checkpoint_dir /Volumes/<catalog_name>/<schema_name>/<volume_name>/checkpoints_single/ \
-    --checkpoint.staging_dir /local_disk0/checkpoints_single/ \
-    --checkpoint.is_async True
-```
-
-In addition to specifying the configuration file, we also use these options:
-
-- `--step_scheduler.max_steps`: Limits the number of training steps taken. Again, this is for example purposes – adapt for your actual use case as needed.
-- `--checkpoint.checkpoint_dir`: Tells AutoModel where to {doc}`save model checkpoints </guides/checkpointing>` from training. We recommend saving model checkpoints in a Databricks Unity Catalog [volume](https://docs.databricks.com/aws/en/volumes/).
-- `--checkpoint.staging_dir`: Specifies a temporary staging location for model checkpoints. Files will be temporarily saved to this location before being moved to the final `checkpoint_dir` location. This is needed when saving checkpoints in Unity Catalog. 
-- `--checkpoint.is_async`: Uses asynchronous checkpointing. 
-
-Looking at GPU metrics in Databricks, we see our single GPU is being well utilized (\~95% utilization).
-
-:::{figure} ./databricks-gpu-metrics-single.png
-:name: databricks-gpu-metrics-single
-:alt: Single GPU utilization of ~95% during model training.
-:align: center
-
-Single GPU utilization of ~95% during model training.
-:::
-
-To utilize all four GPUs available on this `g6e.12xlarge` instance, add `--nproc-per-node=4` to the same command:
-
-```bash
-!automodel --nproc-per-node=4 llama3_2_1b_squad.yaml \
-    --step_scheduler.max_steps 20 \
-    --checkpoint.checkpoint_dir /Volumes/<catalog_name>/<schema_name>/<volume_name>/checkpoints_multi/ \
-    --checkpoint.staging_dir /local_disk0/checkpoints_multi/ \
-    --checkpoint.is_async True
-```
-
-The `automodel` CLI uses PyTorch's [Elastic Launch](https://docs.pytorch.org/docs/stable/elastic/run.html) internally to spawn and coordinate multiple training processes on the VM. Each training process runs on a separate GPU, and we can now see all four GPUs are being used (~95% utilization for each GPU).
-
-:::{figure} ./databricks-gpu-metrics-multi.png
-:name: databricks-gpu-metrics-multi
-:alt: Multi-GPU, single-node utilization of ~95% during model training.
-:align: center
-
-Multi-GPU, single-node utilization of ~95% during model training.
-:::
-
-
-### Multi-Node
-
-To scale further to multi-node training, we need to submit training jobs to all instances in our Databricks cluster.
-
-First, each instance needs to be authenticated with Hugging Face to download the model and dataset:
-
-```python
-# Ensure workers are authenticated with Hugging Face
-
-import subprocess
-import shlex
-
-def run_command(cmd):
-    p = subprocess.run(shlex.split(cmd), capture_output=True)
-    return p.stdout.decode()
- 
-rdd = sc.parallelize(range(sc.defaultParallelism))
-rdd.mapPartitions(lambda _: [run_command("hf auth login --token " + hf_token)]).collect();
-```
-
-Next, we use PySpark's [TorchDistributor](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.torch.distributor.TorchDistributor.html) to run the same training job across multiple instances like this:
-
-```py
-from pyspark.ml.torch.distributor import TorchDistributor
-import nemo_automodel.recipes.llm.train_ft as recipe_mod
-
-num_executor = 2            # Number of workers in cluster
-num_gpus_per_executor = 4   # Number of GPUs per worker
-distributor = TorchDistributor(
-    num_processes=num_executor * num_gpus_per_executor,
-    local_mode=False,
-    use_gpu=True,
-)
-
-train_file = recipe_mod.__file__
-args = [
-    "--config", "llama3_2_1b_squad.yaml",
-    "--step_scheduler.max_steps", "20",
-    "--checkpoint.checkpoint_dir", "/Volumes/<catalog_name>/<schema_name>/<volume_name>/checkpoints_dist/",
-    "--checkpoint.staging_dir", "/local_disk0/checkpoints_dist/",
-    "--checkpoint.is_async", "True",
-]
-distributor.run(train_file, *args)
-```
-
-`TorchDistributor` uses `torchrun` internally, so we point it at the recipe module directly (rather than the `automodel` CLI, which also wraps `torchrun`).
-
-We now see GPU utilization is \~95% for all GPUs on all worker nodes during training (8 GPUs in this particular case).
-
-## Track Experiments with MLflow
-
-Databricks includes built-in MLflow integration for tracking experiments, logging metrics, and storing artifacts. To use MLflow with AutoModel on Databricks, add the MLflow configuration to your YAML file.
-
-### Configure MLflow
-
-Edit your configuration file (e.g., `llama3_2_1b_squad.yaml`) to include the `mlflow` section:
-
-```yaml
-mlflow:
-  experiment_name: "automodel-databricks-llama3-squad"
-  run_name: ""
-  tracking_uri: "databricks"
-  artifact_location: null
-  tags:
-    platform: "databricks"
-    task: "squad-finetune"
-    model_family: "llama3.2"
-```
-
-For Databricks, the key configuration parameters are:
-
-- `tracking_uri`: Set to `"databricks"` to use Databricks' managed MLflow tracking server
-- `experiment_name`: Name of your experiment (will appear in the Databricks workspace)
-- `artifact_location`: Leave as `null` to use default Databricks artifact storage, or specify a Unity Catalog volume path like `/Volumes/<catalog>/<schema>/<volume>/mlflow-artifacts`
-- `tags`: Add custom tags to organize and filter your runs
-
-:::{note}
-Databricks automatically handles authentication when `tracking_uri` is set to `"databricks"`. No additional credentials are needed.
-:::
-
-### Run Training with MLflow
-
-Run training with MLflow tracking enabled using the same commands as before. The MLflow configuration will be read from your YAML file:
-
-**Single-node:**
-```bash
-!automodel llama3_2_1b_squad.yaml \
-    --step_scheduler.max_steps 20 \
-    --checkpoint.checkpoint_dir /Volumes/<catalog_name>/<schema_name>/<volume_name>/checkpoints/
-```
-
-**Multi-GPU:**
-```bash
-!automodel --nproc-per-node=4 llama3_2_1b_squad.yaml \
-    --step_scheduler.max_steps 20 \
-    --checkpoint.checkpoint_dir /Volumes/<catalog_name>/<schema_name>/<volume_name>/checkpoints/
-```
-
-**Multi-node with TorchDistributor:**
-```python
-import nemo_automodel.recipes.llm.train_ft as recipe_mod
-
-distributor = TorchDistributor(
-    num_processes=num_executor * num_gpus_per_executor,
-    local_mode=False,
-    use_gpu=True,
-)
-
-args = [
-    "--config", "llama3_2_1b_squad.yaml",
-    "--step_scheduler.max_steps", "20",
-    "--checkpoint.checkpoint_dir", "/Volumes/<catalog_name>/<schema_name>/<volume_name>/checkpoints/",
-]
-distributor.run(recipe_mod.__file__, *args)
-```
-
-### View Results
-
-During training, you'll see MLflow logging messages in your output:
-
-```
-MLflow run started: abc123def456
-View run at: databricks/#/mlflow/experiments/123/runs/abc123def456
-```
-
-To view your experiments and metrics:
-
-1. Navigate to the **Experiments** page in your Databricks workspace
-2. Find your experiment by name (e.g., `automodel-databricks-llama3-squad`)
-3. Click on a run to view metrics, parameters, and artifacts
-
-The Databricks MLflow UI displays:
-- Training and validation metrics over time
-- Model parameters and hyperparameters
-- Custom tags for filtering and comparison
-- Artifacts and model checkpoints
-- System metrics (GPU utilization, memory usage)
-
-### Store Artifacts in Unity Catalog
-
-To store MLflow artifacts in Unity Catalog volumes, specify the `artifact_location`:
-
-```yaml
-mlflow:
-  experiment_name: "automodel-databricks-llama3-squad"
-  tracking_uri: "databricks"
-  artifact_location: "/Volumes/<catalog_name>/<schema_name>/<volume_name>/mlflow-artifacts"
-  tags:
-    platform: "databricks"
-```
-
-This ensures your artifacts are stored in a governed, versioned location within Unity Catalog.
-
-### Additional Configuration
-
-You can override MLflow settings from the command line:
-
-```bash
-!automodel llama3_2_1b_squad.yaml \
-    --mlflow.experiment_name "custom-experiment-name" \
-    --mlflow.run_name "baseline-run-1" \
-    --mlflow.tags.learning_rate "1e-5"
-```
-
-For more details on MLflow configuration options and best practices, see the {doc}`MLflow logging guide </guides/mlflow-logging>`.
-
-## Conclusion
-
-This guide showed how to use AutoModel for model training on Databricks-managed compute. It's relatively straightforward to scale from a single-GPU to multi-GPU to multi-node training to best suit your needs. 
-
-While the example here fine-tunes a Llama-3.2-1B model using the SQuAD dataset, any supported AutoModel functionality (like model pre-training, VLMs, etc.) can also run, and scale, on Databricks. Check out {doc}`additional recipes and end-to-end examples </guides/overview>` to learn more. 
diff --git a/fern/versions/v0.4/pages/guides/llm/databricks.mdx b/docs/guides/llm/databricks.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/llm/databricks.mdx
rename to docs/guides/llm/databricks.mdx
diff --git a/docs/guides/llm/dataset.md b/docs/guides/llm/dataset.md
deleted file mode 100644
index 8273662213..0000000000
--- a/docs/guides/llm/dataset.md
+++ /dev/null
@@ -1,186 +0,0 @@
-# Integrate Your Own Text Dataset
-
-This guide shows you how to integrate your own dataset into NeMo Automodel for training. You'll learn about two main dataset types: **completion datasets** for language modeling (like [HellaSwag](https://huggingface.co/datasets/rowan/hellaswag)) and **instruction datasets** for question-answering tasks (like [SQuAD](https://huggingface.co/datasets/rajpurkar/squad)). We'll cover how to create custom datasets by implementing the required methods and preprocessing functions, and finally show you how to specify your own data logic using YAML configuration with file paths—allowing you to define custom dataset processing without modifying the main codebase.
-
-## Quick Start Summary
-| **Type**        |  **Use Case**    | **Example** | **Preprocessor**               | **Section**              |
-| --------------- | ------------------ | -------------- | --------------------------------- | --------------------------- |
-| ✍️ Completion   | Language modeling  | HellaSwag      | `SFTSingleTurnPreprocessor`       | [Jump](#completion-datasets)  |
-| 🗣️ Instruction  | Question answering | SQuAD          | `make_*` function                 | [Jump](#instruction-datasets) |
-
-## Types of Supported Datasets
-
-NeMo Automodel supports a variety of datasets, depending on the task.
-### Completion Datasets
-
-**Completion datasets** are single text sequences designed for language modeling where the model learns to predict the next token given a context. These datasets typically contain a context (prompt) and a target (completion) that the model should learn to generate.
-
-#### Example: HellaSwag
-
-The [HellaSwag](https://huggingface.co/datasets/rowan/hellaswag) dataset is a popular completion dataset used for commonsense reasoning. It contains situations with multiple-choice endings where the model must choose the most plausible continuation.
-
-**HellaSwag dataset structure:**
-- **Context (`ctx`)**: A situation or scenario description
-- **Endings**: Multiple possible completions (4 options)
-- **Label**: Index of the correct ending
-
-**Example:**
-```
-Context: "A man is sitting at a piano in a large room."
-Endings: [
-  "He starts playing a beautiful melody.",
-  "He eats a sandwich while sitting there.",
-  "He suddenly becomes invisible.",
-  "He transforms into a robot."
-]
-Label: 0  # First ending is correct
-```
-
-#### Preprocessing with SFTSingleTurnPreprocessor
-
-NeMo Automodel provides the `SFTSingleTurnPreprocessor` class to handle completion datasets. This processor:
-
-1. **Extracts context and target** using `get_context()` and `get_target()`.
-2. **Tokenizes and cleans** context and target separately.
-3. **Concatenates** them into one sequence.
-4. **Creates loss mask**: `-100` for context, target IDs for target.
-5. **Pads** sequences to equal length.
-
-
-#### Create Your Own Completion Dataset
-
-To adapt your dataset into this format, define a class like this:
-
-```python
-from datasets import load_dataset
-from nemo_automodel.components.datasets.utils import SFTSingleTurnPreprocessor
-
-class MyCompletionDataset:
-    def __init__(self, path_or_dataset, tokenizer, split="train"):
-        raw_datasets = load_dataset(path_or_dataset, split=split)
-        processor = SFTSingleTurnPreprocessor(tokenizer)
-        self.dataset = processor.process(raw_datasets, self)
-
-    def get_context(self, examples):
-        """Extract context/prompt from your dataset"""
-        return examples["context_field"]  # Replace with your context field
-
-    def get_target(self, examples):
-        """Extract target/completion from your dataset"""
-        return examples["target_field"]   # Replace with your target field
-
-    def __getitem__(self, index):
-        return self.dataset[index]
-
-    def __len__(self):
-        return len(self.dataset)
-```
-
-
-### Instruction Datasets
-
-**Instruction datasets** are question-answer pairs where the model learns to respond to specific instructions or questions. These datasets are structured as context-question pairs with corresponding answers, making them ideal for teaching models to follow instructions and provide accurate responses.
-
-#### Example: SQuAD
-
-The [SQuAD (Stanford Question Answering Dataset)](https://huggingface.co/datasets/rajpurkar/squad) is a popular instruction dataset for reading comprehension. It contains questions based on Wikipedia articles along with their answers.
-
-**SQuAD dataset structure:**
-- **Context**: A paragraph of text from Wikipedia
-- **Question**: A question about the context
-- **Answers**: The correct answer with its position in the context
-
-#### Create Your Own Instruction Dataset
-
-The [`squad.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/datasets/llm/squad.py) file contains the implementation for processing the SQuAD dataset into a format suitable for instruction tuning. It defines a dataset class and preprocessing functions that extract the context, question, and answer fields, concatenate them into a prompt-completion format, and apply tokenization, padding, and loss masking. This serves as a template for building custom instruction datasets by following a similar structure and adapting the extraction logic to your dataset's schema.
-
-Based on the SQuAD implementation in `squad.py`, you can create your own instruction dataset using the `make_squad_dataset` pattern:
-
-```python
-from datasets import load_dataset
-
-def make_my_instruction_dataset(
-    tokenizer,
-    seq_length=None,
-    limit_dataset_samples=None,
-    split="train",
-    dataset_name="your-dataset-name",
-):
-    if limit_dataset_samples:
-        split = f"{split}[:{limit_dataset_samples}]"
-
-    dataset = load_dataset(dataset_name, split=split)
-
-    return dataset.map(
-        your_own_fmt_fn,  # Your formatting function
-        batched=False,
-        remove_columns=dataset.column_names,
-    )
-```
-
-## YAML-based Custom Dataset Configuration
-
-NeMo Automodel supports YAML-based dataset specification using the _target_ key. This lets you reference dataset-building classes or functions using either:
-
-- 1. Python Dotted Path
-
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
-  path_or_dataset: rowan/hellaswag
-  split: train
-```
-
-- 2. File Path + Function Name
-
-```
-<file-path>:<function-name>
-```
-
-Where:
-- `<file-path>`: The absolute path to a Python file containing your dataset function
-- `<function-name>`: The name of the function to call from that file
-
-```yaml
-dataset:
-  _target_: /path/to/your/custom_dataset.py:build_my_dataset
-  num_blocks: 111
-```
-This will call `build_my_dataset()` from the specified file with the other keys (e.g., num_blocks) as arguments. This approach allows you to integrate custom datasets via config alone—no need to alter the codebase or package structure.
-
-
-## Packed Sequence Support in NeMo AutoModel
-NeMo AutoModel supports **packed sequences**, a technique to optimize training with variable-length sequences (e.g., text) by minimizing padding.
-
-### What is a Packed Sequence?
-Instead of padding each sequence to a fixed length (wasting computation on `[PAD]` tokens), packed sequences:
-- Concatenate short sequences into a single continuous sequence.
-- Separate sequences with special tokens (e.g., `[EOS]`).
-- Track lengths via a "attention mask" to prevent cross-sequence information leakage.
-
-### Benefits
-- Reduces redundant computation on padding tokens leading to faster training.
-- Enables larger effective batch sizes leading to better GPU utilization.
-- Especially useful for language modeling and text datasets.
-
-
-### Enable Packed Sequences in NeMo Automodel
-
-To enable packed sequences, add these keys to your recipe's YAML config:
-```
-packed_sequence:
-   # Set packed_sequence_size > 0 to run with packed sequences
-   packed_sequence_size: 1024
-   split_across_pack: False
-```
-
-The `packed_sequence` has two options:
-- **packed_sequence_size**: Defines the total token length of each packed sequence, higher values require higher GPU memory usage.
-- **split_across_pack**: If two will split a sequence across different packed sequences.
-
-
-## Troubleshooting Tips
-
-- **Tokenization Mismatch?** Ensure your tokenizer aligns with the model's expected inputs.
-- **Dataset too large?** Use `limit_dataset_samples` in your YAML config to load a subset, useful for quick debugging.
-- **Loss not decreasing?** Verify that your loss mask correctly ignores prompt tokens.
diff --git a/fern/versions/v0.4/pages/guides/llm/dataset.mdx b/docs/guides/llm/dataset.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/llm/dataset.mdx
rename to docs/guides/llm/dataset.mdx
diff --git a/docs/guides/llm/dsv4-flash.md b/docs/guides/llm/dsv4-flash.md
deleted file mode 100644
index bd3872fa49..0000000000
--- a/docs/guides/llm/dsv4-flash.md
+++ /dev/null
@@ -1,94 +0,0 @@
-# Fine-Tune DeepSeek V4 Flash
-
-## Introduction
-
-[deepseek-ai/DeepSeek-V4-Flash](https://huggingface.co/deepseek-ai/DeepSeek-V4-Flash) is the latest fine-grained Mixture-of-Experts language model from DeepSeek. It uses a 43-layer all-MoE backbone (no dense MLP layers) with 256 routed experts plus one shared expert per block and top-6 routing. The architecture introduces a hybrid per-layer attention zoo — Sliding-Window Attention (SWA), Compressed Sparse Attention (CSA, Compressor + Indexer), and Hierarchical Compressed Attention (HCA, Compressor only) — selectable per layer through `compress_ratios`. The first `num_hash_layers` blocks use a hash-clustering gate (`DeepseekV4HashGate`) for token-to-expert routing, and every block maintains `hc_mult=4` Hyper-Connection streams mixed via a learned col-norm-first Sinkhorn router.
-
-This guide walks you through fine-tuning DeepSeek V4 Flash on HellaSwag using NVIDIA NeMo Automodel. You will learn how to configure the recipe, launch training, and inspect the results.
-
-To set up your environment to run NeMo Automodel, follow the [installation guide](https://github.com/NVIDIA-NeMo/Automodel#-install-nemo-automodel).
-
-## Data
-
-### HellaSwag
-
-We use [HellaSwag](https://huggingface.co/datasets/Rowan/hellaswag), a commonsense natural-language-inference dataset consisting of context + four candidate continuations. The version used here is the standard `rowan/hellaswag` HuggingFace split, formatted for next-token-prediction fine-tuning.
-
-- **Train / validation splits** taken directly from the HuggingFace dataset.
-- **Tokenizer**: shared with the base model (`AutoTokenizer.from_pretrained` on the DeepSeek V4 Flash checkpoint).
-- **Padding**: `pad_seq_len_divisible=64` via the default collater.
-
-For the full HellaSwag dataset wrapper used in NeMo Automodel, see [`nemo_automodel.components.datasets.llm.hellaswag.HellaSwag`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/datasets/llm/hellaswag.py).
-
-## Architecture Notes
-
-DeepSeek V4 Flash differs from V3 / V3.2 in several load-bearing ways. The state-dict adapter and pipeline-parallel forward in NeMo Automodel handle each of these transparently:
-
-- **Attention**: GQA with a single KV head broadcast to all 64 attention heads, Q-LoRA (`q_lora_rank=1024`), and grouped O-LoRA (`o_lora_rank=1024`, `o_groups=8`) — not MLA. Per-head non-learnable rsqrt on Q after `wq_b` matches the inference reference.
-- **Hybrid attention via `compress_ratios`**:
-  - `compress_ratio=0` → pure SWA with a learned per-head attention sink.
-  - `compress_ratio=4` → CSA: Compressor (overlap mode, pools `2 * ratio` raw tokens per compressed token) plus Indexer (selects top-k compressed positions per query). An explicit additive `[B, 1, S, P_total]` mask enforces per-query causal correctness.
-  - `compress_ratio=128` → HCA: Compressor only (non-overlap pooling), deterministic `p < (q + 1) // ratio` causal mask.
-- **Dual RoPE bases**: `theta=10000` for `compress_ratio==0` layers; `theta=160000` (with YaRN scaling) for `compress_ratio>0` layers, applied to both the main attention Q/KV and the Compressor sub-module on those layers. RoPE is encoded as INTERLEAVED pairs (`view_as_complex` style) to match the released checkpoint.
-- **Hash-routing first layers**: the first `num_hash_layers` (default 3) blocks use a `DeepseekV4HashGate` with a `tid2eid` lookup table. `input_ids` is threaded through the model and the V4-aware pipeline forward; under pipeline parallelism, hash layers live on stage 0 where `input_ids` is available.
-- **Hyper-Connections (HC)**: every block maintains `hc_mult=4` streams of the hidden state. The mixer follows the released `hc_split_sinkhorn` formulas: `pre = sigmoid + eps`, `post = 2 * sigmoid` (no `+eps`), `comb = softmax(dim=-1) + eps` followed by a col-norm-first Sinkhorn (`iters - 1` alternating row/col passes), producing a doubly-stochastic mixing matrix per block.
-- **MoE routing**: `sqrtsoftplus` scoring with `noaux_tc` topk method and clamped SwiGLU on routed experts (`swiglu_limit=10.0`).
-- **Optional MTP layers** via `num_nextn_predict_layers`.
-
-### Checkpoint format
-
-The released DSV4-Flash safetensors mix several quantization formats. The state-dict adapter handles all of them transparently:
-
-- **Routed experts**: FP4 `e2m1fn` packed two values per int8 byte, with per-row 32-col FP8 `e8m0fnu` scales — unpacked on load, re-emitted in matching packed placeholders on `to_hf` so DCP shape/dtype validation lines up with the on-disk layout.
-- **Shared experts + non-expert weights**: standard FP8 `e4m3fn` 128×128 block scales.
-- **Hash layers' gate has no bias on disk**: the adapter reads `num_hash_layers` from the checkpoint's `config.json` and drops the corresponding bias keys before DCP load.
-- **Indexer / Compressor key flattening**: on disk the Indexer sits as a sibling of the Compressor with its own nested compressor (`indexer.compressor.{ape,norm,wgate,wkv}` + `indexer.{wq_b,weights_proj}`); the adapter renames these to land at the flat `compressor.indexer.*` layout.
-
-A new in-tree `HuggingFaceStorageReader` recognizes `F8_E8M0` and `F8_E5M2` dtypes (the upstream reader silently dropped them), restoring DCP metadata on every rank for these checkpoints.
-
-## Launch Training
-
-A ready-to-use recipe ships at [`examples/llm_finetune/deepseek_v4/deepseek_v4_flash_hellaswag.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/deepseek_v4/deepseek_v4_flash_hellaswag.yaml). The yaml header documents how to scale `num_hidden_layers` and `ep_size` for the full 43-layer multi-node run.
-
-NeMo Automodel supports several ways to launch training — via the Automodel CLI with Slurm, interactive sessions, `torchrun`, and more. For full details on all launch options (Slurm batch jobs, multi-node configuration, environment variables, etc.), see the [Run on a Cluster](https://github.com/NVIDIA-NeMo/Automodel/blob/main/docs/launcher/slurm.md) guide.
-
-### Standalone Slurm Script
-
-Below is a standalone Slurm script example for the HellaSwag recipe. Before running it, ensure your cluster environment is configured following the [Run on a Cluster](https://github.com/NVIDIA-NeMo/Automodel/blob/main/docs/launcher/slurm.md) guide. Then submit the job:
-
-```bash
-export TRANSFORMERS_OFFLINE=1
-export HF_HOME=your/path/to/hf_cache
-export HF_DATASETS_OFFLINE=1
-export WANDB_API_KEY=your_wandb_key
-
-srun --output=output.out \
-     --error=output.err \
-     --container-image /your/path/to/automodel.image.sqsh --no-container-mount-home bash -c "
-  CUDA_DEVICE_MAX_CONNECTIONS=1 automodel \
-  examples/llm_finetune/deepseek_v4/deepseek_v4_flash_hellaswag.yaml \
-  --nproc-per-node=8 \
-  --model.config.pretrained_model_name_or_path=/your/local/dsv4-flash \
-  --model.config.name_or_path=/your/local/dsv4-flash "
-```
-
-**Before you start**:
-- Hugging Face applies rate limits on downloads. We recommend cloning the model repository to your local filesystem beforehand.
-- Ensure your Hugging Face cache (`HF_HOME`) is configured and that the dataset is already cached locally.
-- To enable Weights & Biases logging, set your `WANDB_API_KEY` and configure the `wandb` section in the YAML file.
-- For the full 43-layer schedule, increase `ep_size` (and add `pp_size`) per the cluster you are running on; see the yaml header for guidance.
-
-## Layer-Parity Validation
-
-The bringup was validated against the official DeepSeek inference reference (`dsv4flash/inference/model.py`) by per-tensor dump bisection. On the 4-layer parity harness (`compress_ratios=[0, 0, 4, 128]`, `num_hash_layers=2`, PP=1, EP=8):
-
-- **Final-logits cosine similarity: 0.998 vs reference, top-1 token matches.**
-- Every block cosine similarity ≥ 0.987.
-
-## Training Results
-
-The training loss curve below is from a 43-layer full-finetune run on HellaSwag with the full attention zoo (SWA + CSA + HCA) live.
-
-<p align="center">
-  <img src="https://github.com/user-attachments/assets/b5ed8837-40cb-41c6-8b90-2789e5e872cc" alt="DeepSeek V4 Flash Training Loss Curve" width="600">
-</p>
diff --git a/fern/versions/v0.4/pages/guides/llm/dsv4-flash.mdx b/docs/guides/llm/dsv4-flash.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/llm/dsv4-flash.mdx
rename to docs/guides/llm/dsv4-flash.mdx
diff --git a/docs/guides/llm/finetune.md b/docs/guides/llm/finetune.md
deleted file mode 100644
index f7adb86137..0000000000
--- a/docs/guides/llm/finetune.md
+++ /dev/null
@@ -1,934 +0,0 @@
-# Supervised Fine-Tuning (SFT) and Parameter-Efficient Fine-Tuning (PEFT)
-
-## Introduction
-
-Pretrained language models are general-purpose: they know a lot about language but nothing about your particular domain, terminology, or task. Fine-tuning bridges that gap — you fine-tune the model on your own examples so it produces answers that are accurate and relevant for your use case, without the cost of training a model from scratch. The result is a model optimized for your data that you can evaluate, publish, and deploy. This guide walks you through that process end-to-end with NeMo AutoModel — from installation through training, evaluation, and deployment — using [Meta LLaMA 3.2 1B](https://huggingface.co/meta-llama/Llama-3.2-1B) and the [SQuAD v1.1](https://huggingface.co/datasets/rajpurkar/squad) dataset as a running example.
-
-NeMo AutoModel supports two fine-tuning modes:
-
-- **Supervised Fine-Tuning (SFT)** updates all model parameters. Use SFT when you need maximum accuracy and have sufficient compute.
-- **Parameter-Efficient Fine-Tuning (PEFT)** using [LoRA](https://arxiv.org/abs/2106.09685) freezes the base model and trains small low-rank adapters. PEFT reduces trainable parameters to less than 1% of the original model, lowering memory and storage costs.
-
-### Workflow Overview
-
-```text
-┌──────────────┐    ┌──────────────┐    ┌──────────────┐    ┌──────────────┐    ┌──────────────┐    ┌──────────────┐    ┌──────────────┐
-│ 1. Install   │--->│ 2. Configure │--->│  3. Train    │--->│ 4. Inference │--->│ 5. Evaluate  │--->│ 6. Publish   │--->│  7. Deploy   │
-│              │    │              │    │              │    │              │    │              │    │  (optional)  │    │  (optional)  │
-│ pip install  │    │ YAML config  │    │ automodel CLI│    │ HF generate  │    │ Val loss +   │    │ HF Hub       │    │ vLLM serving │
-│ or Docker    │    │ Choose SFT   │    │ or torchrun  │    │ API          │    │ lm-eval-     │    │ upload       │    │              │
-│              │    │ or PEFT      │    │              │    │              │    │ harness      │    │              │    │              │
-└──────────────┘    └──────────────┘    └──────────────┘    └──────────────┘    └──────────────┘    └──────────────┘    └──────────────┘
-```
-
-| Step | Section | SFT | PEFT |
-|------|---------|-----|------|
-| **1. Install** | [Install NeMo AutoModel](#install-nemo-automodel) | Same | Same |
-| **2. Configure** | [Configure Your Training Recipe](#configure-your-training-recipe) | YAML without `peft:` section | YAML with `peft:` section |
-| **3. Train** | [Fine-Tune the Model](#fine-tune-the-model) | Same command for both modes | Same command for both modes |
-| **4. Inference** | [Run Inference](#run-inference) | Load consolidated checkpoint directly | Load base model + adapter |
-| **5. Evaluate** | [Evaluate the Fine-Tuned Model](#evaluate-the-fine-tuned-model) | Validation loss during training; lm-eval-harness post-training | Same |
-| **6. Publish** | [Publish to HF Hub](#publish-to-the-hugging-face-hub) | Upload `model/consolidated/` | Upload `model/` (adapter only) |
-| **7. Deploy** | [Deploy with vLLM](#deploy-with-vllm) | `vllm.LLM(model=...)` | `vLLMHFExporter` with `--lora-model` |
-
-## Install NeMo AutoModel
-
-```bash
-pip3 install nemo-automodel
-```
-
-Alternatively, if you run into dependency or driver issues, use the pre-built Docker container:
-
-```bash
-docker pull nvcr.io/nvidia/nemo-automodel:26.02.00
-docker run --gpus all -it --rm --shm-size=8g -v $(pwd)/checkpoints:/tmp/checkpoints/ nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-:::{important}
-Docker containers are ephemeral — files written inside the container are lost when it stops. The `-v` flag in the `docker run` command above bind-mounts a local `checkpoints/` directory into the container so that saved checkpoints persist across runs. For more details, see [Saving Checkpoints When Using Docker](../checkpointing.md#saving-checkpoints-when-using-docker).
-:::
-
-For the full set of installation methods, see the [installation guide](../installation.md).
-
-## Configure Your Training Recipe
-
-
-Training is configured through a [YAML](https://en.wikipedia.org/wiki/YAML) config file with three required sections — **model**, **dataset**, and **step_scheduler** — plus an optional **peft** section. The sections below walk through each one. For the complete copy-pastable file, see [Full Config YAML](#full-config-yaml).
-
-Under the hood, both SFT and PEFT are executed by a **recipe**: a self-contained Python class that wires together model loading, dataset preparation, training, checkpointing, and logging. The fine-tuning recipe is [`TrainFinetuneRecipeForNextTokenPrediction`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/recipes/llm/train_ft.py). The config file tells the recipe *what* to build; the recipe decides *how* to build it.
-
-:::{dropdown} How the Config System Works
-
-NeMo AutoModel configs use a convention borrowed from [Hydra](https://hydra.cc/): the special **`_target_`** key tells the framework *which* Python class or function to call, and **every other key** in the same YAML block is passed as a keyword argument to that call. For example:
-
-```yaml
-optimizer:
-  _target_: torch.optim.Adam
-  lr: 1.0e-5
-  weight_decay: 0
-```
-
-is equivalent to writing this Python code:
-
-```python
-from torch.optim import Adam
-
-optimizer = Adam(lr=1.0e-5, weight_decay=0)
-```
-
-The `_target_` value is a **dotted Python import path**: the same string you would use in an `import` statement. The framework resolves it at runtime by importing the module and looking up the attribute. This means you can point `_target_` at any class constructor or factory function, and the remaining keys become its arguments.
-
-:::{tip}
-To discover which parameters a section accepts, look up the Python signature of its `_target_`. For instance, `torch.optim.Adam` accepts `lr`, `betas`, `eps`, and `weight_decay` — those are the keys you can set in the YAML.
-:::
-
-**From YAML to running code.** Here is the path a config takes through the framework:
-
-```text
-finetune_config.yaml
-        │
-        ▼
-  ┌──────────────┐     load_yaml_config() parses the file into
-  │  ConfigNode  │◄─── a tree of ConfigNode objects, one per
-  └──────┬───────┘     YAML section.
-         │
-         ▼
-  ┌──────────────┐     The recipe's setup() method reads
-  │   Recipe     │◄─── each section from the ConfigNode tree
-  │   setup()    │     and passes it to the matching builder.
-  └──────┬───────┘
-         │
-    ┌────┴─────────────────────────────────┐
-    ▼            ▼            ▼            ▼
-build_model  build_optimizer build_dataloader build_loss_fn ...
-    │            │            │            │
-    ▼            ▼            ▼            ▼
-cfg.model     cfg.optimizer cfg.dataset   cfg.loss_fn
- .instantiate() .instantiate() .instantiate() .instantiate()
-    │            │            │            │
-    ▼            ▼            ▼            ▼
- Resolves      Resolves     Resolves     Resolves
- _target_,     _target_,    _target_,    _target_,
- calls it      calls it     calls it     calls it
- with kwargs   with kwargs  with kwargs  with kwargs
-```
-
-Each builder function calls **`.instantiate()`** on its config section. `.instantiate()` does two things:
-
-1. **Resolves `_target_`** — imports the Python path and obtains the callable (class or function).
-2. **Calls it** — passes every other key in the section as a keyword argument.
-
-Nested `_target_` blocks (like `collate_fn` inside `dataloader`) are recursively instantiated the same way.
-
-**The `recipe` key.** Every config file includes a top-level `recipe` key that tells the CLI *which recipe class* to run. You can write it as a **short name** or as a **fully-qualified Python path** — both resolve to the same class:
-
-```yaml
-# Short name (the CLI looks up the class automatically)
-recipe: TrainFinetuneRecipeForNextTokenPrediction
-
-# Fully-qualified path (used as-is)
-recipe: nemo_automodel.recipes.llm.train_ft.TrainFinetuneRecipeForNextTokenPrediction
-```
-
-The short name form is a convenience — the CLI scans all recipe modules under `nemo_automodel.recipes` and matches the bare class name. If you invoke the recipe script directly with `torchrun` instead of the `automodel` CLI, the `recipe` key is not required because the script itself *is* the recipe.
-
-**Not every section uses `_target_`.** Some sections like `step_scheduler`, `distributed`, and `checkpoint` are plain key-value groups consumed directly by the recipe — they control training schedule, parallelism strategy, and checkpoint behavior without instantiating a Python object.
-:::
-
-### Model
-
-```yaml
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-```
-
-| Key | Role |
-|-----|------|
-| `_target_` | Points to [`NeMoAutoModelForCausalLM.from_pretrained`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/_transformers/auto_model.py) — a factory method that downloads (or loads from cache) a pretrained Hugging Face model and wraps it with NeMo distributed-training support. |
-| `pretrained_model_name_or_path` | A keyword argument to `from_pretrained`. Any argument that [`from_pretrained`](https://huggingface.co/docs/transformers/main_classes/model#transformers.PreTrainedModel.from_pretrained) accepts can be added here (e.g. `cache_dir`, `torch_dtype`). |
-
-This guide uses **Meta Llama 3.2 1B** as a running example. Replace `pretrained_model_name_or_path` with any supported [Hugging Face model ID](../../model-coverage/llm/index.md).
-
-:::{dropdown} About Llama 3.2 1B
-Llama is a family of decoder-only transformer models developed by Meta. The 1B variant is a compact model suitable for research and edge deployment, featuring RoPE positional embeddings, grouped-query attention (GQA), and SwiGLU activations.
-:::
-
-:::{dropdown} Accessing gated models
-Some Hugging Face models are **gated**. If the model page shows a "Request access" button:
-
-1. Log in with your Hugging Face account and accept the license.
-2. Ensure the token you use (from `huggingface-cli login` or `HF_TOKEN`) belongs to the approved account.
-
-Pulling a gated model without an authorized token triggers a 403 error.
-:::
-
-### Dataset
-
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad  # HF-Hub ID used to pull the dataset
-  split: train
-
-validation_dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad
-  split: validation
-```
-
-| Key | Role |
-|-----|------|
-| `_target_` | Points to [`make_squad_dataset`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/datasets/llm/squad.py) — a factory function that downloads the SQuAD dataset, tokenizes it, and returns a `torch.utils.data.Dataset`. To use a different dataset, change `_target_` to a different factory function (see [Integrate Your Own Text Dataset](dataset.md)). |
-| `dataset_name`, `split` | Keyword arguments passed to `make_squad_dataset`. Each dataset factory defines its own parameters — check the function signature to see what's available. |
-
-This guide uses **SQuAD v1.1** as a running example. Swap the dataset by changing `_target_` and the dataset arguments — see [Integrate Your Own Text Dataset](dataset.md) and [Dataset Overview](../dataset-overview.md).
-
-:::{dropdown} About SQuAD v1.1
-The Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset where each example consists of a Wikipedia passage, a question, and a span answer. SQuAD v1.1 guarantees all questions are answerable from the context, making it suitable for straightforward fine-tuning.
-
-Example:
-```json
-{
-    "context": "Architecturally, the school has a Catholic character. ...",
-    "question": "To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?",
-    "answers": { "text": ["Saint Bernadette Soubirous"], "answer_start": [515] }
-}
-```
-:::
-
-### PEFT (Optional)
-
-```yaml
-peft:
-  _target_: nemo_automodel.components._peft.lora.PeftConfig
-  target_modules: "*.proj"  # glob pattern matching linear layer FQNs
-  dim: 8                    # low-rank dimension of the adapters
-  alpha: 32                 # scaling factor for learned weights
-```
-
-| Key | Role |
-|-----|------|
-| `_target_` | Points to [`PeftConfig`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/_peft/lora.py) — a dataclass that describes which layers to adapt and how. Unlike the model and dataset sections, this instantiation produces a *config object*, not the adapter itself. The recipe passes the resulting `PeftConfig` into `build_model`, which applies LoRA adapters to the model. |
-| `target_modules` | A glob pattern matched against fully-qualified layer names (e.g. `"*.proj"` matches every layer whose name ends in `proj`). |
-| `dim` | The low-rank dimension *r* — controls adapter capacity. Larger values learn more but use more memory. |
-| `alpha` | Scaling factor applied to the adapter output (`alpha / dim`). Higher values give adapters more influence during training. |
-
-Including a `peft:` section enables LoRA fine-tuning. Remove it entirely to run SFT instead — see [Switching Between SFT and PEFT](#switching-between-sft-and-peft).
-
-#### QLoRA (Quantized Low-Rank Adaptation)
-
-If GPU memory is a constraint, [QLoRA](https://arxiv.org/abs/2305.14314) combines LoRA with 4-bit NormalFloat (NF4) quantization to reduce memory usage by up to 75% compared to full-parameter SFT in 16-bit precision, while maintaining comparable quality to standard LoRA.
-
-To enable QLoRA, add a `quantization:` section alongside the `peft:` section in your config. Note two differences from the standard PEFT config above: `target_modules` uses the broader `"*_proj"` pattern to apply LoRA to all projection layers (wider coverage compensates for precision loss from 4-bit weights), and `dim` is increased from 8 to 16 for additional adapter capacity.
-
-```yaml
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-
-peft:
-  _target_: nemo_automodel.components._peft.lora.PeftConfig
-  target_modules: "*_proj"  # broader glob than "*.proj" to cover all projection layers
-  dim: 16                   # LoRA rank (higher than default to offset quantization)
-  alpha: 32                # scaling factor
-  dropout: 0.1             # LoRA dropout rate
-
-quantization:
-  load_in_4bit: True                   # enable 4-bit quantization
-  load_in_8bit: False                  # use 4-bit, not 8-bit
-  bnb_4bit_compute_dtype: bfloat16     # compute dtype
-  bnb_4bit_use_double_quant: True      # double quantization for extra savings
-  bnb_4bit_quant_type: nf4             # NormalFloat quantization type
-  bnb_4bit_quant_storage: bfloat16     # storage dtype for quantized weights
-```
-
-### Training Schedule
-
-```yaml
-step_scheduler:
-  num_epochs: 1     # Will train over the dataset once.
-```
-
-Unlike the sections above, `step_scheduler` has **no `_target_`** — it is not instantiated into a Python object. Instead, the recipe reads its keys directly to control the training loop (how many epochs to run, when to checkpoint, when to validate). This is typical of sections that configure *behavior* rather than *components*.
-
-All other settings (distributed strategy, optimizer, checkpointing, logging) use sensible defaults. See the [Full Configuration Reference](#full-configuration-reference) to customize them.
-
-### Full Config YAML
-
-:::{dropdown} finetune_config.yaml (click to expand)
-Save as `finetune_config.yaml`. This config runs PEFT (LoRA). To run SFT instead, remove the `peft:` section. For production-ready examples, see the hosted configs: [Llama 3.2 1B SFT](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml) and [Llama 3.2 1B PEFT](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/llama3_2/llama3_2_1b_squad_peft.yaml).
-
-```yaml
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-
-peft:
-  _target_: nemo_automodel.components._peft.lora.PeftConfig
-  target_modules: "*.proj"
-  dim: 8
-  alpha: 32
-
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad
-  split: train
-
-validation_dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad
-  split: validation
-
-step_scheduler:
-  num_epochs: 1
-```
-:::
-
-## Fine-Tune the Model
-
-You can run the recipe using the AutoModel CLI or directly with `torchrun` (advanced).
-
-```bash
-automodel --nproc-per-node=8 finetune_config.yaml
-```
-
-The `--nproc-per-node=8` flag specifies the number of GPUs per node. Adjust to your case (for a single GPU, omit the `--nproc-per-node` option).
-
-### Invoke the Recipe Script Directly (advanced)
-
-Alternatively, you can invoke the recipe [script](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/recipes/llm/train_ft.py) directly using [torchrun](https://docs.pytorch.org/docs/stable/elastic/run.html), as shown below.
-
-``` bash
-torchrun --nproc-per-node=8 nemo_automodel/recipes/llm/train_ft.py -c finetune_config.yaml
-```
-
-### Sample Output
-Running the recipe using either the `automodel` app or by directly invoking the recipe script should produce
-the following log:
-```
-$ automodel finetune_config.yaml
-INFO:nemo_automodel.cli.app:Config: finetune_config.yaml
-INFO:nemo_automodel.cli.app:Recipe: nemo_automodel.recipes.llm.train_ft.TrainFinetuneRecipeForNextTokenPrediction
-INFO:nemo_automodel.cli.app:Launching job interactively (local)
-cfg-path: finetune_config.yaml
-INFO:root:step 4 | epoch 0 | loss 1.5514 | grad_norm 102.0000 | mem: 11.66 GiB | tps 6924.50
-INFO:root:step 8 | epoch 0 | loss 0.7913 | grad_norm 46.2500 | mem: 14.58 GiB | tps 9328.79
-Saving checkpoint to checkpoints/epoch_0_step_10
-INFO:root:step 12 | epoch 0 | loss 0.4358 | grad_norm 23.8750 | mem: 15.48 GiB | tps 9068.99
-INFO:root:step 16 | epoch 0 | loss 0.2057 | grad_norm 12.9375 | mem: 16.47 GiB | tps 9148.28
-INFO:root:step 20 | epoch 0 | loss 0.2557 | grad_norm 13.4375 | mem: 12.35 GiB | tps 9196.97
-Saving checkpoint to checkpoints/epoch_0_step_20
-INFO:root:[val] step 20 | epoch 0 | loss 0.2469
-```
-
-Each log line reports the current loss, gradient norm, peak GPU memory, and tokens per second (TPS). Small fluctuations between steps (e.g., 0.2057 to 0.2557 above) are normal — look at the overall downward trend rather than individual values.
-
-### Checkpoint Contents
-
-Checkpoints are saved in native Hugging Face format, so no conversion is required — they work directly with Transformers, PEFT, vLLM, lm-eval-harness, and other tools in the Hugging Face ecosystem. SFT and PEFT produce different checkpoint layouts. **SFT checkpoints** contain the full model weights at `model/consolidated/` — a single, self-contained Hugging Face model directory created by gathering distributed shards into one location — and can be loaded directly. **PEFT checkpoints** contain only the adapter weights (~MBs instead of GBs) — at inference time you must load the original base model and apply the adapter on top. This distinction affects every downstream step (inference, publishing, deployment).
-
-:::{dropdown} Checkpoint directory structure
-**SFT checkpoint:**
-```bash
-$ tree checkpoints/epoch_0_step_10/
-checkpoints/epoch_0_step_10/
-├── config.yaml
-├── dataloader.pt
-├── model
-│   ├── consolidated
-│   │   ├── config.json
-│   │   ├── model-00001-of-00001.safetensors
-│   │   ├── model.safetensors.index.json
-│   │   ├── special_tokens_map.json
-│   │   ├── tokenizer.json
-│   │   ├── tokenizer_config.json
-│   │   └── generation_config.json
-│   ├── shard-00001-model-00001-of-00001.safetensors
-│   └── shard-00002-model-00001-of-00001.safetensors
-├── optim
-│   ├── __0_0.distcp
-│   └── __1_0.distcp
-├── rng.pt
-└── step_scheduler.pt
-
-4 directories, 11 files
-```
-
-**PEFT checkpoint:**
-```bash
-$ tree checkpoints/epoch_0_step_10/
-checkpoints/epoch_0_step_10/
-├── dataloader.pt
-├── config.yaml
-├── model
-│   ├── adapter_config.json
-│   ├── adapter_model.safetensors
-│   └── automodel_peft_config.json
-├── optim
-│   ├── __0_0.distcp
-│   └── __1_0.distcp
-├── rng.pt
-└── step_scheduler.pt
-
-2 directories, 8 files
-```
-:::
-
-## Run Inference
-
-Inference uses the Hugging Face `generate` API. Because SFT checkpoints are self-contained while PEFT checkpoints store only adapter weights (see [Checkpoint Contents](#checkpoint-contents)), the loading procedure differs between the two modes.
-
-### SFT Inference
-
-The SFT checkpoint at `model/consolidated/` is a complete Hugging Face model and can be loaded directly:
-
-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-ckpt_path = "checkpoints/epoch_0_step_10/model/consolidated"
-tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
-model = AutoModelForCausalLM.from_pretrained(ckpt_path)
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model.to(device)
-
-prompt = (
-    "Context: Architecturally, the school has a Catholic character. "
-    "Atop the Main Building's gold dome is a golden statue of the Virgin Mary. "
-    "Immediately in front of the Main Building and facing it, is a copper statue of Christ "
-    "with arms upraised with the legend 'Venite Ad Me Omnes'.\n\n"
-    "Question: What is atop the Main Building?\n\n"
-    "Answer:"
-)
-inputs = tokenizer(prompt, return_tensors="pt").to(device)
-output = model.generate(**inputs, max_new_tokens=50)
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-
-### PEFT Inference
-
-PEFT adapters must be loaded on top of the base model:
-
-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from peft import PeftModel
-
-base_model_name = "meta-llama/Llama-3.2-1B"
-tokenizer = AutoTokenizer.from_pretrained(base_model_name)
-model = AutoModelForCausalLM.from_pretrained(base_model_name)
-
-adapter_path = "checkpoints/epoch_0_step_10/model/"
-model = PeftModel.from_pretrained(model, adapter_path)
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model.to(device)
-
-prompt = (
-    "Context: Architecturally, the school has a Catholic character. "
-    "Atop the Main Building's gold dome is a golden statue of the Virgin Mary. "
-    "Immediately in front of the Main Building and facing it, is a copper statue of Christ "
-    "with arms upraised with the legend 'Venite Ad Me Omnes'.\n\n"
-    "Question: What is atop the Main Building?\n\n"
-    "Answer:"
-)
-inputs = tokenizer(prompt, return_tensors="pt").to(device)
-output = model.generate(**inputs, max_new_tokens=50)
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-
-## Evaluate the Fine-Tuned Model
-
-### During Training: Validation Loss
-
-The recipe automatically computes validation loss at the interval set by `val_every_steps`. Look for `[val]` lines in the training log:
-
-```text
-INFO:root:[val] step 20 | epoch 0 | loss 0.2469
-```
-
-A decreasing validation loss across checkpoints indicates the model is learning. If validation loss plateaus or increases while training loss continues to drop, the model may be overfitting — consider stopping earlier or reducing the learning rate.
-
-### After Training: lm-eval-harness
-
-For task-specific benchmarks (e.g., MMLU, GSM8k, HellaSwag accuracy), use [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness) with the fine-tuned checkpoint:
-
-```bash
-pip install lm-eval
-
-# SFT checkpoint (using vLLM backend for faster evaluation)
-lm_eval --model vllm \
-  --model_args pretrained=checkpoints/epoch_0_step_20/model/consolidated/ \
-  --tasks hellaswag \
-  --batch_size auto
-
-# PEFT adapter (using Hugging Face backend with built-in PEFT support)
-lm_eval --model hf \
-  --model_args pretrained=meta-llama/Llama-3.2-1B,peft=checkpoints/epoch_0_step_20/model/ \
-  --tasks hellaswag \
-  --batch_size auto
-```
-
-:::{tip}
-The SFT example uses the `vllm` backend for faster evaluation (requires `pip install vllm`; see [Deploy with vLLM](#deploy-with-vllm) for setup details). The PEFT example uses the `hf` backend with lm-eval's built-in PEFT support to load the adapter on top of the base model.
-:::
-
-:::{tip}
-Run lm-eval-harness on the base model *before* fine-tuning to establish a baseline, then compare against the fine-tuned checkpoint.
-:::
-
-## Publish to the Hugging Face Hub
-
-Fine-tuned checkpoints and PEFT adapters are stored in Hugging Face-native format and can be uploaded directly to the Hub.
-
-1. Install the Hugging Face Hub library (if not already installed):
-
-```bash
-pip3 install huggingface_hub
-```
-
-2. Log in to Hugging Face:
-
-```bash
-huggingface-cli login
-```
-
-3. Upload:
-
-**SFT checkpoint:**
-```python
-from huggingface_hub import HfApi
-
-api = HfApi()
-api.upload_folder(
-    folder_path="checkpoints/epoch_0_step_10/model/consolidated",
-    repo_id="your-username/llama3.2_1b-finetuned-squad",
-    repo_type="model",
-)
-```
-
-**PEFT adapter:**
-```python
-from huggingface_hub import HfApi
-
-api = HfApi()
-api.upload_folder(
-    folder_path="checkpoints/epoch_0_step_10/model",
-    repo_id="your-username/llama3.2_1b-lora-squad",
-    repo_type="model",
-)
-```
-
-Once uploaded, load the checkpoint or adapter directly from the Hub:
-
-**SFT:**
-```python
-from transformers import AutoModelForCausalLM
-
-model = AutoModelForCausalLM.from_pretrained("your-username/llama3.2_1b-finetuned-squad")
-```
-
-**PEFT:**
-```python
-from transformers import AutoModelForCausalLM
-from peft import PeftModel
-
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")
-model = PeftModel.from_pretrained(model, "your-username/llama3.2_1b-lora-squad")
-```
-
-## Deploy with vLLM
-
-[vLLM](https://github.com/vllm-project/vllm) is an efficient inference engine for production deployment of LLMs.
-
-:::{note}
-Make sure vLLM is installed (`pip install vllm`, or use an environment that includes it).
-:::
-
-### SFT Checkpoint with vLLM
-
-```python
-from vllm import LLM, SamplingParams
-
-llm = LLM(model="checkpoints/epoch_0_step_10/model/consolidated/", model_impl="transformers")
-params = SamplingParams(max_tokens=20)
-outputs = llm.generate("Toronto is a city in Canada.", sampling_params=params)
-print(f"Generated text: {outputs[0].outputs[0].text}")
-```
-```text
->>> Generated text:  It is the capital of Ontario. Toronto is a global hub for cultural tourism. The City of Toronto
-```
-
-### PEFT Adapter with vLLM
-
-PEFT adapter serving uses the `vLLMHFExporter` class, which is provided by the `nemo` package — a separate dependency from `nemo-automodel`.
-
-:::{important}
-Install both packages before proceeding:
-```bash
-pip install nemo vllm
-```
-:::
-
-```python
-from nemo.export.vllm_hf_exporter import vLLMHFExporter
-
-if __name__ == '__main__':
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model', required=True, type=str, help="Local path of the base model")
-    parser.add_argument('--lora-model', required=True, type=str, help="Local path of the LoRA adapter")
-    args = parser.parse_args()
-
-    lora_model_name = "lora_model"
-
-    exporter = vLLMHFExporter()
-    exporter.export(model=args.model, enable_lora=True)
-    exporter.add_lora_models(lora_model_name=lora_model_name, lora_model=args.lora_model)
-
-    print("vLLM Output: ", exporter.forward(input_texts=["How are you doing?"], lora_model_name=lora_model_name))
-```
-
-## Full Configuration Reference
-
-This section documents all available config fields for the fine-tuning recipe. For the quick-start config, see [Configure Your Training Recipe](#configure-your-training-recipe).
-
-### Switching Between SFT and PEFT
-
-The `peft:` section controls which mode runs:
-
-| Mode | What to do in the YAML |
-|------|----------------------|
-| **PEFT (LoRA)** | Include the `peft:` section as shown below. |
-| **SFT (full-parameter)** | Remove/comment the `peft:` section entirely. |
-
-All other config sections remain the same for both modes.
-
-### Full Configuration
-
-:::{dropdown} Full Config
-:open:
-```yaml
-# Recipe
-# Selects which recipe class runs the training loop.
-# Use a short name (auto-discovered) or a fully-qualified Python path:
-#   recipe: nemo_automodel.recipes.llm.train_ft.TrainFinetuneRecipeForNextTokenPrediction
-recipe: TrainFinetuneRecipeForNextTokenPrediction
-
-# Training Schedule
-# Controls epoch count, batch sizes, and how often to checkpoint / validate.
-# No _target_ — these are plain values read directly by the recipe.
-step_scheduler:
-  grad_acc_steps: 4       # number of micro-batches accumulated before each optimizer
-                          # step. Effective batch = grad_acc_steps × batch_size.
-  ckpt_every_steps: 10    # save a checkpoint every N gradient steps
-  val_every_steps: 10     # run the validation loop every N gradient steps
-  num_epochs: 1           # how many full passes over the training dataset
-
-# Process Group
-# Initializes the PyTorch distributed process group.
-# No _target_ — consumed directly by the recipe.
-# You normally would not need to tune this.
-dist_env:
-  backend: nccl           # communication backend: "nccl" (GPU, recommended) or "gloo" (CPU)
-  timeout_minutes: 1      # timeout for collective operations; increase for large models
-                          # that take longer to initialize
-
-# Distributed Strategy
-# Determines how model weights, data, and compute are split across GPUs.
-# No _target_ — consumed directly by the recipe.
-# See "Distributed Training: TP, PP, CP, and EP" in Advanced Topics for details.
-distributed:
-  strategy: fsdp2         # parallelism strategy: "fsdp2" (recommended), "megatron_fsdp",
-                          # or "ddp". FSDP2 shards parameters and optimizer states across
-                          # the data-parallel group.
-  dp_size: null           # data-parallel group size. null = auto-detect from
-                          # world_size ÷ (tp_size × cp_size × pp_size).
-  tp_size: 1              # tensor-parallel size: splits weight matrices across GPUs.
-                          # Set to 2, 4, or 8 if the model doesn't fit on one GPU.
-                          # Should divide evenly into the number of attention heads.
-  cp_size: 1              # context-parallel size: splits the input sequence across GPUs.
-                          # Increase for very long contexts (e.g. 32k+ tokens).
-  sequence_parallel: false # when true, extends TP to also shard activations along
-                          # the sequence dimension for additional memory savings
-
-# Random Number Generator
-# _target_ → StatefulRNG: a checkpointable RNG that ensures identical sequences
-# across training restarts. Seed and ranked are kwargs to StatefulRNG().
-rng:
-  _target_: nemo_automodel.components.training.rng.StatefulRNG
-  seed: 1111              # global random seed for reproducibility
-  ranked: true            # when true, each GPU rank gets a unique RNG stream derived
-                          # from the seed, so data shuffling differs per GPU
-
-# Model
-# _target_ → NeMoAutoModelForCausalLM.from_pretrained: downloads (or loads from
-# cache) a pretrained HuggingFace model and wraps it with NeMo distributed-training
-# support. Any from_pretrained kwarg is accepted (cache_dir, torch_dtype, etc.).
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-
-# PEFT (remove / comment this entire section for full-parameter SFT)
-# _target_ → PeftConfig: a dataclass describing which layers get LoRA adapters.
-# The recipe passes this config into build_model(), which attaches adapters
-# to the matching layers.
-peft:
-  _target_: nemo_automodel.components._peft.lora.PeftConfig
-  target_modules: "*.proj" # glob pattern matched against fully-qualified layer names;
-                           # "*.proj" matches every layer ending in "proj"
-  dim: 8                   # low-rank dimension r — controls adapter capacity.
-                           # Larger values are more expressive but use more memory.
-  alpha: 32                # LoRA scaling factor: adapter output is scaled by alpha/dim.
-                           # Higher values give adapters more influence during training.
-  use_triton: True         # use an optimized Triton kernel for LoRA forward/backward
-                           # (requires the triton package)
-
-# Checkpointing
-# No _target_ — plain key-value group consumed by the recipe.
-checkpoint:
-  enabled: true            # set to false to skip saving checkpoints entirely
-  checkpoint_dir: checkpoints/  # output directory. Docker users: bind-mount this path
-                                # (e.g. -v $(pwd)/checkpoints:/workspace/checkpoints)
-                                # to persist checkpoints across container restarts.
-  model_save_format: safetensors  # "safetensors" (recommended, faster and safer) or
-                                  # "torch_save" (legacy pickle-based format)
-  save_consolidated: True  # when true, writes a single HuggingFace-compatible checkpoint
-                           # to model/consolidated/ that can be loaded directly by
-                           # Transformers, vLLM, etc. Requires safetensors format.
-
-# Training Dataset
-# _target_ → make_squad_dataset: a factory function that downloads the SQuAD
-# dataset, tokenizes it, and returns a torch Dataset. To use a different dataset,
-# change _target_ to another factory function (see the dataset guide).
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad  # HuggingFace Hub dataset ID
-  split: train                   # which split to use (train, validation, test)
-
-# Validation Dataset
-validation_dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad
-  split: validation
-  limit_dataset_samples: 64  # cap validation set to 64 samples for faster eval loops;
-                             # remove this line to use the full validation set
-
-# Training Dataloader
-# _target_ → StatefulDataLoader: a checkpointable DataLoader from torchdata that
-# saves and restores iteration state across training restarts, so resumed runs
-# don't re-process already-seen batches.
-dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn: nemo_automodel.components.datasets.utils.default_collater
-                               # function that pads and batches individual samples
-                               # into tensors; can be swapped for custom collation
-  batch_size: 8                # samples per micro-batch per GPU
-  shuffle: true                # whether to shuffle the dataset each epoch
-
-# Validation Dataloader
-validation_dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn: nemo_automodel.components.datasets.utils.default_collater
-  batch_size: 8
-
-# Loss Function
-# _target_ → MaskedCrossEntropy: standard cross-entropy loss that automatically
-# ignores padding tokens so they don't affect the gradient.
-# Other available loss functions (swap _target_ to use):
-#   - nemo_automodel.components.loss.chunked_ce.ChunkedCrossEntropy
-#       Computes CE in chunks along the sequence dimension to reduce peak memory.
-#       Useful for very long sequences. Accepts chunk_len (default 32).
-#   - nemo_automodel.components.loss.linear_ce.FusedLinearCrossEntropy
-#       Fuses the final linear projection (lm_head) with the CE computation,
-#       avoiding the full logit tensor. Significant **memory savings** for large vocabs.
-#   - nemo_automodel.components.loss.te_parallel_ce.TEParallelCrossEntropy
-#       TE-based parallel CE with a Triton kernel. Designed for tensor-parallel
-#       setups where logits are sharded across TP ranks.
-loss_fn:
-  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
-
-# Optimizer
-# _target_ → torch.optim.Adam: any torch.optim class can be used here (e.g.
-# AdamW, SGD). All remaining keys become kwargs to the constructor.
-optimizer:
-  _target_: torch.optim.Adam
-  lr: 1.0e-5               # learning rate — the most important hyperparameter to tune
-  betas: [0.9, 0.999]      # Adam momentum coefficients (β₁ for mean, β₂ for variance)
-  eps: 1e-8                 # small constant added to the denominator for numerical stability
-  weight_decay: 0           # L2 regularization strength (0 = no regularization)
-
-# Logging (optional)
-# Uncomment to enable Weights & Biases experiment tracking.
-# wandb:
-#   project: <your_wandb_project>    # W&B project name
-#   entity: <your_wandb_entity>      # W&B team or username
-#   name: <your_wandb_exp_name>      # display name for this run
-#   save_dir: <your_wandb_save_dir>  # local directory for W&B artifacts
-```
-:::
-
-### Config Field Reference
-
-| Section | Required? | What to change |
-|---------|-----------|----------------|
-| `model` | Yes | Set `pretrained_model_name_or_path` to your Hugging Face model ID. Source: [`auto_model.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/_transformers/auto_model.py). |
-| `peft` | PEFT only | Remove entirely for SFT. Adjust `dim` and `alpha` to tune adapter capacity. `use_triton: True` enables an optimized LoRA kernel (requires the `triton` package). For reduced memory usage, see [QLoRA](#qlora-quantized-low-rank-adaptation). Source: [`lora.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/_peft/lora.py). |
-| `dataset` | Yes | Change `_target_`, `dataset_name`, and `split` for your data. Source: [`squad.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/datasets/llm/squad.py). |
-| `dataloader` | Optional | Adjust `batch_size` and `shuffle`. Uses [`StatefulDataLoader`](https://meta-pytorch.org/data/main/torchdata.stateful_dataloader.html) for checkpointable iteration. Collation: [`utils.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/datasets/utils.py). |
-| `loss_fn` | Optional | Default is [`MaskedCrossEntropy`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/loss/masked_ce.py). Alternatives: [`ChunkedCrossEntropy`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/loss/chunked_ce.py) (long sequences), [`FusedLinearCrossEntropy`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/loss/linear_ce.py) (large vocabs), [`TEParallelCrossEntropy`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/loss/te_parallel_ce.py) (tensor-parallel). |
-| `rng` | Optional | Controls reproducibility. Source: [`rng.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/training/rng.py). |
-| `step_scheduler` | Yes | `grad_acc_steps` sets how many micro-batches accumulate per gradient step. `ckpt_every_steps` and `val_every_steps` are counted in gradient steps. |
-| `distributed` | Yes | `dp_size: null` means auto-detect from world size. Adjust `tp_size` for tensor parallelism across GPUs. |
-| `checkpoint` | Recommended | Set `checkpoint_dir` to a persistent path, especially in Docker. |
-| `optimizer` | Optional | Defaults are reasonable. Any `torch.optim` class can be substituted via `_target_`. |
-| `wandb` | Optional | Uncomment and configure to enable Weights & Biases logging. |
-
-For the fine-tuning recipe itself, see [`train_ft.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/recipes/llm/train_ft.py). For more example configs, browse [`examples/llm_finetune/`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_finetune).
-
-## Distributed Training: TP, PP, CP, and EP
-
-The `distributed:` section controls how the model and data are split across GPUs. NeMo AutoModel supports four parallelism dimensions, each of which slices the workload differently:
-
-| Dimension | Key | What it shards | When to use |
-|-----------|-----|---------------|-------------|
-| **Data Parallel (DP)** | `dp_size` | Replicates the model on each group of GPUs; each replica trains on a different data batch. | Default. Scales batch size linearly with GPU count. |
-| **Tensor Parallel (TP)** | `tp_size` | Splits individual weight matrices (attention, MLP) across GPUs within a node. | Model is too large to fit on a single GPU, or you want to reduce per-GPU memory at the cost of extra communication. |
-| **Pipeline Parallel (PP)** | `pp_size` | Assigns different *layers* (stages) to different GPUs and pipelines micro-batches through them. | Very deep models that don't fit even with TP, or multi-node training where TP's all-reduce is too expensive across nodes. |
-| **Context Parallel (CP)** | `cp_size` | Splits the input *sequence* across GPUs so each GPU processes a portion of the context. | Very long sequences that exceed single-GPU memory. |
-| **Expert Parallel (EP)** | `ep_size` | Distributes MoE experts across GPUs so each GPU holds a subset of experts. | Mixture-of-Experts models only. |
-
-These dimensions compose with each other. The relationship between them and total GPU count is:
-
-```text
-world_size = pp_size × dp_size × cp_size × tp_size
-```
-
-When `dp_size` is set to `null` (the default), it is inferred automatically:
-
-```text
-dp_size = world_size ÷ (tp_size × cp_size × pp_size)
-```
-
-EP does not appear in this formula — experts are distributed across the DP×CP rank groups, with the constraint that `(dp_size × cp_size)` must be divisible by `ep_size`.
-
-#### Data Parallel (default)
-
-Data parallelism is the default. With `strategy: fsdp2`, FSDP2 shards both model parameters and optimizer states across the DP group, so memory usage shrinks as you add GPUs:
-
-```yaml
-distributed:
-  strategy: fsdp2
-  dp_size: null   # auto-detected from world_size ÷ (tp × cp × pp)
-  tp_size: 1
-  cp_size: 1
-```
-
-#### Tensor Parallelism
-
-TP splits weight matrices across GPUs within a single node. Set `tp_size` to the number of GPUs you want to shard over (typically 2, 4, or 8 — should divide evenly into the number of attention heads):
-
-```yaml
-distributed:
-  strategy: fsdp2
-  dp_size: null
-  tp_size: 4
-  cp_size: 1
-  sequence_parallel: false   # set to true for additional memory savings
-```
-
-`sequence_parallel: true` extends TP to also shard activation memory along the sequence dimension, further reducing per-GPU memory at the cost of additional communication.
-
-#### Pipeline Parallelism
-
-PP assigns groups of layers to different GPUs and streams micro-batches through the stages. It requires an additional nested `pipeline:` section:
-
-```yaml
-distributed:
-  strategy: fsdp2
-  dp_size: null
-  tp_size: 4
-  pp_size: 4
-  cp_size: 1
-  activation_checkpointing: true
-
-  pipeline:
-    pp_schedule: interleaved1f1b  # pipeline schedule (1f1b or interleaved1f1b)
-    pp_microbatch_size: 1         # micro-batch size per pipeline step
-    layers_per_stage: 4           # how many layers each stage handles
-    scale_grads_in_schedule: false
-```
-
-| Key | Role |
-|-----|------|
-| `pp_schedule` | The micro-batch schedule. `1f1b` is simpler; `interleaved1f1b` overlaps compute and communication for better throughput. |
-| `pp_microbatch_size` | Number of samples per micro-batch fed into the pipeline. Must satisfy: `local_batch_size ÷ pp_microbatch_size ≥ pp_size`. |
-| `layers_per_stage` | How many transformer layers each pipeline stage contains. If omitted, the framework splits layers evenly across `pp_size` stages. |
-
-:::{note}
-PP requires the model to define a `_pp_plan` that tells the framework how to split layers into stages. All built-in models include this plan; custom models must add one.
-:::
-
-#### Context Parallelism
-
-CP splits the sequence across GPUs — useful for very long contexts that exceed single-GPU memory. Set `cp_size` to the desired split factor:
-
-```yaml
-distributed:
-  strategy: fsdp2
-  dp_size: null
-  tp_size: 1
-  cp_size: 2
-```
-
-:::{important}
-When `cp_size > 1`, fused RoPE is automatically disabled. Some models also require the Transformer Engine (TE) attention backend for CP with packed sequences — the framework will raise an error with instructions if this applies.
-:::
-
-#### Expert Parallelism (MoE models)
-
-EP distributes MoE experts across GPUs. Set `ep_size` to the number of GPUs that share the full set of experts:
-
-```yaml
-distributed:
-  strategy: fsdp2
-  tp_size: 1
-  cp_size: 1
-  pp_size: 1
-  ep_size: 8
-  activation_checkpointing: true
-```
-
-EP only applies to Mixture-of-Experts models (e.g. Qwen3-MoE, Mixtral, DeepSeek-V3). For dense models, leave `ep_size` at `1` or omit it.
-
-#### Combining Multiple Dimensions
-
-You can combine TP, PP, CP, and EP in a single config. For example, a large MoE model on a multi-node cluster might use:
-
-```yaml
-distributed:
-  strategy: fsdp2
-  dp_size: null
-  tp_size: 1
-  cp_size: 2
-  pp_size: 1
-  ep_size: 4
-  activation_checkpointing: true
-```
-
-When choosing a combination, keep these rules in mind:
-
-- **`world_size` must divide evenly** into `pp_size × tp_size × cp_size` (the remainder becomes `dp_size`).
-- **`(dp_size × cp_size) % ep_size == 0`** — EP shares the DP×CP groups.
-- **TP within a node, PP across nodes** is the typical layout — TP requires fast NVLink bandwidth, while PP tolerates higher latency.
-- **Start simple.** Use DP-only first. Add TP if the model doesn't fit on one GPU. Add PP for very large models. Add CP for long sequences. Add EP only for MoE architectures.
-
-## Next Steps
-
-- [Integrate Your Own Text Dataset](dataset.md) — swap the SQuAD example for your own data.
-- [Recipes and End-to-End Examples](../overview.md) — browse the full set of recipes available in NeMo AutoModel. See also the [`examples/llm_finetune/`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_finetune) directory for ready-to-run configs.
-- [Dataset Overview](../dataset-overview.md) — see all supported dataset types across LLM, VLM, and retrieval tasks.
-- [Knowledge Distillation](knowledge-distillation.md) — distill a fine-tuned model into a smaller one.
diff --git a/fern/versions/nightly/pages/guides/llm/finetune.mdx b/docs/guides/llm/finetune.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/llm/finetune.mdx
rename to docs/guides/llm/finetune.mdx
diff --git a/docs/guides/llm/hy3.md b/docs/guides/llm/hy3.md
deleted file mode 100644
index 2bfa3ff519..0000000000
--- a/docs/guides/llm/hy3.md
+++ /dev/null
@@ -1,100 +0,0 @@
-# Fine-Tune Hy3-preview (HunyuanLarge)
-
-## Introduction
-
-[tencent/Hy3-preview](https://huggingface.co/tencent/Hy3-preview) is a 295B Mixture-of-Experts language model from Tencent. It features 80 transformer layers (layer 0 dense, layers 1–79 MoE), 192 routed experts plus one shared expert per MoE block with top-8 sigmoid routing, Grouped Query Attention (64 Q / 8 KV heads, `head_dim=128`), per-head QK RMSNorm applied before RoPE, and an `expert_bias` buffer (surfaced as `e_score_correction_bias` in the Automodel gate) for expert-load correction during inference. The model supports a 256K context window via long-context RoPE (`rope_theta=11158840`).
-
-This guide walks you through fine-tuning Hy3-preview on HellaSwag using NVIDIA NeMo Automodel. You will learn how to configure the recipe, launch training, and inspect the results.
-
-To set up your environment to run NeMo Automodel, follow the [installation guide](https://github.com/NVIDIA-NeMo/Automodel#-install-nemo-automodel).
-
-## Data
-
-### HellaSwag
-
-We use [HellaSwag](https://huggingface.co/datasets/Rowan/hellaswag), a commonsense natural-language-inference dataset consisting of context + four candidate continuations. The version used here is the standard `rowan/hellaswag` HuggingFace split, formatted for next-token-prediction fine-tuning.
-
-- **Train / validation splits** taken directly from the HuggingFace dataset.
-- **Tokenizer**: shared with the base model (`AutoTokenizer.from_pretrained` on the Hy3-preview checkpoint).
-- **Padding**: `pad_seq_len_divisible=64` via the default collater.
-
-For the full HellaSwag dataset wrapper used in NeMo Automodel, see [`nemo_automodel.components.datasets.llm.hellaswag.HellaSwag`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/datasets/llm/hellaswag.py).
-
-## Architecture Notes
-
-Hy3-preview is a large-scale MoE with a few details that are worth calling out explicitly. The NeMo Automodel state-dict adapter and training recipe handle all of these transparently:
-
-- **Dense-first MoE layout**: layer 0 is a standard dense MLP (`intermediate_size=1536`); layers 1–79 use the MoE block (192 routed experts + 1 shared expert). This is controlled by `first_k_dense_replace=1` in the config.
-- **GQA with per-head QK RMSNorm**: 64 Q heads and 8 KV heads (`head_dim=128`). A separate RMSNorm is applied independently to each head's Q and K projections before RoPE is applied — this is distinct from a single pre-attention layer norm and requires care when remapping projection weights.
-- **Sigmoid routing with expert-bias correction**: expert selection uses a sigmoid score (not softmax). The `e_score_correction_bias` buffer tracks per-expert load imbalance; during fine-tuning the bias update factor is set to zero (`gate_bias_update_factor=0.0`) so the bias stays frozen. The buffer is created in the Automodel gate to ensure the HF checkpoint loads cleanly.
-- **Shared expert**: each MoE block contains one always-active shared expert (`num_shared_experts=1`) whose output is added unconditionally alongside the routed output.
-- **MTP layers**: the released checkpoint contains additional multi-token-prediction layers at indices ≥ 80 (`num_nextn_predict_layers`). These are filtered out by the state-dict adapter on load and are not used during standard SFT.
-- **Long-context RoPE**: `rope_theta=11158840` with dynamic NTK-aware scaling (`beta_slow` / `beta_fast`) extending the context to 256K tokens.
-
-## Checkpoint Format
-
-The released `tencent/Hy3-preview` safetensors use a per-expert split layout and Tencent-specific key names that differ from the batched `GroupedExperts` convention used inside Automodel. The `HYV3StateDictAdapter` converts between the two transparently in three steps:
-
-**1. Per-expert tensors → grouped form.**
-On disk every expert is stored as three separate rank-3 tensors:
-
-```
-model.layers.{L}.mlp.experts.{E}.gate_proj.weight   # [moe_inter, hidden]
-model.layers.{L}.mlp.experts.{E}.up_proj.weight     # [moe_inter, hidden]
-model.layers.{L}.mlp.experts.{E}.down_proj.weight   # [hidden, moe_inter]
-```
-
-The adapter merges these across all 192 experts and stacks gate + up into a single fused tensor, landing at the Automodel layout:
-
-```
-model.layers.{L}.mlp.experts.gate_and_up_projs      # [n_local, hidden, 2*moe_inter]
-model.layers.{L}.mlp.experts.down_projs             # [n_local, moe_inter, hidden]
-```
-
-where `n_local = n_experts / ep_size` (the slice owned by the current EP rank).
-
-**2. Three HYV3-specific key renames.**
-
-| On-disk (HF) key | Native Automodel key |
-|---|---|
-| `mlp.expert_bias` | `mlp.gate.e_score_correction_bias` |
-| `mlp.router.gate.weight` | `mlp.gate.weight` |
-| `mlp.shared_mlp.*` | `mlp.shared_experts.*` |
-
-All other keys (attention projections, norms, embeddings, `lm_head`) are identical between formats.
-
-**3. MTP layer filtering.**
-Keys at layer indices ≥ `num_hidden_layers` (default 80) are silently dropped on load.
-
-## Launch Training
-
-A ready-to-use recipe ships at [`examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml). The yaml header documents how to adjust `ep_size` and `pp_size` for different cluster configurations.
-
-NeMo Automodel supports several ways to launch training — via the Automodel CLI with Slurm, interactive sessions, `torchrun`, and more. For full details on all launch options (Slurm batch jobs, multi-node configuration, environment variables, etc.), see the [Run on a Cluster](https://github.com/NVIDIA-NeMo/Automodel/blob/main/docs/launcher/slurm.md) guide.
-
-### Standalone Slurm Script
-
-Below is a standalone Slurm script example for the HellaSwag recipe. Before running it, ensure your cluster environment is configured following the [Run on a Cluster](https://github.com/NVIDIA-NeMo/Automodel/blob/main/docs/launcher/slurm.md) guide. Then submit the job:
-
-```bash
-export TRANSFORMERS_OFFLINE=1
-export HF_HOME=your/path/to/hf_cache
-export HF_DATASETS_OFFLINE=1
-export WANDB_API_KEY=your_wandb_key
-
-srun --output=output.out \
-     --error=output.err \
-     --container-image /your/path/to/automodel.image.sqsh --no-container-mount-home bash -c "
-  CUDA_DEVICE_MAX_CONNECTIONS=1 automodel \
-  examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml \
-  --nproc-per-node=8 \
-  --model.config.pretrained_model_name_or_path=/your/local/hy3-preview \
-  --model.config.name_or_path=/your/local/hy3-preview "
-```
-
-**Before you start**:
-- Hugging Face applies rate limits on downloads. We recommend cloning the model repository to your local filesystem beforehand.
-- Ensure your Hugging Face cache (`HF_HOME`) is configured and that the dataset is already cached locally.
-- To enable Weights & Biases logging, set your `WANDB_API_KEY` and uncomment the `wandb` section at the bottom of the YAML file.
-- The full recipe uses `pp_size=4` and `ep_size=32` (128 GPUs total). Valid `ep_size` values are any divisor of 192 (e.g. 8, 16, 24, 32, 48, 64, 96, 192); adjust `pp_size` and `--nproc-per-node` to match your node count.
-- For a quick end-to-end smoke test on 8 GPUs use [`examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml).
diff --git a/fern/versions/nightly/pages/guides/llm/hy3.mdx b/docs/guides/llm/hy3.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/llm/hy3.mdx
rename to docs/guides/llm/hy3.mdx
diff --git a/docs/guides/llm/knowledge-distillation.md b/docs/guides/llm/knowledge-distillation.md
deleted file mode 100644
index cf88e9aa2e..0000000000
--- a/docs/guides/llm/knowledge-distillation.md
+++ /dev/null
@@ -1,181 +0,0 @@
-# Knowledge Distillation
-
-This guide walks through fine-tuning a **student** LLM with the help of a
-larger **teacher** model using the `kd` (knowledge distillation) recipe.
-
-In particular, we will show how to distill a 3B (`meta-llama/Llama-3.2-3B`) model into a 1B (`meta-llama/Llama-3.2-1B`) model.
-
-## What is Knowledge Distillation?
-
-Knowledge distillation (KD) transfers the *dark knowledge* of a high-capacity
-teacher model to a smaller student by minimizing the divergence between their
-predicted distributions. The student learns from both the ground-truth labels
-(Cross-Entropy loss, **CE**) and the soft targets of the teacher (Kullback-Leibler
-loss, **KD**):
-
-
-$$
-  \mathcal{L} = (1-\alpha) \cdot \mathcal{L}_{\textrm{CE}}(p^{s}, y) + \alpha \cdot \mathcal{KL}(p^{s}, p^{t})
-$$
-
-where $\(\alpha\)$ is the `kd_ratio`, $\(T\)$ softmax `temperature` and $y$ the labels. For the arguments p:
-$$p^{s} = softmax(z^{s}, T)$$.
-
-## Prepare the YAML Config
-
-A ready-to-use example is provided at
-`examples/llm_kd/llama3_2/llama3_2_1b_kd.yaml`.  Important sections:
-
-* `model` – the student to be fine-tuned (1 B parameters in the example)
-* `teacher_model` – a larger frozen model used for supervision (7 B)
-* `kd_ratio` – blend between CE and KD loss
-* `temperature` – softens probability distributions before KL-divergence
-* `peft` – **optional** LoRA config (commented). Uncomment to train only a
-  handful of parameters.
-
-Feel free to tweak these values as required.
-
-### Example YAML
-
-```yaml
-# Example config for knowledge distillation fine-tuning
-# Run with:
-#   automodel examples/llm_kd/llama3_2/llama3_2_1b_kd.yaml
-
-step_scheduler:
-  global_batch_size: 32
-  local_batch_size: 1
-  ckpt_every_steps: 200
-  val_every_steps: 100  # will run every x number of gradient steps
-  num_epochs: 2
-
-dist_env:
-  backend: nccl
-  timeout_minutes: 1
-
-rng:
-  _target_: nemo_automodel.components.training.rng.StatefulRNG
-  seed: 1111
-  ranked: true
-
-# Student
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-  torch_dtype: bf16
-
-# Teacher
-teacher_model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-3B
-  torch_dtype: bf16
-
-checkpoint:
-  enabled: true
-  checkpoint_dir: checkpoints/
-  model_save_format: safetensors
-  save_consolidated: false
-
-distributed:
-  strategy: fsdp2
-  dp_size: null
-  tp_size: 1
-  cp_size: 1
-  pp_size: 1
-  sequence_parallel: false
-
-# PEFT can be enabled by uncommenting below – student weights will remain small
-# peft:
-#   _target_: nemo_automodel.components._peft.lora.PeftConfig
-#   target_modules: '*_proj'
-#   dim: 16
-#   alpha: 32
-#   use_triton: true
-
-loss_fn:
-  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
-
-# Knowledge-distillation hyper-params
-kd_ratio: 0.5          # 0 → pure CE, 1 → pure KD
-kd_loss_fn:
-  _target_: nemo_automodel.components.loss.kd_loss.KDLoss
-  ignore_index: -100
-  temperature: 1.0
-  fp32_upcast: true
-
-# Optimizer
-optimizer:
-  _target_: torch.optim.Adam
-  betas: [0.9, 0.999]
-  eps: 1e-8
-  lr: 1.0e-5
-  weight_decay: 0
-
-# Dataset / Dataloader
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad
-  split: train
-
-dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn: nemo_automodel.components.datasets.utils.default_collater
-  shuffle: false
-
-validation_dataset:
-  _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
-  path_or_dataset: rowan/hellaswag
-  split: validation
-  num_samples_limit: 64
-
-validation_dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn: nemo_automodel.components.datasets.utils.default_collater
-```
-
-### Current Limitations
-
-* Pipeline parallelism (`pp_size > 1`) is not yet supported – planned for a future release.
-* Distilling Vision-Language models (`vlm` recipe) is currently not supported.
-* Student and teacher models must share the same tokenizer for now; support for different tokenizers will be added in the future.
-
-## Launch Training
-
-### Single-GPU Quick Run
-
-```bash
-# Runs on a single device of the current host
-automodel examples/llm_kd/llama3_2/llama3_2_1b_kd.yaml
-```
-
-### Multi-GPU (Single Node)
-
-```bash
-# Leverage all GPUs on the local machine
-torchrun --nproc-per-node $(nvidia-smi -L | wc -l) \
-    nemo_automodel/recipes/llm/kd.py \
-    -c examples/llm_kd/llama3_2/llama3_2_1b_kd.yaml
-```
-
-### Slurm Cluster
-
-The CLI seamlessly submits Slurm jobs when a `slurm` section is added to the
-YAML.  Refer to `docs/guides/installation.md` for cluster instructions.
-
-## Monitoring
-
-Metrics such as *train_loss*, *kd_loss*, *learning_rate* and *tokens/sec* are
-logged to **WandB** when the corresponding section is enabled.
-
-## Checkpoints and Inference
-
-- Checkpoints are written under the directory configured in the `checkpoint.checkpoint_dir` field at every `ckpt_every_steps`.
-- The final student model is saved according to the `checkpoint` section (e.g., `model_save_format: safetensors`, consolidated weights if `save_consolidated: true`).
-
-Load the distilled model:
-
-```python
-import nemo_automodel as am
-student = am.NeMoAutoModelForCausalLM.from_pretrained("checkpoints/final")
-print(student("Translate to French: I love coding!").text)
-```
diff --git a/fern/versions/v0.4/pages/guides/llm/knowledge-distillation.mdx b/docs/guides/llm/knowledge-distillation.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/llm/knowledge-distillation.mdx
rename to docs/guides/llm/knowledge-distillation.mdx
diff --git a/docs/guides/llm/large-moe-finetune.md b/docs/guides/llm/large-moe-finetune.md
deleted file mode 100644
index ec9048eaef..0000000000
--- a/docs/guides/llm/large-moe-finetune.md
+++ /dev/null
@@ -1,130 +0,0 @@
-# Fine-Tune Large MoE LLMs
-
-## Introduction
-
-Mixture-of-Experts (MoE) architectures have become the dominant design for frontier language models, activating only a fraction of their total parameters per token to deliver strong performance at reduced compute cost. This guide walks through fine-tuning four example MoE LLMs with NVIDIA NeMo Automodel. For a full list of supported architectures, see the [LLM model coverage](../../model-coverage/llm/index.md) page.
-
-| Model | HF Checkpoint | Validated Using |
-|-------|--------------|-----------------|
-| GLM-5 | [`zai-org/GLM-5`](https://huggingface.co/zai-org/GLM-5) | 256 H100 GPUs (32 nodes x 8) |
-| MiniMax-M2.5 | [`MiniMaxAI/MiniMax-M2.5`](https://huggingface.co/MiniMaxAI/MiniMax-M2.5) | 64 H100 GPUs (8 nodes x 8) |
-| Step-3.5 Flash | [`stepfun-ai/Step-3.5-Flash`](https://huggingface.co/stepfun-ai/Step-3.5-Flash) | 64 H100 GPUs (8 nodes x 8) |
-| DeepSeek-V3.2 | [`deepseek-ai/DeepSeek-V3.2`](https://huggingface.co/deepseek-ai/DeepSeek-V3.2) | 256 H100 GPUs (32 nodes x 8) |
-
-To set up your environment to run NeMo Automodel, follow the [installation guide](https://github.com/NVIDIA-NeMo/Automodel#-install-nemo-automodel).
-
-## Data
-
-### HellaSwag Dataset
-
-All four recipes use the [HellaSwag](https://huggingface.co/datasets/rowan/hellaswag) dataset, a commonsense natural language inference benchmark where the model must predict the most plausible continuation of a given scenario.
-
-- **Source**: `rowan/hellaswag`
-- **Split**: `train` (used for both training and validation in these recipes)
-- **Task**: Next-token prediction on commonsense sentence completions
-
-For details on how to swap in your own dataset, see the [LLM Dataset Guide](https://github.com/NVIDIA-NeMo/Automodel/blob/main/docs/guides/llm/dataset.md) and the [Dataset Overview](https://github.com/NVIDIA-NeMo/Automodel/blob/main/docs/guides/dataset-overview.md).
-
-## Recipes
-
-### MiniMax-M2.5
-
-[`examples/llm_finetune/minimax_m2/minimax_m2.5_hellaswag_pp.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/minimax_m2/minimax_m2.5_hellaswag_pp.yaml) — validated using **64 H100 GPUs** (8 nodes x 8).
-
-Key distributed settings:
-
-```yaml
-distributed:
-  strategy: fsdp2
-  pp_size: 2
-  ep_size: 32
-  pipeline:
-    pp_schedule: interleaved1f1b
-    layers_per_stage: 2
-```
-
-### GLM-5
-
-[`examples/llm_finetune/glm/glm_5_hellaswag_pp.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/glm/glm_5_hellaswag_pp.yaml) — validated using **256 H100 GPUs** (32 nodes x 8).
-
-Key distributed settings:
-
-```yaml
-distributed:
-  strategy: fsdp2
-  pp_size: 4
-  ep_size: 64
-  activation_checkpointing: true
-  pipeline:
-    pp_schedule: interleaved1f1b
-    layers_per_stage: 2
-```
-
-### Step-3.5 Flash (StepFun)
-
-[`examples/llm_finetune/stepfun/step_3.5_flash_hellaswag_pp.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/stepfun/step_3.5_flash_hellaswag_pp.yaml) — validated using **64 H100 GPUs** (8 nodes x 8).
-
-Key distributed settings:
-
-```yaml
-distributed:
-  strategy: fsdp2
-  pp_size: 2
-  ep_size: 32
-  pipeline:
-    pp_schedule: interleaved1f1b
-    layers_per_stage: 2
-```
-
-### DeepSeek-V3.2
-
-[`examples/llm_finetune/deepseek_v32/deepseek_v32_hellaswag_pp.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/deepseek_v32/deepseek_v32_hellaswag_pp.yaml) — validated using **256 H100 GPUs** (32 nodes x 8).
-
-Key distributed settings:
-
-```yaml
-distributed:
-  strategy: fsdp2
-  pp_size: 4
-  ep_size: 64
-  activation_checkpointing: true
-  pipeline:
-    pp_schedule: interleaved1f1b
-    layers_per_stage: 2
-```
-
-## Launch Training
-
-NeMo Automodel supports several ways to launch training—via the Automodel CLI with Slurm, interactive sessions, `torchrun`, and more. For full details on all launch options (Slurm batch jobs, multi-node configuration, environment variables, etc.), see the [Run on a Cluster](https://github.com/NVIDIA-NeMo/Automodel/blob/main/docs/launcher/slurm.md) guide.
-
-### Automodel CLI
-
-```bash
-automodel finetune llm -c examples/llm_finetune/glm/glm_5_hellaswag_pp.yaml
-```
-
-Replace the recipe path with the one for your target model.
-
-### torchrun
-
-```bash
-export TRANSFORMERS_OFFLINE=1
-export HF_HOME=your/path/to/hf_cache
-export HF_DATASETS_OFFLINE=1
-export WANDB_API_KEY=your_wandb_key
-
-torchrun --nproc-per-node=8 \
-         --nnodes=8 \
-         --rdzv_backend=c10d \
-         --rdzv_endpoint=${MASTER_ADDR}:${PORT} \
-  nemo_automodel/recipes/llm/benchmark.py \
-    -c examples/llm_finetune/glm/glm_5_hellaswag_pp.yaml \
-    --model.pretrained_model_name_or_path=/your/local/model_weights
-```
-
-Replace the `-c` path, `--nnodes`, and `--model.pretrained_model_name_or_path` for the model you want to fine-tune.
-
-**Before you start**:
-- Hugging Face applies rate limits on downloads. We recommend cloning the model repository to your local filesystem beforehand.
-- Ensure your Hugging Face cache (`HF_HOME`) is configured and that the dataset is already cached locally.
-- To enable Weights & Biases logging, set your `WANDB_API_KEY` and configure the `wandb` section in the YAML file.
diff --git a/fern/versions/v0.4/pages/guides/llm/large-moe-finetune.mdx b/docs/guides/llm/large-moe-finetune.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/llm/large-moe-finetune.mdx
rename to docs/guides/llm/large-moe-finetune.mdx
diff --git a/docs/guides/llm/nanogpt-pretraining.md b/docs/guides/llm/nanogpt-pretraining.md
deleted file mode 100644
index 8f84dc09bc..0000000000
--- a/docs/guides/llm/nanogpt-pretraining.md
+++ /dev/null
@@ -1,455 +0,0 @@
-# LLM Pre-Training
-
-This guide covers **FineWeb** data preparation, **defining** a [NanoGPT‑style](https://github.com/KellerJordan/modded-nanogpt) model, and **launching and monitoring** a NeMo AutoModel pre‑training run.
-
-## Set Up Your Environment
-
-In this guide, we will use an interactive environment to install NeMo AutoModel from Git. You can also install NeMo AutoModel from PyPI or use our bi-monthly Docker container (see the [Installation Guide](../installation.md)).
-
-```bash
-# clone / install AutoModel (editable for local hacks)
-cd /path/to/workspace/ # specify to your path as needed.
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel/
-pip install -e ".[all]"    # installs NeMo AutoModel + optional extras
-```
-
-:::{note}
-For this guide, we will use a single machine equipped with 8xH100 NVIDIA GPUs.
-:::
-
-:::{tip}
-To run this guide on a single GPU, use the single-GPU command in the **Launch Training** section below and scale down the YAML (for example, reduce `step_scheduler.global_batch_size` / `local_batch_size`, and shrink the model using `model.n_layer` / `model.n_embd` / `model.n_head`). For more launch patterns, see [Run on Your Local Workstation](../../launcher/local-workstation.md).
-:::
-
-## Preprocess the FineWeb Dataset
-
-:::{warning}
-**File Size Limitation**: The `nanogpt_data_processor.py` script has a **4GB file size limit** (~2^32 bytes) due to 32-bit position tracking in the BOS index. This translates to:
-- **~2 billion tokens** when using uint16 (vocabularies < 65,536 tokens, e.g., GPT-2)
-- **~1 billion tokens** when using uint32 (larger vocabularies)
-
-Always use the `--max-tokens` flag to stay within these limits (e.g., `--max-tokens 2B` or `--max-tokens 1.5B`).
-
-For larger datasets, please see [pretraining.md](pretraining.md) which supports sharded preprocessing without these constraints.
-:::
-
-### Quick Introduction to the FineWeb Dataset
-The [FineWeb](https://huggingface.co/datasets/HuggingFaceFW/fineweb) dataset consists of more than 18.5T tokens of cleaned and deduplicated English web data from [CommonCrawl](https://commoncrawl.org/). For this guide, we use the **`sample-10BT` subset** (10 billion tokens), from which we extract a smaller sample (e.g., 500M tokens) that fits within the preprocessing tool's limits.
-
-Briefly, FineWeb is built by extracting main text from CommonCrawl WARC HTML, keeping English pages using fastText language scoring, applying multiple quality filters (e.g., Gopher repetition/quality checks, C4-style rules, and custom heuristics for list-like or repeated/poorly formatted lines), and then MinHash-deduplicating each crawl independently (5-gram shingling with 14×8 hash functions). Basic PII normalization is applied (e.g., anonymizing emails and public IPs). The result is released per-crawl (and convenient sampled subsets), ready for high-throughput streaming.
-
-:::{tip}
-To train on more than 2B tokens from FineWeb, see [pretraining.md](pretraining.md) which uses Megatron Core's sharded dataset format without file size constraints.
-:::
-
-### Preprocessing and Tokenization
-
-For the purposes of this guide, we provide a data preprocessing tool at [`nanogpt_data_processor.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/tools/nanogpt_data_processor.py) that streams datasets from the Hugging Face Hub, tokenizes using Hugging Face's `transformers.AutoTokenizer` (default: GPT-2), and writes the output in **memory-mapped binary shards** to files. During training, we use the [`NanogptDataset`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/datasets/llm/nanogpt_dataset.py) class that can stream efficiently at training time.
-
-
-```bash
-# Step into repo root
-cd /path/to/workspace/Automodel/
-
-# Generate 500 million tokens using the 10B raw split
-python tools/nanogpt_data_processor.py \
-  --dataset HuggingFaceFW/fineweb \
-  --set-name sample-10BT \
-  --max-tokens 500M      # stop after 500 million tokens; specify as needed, reduce for smaller runs.
-
-# Shards are stored in:  tools/fineweb_max_tokens_500M/
-#    dataset.bin (single binary file with all tokens)
-```
-
-**How the preprocessor works:** The script streams data iteratively from the Hugging Face Hub (avoiding loading the entire dataset into memory), uses a multiprocessing pipeline with separate reader and writer processes, and parallelizes tokenization across multiple CPU cores using `ProcessPoolExecutor`. This design enables efficient processing of very large datasets while maintaining low memory overhead. By default, uses the `gpt2` tokenizer, but can support other tokenizers using the `--tokenizer` option.
-
-Consider the following options:
-1. Adjust `--max-tokens` to control how many tokens to process (must stay within the 4GB file size limit mentioned above).
-2. Adjust `--chunk-size` for processing batch size.
-3. Use `--num-workers` to control parallelization.
-4. Specify `--output-dir` to change the output location.
-
-## Understand the NeMo AutoModel Training Workflow
-
-NeMo AutoModel follows a simple but powerful flow for training:
-
-1. A Python recipe script (for example, [`examples/llm_pretrain/pretrain.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_pretrain/pretrain.py)) serves as the entry point that wires up all training components based on a YAML configuration file. Any configuration option can be overridden using CLI arguments (e.g., `--model.name abc`).
-2. The YAML file describes each component of the training job (such as `model`, `dataset`, `optimizer`, `distributed`, `checkpoint`, and optional `wandb`).
-3. Each component is constructed from its `_target_`, which points to a Python callable (function or class constructor) to instantiate. The remaining keys in that YAML block become keyword arguments for that callable.
-
-How `_target_` is resolved:
-- Import path to a Python object (for example, `my_pkg.models.build_model`).
-- Local Python file path plus object name (for example, `/abs/path/to/my_model.py:build_model`).
-- Library callables such as Hugging Face `transformers.AutoModelForCausalLM.from_config`.
-
-Nested objects can also specify their own `_target_` (common when building Hugging Face `config` objects first and passing them into a `from_config` method). Any YAML key can be overridden at launch time from the CLI, making it easy to tweak hyperparameters without editing files.
-
-With this context, let’s define a model using `_target_`, then point the dataset at your preprocessed shards, and finally review the full YAML.
-
-## Define Your Own Model Architecture
-
-NeMo AutoModel relies on a YAML-driven configuration to build every training component. In particular, the `model._target_` must reference a callable that returns an `nn.Module` (or a compatible Hugging Face model). You can point `_target_` at:
-
-- An import path to a Python object.
-- A local Python file plus the object name using `path.py:object_name`.
-- A library callable such as `transformers.AutoModelForCausalLM.from_config`.
-
-Below are examples for each pattern.
-
-### NanoGPT Source and File-Path `_target_`
-
-Below is the minimal GPT‑2 [implementation](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/models/gpt2.py) used for this NanoGPT‑style pretraining flow.
-It is a pure‑PyTorch model with tied embeddings and standard transformer blocks:
-
-```
-"""
-Self-contained GPT-2 (Causal LM) implementation.
-
-This module defines a pure-PyTorch model and defines the necessary
-building blocks (attention, MLP, transformer block, and language-model head).
-The public *build_gpt2_model* helper returns an ``nn.Module``.
-"""
-import math
-from typing import Any
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-# The attention layer
-class CausalSelfAttention(nn.Module):
-    """Multi-head self-attention with a causal mask."""
-
-    def __init__(self, embed_dim: int, num_heads: int, attn_dropout: float = 0.0):
-        super().__init__()
-
-        if embed_dim % num_heads != 0:
-            raise ValueError("embed_dim must be divisible by num_heads")
-
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.head_dim = embed_dim // num_heads
-
-        self.qkv_proj = nn.Linear(embed_dim, 3 * embed_dim)
-        self.out_proj = nn.Linear(embed_dim, embed_dim)
-        self.attn_dropout = attn_dropout
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:  # (B, T, C)
-        bsz, seq_len, _ = x.shape
-
-        # Project to QKV and reshape: (B, T, 3*C) → (B, n_head, T, head_dim)
-        qkv = self.qkv_proj(x).view(bsz, seq_len, 3, self.num_heads, self.head_dim)
-        q, k, v = qkv.unbind(dim=2)
-        q, k, v = (t.transpose(1, 2) for t in (q, k, v))  # (B, n_head, T, head_dim)
-
-        # Use torch's optimized SDPA when available (PyTorch ≥2.0)
-        if hasattr(F, "scaled_dot_product_attention"):
-            attn_output = F.scaled_dot_product_attention(
-                q, k, v, dropout_p=self.attn_dropout, is_causal=True
-            )  # (B, n_head, T, head_dim)
-        else:
-            # Fallback implementation with an explicit causal mask
-            scores = q @ k.transpose(-2, -1) / math.sqrt(self.head_dim)
-            causal_mask = torch.tril(torch.ones(seq_len, seq_len, device=x.device, dtype=torch.bool))
-            scores = scores.masked_fill(~causal_mask, float("-inf"))
-            attn_weights = F.softmax(scores, dim=-1)
-            attn_weights = F.dropout(attn_weights, p=self.attn_dropout, training=self.training)
-            attn_output = attn_weights @ v  # (B, n_head, T, head_dim)
-
-        # Merge heads
-        attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, seq_len, self.embed_dim)
-        return self.out_proj(attn_output)
-
-# The MLP
-class MLP(nn.Module):
-    """GPT-2 feed-forward network (GEGLU → Linear)."""
-
-    def __init__(self, embed_dim: int, expansion_factor: int = 4):
-        super().__init__()
-        hidden_dim = expansion_factor * embed_dim
-        self.fc1 = nn.Linear(embed_dim, hidden_dim)
-        self.act = nn.GELU()
-        self.fc2 = nn.Linear(hidden_dim, embed_dim)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:  # (B, T, C)
-        return self.fc2(self.act(self.fc1(x)))
-
-# Transformers
-class TransformerBlock(nn.Module):
-    """A single transformer block (LN → Attn → Add → LN → MLP → Add)."""
-
-    def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0):
-        super().__init__()
-        self.ln_1 = nn.LayerNorm(embed_dim)
-        self.attn = CausalSelfAttention(embed_dim, num_heads, dropout)
-        self.ln_2 = nn.LayerNorm(embed_dim)
-        self.mlp = MLP(embed_dim)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = x + self.attn(self.ln_1(x))
-        x = x + self.mlp(self.ln_2(x))
-        return x
-
-# The GPT-2 model definition
-class GPT2LMHeadModel(nn.Module):
-    """Minimal GPT-2 Causal-LM with tied input/output embeddings."""
-
-    def __init__(
-        self,
-        *,
-        vocab_size: int,
-        n_positions: int,
-        n_embd: int,
-        n_layer: int,
-        n_head: int,
-        dropout: float = 0.1,
-    ) -> None:
-        super().__init__()
-
-        self.wte = nn.Embedding(vocab_size, n_embd)
-        self.wpe = nn.Embedding(n_positions, n_embd)
-        self.drop = nn.Dropout(dropout)
-
-        self.h = nn.ModuleList([TransformerBlock(n_embd, n_head, dropout) for _ in range(n_layer)])
-        self.ln_f = nn.LayerNorm(n_embd)
-
-        # Language model head (weights tied to token embedding matrix)
-        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)
-        self.lm_head.weight = self.wte.weight  # weight tying
-
-        # Initialize parameters following GPT-2 scheme
-        self._init_weights()
-
-    def forward(self, input_ids: torch.LongTensor) -> torch.Tensor:  # (B, T) → (B, T, V)
-        batch_size, seq_len = input_ids.shape
-
-        if seq_len > self.wpe.num_embeddings:
-            raise ValueError(f"Sequence length {seq_len} exceeds maximum context size {self.wpe.num_embeddings}.")
-
-        pos_ids = torch.arange(seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len)
-
-        x = self.wte(input_ids) + self.wpe(pos_ids)
-        x = self.drop(x)
-
-        for block in self.h:
-            x = block(x)
-
-        x = self.ln_f(x)
-        logits = self.lm_head(x)
-        return logits
-
-    def _init_weights(self):
-        """Parameter initialization following GPT-2 conventions."""
-
-        for module in self.modules():
-            if isinstance(module, nn.Linear):
-                # GPT-2 uses normal(0, 0.02)
-                nn.init.normal_(module.weight, mean=0.0, std=0.02)
-                if module.bias is not None:
-                    nn.init.zeros_(module.bias)
-            elif isinstance(module, nn.Embedding):
-                nn.init.normal_(module.weight, mean=0.0, std=0.02)
-
-# Helper entrypoint
-def build_gpt2_model(
-    *,
-    vocab_size: int = 50257,
-    n_positions: int = 2048,
-    n_ctx: int | None = None,
-    n_embd: int = 768,
-    n_layer: int = 12,
-    n_head: int = 12,
-    bos_token_id: int = 50256,  # kept for API backward-compat (unused)
-    eos_token_id: int = 50256,  # kept for API backward-compat (unused)
-    attn_implementation: str = "flash_attention_2",  # retained but ignored
-    **extra_cfg: Any,  # ignored to preserve call-sites that used to pass config tweaks
-) -> nn.Module:
-    """Instantiate and return a *pure-PyTorch* GPT-2 language model.
-
-    The function intentionally keeps the same signature as the original
-    wrapper so existing YAML/CLI configurations continue to work.
-    Extra keyword arguments are quietly ignored.
-    """
-
-    # Map legacy *n_ctx* to *n_positions* if provided.
-    if n_ctx is not None and n_ctx != n_positions:
-        n_positions = n_ctx
-
-    # Issue a gentle warning if the user passes unused extra kwargs.
-    if extra_cfg:
-        invalid = ", ".join(extra_cfg.keys())
-        print(
-            f"[build_gpt2_model] Warning: Ignoring unsupported keyword arguments: {invalid}.",
-            flush=True,
-        )
-
-    return GPT2LMHeadModel(
-        vocab_size=vocab_size,
-        n_positions=n_positions,
-        n_embd=n_embd,
-        n_layer=n_layer,
-        n_head=n_head,
-    )
-```
-
-In short, `build_gpt2_model(...)` constructs a compact GPT‑2 with configurable depth/width/heads and returns an `nn.Module` that outputs logits over the vocabulary. It’s intentionally lean (no KV‑cache or generation helpers) but perfectly suited for forward/backward passes and next‑token prediction.
-
-To use this exact implementation directly from a file path, point `_target_` to the file and object name (`path.py:object`). Absolute paths are recommended:
-
-```yaml
-model:
-  _target_: /abs/path/to/repo/nemo_automodel/components/models/gpt2.py:build_gpt2_model
-  vocab_size: 50258
-  n_positions: 2048
-  n_embd: 768
-  n_layer: 12
-  n_head: 12
-```
-
-This loads the file on disk and calls `build_gpt2_model(...)` with the remaining keys as keyword arguments.
-
-### Import Path to a Callable (Function or Class)
-
-Instead of a file path, you can reference the callable using its import path:
-
-```yaml
-# examples/llm_pretrain/nanogpt_pretrain.yaml
-model:
-  _target_: nemo_automodel.components.models.gpt2.build_gpt2_model
-  vocab_size: 50258
-  n_positions: 2048
-  n_embd: 768
-  n_layer: 12
-  n_head: 12
-```
-
-### Hugging Face Models using `from_config` Function
-
-You can instantiate any Hugging Face causal LM with a config-first flow by targeting a `from_config` callable and providing a nested `config` node. The nested node is itself resolved using `_target_`, so you can compose Hugging Face configs directly in YAML.
-
-```yaml
-model:
-  _target_: transformers.AutoModelForCausalLM.from_config
-  # Nested object: built first, then passed to from_config(config=...)
-  config:
-    _target_: transformers.AutoConfig.from_pretrained
-    pretrained_model_name_or_path: gpt2   # or "Qwen/Qwen2-1.5B", etc.
-    n_layer: 12
-    n_head: 12
-    n_positions: 2048
-    vocab_size: 50258
-```
-
-Alternatively, target a specific architecture:
-
-```yaml
-model:
-  _target_: transformers.GPT2LMHeadModel.from_config
-  config:
-    _target_: transformers.GPT2Config
-    n_layer: 12
-    n_head: 12
-    n_positions: 2048
-    vocab_size: 50258
-```
-
-:::{note}
-- The `model._target_` may reference an import path or a local Python file using the `path.py:object` form.
-- Any nested mapping that includes `_target_` (e.g., `config:`) is instantiated first and its result is passed upward. This is how the Hugging Face `from_config` pattern works.
-- You can keep using the same training recipe (optimizer, data, distributed settings); only the `model:` block changes.
-:::
-
-## Inspect and Adjust the YAML Configuration
-
-[`examples/llm_pretrain/nanogpt_pretrain.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_pretrain/nanogpt_pretrain.yaml) is a complete configuration that:
-* Defines a GPT-2 model using the `build_gpt2_model` shorthand (easy to scale up).
-* Points `file_pattern` at preprocessed binary data files (configure based on your preprocessing output).
-* Uses the new `NanogptDataset` with `seq_len=1024`.
-* Sets a vanilla `AdamW` optimizer with learning rate `2e-4`.
-* Includes FSDP2 distributed training configuration.
-
-Key configuration sections:
-
-```yaml
-# Model configuration (two options available)
-model:
-  _target_: nemo_automodel.components.models.gpt2.build_gpt2_model
-  vocab_size: 50258
-  n_positions: 2048
-  n_embd: 768
-  n_layer: 12
-  n_head: 12
-
-# Dataset configuration
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.nanogpt_dataset.NanogptDataset
-  file_pattern: "tools/fineweb_max_tokens_500M/dataset.bin"
-  seq_len: 1024
-  shuffle_files: true
-
-# Distributed training
-distributed:
-  strategy: fsdp2
-  dp_size: null
-  tp_size: 1
-  cp_size: 1
-```
-
-**About `_target_` configuration**: The `_target_` field specifies import paths to classes and functions within the nemo_automodel package (or any Python module). For example, `nemo_automodel.components.models.gpt2.build_gpt2_model` imports and calls the GPT-2 model builder function. You can also specify paths to your own Python files (e.g., `my_custom_models.MyTransformer`) to use custom `nn.Module` implementations, allowing full flexibility in model architecture while leveraging the training infrastructure.
-
-Update the `file_pattern` to match your data location. For example, if using `tools/nanogpt_data_processor.py` with the default settings: `"tools/fineweb_max_tokens_500M/dataset.bin"`
-
-Scale **width/depth**, `batch_size`, or `seq_len` as needed - the recipe is model-agnostic.
-
-## Launch Training
-
-```bash
-# Single-GPU run (good for local testing)
-python examples/llm_pretrain/pretrain.py \
-  --config examples/llm_pretrain/nanogpt_pretrain.yaml
-
-# Multi-GPU (e.g., 8x H100)
-torchrun --standalone --nproc-per-node 8 \
-  examples/llm_pretrain/pretrain.py \
-  --config examples/llm_pretrain/nanogpt_pretrain.yaml
-
-# Using the automodel CLI:
-# single-GPU
-automodel examples/llm_pretrain/nanogpt_pretrain.yaml
-
-# multi-GPU (8 GPUs)
-automodel --nproc-per-node 8 examples/llm_pretrain/nanogpt_pretrain.yaml
-```
-:::{tip}
-Adjust the `distributed` section in the YAML config to change between DDP, FSDP2, etc.
-:::
-
-The `TrainFinetuneRecipeForNextTokenPrediction` class handles:
-* Distributed (FSDP2 / TP / CP) wrapping if requested in the YAML.
-* Gradient accumulation, LR scheduling, checkpointing, optional W&B logging.
-* Validation loops if you supply `validation_dataset`.
-
-Checkpoints are written under `checkpoints/` by default as `safetensors` or `torch_save` (YAML-configurable).
-
-## Monitor and Evaluate Training
-
-* **TPS** (tokens per second), **gradient norm**, and **loss** statistics print every optimization step.
-* Enable `wandb` in the YAML for dashboards (`wandb.project`, `wandb.entity`, etc.).
-* Periodic checkpoints can be loaded using `TrainFinetuneRecipeForNextTokenPrediction.load_checkpoint()`.
-
-Example W&B configuration:
-```yaml
-wandb:
-  project: "nanogpt-pretraining"
-  entity: "your-wandb-entity"
-  name: "nanogpt-500M-tokens"
-```
-
-## Explore Further Work
-
-1. **Scaling up**: Swap the GPT-2 config for `LlamaForCausalLM`, `Qwen2`, or any Hugging Face-compatible causal model; increase `n_layer`, `n_embd`, etc.
-2. **Mixed precision** - FSDP2 + `bfloat16` (`dtype: bfloat16` in distributed config) for memory savings.
-3. **Sequence packing** - set `packed_sequence.packed_sequence_size` > 0 to pack variable-length contexts and boost utilization.
-4. **Custom datasets** - implement your own `IterableDataset` or convert existing corpora to the `.bin` format using `tools/nanogpt_data_processor.py` as a template.
-5. **BOS alignment** - set `align_to_bos: true` in the dataset config to ensure sequences start with BOS tokens (requires `bos_token` parameter).
diff --git a/fern/versions/v0.4/pages/guides/llm/nanogpt-pretraining.mdx b/docs/guides/llm/nanogpt-pretraining.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/llm/nanogpt-pretraining.mdx
rename to docs/guides/llm/nanogpt-pretraining.mdx
diff --git a/docs/guides/llm/pretraining.md b/docs/guides/llm/pretraining.md
deleted file mode 100644
index 08153709a1..0000000000
--- a/docs/guides/llm/pretraining.md
+++ /dev/null
@@ -1,750 +0,0 @@
-# Pretraining Megatron Core Datasets
-
-## Introduction
-
-Pretraining builds a base large language model (LLM) by training a randomly initialized model to predict the next token across massive, unlabeled datasets.
-
-Robust pretraining establishes a foundation of linguistic competence and world knowledge that scales with data, parameters, and compute. This base model then serves as the necessary starting point for later fine-tuning or domain-specific adaptation.
-
-NeMo AutoModel provides an end-to-end recipe to run LLM pretraining with Hugging Face–native models and Megatron-Core style datasets.
-
-## Model and Dataset Context
-
-In this guide, we pretrain OpenAI’s `GPT2-124M` model on a FineWeb-Edu subset of 10 billion tokens.
-
-### About the FineWeb-Edu Dataset
-
-[FineWeb-Edu](https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu) is a dataset consisting of 1.3T tokens of educational web pages filtered from the larger [FineWeb](https://huggingface.co/datasets/HuggingFaceFW/fineweb) dataset. The educational web pages were filtered from the main dataset using a fine-tuned [Bert](https://huggingface.co/docs/transformers/en/model_doc/bert)-like classifier. Further reading on the filtering process can be found [here](https://huggingface.co/spaces/HuggingFaceFW/blogpost-fineweb-v1).
-
-Here’s a glimpse of what the data looks like:
-```json
-{
-    "id": "<urn:uuid:673b1bf6-2c30-40ae-992b-c387d00a836a>",
-    "dump": "CC-MAIN-2013-20",
-    "text": "No. 24; Updated March 2011
-    Click here to download and print a PDF version of this document.
-    Parents are usually the first to recognize that their child has a problem with emotions or behavior. Still, the decision to seek professional help can be difficult and painful for a parent. The first step is to gently try to talk to the child. An honest open talk about feelings can often help. Parents may choose to consult with the child's physicians, teachers, members of the clergy, or other adults who know the child well. These steps may resolve the problems for the child and family.
-    Following are a few signs which may indicate that a child and adolescent psychiatric evaluation will be useful ...",
-    "url": "https://www.aacap.org/AACAP/Families_and_Youth/Facts_for_Families/FFF-Guide/When-to-Seek-Help-for-Your-Child-024.aspx",
-    "date": null,
-    "file_path": "s3://commoncrawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/warc/CC-MAIN-20130516092621-00000-ip-10-60-113-184.ec2.internal.warc.gz",
-    "language": "en",
-    "language_score": 0.927742,
-    "token_count": 755,
-    "score": 3.375,
-    "int_score": 3,
-}
-```
-
-#### Download the FineWeb-Edu Dataset
-
-For this guide, we use the FineWeb-Edu 10BT sample—a collection of approximately 10 billion tokens randomly drawn from the full FineWeb-Edu dataset. To prepare the data, run the following commands:
-
-```bash
-# run this inside the AutoModel directory
-
-git clone https://github.com/facebookresearch/lingua.git
-cd lingua
-pip install -r requirements.txt
-python setup/download_prepare_hf_data.py fineweb_edu_10bt <MEMORY> --data_dir <DATA_DIR> --seed 42 --nchunks 1
-cd ..
-mv lingua/fineweb_edu .
-```
-Replace `<MEMORY>` with the amount of system memory allocated to `terashuf` (the tool used for sample shuffling), and set `<DATA_DIR>` to the root directory where the data will be stored. You can run the following example command:
-```bash
-python setup/download_prepare_hf_data.py fineweb_edu_10bt 16 --data_dir ./fineweb_edu --seed 42 --nchunks 1
-```
-
-The expected directory structure is like this:
-```bash
-$ tree fineweb_edu/
-fineweb_edu/
-├── fineweb_edu_10bt
-│   ├── datatrove
-│   │   ├── completions
-│   │   │   ├── 00000
-│   │   │   ├── 00001
-│   │   │   ├── 00002
-│   │   │   ├── 00003
-│   │   │   ├── 00004
-│   │   │   ├── 00005
-│   │   │   │   ...
-│   │   │   └── 00063
-│   │   ├── executor.json
-│   │   ├── logs
-│   │   │   ├── task_00000.log
-│   │   │   ├── task_00001.log
-│   │   │   ├── task_00002.log
-│   │   │   ├── task_00003.log
-│   │   │   ├── task_00004.log
-│   │   │   ├── task_00005.log
-│   │   │   │   ...
-│   │   │   └── task_00063.log
-│   │   ├── stats
-│   │   │   ├── 00000.json
-│   │   │   ├── 00001.json
-│   │   │   ├── 00002.json
-│   │   │   ├── 00003.json
-│   │   │   ├── 00004.json
-│   │   │   ├── 00005.json
-│   │   │   │   ...
-│   │   │   └── 00063.json
-│   │   └── stats.json
-│   ├── fineweb_edu_10bt.chunk.00000.jsonl
-│   │   ...
-│   ├── fineweb_edu_10bt.chunk.00013.jsonl
-│   ├── sample
-│   │   └── 10BT
-│   │       ├── 000_00000.parquet
-│   │       │   ...
-│   │       └── 013_00000.parquet
-│   └── terashuf
-│       ├── LICENSE
-│       ├── Makefile
-│       ├── README.md
-│       ├── terashuf
-│       └── terashuf.cc
-└── fineweb_edu_10bt_shuffled
-    ├── fineweb_edu_10bt.chunk.00.jsonl
-    └── fineweb_edu_10bt.val.jsonl
-```
-
-## Preprocess to a Megatron Core Dataset
-NeMo AutoModel provides tooling to perform the task of tokenizing and saving in the Megatron Core dataset format. You can use it as follows:
-
-```bash
-uv run tools/preprocess_megatron_dataset.py --input "fineweb_edu/fineweb_edu_10bt/fineweb_edu_10bt.chunk.*.jsonl" --json-keys text --output-prefix processed_data --output-path fineweb_edu/megatron_gpt2/ --workers 8 --pretrained-model-name-or-path openai-community/gpt2 --append-eod
-```
-
-The directory should look like this:
-```bash
-$ tree fineweb_edu/megatron_gpt2/
-fineweb_edu/megatron_gpt2/
-├── processed_data_0_text_document.bin
-├── processed_data_0_text_document.idx
-├── processed_data_10_text_document.bin
-├── processed_data_10_text_document.idx
-├── processed_data_11_text_document.bin
-├── processed_data_11_text_document.idx
-├── processed_data_12_text_document.bin
-├── processed_data_12_text_document.idx
-├── processed_data_13_text_document.bin
-├── processed_data_13_text_document.idx
-├── processed_data_1_text_document.bin
-├── processed_data_1_text_document.idx
-├── processed_data_2_text_document.bin
-├── processed_data_2_text_document.idx
-├── processed_data_3_text_document.bin
-├── processed_data_3_text_document.idx
-├── processed_data_4_text_document.bin
-├── processed_data_4_text_document.idx
-├── processed_data_5_text_document.bin
-├── processed_data_5_text_document.idx
-├── processed_data_6_text_document.bin
-├── processed_data_6_text_document.idx
-├── processed_data_7_text_document.bin
-├── processed_data_7_text_document.idx
-├── processed_data_8_text_document.bin
-├── processed_data_8_text_document.idx
-├── processed_data_9_text_document.bin
-└── processed_data_9_text_document.idx
-
-1 directory, 28 files
-```
-
-:::{tip}
-Replace `--workers` with the amount of CPU cores you'd like to use to tokenize in parallel.
-:::
-
-## Use a Recipe for Pretraining
-
-This example demonstrates how to perform pretraining on a large language model using NVIDIA's NeMo AutoModel library. We use the LLM [training recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/recipes/llm/train_ft.py), specifically `TrainFinetuneRecipeForNextTokenPrediction`, which orchestrates the pretraining process — including loading, dataset preparation, optimizer setup, distributed training, checkpointing, and logging.
-
-### What is a Recipe?
-
-A recipe in NeMo AutoModel is a **self-contained orchestration module** that wires together all
-components needed to perform a specific task (e.g., pretraining).
-Think of it as the equivalent of a Trainer class, but highly modular, stateful, and reproducible.
-
-The `TrainFinetuneRecipeForNextTokenPrediction` class is one such recipe. It inherits from `BaseRecipe` and implements:
-
-- `setup()`: builds all training components from the config
-
-- `run_train_validation_loop()`: executes training + validation steps
-
-- Misc: Checkpoint handling, logging, and RNG setup.
-
-### Recipe Config Example
-
-Below is the configuration from `examples/llm_pretrain/megatron_pretrain_gpt2.yaml`:
-
-```yaml
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# To run this recipe, please use the following command:
-# torchrun --nproc-per-node=8 examples/llm_pretrain/pretrain.py --config examples/llm_pretrain/megatron_pretrain_gpt2.yaml
-# Adjust --nproc-per-node to the number of GPUs available on your host machine.
-
-# The model section is responsible for configuring the model we want to finetune.
-# Since we want to use the GPT2-124M model, we pass `openai-community/gpt2` to the
-# `pretrained_model_name_or_path` option.
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config
-  config:
-    _target_: transformers.AutoConfig.from_pretrained
-    pretrained_model_name_or_path: openai-community/gpt2
-
-# As mentioned earlier, we are using the FineWeb-Edu dataset. NeMo AutoModel provides the MegatronPretraining
-# class which prepares the dataset by loading, packing, and shuffling. We use the "train" split for
-# training.
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.megatron_dataset.MegatronPretraining
-  paths: fineweb_edu/megatron_gpt2/processed_data_*_text_document*  # REPLACE THIS
-  index_mapping_dir: fineweb_edu/megatron_gpt2/mapping_dir  # REPLACE THIS
-  tokenizer:
-    _target_: nemo_automodel._transformers.auto_tokenizer.NeMoAutoTokenizer.from_pretrained
-    pretrained_model_name_or_path: openai-community/gpt2
-  seq_length: 1024
-  split: "0.99, 0.01, 0.00"  # train, validation, test
-  splits_to_build: "train"  # has to be one of train, validation, test
-
-dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn: torch.utils.data.default_collate
-  dataloader_type: "single"  # or "cyclic"
-
-# Similarly, for validation we use the "validation" split
-validation_dataset:
-  _target_: nemo_automodel.components.datasets.llm.megatron_dataset.MegatronPretraining
-  paths: fineweb_edu/megatron_gpt2/processed_data_*_text_document*  # REPLACE THIS
-  index_mapping_dir: fineweb_edu/megatron_gpt2/mapping_dir  # REPLACE THIS
-  tokenizer:
-    _target_: nemo_automodel._transformers.auto_tokenizer.NeMoAutoTokenizer.from_pretrained
-    pretrained_model_name_or_path: openai-community/gpt2
-  seq_length: 1024
-  split: "0.99, 0.01, 0.00"  # train, validation, test
-  splits_to_build: "validation"  # has to be one of train, validation, test
-  num_val_samples: 1024
-
-step_scheduler:
-  global_batch_size: 512
-  local_batch_size: 32
-  ckpt_every_steps: 1000 # checkpoints state every 1000 steps
-  val_every_steps: 250  # validates every 250 steps
-  num_epochs: 1
-  max_steps: 18500
-
-dist_env:
-  backend: nccl
-  timeout_minutes: 1
-
-rng:
-  _target_: nemo_automodel.components.training.rng.StatefulRNG
-  seed: 1111
-  ranked: true
-
-checkpoint:
-  enabled: true
-  checkpoint_dir: checkpoints/
-  model_save_format: torch_save # torch_save or safetensors
-  save_consolidated: false # saves the model in a consolidated safetensors format. Requires model_save_format to be safetensors.
-
-# For distributed processing, we use FSDP2.
-distributed:
-  strategy: fsdp2
-  dp_size: null
-  dp_replicate_size: null  # dp_shard_size = dp_size / dp_replicate_size when set. For DDP use strategy: ddp.
-  tp_size: 1
-  cp_size: 1
-  sequence_parallel: false
-
-loss_fn:
-  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
-
-dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn: torch.utils.data.default_collate
-
-validation_dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn: torch.utils.data.default_collate
-
-# We will use the standard AdamW optimizer, but you can specify any optimizer you want, by changing
-# the import path using the _target_ option.
-optimizer:
-  _target_: torch.optim.AdamW
-  betas: [0.9, 0.95]
-  lr: 0.0006
-  weight_decay: 0.1
-
-# We will use a cosine LR schedule with 700 warm-up steps.
-# This means the LR will linearly increase to a maximum of 6e-4, after which
-# it will decay to 0 over the course of training.
-lr_scheduler:
-  lr_decay_style: cosine
-  lr_warmup_steps: 700
-  min_lr: 0.0
-
-# Uncomment and configure for W&B logging
-# wandb:
-#   project: <your_wandb_project>
-#   entity: <your_wandb_entity>
-#   name: <your_wandb_exp_name>
-#   save_dir: <your_wandb_save_dir>
-
-```
-:::{tip}
-If you want to add weights to the dataset blends, you can do so by passing in a list. For example, `paths: ["30", "fineweb_edu/megatron_gpt2/processed_data_0_text_document", "70", "fineweb_edu/megatron_gpt2/processed_data_1_text_document"]`.
-:::
-
-## Load Large Models
-In distributed training, the typical model-loading pipeline has each GPU load the entire model and then retain only the shard it needs. This approach becomes problematic when the model size exceeds the memory capacity of a single GPU. For instance, a 70B-parameter model requires about 140GB of memory for its parameters when using the BF16 data type (2 bytes per parameter). Since most widely used GPUs are limited to 80GB, the full model cannot be directly loaded onto a single device.
-
-In these scenarios, you can pass `is_meta_device: true` in the model config. The model will then be instantiated using [PyTorch's Meta device](https://docs.pytorch.org/docs/stable/meta.html) which loads no data, but stores all other parameter metadata necessary for sharding the model. Once the model is sharded, the model weights will be populated by only loading the weights required by the respective model shard.
-
-
-## Run the Pretraining Recipe
-
-Assuming you saved, or plan to use, the provided config at `examples/llm_pretrain/megatron_pretrain_gpt2.yaml`:
-
-```bash
-uv run torchrun --nproc-per-node=2 examples/llm_pretrain/pretrain.py --config examples/llm_pretrain/megatron_pretrain_gpt2.yaml
-```
-
-### Sample Output
-
-You should see step‑wise logs reporting loss, memory usage, and tokens per second. Checkpoints will be saved under the `checkpoints/` directory as configured.
-
-```bash
-$ uv run torchrun --nproc-per-node=2 examples/llm_pretrain/pretrain.py --config examples/llm_pretrain/megatron_pretrain_gpt2.yaml
-cfg-path: examples/llm_pretrain/megatron_pretrain_gpt2.yaml
-cfg-path: examples/llm_pretrain/megatron_pretrain_gpt2.yaml
-> initializing torch distributed with 2 workers.
-2025-09-01 07:13:17 | INFO | nemo_automodel.components.loggers.log_utils | Setting logging level to 20
-2025-09-01 07:13:17 | INFO | root | Experiment_details:
-2025-09-01 07:13:17 | INFO | root | Timestamp: '2025-09-01T07:13:17'
-2025-09-01 07:13:17 | INFO | root | User: root
-2025-09-01 07:13:17 | INFO | root | Host: 9126f6644eca
-2025-09-01 07:13:17 | INFO | root | World size: 2
-2025-09-01 07:13:17 | INFO | root | Backend: nccl
-2025-09-01 07:13:17 | INFO | root | Recipe: TrainFinetuneRecipeForNextTokenPrediction
-2025-09-01 07:13:17 | INFO | root | Model name: null
-2025-09-01 07:13:17 | INFO | root | Recipe config:
-2025-09-01 07:13:17 | INFO | root |   step_scheduler:
-2025-09-01 07:13:17 | INFO | root |     global_batch_size: 512
-2025-09-01 07:13:17 | INFO | root |     local_batch_size: 32
-2025-09-01 07:13:17 | INFO | root |     ckpt_every_steps: 1000
-2025-09-01 07:13:17 | INFO | root |     val_every_steps: 250
-2025-09-01 07:13:17 | INFO | root |     num_epochs: 1
-2025-09-01 07:13:17 | INFO | root |     max_steps: 18500
-2025-09-01 07:13:17 | INFO | root |   dist_env:
-2025-09-01 07:13:17 | INFO | root |     backend: nccl
-2025-09-01 07:13:17 | INFO | root |     timeout_minutes: 1
-2025-09-01 07:13:17 | INFO | root |   rng:
-2025-09-01 07:13:17 | INFO | root |     _target_: <class 'nemo_automodel.components.training.rng.StatefulRNG'>
-2025-09-01 07:13:17 | INFO | root |     seed: 1111
-2025-09-01 07:13:17 | INFO | root |     ranked: True
-2025-09-01 07:13:17 | INFO | root |   model:
-2025-09-01 07:13:17 | INFO | root |     _target_: <bound method _BaseNeMoAutoModelClass.from_config of <class 'nemo_automodel._transformers.auto_model.NeMoAutoModelForCausalLM'>>
-2025-09-01 07:13:17 | INFO | root |     config:
-2025-09-01 07:13:17 | INFO | root |       _target_: <bound method AutoConfig.from_pretrained of <class 'transformers.models.auto.configuration_auto.AutoConfig'>>
-2025-09-01 07:13:17 | INFO | root |       pretrained_model_name_or_path: openai-community/gpt2
-2025-09-01 07:13:17 | INFO | root |   checkpoint:
-2025-09-01 07:13:17 | INFO | root |     enabled: True
-2025-09-01 07:13:17 | INFO | root |     checkpoint_dir: checkpoints/
-2025-09-01 07:13:17 | INFO | root |     model_save_format: torch_save
-2025-09-01 07:13:17 | INFO | root |     save_consolidated: False
-2025-09-01 07:13:17 | INFO | root |   distributed:
-2025-09-01 07:13:17 | INFO | root |     strategy: fsdp2
-2025-09-01 07:13:17 | INFO | root |     dp_size: None
-2025-09-01 07:13:17 | INFO | root |     dp_replicate_size: None
-2025-09-01 07:13:17 | INFO | root |     tp_size: 1
-2025-09-01 07:13:17 | INFO | root |     cp_size: 1
-2025-09-01 07:13:17 | INFO | root |     sequence_parallel: False
-2025-09-01 07:13:17 | INFO | root |   loss_fn:
-2025-09-01 07:13:17 | INFO | root |     _target_: <class 'nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy'>
-2025-09-01 07:13:17 | INFO | root |   dataset:
-2025-09-01 07:13:17 | INFO | root |     _target_: <class 'nemo_automodel.components.datasets.llm.megatron_dataset.MegatronPretraining'>
-2025-09-01 07:13:17 | INFO | root |     paths: fineweb_edu/megatron_gpt2/processed_data_*_text_document*
-2025-09-01 07:13:17 | INFO | root |     index_mapping_dir: fineweb_edu/megatron_gpt2/mapping_dir
-2025-09-01 07:13:17 | INFO | root |     tokenizer:
-2025-09-01 07:13:17 | INFO | root |       _target_: <bound method AutoTokenizer.from_pretrained of <class 'transformers.models.auto.tokenization_auto.AutoTokenizer'>>
-2025-09-01 07:13:17 | INFO | root |       pretrained_model_name_or_path: openai-community/gpt2
-2025-09-01 07:13:17 | INFO | root |     seq_length: 1024
-2025-09-01 07:13:17 | INFO | root |     split: (0.99, 0.01, 0.0)
-2025-09-01 07:13:17 | INFO | root |     splits_to_build: train
-2025-09-01 07:13:17 | INFO | root |   dataloader:
-2025-09-01 07:13:17 | INFO | root |     _target_: <class 'torchdata.stateful_dataloader.stateful_dataloader.StatefulDataLoader'>
-2025-09-01 07:13:17 | INFO | root |     collate_fn: <function default_collate at 0x76c3155f8720>
-2025-09-01 07:13:17 | INFO | root |   validation_dataset:
-2025-09-01 07:13:17 | INFO | root |     _target_: <class 'nemo_automodel.components.datasets.llm.megatron_dataset.MegatronPretraining'>
-2025-09-01 07:13:17 | INFO | root |     paths: fineweb_edu/megatron_gpt2/processed_data_*_text_document*
-2025-09-01 07:13:17 | INFO | root |     index_mapping_dir: fineweb_edu/megatron_gpt2/mapping_dir
-2025-09-01 07:13:17 | INFO | root |     tokenizer:
-2025-09-01 07:13:17 | INFO | root |       _target_: <bound method AutoTokenizer.from_pretrained of <class 'transformers.models.auto.tokenization_auto.AutoTokenizer'>>
-2025-09-01 07:13:17 | INFO | root |       pretrained_model_name_or_path: openai-community/gpt2
-2025-09-01 07:13:17 | INFO | root |     seq_length: 1024
-2025-09-01 07:13:17 | INFO | root |     split: (0.99, 0.01, 0.0)
-2025-09-01 07:13:17 | INFO | root |     splits_to_build: validation
-2025-09-01 07:13:17 | INFO | root |     num_val_samples: 1024
-2025-09-01 07:13:17 | INFO | root |   validation_dataloader:
-2025-09-01 07:13:17 | INFO | root |     _target_: <class 'torchdata.stateful_dataloader.stateful_dataloader.StatefulDataLoader'>
-2025-09-01 07:13:17 | INFO | root |     collate_fn: <function default_collate at 0x76c3155f8720>
-2025-09-01 07:13:17 | INFO | root |   optimizer:
-2025-09-01 07:13:17 | INFO | root |     _target_: <class 'torch.optim.adamw.AdamW'>
-2025-09-01 07:13:17 | INFO | root |     betas: [0.9, 0.95]
-2025-09-01 07:13:17 | INFO | root |     lr: 0.0006
-2025-09-01 07:13:17 | INFO | root |     weight_decay: 0.1
-2025-09-01 07:13:17 | INFO | root |   lr_scheduler:
-2025-09-01 07:13:17 | INFO | root |     lr_decay_style: cosine
-2025-09-01 07:13:17 | INFO | root |     lr_warmup_steps: 700
-2025-09-01 07:13:17 | INFO | root |     min_lr: 0.0
-2025-09-01 07:13:17 | INFO | root | Library versions:
-2025-09-01 07:13:17 | INFO | root | - nemo_automodel: 0.2.0rc0 (/opt/Automodel/nemo_automodel/__init__.py)
-2025-09-01 07:13:17 | INFO | root | - transformers: 4.55.4 (/opt/venv/lib/python3.12/site-packages/transformers/__init__.py)
-2025-09-01 07:13:17 | INFO | root | - torch: 2.8.0+cu128 CUDA 12.8
-2025-09-01 07:13:27 | INFO | root | Patched model with SDPA method= [<SDPBackend.CUDNN_ATTENTION: 3>, <SDPBackend.FLASH_ATTENTION: 1>, <SDPBackend.EFFICIENT_ATTENTION: 2>, <SDPBackend.MATH: 0>]
-2025-09-01 07:13:27 | INFO | root | Model summary:
-2025-09-01 07:13:27 | INFO | root | --------------------------------
-2025-09-01 07:13:27 | INFO | root | Trainable parameters: 124,439,808
-2025-09-01 07:13:27 | INFO | root | Total parameters: 124,439,808
-2025-09-01 07:13:27 | INFO | root | Trainable parameters percentage: 100.00%
-2025-09-01 07:13:27 | INFO | root | Param L2 norm: 234.2000
-2025-09-01 07:13:27 | INFO | root | --------------------------------
-/opt/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. 
-  warnings.warn(  # warn only once
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Let split_matrix = [(0, 0.99), (0.99, 1.0), None]
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.builder | Building GPTDataset splits with sizes=[9472000, 37888, None] and config=[random_seed: 1234, sequence_length: 1024, blend: [['fineweb_edu/megatron_gpt2/processed_data_0_text_document', 'fineweb_edu/megatron_gpt2/processed_data_10_text_document', 'fineweb_edu/megatron_gpt2/processed_data_11_text_document', 'fineweb_edu/megatron_gpt2/processed_data_12_text_document', 'fineweb_edu/megatron_gpt2/processed_data_13_text_document', 'fineweb_edu/megatron_gpt2/processed_data_1_text_document', 'fineweb_edu/megatron_gpt2/processed_data_2_text_document', 'fineweb_edu/megatron_gpt2/processed_data_3_text_document', 'fineweb_edu/megatron_gpt2/processed_data_4_text_document', 'fineweb_edu/megatron_gpt2/processed_data_5_text_document', 'fineweb_edu/megatron_gpt2/processed_data_6_text_document', 'fineweb_edu/megatron_gpt2/processed_data_7_text_document', 'fineweb_edu/megatron_gpt2/processed_data_8_text_document', 'fineweb_edu/megatron_gpt2/processed_data_9_text_document'], None], blend_per_split: None, split: 0.99, 0.01, 0.0, num_dataset_builder_threads: 1, path_to_cache: fineweb_edu/megatron_gpt2/mapping_dir, mmap_bin_files: True, tokenizer: openai-community/gpt2, mid_level_dataset_surplus: 0.005, reset_position_ids: False, reset_attention_mask: False, eod_mask_loss: False, create_attention_mask: False, drop_last_partial_validation_sequence: True, add_extra_token_to_sequence: True, split_matrix: [(0, 0.99), (0.99, 1.0), None]]
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_0_text_document.idx
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 713000 | Documents: 713000
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset train indices
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 728328
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_10_text_document.idx
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 734000 | Documents: 734000
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset train indices
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 725047
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_11_text_document.idx
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 724000 | Documents: 724000
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset train indices
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 726124
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_12_text_document.idx
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 745000 | Documents: 745000
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset train indices
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 723682
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_13_text_document.idx
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 738000 | Documents: 738000
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset train indices
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 725268
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_1_text_document.idx
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 727000 | Documents: 727000
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset train indices
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 726263
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_2_text_document.idx
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 730000 | Documents: 730000
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset train indices
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 726543
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_3_text_document.idx
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 725000 | Documents: 725000
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset train indices
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 726632
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_4_text_document.idx
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 732000 | Documents: 732000
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset train indices
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 726860
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_5_text_document.idx
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 726000 | Documents: 726000
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset train indices
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 727143
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_6_text_document.idx
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 735000 | Documents: 735000
-2025-09-01 07:13:28 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset train indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 725603
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_7_text_document.idx
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 732000 | Documents: 732000
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset train indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 726076
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_8_text_document.idx
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 182101 | Documents: 182101
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset train indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 182792
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_9_text_document.idx
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 729000 | Documents: 729000
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset train indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 726153
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-/opt/venv/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4807: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. 
-  warnings.warn(  # warn only once
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.builder | Build and save the BlendedDataset indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.builder |  Build and save the dataset and dataset sample indexes
-2025-09-01 07:13:29 | INFO | root | Instantiating MegatronPretrainingSampler with total_samples: 9472000
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Let split_matrix = [(0, 0.99), (0.99, 1.0), None]
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.builder | Building GPTDataset splits with sizes=[9472000, 1024, None] and config=[random_seed: 1234, sequence_length: 1024, blend: [['fineweb_edu/megatron_gpt2/processed_data_0_text_document', 'fineweb_edu/megatron_gpt2/processed_data_10_text_document', 'fineweb_edu/megatron_gpt2/processed_data_11_text_document', 'fineweb_edu/megatron_gpt2/processed_data_12_text_document', 'fineweb_edu/megatron_gpt2/processed_data_13_text_document', 'fineweb_edu/megatron_gpt2/processed_data_1_text_document', 'fineweb_edu/megatron_gpt2/processed_data_2_text_document', 'fineweb_edu/megatron_gpt2/processed_data_3_text_document', 'fineweb_edu/megatron_gpt2/processed_data_4_text_document', 'fineweb_edu/megatron_gpt2/processed_data_5_text_document', 'fineweb_edu/megatron_gpt2/processed_data_6_text_document', 'fineweb_edu/megatron_gpt2/processed_data_7_text_document', 'fineweb_edu/megatron_gpt2/processed_data_8_text_document', 'fineweb_edu/megatron_gpt2/processed_data_9_text_document'], None], blend_per_split: None, split: 0.99, 0.01, 0.0, num_dataset_builder_threads: 1, path_to_cache: fineweb_edu/megatron_gpt2/mapping_dir, mmap_bin_files: True, tokenizer: openai-community/gpt2, mid_level_dataset_surplus: 0.005, reset_position_ids: False, reset_attention_mask: False, eod_mask_loss: False, create_attention_mask: False, drop_last_partial_validation_sequence: True, add_extra_token_to_sequence: True, split_matrix: [(0, 0.99), (0.99, 1.0), None]]
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_0_text_document.idx
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 713000 | Documents: 713000
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset valid indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 7221
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_10_text_document.idx
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 734000 | Documents: 734000
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset valid indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 7215
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_11_text_document.idx
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 724000 | Documents: 724000
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset valid indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 7502
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_12_text_document.idx
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 745000 | Documents: 745000
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset valid indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 7209
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_13_text_document.idx
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 738000 | Documents: 738000
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset valid indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 7453
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_1_text_document.idx
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 727000 | Documents: 727000
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset valid indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 7492
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_2_text_document.idx
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 730000 | Documents: 730000
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset valid indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 7464
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_3_text_document.idx
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 725000 | Documents: 725000
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset valid indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 7362
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_4_text_document.idx
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 732000 | Documents: 732000
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset valid indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 7520
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_5_text_document.idx
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 726000 | Documents: 726000
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset valid indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 7326
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_6_text_document.idx
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 735000 | Documents: 735000
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset valid indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 7498
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_7_text_document.idx
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 732000 | Documents: 732000
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset valid indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 7531
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_8_text_document.idx
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 182101 | Documents: 182101
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset valid indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 1912
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Loading index file fineweb_edu/megatron_gpt2/processed_data_9_text_document.idx
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence lengths
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting sequence pointers
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Extracting document indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.indexed_dataset | Sequences: 729000 | Documents: 729000
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | Build and save the GPTDataset valid indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of samples: 7462
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.gpt_dataset | > total number of epochs: 1
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.builder | Build and save the BlendedDataset indices
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.datasets.llm.megatron.builder |  Build and save the dataset and dataset sample indexes
-2025-09-01 07:13:29 | INFO | root | Instantiating MegatronPretrainingSampler with total_samples: 1024
-2025-09-01 07:13:29 | INFO | nemo_automodel.recipes.llm.train_ft | Building LR scheduler with total_steps=18500, warmup_steps=700, decay_style=cosine
-2025-09-01 07:13:29 | INFO | nemo_automodel.components.optim.scheduler | learning rate decay style: cosine
-2025-09-01 07:13:29 | INFO | root | Model Part 0:
-2025-09-01 07:13:29 | INFO | root | FSDPGPT2LMHeadModel(
-2025-09-01 07:13:29 | INFO | root |   (transformer): GPT2Model(
-2025-09-01 07:13:29 | INFO | root |     (wte): Embedding(50257, 768)
-2025-09-01 07:13:29 | INFO | root |     (wpe): Embedding(1024, 768)
-2025-09-01 07:13:29 | INFO | root |     (drop): Dropout(p=0.1, inplace=False)
-2025-09-01 07:13:29 | INFO | root |     (h): ModuleList(
-2025-09-01 07:13:29 | INFO | root |       (0-11): 12 x FSDPGPT2Block(
-2025-09-01 07:13:29 | INFO | root |         (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
-2025-09-01 07:13:29 | INFO | root |         (attn): GPT2Attention(
-2025-09-01 07:13:29 | INFO | root |           (c_attn): Conv1D(nf=2304, nx=768)
-2025-09-01 07:13:29 | INFO | root |           (c_proj): Conv1D(nf=768, nx=768)
-2025-09-01 07:13:29 | INFO | root |           (attn_dropout): Dropout(p=0.1, inplace=False)
-2025-09-01 07:13:29 | INFO | root |           (resid_dropout): Dropout(p=0.1, inplace=False)
-2025-09-01 07:13:29 | INFO | root |         )
-2025-09-01 07:13:29 | INFO | root |         (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
-2025-09-01 07:13:29 | INFO | root |         (mlp): GPT2MLP(
-2025-09-01 07:13:29 | INFO | root |           (c_fc): Conv1D(nf=3072, nx=768)
-2025-09-01 07:13:29 | INFO | root |           (c_proj): Conv1D(nf=768, nx=3072)
-2025-09-01 07:13:29 | INFO | root |           (act): NewGELUActivation()
-2025-09-01 07:13:29 | INFO | root |           (dropout): Dropout(p=0.1, inplace=False)
-2025-09-01 07:13:29 | INFO | root |         )
-2025-09-01 07:13:29 | INFO | root |       )
-2025-09-01 07:13:29 | INFO | root |     )
-2025-09-01 07:13:29 | INFO | root |     (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
-2025-09-01 07:13:29 | INFO | root |   )
-2025-09-01 07:13:29 | INFO | root |   (lm_head): Linear(in_features=768, out_features=50257, bias=False)
-2025-09-01 07:13:29 | INFO | root | )
-2025-09-01 07:13:29 | INFO | root | Optimizer:
-2025-09-01 07:13:29 | INFO | root | AdamW (
-2025-09-01 07:13:29 | INFO | root | Parameter Group 0
-2025-09-01 07:13:29 | INFO | root |     amsgrad: False
-2025-09-01 07:13:29 | INFO | root |     betas: [0.9, 0.95]
-2025-09-01 07:13:29 | INFO | root |     capturable: False
-2025-09-01 07:13:29 | INFO | root |     decoupled_weight_decay: True
-2025-09-01 07:13:29 | INFO | root |     differentiable: False
-2025-09-01 07:13:29 | INFO | root |     eps: 1e-08
-2025-09-01 07:13:29 | INFO | root |     foreach: None
-2025-09-01 07:13:29 | INFO | root |     fused: None
-2025-09-01 07:13:29 | INFO | root |     lr: 5.9999999999999995e-05
-2025-09-01 07:13:29 | INFO | root |     maximize: False
-2025-09-01 07:13:29 | INFO | root |     weight_decay: 0.1
-2025-09-01 07:13:29 | INFO | root | )
-2025-09-01 07:13:29 | INFO | root | LR scheduler:
-2025-09-01 07:13:29 | INFO | root | OptimizerParamScheduler(
-2025-09-01 07:13:29 | INFO | root |     optimizer: AdamW
-2025-09-01 07:13:29 | INFO | root |     learning_rate:
-2025-09-01 07:13:29 | INFO | root |         init_lr: 5.9999999999999995e-05
-2025-09-01 07:13:29 | INFO | root |         max_lr: 0.0006
-2025-09-01 07:13:29 | INFO | root |         min_lr: 0.0
-2025-09-01 07:13:29 | INFO | root |         warmup_steps: 700
-2025-09-01 07:13:29 | INFO | root |         decay_steps: 18500
-2025-09-01 07:13:29 | INFO | root |         decay_style: cosine
-2025-09-01 07:13:29 | INFO | root |     weight_decay:
-2025-09-01 07:13:29 | INFO | root |         start_wd: 0.1
-2025-09-01 07:13:29 | INFO | root |         end_wd: 0.1
-2025-09-01 07:13:29 | INFO | root |         incr_steps: 18500
-2025-09-01 07:13:29 | INFO | root |         incr_style: constant
-2025-09-01 07:13:29 | INFO | root |     current_step: 0
-2025-09-01 07:13:29 | INFO | root | )
-2025-09-01 07:13:29 | INFO | root | Step scheduler:
-2025-09-01 07:13:29 | INFO | root | - Gradient accumulation steps: 8
-2025-09-01 07:13:29 | INFO | root | - Checkpoint every steps: 1000
-2025-09-01 07:13:29 | INFO | root | - Current Epoch: 0
-2025-09-01 07:13:29 | INFO | root | - Number of epochs: 1
-2025-09-01 07:13:29 | INFO | root | - Validation every steps: 250
-2025-09-01 07:13:29 | INFO | root | - Max train steps: 18500
-2025-09-01 07:13:33 | INFO | root | step 1 | epoch 0 | loss 10.9521 | grad_norm 12.9375 | lr 6.08e-05 | mem 38.39 GiB | tps 132005.57(66002.79/gpu) | num_label_tokens 524288
-2025-09-01 07:13:37 | INFO | root | step 2 | epoch 0 | loss 10.1146 | grad_norm 6.0312 | lr 6.15e-05 | mem 38.63 GiB | tps 146246.38(73123.19/gpu) | num_label_tokens 524288
-2025-09-01 07:13:41 | INFO | root | step 3 | epoch 0 | loss 9.7842 | grad_norm 3.0781 | lr 6.23e-05 | mem 38.63 GiB | tps 145236.76(72618.38/gpu) | num_label_tokens 524288
-2025-09-01 07:13:44 | INFO | root | step 4 | epoch 0 | loss 9.6514 | grad_norm 2.2812 | lr 6.31e-05 | mem 38.63 GiB | tps 144882.21(72441.11/gpu) | num_label_tokens 524288
-2025-09-01 07:13:48 | INFO | root | step 5 | epoch 0 | loss 9.5964 | grad_norm 2.2188 | lr 6.39e-05 | mem 38.63 GiB | tps 144711.55(72355.78/gpu) | num_label_tokens 524288
-```
-For each training batch, the fine-tuning recipe logs the current loss, along with current peak memory usage and tokens per second (TPS).
-
-As training progresses, you should observe the model loss beginning to converge. To verify your results, you can compare your convergence curves against the baseline benchmarks provided in the [llm.c repository](https://github.com/karpathy/llm.c/discussions/481).
-
-:::{figure} ./gpt2_loss.png
-:name: gpt2-train-loss
-:alt: Example of GPT-2 training convergence on FineWeb-Edu-10B
-:align: center
-
-Example of GPT-2 training convergence on FineWeb-Edu-10B.
-:::
diff --git a/fern/versions/nightly/pages/guides/llm/pretraining.mdx b/docs/guides/llm/pretraining.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/llm/pretraining.mdx
rename to docs/guides/llm/pretraining.mdx
diff --git a/docs/guides/llm/retrieval-dataset.md b/docs/guides/llm/retrieval-dataset.md
deleted file mode 100644
index cafd6ab950..0000000000
--- a/docs/guides/llm/retrieval-dataset.md
+++ /dev/null
@@ -1,106 +0,0 @@
-# Retrieval Dataset (Embedding Fine-tuning)
-
-NeMo Automodel supports **retrieval model fine-tuning** using a retrieval-style dataset: each training example is a **query** paired with **one positive** document and **one or more negative** documents.
-
-This dataset is used by the retrieval recipes (see `examples/retrieval/bi_encoder/` and `examples/retrieval/cross_encoder/`) together with the `BiEncoderCollator`.
-
-## What the Bi-Encoder Consumes
-
-The dataset factory `nemo_automodel.components.datasets.llm.make_retrieval_dataset` returns a Hugging Face `datasets.Dataset`. At runtime it transforms each raw record into the training-time schema:
-
-- `question`: query string
-- `doc_text`: list of document texts in the order `[positive, negative_1, negative_2, ...]`
-- `doc_image`: list of images (or empty strings), aligned with `doc_text`
-- `query_instruction` / `passage_instruction`: optional, used when `use_dataset_instruction: true` and the corpus provides instructions via metadata
-
-## Supported Input Formats
-
-NeMo Automodel supports **two** input schemas:
-
-### Corpus ID-Based JSON (Merlin/NeMo-Retriever Style)
-
-This is the format used by NeMo retriever pipelines where documents live in a separate **corpus** and training examples reference documents by **ID**.
-
-**Training file example (single JSON):**
-
-```json
-{
-  "corpus": [
-    { "path": "/abs/path/to/wiki_corpus" }
-  ],
-  "data": [
-    {
-      "question_id": "q_001",
-      "question": "Explain transformers",
-      "corpus_id": "wiki_corpus",
-      "pos_doc": [{ "id": "d_123" }],
-      "neg_doc": [{ "id": "d_456" }, "d_789"]
-    }
-  ]
-}
-```
-
-**Corpus requirements**
-
-Each corpus directory must contain a `merlin_metadata.json` file.
-
-Minimal example:
-
-```json
-{ "class": "TextQADataset", "corpus_id": "wiki_corpus" }
-```
-
-:::{note}
-- `pos_doc` and `neg_doc` can be lists of `{"id": ...}` dicts or raw IDs (they are normalized internally).
-- If you set `use_dataset_instruction: true`, optional fields like `query_instruction` and `passage_instruction` in `merlin_metadata.json` are surfaced to the collator.
-:::
-
-### Inline-Text JSONL (No Corpus Required)
-
-This is convenient for custom fine-tuning pipelines where the documents are included **inline**.
-
-**JSONL example (one example per line):**
-
-```json
-{"query":"Explain transformers","pos_doc":"Transformers are a type of neural network...","neg_doc":["RNNs are...","CNNs are..."]}
-{"query":"What is Python?","pos_doc":["A programming language."],"neg_doc":"A snake."}
-```
-
-:::{note}
-- `query` is accepted (`question` is also accepted as an alias).
-- `pos_doc` and `neg_doc` can be either:
-  - strings (interpreted as document text), or
-  - lists of strings, or
-  - dicts with at least `text` (optionally `image`, `nr_ocr`) for multimodal use cases.
-- If `corpus_id` is not provided, it defaults to `__inline__`.
-- `use_dataset_instruction: true` has no effect for pure inline records (instructions come from corpus metadata).
-:::
-
-## YAML Usage (Dataset + Collator)
-
-Use the dataset factory plus the bi-encoder collator:
-
-```yaml
-dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  dataset:
-    _target_: nemo_automodel.components.datasets.llm.make_retrieval_dataset
-    data_dir_list:
-      - /abs/path/to/train.jsonl   # or train.json (corpus-id format)
-    data_type: train
-    n_passages: 5                 # 1 positive + 4 negatives
-    do_shuffle: true
-    use_dataset_instruction: false
-  collate_fn:
-    _target_: nemo_automodel.components.datasets.llm.BiEncoderCollator
-    q_max_len: 512
-    p_max_len: 512
-    query_prefix: "query:"
-    passage_prefix: "passage:"
-    pad_to_multiple_of: 8
-```
-
-## Requirements
-
-- `pos_doc` must be **non-empty**.
-- If training requests negatives (e.g., `n_passages > 1`), `neg_doc` must contain **at least one** document.
\ No newline at end of file
diff --git a/fern/versions/v0.4/pages/guides/llm/retrieval-dataset.mdx b/docs/guides/llm/retrieval-dataset.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/llm/retrieval-dataset.mdx
rename to docs/guides/llm/retrieval-dataset.mdx
diff --git a/docs/guides/llm/sequence-classification.md b/docs/guides/llm/sequence-classification.md
deleted file mode 100644
index 99effa7e4f..0000000000
--- a/docs/guides/llm/sequence-classification.md
+++ /dev/null
@@ -1,119 +0,0 @@
-# Sequence Classification (SFT/PEFT)
-
-## Introduction
-
-Sequence classification tasks (e.g., sentiment analysis, topic classification, GLUE tasks) map input text to a discrete label. NeMo AutoModel provides a lightweight recipe specialized for this setting that integrates with popular pretrained model formats and dataset sources. Integration with Hugging Face is supported.
-
-This guide shows how to train a sequence classification model using the `TrainFinetuneRecipeForSequenceClassification` recipe, including optional Parameter-Efficient Fine-Tuning (LoRA).
-
-## Quickstart
-
-Use the example config for GLUE MRPC with RoBERTa-large + LoRA:
-
-```bash
-python3 examples/llm_seq_cls/seq_cls.py --config examples/llm_seq_cls/glue/mrpc_roberta_lora.yaml
-```
-
-- Loads `roberta-large` with `num_labels: 2`
-- Builds GLUE MRPC datasets (train/validation)
-- Optionally, enables LoRA via the `peft` block
-- Trains and validates per `step_scheduler`
-
-## What is the Sequence Classification Recipe?
-
-`TrainFinetuneRecipeForSequenceClassification` is a config-driven trainer that orchestrates:
-- Model and optimizer construction
-- Dataset/Dataloader setup
-- Training and validation loops
-- Checkpointing and logging
-
-It follows the same design as the SFT recipe in the fine-tune guide, but uses a standard cross-entropy classification loss and a simplified batching pipeline.
-
-## Minimal Config Anatomy
-
-```yaml
-# GLUE MRPC with RoBERTa-large + LoRA
-step_scheduler:
-  global_batch_size: 32
-  local_batch_size: 32
-  ckpt_every_steps: 200
-  val_every_steps: 100
-  num_epochs: 2
-  max_steps: 10
-
-dist_env:
-  backend: nccl
-  timeout_minutes: 1
-
-model:
-  _target_: nemo_automodel.NeMoAutoModelForSequenceClassification.from_pretrained
-  pretrained_model_name_or_path: roberta-large
-  num_labels: 2
-
-checkpoint:
-  enabled: true
-  checkpoint_dir: checkpoints/
-  model_save_format: safetensors
-  save_consolidated: true
-
-distributed:
-  strategy: fsdp2
-  dp_size: null
-  dp_replicate_size: null
-  tp_size: 1
-  cp_size: 1
-  sequence_parallel: false
-
-peft:
-  _target_: nemo_automodel.components._peft.lora.PeftConfig
-  target_modules:
-  - "*.query"
-  - "*.value"
-  dim: 8
-  alpha: 16
-  dropout: 0.1
-
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.seq_cls.GLUE_MRPC
-  split: train
-
-dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn: nemo_automodel.components.datasets.utils.default_collater
-
-validation_dataset:
-  _target_: nemo_automodel.components.datasets.llm.seq_cls.GLUE_MRPC
-  split: validation
-
-validation_dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn: nemo_automodel.components.datasets.utils.default_collater
-
-optimizer:
-  _target_: torch.optim.AdamW
-  betas: [0.9, 0.999]
-  eps: 1e-8
-  lr: 3.0e-4
-  weight_decay: 0
-
-
-```
-
-## Dataset Notes
-
-- For single-sentence datasets (e.g., `yelp_review_full`, `imdb`), use `YelpReviewFull` or `IMDB` from `nemo_automodel.components.datasets.llm.seq_cls`.
-- For GLUE MRPC (sentence-pair classification), use `GLUE_MRPC`, which tokenizes `(sentence1, sentence2)` with padding/truncation.
-
-## LoRA (PEFT) Settings
-
-- `target_modules`: glob to select linear layers (e.g., `"*.proj"`).
-- `dim` (rank), `alpha`, `dropout`: tune per model/compute budget. Values `dim=8, alpha=16, dropout=0.1` are a good starting point for RoBERTa.
-- The recipe automatically applies the adapters; no additional code changes are required.
-
-## Running with torchrun
-
-```bash
-torchrun --nproc-per-node=2 examples/llm_seq_cls/seq_cls.py --config examples/llm_seq_cls/glue/mrpc_roberta_lora.yaml
-```
-You can adjust the number of GPUs as necessary using the `--nproc-per-node` knob.
-
diff --git a/fern/versions/v0.4/pages/guides/llm/sequence-classification.mdx b/docs/guides/llm/sequence-classification.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/llm/sequence-classification.mdx
rename to docs/guides/llm/sequence-classification.mdx
diff --git a/docs/guides/llm/toolcalling.md b/docs/guides/llm/toolcalling.md
deleted file mode 100644
index 29c2899998..0000000000
--- a/docs/guides/llm/toolcalling.md
+++ /dev/null
@@ -1,122 +0,0 @@
-# Function Calling with FunctionGemma
-
-This tutorial walks through fine-tuning [FunctionGemma](https://huggingface.co/google/functiongemma-270m-it), Google's 270M function-calling model, with NeMo AutoModel on the xLAM function-calling dataset.
-
-
-## FunctionGemma Introduction
-FunctionGemma is a lightweight, 270M-parameter variant built on the Gemma 3 architecture with a function-calling chat format. It is intended to be fine-tuned for task-specific function calling, and its compact size makes it practical for edge or resource-constrained deployments.
-- Gemma 3 architecture, updated tokenizer, and function-calling chat format.
-- Trained specifically for function calling: multiple tool definitions, parallel calls, tool responses, and natural-language summaries.
-- Small/edge friendly: ~270M params for fast, dense inference on-device.
-- Text-only, function-oriented model (not a general dialogue model), best used after task-specific finetuning.
-
-## Prerequisites
-- Install NeMo AutoModel and its extras: `pip install nemo-automodel`.
-- A FunctionGemma checkpoint available locally or using <https://huggingface.co/google/functiongemma-270m-it>.
-- Small model footprint: can be fine-tuned on a single GPU; scale batch/sequence as needed.
-
-## xLAM Dataset
-The xLAM function-calling dataset contains user queries, tool schemas, and tool call traces. It covers diverse tools and arguments so models learn to emit structured tool calls.
-- Dataset URL: https://huggingface.co/datasets/Salesforce/xlam-function-calling-60k
-- Each sample provides:
-  - `query`: the user request.
-  - `tools`: tool definitions (lightweight schema).
-  - `answers`: tool calls with serialized arguments.
-
-Example entry:
-```json
-{
-  "id": 123,
-  "query": "Book me a table for two at 7pm in Seattle.",
-  "tools": [
-    {
-      "name": "book_table",
-      "description": "Book a restaurant table",
-      "parameters": {
-        "party_size": {"type": "int"},
-        "time": {"type": "string"},
-        "city": {"type": "string"}
-      }
-    }
-  ],
-  "answers": [
-    {
-      "name": "book_table",
-      "arguments": "{\"party_size\":2,\"time\":\"19:00\",\"city\":\"Seattle\"}"
-    }
-  ]
-}
-```
-
-
-The helper `make_xlam_dataset` converts each xLAM row into OpenAI-style tool schemas and tool calls, then renders them through the chat template so loss is applied only on the tool-call arguments:
-
-```python
-def _format_example(
-    example,
-    tokenizer,
-    eos_token_id,
-    pad_token_id,
-    seq_length=None,
-    padding=None,
-    truncation=None,
-):
-    tools = _convert_tools(_json_load_if_str(example["tools"]))
-    tool_calls = _convert_tool_calls(_json_load_if_str(example["answers"]), example_id=example.get("id"))
-
-    formatted_text = [
-        {"role": "user", "content": example["query"]},
-        {"role": "assistant", "content": "", "tool_calls": tool_calls},
-    ]
-
-    return format_chat_template(
-        tokenizer=tokenizer,
-        formatted_text=formatted_text,
-        tools=tools,
-        eos_token_id=eos_token_id,
-        pad_token_id=pad_token_id,
-        seq_length=seq_length,
-        padding=padding,
-        truncation=truncation,
-        answer_only_loss_mask=True,
-    )
-```
-
-
-
-## Run Full-Parameter SFT
-Use the ready-made config at [`examples/llm_finetune/gemma/functiongemma_xlam.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/gemma/functiongemma_xlam.yaml) to start fine-tuning:
-
-
-
-With the config in place, launch training (8 GPUs shown; adjust `--nproc-per-node` as needed):
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/gemma/functiongemma_xlam.yaml
-```
-
-You should be able to see a training loss curve similar to the one shown below:
-
-<p align="center">
-  <img src="https://raw.githubusercontent.com/NVIDIA-NeMo/Automodel/main/docs/guides/llm/functiongemma-sft-loss.png" alt="FunctionGemma SFT loss" width="400">
-</p>
-
-## Run PEFT (LoRA)
-To apply LoRA (PEFT), uncomment the `peft` block in the config and tune rank/alpha/targets per the [SFT/PEFT guide](finetune.md). Example override:
-
-```yaml
-peft:
-  _target_: nemo_automodel.components._peft.lora.PeftConfig
-  target_modules: '*_proj'
-  dim: 16
-  alpha: 16
-  use_triton: true
-```
-Then fine-tune with the same recipe. Adjust the number of GPUs as needed.
-```bash
-automodel examples/llm_finetune/gemma/functiongemma_xlam.yaml
-```
-
-<p align="center">
-  <img src="https://raw.githubusercontent.com/NVIDIA-NeMo/Automodel/main/docs/guides/llm/functiongemma-peft-loss.png" alt="FunctionGemma PEFT loss" width="400">
-</p>
diff --git a/fern/versions/v0.4/pages/guides/llm/toolcalling.mdx b/docs/guides/llm/toolcalling.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/llm/toolcalling.mdx
rename to docs/guides/llm/toolcalling.mdx
diff --git a/docs/guides/mlflow-logging.md b/docs/guides/mlflow-logging.md
deleted file mode 100644
index 7b5422c3b0..0000000000
--- a/docs/guides/mlflow-logging.md
+++ /dev/null
@@ -1,256 +0,0 @@
-# MLflow Logging
-
-## Introduction
-
-MLflow is an open-source platform for managing the machine learning lifecycle, including experiment tracking, model versioning, and deployment. NeMo AutoModel integrates with MLflow to log training metrics, parameters, and artifacts during model training.
-
-With MLflow integration, you can:
-- Track and compare experiments across multiple runs
-- Log hyperparameters and training configurations
-- Monitor training and validation metrics in real-time
-- Store model checkpoints and artifacts
-- Visualize experiment results through the MLflow UI
-- Share results with team members
-
-## Prerequisites
-
-Before using MLflow logging in NeMo AutoModel, ensure you have:
-
-1. **MLflow installed**: MLflow is installed with `nemo-automodel` by default. If you see an import error in your environment, install it manually:
-   ```bash
-   pip install mlflow
-   # or:
-   uv pip install mlflow
-   ```
-
-2. **MLflow tracking server** (optional): For production use, set up a tracking server to centralize experiment data. For local development, MLflow will use a local file-based store by default.
-
-## Configuration
-
-Enable MLflow logging by adding an `mlflow` section to your recipe YAML configuration:
-
-```yaml
-mlflow:
-  experiment_name: "automodel-llm-llama3_2_1b_squad-finetune"
-  run_name: ""
-  tracking_uri: null
-  artifact_location: null
-  tags:
-    task: "squad-finetune"
-    model_family: "llama3.2"
-    model_size: "1b"
-    dataset: "squad"
-    framework: "automodel"
-```
-
-### Configuration Parameters
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `experiment_name` | str | "automodel-experiment" | Name of the MLflow experiment. All runs are grouped under this experiment. |
-| `run_name` | str | "" | Optional name for the current run. If empty, MLflow generates a unique name. |
-| `tracking_uri` | str | null | URI of the MLflow tracking server. If null, uses local file-based storage. |
-| `artifact_location` | str | null | Location to store artifacts. If null, uses default MLflow location. |
-| `tags` | dict | {} | Dictionary of tags to attach to the run for organization and filtering. |
-
-### Tracking URI Options
-
-The `tracking_uri` parameter determines where MLflow stores experiment data:
-
-- **Local file storage (default)**: `null` or `file:///path/to/mlruns`
-- **Remote tracking server**: `http://your-mlflow-server:5000`
-- **Database backend**: `postgresql://user:password@host:port/database`
-
-For team collaboration, we recommend setting up a remote tracking server.
-
-## What Gets Logged
-
-NeMo AutoModel automatically logs the following information to MLflow:
-
-### Metrics
-- Training loss at each step
-- Validation loss and metrics
-- Learning rate schedule
-- Gradient norms (if gradient clipping is enabled)
-
-### Parameters
-- Model configuration (architecture, size, pretrained checkpoint)
-- Training hyperparameters (learning rate, batch size, optimizer settings)
-- Dataset information
-- Parallelism configuration (DP, TP, CP settings)
-
-### Tags
-- Custom tags from configuration
-- Automatically added tags:
-  - Model name from `pretrained_model_name_or_path`
-  - Global and local batch sizes
-
-### Artifacts
-- Model checkpoints (if configured)
-- Training configuration files
-
-:::{note}
-Only rank 0 in distributed training logs to MLflow to avoid duplicate entries and reduce overhead.
-:::
-
-## Usage Example
-
-Here's a complete example of training with MLflow logging enabled:
-
-### Configure Your Recipe
-
-Add the MLflow configuration to your YAML file (e.g., `llama3_2_1b_squad.yaml`):
-
-```yaml
-step_scheduler:
-  global_batch_size: 64
-  local_batch_size: 8
-  ckpt_every_steps: 1000
-  val_every_steps: 10
-  num_epochs: 1
-
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-
-mlflow:
-  experiment_name: "llama3-squad-finetune"
-  run_name: "baseline-run-1"
-  tracking_uri: null  # Uses local storage
-  tags:
-    task: "question-answering"
-    dataset: "squad"
-    model: "llama-3.2-1b"
-```
-
-### Run Training
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```
-
-During training, you'll see MLflow logging messages:
-
-```
-MLflow run started: abc123def456
-View run at: file:///path/to/mlruns/#/experiments/1/runs/abc123def456
-```
-
-### View Results in MLflow UI
-
-Launch the MLflow UI to visualize your experiments:
-
-```bash
-mlflow ui
-```
-
-By default, the UI runs at `http://localhost:5000`. Open this URL in your browser to:
-- Compare metrics across runs
-- View parameter configurations
-- Download artifacts
-- Filter and search experiments by tags
-
-## Integration with Other Loggers
-
-MLflow can be used alongside other logging tools like Weights & Biases (WandB). Simply enable both in your configuration:
-
-```yaml
-# Enable both MLflow and WandB
-mlflow:
-  experiment_name: "my-experiment"
-  tags:
-    framework: "automodel"
-
-wandb:
-  project: "my-project"
-  entity: "my-team"
-  name: "my-run"
-```
-
-Both loggers will track the same metrics independently, allowing you to leverage the strengths of each platform.
-
-## Best Practices
-
-### Experiment Organization
-
-1. **Use descriptive experiment names**: Group related runs under meaningful experiment names.
-   ```yaml
-   experiment_name: "llama3-squad-ablation-study"
-   ```
-
-2. **Tag your runs**: Add tags for easy filtering and comparison.
-   ```yaml
-   tags:
-     model_size: "1b"
-     learning_rate: "1e-5"
-     optimizer: "adam"
-   ```
-
-3. **Use run names for variants**: Differentiate runs within an experiment.
-   ```yaml
-   run_name: "lr-1e5-bs64"
-   ```
-
-### Remote Tracking Server
-
-For team collaboration, set up a shared MLflow tracking server:
-
-```yaml
-mlflow:
-  tracking_uri: "http://mlflow-server.example.com:5000"
-  experiment_name: "team-llm-experiments"
-```
-
-### Artifact Storage
-
-For large-scale experiments, configure a dedicated artifact location:
-
-```yaml
-mlflow:
-  artifact_location: "s3://my-bucket/mlflow-artifacts"
-```
-
-Supported storage backends include S3, Azure Blob Storage, Google Cloud Storage, and network file systems.
-
-### Performance Considerations
-
-- MLflow logging adds minimal overhead since only rank 0 logs.
-- Metrics are logged asynchronously to avoid blocking training.
-- For very frequent logging (every step), consider increasing `val_every_steps` to reduce I/O.
-
-## Troubleshooting
-
-### MLflow Not Installed
-
-If you see an import error:
-```
-ImportError: MLflow is not installed. Please install it (e.g. pip install mlflow).
-```
-
-Install MLflow:
-```bash
-pip install mlflow
-# or:
-uv pip install mlflow
-```
-
-### Connection Issues
-
-If you can't connect to a remote tracking server:
-- Verify the `tracking_uri` is correct
-- Check network connectivity and firewall rules
-- Ensure the tracking server is running
-
-### Missing Metrics
-
-If metrics aren't appearing in MLflow:
-- Verify you're running on rank 0 or check rank 0 logs
-- Ensure the MLflow run started successfully (check for "MLflow run started" message)
-- Check that metrics are being computed during training
-
-## References
-
-- [MLflow Documentation](https://mlflow.org/docs/latest/index.html)
-- [MLflow Tracking](https://mlflow.org/docs/latest/tracking.html)
-- [MLflow Python API](https://mlflow.org/docs/latest/python_api/index.html)
-- [NeMo AutoModel Examples](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples)
\ No newline at end of file
diff --git a/fern/versions/v0.4/pages/guides/mlflow-logging.mdx b/docs/guides/mlflow-logging.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/mlflow-logging.mdx
rename to docs/guides/mlflow-logging.mdx
diff --git a/docs/guides/omni/gemma3-3n.md b/docs/guides/omni/gemma3-3n.md
deleted file mode 100644
index ecb81eb97d..0000000000
--- a/docs/guides/omni/gemma3-3n.md
+++ /dev/null
@@ -1,333 +0,0 @@
-# Fine-Tune Gemma 3 and Gemma 3n
-
-This document explains how to fine-tune Gemma 3 and Gemma 3n using NeMo AutoModel. It outlines key operations, including initiating SFT and PEFT-LoRA runs and managing experiment configurations using YAML.
-
-To set up your environment to run NeMo AutoModel, follow the [Installation Guide](https://github.com/NVIDIA-NeMo/Automodel#-install-nemo-automodel).
-
-## Data
-
-### MedPix-VQA Dataset
-
-The [MedPix-VQA](https://huggingface.co/datasets/mmoukouba/MedPix-VQA) dataset is a comprehensive medical Visual Question-Answering dataset designed for training and evaluating VQA models in the medical domain. It contains medical images from MedPix, a well-known medical image database, paired with questions and answers that focus on medical image interpretation.
-
-The dataset consists of 20,500 examples with the following structure:
-- **Training Set**: 17,420 examples (85%)
-- **Validation Set**: 3,080 examples (15%)
-- **Columns**: `image_id`, `mode`, `case_id`, `question`, `answer`
-
-### Preprocess the Dataset
-
-NeMo AutoModel provides built-in preprocessing for the MedPix-VQA dataset through the `make_medpix_dataset` function. Here's how the preprocessing works:
-
-```python
-from nemo_automodel.components.datasets.vlm.datasets import make_medpix_dataset
-
-# Load and preprocess the dataset
-dataset = make_medpix_dataset(
-    path_or_dataset="mmoukouba/MedPix-VQA",
-    split="train"
-)
-```
-
-The preprocessing pipeline performs the following steps:
-
-1. **Loads the dataset** using the Hugging Face `datasets` library.
-2. **Extracts question-answer pairs** by processing the `question` and `answer` fields from the dataset.
-3. **Converts to the Hugging Face message list format** to restructure the data into a chat-style format compatible with the Autoprocessor's `apply_chat_template` function.
-
-```python
-# Example of the conversation format created
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image", "image": example["image_id"]},
-            {"type": "text", "text": example["question"]},
-        ],
-    },
-    {
-        "role": "assistant",
-        "content": [{"type": "text", "text": example["answer"]}]
-    },
-]
-```
-
-### Use the Collate Functions
-
-NeMo AutoModel provides specialized collate functions for different VLM processors. The collate function is responsible for batching examples and preparing them for model input.
-
-Both Gemma 3 and Gemma 3n models work seamlessly with the Hugging Face `AutoProcessor` and use the default collate function:
-
-```python
-processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")
-# For Gemma 3n, get processor:
-# processor = AutoProcessor.from_pretrained("google/gemma-3n-e4b-it")
-
-# For Gemma 3 and Gemma 3n, use the default collate function
-def default_collate_fn(examples: list, processor) -> dict[str, torch.Tensor]:
-    batch = processor.apply_chat_template(
-        [example["conversation"] for example in examples],
-        tokenize=True,
-        add_generation_prompt=False,
-        return_tensors="pt",
-        return_dict=True,
-    )
-
-    labels = batch["input_ids"].clone()[:, 1:]
-    labels = torch.cat([labels, -100 * torch.ones_like(labels[:, :1])], dim=1)
-    batch["labels"] = labels
-    loss_mask = create_batch_loss_masks(
-        batch["input_ids"], processor, start_of_response_token=start_of_response_token
-    )
-    batch["loss_mask"] = loss_mask
-
-    return batch
-```
-
-The default collate function:
-- Applies the processor's chat template to convert message lists into model-ready inputs.
-- Creates labels for training to guide supervised learning.
-- Masks prompts and special tokens so that only answer tokens are considered during loss calculation.
-
-### Preprocess Custom Datasets
-
-When using a custom dataset with a model whose Hugging Face `AutoProcessor` supports the `apply_chat_template` method, you'll need to convert your data into the Hugging Face message list format expected by the `apply_chat_template`.
-We provide [examples](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/datasets/vlm/datasets.py) demonstrating how to perform this conversion.
-
-Some models, such as [Qwen2.5 VL](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct), have specific preprocessing requirements and require custom collate functions. For instance, Qwen2.5-VL uses the `qwen_vl_utils.process_vision_info` function to process images:
-
-```python
-
-texts = [processor.apply_chat_template(example["conversation"], tokenize=False) for example in examples]
-image_inputs = [process_vision_info(example["conversation"])[0] for example in examples]
-
-batch = processor(
-    text=texts,
-    images=image_inputs,
-    padding=True,
-    return_tensors="pt",
-)
-
-```
-If your dataset requires custom preprocessing logic, you can define a custom collate function. To use it, specify the function in your YAML configuration:
-
-```yaml
-dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  batch_size: 1
-  collate_fn:
-    _target_: nemo_automodel.components.datasets.vlm.collate_fns.qwen2_5_collate_fn
-```
-
-We provide [example custom collate functions](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/datasets/vlm/collate_fns.py) that you can use as references for your implementation.
-
-## Run the Fine-Tune Script
-
-Use the `automodel` CLI to launch fine-tuning with a YAML configuration file.
-
-### Apply YAML-Based Configuration
-
-NeMo AutoModel uses a flexible configuration system that combines YAML configuration files with command-line overrides. This allows you to maintain base configurations while easily experimenting with different parameters.
-
-The simplest way to run fine-tuning is with a YAML configuration file. We provide configs for both Gemma 3 and Gemma 3n.
-
-:::{note}
-These VLM recipes require the optional `vlm` dependency set. If you see `ImportError: qwen_vl_utils is not installed`, install VLM dependencies first:
-
-```bash
-uv sync --frozen --extra vlm
-```
-
-(If you're using pip: `pip3 install "nemo-automodel[vlm]"`.)
-:::
-
-#### Run Gemma 3 Fine-Tuning
-
-* **Single-GPU**
-
-```bash
-automodel examples/vlm_finetune/gemma3/gemma3_vl_4b_medpix.yaml
-```
-
-* **Multi-GPU**
-
-```bash
-automodel --nproc-per-node=2 examples/vlm_finetune/gemma3/gemma3_vl_4b_medpix.yaml
-```
-
-#### Run Gemma 3n Fine-Tuning
-
-* **Single-GPU**
-
-```bash
-automodel examples/vlm_finetune/gemma3n/gemma3n_vl_4b_medpix.yaml
-```
-
-* **Multi-GPU**
-
-```bash
-automodel --nproc-per-node=2 examples/vlm_finetune/gemma3n/gemma3n_vl_4b_medpix.yaml
-```
-
-#### Override Configuration Parameters
-
-You can override any configuration parameter using dot-notation without modifying the YAML file:
-
-```bash
-automodel examples/vlm_finetune/gemma3/gemma3_vl_4b_medpix.yaml \
-    --step_scheduler.ckpt_every_steps 100 \
-    --step_scheduler.max_steps 1000 \
-    --optimizer.lr 2e-5 \
-    --rng.seed 1234
-```
-
-### Configure Model Freezing
-
-NeMo AutoModel supports parameter freezing, allowing you to control which parts of a model remain trainable during fine-tuning. This is especially useful for VLMs, where you may want to preserve the pre-trained visual and audio encoders while adapting only the language model components.
-
-With the freezing configuration, you can selectively freeze specific parts of the model to suit your training objectives:
-
-```yaml
-freeze_config:
-  freeze_embeddings: true        # Freeze embeddings
-  freeze_vision_tower: true      # Freeze vision encoder (recommended for VLMs)
-  freeze_audio_tower: true       # Freeze audio encoder (for multimodal models)
-  freeze_language_model: false   # Allow language model adaptation
-```
-
-### Run Parameter-Efficient Fine-Tuning
-
-For memory-efficient training, you can use Low-Rank Adaptation (LoRA) instead of full fine-tuning. NeMo AutoModel provides a dedicated PEFT recipe for Gemma 3:
-
-To run PEFT with Gemma 3:
-
-```bash
-automodel examples/vlm_finetune/gemma3/gemma3_vl_4b_medpix_peft.yaml
-```
-
-The LoRA configuration excludes vision and audio components from adaptation to preserve pre-trained visual representations:
-
-```yaml
-peft:
-  peft_fn: nemo_automodel._peft.lora.apply_lora_to_linear_modules
-  match_all_linear: False
-  exclude_modules:  # exclude all vision and audio modules and lm_head
-    - "*vision_tower*"
-    - "*vision*"
-    - "*visual*"
-    - "*audio*"
-    - "*image_encoder*"
-    - "*lm_head*"
-  dim: 8
-  alpha: 32
-  use_triton: True
-```
-
-The training loss should look similar to the example below:
-
-```{image} medpix_peft.jpg
-:alt: Training Loss Curve
-:class: bg-primary
-:width: 400px
-:align: center
-```
-
-### Checkpointing
-
-We support training state checkpointing in either [Safetensors](https://huggingface.co/docs/safetensors/en/index) or [PyTorch DCP](https://docs.pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html) format.
-
-```yaml
-checkpoint:
-  enabled: true
-  checkpoint_dir: vlm_checkpoints/
-  model_save_format: torch_save  # or "safetensors"
-  save_consolidated: false
-```
-
-#### Integrate Weights & Biases
-
-You can enable W&B logging by setting your API key and configuring the logger:
-
-```bash
-export WANDB_API_KEY=<YOUR_WANDB_API_KEY>
-```
-
-Then, add the W&B configuration to your YAML file:
-
-```yaml
-wandb:
-  project: nemo_automodel_vlm
-  entity: your_entity
-  name: gemma3_medpix_vqa_experiment
-  save_dir: ./wandb_logs
-```
-
-## Run Inference
-
-After fine-tuning your Gemma 3 or Gemma 3n model, you can use it for inference on new image-text tasks.
-
-### Generation Script
-
-The inference functionality is provided through [`examples/vlm_generate/generate.py`](../../../examples/vlm_generate/generate.py), which supports loading fine-tuned checkpoints and performing image-text generation.
-
-#### Basic Usage
-
-```bash
-uv run examples/vlm_generate/generate.py \
-    --checkpoint-path /path/to/checkpoint \
-    --prompt "Describe this image." \
-    --base-model google/gemma-3-4b-it \
-    --image /path/to/image.jpg
-```
-
-The output can be either `text` (default) or `json`, with an optional write file.
-
-For models trained on MedPix-VQA, load the trained checkpoint and generate outputs using the following command. Be sure to specify the same base model used during training:
-
-```bash
-uv run examples/vlm_generate/generate.py \
-    --checkpoint-path vlm_checkpoints/epoch_0_step_200 \
-    --prompt "What medical condition is shown in this image?" \
-    --base-model google/gemma-3-4b-it \
-    --image medical_image.jpg
-```
-
-When checkpoints are saved from PEFT training, they contain only the adapter weights. To use them for generation, you need to specify the PEFT configuration.
-Run the following command to load and generate from adapters trained on MedPix-VQA:
-
-```bash
-uv run examples/vlm_generate/generate.py \
-    --checkpoint-path peft_vlm_checkpoints/epoch_0_step_200/ \
-    --prompt "What medical condition is shown in this image?" \
-    --image-url medical_image.jpg \
-    --base-model google/gemma-3-4b-it \
-    --is-peft \
-    --peft-exclude-modules *vision_tower* *vision* *visual* *audio* *image_encoder* *lm_head*
-```
-
-Given the following image:
-
-```{image} medpix.jpg
-:alt: Sample image from the MedPix dataset
-:class: bg-primary
-:width: 200px
-:align: center
-```
-
-And the prompt:
-
-```
-How does the interhemispheric fissure appear in this image?
-```
-
-Example Gemma 3 response:
-
-```
-The interhemispheric fissure appears as a dark streak, indicating significant tissue loss.
-```
-
-Example Gemma 3n response:
-
-```
-The interhemispheric fissure appears somewhat obscured by the fluid-filled mass.
-```
diff --git a/fern/versions/v0.4/pages/guides/omni/gemma3-3n.mdx b/docs/guides/omni/gemma3-3n.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/omni/gemma3-3n.mdx
rename to docs/guides/omni/gemma3-3n.mdx
diff --git a/docs/guides/overview.md b/docs/guides/overview.md
deleted file mode 100644
index 058819490d..0000000000
--- a/docs/guides/overview.md
+++ /dev/null
@@ -1,118 +0,0 @@
-## Recipes and End-to-End Examples
-
-NeMo Automodel is organized around two key concepts: recipes and components.
-
-Recipes are executable scripts configured with YAML files. Each recipe defines its own training and validation loop, orchestrated through a `step_scheduler`. It specifies the model, dataset, loss function, optimizer, scheduler, checkpointing, and distributed training settings—allowing end-to-end training with a single command.
-
-Components are modular, plug-and-play building blocks referenced using the `_target_` field. These include models, datasets, loss functions, and distribution managers. Recipes assemble these components, making it easy to swap them out to change precision, distribution strategy, dataset, or task—without modifying the training loop itself.
-
-This page maps the ready-to-run recipes found in the `examples/` directory to their intended use cases, representative model families, and the most relevant how-to guides.
-
-- Examples root: [examples/ (GitHub)](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples)
-- Getting started: [Installation](installation.md)
-
-## Large Language Models (LLM)
-This section provides practical recipes and configurations for working with large language models across three core workflows: fine-tuning, pretraining, and knowledge distillation.
-
-### Fine-Tuning
-
-End-to-end fine-tuning recipes for many open models. Each subfolder contains YAML configurations showing task setups (e.g., SQuAD, HellaSwag), precision options (e.g., FP8), and parameter-efficient methods (e.g., LoRA/QLoRA).
-
-- Folder: [examples/llm_finetune](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_finetune)
-- Representative families: Llama 3.1/3.2/3.3, Gemma 2/3, Falcon 3, Mistral/Mixtral, Nemotron, Granite, Starcoder, Qwen, Baichuan, GLM, OLMo, Phi, GPT-OSS, Moonlight
-- How-to guide: [LLM finetuning](llm/finetune.md)
-
-### Pretraining
-
-Starter configurations and scripts for pretraining with datasets from different stacks (e.g., PyTorch, Megatron Core).
-
-- Folder: [examples/llm_pretrain](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_pretrain)
-- Example models: GPT-2 baseline, NanoGPT, DeepSeek-V3, Moonlight 16B TE (Slurm)
-- How-to guides:
-  - [LLM pretraining](llm/pretraining.md)
-  - [Pretraining with NanoGPT](llm/nanogpt-pretraining.md)
-
-### Knowledge Distillation (KD)
-
-Recipes for distilling knowledge from a large teacher model into a smaller, more efficient student model.
-
-- Folder: [examples/llm_kd](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_kd)
-- Example model: Llama 3.2 1B
-- How-to guide: [Knowledge distillation](llm/knowledge-distillation.md)
-
-### Benchmark Configurations
-
-Curated configurations for benchmarking different training stacks and settings (e.g., Torch vs. TransformerEngine + DeepEP, various model sizes, MoE variants).
-
-- Folder: [examples/llm_benchmark](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_benchmark)
-- Representative configurations: DeepSeek-V3, GPT-OSS (20B/120B), Kimi K2, Moonlight 16B, Qwen3 MoE 30B
-
-
-## Vision Language Models (VLM)
-This section provides practical recipes and configurations for working with vision language models, covering fine-tuning and generation workflows for multimodal tasks.
-
-### Fine-Tuning
-
-Fine-tuning recipes for VLMs.
-
-- Folder: [examples/vlm_finetune](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/vlm_finetune)
-- Representative family: Gemma 3 (various configurations)
-- How-to guide: [Gemma 3n: Efficient multimodal fine-tuning](omni/gemma3-3n.md)
-
-### Generation
-
-Simple generation script and configurations for VLMs.
-
-- Folder: [examples/vlm_generate](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/vlm_generate)
-
-## Audio Models (ASR)
-
-This section provides recipes for fine-tuning omni / audio-capable models on automatic speech recognition (ASR) tasks. The recipes reuse the VLM training stack but operate on `{audio, text}` HuggingFace datasets (AMI, LibriSpeech, GigaSpeech, CommonVoice, etc.).
-
-### Fine-Tuning
-
-End-to-end ASR fine-tuning of `Qwen3-Omni-30B-A3B-Instruct` on any HuggingFace audio dataset, including a thinker-only checkpoint export step for downstream `transformers` / vLLM loading.
-
-- Folder: [examples/audio_finetune/qwen3_omni_asr](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/audio_finetune/qwen3_omni_asr)
-- Representative model: Qwen3-Omni-30B-A3B-Instruct
-- How-to guide: [Fine-tune Qwen3-Omni for ASR](audio/qwen3-omni-asr.md)
-
-## Diffusion Models (Text-to-Image & Text-to-Video)
-
-Text-to-image and text-to-video diffusion models can generate visual content from natural language descriptions. Fine-tuning lets you adapt these models to a specific style, domain, or dataset — for example, generating product videos in your brand's aesthetic. Pretraining gives you full control when no existing model fits your needs.
- 
-This section walks through the full workflow in NeMo AutoModel: preparing your dataset, training the model, and generating outputs.
-
-### Fine-Tuning
-
-Fine-tuning recipes for adapting pretrained diffusion models to your data.
-
-- Folder: [examples/diffusion/finetune](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/diffusion/finetune)
-- Representative models: FLUX.1-dev (T2I, 12B), Wan 2.1 T2V 1.3B, HunyuanVideo 1.5
-- How-to guide: [Diffusion fine-tuning](diffusion/finetune.md)
-
-### Pretraining
-
-Pretraining recipes for training diffusion models from scratch on large-scale datasets.
-
-- Folder: [examples/diffusion/pretrain](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/diffusion/pretrain)
-- Representative models: Wan 2.1 T2V 1.3B, FLUX.1-dev
-- How-to guide: [Diffusion fine-tuning (pretraining section)](diffusion/finetune.md#configure-your-training-recipe)
-
-### Generation
-
-Generation scripts and configs for running inference with pretrained or fine-tuned diffusion models.
-
-- Folder: [examples/diffusion/generate](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/diffusion/generate)
-- Representative models: Wan 2.1 1.3B, FLUX.1-dev, HunyuanVideo
-- How-to guide: [Diffusion generation](diffusion/finetune.md#generation--inference)
-
-### Dataset Preparation
-
-Preprocessing pipeline to create `.meta` files containing VAE latents and text embeddings.
-
-- How-to guide: [Diffusion dataset preparation](diffusion/dataset.md)
-
----
-
-If you are new to the project, begin with the [Installation](installation.md) guide. Then, select a recipe category above and follow its linked how-to guide(s). The provided YAML configurations can serve as templates—customize them by adapting model names, datasets, and precision settings to match your specific needs.
diff --git a/fern/versions/nightly/pages/guides/overview.mdx b/docs/guides/overview.mdx
similarity index 88%
rename from fern/versions/nightly/pages/guides/overview.mdx
rename to docs/guides/overview.mdx
index 63becad471..8f44d8b9aa 100644
--- a/fern/versions/nightly/pages/guides/overview.mdx
+++ b/docs/guides/overview.mdx
@@ -69,6 +69,18 @@ Simple generation script and configurations for VLMs.
 
 - Folder: [examples/vlm_generate](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/vlm_generate)
 
+## Audio Models (ASR)
+
+This section provides recipes for fine-tuning omni / audio-capable models on automatic speech recognition (ASR) tasks. The recipes reuse the VLM training stack but operate on `{audio, text}` HuggingFace datasets (AMI, LibriSpeech, GigaSpeech, CommonVoice, etc.).
+
+### Fine-Tuning
+
+End-to-end ASR fine-tuning of `Qwen3-Omni-30B-A3B-Instruct` on any HuggingFace audio dataset, including a thinker-only checkpoint export step for downstream `transformers` / vLLM loading.
+
+- Folder: [examples/audio_finetune/qwen3_omni_asr](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/audio_finetune/qwen3_omni_asr)
+- Representative model: Qwen3-Omni-30B-A3B-Instruct
+- How-to guide: [Fine-tune Qwen3-Omni for ASR](/recipes-e2e-examples/qwen3-omni-asr)
+
 ## Diffusion Models (Text-to-Image & Text-to-Video)
 
 Text-to-image and text-to-video diffusion models can generate visual content from natural language descriptions. Fine-tuning lets you adapt these models to a specific style, domain, or dataset — for example, generating product videos in your brand's aesthetic. Pretraining gives you full control when no existing model fits your needs.
diff --git a/docs/guides/pipelining.md b/docs/guides/pipelining.md
deleted file mode 100644
index 751f98e24f..0000000000
--- a/docs/guides/pipelining.md
+++ /dev/null
@@ -1,742 +0,0 @@
-# Pipeline Parallelism with AutoPipeline
-
-## Introduction
-
-As large language models continue to grow in size, training and fine-tuning them efficiently across multiple GPUs has become increasingly challenging. While data parallelism works well for smaller models, models with billions of parameters require more sophisticated parallelization strategies to overcome memory constraints and communication overhead.
-
-Pipeline parallelism addresses these challenges by splitting a model's layers across different devices and processing them in a pipelined fashion. Each device processes a different stage of the model, enabling training of models that wouldn't fit on a single device while maintaining high GPU utilization through overlapped computation.
-
-AutoPipeline is NeMo AutoModel's high-level pipeline parallelism interface specifically designed for Hugging Face models, making pipeline parallelism as simple as data parallelism. Built on PyTorch's native `torch.distributed.pipelining`, AutoPipeline provides seamless pipeline parallelism support for any Hugging Face decoder-only causal language model with minimal code changes.
-
-For custom models and more granular control, the functional API in `nemo_automodel.components.distributed.pipelining.functional` provides modular, accessible building blocks that can be used with any PyTorch model architecture.
-
-This guide walks you through the complete process of using AutoPipeline for Hugging Face models and the functional API for custom models. You'll learn how to configure pipeline stages, integrate with existing training workflows, optimize performance, and combine pipeline parallelism with other parallelization strategies.
-
-
-**Prerequisites:**
-
-```bash
-# Install uv from https://docs.astral.sh/uv/getting-started/installation/
-# Initialize the virtual environment using uv
-uv venv
-
-# Install the latest stable release from PyPI
-uv pip install nemo-automodel
-
-# Or install from source for the latest features
-uv pip install git+https://github.com/NVIDIA-NeMo/Automodel.git
-```
-:::{important}
-Before proceeding with this guide, please ensure that you have NeMo AutoModel installed on your machine.
-For a complete guide and additional options please consult the AutoModel [Installation Guide](./installation.md).
-:::
-
-## Key Features
-
-AutoPipeline provides the following capabilities:
-
-- **Universal Hugging Face Support**: Works with any Hugging Face decoder-only causal language model including Llama, Qwen, Mistral, Gemma, and more
-- **PyTorch Native Integration**: Built on PyTorch's `torch.distributed.pipelining` for optimal performance
-- **Flexible Configuration**: Multiple scheduling strategies, configurable microbatch sizes, and automatic or manual layer splitting
-- **Mixed Parallelism Support**: Combine pipeline parallelism with data parallelism, tensor parallelism, and FSDP
-- **Modular Functional API**: For custom models, the functional module provides accessible, low-level building blocks
-- **Minimal Opinions**: Easy to extend and integrate with existing training workflows
-
-## Quick Start with AutoPipeline (Hugging Face Models)
-
-Here's a minimal example to get started with AutoPipeline using 2 pipeline stages with a Hugging Face model:
-
-```python
-import torch
-from torch.distributed.device_mesh import init_device_mesh
-from nemo_automodel.components.distributed.pipelining import AutoPipeline
-from transformers import AutoModelForCausalLM
-from transformers.integrations.accelerate import init_empty_weights
-from transformers.modeling_utils import no_init_weights
-from transformers.utils import ContextManagers
-
-def loss_fn(logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
-    """Define loss function for pipeline training."""
-    return torch.nn.functional.cross_entropy(
-        logits.float().view(-1, logits.size(-1)),
-        targets.view(-1),
-        ignore_index=-100
-    )
-
-if __name__ == "__main__":
-    # 1) Initialize device mesh with 2 pipeline stages
-    world_mesh = init_device_mesh("cuda", mesh_shape=(2,), mesh_dim_names=("pp",))
-
-    # 2) Load model on meta device to avoid OOM with large models
-    init_ctx = ContextManagers([no_init_weights(), init_empty_weights()])
-    with init_ctx:
-        model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")
-
-    # 3) Configure and build pipeline
-    ap = AutoPipeline(
-        world_mesh=world_mesh,
-        pp_axis_name="pp",
-        pp_schedule="1f1b",
-        pp_microbatch_size=1,
-        pp_batch_size=8,  # Total batch size across pipeline
-        device=torch.cuda.current_device(),
-        dtype=torch.bfloat16,
-    ).build(model, loss_fn=loss_fn)
-
-    # 4) Access pipeline components
-    print(ap.debug_summary())
-    print(ap.pretty_print_stages())
-```
-
-### Run the Quick Start Example
-
-Save the above code as `pipeline_example.py` and run with:
-
-```bash
-# Run with 2 GPUs for 2 pipeline stages
-uv run torchrun --nproc-per-node=2 pipeline_example.py
-```
-
-For a complete training example:
-
-```bash
-# Run fine-tuning with 2-way pipeline parallelism using Llama 3.1 8B
-automodel --nproc-per-node=2 examples/llm_finetune/llama3_1/llama3_1_8b_hellaswag_pp.yaml
-```
-
-## Configuration Options
-
-### Basic Configuration
-
-AutoPipeline provides comprehensive control over pipeline behavior:
-
-```python
-ap = AutoPipeline(
-    # Device mesh configuration
-    world_mesh=world_mesh,           # DeviceMesh with pipeline axis
-    pp_axis_name="pp",              # Name of pipeline axis (default: "pp")
-
-    # Schedule configuration
-    pp_schedule="1f1b",             # Pipeline schedule ("1f1b", "looped_bfs", etc.)
-    pp_microbatch_size=1,           # Microbatch size per stage
-    # pp_batch_size is automatically inferred from dataloader.batch_size
-
-    # Stage configuration
-    layers_per_stage=None,          # Layers per stage (None for auto)
-    module_fqns_per_model_part=None,  # Manual module assignment
-
-    # Model patching (HF-specific)
-    patch_inner_model=True,         # Make decoder forward stage-friendly
-    patch_causal_lm_model=True,     # Make CausalLM wrapper return tensors (hidden/logits)
-).build(model, loss_fn=loss_fn)
-```
-
-### Model Patching (`patch_inner_model`, `patch_causal_lm_model`)
-
-AutoPipeline splits a model by deep-copying it per stage and pruning away modules that don't belong to that stage. Many Hugging Face models assume the full module tree is present and return `ModelOutput` objects; after pruning, their original `forward()` often breaks (or returns objects that are awkward to pipeline).
-
-These two flags switch AutoPipeline to lightweight, pipeline-friendly `forward()` implementations that return tensors (see `nemo_automodel.components.distributed.pipelining.hf_utils.patch_hf_model_for_pp`):
-
-- **`patch_inner_model`**: patches the *decoder module* (`model.model` for `...ForCausalLM`, otherwise the module itself) so each stage can run even after pruning.
-  - **Stage 0** (has `embed_tokens`): takes token IDs and produces hidden states.
-  - **Middle stages** (no `embed_tokens`): take hidden states from the previous stage (using `inputs_embeds`, or a float tensor passed through `input_ids`) and produce hidden states.
-  - Handles sliced layer containers (e.g., `layers` becoming dict-like after stage pruning) and returns a **tensor** of hidden states so stages can be chained.
-
-  For compilation/performance, this patched forward prefers a precomputed `causal_mask_mapping` dict (it will fall back to computing masks and warn if you don't provide it).
-
-- **`patch_causal_lm_model`**: patches the *`...ForCausalLM` wrapper* forward (the module that owns `lm_head`) so pipeline stages return tensors:
-  - Returns **hidden states** when `lm_head` is absent on that stage.
-  - Returns **logits** when `lm_head` is present (typically only the last stage).
-  - Supports `logits_to_keep` to compute logits for only the last `k` tokens.
-  
-  Note: this is only used when the module you pipeline is a `...ForCausalLM`-style wrapper (i.e., it has a `.model` attribute). If you pass a base decoder module directly, `patch_causal_lm_model` typically has no effect.
-
-#### When Should I Change These?
-
-- **Leave both `True` (default)** for standard Hugging Face `AutoModelForCausalLM` / `...ForCausalLM` models. This is the common case and gives the expected behavior: token IDs -> hidden states -> logits across stages.
-- **Set both `False`** when your model already has a pipeline-friendly forward (returns tensors and can accept hidden states when embeddings are absent) or it needs custom kwargs/paths that the HF patch doesn't preserve (common for NeMo AutoModel-native model implementations, packed-sequence/`thd` paths, extra args like `padding_mask`, etc.). Many benchmark configs for NeMo-native models do this (for example `examples/llm_benchmark/qwen/qwen3_moe_30b_torch.yaml`).
-- **Set `patch_inner_model=False, patch_causal_lm_model=True`** when your inner model is already stage-friendly, but the wrapper forward still returns a `ModelOutput` and you only want the wrapper simplified to “hidden states or logits”.
-
-If you disable `patch_causal_lm_model`, your last stage will typically output hidden states instead of logits; in that case, make sure your `loss_fn` (or your last-stage module) applies the LM head explicitly.
-
-### Automatic vs. Manual Layer Distribution
-
-AutoPipeline offers flexible control over how your model is split across pipeline stages:
-
-#### Automatic Distribution
-Let AutoPipeline automatically balance layers across stages:
-
-```python
-ap = AutoPipeline(
-    world_mesh=world_mesh,
-    pp_schedule="1f1b",
-    layers_per_stage=8,  # Each stage gets ~8 transformer layers
-).build(model, loss_fn=loss_fn)
-```
-
-#### Manual Distribution
-Specify exactly which modules go to each stage:
-
-```python
-from nemo_automodel.components.distributed.pipelining.functional import (
-    generate_hf_model_fqn_per_model_part
-)
-
-# Generate balanced assignments
-module_fqns = generate_hf_model_fqn_per_model_part(
-    num_stages=4,
-    num_layers=32,
-    include_embeddings=True,
-    include_lm_head=True,
-    include_rotary_emb=True,
-    fqn_prefix="model."
-)
-
-# Or define custom assignments
-custom_module_fqns = [
-    # Stage 0: Embeddings + first 8 layers
-    ["model.embed_tokens", "model.rotary_emb"] +
-    [f"model.layers.{i}" for i in range(8)],
-
-    # Stage 1: Next 8 layers
-    ["model.rotary_emb"] + [f"model.layers.{i}" for i in range(8, 16)],
-
-    # Stage 2: Next 8 layers
-    ["model.rotary_emb"] + [f"model.layers.{i}" for i in range(16, 24)],
-
-    # Stage 3: Final 8 layers + output
-    ["model.rotary_emb"] + [f"model.layers.{i}" for i in range(24, 32)] +
-    ["model.norm", "lm_head"]
-]
-
-ap = AutoPipeline(
-    world_mesh=world_mesh,
-    module_fqns_per_model_part=custom_module_fqns,
-).build(model, loss_fn=loss_fn)
-```
-
-## Understand Model Splitting
-
-When AutoPipeline splits your model, it intelligently distributes components across pipeline stages. Here's how a typical model gets split:
-
-### Example: 32-Layer Model Across 2 Stages
-
-```python
-# Stage 0 (Rank 0): Input processing + first half
-stage_0_modules = [
-    "model.embed_tokens",     # Token embeddings
-    "model.layers.0-15",      # First 16 transformer layers
-    "model.rotary_emb"        # Position embeddings (shared)
-]
-
-# Stage 1 (Rank 1): Second half + output processing
-stage_1_modules = [
-    "model.layers.16-31",     # Last 16 transformer layers
-    "model.norm",             # Final layer norm
-    "lm_head",               # Language modeling head
-    "model.rotary_emb"        # Position embeddings (shared)
-]
-```
-
-### Example: 32-Layer Model Across 4 Stages
-
-```python
-# Stage 0 (Rank 0): Input processing
-stage_0_modules = [
-    "model.embed_tokens",     # Token embeddings
-    "model.layers.0-7",       # First 8 transformer layers
-    "model.rotary_emb"        # Position embeddings (shared)
-]
-
-# Stage 1 (Rank 1): Early layers
-stage_1_modules = [
-    "model.layers.8-15",      # Next 8 transformer layers
-    "model.rotary_emb"
-]
-
-# Stage 2 (Rank 2): Middle layers
-stage_2_modules = [
-    "model.layers.16-23",     # Next 8 transformer layers
-    "model.rotary_emb"
-]
-
-# Stage 3 (Rank 3): Output processing
-stage_3_modules = [
-    "model.layers.24-31",     # Final 8 transformer layers
-    "model.norm",             # Final layer norm
-    "lm_head",               # Language modeling head
-    "model.rotary_emb"
-]
-```
-
-Key observations:
-- **Embeddings** only exist on the first stage
-- **Language modeling head** only exists on the last stage
-- **Rotary embeddings** are shared across all stages (for position encoding)
-- **Transformer layers** are evenly distributed
-
-## Use the Functional API for Custom Models
-
-While AutoPipeline is specifically designed as a high-level interface for Hugging Face models, the functional API in `nemo_automodel.components.distributed.pipelining.functional` provides more modular and accessible building blocks that can be used with any PyTorch model, including custom architectures. This separation allows for cleaner code organization where AutoPipeline handles Hugging Face-specific optimizations while the functional module remains model-agnostic.
-
-### Key Functional API Components
-
-The functional API provides several utilities for building custom pipeline parallel systems:
-
-#### Stage ID Calculation
-```python
-from nemo_automodel.components.distributed.pipelining.functional import stage_ids_this_rank
-
-# Calculate which stages run on this rank
-# For a "loop" style schedule (default)
-stage_ids = stage_ids_this_rank(pp_rank=0, pp_size=4, num_stages=8, style="loop")
-# Returns: (0, 4) - rank 0 gets stages 0 and 4
-
-# For a "v" style schedule (for zero-bubble schedules)
-stage_ids = stage_ids_this_rank(pp_rank=0, pp_size=4, num_stages=8, style="v")
-# Returns: (0, 7) - rank 0 gets stages 0 and 7
-```
-
-#### Module Name Generation
-```python
-from nemo_automodel.components.distributed.pipelining.functional import (
-    generate_hf_model_fqn_per_model_part
-)
-
-# Generate balanced module assignments for any model
-module_names = generate_hf_model_fqn_per_model_part(
-    num_stages=4,
-    num_layers=32,
-    include_embeddings=True,
-    include_lm_head=True,
-    include_rotary_emb=False,  # Set based on your model
-    fqn_prefix=""  # Use "model." for nested models
-)
-```
-
-#### Virtual Stage Calculation
-```python
-from nemo_automodel.components.distributed.pipelining.functional import calculate_virtual_stages
-
-# Calculate virtual stages for interleaved schedules
-num_virtual_stages, stages_per_rank = calculate_virtual_stages(
-    num_layers=32,
-    layers_per_stage=4,  # Each virtual stage has 4 layers
-    pp_size=4,
-    is_single_stage_schedule=False,
-    round_to_pp_multiple="up"  # Round up to nearest multiple of pp_size
-)
-```
-
-#### Pipeline Schedule Build
-```python
-from nemo_automodel.components.distributed.pipelining.functional import build_pipeline_schedule
-
-# Build a schedule for your stages
-schedule = build_pipeline_schedule(
-    pipeline_parallel_schedule_csv=None,  # Optional CSV schedule
-    pipeline_parallel_schedule="1f1b",
-    microbatch_size=1,
-    local_batch_size=8,
-    stages=stages,  # List of PipelineStage objects
-    loss_fn=loss_fn,
-    scale_grads=False
-)
-```
-
-### Example: Pipeline Parallelism for Custom Models
-
-Here's how to use the functional API to implement pipeline parallelism for a custom model:
-
-```python
-import torch
-import torch.nn as nn
-from torch.distributed.device_mesh import init_device_mesh
-from torch.distributed.pipelining import PipelineStage
-from nemo_automodel.components.distributed.pipelining.functional import (
-    stage_ids_this_rank,
-    build_pipeline_schedule,
-    calculate_virtual_stages
-)
-
-class CustomTransformerBlock(nn.Module):
-    def __init__(self, hidden_size):
-        super().__init__()
-        self.attention = nn.MultiheadAttention(hidden_size, num_heads=8)
-        self.mlp = nn.Sequential(
-            nn.Linear(hidden_size, hidden_size * 4),
-            nn.GELU(),
-            nn.Linear(hidden_size * 4, hidden_size)
-        )
-        self.norm1 = nn.LayerNorm(hidden_size)
-        self.norm2 = nn.LayerNorm(hidden_size)
-
-    def forward(self, x):
-        # Simplified transformer block
-        attn_out, _ = self.attention(x, x, x)
-        x = self.norm1(x + attn_out)
-        x = self.norm2(x + self.mlp(x))
-        return x
-
-class CustomModel(nn.Module):
-    def __init__(self, vocab_size, hidden_size, num_layers):
-        super().__init__()
-        self.embedding = nn.Embedding(vocab_size, hidden_size)
-        self.layers = nn.ModuleList([
-            CustomTransformerBlock(hidden_size) for _ in range(num_layers)
-        ])
-        self.output_proj = nn.Linear(hidden_size, vocab_size)
-
-    def forward(self, input_ids):
-        x = self.embedding(input_ids)
-        for layer in self.layers:
-            x = layer(x)
-        return self.output_proj(x)
-
-def split_custom_model_for_pipeline(model, pp_rank, pp_size, num_stages):
-    """Split a custom model into pipeline stages."""
-
-    # Determine which stages this rank handles
-    stage_indices = stage_ids_this_rank(pp_rank, pp_size, num_stages, style="loop")
-
-    stages = []
-    for stage_idx in stage_indices:
-        # Create a stage-specific version of the model
-        # This is a simplified example - you'd need to implement proper splitting
-        stage_model = create_stage_model(model, stage_idx, num_stages)
-
-        # Create PipelineStage
-        stage = PipelineStage(
-            stage_model,
-            stage_idx,
-            num_stages,
-            device=torch.cuda.current_device(),
-            group=None  # Set your process group here
-        )
-        stages.append(stage)
-
-    return stages
-
-# Usage
-def main():
-    # Initialize device mesh
-    world_mesh = init_device_mesh("cuda", mesh_shape=(4,), mesh_dim_names=("pp",))
-    pp_rank = world_mesh["pp"].get_local_rank()
-    pp_size = world_mesh["pp"].size()
-
-    # Create model
-    model = CustomModel(vocab_size=50000, hidden_size=768, num_layers=24)
-
-    # Calculate virtual stages
-    num_virtual_stages, stages_per_rank = calculate_virtual_stages(
-        num_layers=24,
-        layers_per_stage=3,  # 8 virtual stages total
-        pp_size=4,
-        is_single_stage_schedule=False
-    )
-
-    # Split model into stages
-    stages = split_custom_model_for_pipeline(model, pp_rank, pp_size, num_virtual_stages)
-
-    # Define loss function
-    def loss_fn(logits, targets):
-        return nn.functional.cross_entropy(
-            logits.view(-1, logits.size(-1)),
-            targets.view(-1)
-        )
-
-    # Build pipeline schedule
-    schedule = build_pipeline_schedule(
-        pipeline_parallel_schedule_csv=None,
-        pipeline_parallel_schedule="interleaved_1f1b",  # Good for multi-stage
-        microbatch_size=1,
-        local_batch_size=8,
-        stages=stages,
-        loss_fn=loss_fn,
-        scale_grads=True
-    )
-
-    # Training loop
-    for batch in dataloader:
-        # Use schedule.step() for training
-        losses = []
-        schedule.step(batch["input_ids"], target=batch["labels"], losses=losses)
-
-        # losses will contain the loss values from the last stage
-        if losses:
-            print(f"Loss: {sum(losses) / len(losses)}")
-```
-
-### Advanced: Custom Model Splitting Logic
-
-For more complex custom models, you can implement your own splitting logic:
-
-```python
-from nemo_automodel.components.distributed.pipelining.functional import pipeline_model
-
-def custom_parallelize_fn(
-    model, world_mesh, moe_mesh, *,
-    pp_enabled, dp_axis_names, **kwargs
-):
-    """Custom parallelization function for each pipeline stage."""
-    # Apply your custom parallelization logic here
-    # This is called for each pipeline stage
-    if dp_axis_names:
-        # Apply data parallelism
-        pass
-    # Add any other parallelization strategies
-    pass
-
-# Use pipeline_model for complete pipeline setup
-schedule, model_parts, has_first, has_last, stages = pipeline_model(
-    model=your_custom_model,
-    world_mesh=world_mesh,
-    moe_mesh=None,
-    pp_axis_name="pp",
-    dp_axis_names=("dp",),
-    layers_per_stage=4,
-    pipeline_parallel_schedule="1f1b",
-    pipeline_parallel_schedule_csv=None,
-    microbatch_size=1,
-    local_batch_size=8,
-    device=torch.cuda.current_device(),
-    loss_fn=loss_fn,
-    parallelize_fn=custom_parallelize_fn,
-    module_fqns_per_model_part=None,  # Provide custom module names
-    patch_inner_model=False,  # Custom model: don't apply HF forward patches
-    patch_causal_lm_model=False,  # Custom model: don't apply HF forward patches
-)
-```
-
-### Tips for Using Functional API with Custom Models
-
-The functional API is designed to be more accessible and modular than AutoPipeline:
-
-1. **Module Naming**: Ensure your model has consistent module naming that can be mapped to stages
-2. **State Management**: Handle model state (embeddings, buffers) carefully across stages
-3. **Communication**: First and last stages need special handling for inputs/outputs
-4. **Flexibility**: The functional API gives you complete control over how models are split and parallelized
-5. **Testing**: Start with a small model and verify correct splitting before scaling up
-
-The functional module's modular design makes it easier to integrate pipeline parallelism into existing custom model training workflows without the Hugging Face-specific assumptions that AutoPipeline makes.
-
-## Mixed Parallelism
-
-AutoPipeline can be combined with other parallelization strategies for optimal performance:
-
-```python
-def parallelize_fn(
-    model, world_mesh, moe_mesh, *,
-    pp_enabled, dp_axis_names,
-    cp_axis_name=None, tp_axis_name=None, ep_axis_name=None
-):
-    """Apply additional parallelization to each pipeline stage."""
-    # Example: Apply FSDP to each stage
-    if dp_axis_names:
-        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-        # Wrap model with FSDP (simplified example)
-        # In practice, you'd configure FSDP parameters
-        pass
-
-    # Example: Apply tensor parallelism
-    if tp_axis_name:
-        # Apply tensor parallelism to attention/MLP layers
-        pass
-
-# Build pipeline with custom parallelization
-ap = AutoPipeline(world_mesh=world_mesh).build(
-    model,
-    loss_fn=loss_fn,
-    parallelize_fn=parallelize_fn
-)
-```
-
-## Monitor and Debug
-
-AutoPipeline provides comprehensive tools for understanding your pipeline configuration:
-
-### Pipeline Information
-```python
-# Get pipeline info
-info = ap.info
-print(f"Pipeline enabled: {info.enabled}")
-print(f"Has first stage: {info.has_first_stage}")
-print(f"Has last stage: {info.has_last_stage}")
-
-# Access model parts
-model_parts = ap.parts  # List of pipeline stages
-stage_modules = ap.list_stage_modules()  # Module names per stage
-```
-
-### Analysis
-```python
-# Parameter distribution
-stage_param_counts = ap.get_stage_param_counts()
-total_params = ap.get_total_param_count()
-trainable_params = ap.get_total_param_count(trainable_only=True)
-
-for i, params in enumerate(stage_param_counts):
-    percentage = (params / total_params) * 100
-    print(f"Stage {i}: {params:,} parameters ({percentage:.1f}%)")
-
-# Debug summary
-print(ap.debug_summary())
-print(ap.pretty_print_stages(max_modules_per_stage=10))
-
-# Visualize schedule
-ap.visualize_current_schedule("pipeline_schedule.png")
-```
-
-### Gradient Management
-```python
-# Scale gradients for mixed parallelism
-ap.scale_grads_by_divisor(divisor=8)
-
-# Clip gradients across pipeline stages
-grad_norm = ap.clip_grad_norm(max_norm=1.0, norm_type=2.0)
-```
-
-## Add Pipeline Parallelism to Existing Configurations
-
-You can easily add pipeline parallelism to any existing training configuration through command-line overrides or YAML modifications.
-
-### Command-Line Override Method
-
-Add pipeline parallelism to an existing config using command-line arguments:
-
-```bash
-automodel --nproc-per-node=2 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-    --distributed.strategy fsdp2 \
-    --distributed.pp_size 2 \
-    --distributed.pipeline.pp_schedule 1f1b \
-    --distributed.pipeline.pp_microbatch_size 1 \
-    --distributed.pipeline.round_virtual_stages_to_pp_multiple up \
-    --distributed.pipeline.scale_grads_in_schedule false
-```
-
-Key parameters to override:
-- `--distributed.pp_size`: Number of pipeline stages (must match nproc-per-node)
-- `pp_batch_size` is automatically inferred from `--dataloader.batch_size`
-- `--distributed.pipeline.pp_schedule`: Pipeline schedule (1f1b, interleaved_1f1b, etc.)
-
-### YAML Configuration Method
-
-Add these sections to your existing YAML config:
-
-```yaml
-distributed:
-  strategy: fsdp2
-  dp_size: 1
-  tp_size: 1
-  cp_size: 1
-  pp_size: 4  # Enable 4-way pipeline parallelism
-  sequence_parallel: false
-  pipeline:
-    pp_schedule: 1f1b
-    pp_microbatch_size: 1
-    # pp_batch_size is automatically inferred from dataloader.batch_size
-    round_virtual_stages_to_pp_multiple: up
-    scale_grads_in_schedule: false
-    layers_per_stage: null  # Auto-compute, or specify number
-```
-
-### Mixed Parallelism Examples
-
-#### Pipeline + Data Parallelism (4 GPUs Total)
-```bash
-automodel --nproc-per-node=4 your_config.yaml \
-    --distributed.pp_size 2 \
-    --distributed.dp_size 2 \
-    --dataloader.batch_size 16
-```
-
-#### Pipeline + Tensor Parallelism (4 GPUs Total)
-```bash
-automodel --nproc-per-node=4 your_config.yaml \
-    --distributed.pp_size 2 \
-    --distributed.tp_size 2 \
-    --dataloader.batch_size 8
-```
-
-#### Full Hybrid: PP + DP + TP (8 GPUs Total)
-```bash
-automodel --nproc-per-node=8 your_config.yaml \
-    --distributed.pp_size 2 \
-    --distributed.dp_size 2 \
-    --distributed.tp_size 2 \
-    --dataloader.batch_size 32
-```
-
-## Integrate with Training Recipes
-
-AutoPipeline seamlessly integrates with NeMo AutoModel's recipe system. Here's a complete example YAML configuration:
-
-```yaml
-# config.yaml
-distributed:
-  strategy: fsdp2
-  dp_size: 1
-  tp_size: 1
-  cp_size: 1
-  pp_size: 2          # 2-way pipeline parallelism
-  sequence_parallel: false
-  pipeline:
-    pp_schedule: 1f1b
-    pp_microbatch_size: 1
-    # pp_batch_size is automatically inferred from dataloader.batch_size
-    layers_per_stage: null  # Auto-compute layer distribution
-    round_virtual_stages_to_pp_multiple: up
-    scale_grads_in_schedule: false
-
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-
-loss_fn:
-  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
-
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.SQuAD
-  path_or_dataset: squad
-  split: train
-
-dataloader:
-  batch_size: 8
-  shuffle: true
-```
-
-Run training with:
-```bash
-# Run with 2 GPUs for 2-way pipeline parallelism
-automodel --nproc-per-node=2 config.yaml
-```
-
-## Troubleshooting
-
-### Common Issues
-
-**Model doesn't fit in memory:**
-- Increase number of pipeline stages
-- Reduce microbatch size
-- Enable gradient checkpointing
-
-**Pipeline bubbles reducing efficiency:**
-- Increase batch size to have more microbatches
-- Try different schedules (e.g., `interleaved_1f1b`)
-- Adjust virtual stages configuration
-
-**Uneven stage distribution:**
-- Use manual module assignment for fine control
-- Adjust `layers_per_stage` parameter
-- Check parameter counts with `get_stage_param_counts()`
-
-## Conclusion
-
-AutoPipeline and the functional API together provide a complete pipeline parallelism solution for both Hugging Face and custom models. AutoPipeline offers a high-level, optimized interface specifically for Hugging Face models, while the functional module provides modular, accessible building blocks for custom architectures.
-
-Key takeaways:
-- Pipeline parallelism enables training of models too large for a single GPU
-- AutoPipeline provides a simple API for Hugging Face models with powerful customization options
-- The functional API offers modular components for implementing pipeline parallelism with any PyTorch model
-- Both can be combined with other parallelization strategies for optimal performance
-- Use built-in monitoring tools to understand and optimize your pipeline
diff --git a/fern/versions/nightly/pages/guides/pipelining.mdx b/docs/guides/pipelining.mdx
similarity index 100%
rename from fern/versions/nightly/pages/guides/pipelining.mdx
rename to docs/guides/pipelining.mdx
diff --git a/docs/guides/quantization-aware-training.md b/docs/guides/quantization-aware-training.md
deleted file mode 100644
index 04b91d0db6..0000000000
--- a/docs/guides/quantization-aware-training.md
+++ /dev/null
@@ -1,311 +0,0 @@
-# Quantization-Aware Training (QAT)
-
-NeMo Automodel supports Quantization-Aware Training (QAT) for Supervised Fine-Tuning (SFT) using [TorchAO](https://github.com/pytorch/ao). QAT simulates quantization effects during the training process, allowing models to adapt to lower precision representations while learning. This approach produces quantized models that maintain significantly higher accuracy compared to applying quantization after training is complete.
-
-## What is Quantization-Aware Training?
-
-Quantization-Aware Training simulates the effects of quantization during the training process. By introducing fake quantization operations in the forward pass, the model learns to adapt to lower precision representations, maintaining better accuracy when deployed with actual quantization.
-
-### Benefits of QAT
-
-- **Better accuracy**: Models trained with QAT maintain higher accuracy when quantized compared to post-training quantization
-- **Efficient deployment**: Quantized models require less memory and compute resources
-- **Edge device support**: Enables deployment on resource-constrained devices
-- **Production optimization**: Reduces inference costs while maintaining model quality
-
-### QAT vs. Post-Training Quantization
-
-| Aspect | QAT | Post-Training Quantization |
-|--------|-----|---------------------------|
-| **Accuracy** | Higher - model adapts during training | Lower - no adaptation |
-| **Training time** | Longer - requires retraining | None - applied after training |
-| **Use case** | Production deployments requiring best accuracy | Quick prototyping or less critical applications |
-| **Flexibility** | Can fine-tune quantization parameters | Limited to fixed quantization schemes |
-
-## Requirements
-
-To use QAT in NeMo Automodel, you need:
-
-- **Software**: TorchAO library must be installed
-- **Hardware**: Compatible NVIDIA GPU (recommended: A100 or newer)
-- **Model**: Any supported model architecture for SFT
-
-## Install TorchAO
-
-Make sure you have TorchAO installed. Follow the [installation guide](https://github.com/pytorch/ao?tab=readme-ov-file#-installation) for TorchAO.
-
-```bash
-pip install torchao
-```
-
-## How QAT Works in NeMo Automodel
-
-NeMo Automodel integrates TorchAO's QAT quantizers into the training pipeline. During training:
-
-1. **Model preparation**: The quantizer prepares the model by inserting fake quantization operations
-2. **Forward pass**: Weights and activations are quantized using fake quantization
-3. **Backward pass**: Gradients flow through the fake quantization operations
-4. **Weight updates**: Model learns to minimize loss while accounting for quantization effects
-
-### Supported Quantization Schemes
-
-NeMo Automodel supports two TorchAO QAT quantizers:
-
-#### Int8 Dynamic Activation + Int4 Weight (8da4w-qat)
-- **Quantizer**: `Int8DynActInt4WeightQATQuantizer`
-- **Activations**: INT8 with dynamic quantization
-- **Weights**: INT4 quantization
-- **Use case**: Balanced accuracy and efficiency
-- **Memory savings**: ~4x compared to FP16/BF16
-
-#### Int4 Weight-Only (4w-qat)
-- **Quantizer**: `Int4WeightOnlyQATQuantizer`
-- **Activations**: Full precision
-- **Weights**: INT4 quantization
-- **Use case**: Maximum memory savings with minimal accuracy loss
-- **Memory savings**: ~4x for weights only
-
-## Configuration
-
-To enable QAT in your training configuration, you need to specify the quantizer in your YAML configuration file.
-
-### Basic Configuration
-
-```yaml
-# Enable QAT with Int8 Dynamic Activation + Int4 Weight quantization
-qat:
-  enabled: true
-  quantizer:
-    _target_: torchao.quantization.qat.Int8DynActInt4WeightQATQuantizer
-    groupsize: 256
-```
-
-### Int4 Weight-Only Configuration
-
-```yaml
-# Enable QAT with Int4 Weight-Only quantization
-qat:
-  enabled: true
-  quantizer:
-    _target_: torchao.quantization.qat.Int4WeightOnlyQATQuantizer
-    groupsize: 256
-```
-
-### Configuration Parameters
-
-| Parameter | Type | Description |
-|-----------|------|-------------|
-| `enabled` | bool | Enable or disable QAT |
-| `quantizer._target_` | str | Fully qualified class name of the TorchAO quantizer |
-| `quantizer.groupsize` | int | Group size for weight quantization (typically 128 or 256) |
-
-### Delayed Fake Quantization
-
-You can optionally delay the activation of fake quantization to allow the model to train normally for a few steps before introducing quantization effects:
-
-```yaml
-qat:
-  enabled: true
-  quantizer:
-    _target_: torchao.quantization.qat.Int8DynActInt4WeightQATQuantizer
-    groupsize: 256
-  delay_fake_quant_steps: 1000  # Enable fake quant after 1000 steps
-```
-
-## Training Workflow
-
-### 1. Prepare Your Configuration
-
-Create a YAML configuration file with QAT enabled:
-
-```yaml
-model:
-  model_name: meta-llama/Llama-3.2-1B
-
-task:
-  type: sft
-  
-qat:
-  enabled: true
-  quantizer:
-    _target_: torchao.quantization.qat.Int8DynActInt4WeightQATQuantizer
-    groupsize: 256
-
-trainer:
-  max_steps: 10000
-  val_check_interval: 500
-```
-
-### 2. Run Training
-
-Launch training with your QAT-enabled configuration:
-
-```bash
-automodel --nproc-per-node=8 your_qat_config.yaml
-```
-
-### 3. Monitor Training
-
-During training, the model will:
-- Apply fake quantization to weights and activations
-- Learn to minimize loss while accounting for quantization effects
-- Produce checkpoints that can be converted to actual quantized models
-
-### 4. Deploy Quantized Model
-
-After training, convert the QAT checkpoint to a fully quantized model for deployment:
-
-```python
-from torchao.quantization import quantize_
-
-# Load your trained model
-model = load_model_from_checkpoint(checkpoint_path)
-
-# Apply actual quantization (not fake quantization)
-quantize_(model, int8_dynamic_activation_int4_weight())
-
-# Deploy the quantized model
-model.eval()
-```
-
-## Performance Considerations
-
-### Training Performance
-
-- **Training time**: QAT adds overhead during training due to fake quantization operations
-- **Memory usage**: Similar to full-precision training during the training phase
-- **Convergence**: May require slightly more training steps to converge compared to full-precision training
-
-### Inference Performance
-
-After converting to actual quantization:
-
-- **Speed**: 2-4x faster inference depending on hardware and model size
-- **Memory**: ~4x reduction in model size
-- **Accuracy**: Minimal degradation compared to full-precision models (typically <1% difference)
-
-### When to Use QAT
-
-QAT is most beneficial when:
-
-- **Deploying to production**: Where inference efficiency is critical
-- **Edge devices**: Resource-constrained environments
-- **Large-scale serving**: Reducing infrastructure costs
-- **Accuracy is important**: When post-training quantization causes unacceptable accuracy loss
-
-### When Not to Use QAT
-
-Consider alternatives when:
-
-- **Quick prototyping**: Post-training quantization is faster
-- **Small models**: Quantization overhead may not be worth it
-- **Limited training resources**: QAT requires retraining the model
-- **Accuracy is not critical**: Post-training quantization may be sufficient
-
-## Best Practices
-
-### 1. Start with Post-Training Quantization
-
-Before investing in QAT, try post-training quantization to establish a baseline:
-
-```python
-# Quick post-training quantization test
-from torchao.quantization import quantize_
-quantize_(model, int8_dynamic_activation_int4_weight())
-```
-
-If accuracy is acceptable, you may not need QAT.
-
-### 2. Choose the Right Quantization Scheme
-
-- **8da4w-qat**: Best balance of accuracy and efficiency for most use cases
-- **4w-qat**: Use when memory is the primary constraint and activations can remain full precision
-
-### 3. Tune Group Size
-
-The `groupsize` parameter affects the granularity of quantization:
-
-- **Smaller groups (128)**: Better accuracy, slightly more memory
-- **Larger groups (256)**: More efficient, may have minor accuracy impact
-
-Start with 256 and reduce to 128 if accuracy is insufficient.
-
-### 4. Monitor Validation Metrics
-
-Track validation metrics closely during QAT training:
-
-- Compare against full-precision baseline
-- Watch for convergence issues
-- Adjust learning rate if needed (QAT may benefit from slightly lower learning rates)
-
-### 5. Use Delayed Fake Quantization
-
-For better convergence, consider delaying fake quantization:
-
-```yaml
-qat:
-  delay_fake_quant_steps: 1000  # Let model train normally first
-```
-
-This allows the model to learn basic patterns before introducing quantization constraints.
-
-## Accuracy vs. Efficiency Trade-offs
-
-### Expected Accuracy Impact
-
-| Quantization Method | Typical Accuracy Loss | Memory Savings |
-|---------------------|----------------------|----------------|
-| Full Precision (BF16) | Baseline | Baseline |
-| Post-Training Quantization | 1-3% | 4x |
-| QAT (8da4w) | <1% | 4x |
-| QAT (4w) | <1.5% | 4x (weights only) |
-
-### Optimization Strategies
-
-If accuracy is below expectations:
-
-1. **Increase training steps**: QAT may need more training to converge
-2. **Reduce learning rate**: Lower learning rates can help with quantization constraints
-3. **Use 8da4w instead of 4w**: Better accuracy with minimal additional cost
-4. **Reduce group size**: Smaller groups provide finer-grained quantization
-5. **Delay fake quantization**: Give the model time to learn before quantizing
-
-## Limitations and Known Issues
-
-### Current Limitations
-
-- **SFT only**: QAT is currently supported for Supervised Fine-Tuning tasks only
-- **Model compatibility**: Not all model architectures may be compatible with TorchAO quantizers
-- **Training overhead**: QAT adds computational overhead during training
-
-### Troubleshooting
-
-#### Issue: Training diverges or doesn't converge
-
-**Solution**: Try these approaches:
-- Reduce learning rate by 2-5x
-- Increase `delay_fake_quant_steps` to 2000-5000
-- Use a smaller group size (128 instead of 256)
-- Verify your baseline model trains successfully without QAT
-
-#### Issue: Accuracy is significantly worse than expected
-
-**Solution**:
-- Ensure you're comparing against the same baseline (same training steps, data, etc.)
-- Try 8da4w quantization instead of 4w
-- Reduce group size to 128
-- Increase training steps by 20-30%
-
-#### Issue: Out of memory during training
-
-**Solution**:
-- QAT should have similar memory usage to full-precision training
-- Reduce batch size if needed
-- Use gradient accumulation to maintain effective batch size
-
-## References
-
-- [TorchAO Documentation](https://github.com/pytorch/ao)
-- [TorchAO QAT Guide](https://github.com/pytorch/ao/tree/main/torchao/quantization/qat)
-- [Quantization Fundamentals](https://pytorch.org/docs/stable/quantization.html)
-- [INT8 Quantization for Deep Learning](https://arxiv.org/abs/1806.08342)
diff --git a/fern/versions/v0.4/pages/guides/quantization-aware-training.mdx b/docs/guides/quantization-aware-training.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/quantization-aware-training.mdx
rename to docs/guides/quantization-aware-training.mdx
diff --git a/docs/guides/vlm/dataset.md b/docs/guides/vlm/dataset.md
deleted file mode 100644
index 557709cab9..0000000000
--- a/docs/guides/vlm/dataset.md
+++ /dev/null
@@ -1,161 +0,0 @@
-# Integrate Your Own Multi-Modal Dataset
-
-This guide shows you how to integrate your own dataset into NeMo Automodel for training.
-You'll learn about **multi-modal datasets** that combine text with images or other modalities. We'll cover how to create custom datasets by implementing the required methods and preprocessing functions, and finally show you how to specify your own data logic using YAML configuration with file paths—allowing you to define custom dataset processing without modifying the main codebase.
-
-## Quick Start Summary
-| **Type**        |  **Use Case**    | **Example** | **Preprocessor**               | **Section**              |
-| --------------- | ------------------ | -------------- | --------------------------------- | --------------------------- |
-| 🖼️ Multi-modal  | Vision + Language  | MedPix-VQA     | `apply_chat_template`, collate fn | [Jump](#multi-modal-datasets) |
-| 🎤 Audio        | Speech + Language  | Common Voice 17| `apply_chat_template`, collate fn | [Jump](#audio-datasets) |
-
-
-## Multi-modal Datasets
-
-Multi-modal datasets combine text with other input types (e.g., images, audio, or video) and are essential for training Vision-Language Models (VLMs). These datasets introduce specific challenges such as aligning modalities, batching diverse data types, and formatting prompts for multi-turn, multi-modal dialogue.
-
-NeMo Automodel supports multi-modal dataset integration through flexible preprocessing, custom formatting, and YAML-based configuration.
-
-### Typical Types in Multi-modal Datasets
-A multi-modal dataset typically contains:
-- **Images, videos, audios** or other non-text modalities.
-- **Textual inputs** such as questions, instructions, or captions.
-- **Answers** or expected outputs from the model.
-
-These are formatted into structured conversations or instruction-response pairs for use with VLMs like BLIP, Llava, or Flamingo.
-
-#### Example: MedPix-VQA Dataset
-
-The [MedPix-VQA](https://huggingface.co/datasets/mmoukouba/MedPix-VQA) dataset is a comprehensive medical Visual Question Answering dataset designed for training and evaluating VQA models in the medical domain. It contains radiological images (from MedPix; well-known medial image dataset) and associated QA pairs used for medical image interpretation.
-
-**Structure**:
-- 20,500 total examples
-- Columns: `image_id`, `mode`, `case_id`, `question`, `answer`
-
-```json
-{
-  "image_id": "medpix_0143.jpg",
-  "mode": "CT",
-  "case_id": "case_101",
-  "question": "What abnormality is visible in the left hemisphere?",
-  "answer": "Subdural hematoma"
-}
-```
-
-The example dataset preprocessing performs the following steps:
-
-1. Loads the dataset using Hugging Face's `datasets` library.
-2. Extracts the `question` and `answer`.
-3. Transforms the data into a chat-like format that is compatible with Hugging Face's Autoprocessor `apply_chat_template` function. For example:
-
-```python
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image", "image": example["image_id"]},
-            {"type": "text", "text": example["question"]},
-        ],
-    },
-    {
-        "role": "assistant",
-        "content": [{"type": "text", "text": example["answer"]}]
-    },
-]
-```
-
-For more detailed examples of how to process multi-modal datasets for VLMs, see the examples in [`datasets.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/datasets/vlm/datasets.py).
-
-## Audio Datasets
-
-Audio datasets combine speech input with text transcriptions and are essential for training models capable of speech recognition and transcription tasks. NeMo Automodel supports audio dataset integration through specialized preprocessing functions and custom collate functions for multimodal models like Phi-4.
-
-### Example: Common Voice 17 Dataset
-
-The [Common Voice 17](https://huggingface.co/datasets/ysdede/commonvoice_17_tr_fixed) dataset is a comprehensive speech recognition dataset containing audio clips and corresponding transcriptions. This particular version focuses on Turkish speech data and has been preprocessed and fixed for compatibility with modern training frameworks.
-
-**Structure**:
-- **Audio**: Speech recordings in various formats
-- **Transcription**: Text transcriptions of the spoken content
-- **Use case**: Speech-to-text transcription for multimodal models
-
-```json
-{
-  "audio": {
-    "path": "common_voice_tr_17528071.mp3",
-    "array": [-0.1600779, -0.13843077],
-    "sampling_rate": 16000
-  },
-  "transcription": "Kosova başkentinswki yolcu sayısı arttı."
-}
-```
-
-The example dataset preprocessing performs the following steps:
-
-1. Loads the dataset using Hugging Face's `datasets` library.
-2. Extracts the `audio` and `transcription` fields.
-
-For more detailed examples of how to process multi-modal datasets, see the examples in [`datasets.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/datasets/vlm/datasets.py).
-
-
-### Collate Functions
-
-NeMo Automodel provides specialized collate functions for different VLM processors. The collate function is responsible for batching examples and preparing them for model input.
-
-Multi-modal models require custom collate functions to batch and process each sample correctly. If your model uses a Hugging Face `AutoProcessor`, you can use it directly. Otherwise, you can define your own collate logic and point to it in your YAML config. We provide [example custom collate functions](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/datasets/vlm/collate_fns.py) that you can use as references for your implementation. After you implement your own collate function, you can specify it in your YAML config.
-
-
-## YAML-based Custom Dataset Configuration
-
-NeMo Automodel supports YAML-based dataset specification using the _target_ key. This lets you reference dataset-building classes or functions using either:
-
-- 1. Python Dotted Path
-
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
-  path_or_dataset: rowan/hellaswag
-  split: train
-```
-
-- 2. File Path + Function Name
-
-```
-<file-path>:<function-name>
-```
-
-Where:
-- `<file-path>`: The absolute path to a Python file containing your dataset function
-- `<function-name>`: The name of the function to call from that file
-
-```yaml
-dataset:
-  _target_: /path/to/your/custom_dataset.py:build_my_dataset
-  num_blocks: 111
-```
-This will call `build_my_dataset()` from the specified file with the other keys (e.g., num_blocks) as arguments. This approach allows you to integrate custom datasets via config alone—no need to alter the codebase or package structure.
-
-
-## Custom Chat Template
-
-By default, VLM fine-tuning uses the chat template built into the model's `AutoProcessor`. To use a custom template, add `chat_template` under `dataset:` in your YAML config:
-
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset
-  split: train
-  chat_template: /path/to/template.jinja
-```
-
-Accepted values:
-- A Jinja template string (e.g., `"{% for msg in messages %}..."`)
-- A path to a `.jinja` template file
-- A path to a JSON file containing a `chat_template` key (e.g., `tokenizer_config.json`)
-
-The override is applied to both `processor.chat_template` and `processor.tokenizer.chat_template` before dataset loading.
-
-## Troubleshooting Tips
-
-- **Tokenization Mismatch?** Ensure your tokenizer aligns with the model's expected inputs.
-- **Dataset too large?** Use `limit_dataset_samples` in your YAML config to load a subset, useful for quick debugging.
-- **Loss not decreasing?** Verify that your loss mask correctly ignores prompt tokens.
diff --git a/fern/versions/v0.4/pages/guides/vlm/dataset.mdx b/docs/guides/vlm/dataset.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/vlm/dataset.mdx
rename to docs/guides/vlm/dataset.mdx
diff --git a/docs/guides/vlm/gemma4.md b/docs/guides/vlm/gemma4.md
deleted file mode 100644
index 229b13247a..0000000000
--- a/docs/guides/vlm/gemma4.md
+++ /dev/null
@@ -1,514 +0,0 @@
-# Fine-Tuning Gemma 4 31B on CORD-v2 Receipts — End-to-End Guide
-
-**A step-by-step guide for fine-tuning Gemma 4 31B to extract structured receipt data
-from scanned images using [NeMo Automodel](https://github.com/NVIDIA-NeMo/Automodel).**
-
----
-
-## What is Gemma 4 31B?
-
-Gemma 4 31B is a dense vision-language model with a 60-layer transformer decoder,
-SigLIP vision encoder, and support for multimodal inputs (images, audio, text).
-
-Key architectural details:
-- Mixed attention: sliding window (512 tokens) + full attention (every 6th layer)
-- 32 attention heads, 16 KV heads (GQA)
-- Hidden dim 5376, vocab size 262,144
-- bfloat16, final logit softcapping at 30.0
-- Thinking-channel support (`<|channel>thought\n<channel|>` prefix)
-
-## The Task
-
-We fine-tune Gemma 4 31B on the **CORD-v2** (Consolidated Receipt Dataset) to extract
-structured fields from scanned receipts:
-
-| Field | Example |
-|-------|---------|
-| `menu` | Item names, quantities, unit prices, sub-totals |
-| `sub_total` | Subtotal details (subtotal price, discount, tax, etc.) |
-| `total` | Total price, cash price, change price, etc. |
-| `void_menu` | Voided items (if any) |
-
-The **base model** produces free-form descriptions. After fine-tuning, it outputs
-**structured XML-like token sequences** matching the receipt fields.
-
-## Guide Overview
-
-| Step | Description |
-|------|-------------|
-| **Step 0** | Environment setup |
-| **Step 1** | Explore the CORD-v2 dataset |
-| **Step 2** | Evaluate the base model (before fine-tuning) |
-| **Step 3** | Training configuration |
-| **Step 4** | Launch fine-tuning |
-| **Step 5** | Evaluate the fine-tuned model |
-| **Step 6** | Compare results |
-
-## Hardware Requirements
-
-- **8x A100 80 GB** (or 8x H100) GPUs required for 31B with FSDP2 + activation checkpointing
-- **Estimated training time**: ~45 min on 8x H100 (800 training samples, 500 steps)
-
----
-
-## Step 0 — Environment Setup
-
-This guide runs **inside** the NeMo Automodel Docker container:
-
-```bash
-docker run -it --rm --gpus all --ipc=host --network host \
-    -v $(pwd):/workspace \
-    nvcr.io/nvidia/nemo-automodel:26.02
-
-# Inside the container:
-huggingface-cli login          # needed for gated model access
-cd /opt/Automodel
-```
-
-> **Note**: Gemma 4 requires the transformers version that include the model implementation. Please make sure proper transformers is installed.
-
----
-
-## Step 1 — Explore the CORD-v2 Dataset
-
-[CORD-v2](https://huggingface.co/datasets/naver-clova-ix/cord-v2) is a Consolidated
-Receipt Dataset for Post-OCR Parsing containing scanned receipts with structured
-ground-truth JSON labels.
-
-```python
-import json
-from datasets import load_dataset
-
-dataset = load_dataset("naver-clova-ix/cord-v2")
-
-print(f"Train      : {len(dataset['train'])} samples")
-print(f"Validation : {len(dataset['validation'])} samples")
-print(f"Test       : {len(dataset['test'])} samples")
-
-# Inspect a sample
-ex = dataset["train"][0]
-gt = json.loads(ex["ground_truth"])["gt_parse"]
-print(f"\nGround-truth keys: {list(gt.keys())}")
-
-for key in gt:
-    if isinstance(gt[key], list):
-        print(f"\n  {key} ({len(gt[key])} items):")
-        for item in gt[key][:2]:
-            print(f"    {item}")
-    else:
-        print(f"\n  {key}: {gt[key]}")
-```
-
-Expected output:
-```
-Train      : 800 samples
-Validation : 100 samples
-Test       : 100 samples
-
-Ground-truth keys: ['menu', 'sub_total', 'total', 'void_menu']
-
-  menu (7 items):
-    {'nm': 'ABRA KADABRA FLAME GRILLED', 'num': '1', 'unitprice': '39,000', 'cnt': '1', 'price': '39,000'}
-    {'nm': 'Lemon Tea', 'num': '1', 'unitprice': '7,000', 'cnt': '1', 'price': '7,000'}
-
-  sub_total: {'subtotal_price': '87,000', 'discount_price': '0', 'tax_price': '7,909'}
-
-  total: {'total_price': '87,000', 'cashprice': '100,000', 'changeprice': '13,000'}
-
-  void_menu: []
-```
-
-### Target format: JSON-to-token conversion
-
-NeMo Automodel converts structured JSON into an XML-like **token sequence** using
-the `json2token()` function. This is the format the model is trained to produce:
-
-```python
-from nemo_automodel.components.datasets.vlm.utils import json2token
-
-token_seq = json2token(gt, sort_json_key=True)
-print(f"Token sequence (first 300 chars):\n  {token_seq[:300]}...")
-print(f"\nTotal length: {len(token_seq)} chars")
-```
-
-Expected output:
-```
-Token sequence (first 300 chars):
-  <s_menu><s_cnt>1</s_cnt><s_nm>ABRA KADABRA FLAME GRILLED</s_nm><s_num>1</s_num>
-  <s_price>39,000</s_price><s_unitprice>39,000</s_unitprice><sep/><s_cnt>1</s_cnt>
-  <s_nm>Lemon Tea</s_nm><s_num>1</s_num><s_price>7,000</s_price><s_unitprice>7,000
-  </s_unitprice><sep/>...
-
-Total length: 827 chars
-```
-
----
-
-## Step 2 — Evaluate the Base Model (Before Fine-Tuning)
-
-Load the pretrained Gemma 4 31B model and run it on receipt images. The base model
-will produce free-form descriptions instead of structured token sequences.
-
-```python
-import os
-import json
-import torch
-from transformers import AutoProcessor
-from nemo_automodel import NeMoAutoModelForImageTextToText
-from nemo_automodel.components.datasets.vlm.utils import json2token
-from datasets import load_dataset
-
-# --- Helpers ---
-
-def compute_ned(pred: str, target: str) -> float:
-    """Normalized Edit Distance (0 = perfect match, 1 = completely different)."""
-    m, n = len(pred), len(target)
-    if max(m, n) == 0:
-        return 0.0
-    dp = list(range(n + 1))
-    for i in range(1, m + 1):
-        prev, dp[0] = dp[0], i
-        for j in range(1, n + 1):
-            tmp = dp[j]
-            dp[j] = prev if pred[i - 1] == target[j - 1] else 1 + min(dp[j], dp[j - 1], prev)
-            prev = tmp
-    return dp[n] / max(m, n)
-
-
-def run_gemma4_inference(model, processor, pil_image, prompt="Describe this image.",
-                         max_new_tokens=1024):
-    """Run Gemma 4 inference on a single image."""
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": pil_image},
-                {"type": "text", "text": prompt},
-            ],
-        },
-    ]
-    inputs = processor.apply_chat_template(
-        messages,
-        tokenize=True,
-        add_generation_prompt=True,
-        return_tensors="pt",
-        return_dict=True,
-    ).to(model.device)
-
-    with torch.inference_mode():
-        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
-
-    generated_text = processor.decode(outputs[0], skip_special_tokens=True)
-    prompt_length = len(processor.decode(inputs["input_ids"][0], skip_special_tokens=True))
-    return generated_text[prompt_length:].strip()
-
-
-def evaluate_receipts(model, processor, test_dataset, n_samples=20):
-    """Evaluate model on receipt test set, return avg NED and per-sample results."""
-    model.eval()
-    results = []
-    n = min(n_samples, len(test_dataset))
-    for i in range(n):
-        ex = test_dataset[i]
-        gt = json.loads(ex["ground_truth"])["gt_parse"]
-        target = json2token(gt, sort_json_key=True)
-        pred = run_gemma4_inference(model, processor, ex["image"])
-        ned = compute_ned(pred, target)
-        results.append({"idx": i, "ned": ned, "pred": pred, "target": target, "gt": gt})
-        print(f"  Sample {i:2d}: NED = {ned:.4f}")
-    avg_ned = sum(r["ned"] for r in results) / len(results)
-    print(f"\n  Average NED: {avg_ned:.4f}")
-    return avg_ned, results
-
-# --- Load base model ---
-
-MODEL_PATH = "google/gemma-4-31B-it"
-
-processor = AutoProcessor.from_pretrained(MODEL_PATH)
-base_model = NeMoAutoModelForImageTextToText.from_pretrained(
-    MODEL_PATH,
-    torch_dtype=torch.bfloat16,
-    use_liger_kernel=True,
-    attn_implementation="flash_attention_2",
-    text_config={"use_cache": False},
-).eval().to("cuda")
-
-print(f"Parameters: {sum(p.numel() for p in base_model.parameters()):,}")
-
-# --- Evaluate ---
-
-dataset = load_dataset("naver-clova-ix/cord-v2")
-print("\nEvaluating base model on receipt test set:")
-base_avg_ned, base_results = evaluate_receipts(base_model, processor, dataset["test"])
-```
-
-Expected base model output (receipt image):
-```
-  Sample  0: NED = 0.8734
-  Sample  1: NED = 0.9012
-  ...
-  Average NED: 0.8850
-```
-
-**Example base model prediction** (free-form, not structured):
-```
-The image shows a receipt from a restaurant. The total amount is 87,000 with items
-including ABRA KADABRA FLAME GRILLED for 39,000 and Lemon Tea for 7,000...
-```
-
-> The base model produces readable descriptions but not the structured token format
-> we need. Fine-tuning teaches it to output `<s_menu><s_nm>...</s_nm>...` sequences.
-
----
-
-## Step 3 — Training Configuration
-### YAML config
-
-You can save the yaml below as `gemma4_31b_cord_v2.yaml` for training cord_v2 dataset. 
-
-```yaml
-
-step_scheduler:
-  global_batch_size: 8
-  local_batch_size: 1
-  ckpt_every_steps: 100
-  val_every_steps: 100
-  max_steps: 500
-
-dist_env:
-  backend: nccl
-  timeout_minutes: 60
-
-model:
-  _target_: nemo_automodel.NeMoAutoModelForImageTextToText.from_pretrained
-  pretrained_model_name_or_path: google/gemma-4-31B-it
-  torch_dtype: torch.bfloat16
-  use_liger_kernel: true
-  use_sdpa_patching: false
-  attn_implementation: flash_attention_2
-  text_config:
-    use_cache: false
-
-checkpoint:
-  enabled: true
-  checkpoint_dir: vlm_checkpoints/gemma4_31b_cord_v2/
-  model_save_format: safetensors
-  save_consolidated: true
-
-distributed:
-  strategy: fsdp2
-  activation_checkpointing: true
-
-loss_fn:
-  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
-
-dataset:
-  _target_: nemo_automodel.components.datasets.vlm.datasets.make_cord_v2_dataset
-  path_or_dataset: naver-clova-ix/cord-v2
-  split: train
-
-dataloader:
-  collate_fn:
-    _target_: nemo_automodel.components.datasets.vlm.collate_fns.gemma4_prefix_collate_fn
-
-validation_dataset:
-  _target_: nemo_automodel.components.datasets.vlm.datasets.make_cord_v2_dataset
-  path_or_dataset: naver-clova-ix/cord-v2
-  split: validation
-
-optimizer:
-  _target_: torch.optim.AdamW
-  lr: 1e-5
-  weight_decay: 0.01
-  betas: [0.9, 0.95]
-
-lr_scheduler:
-  lr_decay_style: cosine
-
-freeze_config:
-  freeze_embeddings: true
-  freeze_vision_tower: true
-  freeze_audio_tower: true
-  freeze_language_model: false
-```
-
-### Why `gemma4_prefix_collate_fn`?
-
-Gemma 4 31B instruction-tuned models always emit a thinking-channel prefix
-(`<|channel>thought\n<channel|>`) before the actual response. When this prefix
-is absent from training sequences, the model predicts `<|channel>` but the label
-says answer text, inflating initial loss to ~9. The `gemma4_prefix_collate_fn`
-injects this prefix (masked as -100 in labels so the model is not penalized for it)
-and brings initial loss down to ~3.
-
----
-
-## Step 4 — Launch Fine-Tuning
-
-```bash
-torchrun --nproc-per-node=8 \
-    examples/vlm_finetune/finetune.py \
-    -c gemma4_31b_cord_v2.yaml \
-    2>&1 | tee logs/train_gemma4_31b_cord_v2.log
-```
-
-### What to watch
-
-- **Loss** drops rapidly from ~0.73 to ~0.04 in the first 50 steps, then stabilizes around 0.005
-- **Validation loss** reaches ~0.018 by step 199 (best checkpoint)
-- Training takes ~15 min on 8x H100 (300 steps, 800 training samples)
-
-### Training log
-
-```
-step    0 | loss 0.7350 | grad_norm  35.65 | lr 1.18e-06 | mem 60.90 GiB | tps/gpu  45
-step   10 | loss 0.5489 | grad_norm  26.19 | lr 2.98e-06 | mem 40.36 GiB | tps/gpu 425
-step   20 | loss 0.1455 | grad_norm  10.53 | lr 4.78e-06 | mem 40.42 GiB | tps/gpu 438
-step   50 | loss 0.0406 | grad_norm  27.16 | lr 1.00e-05 | mem 40.34 GiB | tps/gpu 377
-step  100 | loss 0.0148 | grad_norm   7.02 | lr 9.70e-06 | mem 40.36 GiB | tps/gpu 449
-step  200 | loss 0.0065 | grad_norm   2.28 | lr 7.52e-06 | mem 40.44 GiB | tps/gpu 441
-step  300 | loss 0.0041 | grad_norm   2.10 | lr 3.16e-06 | mem 40.53 GiB | tps/gpu 448
-
-Validation:
-  step  99 | val_loss 0.0225
-  step 199 | val_loss 0.0183  <-- LOWEST_VAL (best checkpoint)
-  step 299 | val_loss 0.0192
-```
-
-### Checkpoints saved
-
-```
-vlm_checkpoints/gemma4_31b_cord_v2/
-  epoch_0_step_99/
-  epoch_0_step_199/
-  epoch_0_step_299/
-    model/
-      consolidated/          <-- HF-compatible checkpoint for inference
-        config.json
-        model.safetensors.index.json
-        model-00001-of-00013.safetensors
-        ...
-    optim/
-    rng/
-    dataloader/
-  LATEST -> epoch_0_step_299
-  LOWEST_VAL -> epoch_0_step_199
-  training.jsonl             <-- per-step metrics
-  validation.jsonl           <-- per-validation metrics
-```
-
-> **Tip**: `LOWEST_VAL` symlink points to the checkpoint with the best validation loss.
-> Use this for inference evaluation.
-
----
-
-## Step 5 — Evaluate the Fine-Tuned Model
-
-### Load consolidated checkpoint with HF AutoModelForMultimodalLM
-
-Because we set `save_consolidated: true` in the config, each checkpoint contains
-an HF-compatible `model/consolidated/` directory. Use HF's `AutoModelForMultimodalLM`
-for inference (generation), and load the processor from the **base model** path.
-
-```python
-import json
-import os
-import torch
-from datasets import load_dataset
-from transformers import AutoProcessor, AutoModelForMultimodalLM
-from nemo_automodel.components.datasets.vlm.utils import json2token
-
-# Paths
-BASE_MODEL = "google/gemma-4-31B-it"
-CKPT_DIR = "vlm_checkpoints/gemma4_31b_cord_v2"
-best_ckpt = os.path.realpath(os.path.join(CKPT_DIR, "LOWEST_VAL"))
-consolidated = os.path.join(best_ckpt, "model", "consolidated")
-
-# Load processor from base model, model from fine-tuned checkpoint
-processor = AutoProcessor.from_pretrained(BASE_MODEL)
-model = AutoModelForMultimodalLM.from_pretrained(
-    consolidated,
-    dtype=torch.bfloat16,
-    device_map="auto",
-).eval()
-
-# Evaluate on test set
-dataset = load_dataset("naver-clova-ix/cord-v2")
-print("Evaluating fine-tuned model:")
-ft_avg_ned, ft_results = evaluate_receipts(model, processor, dataset["test"])
-```
-
-### Fine-tuned output (test sample 1 -- perfect NED=0.0)
-
-```
-<s_total><s_total_price>91000</s_total_price><s_cashprice>91000</s_cashprice>
-</s_total><s_menu><s_price>17500</s_price><s_nm>J.STB PROMO</s_nm><sep/>
-<s_price>46000</s_price><s_nm>Y.B.BAT</s_nm><sep/><s_price>27500</s_price>
-<s_nm>Y.BASO PROM</s_nm></s_menu>
-```
-
-### Parsing the structured output
-
-You can convert the token sequence back to a structured dict:
-
-```python
-import re
-
-def token2json(token_seq):
-    """Convert a token sequence back to a JSON-like dict."""
-    result = {}
-    pattern = r"<s_(\w+)>(.*?)</s_\1>"
-    matches = re.findall(pattern, token_seq, re.DOTALL)
-    for key, value in matches:
-        if "<sep/>" in value:
-            items = value.split("<sep/>")
-            result[key] = [token2json(item) if "<s_" in item else item for item in items]
-        elif "<s_" in value:
-            result[key] = token2json(value)
-        else:
-            result[key] = value
-    return result
-
-parsed = token2json(prediction)
-print(json.dumps(parsed, indent=2))
-```
-
-Example parsed output (test sample 4):
-```json
-{
-  "total": {"total_price": "174,600", "changeprice": "25,400", "cashprice": "200,000"},
-  "sub_total": {"subtotal_price": "194,000", "discount_price": "19,400"},
-  "menu": [
-    {"price": "82,000", "nm": "ICE BLACKCOFFE"},
-    {"price": "44,000", "nm": "C.Capuccino (L)"},
-    {"price": "30,000", "nm": "C.WHITE COFFE"},
-    {"price": "38,000", "nm": "C.Capuccino (L)"}
-  ]
-}
-```
-
----
-
-## Step 6 — Results Comparison
-
-### Metrics (20 test samples)
-
-| Metric | Fine-Tuned (epoch_1_step_199) |
-|--------|-------------------------------|
-| **Average NED** | **0.0601** |
-| **Field-Level Accuracy** | **92.6%** |
-| Perfect matches (NED=0.0) | 10/20 (50%) |
-| Near-perfect (NED<0.05) | 14/20 (70%) |
-
-### Field-level extraction accuracy (actual)
-
-```
-Field                 Correct / Total  Accuracy
---------------------------------------------------
-total_price                18 /    19     94.7%
-subtotal_price             13 /    14     92.9%
-tax_price                   7 /     8     87.5%
-cashprice                  13 /    15     86.7%
-changeprice                12 /    12    100.0%
-OVERALL                    63 /    68     92.6%
-```
diff --git a/fern/versions/v0.4/pages/guides/vlm/gemma4.mdx b/docs/guides/vlm/gemma4.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/vlm/gemma4.mdx
rename to docs/guides/vlm/gemma4.mdx
diff --git a/docs/guides/vlm/mistral-medium-3-5.md b/docs/guides/vlm/mistral-medium-3-5.md
deleted file mode 100644
index e9a9cd8671..0000000000
--- a/docs/guides/vlm/mistral-medium-3-5.md
+++ /dev/null
@@ -1,97 +0,0 @@
-# Fine-Tune Mistral Medium 3.5 VLM
-
-## Introduction
-
-[Mistral Medium 3.5](https://huggingface.co/mistralai) is Mistral AI's
-new flagship model. It is a **128B dense** transformer that merges
-*Mistral Medium 3.1*, *Magistral Medium*, and *Devstral 2* into a single
-checkpoint with a configurable reasoning mode, supports a **256k-token
-context window**, and serves the default model in Mistral Vibe and
-Le Chat.
-
-The model ships natively in FP8, which
-combined with its dense (non-MoE) layout makes it materially smaller to
-deploy than comparably-capable MoE systems — full inference fits in a
-single H200 node or 2 × H100 nodes, and the recipe in this guide
-fine-tunes the full VLM end-to-end on 8 × H100 nodes (64 GPUs).
-
-**Architecture at a glance**
-
-- 88 Ministral-3 decoder layers (hidden 12288, 96 attention heads, 8 KV
-  heads, GQA), llama-style RoPE + RMSNorm + SwiGLU MLP.
-- Dense — no MoE routing. Compactness vs. MoE peers translates directly
-  into smaller per-GPU memory and easier multi-node sharding.
-- Pixtral vision tower + multi-modal projector for image inputs.
-- FP8 on disk; dequantized to BF16 per local TP shard inside the
-  standard DCP load path.
-
-This guide walks you through fine-tuning Mistral Medium 3.5 on a medical
-Visual Question Answering task using NVIDIA NeMo AutoModel. You will
-learn how to prepare the dataset, launch training on a Slurm cluster,
-and inspect the results.
-
-To set up your environment to run NeMo AutoModel, follow the
-[installation guide](https://github.com/NVIDIA-NeMo/Automodel#-install-nemo-automodel).
-
-## Data
-
-### MedPix-VQA Dataset
-
-We use the [MedPix-VQA](https://huggingface.co/datasets/mmoukouba/MedPix-VQA)
-dataset, a comprehensive medical Visual Question Answering dataset
-containing radiological images paired with question-answer pairs for
-medical image interpretation.
-
-- **20,500 total examples** (85% train / 15% validation)
-- **Columns**: `image_id`, `mode`, `case_id`, `question`, `answer`
-
-For a full walkthrough of how MedPix-VQA is preprocessed and integrated
-into NeMo AutoModel — including the chat-template conversion and collate
-functions — see the
-[Multi-Modal Dataset Guide](https://github.com/NVIDIA-NeMo/Automodel/blob/main/docs/guides/vlm/dataset.md#multi-modal-datasets).
-
-## Launch Training
-
-We provide a ready-to-use recipe at
-[`examples/vlm_finetune/mistral3p5/mistral3p5_128b_medpix.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/mistral3p5/mistral3p5_128b_medpix.yaml).
-This recipe is configured for **8 nodes × 8 H100-80GB GPUs (64 GPUs total)**
-with TP=8, PP=8, DP=1. The vision tower and multi-modal projector are
-frozen by default and only the Ministral-3 language model is trained;
-flip `freeze_config.freeze_vision_tower: false` to train the vision
-side as well.
-
-NeMo AutoModel supports several ways to launch training — via the
-AutoModel CLI with Slurm, interactive sessions, `torchrun`, and more.
-For full details on all launch options (Slurm batch jobs, multi-node
-configuration, environment variables, etc.), see the
-[Run on a Cluster](https://github.com/NVIDIA-NeMo/Automodel/blob/main/docs/launcher/slurm.md)
-guide.
-
-
-**Before you start**:
-
-- Hugging Face applies rate limits on downloads. We recommend cloning
-  the model repository to your local filesystem beforehand.
-- Ensure your Hugging Face cache (`HF_HOME`) is configured and that the
-  dataset is already cached locally.
-- To enable Weights & Biases logging, set your `WANDB_API_KEY` and
-  configure the `wandb` section in the YAML file.
-
-
-## Training Results
-
-The recipe produces a healthy initial loss aligned with the HF
-reference forward on matched samples. On MedPix-VQA the first
-optimizer step lands around per-token loss **3.2** and grad-norm
-**~930** (clipped to `max_grad_norm=1.0`), descending past 1.8 within
-a handful of steps. The HF reference forward (single-sample, FP8
-dequantize on-load) on the same first batch produces per-token loss
-**3.47**, confirming the distributed forward is numerically
-equivalent within bf16 + TP-reduction tolerance.
-
-The training loss curves for Mistral Medium 3.5 fine-tuned on
-MedPix-VQA are shown below.
-
-<p align="center">
-  <img src="https://raw.githubusercontent.com/NVIDIA-NeMo/Automodel/main/docs/guides/vlm/mistralm35.png" alt="Mistral Medium 3.5 Training Loss Curve" width="500">
-</p>
diff --git a/fern/versions/v0.4/pages/guides/vlm/mistral-medium-3-5.mdx b/docs/guides/vlm/mistral-medium-3-5.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/vlm/mistral-medium-3-5.mdx
rename to docs/guides/vlm/mistral-medium-3-5.mdx
diff --git a/docs/guides/vlm/nemotron-omni.md b/docs/guides/vlm/nemotron-omni.md
deleted file mode 100644
index 2ca682c306..0000000000
--- a/docs/guides/vlm/nemotron-omni.md
+++ /dev/null
@@ -1,479 +0,0 @@
-# Fine-Tuning NemotronOmni on CORD-v2 Receipts — End-to-End Guide
-
-**A step-by-step guide for fine-tuning NemotronOmni (33B MoE) to extract structured
-receipt data from scanned images using [NeMo Automodel](https://github.com/NVIDIA-NeMo/Automodel).
-Covers both full SFT and LoRA PEFT.**
-
----
-
-## What is NemotronOmni?
-
-NemotronOmni (`NemotronH_Nano_Omni_Reasoning_V3`) is a ~33B multimodal MoE model supporting
-image, video, and audio inputs.
-
-Key architectural details:
-- **LLM backbone**: NemotronV3 hybrid Mamba2 + Attention + MoE, 52 layers, hidden dim 2688
-- **Vision encoder**: RADIO v2.5-H (ViT-Huge), 256 vision tokens per tile
-- **Audio encoder**: Parakeet FastConformer (1024-dim)
-- **MoE**: 128 experts per MoE layer, top-6 routing with sigmoid gating
-- **Total parameters**: 33B (31.5B trainable with frozen vision/audio towers)
-
-## Fine-Tune for Receipt Field Extraction
-
-We fine-tune NemotronOmni on the **CORD-v2** (Consolidated Receipt Dataset) to extract
-structured fields from scanned receipts:
-
-| Field | Example |
-|-------|---------|
-| `menu` | Item names, quantities, prices |
-| `sub_total` | Subtotal, tax, discount |
-| `total` | Total price, cash paid, change |
-
-The **base model** produces free-form descriptions. After fine-tuning, it outputs
-**structured XML-like token sequences** matching the receipt fields.
-
-## Guide Overview
-
-| Step | Description |
-|------|-------------|
-| **Step 0** | Environment setup |
-| **Step 1** | Explore the CORD-v2 dataset |
-| **Step 2** | Training configuration (SFT and LoRA) |
-| **Step 3** | Launch fine-tuning |
-| **Step 4** | Run inference on the fine-tuned model |
-| **Step 5** | Compare SFT vs LoRA results |
-
-## Hardware Requirements
-
-- **8x H100 80 GB** GPUs required (MoE with EP=8)
-- **SFT memory**: ~49 GiB per GPU
-- **LoRA memory**: ~30 GiB per GPU
-- **Estimated training time**: ~10 min on 8x H100 (400 steps, 800 training samples)
-
----
-
-## Step 0 — Set Up the Environment
-
-```bash
-# Inside the NeMo AutoModel container (26.04+):
-cd /opt/Automodel
-
-# Or from a source checkout:
-git clone -b nemotron-omni ssh://git@gitlab-master.nvidia.com:12051/huiyingl/automodel-omni.git
-cd automodel-omni
-```
-
-:::{note}
-NemotronOmni requires `mamba_ssm`, `causal_conv1d`, and `decord` packages, which are included in the NeMo AutoModel container.
-:::
-
----
-
-## Step 1 — Explore the CORD-v2 Dataset
-
-[CORD-v2](https://huggingface.co/datasets/naver-clova-ix/cord-v2) contains scanned
-receipts with structured ground-truth JSON labels.
-
-```python
-import json
-from datasets import load_dataset
-
-dataset = load_dataset("naver-clova-ix/cord-v2")
-
-print(f"Train      : {len(dataset['train'])} samples")
-print(f"Validation : {len(dataset['validation'])} samples")
-print(f"Test       : {len(dataset['test'])} samples")
-
-# Inspect a sample
-ex = dataset["train"][0]
-gt = json.loads(ex["ground_truth"])["gt_parse"]
-print(f"\nGround-truth keys: {list(gt.keys())}")
-```
-
-Expected output:
-```
-Train      : 800 samples
-Validation : 100 samples
-Test       : 100 samples
-
-Ground-truth keys: ['menu', 'sub_total', 'total', 'void_menu']
-```
-
-### Target Format: JSON-to-Token Conversion
-
-NeMo Automodel converts structured JSON into an XML-like **token sequence** using
-the `json2token()` function. This is the format the model is trained to produce:
-
-```
-<s_total><s_total_price>45,500</s_total_price><s_changeprice>4,500</s_changeprice>
-<s_cashprice>50,000</s_cashprice></s_total><s_menu><s_price>16,500</s_price>
-<s_nm>REAL GANACHE</s_nm><s_cnt>1</s_cnt><sep/><s_price>13,000</s_price>
-<s_nm>EGG TART</s_nm><s_cnt>1</s_cnt></s_menu>
-```
-
----
-
-## Step 2 — Training Configuration
-
-### Full SFT Config
-
-**Config file**: `examples/vlm_finetune/nemotron_omni/nemotron_omni_cord_v2.yaml`
-
-```yaml
-recipe: FinetuneRecipeForVLM
-
-step_scheduler:
-  global_batch_size: 8
-  local_batch_size: 1
-  ckpt_every_steps: 100
-  val_every_steps: 200
-  max_steps: 400
-
-model:
-  _target_: nemo_automodel.NeMoAutoModelForImageTextToText.from_pretrained
-  pretrained_model_name_or_path: <path_to_nemotron_omni_v2.0>
-  trust_remote_code: true
-  torch_dtype: torch.bfloat16
-  backend:
-    _target_: nemo_automodel.components.models.common.BackendConfig
-    attn: sdpa
-    linear: torch
-    rms_norm: torch_fp32
-    rope_fusion: false
-    enable_deepep: false
-    fake_balanced_gate: false
-    enable_hf_state_dict_adapter: true
-
-distributed:
-  strategy: fsdp2
-  ep_size: 8            # 128 MoE experts across 8 GPUs
-
-freeze_config:
-  freeze_embeddings: true
-  freeze_vision_tower: true
-  freeze_audio_tower: true
-  freeze_language_model: false
-
-dataset:
-  _target_: nemo_automodel.components.datasets.vlm.datasets.make_cord_v2_dataset
-  path_or_dataset: naver-clova-ix/cord-v2
-  split: train
-
-dataloader:
-  collate_fn:
-    _target_: nemo_automodel.components.datasets.vlm.collate_fns.nemotron_omni_collate_fn
-    max_length: 4096
-
-optimizer:
-  _target_: torch.optim.AdamW
-  lr: 1e-4
-  weight_decay: 0.01
-  betas: [0.9, 0.95]
-```
-
-### LoRA PEFT Config
-
-**Config file**: `examples/vlm_finetune/nemotron_omni/nemotron_omni_cord_v2_peft.yaml`
-
-Adds a `peft:` block to apply LoRA to language model linear layers only:
-
-```yaml
-peft:
-  _target_: nemo_automodel.components._peft.lora.PeftConfig
-  match_all_linear: false
-  exclude_modules:
-    - "*vision_tower*"
-    - "*vision_model*"
-    - "*audio*"
-    - "*sound*"
-    - "*lm_head*"
-    - "*mlp1*"
-  dim: 64
-  alpha: 128
-  use_triton: true
-
-optimizer:
-  _target_: torch.optim.AdamW
-  lr: 1e-3
-```
-
-### Collate function
-
-NemotronOmni uses InternVL-style image handling where each `<image>` token in the
-input is replaced by 256 vision embeddings during the model's forward pass. The
-collate function:
-1. Extracts images from the conversation
-2. Applies the chat template (which adds `<think></think>` prefix for the assistant turn)
-3. Processes images through the NemotronOmni processor
-4. Builds `image_flags` tensors and creates training labels
-
----
-
-## Step 3 — Launch Fine-Tuning
-
-### Full SFT
-
-```bash
-torchrun --nproc-per-node=8 \
-    examples/vlm_finetune/finetune.py \
-    -c examples/vlm_finetune/nemotron_omni/nemotron_omni_cord_v2.yaml
-```
-
-### LoRA PEFT
-
-```bash
-torchrun --nproc-per-node=8 \
-    examples/vlm_finetune/finetune.py \
-    -c examples/vlm_finetune/nemotron_omni/nemotron_omni_cord_v2_peft.yaml
-```
-
-### Training log — Full SFT
-
-```
-Trainable parameters: 31,570,023,872
-Trainable parameters percentage: 95.63%
-
-step    0 | loss 0.6866 | grad_norm  7.57 | lr 1.00e-04 | mem 37.29 GiB | tps/gpu   33
-step   10 | loss 0.0705 | grad_norm  1.00 | lr 1.00e-04 | mem 48.95 GiB | tps/gpu 2419
-step   50 | loss 0.0173 | grad_norm  0.43 | lr 1.00e-04 | mem 48.72 GiB | tps/gpu 2615
-step  100 | loss 0.0115 | grad_norm  0.37 | lr 1.00e-04 | mem 48.84 GiB | tps/gpu 2642
-step  200 | loss 0.0099 | grad_norm  0.20 | lr 1.00e-04 | mem 48.76 GiB | tps/gpu 2616
-step  300 | loss 0.0056 | grad_norm  0.15 | lr 1.00e-04 | mem 48.72 GiB | tps/gpu 2087
-step  399 | loss 0.0039 | grad_norm  0.17 | lr 1.00e-04 | mem 48.79 GiB | tps/gpu 2616
-
-Validation:
-  step  99 | val_loss 0.0363
-  step 199 | val_loss 0.0342  <-- LOWEST_VAL
-  step 299 | val_loss 0.0414
-  step 399 | val_loss 0.0425
-```
-
-### Training log — LoRA PEFT
-
-```
-Trainable parameters: 55,422,976
-Trainable parameters percentage: 0.17%
-
-step    0 | loss 0.6866 | grad_norm  1.92 | lr 1.00e-03 | mem 30.26 GiB | tps/gpu   34
-step   10 | loss 0.0557 | grad_norm  0.30 | lr 1.00e-03 | mem 30.16 GiB | tps/gpu 2455
-step   50 | loss 0.0392 | grad_norm  0.32 | lr 1.00e-03 | mem 30.16 GiB | tps/gpu 3352
-step  100 | loss 0.0309 | grad_norm  0.27 | lr 1.00e-03 | mem 30.20 GiB | tps/gpu 2456
-step  200 | loss 0.0280 | grad_norm  0.23 | lr 1.00e-03 | mem 30.34 GiB | tps/gpu 2477
-step  300 | loss 0.0326 | grad_norm  0.31 | lr 1.00e-03 | mem 30.52 GiB | tps/gpu 2737
-step  399 | loss 0.0171 | grad_norm  0.24 | lr 1.00e-03 | mem 30.33 GiB | tps/gpu 3258
-
-Validation:
-  step  99 | val_loss 0.0449  <-- LOWEST_VAL
-  step 199 | val_loss 0.0524
-  step 299 | val_loss 0.0482
-  step 399 | val_loss 0.0566
-```
-
-### Checkpoints saved
-
-```
-checkpoint_dir/
-  epoch_0_step_99/
-  epoch_1_step_199/
-  epoch_2_step_299/
-  epoch_3_step_399/
-    model/
-      consolidated/          <-- HF-compatible checkpoint for inference
-        config.json
-        model.safetensors.index.json
-        model-00001-of-00017.safetensors
-        ...
-    optim/
-    rng/
-    dataloader/
-  LATEST -> epoch_3_step_399
-  LOWEST_VAL -> epoch_1_step_199
-  training.jsonl
-  validation.jsonl
-```
-
-For LoRA, the checkpoint saves adapter weights instead:
-```
-  model/
-    adapter_model.safetensors   (~27 MB)
-    adapter_config.json
-```
-
-> **Tip**: `LOWEST_VAL` symlink points to the checkpoint with the best validation loss.
-
----
-
-## Step 4 — Run Inference on the Fine-Tuned Model
-
-### Full SFT inference
-
-Load the consolidated checkpoint and run inference on a handful of validation samples
-to spot-check structured output.
-
-```python
-import torch
-import json
-from transformers import AutoModel, AutoProcessor
-from datasets import load_dataset
-from nemo_automodel.components.datasets.vlm.utils import json2token
-
-CKPT = "<checkpoint_dir>/LOWEST_VAL/model/consolidated"
-
-# Load processor
-processor = AutoProcessor.from_pretrained(CKPT, trust_remote_code=True)
-tokenizer = processor.tokenizer
-
-# `device_map` streams weights directly to GPU; skipping the AutoModel.from_config
-# CPU-instantiation step saves ~5 min on the 30B v3 dump.
-model = AutoModel.from_pretrained(
-    CKPT, trust_remote_code=True, torch_dtype=torch.bfloat16,
-    device_map={"": torch.cuda.current_device()},
-)
-
-# Reset RADIO's `summary_idxs` (non-persistent buffer; can be a meta tensor after load)
-if hasattr(model, "vision_model") and hasattr(model.vision_model, "radio_model"):
-    model.vision_model.radio_model.summary_idxs = None
-
-model.eval()
-
-# Load dataset
-dataset = load_dataset("naver-clova-ix/cord-v2")
-
-# v3 processor returns extra placeholder-expansion metadata that is NOT a generate() kwarg.
-PROCESSOR_METADATA_KEYS = ("num_patches", "num_tokens", "imgs_sizes")
-
-# Run inference on the first 5 validation samples
-for i in range(5):
-    sample = dataset["validation"][i]
-    image = sample["image"].convert("RGB")
-    gt = json.loads(sample["ground_truth"])["gt_parse"]
-    gt_text = json2token(gt, sort_json_key=True)
-
-    # Build prompt — enable_thinking=False for structured output
-    messages = [{"role": "user", "content": "<image>\nDescribe this image."}]
-    text = tokenizer.apply_chat_template(
-        messages, tokenize=False,
-        add_generation_prompt=True, enable_thinking=False,
-    )
-    inputs = processor(text=text, images=[image], return_tensors="pt")
-    for k in PROCESSOR_METADATA_KEYS:
-        inputs.pop(k, None)
-    inputs = {k: v.to("cuda") if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
-
-    with torch.no_grad():
-        output_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=False)
-
-    generated = tokenizer.decode(
-        output_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True,
-    ).strip()
-
-    print(f"\n=== Sample {i} ===")
-    print(f"Ground truth: {gt_text}")
-    print(f"Prediction:   {generated}")
-```
-
-### LoRA PEFT inference
-
-NeMo Automodel saves LoRA adapters under its internal wrapper FQNs
-(e.g. `language_model.model.layers.X.mixer.in_proj`), which differ from the HF
-base model namespace (`language_model.backbone.layers.X.mixer.in_proj`).
-To apply the adapter, merge the delta weights directly into the base model with
-a small FQN translation:
-
-```python
-import json, re
-import torch
-from pathlib import Path
-from safetensors import safe_open
-from transformers import AutoModel, AutoProcessor
-
-BASE    = "<path_to_nemotron_omni_v3>"
-ADAPTER = "<ckpt_dir>/LOWEST_VAL/model"
-
-# Load base directly to GPU. Skip AutoModel.from_config — instantiating a 30B
-# model on CPU just to read the class type adds 5+ minutes.
-processor = AutoProcessor.from_pretrained(BASE, trust_remote_code=True)
-model = AutoModel.from_pretrained(
-    BASE, trust_remote_code=True, dtype=torch.bfloat16,
-    device_map={"": torch.cuda.current_device()},
-)
-if hasattr(model, "vision_model") and hasattr(model.vision_model, "radio_model"):
-    model.vision_model.radio_model.summary_idxs = None
-
-# Wrapper -> HF base FQN translation. vision_projector.* targets are listed in
-# adapter_config.json but no tensors are saved for them, so we just skip those.
-def translate(fqn):
-    if fqn.startswith("language_model.model."):
-        return "language_model.backbone." + fqn[len("language_model.model."):]
-    return None
-
-cfg   = json.loads((Path(ADAPTER) / "adapter_config.json").read_text())
-scale = cfg["lora_alpha"] / cfg["r"]
-
-pairs = {}
-with safe_open(str(Path(ADAPTER) / "adapter_model.safetensors"), framework="pt") as f:
-    for k in f.keys():
-        m = re.match(r"^base_model\.model\.(.+)\.lora_(A|B)\.weight$", k)
-        if m:
-            pairs.setdefault(m.group(1), {})[m.group(2)] = f.get_tensor(k)
-
-modules = dict(model.named_modules())
-for wrapper_fqn, ab in pairs.items():
-    hf_fqn = translate(wrapper_fqn)
-    if hf_fqn is None or hf_fqn not in modules:
-        continue
-    W = modules[hf_fqn].weight
-    A = ab["A"].to(device=W.device, dtype=torch.float32)
-    B = ab["B"].to(device=W.device, dtype=torch.float32)
-    with torch.no_grad():
-        W.add_(((B @ A) * scale).to(W.dtype))
-
-model.eval()
-# ... then run the same generate() loop as in the SFT example above.
-```
-
-**Resources** — single GPU; ~60 GB GPU RAM for the bf16 30B base.
-**Runtime** — ~75 s base load + ~1 s LoRA merge + ~5–15 s per sample.
-
----
-
-## Step 5 — Results Comparison
-
-### Evaluation on 5 CORD-v2 Validation Samples
-
-#### Full SFT (lr=1e-4, 400 steps, epoch_3_step_399)
-
-| Sample | Ground Truth | Prediction | Match |
-|--------|-------------|------------|-------|
-| 1 | `<s_total>...<s_nm>REAL GANACHE</s_nm>...<s_nm>EGG TART</s_nm>...<s_nm>PIZZA TOAST</s_nm>...` | Exact match | 100% |
-| 2 | `<s_total>...<s_nm>JAMUR</s_nm>...<s_nm>TAHU</s_nm>...` | Exact match | 100% |
-| 3 | `<s_total>...<s_nm>Gojek Chicken Chilli Sauce H</s_nm>...` | Correct values, slight name segmentation diff | 33% |
-| 4 | `<s_total>...<s_nm>VANILLA CHOCO HEART CAKE</s_nm>...` | Exact match | 100% |
-| 5 | `<s_total>...<s_nm>Sate Padang</s_nm>...` | Correct, extra `<s_unitprice>` field | ~0% |
-
-**3/5 exact matches. All samples produce correct structured output.**
-
-#### LoRA PEFT (rank=64, lr=1e-3, 400 steps, epoch_0_step_99)
-
-| Sample | Ground Truth | Prediction | Match |
-|--------|-------------|------------|-------|
-| 1 | `<s_total>...<s_nm>REAL GANACHE</s_nm>...` | Exact match | 100% |
-| 2 | `<s_total>...<s_nm>JAMUR</s_nm>...<s_nm>TAHU</s_nm>...` | Exact match | 100% |
-| 3 | `<s_total>...<s_nm>Gojek Chicken Chilli Sauce H</s_nm>...` | Correct values, slight name segmentation diff | 33% |
-| 4 | `<s_total>...<s_nm>VANILLA CHOCO HEART CAKE</s_nm>...` | Exact match | 100% |
-| 5 | `<s_total>...<s_nm>Sate Padang</s_nm>...` | Exact match | 100% |
-
-**4/5 exact matches. All samples produce correct structured output.**
-
-### Summary
-
-| | Full SFT | LoRA PEFT |
-|---|---|---|
-| Trainable params | 31.5B (95.63%) | 55M (0.17%) |
-| Learning rate | 1e-4 | 1e-3 |
-| GPU memory | ~49 GiB | ~30 GiB |
-| Training time (8x H100) | ~10 min | ~6 min |
-| Best val loss | 0.034 (step 199) | 0.045 (step 99) |
-| Final train loss | 0.004 | 0.017 |
-| Checkpoint size | ~64 GB | ~27 MB |
-| Exact matches (5 val) | 3/5 | 4/5 |
diff --git a/fern/versions/v0.4/pages/guides/vlm/nemotron-omni.mdx b/docs/guides/vlm/nemotron-omni.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/vlm/nemotron-omni.mdx
rename to docs/guides/vlm/nemotron-omni.mdx
diff --git a/docs/guides/vlm/qwen3-5.md b/docs/guides/vlm/qwen3-5.md
deleted file mode 100644
index 65f0088cdb..0000000000
--- a/docs/guides/vlm/qwen3-5.md
+++ /dev/null
@@ -1,62 +0,0 @@
-# Fine-Tune Qwen3.5-VL
-
-## Introduction
-
-[Qwen/Qwen3.5-397B-A17B](https://huggingface.co/Qwen/Qwen3.5-397B-A17B) is the latest vision-language model in the Qwen series developed by Alibaba. It’s a 397B-parameter (17B active) hybrid MoE model that uses a repeated layout of Gated DeltaNet and Gated Attention blocks, each paired with sparse MoE (512 experts; 10 routed + 1 shared active). Qwen3.5 is a major upgrade that unifies vision+language, boosts efficiency and multilingual coverage, delivering higher performance at lower latency/cost for developers and enterprises. Qwen3.5-397B-A17B shows competitive benchmark performance across knowledge, reasoning, coding, and agent tasks.
-<p align="center">
-  <img src="https://raw.githubusercontent.com/NVIDIA-NeMo/Automodel/main/docs/guides/vlm/qwen3_5scores.png" alt="Qwen3.5 benchmark" width="500">
-</p>
-
-This guide walks you through fine-tuning Qwen3.5 on a medical Visual Question Answering task using NVIDIA NeMo Automodel. You will learn how to prepare the dataset, launch training on a Slurm cluster, and inspect the results.
-
-To set up your environment to run NeMo Automodel, follow the [installation guide](https://github.com/NVIDIA-NeMo/Automodel#-install-nemo-automodel).
-
-## Data
-
-### MedPix-VQA Dataset
-
-We use the [MedPix-VQA](https://huggingface.co/datasets/mmoukouba/MedPix-VQA) dataset, a comprehensive medical Visual Question Answering dataset containing radiological images paired with question-answer pairs for medical image interpretation.
-
-- **20,500 total examples** (85% train / 15% validation)
-- **Columns**: `image_id`, `mode`, `case_id`, `question`, `answer`
-
-For a full walkthrough of how MedPix-VQA is preprocessed and integrated into NeMo Automodel—including the chat-template conversion and collate functions—see the [Multi-Modal Dataset Guide](https://github.com/NVIDIA-NeMo/Automodel/blob/main/docs/guides/vlm/dataset.md#multi-modal-datasets).
-
-## Launch Training
-
-We provide a ready-to-use recipe at [`examples/vlm_finetune/qwen3_5_moe/qwen3_5_moe_medpix.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/qwen3_5_moe/qwen3_5_moe_medpix.yaml). This recipe is configured to run on 32 x 8 H100 nodes.
-
-NeMo Automodel supports several ways to launch training—via the Automodel CLI with Slurm, interactive sessions, `torchrun`, and more. For full details on all launch options (Slurm batch jobs, multi-node configuration, environment variables, etc.), see the [Run on a Cluster](https://github.com/NVIDIA-NeMo/Automodel/blob/main/docs/launcher/slurm.md) guide.
-
-### Standalone Slurm Script
-
-We also provide a standalone Slurm script example for Qwen3.5. Before running it, ensure your cluster environment is configured following the [Run on a Cluster](https://github.com/NVIDIA-NeMo/Automodel/blob/main/docs/launcher/slurm.md) guide. Then submit the job with the following command:
-
-```bash
-export TRANSFORMERS_OFFLINE=1
-export HF_HOME=your/path/to/hf_cache
-export HF_DATASETS_OFFLINE=1
-export WANDB_API_KEY=your_wandb_key
-
-srun --output=output.out \
-     --error=output.err \
-     --container-image /your/path/to/automodel26.02.image.sqsh --no-container-mount-home bash -c "
-  CUDA_DEVICE_MAX_CONNECTIONS=1 automodel \
-  examples/vlm_finetune/qwen3_5_moe/qwen3_5_moe_medpix.yaml \
-  --nproc-per-node=8 \
-  --model.pretrained_model_name_or_path=/your/local/qwen3.5weights \
-  --processor.pretrained_model_name_or_path=/your/local/qwen3.5weights "
-```
-
-**Before you start**:
-- Hugging Face applies rate limits on downloads. We recommend cloning the model repository to your local filesystem beforehand.
-- Ensure your Hugging Face cache (`HF_HOME`) is configured and that the dataset is already cached locally.
-- To enable Weights & Biases logging, set your `WANDB_API_KEY` and configure the `wandb` section in the YAML file.
-
-## Training Results
-
-The training loss curves for Qwen3.5-VL fine-tuned on MedPix-VQA are shown below.
-
-<p align="center">
-  <img src="https://raw.githubusercontent.com/NVIDIA-NeMo/Automodel/main/docs/guides/vlm/qwen3_5.png" alt="Qwen3.5-VL Training Loss Curve" width="500">
-</p>
\ No newline at end of file
diff --git a/fern/versions/v0.4/pages/guides/vlm/qwen3-5.mdx b/docs/guides/vlm/qwen3-5.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/guides/vlm/qwen3-5.mdx
rename to docs/guides/vlm/qwen3-5.mdx
diff --git a/docs/index.md b/docs/index.md
deleted file mode 100644
index 7c13422298..0000000000
--- a/docs/index.md
+++ /dev/null
@@ -1,304 +0,0 @@
----
-
-description: "NeMo AutoModel is a PyTorch DTensor-native SPMD open-source training library for scalable LLM and VLM training and fine-tuning with day-0 Hugging Face model support"
-
-categories:
-
-- documentation
-- home
-tags:
-- training
-- fine-tuning
-- distributed
-- gpu-accelerated
-- spmd
-- dtensor
-personas:
-- Machine Learning Engineers
-- Data Scientists
-- Researchers
-- DevOps Professionals
-difficulty: beginner
-content_type: index
----
-
-(automodel-home)=
-
-# NeMo AutoModel Documentation
-
-PyTorch-native training that scales from 1 GPU to thousands with a single config change. Load any Hugging Face model, point at your data, and start training; no checkpoint conversion and no boilerplate.
-**Quick links:** [🤗 HF Compatible](guides/huggingface-api-compatibility.md) | [🚀 Performance](performance-summary.md) | [📐 Scalability](about/key-features.md) | [🎯 SFT & PEFT](guides/llm/finetune.md) | [🎨 Diffusion](guides/diffusion/finetune.md) | [👁️ VLM](guides/vlm/gemma4.md)
-
-::::{grid} 2 2 2 2
-:gutter: 1 1 1 2
-
-:::{grid-item-card} {octicon}`book;1.5em;sd-mr-1` About
-:link: about/index
-:link-type: doc
-Overview of NeMo AutoModel and its capabilities.
-:::
-
-:::{grid-item-card} {octicon}`zap;1.5em;sd-mr-1` Key Features
-:link: about/key-features
-:link-type: doc
-Supported workflows, parallelism, recipes, and benchmarks.
-:::
-
-:::{grid-item-card} {octicon}`hubot;1.5em;sd-mr-1` 🤗 HF Integration
-:link: guides/huggingface-api-compatibility
-:link-type: doc
-A `transformers`-compatible library with accelerated model implementations.
-:::
-
-:::{grid-item-card} {octicon}`checklist;1.5em;sd-mr-1` Model Coverage
-:link: model-coverage/overview
-:link-type: doc
-Built on `transformers` for day-0 model support and OOTB compatibility.
-:::
-
-::::
-
-## Get Started
-
-```bash
-uv pip install nemo-automodel
-
-automodel --nproc-per-node=2 llama3_2_1b_squad.yaml
-```
-
-See the [installation guide](guides/installation.md) for Docker, source builds, and multi-node setup.
-See the [configuration guide](guides/configuration.md) for YAML recipes and CLI overrides.
-Launch on a [local workstation](launcher/local-workstation.md) or [SLURM cluster](launcher/slurm.md).
-
-## Latest Model Support
-
-New models are added regularly. Pick a model below to start fine-tuning, or see the [full release log](model-coverage/latest-models.md).
-
-| Date | Modality | Model |
-|------|----------|-------|
-| 2026-05-18 | Audio | Qwen3-Omni ASR ([recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/audio_finetune/qwen3_omni_asr/ami_sft.yaml)) |
-| 2026-04-07 | LLM | [GLM-5.1](https://github.com/NVIDIA-NeMo/Automodel/discussions/1719) ([recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/glm/glm_5.1_hellaswag_pp.yaml)) |
-| 2026-04-02 | VLM | Gemma 4 ([recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/gemma4/gemma4_4b.yaml)) |
-| 2026-03-16 | VLM | [Mistral Small 4](https://github.com/NVIDIA-NeMo/Automodel/discussions/1558) ([recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/mistral4/mistral4_medpix.yaml)) |
-| 2026-03-11 | LLM | [Nemotron Super v3](https://github.com/NVIDIA-NeMo/Automodel/discussions/976) ([recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/nemotron/nemotron_super_v3_hellaswag.yaml)) |
-| 2026-03-03 | Diffusion | FLUX.1-dev ([recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/diffusion/finetune/flux_t2i_flow.yaml)) |
-
-## Recipes & Guides
-
-Find the right guide for your task: fine-tuning, pretraining, distillation, diffusion, and more.
-
-| I want to...                | Choose this when...                                                                 | Input Data                                        | Model     | Guide                                                     |
-| --------------------------- | ----------------------------------------------------------------------------------- | ------------------------------------------------- | --------- | --------------------------------------------------------- |
-| **SFT (full fine-tune)**    | You need maximum accuracy and have the GPU budget to update all weights             | Instruction / chat dataset                        | LLM       | [Start fine-tuning](guides/llm/finetune.md)               |
-| **PEFT (LoRA)**             | You want to fine-tune on limited GPU memory; updates <1 % of parameters             | Instruction / chat dataset                        | LLM       | [Start LoRA](guides/llm/finetune.md)     |
-| **Tool / function calling** | Your model needs to call APIs or tools with structured arguments                    | Function-calling dataset (queries + tool schemas) | LLM       | [Add tool calling](guides/llm/toolcalling.md)             |
-| **Fine-tune VLM**           | Your task involves both images and text (e.g., visual QA, captioning)               | Image + text dataset                              | VLM       | [Fine-tune VLM](guides/omni/gemma3-3n.md)                 |
-| **Fine-tune Gemma 4**       | You want to fine-tune Gemma 4 for structured extraction from images (e.g., receipts) | Image + text dataset                              | VLM       | [Fine-tune Gemma 4](guides/vlm/gemma4.md)                 |
-| **Fine-tune dLLM**          | You want to fine-tune a diffusion language model (e.g., LLaDA) using masked denoising | Instruction / chat dataset                        | dLLM      | [Fine-tune dLLM](guides/dllm/finetune.md)                 |
-| **Fine-tune Diffusion**     | You want to fine-tune a diffusion model for image or video generation               | Video / Image dataset                             | Diffusion | [Fine-tune Diffusion](guides/diffusion/finetune.md)       |
-| **Fine-tune VLM-MoE**       | You need large-scale vision-language training with sparse MoE efficiency            | Image + text dataset                              | VLM (MoE) | [Fine-tune VLM-MoE](guides/vlm/qwen3-5.md)                |
-| **Fine-tune Audio ASR**     | Adapt Qwen3-Omni for speech recognition on HF audio datasets                        | Audio + transcript dataset                        | Qwen3-Omni | [Fine-tune Qwen3-Omni ASR](guides/audio/qwen3-omni-asr.md) |
-| **Embedding fine-tune**     | You want to improve text similarity for search, retrieval, or RAG         | Text pairs / retrieval corpus                     | LLM       | {bdg-info}`Coming Soon`                                   |
-| **Fine-tune a large MoE**   | You are adapting a large sparse MoE model (DeepSeek-V3, GLM-5, etc.) to your domain | Text dataset (e.g., HellaSwag)                    | LLM (MoE) | [Fine-tune MoE](guides/llm/large-moe-finetune.md)         |
-| **Fine-tune DeepSeek V4 Flash** | You want to fine-tune the DeepSeek V4 Flash hybrid-attention MoE (SWA / CSA / HCA + hash-routing) | Text dataset (e.g., HellaSwag)                    | LLM (MoE) | [Fine-tune DeepSeek V4 Flash](guides/llm/dsv4-flash.md)   |
-| **Fine-tune Hy3-preview**       | You want to fine-tune Tencent's 295B MoE with sigmoid routing and per-head QK RMSNorm              | Text dataset (e.g., HellaSwag)                    | LLM (MoE) | [Fine-tune Hy3-preview](guides/llm/hy3.md)                |
-| **Sequence classification** | You need to classify text into categories (sentiment, topic, NLI)                   | Text + labels (e.g., GLUE MRPC)                   | LLM       | [Train classifier](guides/llm/sequence-classification.md) |
-| **QAT fine-tune**           | You want a quantized model that keeps accuracy for efficient deployment             | Text dataset                                      | LLM       | [Enable QAT](guides/quantization-aware-training.md)       |
-| **Knowledge distillation**  | You want a smaller, faster model that retains most of the teacher's quality         | Instruction dataset + teacher model               | LLM       | [Distill a model](guides/llm/knowledge-distillation.md)   |
-| **Pretrain an LLM**         | You are building a base model from scratch on your own corpus                       | Large unlabeled text corpus (e.g., FineWeb-Edu)   | LLM       | [Start pretraining](guides/llm/pretraining.md)            |
-| **Pretrain (NanoGPT)**      | You want quick pretraining experiments on a single node                             | FineWeb / text corpus                             | LLM       | [Try NanoGPT](guides/llm/nanogpt-pretraining.md)          |
-
-## Performance
-
-Training throughput on NVIDIA GPUs with optimized kernels for Hugging Face models.
-
-
-| Model            | GPUs | TFLOPs/sec/GPU | Tokens/sec/GPU | Optimizations          |
-| ---------------- | ---- | -------------- | -------------- | ---------------------- |
-| DeepSeek V3 671B | 256  | 250            | 1,002          | TE + DeepEP            |
-| GPT-OSS 20B      | 8    | 279            | 13,058         | TE + DeepEP + FlexAttn |
-| Qwen3 MoE 30B    | 8    | 277            | 12,040         | TE + DeepEP            |
-
-
-See the [full benchmark results](performance-summary.md) for configuration details and more models.
-
-## Advanced Topics
-
-Parallelism, precision, checkpointing strategies, and experiment tracking.
-
-::::{grid} 1 2 2 3
-:gutter: 1 1 1 2
-
-:::{grid-item-card} {octicon}`git-merge;1.5em;sd-mr-1` Pipeline Parallelism
-:link: guides/pipelining
-:link-type: doc
-Torch-native pipelining composable with FSDP2 and DTensor.
-+++
-{bdg-secondary}`3d-parallelism`
-:::
-
-:::{grid-item-card} {octicon}`zap;1.5em;sd-mr-1` FP8 Training
-:link: guides/fp8-training
-:link-type: doc
-Mixed-precision FP8 training with torchao.
-+++
-{bdg-secondary}`FP8` {bdg-secondary}`mixed-precision`
-:::
-
-:::{grid-item-card} {octicon}`database;1.5em;sd-mr-1` Checkpointing
-:link: guides/checkpointing
-:link-type: doc
-Distributed checkpoints with SafeTensors output.
-+++
-{bdg-secondary}`DCP` {bdg-secondary}`safetensors`
-:::
-
-:::{grid-item-card} {octicon}`shield-check;1.5em;sd-mr-1` Gradient Checkpointing
-:link: guides/gradient-checkpointing
-:link-type: doc
-Trade compute for memory with activation checkpointing.
-+++
-{bdg-secondary}`memory-efficiency`
-:::
-
-:::{grid-item-card} {octicon}`meter;1.5em;sd-mr-1` Quantization-Aware Training
-:link: guides/quantization-aware-training
-:link-type: doc
-Train with quantization for deployment-ready models.
-+++
-{bdg-secondary}`QAT`
-:::
-
-:::{grid-item-card} {octicon}`graph;1.5em;sd-mr-1` Experiment Tracking
-:link: guides/mlflow-logging
-:link-type: doc
-Track experiments and metrics with MLflow and Wandb.
-+++
-{bdg-secondary}`MLflow` {bdg-secondary}`Wandb`
-:::
-
-::::
-
-## For Developers
-
-::::{grid} 1 2 2 3
-:gutter: 1 1 1 2
-
-:::{grid-item-card} {octicon}`file-directory;1.5em;sd-mr-1` Repo Internals
-:link: repository-structure
-:link-type: doc
-Components, recipes, and CLI architecture.
-:::
-
-:::{grid-item-card} {octicon}`code;1.5em;sd-mr-1` API Reference
-:link: apidocs/index
-:link-type: doc
-Auto-generated Python API documentation.
-:::
-
-:::{grid-item-card} {octicon}`plug;1.5em;sd-mr-1` Use as a Library
-:link: about/index
-:link-type: doc
-Drop-in accelerated backend for TRL, lm-eval-harness, OpenRLHF, or any code that loads Hugging Face models.
-:::
-
-::::
-
----
-
-::::{toctree}
-:hidden:
-:caption: Get Started
-About <about/index.md>
-Key Features <about/key-features.md>
-Installation <guides/installation.md>
-Configuration <guides/configuration.md>
-🤗 HF Compatibility <guides/huggingface-api-compatibility.md>
-Repo Structure <repository-structure.md>
-Release Notes <release-notes.md>
-::::
-
-::::{toctree}
-:hidden:
-:caption: Announcements
-announcements.md
-::::
-
-::::{toctree}
-:hidden:
-:caption: Performance
-performance-summary.md
-::::
-
-::::{toctree}
-:hidden:
-:caption: Model Coverage
-Overview <model-coverage/overview.md>
-Release Log <model-coverage/latest-models.md>
-Large Language Models <model-coverage/llm/index.md>
-Vision Language Models <model-coverage/vlm/index.md>
-Omni <model-coverage/omni/index.md>
-Diffusion <model-coverage/diffusion/index.md>
-Embedding Models <model-coverage/embedding/index.md>
-Reranking Models <model-coverage/reranker/index.md>
-::::
-
-::::{toctree}
-:hidden:
-:caption: Recipes & E2E Examples
-Overview <guides/overview.md>
-SFT & PEFT <guides/llm/finetune.md>
-Function Calling <guides/llm/toolcalling.md>
-guides/llm/knowledge-distillation.md
-Large MoE Fine-Tuning <guides/llm/large-moe-finetune.md>
-DeepSeek V4 Flash <guides/llm/dsv4-flash.md>
-Hy3-preview <guides/llm/hy3.md>
-Pretraining <guides/llm/pretraining.md>
-NanoGPT Pretraining <guides/llm/nanogpt-pretraining.md>
-Sequence Classification <guides/llm/sequence-classification.md>
-Gemma 3 / 3n <guides/omni/gemma3-3n.md>
-Gemma 4 <guides/vlm/gemma4.md>
-Qwen3.5-VL <guides/vlm/qwen3-5.md>
-Nemotron-Omni <guides/vlm/nemotron-omni.md>
-Mistral Medium 3.5 VL <guides/vlm/mistral-medium-3-5.md>
-Qwen3-Omni ASR <guides/audio/qwen3-omni-asr.md>
-Diffusion Fine-Tuning <guides/diffusion/finetune.md>
-dLLM Fine-Tuning <guides/dllm/finetune.md>
-QAT <guides/quantization-aware-training.md>
-Databricks <guides/llm/databricks.md>
-::::
-
-::::{toctree}
-:hidden:
-:caption: Datasets
-Overview <guides/dataset-overview.md>
-Text Dataset <guides/llm/dataset.md>
-Retrieval Dataset <guides/llm/retrieval-dataset.md>
-ColumnMapped Dataset <guides/llm/column-mapped-text-instruction-dataset.md>
-ColumnMapped Iterable <guides/llm/column-mapped-text-instruction-iterable-dataset.md>
-Multi-Modal Dataset <guides/vlm/dataset.md>
-Diffusion Dataset <guides/diffusion/dataset.md>
-::::
-
-::::{toctree}
-:hidden:
-:caption: Job Launchers
-Overview <launcher/overview.md>
-Local Workstation <launcher/local-workstation.md>
-SLURM Cluster <launcher/slurm.md>
-NeMo Run <launcher/nemo-run.md>
-SkyPilot <launcher/skypilot.md>
-SkyPilot k8s <launcher/skypilot-kubernetes.md>
-::::
-
-::::{toctree}
-:hidden:
-:caption: Development
-guides/checkpointing.md
-Gradient Checkpointing <guides/gradient-checkpointing.md>
-Pipeline Parallelism <guides/pipelining.md>
-guides/fp8-training.md
-guides/mlflow-logging.md
-API Reference <apidocs/index.rst>
-Breaking Changes <breaking-changes.md>
-::::
diff --git a/fern/versions/nightly/pages/index.mdx b/docs/index.mdx
similarity index 94%
rename from fern/versions/nightly/pages/index.mdx
rename to docs/index.mdx
index 6383d35c16..55dc2708eb 100644
--- a/fern/versions/nightly/pages/index.mdx
+++ b/docs/index.mdx
@@ -4,7 +4,7 @@ description: "NeMo AutoModel is a PyTorch DTensor-native SPMD open-source traini
 ---
 import { Tag } from "@/components/Tag";
 
-PyTorch-native training that scales from 1 GPU to thousands with a single config change. Load any Hugging Face model, point at your data, and start training -- no checkpoint conversion, no boilerplate.
+PyTorch-native training that scales from 1 GPU to thousands with a single config change. Load any Hugging Face model, point at your data, and start training; no checkpoint conversion and no boilerplate.
 **Quick links:** [🤗 HF Compatible](/get-started/hf-compatibility) | [🚀 Performance](/performance/performance-summary) | [📐 Scalability](/get-started/key-features) | [🎯 SFT & PEFT](/recipes-e2e-examples/sft-peft) | [🎨 Diffusion](/recipes-e2e-examples/diffusion-fine-tuning) | [👁️ VLM](/recipes-e2e-examples/gemma-4)
 
 <Cards>
@@ -44,6 +44,7 @@ New models are added regularly. Pick a model below to start fine-tuning, or see
 
 | Date | Modality | Model |
 |------|----------|-------|
+| 2026-05-18 | Audio | Qwen3-Omni ASR ([recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/audio_finetune/qwen3_omni_asr/ami_sft.yaml)) |
 | 2026-04-07 | LLM | [GLM-5.1](https://github.com/NVIDIA-NeMo/Automodel/discussions/1719) ([recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/glm/glm_5.1_hellaswag_pp.yaml)) |
 | 2026-04-02 | VLM | Gemma 4 ([recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/gemma4/gemma4_4b.yaml)) |
 | 2026-03-16 | VLM | [Mistral Small 4](https://github.com/NVIDIA-NeMo/Automodel/discussions/1558) ([recipe](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/mistral4/mistral4_medpix.yaml)) |
@@ -52,7 +53,7 @@ New models are added regularly. Pick a model below to start fine-tuning, or see
 
 ## Recipes & Guides
 
-Find the right guide for your task -- fine-tuning, pretraining, distillation, diffusion, and more.
+Find the right guide for your task: fine-tuning, pretraining, distillation, diffusion, and more.
 
 | I want to...                | Choose this when...                                                                 | Input Data                                        | Model     | Guide                                                     |
 | --------------------------- | ----------------------------------------------------------------------------------- | ------------------------------------------------- | --------- | --------------------------------------------------------- |
@@ -64,6 +65,7 @@ Find the right guide for your task -- fine-tuning, pretraining, distillation, di
 | **Fine-tune dLLM**          | You want to fine-tune a diffusion language model (e.g., LLaDA) using masked denoising | Instruction / chat dataset                        | dLLM      | [Fine-tune dLLM](/recipes-e2e-examples/dllm-fine-tuning)                 |
 | **Fine-tune Diffusion**     | You want to fine-tune a diffusion model for image or video generation               | Video / Image dataset                             | Diffusion | [Fine-tune Diffusion](/recipes-e2e-examples/diffusion-fine-tuning)       |
 | **Fine-tune VLM-MoE**       | You need large-scale vision-language training with sparse MoE efficiency            | Image + text dataset                              | VLM (MoE) | [Fine-tune VLM-MoE](/recipes-e2e-examples/qwen3-5-vl)                |
+| **Fine-tune Audio ASR**     | Adapt Qwen3-Omni for speech recognition on HF audio datasets                        | Audio + transcript dataset                        | Qwen3-Omni | [Fine-tune Qwen3-Omni ASR](/recipes-e2e-examples/qwen3-omni-asr) |
 | **Embedding fine-tune**     | You want to improve text similarity for search, retrieval, or RAG         | Text pairs / retrieval corpus                     | LLM       | <Tag variant="info">Coming Soon</Tag>                                   |
 | **Fine-tune a large MoE**   | You are adapting a large sparse MoE model (DeepSeek-V3, GLM-5, etc.) to your domain | Text dataset (e.g., HellaSwag)                    | LLM (MoE) | [Fine-tune MoE](/recipes-e2e-examples/large-moe-fine-tuning)         |
 | **Fine-tune DeepSeek V4 Flash** | You want to fine-tune the DeepSeek V4 Flash hybrid-attention MoE (SWA / CSA / HCA + hash-routing) | Text dataset (e.g., HellaSwag)                    | LLM (MoE) | [Fine-tune DeepSeek V4 Flash](/recipes-e2e-examples/deepseek-v4-flash)   |
diff --git a/docs/launcher/local-workstation.md b/docs/launcher/local-workstation.md
deleted file mode 100644
index cd5cf01a7d..0000000000
--- a/docs/launcher/local-workstation.md
+++ /dev/null
@@ -1,150 +0,0 @@
-# Run on Your Local Workstation
-
-Use this guide for local, single-node workflows on a workstation or an interactive Slurm allocation. For setup details, refer to our [Installation Guide](../guides/installation.md).
-For batch multi-node jobs, see the [Slurm](./slurm.md) or [SkyPilot](./skypilot.md) guides.
-
-NeMo AutoModel uses recipes to run end-to-end workflows. If you're new to recipes, see the [Repository Structure](../repository-structure.md) guide.
-
-## Quick Start: Choose Your Job Launch Option
-
-- **CLI (recommended)**
-  ```bash
-  automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-  ```
-
-- **Direct recipe script**
-  - Single GPU
-    ```bash
-    python nemo_automodel/recipes/llm/train_ft.py -c examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-    ```
-  - Multi-GPU (single node)
-    ```bash
-    torchrun --nproc-per-node=2 nemo_automodel/recipes/llm/train_ft.py -c examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-    ```
-
-## Run with AutoModel CLI (Single Node)
-
-The AutoModel CLI is the preferred method for most users. It offers a unified interface to launch training scaling from a local workstation (this guide) to large clusters (see our [cluster guide](./slurm.md)).
-
-### Basic Usage
-
-The CLI follows this format:
-```bash
-automodel [--nproc-per-node N] <config.yaml> [--key.subkey=override ...]
-```
-
-A short alias `am` is also available. Both commands also work with `uv run` (e.g., `uv run automodel <config.yaml>`).
-
-Where:
-- `<config.yaml>`: Path to your YAML configuration file (must contain a `recipe._target_` key)
-- `--nproc-per-node`: Optional override for the number of GPUs to use
-
-The recipe class is specified inside the YAML via the `recipe._target_` key:
-```yaml
-recipe:
-  _target_: nemo_automodel.recipes.llm.train_ft.TrainFinetuneRecipeForNextTokenPrediction
-```
-
-### Train on a Single GPU
-
-For simple fine-tuning on a single GPU:
-
-```bash
-automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```
-
-### Train on Multiple GPUs (Single Node)
-
-For interactive single-node jobs, the CLI automatically detects the number of available GPUs and
-uses `torchrun` for multi-GPU training. You can manually specify the number of GPUs using the `--nproc-per-node` option:
-
-```bash
-automodel --nproc-per-node 2 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```
-
-If you don't specify `--nproc-per-node`, it will use all available GPUs on your system.
-
-Looking for Slurm or cloud training? See [Slurm](./slurm.md) or [SkyPilot](./skypilot.md).
-
-## Run with uv (Development Mode)
-
-When you need more control over the environment or are actively developing with the codebase, you can use `uv` to run training scripts directly. This approach gives you direct access to the underlying Python scripts and is ideal for debugging or customization.
-
-### Train on a Single GPU
-
-```bash
-uv run nemo_automodel/recipes/llm/train_ft.py -c examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```
-
-### Train on Multiple GPUs with Torchrun (Single Node)
-
-For multi-GPU single-node training, use `torchrun` directly:
-
-```bash
-uv run torchrun --nproc-per-node=2 nemo_automodel/recipes/llm/train_ft.py -c examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```
-
-### Why Use uv?
-
-uv provides several advantages for development and experimentation:
-
-- **Automatic environment management**: uv automatically creates and manages virtual environments, ensuring consistent dependencies without manual setup.
-- **Lock file synchronization**: Keeps your local environment perfectly synchronized with the project's `uv.lock` file.
-- **No installation required**: Run scripts directly from the repository without installing packages system-wide.
-- **Development flexibility**: Direct access to Python scripts for debugging, profiling, and customization.
-- **Dependency isolation**: Each project gets its own isolated environment, preventing conflicts.
-
-## Run with Torchrun
-
-If you have NeMo AutoModel installed in your environment and prefer to run recipes directly without uv, you can use `torchrun` directly:
-
-### Train on a Single GPU
-
-```bash
-python nemo_automodel/recipes/llm/train_ft.py -c examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```
-
-### Train on Multiple GPUs (Single Node)
-
-```bash
-torchrun --nproc-per-node=2 nemo_automodel/recipes/llm/train_ft.py -c examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```
-
-This approach requires that you have already installed NeMo AutoModel and its dependencies in your Python environment (see the [installation guide](../guides/installation.md) for details).
-
-## Customize Configuration Settings
-
-All approaches use the same YAML configuration files. You can easily customize training by following the steps in this section.
-
-1. **Override config values**: Use command-line arguments to directly replace default settings.
-For example, if you want to fine-tune `Qwen/Qwen3-0.6B` instead of `meta-llama/Llama-3.2-1B`, you can use:
-   ```bash
-   automodel config.yaml --model.pretrained_model_name_or_path Qwen/Qwen3-0.6B
-   ```
-
-2. **Edit the config file**: Modify the YAML directly for persistent changes.
-
-3. **Create custom configs**: Copy and modify existing configurations from the `examples/` directory.
-
-## When to Use Which Approach
-
-**Use the AutoModel CLI when:**
-- You want a simple, unified interface
-- You are running locally on a single machine
-- You don't need to modify the underlying code
-- You prefer a higher-level abstraction
-
-**Use uv when:**
-- You're developing or debugging the codebase
-- You want automatic dependency management
-- You need maximum control over the execution
-- You want to avoid manual environment setup
-- You're experimenting with custom modifications
-
-**Use Torchrun when:**
-- You have a stable, pre-configured environment
-- You prefer explicit control over Python execution
-- You're working in environments where uv is not available
-- You're integrating with existing PyTorch workflows
-
-All approaches use the same configuration files and provide the same training capabilities on a single node. For multi-node training, see [Run on a Cluster](./slurm.md).
diff --git a/fern/versions/v0.4/pages/launcher/local-workstation.mdx b/docs/launcher/local-workstation.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/launcher/local-workstation.mdx
rename to docs/launcher/local-workstation.mdx
diff --git a/docs/launcher/nemo-run.md b/docs/launcher/nemo-run.md
deleted file mode 100644
index 1c19021c61..0000000000
--- a/docs/launcher/nemo-run.md
+++ /dev/null
@@ -1,237 +0,0 @@
-# Run with NeMo-Run
-
-In this guide, you will learn how to launch NeMo AutoModel training jobs using [NeMo-Run](https://github.com/NVIDIA-NeMo/Run). NeMo-Run supports multiple backends including Slurm, Kubernetes, Docker, and local execution. For cloud-based training, see [Run on Any Cloud with SkyPilot](./skypilot.md). For direct sbatch usage, see [Run on a Cluster (Slurm)](./slurm.md). For single-node workstation usage, see [Run on Your Local Workstation](./local-workstation.md).
-
-NeMo-Run is an open-source tool from NVIDIA that manages job submission across different execution backends. You define your compute configuration once in a Python file and reuse it across all your training jobs.
-
-## Before You Begin
-
-1. **Install NeMo-Run** (it is not bundled with AutoModel):
-
-```bash
-pip install nemo-run
-```
-
-2. **Create an executor definitions file** at `$NEMORUN_HOME/executors.py`. `NEMORUN_HOME` defaults to `~/.nemo_run`; set the environment variable to use a different location. This file tells NeMo-Run how to reach your compute target. Every executor you reference in a YAML config must be defined here. See [Executor Setup](#executor-setup) for a complete example.
-
-3. **Verify connectivity** to the target in your executor (e.g. SSH for Slurm, kubeconfig for Kubernetes).
-
-4. **Set required environment variables** (if needed by your training config):
-
-```bash
-export HF_TOKEN=hf_...          # Required for gated models (e.g. Llama)
-export WANDB_API_KEY=...        # Optional: Weights & Biases logging
-```
-
-## Executor Setup
-
-The `executor:` field in your YAML config is a name that maps to an entry in `$NEMORUN_HOME/executors.py`. This file must define a module-level `EXECUTOR_MAP` dictionary. NeMo-Run supports several executor types -- here are examples of the most common ones:
-
-### Slurm Executor
-
-```python
-import nemo_run as run
-
-def my_slurm_cluster():
-    executor = run.SlurmExecutor(
-        account="my_account",
-        partition="batch",
-        tunnel=run.SSHTunnel(
-            user="myuser",
-            host="login-node.example.com",
-            job_dir="/remote/path/nemo_run/jobs",
-        ),
-        nodes=1,
-        ntasks_per_node=8,
-        gpus_per_node=8,
-        mem="0",
-        exclusive=True,
-        packager=run.Packager(),
-    )
-    executor.container_image = "nvcr.io/nvidia/nemo-automodel:26.02"
-    executor.container_mounts = ["/data:/data", "/checkpoints:/checkpoints"]
-    executor.env_vars = {"HF_HOME": "/data/hf_cache"}
-    executor.time = "04:00:00"
-    return executor
-
-EXECUTOR_MAP = {
-    "my_slurm": my_slurm_cluster(),
-}
-```
-
-### Kubernetes Executor
-
-```python
-import nemo_run as run
-
-def my_k8s_cluster():
-    return run.KubeflowExecutor(
-        namespace="training",
-        image="nvcr.io/nvidia/nemo-automodel:26.02",
-        num_nodes=1,
-        nprocs_per_node=8,
-        gpus_per_node=8,
-    )
-
-EXECUTOR_MAP = {
-    "my_k8s": my_k8s_cluster(),
-}
-```
-
-### Multiple Executors
-
-You can define as many executors as you need for different backends, clusters, or resource configurations:
-
-```python
-EXECUTOR_MAP = {
-    "slurm_dev": my_slurm_dev(),
-    "slurm_prod": my_slurm_prod(),
-    "k8s": my_k8s_cluster(),
-}
-```
-
-- Keys in `EXECUTOR_MAP` are names you reference in YAML (`executor: slurm_dev`).
-- Values can be executor instances or zero-argument callables that return one.
-- Override fields in the YAML (`nodes`, `devices`, `container_image`, etc.) are applied on top of the executor defaults.
-
-## Quickstart
-
-Any existing AutoModel YAML config can be run via NeMo-Run by adding a `nemo_run:` section at the top. For example, given an existing config that you run locally:
-
-```bash
-automodel examples/llm_finetune/qwen/qwen3_moe_30b_te_packed_sequence.yaml
-```
-
-Add a `nemo_run:` block to submit it to a remote executor instead:
-
-```yaml
-# -- Add this section to any existing config ----------------------------------
-nemo_run:
-  executor: my_slurm             # Name from EXECUTOR_MAP in $NEMORUN_HOME/executors.py
-  container_image: /images/custom.sqsh  # Override executor's default image
-  nodes: 1                       # Override number of nodes
-  ntasks_per_node: 8             # GPUs per node
-  time: "04:00:00"               # Override time limit
-  job_name: qwen3_moe_finetune   # Experiment and job name
-
-# -- Everything below is your existing training config (unchanged) ------------
-recipe: TrainFinetuneRecipeForNextTokenPrediction
-
-step_scheduler:
-  global_batch_size: 32
-  # ... rest of your config ...
-```
-
-Then run the same command:
-
-```bash
-automodel your_config.yaml
-```
-
-The CLI detects the `nemo_run:` key, strips it from the training config, loads the named executor from `$NEMORUN_HOME/executors.py`, and submits the job -- all in one command.
-
-## Configuration Reference
-
-### All `nemo_run:` Fields
-
-| Field | Default | Description |
-|---|---|---|
-| `executor` | `"local"` | Name from `EXECUTOR_MAP` in `$NEMORUN_HOME/executors.py`, or `"local"` for local execution |
-| `job_name` | `<recipe_class_name>` | Experiment and job name |
-| `detach` | `true` | Return immediately after submission |
-| `tail_logs` | `false` | Stream logs after submission |
-| `executors_file` | `$NEMORUN_HOME/executors.py` | Path to the executor definitions file |
-| `job_dir` | `./nemo_run_jobs` | Local directory for job artifacts (config snapshot) |
-| *(any other key)* | *(from executor)* | Applied directly to the executor via `setattr`. Use the executor's native attribute names (e.g. `nodes`, `ntasks_per_node`, `partition`, `container_image`, `time`, `env_vars`). Dicts are merged, lists are extended. |
-
-## Examples
-
-### Single-Node Fine-Tuning (1 x 8 GPUs)
-
-```yaml
-nemo_run:
-  executor: my_slurm
-  nodes: 1
-  ntasks_per_node: 8
-  job_name: single_node_finetune
-```
-
-### Multi-Node Distributed Training (2 x 8 GPUs)
-
-```yaml
-nemo_run:
-  executor: my_slurm
-  nodes: 2
-  ntasks_per_node: 8
-  time: "08:00:00"
-  job_name: multinode_pretrain
-```
-
-For multi-node jobs the launcher automatically adds `--nnodes`, `--node-rank`, `--rdzv-backend`, `--master-addr`, and `--master-port` to the `torchrun` command.
-
-### Custom Container Image and Mounts
-
-```yaml
-nemo_run:
-  executor: my_slurm
-  container_image: /images/automodel_nightly.sqsh
-  container_mounts:
-    - /scratch/datasets:/datasets
-    - /scratch/checkpoints:/checkpoints
-  env_vars:
-    HF_HOME: /datasets/hf_cache
-    NCCL_DEBUG: INFO
-```
-
-### Local Execution (No Cluster)
-
-Use `executor: local` to run on the current machine. No `$NEMORUN_HOME/executors.py` entry is needed:
-
-```yaml
-nemo_run:
-  executor: local
-  ntasks_per_node: 2
-  job_name: local_test
-```
-
-## Monitor and Manage Jobs
-
-NeMo-Run stores experiment metadata under `$NEMORUN_HOME/experiments/`. Set `tail_logs: true` in the YAML to stream job output after submission.
-
-For Slurm-based executors, standard Slurm commands also work:
-
-```bash
-squeue -u $USER                 # List your queued and running jobs
-scancel <job_id>                # Cancel a running or pending job
-sacct -j <job_id>               # View job accounting information
-```
-
-For Kubernetes-based executors, use `kubectl` to monitor pods and jobs.
-
-## How It Works
-
-1. The `automodel` CLI detects the `nemo_run:` key and imports `NemoRunLauncher`.
-2. The `nemo_run:` section is popped from the config. The remaining training config is written to `nemo_run_jobs/<timestamp>/job_config.yaml` for record-keeping.
-3. The launcher loads a pre-configured executor from `$NEMORUN_HOME/executors.py` by name (or creates a `LocalExecutor` for `executor: local`). Override fields are applied on top of the executor defaults.
-4. The training config YAML is embedded in a self-contained inline bash script via a heredoc, so no separate file transfer is needed.
-5. A `torchrun` command is built with `--nproc-per-node` and (for multi-node) distributed rendezvous arguments.
-6. The script is submitted via `nemo_run.Experiment`. By default the call returns immediately (`detach=True`).
-
-## Customize Configuration
-
-Override any training parameter from the command line, same as with local runs:
-
-```bash
-automodel config_with_nemo_run.yaml \
-  --model.pretrained_model_name_or_path meta-llama/Llama-3.2-3B
-```
-
-## When to Use NeMo-Run vs. SkyPilot vs. Slurm
-
-| | NeMo-Run | SkyPilot | Slurm (sbatch) |
-|---|---|---|---|
-| **Infrastructure** | Slurm, Kubernetes, Docker, local | Public cloud (AWS, GCP, Azure) | On-prem HPC |
-| **Container support** | Yes (Pyxis/Enroot, Docker, K8s pods) | N/A (cloud VMs) | Manual (in sbatch script) |
-| **Setup required** | `nemo-run` + `$NEMORUN_HOME/executors.py` | Cloud credentials + `sky check` | Cluster access + sbatch script |
-| **Job submission** | `automodel config.yaml` | `automodel config.yaml` | `sbatch slurm.sub` |
-| **Good for** | Managed multi-backend execution, reusable executor configs | Cloud burst, cost optimization, spot instances | Direct Slurm scripts, full control over sbatch |
diff --git a/fern/versions/nightly/pages/launcher/nemo-run.mdx b/docs/launcher/nemo-run.mdx
similarity index 100%
rename from fern/versions/nightly/pages/launcher/nemo-run.mdx
rename to docs/launcher/nemo-run.mdx
diff --git a/docs/launcher/overview.md b/docs/launcher/overview.md
deleted file mode 100644
index f4c28c4661..0000000000
--- a/docs/launcher/overview.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# Job Launchers
-
-NeMo AutoModel provides several ways to launch training. The right choice depends on your hardware and environment.
-
-## Which Launcher Should I Use?
-
-| Launcher | Best for | GPUs | Guide |
-|---|---|---|---|
-| **Local Workstation** | Getting started, debugging, single-node training | 1-8 on one machine | [Local Workstation](./local-workstation.md) |
-| **NeMo-Run** | Managed execution on Slurm, Kubernetes, Docker, local | 1+ | [NeMo-Run](./nemo-run.md) |
-| **SkyPilot** | Cloud training or Kubernetes clusters | Any | [SkyPilot](./skypilot.md) |
-| **Slurm** | Multi-node batch jobs on HPC clusters | 8+ across nodes | [Slurm](./slurm.md) |
-
-### I Have 1–2 GPUs on My Workstation
-
-Use the **interactive** launcher. No scheduler or cluster software is needed:
-
-```bash
-automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```
-
-See the [Local Workstation](./local-workstation.md) guide.
-
-### I Have Access to a Slurm Cluster
-
-Add a `slurm:` section to your YAML config and submit with the same `automodel` command. The CLI generates the `torchrun` invocation and calls `sbatch` for you:
-
-```bash
-automodel config_with_slurm.yaml
-```
-
-See the [Slurm](./slurm.md) guide.
-
-### I Want Managed Job Submission (Slurm, Kubernetes, Docker)
-
-Add a `nemo_run:` section to your YAML config. NeMo-Run loads a pre-configured executor for your compute target and submits the job:
-
-```bash
-automodel config_with_nemo_run.yaml
-```
-
-See the [NeMo-Run](./nemo-run.md) guide.
-
-### I Want to Train on the Cloud
-
-Add a `skypilot:` section to your YAML config. SkyPilot provisions VMs on any major cloud and handles spot-instance preemption automatically:
-
-```bash
-automodel config_with_skypilot.yaml
-```
-
-See the [SkyPilot](./skypilot.md) guide.
-
-### I Want to Train on Kubernetes with SkyPilot
-
-Use the same `skypilot:` launcher, but set `cloud: kubernetes`. This is a good fit when your team already has a GPU-backed Kubernetes cluster and you want SkyPilot to handle job submission and multi-node orchestration:
-
-```bash
-automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad_skypilot_kubernetes.yaml
-```
-
-See the [SkyPilot + Kubernetes tutorial](./skypilot-kubernetes.md).
-
-## All Launchers Use the Same Config
-
-Every launcher shares the same YAML recipe format. The only difference is an optional launcher section (`slurm:`, `nemo_run:`, or `skypilot:`) that tells the CLI where to run. Without a launcher section, training runs interactively on the current machine.
diff --git a/fern/versions/v0.4/pages/launcher/overview.mdx b/docs/launcher/overview.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/launcher/overview.mdx
rename to docs/launcher/overview.mdx
diff --git a/docs/launcher/skypilot-kubernetes.md b/docs/launcher/skypilot-kubernetes.md
deleted file mode 100644
index 3b8692c252..0000000000
--- a/docs/launcher/skypilot-kubernetes.md
+++ /dev/null
@@ -1,235 +0,0 @@
-# SkyPilot + Kubernetes Tutorial
-
-This tutorial shows how to run NeMo AutoModel on a Kubernetes cluster through SkyPilot.
-
-You will:
-
-1. Check that SkyPilot can see your Kubernetes cluster and GPUs.
-2. Launch a small NeMo AutoModel fine-tuning job on one GPU.
-3. Scale the same job to two nodes.
-4. Follow logs and clean everything up when you are done.
-
-This guide is written for new AutoModel users, so it keeps the moving pieces as small as possible.
-
-## Before you begin
-
-You need:
-
-- a working Kubernetes context in `kubectl`
-- at least one GPU-backed node in the cluster
-- SkyPilot installed with Kubernetes support
-- a local NeMo AutoModel checkout
-- a Hugging Face token in `HF_TOKEN` if you plan to use a gated model such as Llama
-
-If you are setting up SkyPilot on Kubernetes for the first time, the official SkyPilot Kubernetes setup guide is here:
-
-- <https://docs.skypilot.co/en/latest/reference/kubernetes/kubernetes-setup.html>
-
-Install the SkyPilot Kubernetes client in your AutoModel environment:
-
-```bash
-uv pip install "skypilot[kubernetes]"
-```
-
-Set the token once in your shell:
-
-```bash
-export HF_TOKEN=hf_your_token_here
-```
-
-## Step 1: Verify the cluster
-
-Start with three quick checks:
-
-```bash
-kubectl config current-context
-kubectl get nodes
-sky check kubernetes
-```
-
-You want `sky check kubernetes` to report that Kubernetes is enabled.
-
-Next, ask SkyPilot which GPUs it can request from the cluster:
-
-```bash
-sky show-gpus --infra k8s
-```
-
-Example output:
-
-```text
-$ sky show-gpus --infra k8s
-Kubernetes GPUs
-GPU   REQUESTABLE_QTY_PER_NODE  UTILIZATION
-L4    1, 2, 4                   8 of 8 free
-H100  1, 2, 4, 8                8 of 8 free
-
-Kubernetes per node GPU availability
-NODE                       GPU    UTILIZATION
-gpu-node-a                 H100   8 of 8 free
-```
-
-If you do not see any GPUs here, stop and fix the Kubernetes or SkyPilot setup first. AutoModel is ready, but SkyPilot still cannot place GPU jobs.
-
-## Step 2: Run a single-node job
-
-The easiest starting point is a one-GPU fine-tune using the existing Llama 3.2 1B SQuAD example.
-
-This repository now includes a Kubernetes-flavored SkyPilot config at [`examples/llm_finetune/llama3_2/llama3_2_1b_squad_skypilot_kubernetes.yaml`](../../examples/llm_finetune/llama3_2/llama3_2_1b_squad_skypilot_kubernetes.yaml).
-
-Launch it from the repo root:
-
-```bash
-automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad_skypilot_kubernetes.yaml
-```
-
-The important part of that YAML is the `skypilot:` block:
-
-```yaml
-skypilot:
-  cloud: kubernetes
-  accelerators: H100:1
-  use_spot: false
-  disk_size: 200
-  job_name: llama3-2-1b-k8s
-  hf_token: ${HF_TOKEN}
-```
-
-What AutoModel does for you:
-
-- writes a launcher-free copy of the training config to `skypilot_jobs/<timestamp>/job_config.yaml`
-- syncs the repo to the SkyPilot workdir
-- runs `torchrun` on the Kubernetes worker pod
-- forwards your training config unchanged after removing the `skypilot:` section
-
-Example submission output:
-
-```text
-$ automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad_skypilot_kubernetes.yaml
-INFO Config: /workspace/Automodel/examples/llm_finetune/llama3_2/llama3_2_1b_squad_skypilot_kubernetes.yaml
-INFO Recipe: nemo_automodel.recipes.llm.train_ft.TrainFinetuneRecipeForNextTokenPrediction
-INFO Launching job via SkyPilot
-INFO SkyPilot job artifacts in: /workspace/Automodel/skypilot_jobs/1712150400
-```
-
-Then watch the cluster come up:
-
-```bash
-sky status
-sky logs llama3-2-1b-k8s
-kubectl get pods
-```
-
-Example log snippet:
-
-```text
-$ sky status
-Clusters
-NAME              LAUNCHED  RESOURCES                    STATUS
-llama3-2-1b-k8s   1m ago    1x Kubernetes(H100:1)       UP
-
-$ sky logs llama3-2-1b-k8s
-...
-torchrun --nproc_per_node=1 ~/sky_workdir/nemo_automodel/recipes/llm/train_ft.py -c /tmp/automodel_job_config.yaml
-...
-```
-
-## Step 3: Scale to two nodes
-
-Once the single-node job works, scaling out is just a small YAML change.
-
-Use the two-node example at [`examples/llm_finetune/llama3_2/llama3_2_1b_squad_skypilot_kubernetes_2nodes.yaml`](../../examples/llm_finetune/llama3_2/llama3_2_1b_squad_skypilot_kubernetes_2nodes.yaml):
-
-```bash
-automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad_skypilot_kubernetes_2nodes.yaml
-```
-
-The launcher block looks like this:
-
-```yaml
-skypilot:
-  cloud: kubernetes
-  accelerators: H100:1
-  num_nodes: 2
-  use_spot: false
-  disk_size: 200
-  job_name: llama3-2-1b-k8s-2nodes
-  hf_token: ${HF_TOKEN}
-```
-
-For multi-node jobs, AutoModel switches the generated command to a distributed `torchrun` launch that uses SkyPilot's node metadata:
-
-```text
-torchrun \
-  --nproc_per_node=1 \
-  --nnodes=$SKYPILOT_NUM_NODES \
-  --node_rank=$SKYPILOT_NODE_RANK \
-  --rdzv_backend=c10d \
-  --master_addr=$(echo $SKYPILOT_NODE_IPS | head -n1) \
-  --master_port=12375 \
-  ~/sky_workdir/nemo_automodel/recipes/llm/train_ft.py \
-  -c /tmp/automodel_job_config.yaml
-```
-
-That means you do not need to hand-build rendezvous arguments yourself.
-
-Use these commands while the job is starting:
-
-```bash
-sky status
-sky logs llama3-2-1b-k8s-2nodes
-kubectl get pods -o wide
-```
-
-What you want to see:
-
-- two SkyPilot-managed worker pods
-- both pods scheduled onto GPU nodes
-- logs that include `--nnodes=$SKYPILOT_NUM_NODES`
-
-## Step 4: Clean up
-
-When the run is finished, tear the cluster down so it stops consuming resources:
-
-```bash
-sky down llama3-2-1b-k8s
-sky down llama3-2-1b-k8s-2nodes
-```
-
-You can remove old local launcher artifacts too:
-
-```bash
-rm -rf skypilot_jobs
-```
-
-## Common first-run issues
-
-### `sky check kubernetes` fails
-
-Usually this means SkyPilot cannot use your current kubeconfig context yet. Re-check the context with `kubectl config current-context`, then compare it with SkyPilot's Kubernetes setup guide.
-
-### `sky show-gpus --infra k8s` shows no GPUs
-
-SkyPilot can only schedule GPUs that Kubernetes exposes. Make sure the GPU device plugin or operator is installed and the GPU nodes are healthy.
-
-### The job starts, but model download fails
-
-For gated models, make sure `HF_TOKEN` is exported in the shell that runs `automodel`. The SkyPilot launcher forwards it to the remote job.
-
-### Multi-node launch stalls during rendezvous
-
-Start with the single-node example first. If that works, check that:
-
-- your cluster has enough free GPU nodes for `num_nodes`
-- worker pods can talk to each other over the cluster network
-- the logs include the generated `torchrun` multi-node arguments shown above
-
-## Which file should I edit?
-
-If you want to adapt this tutorial for your own model, the quickest path is:
-
-1. Copy [`examples/llm_finetune/llama3_2/llama3_2_1b_squad_skypilot_kubernetes.yaml`](../../examples/llm_finetune/llama3_2/llama3_2_1b_squad_skypilot_kubernetes.yaml).
-2. Change the `model` and dataset sections.
-3. Keep the `skypilot:` block small until the first run succeeds.
-
-That way, when something goes wrong, you only have a few knobs to inspect.
diff --git a/fern/versions/v0.4/pages/launcher/skypilot-kubernetes.mdx b/docs/launcher/skypilot-kubernetes.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/launcher/skypilot-kubernetes.mdx
rename to docs/launcher/skypilot-kubernetes.mdx
diff --git a/docs/launcher/skypilot.md b/docs/launcher/skypilot.md
deleted file mode 100644
index 55d4061fac..0000000000
--- a/docs/launcher/skypilot.md
+++ /dev/null
@@ -1,195 +0,0 @@
-# Run with SkyPilot
-
-In this guide, you will learn how to launch NeMo AutoModel training jobs with [SkyPilot](https://docs.skypilot.co/en/stable/docs/). SkyPilot can target public clouds such as AWS, GCP, Azure, and Lambda, and it can also submit jobs to Kubernetes clusters. For a beginner-friendly Kubernetes walkthrough, see [SkyPilot + Kubernetes tutorial](./skypilot-kubernetes.md). For on-premises cluster usage without SkyPilot, see [Run on a Cluster (Slurm)](./slurm.md). For single-node workstation usage, see [Run on Your Local Workstation](./local-workstation.md).
-
-SkyPilot is an open-source framework that abstracts cloud infrastructure so you can train on whichever cloud is cheapest or most available at launch time — including automatic spot-instance handling for significant cost savings.
-
-## Before You Begin
-
-Complete the following setup steps before launching your first AutoModel job on a cloud provider.
-
-1. **Install SkyPilot** with the connector for your target infrastructure:
-
-```bash
-uv pip install "skypilot[gcp]"         # Google Cloud
-uv pip install "skypilot[aws]"         # Amazon Web Services
-uv pip install "skypilot[azure]"       # Microsoft Azure
-uv pip install "skypilot[lambda]"      # Lambda Cloud
-uv pip install "skypilot[kubernetes]"  # Any Kubernetes cluster
-```
-
-2. **Configure access** for your target infrastructure, then verify:
-
-```bash
-sky check
-```
-
-You should see at least one cloud listed as **OK**.
-
-3. **Set required environment variables:**
-
-```bash
-export HF_TOKEN=hf_...          # Required for gated models (e.g. Llama)
-export WANDB_API_KEY=...        # Optional: Weights & Biases logging
-```
-
-## Quickstart
-
-Add a `skypilot:` section to any existing config YAML, then run the same `automodel` command you already know:
-
-```bash
-automodel your_config_with_skypilot.yaml
-```
-
-The CLI detects the `skypilot:` key, strips it from the training config, uploads the code and config to a cloud VM, and launches training — all in one command.
-
-## Configuration Reference
-
-Below is an annotated example for fine-tuning Llama-3.2-1B on SQuAD on a GCP spot T4. A ready-to-run copy lives at [`examples/llm_finetune/llama3_2/llama3_2_1b_squad_skypilot.yaml`](../../examples/llm_finetune/llama3_2/llama3_2_1b_squad_skypilot.yaml).
-
-```yaml
-# ── SkyPilot launcher section ─────────────────────────────────────────────
-# Removed before the training config reaches the remote VM.
-skypilot:
-  cloud: gcp                  # aws | gcp | azure | lambda | kubernetes
-  accelerators: T4:1          # GPU type:count per node, e.g. A100:8
-  use_spot: true              # ~80 % cost reduction vs on-demand
-  disk_size: 100              # Remote VM disk size in GB
-  num_nodes: 1                # Increase for multi-node distributed training
-  region: us-central1         # Optional — SkyPilot picks cheapest if omitted
-  job_name: llama3_2_finetune # Also used as the SkyPilot cluster name
-
-  # Use env-var placeholders so secrets are never stored in YAML
-  hf_token: ${HF_TOKEN}
-  # wandb_key: ${WANDB_API_KEY}
-
-  # Optional: extra shell commands run on the VM after `pip install -e .`
-  # setup: |
-  #   pip install some-extra-dependency
-
-  # Optional: override the default output directory (default: ./skypilot_jobs)
-  # job_dir: /path/to/skypilot/jobs
-
-# ── Training config (forwarded to the VM unchanged) ───────────────────────
-step_scheduler:
-  global_batch_size: 64
-  local_batch_size: 8
-  num_epochs: 1
-
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-
-# ... rest of your training config ...
-```
-
-### All `skypilot:` Fields
-
-| Field | Default | Description |
-|---|---|---|
-| `cloud` | *(required)* | Cloud provider: `aws`, `gcp`, `azure`, `lambda`, `kubernetes` |
-| `accelerators` | `T4:1` | GPU type and count per node, e.g. `A100:8`, `V100:4` |
-| `num_nodes` | `1` | Number of VMs for distributed training |
-| `use_spot` | `true` | Use spot/preemptible instances |
-| `disk_size` | `100` | Remote VM disk size in GB |
-| `region` | *(auto)* | Cloud region; SkyPilot selects cheapest if omitted |
-| `zone` | *(auto)* | Availability zone within the region |
-| `instance_type` | *(auto)* | Specific instance type; auto-selected if omitted |
-| `job_name` | `<domain>_<command>` | Job and SkyPilot cluster name |
-| `setup` | *(auto)* | Extra setup commands run after `pip install -e .` |
-| `hf_home` | `~/.cache/huggingface` | Hugging Face cache directory on the remote VM |
-| `hf_token` | `$HF_TOKEN` env | Hugging Face token for gated model access |
-| `wandb_key` | `$WANDB_API_KEY` env | Weights & Biases API key |
-| `env_vars` | `{}` | Additional environment variables for the remote VM |
-| `job_dir` | `./skypilot_jobs` | Local directory for job artifacts (config snapshot, logs) |
-| `gpus_per_node` | *(parsed from `accelerators`)* | Override GPU count per node passed to `torchrun` |
-
-## Cloud Examples
-
-### AWS — On-Demand A10G
-
-```yaml
-skypilot:
-  cloud: aws
-  accelerators: A10G:1
-  use_spot: false
-  region: us-east-1
-  job_name: llm_aws_finetune
-  hf_token: ${HF_TOKEN}
-```
-
-### GCP — Spot V100, 8 GPUs (Single Node)
-
-```yaml
-skypilot:
-  cloud: gcp
-  accelerators: V100:8
-  use_spot: true
-  region: us-west1
-  job_name: llm_gcp_v100_8gpu
-  hf_token: ${HF_TOKEN}
-```
-
-### Multi-Node Distributed Training (2 x 8 x A100)
-
-```yaml
-skypilot:
-  cloud: gcp
-  accelerators: A100:8
-  num_nodes: 2
-  use_spot: false
-  job_name: llm_multinode_a100
-  hf_token: ${HF_TOKEN}
-```
-
-For multi-node jobs, the launcher automatically adds the SkyPilot rendezvous environment variables (`$SKYPILOT_NODE_RANK`, `$SKYPILOT_NUM_NODES`, `$SKYPILOT_NODE_IPS`) to the `torchrun` command.
-
-## Monitor and Manage Jobs
-
-After submitting, use standard SkyPilot commands:
-
-```bash
-sky status                    # List running clusters and their status
-sky logs <cluster_name>       # Stream training logs
-sky ssh <cluster_name>        # SSH into the VM for debugging
-sky cancel <cluster_name> <job_id>  # Cancel a running job
-sky down <cluster_name>       # Terminate the cluster and stop billing
-```
-
-## How It Works
-
-1. The `automodel` CLI detects the `skypilot:` key in the YAML and calls `launch_with_skypilot()`.
-2. The training config (with `skypilot:` removed) is written to a local `skypilot_jobs/<timestamp>/job_config.yaml`.
-3. A `sky.Task` is created with:
-   - **workdir** — the current directory synced to `~/sky_workdir` on the remote VM.
-   - **file_mounts** — the job config uploaded to `/tmp/automodel_job_config.yaml`.
-   - **setup** — `pip install -e .` (plus any custom `setup:` commands).
-   - **run** — a `torchrun` command pointing at the recipe script and config.
-4. `sky.launch()` provisions the VM, runs setup, then executes training. The call returns immediately (`detach_run=True`); use `sky logs` to follow progress.
-
-## Customize Configuration
-
-Override any training parameter from the command line, same as with local runs:
-
-```bash
-automodel config_with_skypilot.yaml \
-  --model.pretrained_model_name_or_path meta-llama/Llama-3.2-3B
-```
-
-## Kubernetes Users
-
-If you want to run on a Kubernetes cluster, use `cloud: kubernetes` and follow the dedicated [SkyPilot + Kubernetes tutorial](./skypilot-kubernetes.md). That guide includes:
-
-- a copy-paste single-node config
-- a two-node example
-- sample `sky` and `kubectl` output to help you sanity-check your setup
-- a short troubleshooting section for common first-run issues
-
-## When to Use SkyPilot vs. Slurm
-
-| | SkyPilot | Slurm |
-|---|---|---|
-| **Infrastructure** | Any public cloud | On-premises HPC cluster |
-| **Spot instances** | Yes (automatic) | Depends on cluster config |
-| **Setup required** | Cloud credentials + `sky check` | Cluster access |
-| **Good for** | Flexible cloud burst, cost optimization | Fixed on-prem GPU clusters |
diff --git a/fern/versions/v0.4/pages/launcher/skypilot.mdx b/docs/launcher/skypilot.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/launcher/skypilot.mdx
rename to docs/launcher/skypilot.mdx
diff --git a/docs/launcher/slurm.md b/docs/launcher/slurm.md
deleted file mode 100644
index 1ac5ec65c5..0000000000
--- a/docs/launcher/slurm.md
+++ /dev/null
@@ -1,192 +0,0 @@
-# Run on a Cluster
-
-In this guide, you will learn how to submit distributed training jobs on Slurm clusters (single- or multi-node). For single-node workstation usage, see [Run on Your Local Workstation](./local-workstation.md). For setup details, refer to our [Installation Guide](../guides/installation.md).
-
-NeMo AutoModel uses recipes to run end-to-end workflows. If you're new to recipes, see the [Repository Structure](../repository-structure.md) guide.
-
-
-## Quickstart
-
-```bash
-# Edit the reference script for your cluster, then submit:
-cp slurm.sub my_cluster.sub
-vim my_cluster.sub
-sbatch my_cluster.sub
-```
-
-For interactive testing on a Slurm node:
-  - Single node, single GPU
-    ```bash
-    automodel your_config.yaml
-    ```
-  - Single node, multiple GPUs
-    ```bash
-    automodel --nproc-per-node 8 your_config.yaml
-    ```
-
-## Submit a Batch Job with Slurm
-
-SLURM clusters vary widely: some use Pyxis containers, others use
-Singularity/Apptainer, and many run bare-metal with environment modules.
-Instead of trying to cover all variations in code, AutoModel provides a
-reference sbatch script that you copy and adapt to your cluster.
-
-### Getting Started
-
-1. Copy the reference script:
-
-```bash
-cp slurm.sub my_cluster.sub
-```
-
-2. Edit `my_cluster.sub` — change `CONFIG`, `#SBATCH` directives (account,
-   partition, nodes, time), container runtime, mounts, and secrets for your
-   cluster.
-
-3. Submit the job:
-
-```bash
-sbatch my_cluster.sub
-```
-
-### How It Works
-
-The reference `slurm.sub` script:
-
-1. Sets `CONFIG` to point at your YAML recipe config
-2. Allocates nodes via SBATCH directives
-3. Sets up the multi-node environment (`MASTER_ADDR`, `MASTER_PORT`)
-4. Runs `torchrun -m nemo_automodel.cli.app $CONFIG` on each node via `srun`
-5. Each torchrun worker detects the distributed environment and runs the recipe in-process
-
-All cluster-specific configuration (SBATCH directives, container runtime,
-mounts, NCCL tuning, secrets) lives in your sbatch script where you can see
-and edit it directly.
-
-
-### Examples
-
-**Pyxis container (NVIDIA clusters):**
-
-```bash
-#!/bin/bash
-#SBATCH -A my_account
-#SBATCH -p batch
-#SBATCH -t 01:00:00
-#SBATCH -N 8
-#SBATCH --gpus-per-node=8
-#SBATCH --ntasks-per-node=1
-#SBATCH -J automodel-finetune
-#SBATCH --output=slurm_jobs/%x_%j.out
-#SBATCH --error=slurm_jobs/%x_%j.err
-
-CONFIG=examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-
-CONT=/lustre/fsw/images/automodel.sqsh
-CONT_NAME=automodel-training
-CONT_MOUNT="\
-/home/$USER/Automodel:/opt/Automodel,\
-/home/$USER/.cache/huggingface:/root/.cache/huggingface"
-
-export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
-export MASTER_PORT=13742
-
-srun \
-    --container-name="${CONT_NAME}" \
-    --container-image="${CONT}" \
-    --container-mounts="${CONT_MOUNT}" \
-    --container-entrypoint \
-    --no-container-mount-home \
-    --export=ALL \
-    bash -c "\
-        cd /opt/Automodel && \
-        torchrun \
-            --nproc-per-node=\${SLURM_GPUS_PER_NODE:-8} \
-            --nnodes=\${SLURM_NNODES:-1} \
-            --rdzv_backend=c10d \
-            --rdzv_endpoint=\${MASTER_ADDR}:\${MASTER_PORT} \
-            -m nemo_automodel.cli.app ${CONFIG}"
-```
-
-**Bare-metal (no container):**
-
-```bash
-#!/bin/bash
-#SBATCH -A my_account
-#SBATCH -p gpu
-#SBATCH -N 2
-#SBATCH --gpus-per-node=8
-#SBATCH --time=01:00:00
-
-CONFIG=examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-
-export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
-export MASTER_PORT=13742
-
-module load cuda/12.8
-source /opt/venvs/automodel/bin/activate
-
-srun bash -c "\
-    torchrun \
-        --nproc-per-node=\${SLURM_GPUS_PER_NODE:-8} \
-        --nnodes=\${SLURM_NNODES:-1} \
-        --rdzv_backend=c10d \
-        --rdzv_endpoint=\${MASTER_ADDR}:\${MASTER_PORT} \
-        -m nemo_automodel.cli.app ${CONFIG}"
-```
-
-**Apptainer / Singularity:**
-
-```bash
-#!/bin/bash
-#SBATCH -A my_account
-#SBATCH -p gpu
-#SBATCH -N 2
-#SBATCH --gpus-per-node=8
-#SBATCH --time=01:00:00
-
-CONFIG=examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-
-export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
-export MASTER_PORT=13742
-
-srun apptainer exec --nv /shared/images/automodel.sif \
-    bash -c "\
-        torchrun \
-            --nproc-per-node=\${SLURM_GPUS_PER_NODE:-8} \
-            --nnodes=\${SLURM_NNODES:-1} \
-            --rdzv_backend=c10d \
-            --rdzv_endpoint=\${MASTER_ADDR}:\${MASTER_PORT} \
-            -m nemo_automodel.cli.app ${CONFIG}"
-```
-
-
-### Launch with Modified Code
-
-If the script is executed from within a Git repository accessible to Slurm
-workers, automodel will use the repository source over the installation
-inside the container image (it prepends `$CWD` to `PYTHONPATH` when it
-detects an editable checkout).
-
-```bash
-git clone git@github.com:NVIDIA-NeMo/Automodel.git automodel_test_repo
-cd automodel_test_repo/
-sbatch slurm.sub
-```
-
-## Customize Configuration Settings
-
-You can customize training by following the steps in this section.
-
-1. **Override config values**: Edit the `CONFIG` variable and add CLI overrides
-   in your torchrun command inside the sbatch script. For example, to change
-   the model:
-   ```bash
-   -m nemo_automodel.cli.app ${CONFIG} --model.pretrained_model_name_or_path Qwen/Qwen3-0.6B
-   ```
-
-2. **Edit the config file**: Modify the YAML directly for persistent changes.
-
-3. **Create custom configs**: Copy and modify existing configurations from the `examples/` directory.
-
-For single-node workflows, see our [Run on Your Local Workstation](./local-workstation.md) guide.
diff --git a/fern/versions/v0.4/pages/launcher/slurm.mdx b/docs/launcher/slurm.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/launcher/slurm.mdx
rename to docs/launcher/slurm.mdx
diff --git a/docs/model-coverage/diffusion/black-forest-labs/flux.md b/docs/model-coverage/diffusion/black-forest-labs/flux.md
deleted file mode 100644
index 40d51cafc3..0000000000
--- a/docs/model-coverage/diffusion/black-forest-labs/flux.md
+++ /dev/null
@@ -1,92 +0,0 @@
-# FLUX.1-dev
-
-[FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev) is a 12B parameter text-to-image diffusion transformer from Black Forest Labs, trained with flow matching. It produces high-fidelity images and is designed for non-commercial research and development use.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text-to-Image |
-| **Architecture** | DiT (Flow Matching) |
-| **Parameters** | 12B |
-| **HF Org** | [black-forest-labs](https://huggingface.co/black-forest-labs) |
-:::
-
-## Available Models
-
-- **FLUX.1-dev**: 12B parameters
-
-## Task
-
-- Text-to-Image (T2I)
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| FLUX.1-dev | [`black-forest-labs/FLUX.1-dev`](https://huggingface.co/black-forest-labs/FLUX.1-dev) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`flux_t2i_flow.yaml <../../../../examples/diffusion/finetune/flux_t2i_flow.yaml>` | Fine-tune — FLUX.1-dev with flow matching |
-| {download}`flux_t2i_flow.yaml <../../../../examples/diffusion/pretrain/flux_t2i_flow.yaml>` | Pretrain — FLUX.1-dev with flow matching |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-torchrun --nproc-per-node=8 \
-  examples/diffusion/finetune/finetune.py \
-  -c examples/diffusion/finetune/flux_t2i_flow.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-torchrun --nproc-per-node=8 \
-  examples/diffusion/finetune/finetune.py \
-  -c examples/diffusion/finetune/flux_t2i_flow.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [Diffusion Fine-Tuning Guide](../../../guides/diffusion/finetune.md).
-
-## Training
-
-See the [Diffusion Training and Fine-Tuning Guide](../../../guides/diffusion/finetune.md) and [Dataset Preparation](../../../guides/diffusion/dataset.md).
-
-## Hugging Face Model Cards
-
-- [black-forest-labs/FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev)
diff --git a/fern/versions/v0.4/pages/model-coverage/diffusion/black-forest-labs/flux.mdx b/docs/model-coverage/diffusion/black-forest-labs/flux.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/diffusion/black-forest-labs/flux.mdx
rename to docs/model-coverage/diffusion/black-forest-labs/flux.mdx
diff --git a/docs/model-coverage/diffusion/hunyuanvideo-community/hunyuanvideo.md b/docs/model-coverage/diffusion/hunyuanvideo-community/hunyuanvideo.md
deleted file mode 100644
index 4dc15df7f2..0000000000
--- a/docs/model-coverage/diffusion/hunyuanvideo-community/hunyuanvideo.md
+++ /dev/null
@@ -1,91 +0,0 @@
-# HunyuanVideo 1.5
-
-[HunyuanVideo 1.5](https://huggingface.co/hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v) is a 13B parameter text-to-video diffusion model from the Hunyuan community, supporting 720p resolution video generation with flow matching training.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text-to-Video |
-| **Architecture** | DiT (Flow Matching) |
-| **Parameters** | 13B |
-| **HF Org** | [hunyuanvideo-community](https://huggingface.co/hunyuanvideo-community) |
-:::
-
-## Available Models
-
-- **HunyuanVideo-1.5-Diffusers-720p_t2v**: 13B parameters
-
-## Task
-
-- Text-to-Video (T2V)
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| HunyuanVideo 1.5 720p T2V | [`hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v`](https://huggingface.co/hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`hunyuan_t2v_flow.yaml <../../../../examples/diffusion/finetune/hunyuan_t2v_flow.yaml>` | Fine-tune — HunyuanVideo 1.5 with flow matching |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-torchrun --nproc-per-node=8 \
-  examples/diffusion/finetune/finetune.py \
-  -c examples/diffusion/finetune/hunyuan_t2v_flow.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-torchrun --nproc-per-node=8 \
-  examples/diffusion/finetune/finetune.py \
-  -c examples/diffusion/finetune/hunyuan_t2v_flow.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [Diffusion Fine-Tuning Guide](../../../guides/diffusion/finetune.md).
-
-## Training
-
-See the [Diffusion Training and Fine-Tuning Guide](../../../guides/diffusion/finetune.md) and [Dataset Preparation](../../../guides/diffusion/dataset.md).
-
-## Hugging Face Model Cards
-
-- [hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v](https://huggingface.co/hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v)
diff --git a/fern/versions/v0.4/pages/model-coverage/diffusion/hunyuanvideo-community/hunyuanvideo.mdx b/docs/model-coverage/diffusion/hunyuanvideo-community/hunyuanvideo.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/diffusion/hunyuanvideo-community/hunyuanvideo.mdx
rename to docs/model-coverage/diffusion/hunyuanvideo-community/hunyuanvideo.mdx
diff --git a/docs/model-coverage/diffusion/index.md b/docs/model-coverage/diffusion/index.md
deleted file mode 100644
index 7b115035a6..0000000000
--- a/docs/model-coverage/diffusion/index.md
+++ /dev/null
@@ -1,41 +0,0 @@
-(diffusion-models)=
-
-# Diffusion Models
-
-## Introduction
-
-Diffusion models are a class of generative models that learn to produce images or videos by iteratively denoising samples from a noise distribution. NeMo AutoModel supports training diffusion models using **flow matching**, a framework that regresses velocity fields along straight interpolation paths between noise and data.
-
-NeMo AutoModel integrates with [Hugging Face Diffusers](https://huggingface.co/docs/diffusers) for model loading and generation, while providing its own distributed training infrastructure via the `TrainDiffusionRecipe`. This recipe handles FSDP2 parallelization, flow matching loss computation, multiresolution bucketed dataloading, and checkpoint management.
-
-## Supported Models
-
-| Owner | Model | Task | Architecture |
-|---|---|---|---|
-| Wan AI | [Wan 2.1 T2V](wan-ai/wan2-1-t2v.md) | Text-to-Video | DiT (Flow Matching) |
-| Black Forest Labs | [FLUX.1-dev](black-forest-labs/flux.md) | Text-to-Image | DiT (Flow Matching) |
-| Hunyuan Community | [HunyuanVideo 1.5](hunyuanvideo-community/hunyuanvideo.md) | Text-to-Video | DiT (Flow Matching) |
-| Qwen / Alibaba Cloud | [Qwen-Image](qwen/qwen-image.md) | Text-to-Image | DiT (Flow Matching) |
-
-## Supported Workflows
-
-- **Pretraining**: Train from randomly initialized weights on large-scale datasets
-- **Fine-tuning**: Adapt pretrained model weights to a specific dataset or style
-- **Generation**: Run inference with pretrained or fine-tuned checkpoints
-
-## Dataset
-
-Diffusion training requires pre-encoded `.meta` files containing VAE latents and text embeddings. Raw videos or images must be preprocessed before training. See the [Diffusion Dataset Preparation](../../guides/diffusion/dataset.md) guide.
-
-## Train Diffusion Models
-
-For a complete walkthrough of training configuration, model-specific settings, and launch commands, see the [Diffusion Training and Fine-Tuning Guide](../../guides/diffusion/finetune.md).
-
-```{toctree}
-:hidden:
-
-wan-ai/wan2-1-t2v
-black-forest-labs/flux
-hunyuanvideo-community/hunyuanvideo
-qwen/qwen-image
-```
diff --git a/fern/versions/v0.4/pages/model-coverage/diffusion/index.mdx b/docs/model-coverage/diffusion/index.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/diffusion/index.mdx
rename to docs/model-coverage/diffusion/index.mdx
diff --git a/docs/model-coverage/diffusion/qwen/qwen-image.md b/docs/model-coverage/diffusion/qwen/qwen-image.md
deleted file mode 100644
index 2592e7cba8..0000000000
--- a/docs/model-coverage/diffusion/qwen/qwen-image.md
+++ /dev/null
@@ -1,91 +0,0 @@
-# Qwen-Image
-
-[Qwen-Image](https://huggingface.co/Qwen/Qwen-Image) is Alibaba Cloud's text-to-image diffusion transformer. NeMo AutoModel supports Qwen-Image training via its flow-matching pipeline with a dedicated `qwen_image` adapter, enabling FSDP2 parallelization, multiresolution bucketed dataloading and LoRA-style fine-tuning.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text-to-Image |
-| **Architecture** | DiT (Flow Matching) |
-| **HF Org** | [Qwen](https://huggingface.co/Qwen) |
-:::
-
-## Available Models
-
-- **Qwen-Image**
-
-## Task
-
-- Text-to-Image (T2I)
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Qwen-Image | [`Qwen/Qwen-Image`](https://huggingface.co/Qwen/Qwen-Image) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`qwen_image_t2i_flow.yaml <../../../../examples/diffusion/finetune/qwen_image_t2i_flow.yaml>` | Fine-tune — Qwen-Image with flow matching |
-| {download}`qwen_image_t2i_flow.yaml <../../../../examples/diffusion/pretrain/qwen_image_t2i_flow.yaml>` | Pretrain — Qwen-Image with flow matching |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-torchrun --nproc-per-node=8 \
-  examples/diffusion/finetune/finetune.py \
-  -c examples/diffusion/finetune/qwen_image_t2i_flow.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-torchrun --nproc-per-node=8 \
-  examples/diffusion/finetune/finetune.py \
-  -c examples/diffusion/finetune/qwen_image_t2i_flow.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [Diffusion Training and Fine-Tuning Guide](../../../guides/diffusion/finetune.md).
-
-## Fine-Tuning
-
-See the [Diffusion Training and Fine-Tuning Guide](../../../guides/diffusion/finetune.md).
-
-## Hugging Face Model Cards
-
-- [Qwen/Qwen-Image](https://huggingface.co/Qwen/Qwen-Image)
diff --git a/fern/versions/v0.4/pages/model-coverage/diffusion/qwen/qwen-image.mdx b/docs/model-coverage/diffusion/qwen/qwen-image.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/diffusion/qwen/qwen-image.mdx
rename to docs/model-coverage/diffusion/qwen/qwen-image.mdx
diff --git a/docs/model-coverage/diffusion/wan-ai/wan2-1-t2v.md b/docs/model-coverage/diffusion/wan-ai/wan2-1-t2v.md
deleted file mode 100644
index ff58648b6c..0000000000
--- a/docs/model-coverage/diffusion/wan-ai/wan2-1-t2v.md
+++ /dev/null
@@ -1,92 +0,0 @@
-# Wan 2.1 T2V
-
-[Wan 2.1](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers) is a text-to-video diffusion model from Wan AI, trained with flow matching on a large-scale video dataset. It generates high-quality short video clips from text prompts.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text-to-Video |
-| **Architecture** | DiT (Flow Matching) |
-| **Parameters** | 1.3B |
-| **HF Org** | [Wan-AI](https://huggingface.co/Wan-AI) |
-:::
-
-## Available Models
-
-- **Wan2.1-T2V-1.3B**: 1.3B parameters
-
-## Task
-
-- Text-to-Video (T2V)
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Wan 2.1 T2V 1.3B | [`Wan-AI/Wan2.1-T2V-1.3B-Diffusers`](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`wan2_1_t2v_flow.yaml <../../../../examples/diffusion/finetune/wan2_1_t2v_flow.yaml>` | Fine-tune — Wan 2.1 T2V with flow matching |
-| {download}`wan2_1_t2v_flow.yaml <../../../../examples/diffusion/pretrain/wan2_1_t2v_flow.yaml>` | Pretrain — Wan 2.1 T2V with flow matching |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-torchrun --nproc-per-node=8 \
-  examples/diffusion/finetune/finetune.py \
-  -c examples/diffusion/finetune/wan2_1_t2v_flow.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-torchrun --nproc-per-node=8 \
-  examples/diffusion/finetune/finetune.py \
-  -c examples/diffusion/finetune/wan2_1_t2v_flow.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [Diffusion Fine-Tuning Guide](../../../guides/diffusion/finetune.md).
-
-## Training
-
-See the [Diffusion Training and Fine-Tuning Guide](../../../guides/diffusion/finetune.md) and [Dataset Preparation](../../../guides/diffusion/dataset.md).
-
-## Hugging Face Model Cards
-
-- [Wan-AI/Wan2.1-T2V-1.3B-Diffusers](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers)
diff --git a/fern/versions/v0.4/pages/model-coverage/diffusion/wan-ai/wan2-1-t2v.mdx b/docs/model-coverage/diffusion/wan-ai/wan2-1-t2v.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/diffusion/wan-ai/wan2-1-t2v.mdx
rename to docs/model-coverage/diffusion/wan-ai/wan2-1-t2v.mdx
diff --git a/docs/model-coverage/embedding/index.md b/docs/model-coverage/embedding/index.mdx
similarity index 52%
rename from docs/model-coverage/embedding/index.md
rename to docs/model-coverage/embedding/index.mdx
index 7093ed1874..8c860cec33 100644
--- a/docs/model-coverage/embedding/index.md
+++ b/docs/model-coverage/embedding/index.mdx
@@ -1,12 +1,14 @@
-(embedding-models)=
-
-# Embedding Models
+---
+title: "Embedding Models"
+description: ""
+position: 1
+---
 
 ## Introduction
 
 Text embedding models transform text into dense vector representations that power semantic search, dense retrieval, retrieval-augmented generation (RAG), and classification tasks. NeMo AutoModel includes a training recipe for converting Llama decoder-only models into encoder architectures with bidirectional attention, and falls back to Hugging Face AutoModel for other encoder backbones.
 
-For cross-encoder pairwise scoring, see [Reranking Models](../reranker/index.md).
+For cross-encoder pairwise scoring, see [Reranking Models](/model-coverage/reranking-models/overview).
 
 Embedding models use bi-encoders to produce dense representations for queries and documents independently. They are the standard path for embedding generation and first-stage dense retrieval.
 
@@ -14,8 +16,8 @@ Embedding models use bi-encoders to produce dense representations for queries an
 
 | Owner | Model | Architecture | Auto Class | Tasks |
 |---|---|---|---|---|
-| NVIDIA | [Llama (Bidirectional)](nvidia/llama-bidirectional.md) | `LlamaBidirectionalModel` | [`NeMoAutoModelBiEncoder`](https://github.com/NVIDIA-NeMo/Automodel/blob/8dc00dcb4a35c2413c52c6e7eb7ac8f1c24836aa/nemo_automodel/_transformers/auto_model.py#L991) | Embedding, Dense Retrieval |
-| Mistral AI | [Ministral3 (Bidirectional)](mistralai/ministral3-bidirectional.md) | `Ministral3BidirectionalModel` | [`NeMoAutoModelBiEncoder`](https://github.com/NVIDIA-NeMo/Automodel/blob/8dc00dcb4a35c2413c52c6e7eb7ac8f1c24836aa/nemo_automodel/_transformers/auto_model.py#L991) | Embedding, Dense Retrieval |
+| NVIDIA | [Llama (Bidirectional)](/model-coverage/embedding-models/llama-bidirectional) | `LlamaBidirectionalModel` | [`NeMoAutoModelBiEncoder`](https://github.com/NVIDIA-NeMo/Automodel/blob/8dc00dcb4a35c2413c52c6e7eb7ac8f1c24836aa/nemo_automodel/_transformers/auto_model.py#L991) | Embedding, Dense Retrieval |
+| Mistral AI | [Ministral3 (Bidirectional)](/model-coverage/embedding-models/ministral3-bidirectional) | `Ministral3BidirectionalModel` | [`NeMoAutoModelBiEncoder`](https://github.com/NVIDIA-NeMo/Automodel/blob/8dc00dcb4a35c2413c52c6e7eb7ac8f1c24836aa/nemo_automodel/_transformers/auto_model.py#L991) | Embedding, Dense Retrieval |
 
 ### Hugging Face Auto Backbones
 
@@ -25,9 +27,9 @@ Any Hugging Face model that can be loaded with `AutoModel` can be used as an emb
 
 | Recipe | Description |
 |---|---|
-| {download}`llama3_2_1b.yaml <../../../examples/retrieval/bi_encoder/llama3_2_1b.yaml>` | Bi-encoder — Llama 3.2 1B embedding model |
-| {download}`llama_embed_nemotron_8b.yaml <../../../examples/retrieval/bi_encoder/llama_embed_nemotron_8b/llama_embed_nemotron_8b.yaml>` | Bi-encoder — Llama-Embed-Nemotron-8B reproduction recipe |
-[ [download}`ministral3_3b_instruct.yaml <../../../examples/retrieval/bi_encoder/ministral3_3b_instruct.yaml>` | Bi-encoder — Ministral3-3B recipe |
+| [llama3_2_1b.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/retrieval/bi_encoder/llama3_2_1b.yaml) | Bi-encoder — Llama 3.2 1B embedding model |
+| [llama_embed_nemotron_8b.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/retrieval/bi_encoder/llama_embed_nemotron_8b/llama_embed_nemotron_8b.yaml) | Bi-encoder — Llama-Embed-Nemotron-8B reproduction recipe |
+| [ministral3_3b_instruct.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/retrieval/bi_encoder/ministral3_3b_instruct.yaml) | Bi-encoder — Ministral3-3B recipe |
 
 ## Supported Workflows
 
@@ -37,18 +39,11 @@ Any Hugging Face model that can be loaded with `AutoModel` can be used as an emb
 
 ## Dataset
 
-Retrieval fine-tuning requires query-document pairs: each example is a query paired with one positive document and one or more negative documents. Both inline JSONL and corpus ID-based JSON formats are supported. See the [Retrieval Dataset](../../guides/llm/retrieval-dataset.md) guide.
+Retrieval fine-tuning requires query-document pairs: each example is a query paired with one positive document and one or more negative documents. Both inline JSONL and corpus ID-based JSON formats are supported. See the [Retrieval Dataset](/datasets/retrieval-dataset) guide.
 
-<!--
+{/*
 @akoumpa: uncomment this when finetune guide is published.
 ## Train Embedding Models
 
-For a complete walkthrough of training configuration, model-specific settings, and launch commands, see the [Embedding and Reranking Fine-Tuning Guide](../../guides/retrieval/finetune.md).
--->
-
-```{toctree}
-:hidden:
-
-nvidia/llama-bidirectional
-mistralai/ministral3-bidirectional
-```
+For a complete walkthrough of training configuration, model-specific settings, and launch commands, see the [Embedding and Reranking Fine-Tuning Guide](/recipes-e2e-examples/retrieval-finetune).
+*/}
diff --git a/docs/model-coverage/embedding/mistralai/ministral3-bidirectional.md b/docs/model-coverage/embedding/mistralai/ministral3-bidirectional.mdx
similarity index 89%
rename from docs/model-coverage/embedding/mistralai/ministral3-bidirectional.md
rename to docs/model-coverage/embedding/mistralai/ministral3-bidirectional.mdx
index 37f85ed980..255d59bb3d 100644
--- a/docs/model-coverage/embedding/mistralai/ministral3-bidirectional.md
+++ b/docs/model-coverage/embedding/mistralai/ministral3-bidirectional.mdx
@@ -1,17 +1,18 @@
-# Ministral3 (Bidirectional) for Embedding
+---
+title: "Ministral3 (Bidirectional) for Embedding"
+description: ""
+---
 
 NeMo AutoModel provides a bidirectional variant of [Mistral AI's Ministral3](https://mistral.ai/news/ministraux/) for embedding and dense retrieval tasks. Unlike the standard causal (left-to-right) Ministral3 used for text generation, this variant uses **bidirectional attention**, so each token can attend to both past and future tokens in the sequence, producing richer representations for semantic similarity and dense retrieval.
 
 The bidirectional encoder can be loaded directly from text-only checkpoints (e.g. `mistralai/Ministral-3B-Instruct`) and also automatically extracts the language model from Ministral3 VLM checkpoints (e.g. `mistralai/Ministral-3-3B-Base-2512` or `mistralai/Ministral-3-3B-Instruct-2512`).
 
-:::{card}
 | | |
 |---|---|
 | **Tasks** | Embedding, Dense Retrieval |
 | **Architecture** | `Ministral3BidirectionalModel` |
 | **Parameters** | 3B |
 | **HF Org** | [mistralai](https://huggingface.co/mistralai) |
-:::
 
 ## Available Models
 
@@ -48,7 +49,7 @@ The bi-encoder supports multiple pooling strategies to aggregate token represent
 
 ## Try with NeMo AutoModel
 
-**1. Install NeMo AutoModel**. Refer to the ([Installation Guide](../../../guides/installation.md)) for information:
+**1. Install NeMo AutoModel**. Refer to the ([Installation Guide](/get-started/installation)) for information:
 
 ```bash
 uv pip install nemo-automodel
@@ -68,13 +69,13 @@ torchrun --nproc-per-node=8 examples/retrieval/bi_encoder/finetune.py --config e
 torchrun --nproc-per-node=8 examples/retrieval/bi_encoder/finetune.py --config examples/retrieval/bi_encoder/ministral3_3b_instruct.yaml
 ```
 
-See the [Installation Guide](../../../guides/installation.md).
+See the [Installation Guide](/get-started/installation).
 
-<!-- TODO: uncomment when finetune guide is published.
+{/* TODO: uncomment when finetune guide is published.
 ## Fine-Tuning
 
-See the [Embedding and Reranking Fine-Tuning Guide](../../../guides/retrieval/finetune.md) for bi-encoder training instructions, including LoRA and PEFT configuration.
--->
+See the [Embedding and Reranking Fine-Tuning Guide](/recipes-e2e-examples/retrieval-finetune) for bi-encoder training instructions, including LoRA and PEFT configuration.
+*/}
 
 ## Hugging Face Model Cards
 
diff --git a/docs/model-coverage/embedding/nvidia/llama-bidirectional.md b/docs/model-coverage/embedding/nvidia/llama-bidirectional.mdx
similarity index 76%
rename from docs/model-coverage/embedding/nvidia/llama-bidirectional.md
rename to docs/model-coverage/embedding/nvidia/llama-bidirectional.mdx
index 3f0830ec37..dd4851039a 100644
--- a/docs/model-coverage/embedding/nvidia/llama-bidirectional.md
+++ b/docs/model-coverage/embedding/nvidia/llama-bidirectional.mdx
@@ -1,17 +1,18 @@
-# Llama (Bidirectional) for Embedding
+---
+title: "Llama (Bidirectional) for Embedding"
+description: ""
+---
 
 NeMo AutoModel provides a bidirectional variant of [Meta's Llama](https://www.llama.com/) for embedding and dense retrieval tasks. Unlike the standard causal (left-to-right) Llama used for text generation, this variant uses **bidirectional attention**, so each token can attend to both past and future tokens in the sequence, producing richer representations for semantic similarity and dense retrieval.
 
-For the cross-encoder variant, see [Llama (Bidirectional) for Reranking](../../reranker/nvidia/llama-bidirectional.md).
+For the cross-encoder variant, see [Llama (Bidirectional) for Reranking](/model-coverage/reranking-models/llama-bidirectional).
 
-:::{card}
 | | |
 |---|---|
 | **Tasks** | Embedding, Dense Retrieval |
 | **Architecture** | `LlamaBidirectionalModel` |
 | **Parameters** | 1B – 8B |
 | **HF Org** | [meta-llama](https://huggingface.co/meta-llama) |
-:::
 
 ## Available Models
 
@@ -50,12 +51,12 @@ The bi-encoder supports multiple pooling strategies to aggregate token represent
 
 | Recipe | Description |
 |---|---|
-| {download}`llama3_2_1b.yaml <../../../../examples/retrieval/bi_encoder/llama3_2_1b.yaml>` | Bi-encoder — Llama 3.2 1B embedding model |
-| {download}`llama_embed_nemotron_8b.yaml <../../../../examples/retrieval/bi_encoder/llama_embed_nemotron_8b/llama_embed_nemotron_8b.yaml>` | Bi-encoder — reproduction recipe for [`nvidia/llama-embed-nemotron-8b`](https://huggingface.co/nvidia/llama-embed-nemotron-8b) (uses [`nvidia/embed-nemotron-dataset-v1`](https://huggingface.co/datasets/nvidia/embed-nemotron-dataset-v1)) |
+| [llama3_2_1b.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/retrieval/bi_encoder/llama3_2_1b.yaml) | Bi-encoder — Llama 3.2 1B embedding model |
+| [llama_embed_nemotron_8b.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/retrieval/bi_encoder/llama_embed_nemotron_8b/llama_embed_nemotron_8b.yaml) | Bi-encoder — reproduction recipe for [`nvidia/llama-embed-nemotron-8b`](https://huggingface.co/nvidia/llama-embed-nemotron-8b) (uses [`nvidia/embed-nemotron-dataset-v1`](https://huggingface.co/datasets/nvidia/embed-nemotron-dataset-v1)) |
 
 ## Try with NeMo AutoModel
 
-**1. Install NeMo AutoModel**. Refer to the ([Installation Guide](../../../guides/installation.md)) for information:
+**1. Install NeMo AutoModel**. Refer to the ([Installation Guide](/get-started/installation)) for information:
 
 ```bash
 uv pip install nemo-automodel
@@ -74,7 +75,7 @@ cd Automodel
 torchrun --nproc-per-node=8 examples/retrieval/bi_encoder/finetune.py --config examples/retrieval/bi_encoder/llama3_2_1b.yaml
 ```
 
-:::{dropdown} Run with Docker
+<Accordion title="Run with Docker">
 **1. Pull the container** and mount a checkpoint directory:
 
 ```bash
@@ -95,15 +96,15 @@ cd /opt/Automodel
 ```bash
 torchrun --nproc-per-node=8 examples/retrieval/bi_encoder/finetune.py --config examples/retrieval/bi_encoder/llama3_2_1b.yaml
 ```
-:::
+</Accordion>
 
-See the [Installation Guide](../../../guides/installation.md).
+See the [Installation Guide](/get-started/installation).
 
-<!-- TODO: uncomment when finetune guide is published.
+{/* TODO: uncomment when finetune guide is published.
 ## Fine-Tuning
 
-See the [Embedding and Reranking Fine-Tuning Guide](../../../guides/retrieval/finetune.md) for bi-encoder training instructions, including LoRA and PEFT configuration.
--->
+See the [Embedding and Reranking Fine-Tuning Guide](/recipes-e2e-examples/retrieval-finetune) for bi-encoder training instructions, including LoRA and PEFT configuration.
+*/}
 
 ## Hugging Face Model Card
 
diff --git a/docs/model-coverage/latest-models.md b/docs/model-coverage/latest-models.md
deleted file mode 100644
index 15c2329a4b..0000000000
--- a/docs/model-coverage/latest-models.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# Model Release Log
-
-A reverse-chronological log of every model added to NeMo AutoModel. The **Recipe** column links to a working example YAML you can run immediately.
-
-See the [Model Coverage Overview](overview.md) for release summaries, and the [LLM](llm/index.md) / [VLM](vlm/index.md) / [Omni](omni/index.md) / [Diffusion](diffusion/index.md) pages for the full architecture listings.
-
-| Date | Model | HF Model ID | Modality | Recipe | Try on Brev |
-|------|-------|-------------|----------|--------|------|
-| 2026-04-29 | Mistral Medium 3.5 | [`mistralai/Mistral-Medium-3.5-128B`](https://huggingface.co/mistralai/Mistral-Medium-3.5-128B) | VLM | [mistral3p5_128b_medpix.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/mistral3p5/mistral3p5_128b_medpix.yaml) | 🚧 |
-| 2026-04-28 | Hy3-preview | [`tencent/Hy3-preview`](https://huggingface.co/tencent/Hy3-preview) | LLM | [hy3_preview_deepep.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml) | 🚧 |
-| 2026-04-25 | DeepSeek V4 Flash | [`deepseek-ai/DeepSeek-V4-Flash`](https://huggingface.co/deepseek-ai/DeepSeek-V4-Flash) | LLM | [deepseek_v4_flash_hellaswag.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/deepseek_v4/deepseek_v4_flash_hellaswag.yaml) | 🚧 |
-| 2026-04-22 | Qwen3.6-27B | [`Qwen/Qwen3.6-27B`](https://huggingface.co/Qwen/Qwen3.6-27B) | VLM | [qwen3_6_27b.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/qwen3_5/qwen3_6_27b.yaml) | 🚧 |
-| 2026-04-16 | LLaVA-OneVision-1.5 (4B / 8B) | [`lmms-lab/LLaVA-OneVision-1.5-4B-Instruct`](https://huggingface.co/lmms-lab/LLaVA-OneVision-1.5-4B-Instruct) | VLM | [llava_ov_1_5_4b_finetune.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/llava_onevision/llava_ov_1_5_4b_finetune.yaml) | 🚧 |
-| 2026-04-16 | Qwen3.6 MoE | [`Qwen/Qwen3.6-35B-A3B`](https://huggingface.co/Qwen/Qwen3.6-35B-A3B) | VLM | [qwen3_6_35b.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/qwen3_5_moe/qwen3_6_35b.yaml) | 🚧 |
-| 2026-04-12 | MiniMax-M2.7 | [`MiniMaxAI/MiniMax-M2.7`](https://huggingface.co/MiniMaxAI/MiniMax-M2.7) | LLM | [minimax_m2.7_hellaswag_pp.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/minimax_m2/minimax_m2.7_hellaswag_pp.yaml) |
-| 2026-04-07 | GLM-5.1 | [`zai-org/GLM-5.1`](https://huggingface.co/zai-org/GLM-5.1) | LLM | [glm_5.1_hellaswag_pp.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/glm/glm_5.1_hellaswag_pp.yaml) | 🚧 |
-| 2026-04-02 | Gemma 4 | [`google/gemma-4-E4B-it`](https://huggingface.co/google/gemma-4-E4B-it) | VLM | [gemma4_4b.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/gemma4/gemma4_4b.yaml) | 🚧 |
-| 2026-03-16 | Mistral Small 4 | [`mistralai/Mistral-Small-4-119B-2603`](https://huggingface.co/mistralai/Mistral-Small-4-119B-2603) | VLM | [mistral4_medpix.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/mistral4/mistral4_medpix.yaml) | 🚧 |
-| 2026-03-11 | Nemotron Super v3 | [`nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16`](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16) | LLM | [nemotron_super_v3_hellaswag.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/nemotron/nemotron_super_v3_hellaswag.yaml) | 🚧 |
-| 2026-03-11 | GLM-5 | [`zai-org/GLM-5`](https://huggingface.co/zai-org/GLM-5) | LLM | [glm_5_hellaswag_pp.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/glm/glm_5_hellaswag_pp.yaml) | 🚧 |
-| 2026-03-03 | FLUX.1-dev | [`black-forest-labs/FLUX.1-dev`](https://huggingface.co/black-forest-labs/FLUX.1-dev) | Diffusion | [flux_t2i_flow.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/diffusion/finetune/flux_t2i_flow.yaml) | 🚧 |
-| 2026-03-03 | Wan 2.1 T2V | [`Wan-AI/Wan2.1-T2V-1.3B-Diffusers`](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers) | Diffusion | [wan2_1_t2v_flow.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/diffusion/finetune/wan2_1_t2v_flow.yaml) | 🚧 |
-| 2026-03-03 | HunyuanVideo 1.5 | [`hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v`](https://huggingface.co/hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v) | Diffusion | [hunyuan_t2v_flow.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/diffusion/finetune/hunyuan_t2v_flow.yaml) | 🚧 |
-| 2026-03-02 | Qwen3.5 (0.8B – 9B) | [`Qwen/Qwen3.5-9B`](https://huggingface.co/Qwen/Qwen3.5-9B) | VLM | [qwen3_5_9b.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/qwen3_5/qwen3_5_9b.yaml) | 🚧 |
-| 2026-02-16 | Qwen3.5 MoE | [`Qwen/Qwen3.5-397B-A17B`](https://huggingface.co/Qwen/Qwen3.5-397B-A17B) | VLM | [qwen3_5_moe_medpix.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/qwen3_5_moe/qwen3_5_moe_medpix.yaml) | 🚧 |
-| 2026-02-13 | MiniMax-M2.5 | [`MiniMaxAI/MiniMax-M2.5`](https://huggingface.co/MiniMaxAI/MiniMax-M2.5) | LLM | [minimax_m2.5_hellaswag_pp.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/minimax_m2/minimax_m2.5_hellaswag_pp.yaml) | 🚧 |
-| 2026-02-11 | GLM-4.7-Flash | [`zai-org/GLM-4.7-Flash`](https://huggingface.co/zai-org/GLM-4.7-Flash) | LLM | [glm_4.7_flash_te_packed_sequence.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/glm/glm_4.7_flash_te_packed_sequence.yaml) | 🚧 |
-| 2026-02-09 | MiniMax-M2.1 | [`MiniMaxAI/MiniMax-M2`](https://huggingface.co/MiniMaxAI/MiniMax-M2) | LLM | [minimax_m2.1_hellaswag_pp.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/minimax_m2/minimax_m2.1_hellaswag_pp.yaml) | 🚧 |
-| 2026-02-06 | Qwen3-VL-235B | [`Qwen/Qwen3-VL-235B-A22B-Instruct`](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct) | VLM | [qwen3_vl_moe_235b.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/qwen3/qwen3_vl_moe_235b.yaml) | 🚧 |
-| 2026-02-06 | GLM-4.7 | [`zai-org/GLM-4.7`](https://huggingface.co/zai-org/GLM-4.7) | LLM | [glm_4.7_te_deepep.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/glm/glm_4.7_te_deepep.yaml) | 🚧 |
-| 2026-02-06 | Step-3.5-Flash | [`stepfun-ai/Step-3.5-Flash`](https://huggingface.co/stepfun-ai/Step-3.5-Flash) | LLM | [step_3.5_flash_hellaswag_pp.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/stepfun/step_3.5_flash_hellaswag_pp.yaml) | 🚧 |
-| 2026-02-05 | DeepSeek-V3.2 | [`deepseek-ai/DeepSeek-V3.2`](https://huggingface.co/deepseek-ai/DeepSeek-V3.2) | LLM | [deepseek_v32_hellaswag_pp.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/deepseek_v32/deepseek_v32_hellaswag_pp.yaml) | 🚧 |
-| 2026-02-04 | Kimi-K2.5 VL | [`moonshotai/Kimi-K2.5`](https://huggingface.co/moonshotai/Kimi-K2.5) | VLM | [kimi25vl_medpix.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/kimi/kimi25vl_medpix.yaml) | 🚧 |
-| 2026-01-30 | Kimi-VL | [`moonshotai/Kimi-VL-A3B-Instruct`](https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct) | VLM | [kimi2vl_cordv2.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/kimi/kimi2vl_cordv2.yaml) | 🚧 |
-| 2026-01-12 | Nemotron Flash 1B | [`nvidia/Nemotron-Flash-1B`](https://huggingface.co/nvidia/Nemotron-Flash-1B) | LLM | [nemotron_flash_1b_squad.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/nemotron_flash/nemotron_flash_1b_squad.yaml) | 🚧 |
-| 2026-01-12 | Nemotron Parse v1.1 | [`nvidia/NVIDIA-Nemotron-Parse-v1.1`](https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1) | VLM | [nemotron_parse_v1_1.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/nemotron/nemotron_parse_v1_1.yaml) | <a href="https://brev.nvidia.com/launchable/deploy/now?launchableID=env-3C6LDKU2DfOvpVTFhjw3YQ4djPM"><img src="https://brev-assets.s3.us-west-1.amazonaws.com/nv-lb-dark.svg" alt="Launch on Brev" height="23"></a> |
-| 2026-01-07 | Devstral-Small-2512 | [`mistralai/Devstral-Small-2-24B-Instruct-2512`](https://huggingface.co/mistralai/Devstral-Small-2-24B-Instruct-2512) | LLM | [devstral2_small_2512_squad.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/devstral/devstral2_small_2512_squad.yaml) | 🚧 |
-| 2025-12-15 | Nemotron-3-Nano-30B-A3B | [`nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8`](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8) | LLM | [nemotron_nano_v3_hellaswag.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/nemotron/nemotron_nano_v3_hellaswag.yaml) | 🚧 |
-| 2025-12-05 | Ministral 3 (3B / 8B / 14B) | [`mistralai/Ministral-8B-Instruct-2410`](https://huggingface.co/mistralai/Ministral-8B-Instruct-2410) | VLM | [ministral3_8b_medpix.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/mistral/ministral3_8b_medpix.yaml) | 🚧 |
-| 2025-11-24 | GLM-4.5-Air | [`zai-org/GLM-4.5-Air`](https://huggingface.co/zai-org/GLM-4.5-Air) | LLM | [glm_4.5_air_te_deepep.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/glm/glm_4.5_air_te_deepep.yaml) | 🚧 |
-| 2025-11-19 | InternVL 3.5 | [`OpenGVLab/InternVL3_5-4B`](https://huggingface.co/OpenGVLab/InternVL3_5-4B) | VLM | [internvl_3_5_4b.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/internvl/internvl_3_5_4b.yaml) | 🚧 |
-| 2025-11-10 | Qwen3-Omni | [`Qwen/Qwen3-30B-A3B`](https://huggingface.co/Qwen/Qwen3-30B-A3B) | Omni | [qwen3_omni_moe_30b_te_deepep.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/qwen3/qwen3_omni_moe_30b_te_deepep.yaml) | 🚧 |
-| 2025-10-24 | Qwen3-Next | [`Qwen/Qwen3-235B-A22B`](https://huggingface.co/Qwen/Qwen3-235B-A22B) | LLM | [qwen3_next_te_deepep.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/qwen/qwen3_next_te_deepep.yaml) | 🚧 |
-| 2025-10-23 | Qwen3-VL (4B / 8B) | [`Qwen/Qwen3-VL-8B-Instruct`](https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct) | VLM | [qwen3_vl_4b_instruct_rdr.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/qwen3/qwen3_vl_4b_instruct_rdr.yaml) | 🚧 |
-| 2025-10-05 | Mixtral 8x7B | [`mistralai/Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | LLM | [mixtral-8x7b-v0-1_squad.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/mistral/mixtral-8x7b-v0-1_squad.yaml) | 🚧 |
-| 2025-09-29 | DeepSeek-V3 | [`deepseek-ai/DeepSeek-V3`](https://huggingface.co/deepseek-ai/DeepSeek-V3) | LLM | [deepseekv3_pretrain.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_pretrain/deepseekv3_pretrain.yaml) | 🚧 |
-| 2025-09-23 | GPT-OSS 20B / 120B | [`openai/gpt-oss-20b`](https://huggingface.co/openai/gpt-oss-20b) | LLM | [gpt_oss_20b.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/gpt_oss/gpt_oss_20b.yaml) | 🚧 |
-| 2025-09-08 | Moonlight 16B | [`moonshotai/Moonlight-16B-A3B`](https://huggingface.co/moonshotai/Moonlight-16B-A3B) | LLM | [moonlight_16b_te.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/moonlight/moonlight_16b_te.yaml) | 🚧 |
-| 2025-08-27 | Mistral / Mistral-Nemo | [`mistralai/Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1) | LLM | [mistral_7b_squad.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/mistral/mistral_7b_squad.yaml) | 🚧 |
-| 2025-08-27 | Qwen2 / Qwen2.5 | [`Qwen/Qwen2.5-7B`](https://huggingface.co/Qwen/Qwen2.5-7B) | LLM | [qwen2_5_7b_squad.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/qwen/qwen2_5_7b_squad.yaml) | 🚧 |
-| 2025-08-27 | Gemma 2 / 3 | [`google/gemma-2-9b-it`](https://huggingface.co/google/gemma-2-9b-it) | LLM | [gemma_2_9b_it_squad.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/gemma/gemma_2_9b_it_squad.yaml) | 🚧 |
-| 2025-08-27 | Phi 2 / 3 / 4 | [`microsoft/phi-4`](https://huggingface.co/microsoft/phi-4) | LLM | [phi_4_squad.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/phi/phi_4_squad.yaml) | 🚧 |
-| 2025-08-27 | Granite 3.x | [`ibm-granite/granite-3.3-2b-instruct`](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct) | LLM | [granite_3_3_2b_instruct_squad.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/granite/granite_3_3_2b_instruct_squad.yaml) | 🚧 |
-| 2025-08-27 | OLMo 2 | [`allenai/OLMo-2-0425-1B-Instruct`](https://huggingface.co/allenai/OLMo-2-0425-1B-Instruct) | LLM | [olmo_2_0425_1b_instruct_squad.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/olmo/olmo_2_0425_1b_instruct_squad.yaml) | 🚧 |
-| 2025-08-27 | Seed-Coder / Seed-OSS | [`ByteDance-Seed/Seed-Coder-8B-Instruct`](https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Instruct) | LLM | [seed_coder_8b_instruct_squad.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/seed/seed_coder_8b_instruct_squad.yaml) | 🚧 |
-| 2025-08-27 | Baichuan 2 | [`baichuan-inc/Baichuan2-7B-Chat`](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat) | LLM | [baichuan_2_7b_squad.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/baichuan/baichuan_2_7b_squad.yaml) | 🚧 |
-| 2025-08-27 | Cohere Command-R | [`CohereForAI/c4ai-command-r-v01`](https://huggingface.co/CohereForAI/c4ai-command-r-v01) | LLM | [cohere_command_r_7b_squad.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/cohere/cohere_command_r_7b_squad.yaml) | 🚧 |
-| 2025-08-27 | StarCoder 2 | [`bigcode/starcoder2-3b`](https://huggingface.co/bigcode/starcoder2-3b) | LLM | [starcoder_2_7b_squad.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/starcoder/starcoder_2_7b_squad.yaml) | 🚧 |
-| 2025-08-27 | Falcon 3 | [`tiiuae/Falcon3-7B-Instruct`](https://huggingface.co/tiiuae/Falcon3-7B-Instruct) | LLM | [falcon3_7b_instruct_squad.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/falcon/falcon3_7b_instruct_squad.yaml) | 🚧 |
-| 2025-08-27 | GLM-4 / GLM-4-MoE | [`zai-org/glm-4-9b-chat-hf`](https://huggingface.co/zai-org/glm-4-9b-chat-hf) | LLM | [glm_4_9b_chat_hf_squad.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/glm/glm_4_9b_chat_hf_squad.yaml) | 🚧 |
-| 2025-08-27 | Qwen3 / Qwen3-MoE | [`Qwen/Qwen3-0.6B`](https://huggingface.co/Qwen/Qwen3-0.6B) | LLM | [qwen3_0p6b_hellaswag.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/qwen/qwen3_0p6b_hellaswag.yaml) | 🚧 |
-| 2025-08-23 | Gemma 3 VL | [`google/gemma-3-4b-it`](https://huggingface.co/google/gemma-3-4b-it) | VLM | [gemma3_vl_4b_cord_v2.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/gemma3/gemma3_vl_4b_cord_v2.yaml) | 🚧 |
-| 2025-08-23 | Gemma 3n | [`google/gemma-3n-e4b-it`](https://huggingface.co/google/gemma-3n-e4b-it) | VLM | [gemma3n_vl_4b_medpix.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/gemma3n/gemma3n_vl_4b_medpix.yaml) | 🚧 |
-| 2025-08-23 | Llama 3.x | [`meta-llama/Llama-3.2-1B`](https://huggingface.co/meta-llama/Llama-3.2-1B) | LLM | [llama3_2_1b_squad.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml) | 🚧 |
-| 2025-08-23 | Qwen2.5-VL | [`Qwen/Qwen2.5-VL-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) | VLM | [qwen2_5_vl_3b_rdr.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/qwen2_5/qwen2_5_vl_3b_rdr.yaml) | 🚧 |
-| 2025-08-23 | Phi-4-multimodal | [`microsoft/Phi-4-multimodal-instruct`](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) | Omni | [phi4_mm_cv17.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/phi4/phi4_mm_cv17.yaml) | 🚧 |
diff --git a/fern/versions/nightly/pages/model-coverage/latest-models.mdx b/docs/model-coverage/latest-models.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/latest-models.mdx
rename to docs/model-coverage/latest-models.mdx
diff --git a/docs/model-coverage/llm/allenai/olmo.md b/docs/model-coverage/llm/allenai/olmo.md
deleted file mode 100644
index a410ee6400..0000000000
--- a/docs/model-coverage/llm/allenai/olmo.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# OLMo
-
-[OLMo](https://allenai.org/olmo) (Open Language Model) is Allen AI's fully open language model — open weights, open training data, and open training code. OLMo-1B and OLMo-7B are trained on Dolma.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `OLMoForCausalLM` |
-| **Parameters** | 1B – 7B |
-| **HF Org** | [allenai](https://huggingface.co/allenai) |
-:::
-
-## Available Models
-
-- **OLMo-7B-hf**: 7B
-- **OLMo-1B-hf**: 1B
-
-## Architecture
-
-- `OLMoForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| OLMo 1B | [`allenai/OLMo-1B-hf`](https://huggingface.co/allenai/OLMo-1B-hf) |
-| OLMo 7B | [`allenai/OLMo-7B-hf`](https://huggingface.co/allenai/OLMo-7B-hf) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base LLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [allenai/OLMo-1B-hf](https://huggingface.co/allenai/OLMo-1B-hf)
-- [allenai/OLMo-7B-hf](https://huggingface.co/allenai/OLMo-7B-hf)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/allenai/olmo.mdx b/docs/model-coverage/llm/allenai/olmo.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/allenai/olmo.mdx
rename to docs/model-coverage/llm/allenai/olmo.mdx
diff --git a/docs/model-coverage/llm/allenai/olmo2.md b/docs/model-coverage/llm/allenai/olmo2.md
deleted file mode 100644
index 2a0a2f45ef..0000000000
--- a/docs/model-coverage/llm/allenai/olmo2.md
+++ /dev/null
@@ -1,91 +0,0 @@
-# OLMo2
-
-[OLMo2](https://allenai.org/olmo) is Allen AI's second-generation open language model with improved architecture and training, including RMSNorm and rotary position embeddings.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `OLMo2ForCausalLM` |
-| **Parameters** | 1B – 13B |
-| **HF Org** | [allenai](https://huggingface.co/allenai) |
-:::
-
-## Available Models
-
-- **OLMo-2-0425-1B-Instruct**
-- **OLMo2-7B-1124**
-- **OLMo2-13B-1124**
-
-## Architecture
-
-- `OLMo2ForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| OLMo2 7B | [`allenai/OLMo2-7B-1124`](https://huggingface.co/allenai/OLMo2-7B-1124) |
-| OLMo2 0425 1B Instruct | [`allenai/OLMo-2-0425-1B-Instruct`](https://huggingface.co/allenai/OLMo-2-0425-1B-Instruct) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`olmo_2_0425_1b_instruct_squad.yaml <../../../../examples/llm_finetune/olmo/olmo_2_0425_1b_instruct_squad.yaml>` | SFT — OLMo2 0425 1B Instruct on SQuAD |
-| {download}`olmo_2_0425_1b_instruct_squad_peft.yaml <../../../../examples/llm_finetune/olmo/olmo_2_0425_1b_instruct_squad_peft.yaml>` | LoRA — OLMo2 0425 1B Instruct on SQuAD |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/olmo/olmo_2_0425_1b_instruct_squad.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/olmo/olmo_2_0425_1b_instruct_squad.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [allenai/OLMo2-7B-1124](https://huggingface.co/allenai/OLMo2-7B-1124)
diff --git a/fern/versions/nightly/pages/model-coverage/llm/allenai/olmo2.mdx b/docs/model-coverage/llm/allenai/olmo2.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/allenai/olmo2.mdx
rename to docs/model-coverage/llm/allenai/olmo2.mdx
diff --git a/docs/model-coverage/llm/allenai/olmoe.md b/docs/model-coverage/llm/allenai/olmoe.md
deleted file mode 100644
index a4dd25186f..0000000000
--- a/docs/model-coverage/llm/allenai/olmoe.md
+++ /dev/null
@@ -1,89 +0,0 @@
-# OLMoE
-
-[OLMoE](https://allenai.org/olmo) is Allen AI's open Mixture-of-Experts language model. It activates 1B parameters per token from a 7B total parameter pool.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation (MoE) |
-| **Architecture** | `OLMoEForCausalLM` |
-| **Parameters** | 7B total / 1B active |
-| **HF Org** | [allenai](https://huggingface.co/allenai) |
-:::
-
-## Available Models
-
-- **OLMoE-1B-7B-0924**: 7B total, 1B activated
-- **OLMoE-1B-7B-0924-Instruct**: instruction-tuned variant
-
-## Architecture
-
-- `OLMoEForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| OLMoE 1B 7B | [`allenai/OLMoE-1B-7B-0924`](https://huggingface.co/allenai/OLMoE-1B-7B-0924) |
-| OLMoE 1B 7B Instruct | [`allenai/OLMoE-1B-7B-0924-Instruct`](https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base LLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [allenai/OLMoE-1B-7B-0924](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/allenai/olmoe.mdx b/docs/model-coverage/llm/allenai/olmoe.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/allenai/olmoe.mdx
rename to docs/model-coverage/llm/allenai/olmoe.mdx
diff --git a/docs/model-coverage/llm/baai/aquila.md b/docs/model-coverage/llm/baai/aquila.md
deleted file mode 100644
index 570aec6010..0000000000
--- a/docs/model-coverage/llm/baai/aquila.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# Aquila / Aquila2
-
-[Aquila](https://huggingface.co/BAAI/Aquila-7B) is a Chinese-English bilingual language model from the Beijing Academy of Artificial Intelligence (BAAI).
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `AquilaForCausalLM` |
-| **Parameters** | 7B – 34B |
-| **HF Org** | [BAAI](https://huggingface.co/BAAI) |
-:::
-
-## Available Models
-
-- **Aquila-7B**
-- **AquilaChat-7B**: instruction-tuned
-- **Aquila2-34B**
-
-## Architecture
-
-- `AquilaForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Aquila 7B | [`BAAI/Aquila-7B`](https://huggingface.co/BAAI/Aquila-7B) |
-| AquilaChat 7B | [`BAAI/AquilaChat-7B`](https://huggingface.co/BAAI/AquilaChat-7B) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base LLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [BAAI/Aquila-7B](https://huggingface.co/BAAI/Aquila-7B)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/baai/aquila.mdx b/docs/model-coverage/llm/baai/aquila.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/baai/aquila.mdx
rename to docs/model-coverage/llm/baai/aquila.mdx
diff --git a/docs/model-coverage/llm/baichuan-inc/baichuan.md b/docs/model-coverage/llm/baichuan-inc/baichuan.md
deleted file mode 100644
index c501d4f5db..0000000000
--- a/docs/model-coverage/llm/baichuan-inc/baichuan.md
+++ /dev/null
@@ -1,91 +0,0 @@
-# Baichuan / Baichuan2
-
-[Baichuan](https://github.com/baichuan-inc/Baichuan2) is a Chinese-English bilingual language model series from Baichuan Inc., featuring strong Chinese language performance.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `BaiChuanForCausalLM` |
-| **Parameters** | 7B – 13B |
-| **HF Org** | [baichuan-inc](https://huggingface.co/baichuan-inc) |
-:::
-
-## Available Models
-
-- **Baichuan2-13B-Chat**
-- **Baichuan2-7B-Chat**
-- **Baichuan-7B**
-
-## Architecture
-
-- `BaiChuanForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Baichuan2 13B Chat | [`baichuan-inc/Baichuan2-13B-Chat`](https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat) |
-| Baichuan 7B | [`baichuan-inc/Baichuan-7B`](https://huggingface.co/baichuan-inc/Baichuan-7B) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`baichuan_2_7b_squad.yaml <../../../../examples/llm_finetune/baichuan/baichuan_2_7b_squad.yaml>` | SFT — Baichuan2 7B on SQuAD |
-| {download}`baichuan_2_7b_squad_peft.yaml <../../../../examples/llm_finetune/baichuan/baichuan_2_7b_squad_peft.yaml>` | LoRA — Baichuan2 7B on SQuAD |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/baichuan/baichuan_2_7b_squad.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/baichuan/baichuan_2_7b_squad.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [baichuan-inc/Baichuan2-13B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/baichuan-inc/baichuan.mdx b/docs/model-coverage/llm/baichuan-inc/baichuan.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/baichuan-inc/baichuan.mdx
rename to docs/model-coverage/llm/baichuan-inc/baichuan.mdx
diff --git a/docs/model-coverage/llm/baidu/ernie4-5.md b/docs/model-coverage/llm/baidu/ernie4-5.md
deleted file mode 100644
index c31222abcb..0000000000
--- a/docs/model-coverage/llm/baidu/ernie4-5.md
+++ /dev/null
@@ -1,94 +0,0 @@
-# ERNIE 4.5
-
-[ERNIE 4.5](https://huggingface.co/baidu) is Baidu's dense and Mixture-of-Experts language model family with long-context text checkpoints on Hugging Face.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architectures** | `Ernie4_5ForCausalLM`, `Ernie4_5_MoeForCausalLM` |
-| **Parameters** | 0.36B dense; 21B total / 3B active MoE |
-| **Context Length** | 131,072 tokens |
-| **HF Org** | [baidu](https://huggingface.co/baidu) |
-:::
-
-## Available Models
-
-- **ERNIE-4.5-0.3B-PT**: dense text checkpoint with 0.36B parameters.
-- **ERNIE-4.5-21B-A3B-PT**: text MoE checkpoint with 21B total parameters and 3B activated parameters per token.
-
-## Architectures
-
-- `Ernie4_5ForCausalLM`: dense Hugging Face implementation path.
-- `Ernie4_5_MoeForCausalLM`: custom NeMo AutoModel implementation with expert parallelism support.
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| ERNIE 4.5 0.3B PT | [`baidu/ERNIE-4.5-0.3B-PT`](https://huggingface.co/baidu/ERNIE-4.5-0.3B-PT) |
-| ERNIE 4.5 21B A3B PT | [`baidu/ERNIE-4.5-21B-A3B-PT`](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-PT) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`ernie4_5_0p3b_hellaswag.yaml <../../../../examples/llm_finetune/ernie4_5/ernie4_5_0p3b_hellaswag.yaml>` | SFT -- ERNIE 4.5 0.3B on HellaSwag with the Hugging Face implementation |
-| {download}`ernie4_5_21b_a3b_hellaswag.yaml <../../../../examples/llm_finetune/ernie4_5/ernie4_5_21b_a3b_hellaswag.yaml>` | SFT -- ERNIE 4.5 21B A3B on HellaSwag with TE attention and DeepEP |
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run a dense recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/ernie4_5/ernie4_5_0p3b_hellaswag.yaml
-```
-
-**4. Run the MoE recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/ernie4_5/ernie4_5_21b_a3b_hellaswag.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2. Navigate to the AutoModel directory**:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/ernie4_5/ernie4_5_21b_a3b_hellaswag.yaml
-```
-:::
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md) and the [Large MoE Fine-Tuning Guide](../../../guides/llm/large-moe-finetune.md).
-
-## Hugging Face Model Cards
-
-- [baidu/ERNIE-4.5-0.3B-PT](https://huggingface.co/baidu/ERNIE-4.5-0.3B-PT)
-- [baidu/ERNIE-4.5-21B-A3B-PT](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-PT)
diff --git a/fern/versions/nightly/pages/model-coverage/llm/baidu/ernie4-5.mdx b/docs/model-coverage/llm/baidu/ernie4-5.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/baidu/ernie4-5.mdx
rename to docs/model-coverage/llm/baidu/ernie4-5.mdx
diff --git a/docs/model-coverage/llm/bigcode/starcoder.md b/docs/model-coverage/llm/bigcode/starcoder.md
deleted file mode 100644
index 1c0c7a673b..0000000000
--- a/docs/model-coverage/llm/bigcode/starcoder.md
+++ /dev/null
@@ -1,91 +0,0 @@
-# StarCoder
-
-[StarCoder](https://huggingface.co/blog/starcoder) is BigCode's code language model trained on the Stack dataset. It uses Multi-Query Attention and Fill-in-the-Middle (FIM) objectives. WizardCoder also uses this architecture.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Code Generation |
-| **Architecture** | `GPTBigCodeForCausalLM` |
-| **Parameters** | 1B – 15.5B |
-| **HF Org** | [bigcode](https://huggingface.co/bigcode) |
-:::
-
-## Available Models
-
-- **StarCoder**: 15.5B
-- **gpt_bigcode-santacoder**: 1.1B
-- **WizardCoder-15B-V1.0** (WizardLM)
-
-## Architecture
-
-- `GPTBigCodeForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| StarCoder | [`bigcode/starcoder`](https://huggingface.co/bigcode/starcoder) |
-| SantaCoder | [`bigcode/gpt_bigcode-santacoder`](https://huggingface.co/bigcode/gpt_bigcode-santacoder) |
-| WizardCoder 15B | [`WizardLM/WizardCoder-15B-V1.0`](https://huggingface.co/WizardLM/WizardCoder-15B-V1.0) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base LLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [bigcode/starcoder](https://huggingface.co/bigcode/starcoder)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/bigcode/starcoder.mdx b/docs/model-coverage/llm/bigcode/starcoder.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/bigcode/starcoder.mdx
rename to docs/model-coverage/llm/bigcode/starcoder.mdx
diff --git a/docs/model-coverage/llm/bigcode/starcoder2.md b/docs/model-coverage/llm/bigcode/starcoder2.md
deleted file mode 100644
index f5b796541c..0000000000
--- a/docs/model-coverage/llm/bigcode/starcoder2.md
+++ /dev/null
@@ -1,92 +0,0 @@
-# StarCoder2
-
-[StarCoder2](https://huggingface.co/blog/starcoder2) is BigCode's second-generation code language model, available in 3B, 7B, and 15B sizes, trained on 600+ programming languages from The Stack v2.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Code Generation |
-| **Architecture** | `Starcoder2ForCausalLM` |
-| **Parameters** | 3B – 15B |
-| **HF Org** | [bigcode](https://huggingface.co/bigcode) |
-:::
-
-## Available Models
-
-- **starcoder2-3b**: 3B
-- **starcoder2-7b**: 7B
-- **starcoder2-15b**: 15B
-
-## Architecture
-
-- `Starcoder2ForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| StarCoder2 3B | [`bigcode/starcoder2-3b`](https://huggingface.co/bigcode/starcoder2-3b) |
-| StarCoder2 7B | [`bigcode/starcoder2-7b`](https://huggingface.co/bigcode/starcoder2-7b) |
-| StarCoder2 15B | [`bigcode/starcoder2-15b`](https://huggingface.co/bigcode/starcoder2-15b) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`starcoder_2_7b_squad.yaml <../../../../examples/llm_finetune/starcoder/starcoder_2_7b_squad.yaml>` | SFT — StarCoder2 7B on SQuAD |
-| {download}`starcoder_2_7b_hellaswag_fp8.yaml <../../../../examples/llm_finetune/starcoder/starcoder_2_7b_hellaswag_fp8.yaml>` | SFT — StarCoder2 7B on HellaSwag with FP8 |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/starcoder/starcoder_2_7b_squad.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/starcoder/starcoder_2_7b_squad.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [bigcode/starcoder2-7b](https://huggingface.co/bigcode/starcoder2-7b)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/bigcode/starcoder2.mdx b/docs/model-coverage/llm/bigcode/starcoder2.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/bigcode/starcoder2.mdx
rename to docs/model-coverage/llm/bigcode/starcoder2.mdx
diff --git a/docs/model-coverage/llm/bytedance-seed/seed.md b/docs/model-coverage/llm/bytedance-seed/seed.md
deleted file mode 100644
index d8b6a2f1d8..0000000000
--- a/docs/model-coverage/llm/bytedance-seed/seed.md
+++ /dev/null
@@ -1,93 +0,0 @@
-# Seed (ByteDance)
-
-[Seed-Coder](https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Instruct) and [Seed-OSS](https://huggingface.co/ByteDance-Seed/Seed-OSS-36B-Instruct) are open-weight models from ByteDance. Both use the `Qwen2ForCausalLM` architecture under the hood.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `Qwen2ForCausalLM` |
-| **Parameters** | 8B – 36B |
-| **HF Org** | [ByteDance-Seed](https://huggingface.co/ByteDance-Seed) |
-:::
-
-## Available Models
-
-- **Seed-Coder-8B-Instruct**: 8B code model
-- **Seed-OSS-36B-Instruct**: 36B general model
-
-## Architecture
-
-- `Qwen2ForCausalLM` (reuses Qwen2 architecture)
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Seed-Coder 8B Instruct | [`ByteDance-Seed/Seed-Coder-8B-Instruct`](https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Instruct) |
-| Seed-OSS 36B Instruct | [`ByteDance-Seed/Seed-OSS-36B-Instruct`](https://huggingface.co/ByteDance-Seed/Seed-OSS-36B-Instruct) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`seed_coder_8b_instruct_squad.yaml <../../../../examples/llm_finetune/seed/seed_coder_8b_instruct_squad.yaml>` | SFT — Seed-Coder 8B on SQuAD |
-| {download}`seed_coder_8b_instruct_squad_peft.yaml <../../../../examples/llm_finetune/seed/seed_coder_8b_instruct_squad_peft.yaml>` | LoRA — Seed-Coder 8B on SQuAD |
-| {download}`seed_oss_36B_hellaswag.yaml <../../../../examples/llm_finetune/seed/seed_oss_36B_hellaswag.yaml>` | SFT — Seed-OSS 36B on HellaSwag |
-| {download}`seed_oss_36B_hellaswag_peft.yaml <../../../../examples/llm_finetune/seed/seed_oss_36B_hellaswag_peft.yaml>` | LoRA — Seed-OSS 36B on HellaSwag |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/seed/seed_coder_8b_instruct_squad.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/seed/seed_coder_8b_instruct_squad.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [ByteDance-Seed/Seed-Coder-8B-Instruct](https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Instruct)
-- [ByteDance-Seed/Seed-OSS-36B-Instruct](https://huggingface.co/ByteDance-Seed/Seed-OSS-36B-Instruct)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/bytedance-seed/seed.mdx b/docs/model-coverage/llm/bytedance-seed/seed.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/bytedance-seed/seed.mdx
rename to docs/model-coverage/llm/bytedance-seed/seed.mdx
diff --git a/docs/model-coverage/llm/cohere/command-r.md b/docs/model-coverage/llm/cohere/command-r.md
deleted file mode 100644
index bc0e0b904c..0000000000
--- a/docs/model-coverage/llm/cohere/command-r.md
+++ /dev/null
@@ -1,93 +0,0 @@
-# Command-R
-
-[Cohere Command-R](https://cohere.com/command) is a series of enterprise-grade language models optimized for retrieval-augmented generation (RAG) and tool use. Command-R7B uses the updated `Cohere2ForCausalLM` architecture.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `CohereForCausalLM` / `Cohere2ForCausalLM` |
-| **Parameters** | 7B – 104B |
-| **HF Org** | [CohereForAI](https://huggingface.co/CohereForAI) |
-:::
-
-## Available Models
-
-- **c4ai-command-r-v01**: 35B
-- **c4ai-command-r-plus**: 104B
-- **c4ai-command-r7b-12-2024**: 7B (`Cohere2ForCausalLM`)
-
-## Architectures
-
-- `CohereForCausalLM` — Command-R v01, Plus
-- `Cohere2ForCausalLM` — Command-R7B
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Command-R v01 | [`CohereForAI/c4ai-command-r-v01`](https://huggingface.co/CohereForAI/c4ai-command-r-v01) |
-| Command-R7B | [`CohereForAI/c4ai-command-r7b-12-2024`](https://huggingface.co/CohereForAI/c4ai-command-r7b-12-2024) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`cohere_command_r_7b_squad.yaml <../../../../examples/llm_finetune/cohere/cohere_command_r_7b_squad.yaml>` | SFT — Command-R 7B on SQuAD |
-| {download}`cohere_command_r_7b_squad_peft.yaml <../../../../examples/llm_finetune/cohere/cohere_command_r_7b_squad_peft.yaml>` | LoRA — Command-R 7B on SQuAD |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/cohere/cohere_command_r_7b_squad.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/cohere/cohere_command_r_7b_squad.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
-- [CohereForAI/c4ai-command-r7b-12-2024](https://huggingface.co/CohereForAI/c4ai-command-r7b-12-2024)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/cohere/command-r.mdx b/docs/model-coverage/llm/cohere/command-r.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/cohere/command-r.mdx
rename to docs/model-coverage/llm/cohere/command-r.mdx
diff --git a/docs/model-coverage/llm/deepseek-ai/deepseek-v3.md b/docs/model-coverage/llm/deepseek-ai/deepseek-v3.md
deleted file mode 100644
index 2681279cfc..0000000000
--- a/docs/model-coverage/llm/deepseek-ai/deepseek-v3.md
+++ /dev/null
@@ -1,103 +0,0 @@
-# DeepSeek-V3
-
-[DeepSeek-V3](https://github.com/deepseek-ai/DeepSeek-V3) is a large-scale Mixture-of-Experts model with 671B total parameters and 37B activated per token. It features Multi-head Latent Attention (MLA), innovative load balancing, and Multi-Token Prediction (MTP). DeepSeek-V3.2 is an updated release with further improvements.
-
-[Moonlight](https://huggingface.co/moonshotai/Moonlight-16B-A3B) by Moonshot AI also uses this architecture with 16B total / 3B activated parameters.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation (MoE) |
-| **Architecture** | `DeepseekV3ForCausalLM` / `DeepseekV32ForCausalLM` |
-| **Parameters** | 671B total / 37B active |
-| **HF Org** | [deepseek-ai](https://huggingface.co/deepseek-ai) |
-:::
-
-## Available Models
-
-- **DeepSeek-V3**: 671B total, 37B activated
-- **DeepSeek-V3.2** (`DeepseekV32ForCausalLM`): updated architecture
-- **Moonlight-16B-A3B** (Moonshot AI): 16B total, 3B activated
-
-## Architectures
-
-- `DeepseekV3ForCausalLM`
-- `DeepseekV32ForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| DeepSeek-V3 | [`deepseek-ai/DeepSeek-V3`](https://huggingface.co/deepseek-ai/DeepSeek-V3) |
-| DeepSeek-V3-Base | [`deepseek-ai/DeepSeek-V3-Base`](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base) |
-| DeepSeek-V3.2 | [`deepseek-ai/DeepSeek-V3.2`](https://huggingface.co/deepseek-ai/DeepSeek-V3.2) |
-| Moonlight 16B A3B | [`moonshotai/Moonlight-16B-A3B`](https://huggingface.co/moonshotai/Moonlight-16B-A3B) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`deepseek_v32_hellaswag_pp.yaml <../../../../examples/llm_finetune/deepseek_v32/deepseek_v32_hellaswag_pp.yaml>` | SFT — DeepSeek-V3.2 on HellaSwag with pipeline parallelism |
-| {download}`moonlight_16b_te.yaml <../../../../examples/llm_finetune/moonlight/moonlight_16b_te.yaml>` | SFT — Moonlight 16B with Transformer Engine |
-| {download}`moonlight_16b_te_packed_sequence.yaml <../../../../examples/llm_finetune/moonlight/moonlight_16b_te_packed_sequence.yaml>` | SFT — Moonlight 16B with packed sequences |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-:::{note}
-This recipe was validated on **32 nodes × 8 GPUs (256 H100s)**. See the [Launcher Guide](../../../launcher/slurm.md) for multi-node setup.
-:::
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/deepseek_v32/deepseek_v32_hellaswag_pp.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/deepseek_v32/deepseek_v32_hellaswag_pp.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md) and the [Large MoE Fine-Tuning Guide](../../../guides/llm/large-moe-finetune.md).
-
-## Hugging Face Model Cards
-
-- [deepseek-ai/DeepSeek-V3](https://huggingface.co/deepseek-ai/DeepSeek-V3)
-- [deepseek-ai/DeepSeek-V3-Base](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base)
-- [moonshotai/Moonlight-16B-A3B](https://huggingface.co/moonshotai/Moonlight-16B-A3B)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/deepseek-ai/deepseek-v3.mdx b/docs/model-coverage/llm/deepseek-ai/deepseek-v3.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/deepseek-ai/deepseek-v3.mdx
rename to docs/model-coverage/llm/deepseek-ai/deepseek-v3.mdx
diff --git a/docs/model-coverage/llm/deepseek-ai/deepseek.md b/docs/model-coverage/llm/deepseek-ai/deepseek.md
deleted file mode 100644
index 1957ed2372..0000000000
--- a/docs/model-coverage/llm/deepseek-ai/deepseek.md
+++ /dev/null
@@ -1,91 +0,0 @@
-# DeepSeek
-
-[DeepSeek](https://github.com/deepseek-ai) is a series of open-weight language models from DeepSeek AI. The first-generation models (V1/V2) use standard transformer decoder and Multi-head Latent Attention architectures.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `DeepseekForCausalLM` |
-| **Parameters** | 7B – 67B |
-| **HF Org** | [deepseek-ai](https://huggingface.co/deepseek-ai) |
-:::
-
-## Available Models
-
-- **DeepSeek-V2**: 236B total, 21B activated (MoE)
-- **DeepSeek-V2-Chat**: instruction-tuned variant
-- **DeepSeek-LLM 7B/67B**: dense models
-
-## Architecture
-
-- `DeepseekForCausalLM` — DeepSeek v1/v2 dense models
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| DeepSeek LLM 7B Chat | [`deepseek-ai/deepseek-llm-7b-chat`](https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat) |
-| DeepSeek LLM 67B Chat | [`deepseek-ai/deepseek-llm-67b-chat`](https://huggingface.co/deepseek-ai/deepseek-llm-67b-chat) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base LLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [deepseek-ai/deepseek-llm-7b-chat](https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat)
-- [deepseek-ai/deepseek-llm-67b-chat](https://huggingface.co/deepseek-ai/deepseek-llm-67b-chat)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/deepseek-ai/deepseek.mdx b/docs/model-coverage/llm/deepseek-ai/deepseek.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/deepseek-ai/deepseek.mdx
rename to docs/model-coverage/llm/deepseek-ai/deepseek.mdx
diff --git a/docs/model-coverage/llm/deepseek-ai/dsv4-flash.md b/docs/model-coverage/llm/deepseek-ai/dsv4-flash.md
deleted file mode 100644
index 4b4d8aacc4..0000000000
--- a/docs/model-coverage/llm/deepseek-ai/dsv4-flash.md
+++ /dev/null
@@ -1,91 +0,0 @@
-# DeepSeek V4 Flash
-
-[DeepSeek V4 Flash](https://huggingface.co/deepseek-ai/DeepSeek-V4-Flash) is DeepSeek's latest fine-grained Mixture-of-Experts language model. It uses a 43-layer all-MoE backbone with 256 routed experts plus one shared expert per block, top-6 routing, and a hybrid per-layer attention zoo (SWA / CSA / HCA) selectable through `compress_ratios`. The first `num_hash_layers` blocks use a hash-clustering gate, and every block maintains `hc_mult=4` Hyper-Connection streams mixed via a learned col-norm-first Sinkhorn router.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation (MoE) |
-| **Architecture** | `DeepseekV4ForCausalLM` |
-| **Parameters** | fine-grained MoE, 256 routed + 1 shared expert |
-| **HF Org** | [deepseek-ai](https://huggingface.co/deepseek-ai) |
-:::
-
-## Available Models
-
-- **DeepSeek-V4-Flash**
-
-## Architecture
-
-- `DeepseekV4ForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| DeepSeek V4 Flash | [`deepseek-ai/DeepSeek-V4-Flash`](https://huggingface.co/deepseek-ai/DeepSeek-V4-Flash) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| [`deepseek_v4_flash_hellaswag.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/deepseek_v4/deepseek_v4_flash_hellaswag.yaml) | SFT — DeepSeek V4 Flash on HellaSwag with pipeline parallelism |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-:::{note}
-The full 43-layer schedule requires a multi-node run; see the recipe yaml header for `ep_size` / `pp_size` guidance. See the [Launcher Guide](../../../launcher/slurm.md) for multi-node setup.
-:::
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/deepseek_v4/deepseek_v4_flash_hellaswag.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/deepseek_v4/deepseek_v4_flash_hellaswag.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [Fine-Tune DeepSeek V4 Flash](../../../guides/llm/dsv4-flash.md) guide and the [Large MoE Fine-Tuning Guide](../../../guides/llm/large-moe-finetune.md).
-
-## Hugging Face Model Cards
-
-- [deepseek-ai/DeepSeek-V4-Flash](https://huggingface.co/deepseek-ai/DeepSeek-V4-Flash)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/deepseek-ai/dsv4-flash.mdx b/docs/model-coverage/llm/deepseek-ai/dsv4-flash.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/deepseek-ai/dsv4-flash.mdx
rename to docs/model-coverage/llm/deepseek-ai/dsv4-flash.mdx
diff --git a/docs/model-coverage/llm/eleutherai/gpt-j.md b/docs/model-coverage/llm/eleutherai/gpt-j.md
deleted file mode 100644
index 2cea2c8116..0000000000
--- a/docs/model-coverage/llm/eleutherai/gpt-j.md
+++ /dev/null
@@ -1,89 +0,0 @@
-# GPT-J
-
-[GPT-J](https://github.com/kingoflolz/mesh-transformer-jax) is a 6B parameter transformer language model trained by EleutherAI on the Pile dataset. It was one of the earliest large open-weight models.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `GPTJForCausalLM` |
-| **Parameters** | 6B |
-| **HF Org** | [EleutherAI](https://huggingface.co/EleutherAI) |
-:::
-
-## Available Models
-
-- **gpt-j-6b**: 6B parameters
-- **gpt4all-j**: GPT-J fine-tuned for instruction following (Nomic AI)
-
-## Architecture
-
-- `GPTJForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| GPT-J 6B | [`EleutherAI/gpt-j-6b`](https://huggingface.co/EleutherAI/gpt-j-6b) |
-| GPT4All-J | [`nomic-ai/gpt4all-j`](https://huggingface.co/nomic-ai/gpt4all-j) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base LLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [EleutherAI/gpt-j-6b](https://huggingface.co/EleutherAI/gpt-j-6b)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/eleutherai/gpt-j.mdx b/docs/model-coverage/llm/eleutherai/gpt-j.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/eleutherai/gpt-j.mdx
rename to docs/model-coverage/llm/eleutherai/gpt-j.mdx
diff --git a/docs/model-coverage/llm/eleutherai/gpt-neox.md b/docs/model-coverage/llm/eleutherai/gpt-neox.md
deleted file mode 100644
index 9622f6be26..0000000000
--- a/docs/model-coverage/llm/eleutherai/gpt-neox.md
+++ /dev/null
@@ -1,96 +0,0 @@
-# GPT-NeoX / Pythia
-
-[GPT-NeoX](https://github.com/EleutherAI/gpt-neox) is EleutherAI's large-scale language model architecture. The same `GPTNeoXForCausalLM` architecture is used by the Pythia scaling suite, OpenAssistant, Databricks Dolly, and StableLM models.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `GPTNeoXForCausalLM` |
-| **Parameters** | 1B – 20B |
-| **HF Org** | [EleutherAI](https://huggingface.co/EleutherAI) |
-:::
-
-## Available Models
-
-- **GPT-NeoX-20B** (EleutherAI)
-- **Pythia** suite: 70M, 160M, 410M, 1B, 1.4B, 2.8B, 6.9B, 12B (EleutherAI)
-- **OA-SFT-Pythia-12B** (OpenAssistant)
-- **Dolly-v2-12B** (Databricks)
-- **StableLM-tuned-alpha-7B** (Stability AI)
-
-## Architecture
-
-- `GPTNeoXForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| GPT-NeoX 20B | [`EleutherAI/gpt-neox-20b`](https://huggingface.co/EleutherAI/gpt-neox-20b) |
-| Pythia 12B | [`EleutherAI/pythia-12b`](https://huggingface.co/EleutherAI/pythia-12b) |
-| OpenAssistant SFT Pythia 12B | [`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`](https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5) |
-| Dolly v2 12B | [`databrickslabs/dolly`](https://github.com/databrickslabs/dolly) |
-| StableLM tuned alpha 7B | [`stabilityai/stablelm-tuned-alpha-7b`](https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base LLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [EleutherAI/gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b)
-- [EleutherAI/pythia-12b](https://huggingface.co/EleutherAI/pythia-12b)
diff --git a/fern/versions/nightly/pages/model-coverage/llm/eleutherai/gpt-neox.mdx b/docs/model-coverage/llm/eleutherai/gpt-neox.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/eleutherai/gpt-neox.mdx
rename to docs/model-coverage/llm/eleutherai/gpt-neox.mdx
diff --git a/docs/model-coverage/llm/google/gemma.md b/docs/model-coverage/llm/google/gemma.md
deleted file mode 100644
index 587efeae1e..0000000000
--- a/docs/model-coverage/llm/google/gemma.md
+++ /dev/null
@@ -1,102 +0,0 @@
-# Gemma
-
-[Google's Gemma](https://ai.google.dev/gemma) is a family of open-weight language models built on the same research and technology as Gemini. Gemma models are available in multiple sizes and versions, with improvements in each generation including local sliding window attention (Gemma 2) and interleaved global/local attention (Gemma 3).
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `GemmaForCausalLM` / `Gemma2ForCausalLM` / `Gemma3ForCausalLM` |
-| **Parameters** | 1B – 27B |
-| **HF Org** | [google](https://huggingface.co/google) |
-:::
-
-## Available Models
-
-- **Gemma 3**: 1B, 4B, 12B, 27B
-- **Gemma 2**: 2B, 9B, 27B
-- **Gemma (v1)**: 2B, 7B
-
-## Architectures
-
-- `GemmaForCausalLM` — Gemma v1
-- `Gemma2ForCausalLM` — Gemma 2
-- `Gemma3ForCausalLM` — Gemma 3
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Gemma 1.1 2B IT | [`google/gemma-1.1-2b-it`](https://huggingface.co/google/gemma-1.1-2b-it) |
-| Gemma 2B | [`google/gemma-2b`](https://huggingface.co/google/gemma-2b) |
-| Gemma 2 9B IT | [`google/gemma-2-9b-it`](https://huggingface.co/google/gemma-2-9b-it) |
-| Gemma 2 27B | [`google/gemma-2-27b`](https://huggingface.co/google/gemma-2-27b) |
-| Gemma 3 1B IT | [`google/gemma-3-1b-it`](https://huggingface.co/google/gemma-3-1b-it) |
-| Gemma 3 4B IT | [`google/gemma-3-4b-it`](https://huggingface.co/google/gemma-3-4b-it) |
-| Gemma 3 27B IT | [`google/gemma-3-27b-it`](https://huggingface.co/google/gemma-3-27b-it) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`gemma_2_9b_it_squad.yaml <../../../../examples/llm_finetune/gemma/gemma_2_9b_it_squad.yaml>` | SFT — Gemma 2 9B IT on SQuAD |
-| {download}`gemma_2_9b_it_squad_peft.yaml <../../../../examples/llm_finetune/gemma/gemma_2_9b_it_squad_peft.yaml>` | LoRA — Gemma 2 9B IT on SQuAD |
-| {download}`gemma_3_270m_squad.yaml <../../../../examples/llm_finetune/gemma/gemma_3_270m_squad.yaml>` | SFT — Gemma 3 270M on SQuAD |
-| {download}`gemma_3_270m_squad_peft.yaml <../../../../examples/llm_finetune/gemma/gemma_3_270m_squad_peft.yaml>` | LoRA — Gemma 3 270M on SQuAD |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/gemma/gemma_2_9b_it_squad.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/gemma/gemma_2_9b_it_squad.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md) for full SFT and LoRA instructions.
-
-## Hugging Face Model Cards
-
-- [google/gemma-2b](https://huggingface.co/google/gemma-2b)
-- [google/gemma-2-9b-it](https://huggingface.co/google/gemma-2-9b-it)
-- [google/gemma-3-1b-it](https://huggingface.co/google/gemma-3-1b-it)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/google/gemma.mdx b/docs/model-coverage/llm/google/gemma.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/google/gemma.mdx
rename to docs/model-coverage/llm/google/gemma.mdx
diff --git a/docs/model-coverage/llm/ibm/bamba.md b/docs/model-coverage/llm/ibm/bamba.md
deleted file mode 100644
index b17b0e8ffe..0000000000
--- a/docs/model-coverage/llm/ibm/bamba.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# Bamba
-
-[Bamba](https://huggingface.co/ibm-ai-platform/Bamba-9B) is a hybrid SSM-attention language model from IBM, combining Mamba-2 selective state space layers with standard transformer attention for efficient long-context processing.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `BambaForCausalLM` |
-| **Parameters** | 9B |
-| **HF Org** | [ibm-ai-platform](https://huggingface.co/ibm-ai-platform) |
-:::
-
-## Available Models
-
-- **Bamba-9B**: 9B parameters
-
-## Architecture
-
-- `BambaForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Bamba 9B | [`ibm-ai-platform/Bamba-9B`](https://huggingface.co/ibm-ai-platform/Bamba-9B) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base LLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [ibm-ai-platform/Bamba-9B](https://huggingface.co/ibm-ai-platform/Bamba-9B)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/ibm/bamba.mdx b/docs/model-coverage/llm/ibm/bamba.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/ibm/bamba.mdx
rename to docs/model-coverage/llm/ibm/bamba.mdx
diff --git a/docs/model-coverage/llm/ibm/granite-moe.md b/docs/model-coverage/llm/ibm/granite-moe.md
deleted file mode 100644
index 078f703577..0000000000
--- a/docs/model-coverage/llm/ibm/granite-moe.md
+++ /dev/null
@@ -1,94 +0,0 @@
-# Granite MoE
-
-IBM Granite MoE models extend the Granite architecture with Mixture-of-Experts layers for more efficient scaling. PowerMoE (IBM Research) also uses this architecture.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation (MoE) |
-| **Architecture** | `GraniteMoeForCausalLM` |
-| **Parameters** | 1B – 3B |
-| **HF Org** | [ibm-granite](https://huggingface.co/ibm-granite) |
-:::
-
-## Available Models
-
-- **Granite 3.0 1B A400M Base** — 1B total, 400M activated
-- **Granite 3.0 3B A800M Instruct** — 3B total, 800M activated
-- **PowerMoE-3B** (IBM Research) — 3B total
-- **MoE-7B-1B-Active-Shared-Experts** (IBM Research, test model)
-
-## Architectures
-
-- `GraniteMoeForCausalLM`
-- `GraniteMoeSharedForCausalLM` — variant with shared experts
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Granite 3.0 1B A400M Base | [`ibm-granite/granite-3.0-1b-a400m-base`](https://huggingface.co/ibm-granite/granite-3.0-1b-a400m-base) |
-| Granite 3.0 3B A800M Instruct | [`ibm-granite/granite-3.0-3b-a800m-instruct`](https://huggingface.co/ibm-granite/granite-3.0-3b-a800m-instruct) |
-| PowerMoE 3B | [`ibm/PowerMoE-3b`](https://huggingface.co/ibm/PowerMoE-3b) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base LLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [ibm-granite/granite-3.0-1b-a400m-base](https://huggingface.co/ibm-granite/granite-3.0-1b-a400m-base)
-- [ibm/PowerMoE-3b](https://huggingface.co/ibm/PowerMoE-3b)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/ibm/granite-moe.mdx b/docs/model-coverage/llm/ibm/granite-moe.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/ibm/granite-moe.mdx
rename to docs/model-coverage/llm/ibm/granite-moe.mdx
diff --git a/docs/model-coverage/llm/ibm/granite.md b/docs/model-coverage/llm/ibm/granite.md
deleted file mode 100644
index e34e044785..0000000000
--- a/docs/model-coverage/llm/ibm/granite.md
+++ /dev/null
@@ -1,94 +0,0 @@
-# Granite
-
-[IBM Granite](https://www.ibm.com/granite) is IBM's family of enterprise-focused language models. Granite 3.x models are trained on a mix of code and language data and are optimized for enterprise tasks including summarization, classification, and RAG. PowerLM (IBM Research) also uses this architecture.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `GraniteForCausalLM` |
-| **Parameters** | 2B – 8B |
-| **HF Org** | [ibm-granite](https://huggingface.co/ibm-granite) |
-:::
-
-## Available Models
-
-- **Granite 3.3 2B Instruct**
-- **Granite 3.1 8B Instruct**
-- **Granite 3.0 2B Base**
-- **PowerLM-3B** (IBM Research)
-
-## Architecture
-
-- `GraniteForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Granite 3.0 2B Base | [`ibm-granite/granite-3.0-2b-base`](https://huggingface.co/ibm-granite/granite-3.0-2b-base) |
-| Granite 3.1 8B Instruct | [`ibm-granite/granite-3.1-8b-instruct`](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct) |
-| PowerLM 3B | [`ibm/PowerLM-3b`](https://huggingface.co/ibm/PowerLM-3b) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`granite_3_3_2b_instruct_squad.yaml <../../../../examples/llm_finetune/granite/granite_3_3_2b_instruct_squad.yaml>` | SFT — Granite 3.3 2B Instruct on SQuAD |
-| {download}`granite_3_3_2b_instruct_squad_peft.yaml <../../../../examples/llm_finetune/granite/granite_3_3_2b_instruct_squad_peft.yaml>` | LoRA — Granite 3.3 2B Instruct on SQuAD |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/granite/granite_3_3_2b_instruct_squad.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/granite/granite_3_3_2b_instruct_squad.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [ibm-granite/granite-3.0-2b-base](https://huggingface.co/ibm-granite/granite-3.0-2b-base)
-- [ibm-granite/granite-3.1-8b-instruct](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/ibm/granite.mdx b/docs/model-coverage/llm/ibm/granite.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/ibm/granite.mdx
rename to docs/model-coverage/llm/ibm/granite.mdx
diff --git a/docs/model-coverage/llm/inceptionai/jais.md b/docs/model-coverage/llm/inceptionai/jais.md
deleted file mode 100644
index ae2f7f3373..0000000000
--- a/docs/model-coverage/llm/inceptionai/jais.md
+++ /dev/null
@@ -1,93 +0,0 @@
-# Jais
-
-[Jais](https://huggingface.co/inceptionai/jais-13b) is an Arabic-English bilingual language model from Inception (formerly G42/Inception AI), trained on a large Arabic and English corpus.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `JAISLMHeadModel` |
-| **Parameters** | 13B – 30B |
-| **HF Org** | [inceptionai](https://huggingface.co/inceptionai) |
-:::
-
-## Available Models
-
-- **jais-30b-chat-v3**: 30B
-- **jais-30b-v3**: 30B base
-- **jais-13b-chat**: 13B
-- **jais-13b**: 13B base
-
-## Architecture
-
-- `JAISLMHeadModel`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Jais 13B | [`inceptionai/jais-13b`](https://huggingface.co/inceptionai/jais-13b) |
-| Jais 13B Chat | [`inceptionai/jais-13b-chat`](https://huggingface.co/inceptionai/jais-13b-chat) |
-| Jais 30B v3 | [`inceptionai/jais-30b-v3`](https://huggingface.co/inceptionai/jais-30b-v3) |
-| Jais 30B Chat v3 | [`inceptionai/jais-30b-chat-v3`](https://huggingface.co/inceptionai/jais-30b-chat-v3) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base LLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [inceptionai/jais-13b](https://huggingface.co/inceptionai/jais-13b)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/inceptionai/jais.mdx b/docs/model-coverage/llm/inceptionai/jais.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/inceptionai/jais.mdx
rename to docs/model-coverage/llm/inceptionai/jais.mdx
diff --git a/docs/model-coverage/llm/inclusionai/ling-2.md b/docs/model-coverage/llm/inclusionai/ling-2.md
deleted file mode 100644
index f5ff8d409c..0000000000
--- a/docs/model-coverage/llm/inclusionai/ling-2.md
+++ /dev/null
@@ -1,70 +0,0 @@
-# Ling 2.0
-
-[Ling 2.0](https://huggingface.co/collections/inclusionAI/ling-20) is the Mixture-of-Experts
-LLM family from inclusionAI (Ant Group), released under the `bailing_moe` HF
-architecture (`BailingMoeV2ForCausalLM`).  The line spans a 16 B mini through
-a 1 T flagship while sharing the same architecture.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation (MoE) |
-| **Architecture** | `BailingMoeV2ForCausalLM` |
-| **Parameters** | 16 B – 1 T total |
-| **HF Org** | [inclusionAI](https://huggingface.co/inclusionAI) |
-:::
-
-## Available Models
-
-- **Ling-mini-2.0**: 16 B total / ~1.4 B activated per token (20 layers, 256 experts, 8 activated).
-- **Ling-flash-2.0**: 100 B total / ~6 B activated per token (32 layers, 256 experts, 8 activated).
-- **Ling-1T**: 1 T total / ~50 B activated per token (80 layers, `first_k_dense_replace=4`).
-- **Ling-mini-base-2.0** / **Ling-flash-base-2.0**: base (pre-instruct) variants.
-
-All variants share the same architecture: GQA + per-head QK-RMSNorm + half RoPE
-(`partial_rotary_factor=0.5`) + sigmoid-routed grouped MoE with one shared
-expert and a per-expert correction bias (aux-loss-free routing).
-
-## Architecture
-
-- `BailingMoeV2ForCausalLM` (HF `model_type: "bailing_moe"`)
-- GQA attention; `use_qk_norm: true`
-- Half RoPE (`partial_rotary_factor=0.5`)
-- DeepSeek-V3-style routing: sigmoid scoring, per-expert bias, grouped top-k
-  (`n_group=8`, `topk_group=4`)
-- 1 shared expert at `moe_intermediate_size`
-- `first_k_dense_replace` dense MLP layer(s) at the start of the stack
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Ling-mini-2.0 | [`inclusionAI/Ling-mini-2.0`](https://huggingface.co/inclusionAI/Ling-mini-2.0) |
-| Ling-flash-2.0 | [`inclusionAI/Ling-flash-2.0`](https://huggingface.co/inclusionAI/Ling-flash-2.0) |
-| Ling-1T | [`inclusionAI/Ling-1T`](https://huggingface.co/inclusionAI/Ling-1T) |
-
-## Example Recipes
-
-| Recipe | Description | Min HW |
-|---|---|---|
-| {download}`ling_mini_2_0_squad.yaml <../../../../examples/llm_finetune/ling/ling_mini_2_0_squad.yaml>` | LoRA SFT — Ling-mini-2.0 on SQuAD | 2× H100 80GB |
-| {download}`ling_mini_2_0_hellaswag.yaml <../../../../examples/llm_finetune/ling/ling_mini_2_0_hellaswag.yaml>` | LoRA SFT — Ling-mini-2.0 on HellaSwag | 2× H100 80GB |
-| {download}`ling_mini_2_0_sft.yaml <../../../../examples/llm_finetune/ling/ling_mini_2_0_sft.yaml>` | Full SFT — Ling-mini-2.0 on HellaSwag, FSDP2 + EP=8 | 8× H100 80GB |
-| {download}`ling_flash_2_0_lora.yaml <../../../../examples/llm_finetune/ling/ling_flash_2_0_lora.yaml>` | LoRA SFT — Ling-flash-2.0 on HellaSwag | 8× H100 80GB |
-| {download}`ling_flash_2_0_sft.yaml <../../../../examples/llm_finetune/ling/ling_flash_2_0_sft.yaml>` | Full SFT — Ling-flash-2.0 on HellaSwag, FSDP2 + EP=32 | 32× H100 80GB (4 nodes) |
-| {download}`ling_1t_lora_pp.yaml <../../../../examples/llm_finetune/ling/ling_1t_lora_pp.yaml>` | LoRA SFT — Ling-1T on HellaSwag, FSDP2 + PP=8 + EP=8 | 64× H100 80GB (8 nodes) |
-| {download}`ling_1t_sft.yaml <../../../../examples/llm_finetune/ling/ling_1t_sft.yaml>` | Full SFT — Ling-1T on HellaSwag, FSDP2 + PP=4 + EP=64 | 256× H100 80GB (32 nodes) |
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)).
-
-**2. Run LoRA fine-tuning:**
-
-```bash
-automodel examples/llm_finetune/ling/ling_mini_2_0_squad.yaml --nproc-per-node 1
-```
-
-A single 80 GB H100 / A100 fits Ling-mini-2.0 in bf16 with the LoRA defaults
-in the example.  Set `distributed.ep_size > 1` for multi-GPU expert
-parallelism on the larger variants.
diff --git a/fern/versions/nightly/pages/model-coverage/llm/inclusionai/ling-2.mdx b/docs/model-coverage/llm/inclusionai/ling-2.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/inclusionai/ling-2.mdx
rename to docs/model-coverage/llm/inclusionai/ling-2.mdx
diff --git a/docs/model-coverage/llm/index.md b/docs/model-coverage/llm/index.md
deleted file mode 100644
index 2a81e3cbde..0000000000
--- a/docs/model-coverage/llm/index.md
+++ /dev/null
@@ -1,151 +0,0 @@
-# Large Language Models (LLMs)
-
-## Introduction
-Large Language Models (LLMs) power a variety of tasks such as dialogue systems, text classification, summarization, and more.
-NeMo AutoModel provides a simple interface for loading and fine-tuning LLMs hosted on the Hugging Face Hub.
-
-## Run LLMs with NeMo AutoModel
-To run LLMs with NeMo AutoModel, make sure you're using NeMo container version [`25.11.00`](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo-automodel?version=25.11.00) or later. If the model you intend to fine-tune requires a newer version of Transformers, you may need to upgrade to the latest version of NeMo AutoModel by using:
-
-```bash
-pip3 install --upgrade git+git@github.com:NVIDIA-NeMo/AutoModel.git
-```
-
-For other installation options (e.g., uv), see the [NeMo AutoModel Installation Guide](../../guides/installation.md).
-
-## Supported Models
-
-NeMo AutoModel supports the [AutoModelForCausalLM](https://huggingface.co/transformers/v3.5.1/model_doc/auto.html#automodelforcausallm) in the [Text Generation](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending) category. During preprocessing, it uses `transformers.AutoTokenizer`, which is sufficient for most LLM cases. If your model requires custom text handling, override the tokenizer in your recipe YAML or provide a custom dataset `_target_`. See [LLM datasets](../../guides/llm/dataset.md) and [dataset overview](../../guides/dataset-overview.md).
-
-| Owner | Model Family | Architectures |
-|---|---|---|
-| Meta | [Llama](meta/llama.md) | `LlamaForCausalLM` |
-| Google | [Gemma](google/gemma.md) | `GemmaForCausalLM`, `Gemma2ForCausalLM`, `Gemma3ForCausalLM` |
-| Qwen / Alibaba Cloud | [Qwen2](qwen/qwen2.md) | `Qwen2ForCausalLM` |
-| Qwen / Alibaba Cloud | [Qwen2 MoE](qwen/qwen2-moe.md) | `Qwen2MoeForCausalLM` |
-| Qwen / Alibaba Cloud | [Qwen3](qwen/qwen3.md) | `Qwen3ForCausalLM` |
-| Qwen / Alibaba Cloud | [Qwen3 MoE](qwen/qwen3-moe.md) | `Qwen3MoeForCausalLM` |
-| Qwen / Alibaba Cloud | [Qwen3-Next](qwen/qwen3-next.md) | `Qwen3NextForCausalLM` |
-| Baidu | [ERNIE 4.5](baidu/ernie4-5.md) | `Ernie4_5ForCausalLM`, `Ernie4_5_MoeForCausalLM` |
-| DeepSeek | [DeepSeek](deepseek-ai/deepseek.md) | `DeepseekForCausalLM` |
-| DeepSeek | [DeepSeek-V3](deepseek-ai/deepseek-v3.md) | `DeepseekV3ForCausalLM`, `DeepseekV32ForCausalLM` |
-| DeepSeek | [DeepSeek V4 Flash](deepseek-ai/dsv4-flash.md) | `DeepseekV4ForCausalLM` |
-| Mistral AI | [Mistral](mistralai/mistral.md) | `MistralForCausalLM` |
-| Mistral AI | [Mixtral](mistralai/mixtral.md) | `MixtralForCausalLM` |
-| Mistral AI | [Ministral3 / Devstral](mistralai/ministral3.md) | `Mistral3ForConditionalGeneration` |
-| Microsoft | [Phi](microsoft/phi.md) | `PhiForCausalLM` |
-| Microsoft | [Phi-3 / Phi-4](microsoft/phi3.md) | `Phi3ForCausalLM` |
-| Microsoft | [Phi-3-Small](microsoft/phi3-small.md) | `Phi3SmallForCausalLM` |
-| NVIDIA | [Nemotron](nvidia/nemotron.md) | `NemotronForCausalLM` |
-| NVIDIA | [Nemotron-H](nvidia/nemotron-h.md) | `NemotronHForCausalLM` |
-| NVIDIA | [Nemotron-Flash](nvidia/nemotron-flash.md) | `NemotronFlashForCausalLM` |
-| NVIDIA | [Nemotron-Super](nvidia/nemotron-super.md) | `DeciLMForCausalLM` |
-| ZAI / Zhipu AI | [ChatGLM](thudm/chatglm.md) | `ChatGLMModel` |
-| ZAI / Zhipu AI | [GLM-4](thudm/glm4.md) | `GlmForCausalLM`, `Glm4ForCausalLM` |
-| ZAI / Zhipu AI | [GLM-4 MoE](thudm/glm4-moe.md) | `Glm4MoeForCausalLM`, `Glm4MoeLiteForCausalLM` |
-| ZAI / Zhipu AI | [GLM-5 / GLM-5.1](thudm/glm5-moe-dsa.md) | `GlmMoeDsaForCausalLM` |
-| IBM | [Granite](ibm/granite.md) | `GraniteForCausalLM` |
-| IBM | [Granite MoE](ibm/granite-moe.md) | `GraniteMoeForCausalLM`, `GraniteMoeSharedForCausalLM` |
-| IBM | [Bamba](ibm/bamba.md) | `BambaForCausalLM` |
-| Allen AI | [OLMo](allenai/olmo.md) | `OLMoForCausalLM` |
-| Allen AI | [OLMo2](allenai/olmo2.md) | `OLMo2ForCausalLM` |
-| Allen AI | [OLMoE](allenai/olmoe.md) | `OLMoEForCausalLM` |
-| OpenAI | [GPT-OSS](openai/gpt-oss.md) | `GptOssForCausalLM` |
-| OpenAI | [GPT-2](openai/gpt2.md) | `GPT2LMHeadModel` |
-| EleutherAI | [GPT-J](eleutherai/gpt-j.md) | `GPTJForCausalLM` |
-| EleutherAI | [GPT-NeoX / Pythia](eleutherai/gpt-neox.md) | `GPTNeoXForCausalLM` |
-| BigCode | [StarCoder](bigcode/starcoder.md) | `GPTBigCodeForCausalLM` |
-| BigCode | [StarCoder2](bigcode/starcoder2.md) | `Starcoder2ForCausalLM` |
-| BAAI | [Aquila](baai/aquila.md) | `AquilaForCausalLM` |
-| Baichuan Inc | [Baichuan](baichuan-inc/baichuan.md) | `BaiChuanForCausalLM` |
-| Cohere | [Command-R](cohere/command-r.md) | `CohereForCausalLM`, `Cohere2ForCausalLM` |
-| TII | [Falcon](tiiuae/falcon.md) | `FalconForCausalLM` |
-| LG AI Research | [EXAONE](lgai-exaone/exaone.md) | `ExaoneForCausalLM` |
-| InternLM | [InternLM](internlm/internlm.md) | `InternLMForCausalLM`, `InternLM2ForCausalLM`, `InternLM3ForCausalLM` |
-| Inception AI | [Jais](inceptionai/jais.md) | `JAISLMHeadModel` |
-| MiniMax | [MiniMax-M2](minimax/minimax-m2.md) | `MiniMaxM2ForCausalLM` |
-| OpenBMB | [MiniCPM](openbmb/minicpm.md) | `MiniCPMForCausalLM`, `MiniCPM3ForCausalLM` |
-| Moonshot AI | [Moonlight](moonshotai/moonlight.md) | `DeepseekV3ForCausalLM` |
-| ByteDance Seed | [Seed](bytedance-seed/seed.md) | `Qwen2ForCausalLM` |
-| Upstage | [Solar](upstage/solar.md) | `SolarForCausalLM` |
-| OrionStar | [Orion](orionstar/orion.md) | `OrionForCausalLM` |
-| Stability AI | [StableLM](stabilityai/stablelm.md) | `StableLmForCausalLM` |
-| Stepfun AI | [Step-3.5](stepfun-ai/step-3-5.md) | `Step3p5ForCausalLM` |
-| Parasail AI | [GritLM](parasail-ai/gritlm.md) | `GritLM` |
-| Tencent | [Hy3-preview](tencent/hy3.md) | `HYV3ForCausalLM` |
-| Xiaomi MiMo | [MiMo-V2-Flash](xiaomimimo/mimo-v2-flash.md) | `MiMoV2FlashForCausalLM` |
-| inclusionAI | [Ling 2.0](inclusionai/ling-2.md) | `BailingMoeV2ForCausalLM` |
-
-## Fine-Tuning LLMs with NeMo AutoModel
-
-The models listed above can be fine-tuned using NeMo AutoModel. We support two primary fine-tuning approaches:
-
-1. **Parameter-Efficient Fine-Tuning (PEFT)**: Updates only a small subset of parameters (typically <1%) using techniques like Low-Rank Adaptation (LoRA).
-2. **Supervised Fine-Tuning (SFT)**: Updates all or most model parameters for deeper adaptation.
-
-See the [Fine-Tuning Guide](../../guides/llm/finetune.md) to learn how to apply both methods to your data.
-
-:::{tip}
-In these guides, we use the `SQuAD v1.1` dataset for demonstration purposes, but you can use your own data. Update the recipe YAML `dataset` / `validation_dataset` sections accordingly. See [LLM datasets](../../guides/llm/dataset.md) and [dataset overview](../../guides/dataset-overview.md).
-:::
-
-```{toctree}
-:hidden:
-
-meta/llama
-google/gemma
-qwen/qwen2
-qwen/qwen2-moe
-qwen/qwen3
-qwen/qwen3-moe
-qwen/qwen3-next
-baidu/ernie4-5
-deepseek-ai/deepseek
-deepseek-ai/deepseek-v3
-deepseek-ai/dsv4-flash
-mistralai/mistral
-mistralai/mixtral
-mistralai/ministral3
-microsoft/phi
-microsoft/phi3
-microsoft/phi3-small
-nvidia/nemotron
-nvidia/nemotron-h
-nvidia/nemotron-flash
-nvidia/nemotron-super
-thudm/chatglm
-thudm/glm4
-thudm/glm4-moe
-thudm/glm5-moe-dsa
-ibm/granite
-ibm/granite-moe
-ibm/bamba
-allenai/olmo
-allenai/olmo2
-allenai/olmoe
-openai/gpt-oss
-openai/gpt2
-eleutherai/gpt-j
-eleutherai/gpt-neox
-bigcode/starcoder
-bigcode/starcoder2
-baai/aquila
-baichuan-inc/baichuan
-cohere/command-r
-tiiuae/falcon
-lgai-exaone/exaone
-internlm/internlm
-inceptionai/jais
-minimax/minimax-m2
-openbmb/minicpm
-moonshotai/moonlight
-bytedance-seed/seed
-upstage/solar
-orionstar/orion
-stabilityai/stablelm
-stepfun-ai/step-3-5
-parasail-ai/gritlm
-tencent/hy3
-xiaomimimo/mimo-v2-flash
-inclusionai/ling-2
-```
diff --git a/fern/versions/nightly/pages/model-coverage/llm/index.mdx b/docs/model-coverage/llm/index.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/index.mdx
rename to docs/model-coverage/llm/index.mdx
diff --git a/docs/model-coverage/llm/internlm/internlm.md b/docs/model-coverage/llm/internlm/internlm.md
deleted file mode 100644
index cb1f87ac7b..0000000000
--- a/docs/model-coverage/llm/internlm/internlm.md
+++ /dev/null
@@ -1,94 +0,0 @@
-# InternLM
-
-[InternLM](https://github.com/InternLM/InternLM) is a bilingual (Chinese-English) language model series from Shanghai AI Laboratory, with versions 1, 2, and 3 each improving on reasoning, instruction following, and long-context capabilities.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `InternLMForCausalLM` / `InternLM2ForCausalLM` / `InternLM3ForCausalLM` |
-| **Parameters** | 7B – 8B |
-| **HF Org** | [internlm](https://huggingface.co/internlm) |
-:::
-
-## Available Models
-
-- **InternLM3-8B-Instruct** (InternLM3)
-- **InternLM2-7B**, **InternLM2-Chat-7B** (InternLM2)
-- **InternLM-7B**, **InternLM-Chat-7B** (InternLM v1)
-
-## Architectures
-
-- `InternLMForCausalLM` — InternLM v1
-- `InternLM2ForCausalLM` — InternLM2
-- `InternLM3ForCausalLM` — InternLM3
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| InternLM3 8B Instruct | [`internlm/internlm3-8b-instruct`](https://huggingface.co/internlm/internlm3-8b-instruct) |
-| InternLM2 7B | [`internlm/internlm2-7b`](https://huggingface.co/internlm/internlm2-7b) |
-| InternLM 7B | [`internlm/internlm-7b`](https://huggingface.co/internlm/internlm-7b) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base LLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [internlm/internlm3-8b-instruct](https://huggingface.co/internlm/internlm3-8b-instruct)
-- [internlm/internlm2-7b](https://huggingface.co/internlm/internlm2-7b)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/internlm/internlm.mdx b/docs/model-coverage/llm/internlm/internlm.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/internlm/internlm.mdx
rename to docs/model-coverage/llm/internlm/internlm.mdx
diff --git a/docs/model-coverage/llm/lgai-exaone/exaone.md b/docs/model-coverage/llm/lgai-exaone/exaone.md
deleted file mode 100644
index eff5716d35..0000000000
--- a/docs/model-coverage/llm/lgai-exaone/exaone.md
+++ /dev/null
@@ -1,88 +0,0 @@
-# EXAONE
-
-EXAONE is a bilingual (Korean-English) language model series from LG AI Research, with strong performance on Korean-language benchmarks.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `ExaoneForCausalLM` |
-| **Parameters** | 7.8B |
-| **HF Org** | [LGAI-EXAONE](https://huggingface.co/LGAI-EXAONE) |
-:::
-
-## Available Models
-
-- **EXAONE-3.0-7.8B-Instruct**
-- **EXAONE-3.5-7.8B-Instruct**
-
-## Architecture
-
-- `ExaoneForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| EXAONE 3.0 7.8B Instruct | [`LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base LLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
diff --git a/fern/versions/nightly/pages/model-coverage/llm/lgai-exaone/exaone.mdx b/docs/model-coverage/llm/lgai-exaone/exaone.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/lgai-exaone/exaone.mdx
rename to docs/model-coverage/llm/lgai-exaone/exaone.mdx
diff --git a/docs/model-coverage/llm/meta/llama.md b/docs/model-coverage/llm/meta/llama.md
deleted file mode 100644
index b3a2333269..0000000000
--- a/docs/model-coverage/llm/meta/llama.md
+++ /dev/null
@@ -1,104 +0,0 @@
-# Llama
-
-[Meta's Llama](https://www.llama.com/) is a family of open-weight autoregressive language models built on the transformer decoder architecture. Key design choices include pre-normalization with RMSNorm, SwiGLU activations, and Rotary Positional Embeddings (RoPE). Llama 3+ models add Grouped Query Attention (GQA) for memory-efficient inference at larger scales.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `LlamaForCausalLM` |
-| **Parameters** | 1B – 405B |
-| **HF Org** | [meta-llama](https://huggingface.co/meta-llama) |
-:::
-
-## Available Models
-
-- **Llama 3.2**: 1B, 3B
-- **Llama 3.1**: 8B, 70B, 405B (128K context)
-- **Llama 3**: 8B, 70B
-- **Llama 2**: 7B, 13B, 70B
-- **LLaMA (v1)**: 7B, 13B, 30B, 65B
-- **Yi** (01-ai): 6B, 34B — uses `LlamaForCausalLM`
-
-## Architecture
-
-- `LlamaForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Llama 3.2 1B | [`meta-llama/Llama-3.2-1B`](https://huggingface.co/meta-llama/Llama-3.2-1B) |
-| Llama 3.2 3B | [`meta-llama/Llama-3.2-3B`](https://huggingface.co/meta-llama/Llama-3.2-3B) |
-| Llama 3.1 8B | [`meta-llama/Meta-Llama-3.1-8B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) |
-| Llama 3.1 70B | [`meta-llama/Meta-Llama-3.1-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) |
-| Llama 3.1 405B | [`meta-llama/Meta-Llama-3.1-405B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct) |
-| Llama 3 8B | [`meta-llama/Meta-Llama-3-8B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) |
-| Llama 3 70B | [`meta-llama/Meta-Llama-3-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) |
-| Llama 2 70B | [`meta-llama/Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) |
-| Yi 34B | [`01-ai/Yi-34B`](https://huggingface.co/01-ai/Yi-34B) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`llama3_2_1b_squad.yaml <../../../../examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml>` | SFT — Llama 3.2 1B on SQuAD |
-| {download}`llama_3_3_70b_instruct_squad.yaml <../../../../examples/llm_finetune/llama3_3/llama_3_3_70b_instruct_squad.yaml>` | SFT — Llama 3.3 70B Instruct on SQuAD |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md) for full SFT and LoRA instructions.
-
-## Hugging Face Model Cards
-
-- [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B)
-- [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B)
-- [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B)
-- [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/meta/llama.mdx b/docs/model-coverage/llm/meta/llama.mdx
similarity index 98%
rename from fern/versions/v0.4/pages/model-coverage/llm/meta/llama.mdx
rename to docs/model-coverage/llm/meta/llama.mdx
index 7291f5947f..8c9aaf5fca 100644
--- a/fern/versions/v0.4/pages/model-coverage/llm/meta/llama.mdx
+++ b/docs/model-coverage/llm/meta/llama.mdx
@@ -102,5 +102,6 @@ See the [LLM Fine-Tuning Guide](/recipes-e2e-examples/sft-peft) for full SFT and
 ## Hugging Face Model Cards
 
 - [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B)
+- [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B)
 - [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B)
 - [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B)
diff --git a/docs/model-coverage/llm/microsoft/phi.md b/docs/model-coverage/llm/microsoft/phi.md
deleted file mode 100644
index 9635a6efbb..0000000000
--- a/docs/model-coverage/llm/microsoft/phi.md
+++ /dev/null
@@ -1,91 +0,0 @@
-# Phi
-
-[Microsoft's Phi](https://azure.microsoft.com/en-us/products/phi) are compact, high-capability language models designed to punch above their weight class. Phi-1.5 and Phi-2 use a standard transformer decoder architecture (`PhiForCausalLM`). For Phi-3 and Phi-4 see [Phi-3 / Phi-4](phi3.md).
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `PhiForCausalLM` |
-| **Parameters** | 1.3B – 2.7B |
-| **HF Org** | [microsoft](https://huggingface.co/microsoft) |
-:::
-
-## Available Models
-
-- **Phi-2**: 2.7B
-- **Phi-1.5**: 1.3B
-
-## Architecture
-
-- `PhiForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Phi-2 | [`microsoft/phi-2`](https://huggingface.co/microsoft/phi-2) |
-| Phi-1.5 | [`microsoft/phi-1_5`](https://huggingface.co/microsoft/phi-1_5) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`phi_2_squad.yaml <../../../../examples/llm_finetune/phi/phi_2_squad.yaml>` | SFT — Phi-2 on SQuAD |
-| {download}`phi_2_squad_peft.yaml <../../../../examples/llm_finetune/phi/phi_2_squad_peft.yaml>` | LoRA — Phi-2 on SQuAD |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/phi/phi_2_squad.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/phi/phi_2_squad.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [microsoft/phi-2](https://huggingface.co/microsoft/phi-2)
-- [microsoft/phi-1_5](https://huggingface.co/microsoft/phi-1_5)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/microsoft/phi.mdx b/docs/model-coverage/llm/microsoft/phi.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/microsoft/phi.mdx
rename to docs/model-coverage/llm/microsoft/phi.mdx
diff --git a/docs/model-coverage/llm/microsoft/phi3-small.md b/docs/model-coverage/llm/microsoft/phi3-small.md
deleted file mode 100644
index 59a1d4b457..0000000000
--- a/docs/model-coverage/llm/microsoft/phi3-small.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# Phi-3-Small
-
-[Phi-3-Small](https://azure.microsoft.com/en-us/products/phi) is Microsoft's 7B model using a distinct `Phi3SmallForCausalLM` architecture with blocksparse attention, separate from the standard Phi-3 family.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `Phi3SmallForCausalLM` |
-| **Parameters** | 7B |
-| **HF Org** | [microsoft](https://huggingface.co/microsoft) |
-:::
-
-## Available Models
-
-- **Phi-3-small-8k-instruct**: 7B, 8K context
-- **Phi-3-small-128k-instruct**: 7B, 128K context
-
-## Architecture
-
-- `Phi3SmallForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Phi-3-small-8k-instruct | [`microsoft/Phi-3-small-8k-instruct`](https://huggingface.co/microsoft/Phi-3-small-8k-instruct) |
-| Phi-3-small-128k-instruct | [`microsoft/Phi-3-small-128k-instruct`](https://huggingface.co/microsoft/Phi-3-small-128k-instruct) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base LLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [microsoft/Phi-3-small-8k-instruct](https://huggingface.co/microsoft/Phi-3-small-8k-instruct)
-- [microsoft/Phi-3-small-128k-instruct](https://huggingface.co/microsoft/Phi-3-small-128k-instruct)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/microsoft/phi3-small.mdx b/docs/model-coverage/llm/microsoft/phi3-small.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/microsoft/phi3-small.mdx
rename to docs/model-coverage/llm/microsoft/phi3-small.mdx
diff --git a/docs/model-coverage/llm/microsoft/phi3.md b/docs/model-coverage/llm/microsoft/phi3.md
deleted file mode 100644
index df28ef59c5..0000000000
--- a/docs/model-coverage/llm/microsoft/phi3.md
+++ /dev/null
@@ -1,101 +0,0 @@
-# Phi-3 / Phi-4
-
-[Phi-3](https://azure.microsoft.com/en-us/products/phi) and [Phi-4](https://azure.microsoft.com/en-us/products/phi) are Microsoft's high-capability small language models using a shared transformer decoder architecture (`Phi3ForCausalLM`). Phi-4-mini and Phi-4 achieve strong benchmark results at relatively small parameter counts.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `Phi3ForCausalLM` |
-| **Parameters** | 3.8B – 14B |
-| **HF Org** | [microsoft](https://huggingface.co/microsoft) |
-:::
-
-## Available Models
-
-- **Phi-4**: 14B
-- **Phi-4-mini-instruct**: 3.8B
-- **Phi-3.5-mini-instruct**: 3.8B
-- **Phi-3-medium-128k-instruct**: 14B
-- **Phi-3-mini-128k-instruct**: 3.8B
-- **Phi-3-mini-4k-instruct**: 3.8B
-
-## Architecture
-
-- `Phi3ForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Phi-4 | [`microsoft/Phi-4`](https://huggingface.co/microsoft/Phi-4) |
-| Phi-4-mini-instruct | [`microsoft/Phi-4-mini-instruct`](https://huggingface.co/microsoft/Phi-4-mini-instruct) |
-| Phi-3-mini-4k-instruct | [`microsoft/Phi-3-mini-4k-instruct`](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) |
-| Phi-3-mini-128k-instruct | [`microsoft/Phi-3-mini-128k-instruct`](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) |
-| Phi-3-medium-128k-instruct | [`microsoft/Phi-3-medium-128k-instruct`](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`phi_4_squad.yaml <../../../../examples/llm_finetune/phi/phi_4_squad.yaml>` | SFT — Phi-4 on SQuAD |
-| {download}`phi_4_squad_peft.yaml <../../../../examples/llm_finetune/phi/phi_4_squad_peft.yaml>` | LoRA — Phi-4 on SQuAD |
-| {download}`phi_3_mini_it_squad.yaml <../../../../examples/llm_finetune/phi/phi_3_mini_it_squad.yaml>` | SFT — Phi-3-mini Instruct on SQuAD |
-| {download}`phi_3_mini_it_squad_peft.yaml <../../../../examples/llm_finetune/phi/phi_3_mini_it_squad_peft.yaml>` | LoRA — Phi-3-mini Instruct on SQuAD |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/phi/phi_4_squad.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/phi/phi_4_squad.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [microsoft/Phi-4](https://huggingface.co/microsoft/Phi-4)
-- [microsoft/Phi-4-mini-instruct](https://huggingface.co/microsoft/Phi-4-mini-instruct)
-- [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/microsoft/phi3.mdx b/docs/model-coverage/llm/microsoft/phi3.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/microsoft/phi3.mdx
rename to docs/model-coverage/llm/microsoft/phi3.mdx
diff --git a/docs/model-coverage/llm/minimax/minimax-m2.md b/docs/model-coverage/llm/minimax/minimax-m2.md
deleted file mode 100644
index e515cbeb66..0000000000
--- a/docs/model-coverage/llm/minimax/minimax-m2.md
+++ /dev/null
@@ -1,98 +0,0 @@
-# MiniMax-M2
-
-[MiniMax-M2](https://huggingface.co/MiniMaxAI) is MiniMax's large Mixture-of-Experts language model with linear attention for efficient long-context inference.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation (MoE) |
-| **Architecture** | `MiniMaxM2ForCausalLM` |
-| **Parameters** | varies |
-| **HF Org** | [MiniMaxAI](https://huggingface.co/MiniMaxAI) |
-:::
-
-## Available Models
-
-- **MiniMax-M2.1**
-- **MiniMax-M2.5**
-- **MiniMax-M2.7**
-## Architecture
-
-- `MiniMaxM2ForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| MiniMax M2.1 | [`MiniMaxAI/MiniMax-M2.1`](https://huggingface.co/MiniMaxAI/MiniMax-M2.1) |
-| MiniMax M2.5 | [`MiniMaxAI/MiniMax-M2.5`](https://huggingface.co/MiniMaxAI/MiniMax-M2.5) |
-| MiniMax M2.7 | [`MiniMaxAI/MiniMax-M2.7`](https://huggingface.co/MiniMaxAI/MiniMax-M2.7) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`minimax_m2.1_hellaswag_pp.yaml <../../../../examples/llm_finetune/minimax_m2/minimax_m2.1_hellaswag_pp.yaml>` | SFT — MiniMax-M2.1 on HellaSwag with pipeline parallelism |
-| {download}`minimax_m2.5_hellaswag_pp.yaml <../../../../examples/llm_finetune/minimax_m2/minimax_m2.5_hellaswag_pp.yaml>` | SFT — MiniMax-M2.5 on HellaSwag with pipeline parallelism |
-| {download}`minimax_m2.7_hellaswag_pp.yaml <../../../../examples/llm_finetune/minimax_m2/minimax_m2.7_hellaswag_pp.yaml>` | SFT — MiniMax-M2.7 on HellaSwag with pipeline parallelism |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-:::{note}
-This recipe was validated on **8 nodes × 8 GPUs (64 H100s)**. See the [Launcher Guide](../../../launcher/slurm.md) for multi-node setup.
-:::
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/minimax_m2/minimax_m2.1_hellaswag_pp.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/minimax_m2/minimax_m2.1_hellaswag_pp.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [Large MoE Fine-Tuning Guide](../../../guides/llm/large-moe-finetune.md).
-
-## Hugging Face Model Cards
-
-- [MiniMaxAI/MiniMax-M2.1](https://huggingface.co/MiniMaxAI/MiniMax-M2.1)
-- [MiniMaxAI/MiniMax-M2.5](https://huggingface.co/MiniMaxAI/MiniMax-M2.5)
-- [MiniMaxAI/MiniMax-M2.7](https://huggingface.co/MiniMaxAI/MiniMax-M2.7)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/minimax/minimax-m2.mdx b/docs/model-coverage/llm/minimax/minimax-m2.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/minimax/minimax-m2.mdx
rename to docs/model-coverage/llm/minimax/minimax-m2.mdx
diff --git a/docs/model-coverage/llm/mistralai/ministral3.md b/docs/model-coverage/llm/mistralai/ministral3.md
deleted file mode 100644
index 6d9e933101..0000000000
--- a/docs/model-coverage/llm/mistralai/ministral3.md
+++ /dev/null
@@ -1,92 +0,0 @@
-# Ministral3 / Devstral
-
-[Ministral](https://mistral.ai/news/ministraux/) is Mistral AI's efficient small model series optimized for on-device and edge use cases. [Devstral](https://mistral.ai/news/devstral/) is a code-focused model built on the same architecture, designed for software engineering agents.
-
-Both use the `Mistral3ForConditionalGeneration` architecture.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `Mistral3ForConditionalGeneration` |
-| **Parameters** | 3B – 24B |
-| **HF Org** | [mistralai](https://huggingface.co/mistralai) |
-:::
-
-## Available Models
-
-**Ministral3:**
-- **Ministral-3-3B-Instruct-2512**
-- **Ministral-3-8B-Instruct-2512**
-- **Ministral-3-14B-Instruct-2512**
-
-**Devstral:**
-- **Devstral-Small-2-24B-Instruct-2512**
-
-## Architecture
-
-- `Mistral3ForConditionalGeneration`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Ministral-3 3B Instruct | [`mistralai/Ministral-3-3B-Instruct-2512`](https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512) |
-| Ministral-3 8B Instruct | [`mistralai/Ministral-3-8B-Instruct-2512`](https://huggingface.co/mistralai/Ministral-3-8B-Instruct-2512) |
-| Ministral-3 14B Instruct | [`mistralai/Ministral-3-14B-Instruct-2512`](https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512) |
-| Devstral Small 2 24B | [`mistralai/Devstral-Small-2-24B-Instruct-2512`](https://huggingface.co/mistralai/Devstral-Small-2-24B-Instruct-2512) |
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/devstral/devstral2_small_2512_squad.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/devstral/devstral2_small_2512_squad.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [mistralai/Ministral-3-8B-Instruct-2512](https://huggingface.co/mistralai/Ministral-3-8B-Instruct-2512)
-- [mistralai/Devstral-Small-2-24B-Instruct-2512](https://huggingface.co/mistralai/Devstral-Small-2-24B-Instruct-2512)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/mistralai/ministral3.mdx b/docs/model-coverage/llm/mistralai/ministral3.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/mistralai/ministral3.mdx
rename to docs/model-coverage/llm/mistralai/ministral3.mdx
diff --git a/docs/model-coverage/llm/mistralai/mistral.md b/docs/model-coverage/llm/mistralai/mistral.md
deleted file mode 100644
index 33e24d981a..0000000000
--- a/docs/model-coverage/llm/mistralai/mistral.md
+++ /dev/null
@@ -1,95 +0,0 @@
-# Mistral
-
-[Mistral AI](https://mistral.ai/) models are efficient transformer decoder models featuring sliding window attention for long context support. Mistral-Nemo is a 12B model developed jointly with NVIDIA.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `MistralForCausalLM` |
-| **Parameters** | 7B – 12B |
-| **HF Org** | [mistralai](https://huggingface.co/mistralai) |
-:::
-
-## Available Models
-
-- **Mistral-7B**: v0.1, v0.2, v0.3
-- **Mistral-7B-Instruct**: v0.1, v0.2, v0.3
-- **Mistral-Nemo-Instruct-2407**: 12B
-
-## Architecture
-
-- `MistralForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Mistral 7B v0.1 | [`mistralai/Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1) |
-| Mistral 7B Instruct v0.1 | [`mistralai/Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) |
-| Mistral Nemo Instruct 2407 | [`mistralai/Mistral-Nemo-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`mistral_7b_squad.yaml <../../../../examples/llm_finetune/mistral/mistral_7b_squad.yaml>` | SFT — Mistral 7B on SQuAD |
-| {download}`mistral_7b_squad_peft.yaml <../../../../examples/llm_finetune/mistral/mistral_7b_squad_peft.yaml>` | LoRA — Mistral 7B on SQuAD |
-| {download}`mistral_nemo_2407_squad.yaml <../../../../examples/llm_finetune/mistral/mistral_nemo_2407_squad.yaml>` | SFT — Mistral Nemo 2407 on SQuAD |
-| {download}`mistral_nemo_2407_squad_peft.yaml <../../../../examples/llm_finetune/mistral/mistral_nemo_2407_squad_peft.yaml>` | LoRA — Mistral Nemo 2407 on SQuAD |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/mistral/mistral_7b_squad.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/mistral/mistral_7b_squad.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
-- [mistralai/Mistral-Nemo-Instruct-2407](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/mistralai/mistral.mdx b/docs/model-coverage/llm/mistralai/mistral.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/mistralai/mistral.mdx
rename to docs/model-coverage/llm/mistralai/mistral.mdx
diff --git a/docs/model-coverage/llm/mistralai/mixtral.md b/docs/model-coverage/llm/mistralai/mixtral.md
deleted file mode 100644
index b7fca46885..0000000000
--- a/docs/model-coverage/llm/mistralai/mixtral.md
+++ /dev/null
@@ -1,92 +0,0 @@
-# Mixtral
-
-[Mixtral](https://mistral.ai/news/mixtral-of-experts/) is Mistral AI's Mixture-of-Experts model series. Each token is processed by a subset of experts, enabling a large total parameter count with efficient per-token compute.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation (MoE) |
-| **Architecture** | `MixtralForCausalLM` |
-| **Parameters** | 47B total / 13B active |
-| **HF Org** | [mistralai](https://huggingface.co/mistralai) |
-:::
-
-## Available Models
-
-- **Mixtral-8x7B**: 8 experts, 2 active per token (~13B active)
-- **Mixtral-8x7B-Instruct**: instruction-tuned variant
-- **Mixtral-8x22B**: 8 experts, 2 active per token (~39B active)
-
-## Architecture
-
-- `MixtralForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Mixtral 8x7B v0.1 | [`mistralai/Mixtral-8x7B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) |
-| Mixtral 8x7B Instruct v0.1 | [`mistralai/Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`mixtral-8x7b-v0-1_squad.yaml <../../../../examples/llm_finetune/mistral/mixtral-8x7b-v0-1_squad.yaml>` | SFT — Mixtral 8x7B on SQuAD |
-| {download}`mixtral-8x7b-v0-1_squad_peft.yaml <../../../../examples/llm_finetune/mistral/mixtral-8x7b-v0-1_squad_peft.yaml>` | LoRA — Mixtral 8x7B on SQuAD |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/mistral/mixtral-8x7b-v0-1_squad.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/mistral/mixtral-8x7b-v0-1_squad.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md) and the [Large MoE Fine-Tuning Guide](../../../guides/llm/large-moe-finetune.md).
-
-## Hugging Face Model Cards
-
-- [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
-- [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/mistralai/mixtral.mdx b/docs/model-coverage/llm/mistralai/mixtral.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/mistralai/mixtral.mdx
rename to docs/model-coverage/llm/mistralai/mixtral.mdx
diff --git a/docs/model-coverage/llm/moonshotai/moonlight.md b/docs/model-coverage/llm/moonshotai/moonlight.md
deleted file mode 100644
index c64a8d30a6..0000000000
--- a/docs/model-coverage/llm/moonshotai/moonlight.md
+++ /dev/null
@@ -1,88 +0,0 @@
-# Moonlight
-
-[Moonlight](https://huggingface.co/moonshotai/Moonlight-16B-A3B) is a Mixture-of-Experts language model from Moonshot AI trained using Muon optimizer. It uses the `DeepseekV3ForCausalLM` architecture with 16B total parameters and 3B activated per token.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation (MoE) |
-| **Architecture** | `DeepseekV3ForCausalLM` |
-| **Parameters** | 16B total / 3B active |
-| **HF Org** | [moonshotai](https://huggingface.co/moonshotai) |
-:::
-
-## Available Models
-
-- **Moonlight-16B-A3B**: 16B total, 3B activated
-
-## Architecture
-
-- `DeepseekV3ForCausalLM` (same architecture as DeepSeek-V3)
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Moonlight 16B A3B | [`moonshotai/Moonlight-16B-A3B`](https://huggingface.co/moonshotai/Moonlight-16B-A3B) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`moonlight_16b_te.yaml <../../../../examples/llm_finetune/moonlight/moonlight_16b_te.yaml>` | SFT — Moonlight 16B with Transformer Engine |
-| {download}`moonlight_16b_te_packed_sequence.yaml <../../../../examples/llm_finetune/moonlight/moonlight_16b_te_packed_sequence.yaml>` | SFT — Moonlight 16B with packed sequences |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/moonlight/moonlight_16b_te.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/moonlight/moonlight_16b_te.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [moonshotai/Moonlight-16B-A3B](https://huggingface.co/moonshotai/Moonlight-16B-A3B)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/moonshotai/moonlight.mdx b/docs/model-coverage/llm/moonshotai/moonlight.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/moonshotai/moonlight.mdx
rename to docs/model-coverage/llm/moonshotai/moonlight.mdx
diff --git a/docs/model-coverage/llm/nvidia/nemotron-flash.md b/docs/model-coverage/llm/nvidia/nemotron-flash.md
deleted file mode 100644
index e0342c9f1f..0000000000
--- a/docs/model-coverage/llm/nvidia/nemotron-flash.md
+++ /dev/null
@@ -1,92 +0,0 @@
-# Nemotron-Flash
-
-[NVIDIA Nemotron-Flash](https://huggingface.co/nvidia/Nemotron-Flash-1B) is a compact, fast language model designed for low-latency inference workloads.
-
-:::{note}
-This model requires `trust_remote_code: true` in your recipe YAML.
-:::
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `NemotronFlashForCausalLM` |
-| **Parameters** | 1B |
-| **HF Org** | [nvidia](https://huggingface.co/nvidia) |
-:::
-
-## Available Models
-
-- **Nemotron-Flash-1B**: 1B parameters
-
-## Architecture
-
-- `NemotronFlashForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Nemotron-Flash 1B | [`nvidia/Nemotron-Flash-1B`](https://huggingface.co/nvidia/Nemotron-Flash-1B) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`nemotron_flash_1b_squad.yaml <../../../../examples/llm_finetune/nemotron_flash/nemotron_flash_1b_squad.yaml>` | SFT — Nemotron-Flash 1B on SQuAD |
-| {download}`nemotron_flash_1b_squad_peft.yaml <../../../../examples/llm_finetune/nemotron_flash/nemotron_flash_1b_squad_peft.yaml>` | LoRA — Nemotron-Flash 1B on SQuAD |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/nemotron_flash/nemotron_flash_1b_squad.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/nemotron_flash/nemotron_flash_1b_squad.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [nvidia/Nemotron-Flash-1B](https://huggingface.co/nvidia/Nemotron-Flash-1B)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/nvidia/nemotron-flash.mdx b/docs/model-coverage/llm/nvidia/nemotron-flash.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/nvidia/nemotron-flash.mdx
rename to docs/model-coverage/llm/nvidia/nemotron-flash.mdx
diff --git a/docs/model-coverage/llm/nvidia/nemotron-h.md b/docs/model-coverage/llm/nvidia/nemotron-h.md
deleted file mode 100644
index 7c87558b10..0000000000
--- a/docs/model-coverage/llm/nvidia/nemotron-h.md
+++ /dev/null
@@ -1,96 +0,0 @@
-# Nemotron-H
-
-[NVIDIA Nemotron-H](https://developer.nvidia.com/blog/nemotron-h-reasoning-enabling-throughput-gains-with-no-compromises/) is a hybrid Mamba-2 / transformer architecture that interleaves selective state space layers with standard attention layers for improved efficiency on long sequences.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `NemotronHForCausalLM` |
-| **Parameters** | 9B – 30B |
-| **HF Org** | [nvidia](https://huggingface.co/nvidia) |
-:::
-
-## Available Models
-
-- **NVIDIA-Nemotron-Nano-9B-v2**: 9B hybrid model
-- **NVIDIA-Nemotron-Nano-12B-v2**: 12B hybrid model
-- **NVIDIA-Nemotron-3-Nano-30B-A3B-BF16**: 30B total, 3B activated (sparse MoE + Mamba-2)
-
-## Architecture
-
-- `NemotronHForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Nemotron-Nano 9B v2 | [`nvidia/NVIDIA-Nemotron-Nano-9B-v2`](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2) |
-| Nemotron-Nano 12B v2 | [`nvidia/NVIDIA-Nemotron-Nano-12B-v2`](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2) |
-| Nemotron-3-Nano 30B A3B | [`nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16`](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`nemotron_nano_9b_squad.yaml <../../../../examples/llm_finetune/nemotron/nemotron_nano_9b_squad.yaml>` | SFT — Nemotron-Nano 9B on SQuAD |
-| {download}`nemotron_nano_9b_squad_peft.yaml <../../../../examples/llm_finetune/nemotron/nemotron_nano_9b_squad_peft.yaml>` | LoRA — Nemotron-Nano 9B on SQuAD |
-| {download}`nemotron_nano_v3_hellaswag.yaml <../../../../examples/llm_finetune/nemotron/nemotron_nano_v3_hellaswag.yaml>` | SFT — Nemotron-3-Nano 30B on HellaSwag |
-| {download}`nemotron_nano_v3_hellaswag_peft.yaml <../../../../examples/llm_finetune/nemotron/nemotron_nano_v3_hellaswag_peft.yaml>` | LoRA — Nemotron-3-Nano 30B on HellaSwag |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/nemotron/nemotron_nano_9b_squad.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/nemotron/nemotron_nano_9b_squad.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [nvidia/NVIDIA-Nemotron-Nano-9B-v2](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2)
-- [nvidia/NVIDIA-Nemotron-Nano-12B-v2](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2)
-- [nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/nvidia/nemotron-h.mdx b/docs/model-coverage/llm/nvidia/nemotron-h.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/nvidia/nemotron-h.mdx
rename to docs/model-coverage/llm/nvidia/nemotron-h.mdx
diff --git a/docs/model-coverage/llm/nvidia/nemotron-super.md b/docs/model-coverage/llm/nvidia/nemotron-super.md
deleted file mode 100644
index a8cee21608..0000000000
--- a/docs/model-coverage/llm/nvidia/nemotron-super.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# Nemotron-Super (Llama-3.3-Nemotron-Super-49B)
-
-[Llama-3.3-Nemotron-Super-49B-v1](https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1) is a NVIDIA model derived from Llama-3.1-70B through Neural Architecture Search (NAS)-based pruning and knowledge distillation, resulting in a 49B model with strong reasoning capabilities. It uses the `DeciLMForCausalLM` architecture.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `DeciLMForCausalLM` |
-| **Parameters** | 49B |
-| **HF Org** | [nvidia](https://huggingface.co/nvidia) |
-:::
-
-## Available Models
-
-- **Llama-3.3-Nemotron-Super-49B-v1**: 49B parameters
-
-## Architecture
-
-- `DeciLMForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Llama-3.3-Nemotron-Super-49B-v1 | [`nvidia/Llama-3_3-Nemotron-Super-49B-v1`](https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base LLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [nvidia/Llama-3_3-Nemotron-Super-49B-v1](https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/nvidia/nemotron-super.mdx b/docs/model-coverage/llm/nvidia/nemotron-super.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/nvidia/nemotron-super.mdx
rename to docs/model-coverage/llm/nvidia/nemotron-super.mdx
diff --git a/docs/model-coverage/llm/nvidia/nemotron.md b/docs/model-coverage/llm/nvidia/nemotron.md
deleted file mode 100644
index ff760ee75a..0000000000
--- a/docs/model-coverage/llm/nvidia/nemotron.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# Nemotron / Minitron
-
-[NVIDIA Nemotron](https://www.nvidia.com/en-us/ai-data-science/foundation-models/) and [Minitron](https://developer.nvidia.com/blog/how-to-prune-and-distill-llama-3-1-8b-to-an-nvidia-llama-3-1-minitron-4b-model/) are NVIDIA's family of language models. Minitron models are produced by pruning and distilling larger Llama/Nemotron models into compact, high-performance checkpoints.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `NemotronForCausalLM` |
-| **Parameters** | 8B |
-| **HF Org** | [nvidia](https://huggingface.co/nvidia) |
-:::
-
-## Available Models
-
-- **Minitron-8B-Base**: pruned and distilled from Llama-3.1-8B
-
-## Architecture
-
-- `NemotronForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Minitron 8B Base | [`nvidia/Minitron-8B-Base`](https://huggingface.co/nvidia/Minitron-8B-Base) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base LLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [nvidia/Minitron-8B-Base](https://huggingface.co/nvidia/Minitron-8B-Base)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/nvidia/nemotron.mdx b/docs/model-coverage/llm/nvidia/nemotron.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/nvidia/nemotron.mdx
rename to docs/model-coverage/llm/nvidia/nemotron.mdx
diff --git a/docs/model-coverage/llm/openai/gpt-oss.md b/docs/model-coverage/llm/openai/gpt-oss.md
deleted file mode 100644
index aca34152af..0000000000
--- a/docs/model-coverage/llm/openai/gpt-oss.md
+++ /dev/null
@@ -1,91 +0,0 @@
-# GPT-OSS
-
-[GPT-OSS](https://huggingface.co/openai/gpt-oss-20b) is OpenAI's open-weight model family featuring QuickGELU activations and activation clamping for training stability.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `GptOssForCausalLM` |
-| **Parameters** | 20B – 120B |
-| **HF Org** | [openai](https://huggingface.co/openai) |
-:::
-
-## Available Models
-
-- **gpt-oss-20b**: 20B parameters
-- **gpt-oss-120b**: 120B parameters
-
-## Architecture
-
-- `GptOssForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| GPT-OSS 20B | [`openai/gpt-oss-20b`](https://huggingface.co/openai/gpt-oss-20b) |
-| GPT-OSS 120B | [`openai/gpt-oss-120b`](https://huggingface.co/openai/gpt-oss-120b) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`gpt_oss_20b.yaml <../../../../examples/llm_finetune/gpt_oss/gpt_oss_20b.yaml>` | SFT — GPT-OSS 20B |
-| {download}`gpt_oss_120b.yaml <../../../../examples/llm_finetune/gpt_oss/gpt_oss_120b.yaml>` | SFT — GPT-OSS 120B |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/gpt_oss/gpt_oss_20b.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/gpt_oss/gpt_oss_20b.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [openai/gpt-oss-20b](https://huggingface.co/openai/gpt-oss-20b)
-- [openai/gpt-oss-120b](https://huggingface.co/openai/gpt-oss-120b)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/openai/gpt-oss.mdx b/docs/model-coverage/llm/openai/gpt-oss.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/openai/gpt-oss.mdx
rename to docs/model-coverage/llm/openai/gpt-oss.mdx
diff --git a/docs/model-coverage/llm/openai/gpt2.md b/docs/model-coverage/llm/openai/gpt2.md
deleted file mode 100644
index f5c9f17ecd..0000000000
--- a/docs/model-coverage/llm/openai/gpt2.md
+++ /dev/null
@@ -1,63 +0,0 @@
-# GPT-2
-
-[GPT-2](https://huggingface.co/openai-community/gpt2) is OpenAI's foundational decoder-only transformer. NeMo AutoModel uses it as a baseline for the Megatron pretraining smoke test and tutorials — its small footprint makes it a convenient target to validate data pipelines, distributed configs, and logging without needing large compute.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation (pretraining baseline) |
-| **Architecture** | `GPT2LMHeadModel` |
-| **Parameters** | 124M – 1.5B |
-| **HF Org** | [openai-community](https://huggingface.co/openai-community) |
-:::
-
-## Available Models
-
-- **gpt2** (124M)
-- **gpt2-medium** (355M)
-- **gpt2-large** (774M)
-- **gpt2-xl** (1.5B)
-
-## Architecture
-
-- `GPT2LMHeadModel`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| GPT-2 | [`openai-community/gpt2`](https://huggingface.co/openai-community/gpt2) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`megatron_pretrain_gpt2.yaml <../../../../examples/llm_pretrain/megatron_pretrain_gpt2.yaml>` | Megatron pretraining smoke test — GPT-2 on FineWeb-Edu |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_pretrain/megatron_pretrain_gpt2.yaml
-```
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Pretraining Guide](../../../guides/llm/pretraining.md).
-
-## Hugging Face Model Cards
-
-- [openai-community/gpt2](https://huggingface.co/openai-community/gpt2)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/openai/gpt2.mdx b/docs/model-coverage/llm/openai/gpt2.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/openai/gpt2.mdx
rename to docs/model-coverage/llm/openai/gpt2.mdx
diff --git a/docs/model-coverage/llm/openbmb/minicpm.md b/docs/model-coverage/llm/openbmb/minicpm.md
deleted file mode 100644
index 83355a8ba9..0000000000
--- a/docs/model-coverage/llm/openbmb/minicpm.md
+++ /dev/null
@@ -1,92 +0,0 @@
-# MiniCPM
-
-[MiniCPM](https://github.com/OpenBMB/MiniCPM) is a compact language model series from OpenBMB / Tsinghua University, designed to deliver strong performance at small parameter counts using model merging and continuous training techniques.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `MiniCPMForCausalLM` / `MiniCPM3ForCausalLM` |
-| **Parameters** | 2B – 4B |
-| **HF Org** | [openbmb](https://huggingface.co/openbmb) |
-:::
-
-## Available Models
-
-- **MiniCPM3-4B** (`MiniCPM3ForCausalLM`): 4B
-- **MiniCPM-2B-sft-bf16** (`MiniCPMForCausalLM`): 2B, SFT
-- **MiniCPM-2B-dpo-bf16** (`MiniCPMForCausalLM`): 2B, DPO
-
-## Architectures
-
-- `MiniCPMForCausalLM` — MiniCPM v1/v2
-- `MiniCPM3ForCausalLM` — MiniCPM3
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| MiniCPM 2B SFT | [`openbmb/MiniCPM-2B-sft-bf16`](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) |
-| MiniCPM3 4B | [`openbmb/MiniCPM3-4B`](https://huggingface.co/openbmb/MiniCPM3-4B) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base LLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16)
-- [openbmb/MiniCPM3-4B](https://huggingface.co/openbmb/MiniCPM3-4B)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/openbmb/minicpm.mdx b/docs/model-coverage/llm/openbmb/minicpm.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/openbmb/minicpm.mdx
rename to docs/model-coverage/llm/openbmb/minicpm.mdx
diff --git a/docs/model-coverage/llm/orionstar/orion.md b/docs/model-coverage/llm/orionstar/orion.md
deleted file mode 100644
index 3345749269..0000000000
--- a/docs/model-coverage/llm/orionstar/orion.md
+++ /dev/null
@@ -1,89 +0,0 @@
-# Orion
-
-[Orion](https://github.com/OrionStarAI/Orion) is a bilingual (Chinese-English) language model from OrionStar AI, with 14B parameters and strong performance on Chinese benchmarks.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `OrionForCausalLM` |
-| **Parameters** | 14B |
-| **HF Org** | [OrionStarAI](https://huggingface.co/OrionStarAI) |
-:::
-
-## Available Models
-
-- **Orion-14B-Base**: 14B
-- **Orion-14B-Chat**: 14B instruction-tuned
-
-## Architecture
-
-- `OrionForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Orion 14B Base | [`OrionStarAI/Orion-14B-Base`](https://huggingface.co/OrionStarAI/Orion-14B-Base) |
-| Orion 14B Chat | [`OrionStarAI/Orion-14B-Chat`](https://huggingface.co/OrionStarAI/Orion-14B-Chat) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base LLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [OrionStarAI/Orion-14B-Base](https://huggingface.co/OrionStarAI/Orion-14B-Base)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/orionstar/orion.mdx b/docs/model-coverage/llm/orionstar/orion.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/orionstar/orion.mdx
rename to docs/model-coverage/llm/orionstar/orion.mdx
diff --git a/docs/model-coverage/llm/parasail-ai/gritlm.md b/docs/model-coverage/llm/parasail-ai/gritlm.md
deleted file mode 100644
index f5e067c34c..0000000000
--- a/docs/model-coverage/llm/parasail-ai/gritlm.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# GritLM
-
-[GritLM](https://github.com/ContextualAI/gritlm) (Generative Representational Instruction Tuning) is a unified model that performs both generative language modeling and text embedding in a single model, from Parasail AI.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation + Embedding |
-| **Architecture** | `GritLM` |
-| **Parameters** | 7B |
-| **HF Org** | [parasail-ai](https://huggingface.co/parasail-ai) |
-:::
-
-## Available Models
-
-- **GritLM-7B-vllm**
-
-## Architecture
-
-- `GritLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| GritLM 7B vllm | [`parasail-ai/GritLM-7B-vllm`](https://huggingface.co/parasail-ai/GritLM-7B-vllm) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base LLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [parasail-ai/GritLM-7B-vllm](https://huggingface.co/parasail-ai/GritLM-7B-vllm)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/parasail-ai/gritlm.mdx b/docs/model-coverage/llm/parasail-ai/gritlm.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/parasail-ai/gritlm.mdx
rename to docs/model-coverage/llm/parasail-ai/gritlm.mdx
diff --git a/docs/model-coverage/llm/qwen/qwen2-moe.md b/docs/model-coverage/llm/qwen/qwen2-moe.md
deleted file mode 100644
index a0b4194d57..0000000000
--- a/docs/model-coverage/llm/qwen/qwen2-moe.md
+++ /dev/null
@@ -1,88 +0,0 @@
-# Qwen2 MoE
-
-[Qwen1.5-MoE](https://qwenlm.github.io/) is a Mixture-of-Experts variant from Alibaba Cloud that activates only a fraction of parameters per token, enabling efficient training and inference at scale.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation (MoE) |
-| **Architecture** | `Qwen2MoeForCausalLM` |
-| **Parameters** | 14.3B total / 2.7B active |
-| **HF Org** | [Qwen](https://huggingface.co/Qwen) |
-:::
-
-## Available Models
-
-- **Qwen1.5-MoE-A2.7B**: 14.3B total parameters, 2.7B activated per token
-
-## Architecture
-
-- `Qwen2MoeForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Qwen1.5 MoE A2.7B | [`Qwen/Qwen1.5-MoE-A2.7B`](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B) |
-| Qwen1.5 MoE A2.7B Chat | [`Qwen/Qwen1.5-MoE-A2.7B-Chat`](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`qwen1_5_moe_a2_7b_qlora.yaml <../../../../examples/llm_finetune/qwen/qwen1_5_moe_a2_7b_qlora.yaml>` | QLoRA — Qwen1.5 MoE A2.7B |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/qwen/qwen1_5_moe_a2_7b_qlora.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/qwen/qwen1_5_moe_a2_7b_qlora.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md) and the [Large MoE Fine-Tuning Guide](../../../guides/llm/large-moe-finetune.md).
-
-## Hugging Face Model Cards
-
-- [Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/qwen/qwen2-moe.mdx b/docs/model-coverage/llm/qwen/qwen2-moe.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/qwen/qwen2-moe.mdx
rename to docs/model-coverage/llm/qwen/qwen2-moe.mdx
diff --git a/docs/model-coverage/llm/qwen/qwen2.md b/docs/model-coverage/llm/qwen/qwen2.md
deleted file mode 100644
index 19cec6f9fb..0000000000
--- a/docs/model-coverage/llm/qwen/qwen2.md
+++ /dev/null
@@ -1,95 +0,0 @@
-# Qwen2
-
-[Qwen2](https://qwenlm.github.io/) is Alibaba Cloud's second-generation large language model series. It features grouped query attention, YARN-based long-context extension, and dual chunk attention for long sequences. QwQ-32B-Preview, a reasoning-focused model, also uses this architecture.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `Qwen2ForCausalLM` |
-| **Parameters** | 0.5B – 72B |
-| **HF Org** | [Qwen](https://huggingface.co/Qwen) |
-:::
-
-## Available Models
-
-- **Qwen2.5**: 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B
-- **Qwen2**: 0.5B, 1.5B, 7B, 57B-A14B (MoE), 72B
-- **QwQ-32B-Preview** — reasoning model
-
-## Architecture
-
-- `Qwen2ForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Qwen2.5 7B Instruct | [`Qwen/Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) |
-| Qwen2.5 72B Instruct | [`Qwen/Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) |
-| Qwen2 7B Instruct | [`Qwen/Qwen2-7B-Instruct`](https://huggingface.co/Qwen/Qwen2-7B-Instruct) |
-| QwQ 32B Preview | [`Qwen/QwQ-32B-Preview`](https://huggingface.co/Qwen/QwQ-32B-Preview) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`qwen2_5_7b_squad.yaml <../../../../examples/llm_finetune/qwen/qwen2_5_7b_squad.yaml>` | SFT — Qwen2.5 7B on SQuAD |
-| {download}`qwq_32b_squad_peft.yaml <../../../../examples/llm_finetune/qwen/qwq_32b_squad_peft.yaml>` | LoRA — QwQ 32B on SQuAD |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/qwen/qwen2_5_7b_squad.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/qwen/qwen2_5_7b_squad.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md) for full SFT and LoRA instructions.
-
-## Hugging Face Model Cards
-
-- [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)
-- [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct)
-- [Qwen/QwQ-32B-Preview](https://huggingface.co/Qwen/QwQ-32B-Preview)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/qwen/qwen2.mdx b/docs/model-coverage/llm/qwen/qwen2.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/qwen/qwen2.mdx
rename to docs/model-coverage/llm/qwen/qwen2.mdx
diff --git a/docs/model-coverage/llm/qwen/qwen3-moe.md b/docs/model-coverage/llm/qwen/qwen3-moe.md
deleted file mode 100644
index 977002b850..0000000000
--- a/docs/model-coverage/llm/qwen/qwen3-moe.md
+++ /dev/null
@@ -1,91 +0,0 @@
-# Qwen3 MoE
-
-[Qwen3 MoE](https://qwenlm.github.io/blog/qwen3/) is the Mixture-of-Experts variant of the Qwen3 series from Alibaba Cloud, activating a small fraction of parameters per token for efficient large-scale training.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation (MoE) |
-| **Architecture** | `Qwen3MoeForCausalLM` |
-| **Parameters** | 30B – 235B total |
-| **HF Org** | [Qwen](https://huggingface.co/Qwen) |
-:::
-
-## Available Models
-
-- **Qwen3-30B-A3B**: 30B total parameters, 3B activated per token
-- **Qwen3-235B-A22B**: 235B total parameters, 22B activated per token
-
-## Architecture
-
-- `Qwen3MoeForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Qwen3 30B A3B | [`Qwen/Qwen3-30B-A3B`](https://huggingface.co/Qwen/Qwen3-30B-A3B) |
-| Qwen3 235B A22B | [`Qwen/Qwen3-235B-A22B`](https://huggingface.co/Qwen/Qwen3-235B-A22B) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`qwen3_moe_30b_te_deepep.yaml <../../../../examples/llm_finetune/qwen/qwen3_moe_30b_te_deepep.yaml>` | SFT — Qwen3 MoE 30B with TE + DeepEP |
-| {download}`qwen3_moe_30b_lora.yaml <../../../../examples/llm_finetune/qwen/qwen3_moe_30b_lora.yaml>` | LoRA — Qwen3 MoE 30B |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/qwen/qwen3_moe_30b_te_deepep.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/qwen/qwen3_moe_30b_te_deepep.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md) and the [Large MoE Fine-Tuning Guide](../../../guides/llm/large-moe-finetune.md).
-
-## Hugging Face Model Cards
-
-- [Qwen/Qwen3-30B-A3B](https://huggingface.co/Qwen/Qwen3-30B-A3B)
-- [Qwen/Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/qwen/qwen3-moe.mdx b/docs/model-coverage/llm/qwen/qwen3-moe.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/qwen/qwen3-moe.mdx
rename to docs/model-coverage/llm/qwen/qwen3-moe.mdx
diff --git a/docs/model-coverage/llm/qwen/qwen3-next.md b/docs/model-coverage/llm/qwen/qwen3-next.md
deleted file mode 100644
index ef4676892a..0000000000
--- a/docs/model-coverage/llm/qwen/qwen3-next.md
+++ /dev/null
@@ -1,91 +0,0 @@
-# Qwen3-Next
-
-Qwen3-Next is an advanced MoE language model from Alibaba Cloud's Qwen team designed for high-throughput inference with large total parameter counts and efficient per-token activation.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation (MoE) |
-| **Architecture** | `Qwen3NextForCausalLM` |
-| **Parameters** | 80B total / 3B active |
-| **HF Org** | [Qwen](https://huggingface.co/Qwen) |
-:::
-
-## Available Models
-
-- **Qwen3-Next-80B-A3B**: 80B total parameters, 3B activated per token
-
-## Architecture
-
-- `Qwen3NextForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Qwen3-Next 80B A3B Instruct | [`Qwen/Qwen3-Next-80B-A3B-Instruct`](https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`qwen3_next_te_deepep.yaml <../../../../examples/llm_finetune/qwen/qwen3_next_te_deepep.yaml>` | SFT — Qwen3-Next with TE + DeepEP |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-:::{note}
-This recipe was validated on **4 nodes × 8 GPUs (32 H100s)**. See the [Launcher Guide](../../../launcher/slurm.md) for multi-node setup.
-:::
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/qwen/qwen3_next_te_deepep.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/qwen/qwen3_next_te_deepep.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [Large MoE Fine-Tuning Guide](../../../guides/llm/large-moe-finetune.md).
-
-## Hugging Face Model Cards
-
-- [Qwen/Qwen3-Next-80B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/qwen/qwen3-next.mdx b/docs/model-coverage/llm/qwen/qwen3-next.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/qwen/qwen3-next.mdx
rename to docs/model-coverage/llm/qwen/qwen3-next.mdx
diff --git a/docs/model-coverage/llm/qwen/qwen3.md b/docs/model-coverage/llm/qwen/qwen3.md
deleted file mode 100644
index a33d2ae4c5..0000000000
--- a/docs/model-coverage/llm/qwen/qwen3.md
+++ /dev/null
@@ -1,91 +0,0 @@
-# Qwen3
-
-[Qwen3](https://qwenlm.github.io/blog/qwen3/) is Alibaba Cloud's third-generation dense language model series, featuring improved reasoning, instruction following, and multilingual capabilities over Qwen2.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `Qwen3ForCausalLM` |
-| **Parameters** | 0.6B – 32B |
-| **HF Org** | [Qwen](https://huggingface.co/Qwen) |
-:::
-
-## Available Models
-
-- **Qwen3**: 0.6B, 1.7B, 4B, 8B, 14B, 32B
-
-## Architecture
-
-- `Qwen3ForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Qwen3 0.6B | [`Qwen/Qwen3-0.6B`](https://huggingface.co/Qwen/Qwen3-0.6B) |
-| Qwen3 8B | [`Qwen/Qwen3-8B`](https://huggingface.co/Qwen/Qwen3-8B) |
-| Qwen3 32B | [`Qwen/Qwen3-32B`](https://huggingface.co/Qwen/Qwen3-32B) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`qwen3_0p6b_hellaswag.yaml <../../../../examples/llm_finetune/qwen/qwen3_0p6b_hellaswag.yaml>` | SFT — Qwen3 0.6B on HellaSwag |
-| {download}`qwen3_8b_squad_spark.yaml <../../../../examples/llm_finetune/qwen/qwen3_8b_squad_spark.yaml>` | SFT — Qwen3 8B on SQuAD (Spark) |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/qwen/qwen3_0p6b_hellaswag.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/qwen/qwen3_0p6b_hellaswag.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md) for full SFT and LoRA instructions.
-
-## Hugging Face Model Cards
-
-- [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B)
-- [Qwen/Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/qwen/qwen3.mdx b/docs/model-coverage/llm/qwen/qwen3.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/qwen/qwen3.mdx
rename to docs/model-coverage/llm/qwen/qwen3.mdx
diff --git a/docs/model-coverage/llm/stabilityai/stablelm.md b/docs/model-coverage/llm/stabilityai/stablelm.md
deleted file mode 100644
index 44adcc079d..0000000000
--- a/docs/model-coverage/llm/stabilityai/stablelm.md
+++ /dev/null
@@ -1,89 +0,0 @@
-# StableLM
-
-[StableLM](https://huggingface.co/stabilityai) is Stability AI's series of open language models, available in base and instruction-tuned variants across multiple sizes.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `StableLmForCausalLM` |
-| **Parameters** | 3B – 7B |
-| **HF Org** | [stabilityai](https://huggingface.co/stabilityai) |
-:::
-
-## Available Models
-
-- **stablelm-3b-4e1t**: 3B
-- **stablelm-base-alpha-7b-v2**: 7B
-
-## Architecture
-
-- `StableLmForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| StableLM 3B 4E1T | [`stabilityai/stablelm-3b-4e1t`](https://huggingface.co/stabilityai/stablelm-3b-4e1t) |
-| StableLM Base Alpha 7B v2 | [`stabilityai/stablelm-base-alpha-7b-v2`](https://huggingface.co/stabilityai/stablelm-base-alpha-7b-v2) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base LLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [stabilityai/stablelm-3b-4e1t](https://huggingface.co/stabilityai/stablelm-3b-4e1t)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/stabilityai/stablelm.mdx b/docs/model-coverage/llm/stabilityai/stablelm.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/stabilityai/stablelm.mdx
rename to docs/model-coverage/llm/stabilityai/stablelm.mdx
diff --git a/docs/model-coverage/llm/stepfun-ai/step-3-5.md b/docs/model-coverage/llm/stepfun-ai/step-3-5.md
deleted file mode 100644
index 578f782689..0000000000
--- a/docs/model-coverage/llm/stepfun-ai/step-3-5.md
+++ /dev/null
@@ -1,91 +0,0 @@
-# Step-3.5
-
-[Step-3.5-Flash](https://huggingface.co/stepfun-ai/Step-3.5-Flash) is a Mixture-of-Experts language model from Stepfun AI, designed for efficient inference.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation (MoE) |
-| **Architecture** | `Step3p5ForCausalLM` |
-| **Parameters** | varies |
-| **HF Org** | [stepfun-ai](https://huggingface.co/stepfun-ai) |
-:::
-
-## Available Models
-
-- **Step-3.5-Flash**
-
-## Architecture
-
-- `Step3p5ForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Step-3.5-Flash | [`stepfun-ai/Step-3.5-Flash`](https://huggingface.co/stepfun-ai/Step-3.5-Flash) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`step_3.5_flash_hellaswag_pp.yaml <../../../../examples/llm_finetune/stepfun/step_3.5_flash_hellaswag_pp.yaml>` | SFT — Step-3.5-Flash on HellaSwag with pipeline parallelism |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-:::{note}
-This recipe was validated on **16 nodes × 8 GPUs (128 H100s)**. See the [Launcher Guide](../../../launcher/slurm.md) for multi-node setup.
-:::
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/stepfun/step_3.5_flash_hellaswag_pp.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/stepfun/step_3.5_flash_hellaswag_pp.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [Large MoE Fine-Tuning Guide](../../../guides/llm/large-moe-finetune.md).
-
-## Hugging Face Model Cards
-
-- [stepfun-ai/Step-3.5-Flash](https://huggingface.co/stepfun-ai/Step-3.5-Flash)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/stepfun-ai/step-3-5.mdx b/docs/model-coverage/llm/stepfun-ai/step-3-5.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/stepfun-ai/step-3-5.mdx
rename to docs/model-coverage/llm/stepfun-ai/step-3-5.mdx
diff --git a/docs/model-coverage/llm/tencent/hy3.md b/docs/model-coverage/llm/tencent/hy3.md
deleted file mode 100644
index ea2d8be1e1..0000000000
--- a/docs/model-coverage/llm/tencent/hy3.md
+++ /dev/null
@@ -1,63 +0,0 @@
-# Hy3 (HunyuanLarge)
-
-[Hy3-preview](https://huggingface.co/tencent/Hy3-preview) is a 295B Mixture-of-Experts language model from Tencent. It features 80 transformer layers (layer 0 dense, layers 1–79 MoE), 192 routed experts plus 1 shared expert with top-8 sigmoid routing, Grouped Query Attention (64 Q / 8 KV heads), per-head QK RMSNorm, RoPE, and an `e_score_correction_bias` gate buffer for expert-load correction. It supports a 256K context window.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation (MoE) |
-| **Architecture** | `HYV3ForCausalLM` |
-| **Parameters** | 295B total |
-| **HF Org** | [tencent](https://huggingface.co/tencent) |
-:::
-
-## Available Models
-
-- **Hy3-preview**: 295B total, top-8 routed experts activated per token
-
-## Architectures
-
-- `HYV3ForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Hy3-preview | [`tencent/Hy3-preview`](https://huggingface.co/tencent/Hy3-preview) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`hy3_preview_deepep.yaml <../../../../examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml>` | SFT — Hy3-preview with DeepEP |
-
-## Try with NeMo AutoModel
-
-**1. Install** ([NeMo AutoModel](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/hy_v3/hy3_preview_deepep.yaml
-```
-
-See the [NeMo AutoModel Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md) and the [Large MoE Fine-Tuning Guide](../../../guides/llm/large-moe-finetune.md).
-
-## Hugging Face Model Cards
-
-- [tencent/Hy3-preview](https://huggingface.co/tencent/Hy3-preview)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/tencent/hy3.mdx b/docs/model-coverage/llm/tencent/hy3.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/tencent/hy3.mdx
rename to docs/model-coverage/llm/tencent/hy3.mdx
diff --git a/docs/model-coverage/llm/thudm/chatglm.md b/docs/model-coverage/llm/thudm/chatglm.md
deleted file mode 100644
index 3f57f10b3a..0000000000
--- a/docs/model-coverage/llm/thudm/chatglm.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# ChatGLM
-
-[ChatGLM](https://github.com/zai-org/ChatGLM-6B) is a bilingual (Chinese-English) conversational language model from Zhipu AI. ChatGLM2 and ChatGLM3 extend the original with improved performance, longer context, and more efficient attention.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `ChatGLMModel` |
-| **Parameters** | 6B |
-| **HF Org** | [zai-org](https://huggingface.co/zai-org) |
-:::
-
-## Available Models
-
-- **ChatGLM3-6B**
-- **ChatGLM2-6B**
-
-## Architecture
-
-- `ChatGLMModel` / `ChatGLMForConditionalGeneration`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| ChatGLM3 6B | [`zai-org/chatglm3-6b`](https://huggingface.co/zai-org/chatglm3-6b) |
-| ChatGLM2 6B | [`zai-org/chatglm2-6b`](https://huggingface.co/zai-org/chatglm2-6b) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base LLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [zai-org/chatglm3-6b](https://huggingface.co/zai-org/chatglm3-6b)
-- [zai-org/chatglm2-6b](https://huggingface.co/zai-org/chatglm2-6b)
diff --git a/fern/versions/nightly/pages/model-coverage/llm/thudm/chatglm.mdx b/docs/model-coverage/llm/thudm/chatglm.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/thudm/chatglm.mdx
rename to docs/model-coverage/llm/thudm/chatglm.mdx
diff --git a/docs/model-coverage/llm/thudm/glm4-moe.md b/docs/model-coverage/llm/thudm/glm4-moe.md
deleted file mode 100644
index 2348078168..0000000000
--- a/docs/model-coverage/llm/thudm/glm4-moe.md
+++ /dev/null
@@ -1,101 +0,0 @@
-# GLM-4 MoE (GLM-4.5 / GLM-4.7)
-
-[GLM-4.5 and GLM-4.7](https://huggingface.co/zai-org) are Mixture-of-Experts variants of the GLM family released under the `zai-org` HuggingFace organization. GLM-4.7-Flash is a lighter variant with fewer active parameters.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation (MoE) |
-| **Architecture** | `Glm4MoeForCausalLM` / `Glm4MoeLiteForCausalLM` |
-| **Parameters** | varies |
-| **HF Org** | [zai-org](https://huggingface.co/zai-org) |
-:::
-
-## Available Models
-
-- **GLM-4.5-Air** (`Glm4MoeForCausalLM`)
-- **GLM-4.7** (`Glm4MoeForCausalLM`)
-- **GLM-4.7-Flash** (`Glm4MoeLiteForCausalLM`): lightweight MoE variant
-
-## Architectures
-
-- `Glm4MoeForCausalLM` — GLM-4.5, GLM-4.7
-- `Glm4MoeLiteForCausalLM` — GLM-4.7-Flash
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| GLM-4.5-Air | [`zai-org/GLM-4.5-Air`](https://huggingface.co/zai-org/GLM-4.5-Air) |
-| GLM-4.7 | [`zai-org/GLM-4.7`](https://huggingface.co/zai-org/GLM-4.7) |
-| GLM-4.7-Flash | [`zai-org/GLM-4.7-Flash`](https://huggingface.co/zai-org/GLM-4.7-Flash) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`glm_4.5_air_te_deepep.yaml <../../../../examples/llm_finetune/glm/glm_4.5_air_te_deepep.yaml>` | SFT — GLM-4.5-Air with TE + DeepEP |
-| {download}`glm_4.7_te_deepep.yaml <../../../../examples/llm_finetune/glm/glm_4.7_te_deepep.yaml>` | SFT — GLM-4.7 with TE + DeepEP |
-| {download}`glm_4.7_flash_te_deepep.yaml <../../../../examples/llm_finetune/glm/glm_4.7_flash_te_deepep.yaml>` | SFT — GLM-4.7-Flash with TE + DeepEP |
-| {download}`glm_4.7_flash_te_packed_sequence.yaml <../../../../examples/llm_finetune/glm/glm_4.7_flash_te_packed_sequence.yaml>` | SFT — GLM-4.7-Flash with packed sequences |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-:::{note}
-This recipe was validated on **8 nodes × 8 GPUs (64 H100s)**. See the [Launcher Guide](../../../launcher/slurm.md) for multi-node setup.
-:::
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/glm/glm_4.5_air_te_deepep.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/glm/glm_4.5_air_te_deepep.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md) and the [Large MoE Fine-Tuning Guide](../../../guides/llm/large-moe-finetune.md).
-
-## Hugging Face Model Cards
-
-- [zai-org/GLM-4.5-Air](https://huggingface.co/zai-org/GLM-4.5-Air)
-- [zai-org/GLM-4.7](https://huggingface.co/zai-org/GLM-4.7)
-- [zai-org/GLM-4.7-Flash](https://huggingface.co/zai-org/GLM-4.7-Flash)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/thudm/glm4-moe.mdx b/docs/model-coverage/llm/thudm/glm4-moe.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/thudm/glm4-moe.mdx
rename to docs/model-coverage/llm/thudm/glm4-moe.mdx
diff --git a/docs/model-coverage/llm/thudm/glm4.md b/docs/model-coverage/llm/thudm/glm4.md
deleted file mode 100644
index 8cf0ee2e18..0000000000
--- a/docs/model-coverage/llm/thudm/glm4.md
+++ /dev/null
@@ -1,92 +0,0 @@
-# GLM-4
-
-[GLM-4](https://github.com/zai-org/GLM-4) is Zhipu AI's fourth-generation General Language Model, featuring strong multilingual capabilities and tool-use support.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `GlmForCausalLM` / `Glm4ForCausalLM` |
-| **Parameters** | 9B – 32B |
-| **HF Org** | [zai-org](https://huggingface.co/zai-org) |
-:::
-
-## Available Models
-
-- **GLM-4-9B-Chat-HF** (`GlmForCausalLM`): 9B
-- **GLM-4-32B-0414** (`Glm4ForCausalLM`): 32B
-
-## Architectures
-
-- `GlmForCausalLM` — GLM-4 series
-- `Glm4ForCausalLM` — GLM-4-0414 series
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| GLM-4-9B-Chat-HF | [`zai-org/glm-4-9b-chat-hf`](https://huggingface.co/zai-org/glm-4-9b-chat-hf) |
-| GLM-4-32B-0414 | [`zai-org/GLM-4-32B-0414`](https://huggingface.co/zai-org/GLM-4-32B-0414) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`glm_4_9b_chat_hf_squad.yaml <../../../../examples/llm_finetune/glm/glm_4_9b_chat_hf_squad.yaml>` | SFT — GLM-4 9B on SQuAD |
-| {download}`glm_4_9b_chat_hf_hellaswag_fp8.yaml <../../../../examples/llm_finetune/glm/glm_4_9b_chat_hf_hellaswag_fp8.yaml>` | SFT — GLM-4 9B on HellaSwag with FP8 |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/glm/glm_4_9b_chat_hf_squad.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/glm/glm_4_9b_chat_hf_squad.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [zai-org/glm-4-9b-chat-hf](https://huggingface.co/zai-org/glm-4-9b-chat-hf)
-- [zai-org/GLM-4-32B-0414](https://huggingface.co/zai-org/GLM-4-32B-0414)
diff --git a/fern/versions/nightly/pages/model-coverage/llm/thudm/glm4.mdx b/docs/model-coverage/llm/thudm/glm4.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/thudm/glm4.mdx
rename to docs/model-coverage/llm/thudm/glm4.mdx
diff --git a/docs/model-coverage/llm/thudm/glm5-moe-dsa.md b/docs/model-coverage/llm/thudm/glm5-moe-dsa.md
deleted file mode 100644
index f7869f18a0..0000000000
--- a/docs/model-coverage/llm/thudm/glm5-moe-dsa.md
+++ /dev/null
@@ -1,123 +0,0 @@
-# GLM-5 / GLM-5.1 (MoE + DSA)
-
-[GLM-5](https://huggingface.co/zai-org/GLM-5) and [GLM-5.1](https://huggingface.co/zai-org/GLM-5.1) are Zhipu AI's latest open-source large Mixture-of-Experts models featuring a DeepSeek-style MLA (Multi-head Latent Attention) + DSA (Dynamic Sparse Attention) architecture. GLM-5.1 shares the `glm_moe_dsa` architecture with GLM-5, with updated weights.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation (MoE) |
-| **Architecture** | `GlmMoeDsaForCausalLM` |
-| **Parameters** | 256 routed experts, 8 active per token |
-| **HF Org** | [zai-org](https://huggingface.co/zai-org) |
-:::
-
-## Key Features
-
-- **Mixture of Experts (MoE)**: 256 routed experts with 8 active per token
-- **78 layers**, hidden size 6144, with MLA using KV compression (kv_lora_rank=512) and head_dim=64
-- **~200k context window** (max_position_embeddings=202,752)
-- **3 dense layers** followed by MoE layers (first_k_dense_replace=3)
-
-## Available Models
-
-- **GLM-5** (`GlmMoeDsaForCausalLM`)
-- **GLM-5.1** (`GlmMoeDsaForCausalLM`): updated weights
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| GLM-5 | [`zai-org/GLM-5`](https://huggingface.co/zai-org/GLM-5) |
-| GLM-5.1 | [`zai-org/GLM-5.1`](https://huggingface.co/zai-org/GLM-5.1) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`glm_5_hellaswag_pp.yaml <../../../../examples/llm_finetune/glm/glm_5_hellaswag_pp.yaml>` | SFT — GLM-5 with EP=64, PP=4 on 32 nodes |
-| {download}`glm_5.1_hellaswag_pp.yaml <../../../../examples/llm_finetune/glm/glm_5.1_hellaswag_pp.yaml>` | SFT — GLM-5.1 with EP=64, PP=4 on 32 nodes |
-
-## Parallel Setup
-
-The recipe scales training using Expert Parallelism and Pipeline Parallelism (EP=64, PP=4 across 32 nodes of 8× H100 GPUs).
-
-```yaml
-distributed:
-  strategy: fsdp2
-  tp_size: 1
-  cp_size: 1
-  pp_size: 4
-  ep_size: 64
-  sequence_parallel: false
-  activation_checkpointing: true
-  pipeline:
-    pp_schedule: interleaved1f1b
-    pp_microbatch_size: 1
-    round_virtual_stages_to_pp_multiple: down
-    scale_grads_in_schedule: false
-    patch_inner_model: false
-    patch_causal_lm_model: false
-    layers_per_stage: 2
-  moe:
-    reshard_after_forward: false
-    wrap_outer_model: false
-```
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-:::{note}
-This recipe was validated on **32 nodes × 8 GPUs (256 H100s)**. See the [Launcher Guide](../../../launcher/slurm.md) for multi-node setup.
-:::
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/glm/glm_5_hellaswag_pp.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/glm/glm_5_hellaswag_pp.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md) and the [Large MoE Fine-Tuning Guide](../../../guides/llm/large-moe-finetune.md).
-
-## Hugging Face Model Cards
-
-- [zai-org/GLM-5](https://huggingface.co/zai-org/GLM-5)
-- [zai-org/GLM-5.1](https://huggingface.co/zai-org/GLM-5.1)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/thudm/glm5-moe-dsa.mdx b/docs/model-coverage/llm/thudm/glm5-moe-dsa.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/thudm/glm5-moe-dsa.mdx
rename to docs/model-coverage/llm/thudm/glm5-moe-dsa.mdx
diff --git a/docs/model-coverage/llm/tiiuae/falcon.md b/docs/model-coverage/llm/tiiuae/falcon.md
deleted file mode 100644
index 0f9d0b679b..0000000000
--- a/docs/model-coverage/llm/tiiuae/falcon.md
+++ /dev/null
@@ -1,94 +0,0 @@
-# Falcon
-
-[Falcon](https://falconllm.tii.ae/) is a series of open language models from the Technology Innovation Institute (TII) in Abu Dhabi, known for being trained on a high-quality curated web corpus (RefinedWeb).
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `FalconForCausalLM` |
-| **Parameters** | 7B – 40B |
-| **HF Org** | [tiiuae](https://huggingface.co/tiiuae) |
-:::
-
-## Available Models
-
-- **Falcon-40B**, **Falcon-40B-Instruct**
-- **Falcon-7B**, **Falcon-7B-Instruct**
-- **Falcon-RW-7B**
-- **Falcon3-7B-Instruct**
-
-## Architecture
-
-- `FalconForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Falcon 7B | [`tiiuae/falcon-7b`](https://huggingface.co/tiiuae/falcon-7b) |
-| Falcon 40B | [`tiiuae/falcon-40b`](https://huggingface.co/tiiuae/falcon-40b) |
-| Falcon RW 7B | [`tiiuae/falcon-rw-7b`](https://huggingface.co/tiiuae/falcon-rw-7b) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`falcon3_7b_instruct_squad.yaml <../../../../examples/llm_finetune/falcon/falcon3_7b_instruct_squad.yaml>` | SFT — Falcon3 7B Instruct on SQuAD |
-| {download}`falcon3_7b_instruct_squad_peft.yaml <../../../../examples/llm_finetune/falcon/falcon3_7b_instruct_squad_peft.yaml>` | LoRA — Falcon3 7B Instruct on SQuAD |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/falcon/falcon3_7b_instruct_squad.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/falcon/falcon3_7b_instruct_squad.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b)
-- [tiiuae/falcon-40b](https://huggingface.co/tiiuae/falcon-40b)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/tiiuae/falcon.mdx b/docs/model-coverage/llm/tiiuae/falcon.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/tiiuae/falcon.mdx
rename to docs/model-coverage/llm/tiiuae/falcon.mdx
diff --git a/docs/model-coverage/llm/upstage/solar.md b/docs/model-coverage/llm/upstage/solar.md
deleted file mode 100644
index bcde2fcddc..0000000000
--- a/docs/model-coverage/llm/upstage/solar.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# Solar Pro
-
-[Solar Pro](https://huggingface.co/upstage/solar-pro-preview-instruct) is an enterprise language model from Upstage, built on a depth up-scaling technique applied to Llama-based architectures.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation |
-| **Architecture** | `SolarForCausalLM` |
-| **Parameters** | 22B |
-| **HF Org** | [upstage](https://huggingface.co/upstage) |
-:::
-
-## Available Models
-
-- **solar-pro-preview-instruct**
-
-## Architecture
-
-- `SolarForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Solar Pro Preview Instruct | [`upstage/solar-pro-preview-instruct`](https://huggingface.co/upstage/solar-pro-preview-instruct) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base LLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [upstage/solar-pro-preview-instruct](https://huggingface.co/upstage/solar-pro-preview-instruct)
diff --git a/fern/versions/v0.4/pages/model-coverage/llm/upstage/solar.mdx b/docs/model-coverage/llm/upstage/solar.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/llm/upstage/solar.mdx
rename to docs/model-coverage/llm/upstage/solar.mdx
diff --git a/docs/model-coverage/llm/xiaomimimo/mimo-v2-flash.md b/docs/model-coverage/llm/xiaomimimo/mimo-v2-flash.md
deleted file mode 100644
index 58b18944b3..0000000000
--- a/docs/model-coverage/llm/xiaomimimo/mimo-v2-flash.md
+++ /dev/null
@@ -1,97 +0,0 @@
-# MiMo-V2-Flash
-
-[MiMo-V2-Flash](https://huggingface.co/XiaomiMiMo/MiMo-V2-Flash) is Xiaomi's
-hybrid attention Mixture-of-Experts language model. It alternates full and
-sliding-window attention layers, uses a `sigmoid_with_bias` router with
-group-limited expert routing, and ships as an FP8 HF checkpoint.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Text Generation (MoE, hybrid attention) |
-| **Architecture** | `MiMoV2FlashForCausalLM` |
-| **Parameters** | Approx. several hundred B total / much smaller active |
-| **HF Org** | [XiaomiMiMo](https://huggingface.co/XiaomiMiMo) |
-:::
-
-## Available Models
-
-- **MiMo-V2-Flash**: hybrid full/sliding-window attention with FP8 weights.
-
-## Architecture
-
-- `MiMoV2FlashForCausalLM`
-- Sliding-window attention via the `MiMoV2FlashAttention(is_swa=True)` path.
-- MoE blocks use the shared `nemo_automodel.components.moe.layers.MoE`
-  with `score_func="sigmoid_with_bias"` and `gate_precision=fp32` so
-  routing decisions stay numerically stable when activations are bf16.
-- FP8 round-trip in `MiMoV2FlashStateDictAdapter` covers the bulk of
-  attention/expert weights; layer norms, the gate, `lm_head`, and
-  `embed_tokens` stay in bf16 per `NON_QUANTIZED_KEY_PATTERNS`.
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| MiMo-V2-Flash | [`XiaomiMiMo/MiMo-V2-Flash`](https://huggingface.co/XiaomiMiMo/MiMo-V2-Flash) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`mimo_v2_flash_hellaswag.yaml <../../../../examples/llm_finetune/mimo_v2_flash/mimo_v2_flash_hellaswag.yaml>` | SFT — MiMo-V2-Flash on HellaSwag |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/mimo_v2_flash/mimo_v2_flash_hellaswag.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2. Navigate to the AutoModel directory**:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/llm_finetune/mimo_v2_flash/mimo_v2_flash_hellaswag.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Fine-Tuning
-
-See the [LLM Fine-Tuning Guide](../../../guides/llm/finetune.md).
-
-## Hugging Face Model Cards
-
-- [XiaomiMiMo/MiMo-V2-Flash](https://huggingface.co/XiaomiMiMo/MiMo-V2-Flash)
diff --git a/fern/versions/nightly/pages/model-coverage/llm/xiaomimimo/mimo-v2-flash.mdx b/docs/model-coverage/llm/xiaomimimo/mimo-v2-flash.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/llm/xiaomimimo/mimo-v2-flash.mdx
rename to docs/model-coverage/llm/xiaomimimo/mimo-v2-flash.mdx
diff --git a/docs/model-coverage/omni/index.md b/docs/model-coverage/omni/index.md
deleted file mode 100644
index d494632840..0000000000
--- a/docs/model-coverage/omni/index.md
+++ /dev/null
@@ -1,33 +0,0 @@
-# Omni Models
-
-Omni models go beyond image-text understanding to support additional modalities such as audio, video, or a combination of all — text, image, audio, and video in a single unified model.
-
-## Run Omni Models with NeMo AutoModel
-
-To run omni models with NeMo AutoModel, use NeMo container version [`25.11.00`](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo-automodel?version=25.11.00) or later. If the model you want to fine-tune requires a newer version of Transformers, you may need to upgrade:
-
-```bash
-pip3 install --upgrade git+git@github.com:NVIDIA-NeMo/AutoModel.git
-```
-
-For other installation options, see our [NeMo AutoModel Installation Guide](../../guides/installation.md).
-
-## Supported Models
-
-| Owner | Model | Modalities | Architecture |
-|---|---|---|---|
-| Qwen / Alibaba Cloud | [Qwen3-Omni](qwen/qwen3-omni.md) | Text · Image · Audio · Video | `Qwen3OmniForConditionalGeneration` |
-| Microsoft | [Phi-4-multimodal](microsoft/phi4-multimodal.md) | Text · Image · Audio | `Phi4MultimodalForCausalLM` |
-| NVIDIA | [Nemotron-3-Nano-Omni](nvidia/nemotron-omni.md) | Text · Image · Audio | `NemotronH_Nano_Omni_Reasoning_V3` |
-
-## Fine-Tune Omni Models
-
-All supported omni models can be fine-tuned using full SFT or PEFT (LoRA) approaches. See the [VLM Fine-Tuning Guide](../../guides/omni/gemma3-3n.md) for general setup instructions.
-
-```{toctree}
-:hidden:
-
-qwen/qwen3-omni
-microsoft/phi4-multimodal
-nvidia/nemotron-omni
-```
diff --git a/fern/versions/v0.4/pages/model-coverage/omni/index.mdx b/docs/model-coverage/omni/index.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/omni/index.mdx
rename to docs/model-coverage/omni/index.mdx
diff --git a/docs/model-coverage/omni/microsoft/phi4-multimodal.md b/docs/model-coverage/omni/microsoft/phi4-multimodal.md
deleted file mode 100644
index 6b3a27178d..0000000000
--- a/docs/model-coverage/omni/microsoft/phi4-multimodal.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# Phi-4-multimodal
-
-[Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) is Microsoft's multimodal extension of Phi-4, supporting text, image, and audio inputs — making it suitable for speech, vision, and combined multimodal tasks.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Omnimodal (Text·Image·Audio) |
-| **Architecture** | `Phi4MultimodalForCausalLM` |
-| **Parameters** | 5.6B |
-| **HF Org** | [microsoft](https://huggingface.co/microsoft) |
-:::
-
-## Available Models
-
-- **Phi-4-multimodal-instruct**: 5.6B
-
-## Architecture
-
-- `Phi4MultimodalForCausalLM`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Phi-4-multimodal-instruct | [`microsoft/Phi-4-multimodal-instruct`](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) |
-
-## Example Recipes
-
-| Recipe | Dataset | Description |
-|---|---|---|
-| {download}`phi4_mm_cv17.yaml <../../../../examples/vlm_finetune/phi4/phi4_mm_cv17.yaml>` | CommonVoice 17 | SFT — Phi-4-multimodal on CommonVoice (audio-text) |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/phi4/phi4_mm_cv17.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/phi4/phi4_mm_cv17.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [Omni Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Fine-Tuning
-
-See the [VLM / Omni Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Hugging Face Model Cards
-
-- [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct)
diff --git a/fern/versions/v0.4/pages/model-coverage/omni/microsoft/phi4-multimodal.mdx b/docs/model-coverage/omni/microsoft/phi4-multimodal.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/omni/microsoft/phi4-multimodal.mdx
rename to docs/model-coverage/omni/microsoft/phi4-multimodal.mdx
diff --git a/docs/model-coverage/omni/nvidia/nemotron-omni.md b/docs/model-coverage/omni/nvidia/nemotron-omni.md
deleted file mode 100644
index a77c917f0e..0000000000
--- a/docs/model-coverage/omni/nvidia/nemotron-omni.md
+++ /dev/null
@@ -1,55 +0,0 @@
-# Nemotron-3-Nano-Omni
-
-[Nemotron-3-Nano-Omni-30B-A3B-Reasoning](https://huggingface.co/nvidia) is NVIDIA's
-omnimodal reasoning model. It pairs a NemotronH (hybrid Mamba-2 + Attention) MoE
-language backbone with a RADIO v2.5-H vision encoder and a Parakeet (FastConformer)
-sound encoder, supporting interleaved text, image, and audio inputs.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Omnimodal (Text·Image·Audio) |
-| **Architecture** | `NemotronH_Nano_Omni_Reasoning_V3` |
-| **Parameters** | 30B total / 3B active |
-| **HF Org** | [nvidia](https://huggingface.co/nvidia) |
-:::
-
-## Available Models
-
-- **Nemotron-3-Nano-Omni-30B-A3B-Reasoning**: 30B total, 3B activated (MoE)
-
-## Architecture
-
-- `NemotronH_Nano_Omni_Reasoning_V3`
-
-## Example Recipes
-
-| Recipe | Dataset | Description |
-|---|---|---|
-| {download}`nemotron_omni_cord_v2.yaml <../../../../examples/vlm_finetune/nemotron_omni/nemotron_omni_cord_v2.yaml>` | CORD-v2 | Full SFT — receipt parsing |
-| {download}`nemotron_omni_cord_v2_peft.yaml <../../../../examples/vlm_finetune/nemotron_omni/nemotron_omni_cord_v2_peft.yaml>` | CORD-v2 | LoRA PEFT — receipt parsing |
-
-## Try with NeMo AutoModel
-
-**1. Install** ([NeMo AutoModel](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo (8x H100 example):
-
-```bash
-automodel examples/vlm_finetune/nemotron_omni/nemotron_omni_cord_v2.yaml --nproc-per-node 8
-```
-
-For a full walkthrough — dataset preparation, SFT vs. LoRA configs, and
-post-training inference — see the
-[Nemotron-Omni guide](../../../guides/vlm/nemotron-omni.md).
diff --git a/fern/versions/v0.4/pages/model-coverage/omni/nvidia/nemotron-omni.mdx b/docs/model-coverage/omni/nvidia/nemotron-omni.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/omni/nvidia/nemotron-omni.mdx
rename to docs/model-coverage/omni/nvidia/nemotron-omni.mdx
diff --git a/docs/model-coverage/omni/qwen/qwen3-omni.md b/docs/model-coverage/omni/qwen/qwen3-omni.md
deleted file mode 100644
index 456a74d4d7..0000000000
--- a/docs/model-coverage/omni/qwen/qwen3-omni.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# Qwen3-Omni
-
-[Qwen3-Omni](https://qwenlm.github.io/blog/qwen3/) is Alibaba Cloud's omnimodal model supporting text, image, audio, and video inputs in a single unified architecture with a MoE language backbone.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Omnimodal (Text·Image·Audio·Video) |
-| **Architecture** | `Qwen3OmniForConditionalGeneration` |
-| **Parameters** | 30B total / 3B active |
-| **HF Org** | [Qwen](https://huggingface.co/Qwen) |
-:::
-
-## Available Models
-
-- **Qwen3-Omni-30B-A3B-Instruct**: 30B total, 3B activated (MoE)
-
-## Architecture
-
-- `Qwen3OmniForConditionalGeneration`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Qwen3-Omni 30B A3B Instruct | [`Qwen/Qwen3-Omni-30B-A3B-Instruct`](https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct) |
-
-## Example Recipes
-
-| Recipe | Dataset | Description |
-|---|---|---|
-| {download}`qwen3_omni_moe_30b_te_deepep.yaml <../../../../examples/vlm_finetune/qwen3/qwen3_omni_moe_30b_te_deepep.yaml>` | MedPix-VQA | SFT — Qwen3-Omni 30B with TE + DeepEP |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/qwen3/qwen3_omni_moe_30b_te_deepep.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/qwen3/qwen3_omni_moe_30b_te_deepep.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [Omni Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Fine-Tuning
-
-See the [VLM / Omni Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Hugging Face Model Cards
-
-- [Qwen/Qwen3-Omni-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct)
diff --git a/fern/versions/nightly/pages/model-coverage/omni/qwen/qwen3-omni.mdx b/docs/model-coverage/omni/qwen/qwen3-omni.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/omni/qwen/qwen3-omni.mdx
rename to docs/model-coverage/omni/qwen/qwen3-omni.mdx
diff --git a/docs/model-coverage/overview.md b/docs/model-coverage/overview.md
deleted file mode 100644
index f0efd2c64a..0000000000
--- a/docs/model-coverage/overview.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# Model Coverage Overview
-
-NeMo AutoModel integrates with Hugging Face `transformers`. Any LLM or VLM that can be instantiated through `transformers` can also be used using NeMo AutoModel, subject to runtime, third-party software dependencies, and feature compatibility.
-
-## Supported Hugging Face Auto Classes
-
-| Auto Class | Task | Status | Details |
-|------------|------|--------|---------|
-| `AutoModelForCausalLM` | Text Generation (LLM) | Supported | See [LLM model list](llm/index.md). |
-| `AutoModelForImageTextToText` | Image-Text-to-Text (VLM) | Supported | See [VLM model list](vlm/index.md). |
-| `AutoModelForSequenceClassification` | Sequence Classification | WIP | Early support; interfaces may change. |
-| Diffusers Pipelines | Diffusion Generation (T2I, T2V) | Supported | See [Diffusion model list](diffusion/index.md). |
-| `NeMoAutoModelBiEncoder` | Embedding Models | Supported | See [Embedding model list](embedding/index.md). |
-| `NeMoAutoModelCrossEncoder` | Reranking Models | Supported | See [Reranking model list](reranker/index.md). |
-
-## Release Log
-
-The table below tracks when model support and key features were added across NeMo AutoModel releases. For the full list of tested architectures and example configs, see the [LLM](llm/index.md) and [VLM](vlm/index.md) pages.
-
-| Release | Date | New Models | Key Features |
-|---------|------|------------|--------------|
-| **0.3.0** (upcoming) | — | Kimi-VL, Kimi-K25-VL, Gemma 3n, Nemotron-Parse, Qwen3-VL-MoE, Qwen3-Omni, InternVL 3.5, Ministral3, Phi-4-multimodal, Devstral-Small-2, Step-3.5-Flash, Qwen3-Next, Nemotron-3-Nano-30B, FLUX.1-dev, Wan 2.1 T2V, HunyuanVideo 1.5 | MoE LoRA, expanded VLM coverage, diffusion model training (flow matching) |
-| **0.2.0** | Dec 2025 | GPT-OSS 20B/120B, Qwen3, Qwen3-MoE, GLM-4/4-MoE, Qwen2.5-VL, Qwen3-VL | Single- and multi-turn tool calling, streaming dataset, QAT for SFT, sequence classification, async DCP checkpointing, MLflow, CP + sequence packing for MoE |
-| **0.1.0** | Oct 2025 | DeepSeek V3/V3.2, 40+ LLM architectures, Gemma 3 VLM | Pretraining, knowledge distillation, FP8 (torchao), pipeline parallelism, HSDP, auto pipelining, ColumnMapped dataset |
-| **0.1.0a0** | Sep 2025 | Initial LLM and VLM support (Llama, Mistral, Qwen2, Gemma, Phi, and more) | MegatronFSDP, packed sequences, Triton LoRA kernels |
-
-
-## Day-0 Support
-
-- NeMo AutoModel closely tracks the latest `transformers` version and updates its dependency regularly.
-- New models released on the Hugging Face Hub may require the latest `transformers` version, necessitating a package upgrade.
-- We are working on a CI pipeline that automatically bumps the supported `transformers` version when a new release is detected, enabling even faster day-0 support.
-
-
-## Custom Model Registry
-
-NeMo AutoModel includes a custom model registry that allows teams to:
-
-- Add custom implementations to extend support to models not yet covered upstream.
-- Provide optimized or faster implementations for specific models while retaining the same AutoModel interface.
-
-## Having Issues?
-
-If a model from the Hub doesn't work as expected, see the [Troubleshooting Guide](troubleshooting.md) for common issues and solutions.
diff --git a/fern/versions/v0.4/pages/model-coverage/overview.mdx b/docs/model-coverage/overview.mdx
similarity index 87%
rename from fern/versions/v0.4/pages/model-coverage/overview.mdx
rename to docs/model-coverage/overview.mdx
index 5d8e72f282..fa7f939008 100644
--- a/fern/versions/v0.4/pages/model-coverage/overview.mdx
+++ b/docs/model-coverage/overview.mdx
@@ -3,7 +3,7 @@ title: "Model Coverage Overview"
 description: ""
 position: 1
 ---
-NeMo AutoModel integrates with Hugging Face `transformers`. Any LLM or VLM that can be instantiated through `transformers` can also be used via NeMo AutoModel, subject to runtime, third-party software dependencies, and feature compatibility.
+NeMo AutoModel integrates with Hugging Face `transformers`. Any LLM or VLM that can be instantiated through `transformers` can also be used using NeMo AutoModel, subject to runtime, third-party software dependencies, and feature compatibility.
 
 ## Supported Hugging Face Auto Classes
 
@@ -13,6 +13,8 @@ NeMo AutoModel integrates with Hugging Face `transformers`. Any LLM or VLM that
 | `AutoModelForImageTextToText` | Image-Text-to-Text (VLM) | Supported | See [VLM model list](/model-coverage/vision-language-models/overview). |
 | `AutoModelForSequenceClassification` | Sequence Classification | WIP | Early support; interfaces may change. |
 | Diffusers Pipelines | Diffusion Generation (T2I, T2V) | Supported | See [Diffusion model list](/model-coverage/diffusion/overview). |
+| `NeMoAutoModelBiEncoder` | Embedding Models | Supported | See [Embedding model list](/model-coverage/embedding-models/overview). |
+| `NeMoAutoModelCrossEncoder` | Reranking Models | Supported | See [Reranking model list](/model-coverage/reranking-models/overview). |
 
 ## Release Log
 
@@ -31,7 +33,6 @@ The table below tracks when model support and key features were added across NeM
 - New models released on the Hugging Face Hub may require the latest `transformers` version, necessitating a package upgrade.
 - We are working on a CI pipeline that automatically bumps the supported `transformers` version when a new release is detected, enabling even faster day-0 support.
 
-**Note:** To use newly released models, you may need to upgrade your NeMo AutoModel installation — just as you would upgrade `transformers` to access the latest models. AutoModel mirrors the familiar `transformers` `Auto*` APIs while adding optional performance accelerations and distributed training features.
 
 ## Custom Model Registry
 
diff --git a/docs/model-coverage/reranker/index.md b/docs/model-coverage/reranker/index.mdx
similarity index 69%
rename from docs/model-coverage/reranker/index.md
rename to docs/model-coverage/reranker/index.mdx
index 3a663e095d..fc22d03f88 100644
--- a/docs/model-coverage/reranker/index.md
+++ b/docs/model-coverage/reranker/index.mdx
@@ -1,18 +1,20 @@
-(reranking-models)=
-
-# Reranking Models
+---
+title: "Reranking Models"
+description: ""
+position: 1
+---
 
 ## Introduction
 
 Reranking models use cross-encoders to score a query-document pair jointly. They are typically used after an embedding model has produced an initial candidate set. NeMo AutoModel supports optimized bidirectional Llama rerankers and falls back to Hugging Face `AutoModelForSequenceClassification` for other architectures.
 
-For first-stage dense retrieval, see [Embedding Models](../embedding/index.md).
+For first-stage dense retrieval, see [Embedding Models](/model-coverage/embedding-models/overview).
 
 ## Optimized Backbones (Bidirectional Attention)
 
 | Owner | Model | Architecture | Wrapper Class | Tasks |
 |---|---|---|---|---|
-| NVIDIA | [llama-nemotron-rerank-1b-v2](nvidia/llama-bidirectional.md) | `LlamaBidirectionalForSequenceClassification` | `NeMoAutoModelCrossEncoder` | Reranking |
+| NVIDIA | [llama-nemotron-rerank-1b-v2](/model-coverage/reranking-models/llama-bidirectional) | `LlamaBidirectionalForSequenceClassification` | `NeMoAutoModelCrossEncoder` | Reranking |
 
 ## Hugging Face Auto Backbones
 
@@ -25,16 +27,10 @@ Any Hugging Face model loadable using `AutoModelForSequenceClassification` can b
 
 ## Dataset
 
-Retrieval fine-tuning requires query-document pairs: each example is a query paired with one positive document and one or more negative documents. Both inline JSONL and corpus ID-based JSON formats are supported. See the [Retrieval Dataset](../../guides/llm/retrieval-dataset.md) guide.
+Retrieval fine-tuning requires query-document pairs: each example is a query paired with one positive document and one or more negative documents. Both inline JSONL and corpus ID-based JSON formats are supported. See the [Retrieval Dataset](/datasets/retrieval-dataset) guide.
 
-<!-- TODO: uncomment when finetune guide is published.
+{/* TODO: uncomment when finetune guide is published.
 ## Train Reranking Models
 
-For a complete walkthrough of training configuration, model-specific settings, and launch commands, see the [Embedding and Reranking Fine-Tuning Guide](../../guides/retrieval/finetune.md).
--->
-
-```{toctree}
-:hidden:
-
-nvidia/llama-bidirectional
-```
+For a complete walkthrough of training configuration, model-specific settings, and launch commands, see the [Embedding and Reranking Fine-Tuning Guide](/recipes-e2e-examples/retrieval-finetune).
+*/}
diff --git a/docs/model-coverage/reranker/nvidia/llama-bidirectional.md b/docs/model-coverage/reranker/nvidia/llama-bidirectional.mdx
similarity index 81%
rename from docs/model-coverage/reranker/nvidia/llama-bidirectional.md
rename to docs/model-coverage/reranker/nvidia/llama-bidirectional.mdx
index 7ae40d502a..110b811236 100644
--- a/docs/model-coverage/reranker/nvidia/llama-bidirectional.md
+++ b/docs/model-coverage/reranker/nvidia/llama-bidirectional.mdx
@@ -1,17 +1,18 @@
-# Llama (Bidirectional) for Reranking
+---
+title: "Llama (Bidirectional) for Reranking"
+description: ""
+---
 
 NeMo AutoModel provides a bidirectional variant of [Meta's Llama](https://www.llama.com/) for reranking tasks. Unlike the standard causal (left-to-right) Llama used for text generation, this variant uses **bidirectional attention**, allowing the query and document to interact across the full sequence before a classification head produces a relevance score.
 
-For the bi-encoder variant, see [Llama (Bidirectional) for Embedding](../../embedding/nvidia/llama-bidirectional.md).
+For the bi-encoder variant, see [Llama (Bidirectional) for Embedding](/model-coverage/embedding-models/llama-bidirectional).
 
-:::{card}
 | | |
 |---|---|
 | **Tasks** | Reranking |
 | **Architecture** | `LlamaBidirectionalForSequenceClassification` |
 | **Parameters** | 1B – 8B |
 | **HF Org** | [meta-llama](https://huggingface.co/meta-llama) |
-:::
 
 ## Available Models
 
@@ -39,11 +40,11 @@ The cross-encoder path is used for pairwise relevance scoring and reranking.
 
 | Recipe | Description |
 |---|---|
-| {download}`llama3_2_1b.yaml <../../../../examples/retrieval/cross_encoder/llama3_2_1b.yaml>` | Cross-encoder — Llama 3.2 1B reranker |
+| [llama3_2_1b.yaml](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/retrieval/cross_encoder/llama3_2_1b.yaml) | Cross-encoder — Llama 3.2 1B reranker |
 
 ## Try with NeMo AutoModel
 
-**1. Install NeMo AutoModel**. Refer to the ([Installation Guide](../../../guides/installation.md)) for information:
+**1. Install NeMo AutoModel**. Refer to the ([Installation Guide](/get-started/installation)) for information:
 
 ```bash
 uv pip install nemo-automodel
@@ -62,7 +63,7 @@ cd Automodel
 torchrun --nproc-per-node=8 examples/retrieval/cross_encoder/finetune.py --config examples/retrieval/cross_encoder/llama3_2_1b.yaml
 ```
 
-:::{dropdown} Run with Docker
+<Accordion title="Run with Docker">
 **1. Pull the container** and mount a checkpoint directory:
 
 ```bash
@@ -83,15 +84,15 @@ cd /opt/Automodel
 ```bash
 torchrun --nproc-per-node=8 examples/retrieval/cross_encoder/finetune.py --config examples/retrieval/cross_encoder/llama3_2_1b.yaml
 ```
-:::
+</Accordion>
 
-See the [Installation Guide](../../../guides/installation.md).
+See the [Installation Guide](/get-started/installation).
 
-<!-- TODO: uncomment when finetune guide is published.
+{/* TODO: uncomment when finetune guide is published.
 ## Fine-Tuning
 
-See the [Embedding and Reranking Fine-Tuning Guide](../../../guides/retrieval/finetune.md) for cross-encoder training instructions, including LoRA/PEFT configuration.
--->
+See the [Embedding and Reranking Fine-Tuning Guide](/recipes-e2e-examples/retrieval-finetune) for cross-encoder training instructions, including LoRA/PEFT configuration.
+*/}
 
 ## Hugging Face Model Cards
 
diff --git a/docs/model-coverage/troubleshooting.md b/docs/model-coverage/troubleshooting.md
deleted file mode 100644
index cbaf8fc08b..0000000000
--- a/docs/model-coverage/troubleshooting.md
+++ /dev/null
@@ -1,23 +0,0 @@
-:orphan:
-
-# Troubleshooting Unsupported Models
-
-Sometimes a model listed on the Hugging Face Hub may not work with NeMo AutoModel.
-If you encounter any such model, please open a [GitHub issue](https://github.com/NVIDIA-NeMo/Automodel/issues) with the model ID and any stack trace you see.
-
-## Common Issues
-
-| Issue | Example Error | Solution |
-|-------|---------------|----------|
-| Model has explicitly disabled training in its definition code | — | Request support via a [GitHub issue](https://github.com/NVIDIA-NeMo/Automodel/issues). We can add the model through our custom registry. |
-| Model requires a newer `transformers` version | `The checkpoint you are trying to load has model type deepseek_v32 but Transformers does not recognize this architecture.` | Upgrade `transformers` (and NeMo AutoModel if needed), or open a GitHub issue. |
-| Model upper-bounds `transformers`, requiring an older version | — | Open a [GitHub issue](https://github.com/NVIDIA-NeMo/Automodel/issues). |
-| Unsupported checkpoint format | `OSError: meta-llama/Llama-2-70b does not appear to have a file named pytorch_model.bin, model.safetensors, ...` | Open a GitHub issue or request the model publisher to share a SafeTensors checkpoint. |
-
-These cases typically stem from upstream packaging or dependency constraints. You would encounter the same issues when using `transformers` directly, as AutoModel mirrors the familiar load and fine-tune semantics.
-
-## Steps to Try
-
-1. **Upgrade NeMo AutoModel** to a release that supports the required `transformers` version. See [Installation](../guides/installation.md).
-2. **Enable remote code** — if the model uses custom code, set `trust_remote_code: true` in your `model:` config. See [Hugging Face API Compatibility](../guides/huggingface-api-compatibility.md).
-3. **Open a GitHub issue** with the model ID and error so we can prioritize support or add a registry-backed implementation.
diff --git a/fern/versions/v0.4/pages/model-coverage/troubleshooting.mdx b/docs/model-coverage/troubleshooting.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/troubleshooting.mdx
rename to docs/model-coverage/troubleshooting.mdx
diff --git a/docs/model-coverage/vlm/google/gemma3-vl.md b/docs/model-coverage/vlm/google/gemma3-vl.md
deleted file mode 100644
index 16b357b7d3..0000000000
--- a/docs/model-coverage/vlm/google/gemma3-vl.md
+++ /dev/null
@@ -1,96 +0,0 @@
-# Gemma 3 VL / Gemma 3n
-
-[Gemma 3 VL](https://ai.google.dev/gemma/docs/core) is Google's multimodal extension of Gemma 3, supporting image-text inputs for tasks like image captioning and visual question answering. Gemma 3n is a next-generation efficiency-focused variant.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Image-Text-to-Text |
-| **Architecture** | `Gemma3ForConditionalGeneration` |
-| **Parameters** | 4B – 27B |
-| **HF Org** | [google](https://huggingface.co/google) |
-:::
-
-## Available Models
-
-- **Gemma 3 27B IT** (VL)
-- **Gemma 3 4B IT** (VL)
-- **Gemma 3n 4B** (VL)
-
-## Architecture
-
-- `Gemma3ForConditionalGeneration`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Gemma 3 4B IT | [`google/gemma-3-4b-it`](https://huggingface.co/google/gemma-3-4b-it) |
-| Gemma 3 27B IT | [`google/gemma-3-27b-it`](https://huggingface.co/google/gemma-3-27b-it) |
-
-## Example Recipes
-
-| Recipe | Dataset | Description |
-|---|---|---|
-| {download}`gemma3_vl_4b_cord_v2.yaml <../../../../examples/vlm_finetune/gemma3/gemma3_vl_4b_cord_v2.yaml>` | cord-v2 | SFT — Gemma 3 4B VL on CORD-v2 |
-| {download}`gemma3_vl_4b_cord_v2_peft.yaml <../../../../examples/vlm_finetune/gemma3/gemma3_vl_4b_cord_v2_peft.yaml>` | cord-v2 | LoRA — Gemma 3 4B VL on CORD-v2 |
-| {download}`gemma3_vl_4b_cord_v2_megatron_fsdp.yaml <../../../../examples/vlm_finetune/gemma3/gemma3_vl_4b_cord_v2_megatron_fsdp.yaml>` | cord-v2 | SFT — Gemma 3 4B VL with MegatronFSDP |
-| {download}`gemma3_vl_4b_medpix.yaml <../../../../examples/vlm_finetune/gemma3/gemma3_vl_4b_medpix.yaml>` | MedPix-VQA | SFT — Gemma 3 4B VL on MedPix |
-| {download}`gemma3n_vl_4b_medpix.yaml <../../../../examples/vlm_finetune/gemma3n/gemma3n_vl_4b_medpix.yaml>` | MedPix-VQA | SFT — Gemma 3n 4B VL on MedPix |
-| {download}`gemma3n_vl_4b_medpix_peft.yaml <../../../../examples/vlm_finetune/gemma3n/gemma3n_vl_4b_medpix_peft.yaml>` | MedPix-VQA | LoRA — Gemma 3n 4B VL on MedPix |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/gemma3/gemma3_vl_4b_cord_v2.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/gemma3/gemma3_vl_4b_cord_v2.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Fine-Tuning
-
-See the [Gemma 3 & Gemma 3n Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md) for detailed instructions on dataset preparation, configuration, and multi-GPU training.
-
-## Hugging Face Model Cards
-
-- [google/gemma-3-4b-it](https://huggingface.co/google/gemma-3-4b-it)
-- [google/gemma-3-27b-it](https://huggingface.co/google/gemma-3-27b-it)
diff --git a/fern/versions/v0.4/pages/model-coverage/vlm/google/gemma3-vl.mdx b/docs/model-coverage/vlm/google/gemma3-vl.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/vlm/google/gemma3-vl.mdx
rename to docs/model-coverage/vlm/google/gemma3-vl.mdx
diff --git a/docs/model-coverage/vlm/google/gemma4.md b/docs/model-coverage/vlm/google/gemma4.md
deleted file mode 100644
index 27597692d1..0000000000
--- a/docs/model-coverage/vlm/google/gemma4.md
+++ /dev/null
@@ -1,106 +0,0 @@
-# Gemma 4
-
-[Gemma 4](https://ai.google.dev/gemma) is Google's next-generation multimodal Gemma family, supporting image-text inputs with a Mixture-of-Experts (MoE) language backbone at larger scales. NeMo AutoModel replaces the HF-native dense matmul over experts with the NeMo `GroupedExperts` backend, enabling Expert Parallelism (EP) via the standard MoE parallelizer.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Image-Text-to-Text |
-| **Architecture** | `Gemma4ForConditionalGeneration` |
-| **Parameters** | 2B – 31B (dense) · 26B-A4B (MoE) |
-| **HF Org** | [google](https://huggingface.co/google) |
-:::
-
-## Available Models
-
-- **Gemma 4 E2B IT** (VL, dense)
-- **Gemma 4 E4B IT** (VL, dense, kv-shared layers)
-- **Gemma 4 31B IT** (VL, dense)
-- **Gemma 4 26B-A4B IT** (VL, MoE)
-
-## Architecture
-
-- `Gemma4ForConditionalGeneration`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Gemma 4 E2B IT | [`google/gemma-4-E2B-it`](https://huggingface.co/google/gemma-4-E2B-it) |
-| Gemma 4 E4B IT | [`google/gemma-4-E4B-it`](https://huggingface.co/google/gemma-4-E4B-it) |
-| Gemma 4 31B IT | [`google/gemma-4-31B-it`](https://huggingface.co/google/gemma-4-31B-it) |
-| Gemma 4 26B-A4B IT (MoE) | [`google/gemma-4-26B-A4B-it`](https://huggingface.co/google/gemma-4-26B-A4B-it) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`gemma4_2b.yaml <../../../../examples/vlm_finetune/gemma4/gemma4_2b.yaml>` | SFT — Gemma 4 E2B on MedPix |
-| {download}`gemma4_2b_peft.yaml <../../../../examples/vlm_finetune/gemma4/gemma4_2b_peft.yaml>` | LoRA — Gemma 4 E2B on MedPix |
-| {download}`gemma4_4b.yaml <../../../../examples/vlm_finetune/gemma4/gemma4_4b.yaml>` | SFT — Gemma 4 E4B on MedPix |
-| {download}`gemma4_4b_peft.yaml <../../../../examples/vlm_finetune/gemma4/gemma4_4b_peft.yaml>` | LoRA — Gemma 4 E4B on MedPix |
-| {download}`gemma4_31b.yaml <../../../../examples/vlm_finetune/gemma4/gemma4_31b.yaml>` | SFT — Gemma 4 31B on MedPix |
-| {download}`gemma4_31b_peft.yaml <../../../../examples/vlm_finetune/gemma4/gemma4_31b_peft.yaml>` | LoRA — Gemma 4 31B on MedPix |
-| {download}`gemma4_31b_tp4.yaml <../../../../examples/vlm_finetune/gemma4/gemma4_31b_tp4.yaml>` | SFT — Gemma 4 31B with TP=4 |
-| {download}`gemma4_31b_tp4_pp2.yaml <../../../../examples/vlm_finetune/gemma4/gemma4_31b_tp4_pp2.yaml>` | SFT — Gemma 4 31B with TP=4, PP=2 |
-| {download}`gemma4_31b_tp4_pp4.yaml <../../../../examples/vlm_finetune/gemma4/gemma4_31b_tp4_pp4.yaml>` | SFT — Gemma 4 31B with TP=4, PP=4 (multi-node) |
-| {download}`gemma4_26b_a4b_moe.yaml <../../../../examples/vlm_finetune/gemma4/gemma4_26b_a4b_moe.yaml>` | SFT — Gemma 4 26B-A4B MoE on MedPix |
-| {download}`gemma4_26b_a4b_moe_peft.yaml <../../../../examples/vlm_finetune/gemma4/gemma4_26b_a4b_moe_peft.yaml>` | LoRA — Gemma 4 26B-A4B MoE on MedPix |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/gemma4/gemma4_4b.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/gemma4/gemma4_4b.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Fine-Tuning
-
-See the [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Hugging Face Model Cards
-
-- [google/gemma-4-E2B-it](https://huggingface.co/google/gemma-4-E2B-it)
-- [google/gemma-4-E4B-it](https://huggingface.co/google/gemma-4-E4B-it)
-- [google/gemma-4-31B-it](https://huggingface.co/google/gemma-4-31B-it)
-- [google/gemma-4-26B-A4B-it](https://huggingface.co/google/gemma-4-26B-A4B-it)
diff --git a/fern/versions/v0.4/pages/model-coverage/vlm/google/gemma4.mdx b/docs/model-coverage/vlm/google/gemma4.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/vlm/google/gemma4.mdx
rename to docs/model-coverage/vlm/google/gemma4.mdx
diff --git a/docs/model-coverage/vlm/huggingface/smolvlm.md b/docs/model-coverage/vlm/huggingface/smolvlm.md
deleted file mode 100644
index 90397a5a02..0000000000
--- a/docs/model-coverage/vlm/huggingface/smolvlm.md
+++ /dev/null
@@ -1,89 +0,0 @@
-# SmolVLM
-
-[SmolVLM](https://huggingface.co/blog/smolvlm) is HuggingFace's compact vision language model designed for on-device and memory-constrained deployment, featuring an efficient image token compression strategy.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Image-Text-to-Text |
-| **Architecture** | `SmolVLMForConditionalGeneration` |
-| **Parameters** | 256M – 2B |
-| **HF Org** | [HuggingFaceTB](https://huggingface.co/HuggingFaceTB) |
-:::
-
-## Available Models
-
-- **SmolVLM-Instruct**: 2B
-- **SmolVLM-256M-Instruct**: 256M
-
-## Architecture
-
-- `SmolVLMForConditionalGeneration`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| SmolVLM Instruct | [`HuggingFaceTB/SmolVLM-Instruct`](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) |
-| SmolVLM 256M Instruct | [`HuggingFaceTB/SmolVLM-256M-Instruct`](https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base VLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/gemma3/gemma3_vl_4b_cord_v2.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/gemma3/gemma3_vl_4b_cord_v2.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Fine-Tuning
-
-See the [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Hugging Face Model Cards
-
-- [HuggingFaceTB/SmolVLM-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct)
diff --git a/fern/versions/v0.4/pages/model-coverage/vlm/huggingface/smolvlm.mdx b/docs/model-coverage/vlm/huggingface/smolvlm.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/vlm/huggingface/smolvlm.mdx
rename to docs/model-coverage/vlm/huggingface/smolvlm.mdx
diff --git a/docs/model-coverage/vlm/index.md b/docs/model-coverage/vlm/index.md
deleted file mode 100644
index 54157cf8a8..0000000000
--- a/docs/model-coverage/vlm/index.md
+++ /dev/null
@@ -1,67 +0,0 @@
-# Vision Language Models (VLMs)
-
-## Introduction
-
-Vision Language Models (VLMs) integrate vision and language processing capabilities, enabling models to understand images and generate text descriptions, answer visual questions, and perform multimodal reasoning.
-
-NeMo AutoModel LLM APIs can be easily extended to support VLM tasks. While most of the training setup is the same as for LLMs, some additional steps are required to prepare the data and model for VLM training.
-
-## Run VLMs with NeMo AutoModel
-
-To run VLMs with NeMo AutoModel, use NeMo container version [`25.11.00`](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo-automodel?version=25.11.00) or later. If the model you want to fine-tune requires a newer version of Transformers, you may need to upgrade:
-
-```bash
-pip3 install --upgrade git+git@github.com:NVIDIA-NeMo/AutoModel.git
-```
-
-For other installation options, see our [Installation Guide](../../guides/installation.md).
-
-## Supported Models
-
-NeMo AutoModel supports [AutoModelForImageTextToText](https://huggingface.co/docs/transformers/main/model_doc/auto#transformers.AutoModelForImageTextToText) in the [Image-Text-to-Text](https://huggingface.co/models?pipeline_tag=image-text-to-text&sort=trending) category.
-
-| Owner | Model | Architectures |
-|---|---|---|
-| Moonshot AI | [Kimi-VL](moonshotai/kimi-vl.md) | `KimiVLForConditionalGeneration` |
-| Google | [Gemma 3 VL / Gemma 3n](google/gemma3-vl.md) | `Gemma3ForConditionalGeneration` |
-| Google | [Gemma 4](google/gemma4.md) | `Gemma4ForConditionalGeneration` |
-| Qwen / Alibaba Cloud | [Qwen2.5-VL](qwen/qwen2-5-vl.md) | `Qwen2VLForConditionalGeneration`, `Qwen2_5VLForConditionalGeneration` |
-| Qwen / Alibaba Cloud | [Qwen3-VL / Qwen3-VL-MoE](qwen/qwen3-vl.md) | `Qwen3VLForConditionalGeneration` |
-| Qwen / Alibaba Cloud | [Qwen3.5-VL](qwen/qwen3-5-vl.md) | `Qwen3_5VLForConditionalGeneration`, `Qwen3_5MoeVLForConditionalGeneration` |
-| NVIDIA | [Nemotron-Parse](nvidia/nemotron-parse.md) | `NemotronParseForConditionalGeneration` |
-| Mistral AI | [Ministral3 VL](mistralai/ministral3-vl.md) | `Mistral3ForConditionalGeneration` |
-| Mistral AI | [Mistral-Small-4](mistralai/mistral-small-4.md) | `MistralForConditionalGeneration` |
-| Mistral AI | [Mistral Medium 3.5](mistralai/mistral-medium-3-5.md) | `Mistral3ForConditionalGeneration` (FP8) |
-| InternLM / Shanghai AI Lab | [InternVL](internlm/internvl.md) | `InternVLForConditionalGeneration` |
-| Meta | [Llama 4](meta/llama4.md) | `Llama4ForConditionalGeneration` |
-| HuggingFace | [SmolVLM](huggingface/smolvlm.md) | `SmolVLMForConditionalGeneration` |
-| LLaVA | [LLaVA](llava-hf/llava.md) | `LlavaForConditionalGeneration`, `LlavaNextForConditionalGeneration`, `LlavaNextVideoForConditionalGeneration`, `LlavaOnevisionForConditionalGeneration` |
-| lmms-lab | [LLaVA-OneVision 1.5](lmms-lab/llava-onevision.md) | `LlavaOneVisionForConditionalGeneration` |
-
-## Fine-Tuning
-
-All supported models can be fine-tuned using either full SFT or PEFT (LoRA) approaches. See the [Gemma 3 Fine-Tuning Guide](../../guides/omni/gemma3-3n.md) for a complete walkthrough covering dataset preparation, configuration, and multi-GPU training.
-
-:::{tip}
-In these guides, we use the `quintend/rdr-items` and `naver-clova-ix/cord-v2` datasets for demonstration purposes. Update the recipe YAML `dataset` section to use your own data. See [VLM datasets](../../guides/vlm/dataset.md) and [dataset overview](../../guides/dataset-overview.md).
-:::
-
-```{toctree}
-:hidden:
-
-moonshotai/kimi-vl
-google/gemma3-vl
-google/gemma4
-qwen/qwen2-5-vl
-qwen/qwen3-vl
-qwen/qwen3-5-vl
-nvidia/nemotron-parse
-mistralai/ministral3-vl
-mistralai/mistral-small-4
-mistralai/mistral-medium-3-5
-internlm/internvl
-meta/llama4
-huggingface/smolvlm
-llava-hf/llava
-lmms-lab/llava-onevision
-```
diff --git a/fern/versions/v0.4/pages/model-coverage/vlm/index.mdx b/docs/model-coverage/vlm/index.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/vlm/index.mdx
rename to docs/model-coverage/vlm/index.mdx
diff --git a/docs/model-coverage/vlm/internlm/internvl.md b/docs/model-coverage/vlm/internlm/internvl.md
deleted file mode 100644
index 0f6da2ae0f..0000000000
--- a/docs/model-coverage/vlm/internlm/internvl.md
+++ /dev/null
@@ -1,88 +0,0 @@
-# InternVL
-
-[InternVL](https://github.com/OpenGVLab/InternVL) is a vision language model from Shanghai AI Laboratory (OpenGVLab), combining a large vision encoder with an InternLM language backbone for strong multimodal performance.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Image-Text-to-Text |
-| **Architecture** | `InternVLForConditionalGeneration` |
-| **Parameters** | 4B – 8B |
-| **HF Org** | [OpenGVLab](https://huggingface.co/OpenGVLab) |
-:::
-
-## Available Models
-
-- **InternVL3.5-4B**
-- **InternVL3.5-8B**
-
-## Architecture
-
-- `InternVLForConditionalGeneration`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| InternVL3.5 4B | [`OpenGVLab/InternVL3_5-4B`](https://huggingface.co/OpenGVLab/InternVL3_5-4B) |
-
-## Example Recipes
-
-| Recipe | Dataset | Description |
-|---|---|---|
-| {download}`internvl_3_5_4b.yaml <../../../../examples/vlm_finetune/internvl/internvl_3_5_4b.yaml>` | MedPix-VQA | SFT — InternVL3.5 4B on MedPix |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/internvl/internvl_3_5_4b.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/internvl/internvl_3_5_4b.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Fine-Tuning
-
-See the [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Hugging Face Model Cards
-
-- [OpenGVLab/InternVL3_5-4B](https://huggingface.co/OpenGVLab/InternVL3_5-4B)
diff --git a/fern/versions/nightly/pages/model-coverage/vlm/internlm/internvl.mdx b/docs/model-coverage/vlm/internlm/internvl.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/vlm/internlm/internvl.mdx
rename to docs/model-coverage/vlm/internlm/internvl.mdx
diff --git a/docs/model-coverage/vlm/llava-hf/llava.md b/docs/model-coverage/vlm/llava-hf/llava.md
deleted file mode 100644
index 3c9cc28e0c..0000000000
--- a/docs/model-coverage/vlm/llava-hf/llava.md
+++ /dev/null
@@ -1,100 +0,0 @@
-# LLaVA
-
-[LLaVA](https://llava-vl.github.io/) (Large Language and Vision Assistant) is a pioneering open-source multimodal model connecting a vision encoder to a language model via a projection layer. Multiple versions and variants are supported via the `llava-hf` organization on Hugging Face.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Image-Text-to-Text |
-| **Architecture** | `LlavaForConditionalGeneration` / `LlavaNextForConditionalGeneration` |
-| **Parameters** | 7B – 34B |
-| **HF Org** | [llava-hf](https://huggingface.co/llava-hf) |
-:::
-
-## Available Models
-
-- **LLaVA-1.5** (`LlavaForConditionalGeneration`): 7B, 13B
-- **LLaVA-1.6 / LLaVA-NeXT** (`LlavaNextForConditionalGeneration`): 7B, 34B
-- **LLaVA-NeXT-Video** (`LlavaNextVideoForConditionalGeneration`): 7B
-- **LLaVA-OneVision** (`LlavaOnevisionForConditionalGeneration`): 7B
-
-## Architectures
-
-- `LlavaForConditionalGeneration` — LLaVA 1.5
-- `LlavaNextForConditionalGeneration` — LLaVA-NeXT / 1.6
-- `LlavaNextVideoForConditionalGeneration` — LLaVA-NeXT-Video
-- `LlavaOnevisionForConditionalGeneration` — LLaVA-OneVision
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| LLaVA 1.5 7B | [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) |
-| LLaVA 1.5 13B | [`llava-hf/llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf) |
-| LLaVA-NeXT Mistral 7B | [`llava-hf/llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) |
-| LLaVA-NeXT 34B | [`llava-hf/llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) |
-| LLaVA-NeXT-Video 7B | [`llava-hf/LLaVA-NeXT-Video-7B-hf`](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf) |
-| LLaVA-OneVision 7B | [`llava-hf/llava-onevision-qwen2-7b-ov-hf`](https://huggingface.co/llava-hf/llava-onevision-qwen2-7b-ov-hf) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base VLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/gemma3/gemma3_vl_4b_cord_v2.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/gemma3/gemma3_vl_4b_cord_v2.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Fine-Tuning
-
-See the [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Hugging Face Model Cards
-
-- [llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)
-- [llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)
-- [llava-hf/llava-onevision-qwen2-7b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-7b-ov-hf)
diff --git a/fern/versions/v0.4/pages/model-coverage/vlm/llava-hf/llava.mdx b/docs/model-coverage/vlm/llava-hf/llava.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/vlm/llava-hf/llava.mdx
rename to docs/model-coverage/vlm/llava-hf/llava.mdx
diff --git a/docs/model-coverage/vlm/lmms-lab/llava-onevision.md b/docs/model-coverage/vlm/lmms-lab/llava-onevision.md
deleted file mode 100644
index 9d7031232d..0000000000
--- a/docs/model-coverage/vlm/lmms-lab/llava-onevision.md
+++ /dev/null
@@ -1,93 +0,0 @@
-# LLaVA-OneVision 1.5
-
-[LLaVA-OneVision 1.5](https://github.com/EvolvingLMMs-Lab/LLaVA-OneVision-2) is a vision-language model combining a **Rice ViT** encoder with a **Qwen3** language backbone, capable of handling both image and video understanding. NeMo AutoModel ships a custom NVIDIA implementation (`LlavaOneVisionForConditionalGeneration`) with FSDP2/HSDP support, LoRA fine-tuning and distributed training.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Image-Text-to-Text |
-| **Architecture** | `LlavaOneVisionForConditionalGeneration` |
-| **Parameters** | 4B · 8B |
-| **HF Org** | [lmms-lab](https://huggingface.co/lmms-lab) |
-:::
-
-## Available Models
-
-- **LLaVA-OneVision 1.5 4B**: Qwen3 4B text backbone + Rice ViT (1024 hidden, 24 layers)
-- **LLaVA-OneVision 1.5 8B**: Qwen3 8B text backbone + Rice ViT (1024 hidden, 24 layers)
-
-## Architecture
-
-- `LlavaOneVisionForConditionalGeneration`
-
-Vision tower is the **Rice Transformer**: 14×14 patch embed with 2D RoPE, standard Transformer blocks (LayerNorm + Attention + MLP), and a 2×2 spatial Patch Merger that projects to the language-model hidden size.
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| LLaVA-OneVision-1.5 4B Instruct | [`lmms-lab/LLaVA-OneVision-1.5-4B-Instruct`](https://huggingface.co/lmms-lab/LLaVA-OneVision-1.5-4B-Instruct) |
-| LLaVA-OneVision-1.5 8B Instruct | [`lmms-lab/LLaVA-OneVision-1.5-8B-Instruct`](https://huggingface.co/lmms-lab/LLaVA-OneVision-1.5-8B-Instruct) |
-
-## Example Recipes
-
-| Recipe | Description |
-|---|---|
-| {download}`llava_ov_1_5_4b_finetune.yaml <../../../../examples/vlm_finetune/llava_onevision/llava_ov_1_5_4b_finetune.yaml>` | SFT — LLaVA-OneVision-1.5 4B on LLaVA-Instruct-150K |
-| {download}`llava_ov_1_5_8b_lora.yaml <../../../../examples/vlm_finetune/llava_onevision/llava_ov_1_5_8b_lora.yaml>` | LoRA — LLaVA-OneVision-1.5 8B on LLaVA-Instruct-150K |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/llava_onevision/llava_ov_1_5_4b_finetune.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/llava_onevision/llava_ov_1_5_4b_finetune.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Fine-Tuning
-
-See the [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Hugging Face Model Cards
-
-- [lmms-lab/LLaVA-OneVision-1.5-4B-Instruct](https://huggingface.co/lmms-lab/LLaVA-OneVision-1.5-4B-Instruct)
-- [lmms-lab/LLaVA-OneVision-1.5-8B-Instruct](https://huggingface.co/lmms-lab/LLaVA-OneVision-1.5-8B-Instruct)
diff --git a/fern/versions/nightly/pages/model-coverage/vlm/lmms-lab/llava-onevision.mdx b/docs/model-coverage/vlm/lmms-lab/llava-onevision.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/vlm/lmms-lab/llava-onevision.mdx
rename to docs/model-coverage/vlm/lmms-lab/llava-onevision.mdx
diff --git a/docs/model-coverage/vlm/meta/llama4.md b/docs/model-coverage/vlm/meta/llama4.md
deleted file mode 100644
index 9a3293e5e2..0000000000
--- a/docs/model-coverage/vlm/meta/llama4.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# Llama 4
-
-[Llama 4](https://ai.meta.com/blog/llama-4-multimodal-intelligence/) is Meta's first natively multimodal model family. Llama 4 Scout and Maverick are MoE models supporting interleaved image and text inputs.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Image-Text-to-Text |
-| **Architecture** | `Llama4ForConditionalGeneration` |
-| **Parameters** | 17B active (MoE) |
-| **HF Org** | [meta-llama](https://huggingface.co/meta-llama) |
-:::
-
-## Available Models
-
-- **Llama-4-Scout-17B-16E-Instruct**: 17B active / 16 experts
-- **Llama-4-Maverick-17B-128E-Instruct**: 17B active / 128 experts
-
-## Architecture
-
-- `Llama4ForConditionalGeneration`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Llama-4-Scout-17B-16E-Instruct | [`meta-llama/Llama-4-Scout-17B-16E-Instruct`](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct) |
-| Llama-4-Maverick-17B-128E-Instruct | [`meta-llama/Llama-4-Maverick-17B-128E-Instruct`](https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct) |
-
-
-## Try with NeMo AutoModel
-
-Install NeMo AutoModel and follow the fine-tuning guide to configure a recipe for this model.
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get example recipes you can adapt:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Fine-tune** by adapting a base VLM recipe — override the model ID on the CLI:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/gemma3/gemma3_vl_4b_cord_v2.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-
-Replace `<MODEL_HF_ID>` with the model ID from **Example HF Models** above.
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** The recipes are at `/opt/Automodel/examples/` — navigate there:
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Fine-tune**:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/gemma3/gemma3_vl_4b_cord_v2.yaml \
-  --model.pretrained_model_name_or_path <MODEL_HF_ID>
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Fine-Tuning
-
-See the [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Hugging Face Model Cards
-
-- [meta-llama/Llama-4-Scout-17B-16E-Instruct](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct)
-- [meta-llama/Llama-4-Maverick-17B-128E-Instruct](https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct)
diff --git a/fern/versions/v0.4/pages/model-coverage/vlm/meta/llama4.mdx b/docs/model-coverage/vlm/meta/llama4.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/vlm/meta/llama4.mdx
rename to docs/model-coverage/vlm/meta/llama4.mdx
diff --git a/docs/model-coverage/vlm/mistralai/ministral3-vl.md b/docs/model-coverage/vlm/mistralai/ministral3-vl.md
deleted file mode 100644
index 15d9581eb1..0000000000
--- a/docs/model-coverage/vlm/mistralai/ministral3-vl.md
+++ /dev/null
@@ -1,93 +0,0 @@
-# Ministral3 VL
-
-[Ministral3](https://mistral.ai/news/ministraux/) is Mistral AI's efficient small model series. The vision-capable variants support image-text inputs for multimodal tasks.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Image-Text-to-Text |
-| **Architecture** | `Mistral3ForConditionalGeneration` |
-| **Parameters** | 3B – 14B |
-| **HF Org** | [mistralai](https://huggingface.co/mistralai) |
-:::
-
-## Available Models
-
-- **Ministral-3-14B-Instruct-2512**
-- **Ministral-3-8B-Instruct-2512**
-- **Ministral-3-3B-Instruct-2512**
-
-## Architecture
-
-- `Mistral3ForConditionalGeneration`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Ministral-3 3B Instruct | [`mistralai/Ministral-3-3B-Instruct-2512`](https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512) |
-| Ministral-3 8B Instruct | [`mistralai/Ministral-3-8B-Instruct-2512`](https://huggingface.co/mistralai/Ministral-3-8B-Instruct-2512) |
-| Ministral-3 14B Instruct | [`mistralai/Ministral-3-14B-Instruct-2512`](https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512) |
-
-## Example Recipes
-
-| Recipe | Dataset | Description |
-|---|---|---|
-| {download}`ministral3_3b_medpix.yaml <../../../../examples/vlm_finetune/mistral/ministral3_3b_medpix.yaml>` | MedPix-VQA | SFT — Ministral3 3B on MedPix |
-| {download}`ministral3_8b_medpix.yaml <../../../../examples/vlm_finetune/mistral/ministral3_8b_medpix.yaml>` | MedPix-VQA | SFT — Ministral3 8B on MedPix |
-| {download}`ministral3_14b_medpix.yaml <../../../../examples/vlm_finetune/mistral/ministral3_14b_medpix.yaml>` | MedPix-VQA | SFT — Ministral3 14B on MedPix |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/mistral/ministral3_3b_medpix.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/mistral/ministral3_3b_medpix.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Fine-Tuning
-
-See the [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Hugging Face Model Cards
-
-- [mistralai/Ministral-3-8B-Instruct-2512](https://huggingface.co/mistralai/Ministral-3-8B-Instruct-2512)
diff --git a/fern/versions/v0.4/pages/model-coverage/vlm/mistralai/ministral3-vl.mdx b/docs/model-coverage/vlm/mistralai/ministral3-vl.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/vlm/mistralai/ministral3-vl.mdx
rename to docs/model-coverage/vlm/mistralai/ministral3-vl.mdx
diff --git a/docs/model-coverage/vlm/mistralai/mistral-medium-3-5.md b/docs/model-coverage/vlm/mistralai/mistral-medium-3-5.md
deleted file mode 100644
index 1650d2a4a0..0000000000
--- a/docs/model-coverage/vlm/mistralai/mistral-medium-3-5.md
+++ /dev/null
@@ -1,158 +0,0 @@
-# Mistral Medium 3.5
-
-[Mistral Medium 3.5](https://huggingface.co/mistralai) is Mistral AI's
-flagship **128B dense** model that merges instruction-following, reasoning,
-and coding into a single checkpoint with a configurable reasoning mode.
-It unifies the lineage of *Mistral Medium 3.1*, *Magistral Medium*, and
-*Devstral 2* into one model, and ships natively in FP8 (per-tensor
-`weight_scale_inv`) so the full model fits inside an H200 node or 2 ×
-H100 nodes — a notable footprint advantage over comparably-capable
-Mixture-of-Experts (MoE) systems.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Image-Text-to-Text |
-| **Architecture** | `Mistral3ForConditionalGeneration` (Pixtral vision tower + dense Ministral-3 text decoder) |
-| **Parameters** | 128B (dense, FP8 on disk) |
-| **Context Window** | 256k tokens |
-| **Languages** | 40+ (English, French, Spanish, German, Russian, Chinese, Japanese, Italian, Portuguese, Arabic, Hindi, Korean, plus Indic / Nordic / Eastern European tail) |
-| **License** | Modified MIT (open-weights, ≤ $20M annual revenue threshold) |
-| **HF Org** | [mistralai](https://huggingface.co/mistralai) |
-:::
-
-## Architecture
-
-Mistral Medium 3.5 is a **dense** transformer — no MoE routing — built on
-the same text backbone as
-[`mistralai/Devstral-2-123B-Instruct-2512`](https://huggingface.co/mistralai/Devstral-2-123B-Instruct-2512):
-88 Ministral-3 decoder layers (hidden 12288, 96 attention heads,
-8 KV heads, GQA) with the standard llama-style RoPE + RMSNorm + SwiGLU
-MLP layout. The multimodal variant adds a Pixtral vision tower and
-multi-modal projector on top, making it an
-`AutoModelForImageTextToText` checkpoint.
-
-Compared with MoE models of similar capability, the dense layout
-trades sparse-activation throughput for a substantially smaller
-deployment footprint — relevant when you want to fine-tune or serve
-the model on a single node.
-
-## Key Strengths
-
-- **Compactness.** Dense 128B fits in fewer GPUs than the comparable
-  MoE class — a single H200 node or 2 × H100 nodes for inference.
-- **Configurable reasoning mode.** One checkpoint covers chat,
-  agentic, and reasoning workloads; the reasoning mode is toggled at
-  inference time.
-- **Strong agentic performance.** Competitive on tool-use and
-  decision-making benchmarks; suitable as a base for connector-driven
-  agent workflows.
-- **Long context.** 256k-token window for document parsing and
-  research-assistant use cases.
-
-Trade-offs disclosed in the model card: weaker non-agentic benchmark
-performance and more verbose outputs than some closed-source
-competitors.
-
-## Use Cases
-
-- Agentic workflows with connectors
-- Cloud and local async coding
-- Document parsing (multimodal — text + image)
-- Research assistants
-- General chat
-- Base model for downstream fine-tuning
-
-## Available Models
-
-- **Mistral-Medium-3.5 128B**
-
-## Class
-
-- HF: `Mistral3ForConditionalGeneration`
-- NeMo AutoModel custom: `Mistral3FP8VLMForConditionalGeneration`
-  ([source](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/models/mistral3_vlm/model.py))
-
-The custom class extends HF's `Mistral3ForConditionalGeneration` and
-attaches a `Mistral3FP8StateDictAdapter.for_vlm_full()` so the FP8
-checkpoint dequantizes per-shard inside the standard DCP load — the
-full BF16 model is never materialized on a single rank, allowing TP+PP
-training to fit on H100-80GB.
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Mistral Medium 3.5 128B | [`mistralai/Mistral-Medium-3.5`](https://huggingface.co/mistralai) |
-
-## Example Recipes
-
-| Recipe | Dataset | Description |
-|---|---|---|
-| {download}`mistral3p5_128b_medpix.yaml <../../../../examples/vlm_finetune/mistral3p5/mistral3p5_128b_medpix.yaml>` | MedPix-VQA | SFT — Mistral Medium 3.5 128B on MedPix, 8 nodes × 8 GPUs (TP=8 PP=8) |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-:::{note}
-This recipe was validated on **8 nodes × 8 GPUs (64 H100s)** with
-TP=8 PP=8 DP=1. See the [Launcher Guide](../../../launcher/slurm.md)
-for multi-node setup. Inference / single-node fine-tune fits in
-**1 × H200** or **2 × H100** nodes thanks to the dense + FP8 layout.
-:::
-
-**3. Run the recipe** via Slurm (see the
-[fine-tuning guide](../../../guides/vlm/mistral-medium-3-5.md) for a
-complete launch script):
-
-```bash
-sbatch your_slurm_script.sub
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/mistral3p5/mistral3p5_128b_medpix.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and the
-[Mistral Medium 3.5 Fine-Tuning Guide](../../../guides/vlm/mistral-medium-3-5.md).
-
-## Fine-Tuning
-
-See the [Mistral Medium 3.5 Fine-Tuning Guide](../../../guides/vlm/mistral-medium-3-5.md).
-
-## Hugging Face Model Cards
-
-- [mistralai](https://huggingface.co/mistralai)
-- Related architecture: [`mistralai/Devstral-2-123B-Instruct-2512`](https://huggingface.co/mistralai/Devstral-2-123B-Instruct-2512)
diff --git a/fern/versions/v0.4/pages/model-coverage/vlm/mistralai/mistral-medium-3-5.mdx b/docs/model-coverage/vlm/mistralai/mistral-medium-3-5.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/vlm/mistralai/mistral-medium-3-5.mdx
rename to docs/model-coverage/vlm/mistralai/mistral-medium-3-5.mdx
diff --git a/docs/model-coverage/vlm/mistralai/mistral-small-4.md b/docs/model-coverage/vlm/mistralai/mistral-small-4.md
deleted file mode 100644
index 86dc86de18..0000000000
--- a/docs/model-coverage/vlm/mistralai/mistral-small-4.md
+++ /dev/null
@@ -1,91 +0,0 @@
-# Mistral-Small-4
-
-[Mistral-Small-4-119B](https://huggingface.co/mistralai/Mistral-Small-4-119B-2603) is Mistral AI's multimodal MoE model supporting both text and image inputs at scale.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Image-Text-to-Text |
-| **Architecture** | `MistralForConditionalGeneration` |
-| **Parameters** | 119B (MoE) |
-| **HF Org** | [mistralai](https://huggingface.co/mistralai) |
-:::
-
-## Available Models
-
-- **Mistral-Small-4-119B-2603**
-
-## Architecture
-
-- `MistralForConditionalGeneration`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Mistral-Small-4 119B | [`mistralai/Mistral-Small-4-119B-2603`](https://huggingface.co/mistralai/Mistral-Small-4-119B-2603) |
-
-## Example Recipes
-
-| Recipe | Dataset | Description |
-|---|---|---|
-| {download}`mistral4_medpix.yaml <../../../../examples/vlm_finetune/mistral4/mistral4_medpix.yaml>` | MedPix-VQA | SFT — Mistral-Small-4 on MedPix |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-:::{note}
-This recipe was validated on **4 nodes × 8 GPUs (32 H100s)**. See the [Launcher Guide](../../../launcher/slurm.md) for multi-node setup.
-:::
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/mistral4/mistral4_medpix.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/mistral4/mistral4_medpix.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Fine-Tuning
-
-See the [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Hugging Face Model Cards
-
-- [mistralai/Mistral-Small-4-119B-2603](https://huggingface.co/mistralai/Mistral-Small-4-119B-2603)
diff --git a/fern/versions/nightly/pages/model-coverage/vlm/mistralai/mistral-small-4.mdx b/docs/model-coverage/vlm/mistralai/mistral-small-4.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/vlm/mistralai/mistral-small-4.mdx
rename to docs/model-coverage/vlm/mistralai/mistral-small-4.mdx
diff --git a/docs/model-coverage/vlm/moonshotai/kimi-vl.md b/docs/model-coverage/vlm/moonshotai/kimi-vl.md
deleted file mode 100644
index f8344a9936..0000000000
--- a/docs/model-coverage/vlm/moonshotai/kimi-vl.md
+++ /dev/null
@@ -1,89 +0,0 @@
-# Kimi-VL
-
-[Kimi-VL](https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct) and Kimi-K25-VL are vision language models from Moonshot AI. Kimi-VL-A3B uses a MoE language backbone (3B active parameters) with a vision encoder, supporting image understanding and multimodal reasoning.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Image-Text-to-Text |
-| **Architecture** | `KimiVLForConditionalGeneration` |
-| **Parameters** | ~3B active (MoE) |
-| **HF Org** | [moonshotai](https://huggingface.co/moonshotai) |
-:::
-
-## Available Models
-
-- **Kimi-VL-A3B-Instruct**
-- **Kimi-K25-VL**
-
-## Architecture
-
-- `KimiVLForConditionalGeneration`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Kimi-VL-A3B-Instruct | [`moonshotai/Kimi-VL-A3B-Instruct`](https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct) |
-
-## Example Recipes
-
-| Recipe | Dataset | Description |
-|---|---|---|
-| {download}`kimi2vl_cordv2.yaml <../../../../examples/vlm_finetune/kimi/kimi2vl_cordv2.yaml>` | cord-v2 | SFT — Kimi-VL on CORD-v2 |
-| {download}`kimi25vl_medpix.yaml <../../../../examples/vlm_finetune/kimi/kimi25vl_medpix.yaml>` | MedPix-VQA | SFT — Kimi-K25-VL on MedPix |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/kimi/kimi2vl_cordv2.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/kimi/kimi2vl_cordv2.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Fine-Tuning
-
-See the [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Hugging Face Model Cards
-
-- [moonshotai/Kimi-VL-A3B-Instruct](https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct)
diff --git a/fern/versions/v0.4/pages/model-coverage/vlm/moonshotai/kimi-vl.mdx b/docs/model-coverage/vlm/moonshotai/kimi-vl.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/vlm/moonshotai/kimi-vl.mdx
rename to docs/model-coverage/vlm/moonshotai/kimi-vl.mdx
diff --git a/docs/model-coverage/vlm/nvidia/nemotron-parse.md b/docs/model-coverage/vlm/nvidia/nemotron-parse.md
deleted file mode 100644
index a579a3e9f5..0000000000
--- a/docs/model-coverage/vlm/nvidia/nemotron-parse.md
+++ /dev/null
@@ -1,91 +0,0 @@
-# Nemotron-Parse
-
-[Nemotron-Parse-v1.1](https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1) is NVIDIA's document parsing VLM, specializing in extracting structured information from complex documents including tables, forms, and mixed-content PDFs.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Document Parsing |
-| **Architecture** | `NemotronParseForConditionalGeneration` |
-| **Parameters** | varies |
-| **HF Org** | [nvidia](https://huggingface.co/nvidia) |
-:::
-
-## Available Models
-
-- **Nemotron-Parse-v1.1**
-
-## Architecture
-
-- `NemotronParseForConditionalGeneration`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Nemotron-Parse v1.1 | [`nvidia/NVIDIA-Nemotron-Parse-v1.1`](https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1) |
-
-## Example Recipes
-
-| Recipe | Dataset | Description | Try on Brev |
-|---|---|---|---|
-| {download}`nemotron_parse_v1_1.yaml <../../../../examples/vlm_finetune/nemotron/nemotron_parse_v1_1.yaml>` | cord-v2 | SFT — Nemotron-Parse on CORD-v2 | [![Launch on Brev](https://brev-assets.s3.us-west-1.amazonaws.com/nv-lb-dark.svg)](https://brev.nvidia.com/launchable/deploy/now?launchableID=env-3C6LDKU2DfOvpVTFhjw3YQ4djPM) |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/nemotron/nemotron_parse_v1_1.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/nemotron/nemotron_parse_v1_1.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Fine-Tuning Tutorial on Brev
-
-Launch the end-to-end Nemotron Parse fine-tuning tutorial on Brev with a single click:
-
-[![Launch on Brev](https://brev-assets.s3.us-west-1.amazonaws.com/nv-lb-dark.svg)](https://brev.nvidia.com/launchable/deploy/now?launchableID=env-3C6LDKU2DfOvpVTFhjw3YQ4djPM)
-
-See also the [tutorial notebook](https://github.com/NVIDIA-NeMo/Automodel/blob/main/tutorials/nemotron-parse/finetune.ipynb) and the [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Hugging Face Model Cards
-
-- [nvidia/NVIDIA-Nemotron-Parse-v1.1](https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1)
diff --git a/fern/versions/nightly/pages/model-coverage/vlm/nvidia/nemotron-parse.mdx b/docs/model-coverage/vlm/nvidia/nemotron-parse.mdx
similarity index 100%
rename from fern/versions/nightly/pages/model-coverage/vlm/nvidia/nemotron-parse.mdx
rename to docs/model-coverage/vlm/nvidia/nemotron-parse.mdx
diff --git a/docs/model-coverage/vlm/qwen/qwen2-5-vl.md b/docs/model-coverage/vlm/qwen/qwen2-5-vl.md
deleted file mode 100644
index fad741e688..0000000000
--- a/docs/model-coverage/vlm/qwen/qwen2-5-vl.md
+++ /dev/null
@@ -1,95 +0,0 @@
-# Qwen2.5-VL
-
-[Qwen2.5-VL](https://qwenlm.github.io/blog/qwen2.5-vl/) is Alibaba Cloud's vision language model series supporting image and video understanding. It features dynamic resolution processing and integrates with the Qwen2.5 language backbone.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Image-Text-to-Text |
-| **Architecture** | `Qwen2_5VLForConditionalGeneration` |
-| **Parameters** | 2B – 72B |
-| **HF Org** | [Qwen](https://huggingface.co/Qwen) |
-:::
-
-## Available Models
-
-- **Qwen2.5-VL-72B-Instruct**
-- **Qwen2.5-VL-32B-Instruct**
-- **Qwen2.5-VL-7B-Instruct**
-- **Qwen2.5-VL-3B-Instruct**
-- **Qwen2-VL-7B-Instruct**, **Qwen2-VL-2B-Instruct** (Qwen2 VL)
-
-## Architectures
-
-- `Qwen2_5VLForConditionalGeneration` — Qwen2.5-VL
-- `Qwen2VLForConditionalGeneration` — Qwen2-VL
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Qwen2.5-VL 3B Instruct | [`Qwen/Qwen2.5-VL-3B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct) |
-| Qwen2.5-VL 7B Instruct | [`Qwen/Qwen2.5-VL-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) |
-| Qwen2-VL 7B Instruct | [`Qwen/Qwen2-VL-7B-Instruct`](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct) |
-
-## Example Recipes
-
-| Recipe | Dataset | Description |
-|---|---|---|
-| {download}`qwen2_5_vl_3b_rdr.yaml <../../../../examples/vlm_finetune/qwen2_5/qwen2_5_vl_3b_rdr.yaml>` | rdr-items | SFT — Qwen2.5-VL 3B on RDR Items |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/qwen2_5/qwen2_5_vl_3b_rdr.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/qwen2_5/qwen2_5_vl_3b_rdr.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Fine-Tuning
-
-See the [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Hugging Face Model Cards
-
-- [Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)
-- [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)
diff --git a/fern/versions/v0.4/pages/model-coverage/vlm/qwen/qwen2-5-vl.mdx b/docs/model-coverage/vlm/qwen/qwen2-5-vl.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/vlm/qwen/qwen2-5-vl.mdx
rename to docs/model-coverage/vlm/qwen/qwen2-5-vl.mdx
diff --git a/docs/model-coverage/vlm/qwen/qwen3-5-vl.md b/docs/model-coverage/vlm/qwen/qwen3-5-vl.md
deleted file mode 100644
index 50890fb5fe..0000000000
--- a/docs/model-coverage/vlm/qwen/qwen3-5-vl.md
+++ /dev/null
@@ -1,91 +0,0 @@
-# Qwen3.5-VL
-
-Qwen3.5-VL is Alibaba Cloud's next-generation vision language model series, including dense and MoE variants for image and multimodal understanding tasks.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Image-Text-to-Text |
-| **Architecture** | `Qwen3_5VLForConditionalGeneration` |
-| **Parameters** | 4B – 35B+ |
-| **HF Org** | [Qwen](https://huggingface.co/Qwen) |
-:::
-
-## Available Models
-
-- **Qwen3.5-VL-4B**: 4B dense model
-- **Qwen3.5-VL-9B**: 9B dense model
-- **Qwen3.5-MoE**: large MoE variant (35B+)
-- **Qwen3.6-27B**: 27B dense model
-- **Qwen3.6-35B-A3B**: next-generation MoE variant (35B total, 3B active)
-
-## Architectures
-
-- `Qwen3_5VLForConditionalGeneration` — dense models
-- `Qwen3_5MoeVLForConditionalGeneration` — MoE variant
-
-## Example Recipes
-
-| Recipe | Dataset | Description |
-|---|---|---|
-| {download}`qwen3_5_4b.yaml <../../../../examples/vlm_finetune/qwen3_5/qwen3_5_4b.yaml>` | MedPix-VQA | SFT — Qwen3.5-VL 4B on MedPix |
-| {download}`qwen3_5_9b.yaml <../../../../examples/vlm_finetune/qwen3_5/qwen3_5_9b.yaml>` | MedPix-VQA | SFT — Qwen3.5-VL 9B on MedPix |
-| {download}`qwen3_5_moe_medpix.yaml <../../../../examples/vlm_finetune/qwen3_5_moe/qwen3_5_moe_medpix.yaml>` | MedPix-VQA | SFT — Qwen3.5-MoE on MedPix |
-| {download}`qwen3_5_35b.yaml <../../../../examples/vlm_finetune/qwen3_5_moe/qwen3_5_35b.yaml>` | MedPix-VQA | SFT — Qwen3.5 35B on MedPix |
-| {download}`qwen3_6_27b.yaml <../../../../examples/vlm_finetune/qwen3_5/qwen3_6_27b.yaml>` | MedPix-VQA | SFT — Qwen3.6-27B on MedPix |
-| {download}`qwen3_6_35b.yaml <../../../../examples/vlm_finetune/qwen3_5_moe/qwen3_6_35b.yaml>` | MedPix-VQA | SFT — Qwen3.6 35B-A3B on MedPix |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/qwen3_5/qwen3_5_4b.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/qwen3_5/qwen3_5_4b.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Fine-Tuning
-
-See the [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Hugging Face Model Cards
-
-- [Qwen](https://huggingface.co/Qwen)
diff --git a/fern/versions/v0.4/pages/model-coverage/vlm/qwen/qwen3-5-vl.mdx b/docs/model-coverage/vlm/qwen/qwen3-5-vl.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/vlm/qwen/qwen3-5-vl.mdx
rename to docs/model-coverage/vlm/qwen/qwen3-5-vl.mdx
diff --git a/docs/model-coverage/vlm/qwen/qwen3-vl.md b/docs/model-coverage/vlm/qwen/qwen3-vl.md
deleted file mode 100644
index db9ef2d4bb..0000000000
--- a/docs/model-coverage/vlm/qwen/qwen3-vl.md
+++ /dev/null
@@ -1,95 +0,0 @@
-# Qwen3-VL / Qwen3-VL-MoE
-
-[Qwen3-VL](https://qwenlm.github.io/blog/qwen3/) is Alibaba Cloud's third-generation vision language model series. The MoE variant activates a fraction of parameters per token for efficient large-scale inference.
-
-:::{card}
-| | |
-|---|---|
-| **Task** | Image-Text-to-Text |
-| **Architecture** | `Qwen3VLForConditionalGeneration` |
-| **Parameters** | 4B – 235B |
-| **HF Org** | [Qwen](https://huggingface.co/Qwen) |
-:::
-
-## Available Models
-
-- **Qwen3-VL-8B-Instruct**: 8B
-- **Qwen3-VL-4B-Instruct**: 4B
-- **Qwen3-VL-MoE-30B**: 30B total (MoE)
-- **Qwen3-VL-MoE-235B**: 235B total (MoE)
-
-## Architecture
-
-- `Qwen3VLForConditionalGeneration`
-
-## Example HF Models
-
-| Model | HF ID |
-|---|---|
-| Qwen3-VL 4B Instruct | [`Qwen/Qwen3-VL-4B-Instruct`](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct) |
-| Qwen3-VL 8B Instruct | [`Qwen/Qwen3-VL-8B-Instruct`](https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct) |
-
-## Example Recipes
-
-| Recipe | Dataset | Description |
-|---|---|---|
-| {download}`qwen3_vl_4b_instruct_rdr.yaml <../../../../examples/vlm_finetune/qwen3/qwen3_vl_4b_instruct_rdr.yaml>` | rdr-items | SFT — Qwen3-VL 4B on RDR Items |
-| {download}`qwen3_vl_8b_instruct_rdr.yaml <../../../../examples/vlm_finetune/qwen3/qwen3_vl_8b_instruct_rdr.yaml>` | rdr-items | SFT — Qwen3-VL 8B on RDR Items |
-| {download}`qwen3_vl_moe_30b_te_deepep.yaml <../../../../examples/vlm_finetune/qwen3/qwen3_vl_moe_30b_te_deepep.yaml>` | MedPix-VQA | SFT — Qwen3-VL-MoE 30B with TE + DeepEP |
-| {download}`qwen3_vl_moe_235b.yaml <../../../../examples/vlm_finetune/qwen3/qwen3_vl_moe_235b.yaml>` | MedPix-VQA | SFT — Qwen3-VL-MoE 235B |
-
-
-## Try with NeMo AutoModel
-
-**1. Install** ([full instructions](../../../guides/installation.md)):
-
-```bash
-pip install nemo-automodel
-```
-
-**2. Clone the repo** to get the example recipes:
-
-```bash
-git clone https://github.com/NVIDIA-NeMo/Automodel.git
-cd Automodel
-```
-
-**3. Run the recipe** from inside the repo:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/qwen3/qwen3_vl_4b_instruct_rdr.yaml
-```
-
-:::{dropdown} Run with Docker
-**1. Pull the container** and mount a checkpoint directory:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v $(pwd)/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-**2.** Navigate to the AutoModel directory (where the recipes are):
-
-```bash
-cd /opt/Automodel
-```
-
-**3. Run the recipe**:
-
-```bash
-automodel --nproc-per-node=8 examples/vlm_finetune/qwen3/qwen3_vl_4b_instruct_rdr.yaml
-```
-:::
-
-See the [Installation Guide](../../../guides/installation.md) and [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Fine-Tuning
-
-See the [VLM Fine-Tuning Guide](../../../guides/omni/gemma3-3n.md).
-
-## Hugging Face Model Cards
-
-- [Qwen/Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct)
-- [Qwen/Qwen3-VL-8B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct)
diff --git a/fern/versions/v0.4/pages/model-coverage/vlm/qwen/qwen3-vl.mdx b/docs/model-coverage/vlm/qwen/qwen3-vl.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/model-coverage/vlm/qwen/qwen3-vl.mdx
rename to docs/model-coverage/vlm/qwen/qwen3-vl.mdx
diff --git a/docs/performance-summary.md b/docs/performance-summary.md
deleted file mode 100644
index 622612cac9..0000000000
--- a/docs/performance-summary.md
+++ /dev/null
@@ -1,83 +0,0 @@
-# Performance Summary
-
-This document provides performance benchmarks for various large language models using NeMo AutoModel with the PyTorch backend.
-
-## Pre-Training Performance
-
-The table below shows training performance for full sequences with no padding across different model architectures and scales.
-
-### System: DGX-H100, Precision: BF16
-
-| Model | #GPUs | GBS | MBS | LBS | GA | Seq Length | TP | PP | CP | EP | VP | FSDP | Kernel Optimizations | Time per Global Step (s) | Model TFLOPs/sec/GPU | Tokens/sec/GPU |
-|-------|------:|----:|----:|----:|---:|-----------:|---:|---:|---:|---:|---:|-----:|---------|-------------------------:|---------------------:|---------------:|
-| Nemotron V3 Super 120B (26.02) | 64 | 512 | 2 | 2 | 4 | 4096 | 1 | 1 | 1 | 64 | - | 64 | TE + DeepEP + TorchSDPA | 7.286 | 334 | 4,497 |
-| Nemotron V3 Nano 30B (26.02) | 8 | 512 | 4 | 4 | 16 | 4096 | 1 | 1 | 1 | 8 | - | 8 | TE + DeepEP + TorchSDPA | 15.614 | 328 | 16,789 |
-| DeepSeek V3 671B | 1024 | 8192 | 1 | 8 | 4 | 4096 | 1 | 4 | 1 | 64 | 8 | 256 | TE + DeepEP | 37.87 | 216 | 865 |
-| DeepSeek V3 671B | 256 | 512 | 1 | 8 | 1 | 4096 | 1 | 4 | 1 | 64 | 8 | 64 | TE + DeepEP | 8.18 | 250 | 1,002 |
-| Kimi K2 | 256 | 512 | 1 | 8 | 2 | 4096 | 1 | 8 | 1 | 32 | 4 | 32 | TE + DeepEP | 8.86 | 189 | 924 |
-| Qwen3 MoE 30B | 8 | 512 | 4 | 4 | 16 | 4096 | 1 | 1 | 1 | 8 | - | 8 | TE + DeepEP | 21.773 | 277 | 12,040 |
-| GPT-OSS 20B | 8 | 256 | 2 | 2 | 16 | 4096 | 1 | 1 | 1 | - | - | 8 | TE + DeepEP + FlexAttn | 10.04 | 279 | 13,058 |
-| GPT-OSS 120B | 64 | 512 | 2 | 2 | 4 | 4096 | 1 | 1 | 1 | - | - | 64 | TE + DeepEP + FlexAttn | 4.30 | 231 | 7,626 |
-| Llama3 70B | 64 | 128 | 1 | 1 | 4 | 8192 | 1 | 1 | 2 | - | - | 32 | TE + fsdp2_prefetch | 18.90 | 389 | 866.77 |
-
-
-## Fine-Tuning (LoRA) Performance
-
-The table below shows fine-tuning (LoRA) performance for full sequences with no padding across different model architectures and scales.
-
-### System: DGX-H100, Precision: BF16
-
-| Model | #GPUs | GBS | MBS | LBS | GA | Seq Length | TP | PP | CP | EP | VP | FSDP | Kernel Optimizations | Time per Global Step (s) | Model TFLOPs/sec/GPU | Tokens/sec/GPU |
-|-------|------:|----:|----:|----:|---:|-----------:|---:|---:|---:|---:|---:|-----:|---------|-------------------------:|---------------------:|---------------:|
-| Llama3 8B | 1 | 32 | 2 | 2 | 16 | 4096 | 1 | 1 | 1 | - | 1 | 1 | TE + triton | 10.51 | 402 | 12472.87 |
-| Qwen2.5 7B | 1 | 32 | 2 | 2 | 16 | 4096 | 1 | 1 | 1 | - | 1 | 1 | TE + triton | 9.29 | 423 | 14110.05 |
-| Llama3 70B | 8 | 32 | 2 | 2 | 4 | 4096 | 2 | 1 | 1 | - | 1 | 4 | TE + triton + fsdp2_prefetch | 15.00 | 316 | 1091.85 |
-| Qwen2.5 32B | 8 | 32 | 2 | 2 | 4 | 4096 | 2 | 1 | 1 | - | 1 | 4 | TE + triton + fsdp2_prefetch | 7.28 | 301 | 2250.31 |
-| Llama3 70B 2-node | 16 | 32 | 2 | 2 | 2 | 4096 | 2 | 1 | 1 | - | 1 | 8 | TE + triton + fsdp2_prefetch | 8.32 | 285 | 984.85 |
-| Qwen2.5 32B 2-node | 16 | 32 | 2 | 2 | 2 | 4096 | 2 | 1 | 1 | - | 1 | 8 | TE + triton + fsdp2_prefetch | 3.95 | 277 | 2072.89 |
-
-## Glossary
-
-- **MFU**: Model FLOPs Utilization - ratio of achieved compute to peak hardware capability
-- **TP**: Tensor Parallelism - splits individual layers across GPUs
-- **PP**: Pipeline Parallelism - splits model layers into stages
-- **EP**: Expert Parallelism - distributes MoE experts across GPUs
-- **DP**: Data Parallelism - replicates model and splits data
-- **VP**: Virtual Pipeline - number of pipeline stages per GPU for interleaving
-- **MBS**: Micro-Batch Size - size of one forward pass in pipeline
-- **LBS**: Local Batch Size - size of one step per GPU
-- **GBS**: Global Batch Size - total batch size across all GPUs
-- **GA**: Gradient Accumulation - number of local-batches before optimizer step
-- **TE**: Transformer Engine kernel optimizations - RMSNorm, Linear and DotProductAttention
-- **DeepEP**: Deep Expert Parallelism - advanced EP routing for MoE models
-- **FlexAttn**: PyTorch's [Flex Attention](https://docs.pytorch.org/docs/stable/nn.attention.flex_attention.html)
-
-## Configuration Files
-
-Pre-training and fine-tuning (LoRA) benchmark configurations are available in [`examples/llm_benchmark/`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_benchmark):
-
-- [`deepseek_v3_te_deepep.yaml`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_benchmark/deepseek/deepseek_v3_te_deepep.yaml) - DeepSeek V3 with TE + DeepEP
-- [`kimi_k2_te_deepep.yaml`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_benchmark/kimi/kimi_k2_te_deepep.yaml) - Kimi K2 optimized configuration
-- [`qwen3_moe_30b_te_deepep.yaml`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_benchmark/qwen/qwen3_moe_30b_te_deepep.yaml) - Qwen3 MoE with TE + DeepEP
-- [`gptoss_20b_te_deepep.yaml`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_benchmark/gpt_oss/gptoss_20b_te_deepep.yaml) - GPT-OSS 20B with optimizations
-- [`gptoss_120b_te_deepep.yaml`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_benchmark/gpt_oss/gptoss_120b_te_deepep.yaml) - GPT-OSS 120B optimized
-- [`custom_llama3_1_70b_pretrain_benchmark_8nodes.yaml`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_benchmark/llama3_3/custom_llama3_1_70b_pretrain_benchmark_8nodes.yaml) - Llama3-70B optimized
-- [`llama3_1_8b_peft_benchmark.yaml`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_benchmark/llama3_1/llama3_1_8b_peft_benchmark.yaml) - Llama-8B fine-tuning (LoRA) optimized
-- [`qwen2_5_7b_peft_benchmark.yaml`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_benchmark/qwen/qwen2_5_7b_peft_benchmark.yaml) - Qwen2.5-7B fine-tuning (LoRA) optimized
-- [`custom_llama3_3_70b_instruct_peft_benchmark.yaml`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_benchmark/llama3_3/custom_llama3_3_70b_instruct_peft_benchmark.yaml) - Llama-70B fine-tuning (LoRA) optimized
-- [`custom_qwen2_5_32b_peft_benchmark.yaml`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_benchmark/qwen/custom_qwen2_5_32b_peft_benchmark.yaml) - Qwen2.5-32B fine-tuning (LoRA) optimized
-- [`custom_llama3_3_70b_instruct_peft_benchmark_2nodes.yaml`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_benchmark/llama3_3/custom_llama3_3_70b_instruct_peft_benchmark_2nodes.yaml) - Llama-70B fine-tuning (LoRA) optimized on 2 nodes
-- [`custom_qwen2_5_32b_peft_benchmark_2nodes.yaml`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_benchmark/qwen/custom_qwen2_5_32b_peft_benchmark_2nodes.yaml) - Qwen2.5-32B fine-tuning (LoRA) optimized on 2 nodes
-
-:::{note}
-- All benchmarks use mock data for consistent performance measurement.
-- Fake balanced gate is enabled to simulate ideal expert routing.
-- No gradient clipping applied for pure performance measurement.
-- MFU calculated using peak TFLOPs for the system (989 for BF16 H100).
-- Step times include forward and backward passes + optimizer step for the global batch.
-:::
-
-
-## Version Information
-- **Last Updated**: 2025-10-02
-- **NeMo AutoModel Version**: `main` Branch
diff --git a/fern/versions/v0.4/pages/performance-summary.mdx b/docs/performance-summary.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/performance-summary.mdx
rename to docs/performance-summary.mdx
diff --git a/docs/project.json b/docs/project.json
deleted file mode 100644
index 103b1f387b..0000000000
--- a/docs/project.json
+++ /dev/null
@@ -1 +0,0 @@
-{"name": "nemo-automodel", "version": "nightly"}
diff --git a/docs/release-notes.md b/docs/release-notes.md
deleted file mode 100644
index 7cbefe5b43..0000000000
--- a/docs/release-notes.md
+++ /dev/null
@@ -1,256 +0,0 @@
-# Release Notes
-
-## 0.4.0 · 26.04 (2026-04-28) · [PyPI](https://pypi.org/project/nemo-automodel/0.4.0/) · [GH](https://github.com/NVIDIA-NeMo/Automodel/releases/tag/v0.4.0) · [NGC Docker](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo-automodel/tags?version=26.04.00)
-
-### Highlights
-
-- **Discrete-diffusion LLMs (dLLM).** SFT and generation support for dLLM
-  models, including Llada.
-- **Embedding and retrieval training.** Reranker training, biencoder datasets
-  loaded directly from the Hugging Face Hub, in-batch negative sampling, and
-  ONNX export for biencoder models.
-- **SkyPilot launcher.** Native multi-node launch on cloud (SkyPilot,
-  including Kubernetes), in addition to local interactive runs. SkyPilot and
-  NeMo Run launchers are selected with YAML sections in the config; SLURM jobs
-  use the `sbatch slurm.sub` workflow.
-- **CLI install profile.** The `nemo-automodel[cli]` extra declares `pyyaml`
-  beyond the package's base dependencies for job-submission configs.
-- **Refreshed CLI.** `automodel <config.yaml>` (alias `am`) replaces the older
-  `automodel <command> <domain> -c <config>` form.
-
-### New Models
-
-- **LLM:** GLM-5, MiniMax-M2.5, Nemotron Super v3, Nemotron Nano 4B/8B.
-- **MoE / VLM:** Qwen3.5-MoE (397B-A17B, 35B-A3B).
-- **VLM:** Gemma 4, Mistral Small 4, Qwen3.5 small dense models.
-- **Diffusion:** FLUX.1-dev, Wan 2.1 T2V, HunyuanVideo 1.5; Wan
-  multi-resolution and LoRA recipes for diffusion.
-
-### Distributed Training
-
-- Context parallelism for Qwen3.5-MoE and Nemotron v3.
-- Pipeline parallelism for knowledge distillation.
-- HybridEP and UCCL-EP as alternative expert-parallel dispatchers.
-- FSDP2 weight prefetching and async TP optimization.
-- TP > 1 in knowledge distillation.
-
-### Performance and Kernels
-
-- TE Linear layers enabled for PEFT/LoRA.
-- `torch._grouped_mm` expert backend.
-- fp32 RMSNorm backend and `cast_model_to_dtype` controls.
-- TP-aware KD loss with distributed softmax and T² scaling.
-- FlashOptim optimizer integration.
-- Sequence-packing updates: Qwen3.5-MoE VLM neat-packing recipe with EP+PP;
-  Generic THD collation for chat datasets; CP/BSHD padding fixes.
-
-### PEFT
-
-- MoE LoRA: rank scaling, `torch_mm` integration, expert-LoRA init using
-  `config.expert_dim`.
-- `merge_lora` tool for materializing adapters into the base model.
-- QLoRA PEFT checkpoints saved with the HF adapter prefix.
-
-### Recipes and Workflow
-
-- New recipes for Gemma 4 (LoRA), Nemotron Nano 4B SQuAD, Mistral Small 4,
-  Tulu-3 E2E convergence, GPT-OSS 20B / Moonlight 16B convergence, and
-  reranker / biencoder training.
-- MFU logging for LLM and dLLM train recipes.
-- Native Comet ML experiment tracking.
-- NEFTune noisy embeddings for instruction fine-tuning.
-- Scheduler-driven manual garbage collection.
-- Common inference utility and `.generate()` with KV cache for Nemotron v3.
-
-### Checkpointing
-
-- `v4_compatible` checkpoint format.
-- Diffusion full fine-tuning and pretraining examples use safetensors
-  checkpoint format; diffusion LoRA examples use `torch_save`.
-- QLoRA / LoRA loading robustness; tied-weight handling moved out of
-  `_init_model`.
-
-### Fixes
-
-- FSDP2 meta-device crash for Qwen3.5 GatedDeltaNet fp32 params.
-- Activation checkpointing silently skipped on registered VLMs (ModuleList
-  flattening).
-- Gradient checkpointing for MoE models on single GPU (`ep_size=1`).
-- Gradient clipping with `torch_mm` + EP (GPT-OSS 120B recipe).
-- Rotary embeddings for v4 models; `inputs_embeds` passthrough for Nano v3.
-
-### Breaking Changes
-
-A migration guide for the new CLI, the `recipe` YAML section, the SLURM
-`sbatch`-script workflow, and the `nemo-automodel[cli]` install profile is in
-[Breaking Changes](breaking-changes.md).
-
----
-
-## 0.3.0 · 26.02 (2026-02-26) · [PyPI](https://pypi.org/project/nemo-automodel/0.3.0/) · [GH](https://github.com/NVIDIA-NeMo/Automodel/releases/tag/v0.3.0) · [NGC Docker](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo-automodel/tags?version=26.02.00)
-
-### Highlights
-
-- **Transformers v4 / v5 alignment.** New `transformers v4` API support and a
-  v5 refactor for device-mesh-only model init.
-- **Streaming safetensors writer** for faster checkpoint export.
-- **Faster fp8 dequant kernels** with DTensor dequantization fixes for DSv3.
-
-### New Models
-
-- **LLM:** DeepSeek V3.2, Step-3.5-Flash, MiniMax-M2.1,
-  Nemotron-3-Nano-30B-A3B, Nemotron Flash 1B, GLM-4.7,
-  Devstral-Small-2-24B.
-- **MoE / VLM / Omni:** Qwen3-VL (4B/8B), Qwen3-VL-MoE (30B/235B),
-  Kimi-VL, Kimi-K2.5 VL, Nemotron-Parse VLM, InternVL3.5-4B,
-  Ministral3 (3B/8B/14B), Phi-4-multimodal.
-
-### Distributed Training
-
-- v5 refactor: device-mesh-only model init.
-- TP plan for Ministral; Ministral3 ported to transformers v4.
-- Pipeline-parallelism validation support.
-- Parallel diffusers `generate()`.
-
-### Performance and Kernels
-
-- TE fp8 for models that support it.
-- `GroupedExpertsTE` backend (prerequisite for MoE fp8).
-- TE RoPE fusion for custom MoE models; norm fusion and RoPE cache for dense
-  models.
-- Improved import time.
-
-### PEFT
-
-- DoRA implementation.
-- LoRA support for custom MoEs.
-- LoRA support in Biencoder.
-
-### Datasets and Workflow
-
-- Databricks Delta Lake dataset support; consolidation for Databricks.
-- Parquet file support; inline text dataset format.
-- `ColumnMapped`: configurable special tokens, chat-template flags, and
-  answer-only masking.
-- Hard negative mining and biencoder + inline-dataset tests.
-- nsys benchmark support and model-layer name scoping in the CLI.
-- Updated checkpoint auto-loading with explicit `restore_from`.
-- Dion optimizer.
-- Functiongemma + xlam tool-calling recipes.
-
-### Fixes
-
-- `inputs_embeds` passthrough for Nano v3.
-- `from_pretrained` / `from_config` simplification with model-id pass-through.
-- Tied-embedding detection improvements.
-
----
-
-## 0.2.0 · 25.11 (2025-12-04) · [PyPI](https://pypi.org/project/nemo-automodel/0.2.0/) · [GH](https://github.com/NVIDIA-NeMo/Automodel/releases/tag/v0.2.0) · [NGC Docker](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo-automodel/tags?version=25.11.00)
-
-### Highlights
-
-- **Async checkpointing.** Checkpoint refactor with async DCP and HF
-  safetensors backport / consolidation.
-- **Custom MoE optimizations.** FSDP optimizations, packed-sequence + context
-  parallel through TE, configurable router precision, fp32 `lm_head` and
-  fp32 `apply_rope`.
-- **Performance documentation.** New performance-summary doc and benchmarking
-  recipe with configs.
-- **Multinode + cluster guidance.** Multinode configs and updated launcher
-  docs.
-
-### New Models
-
-- **MoE:** Qwen3 MoE custom implementation, Qwen3 Next, GPT-OSS (custom
-  implementation, dequantization, DGX Spark recipe), GLM 4 / 4.5 / 4.6 MoE,
-  GLM 4.5 Air, Moonlight 2L test, Phi 4 (TP plan).
-- **Omni / VLM:** Qwen3-Omni OOTB recipe and custom implementation.
-- **DeepSeek v3** with fp8 base checkpoint loading.
-- **Sequence classification:** Qwen3ForSequenceClassification registered;
-  generic SFT sequence-classification recipe.
-
-### Distributed Training
-
-- VLM expert-parallel recipe support.
-- PP for VLM; PEFT with PP.
-- Sharding optimization for SP / LoRA.
-- `clip_grad_norm` across all parallelism modes.
-- `fully_shard_by_dtype` option.
-- Out-of-tree (OOT) parallelism decorator.
-
-### Performance and Kernels
-
-- Mask creation moved into the data pipeline for better performance.
-- TE attention for GPT-OSS.
-- Faster fp8 dequant; auto-detect base-weights dequant.
-
-### PEFT
-
-- LoRA-aware `ColwiseParallel` / `RowwiseParallel`.
-- LoRA + TE.
-- MFU estimation for LoRA.
-- Additional PEFT LoRA recipes.
-
-### Datasets and Recipes
-
-- Multiturn chat dataset; VLM multiturn chat support.
-- Tool-calling dataset and recipe.
-- Streaming dataset.
-- Multiple validation datasets with per-dataset logging.
-- ColumnMapped: surface truncating + padding options.
-- Configurable max-clip-grad; configurable remote-logging frequency using 
-  `step_scheduler`.
-- Validation-loss checkpoint, run-val-at-ckpt, best-ckpt symlink.
-- InternVL recipe; Qwen3-VL 30B recipe; Llama-Embed-Nemotron-8B training.
-
-### Logging and Observability
-
-- MLflow integration.
-- Metric logger with JSONL output.
-- YAML logging-to-stdout improvements.
-
-### Workflow
-
-- Knowledge-distillation custom validation step; `ScopedModuleOffloading` to
-  reduce memory.
-- Model Registry component.
-- SIGTERM handling.
-- `NEMO_ENABLE_USER_MODULES` for user-extension modules.
-- Rank-0 download for custom models.
-- Dereference env vars in YAML.
-
----
-
-## 0.1.2 (2025-10-23) · [PyPI](https://pypi.org/project/nemo-automodel/0.1.2/) · [GH](https://github.com/NVIDIA-NeMo/Automodel/releases/tag/v0.1.2)
-
-Patch release.
-
-- **Fix:** `max_steps` now set inside the constructor (#650).
-- **Fix:** step scheduler switched to zero-based indexing (#627).
-- **Fix:** sample-limit handling for `ColumnMapped` datasets (#521).
-
----
-
-## 0.1.0 (2025-10-08) · [PyPI](https://pypi.org/project/nemo-automodel/0.1.0/) · [GH](https://github.com/NVIDIA-NeMo/Automodel/releases/tag/v0.1.0)
-
-Initial public release of NeMo AutoModel.
-
-### Highlights
-
-- PyTorch-native training framework for LLMs and VLMs with Hugging Face
-  Transformers integration via `NeMoAuto*` wrapper classes.
-- YAML-driven recipes for SFT and PEFT.
-- FSDP2 / HSDP / DDP distributed training with DTensor sharding.
-- Megatron-FSDP available as the default heavy-duty sharding option (replaces
-  the earlier nvFSDP path).
-- Knowledge distillation recipe.
-- MoE component with DeepSeek v3 model implementation.
-- `ColumnMappedTextInstructionDataset` for instruction tuning.
-- Gradient checkpointing.
-- SLURM launcher.
-
----
-
-For the list of newly supported models per release, see the
-[Model Coverage Release Log](model-coverage/latest-models.md).
diff --git a/fern/versions/nightly/pages/release-notes.mdx b/docs/release-notes.mdx
similarity index 100%
rename from fern/versions/nightly/pages/release-notes.mdx
rename to docs/release-notes.mdx
diff --git a/docs/repository-structure.md b/docs/repository-structure.md
deleted file mode 100644
index 72cb311fad..0000000000
--- a/docs/repository-structure.md
+++ /dev/null
@@ -1,119 +0,0 @@
-# Repository Structure
-
-This introductory guide presents the structure of the NeMo AutoModel repository, provides a brief overview of its parts, introduces concepts such as components and recipes, and explains how everything fits together.
-
-## What is NeMo AutoModel?
-NeMo AutoModel is a PyTorch library for fine-tuning and pretraining large-scale models. In particular, it provides:
-- **Optimized implementations** for training efficiency, including fused kernels and memory-saving techniques.
-- [**Day-0 support**](model-coverage/overview.md) for LLMs and VLMs available on the Hugging Face Hub.
-- **Seamless integration** with Hugging Face datasets, tokenizers, and related tools.
-- **Distributed training strategies** using FSDP2 and MegatronFSDP across multi-GPU and multi-node environments.
-- **End-to-end workflows** with recipes for data preparation, training, and evaluation.
-
-
-## Repository Structure
-The AutoModel source code is available under the [`nemo_automodel`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/nemo_automodel) directory. It is organized into three directories:
-- [`components/`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/nemo_automodel/components)  - Self-contained modules
-- [`recipes/`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/nemo_automodel/recipes) - End-to-end training workflows
-- [`cli/`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/nemo_automodel/cli) - CLI entry-point and job launcher dispatch.
-
-### Components Directory
-The `components/` directory contains isolated modules used in training loops.
-Each component is designed to be dependency-light and reusable without cross-module imports.
-
-#### Directory Structure
-The following directory listing shows all components along with explanations of their contents.
-```
-$ tree -L 1 nemo_automodel/components/
-
-├── _peft/          - Implementations of PEFT methods, such as LoRA.
-├── attention/      - Efficient attention modules and related utilities (e.g., flash attention, rotary embeddings).
-├── checkpoint/     - Checkpoint save and load-related logic.
-├── config/         - Utils to load YAML files and CLI-parsing helpers.
-├── datasets/       - LLM and VLM datasets and utils (collate functions, preprocessing).
-├── distributed/    - Distributed processing primitives (DDP, FSDP2, MegatronFSDP) and pipeline parallelism (AutoPipeline).
-├── launcher/       - Job launcher for interactive and batch (Slurm, K8s) processing.
-├── loggers/        - Metric/event logging for Weights & Biases and other tools.
-├── loss/           - Loss functions (such as cross-entropy and linear cross-entropy, etc.).
-├── models/         - Optimized model implementations for LLMs and VLMs.
-├── moe/            - Mixture of Experts modules and routing utilities for scalable model architectures.
-├── optim/          - Optimizers and LR schedulers, including fused or second-order variants.
-├── quantization/   - Quantization layers and helpers for 4-bit/8-bit or other reduced-precision training and inference.
-├── training/       - Training and fine-tuning utils.
-└── utils/          - Small, dependency-free helpers (seed, profiler, timing, fs).
-```
-
-#### Key Features
-- Each component can be used independently in other projects.
-- Each component has its own dependencies, without cross-module imports.
-- Unit tests are colocated with the component they cover.
-
-### Recipes Directory
-Recipes define **end-to-end workflows** (data and model loading → training with custom loop → saving the output checkpoint)
-for a variety of tasks, such as training, fine-tuning, and knowledge distillation.
-
-#### Available Recipes
-The following directory listing shows all components along with explanations of their contents.
-```
-$ tree -L 2 nemo_automodel/recipes/
-├── llm
-│   ├── benchmark.py  - Benchmark recipe for LLMs
-│   ├── kd.py         - Knowledge Distillation for LLMs
-│   └── train_ft.py   - Train recipe for LLMs (Pretrain & Finetune SFT, PEFT).
-└── vlm
-    └── finetune.py   - Finetune recipe for VLMs (SFT, PEFT).
-```
-
-#### Run a Recipe
-
-Recipes are launched via the `automodel` CLI:
-```bash
-automodel --nproc-per-node 2 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```
-
-The above command will fine-tune the Llama3.2-1B model on the SQuAD dataset with two GPUs using the [`llama3_2_1b_squad.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml) config.
-For a single-GPU run, omit `--nproc-per-node`:
-```bash
-automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```
-
-Each recipe imports the components it needs from the `nemo_automodel/components/` catalog.
-The recipe/components structure enables you to:
-- Decouple individual components and replace them with custom implementations when needed.
-- Avoid rigid, class-based trainer structures by using linear scripts that expose training logic for maximum flexibility and control.
-
-<!-- For an in-depth explanation of the LLM recipe please also see the [LLM recipe deep-dive guide](docs/llm_recipe_deep_dive.md). -->
-
-#### Configure a Recipe
-An example YAML configuration is shown below. The complete config is available [here](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml):
-```yaml
-step_scheduler:
-  grad_acc_steps: 4
-  ckpt_every_steps: 1000
-  val_every_steps: 10  # will run every x number of gradient steps
-  num_epochs: 1
-
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad
-  split: train
-```
-
-More recipe examples are available under the [`examples/`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples) directory.
-
-### CLI Directory (`cli/`)
-The `automodel` (or `am`) CLI application simplifies job execution across different environments, from
-single-GPU interactive sessions to batch multi-node runs. It supports interactive (local), SLURM,
-SkyPilot, and NeMo-Run launchers. The CLI lives at the repository root in the
-`cli/` package, separate from the core `nemo_automodel` library.
-
-
-## Next Steps
-
-Learn how to train models with NeMo AutoModel on:
-- **Your local workstation**: See [`docs/launcher/local-workstation.md`](launcher/local-workstation.md).
-- **A cluster**: See [`docs/launcher/slurm.md`](launcher/slurm.md).
diff --git a/fern/versions/v0.4/pages/repository-structure.mdx b/docs/repository-structure.mdx
similarity index 100%
rename from fern/versions/v0.4/pages/repository-structure.mdx
rename to docs/repository-structure.mdx
diff --git a/docs/versions1.json b/docs/versions1.json
deleted file mode 100644
index a2f2d46129..0000000000
--- a/docs/versions1.json
+++ /dev/null
@@ -1,26 +0,0 @@
-[
-    {
-        "version": "nightly",
-        "url": "https://docs.nvidia.com/nemo/automodel/nightly/"
-    },
-    {
-        "name": "0.4.0 (latest) · 26.04",
-        "version": "0.4.0",
-        "url": "https://docs.nvidia.com/nemo/automodel/latest/"
-    },
-    {
-        "name": "0.3.0 · 26.02",
-        "version": "0.3.0",
-        "url": "https://docs.nvidia.com/nemo/automodel/0.3.0/"
-    },
-    {
-        "name": "0.2.0 · 25.11",
-        "version": "0.2.0",
-        "url": "https://docs.nvidia.com/nemo/automodel/0.2.0/"
-    },
-    {
-        "name": "0.1.0",
-        "version": "0.1.0",
-        "url": "https://docs.nvidia.com/nemo/automodel/0.1.0/"
-    }
-]
diff --git a/fern/assets/NVIDIA_dark.svg b/fern/assets/NVIDIA_dark.svg
deleted file mode 100644
index fe67b898bf..0000000000
--- a/fern/assets/NVIDIA_dark.svg
+++ /dev/null
@@ -1,37 +0,0 @@
-<!-- SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
-<!-- SPDX-License-Identifier: LicenseRef-NvidiaProprietary -->
-<svg version="1.1" id="Layer_1" xmlns:x="ns_extend;" xmlns:i="ns_ai;" xmlns:graph="ns_graphs;" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" viewBox="0 0 585.8 108" style="enable-background:new 0 0 585.8 108;" xml:space="preserve">
- <style type="text/css">
-  .st0{fill:#FFFFFF;}
-	.st1{fill:#76B900;}
- </style>
- <metadata>
-  <sfw xmlns="ns_sfw;">
-   <slices>
-   </slices>
-   <sliceSourceBounds bottomLeftOrigin="true" height="108" width="585.8" x="-330.3" y="-210.3">
-   </sliceSourceBounds>
-  </sfw>
- </metadata>
- <g>
-  <path class="st0" d="M578.2,86.6V85h1c0.5,0,1.3,0.1,1.3,0.7s-0.4,0.9-1.1,0.9H578.2 M578.2,87.7h0.7l1.6,2.8h1.7l-1.8-2.9
-		c0.9,0.1,1.6-0.6,1.7-1.5v-0.2c0-1.6-1.1-2-2.9-2h-2.6v6.7h1.5V87.7 M585.8,87.2c0-4-3.1-6.3-6.5-6.3s-6.5,2.3-6.5,6.3
-		s3.1,6.3,6.5,6.3S585.8,91.2,585.8,87.2 M583.9,87.2c0.1,2.5-1.9,4.7-4.4,4.8h-0.2c-2.6,0.1-4.9-2-5-4.6c-0.1-2.6,2-4.9,4.6-5
-		c2.6-0.1,4.9,2,5,4.6C583.9,87.1,583.9,87.1,583.9,87.2z">
-  </path>
-  <path class="st0" d="M347.5,20.4v70.9h20V20.4H347.5z M190,20.3v71h20.2V36.2l15.7,0.1c5.2,0,8.8,1.3,11.2,3.9
-		c3.2,3.4,4.4,8.8,4.4,18.8v32.4h19.6V52.1c0-28-17.8-31.8-35.3-31.8C226,20.3,190,20.3,190,20.3z M379.8,20.4v70.9h32.5
-		c17.3,0,23-2.9,29-9.3c4.3-4.5,7.1-14.5,7.1-25.4c0-10-2.3-18.9-6.5-24.4c-7.3-9.9-18-11.8-34-11.8
-		C407.8,20.4,379.8,20.4,379.8,20.4z M399.6,35.8h8.6c12.5,0,20.6,5.6,20.6,20.1s-8.1,20.2-20.6,20.2h-8.6V35.8z M318.7,20.4
-		l-16.7,56.2l-16-56.2h-21.6l22.8,70.9H316l23.1-70.9C339.1,20.4,318.7,20.4,318.7,20.4z M457.7,91.4h20V20.4h-20V91.4z M513.9,20.4
-		l-27.9,70.9h19.7l4.4-12.6h33.1l4.2,12.5h21.5l-28.2-70.8H513.9z M526.8,33.4L539,66.5h-24.6C514.3,66.5,526.8,33.4,526.8,33.4z">
-  </path>
-  <path class="st1" d="M60.9,32.2v-9.7c1-0.1,1.9-0.1,2.9-0.1c26.7-0.8,44.2,23,44.2,23S89.1,71.5,68.8,71.5c-2.7,0-5.3-0.4-7.9-1.3
-		V40.7c10.4,1.3,12.5,5.8,18.7,16.2l13.9-11.7c0,0-10.2-13.3-27.2-13.3C64.5,31.9,62.7,32,60.9,32.2 M60.9,0v14.5l2.9-0.2
-		c37.1-1.3,61.3,30.4,61.3,30.4S97.3,78.6,68.4,78.6c-2.5,0-5-0.2-7.5-0.7v9c2,0.2,4.1,0.4,6.2,0.4c26.9,0,46.4-13.8,65.3-30
-		c3.1,2.5,15.9,8.6,18.6,11.2c-17.9,15-59.7,27.1-83.4,27.1c-2.3,0-4.4-0.1-6.6-0.4V108h102.3V0C163.3,0,60.9,0,60.9,0z M60.9,70.3
-		v7.7C36,73.5,29.1,47.6,29.1,47.6s12-13.2,31.8-15.4v8.4h-0.1c-10.4-1.3-18.6,8.5-18.6,8.5S46.9,65.5,60.9,70.3 M16.7,46.5
-		c0,0,14.7-21.8,44.2-24v-7.9C28.2,17.2,0,44.8,0,44.8s16,46.3,60.9,50.5v-8.4C27.9,82.8,16.7,46.5,16.7,46.5z">
-  </path>
- </g>
-</svg>
diff --git a/fern/assets/NVIDIA_light.svg b/fern/assets/NVIDIA_light.svg
deleted file mode 100644
index 568ee177ba..0000000000
--- a/fern/assets/NVIDIA_light.svg
+++ /dev/null
@@ -1,36 +0,0 @@
-<!-- SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
-<!-- SPDX-License-Identifier: LicenseRef-NvidiaProprietary -->
-<svg version="1.1" id="Layer_1" xmlns:x="ns_extend;" xmlns:i="ns_ai;" xmlns:graph="ns_graphs;" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" viewBox="0 0 585.8 108" style="enable-background:new 0 0 585.8 108;" xml:space="preserve">
- <style type="text/css">
-  .st0{fill:#76B900;}
- </style>
- <metadata>
-  <sfw xmlns="ns_sfw;">
-   <slices>
-   </slices>
-   <sliceSourceBounds bottomLeftOrigin="true" height="108" width="585.8" x="-330.3" y="-210.3">
-   </sliceSourceBounds>
-  </sfw>
- </metadata>
- <g>
-  <path d="M578.2,86.6V85h1c0.5,0,1.3,0.1,1.3,0.7s-0.4,0.9-1.1,0.9H578.2 M578.2,87.7h0.7l1.6,2.8h1.7l-1.8-2.9
-		c0.9,0.1,1.6-0.6,1.7-1.5v-0.2c0-1.6-1.1-2-2.9-2h-2.6v6.7h1.5V87.7 M585.8,87.2c0-4-3.1-6.3-6.5-6.3s-6.5,2.3-6.5,6.3
-		s3.1,6.3,6.5,6.3S585.8,91.2,585.8,87.2 M583.9,87.2c0.1,2.5-1.9,4.7-4.4,4.8h-0.2c-2.6,0.1-4.9-2-5-4.6c-0.1-2.6,2-4.9,4.6-5
-		c2.6-0.1,4.9,2,5,4.6C583.9,87.1,583.9,87.1,583.9,87.2z">
-  </path>
-  <path d="M347.5,20.4v70.9h20V20.4H347.5z M190,20.3v71h20.2V36.2l15.7,0.1c5.2,0,8.8,1.3,11.2,3.9c3.2,3.4,4.4,8.8,4.4,18.8v32.4
-		h19.6V52.1c0-28-17.8-31.8-35.3-31.8C226,20.3,190,20.3,190,20.3z M379.8,20.4v70.9h32.5c17.3,0,23-2.9,29-9.3
-		c4.3-4.5,7.1-14.5,7.1-25.4c0-10-2.3-18.9-6.5-24.4c-7.3-9.9-18-11.8-34-11.8C407.8,20.4,379.8,20.4,379.8,20.4z M399.6,35.8h8.6
-		c12.5,0,20.6,5.6,20.6,20.1s-8.1,20.2-20.6,20.2h-8.6V35.8z M318.7,20.4l-16.7,56.2l-16-56.2h-21.6l22.8,70.9H316l23.1-70.9
-		C339.1,20.4,318.7,20.4,318.7,20.4z M457.7,91.4h20V20.4h-20V91.4z M513.9,20.4l-27.9,70.9h19.7l4.4-12.6h33.1l4.2,12.5h21.5
-		l-28.2-70.8H513.9z M526.8,33.4L539,66.5h-24.6C514.3,66.5,526.8,33.4,526.8,33.4z">
-  </path>
-  <path class="st0" d="M60.9,32.2v-9.7c1-0.1,1.9-0.1,2.9-0.1c26.7-0.8,44.2,23,44.2,23S89.1,71.5,68.8,71.5c-2.7,0-5.3-0.4-7.9-1.3
-		V40.7c10.4,1.3,12.5,5.8,18.7,16.2l13.9-11.7c0,0-10.2-13.3-27.2-13.3C64.5,31.9,62.7,32,60.9,32.2 M60.9,0v14.5l2.9-0.2
-		c37.1-1.3,61.3,30.4,61.3,30.4S97.3,78.6,68.4,78.6c-2.5,0-5-0.2-7.5-0.7v9c2,0.2,4.1,0.4,6.2,0.4c26.9,0,46.4-13.8,65.3-30
-		c3.1,2.5,15.9,8.6,18.6,11.2c-17.9,15-59.7,27.1-83.4,27.1c-2.3,0-4.4-0.1-6.6-0.4V108h102.3V0C163.3,0,60.9,0,60.9,0z M60.9,70.3
-		v7.7C36,73.5,29.1,47.6,29.1,47.6s12-13.2,31.8-15.4v8.4h-0.1c-10.4-1.3-18.6,8.5-18.6,8.5S46.9,65.5,60.9,70.3 M16.7,46.5
-		c0,0,14.7-21.8,44.2-24v-7.9C28.2,17.2,0,44.8,0,44.8s16,46.3,60.9,50.5v-8.4C27.9,82.8,16.7,46.5,16.7,46.5z">
-  </path>
- </g>
-</svg>
diff --git a/fern/assets/NVIDIA_symbol.svg b/fern/assets/NVIDIA_symbol.svg
deleted file mode 100644
index fd57037f32..0000000000
--- a/fern/assets/NVIDIA_symbol.svg
+++ /dev/null
@@ -1,24 +0,0 @@
-<!-- SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
-<!-- SPDX-License-Identifier: LicenseRef-NvidiaProprietary -->
-<svg version="1.1" id="Layer_1" xmlns:x="ns_extend;" xmlns:i="ns_ai;" xmlns:graph="ns_graphs;" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" viewBox="0 0 163.3 108" style="enable-background:new 0 0 163.3 108;" xml:space="preserve">
- <style type="text/css">
-  .st0{fill:#76B900;}
- </style>
- <metadata>
-  <sfw xmlns="ns_sfw;">
-   <slices>
-   </slices>
-   <sliceSourceBounds bottomLeftOrigin="true" height="108" width="163.3" x="-331" y="-210.3">
-   </sliceSourceBounds>
-  </sfw>
- </metadata>
- <g>
-  <path class="st0" d="M60.9,32.2v-9.7c1-0.1,1.9-0.1,2.9-0.1c26.7-0.8,44.2,23,44.2,23S89.1,71.5,68.8,71.5c-2.7,0-5.3-0.4-7.9-1.3
-		V40.7c10.4,1.3,12.5,5.8,18.7,16.2l13.9-11.7c0,0-10.2-13.3-27.2-13.3C64.5,31.9,62.7,32,60.9,32.2 M60.9,0v14.5l2.9-0.2
-		c37.1-1.3,61.3,30.4,61.3,30.4S97.3,78.6,68.4,78.6c-2.5,0-5-0.2-7.5-0.7v9c2,0.2,4.1,0.4,6.2,0.4c26.9,0,46.4-13.8,65.3-30
-		c3.1,2.5,15.9,8.6,18.6,11.2c-17.9,15-59.7,27.1-83.4,27.1c-2.3,0-4.4-0.1-6.6-0.4V108h102.3V0C163.3,0,60.9,0,60.9,0z M60.9,70.3
-		v7.7C36,73.5,29.1,47.6,29.1,47.6s12-13.2,31.8-15.4v8.4h-0.1c-10.4-1.3-18.6,8.5-18.6,8.5S46.9,65.5,60.9,70.3 M16.7,46.5
-		c0,0,14.7-21.8,44.2-24v-7.9C28.2,17.2,0,44.8,0,44.8s16,46.3,60.9,50.5v-8.4C27.9,82.8,16.7,46.5,16.7,46.5z">
-  </path>
- </g>
-</svg>
diff --git a/fern/components/CustomFooter.tsx b/fern/components/CustomFooter.tsx
deleted file mode 100644
index db2d9da6b0..0000000000
--- a/fern/components/CustomFooter.tsx
+++ /dev/null
@@ -1,96 +0,0 @@
-/**
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
- */
-
-/**
- * Custom footer for NVIDIA docs (Fern native header/footer).
- * Markup and class names match the original custom-app footer 1:1 so that
- * fern/main.css (footer + Built with Fern styles) applies correctly:
- * dark mode logo, responsive layout, and Built with Fern tooltip.
- */
-export default function CustomFooter() {
-  const currentYear = new Date().getFullYear();
-  const logoUrl =
-    "https://fern-image-hosting.s3.us-east-1.amazonaws.com/nvidia/NVIDIA_Logo_0.svg";
-
-  return (
-    <footer className="bd-footer">
-      <div className="bd-footer__inner">
-        <div className="footer-items__start">
-          <div className="footer-item">
-            <div className="footer-logos-container">
-              <a
-                className="footer-brand"
-                href="https://www.nvidia.com"
-                target="_blank"
-                rel="noopener"
-              >
-                <img src={logoUrl} className="logo__image only-light" alt="NVIDIA" />
-                <img src={logoUrl} className="logo__image only-dark" alt="NVIDIA" />
-              </a>
-              <div className="footer-brand-fern">
-                <a
-                  href="https://buildwithfern.com"
-                  className="built-with-fern-link"
-                  target="_blank"
-                  rel="noopener noreferrer"
-                >
-                  <svg
-                    width="145"
-                    height="16"
-                    viewBox="0 0 145 16"
-                    fill="none"
-                    xmlns="http://www.w3.org/2000/svg"
-                    className="built-with-fern-logo built-with-fern-logo-light"
-                    aria-hidden
-                  >
-                    <path d="M9.79656 4.8H14.5006C15.5139 4.8 16.3192 5.05067 16.9166 5.552C17.5139 6.04267 17.8126 6.71467 17.8126 7.568C17.8126 8.112 17.6739 8.608 17.3966 9.056C17.1192 9.504 16.7512 9.84 16.2926 10.064C16.8579 10.2667 17.3059 10.608 17.6366 11.088C17.9672 11.5573 18.1326 12.1173 18.1326 12.768C18.1326 13.7387 17.8286 14.5227 17.2206 15.12C16.6126 15.7067 15.7752 16 14.7086 16H9.79656V4.8ZM14.4846 14.528C15.1246 14.528 15.6206 14.3627 15.9726 14.032C16.3246 13.7013 16.5006 13.2373 16.5006 12.64C16.5006 12.0427 16.3246 11.5893 15.9726 11.28C15.6312 10.96 15.1352 10.8 14.4846 10.8H11.3966V14.528H14.4846ZM14.2766 9.424C14.8846 9.424 15.3539 9.28533 15.6846 9.008C16.0152 8.72 16.1806 8.32533 16.1806 7.824C16.1806 7.32267 16.0152 6.93867 15.6846 6.672C15.3539 6.40533 14.8846 6.272 14.2766 6.272H11.3966V9.424H14.2766ZM22.5778 16.224C21.6285 16.224 20.8871 15.9413 20.3538 15.376C19.8205 14.8107 19.5538 14 19.5538 12.944V8.304H21.1058V12.8C21.1058 13.472 21.2551 13.9787 21.5538 14.32C21.8631 14.6507 22.3005 14.816 22.8658 14.816C23.4525 14.816 23.9165 14.6293 24.2578 14.256C24.6098 13.872 24.7858 13.3707 24.7858 12.752V8.304H26.3378V16H24.9618V15.12C24.7165 15.4827 24.3858 15.76 23.9698 15.952C23.5538 16.1333 23.0898 16.224 22.5778 16.224ZM28.0746 8.304H29.6266V16H28.0746V8.304ZM27.9786 4.912H29.7066V6.752H27.9786V4.912ZM33.0334 16C32.4894 16 32.0948 15.888 31.8494 15.664C31.6041 15.44 31.4814 15.0667 31.4814 14.544V4.8H33.0334V14.064C33.0334 14.2667 33.0761 14.416 33.1614 14.512C33.2468 14.5973 33.3854 14.64 33.5774 14.64H34.5534V16H33.0334ZM37.9539 16C37.2819 16 36.7966 15.856 36.4979 15.568C36.1993 15.28 36.0499 14.8053 36.0499 14.144V9.664H34.0339V8.304H36.0499V6H37.6019V8.304H40.0179V9.664H37.6019V13.84C37.6019 14.1173 37.6659 14.32 37.7939 14.448C37.9219 14.576 38.1299 14.64 38.4179 14.64H40.0179V16H37.9539ZM43.5709 8.304H45.1869L46.8989 14.272L48.6109 8.304H50.3869L52.0989 14.272L53.8109 8.304H55.4269L53.0429 16H51.2189L49.5069 10.064L47.7789 16H45.9549L43.5709 8.304ZM56.3746 8.304H57.9266V16H56.3746V8.304ZM56.2786 4.912H58.0066V6.752H56.2786V4.912ZM62.5971 16C61.9251 16 61.4397 15.856 61.1411 15.568C60.8424 15.28 60.6931 14.8053 60.6931 14.144V9.664H58.6771V8.304H60.6931V6H62.2451V8.304H64.6611V9.664H62.2451V13.84C62.2451 14.1173 62.3091 14.32 62.4371 14.448C62.5651 14.576 62.7731 14.64 63.0611 14.64H64.6611V16H62.5971ZM65.6727 4.8H67.2247V9.056C67.4807 8.736 67.8007 8.496 68.1847 8.336C68.5794 8.16533 69.0114 8.08 69.4807 8.08C70.4407 8.08 71.1927 8.368 71.7367 8.944C72.2807 9.50933 72.5527 10.3147 72.5527 11.36V16H71.0007V11.504C71.0007 10.832 70.8407 10.3307 70.5207 10C70.2114 9.65867 69.7687 9.488 69.1927 9.488C68.5954 9.488 68.1154 9.68 67.7527 10.064C67.4007 10.4373 67.2247 10.9333 67.2247 11.552V16H65.6727V4.8Z" fill="#1E1F24" />
-                    <path d="M92.3849 7.82856C91.3321 6.93847 89.746 6.58166 88.3403 7.62074C88.2756 7.66779 88.1952 7.58741 88.2442 7.52468C88.5775 7.09532 88.9638 6.63263 89.2755 6.16798C89.5931 5.69157 90.0675 5.35044 90.6145 5.18379C93.5259 4.30155 92.6515 0.00012207 92.6515 0.00012207C92.6515 0.00012207 88.154 0.290282 88.7089 4.17019C88.801 4.81913 88.6285 5.47983 88.2227 5.99545C87.7247 6.62479 87.1463 7.22667 86.7268 7.66191C86.6385 7.7521 86.4895 7.66583 86.5248 7.54428C86.9307 6.17778 87.2267 4.06432 85.821 2.70175L83.8428 1.05881L83.4625 1.56071C82.3312 3.05268 82.6626 5.15634 84.1565 6.28561C85.0132 6.93259 85.4014 7.63643 85.3407 8.40888C85.3034 8.87157 85.0936 9.30485 84.7799 9.64794C84.1898 10.2949 83.6389 10.9889 83.2135 11.7928C83.1546 11.9045 82.9841 11.8614 82.99 11.734C83.0507 10.4067 82.9233 7.41489 80.6883 6.34639L78.1866 5.37984L77.9925 5.9582C77.3632 7.82464 78.3925 9.81851 80.257 10.4518C81.8783 11.0027 82.4567 12.0476 82.0665 13.6141C82.0489 13.671 81.7666 15.2845 81.8058 16.0001H83.6036C83.6644 14.8904 84.829 14.1611 85.8386 14.614C86.1229 14.7414 86.415 14.9238 86.715 15.159C88.3227 16.4255 90.691 16.1256 91.9555 14.516L92.3163 14.0572L90.0421 12.4241C88.4815 11.1968 86.3994 11.7516 84.8584 12.8024C84.729 12.8907 84.5643 12.7495 84.6368 12.6084C86.4993 8.95391 88.9206 8.96175 89.8695 9.77341C91.0204 10.7576 92.7633 10.5812 93.7396 9.4264L94.02 9.09507L92.3829 7.82856H92.3849Z" fill="#51C233" />
-                    <path d="M111.257 4.27539C114.524 4.27557 116.739 6.46855 116.739 9.98145C116.739 10.3833 116.718 10.788 116.673 11.2568H108.84C108.974 12.6434 109.892 13.4053 111.391 13.4053C112.398 13.4052 113.045 12.9803 113.338 12.375H116.538C115.888 14.5682 114.189 16 111.37 16C107.991 15.9998 105.754 13.6502 105.754 10.0703H105.751C105.751 6.55739 107.99 4.27539 111.257 4.27539ZM132.095 4.27539C134.801 4.2756 136.503 6.02159 136.503 8.95117V15.665H133.369V9.28613C133.369 7.81028 132.697 7.09379 131.444 7.09375C130.192 7.09375 129.362 7.96679 129.362 9.37598V15.6621H126.23V4.61035H128.984V5.72852C129.634 4.76615 130.82 4.27539 132.095 4.27539ZM106.379 2.72949H103.313C102.663 2.72949 102.305 2.99745 102.305 3.64746V4.60938H105.706V7.33887H102.305V15.6621H99.171V7.33887H96.42V4.60938H99.171V3.26758C99.171 1.11907 100.402 0 102.528 0H106.379V2.72949ZM120.583 6.55371C120.851 5.30087 121.747 4.60645 123.156 4.60645H125.126V4.98535C125.126 6.28287 124.074 7.33493 122.776 7.33496C121.546 7.33496 120.963 7.96297 120.963 9.21582V15.6611H117.829V4.60645H120.583V6.55371ZM111.257 6.73633C109.736 6.73633 108.907 7.58722 108.818 8.88477H113.584V8.83984C113.584 7.58713 112.778 6.73647 111.257 6.73633Z" fill="#1E1F24" />
-                  </svg>
-                  <svg
-                    width="145"
-                    height="16"
-                    viewBox="0 0 145 16"
-                    fill="none"
-                    xmlns="http://www.w3.org/2000/svg"
-                    className="built-with-fern-logo built-with-fern-logo-dark"
-                    aria-hidden
-                  >
-                    <path d="M9.79656 4.8H14.5006C15.5139 4.8 16.3192 5.05067 16.9166 5.552C17.5139 6.04267 17.8126 6.71467 17.8126 7.568C17.8126 8.112 17.6739 8.608 17.3966 9.056C17.1192 9.504 16.7512 9.84 16.2926 10.064C16.8579 10.2667 17.3059 10.608 17.6366 11.088C17.9672 11.5573 18.1326 12.1173 18.1326 12.768C18.1326 13.7387 17.8286 14.5227 17.2206 15.12C16.6126 15.7067 15.7752 16 14.7086 16H9.79656V4.8ZM14.4846 14.528C15.1246 14.528 15.6206 14.3627 15.9726 14.032C16.3246 13.7013 16.5006 13.2373 16.5006 12.64C16.5006 12.0427 16.3246 11.5893 15.9726 11.28C15.6312 10.96 15.1352 10.8 14.4846 10.8H11.3966V14.528H14.4846ZM14.2766 9.424C14.8846 9.424 15.3539 9.28533 15.6846 9.008C16.0152 8.72 16.1806 8.32533 16.1806 7.824C16.1806 7.32267 16.0152 6.93867 15.6846 6.672C15.3539 6.40533 14.8846 6.272 14.2766 6.272H11.3966V9.424H14.2766ZM22.5778 16.224C21.6285 16.224 20.8871 15.9413 20.3538 15.376C19.8205 14.8107 19.5538 14 19.5538 12.944V8.304H21.1058V12.8C21.1058 13.472 21.2551 13.9787 21.5538 14.32C21.8631 14.6507 22.3005 14.816 22.8658 14.816C23.4525 14.816 23.9165 14.6293 24.2578 14.256C24.6098 13.872 24.7858 13.3707 24.7858 12.752V8.304H26.3378V16H24.9618V15.12C24.7165 15.4827 24.3858 15.76 23.9698 15.952C23.5538 16.1333 23.0898 16.224 22.5778 16.224ZM28.0746 8.304H29.6266V16H28.0746V8.304ZM27.9786 4.912H29.7066V6.752H27.9786V4.912ZM33.0334 16C32.4894 16 32.0948 15.888 31.8494 15.664C31.6041 15.44 31.4814 15.0667 31.4814 14.544V4.8H33.0334V14.064C33.0334 14.2667 33.0761 14.416 33.1614 14.512C33.2468 14.5973 33.3854 14.64 33.5774 14.64H34.5534V16H33.0334ZM37.9539 16C37.2819 16 36.7966 15.856 36.4979 15.568C36.1993 15.28 36.0499 14.8053 36.0499 14.144V9.664H34.0339V8.304H36.0499V6H37.6019V8.304H40.0179V9.664H37.6019V13.84C37.6019 14.1173 37.6659 14.32 37.7939 14.448C37.9219 14.576 38.1299 14.64 38.4179 14.64H40.0179V16H37.9539ZM43.5709 8.304H45.1869L46.8989 14.272L48.6109 8.304H50.3869L52.0989 14.272L53.8109 8.304H55.4269L53.0429 16H51.2189L49.5069 10.064L47.7789 16H45.9549L43.5709 8.304ZM56.3746 8.304H57.9266V16H56.3746V8.304ZM56.2786 4.912H58.0066V6.752H56.2786V4.912ZM62.5971 16C61.9251 16 61.4397 15.856 61.1411 15.568C60.8424 15.28 60.6931 14.8053 60.6931 14.144V9.664H58.6771V8.304H60.6931V6H62.2451V8.304H64.6611V9.664H62.2451V13.84C62.2451 14.1173 62.3091 14.32 62.4371 14.448C62.5651 14.576 62.7731 14.64 63.0611 14.64H64.6611V16H62.5971ZM65.6727 4.8H67.2247V9.056C67.4807 8.736 67.8007 8.496 68.1847 8.336C68.5794 8.16533 69.0114 8.08 69.4807 8.08C70.4407 8.08 71.1927 8.368 71.7367 8.944C72.2807 9.50933 72.5527 10.3147 72.5527 11.36V16H71.0007V11.504C71.0007 10.832 70.8407 10.3307 70.5207 10C70.2114 9.65867 69.7687 9.488 69.1927 9.488C68.5954 9.488 68.1154 9.68 67.7527 10.064C67.4007 10.4373 67.2247 10.9333 67.2247 11.552V16H65.6727V4.8Z" fill="#EEEEF0" />
-                    <path d="M92.3848 7.82856C91.332 6.93847 89.7459 6.58166 88.3402 7.62074C88.2755 7.66779 88.1952 7.58741 88.2442 7.52468C88.5775 7.09532 88.9637 6.63263 89.2754 6.16798C89.593 5.69157 90.0675 5.35044 90.6145 5.18379C93.5259 4.30155 92.6515 0.00012207 92.6515 0.00012207C92.6515 0.00012207 88.154 0.290282 88.7088 4.17019C88.801 4.81913 88.6284 5.47983 88.2226 5.99545C87.7246 6.62479 87.1463 7.22667 86.7267 7.66191C86.6385 7.7521 86.4895 7.66583 86.5248 7.54428C86.9306 6.17778 87.2266 4.06432 85.8209 2.70175L83.8427 1.05881L83.4624 1.56071C82.3312 3.05268 82.6625 5.15634 84.1564 6.28561C85.0132 6.93259 85.4014 7.63643 85.3406 8.40888C85.3033 8.87157 85.0936 9.30485 84.7799 9.64794C84.1898 10.2949 83.6388 10.9889 83.2134 11.7928C83.1546 11.9045 82.984 11.8614 82.9899 11.734C83.0507 10.4067 82.9232 7.41489 80.6882 6.34639L78.1866 5.37984L77.9925 5.9582C77.3631 7.82464 78.3924 9.81851 80.2569 10.4518C81.8783 11.0027 82.4566 12.0476 82.0665 13.6141C82.0488 13.671 81.7665 15.2845 81.8057 16.0001H83.6036C83.6643 14.8904 84.8289 14.1611 85.8386 14.614C86.1229 14.7414 86.415 14.9238 86.7149 15.159C88.3226 16.4255 90.6909 16.1256 91.9555 14.516L92.3162 14.0572L90.042 12.4241C88.4814 11.1968 86.3993 11.7516 84.8583 12.8024C84.7289 12.8907 84.5642 12.7495 84.6368 12.6084C86.4993 8.95391 88.9206 8.96175 89.8695 9.77341C91.0203 10.7576 92.7632 10.5812 93.7396 9.4264L94.0199 9.09507L92.3829 7.82856H92.3848Z" fill="#51C233" />
-                    <path d="M111.257 4.27539C114.524 4.27557 116.739 6.46855 116.739 9.98145C116.739 10.3833 116.718 10.788 116.673 11.2568H108.84C108.974 12.6434 109.892 13.4053 111.391 13.4053C112.398 13.4052 113.045 12.9803 113.338 12.375H116.538C115.888 14.5682 114.189 16 111.37 16C107.991 15.9998 105.754 13.6502 105.754 10.0703H105.751C105.751 6.55739 107.989 4.27539 111.257 4.27539ZM132.095 4.27539C134.801 4.2756 136.503 6.02159 136.503 8.95117V15.665H133.369V9.28613C133.369 7.81028 132.697 7.09379 131.444 7.09375C130.191 7.09375 129.362 7.96679 129.362 9.37598V15.6621H126.229V4.61035H128.983V5.72852C129.633 4.76615 130.82 4.27539 132.095 4.27539ZM106.379 2.72949H103.312C102.662 2.72949 102.305 2.99745 102.305 3.64746V4.60938H105.706V7.33887H102.305V15.6621H99.1709V7.33887H96.4199V4.60938H99.1709V3.26758C99.1709 1.11907 100.402 0 102.528 0H106.379V2.72949ZM120.583 6.55371C120.851 5.30087 121.747 4.60645 123.156 4.60645H125.126V4.98535C125.126 6.28287 124.074 7.33493 122.776 7.33496C121.546 7.33496 120.963 7.96297 120.963 9.21582V15.6611H117.829V4.60645H120.583V6.55371ZM111.257 6.73633C109.736 6.73633 108.907 7.58722 108.817 8.88477H113.584V8.83984C113.584 7.58713 112.777 6.73647 111.257 6.73633Z" fill="#EEEEF0" />
-                  </svg>
-                  <span className="built-with-fern-tooltip">Developer-friendly docs for your API</span>
-                </a>
-              </div>
-            </div>
-          </div>
-          <div className="footer-item">
-            <div className="footer-links">
-              <a href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/" target="_blank" rel="noopener">Privacy Policy</a>
-              <span className="pipe-separator"> | </span>
-              <a href="https://www.nvidia.com/en-us/preferences/cookie-policy/" target="_blank" rel="noopener">Manage My Privacy</a>
-              <span className="pipe-separator"> | </span>
-              <a href="https://www.nvidia.com/en-us/preferences/start/" target="_blank" rel="noopener">Do Not Sell or Share My Data</a>
-              <span className="pipe-separator"> | </span>
-              <a href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/" target="_blank" rel="noopener">Terms of Service</a>
-              <span className="pipe-separator"> | </span>
-              <a href="https://www.nvidia.com/en-us/about-nvidia/accessibility/" target="_blank" rel="noopener">Accessibility</a>
-              <span className="pipe-separator"> | </span>
-              <a href="https://www.nvidia.com/en-us/about-nvidia/company-policies/" target="_blank" rel="noopener">Corporate Policies</a>
-              <span className="pipe-separator"> | </span>
-              <a href="https://www.nvidia.com/en-us/product-security/" target="_blank" rel="noopener">Product Security</a>
-              <span className="pipe-separator"> | </span>
-              <a href="https://www.nvidia.com/en-us/contact/" target="_blank" rel="noopener">Contact</a>
-            </div>
-          </div>
-          <div className="footer-item">
-            <p className="copyright">Copyright &#169; {currentYear}, NVIDIA Corporation.</p>
-          </div>
-        </div>
-      </div>
-    </footer>
-  );
-}
diff --git a/fern/main.css b/fern/main.css
deleted file mode 100644
index 936c895aaa..0000000000
--- a/fern/main.css
+++ /dev/null
@@ -1,872 +0,0 @@
-/*!
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
- *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
- */
-
-/* Color themes for light and dark modes */
-:root {
-    /* Brand Colors */
-    --nv-color-green: #74B900;
-    --nv-color-green-2: #004B31;
-    --nv-color-black: #000000;
-    --nv-color-white: #FFFFFF;
-
-    /* Grey Scale - Light */
-    --nv-light-grey-1: #f7f7f7;
-    --nv-light-grey-2: #EEEEEE;
-    --nv-light-grey-3: #DDDDDD;
-    --nv-light-grey-4: #CCCCCC;
-    --nv-light-grey-5: #999999;
-
-    /* Grey Scale - Dark */
-    --nv-dark-grey-1: #111111;
-    --nv-dark-grey-2: #1A1A1A;
-    --nv-dark-grey-3: #222222;
-    --nv-dark-grey-4: #333333;
-    --nv-dark-grey-5: #666666;
-
-    /* Colors by Usage */
-    --nv-color-text: #000000;
-    --nv-color-bg-default: #FFFFFF;
-    --nv-color-bg-alt: #f7f7f7;
-    --nv-color-success: #76B900;
-    --nv-color-error: #f44336;
-
-    /* Theme-independent settings */
-    --rounded: 999px;
-}
-main {
-    min-height: calc(100vh - 200px);
-  }
-/* Typography - Headers */
-h1 {
-    font-size: 36px;
-    font-weight: 700;
-    line-height: 1.25em; /* 45px */
-}
-
-h2 {
-    font-size: 28px;
-    font-weight: 700;
-    line-height: 1.25em; /* 35px */
-}
-
-h3 {
-    font-size: 24px;
-    font-weight: 700;
-    line-height: 1.25em; /* 30px */
-}
-
-h4 {
-    font-size: 20px;
-    font-weight: 700;
-    line-height: 1.25em; /* 25px */
-}
-
-/* Typography - Paragraphs */
-.prose{
-    color: var(--nv-dark-grey-2) !important;
-}
-.dark .prose{
-    color: var(--nv-light-grey-2) !important;
-}
-p {
-    text-decoration-thickness: 3px;
-}
-.fern-mdx-link {
-    color: var(--tw-prose-body);
-    text-decoration-color: var(--accent);
-    font-weight: var(--font-weight-normal);
-}
-
-/* Badge links: hide redundant external-link icon (badges already indicate links) */
-.badge-links .fern-mdx-link svg {
-    display: none;
-}
-
-/* Light theme (default) */
-html:not([data-theme]),html[data-theme=light] {
-    --pst-color-background: #fff;
-    --pst-color-on-background: #fff;
-    --pst-color-shadow: #ccc;
-    --pst-color-heading: #000;
-    --pst-color-text-base: #1a1a1a;
-    --pst-color-text-muted: #666;
-    --pst-color-surface: #f7f7f7;
-    --pst-color-on-surface: #333;
-    --pst-color-primary: var(--nv-color-green-2);
-    --pst-color-table-row-hover-bg: var(--nv-color-green);
-    --pst-color-link: var(--pst-color-text-base);
-    --pst-color-link-hover: var(--pst-color-text-base);
-    --pst-color-inline-code: var(--pst-color-primary);
-    --pst-color-inline-code-links: var(--pst-color-primary);
-    --pst-color-secondary: var(--pst-color-primary);
-    --pst-color-secondary-bg: var(--nv-color-green);
-    --pst-color-accent: var(--nv-color-green);
-}
-
-/* Dark theme */
-html[data-theme=dark] {
-    --pst-color-background: #111;
-    --pst-color-on-background: #000;
-    --pst-color-shadow: #000;
-    --pst-color-heading: #fff;
-    --pst-color-text-base: #eee;
-    --pst-color-text-muted: #999;
-    --pst-color-surface: #1a1a1a;
-    --pst-color-on-surface: #ddd;
-    --pst-color-primary: var(--nv-color-green);
-    --pst-color-table-row-hover-bg: var(--nv-color-green-2);
-    --pst-color-link: var(--pst-color-text-base);
-    --pst-color-link-hover: var(--pst-color-text-base);
-    --pst-color-inline-code: var(--pst-color-primary);
-    --pst-color-inline-code-links: var(--pst-color-primary);
-    --pst-color-secondary: var(--pst-color-primary);
-    --pst-color-secondary-bg: var(--nv-color-green-2);
-    --pst-color-accent: var(--nv-color-green);
-}
-
-/* Product and verion selector styling */
-
-.fern-product-selector {
-  border-radius: 8px;
-  pointer-events: none !important;
-  padding-right: 2px;
-}
-
-.product-dropdown-trigger svg{
-    display: none !important;
-}
-
-.fern-product-selector .product-dropdown-trigger p{
-    font-weight: bold !important;
-}
-.fern-product-selector-radio-group {
-    display: grid;
-    grid-template-columns: repeat(3, 1fr);
-    gap: 8px;
-    max-width: 1000px;
-}
-
-@media (max-width: 768px) {
-    .fern-product-selector-radio-group {
-        grid-template-columns: repeat(2, 1fr);
-    }
-}
-.fern-version-selector {
-    transform: translateY(-1px);
-}
-
-.fern-version-selector .version-dropdown-trigger{
-    outline: 1px solid var(--border, var(--grayscale-a5)) !important;
-    border-radius: 5px;
-    transition: box-shadow 0.3s ease, outline 0.3s ease;
-}
-.product-dropdown-trigger{
-    padding-left: 0px !important;
-}
-
-.product-dropdown-trigger, .version-dropdown-trigger{
-    background-color: transparent !important;
-}
-.product-dropdown-trigger svg:hover{
-    stroke: var(--nv-color-green) !important;
-}
-.version-dropdown-trigger:hover{
-    box-shadow: 0 0 0 1px var(--nv-color-green) !important;
-}
-.version-dropdown-trigger svg:hover{
-    stroke: var(--nv-color-green) !important;
-}
-/* Sidebar styling */
-#fern-sidebar {
-    border-right: 1px solid var(--border, var(--grayscale-a5)) !important;
-    height: 100vh !important;
-}
-.fern-sidebar-link:not(:hover){
-    background-color: transparent !important;
-}
-.fern-sidebar-link {
-    padding-left: 1rem !important;
-    padding-right: 1rem !important;
-    padding-top: 0.5rem !important;
-    padding-bottom: 0.5rem !important;
-    border-radius: 0px !important;
-    &.nested {
-        padding-left: 1rem !important;
-    }
-}
-/* Section-level sidebar links (pages that have children) should match sidebar heading padding */
-.fern-sidebar-group > li > .fern-sidebar-link:has(+ .fern-sidebar-group) {
-    padding-left: 0.25rem !important;
-}
-.fern-sidebar-group{
-    padding: 0 !important
-}
-#fern-sidebar-scroll-area{
-    padding-right: 0 !important
-}
-
-/* header styling */
-.fern-header-content{
-    padding-left: 18.5px;
-    margin-top: -5px;
-    margin-bottom: -5px;
-}
-#fern-header {
-    border-color: var(--border, var(--grayscale-a5)) !important;
-}
-@keyframes header-background-fade {
-    0% {
-      background-color: transparent;
-    }
-    100% {
-      background-color: var(--header-background);
-    }
-  }
-
-[data-theme=default]#fern-header {
-animation: header-background-fade linear;
-animation-timeline: scroll();
-animation-range: 0 50px;
-}
-.fern-header-navbar-links .fern-button{
-    background-color: transparent !important;
-}
-.fern-header-navbar-links > button{
-    background-color: transparent !important;
-}
-.fern-header-logo-container > div > div > a > img{
-    padding-right: 0.5rem;
-}
-.fern-header-logo-container .font-heading{
-    font-size: 16px !important;
-    font-weight: bold !important;
-    color: var(--grayscale-a12) !important;
-    border-inline: 1px solid var(--border, var(--grayscale-a5));
-    padding: 15px 1rem;
-    margin: -20px 0.5rem;
-}
-@media (max-width: 1024px) {
-    .fern-header-logo-container .font-heading{
-        display: none !important;
-    }
-}
-/* Search bar styling */
-#fern-search-button{
-    background-color: transparent !important;
-    border-radius: var(--rounded);
-    transition: box-shadow 0.3s ease, outline 0.3s ease;
-}
-#fern-search-button:hover{
-    box-shadow: 0 0 0 1px var(--nv-color-green) !important;
-}
-#fern-search-button .fern-kbd{
-    display: none;
-}
-
-.fern-layout-footer-toolbar button{
-    background-color: transparent !important;
-    border-color: transparent !important;
-    padding-inline: 0px !important;
-}
-
-/* ========== Custom footer (native React component) – 1:1 with original ========== */
-.bd-footer {
-  border-top: 1px solid var(--border, var(--grayscale-a5)) !important;
-  font-family: NVIDIA, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif !important;
-  font-size: 0.875rem;
-  padding: 2rem 0;
-  width: 100%;
-}
-.bd-footer * {
-  font-family: inherit;
-}
-.bd-footer__inner {
-  padding: 0 2rem;
-}
-.footer-items__start {
-  display: flex;
-  flex-direction: column;
-  gap: 1.5rem;
-}
-.footer-logos-container {
-  display: flex;
-  align-items: center;
-  justify-content: space-between;
-  width: 100%;
-  gap: 1rem;
-}
-.footer-brand {
-  display: inline-block;
-  text-decoration: none;
-}
-.footer-brand .logo__image {
-  height: 24px;
-  width: auto;
-  transition: opacity 0.2s ease;
-}
-.footer-brand:hover .logo__image {
-  opacity: 0.8;
-}
-.footer-brand-fern {
-  display: flex;
-  align-items: center;
-  margin-left: auto;
-}
-/* Logo theme visibility – .dark is on ancestor in Fern */
-.only-light {
-  display: block;
-  filter: invert(1);
-}
-.only-dark {
-  display: none;
-}
-.dark .only-light {
-  display: none;
-}
-.dark .only-dark {
-  display: block;
-  filter: none;
-}
-.footer-links {
-  display: flex;
-  flex-wrap: wrap;
-  gap: 0.25rem 0.5rem;
-  line-height: 1.65;
-  margin: 0;
-  padding: 0;
-}
-.footer-links a {
-  color: var(--grayscale-a11);
-  text-decoration: none;
-  transition: color 0.2s ease;
-  white-space: nowrap;
-}
-.pipe-separator {
-  color: var(--grayscale-a11);
-  white-space: nowrap;
-}
-.copyright {
-  color: var(--grayscale-a11);
-  font-size: 0.875rem;
-  line-height: 1.65;
-  margin: 0;
-}
-@media (max-width: 768px) {
-  .bd-footer { padding: 1.5rem 0; }
-  .bd-footer__inner { padding: 0 1.5rem; }
-  .footer-items__start { gap: 1rem; }
-  .footer-links { flex-direction: row; gap: 0.5rem 0.75rem; }
-  .footer-links a { white-space: normal; word-break: break-word; }
-}
-@media (max-width: 480px) {
-  .footer-links { gap: 0.5rem; }
-  .footer-links a { font-size: 0.8125rem; }
-  .copyright { font-size: 0.8125rem; }
-}
-/* Built with Fern link + tooltip */
-.built-with-fern-link {
-  display: flex;
-  align-items: baseline;
-  gap: 0.25rem;
-  text-decoration: none;
-  position: relative;
-}
-.built-with-fern-logo {
-  height: 1rem;
-  margin: 0;
-  transition: filter 150ms ease;
-}
-.built-with-fern-logo path { fill: var(--grayscale-a12); }
-.built-with-fern-link:hover .built-with-fern-logo { filter: saturate(1) opacity(1); }
-.built-with-fern-link:hover .built-with-fern-logo path:nth-child(2) { fill: #51C233; }
-.built-with-fern-tooltip {
-  position: absolute;
-  top: 50%;
-  right: calc(100%);
-  bottom: auto;
-  left: auto;
-  transform: translateY(-50%);
-  margin: 0;
-  margin-right: 0.5rem;
-  padding: 0.5rem 0.75rem;
-  background-color: #FFFFFF;
-  color: #000000;
-  font-size: 0.85rem;
-  border-radius: 0.375rem;
-  border: 1px solid var(--grayscale-a5);
-  white-space: nowrap;
-  pointer-events: none;
-  opacity: 0;
-  transition: opacity 150ms ease;
-  transition-delay: 0s;
-  z-index: 50;
-  box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15);
-  width: max-content;
-}
-.built-with-fern-link:hover .built-with-fern-tooltip {
-  opacity: 1;
-  transition-delay: 0.75s;
-}
-.dark .built-with-fern-tooltip {
-  background-color: #000000;
-  color: #FFFFFF;
-}
-.built-with-fern-logo-dark { display: none; }
-.dark .built-with-fern-logo-light { display: none; }
-.dark .built-with-fern-logo-dark { display: block; }
-@media (prefers-color-scheme: dark) {
-  .built-with-fern-logo-light { display: none; }
-  .built-with-fern-logo-dark { display: block; }
-}
-
-/* Footer styling */
-.fern-footer-nav{
-    border-radius: var(--rounded);
-    background-color: transparent !important;
-    transition: box-shadow 0.3s ease, outline 0.3s ease;
-}
-/* Hide line numbers */
-.code-block-line-gutter {
-    display: none !important;
-}
-.fern-footer-prev h4, .fern-footer-next h4{
-    font-size: inherit !important;
-}
-.fern-sidebar-link.nested[data-state="active"]:before {
-    left: -0px !important;
-    bottom: -0px !important;
-    top: -0px !important;
-    width: 2px !important;
-}
-.fern-sidebar-link[data-state="active"] {
-    color: unset !important;
-}
-
-.fern-selection-item .fern-selection-item-icon{
-    border-color: transparent !important;
-}
-/* Button styling */
-.fern-button{
-    border-radius: var(--rounded);
-    font-weight: bold;
-}
-.fern-button.filled.primary{
-    color: var(--nv-color-black);
-}
-.dark .fern-button.filled.primary{
-    background-color: var(--nv-color-white);
-}
-.dark .fern-button.filled.primary:hover{
-    background-color: var(--nv-light-grey-2);
-}
-.fern-button.outlined.normal{
-    background-color: transparent;
-    --tw-ring-color: transparent;
-    color: var(--nv-color-black);
-}
-.fern-button.outlined.normal:hover{
-    color: var(--nv-color-green)
-}
-.dark .fern-button.outlined.normal{
-    color: var(--nv-color-white);
-}
-.dark .fern-button.outlined.normal:hover{
-    color: var(--nv-color-green);
-}
-/* Card styling */
-.fern-card{
-    transition: box-shadow 0.3s ease, outline 0.3s ease;
-}
-svg.card-icon{
-    height: 24px !important;
-    width: 24px !important;
-}
-.card-icon{
-    background-color: transparent !important;
-}
-.fern-card:hover{
-    box-shadow: 0 0 0 1px var(--nv-color-green) !important;
-}
-.fern-docs-badge{
-    border-radius: var(--rounded);
-}
-.fern-page-actions button:hover{
-    background-color: transparent !important;
-}
-.fern-page-actions a:hover{
-    background-color: transparent !important;
-}
-/* Moving logo to footer */
-#builtwithfern, #builtwithfern * {
-    display: none !important;
-}
-
-/* Landing Page Gradients */
-/* Top: Simple radial gradient (no mask, responsive) */
-.landing-gradient-top {
-    position: absolute;
-    top: 0;
-    left: 0;
-    right: 0;
-    height: 800px;
-    background: radial-gradient(ellipse 100% 100% at 50% 10%,
-        rgba(191, 242, 48, 0.15) 0%,
-        rgba(158, 228, 179, 0.12) 30%,
-        rgba(124, 215, 254, 0.12) 50%,
-        rgba(124, 215, 254, 0.06) 75%,
-        transparent 100%);
-    pointer-events: none;
-    z-index: 0;
-}
-
-/* Bottom: Masked gradient for organic transition */
-.landing-gradient-bottom {
-    position: absolute;
-    bottom: -282px;
-    left: 0;
-    right: 0;
-    height: 1232px;
-    background: linear-gradient(85deg, #BFF230 41.98%, #7CD7FE 99.52%);
-    opacity: 0.05;
-    pointer-events: none;
-    z-index: 5;
-    mask-image: url('https://www.figma.com/api/mcp/asset/27509afa-9c16-46bb-8415-4395e2e5a347');
-    mask-repeat: no-repeat;
-    mask-position: 0% -17px;
-    mask-size: 100% auto;
-    -webkit-mask-image: url('https://www.figma.com/api/mcp/asset/27509afa-9c16-46bb-8415-4395e2e5a347');
-    -webkit-mask-repeat: no-repeat;
-    -webkit-mask-position: 0% -17px;
-    -webkit-mask-size: 100% auto;
-}
-
-/* Landing Page Gradients Wrapper */
-.landing-page-gradients {
-    position: relative;
-    width: 100%;
-    margin-top: -100px;
-    padding-top: 100px;
-    overflow: visible;
-    background: #181818;
-}
-
-/* Hero Section (Landing page only) */
-.hero-section {
-    position: relative;
-    width: 100%;
-    padding: 3rem 6rem;
-    margin: 0 auto;
-    overflow: visible;
-    display: flex;
-    flex-direction: column;
-    align-items: center;
-    z-index: 10;
-}
-
-/* Hero Section Content - constrain width */
-.hero-section > * {
-    position: relative;
-    z-index: 100;
-    max-width: 1440px;
-    width: 100%;
-}
-
-/* Tablet and Mobile: fix spacing and layout */
-@media (max-width: 1024px) {
-    /* Extend dark background behind header */
-    .landing-page body, .landing-page html, .landing-page main {
-        background: #181818 !important;
-    }
-
-    .landing-page-gradients {
-        margin-top: -100px;
-        padding-top: 100px;
-    }
-
-    .hero-section {
-        padding: 2rem 2rem;
-    }
-
-    .hero-section > * {
-        max-width: none;
-    }
-
-    .hero-content-grid {
-        grid-template-columns: 1fr;
-        gap: 2rem;
-    }
-
-    .hero-heading {
-        font-size: 36px;
-    }
-
-    .hero-subtitle {
-        font-size: 16px;
-    }
-
-    .hero-title-section {
-        margin-bottom: 2rem;
-    }
-}
-
-/* Small mobile only */
-@media (max-width: 600px) {
-    .hero-heading {
-        font-size: 28px;
-    }
-
-    .hero-section {
-        padding: 1.5rem 1.5rem;
-    }
-}
-
-.hero-section h1,
-.hero-section h2,
-.hero-section h3,
-.hero-section h4,
-.hero-section h5,
-.hero-section h6 {
-    pointer-events: none !important;
-}
-/* Hero Title Section */
-.hero-title-section {
-    text-align: center;
-    margin-bottom: 4rem;
-    position: relative;
-    z-index: 100;
-}
-
-.hero-heading {
-    font-size: 48px;
-    font-weight: 700;
-    line-height: 1.2;
-    margin: 0 0 1rem 0;
-    color: var(--nv-color-white);
-}
-
-.hero-subtitle {
-    font-size: 18px;
-    line-height: 1.5;
-    margin: 0;
-    color: var(--nv-color-white);
-}
-
-/* Hero Content Grid */
-.hero-content-grid {
-    display: grid;
-    grid-template-columns: repeat(2, 1fr);
-    gap: 3rem;
-    align-items: start;
-    position: relative;
-    z-index: 100;
-}
-
-.hero-column {
-    display: flex;
-    flex-direction: column;
-    gap: 1rem;
-}
-
-.hero-column-title {
-    font-size: 24px;
-    font-weight: 700;
-    margin: 0;
-    color: var(--nv-color-white);
-}
-
-.hero-column-subtitle {
-    font-size: 16px;
-    margin: 0 0 1rem 0;
-    color: var(--nv-color-white);
-}
-
-/* Hero Card Container (Left Column) */
-.hero-card-container {
-    display: flex;
-    flex-direction: column;
-    border-radius: 8px;
-    overflow: hidden;
-    border: 1px solid var(--border, var(--grayscale-a5));
-    margin-top: 1.5rem !important;
-    background: rgba(26, 26, 26, 0.2);
-    backdrop-filter: blur(6px);
-}
-
-.hero-card-image {
-    width: 100%;
-    height: auto;
-    display: block;
-}
-
-.hero-card-content {
-    padding: 1.5rem;
-    display: flex;
-    flex-direction: row;
-    gap: 1rem;
-    align-items: center;
-    justify-content: space-between;
-    background: rgba(26, 26, 26, 0.2);
-    backdrop-filter: blur(6px);
-}
-
-.hero-card-text-wrapper {
-    flex: 1;
-}
-
-.hero-card-text {
-    margin: 0;
-    font-size: 14px;
-    line-height: 1.5;
-    color: var(--nv-color-white);
-}
-
-.hero-card-button-wrapper {
-    flex-shrink: 0;
-}
-.hero-card-button-wrapper .fern-mdx-link{
-    text-decoration: none !important;
-}
-
-.hero-card-button {
-    white-space: nowrap;
-}
-
-/* Hero Cards */
-
-.hero-column .fern-card {
-    padding: 9px 17px;
-    background-color: rgba(26, 26, 26, 0.2) !important;
-    backdrop-filter: blur(6px);
-}
-
-.hero-section .fern-card{
-    color: white !important;
-}
-
-.hero-column .card-icon {
-    font-size: 64px !important;
-    width: 64px !important;
-    height: 64px !important;
-}
-
-.hero-column .card-icon svg,
-.hero-column .card-icon i {
-    font-size: 64px !important;
-    width: 64px !important;
-    height: 64px !important;
-}
-
-.hero-column .fern-card-title {
-    font-size: 16px;
-    font-weight: 500;
-    line-height: 24px;
-}
-
-.hero-column .fern-card p {
-    font-size: 14px;
-    line-height: 20px;
-    color: white !important;
-}
-
-/* Body Section */
-.body-section {
-    display: flex;
-    padding: 4rem 16rem;
-    flex-direction: column;
-    justify-content: center;
-    align-items: center;
-    gap: 4rem;
-    align-self: stretch;
-    position: relative;
-    z-index: 1;
-    background: #181818;
-}
-
-/* Body Section Content - constrain width */
-.body-section > * {
-    max-width: 1440px;
-    width: 100%;
-    position: relative;
-    z-index: 10;
-}
-
-.code-block .fern-code-link{
-    text-decoration: underline !important;
-    text-decoration-color: var(--accent) !important;
-    text-underline-offset: 1px !important;
-    text-decoration-style: underline !important;
-}
-
-/* Mobile Styles */
-@media (max-width: 768px) {
-    .hero-section {
-        padding: 2rem 1.5rem;
-    }
-
-    .hero-title-section {
-        margin-bottom: 2rem;
-    }
-
-    .hero-heading {
-        font-size: 32px;
-    }
-
-    .hero-subtitle {
-        font-size: 16px;
-    }
-
-    .hero-content-grid {
-        grid-template-columns: 1fr;
-        gap: 2rem;
-    }
-
-    .hero-column-title {
-        font-size: 20px;
-    }
-
-    .hero-column-subtitle {
-        font-size: 14px;
-    }
-
-    .hero-card-content {
-        flex-direction: column;
-        align-items: flex-start;
-    }
-
-    .hero-card-button-wrapper {
-        align-self: flex-start;
-    }
-
-    .hero-column .card-icon,
-    .hero-column .card-icon svg,
-    .hero-column .card-icon i {
-        font-size: 40px !important;
-        width: 40px !important;
-        height: 40px !important;
-    }
-
-    .hero-column .fern-card-title {
-        font-size: 14px;
-    }
-
-    .hero-column .fern-card p {
-        font-size: 11px;
-    }
-
-    .body-section {
-        padding: 2rem 1.5rem;
-    }
-
-    .fern-selection-item-icon.use-icon {
-        display: none !important;
-    }
-}
\ No newline at end of file
diff --git a/fern/versions/nightly.yml b/fern/versions/nightly.yml
deleted file mode 100644
index dd2f6f15e4..0000000000
--- a/fern/versions/nightly.yml
+++ /dev/null
@@ -1,351 +0,0 @@
-navigation:
-  - section: "Get Started"
-    contents:
-      - page: "About NeMo AutoModel"
-        path: ./nightly/pages/about/index.mdx
-        slug: about
-      - page: "Key Features and Concepts"
-        path: ./nightly/pages/about/key-features.mdx
-        slug: key-features
-      - page: "Install NeMo AutoModel"
-        path: ./nightly/pages/guides/installation.mdx
-        slug: installation
-      - page: "YAML Configuration"
-        path: ./nightly/pages/guides/configuration.mdx
-        slug: configuration
-      - page: "🤗 Transformers API Compatibility"
-        path: ./nightly/pages/guides/huggingface-api-compatibility.mdx
-        slug: hf-compatibility
-      - page: "Repository Structure"
-        path: ./nightly/pages/repository-structure.mdx
-        slug: repo-structure
-      - page: "Release Notes"
-        path: ./nightly/pages/release-notes.mdx
-        slug: release-notes
-  - section: "Announcements"
-    contents:
-      - page: "Announcements"
-        path: ./nightly/pages/announcements.mdx
-  - section: "NeMo AutoModel Performance"
-    slug: performance
-    contents:
-      - page: "Performance Summary"
-        path: ./nightly/pages/performance-summary.mdx
-        slug: performance-summary
-  - section: "Model Coverage"
-    contents:
-      - page: "Model Coverage Overview"
-        path: ./nightly/pages/model-coverage/overview.mdx
-        slug: overview
-      - page: "Model Release Log"
-        path: ./nightly/pages/model-coverage/latest-models.mdx
-        slug: release-log
-      - section: "Large Language Models (LLMs)"
-        slug: large-language-models
-        contents:
-          - page: "Overview"
-            path: ./nightly/pages/model-coverage/llm/index.mdx
-            slug: overview
-          - page: "Llama"
-            path: ./nightly/pages/model-coverage/llm/meta/llama.mdx
-          - page: "Gemma"
-            path: ./nightly/pages/model-coverage/llm/google/gemma.mdx
-          - page: "Qwen2"
-            path: ./nightly/pages/model-coverage/llm/qwen/qwen2.mdx
-          - page: "Qwen2 MoE"
-            path: ./nightly/pages/model-coverage/llm/qwen/qwen2-moe.mdx
-          - page: "Qwen3"
-            path: ./nightly/pages/model-coverage/llm/qwen/qwen3.mdx
-          - page: "Qwen3 MoE"
-            path: ./nightly/pages/model-coverage/llm/qwen/qwen3-moe.mdx
-          - page: "Qwen3-Next"
-            path: ./nightly/pages/model-coverage/llm/qwen/qwen3-next.mdx
-          - page: "ERNIE 4.5"
-            path: ./nightly/pages/model-coverage/llm/baidu/ernie4-5.mdx
-          - page: "DeepSeek"
-            path: ./nightly/pages/model-coverage/llm/deepseek-ai/deepseek.mdx
-          - page: "DeepSeek-V3"
-            path: ./nightly/pages/model-coverage/llm/deepseek-ai/deepseek-v3.mdx
-          - page: "DeepSeek-V4 Flash"
-            path: ./nightly/pages/model-coverage/llm/deepseek-ai/dsv4-flash.mdx
-          - page: "Mistral"
-            path: ./nightly/pages/model-coverage/llm/mistralai/mistral.mdx
-          - page: "Mixtral"
-            path: ./nightly/pages/model-coverage/llm/mistralai/mixtral.mdx
-          - page: "Ministral3 / Devstral"
-            path: ./nightly/pages/model-coverage/llm/mistralai/ministral3.mdx
-          - page: "Phi"
-            path: ./nightly/pages/model-coverage/llm/microsoft/phi.mdx
-          - page: "Phi-3 / Phi-4"
-            path: ./nightly/pages/model-coverage/llm/microsoft/phi3.mdx
-          - page: "Phi-3-Small"
-            path: ./nightly/pages/model-coverage/llm/microsoft/phi3-small.mdx
-          - page: "Nemotron / Minitron"
-            path: ./nightly/pages/model-coverage/llm/nvidia/nemotron.mdx
-          - page: "Nemotron-H"
-            path: ./nightly/pages/model-coverage/llm/nvidia/nemotron-h.mdx
-          - page: "Nemotron-Flash"
-            path: ./nightly/pages/model-coverage/llm/nvidia/nemotron-flash.mdx
-          - page: "Nemotron-Super (Llama-3.3-Nemotron-Super-49B)"
-            path: ./nightly/pages/model-coverage/llm/nvidia/nemotron-super.mdx
-          - page: "ChatGLM"
-            path: ./nightly/pages/model-coverage/llm/thudm/chatglm.mdx
-          - page: "GLM-4"
-            path: ./nightly/pages/model-coverage/llm/thudm/glm4.mdx
-          - page: "GLM-4 MoE (GLM-4.5 / GLM-4.7)"
-            path: ./nightly/pages/model-coverage/llm/thudm/glm4-moe.mdx
-          - page: "GLM-5 MoE (DSA)"
-            path: ./nightly/pages/model-coverage/llm/thudm/glm5-moe-dsa.mdx
-          - page: "Granite"
-            path: ./nightly/pages/model-coverage/llm/ibm/granite.mdx
-          - page: "Granite MoE"
-            path: ./nightly/pages/model-coverage/llm/ibm/granite-moe.mdx
-          - page: "Bamba"
-            path: ./nightly/pages/model-coverage/llm/ibm/bamba.mdx
-          - page: "OLMo"
-            path: ./nightly/pages/model-coverage/llm/allenai/olmo.mdx
-          - page: "OLMo2"
-            path: ./nightly/pages/model-coverage/llm/allenai/olmo2.mdx
-          - page: "OLMoE"
-            path: ./nightly/pages/model-coverage/llm/allenai/olmoe.mdx
-          - page: "GPT-OSS"
-            path: ./nightly/pages/model-coverage/llm/openai/gpt-oss.mdx
-          - page: "GPT-2"
-            path: ./nightly/pages/model-coverage/llm/openai/gpt2.mdx
-          - page: "GPT-J"
-            path: ./nightly/pages/model-coverage/llm/eleutherai/gpt-j.mdx
-          - page: "GPT-NeoX / Pythia"
-            path: ./nightly/pages/model-coverage/llm/eleutherai/gpt-neox.mdx
-          - page: "StarCoder"
-            path: ./nightly/pages/model-coverage/llm/bigcode/starcoder.mdx
-          - page: "StarCoder2"
-            path: ./nightly/pages/model-coverage/llm/bigcode/starcoder2.mdx
-          - page: "Aquila / Aquila2"
-            path: ./nightly/pages/model-coverage/llm/baai/aquila.mdx
-          - page: "Baichuan / Baichuan2"
-            path: ./nightly/pages/model-coverage/llm/baichuan-inc/baichuan.mdx
-          - page: "Command-R"
-            path: ./nightly/pages/model-coverage/llm/cohere/command-r.mdx
-          - page: "Falcon"
-            path: ./nightly/pages/model-coverage/llm/tiiuae/falcon.mdx
-          - page: "EXAONE"
-            path: ./nightly/pages/model-coverage/llm/lgai-exaone/exaone.mdx
-          - page: "InternLM"
-            path: ./nightly/pages/model-coverage/llm/internlm/internlm.mdx
-          - page: "Jais"
-            path: ./nightly/pages/model-coverage/llm/inceptionai/jais.mdx
-          - page: "MiniMax-M2"
-            path: ./nightly/pages/model-coverage/llm/minimax/minimax-m2.mdx
-          - page: "MiniCPM"
-            path: ./nightly/pages/model-coverage/llm/openbmb/minicpm.mdx
-          - page: "Moonlight"
-            path: ./nightly/pages/model-coverage/llm/moonshotai/moonlight.mdx
-          - page: "Seed (ByteDance)"
-            path: ./nightly/pages/model-coverage/llm/bytedance-seed/seed.mdx
-          - page: "Solar Pro"
-            path: ./nightly/pages/model-coverage/llm/upstage/solar.mdx
-          - page: "Orion"
-            path: ./nightly/pages/model-coverage/llm/orionstar/orion.mdx
-          - page: "StableLM"
-            path: ./nightly/pages/model-coverage/llm/stabilityai/stablelm.mdx
-          - page: "Step-3.5"
-            path: ./nightly/pages/model-coverage/llm/stepfun-ai/step-3-5.mdx
-          - page: "GritLM"
-            path: ./nightly/pages/model-coverage/llm/parasail-ai/gritlm.mdx
-          - page: "Hy3-preview"
-            path: ./nightly/pages/model-coverage/llm/tencent/hy3.mdx
-          - page: "MiMo-V2-Flash"
-            path: ./nightly/pages/model-coverage/llm/xiaomimimo/mimo-v2-flash.mdx
-          - page: "Ling 2.0"
-            path: ./nightly/pages/model-coverage/llm/inclusionai/ling-2.mdx
-      - section: "Vision Language Models (VLMs)"
-        slug: vision-language-models
-        contents:
-          - page: "Overview"
-            path: ./nightly/pages/model-coverage/vlm/index.mdx
-            slug: overview
-          - page: "Kimi-VL"
-            path: ./nightly/pages/model-coverage/vlm/moonshotai/kimi-vl.mdx
-          - page: "Gemma 3 VL / Gemma 3n"
-            path: ./nightly/pages/model-coverage/vlm/google/gemma3-vl.mdx
-          - page: "Gemma 4"
-            path: ./nightly/pages/model-coverage/vlm/google/gemma4.mdx
-          - page: "Qwen2.5-VL"
-            path: ./nightly/pages/model-coverage/vlm/qwen/qwen2-5-vl.mdx
-          - page: "Qwen3-VL / Qwen3-VL-MoE"
-            path: ./nightly/pages/model-coverage/vlm/qwen/qwen3-vl.mdx
-          - page: "Qwen3.5-VL"
-            path: ./nightly/pages/model-coverage/vlm/qwen/qwen3-5-vl.mdx
-          - page: "Nemotron-Parse"
-            path: ./nightly/pages/model-coverage/vlm/nvidia/nemotron-parse.mdx
-          - page: "Ministral3 VL"
-            path: ./nightly/pages/model-coverage/vlm/mistralai/ministral3-vl.mdx
-          - page: "Mistral Medium 3.5"
-            path: ./nightly/pages/model-coverage/vlm/mistralai/mistral-medium-3-5.mdx
-          - page: "Mistral-Small-4"
-            path: ./nightly/pages/model-coverage/vlm/mistralai/mistral-small-4.mdx
-          - page: "InternVL"
-            path: ./nightly/pages/model-coverage/vlm/internlm/internvl.mdx
-          - page: "Llama 4"
-            path: ./nightly/pages/model-coverage/vlm/meta/llama4.mdx
-          - page: "LLaVA-OneVision"
-            path: ./nightly/pages/model-coverage/vlm/lmms-lab/llava-onevision.mdx
-          - page: "SmolVLM"
-            path: ./nightly/pages/model-coverage/vlm/huggingface/smolvlm.mdx
-          - page: "LLaVA"
-            path: ./nightly/pages/model-coverage/vlm/llava-hf/llava.mdx
-      - section: "Omni Models"
-        slug: omni
-        contents:
-          - page: "Overview"
-            path: ./nightly/pages/model-coverage/omni/index.mdx
-            slug: overview
-          - page: "Qwen3-Omni"
-            path: ./nightly/pages/model-coverage/omni/qwen/qwen3-omni.mdx
-          - page: "Phi-4-multimodal"
-            path: ./nightly/pages/model-coverage/omni/microsoft/phi4-multimodal.mdx
-          - page: "Nemotron-Omni"
-            path: ./nightly/pages/model-coverage/omni/nvidia/nemotron-omni.mdx
-      - section: "Diffusion Models"
-        slug: diffusion
-        contents:
-          - page: "Overview"
-            path: ./nightly/pages/model-coverage/diffusion/index.mdx
-            slug: overview
-          - page: "Wan 2.1 T2V"
-            path: ./nightly/pages/model-coverage/diffusion/wan-ai/wan2-1-t2v.mdx
-          - page: "FLUX.1-dev"
-            path: ./nightly/pages/model-coverage/diffusion/black-forest-labs/flux.mdx
-          - page: "HunyuanVideo 1.5"
-            path: ./nightly/pages/model-coverage/diffusion/hunyuanvideo-community/hunyuanvideo.mdx
-          - page: "Qwen-Image"
-            path: ./nightly/pages/model-coverage/diffusion/qwen/qwen-image.mdx
-  - section: "Recipes & E2E Examples"
-    contents:
-      - page: "Recipes and End-to-End Examples"
-        path: ./nightly/pages/guides/overview.mdx
-        slug: overview
-      - page: "Supervised Fine-Tuning (SFT) and Parameter-Efficient Fine-Tuning (PEFT) with NeMo AutoModel"
-        path: ./nightly/pages/guides/llm/finetune.mdx
-        slug: sft-peft
-      - page: "Function Calling with NeMo AutoModel using FunctionGemma"
-        path: ./nightly/pages/guides/llm/toolcalling.mdx
-        slug: function-calling
-      - page: "Knowledge Distillation with NeMo AutoModel"
-        path: ./nightly/pages/guides/llm/knowledge-distillation.mdx
-        slug: knowledge-distillation
-      - page: "Fine-Tune Large MoE LLMs"
-        path: ./nightly/pages/guides/llm/large-moe-finetune.mdx
-        slug: large-moe-fine-tuning
-      - page: "DeepSeek V4 Flash"
-        path: ./nightly/pages/guides/llm/dsv4-flash.mdx
-        slug: deepseek-v4-flash
-      - page: "Hy3-preview"
-        path: ./nightly/pages/guides/llm/hy3.mdx
-        slug: hy3-preview
-      - page: "Pretraining Megatron Core Datasets with NeMo AutoModel"
-        path: ./nightly/pages/guides/llm/pretraining.mdx
-        slug: pretraining
-      - page: "LLM Pre-Training with NeMo AutoModel"
-        path: ./nightly/pages/guides/llm/nanogpt-pretraining.mdx
-        slug: nanogpt-pretraining
-      - page: "Sequence Classification (SFT/PEFT) with NeMo AutoModel"
-        path: ./nightly/pages/guides/llm/sequence-classification.mdx
-        slug: sequence-classification
-      - page: "Fine-Tune Gemma 3 and Gemma 3n"
-        path: ./nightly/pages/guides/omni/gemma3-3n.mdx
-        slug: gemma-3-3n
-      - page: "Fine-Tuning Gemma 4 31B on CORD-v2 Receipts — End-to-End Guide"
-        path: ./nightly/pages/guides/vlm/gemma4.mdx
-        slug: gemma-4
-      - page: "Fine-Tune Qwen3.5-VL"
-        path: ./nightly/pages/guides/vlm/qwen3-5.mdx
-        slug: qwen3-5-vl
-      - page: "Nemotron-Omni"
-        path: ./nightly/pages/guides/vlm/nemotron-omni.mdx
-        slug: nemotron-omni
-      - page: "Mistral Medium 3.5 VL"
-        path: ./nightly/pages/guides/vlm/mistral-medium-3-5.mdx
-        slug: mistral-medium-3-5
-      - page: "Diffusion Model Fine-Tuning with NeMo AutoModel"
-        path: ./nightly/pages/guides/diffusion/finetune.mdx
-        slug: diffusion-fine-tuning
-      - page: "dLLM Fine-Tuning"
-        path: ./nightly/pages/guides/dllm/finetune.mdx
-        slug: dllm-fine-tuning
-      - page: "Quantization-Aware Training (QAT) in NeMo Automodel"
-        path: ./nightly/pages/guides/quantization-aware-training.mdx
-        slug: qat
-      - page: "Model Training on Databricks"
-        path: ./nightly/pages/guides/llm/databricks.mdx
-        slug: databricks
-  - section: "Datasets"
-    contents:
-      - page: "Dataset Overview: LLM, VLM, and Retrieval Datasets in NeMo AutoModel"
-        path: ./nightly/pages/guides/dataset-overview.mdx
-        slug: overview
-      - page: "Integrate Your Own Text Dataset"
-        path: ./nightly/pages/guides/llm/dataset.mdx
-        slug: text-dataset
-      - page: "Retrieval Dataset (Embedding Fine-tuning)"
-        path: ./nightly/pages/guides/llm/retrieval-dataset.mdx
-        slug: retrieval-dataset
-      - page: "Use the ColumnMappedTextInstructionDataset"
-        path: ./nightly/pages/guides/llm/column-mapped-text-instruction-dataset.mdx
-        slug: columnmapped-dataset
-      - page: "Use the ColumnMappedTextInstructionIterableDataset (Streaming)"
-        path: ./nightly/pages/guides/llm/column-mapped-text-instruction-iterable-dataset.mdx
-        slug: columnmapped-iterable
-      - page: "Integrate Your Own Multi-Modal Dataset"
-        path: ./nightly/pages/guides/vlm/dataset.mdx
-        slug: multi-modal-dataset
-      - page: "Diffusion Dataset Preparation"
-        path: ./nightly/pages/guides/diffusion/dataset.mdx
-        slug: diffusion-dataset
-  - section: "Job Launchers"
-    contents:
-      - page: "Job Launchers"
-        path: ./nightly/pages/launcher/overview.mdx
-        slug: overview
-      - page: "Run on Your Local Workstation"
-        path: ./nightly/pages/launcher/local-workstation.mdx
-        slug: local-workstation
-      - page: "Run on a Cluster"
-        path: ./nightly/pages/launcher/slurm.mdx
-        slug: slurm-cluster
-      - page: "Run with NeMo Run"
-        path: ./nightly/pages/launcher/nemo-run.mdx
-        slug: nemo-run
-      - page: "Run on Any Cloud with SkyPilot"
-        path: ./nightly/pages/launcher/skypilot.mdx
-        slug: skypilot
-      - page: "SkyPilot k8s"
-        path: ./nightly/pages/launcher/skypilot-kubernetes.mdx
-        slug: skypilot-k8s
-  - section: "Development"
-    contents:
-      - page: "Checkpointing in NeMo Automodel"
-        path: ./nightly/pages/guides/checkpointing.mdx
-        slug: checkpointing
-      - page: "Gradient (Activation) Checkpointing in NeMo AutoModel"
-        path: ./nightly/pages/guides/gradient-checkpointing.mdx
-        slug: gradient-checkpointing
-      - page: "Pipeline Parallelism with AutoPipeline"
-        path: ./nightly/pages/guides/pipelining.mdx
-        slug: pipeline-parallelism
-      - page: "FP8 Training in NeMo AutoModel"
-        path: ./nightly/pages/guides/fp8-training.mdx
-        slug: fp8-training
-      - page: "MLflow Logging in NeMo AutoModel"
-        path: ./nightly/pages/guides/mlflow-logging.mdx
-        slug: mlflow-logging
-      - page: "Breaking Changes"
-        path: ./nightly/pages/breaking-changes.mdx
-        slug: breaking-changes
-      - section: "API Reference"
-        slug: api-reference
-        contents:
-          - page: "Overview"
-            path: ./nightly/pages/api-reference/index.mdx
-            slug: overview
-          - folder: ../product-docs/nemo-automodel/Full-Library-Reference
diff --git a/fern/versions/v0.4/pages/automodel_diagram.png b/fern/versions/v0.4/pages/automodel_diagram.png
deleted file mode 100644
index bb56ea575a..0000000000
Binary files a/fern/versions/v0.4/pages/automodel_diagram.png and /dev/null differ
diff --git a/fern/versions/v0.4/pages/guides/fp8_convergence.jpg b/fern/versions/v0.4/pages/guides/fp8_convergence.jpg
deleted file mode 100644
index 15733abf29..0000000000
Binary files a/fern/versions/v0.4/pages/guides/fp8_convergence.jpg and /dev/null differ
diff --git a/fern/versions/v0.4/pages/guides/llm/databricks-gpu-metrics-multi.png b/fern/versions/v0.4/pages/guides/llm/databricks-gpu-metrics-multi.png
deleted file mode 100644
index 0dcb08d7ef..0000000000
Binary files a/fern/versions/v0.4/pages/guides/llm/databricks-gpu-metrics-multi.png and /dev/null differ
diff --git a/fern/versions/v0.4/pages/guides/llm/databricks-gpu-metrics-single.png b/fern/versions/v0.4/pages/guides/llm/databricks-gpu-metrics-single.png
deleted file mode 100644
index 9f297a649e..0000000000
Binary files a/fern/versions/v0.4/pages/guides/llm/databricks-gpu-metrics-single.png and /dev/null differ
diff --git a/fern/versions/v0.4/pages/guides/llm/functiongemma-peft-loss.png b/fern/versions/v0.4/pages/guides/llm/functiongemma-peft-loss.png
deleted file mode 100644
index 036ff9d8db..0000000000
Binary files a/fern/versions/v0.4/pages/guides/llm/functiongemma-peft-loss.png and /dev/null differ
diff --git a/fern/versions/v0.4/pages/guides/llm/functiongemma-sft-loss.png b/fern/versions/v0.4/pages/guides/llm/functiongemma-sft-loss.png
deleted file mode 100644
index 8e05866ccd..0000000000
Binary files a/fern/versions/v0.4/pages/guides/llm/functiongemma-sft-loss.png and /dev/null differ
diff --git a/fern/versions/v0.4/pages/guides/llm/gpt2_loss.png b/fern/versions/v0.4/pages/guides/llm/gpt2_loss.png
deleted file mode 100644
index 301577d71f..0000000000
Binary files a/fern/versions/v0.4/pages/guides/llm/gpt2_loss.png and /dev/null differ
diff --git a/fern/versions/v0.4/pages/guides/omni/medpix.jpg b/fern/versions/v0.4/pages/guides/omni/medpix.jpg
deleted file mode 100644
index 3812e4414e..0000000000
Binary files a/fern/versions/v0.4/pages/guides/omni/medpix.jpg and /dev/null differ
diff --git a/fern/versions/v0.4/pages/guides/omni/medpix_peft.jpg b/fern/versions/v0.4/pages/guides/omni/medpix_peft.jpg
deleted file mode 100644
index 1fb3f19031..0000000000
Binary files a/fern/versions/v0.4/pages/guides/omni/medpix_peft.jpg and /dev/null differ
diff --git a/fern/versions/v0.4/pages/guides/vlm/mistralm35.png b/fern/versions/v0.4/pages/guides/vlm/mistralm35.png
deleted file mode 100644
index 132b75c287..0000000000
Binary files a/fern/versions/v0.4/pages/guides/vlm/mistralm35.png and /dev/null differ
diff --git a/fern/versions/v0.4/pages/guides/vlm/qwen3_5.png b/fern/versions/v0.4/pages/guides/vlm/qwen3_5.png
deleted file mode 100644
index 1f0ddc0ef2..0000000000
Binary files a/fern/versions/v0.4/pages/guides/vlm/qwen3_5.png and /dev/null differ
diff --git a/fern/versions/v0.4/pages/guides/vlm/qwen3_5scores.png b/fern/versions/v0.4/pages/guides/vlm/qwen3_5scores.png
deleted file mode 100644
index f636a60890..0000000000
Binary files a/fern/versions/v0.4/pages/guides/vlm/qwen3_5scores.png and /dev/null differ
diff --git a/skills/README.md b/skills/README.md
index eebb5f1a4e..76b7aeea17 100644
--- a/skills/README.md
+++ b/skills/README.md
@@ -26,4 +26,4 @@ To invoke a skill manually, use `/<skill-name>` in your Claude Code session.
 | `cicd` | Commit/PR workflow, CI trigger mechanism, failure investigation |
 | `build-and-dependency` | Container setup, uv package management, environment variables, CLI usage |
 | `testing` | Unit and functional test layout, tier semantics (L0/L1/L2), adding tests |
-| `fern-docs` | Maintain the Fern docs site under `fern/` — pages, slugs, redirects, version aliases, library reference |
\ No newline at end of file
+| `fern-docs` | Maintain the Fern docs site under `docs/` (MDX content) + `docs/fern/` (infra) — pages, slugs, redirects, version aliases, library reference |
\ No newline at end of file
diff --git a/skills/fern-docs/SKILL.md b/skills/fern-docs/SKILL.md
index 21ecb731bd..cc5c07bd11 100644
--- a/skills/fern-docs/SKILL.md
+++ b/skills/fern-docs/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: fern-docs
-description: Maintain the NeMo AutoModel Fern docs site under fern/ — add, update, move, or remove pages; manage redirects, slugs, navigation, and version aliases; run validation and previews.
+description: Maintain the NeMo AutoModel Fern docs site under docs/ (MDX content) + docs/fern/ (infra) — add, update, move, or remove pages; manage redirects, slugs, navigation, and version aliases; run validation and previews.
 when_to_use: Editing or adding documentation pages, fixing broken links, renaming a slug, updating the sidebar, adding a redirect, regenerating the API library reference, debugging fern check / broken-link errors, cutting a new version train, 'edit docs', 'add doc page', 'fern check failing', 'preview fails locally'.
 ---
 
@@ -10,50 +10,54 @@ Unified skill for adding, updating, moving, and removing pages on the NeMo AutoM
 
 ## Scope rule
 
-**ALL docs edits happen under `fern/`.** The legacy Sphinx tree at `docs/` is read-only reference; do not add new pages there. New pages, release notes, migration guides — everything belongs under `fern/versions/nightly/pages/`.
+**Nightly MDX content lives at the top level of `docs/`** (e.g. `docs/index.mdx`, `docs/guides/llm/finetune.mdx`). **`docs/fern/` holds only Fern build infrastructure** — config, theme, components, and the frozen v0.4 snapshot. New pages, release notes, migration guides → add as a top-level `.mdx` under `docs/`.
 
 **Two real content trees, plus a GA alias YAML.**
 
-- `fern/versions/nightly/pages/` — bleeding-edge tree. Every PR lands here. Mounted at the `nightly` URL slug via `nightly.yml`.
-- `fern/versions/v0.4/pages/` — frozen 0.4.0 GA snapshot. Independent copy of every page. Only changes via deliberate back-port. Mounted at the `v0.4` URL slug via `v0.4.yml`.
-- `fern/versions/latest.yml` — GA alias. Its `path:` lines mount the current GA's content (today: `./v0.4/pages/...`). Repointed at the next GA's tree when one is cut.
+- `docs/` — bleeding-edge (nightly) tree. Every PR lands here. Mounted at the `nightly` URL slug via `docs/fern/versions/nightly.yml` (paths reach back up via `../../<rel>.mdx`).
+- `docs/fern/versions/v0.4/pages/` — frozen 0.4.0 GA snapshot. Independent copy of every page. Only changes via deliberate back-port. Mounted at the `v0.4` URL slug via `v0.4.yml`.
+- `docs/fern/versions/latest.yml` — GA alias. Its `path:` lines mount the current GA's content (today: `./v0.4/pages/...`). Repointed at the next GA's tree when one is cut.
 
-The two trees were byte-for-byte identical at the moment 0.4.0 shipped (today, just after the migration), but they will diverge as nightly accumulates post-release edits and v0.4 stays frozen. **Default editing target is `nightly/`.** Only touch `v0.4/` for explicit back-ports — call out the divergence in the PR description.
+The two trees were byte-for-byte identical at the moment 0.4.0 shipped, but they will diverge as nightly accumulates post-release edits and v0.4 stays frozen. **Default editing target is `docs/` top-level.** Only touch `docs/fern/versions/v0.4/pages/` for explicit back-ports — call out the divergence in the PR description.
 
 **Sidebar fidelity rule.** Section captions, page titles, and Model Coverage child ordering must match the **published v0.4.0 sidebar at docs.nvidia.com/nemo/automodel/latest** verbatim. Don't silently shorten a title or reorder siblings — the docs PM and content engineers diff against the published site and any drift is treated as a regression. If you want a shorter sidebar label, change the toctree-derived display name in the source — never just retitle in the MDX.
 
 ## Layout at a glance
 
 ```
-fern/
-├── fern.config.json              # Org slug + Fern CLI pin (4.62.4+)
-├── docs.yml                      # Site config: instances, versions, redirects, libraries, theme
-├── main.css                      # NVIDIA-green theme overrides
-├── assets/                       # Logos and shared SVGs (NVIDIA_dark/light/symbol)
-├── components/                   # BadgeLinks.tsx, Tag.tsx, CustomFooter.tsx
-├── versions/
-│   ├── nightly.yml               # Nav for bleeding-edge — paths → ./nightly/pages/
-│   ├── nightly/pages/            # Bleeding-edge MDX (edited every PR)
-│   ├── v0.4.yml                  # Nav for frozen 0.4.0 — paths → ./v0.4/pages/
-│   ├── v0.4/pages/               # Frozen 0.4.0 MDX (back-ports only)
-│   └── latest.yml                # GA alias — paths → ./v0.4/pages/ today; repointed at next GA cut
-└── product-docs/                 # GENERATED Python API reference (gitignored)
+docs/                                ← nightly MDX (top level)
+├── index.mdx, breaking-changes.mdx, release-notes.mdx, ...
+├── about/, guides/, model-coverage/, launcher/, api-reference/
+├── *.png / *.jpg                    ← page-scoped images
+└── fern/                            ← infra only
+    ├── fern.config.json             # Org slug + Fern CLI pin (5.29.0+)
+    ├── docs.yml                     # Site config + global-theme: nvidia (inherits
+    │                                #   logos / footer / theme CSS / fonts / OneTrust JS
+    │                                #   from NVIDIA/fern-components)
+    ├── components/                  # BadgeLinks.tsx, Tag.tsx
+    │                                #   (repo-specific; NVIDIA footer ships in global theme)
+    ├── versions/
+    │   ├── nightly.yml              # Nav for nightly — paths → ../../<rel>.mdx (up into docs/)
+    │   ├── v0.4.yml                 # Nav for frozen 0.4.0 — paths → ./v0.4/pages/
+    │   ├── v0.4/pages/              # Frozen 0.4.0 MDX (back-ports only)
+    │   └── latest.yml               # GA alias — paths → ./v0.4/pages/ today; repointed at next GA cut
+    └── product-docs/                # GENERATED Python API reference (gitignored)
 ```
 
 ```
-File                                                      URL
-─────────────────────────────────────────────────────────  ────────────────────────────────────────────
-fern/versions/nightly/pages/get-started/installation.mdx     /latest/get-started/installation
-                                                          /v0.4/get-started/installation
-                                                          /nightly/get-started/installation
+File                                                          URL
+─────────────────────────────────────────────────────────────  ────────────────────────────────────────────
+docs/get-started/installation.mdx                              /nightly/get-started/installation
+docs/fern/versions/v0.4/pages/get-started/installation.mdx     /latest/get-started/installation
+                                                               /v0.4/get-started/installation
 ```
 
 ## Operations
 
 ### Add a page
 
-1. Gather: title, target section, filename (kebab-case `.mdx`), subdirectory under `fern/versions/nightly/pages/`.
-2. Create the MDX with frontmatter:
+1. Gather: title, target section, filename (kebab-case `.mdx`), subdirectory under `docs/`.
+2. Create the MDX at `docs/<subdir>/<filename>.mdx` with frontmatter:
 
    ```mdx
    ---
@@ -65,28 +69,27 @@ fern/versions/nightly/pages/get-started/installation.mdx     /latest/get-started
    <body — typically no leading `# H1`; Fern renders the title automatically>
    ```
 
-3. Add a `- page:` entry to `fern/versions/nightly.yml` under the right `section:`, with an explicit `slug:` if the desired URL differs from the slugified title:
+3. Add a `- page:` entry to `docs/fern/versions/nightly.yml` under the right `section:`, with `path:` reaching up into `docs/` via `../../`:
 
    ```yaml
    - page: "<Page Title>"
-     path: ./nightly/pages/<subdir>/<filename>.mdx
+     path: ../../<subdir>/<filename>.mdx
      slug: <short-url-segment>
    ```
 
-4. **Sync the aliases:** `cp fern/versions/nightly.yml fern/versions/latest.yml && sed -i '' 's|./nightly/pages/|./v0.4/pages/|g' fern/versions/latest.yml`.
-5. `make docs-check` (runs `fern check`) and verify URL resolves on `make docs` preview.
+4. `make docs-check` (runs `fern check`) and verify URL resolves on `make docs` preview. There is no `nightly.yml` ↔ `latest.yml` alias-sync step under this layout — `latest.yml` mounts the frozen v0.4 tree, not nightly, so it's intentionally out of sync.
 
 ### Update a page
 
-1. Locate by path, title, or keyword: `grep -rn "<keyword>" fern/versions/nightly/pages/ --include="*.mdx"`.
-2. **Content only** — edit the single MDX file. There is no mirror to maintain (latest/nightly serve the same content).
-3. **Title change** — update the frontmatter `title:` and (if the page is in `versions/nightly.yml`) update the `- page:` entry's display label. Re-sync `latest.yml` and `v0.4.yml`.
-4. **Section move** — `git mv` the file, update `path:` in `versions/nightly.yml`, fix incoming links, re-sync aliases.
-5. **Slug change** — change `slug:` in the YAML (or rename the file and let the default slug update). Add a `redirects:` entry in `docs.yml` so the old URL keeps working.
+1. Locate by path, title, or keyword: `grep -rn "<keyword>" docs/ --include="*.mdx" --exclude-dir=fern`.
+2. **Content only** — edit the single MDX file at `docs/<...>.mdx`.
+3. **Title change** — update the frontmatter `title:` and update the `- page:` entry's display label in `docs/fern/versions/nightly.yml`.
+4. **Section move** — `git mv` the file within `docs/`, update `path:` in `nightly.yml`, fix incoming links.
+5. **Slug change** — change `slug:` in the YAML (or rename the file and let the default slug update). Add a `redirects:` entry in `docs/fern/docs.yml` so the old URL keeps working.
 
 ### Redirect quirks
 
-Four things to watch when editing `redirects:` in `fern/docs.yml`:
+Four things to watch when editing `redirects:` in `docs/fern/docs.yml`:
 
 1. **`:path*` does NOT match the empty-path case.** `/<basepath>/v0.4/:path*/index.html` will *not* match `/<basepath>/v0.4/index.html` (where `:path*` would have to be empty). Each version-root `index.html` needs its own explicit rule. NeMo Curator (NVIDIA-NeMo/Curator#1938) discovered this when their version-root URLs 404'd. AutoModel ships explicit rules for `latest`, `v0.4`, `nightly`, and the legacy `0.4` form — when you add a new version slug, add four new explicit rules: `<slug>/index.html`, `<slug>/index`, plus the same two for any legacy form (e.g. `0.5` → `v0.5`).
 2. **Older un-migrated versions need a fallback.** Whatever versions the published Sphinx site exposed (check the version-switcher dropdown on `docs.nvidia.com/nemo/<product>/latest/`) but you didn't migrate into Fern still need to resolve. The pattern: redirect each old slug's URLs to the equivalent path under `/latest/` so external bookmarks and search results land on the closest current page instead of 404ing. Five rules per old version: `<slug>/index.html`, `<slug>/index`, `<slug>/:path*/index.html`, `<slug>/:path*`, `<slug>/:path*.html` — all destinations `/latest/...`. AutoModel ships these for `0.3.0`, `0.2.0`, `0.1.0`.
@@ -95,17 +98,17 @@ Four things to watch when editing `redirects:` in `fern/docs.yml`:
 
 ### Remove a page
 
-1. Find incoming links: `grep -rn "<filename>" fern/versions/nightly/pages/ --include="*.mdx"`.
-2. `git rm fern/versions/nightly/pages/<path>.mdx`.
-3. Remove the `- page:` block from `versions/nightly.yml` (and re-sync `latest.yml` / `nightly.yml`).
+1. Find incoming links: `grep -rn "<filename>" docs/ --include="*.mdx" --exclude-dir=fern`.
+2. `git rm docs/<...>.mdx`.
+3. Remove the `- page:` block from `docs/fern/versions/nightly.yml`.
 4. Fix or delete incoming links.
-5. Add a redirect in `docs.yml` if the URL was public.
+5. Add a redirect in `docs/fern/docs.yml` if the URL was public.
 
 ### Worked example: add a guide
 
 Request: *"Add a fine-tuning guide for Qwen3.6 under Recipes & E2E Examples."*
 
-1. Create `fern/versions/nightly/pages/guides/llm/qwen3-6-finetune.mdx`:
+1. Create `docs/guides/llm/qwen3-6-finetune.mdx`:
 
    ```mdx
    ---
@@ -116,23 +119,22 @@ Request: *"Add a fine-tuning guide for Qwen3.6 under Recipes & E2E Examples."*
    This guide walks through fine-tuning Qwen3.6 with NeMo AutoModel...
    ```
 
-2. Add to `fern/versions/nightly.yml` under the `Recipes & E2E Examples` section, slotted in publication-order with the other fine-tune entries:
+2. Add to `docs/fern/versions/nightly.yml` under the `Recipes & E2E Examples` section, slotted in publication-order with the other fine-tune entries:
 
    ```yaml
    - page: "Fine-Tune Qwen3.6"
-     path: ./nightly/pages/guides/llm/qwen3-6-finetune.mdx
+     path: ../../guides/llm/qwen3-6-finetune.mdx
      slug: qwen3-6-finetune
    ```
 
-3. `cp fern/versions/nightly.yml fern/versions/latest.yml && sed -i '' 's|./nightly/pages/|./v0.4/pages/|g' fern/versions/latest.yml`.
-4. `make docs-check` then `make docs` to preview at `http://localhost:3002/latest/recipes-e2e-examples/qwen3-6-finetune`.
+3. `make docs-check` then `make docs` to preview at `http://localhost:3002/nightly/recipes-e2e-examples/qwen3-6-finetune`.
 
 ### Worked example: rename a slug with a redirect
 
 Request: *"Rename `/recipes-e2e-examples/sft-peft` to `/recipes-e2e-examples/fine-tuning`."*
 
-1. Edit `versions/nightly.yml`, change the `slug:` on the SFT & PEFT entry from `sft-peft` to `fine-tuning`.
-2. Add a redirect to `fern/docs.yml`:
+1. Edit `docs/fern/versions/nightly.yml`, change the `slug:` on the SFT & PEFT entry from `sft-peft` to `fine-tuning`.
+2. Add a redirect to `docs/fern/docs.yml`:
 
    ```yaml
    redirects:
@@ -140,8 +142,7 @@ Request: *"Rename `/recipes-e2e-examples/sft-peft` to `/recipes-e2e-examples/fin
        destination: "/:version/recipes-e2e-examples/fine-tuning"
    ```
 
-3. `grep -rn "/recipes-e2e-examples/sft-peft" fern/versions/nightly/pages/` and update incoming body links.
-4. Re-sync `latest.yml` and `v0.4.yml`.
+3. `grep -rn "/recipes-e2e-examples/sft-peft" docs/ --include="*.mdx" --exclude-dir=fern` and update incoming body links.
 
 ## Content guidelines
 
@@ -167,7 +168,7 @@ import { BadgeLinks } from "@/components/BadgeLinks";
 
 `<Tag variant="...">` accepts: `primary`, `secondary`, `success`, `warning`, `danger`, `info`, `light`, `dark` (1:1 with sphinx-design `{bdg-*}` variants).
 
-Images live in `fern/assets/` (shared across all pages) or alongside the MDX file (page-scoped). Reference page-scoped images with relative paths (`./image.png`), not absolute (`/image.png`) — Fern's path resolver doesn't normalize root-relative image paths the same way as link targets.
+Page-scoped images live alongside the MDX file (e.g. `docs/guides/audio/qwen_omni_asr.png`). Reference them with relative paths (`./image.png`), not absolute (`/image.png`) — Fern's path resolver doesn't normalize root-relative image paths the same way as link targets. The NVIDIA logos and favicon come from the `nvidia` global theme; do not add them locally.
 
 ## Frontmatter
 
@@ -204,15 +205,17 @@ For cross-repo references (yaml configs, Python source), use absolute GitHub URL
 make docs-check          # `fern check` — config + MDX validation
 ```
 
+Run from `docs/fern/` (`cd docs/fern && make docs-check`) or anywhere with `make -C docs/fern docs-check`.
+
 `fern check` must pass before commit. The dev server's broken-link warnings for version-prefixed routes (e.g. `/latest/get-started/installation` from MDX that uses `/get-started/installation`) are **false positives** — Fern's strict validator doesn't resolve version-agnostic links. The published site renders them correctly. The URLMap-based `validate_fern_internal_links.py` (under the convert-to-fern toolkit) is authoritative.
 
-To regenerate the autodoc library reference (gitignored under `product-docs/`):
+To regenerate the autodoc library reference (gitignored under `docs/fern/product-docs/`):
 
 ```bash
 make docs                # runs `fern docs md generate` then `fern docs dev`
 ```
 
-`fern docs md generate` populates `product-docs/` from the `nemo_automodel` package source declared in `docs.yml` `libraries:` block. Without this step, a cold `fern docs dev` fails with `Folder not found: ./product-docs/...`.
+`fern docs md generate` populates `docs/fern/product-docs/` from the `nemo_automodel` package source declared in `docs.yml` `libraries:` block. Without this step, a cold `fern docs dev` fails with `Folder not found: ./product-docs/...`.
 
 ## Preview and publish
 
@@ -223,14 +226,14 @@ make docs                # runs `fern docs md generate` then `fern docs dev`
 | Shared preview URL on `*.docs.buildwithfern.com` (needs `DOCS_FERN_TOKEN`) | `make docs-preview` |
 | Trigger production publish workflow on `origin/main` | `make docs-publish` |
 
-PRs that touch `fern/**` get an automatic Fern preview URL posted as a 🌿 comment by `fern-docs-preview-comment.yml`. No manual step.
+PRs that touch `docs/**` get an automatic Fern preview URL posted as a 🌿 comment by `fern-docs-preview-comment.yml`. No manual step.
 
 ```
                     ┌─ fern-docs-ci.yml                  → fern check (push to pull-request/<n>)
-PR (touches fern/) ─┼─ fern-docs-preview-build.yml       → upload fern/ artifact (no secrets)
+PR (touches docs/) ─┼─ fern-docs-preview-build.yml       → upload docs/fern/ artifact (no secrets)
                     └─ fern-docs-preview-comment.yml     → 🌿 preview URL comment
 
-Push to main (touches fern/) → publish-fern-docs.yml → docs.nvidia.com/nemo/automodel
+Push to main (touches docs/) → publish-fern-docs.yml → docs.nvidia.com/nemo/automodel
 Tag push (docs/v*)           → publish-fern-docs.yml → docs.nvidia.com/nemo/automodel
 Manual dispatch              → publish-fern-docs.yml → docs.nvidia.com/nemo/automodel
 ```
@@ -241,11 +244,11 @@ The preview-comment + publish jobs require the `DOCS_FERN_TOKEN` org secret (alr
 
 When NeMo AutoModel ships a new GA (e.g. `v0.5`):
 
-1. `cp -r fern/versions/nightly fern/versions/v0.5` — frozen snapshot of the bleeding-edge tree at release.
-2. `cp fern/versions/nightly.yml fern/versions/v0.5.yml` and rewrite `./nightly/` path prefixes to `./v0.5/`.
-3. Update `fern/versions/latest.yml` to point at the new train: `cp fern/versions/v0.5.yml fern/versions/latest.yml`. (`latest` is the auto-bumping GA alias.)
-4. In `fern/docs.yml` `versions:`, add a new frozen-pin entry (`display-name: "0.5.0 · 26.07"`, `slug: v0.5`, `availability: stable`) and keep the previous pin (`v0.4`) for permalink stability.
-5. `fern/versions/nightly/pages/` keeps moving forward as the bleeding-edge tree; the new `fern/versions/v0.5/pages/` is the frozen GA snapshot and only changes via deliberate back-port.
+1. `mkdir -p docs/fern/versions/v0.5/pages && rsync -a --exclude='fern' docs/ docs/fern/versions/v0.5/pages/` — fresh frozen snapshot of nightly at release time.
+2. `cp docs/fern/versions/nightly.yml docs/fern/versions/v0.5.yml` and rewrite `../../` path prefixes to `./v0.5/pages/`.
+3. Update `docs/fern/versions/latest.yml` to point at the new train: `cp docs/fern/versions/v0.5.yml docs/fern/versions/latest.yml`. (`latest` is the auto-bumping GA alias.)
+4. In `docs/fern/docs.yml` `versions:`, add a new frozen-pin entry (`display-name: "0.5.0 · 26.07"`, `slug: v0.5`, `availability: stable`) and keep the previous pin (`v0.4`) for permalink stability.
+5. `docs/` keeps moving forward as the bleeding-edge tree; the new `docs/fern/versions/v0.5/pages/` is the frozen GA snapshot and only changes via deliberate back-port.
 6. Promote `nightly` to `availability: stable` if and when its content tree gets cut over.
 7. Tag `docs/v0.5.0` and push to publish.
 
@@ -263,29 +266,28 @@ If sign-off is missing on a recent commit, amend with `git commit --amend -s`. P
 
 | Symptom | Fix |
 |---|---|
-| `fern check` YAML error | 2-space indent; `- page:` inside `contents:`; `path:` is relative to the version YAML; `slug:` must not collide with siblings |
+| `fern check` YAML error | 2-space indent; `- page:` inside `contents:`; `path:` is relative to `nightly.yml`'s location (so nightly entries reach back up via `../../`); `slug:` must not collide with siblings |
 | Page 404 in preview | Missing `slug:` override (default slugifies the long display title) or `position:` collision in an auto-discovered folder |
 | `Folder not found: ./product-docs/...` on `fern docs dev` | Run `make docs` once to populate the library reference |
 | `[ERR_PNPM_IGNORED_BUILDS]` on first `fern docs dev` | pnpm 10+ blocks esbuild's postinstall — `pnpm config set onlyBuiltDependencies '["esbuild"]' --location global`, then `rm -rf ~/.fern/app-preview` and retry |
 | Broken-link warning on version-agnostic path | `fern docs broken-links` false-positives; URLMap-based validator is authoritative |
 | `JSX expressions must have one parent element` | Wrap multi-element JSX in `<>...</>` or a `<div>` |
-| Old Sphinx URL breaks | Add a `redirects:` entry in `fern/docs.yml`; the redirect generator already handles `/index.html` and `.html` legacy forms |
+| Old Sphinx URL breaks | Add a `redirects:` entry in `docs/fern/docs.yml`; the redirect generator already handles `/index.html` and `.html` legacy forms |
 | Image not rendering | Use relative path (`./image.png`) for page-scoped images, not root-relative (`/image.png`) |
-| Sidebar caption looks shortened vs published site | Compare against `docs.nvidia.com/nemo/automodel/latest` and restore the verbatim title in `versions/nightly.yml` |
-| `latest.yml` or `v0.4.yml` drift from `nightly.yml` | Re-sync: `cp fern/versions/nightly.yml fern/versions/latest.yml && sed -i '' 's|./nightly/pages/|./v0.4/pages/|g' fern/versions/latest.yml` |
+| Sidebar caption looks shortened vs published site | Compare against `docs.nvidia.com/nemo/automodel/latest` and restore the verbatim title in `docs/fern/versions/nightly.yml` |
+| `path: ../../foo.mdx` doesn't resolve | Confirm the MDX file is at `docs/foo.mdx` (top level), not still under `docs/fern/versions/nightly/pages/` — that legacy tree no longer exists |
 
 ## Key references
 
 | File | Purpose |
 |---|---|
-| `fern/docs.yml` | Site config — `instances`, `versions`, `redirects`, `libraries`, theme |
-| `fern/versions/nightly.yml` | Canonical nav tree |
-| `fern/versions/{latest,v0.4}.yml` | Aliases — content copies of `nightly.yml` |
-| `fern/versions/nightly/pages/` | MDX content (130+ pages) |
-| `fern/components/` | `BadgeLinks.tsx`, `Tag.tsx`, `CustomFooter.tsx` |
-| `fern/main.css` | Theme overrides — NVIDIA green, badge spacing |
-| `fern/README.md` | Human-facing orientation |
-| `fern/Makefile` | `make docs / docs-check / docs-preview / docs-publish` (run from `fern/` or via `make -C fern`) |
+| `docs/fern/docs.yml` | Site config — `instances`, `versions`, `redirects`, `libraries`, theme |
+| `docs/fern/versions/nightly.yml` | Canonical nav tree — paths reach up into `docs/` via `../../` |
+| `docs/fern/versions/{latest,v0.4}.yml` | Frozen GA nav (mount `./v0.4/pages/...`) |
+| `docs/` (top-level *.mdx) | Nightly MDX content (~140 pages + page-scoped images) |
+| `docs/fern/versions/v0.4/pages/` | Frozen 0.4.0 snapshot (back-ports only) |
+| `docs/fern/components/` | `BadgeLinks.tsx`, `Tag.tsx` (repo-specific; NVIDIA footer ships via `global-theme: nvidia`) |
+| `docs/fern/README.md` | Human-facing orientation |
+| `docs/fern/Makefile` | `make docs / docs-check / docs-preview / docs-publish` (run from `docs/fern/` or via `make -C docs/fern`) |
 | `.github/workflows/fern-docs-*.yml` | CI: check, preview build, preview comment |
 | `.github/workflows/publish-fern-docs.yml` | CI: publish to docs.nvidia.com/nemo/automodel |
-| `docs/` | **Legacy** Sphinx source — read-only reference for fidelity checks |